source_monitor 0.11.1 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/commands/rails-audit.md +77 -0
- data/CHANGELOG.md +50 -0
- data/CLAUDE.md +2 -2
- data/Gemfile.lock +7 -20
- data/RAILS_AUDIT.md +424 -0
- data/VERSION +1 -1
- data/app/assets/builds/source_monitor/application.css +4 -24
- data/app/assets/builds/source_monitor/application.js +57 -89
- data/app/assets/builds/source_monitor/application.js.map +4 -4
- data/app/assets/javascripts/source_monitor/application.js +3 -6
- data/app/assets/javascripts/source_monitor/controllers/dropdown_controller.js +6 -86
- data/app/assets/javascripts/source_monitor/controllers/filter_submit_controller.js +13 -0
- data/app/assets/javascripts/source_monitor/controllers/modal_controller.js +56 -0
- data/app/assets/javascripts/source_monitor/controllers/notification_controller.js +3 -13
- data/app/components/source_monitor/application_component.rb +10 -0
- data/app/components/source_monitor/filter_dropdown_component.rb +62 -0
- data/app/components/source_monitor/icon_component.rb +140 -0
- data/app/components/source_monitor/status_badge_component.html.erb +8 -0
- data/app/components/source_monitor/status_badge_component.rb +96 -0
- data/app/controllers/concerns/source_monitor/sanitizes_search_params.rb +4 -0
- data/app/controllers/concerns/source_monitor/set_source.rb +13 -0
- data/app/controllers/source_monitor/application_controller.rb +17 -0
- data/app/controllers/source_monitor/bulk_scrape_enablements_controller.rb +6 -10
- data/app/controllers/source_monitor/dashboard_controller.rb +5 -1
- data/app/controllers/source_monitor/import_history_dismissals_controller.rb +1 -1
- data/app/controllers/source_monitor/import_sessions_controller.rb +30 -9
- data/app/controllers/source_monitor/item_scrapes_controller.rb +70 -0
- data/app/controllers/source_monitor/items_controller.rb +2 -69
- data/app/controllers/source_monitor/source_bulk_scrapes_controller.rb +1 -4
- data/app/controllers/source_monitor/source_favicon_fetches_controller.rb +2 -12
- data/app/controllers/source_monitor/source_fetches_controller.rb +1 -6
- data/app/controllers/source_monitor/source_health_checks_controller.rb +9 -16
- data/app/controllers/source_monitor/source_health_resets_controller.rb +1 -6
- data/app/controllers/source_monitor/source_retries_controller.rb +1 -6
- data/app/controllers/source_monitor/source_scrape_tests_controller.rb +2 -4
- data/app/controllers/source_monitor/source_turbo_responses.rb +1 -3
- data/app/controllers/source_monitor/sources_controller.rb +15 -20
- data/app/helpers/source_monitor/application_helper.rb +15 -31
- data/app/helpers/source_monitor/health_badge_helper.rb +8 -0
- data/app/jobs/source_monitor/download_content_images_job.rb +1 -59
- data/app/jobs/source_monitor/favicon_fetch_job.rb +1 -58
- data/app/jobs/source_monitor/fetch_feed_job.rb +2 -52
- data/app/jobs/source_monitor/import_opml_job.rb +6 -145
- data/app/jobs/source_monitor/import_session_health_check_job.rb +15 -76
- data/app/jobs/source_monitor/item_cleanup_job.rb +5 -0
- data/app/jobs/source_monitor/log_cleanup_job.rb +13 -2
- data/app/jobs/source_monitor/schedule_fetches_job.rb +8 -0
- data/app/jobs/source_monitor/scrape_item_job.rb +6 -52
- data/app/jobs/source_monitor/source_health_check_job.rb +1 -72
- data/app/models/concerns/source_monitor/loggable.rb +12 -0
- data/app/models/source_monitor/fetch_log.rb +0 -8
- data/app/models/source_monitor/health_check_log.rb +0 -8
- data/app/models/source_monitor/import_history.rb +14 -0
- data/app/models/source_monitor/import_session.rb +2 -0
- data/app/models/source_monitor/item.rb +15 -0
- data/app/models/source_monitor/item_content.rb +4 -3
- data/app/models/source_monitor/scrape_log.rb +4 -6
- data/app/models/source_monitor/source.rb +28 -19
- data/app/presenters/source_monitor/base_presenter.rb +19 -0
- data/app/presenters/source_monitor/source_details_presenter.rb +61 -0
- data/app/presenters/source_monitor/sources_filter_presenter.rb +61 -0
- data/app/views/source_monitor/dashboard/_recent_activity.html.erb +3 -3
- data/app/views/source_monitor/dashboard/_stat_card.html.erb +2 -1
- data/app/views/source_monitor/dashboard/_stats.html.erb +5 -7
- data/app/views/source_monitor/items/_details.html.erb +11 -14
- data/app/views/source_monitor/items/index.html.erb +10 -35
- data/app/views/source_monitor/logs/index.html.erb +20 -41
- data/app/views/source_monitor/shared/_form_errors.html.erb +14 -0
- data/app/views/source_monitor/source_scrape_tests/_result.html.erb +1 -29
- data/app/views/source_monitor/source_scrape_tests/_result_content.html.erb +33 -0
- data/app/views/source_monitor/source_scrape_tests/show.html.erb +1 -29
- data/app/views/source_monitor/sources/_bulk_scrape_enable_modal.html.erb +2 -2
- data/app/views/source_monitor/sources/_bulk_scrape_modal.html.erb +7 -5
- data/app/views/source_monitor/sources/_details.html.erb +24 -52
- data/app/views/source_monitor/sources/_health_status_badge.html.erb +4 -6
- data/app/views/source_monitor/sources/_row.html.erb +7 -18
- data/app/views/source_monitor/sources/edit.html.erb +1 -10
- data/app/views/source_monitor/sources/index.html.erb +26 -46
- data/app/views/source_monitor/sources/new.html.erb +1 -10
- data/config/routes.rb +1 -1
- data/db/migrate/20260313120000_add_composite_indexes_to_log_tables.rb +14 -0
- data/db/migrate/20260314120000_align_health_status_default.rb +11 -0
- data/lib/source_monitor/analytics/sources_index_metrics.rb +15 -0
- data/lib/source_monitor/dashboard/queries/recent_activity_query.rb +10 -4
- data/lib/source_monitor/dashboard/turbo_broadcaster.rb +21 -5
- data/lib/source_monitor/favicons/fetcher.rb +86 -0
- data/lib/source_monitor/fetching/cloudflare_bypass.rb +14 -5
- data/lib/source_monitor/fetching/completion/event_publisher.rb +12 -0
- data/lib/source_monitor/fetching/completion/follow_up_handler.rb +15 -2
- data/lib/source_monitor/fetching/completion/retention_handler.rb +11 -3
- data/lib/source_monitor/fetching/feed_fetcher.rb +2 -21
- data/lib/source_monitor/fetching/fetch_runner.rb +12 -3
- data/lib/source_monitor/fetching/retry_orchestrator.rb +102 -0
- data/lib/source_monitor/fetching/stalled_fetch_reconciler.rb +9 -0
- data/lib/source_monitor/health/source_health_check_orchestrator.rb +95 -0
- data/lib/source_monitor/health.rb +1 -0
- data/lib/source_monitor/images/downloader.rb +6 -7
- data/lib/source_monitor/images/processor.rb +98 -0
- data/lib/source_monitor/import_sessions/health_check_updater.rb +95 -0
- data/lib/source_monitor/import_sessions/opml_importer.rb +163 -0
- data/lib/source_monitor/items/item_creator.rb +0 -21
- data/lib/source_monitor/logs/query.rb +20 -0
- data/lib/source_monitor/queries/scrape_candidates_query.rb +30 -0
- data/lib/source_monitor/queries.rb +7 -0
- data/lib/source_monitor/scheduler.rb +5 -0
- data/lib/source_monitor/scraping/bulk_result_presenter.rb +11 -8
- data/lib/source_monitor/scraping/runner.rb +52 -0
- data/lib/source_monitor/scraping/scheduler.rb +5 -0
- data/lib/source_monitor/scraping/state.rb +4 -2
- data/lib/source_monitor/security/parameter_sanitizer.rb +7 -0
- data/lib/source_monitor/version.rb +1 -1
- data/lib/source_monitor.rb +7 -0
- data/source_monitor.gemspec +1 -0
- metadata +47 -1
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
module Health
|
|
5
|
+
# Orchestrates a source health check: runs the probe, broadcasts
|
|
6
|
+
# UI updates with toast notifications, triggers a follow-up fetch
|
|
7
|
+
# for degraded sources, and handles unexpected errors gracefully.
|
|
8
|
+
# Extracted from SourceHealthCheckJob.
|
|
9
|
+
class SourceHealthCheckOrchestrator
|
|
10
|
+
DEGRADED_STATUSES = %w[declining failing].freeze
|
|
11
|
+
|
|
12
|
+
def initialize(source)
|
|
13
|
+
@source = source
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def call
|
|
17
|
+
result = SourceMonitor::Health::SourceHealthCheck.new(source: source).call
|
|
18
|
+
broadcast_outcome(result)
|
|
19
|
+
trigger_fetch_if_degraded(result)
|
|
20
|
+
rescue StandardError => error
|
|
21
|
+
log_error(error)
|
|
22
|
+
record_unexpected_failure(error)
|
|
23
|
+
broadcast_outcome(nil, error)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
attr_reader :source
|
|
29
|
+
|
|
30
|
+
def trigger_fetch_if_degraded(result)
|
|
31
|
+
return unless result&.success?
|
|
32
|
+
return unless DEGRADED_STATUSES.include?(source.health_status.to_s)
|
|
33
|
+
|
|
34
|
+
SourceMonitor::FetchFeedJob.perform_later(source.id, force: true)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def record_unexpected_failure(error)
|
|
38
|
+
SourceMonitor::HealthCheckLog.create!(
|
|
39
|
+
source: source,
|
|
40
|
+
success: false,
|
|
41
|
+
started_at: Time.current,
|
|
42
|
+
completed_at: Time.current,
|
|
43
|
+
duration_ms: 0,
|
|
44
|
+
error_class: error.class.name,
|
|
45
|
+
error_message: error.message
|
|
46
|
+
)
|
|
47
|
+
rescue StandardError
|
|
48
|
+
nil
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def broadcast_outcome(result, error = nil)
|
|
52
|
+
SourceMonitor::Realtime.broadcast_source(source)
|
|
53
|
+
|
|
54
|
+
message, level = toast_payload(result, error)
|
|
55
|
+
return if message.blank?
|
|
56
|
+
|
|
57
|
+
SourceMonitor::Realtime.broadcast_toast(message: message, level: level)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def toast_payload(result, error)
|
|
61
|
+
if error
|
|
62
|
+
return [
|
|
63
|
+
"Health check failed for #{source.name}: #{error.message}",
|
|
64
|
+
:error
|
|
65
|
+
]
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
if result&.success?
|
|
69
|
+
[
|
|
70
|
+
"Health check succeeded for #{source.name}.",
|
|
71
|
+
:success
|
|
72
|
+
]
|
|
73
|
+
else
|
|
74
|
+
failure_reason = result&.error&.message
|
|
75
|
+
http_status = result&.log&.http_status
|
|
76
|
+
message = "Health check failed for #{source.name}"
|
|
77
|
+
message += " (HTTP #{http_status})" if http_status.present?
|
|
78
|
+
message += ": #{failure_reason}" if failure_reason.present?
|
|
79
|
+
[
|
|
80
|
+
"#{message}.",
|
|
81
|
+
:error
|
|
82
|
+
]
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def log_error(error)
|
|
87
|
+
return unless defined?(Rails) && Rails.respond_to?(:logger)
|
|
88
|
+
|
|
89
|
+
Rails.logger&.error(
|
|
90
|
+
"[SourceMonitor::Health::SourceHealthCheckOrchestrator] error for source #{source.id}: #{error.class}: #{error.message}"
|
|
91
|
+
)
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
require "source_monitor/health/source_health_monitor"
|
|
4
4
|
require "source_monitor/health/source_health_reset"
|
|
5
5
|
require "source_monitor/health/source_health_check"
|
|
6
|
+
require "source_monitor/health/source_health_check_orchestrator"
|
|
6
7
|
require "source_monitor/health/import_source_health_check"
|
|
7
8
|
|
|
8
9
|
module SourceMonitor
|
|
@@ -43,13 +43,12 @@ module SourceMonitor
|
|
|
43
43
|
private
|
|
44
44
|
|
|
45
45
|
def fetch_image
|
|
46
|
-
connection =
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
end
|
|
46
|
+
connection = SourceMonitor::HTTP.client(
|
|
47
|
+
timeout: settings.download_timeout,
|
|
48
|
+
open_timeout: [ settings.download_timeout / 2, 5 ].min,
|
|
49
|
+
headers: { "Accept" => "image/*" },
|
|
50
|
+
retry_requests: false
|
|
51
|
+
)
|
|
53
52
|
|
|
54
53
|
response = connection.get(url)
|
|
55
54
|
return response if response.status == 200
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "faraday"
|
|
4
|
+
|
|
5
|
+
module SourceMonitor
|
|
6
|
+
module Images
|
|
7
|
+
# Orchestrates downloading images from an item's HTML content, attaching
|
|
8
|
+
# them via ActiveStorage, and rewriting the HTML to use local blob URLs.
|
|
9
|
+
# Extracted from DownloadContentImagesJob for testability and reuse.
|
|
10
|
+
class Processor
|
|
11
|
+
TRANSIENT_ERRORS = [
|
|
12
|
+
Timeout::Error, Errno::ETIMEDOUT,
|
|
13
|
+
Faraday::TimeoutError, Faraday::ConnectionFailed,
|
|
14
|
+
Net::OpenTimeout, Net::ReadTimeout
|
|
15
|
+
].freeze
|
|
16
|
+
|
|
17
|
+
def initialize(item)
|
|
18
|
+
@item = item
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def call
|
|
22
|
+
return unless SourceMonitor.config.images.download_enabled?
|
|
23
|
+
|
|
24
|
+
html = item.content
|
|
25
|
+
return if html.blank?
|
|
26
|
+
|
|
27
|
+
item_content = item.item_content || item.build_item_content
|
|
28
|
+
|
|
29
|
+
# Skip if images already attached (idempotency)
|
|
30
|
+
return if item_content.persisted? && item_content.images.attached?
|
|
31
|
+
|
|
32
|
+
base_url = item.url
|
|
33
|
+
rewriter = SourceMonitor::Images::ContentRewriter.new(html, base_url: base_url)
|
|
34
|
+
image_urls = rewriter.image_urls
|
|
35
|
+
return if image_urls.empty?
|
|
36
|
+
|
|
37
|
+
# Save item_content first so we can attach blobs to it
|
|
38
|
+
item_content.save! unless item_content.persisted?
|
|
39
|
+
|
|
40
|
+
# Download images and build URL mapping
|
|
41
|
+
url_mapping = download_images(item_content, image_urls)
|
|
42
|
+
return if url_mapping.empty?
|
|
43
|
+
|
|
44
|
+
# Rewrite HTML with Active Storage URLs
|
|
45
|
+
rewritten_html = rewriter.rewrite do |original_url|
|
|
46
|
+
url_mapping[original_url]
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Update the item content with rewritten HTML
|
|
50
|
+
item.update!(content: rewritten_html)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private
|
|
54
|
+
|
|
55
|
+
attr_reader :item
|
|
56
|
+
|
|
57
|
+
def download_images(item_content, image_urls)
|
|
58
|
+
url_mapping = {}
|
|
59
|
+
settings = SourceMonitor.config.images
|
|
60
|
+
|
|
61
|
+
image_urls.each do |image_url|
|
|
62
|
+
result = SourceMonitor::Images::Downloader.new(image_url, settings: settings).call
|
|
63
|
+
next unless result
|
|
64
|
+
|
|
65
|
+
blob = ActiveStorage::Blob.create_and_upload!(
|
|
66
|
+
io: result.io,
|
|
67
|
+
filename: result.filename,
|
|
68
|
+
content_type: result.content_type
|
|
69
|
+
)
|
|
70
|
+
item_content.images.attach(blob)
|
|
71
|
+
|
|
72
|
+
url_mapping[image_url] = Rails.application.routes.url_helpers.rails_blob_path(blob, only_path: true)
|
|
73
|
+
rescue ActiveRecord::Deadlocked
|
|
74
|
+
raise # let job framework retry on database deadlock
|
|
75
|
+
rescue *TRANSIENT_ERRORS
|
|
76
|
+
raise # re-raise transient errors to abort job for framework retry
|
|
77
|
+
rescue StandardError => error
|
|
78
|
+
# Individual image failure should not block others.
|
|
79
|
+
# Original URL will be preserved (graceful fallback).
|
|
80
|
+
log_image_error(image_url, error)
|
|
81
|
+
next
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
url_mapping
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def log_image_error(image_url, error)
|
|
88
|
+
return unless defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
89
|
+
|
|
90
|
+
Rails.logger.warn(
|
|
91
|
+
"[SourceMonitor::Images::Processor] Skipping image #{image_url}: #{error.class} - #{error.message}"
|
|
92
|
+
)
|
|
93
|
+
rescue StandardError
|
|
94
|
+
nil
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "source_monitor/health/import_source_health_check"
|
|
4
|
+
require "source_monitor/import_sessions/entry_normalizer"
|
|
5
|
+
require "source_monitor/import_sessions/health_check_broadcaster"
|
|
6
|
+
|
|
7
|
+
module SourceMonitor
|
|
8
|
+
module ImportSessions
|
|
9
|
+
# Performs a health check for a single OPML import entry: probes the feed
|
|
10
|
+
# URL, acquires a row lock to merge the result into the import session,
|
|
11
|
+
# and broadcasts UI updates. Extracted from ImportSessionHealthCheckJob.
|
|
12
|
+
class HealthCheckUpdater
|
|
13
|
+
def initialize(import_session:, entry_id:)
|
|
14
|
+
@import_session = import_session
|
|
15
|
+
@entry_id = entry_id
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def call
|
|
19
|
+
return unless active_for?(import_session)
|
|
20
|
+
|
|
21
|
+
result = perform_health_check
|
|
22
|
+
return unless result
|
|
23
|
+
|
|
24
|
+
updated_entry = nil
|
|
25
|
+
|
|
26
|
+
import_session.with_lock do
|
|
27
|
+
import_session.reload
|
|
28
|
+
return unless active_for?(import_session)
|
|
29
|
+
|
|
30
|
+
entries = Array(import_session.parsed_sources).map(&:to_h)
|
|
31
|
+
index = entries.index { |candidate| entry_id_for(candidate) == entry_id.to_s }
|
|
32
|
+
return unless index
|
|
33
|
+
|
|
34
|
+
entries[index] = entries[index].merge(
|
|
35
|
+
"health_status" => result.status,
|
|
36
|
+
"health_error" => result.error_message
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
selected_ids = Array(import_session.selected_source_ids).map(&:to_s)
|
|
40
|
+
selected_ids -= [ entry_id.to_s ] if result.status == "unhealthy"
|
|
41
|
+
|
|
42
|
+
attrs = {
|
|
43
|
+
parsed_sources: entries,
|
|
44
|
+
selected_source_ids: selected_ids,
|
|
45
|
+
health_check_completed_at: completion_time(entries, import_session.health_check_targets)
|
|
46
|
+
}.compact
|
|
47
|
+
|
|
48
|
+
import_session.update!(attrs)
|
|
49
|
+
normalized_entry = SourceMonitor::ImportSessions::EntryNormalizer.normalize(entries[index])
|
|
50
|
+
updated_entry = normalized_entry.merge(selected: selected_ids.include?(entry_id.to_s))
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
broadcaster = SourceMonitor::ImportSessions::HealthCheckBroadcaster.new(import_session)
|
|
54
|
+
broadcaster.broadcast_row(updated_entry) if updated_entry
|
|
55
|
+
broadcaster.broadcast_progress
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
private
|
|
59
|
+
|
|
60
|
+
attr_reader :import_session, :entry_id
|
|
61
|
+
|
|
62
|
+
def active_for?(session)
|
|
63
|
+
session.current_step == "health_check" && session.health_checks_active?
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def perform_health_check
|
|
67
|
+
entry = find_entry
|
|
68
|
+
return unless entry
|
|
69
|
+
|
|
70
|
+
SourceMonitor::Health::ImportSourceHealthCheck.new(feed_url: entry_feed_url(entry)).call
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def find_entry
|
|
74
|
+
Array(import_session.parsed_sources).find { |entry| entry_id_for(entry) == entry_id.to_s }
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def entry_id_for(entry)
|
|
78
|
+
entry.to_h["id"].presence || entry.to_h[:id].presence || entry.to_h["feed_url"].to_s
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def entry_feed_url(entry)
|
|
82
|
+
entry.to_h["feed_url"] || entry.to_h[:feed_url]
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def completion_time(entries, targets)
|
|
86
|
+
normalized = Array(entries).map { |entry| SourceMonitor::ImportSessions::EntryNormalizer.normalize(entry) }
|
|
87
|
+
filtered = normalized.select { |entry| targets.include?(entry[:id]) }
|
|
88
|
+
return nil if filtered.empty?
|
|
89
|
+
|
|
90
|
+
completed = filtered.count { |entry| %w[healthy unhealthy].include?(entry[:health_status].to_s) }
|
|
91
|
+
completed >= filtered.size ? Time.current : nil
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "set"
|
|
4
|
+
require "source_monitor/import_sessions/entry_normalizer"
|
|
5
|
+
require "source_monitor/realtime/broadcaster"
|
|
6
|
+
require "source_monitor/sources/params"
|
|
7
|
+
|
|
8
|
+
module SourceMonitor
|
|
9
|
+
module ImportSessions
|
|
10
|
+
# Orchestrates OPML import: selects entries, deduplicates, creates sources,
|
|
11
|
+
# records results, and broadcasts completion. Extracted from ImportOpmlJob
|
|
12
|
+
# so import logic can be invoked synchronously (console, tests).
|
|
13
|
+
class OPMLImporter
|
|
14
|
+
def initialize(import_session:, import_history:)
|
|
15
|
+
@import_session = import_session
|
|
16
|
+
@import_history = import_history
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def call
|
|
20
|
+
import_history.update_columns(started_at: Time.current) unless import_history.started_at
|
|
21
|
+
|
|
22
|
+
processed = Set.new
|
|
23
|
+
|
|
24
|
+
selected_entries.each do |entry|
|
|
25
|
+
process_entry(entry, processed)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
import_history.update!(
|
|
29
|
+
imported_sources: imported_sources,
|
|
30
|
+
failed_sources: failed_sources,
|
|
31
|
+
skipped_duplicates: skipped_duplicates,
|
|
32
|
+
bulk_settings: import_session.bulk_settings.presence || {},
|
|
33
|
+
completed_at: Time.current
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
broadcast_completion(import_history)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
attr_reader :import_session, :import_history
|
|
42
|
+
|
|
43
|
+
def selected_entries
|
|
44
|
+
ids = Array(import_session.selected_source_ids).map(&:to_s)
|
|
45
|
+
|
|
46
|
+
Array(import_session.parsed_sources)
|
|
47
|
+
.map { |entry| SourceMonitor::ImportSessions::EntryNormalizer.normalize(entry) }
|
|
48
|
+
.select { |entry| ids.include?(entry[:id]) }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def process_entry(entry, processed)
|
|
52
|
+
feed_url = entry[:feed_url].to_s
|
|
53
|
+
|
|
54
|
+
if feed_url.blank?
|
|
55
|
+
failed_sources << failure_payload(feed_url, "MissingFeedURL", "Feed URL is missing")
|
|
56
|
+
return
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
normalized_url = feed_url.downcase
|
|
60
|
+
|
|
61
|
+
if processed.include?(normalized_url)
|
|
62
|
+
skipped_duplicates << skipped_payload(feed_url, "duplicate in import selection")
|
|
63
|
+
return
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
if duplicate_source?(normalized_url)
|
|
67
|
+
skipped_duplicates << skipped_payload(feed_url, "already exists")
|
|
68
|
+
processed << normalized_url
|
|
69
|
+
return
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
source = SourceMonitor::Source.new(build_attributes(entry))
|
|
73
|
+
|
|
74
|
+
if source.save
|
|
75
|
+
imported_sources << { id: source.id, feed_url: source.feed_url, name: source.name }
|
|
76
|
+
SourceMonitor::FaviconFetchJob.perform_later(source.id) if should_fetch_favicon?(source)
|
|
77
|
+
processed << normalized_url
|
|
78
|
+
else
|
|
79
|
+
failed_sources << failure_payload(feed_url, "ValidationFailed", source.errors.full_messages.to_sentence)
|
|
80
|
+
end
|
|
81
|
+
rescue ActiveRecord::RecordNotUnique
|
|
82
|
+
skipped_duplicates << skipped_payload(feed_url, "already exists")
|
|
83
|
+
processed << normalized_url
|
|
84
|
+
rescue StandardError => error
|
|
85
|
+
failed_sources << failure_payload(feed_url, error.class.name, error.message)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def duplicate_source?(normalized_feed_url)
|
|
89
|
+
SourceMonitor::Source.where("LOWER(feed_url) = ?", normalized_feed_url).exists?
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def build_attributes(entry)
|
|
93
|
+
defaults = SourceMonitor::Sources::Params.default_attributes.deep_dup
|
|
94
|
+
settings = SourceMonitor::Security::ParameterSanitizer.sanitize(import_session.bulk_settings.presence || {})
|
|
95
|
+
settings = settings.deep_symbolize_keys
|
|
96
|
+
|
|
97
|
+
defaults.merge(settings).merge(identity_attributes(entry))
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def identity_attributes(entry)
|
|
101
|
+
{
|
|
102
|
+
name: entry[:title].presence || entry[:feed_url],
|
|
103
|
+
feed_url: entry[:feed_url],
|
|
104
|
+
website_url: entry[:website_url]
|
|
105
|
+
}
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def imported_sources
|
|
109
|
+
@imported_sources ||= []
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def failed_sources
|
|
113
|
+
@failed_sources ||= []
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def skipped_duplicates
|
|
117
|
+
@skipped_duplicates ||= []
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def failure_payload(feed_url, error_class, message)
|
|
121
|
+
{
|
|
122
|
+
feed_url: feed_url,
|
|
123
|
+
error_class: error_class,
|
|
124
|
+
error_message: message
|
|
125
|
+
}
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def skipped_payload(feed_url, reason)
|
|
129
|
+
{
|
|
130
|
+
feed_url: feed_url,
|
|
131
|
+
reason: reason
|
|
132
|
+
}
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def should_fetch_favicon?(source)
|
|
136
|
+
defined?(ActiveStorage) &&
|
|
137
|
+
SourceMonitor.config.favicons.enabled? &&
|
|
138
|
+
source.website_url.present?
|
|
139
|
+
rescue StandardError
|
|
140
|
+
false
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def broadcast_completion(history)
|
|
144
|
+
return unless defined?(Turbo::StreamsChannel)
|
|
145
|
+
|
|
146
|
+
histories = SourceMonitor::ImportHistory.recent_for(history.user_id).limit(5)
|
|
147
|
+
|
|
148
|
+
Turbo::StreamsChannel.broadcast_replace_to(
|
|
149
|
+
SourceMonitor::Realtime::Broadcaster::SOURCE_INDEX_STREAM,
|
|
150
|
+
target: "source_monitor_import_history_panel",
|
|
151
|
+
html: SourceMonitor::SourcesController.render(
|
|
152
|
+
partial: "source_monitor/sources/import_history_panel",
|
|
153
|
+
locals: { import_histories: histories }
|
|
154
|
+
)
|
|
155
|
+
)
|
|
156
|
+
rescue StandardError => error
|
|
157
|
+
if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
158
|
+
Rails.logger.error("[SourceMonitor::ImportSessions::OPMLImporter] broadcast failed: #{error.class}: #{error.message}")
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
end
|
|
@@ -120,7 +120,6 @@ module SourceMonitor
|
|
|
120
120
|
new_item = SourceMonitor::Item.new(source_id: source.id)
|
|
121
121
|
apply_attributes(new_item, attributes)
|
|
122
122
|
new_item.save!
|
|
123
|
-
new_item.ensure_feed_content_record
|
|
124
123
|
Result.new(item: new_item, status: :created)
|
|
125
124
|
rescue ActiveRecord::RecordNotUnique
|
|
126
125
|
handle_concurrent_duplicate(attributes, raw_guid_present:)
|
|
@@ -176,26 +175,6 @@ module SourceMonitor
|
|
|
176
175
|
def content_extractor
|
|
177
176
|
@content_extractor ||= ContentExtractor.new(source: source)
|
|
178
177
|
end
|
|
179
|
-
|
|
180
|
-
# Forwarding methods for backward compatibility with tests
|
|
181
|
-
def process_feed_content(raw_content, title:) = content_extractor.process_feed_content(raw_content, title: title)
|
|
182
|
-
def should_process_feed_content?(raw_content) = content_extractor.should_process_feed_content?(raw_content)
|
|
183
|
-
def feed_content_parser_class = content_extractor.feed_content_parser_class
|
|
184
|
-
def wrap_content_for_readability(content, title:) = content_extractor.wrap_content_for_readability(content, title: title)
|
|
185
|
-
def default_feed_readability_options = content_extractor.default_feed_readability_options
|
|
186
|
-
def build_feed_content_metadata(result:, raw_content:, processed_content:)
|
|
187
|
-
content_extractor.build_feed_content_metadata(result: result, raw_content: raw_content, processed_content: processed_content)
|
|
188
|
-
end
|
|
189
|
-
def html_fragment?(value) = content_extractor.html_fragment?(value)
|
|
190
|
-
def deep_copy(value) = content_extractor.deep_copy(value)
|
|
191
|
-
def string_or_nil(value) = entry_parser.string_or_nil(value)
|
|
192
|
-
def sanitize_string_array(values) = entry_parser.sanitize_string_array(values)
|
|
193
|
-
def split_keywords(value) = entry_parser.split_keywords(value)
|
|
194
|
-
def safe_integer(value) = entry_parser.safe_integer(value)
|
|
195
|
-
def json_entry? = entry_parser.json_entry?
|
|
196
|
-
def atom_entry? = entry_parser.atom_entry?
|
|
197
|
-
def normalize_metadata(value) = entry_parser.normalize_metadata(value)
|
|
198
|
-
def generate_fingerprint(title, url, content) = entry_parser.generate_fingerprint(title, url, content)
|
|
199
178
|
end
|
|
200
179
|
end
|
|
201
180
|
end
|
|
@@ -9,6 +9,7 @@ module SourceMonitor
|
|
|
9
9
|
:per_page,
|
|
10
10
|
:has_next_page,
|
|
11
11
|
:has_previous_page,
|
|
12
|
+
:total_count,
|
|
12
13
|
:filter_set,
|
|
13
14
|
keyword_init: true
|
|
14
15
|
) do
|
|
@@ -19,6 +20,24 @@ module SourceMonitor
|
|
|
19
20
|
def has_previous_page?
|
|
20
21
|
!!self[:has_previous_page]
|
|
21
22
|
end
|
|
23
|
+
|
|
24
|
+
def next_page
|
|
25
|
+
return nil unless has_next_page?
|
|
26
|
+
|
|
27
|
+
page + 1
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def previous_page
|
|
31
|
+
return nil unless has_previous_page?
|
|
32
|
+
|
|
33
|
+
[ page - 1, 1 ].max
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def total_pages
|
|
37
|
+
return 1 if total_count.nil? || total_count <= 0
|
|
38
|
+
|
|
39
|
+
[ 1, (total_count.to_f / per_page).ceil ].max
|
|
40
|
+
end
|
|
22
41
|
end
|
|
23
42
|
|
|
24
43
|
def initialize(params:)
|
|
@@ -38,6 +57,7 @@ module SourceMonitor
|
|
|
38
57
|
per_page: pagination_result.per_page,
|
|
39
58
|
has_next_page: pagination_result.has_next_page?,
|
|
40
59
|
has_previous_page: pagination_result.has_previous_page?,
|
|
60
|
+
total_count: pagination_result.total_count,
|
|
41
61
|
filter_set:
|
|
42
62
|
)
|
|
43
63
|
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
module Queries
|
|
5
|
+
class ScrapeCandidatesQuery
|
|
6
|
+
def initialize(threshold: SourceMonitor.config.scraping.scrape_recommendation_threshold)
|
|
7
|
+
@threshold = threshold.to_i
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def call
|
|
11
|
+
return SourceMonitor::Source.none if @threshold <= 0
|
|
12
|
+
|
|
13
|
+
SourceMonitor::Source.active
|
|
14
|
+
.where(scraping_enabled: false)
|
|
15
|
+
.where(id: source_ids_below_threshold)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
private
|
|
19
|
+
|
|
20
|
+
def source_ids_below_threshold
|
|
21
|
+
SourceMonitor::Item
|
|
22
|
+
.joins(:item_content)
|
|
23
|
+
.where.not(SourceMonitor::ItemContent.table_name => { feed_word_count: nil })
|
|
24
|
+
.group(:source_id)
|
|
25
|
+
.having("AVG(#{SourceMonitor::ItemContent.table_name}.feed_word_count) < ?", @threshold)
|
|
26
|
+
.select(:source_id)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -37,6 +37,11 @@ module SourceMonitor
|
|
|
37
37
|
|
|
38
38
|
source_ids.size
|
|
39
39
|
end
|
|
40
|
+
rescue StandardError => error
|
|
41
|
+
Rails.logger.warn(
|
|
42
|
+
"[SourceMonitor::Scheduler] Scheduler run failed: #{error.class} - #{error.message}"
|
|
43
|
+
) if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
44
|
+
0
|
|
40
45
|
end
|
|
41
46
|
|
|
42
47
|
private
|
|
@@ -5,11 +5,10 @@ module SourceMonitor
|
|
|
5
5
|
# Presenter for building flash messages from BulkSourceScraper results
|
|
6
6
|
# Extracts complex message formatting logic from the controller
|
|
7
7
|
class BulkResultPresenter
|
|
8
|
-
attr_reader :result
|
|
8
|
+
attr_reader :result
|
|
9
9
|
|
|
10
|
-
def initialize(result
|
|
10
|
+
def initialize(result:)
|
|
11
11
|
@result = result
|
|
12
|
-
@pluralizer = pluralizer
|
|
13
12
|
end
|
|
14
13
|
|
|
15
14
|
def to_flash_payload
|
|
@@ -25,14 +24,18 @@ module SourceMonitor
|
|
|
25
24
|
|
|
26
25
|
private
|
|
27
26
|
|
|
27
|
+
def pluralize(count, word)
|
|
28
|
+
"#{count} #{count == 1 ? word : word.pluralize}"
|
|
29
|
+
end
|
|
30
|
+
|
|
28
31
|
def build_success_payload
|
|
29
32
|
label = BulkSourceScraper.selection_label(result.selection)
|
|
30
|
-
pluralized_enqueued =
|
|
33
|
+
pluralized_enqueued = pluralize(result.enqueued_count, "item")
|
|
31
34
|
|
|
32
35
|
message = "Queued scraping for #{pluralized_enqueued} from the #{label}."
|
|
33
36
|
|
|
34
37
|
if result.already_enqueued_count.positive?
|
|
35
|
-
pluralized_already =
|
|
38
|
+
pluralized_already = pluralize(result.already_enqueued_count, "item")
|
|
36
39
|
message = "#{message} #{pluralized_already.capitalize} already in progress."
|
|
37
40
|
end
|
|
38
41
|
|
|
@@ -44,12 +47,12 @@ module SourceMonitor
|
|
|
44
47
|
parts = []
|
|
45
48
|
|
|
46
49
|
if result.enqueued_count.positive?
|
|
47
|
-
pluralized_enqueued =
|
|
50
|
+
pluralized_enqueued = pluralize(result.enqueued_count, "item")
|
|
48
51
|
parts << "Queued #{pluralized_enqueued} from the #{label}"
|
|
49
52
|
end
|
|
50
53
|
|
|
51
54
|
if result.already_enqueued_count.positive?
|
|
52
|
-
pluralized_already =
|
|
55
|
+
pluralized_already = pluralize(result.already_enqueued_count, "item")
|
|
53
56
|
parts << "#{pluralized_already.capitalize} already in progress"
|
|
54
57
|
end
|
|
55
58
|
|
|
@@ -62,7 +65,7 @@ module SourceMonitor
|
|
|
62
65
|
if other_failures.values.sum.positive?
|
|
63
66
|
skipped = other_failures.map do |status, count|
|
|
64
67
|
label_key = status.to_s.tr("_", " ")
|
|
65
|
-
"#{
|
|
68
|
+
"#{pluralize(count, label_key)}"
|
|
66
69
|
end.join(", ")
|
|
67
70
|
parts << "Skipped #{skipped}"
|
|
68
71
|
end
|