source_monitor 0.11.1 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/commands/rails-audit.md +77 -0
- data/CHANGELOG.md +50 -0
- data/CLAUDE.md +2 -2
- data/Gemfile.lock +7 -20
- data/RAILS_AUDIT.md +424 -0
- data/VERSION +1 -1
- data/app/assets/builds/source_monitor/application.css +4 -24
- data/app/assets/builds/source_monitor/application.js +57 -89
- data/app/assets/builds/source_monitor/application.js.map +4 -4
- data/app/assets/javascripts/source_monitor/application.js +3 -6
- data/app/assets/javascripts/source_monitor/controllers/dropdown_controller.js +6 -86
- data/app/assets/javascripts/source_monitor/controllers/filter_submit_controller.js +13 -0
- data/app/assets/javascripts/source_monitor/controllers/modal_controller.js +56 -0
- data/app/assets/javascripts/source_monitor/controllers/notification_controller.js +3 -13
- data/app/components/source_monitor/application_component.rb +10 -0
- data/app/components/source_monitor/filter_dropdown_component.rb +62 -0
- data/app/components/source_monitor/icon_component.rb +140 -0
- data/app/components/source_monitor/status_badge_component.html.erb +8 -0
- data/app/components/source_monitor/status_badge_component.rb +96 -0
- data/app/controllers/concerns/source_monitor/sanitizes_search_params.rb +4 -0
- data/app/controllers/concerns/source_monitor/set_source.rb +13 -0
- data/app/controllers/source_monitor/application_controller.rb +17 -0
- data/app/controllers/source_monitor/bulk_scrape_enablements_controller.rb +6 -10
- data/app/controllers/source_monitor/dashboard_controller.rb +5 -1
- data/app/controllers/source_monitor/import_history_dismissals_controller.rb +1 -1
- data/app/controllers/source_monitor/import_sessions_controller.rb +30 -9
- data/app/controllers/source_monitor/item_scrapes_controller.rb +70 -0
- data/app/controllers/source_monitor/items_controller.rb +2 -69
- data/app/controllers/source_monitor/source_bulk_scrapes_controller.rb +1 -4
- data/app/controllers/source_monitor/source_favicon_fetches_controller.rb +2 -12
- data/app/controllers/source_monitor/source_fetches_controller.rb +1 -6
- data/app/controllers/source_monitor/source_health_checks_controller.rb +9 -16
- data/app/controllers/source_monitor/source_health_resets_controller.rb +1 -6
- data/app/controllers/source_monitor/source_retries_controller.rb +1 -6
- data/app/controllers/source_monitor/source_scrape_tests_controller.rb +2 -4
- data/app/controllers/source_monitor/source_turbo_responses.rb +1 -3
- data/app/controllers/source_monitor/sources_controller.rb +15 -20
- data/app/helpers/source_monitor/application_helper.rb +15 -31
- data/app/helpers/source_monitor/health_badge_helper.rb +8 -0
- data/app/jobs/source_monitor/download_content_images_job.rb +1 -59
- data/app/jobs/source_monitor/favicon_fetch_job.rb +1 -58
- data/app/jobs/source_monitor/fetch_feed_job.rb +2 -52
- data/app/jobs/source_monitor/import_opml_job.rb +6 -145
- data/app/jobs/source_monitor/import_session_health_check_job.rb +15 -76
- data/app/jobs/source_monitor/item_cleanup_job.rb +5 -0
- data/app/jobs/source_monitor/log_cleanup_job.rb +13 -2
- data/app/jobs/source_monitor/schedule_fetches_job.rb +8 -0
- data/app/jobs/source_monitor/scrape_item_job.rb +6 -52
- data/app/jobs/source_monitor/source_health_check_job.rb +1 -72
- data/app/models/concerns/source_monitor/loggable.rb +12 -0
- data/app/models/source_monitor/fetch_log.rb +0 -8
- data/app/models/source_monitor/health_check_log.rb +0 -8
- data/app/models/source_monitor/import_history.rb +14 -0
- data/app/models/source_monitor/import_session.rb +2 -0
- data/app/models/source_monitor/item.rb +15 -0
- data/app/models/source_monitor/item_content.rb +4 -3
- data/app/models/source_monitor/scrape_log.rb +4 -6
- data/app/models/source_monitor/source.rb +28 -19
- data/app/presenters/source_monitor/base_presenter.rb +19 -0
- data/app/presenters/source_monitor/source_details_presenter.rb +61 -0
- data/app/presenters/source_monitor/sources_filter_presenter.rb +61 -0
- data/app/views/source_monitor/dashboard/_recent_activity.html.erb +3 -3
- data/app/views/source_monitor/dashboard/_stat_card.html.erb +2 -1
- data/app/views/source_monitor/dashboard/_stats.html.erb +5 -7
- data/app/views/source_monitor/items/_details.html.erb +11 -14
- data/app/views/source_monitor/items/index.html.erb +10 -35
- data/app/views/source_monitor/logs/index.html.erb +20 -41
- data/app/views/source_monitor/shared/_form_errors.html.erb +14 -0
- data/app/views/source_monitor/source_scrape_tests/_result.html.erb +1 -29
- data/app/views/source_monitor/source_scrape_tests/_result_content.html.erb +33 -0
- data/app/views/source_monitor/source_scrape_tests/show.html.erb +1 -29
- data/app/views/source_monitor/sources/_bulk_scrape_enable_modal.html.erb +2 -2
- data/app/views/source_monitor/sources/_bulk_scrape_modal.html.erb +7 -5
- data/app/views/source_monitor/sources/_details.html.erb +24 -52
- data/app/views/source_monitor/sources/_health_status_badge.html.erb +4 -6
- data/app/views/source_monitor/sources/_row.html.erb +7 -18
- data/app/views/source_monitor/sources/edit.html.erb +1 -10
- data/app/views/source_monitor/sources/index.html.erb +26 -46
- data/app/views/source_monitor/sources/new.html.erb +1 -10
- data/config/routes.rb +1 -1
- data/db/migrate/20260313120000_add_composite_indexes_to_log_tables.rb +14 -0
- data/db/migrate/20260314120000_align_health_status_default.rb +11 -0
- data/lib/source_monitor/analytics/sources_index_metrics.rb +15 -0
- data/lib/source_monitor/dashboard/queries/recent_activity_query.rb +10 -4
- data/lib/source_monitor/dashboard/turbo_broadcaster.rb +21 -5
- data/lib/source_monitor/favicons/fetcher.rb +86 -0
- data/lib/source_monitor/fetching/cloudflare_bypass.rb +14 -5
- data/lib/source_monitor/fetching/completion/event_publisher.rb +12 -0
- data/lib/source_monitor/fetching/completion/follow_up_handler.rb +15 -2
- data/lib/source_monitor/fetching/completion/retention_handler.rb +11 -3
- data/lib/source_monitor/fetching/feed_fetcher.rb +2 -21
- data/lib/source_monitor/fetching/fetch_runner.rb +12 -3
- data/lib/source_monitor/fetching/retry_orchestrator.rb +102 -0
- data/lib/source_monitor/fetching/stalled_fetch_reconciler.rb +9 -0
- data/lib/source_monitor/health/source_health_check_orchestrator.rb +95 -0
- data/lib/source_monitor/health.rb +1 -0
- data/lib/source_monitor/images/downloader.rb +6 -7
- data/lib/source_monitor/images/processor.rb +98 -0
- data/lib/source_monitor/import_sessions/health_check_updater.rb +95 -0
- data/lib/source_monitor/import_sessions/opml_importer.rb +163 -0
- data/lib/source_monitor/items/item_creator.rb +0 -21
- data/lib/source_monitor/logs/query.rb +20 -0
- data/lib/source_monitor/queries/scrape_candidates_query.rb +30 -0
- data/lib/source_monitor/queries.rb +7 -0
- data/lib/source_monitor/scheduler.rb +5 -0
- data/lib/source_monitor/scraping/bulk_result_presenter.rb +11 -8
- data/lib/source_monitor/scraping/runner.rb +52 -0
- data/lib/source_monitor/scraping/scheduler.rb +5 -0
- data/lib/source_monitor/scraping/state.rb +4 -2
- data/lib/source_monitor/security/parameter_sanitizer.rb +7 -0
- data/lib/source_monitor/version.rb +1 -1
- data/lib/source_monitor.rb +7 -0
- data/source_monitor.gemspec +1 -0
- metadata +47 -1
|
@@ -1,16 +1,7 @@
|
|
|
1
1
|
<div class="mx-auto max-w-2xl py-10">
|
|
2
2
|
<h1 class="text-3xl font-semibold">New Source</h1>
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
<div class="mt-4 rounded border border-red-300 bg-red-50 p-4">
|
|
6
|
-
<h2 class="font-medium text-red-700">Please fix the following:</h2>
|
|
7
|
-
<ul class="mt-2 list-disc space-y-1 pl-5 text-red-700">
|
|
8
|
-
<% @source.errors.full_messages.each do |message| %>
|
|
9
|
-
<li><%= message %></li>
|
|
10
|
-
<% end %>
|
|
11
|
-
</ul>
|
|
12
|
-
</div>
|
|
13
|
-
<% end %>
|
|
4
|
+
<%= render "source_monitor/shared/form_errors", record: @source %>
|
|
14
5
|
|
|
15
6
|
<div class="mt-6">
|
|
16
7
|
<%= render "form", source: @source %>
|
data/config/routes.rb
CHANGED
|
@@ -17,7 +17,7 @@ SourceMonitor::Engine.routes.draw do
|
|
|
17
17
|
end
|
|
18
18
|
end
|
|
19
19
|
resources :items, only: %i[index show] do
|
|
20
|
-
|
|
20
|
+
resource :scrape, only: :create, controller: "item_scrapes"
|
|
21
21
|
end
|
|
22
22
|
resources :bulk_scrape_enablements, only: :create
|
|
23
23
|
resources :sources do
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class AddCompositeIndexesToLogTables < ActiveRecord::Migration[8.0]
|
|
4
|
+
def change
|
|
5
|
+
add_index :sourcemon_fetch_logs, [ :source_id, :started_at ],
|
|
6
|
+
name: "index_fetch_logs_on_source_id_and_started_at"
|
|
7
|
+
add_index :sourcemon_scrape_logs, [ :source_id, :started_at ],
|
|
8
|
+
name: "index_scrape_logs_on_source_id_and_started_at"
|
|
9
|
+
add_index :sourcemon_scrape_logs, [ :item_id, :started_at ],
|
|
10
|
+
name: "index_scrape_logs_on_item_id_and_started_at"
|
|
11
|
+
add_index :sourcemon_health_check_logs, [ :source_id, :started_at ],
|
|
12
|
+
name: "index_health_check_logs_on_source_id_and_started_at"
|
|
13
|
+
end
|
|
14
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class AlignHealthStatusDefault < ActiveRecord::Migration[8.1]
|
|
4
|
+
def up
|
|
5
|
+
change_column_default :sourcemon_sources, :health_status, from: "healthy", to: "working"
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def down
|
|
9
|
+
change_column_default :sourcemon_sources, :health_status, from: "working", to: "healthy"
|
|
10
|
+
end
|
|
11
|
+
end
|
|
@@ -43,6 +43,21 @@ module SourceMonitor
|
|
|
43
43
|
@item_activity_rates ||= SourceActivityRates.new(scope: result_scope, lookback:, now:).per_source_rates
|
|
44
44
|
end
|
|
45
45
|
|
|
46
|
+
def word_count_averages(source_ids)
|
|
47
|
+
if source_ids.any?
|
|
48
|
+
base = SourceMonitor::ItemContent.joins(:item).where(sourcemon_items: { source_id: source_ids })
|
|
49
|
+
feed = base.where.not(feed_word_count: nil)
|
|
50
|
+
.group("sourcemon_items.source_id")
|
|
51
|
+
.average(:feed_word_count)
|
|
52
|
+
scraped = base.where.not(scraped_word_count: nil)
|
|
53
|
+
.group("sourcemon_items.source_id")
|
|
54
|
+
.average(:scraped_word_count)
|
|
55
|
+
{ feed:, scraped: }
|
|
56
|
+
else
|
|
57
|
+
{ feed: {}, scraped: {} }
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
46
61
|
def fetch_interval_filter
|
|
47
62
|
min = integer_param(search_params["fetch_interval_minutes_gteq"])
|
|
48
63
|
max = integer_param(search_params["fetch_interval_minutes_lt"]) || integer_param(search_params["fetch_interval_minutes_lteq"])
|
|
@@ -43,7 +43,7 @@ module SourceMonitor
|
|
|
43
43
|
end
|
|
44
44
|
|
|
45
45
|
def sanitized_sql
|
|
46
|
-
ActiveRecord::Base.send(:sanitize_sql_array, [ unified_sql_template, limit ])
|
|
46
|
+
ActiveRecord::Base.send(:sanitize_sql_array, [ unified_sql_template, limit, limit, limit, limit ])
|
|
47
47
|
end
|
|
48
48
|
|
|
49
49
|
def unified_sql_template
|
|
@@ -61,11 +61,11 @@ module SourceMonitor
|
|
|
61
61
|
source_id,
|
|
62
62
|
source_feed_url
|
|
63
63
|
FROM (
|
|
64
|
-
#{fetch_log_sql}
|
|
64
|
+
(#{fetch_log_sql})
|
|
65
65
|
UNION ALL
|
|
66
|
-
#{scrape_log_sql}
|
|
66
|
+
(#{scrape_log_sql})
|
|
67
67
|
UNION ALL
|
|
68
|
-
#{item_sql}
|
|
68
|
+
(#{item_sql})
|
|
69
69
|
) AS dashboard_events
|
|
70
70
|
WHERE occurred_at IS NOT NULL
|
|
71
71
|
ORDER BY occurred_at DESC
|
|
@@ -91,6 +91,8 @@ module SourceMonitor
|
|
|
91
91
|
FROM #{SourceMonitor::FetchLog.quoted_table_name}
|
|
92
92
|
LEFT JOIN #{SourceMonitor::Source.quoted_table_name}
|
|
93
93
|
ON #{SourceMonitor::Source.quoted_table_name}.id = #{SourceMonitor::FetchLog.quoted_table_name}.source_id
|
|
94
|
+
ORDER BY #{SourceMonitor::FetchLog.quoted_table_name}.started_at DESC NULLS LAST
|
|
95
|
+
LIMIT ?
|
|
94
96
|
SQL
|
|
95
97
|
end
|
|
96
98
|
|
|
@@ -114,6 +116,8 @@ module SourceMonitor
|
|
|
114
116
|
ON #{SourceMonitor::Source.quoted_table_name}.id = #{SourceMonitor::ScrapeLog.quoted_table_name}.source_id
|
|
115
117
|
LEFT JOIN #{SourceMonitor::Item.quoted_table_name}
|
|
116
118
|
ON #{SourceMonitor::Item.quoted_table_name}.id = #{SourceMonitor::ScrapeLog.quoted_table_name}.item_id
|
|
119
|
+
ORDER BY #{SourceMonitor::ScrapeLog.quoted_table_name}.started_at DESC NULLS LAST
|
|
120
|
+
LIMIT ?
|
|
117
121
|
SQL
|
|
118
122
|
end
|
|
119
123
|
|
|
@@ -135,6 +139,8 @@ module SourceMonitor
|
|
|
135
139
|
FROM #{SourceMonitor::Item.quoted_table_name}
|
|
136
140
|
LEFT JOIN #{SourceMonitor::Source.quoted_table_name}
|
|
137
141
|
ON #{SourceMonitor::Source.quoted_table_name}.id = #{SourceMonitor::Item.quoted_table_name}.source_id
|
|
142
|
+
ORDER BY #{SourceMonitor::Item.quoted_table_name}.created_at DESC NULLS LAST
|
|
143
|
+
LIMIT ?
|
|
138
144
|
SQL
|
|
139
145
|
end
|
|
140
146
|
|
|
@@ -22,17 +22,33 @@ module SourceMonitor
|
|
|
22
22
|
@item_callback ||= lambda { |_event| broadcast_dashboard_updates }
|
|
23
23
|
end
|
|
24
24
|
|
|
25
|
+
STAT_CARDS = [
|
|
26
|
+
{ key: "total_sources", label: "Sources", stat: :total_sources, caption: "Total registered" },
|
|
27
|
+
{ key: "active_sources", label: "Active", stat: :active_sources, caption: "Fetching on schedule" },
|
|
28
|
+
{ key: "failed_sources", label: "Failures", stat: :failed_sources, caption: "Require attention" },
|
|
29
|
+
{ key: "total_items", label: "Items", stat: :total_items, caption: "Stored entries" },
|
|
30
|
+
{ key: "fetches_today", label: "Fetches Today", stat: :fetches_today, caption: "Completed runs" }
|
|
31
|
+
].freeze
|
|
32
|
+
|
|
25
33
|
def broadcast_dashboard_updates
|
|
26
34
|
return unless turbo_streams_available?
|
|
27
35
|
|
|
28
36
|
queries = SourceMonitor::Dashboard::Queries.new
|
|
29
37
|
url_helpers = SourceMonitor::Engine.routes.url_helpers
|
|
38
|
+
stats = queries.stats
|
|
30
39
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
40
|
+
STAT_CARDS.each do |card|
|
|
41
|
+
Turbo::StreamsChannel.broadcast_replace_to(
|
|
42
|
+
STREAM_NAME,
|
|
43
|
+
target: "source_monitor_stat_#{card[:key]}",
|
|
44
|
+
html: render_partial("source_monitor/dashboard/stat_card", stat_card: {
|
|
45
|
+
key: card[:key],
|
|
46
|
+
label: card[:label],
|
|
47
|
+
value: stats[card[:stat]],
|
|
48
|
+
caption: card[:caption]
|
|
49
|
+
})
|
|
50
|
+
)
|
|
51
|
+
end
|
|
36
52
|
|
|
37
53
|
Turbo::StreamsChannel.broadcast_replace_to(
|
|
38
54
|
STREAM_NAME,
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
module Favicons
|
|
5
|
+
# Coordinates favicon fetching for a source: checks prerequisites
|
|
6
|
+
# (ActiveStorage, config, cooldown), delegates to Discoverer, and
|
|
7
|
+
# handles attachment or failure recording. Extracted from FaviconFetchJob.
|
|
8
|
+
class Fetcher
|
|
9
|
+
TRANSIENT_ERRORS = [
|
|
10
|
+
Timeout::Error, Errno::ETIMEDOUT,
|
|
11
|
+
Faraday::TimeoutError, Faraday::ConnectionFailed,
|
|
12
|
+
Net::OpenTimeout, Net::ReadTimeout
|
|
13
|
+
].freeze
|
|
14
|
+
|
|
15
|
+
def initialize(source)
|
|
16
|
+
@source = source
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def call
|
|
20
|
+
return unless defined?(ActiveStorage)
|
|
21
|
+
return unless SourceMonitor.config.favicons.enabled?
|
|
22
|
+
return if source.website_url.blank?
|
|
23
|
+
return if source.favicon.attached?
|
|
24
|
+
return if within_cooldown?
|
|
25
|
+
|
|
26
|
+
result = SourceMonitor::Favicons::Discoverer.new(source.website_url).call
|
|
27
|
+
|
|
28
|
+
if result
|
|
29
|
+
attach_favicon(result)
|
|
30
|
+
else
|
|
31
|
+
record_failed_attempt
|
|
32
|
+
end
|
|
33
|
+
rescue ActiveRecord::Deadlocked
|
|
34
|
+
raise
|
|
35
|
+
rescue *TRANSIENT_ERRORS => error
|
|
36
|
+
log_error("Transient error", error)
|
|
37
|
+
raise
|
|
38
|
+
rescue StandardError => error
|
|
39
|
+
record_failed_attempt
|
|
40
|
+
log_error("Failed", error)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
attr_reader :source
|
|
46
|
+
|
|
47
|
+
def within_cooldown?
|
|
48
|
+
last_attempt = source.metadata&.dig("favicon_last_attempted_at")
|
|
49
|
+
return false if last_attempt.blank?
|
|
50
|
+
|
|
51
|
+
cooldown_days = SourceMonitor.config.favicons.retry_cooldown_days
|
|
52
|
+
Time.parse(last_attempt) > cooldown_days.days.ago
|
|
53
|
+
rescue ArgumentError, TypeError
|
|
54
|
+
false
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def attach_favicon(result)
|
|
58
|
+
blob = ActiveStorage::Blob.create_and_upload!(
|
|
59
|
+
io: result.io,
|
|
60
|
+
filename: result.filename,
|
|
61
|
+
content_type: result.content_type
|
|
62
|
+
)
|
|
63
|
+
source.favicon.attach(blob)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def record_failed_attempt
|
|
67
|
+
metadata = (source.metadata || {}).merge(
|
|
68
|
+
"favicon_last_attempted_at" => Time.current.iso8601
|
|
69
|
+
)
|
|
70
|
+
source.update_column(:metadata, metadata)
|
|
71
|
+
rescue StandardError
|
|
72
|
+
nil
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def log_error(prefix, error)
|
|
76
|
+
return unless defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
77
|
+
|
|
78
|
+
Rails.logger.warn(
|
|
79
|
+
"[SourceMonitor::Favicons::Fetcher] #{prefix} for source #{source&.id}: #{error.class} - #{error.message}"
|
|
80
|
+
)
|
|
81
|
+
rescue StandardError
|
|
82
|
+
nil
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
@@ -12,12 +12,15 @@ module SourceMonitor
|
|
|
12
12
|
|
|
13
13
|
CLOUDFLARE_MARKERS = FeedFetcher::CLOUDFLARE_MARKERS
|
|
14
14
|
SNIFF_LIMIT = FeedFetcher::SNIFF_LIMIT
|
|
15
|
+
DEFAULT_MAX_ATTEMPTS = 2
|
|
16
|
+
BYPASS_TIMEOUT = 10
|
|
15
17
|
|
|
16
|
-
attr_reader :response, :feed_url
|
|
18
|
+
attr_reader :response, :feed_url, :max_attempts
|
|
17
19
|
|
|
18
|
-
def initialize(response:, feed_url:)
|
|
20
|
+
def initialize(response:, feed_url:, max_attempts: DEFAULT_MAX_ATTEMPTS)
|
|
19
21
|
@response = response
|
|
20
22
|
@feed_url = feed_url
|
|
23
|
+
@max_attempts = max_attempts
|
|
21
24
|
end
|
|
22
25
|
|
|
23
26
|
def call
|
|
@@ -36,7 +39,7 @@ module SourceMonitor
|
|
|
36
39
|
end
|
|
37
40
|
|
|
38
41
|
def attempt_ua_rotation
|
|
39
|
-
USER_AGENTS.each do |ua|
|
|
42
|
+
USER_AGENTS.first(max_attempts).each do |ua|
|
|
40
43
|
headers = {
|
|
41
44
|
"User-Agent" => ua,
|
|
42
45
|
"Cache-Control" => "no-cache",
|
|
@@ -50,9 +53,15 @@ module SourceMonitor
|
|
|
50
53
|
end
|
|
51
54
|
|
|
52
55
|
def fetch_with_headers(headers)
|
|
53
|
-
client = SourceMonitor::HTTP.client(
|
|
56
|
+
client = SourceMonitor::HTTP.client(
|
|
57
|
+
headers: headers,
|
|
58
|
+
timeout: BYPASS_TIMEOUT,
|
|
59
|
+
open_timeout: [ BYPASS_TIMEOUT / 2, 5 ].min,
|
|
60
|
+
retry_requests: false
|
|
61
|
+
)
|
|
54
62
|
client.get(feed_url)
|
|
55
|
-
rescue StandardError
|
|
63
|
+
rescue StandardError => e
|
|
64
|
+
Rails.logger.warn("[SourceMonitor] CloudflareBypass request failed for #{feed_url}: #{e.class}: #{e.message}") if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
56
65
|
nil
|
|
57
66
|
end
|
|
58
67
|
|
|
@@ -5,12 +5,24 @@ module SourceMonitor
|
|
|
5
5
|
module Completion
|
|
6
6
|
# Publishes fetch completion events to the configured event dispatcher.
|
|
7
7
|
class EventPublisher
|
|
8
|
+
Result = Struct.new(:status, :error, keyword_init: true) do
|
|
9
|
+
def success?
|
|
10
|
+
status != :failed
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
8
14
|
def initialize(dispatcher: SourceMonitor::Events)
|
|
9
15
|
@dispatcher = dispatcher
|
|
10
16
|
end
|
|
11
17
|
|
|
12
18
|
def call(source:, result:)
|
|
13
19
|
dispatcher.after_fetch_completed(source: source, result: result)
|
|
20
|
+
Result.new(status: :published)
|
|
21
|
+
rescue StandardError => error
|
|
22
|
+
Rails.logger.error(
|
|
23
|
+
"[SourceMonitor::Fetching::Completion::EventPublisher] Event dispatch failed for source #{source.id}: #{error.class} - #{error.message}"
|
|
24
|
+
) if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
25
|
+
Result.new(status: :failed, error: error)
|
|
14
26
|
end
|
|
15
27
|
|
|
16
28
|
private
|
|
@@ -5,25 +5,38 @@ module SourceMonitor
|
|
|
5
5
|
module Completion
|
|
6
6
|
# Enqueues follow-up scraping work for items created during a fetch.
|
|
7
7
|
class FollowUpHandler
|
|
8
|
+
Result = Struct.new(:status, :enqueued_count, :errors, keyword_init: true) do
|
|
9
|
+
def success?
|
|
10
|
+
status != :failed
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
8
14
|
def initialize(enqueuer_class: SourceMonitor::Scraping::Enqueuer, job_class: SourceMonitor::ScrapeItemJob)
|
|
9
15
|
@enqueuer_class = enqueuer_class
|
|
10
16
|
@job_class = job_class
|
|
11
17
|
end
|
|
12
18
|
|
|
13
19
|
def call(source:, result:)
|
|
14
|
-
return unless should_enqueue?(source:, result:)
|
|
20
|
+
return Result.new(status: :skipped, enqueued_count: 0, errors: []) unless should_enqueue?(source:, result:)
|
|
21
|
+
|
|
22
|
+
enqueued = 0
|
|
23
|
+
errors = []
|
|
15
24
|
|
|
16
25
|
Array(result.item_processing&.created_items).each do |item|
|
|
17
26
|
next unless item.present? && item.scraped_at.nil?
|
|
18
27
|
|
|
19
28
|
begin
|
|
20
29
|
enqueuer_class.enqueue(item:, source:, job_class:, reason: :auto)
|
|
30
|
+
enqueued += 1
|
|
21
31
|
rescue StandardError => error
|
|
32
|
+
errors << error
|
|
22
33
|
Rails.logger.error(
|
|
23
|
-
"[SourceMonitor]
|
|
34
|
+
"[SourceMonitor::Fetching::Completion::FollowUpHandler] Failed to enqueue scrape for item #{item.id}: #{error.class}: #{error.message}"
|
|
24
35
|
) if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
25
36
|
end
|
|
26
37
|
end
|
|
38
|
+
|
|
39
|
+
Result.new(status: :applied, enqueued_count: enqueued, errors: errors)
|
|
27
40
|
end
|
|
28
41
|
|
|
29
42
|
private
|
|
@@ -5,20 +5,28 @@ module SourceMonitor
|
|
|
5
5
|
module Completion
|
|
6
6
|
# Applies item retention after a fetch completes.
|
|
7
7
|
class RetentionHandler
|
|
8
|
+
Result = Struct.new(:status, :removed_total, :error, keyword_init: true) do
|
|
9
|
+
def success?
|
|
10
|
+
status != :failed
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
8
14
|
def initialize(pruner: SourceMonitor::Items::RetentionPruner)
|
|
9
15
|
@pruner = pruner
|
|
10
16
|
end
|
|
11
17
|
|
|
12
18
|
def call(source:, result:) # rubocop:disable Lint/UnusedMethodArgument
|
|
13
|
-
pruner.call(
|
|
19
|
+
pruner_result = pruner.call(
|
|
14
20
|
source: source,
|
|
15
21
|
strategy: SourceMonitor.config.retention.strategy
|
|
16
22
|
)
|
|
23
|
+
removed = pruner_result.respond_to?(:removed_total) ? pruner_result.removed_total : 0
|
|
24
|
+
Result.new(status: :applied, removed_total: removed)
|
|
17
25
|
rescue StandardError => error
|
|
18
26
|
Rails.logger.error(
|
|
19
|
-
"[SourceMonitor] Retention pruning failed for source #{source.id}: #{error.class} - #{error.message}"
|
|
27
|
+
"[SourceMonitor::Fetching::Completion::RetentionHandler] Retention pruning failed for source #{source.id}: #{error.class} - #{error.message}"
|
|
20
28
|
)
|
|
21
|
-
|
|
29
|
+
Result.new(status: :failed, removed_total: 0, error: error)
|
|
22
30
|
end
|
|
23
31
|
|
|
24
32
|
private
|
|
@@ -27,13 +27,6 @@ module SourceMonitor
|
|
|
27
27
|
)
|
|
28
28
|
ResponseWrapper = Struct.new(:status, :headers, :body, keyword_init: true)
|
|
29
29
|
|
|
30
|
-
MIN_FETCH_INTERVAL = AdaptiveInterval::MIN_FETCH_INTERVAL
|
|
31
|
-
MAX_FETCH_INTERVAL = AdaptiveInterval::MAX_FETCH_INTERVAL
|
|
32
|
-
INCREASE_FACTOR = AdaptiveInterval::INCREASE_FACTOR
|
|
33
|
-
DECREASE_FACTOR = AdaptiveInterval::DECREASE_FACTOR
|
|
34
|
-
FAILURE_INCREASE_FACTOR = AdaptiveInterval::FAILURE_INCREASE_FACTOR
|
|
35
|
-
JITTER_PERCENT = AdaptiveInterval::JITTER_PERCENT
|
|
36
|
-
|
|
37
30
|
attr_reader :source, :client, :jitter_proc
|
|
38
31
|
|
|
39
32
|
def initialize(source:, client: nil, jitter: nil)
|
|
@@ -328,7 +321,8 @@ module SourceMonitor
|
|
|
328
321
|
|
|
329
322
|
response = perform_request
|
|
330
323
|
handle_response(response, started_at, instrumentation_payload)
|
|
331
|
-
rescue StandardError
|
|
324
|
+
rescue StandardError => e
|
|
325
|
+
Rails.logger.warn("[SourceMonitor] AIA recovery failed for #{source.feed_url}: #{e.class}: #{e.message}") if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
332
326
|
nil
|
|
333
327
|
end
|
|
334
328
|
|
|
@@ -389,19 +383,6 @@ module SourceMonitor
|
|
|
389
383
|
def entry_processor
|
|
390
384
|
@entry_processor ||= EntryProcessor.new(source: source)
|
|
391
385
|
end
|
|
392
|
-
|
|
393
|
-
# Forwarding methods for backward compatibility with tests
|
|
394
|
-
def process_feed_entries(feed) = entry_processor.process_feed_entries(feed)
|
|
395
|
-
def jitter_offset(interval_seconds) = adaptive_interval.jitter_offset(interval_seconds)
|
|
396
|
-
def adjusted_interval_with_jitter(interval_seconds) = adaptive_interval.adjusted_interval_with_jitter(interval_seconds)
|
|
397
|
-
def updated_metadata(feed_signature: nil) = source_updater.updated_metadata(feed_signature: feed_signature)
|
|
398
|
-
def feed_signature_changed?(feed_signature) = source_updater.feed_signature_changed?(feed_signature)
|
|
399
|
-
def configured_seconds(minutes_value, default) = adaptive_interval.configured_seconds(minutes_value, default)
|
|
400
|
-
def configured_positive(value, default) = adaptive_interval.configured_positive(value, default)
|
|
401
|
-
def configured_non_negative(value, default) = adaptive_interval.configured_non_negative(value, default)
|
|
402
|
-
def interval_minutes_for(interval_seconds) = adaptive_interval.interval_minutes_for(interval_seconds)
|
|
403
|
-
def parse_http_time(value) = source_updater.parse_http_time(value)
|
|
404
|
-
def extract_numeric(value) = adaptive_interval.extract_numeric(value)
|
|
405
386
|
end
|
|
406
387
|
end
|
|
407
388
|
end
|
|
@@ -59,13 +59,13 @@ module SourceMonitor
|
|
|
59
59
|
lock.with_lock do
|
|
60
60
|
mark_fetching!
|
|
61
61
|
result = fetcher_class.new(source: source).call
|
|
62
|
-
retention_handler.call(source:, result:)
|
|
63
|
-
follow_up_handler.call(source:, result:)
|
|
62
|
+
log_handler_result("RetentionHandler", retention_handler.call(source:, result:))
|
|
63
|
+
log_handler_result("FollowUpHandler", follow_up_handler.call(source:, result:))
|
|
64
64
|
schedule_retry_if_needed(result)
|
|
65
65
|
mark_complete!(result)
|
|
66
66
|
end
|
|
67
67
|
|
|
68
|
-
event_publisher.call(source:, result:)
|
|
68
|
+
log_handler_result("EventPublisher", event_publisher.call(source:, result:))
|
|
69
69
|
result
|
|
70
70
|
rescue SourceMonitor::Fetching::AdvisoryLock::NotAcquiredError => error
|
|
71
71
|
raise ConcurrencyError, error.message
|
|
@@ -138,6 +138,15 @@ module SourceMonitor
|
|
|
138
138
|
self.class.send(:update_source_state!, source, attrs)
|
|
139
139
|
end
|
|
140
140
|
|
|
141
|
+
def log_handler_result(handler_name, handler_result)
|
|
142
|
+
return unless handler_result.respond_to?(:success?) && !handler_result.success?
|
|
143
|
+
|
|
144
|
+
error_detail = handler_result.respond_to?(:error) && handler_result.error ? ": #{handler_result.error.message}" : ""
|
|
145
|
+
Rails.logger.warn(
|
|
146
|
+
"[SourceMonitor::Fetching::FetchRunner] #{handler_name} failed for source #{source.id}#{error_detail}"
|
|
147
|
+
) if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
148
|
+
end
|
|
149
|
+
|
|
141
150
|
def schedule_retry_if_needed(result)
|
|
142
151
|
decision = result&.retry_decision
|
|
143
152
|
return unless decision&.retry?
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
module Fetching
|
|
5
|
+
# Executes retry/circuit-breaker decisions produced by RetryPolicy.
|
|
6
|
+
#
|
|
7
|
+
# Accepts a source, the original fetch error, and a RetryPolicy::Decision,
|
|
8
|
+
# then either enqueues a retry job, opens the circuit, or resets retry state.
|
|
9
|
+
#
|
|
10
|
+
# Returns a Result struct indicating which path was taken.
|
|
11
|
+
class RetryOrchestrator
|
|
12
|
+
Result = Struct.new(:status, :source, :error, :decision, keyword_init: true) do
|
|
13
|
+
def retry_enqueued?
|
|
14
|
+
status == :retry_enqueued
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def circuit_opened?
|
|
18
|
+
status == :circuit_opened
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def exhausted?
|
|
22
|
+
status == :exhausted
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def self.call(source:, error:, decision:, job_class: SourceMonitor::FetchFeedJob, now: Time.current)
|
|
27
|
+
new(source:, error:, decision:, job_class:, now:).call
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def initialize(source:, error:, decision:, job_class:, now:)
|
|
31
|
+
@source = source
|
|
32
|
+
@error = error
|
|
33
|
+
@decision = decision
|
|
34
|
+
@job_class = job_class
|
|
35
|
+
@now = now
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def call
|
|
39
|
+
if decision.retry?
|
|
40
|
+
enqueue_retry!
|
|
41
|
+
elsif decision.open_circuit?
|
|
42
|
+
open_circuit!
|
|
43
|
+
else
|
|
44
|
+
reset_retry_state!
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
|
|
50
|
+
attr_reader :source, :error, :decision, :job_class, :now
|
|
51
|
+
|
|
52
|
+
def enqueue_retry!
|
|
53
|
+
retry_at = now + (decision.wait || 0)
|
|
54
|
+
|
|
55
|
+
source.with_lock do
|
|
56
|
+
source.reload
|
|
57
|
+
source.update!(
|
|
58
|
+
fetch_retry_attempt: decision.next_attempt,
|
|
59
|
+
fetch_circuit_opened_at: nil,
|
|
60
|
+
fetch_circuit_until: nil,
|
|
61
|
+
next_fetch_at: retry_at,
|
|
62
|
+
backoff_until: retry_at,
|
|
63
|
+
fetch_status: "queued"
|
|
64
|
+
)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
job_class.set(wait: decision.wait || 0).perform_later(source.id)
|
|
68
|
+
|
|
69
|
+
Result.new(status: :retry_enqueued, source: source, error: error, decision: decision)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def open_circuit!
|
|
73
|
+
source.with_lock do
|
|
74
|
+
source.reload
|
|
75
|
+
source.update!(
|
|
76
|
+
fetch_retry_attempt: 0,
|
|
77
|
+
fetch_circuit_opened_at: now,
|
|
78
|
+
fetch_circuit_until: decision.circuit_until,
|
|
79
|
+
next_fetch_at: decision.circuit_until,
|
|
80
|
+
backoff_until: decision.circuit_until,
|
|
81
|
+
fetch_status: "failed"
|
|
82
|
+
)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
Result.new(status: :circuit_opened, source: source, error: error, decision: decision)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def reset_retry_state!
|
|
89
|
+
source.with_lock do
|
|
90
|
+
source.reload
|
|
91
|
+
source.update!(
|
|
92
|
+
fetch_retry_attempt: 0,
|
|
93
|
+
fetch_circuit_opened_at: nil,
|
|
94
|
+
fetch_circuit_until: nil
|
|
95
|
+
)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
Result.new(status: :exhausted, source: source, error: error, decision: decision)
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
@@ -102,6 +102,15 @@ module SourceMonitor
|
|
|
102
102
|
return ::SolidQueue::Job.none unless jobs_supported?
|
|
103
103
|
|
|
104
104
|
queue_name = SourceMonitor.queue_name(:fetch)
|
|
105
|
+
# SolidQueue stores job arguments as JSON in the `arguments` text column.
|
|
106
|
+
# The format is: {"job_class":"...", "arguments":[source_id, ...], ...}
|
|
107
|
+
# We cast to jsonb and extract the first positional argument to match
|
|
108
|
+
# jobs targeting this source.
|
|
109
|
+
#
|
|
110
|
+
# Tested against: SolidQueue 1.1.x (Rails 8.x). The serialization format
|
|
111
|
+
# is part of ActiveJob's serialize/deserialize contract. If SolidQueue
|
|
112
|
+
# changes its storage format, this query will silently return no matches
|
|
113
|
+
# (safe failure). Re-verify on SolidQueue major version upgrades.
|
|
105
114
|
::SolidQueue::Job.
|
|
106
115
|
where(queue_name: queue_name).
|
|
107
116
|
where("arguments::jsonb -> 'arguments' ->> 0 = ?", source.id.to_s)
|