source_monitor 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.rubocop.yml +12 -0
- data/.ruby-version +1 -0
- data/AGENTS.md +132 -0
- data/CHANGELOG.md +66 -0
- data/CONTRIBUTING.md +31 -0
- data/Gemfile +30 -0
- data/Gemfile.lock +411 -0
- data/MIT-LICENSE +20 -0
- data/README.md +108 -0
- data/Rakefile +8 -0
- data/app/assets/builds/.keep +0 -0
- data/app/assets/config/source_monitor_manifest.js +4 -0
- data/app/assets/images/source_monitor/.keep +0 -0
- data/app/assets/javascripts/source_monitor/application.js +20 -0
- data/app/assets/javascripts/source_monitor/controllers/async_submit_controller.js +36 -0
- data/app/assets/javascripts/source_monitor/controllers/dropdown_controller.js +109 -0
- data/app/assets/javascripts/source_monitor/controllers/modal_controller.js +56 -0
- data/app/assets/javascripts/source_monitor/controllers/notification_controller.js +53 -0
- data/app/assets/javascripts/source_monitor/turbo_actions.js +13 -0
- data/app/assets/stylesheets/source_monitor/application.tailwind.css +13 -0
- data/app/assets/svgs/source_monitor/.keep +0 -0
- data/app/controllers/concerns/.keep +0 -0
- data/app/controllers/concerns/source_monitor/sanitizes_search_params.rb +81 -0
- data/app/controllers/source_monitor/application_controller.rb +62 -0
- data/app/controllers/source_monitor/dashboard_controller.rb +27 -0
- data/app/controllers/source_monitor/fetch_logs_controller.rb +9 -0
- data/app/controllers/source_monitor/health_controller.rb +10 -0
- data/app/controllers/source_monitor/items_controller.rb +116 -0
- data/app/controllers/source_monitor/logs_controller.rb +15 -0
- data/app/controllers/source_monitor/scrape_logs_controller.rb +9 -0
- data/app/controllers/source_monitor/source_bulk_scrapes_controller.rb +35 -0
- data/app/controllers/source_monitor/source_fetches_controller.rb +22 -0
- data/app/controllers/source_monitor/source_health_checks_controller.rb +34 -0
- data/app/controllers/source_monitor/source_health_resets_controller.rb +27 -0
- data/app/controllers/source_monitor/source_retries_controller.rb +22 -0
- data/app/controllers/source_monitor/source_turbo_responses.rb +115 -0
- data/app/controllers/source_monitor/sources_controller.rb +179 -0
- data/app/helpers/source_monitor/application_helper.rb +327 -0
- data/app/jobs/source_monitor/application_job.rb +13 -0
- data/app/jobs/source_monitor/fetch_feed_job.rb +117 -0
- data/app/jobs/source_monitor/item_cleanup_job.rb +48 -0
- data/app/jobs/source_monitor/log_cleanup_job.rb +47 -0
- data/app/jobs/source_monitor/schedule_fetches_job.rb +29 -0
- data/app/jobs/source_monitor/scrape_item_job.rb +47 -0
- data/app/jobs/source_monitor/source_health_check_job.rb +77 -0
- data/app/mailers/source_monitor/application_mailer.rb +17 -0
- data/app/models/concerns/.keep +0 -0
- data/app/models/concerns/source_monitor/loggable.rb +18 -0
- data/app/models/source_monitor/application_record.rb +5 -0
- data/app/models/source_monitor/fetch_log.rb +31 -0
- data/app/models/source_monitor/health_check_log.rb +28 -0
- data/app/models/source_monitor/item.rb +102 -0
- data/app/models/source_monitor/item_content.rb +11 -0
- data/app/models/source_monitor/log_entry.rb +56 -0
- data/app/models/source_monitor/scrape_log.rb +31 -0
- data/app/models/source_monitor/source.rb +115 -0
- data/app/views/layouts/source_monitor/application.html.erb +54 -0
- data/app/views/source_monitor/dashboard/_fetch_schedule.html.erb +90 -0
- data/app/views/source_monitor/dashboard/_job_metrics.html.erb +82 -0
- data/app/views/source_monitor/dashboard/_recent_activity.html.erb +39 -0
- data/app/views/source_monitor/dashboard/_stat_card.html.erb +6 -0
- data/app/views/source_monitor/dashboard/_stats.html.erb +9 -0
- data/app/views/source_monitor/dashboard/index.html.erb +48 -0
- data/app/views/source_monitor/fetch_logs/show.html.erb +90 -0
- data/app/views/source_monitor/items/_details.html.erb +234 -0
- data/app/views/source_monitor/items/_details_wrapper.html.erb +3 -0
- data/app/views/source_monitor/items/index.html.erb +147 -0
- data/app/views/source_monitor/items/show.html.erb +3 -0
- data/app/views/source_monitor/logs/index.html.erb +208 -0
- data/app/views/source_monitor/scrape_logs/show.html.erb +73 -0
- data/app/views/source_monitor/shared/_toast.html.erb +34 -0
- data/app/views/source_monitor/sources/_bulk_scrape_form.html.erb +64 -0
- data/app/views/source_monitor/sources/_bulk_scrape_modal.html.erb +53 -0
- data/app/views/source_monitor/sources/_details.html.erb +302 -0
- data/app/views/source_monitor/sources/_details_wrapper.html.erb +3 -0
- data/app/views/source_monitor/sources/_empty_state_row.html.erb +5 -0
- data/app/views/source_monitor/sources/_fetch_interval_heatmap.html.erb +46 -0
- data/app/views/source_monitor/sources/_form.html.erb +143 -0
- data/app/views/source_monitor/sources/_health_status_badge.html.erb +46 -0
- data/app/views/source_monitor/sources/_row.html.erb +102 -0
- data/app/views/source_monitor/sources/edit.html.erb +28 -0
- data/app/views/source_monitor/sources/index.html.erb +153 -0
- data/app/views/source_monitor/sources/new.html.erb +22 -0
- data/app/views/source_monitor/sources/show.html.erb +3 -0
- data/config/coverage_baseline.json +2010 -0
- data/config/initializers/feedjira.rb +19 -0
- data/config/routes.rb +18 -0
- data/config/tailwind.config.js +17 -0
- data/db/migrate/20241008120000_create_source_monitor_sources.rb +40 -0
- data/db/migrate/20241008121000_create_source_monitor_items.rb +44 -0
- data/db/migrate/20241008122000_create_source_monitor_fetch_logs.rb +32 -0
- data/db/migrate/20241008123000_create_source_monitor_scrape_logs.rb +25 -0
- data/db/migrate/20251008183000_change_fetch_interval_to_minutes.rb +23 -0
- data/db/migrate/20251009090000_create_source_monitor_item_contents.rb +38 -0
- data/db/migrate/20251009103000_add_feed_content_readability_to_sources.rb +5 -0
- data/db/migrate/20251010090000_add_adaptive_fetching_toggle_to_sources.rb +7 -0
- data/db/migrate/20251010123000_add_deleted_at_to_source_monitor_items.rb +8 -0
- data/db/migrate/20251010153000_add_type_to_source_monitor_sources.rb +8 -0
- data/db/migrate/20251010154500_add_fetch_status_to_source_monitor_sources.rb +9 -0
- data/db/migrate/20251010160000_create_solid_cable_messages.rb +16 -0
- data/db/migrate/20251011090000_add_fetch_retry_state_to_sources.rb +14 -0
- data/db/migrate/20251012090000_add_health_fields_to_sources.rb +17 -0
- data/db/migrate/20251012100000_optimize_source_monitor_database_performance.rb +13 -0
- data/db/migrate/20251014064947_add_not_null_constraints_to_items.rb +30 -0
- data/db/migrate/20251014171659_add_performance_indexes.rb +29 -0
- data/db/migrate/20251014172525_add_fetch_status_check_constraint.rb +18 -0
- data/db/migrate/20251015100000_create_source_monitor_log_entries.rb +89 -0
- data/db/migrate/20251022100000_create_source_monitor_health_check_logs.rb +22 -0
- data/db/migrate/20251108120116_refresh_fetch_status_constraint.rb +29 -0
- data/docs/configuration.md +170 -0
- data/docs/deployment.md +63 -0
- data/docs/gh-cli-workflow.md +44 -0
- data/docs/installation.md +144 -0
- data/docs/troubleshooting.md +76 -0
- data/eslint.config.mjs +27 -0
- data/lib/generators/source_monitor/install/install_generator.rb +59 -0
- data/lib/generators/source_monitor/install/templates/source_monitor.rb.tt +155 -0
- data/lib/source_monitor/analytics/source_activity_rates.rb +53 -0
- data/lib/source_monitor/analytics/source_fetch_interval_distribution.rb +57 -0
- data/lib/source_monitor/analytics/sources_index_metrics.rb +92 -0
- data/lib/source_monitor/assets/bundler.rb +49 -0
- data/lib/source_monitor/assets.rb +6 -0
- data/lib/source_monitor/configuration.rb +654 -0
- data/lib/source_monitor/dashboard/queries.rb +356 -0
- data/lib/source_monitor/dashboard/quick_action.rb +7 -0
- data/lib/source_monitor/dashboard/quick_actions_presenter.rb +26 -0
- data/lib/source_monitor/dashboard/recent_activity.rb +30 -0
- data/lib/source_monitor/dashboard/recent_activity_presenter.rb +77 -0
- data/lib/source_monitor/dashboard/turbo_broadcaster.rb +87 -0
- data/lib/source_monitor/dashboard/upcoming_fetch_schedule.rb +126 -0
- data/lib/source_monitor/engine.rb +107 -0
- data/lib/source_monitor/events.rb +110 -0
- data/lib/source_monitor/feedjira_extensions.rb +103 -0
- data/lib/source_monitor/fetching/advisory_lock.rb +54 -0
- data/lib/source_monitor/fetching/completion/event_publisher.rb +22 -0
- data/lib/source_monitor/fetching/completion/follow_up_handler.rb +37 -0
- data/lib/source_monitor/fetching/completion/retention_handler.rb +30 -0
- data/lib/source_monitor/fetching/feed_fetcher.rb +627 -0
- data/lib/source_monitor/fetching/fetch_error.rb +88 -0
- data/lib/source_monitor/fetching/fetch_runner.rb +142 -0
- data/lib/source_monitor/fetching/retry_policy.rb +85 -0
- data/lib/source_monitor/fetching/stalled_fetch_reconciler.rb +146 -0
- data/lib/source_monitor/health/source_health_check.rb +100 -0
- data/lib/source_monitor/health/source_health_monitor.rb +210 -0
- data/lib/source_monitor/health/source_health_reset.rb +68 -0
- data/lib/source_monitor/health.rb +46 -0
- data/lib/source_monitor/http.rb +85 -0
- data/lib/source_monitor/instrumentation.rb +52 -0
- data/lib/source_monitor/items/item_creator.rb +601 -0
- data/lib/source_monitor/items/retention_pruner.rb +146 -0
- data/lib/source_monitor/items/retention_strategies/destroy.rb +26 -0
- data/lib/source_monitor/items/retention_strategies/soft_delete.rb +50 -0
- data/lib/source_monitor/items/retention_strategies.rb +9 -0
- data/lib/source_monitor/jobs/cleanup_options.rb +85 -0
- data/lib/source_monitor/jobs/fetch_failure_subscriber.rb +129 -0
- data/lib/source_monitor/jobs/solid_queue_metrics.rb +199 -0
- data/lib/source_monitor/jobs/visibility.rb +133 -0
- data/lib/source_monitor/logs/entry_sync.rb +69 -0
- data/lib/source_monitor/logs/filter_set.rb +163 -0
- data/lib/source_monitor/logs/query.rb +81 -0
- data/lib/source_monitor/logs/table_presenter.rb +161 -0
- data/lib/source_monitor/metrics.rb +77 -0
- data/lib/source_monitor/model_extensions.rb +109 -0
- data/lib/source_monitor/models/sanitizable.rb +76 -0
- data/lib/source_monitor/models/url_normalizable.rb +84 -0
- data/lib/source_monitor/pagination/paginator.rb +90 -0
- data/lib/source_monitor/realtime/adapter.rb +97 -0
- data/lib/source_monitor/realtime/broadcaster.rb +237 -0
- data/lib/source_monitor/realtime.rb +17 -0
- data/lib/source_monitor/release/changelog.rb +59 -0
- data/lib/source_monitor/release/runner.rb +73 -0
- data/lib/source_monitor/scheduler.rb +82 -0
- data/lib/source_monitor/scrapers/base.rb +105 -0
- data/lib/source_monitor/scrapers/fetchers/http_fetcher.rb +97 -0
- data/lib/source_monitor/scrapers/parsers/readability_parser.rb +101 -0
- data/lib/source_monitor/scrapers/readability.rb +156 -0
- data/lib/source_monitor/scraping/bulk_result_presenter.rb +85 -0
- data/lib/source_monitor/scraping/bulk_source_scraper.rb +233 -0
- data/lib/source_monitor/scraping/enqueuer.rb +125 -0
- data/lib/source_monitor/scraping/item_scraper/adapter_resolver.rb +44 -0
- data/lib/source_monitor/scraping/item_scraper/persistence.rb +189 -0
- data/lib/source_monitor/scraping/item_scraper.rb +84 -0
- data/lib/source_monitor/scraping/scheduler.rb +43 -0
- data/lib/source_monitor/scraping/state.rb +79 -0
- data/lib/source_monitor/security/authentication.rb +85 -0
- data/lib/source_monitor/security/parameter_sanitizer.rb +42 -0
- data/lib/source_monitor/sources/turbo_stream_presenter.rb +54 -0
- data/lib/source_monitor/turbo_streams/stream_responder.rb +95 -0
- data/lib/source_monitor/version.rb +3 -0
- data/lib/source_monitor.rb +149 -0
- data/lib/tasks/recover_stalled_fetches.rake +16 -0
- data/lib/tasks/source_monitor_assets.rake +28 -0
- data/lib/tasks/source_monitor_tasks.rake +29 -0
- data/lib/tasks/test_smoke.rake +12 -0
- data/package-lock.json +3997 -0
- data/package.json +29 -0
- data/postcss.config.js +6 -0
- data/source_monitor.gemspec +46 -0
- data/stylelint.config.js +12 -0
- metadata +469 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
class FetchFeedJob < ApplicationJob
|
|
5
|
+
FETCH_CONCURRENCY_RETRY_WAIT = 30.seconds
|
|
6
|
+
EARLY_EXECUTION_LEEWAY = 30.seconds
|
|
7
|
+
|
|
8
|
+
source_monitor_queue :fetch
|
|
9
|
+
|
|
10
|
+
discard_on ActiveJob::DeserializationError
|
|
11
|
+
retry_on SourceMonitor::Fetching::FetchRunner::ConcurrencyError,
|
|
12
|
+
wait: FETCH_CONCURRENCY_RETRY_WAIT,
|
|
13
|
+
attempts: 5
|
|
14
|
+
|
|
15
|
+
def perform(source_id, force: false)
|
|
16
|
+
source = SourceMonitor::Source.find_by(id: source_id)
|
|
17
|
+
return unless source
|
|
18
|
+
|
|
19
|
+
return unless should_run?(source, force: force)
|
|
20
|
+
|
|
21
|
+
SourceMonitor::Fetching::FetchRunner.new(source: source, force: force).run
|
|
22
|
+
rescue SourceMonitor::Fetching::FetchError => error
|
|
23
|
+
handle_transient_error(source, error)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def should_run?(source, force:)
|
|
29
|
+
return true if force
|
|
30
|
+
|
|
31
|
+
status = source.fetch_status.to_s
|
|
32
|
+
return true if %w[queued fetching].include?(status)
|
|
33
|
+
|
|
34
|
+
next_fetch_at = source.next_fetch_at
|
|
35
|
+
return true if next_fetch_at.blank?
|
|
36
|
+
|
|
37
|
+
next_fetch_at <= Time.current + EARLY_EXECUTION_LEEWAY
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def handle_transient_error(source, error)
|
|
41
|
+
raise error unless transient_error?(error) && source
|
|
42
|
+
|
|
43
|
+
decision = SourceMonitor::Fetching::RetryPolicy.new(source:, error:, now: Time.current).decision
|
|
44
|
+
return raise error unless decision
|
|
45
|
+
|
|
46
|
+
if decision.retry?
|
|
47
|
+
enqueue_retry!(source, decision)
|
|
48
|
+
elsif decision.open_circuit?
|
|
49
|
+
open_circuit!(source, decision)
|
|
50
|
+
raise error
|
|
51
|
+
else
|
|
52
|
+
reset_retry_state!(source)
|
|
53
|
+
raise error
|
|
54
|
+
end
|
|
55
|
+
rescue StandardError => policy_error
|
|
56
|
+
log_retry_failure(source, error, policy_error)
|
|
57
|
+
raise error
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def enqueue_retry!(source, decision)
|
|
61
|
+
retry_at = Time.current + (decision.wait || 0)
|
|
62
|
+
|
|
63
|
+
source.with_lock do
|
|
64
|
+
source.reload
|
|
65
|
+
source.update!(
|
|
66
|
+
fetch_retry_attempt: decision.next_attempt,
|
|
67
|
+
fetch_circuit_opened_at: nil,
|
|
68
|
+
fetch_circuit_until: nil,
|
|
69
|
+
next_fetch_at: retry_at,
|
|
70
|
+
backoff_until: retry_at,
|
|
71
|
+
fetch_status: "queued"
|
|
72
|
+
)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
retry_job wait: decision.wait || 0
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def open_circuit!(source, decision)
|
|
79
|
+
source.with_lock do
|
|
80
|
+
source.reload
|
|
81
|
+
source.update!(
|
|
82
|
+
fetch_retry_attempt: 0,
|
|
83
|
+
fetch_circuit_opened_at: Time.current,
|
|
84
|
+
fetch_circuit_until: decision.circuit_until,
|
|
85
|
+
next_fetch_at: decision.circuit_until,
|
|
86
|
+
backoff_until: decision.circuit_until,
|
|
87
|
+
fetch_status: "failed"
|
|
88
|
+
)
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def reset_retry_state!(source)
|
|
93
|
+
source.with_lock do
|
|
94
|
+
source.reload
|
|
95
|
+
source.update!(
|
|
96
|
+
fetch_retry_attempt: 0,
|
|
97
|
+
fetch_circuit_opened_at: nil,
|
|
98
|
+
fetch_circuit_until: nil
|
|
99
|
+
)
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def transient_error?(error)
|
|
104
|
+
error.is_a?(SourceMonitor::Fetching::FetchError)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def log_retry_failure(source, original_error, policy_error)
|
|
108
|
+
return unless defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
109
|
+
|
|
110
|
+
message = "[SourceMonitor::FetchFeedJob] Failed to schedule retry for source #{source&.id}: " \
|
|
111
|
+
"#{original_error.class}: #{original_error.message} (policy error: #{policy_error.class})"
|
|
112
|
+
Rails.logger.error(message)
|
|
113
|
+
rescue StandardError
|
|
114
|
+
nil
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
class ItemCleanupJob < ApplicationJob
|
|
5
|
+
DEFAULT_BATCH_SIZE = 100
|
|
6
|
+
|
|
7
|
+
source_monitor_queue :fetch
|
|
8
|
+
|
|
9
|
+
def perform(options = nil)
|
|
10
|
+
options = SourceMonitor::Jobs::CleanupOptions.normalize(options)
|
|
11
|
+
|
|
12
|
+
scope = resolve_scope(options)
|
|
13
|
+
batch_size = SourceMonitor::Jobs::CleanupOptions.batch_size(options, default: DEFAULT_BATCH_SIZE)
|
|
14
|
+
now = SourceMonitor::Jobs::CleanupOptions.resolve_time(options[:now])
|
|
15
|
+
strategy = resolve_strategy(options)
|
|
16
|
+
pruner_class = options[:retention_pruner_class] || SourceMonitor::Items::RetentionPruner
|
|
17
|
+
|
|
18
|
+
scope.find_in_batches(batch_size:) do |batch|
|
|
19
|
+
batch.each do |source|
|
|
20
|
+
pruner_class.call(source:, now:, strategy:)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def resolve_scope(options)
|
|
28
|
+
relation = options[:source_scope] || SourceMonitor::Source.all
|
|
29
|
+
ids = SourceMonitor::Jobs::CleanupOptions.extract_ids([ options[:source_ids], options[:source_id] ])
|
|
30
|
+
|
|
31
|
+
if ids.any?
|
|
32
|
+
relation.where(id: ids)
|
|
33
|
+
else
|
|
34
|
+
relation
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def resolve_strategy(options)
|
|
39
|
+
if options.key?(:strategy)
|
|
40
|
+
options[:strategy]
|
|
41
|
+
elsif options.key?(:soft_delete)
|
|
42
|
+
ActiveModel::Type::Boolean.new.cast(options[:soft_delete]) ? :soft_delete : :destroy
|
|
43
|
+
else
|
|
44
|
+
SourceMonitor.config.retention.strategy
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
class LogCleanupJob < ApplicationJob
|
|
5
|
+
DEFAULT_FETCH_LOG_RETENTION_DAYS = 90
|
|
6
|
+
DEFAULT_SCRAPE_LOG_RETENTION_DAYS = 45
|
|
7
|
+
|
|
8
|
+
source_monitor_queue :fetch
|
|
9
|
+
|
|
10
|
+
def perform(options = nil)
|
|
11
|
+
options = SourceMonitor::Jobs::CleanupOptions.normalize(options)
|
|
12
|
+
|
|
13
|
+
now = SourceMonitor::Jobs::CleanupOptions.resolve_time(options[:now])
|
|
14
|
+
fetch_cutoff = resolve_cutoff(now:, days: options[:fetch_logs_older_than_days], default: DEFAULT_FETCH_LOG_RETENTION_DAYS)
|
|
15
|
+
scrape_cutoff = resolve_cutoff(now:, days: options[:scrape_logs_older_than_days], default: DEFAULT_SCRAPE_LOG_RETENTION_DAYS)
|
|
16
|
+
|
|
17
|
+
prune_fetch_logs(fetch_cutoff) if fetch_cutoff
|
|
18
|
+
prune_scrape_logs(scrape_cutoff) if scrape_cutoff
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
def resolve_cutoff(now:, days:, default:)
|
|
24
|
+
resolved_days =
|
|
25
|
+
if days.nil?
|
|
26
|
+
default
|
|
27
|
+
else
|
|
28
|
+
SourceMonitor::Jobs::CleanupOptions.integer(days)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
return nil unless resolved_days
|
|
32
|
+
return nil if resolved_days <= 0
|
|
33
|
+
|
|
34
|
+
now - resolved_days.days
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def prune_fetch_logs(cutoff)
|
|
38
|
+
SourceMonitor::FetchLog.where(SourceMonitor::FetchLog.arel_table[:started_at].lt(cutoff))
|
|
39
|
+
.in_batches(of: 500) { |batch| batch.delete_all }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def prune_scrape_logs(cutoff)
|
|
43
|
+
SourceMonitor::ScrapeLog.where(SourceMonitor::ScrapeLog.arel_table[:started_at].lt(cutoff))
|
|
44
|
+
.in_batches(of: 500) { |batch| batch.delete_all }
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
class ScheduleFetchesJob < ApplicationJob
|
|
5
|
+
source_monitor_queue :fetch
|
|
6
|
+
|
|
7
|
+
def perform(options = nil)
|
|
8
|
+
limit = extract_limit(options)
|
|
9
|
+
SourceMonitor::Scheduler.run(limit:)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
private
|
|
13
|
+
|
|
14
|
+
def extract_limit(options)
|
|
15
|
+
options_hash =
|
|
16
|
+
case options
|
|
17
|
+
when nil then {}
|
|
18
|
+
when Hash then options
|
|
19
|
+
else {}
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
if options_hash.respond_to?(:symbolize_keys)
|
|
23
|
+
options_hash = options_hash.symbolize_keys
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
options_hash[:limit] || SourceMonitor::Scheduler::DEFAULT_BATCH_SIZE
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
class ScrapeItemJob < ApplicationJob
|
|
5
|
+
source_monitor_queue :scrape
|
|
6
|
+
|
|
7
|
+
discard_on ActiveJob::DeserializationError
|
|
8
|
+
|
|
9
|
+
def perform(item_id)
|
|
10
|
+
log("job:start", item_id: item_id)
|
|
11
|
+
item = SourceMonitor::Item.includes(:source).find_by(id: item_id)
|
|
12
|
+
return unless item
|
|
13
|
+
|
|
14
|
+
source = item.source
|
|
15
|
+
unless source&.scraping_enabled?
|
|
16
|
+
log("job:skipped_scraping_disabled", item: item)
|
|
17
|
+
SourceMonitor::Scraping::State.clear_inflight!(item)
|
|
18
|
+
return
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
SourceMonitor::Scraping::State.mark_processing!(item)
|
|
22
|
+
SourceMonitor::Scraping::ItemScraper.new(item:, source:).call
|
|
23
|
+
log("job:completed", item: item, status: item.scrape_status)
|
|
24
|
+
rescue StandardError => error
|
|
25
|
+
log("job:error", item: item, error: error.message)
|
|
26
|
+
SourceMonitor::Scraping::State.mark_failed!(item)
|
|
27
|
+
raise
|
|
28
|
+
ensure
|
|
29
|
+
SourceMonitor::Scraping::State.clear_inflight!(item) if item
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def log(stage, item: nil, item_id: nil, **extra)
|
|
35
|
+
return unless defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
36
|
+
|
|
37
|
+
payload = {
|
|
38
|
+
stage: "SourceMonitor::ScrapeItemJob##{stage}",
|
|
39
|
+
item_id: item&.id || item_id,
|
|
40
|
+
source_id: item&.source_id
|
|
41
|
+
}.merge(extra.compact)
|
|
42
|
+
Rails.logger.info("[SourceMonitor::ManualScrape] #{payload.to_json}")
|
|
43
|
+
rescue StandardError
|
|
44
|
+
nil
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
class SourceHealthCheckJob < ApplicationJob
|
|
5
|
+
source_monitor_queue :fetch
|
|
6
|
+
|
|
7
|
+
discard_on ActiveJob::DeserializationError
|
|
8
|
+
|
|
9
|
+
def perform(source_id)
|
|
10
|
+
source = SourceMonitor::Source.find_by(id: source_id)
|
|
11
|
+
return unless source
|
|
12
|
+
|
|
13
|
+
result = SourceMonitor::Health::SourceHealthCheck.new(source: source).call
|
|
14
|
+
broadcast_outcome(source, result)
|
|
15
|
+
result
|
|
16
|
+
rescue StandardError => error
|
|
17
|
+
Rails.logger&.error(
|
|
18
|
+
"[SourceMonitor::SourceHealthCheckJob] error for source #{source_id}: #{error.class}: #{error.message}"
|
|
19
|
+
) if defined?(Rails) && Rails.respond_to?(:logger)
|
|
20
|
+
|
|
21
|
+
record_unexpected_failure(source, error) if source
|
|
22
|
+
broadcast_outcome(source, nil, error) if source
|
|
23
|
+
nil
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def record_unexpected_failure(source, error)
|
|
29
|
+
SourceMonitor::HealthCheckLog.create!(
|
|
30
|
+
source: source,
|
|
31
|
+
success: false,
|
|
32
|
+
started_at: Time.current,
|
|
33
|
+
completed_at: Time.current,
|
|
34
|
+
duration_ms: 0,
|
|
35
|
+
error_class: error.class.name,
|
|
36
|
+
error_message: error.message
|
|
37
|
+
)
|
|
38
|
+
rescue StandardError
|
|
39
|
+
nil
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def broadcast_outcome(source, result, error = nil)
|
|
43
|
+
SourceMonitor::Realtime.broadcast_source(source)
|
|
44
|
+
|
|
45
|
+
message, level = toast_payload(source, result, error)
|
|
46
|
+
return if message.blank?
|
|
47
|
+
|
|
48
|
+
SourceMonitor::Realtime.broadcast_toast(message:, level:)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def toast_payload(source, result, error)
|
|
52
|
+
if error
|
|
53
|
+
return [
|
|
54
|
+
"Health check failed for #{source.name}: #{error.message}",
|
|
55
|
+
:error
|
|
56
|
+
]
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
if result&.success?
|
|
60
|
+
[
|
|
61
|
+
"Health check succeeded for #{source.name}.",
|
|
62
|
+
:success
|
|
63
|
+
]
|
|
64
|
+
else
|
|
65
|
+
failure_reason = result&.error&.message
|
|
66
|
+
http_status = result&.log&.http_status
|
|
67
|
+
message = "Health check failed for #{source.name}"
|
|
68
|
+
message += " (HTTP #{http_status})" if http_status.present?
|
|
69
|
+
message += ": #{failure_reason}" if failure_reason.present?
|
|
70
|
+
[
|
|
71
|
+
"#{message}.",
|
|
72
|
+
:error
|
|
73
|
+
]
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
if defined?(::ActionMailer::Base)
|
|
5
|
+
class ApplicationMailer < ::ActionMailer::Base
|
|
6
|
+
default from: "from@example.com"
|
|
7
|
+
layout "mailer"
|
|
8
|
+
end
|
|
9
|
+
else
|
|
10
|
+
# :nocov:
|
|
11
|
+
# Define a no-op mailer so API-only host apps (which skip Action Mailer)
|
|
12
|
+
# can autoload this constant without pulling in the framework.
|
|
13
|
+
class ApplicationMailer
|
|
14
|
+
end
|
|
15
|
+
# :nocov:
|
|
16
|
+
end
|
|
17
|
+
end
|
|
File without changes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
module Loggable
|
|
5
|
+
extend ActiveSupport::Concern
|
|
6
|
+
|
|
7
|
+
included do
|
|
8
|
+
attribute :metadata, default: -> { {} }
|
|
9
|
+
|
|
10
|
+
validates :started_at, presence: true
|
|
11
|
+
validates :duration_ms, numericality: { greater_than_or_equal_to: 0 }, allow_nil: true
|
|
12
|
+
|
|
13
|
+
scope :recent, -> { order(started_at: :desc) }
|
|
14
|
+
scope :successful, -> { where(success: true) }
|
|
15
|
+
scope :failed, -> { where(success: false) }
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
class FetchLog < ApplicationRecord
|
|
5
|
+
include SourceMonitor::Loggable
|
|
6
|
+
|
|
7
|
+
belongs_to :source, class_name: "SourceMonitor::Source", inverse_of: :fetch_logs
|
|
8
|
+
has_one :log_entry, as: :loggable, class_name: "SourceMonitor::LogEntry", inverse_of: :loggable, dependent: :destroy
|
|
9
|
+
|
|
10
|
+
attribute :items_created, :integer, default: 0
|
|
11
|
+
attribute :items_updated, :integer, default: 0
|
|
12
|
+
attribute :items_failed, :integer, default: 0
|
|
13
|
+
attribute :http_response_headers, default: -> { {} }
|
|
14
|
+
|
|
15
|
+
validates :source, presence: true
|
|
16
|
+
validates :items_created, :items_updated, :items_failed,
|
|
17
|
+
numericality: { greater_than_or_equal_to: 0 }
|
|
18
|
+
|
|
19
|
+
scope :for_job, ->(job_id) { where(job_id:) }
|
|
20
|
+
|
|
21
|
+
SourceMonitor::ModelExtensions.register(self, :fetch_log)
|
|
22
|
+
|
|
23
|
+
after_save :sync_log_entry
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def sync_log_entry
|
|
28
|
+
SourceMonitor::Logs::EntrySync.call(self)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
class HealthCheckLog < ApplicationRecord
|
|
5
|
+
include SourceMonitor::Loggable
|
|
6
|
+
|
|
7
|
+
belongs_to :source, class_name: "SourceMonitor::Source", inverse_of: :health_check_logs
|
|
8
|
+
has_one :log_entry,
|
|
9
|
+
as: :loggable,
|
|
10
|
+
class_name: "SourceMonitor::LogEntry",
|
|
11
|
+
inverse_of: :loggable,
|
|
12
|
+
dependent: :destroy
|
|
13
|
+
|
|
14
|
+
attribute :http_response_headers, default: -> { {} }
|
|
15
|
+
|
|
16
|
+
validates :source, presence: true
|
|
17
|
+
|
|
18
|
+
SourceMonitor::ModelExtensions.register(self, :health_check_log)
|
|
19
|
+
|
|
20
|
+
after_save :sync_log_entry
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
def sync_log_entry
|
|
25
|
+
SourceMonitor::Logs::EntrySync.call(self)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "source_monitor/models/url_normalizable"
|
|
4
|
+
|
|
5
|
+
module SourceMonitor
|
|
6
|
+
class Item < ApplicationRecord
|
|
7
|
+
include SourceMonitor::Models::UrlNormalizable
|
|
8
|
+
|
|
9
|
+
belongs_to :source, class_name: "SourceMonitor::Source", inverse_of: :items, counter_cache: true
|
|
10
|
+
has_one :item_content, class_name: "SourceMonitor::ItemContent", inverse_of: :item, dependent: :destroy, autosave: true
|
|
11
|
+
has_many :scrape_logs, class_name: "SourceMonitor::ScrapeLog", inverse_of: :item, dependent: :destroy
|
|
12
|
+
has_many :log_entries, class_name: "SourceMonitor::LogEntry", inverse_of: :item, dependent: :destroy
|
|
13
|
+
|
|
14
|
+
# Explicit scope for active (non-deleted) items - no default_scope to avoid anti-pattern
|
|
15
|
+
scope :active, -> { where(deleted_at: nil) }
|
|
16
|
+
scope :with_deleted, -> { unscope(where: :deleted_at) }
|
|
17
|
+
scope :only_deleted, -> { where.not(deleted_at: nil) }
|
|
18
|
+
|
|
19
|
+
normalizes_urls :url, :canonical_url, :comments_url
|
|
20
|
+
validates_url_format :url, :canonical_url, :comments_url
|
|
21
|
+
|
|
22
|
+
validates :source, presence: true
|
|
23
|
+
validates :guid, presence: true, uniqueness: { scope: :source_id, case_sensitive: false }
|
|
24
|
+
validates :content_fingerprint, uniqueness: { scope: :source_id }, allow_blank: true
|
|
25
|
+
validates :url, presence: true
|
|
26
|
+
|
|
27
|
+
scope :recent, -> { active.order(Arel.sql("published_at DESC NULLS LAST, created_at DESC")) }
|
|
28
|
+
scope :published, -> { active.where.not(published_at: nil) }
|
|
29
|
+
scope :pending_scrape, -> { active.where(scraped_at: nil) }
|
|
30
|
+
scope :failed_scrape, -> { active.where(scrape_status: "failed") }
|
|
31
|
+
|
|
32
|
+
delegate :scraped_html, :scraped_content, to: :item_content, allow_nil: true
|
|
33
|
+
|
|
34
|
+
SourceMonitor::ModelExtensions.register(self, :item)
|
|
35
|
+
|
|
36
|
+
class << self
|
|
37
|
+
def ransackable_attributes(_auth_object = nil)
|
|
38
|
+
%w[title summary url published_at created_at scrape_status]
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def ransackable_associations(_auth_object = nil)
|
|
42
|
+
%w[source]
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def scraped_html=(value)
|
|
47
|
+
assign_content_attribute(:scraped_html, value)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def scraped_content=(value)
|
|
51
|
+
assign_content_attribute(:scraped_content, value)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def deleted?
|
|
55
|
+
deleted_at.present?
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def soft_delete!(timestamp: Time.current)
|
|
59
|
+
return if deleted?
|
|
60
|
+
|
|
61
|
+
self.class.transaction do
|
|
62
|
+
timestamp = timestamp.in_time_zone if timestamp.respond_to?(:in_time_zone)
|
|
63
|
+
timestamp ||= Time.current
|
|
64
|
+
|
|
65
|
+
update_columns(
|
|
66
|
+
deleted_at: timestamp,
|
|
67
|
+
updated_at: timestamp
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
SourceMonitor::Source.decrement_counter(:items_count, source_id) if source_id
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
private
|
|
75
|
+
|
|
76
|
+
# Item content lives in a separate row that we only create when rich content exists.
|
|
77
|
+
# This helper keeps the association lazy-loaded, ensures updates route through the
|
|
78
|
+
# join model, and tears it back down once both scraped fields are blank again.
|
|
79
|
+
def assign_content_attribute(attribute, value)
|
|
80
|
+
ensure_item_content_presence(value)
|
|
81
|
+
item_content&.public_send("#{attribute}=", value)
|
|
82
|
+
cleanup_item_content_if_blank
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def ensure_item_content_presence(value)
|
|
86
|
+
return if item_content.present? || value.nil?
|
|
87
|
+
|
|
88
|
+
build_item_content
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def cleanup_item_content_if_blank
|
|
92
|
+
return unless item_content
|
|
93
|
+
return if item_content.scraped_html.present? || item_content.scraped_content.present?
|
|
94
|
+
|
|
95
|
+
if item_content.persisted?
|
|
96
|
+
item_content.mark_for_destruction
|
|
97
|
+
else
|
|
98
|
+
association(:item_content).reset
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
class ItemContent < ApplicationRecord
|
|
5
|
+
belongs_to :item, class_name: "SourceMonitor::Item", inverse_of: :item_content, touch: true
|
|
6
|
+
|
|
7
|
+
validates :item, presence: true
|
|
8
|
+
|
|
9
|
+
SourceMonitor::ModelExtensions.register(self, :item_content)
|
|
10
|
+
end
|
|
11
|
+
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
class LogEntry < ApplicationRecord
|
|
5
|
+
self.table_name = "sourcemon_log_entries"
|
|
6
|
+
|
|
7
|
+
delegated_type :loggable, types: %w[SourceMonitor::FetchLog SourceMonitor::ScrapeLog SourceMonitor::HealthCheckLog]
|
|
8
|
+
|
|
9
|
+
belongs_to :source, class_name: "SourceMonitor::Source", inverse_of: :log_entries
|
|
10
|
+
belongs_to :item, class_name: "SourceMonitor::Item", inverse_of: :log_entries, optional: true
|
|
11
|
+
|
|
12
|
+
validates :started_at, presence: true
|
|
13
|
+
validates :source, presence: true
|
|
14
|
+
|
|
15
|
+
scope :recent, -> { order(started_at: :desc) }
|
|
16
|
+
|
|
17
|
+
SourceMonitor::ModelExtensions.register(self, :log_entry)
|
|
18
|
+
|
|
19
|
+
class << self
|
|
20
|
+
def ransackable_attributes(_auth_object = nil)
|
|
21
|
+
%w[
|
|
22
|
+
success
|
|
23
|
+
started_at
|
|
24
|
+
http_status
|
|
25
|
+
scraper_adapter
|
|
26
|
+
error_message
|
|
27
|
+
error_class
|
|
28
|
+
loggable_type
|
|
29
|
+
]
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def ransackable_associations(_auth_object = nil)
|
|
33
|
+
%w[source item loggable]
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def fetch?
|
|
38
|
+
loggable_type == FetchLog.sti_name
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def scrape?
|
|
42
|
+
loggable_type == ScrapeLog.sti_name
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def health_check?
|
|
46
|
+
loggable_type == HealthCheckLog.sti_name
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def log_type
|
|
50
|
+
return :fetch if fetch?
|
|
51
|
+
return :scrape if scrape?
|
|
52
|
+
|
|
53
|
+
:health_check
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
class ScrapeLog < ApplicationRecord
|
|
5
|
+
include SourceMonitor::Loggable
|
|
6
|
+
|
|
7
|
+
belongs_to :item, class_name: "SourceMonitor::Item", inverse_of: :scrape_logs
|
|
8
|
+
belongs_to :source, class_name: "SourceMonitor::Source", inverse_of: :scrape_logs
|
|
9
|
+
has_one :log_entry, as: :loggable, class_name: "SourceMonitor::LogEntry", inverse_of: :loggable, dependent: :destroy
|
|
10
|
+
|
|
11
|
+
validates :item, :source, presence: true
|
|
12
|
+
validates :content_length, numericality: { greater_than_or_equal_to: 0 }, allow_nil: true
|
|
13
|
+
validate :source_matches_item
|
|
14
|
+
|
|
15
|
+
SourceMonitor::ModelExtensions.register(self, :scrape_log)
|
|
16
|
+
|
|
17
|
+
after_save :sync_log_entry
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def source_matches_item
|
|
22
|
+
return if item.nil? || source.nil?
|
|
23
|
+
|
|
24
|
+
errors.add(:source, "must match item source") if item.source_id != source_id
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def sync_log_entry
|
|
28
|
+
SourceMonitor::Logs::EntrySync.call(self)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|