source_monitor 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.rubocop.yml +12 -0
- data/.ruby-version +1 -0
- data/AGENTS.md +132 -0
- data/CHANGELOG.md +66 -0
- data/CONTRIBUTING.md +31 -0
- data/Gemfile +30 -0
- data/Gemfile.lock +411 -0
- data/MIT-LICENSE +20 -0
- data/README.md +108 -0
- data/Rakefile +8 -0
- data/app/assets/builds/.keep +0 -0
- data/app/assets/config/source_monitor_manifest.js +4 -0
- data/app/assets/images/source_monitor/.keep +0 -0
- data/app/assets/javascripts/source_monitor/application.js +20 -0
- data/app/assets/javascripts/source_monitor/controllers/async_submit_controller.js +36 -0
- data/app/assets/javascripts/source_monitor/controllers/dropdown_controller.js +109 -0
- data/app/assets/javascripts/source_monitor/controllers/modal_controller.js +56 -0
- data/app/assets/javascripts/source_monitor/controllers/notification_controller.js +53 -0
- data/app/assets/javascripts/source_monitor/turbo_actions.js +13 -0
- data/app/assets/stylesheets/source_monitor/application.tailwind.css +13 -0
- data/app/assets/svgs/source_monitor/.keep +0 -0
- data/app/controllers/concerns/.keep +0 -0
- data/app/controllers/concerns/source_monitor/sanitizes_search_params.rb +81 -0
- data/app/controllers/source_monitor/application_controller.rb +62 -0
- data/app/controllers/source_monitor/dashboard_controller.rb +27 -0
- data/app/controllers/source_monitor/fetch_logs_controller.rb +9 -0
- data/app/controllers/source_monitor/health_controller.rb +10 -0
- data/app/controllers/source_monitor/items_controller.rb +116 -0
- data/app/controllers/source_monitor/logs_controller.rb +15 -0
- data/app/controllers/source_monitor/scrape_logs_controller.rb +9 -0
- data/app/controllers/source_monitor/source_bulk_scrapes_controller.rb +35 -0
- data/app/controllers/source_monitor/source_fetches_controller.rb +22 -0
- data/app/controllers/source_monitor/source_health_checks_controller.rb +34 -0
- data/app/controllers/source_monitor/source_health_resets_controller.rb +27 -0
- data/app/controllers/source_monitor/source_retries_controller.rb +22 -0
- data/app/controllers/source_monitor/source_turbo_responses.rb +115 -0
- data/app/controllers/source_monitor/sources_controller.rb +179 -0
- data/app/helpers/source_monitor/application_helper.rb +327 -0
- data/app/jobs/source_monitor/application_job.rb +13 -0
- data/app/jobs/source_monitor/fetch_feed_job.rb +117 -0
- data/app/jobs/source_monitor/item_cleanup_job.rb +48 -0
- data/app/jobs/source_monitor/log_cleanup_job.rb +47 -0
- data/app/jobs/source_monitor/schedule_fetches_job.rb +29 -0
- data/app/jobs/source_monitor/scrape_item_job.rb +47 -0
- data/app/jobs/source_monitor/source_health_check_job.rb +77 -0
- data/app/mailers/source_monitor/application_mailer.rb +17 -0
- data/app/models/concerns/.keep +0 -0
- data/app/models/concerns/source_monitor/loggable.rb +18 -0
- data/app/models/source_monitor/application_record.rb +5 -0
- data/app/models/source_monitor/fetch_log.rb +31 -0
- data/app/models/source_monitor/health_check_log.rb +28 -0
- data/app/models/source_monitor/item.rb +102 -0
- data/app/models/source_monitor/item_content.rb +11 -0
- data/app/models/source_monitor/log_entry.rb +56 -0
- data/app/models/source_monitor/scrape_log.rb +31 -0
- data/app/models/source_monitor/source.rb +115 -0
- data/app/views/layouts/source_monitor/application.html.erb +54 -0
- data/app/views/source_monitor/dashboard/_fetch_schedule.html.erb +90 -0
- data/app/views/source_monitor/dashboard/_job_metrics.html.erb +82 -0
- data/app/views/source_monitor/dashboard/_recent_activity.html.erb +39 -0
- data/app/views/source_monitor/dashboard/_stat_card.html.erb +6 -0
- data/app/views/source_monitor/dashboard/_stats.html.erb +9 -0
- data/app/views/source_monitor/dashboard/index.html.erb +48 -0
- data/app/views/source_monitor/fetch_logs/show.html.erb +90 -0
- data/app/views/source_monitor/items/_details.html.erb +234 -0
- data/app/views/source_monitor/items/_details_wrapper.html.erb +3 -0
- data/app/views/source_monitor/items/index.html.erb +147 -0
- data/app/views/source_monitor/items/show.html.erb +3 -0
- data/app/views/source_monitor/logs/index.html.erb +208 -0
- data/app/views/source_monitor/scrape_logs/show.html.erb +73 -0
- data/app/views/source_monitor/shared/_toast.html.erb +34 -0
- data/app/views/source_monitor/sources/_bulk_scrape_form.html.erb +64 -0
- data/app/views/source_monitor/sources/_bulk_scrape_modal.html.erb +53 -0
- data/app/views/source_monitor/sources/_details.html.erb +302 -0
- data/app/views/source_monitor/sources/_details_wrapper.html.erb +3 -0
- data/app/views/source_monitor/sources/_empty_state_row.html.erb +5 -0
- data/app/views/source_monitor/sources/_fetch_interval_heatmap.html.erb +46 -0
- data/app/views/source_monitor/sources/_form.html.erb +143 -0
- data/app/views/source_monitor/sources/_health_status_badge.html.erb +46 -0
- data/app/views/source_monitor/sources/_row.html.erb +102 -0
- data/app/views/source_monitor/sources/edit.html.erb +28 -0
- data/app/views/source_monitor/sources/index.html.erb +153 -0
- data/app/views/source_monitor/sources/new.html.erb +22 -0
- data/app/views/source_monitor/sources/show.html.erb +3 -0
- data/config/coverage_baseline.json +2010 -0
- data/config/initializers/feedjira.rb +19 -0
- data/config/routes.rb +18 -0
- data/config/tailwind.config.js +17 -0
- data/db/migrate/20241008120000_create_source_monitor_sources.rb +40 -0
- data/db/migrate/20241008121000_create_source_monitor_items.rb +44 -0
- data/db/migrate/20241008122000_create_source_monitor_fetch_logs.rb +32 -0
- data/db/migrate/20241008123000_create_source_monitor_scrape_logs.rb +25 -0
- data/db/migrate/20251008183000_change_fetch_interval_to_minutes.rb +23 -0
- data/db/migrate/20251009090000_create_source_monitor_item_contents.rb +38 -0
- data/db/migrate/20251009103000_add_feed_content_readability_to_sources.rb +5 -0
- data/db/migrate/20251010090000_add_adaptive_fetching_toggle_to_sources.rb +7 -0
- data/db/migrate/20251010123000_add_deleted_at_to_source_monitor_items.rb +8 -0
- data/db/migrate/20251010153000_add_type_to_source_monitor_sources.rb +8 -0
- data/db/migrate/20251010154500_add_fetch_status_to_source_monitor_sources.rb +9 -0
- data/db/migrate/20251010160000_create_solid_cable_messages.rb +16 -0
- data/db/migrate/20251011090000_add_fetch_retry_state_to_sources.rb +14 -0
- data/db/migrate/20251012090000_add_health_fields_to_sources.rb +17 -0
- data/db/migrate/20251012100000_optimize_source_monitor_database_performance.rb +13 -0
- data/db/migrate/20251014064947_add_not_null_constraints_to_items.rb +30 -0
- data/db/migrate/20251014171659_add_performance_indexes.rb +29 -0
- data/db/migrate/20251014172525_add_fetch_status_check_constraint.rb +18 -0
- data/db/migrate/20251015100000_create_source_monitor_log_entries.rb +89 -0
- data/db/migrate/20251022100000_create_source_monitor_health_check_logs.rb +22 -0
- data/db/migrate/20251108120116_refresh_fetch_status_constraint.rb +29 -0
- data/docs/configuration.md +170 -0
- data/docs/deployment.md +63 -0
- data/docs/gh-cli-workflow.md +44 -0
- data/docs/installation.md +144 -0
- data/docs/troubleshooting.md +76 -0
- data/eslint.config.mjs +27 -0
- data/lib/generators/source_monitor/install/install_generator.rb +59 -0
- data/lib/generators/source_monitor/install/templates/source_monitor.rb.tt +155 -0
- data/lib/source_monitor/analytics/source_activity_rates.rb +53 -0
- data/lib/source_monitor/analytics/source_fetch_interval_distribution.rb +57 -0
- data/lib/source_monitor/analytics/sources_index_metrics.rb +92 -0
- data/lib/source_monitor/assets/bundler.rb +49 -0
- data/lib/source_monitor/assets.rb +6 -0
- data/lib/source_monitor/configuration.rb +654 -0
- data/lib/source_monitor/dashboard/queries.rb +356 -0
- data/lib/source_monitor/dashboard/quick_action.rb +7 -0
- data/lib/source_monitor/dashboard/quick_actions_presenter.rb +26 -0
- data/lib/source_monitor/dashboard/recent_activity.rb +30 -0
- data/lib/source_monitor/dashboard/recent_activity_presenter.rb +77 -0
- data/lib/source_monitor/dashboard/turbo_broadcaster.rb +87 -0
- data/lib/source_monitor/dashboard/upcoming_fetch_schedule.rb +126 -0
- data/lib/source_monitor/engine.rb +107 -0
- data/lib/source_monitor/events.rb +110 -0
- data/lib/source_monitor/feedjira_extensions.rb +103 -0
- data/lib/source_monitor/fetching/advisory_lock.rb +54 -0
- data/lib/source_monitor/fetching/completion/event_publisher.rb +22 -0
- data/lib/source_monitor/fetching/completion/follow_up_handler.rb +37 -0
- data/lib/source_monitor/fetching/completion/retention_handler.rb +30 -0
- data/lib/source_monitor/fetching/feed_fetcher.rb +627 -0
- data/lib/source_monitor/fetching/fetch_error.rb +88 -0
- data/lib/source_monitor/fetching/fetch_runner.rb +142 -0
- data/lib/source_monitor/fetching/retry_policy.rb +85 -0
- data/lib/source_monitor/fetching/stalled_fetch_reconciler.rb +146 -0
- data/lib/source_monitor/health/source_health_check.rb +100 -0
- data/lib/source_monitor/health/source_health_monitor.rb +210 -0
- data/lib/source_monitor/health/source_health_reset.rb +68 -0
- data/lib/source_monitor/health.rb +46 -0
- data/lib/source_monitor/http.rb +85 -0
- data/lib/source_monitor/instrumentation.rb +52 -0
- data/lib/source_monitor/items/item_creator.rb +601 -0
- data/lib/source_monitor/items/retention_pruner.rb +146 -0
- data/lib/source_monitor/items/retention_strategies/destroy.rb +26 -0
- data/lib/source_monitor/items/retention_strategies/soft_delete.rb +50 -0
- data/lib/source_monitor/items/retention_strategies.rb +9 -0
- data/lib/source_monitor/jobs/cleanup_options.rb +85 -0
- data/lib/source_monitor/jobs/fetch_failure_subscriber.rb +129 -0
- data/lib/source_monitor/jobs/solid_queue_metrics.rb +199 -0
- data/lib/source_monitor/jobs/visibility.rb +133 -0
- data/lib/source_monitor/logs/entry_sync.rb +69 -0
- data/lib/source_monitor/logs/filter_set.rb +163 -0
- data/lib/source_monitor/logs/query.rb +81 -0
- data/lib/source_monitor/logs/table_presenter.rb +161 -0
- data/lib/source_monitor/metrics.rb +77 -0
- data/lib/source_monitor/model_extensions.rb +109 -0
- data/lib/source_monitor/models/sanitizable.rb +76 -0
- data/lib/source_monitor/models/url_normalizable.rb +84 -0
- data/lib/source_monitor/pagination/paginator.rb +90 -0
- data/lib/source_monitor/realtime/adapter.rb +97 -0
- data/lib/source_monitor/realtime/broadcaster.rb +237 -0
- data/lib/source_monitor/realtime.rb +17 -0
- data/lib/source_monitor/release/changelog.rb +59 -0
- data/lib/source_monitor/release/runner.rb +73 -0
- data/lib/source_monitor/scheduler.rb +82 -0
- data/lib/source_monitor/scrapers/base.rb +105 -0
- data/lib/source_monitor/scrapers/fetchers/http_fetcher.rb +97 -0
- data/lib/source_monitor/scrapers/parsers/readability_parser.rb +101 -0
- data/lib/source_monitor/scrapers/readability.rb +156 -0
- data/lib/source_monitor/scraping/bulk_result_presenter.rb +85 -0
- data/lib/source_monitor/scraping/bulk_source_scraper.rb +233 -0
- data/lib/source_monitor/scraping/enqueuer.rb +125 -0
- data/lib/source_monitor/scraping/item_scraper/adapter_resolver.rb +44 -0
- data/lib/source_monitor/scraping/item_scraper/persistence.rb +189 -0
- data/lib/source_monitor/scraping/item_scraper.rb +84 -0
- data/lib/source_monitor/scraping/scheduler.rb +43 -0
- data/lib/source_monitor/scraping/state.rb +79 -0
- data/lib/source_monitor/security/authentication.rb +85 -0
- data/lib/source_monitor/security/parameter_sanitizer.rb +42 -0
- data/lib/source_monitor/sources/turbo_stream_presenter.rb +54 -0
- data/lib/source_monitor/turbo_streams/stream_responder.rb +95 -0
- data/lib/source_monitor/version.rb +3 -0
- data/lib/source_monitor.rb +149 -0
- data/lib/tasks/recover_stalled_fetches.rake +16 -0
- data/lib/tasks/source_monitor_assets.rake +28 -0
- data/lib/tasks/source_monitor_tasks.rake +29 -0
- data/lib/tasks/test_smoke.rake +12 -0
- data/package-lock.json +3997 -0
- data/package.json +29 -0
- data/postcss.config.js +6 -0
- data/source_monitor.gemspec +46 -0
- data/stylelint.config.js +12 -0
- metadata +469 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
module Scraping
|
|
5
|
+
# Presenter for building flash messages from BulkSourceScraper results
|
|
6
|
+
# Extracts complex message formatting logic from the controller
|
|
7
|
+
class BulkResultPresenter
|
|
8
|
+
attr_reader :result, :pluralizer
|
|
9
|
+
|
|
10
|
+
def initialize(result:, pluralizer:)
|
|
11
|
+
@result = result
|
|
12
|
+
@pluralizer = pluralizer
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def to_flash_payload
|
|
16
|
+
case result.status
|
|
17
|
+
when :success
|
|
18
|
+
build_success_payload
|
|
19
|
+
when :partial
|
|
20
|
+
build_partial_payload
|
|
21
|
+
else
|
|
22
|
+
build_error_payload
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def build_success_payload
|
|
29
|
+
label = BulkSourceScraper.selection_label(result.selection)
|
|
30
|
+
pluralized_enqueued = pluralizer.call(result.enqueued_count, "item")
|
|
31
|
+
|
|
32
|
+
message = "Queued scraping for #{pluralized_enqueued} from the #{label}."
|
|
33
|
+
|
|
34
|
+
if result.already_enqueued_count.positive?
|
|
35
|
+
pluralized_already = pluralizer.call(result.already_enqueued_count, "item")
|
|
36
|
+
message = "#{message} #{pluralized_already.capitalize} already in progress."
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
{ flash_key: :notice, message:, level: :success }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def build_partial_payload
|
|
43
|
+
label = BulkSourceScraper.selection_label(result.selection)
|
|
44
|
+
parts = []
|
|
45
|
+
|
|
46
|
+
if result.enqueued_count.positive?
|
|
47
|
+
pluralized_enqueued = pluralizer.call(result.enqueued_count, "item")
|
|
48
|
+
parts << "Queued #{pluralized_enqueued} from the #{label}"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
if result.already_enqueued_count.positive?
|
|
52
|
+
pluralized_already = pluralizer.call(result.already_enqueued_count, "item")
|
|
53
|
+
parts << "#{pluralized_already.capitalize} already in progress"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
if result.rate_limited?
|
|
57
|
+
limit = SourceMonitor.config.scraping.max_in_flight_per_source
|
|
58
|
+
parts << "Stopped after reaching the per-source limit#{" of #{limit}" if limit}"
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
other_failures = result.failure_details.except(:rate_limited)
|
|
62
|
+
if other_failures.values.sum.positive?
|
|
63
|
+
skipped = other_failures.map do |status, count|
|
|
64
|
+
label_key = status.to_s.tr("_", " ")
|
|
65
|
+
"#{pluralizer.call(count, label_key)}"
|
|
66
|
+
end.join(", ")
|
|
67
|
+
parts << "Skipped #{skipped}"
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
if parts.empty?
|
|
71
|
+
parts << "No new scrapes were queued from the #{label}"
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
{ flash_key: :notice, message: parts.join(". ") + ".", level: :warning }
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def build_error_payload
|
|
78
|
+
message = result.messages.presence&.first ||
|
|
79
|
+
"No items were queued because nothing matched the selected scope."
|
|
80
|
+
|
|
81
|
+
{ flash_key: :alert, message:, level: :error }
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
module Scraping
|
|
5
|
+
# Orchestrates bulk scrape enqueues for a source based on a user-selected
|
|
6
|
+
# scope. Works alongside the single-item enqueuer to ensure we respect
|
|
7
|
+
# per-source limits and provide actionable feedback for the UI.
|
|
8
|
+
class BulkSourceScraper
|
|
9
|
+
SELECTIONS = %i[current unscraped all].freeze
|
|
10
|
+
SELECTION_LABELS = {
|
|
11
|
+
current: "current view",
|
|
12
|
+
unscraped: "unscraped items",
|
|
13
|
+
all: "all items"
|
|
14
|
+
}.freeze
|
|
15
|
+
DEFAULT_PREVIEW_LIMIT = 10
|
|
16
|
+
|
|
17
|
+
Result = Struct.new(
|
|
18
|
+
:status,
|
|
19
|
+
:selection,
|
|
20
|
+
:attempted_count,
|
|
21
|
+
:enqueued_count,
|
|
22
|
+
:already_enqueued_count,
|
|
23
|
+
:failure_count,
|
|
24
|
+
:failure_details,
|
|
25
|
+
:messages,
|
|
26
|
+
:rate_limited,
|
|
27
|
+
keyword_init: true
|
|
28
|
+
) do
|
|
29
|
+
def success?
|
|
30
|
+
status == :success
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def partial?
|
|
34
|
+
status == :partial
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def error?
|
|
38
|
+
status == :error
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def rate_limited?
|
|
42
|
+
!!rate_limited
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def self.selection_label(selection)
|
|
47
|
+
SELECTION_LABELS[normalize_selection(selection)] || SELECTION_LABELS[:current]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def self.selection_counts(source:, preview_items:, preview_limit: 10)
|
|
51
|
+
preview_collection = Array(preview_items).compact
|
|
52
|
+
base_scope = SourceMonitor::Item.active.where(source_id: source.id)
|
|
53
|
+
{
|
|
54
|
+
current: preview_collection.size.clamp(0, preview_limit.to_i.nonzero? || preview_collection.size),
|
|
55
|
+
unscraped: base_scope.merge(unscraped_scope).count,
|
|
56
|
+
all: base_scope.count
|
|
57
|
+
}
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def self.normalize_selection(selection)
|
|
61
|
+
value = selection.is_a?(String) ? selection.strip : selection
|
|
62
|
+
value = value.to_s.downcase.to_sym if value
|
|
63
|
+
value if SELECTIONS.include?(value)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def initialize(source:, selection:, preview_limit: DEFAULT_PREVIEW_LIMIT, enqueuer: SourceMonitor::Scraping::Enqueuer, config: SourceMonitor.config.scraping)
|
|
67
|
+
@source = source
|
|
68
|
+
@selection = self.class.normalize_selection(selection) || :current
|
|
69
|
+
normalized_limit = preview_limit.respond_to?(:to_i) ? preview_limit.to_i : DEFAULT_PREVIEW_LIMIT
|
|
70
|
+
@preview_limit = normalized_limit.positive? ? normalized_limit : DEFAULT_PREVIEW_LIMIT
|
|
71
|
+
@enqueuer = enqueuer
|
|
72
|
+
@config = config
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def call
|
|
76
|
+
return disabled_result unless source.scraping_enabled?
|
|
77
|
+
return invalid_selection_result unless SELECTIONS.include?(selection)
|
|
78
|
+
|
|
79
|
+
items = scoped_items.to_a
|
|
80
|
+
attempted_count = items.size
|
|
81
|
+
|
|
82
|
+
return no_items_result if attempted_count.zero?
|
|
83
|
+
|
|
84
|
+
failure_details = Hash.new(0)
|
|
85
|
+
messages = []
|
|
86
|
+
enqueued_count = 0
|
|
87
|
+
already_enqueued_count = 0
|
|
88
|
+
rate_limited = false
|
|
89
|
+
|
|
90
|
+
items.each do |item|
|
|
91
|
+
enqueue_result = enqueuer.enqueue(item: item, source:, reason: :manual)
|
|
92
|
+
|
|
93
|
+
case enqueue_result.status
|
|
94
|
+
when :enqueued
|
|
95
|
+
enqueued_count += 1
|
|
96
|
+
when :already_enqueued
|
|
97
|
+
already_enqueued_count += 1
|
|
98
|
+
when :rate_limited
|
|
99
|
+
failure_details[:rate_limited] += 1
|
|
100
|
+
messages << enqueue_result.message if enqueue_result.message.present?
|
|
101
|
+
rate_limited = true
|
|
102
|
+
break
|
|
103
|
+
else
|
|
104
|
+
key = enqueue_result.status || :unknown
|
|
105
|
+
failure_details[key] += 1
|
|
106
|
+
messages << enqueue_result.message if enqueue_result.message.present?
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
failure_count = failure_details.values.sum
|
|
111
|
+
status = determine_status(enqueued_count:, failure_count:, already_enqueued_count:)
|
|
112
|
+
|
|
113
|
+
Result.new(
|
|
114
|
+
status:,
|
|
115
|
+
selection:,
|
|
116
|
+
attempted_count: attempted_count,
|
|
117
|
+
enqueued_count:,
|
|
118
|
+
already_enqueued_count:,
|
|
119
|
+
failure_count:,
|
|
120
|
+
failure_details: failure_details.freeze,
|
|
121
|
+
messages: messages.compact.uniq,
|
|
122
|
+
rate_limited: rate_limited
|
|
123
|
+
)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
private
|
|
127
|
+
|
|
128
|
+
attr_reader :source, :selection, :preview_limit, :enqueuer, :config
|
|
129
|
+
|
|
130
|
+
def scoped_items
|
|
131
|
+
scope = case selection
|
|
132
|
+
when :current
|
|
133
|
+
base_scope.limit(preview_limit)
|
|
134
|
+
when :unscraped
|
|
135
|
+
base_scope.merge(unscraped_scope)
|
|
136
|
+
when :all
|
|
137
|
+
base_scope
|
|
138
|
+
else
|
|
139
|
+
base_scope.limit(preview_limit)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
scope = without_inflight(scope)
|
|
143
|
+
apply_batch_limit(scope)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def base_scope
|
|
147
|
+
SourceMonitor::Item.active.where(source_id: source.id).order(Arel.sql("published_at DESC NULLS LAST, created_at DESC"))
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def without_inflight(scope)
|
|
151
|
+
statuses = SourceMonitor::Scraping::State::IN_FLIGHT_STATUSES
|
|
152
|
+
column = SourceMonitor::Item.arel_table[:scrape_status]
|
|
153
|
+
scope.where(column.eq(nil).or(column.not_in(statuses)))
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def self.unscraped_scope
|
|
157
|
+
item_table = SourceMonitor::Item.arel_table
|
|
158
|
+
failed_statuses = %w[failed partial]
|
|
159
|
+
SourceMonitor::Item.active.where(
|
|
160
|
+
item_table[:scraped_at].eq(nil)
|
|
161
|
+
.or(item_table[:scrape_status].in(failed_statuses))
|
|
162
|
+
)
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def unscraped_scope
|
|
166
|
+
self.class.unscraped_scope
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def apply_batch_limit(scope)
|
|
170
|
+
limit = config.max_bulk_batch_size
|
|
171
|
+
return scope unless limit
|
|
172
|
+
|
|
173
|
+
current_limit = scope.limit_value
|
|
174
|
+
effective_limit = current_limit ? [ current_limit, limit ].min : limit
|
|
175
|
+
scope.limit(effective_limit)
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def determine_status(enqueued_count:, failure_count:, already_enqueued_count:)
|
|
179
|
+
if enqueued_count.positive? && failure_count.zero?
|
|
180
|
+
:success
|
|
181
|
+
elsif enqueued_count.positive?
|
|
182
|
+
:partial
|
|
183
|
+
elsif already_enqueued_count.positive?
|
|
184
|
+
:partial
|
|
185
|
+
else
|
|
186
|
+
:error
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def disabled_result
|
|
191
|
+
Result.new(
|
|
192
|
+
status: :error,
|
|
193
|
+
selection:,
|
|
194
|
+
attempted_count: 0,
|
|
195
|
+
enqueued_count: 0,
|
|
196
|
+
already_enqueued_count: 0,
|
|
197
|
+
failure_count: 1,
|
|
198
|
+
failure_details: { scraping_disabled: 1 },
|
|
199
|
+
messages: [ "Scraping is disabled for this source." ],
|
|
200
|
+
rate_limited: false
|
|
201
|
+
)
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def invalid_selection_result
|
|
205
|
+
Result.new(
|
|
206
|
+
status: :error,
|
|
207
|
+
selection:,
|
|
208
|
+
attempted_count: 0,
|
|
209
|
+
enqueued_count: 0,
|
|
210
|
+
already_enqueued_count: 0,
|
|
211
|
+
failure_count: 1,
|
|
212
|
+
failure_details: { invalid_selection: 1 },
|
|
213
|
+
messages: [ "Invalid selection for bulk scrape." ],
|
|
214
|
+
rate_limited: false
|
|
215
|
+
)
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def no_items_result
|
|
219
|
+
Result.new(
|
|
220
|
+
status: :error,
|
|
221
|
+
selection:,
|
|
222
|
+
attempted_count: 0,
|
|
223
|
+
enqueued_count: 0,
|
|
224
|
+
already_enqueued_count: 0,
|
|
225
|
+
failure_count: 1,
|
|
226
|
+
failure_details: { no_items: 1 },
|
|
227
|
+
messages: [ "No items match the selected scope." ],
|
|
228
|
+
rate_limited: false
|
|
229
|
+
)
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
end
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
module Scraping
|
|
5
|
+
# Coordinates queuing of scraping jobs while respecting source
|
|
6
|
+
# configuration and avoiding duplicate enqueues for the same item.
|
|
7
|
+
class Enqueuer
|
|
8
|
+
Result = Struct.new(:status, :message, :item, keyword_init: true) do
|
|
9
|
+
def enqueued?
|
|
10
|
+
status == :enqueued
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def already_enqueued?
|
|
14
|
+
status == :already_enqueued
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def failure?
|
|
18
|
+
!enqueued? && !already_enqueued?
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
attr_reader :item, :source, :job_class, :reason
|
|
23
|
+
|
|
24
|
+
def self.enqueue(item:, source: nil, job_class: SourceMonitor::ScrapeItemJob, reason: :manual)
|
|
25
|
+
new(item:, source:, job_class:, reason:).enqueue
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def initialize(item:, source: nil, job_class: SourceMonitor::ScrapeItemJob, reason: :manual)
|
|
29
|
+
@item = item
|
|
30
|
+
@source = source || item&.source
|
|
31
|
+
@job_class = job_class
|
|
32
|
+
@reason = reason.to_sym
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def enqueue
|
|
36
|
+
log("enqueue:start", item:, source:, reason: reason)
|
|
37
|
+
return failure(:missing_item, "Item could not be found.") unless item
|
|
38
|
+
return failure(:missing_source, "Item must belong to a source.") unless source
|
|
39
|
+
return failure(:scraping_disabled, "Scraping is disabled for this source.") unless source.scraping_enabled?
|
|
40
|
+
if auto_reason? && !source.auto_scrape?
|
|
41
|
+
return failure(:auto_scrape_disabled, "Automatic scraping is disabled for this source.")
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
already_queued = false
|
|
45
|
+
rate_limited = false
|
|
46
|
+
rate_limit_info = nil
|
|
47
|
+
|
|
48
|
+
item.with_lock do
|
|
49
|
+
item.reload
|
|
50
|
+
|
|
51
|
+
if SourceMonitor::Scraping::State.in_flight?(item.scrape_status)
|
|
52
|
+
log("enqueue:in_flight", item:, status: item.scrape_status)
|
|
53
|
+
already_queued = true
|
|
54
|
+
next
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
exhausted, info = rate_limit_exhausted?
|
|
58
|
+
if exhausted
|
|
59
|
+
rate_limited = true
|
|
60
|
+
rate_limit_info = info
|
|
61
|
+
next
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
SourceMonitor::Scraping::State.mark_pending!(item, broadcast: false, lock: false)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
if already_queued
|
|
68
|
+
log("enqueue:already_enqueued", item:, status: item.scrape_status)
|
|
69
|
+
return Result.new(status: :already_enqueued, message: "Scrape already in progress for this item.", item: item)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
if rate_limited
|
|
73
|
+
message = rate_limit_message(rate_limit_info)
|
|
74
|
+
log("enqueue:rate_limited", item:, limit: rate_limit_info&.fetch(:limit, nil), in_flight: rate_limit_info&.fetch(:in_flight, nil))
|
|
75
|
+
return Result.new(status: :rate_limited, message:, item: item)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
job_class.perform_later(item.id)
|
|
79
|
+
log("enqueue:job_enqueued", item:, job_class: job_class.name)
|
|
80
|
+
Result.new(status: :enqueued, message: "Scrape has been enqueued for processing.", item: item)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
private
|
|
84
|
+
|
|
85
|
+
def auto_reason?
|
|
86
|
+
reason == :auto
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def failure(status, message)
|
|
90
|
+
log("enqueue:failure", item:, status:, message:)
|
|
91
|
+
Result.new(status:, message:, item: item)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def log(stage, item:, **extra)
|
|
95
|
+
return unless defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
96
|
+
|
|
97
|
+
payload = {
|
|
98
|
+
stage: "SourceMonitor::Scraping::Enqueuer##{stage}",
|
|
99
|
+
item_id: item&.id,
|
|
100
|
+
source_id: source&.id,
|
|
101
|
+
reason: reason
|
|
102
|
+
}.merge(extra.compact)
|
|
103
|
+
Rails.logger.info("[SourceMonitor::ManualScrape] #{payload.to_json}")
|
|
104
|
+
rescue StandardError
|
|
105
|
+
nil
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def rate_limit_exhausted?
|
|
109
|
+
limit = SourceMonitor.config.scraping.max_in_flight_per_source
|
|
110
|
+
return [ false, nil ] unless limit
|
|
111
|
+
|
|
112
|
+
in_flight = source.items.where(scrape_status: SourceMonitor::Scraping::State::IN_FLIGHT_STATUSES).count
|
|
113
|
+
[ in_flight >= limit, { limit:, in_flight: in_flight } ]
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def rate_limit_message(info)
|
|
117
|
+
return "Scraping queue is full for this source." unless info
|
|
118
|
+
|
|
119
|
+
limit = info[:limit]
|
|
120
|
+
in_flight = info[:in_flight]
|
|
121
|
+
"Unable to enqueue scrape: scraping queue is full for this source (#{in_flight}/#{limit} jobs in flight)."
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SourceMonitor
|
|
4
|
+
module Scraping
|
|
5
|
+
class ItemScraper
|
|
6
|
+
# Resolves scraper adapter classes based on configuration or engine namespace.
|
|
7
|
+
class AdapterResolver
|
|
8
|
+
VALID_NAME_PATTERN = /\A[a-z0-9_]+\z/i.freeze
|
|
9
|
+
|
|
10
|
+
def initialize(name:, source:)
|
|
11
|
+
@name = name.to_s
|
|
12
|
+
@source = source
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def resolve!
|
|
16
|
+
raise_unknown!("No scraper adapter configured for source") if name.blank?
|
|
17
|
+
raise_unknown!("Invalid scraper adapter: #{name}") unless VALID_NAME_PATTERN.match?(name)
|
|
18
|
+
|
|
19
|
+
configured = SourceMonitor.config.scrapers.adapter_for(name)
|
|
20
|
+
return configured if configured
|
|
21
|
+
|
|
22
|
+
constant = resolve_constant
|
|
23
|
+
return constant if constant <= SourceMonitor::Scrapers::Base
|
|
24
|
+
|
|
25
|
+
raise_unknown!("Unknown scraper adapter: #{name}")
|
|
26
|
+
rescue NameError
|
|
27
|
+
raise_unknown!("Unknown scraper adapter: #{name}")
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
attr_reader :name, :source
|
|
33
|
+
|
|
34
|
+
def resolve_constant
|
|
35
|
+
SourceMonitor::Scrapers.const_get(name.camelize)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def raise_unknown!(message)
|
|
39
|
+
raise SourceMonitor::Scraping::ItemScraper::UnknownAdapterError, message
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "active_support/core_ext/object/blank"
|
|
4
|
+
require "active_support/core_ext/object/deep_dup"
|
|
5
|
+
require "active_support/core_ext/hash/indifferent_access"
|
|
6
|
+
|
|
7
|
+
module SourceMonitor
|
|
8
|
+
module Scraping
|
|
9
|
+
class ItemScraper
|
|
10
|
+
# Persists scrape outcomes to the database and builds a Result object.
|
|
11
|
+
class Persistence
|
|
12
|
+
def initialize(item:, source:, adapter_name:)
|
|
13
|
+
@item = item
|
|
14
|
+
@source = source
|
|
15
|
+
@adapter_name = adapter_name
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def persist_success(adapter_result:, started_at:)
|
|
19
|
+
completed_at = Time.current
|
|
20
|
+
success = adapter_result.status.to_s != "failed"
|
|
21
|
+
status = normalize_status(adapter_result.status, success)
|
|
22
|
+
metadata = normalize_metadata(adapter_result.metadata)
|
|
23
|
+
http_status = metadata&.[](:http_status)
|
|
24
|
+
error_info = success ? {} : extract_error_info(metadata)
|
|
25
|
+
content_length = adapter_result.html.to_s.presence && adapter_result.html.to_s.bytesize
|
|
26
|
+
|
|
27
|
+
log = nil
|
|
28
|
+
item.class.transaction do
|
|
29
|
+
apply_item_success(status:, success:, completed_at:, adapter_result:)
|
|
30
|
+
log = build_log(
|
|
31
|
+
started_at:,
|
|
32
|
+
completed_at:,
|
|
33
|
+
duration_ms: duration_ms(started_at:, completed_at:),
|
|
34
|
+
success: success,
|
|
35
|
+
http_status: http_status,
|
|
36
|
+
content_length: content_length,
|
|
37
|
+
metadata: metadata,
|
|
38
|
+
error_class: error_info[:class],
|
|
39
|
+
error_message: error_info[:message]
|
|
40
|
+
)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
SourceMonitor::Scraping::ItemScraper::Result.new(
|
|
44
|
+
status: status,
|
|
45
|
+
item: item,
|
|
46
|
+
log: log,
|
|
47
|
+
message: message_for(status, success, error_info[:message], metadata)
|
|
48
|
+
)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def persist_failure(error:, started_at:, message_override: nil)
|
|
52
|
+
raise ArgumentError, "Item does not belong to a source" unless source
|
|
53
|
+
|
|
54
|
+
completed_at = Time.current
|
|
55
|
+
message = message_override.presence || error.message.presence || "Scrape failed"
|
|
56
|
+
http_status = extract_http_status(error)
|
|
57
|
+
metadata = failure_metadata(error)
|
|
58
|
+
|
|
59
|
+
log = nil
|
|
60
|
+
item.class.transaction do
|
|
61
|
+
item.update!(scrape_status: "failed", scraped_at: completed_at)
|
|
62
|
+
log = build_log(
|
|
63
|
+
started_at:,
|
|
64
|
+
completed_at: completed_at,
|
|
65
|
+
duration_ms: duration_ms(started_at:, completed_at:),
|
|
66
|
+
success: false,
|
|
67
|
+
http_status: http_status,
|
|
68
|
+
content_length: nil,
|
|
69
|
+
metadata: metadata,
|
|
70
|
+
error_class: error.class.name,
|
|
71
|
+
error_message: message
|
|
72
|
+
)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
SourceMonitor::Scraping::ItemScraper::Result.new(
|
|
76
|
+
status: :failed,
|
|
77
|
+
item: item,
|
|
78
|
+
log: log,
|
|
79
|
+
message: "Scrape failed: #{message}",
|
|
80
|
+
error: error
|
|
81
|
+
)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
private
|
|
85
|
+
|
|
86
|
+
attr_reader :item, :source, :adapter_name
|
|
87
|
+
|
|
88
|
+
def apply_item_success(status:, success:, completed_at:, adapter_result:)
|
|
89
|
+
attributes = {
|
|
90
|
+
scrape_status: status.to_s,
|
|
91
|
+
scraped_at: completed_at
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
if success
|
|
95
|
+
attributes[:scraped_html] = adapter_result.html
|
|
96
|
+
attributes[:scraped_content] = adapter_result.content
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
item.update!(attributes)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def build_log(started_at:, completed_at:, duration_ms:, success:, http_status:, content_length:, metadata:, error_class:, error_message:)
|
|
103
|
+
SourceMonitor::ScrapeLog.create!(
|
|
104
|
+
source: source,
|
|
105
|
+
item: item,
|
|
106
|
+
success: success,
|
|
107
|
+
scraper_adapter: adapter_name,
|
|
108
|
+
started_at: started_at,
|
|
109
|
+
completed_at: completed_at,
|
|
110
|
+
duration_ms: duration_ms,
|
|
111
|
+
http_status: http_status,
|
|
112
|
+
content_length: content_length,
|
|
113
|
+
error_class: error_class,
|
|
114
|
+
error_message: error_message,
|
|
115
|
+
metadata: metadata
|
|
116
|
+
)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def normalize_status(raw_status, success)
|
|
120
|
+
return :success if raw_status.blank? && success
|
|
121
|
+
return :failed if raw_status.blank?
|
|
122
|
+
|
|
123
|
+
raw_status.to_sym
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def normalize_metadata(metadata)
|
|
127
|
+
return {} if metadata.blank?
|
|
128
|
+
|
|
129
|
+
hash = metadata.respond_to?(:to_h) ? metadata.to_h : metadata
|
|
130
|
+
hash.deep_dup.with_indifferent_access
|
|
131
|
+
rescue StandardError
|
|
132
|
+
{}
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def extract_error_info(metadata)
|
|
136
|
+
{
|
|
137
|
+
class: metadata&.[](:error)&.to_s,
|
|
138
|
+
message: metadata&.[](:message)&.to_s
|
|
139
|
+
}.compact
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def failure_metadata(error)
|
|
143
|
+
{
|
|
144
|
+
error: error.class.name,
|
|
145
|
+
message: error.message,
|
|
146
|
+
backtrace: Array(error.backtrace).first(5)
|
|
147
|
+
}.compact
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def extract_http_status(error)
|
|
151
|
+
return error.http_status if error.respond_to?(:http_status) && error.http_status.present?
|
|
152
|
+
|
|
153
|
+
if error.respond_to?(:response)
|
|
154
|
+
response = error.response
|
|
155
|
+
if response.is_a?(Hash)
|
|
156
|
+
return response[:status] || response["status"]
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
if error.message && (match = error.message.match(/\b(\d{3})\b/))
|
|
161
|
+
return match[1].to_i
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
nil
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def duration_ms(started_at:, completed_at:)
|
|
168
|
+
return nil unless started_at && completed_at
|
|
169
|
+
|
|
170
|
+
((completed_at - started_at) * 1000).round
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def message_for(status, success, error_message, metadata)
|
|
174
|
+
return "Scrape failed: #{error_message}" if !success && error_message.present?
|
|
175
|
+
|
|
176
|
+
case status.to_s
|
|
177
|
+
when "success"
|
|
178
|
+
strategy = metadata&.[](:extraction_strategy)
|
|
179
|
+
strategy.present? ? "Scrape completed via #{strategy.to_s.titleize}" : "Scrape completed successfully"
|
|
180
|
+
when "partial"
|
|
181
|
+
"Scrape completed with partial content"
|
|
182
|
+
else
|
|
183
|
+
success ? "Scrape completed" : "Scrape failed"
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
end
|