source_monitor 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.rubocop.yml +12 -0
- data/.ruby-version +1 -0
- data/AGENTS.md +132 -0
- data/CHANGELOG.md +66 -0
- data/CONTRIBUTING.md +31 -0
- data/Gemfile +30 -0
- data/Gemfile.lock +411 -0
- data/MIT-LICENSE +20 -0
- data/README.md +108 -0
- data/Rakefile +8 -0
- data/app/assets/builds/.keep +0 -0
- data/app/assets/config/source_monitor_manifest.js +4 -0
- data/app/assets/images/source_monitor/.keep +0 -0
- data/app/assets/javascripts/source_monitor/application.js +20 -0
- data/app/assets/javascripts/source_monitor/controllers/async_submit_controller.js +36 -0
- data/app/assets/javascripts/source_monitor/controllers/dropdown_controller.js +109 -0
- data/app/assets/javascripts/source_monitor/controllers/modal_controller.js +56 -0
- data/app/assets/javascripts/source_monitor/controllers/notification_controller.js +53 -0
- data/app/assets/javascripts/source_monitor/turbo_actions.js +13 -0
- data/app/assets/stylesheets/source_monitor/application.tailwind.css +13 -0
- data/app/assets/svgs/source_monitor/.keep +0 -0
- data/app/controllers/concerns/.keep +0 -0
- data/app/controllers/concerns/source_monitor/sanitizes_search_params.rb +81 -0
- data/app/controllers/source_monitor/application_controller.rb +62 -0
- data/app/controllers/source_monitor/dashboard_controller.rb +27 -0
- data/app/controllers/source_monitor/fetch_logs_controller.rb +9 -0
- data/app/controllers/source_monitor/health_controller.rb +10 -0
- data/app/controllers/source_monitor/items_controller.rb +116 -0
- data/app/controllers/source_monitor/logs_controller.rb +15 -0
- data/app/controllers/source_monitor/scrape_logs_controller.rb +9 -0
- data/app/controllers/source_monitor/source_bulk_scrapes_controller.rb +35 -0
- data/app/controllers/source_monitor/source_fetches_controller.rb +22 -0
- data/app/controllers/source_monitor/source_health_checks_controller.rb +34 -0
- data/app/controllers/source_monitor/source_health_resets_controller.rb +27 -0
- data/app/controllers/source_monitor/source_retries_controller.rb +22 -0
- data/app/controllers/source_monitor/source_turbo_responses.rb +115 -0
- data/app/controllers/source_monitor/sources_controller.rb +179 -0
- data/app/helpers/source_monitor/application_helper.rb +327 -0
- data/app/jobs/source_monitor/application_job.rb +13 -0
- data/app/jobs/source_monitor/fetch_feed_job.rb +117 -0
- data/app/jobs/source_monitor/item_cleanup_job.rb +48 -0
- data/app/jobs/source_monitor/log_cleanup_job.rb +47 -0
- data/app/jobs/source_monitor/schedule_fetches_job.rb +29 -0
- data/app/jobs/source_monitor/scrape_item_job.rb +47 -0
- data/app/jobs/source_monitor/source_health_check_job.rb +77 -0
- data/app/mailers/source_monitor/application_mailer.rb +17 -0
- data/app/models/concerns/.keep +0 -0
- data/app/models/concerns/source_monitor/loggable.rb +18 -0
- data/app/models/source_monitor/application_record.rb +5 -0
- data/app/models/source_monitor/fetch_log.rb +31 -0
- data/app/models/source_monitor/health_check_log.rb +28 -0
- data/app/models/source_monitor/item.rb +102 -0
- data/app/models/source_monitor/item_content.rb +11 -0
- data/app/models/source_monitor/log_entry.rb +56 -0
- data/app/models/source_monitor/scrape_log.rb +31 -0
- data/app/models/source_monitor/source.rb +115 -0
- data/app/views/layouts/source_monitor/application.html.erb +54 -0
- data/app/views/source_monitor/dashboard/_fetch_schedule.html.erb +90 -0
- data/app/views/source_monitor/dashboard/_job_metrics.html.erb +82 -0
- data/app/views/source_monitor/dashboard/_recent_activity.html.erb +39 -0
- data/app/views/source_monitor/dashboard/_stat_card.html.erb +6 -0
- data/app/views/source_monitor/dashboard/_stats.html.erb +9 -0
- data/app/views/source_monitor/dashboard/index.html.erb +48 -0
- data/app/views/source_monitor/fetch_logs/show.html.erb +90 -0
- data/app/views/source_monitor/items/_details.html.erb +234 -0
- data/app/views/source_monitor/items/_details_wrapper.html.erb +3 -0
- data/app/views/source_monitor/items/index.html.erb +147 -0
- data/app/views/source_monitor/items/show.html.erb +3 -0
- data/app/views/source_monitor/logs/index.html.erb +208 -0
- data/app/views/source_monitor/scrape_logs/show.html.erb +73 -0
- data/app/views/source_monitor/shared/_toast.html.erb +34 -0
- data/app/views/source_monitor/sources/_bulk_scrape_form.html.erb +64 -0
- data/app/views/source_monitor/sources/_bulk_scrape_modal.html.erb +53 -0
- data/app/views/source_monitor/sources/_details.html.erb +302 -0
- data/app/views/source_monitor/sources/_details_wrapper.html.erb +3 -0
- data/app/views/source_monitor/sources/_empty_state_row.html.erb +5 -0
- data/app/views/source_monitor/sources/_fetch_interval_heatmap.html.erb +46 -0
- data/app/views/source_monitor/sources/_form.html.erb +143 -0
- data/app/views/source_monitor/sources/_health_status_badge.html.erb +46 -0
- data/app/views/source_monitor/sources/_row.html.erb +102 -0
- data/app/views/source_monitor/sources/edit.html.erb +28 -0
- data/app/views/source_monitor/sources/index.html.erb +153 -0
- data/app/views/source_monitor/sources/new.html.erb +22 -0
- data/app/views/source_monitor/sources/show.html.erb +3 -0
- data/config/coverage_baseline.json +2010 -0
- data/config/initializers/feedjira.rb +19 -0
- data/config/routes.rb +18 -0
- data/config/tailwind.config.js +17 -0
- data/db/migrate/20241008120000_create_source_monitor_sources.rb +40 -0
- data/db/migrate/20241008121000_create_source_monitor_items.rb +44 -0
- data/db/migrate/20241008122000_create_source_monitor_fetch_logs.rb +32 -0
- data/db/migrate/20241008123000_create_source_monitor_scrape_logs.rb +25 -0
- data/db/migrate/20251008183000_change_fetch_interval_to_minutes.rb +23 -0
- data/db/migrate/20251009090000_create_source_monitor_item_contents.rb +38 -0
- data/db/migrate/20251009103000_add_feed_content_readability_to_sources.rb +5 -0
- data/db/migrate/20251010090000_add_adaptive_fetching_toggle_to_sources.rb +7 -0
- data/db/migrate/20251010123000_add_deleted_at_to_source_monitor_items.rb +8 -0
- data/db/migrate/20251010153000_add_type_to_source_monitor_sources.rb +8 -0
- data/db/migrate/20251010154500_add_fetch_status_to_source_monitor_sources.rb +9 -0
- data/db/migrate/20251010160000_create_solid_cable_messages.rb +16 -0
- data/db/migrate/20251011090000_add_fetch_retry_state_to_sources.rb +14 -0
- data/db/migrate/20251012090000_add_health_fields_to_sources.rb +17 -0
- data/db/migrate/20251012100000_optimize_source_monitor_database_performance.rb +13 -0
- data/db/migrate/20251014064947_add_not_null_constraints_to_items.rb +30 -0
- data/db/migrate/20251014171659_add_performance_indexes.rb +29 -0
- data/db/migrate/20251014172525_add_fetch_status_check_constraint.rb +18 -0
- data/db/migrate/20251015100000_create_source_monitor_log_entries.rb +89 -0
- data/db/migrate/20251022100000_create_source_monitor_health_check_logs.rb +22 -0
- data/db/migrate/20251108120116_refresh_fetch_status_constraint.rb +29 -0
- data/docs/configuration.md +170 -0
- data/docs/deployment.md +63 -0
- data/docs/gh-cli-workflow.md +44 -0
- data/docs/installation.md +144 -0
- data/docs/troubleshooting.md +76 -0
- data/eslint.config.mjs +27 -0
- data/lib/generators/source_monitor/install/install_generator.rb +59 -0
- data/lib/generators/source_monitor/install/templates/source_monitor.rb.tt +155 -0
- data/lib/source_monitor/analytics/source_activity_rates.rb +53 -0
- data/lib/source_monitor/analytics/source_fetch_interval_distribution.rb +57 -0
- data/lib/source_monitor/analytics/sources_index_metrics.rb +92 -0
- data/lib/source_monitor/assets/bundler.rb +49 -0
- data/lib/source_monitor/assets.rb +6 -0
- data/lib/source_monitor/configuration.rb +654 -0
- data/lib/source_monitor/dashboard/queries.rb +356 -0
- data/lib/source_monitor/dashboard/quick_action.rb +7 -0
- data/lib/source_monitor/dashboard/quick_actions_presenter.rb +26 -0
- data/lib/source_monitor/dashboard/recent_activity.rb +30 -0
- data/lib/source_monitor/dashboard/recent_activity_presenter.rb +77 -0
- data/lib/source_monitor/dashboard/turbo_broadcaster.rb +87 -0
- data/lib/source_monitor/dashboard/upcoming_fetch_schedule.rb +126 -0
- data/lib/source_monitor/engine.rb +107 -0
- data/lib/source_monitor/events.rb +110 -0
- data/lib/source_monitor/feedjira_extensions.rb +103 -0
- data/lib/source_monitor/fetching/advisory_lock.rb +54 -0
- data/lib/source_monitor/fetching/completion/event_publisher.rb +22 -0
- data/lib/source_monitor/fetching/completion/follow_up_handler.rb +37 -0
- data/lib/source_monitor/fetching/completion/retention_handler.rb +30 -0
- data/lib/source_monitor/fetching/feed_fetcher.rb +627 -0
- data/lib/source_monitor/fetching/fetch_error.rb +88 -0
- data/lib/source_monitor/fetching/fetch_runner.rb +142 -0
- data/lib/source_monitor/fetching/retry_policy.rb +85 -0
- data/lib/source_monitor/fetching/stalled_fetch_reconciler.rb +146 -0
- data/lib/source_monitor/health/source_health_check.rb +100 -0
- data/lib/source_monitor/health/source_health_monitor.rb +210 -0
- data/lib/source_monitor/health/source_health_reset.rb +68 -0
- data/lib/source_monitor/health.rb +46 -0
- data/lib/source_monitor/http.rb +85 -0
- data/lib/source_monitor/instrumentation.rb +52 -0
- data/lib/source_monitor/items/item_creator.rb +601 -0
- data/lib/source_monitor/items/retention_pruner.rb +146 -0
- data/lib/source_monitor/items/retention_strategies/destroy.rb +26 -0
- data/lib/source_monitor/items/retention_strategies/soft_delete.rb +50 -0
- data/lib/source_monitor/items/retention_strategies.rb +9 -0
- data/lib/source_monitor/jobs/cleanup_options.rb +85 -0
- data/lib/source_monitor/jobs/fetch_failure_subscriber.rb +129 -0
- data/lib/source_monitor/jobs/solid_queue_metrics.rb +199 -0
- data/lib/source_monitor/jobs/visibility.rb +133 -0
- data/lib/source_monitor/logs/entry_sync.rb +69 -0
- data/lib/source_monitor/logs/filter_set.rb +163 -0
- data/lib/source_monitor/logs/query.rb +81 -0
- data/lib/source_monitor/logs/table_presenter.rb +161 -0
- data/lib/source_monitor/metrics.rb +77 -0
- data/lib/source_monitor/model_extensions.rb +109 -0
- data/lib/source_monitor/models/sanitizable.rb +76 -0
- data/lib/source_monitor/models/url_normalizable.rb +84 -0
- data/lib/source_monitor/pagination/paginator.rb +90 -0
- data/lib/source_monitor/realtime/adapter.rb +97 -0
- data/lib/source_monitor/realtime/broadcaster.rb +237 -0
- data/lib/source_monitor/realtime.rb +17 -0
- data/lib/source_monitor/release/changelog.rb +59 -0
- data/lib/source_monitor/release/runner.rb +73 -0
- data/lib/source_monitor/scheduler.rb +82 -0
- data/lib/source_monitor/scrapers/base.rb +105 -0
- data/lib/source_monitor/scrapers/fetchers/http_fetcher.rb +97 -0
- data/lib/source_monitor/scrapers/parsers/readability_parser.rb +101 -0
- data/lib/source_monitor/scrapers/readability.rb +156 -0
- data/lib/source_monitor/scraping/bulk_result_presenter.rb +85 -0
- data/lib/source_monitor/scraping/bulk_source_scraper.rb +233 -0
- data/lib/source_monitor/scraping/enqueuer.rb +125 -0
- data/lib/source_monitor/scraping/item_scraper/adapter_resolver.rb +44 -0
- data/lib/source_monitor/scraping/item_scraper/persistence.rb +189 -0
- data/lib/source_monitor/scraping/item_scraper.rb +84 -0
- data/lib/source_monitor/scraping/scheduler.rb +43 -0
- data/lib/source_monitor/scraping/state.rb +79 -0
- data/lib/source_monitor/security/authentication.rb +85 -0
- data/lib/source_monitor/security/parameter_sanitizer.rb +42 -0
- data/lib/source_monitor/sources/turbo_stream_presenter.rb +54 -0
- data/lib/source_monitor/turbo_streams/stream_responder.rb +95 -0
- data/lib/source_monitor/version.rb +3 -0
- data/lib/source_monitor.rb +149 -0
- data/lib/tasks/recover_stalled_fetches.rake +16 -0
- data/lib/tasks/source_monitor_assets.rake +28 -0
- data/lib/tasks/source_monitor_tasks.rake +29 -0
- data/lib/tasks/test_smoke.rake +12 -0
- data/package-lock.json +3997 -0
- data/package.json +29 -0
- data/postcss.config.js +6 -0
- data/source_monitor.gemspec +46 -0
- data/stylelint.config.js +12 -0
- metadata +469 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "pathname"
|
|
4
|
+
|
|
5
|
+
module SourceMonitor
|
|
6
|
+
module Release
|
|
7
|
+
class Changelog
|
|
8
|
+
MissingEntryError = Class.new(StandardError)
|
|
9
|
+
|
|
10
|
+
def initialize(path: default_path)
|
|
11
|
+
@path = Pathname.new(path)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def latest_entry
|
|
15
|
+
@latest_entry ||= begin
|
|
16
|
+
sections = extract_sections
|
|
17
|
+
heading = sections.keys.find { |key| key != "## Release Checklist" }
|
|
18
|
+
raise MissingEntryError, "Unable to find changelog entry after Release Checklist" unless heading
|
|
19
|
+
|
|
20
|
+
content = ([ heading ] + sections.fetch(heading)).join
|
|
21
|
+
content.rstrip
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def annotation_for(version)
|
|
26
|
+
raise ArgumentError, "version must be provided" if version.to_s.strip.empty?
|
|
27
|
+
|
|
28
|
+
[ "SourceMonitor v#{version}", latest_entry ].join("\n\n")
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
attr_reader :path
|
|
34
|
+
|
|
35
|
+
def default_path
|
|
36
|
+
Pathname.new(__dir__).join("..", "..", "..", "CHANGELOG.md").expand_path
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def extract_sections
|
|
40
|
+
sections = {}
|
|
41
|
+
current_heading = nil
|
|
42
|
+
|
|
43
|
+
File.foreach(path) do |line|
|
|
44
|
+
if line.start_with?("## ")
|
|
45
|
+
current_heading = line.strip
|
|
46
|
+
sections[current_heading] ||= []
|
|
47
|
+
next
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
next unless current_heading
|
|
51
|
+
|
|
52
|
+
sections[current_heading] << line
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
sections
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "tempfile"
|
|
4
|
+
require_relative "changelog"
|
|
5
|
+
|
|
6
|
+
module SourceMonitor
|
|
7
|
+
module Release
|
|
8
|
+
class Runner
|
|
9
|
+
CommandFailure = Class.new(StandardError)
|
|
10
|
+
|
|
11
|
+
QUALITY_COMMANDS = [
|
|
12
|
+
[ "bin/rubocop" ],
|
|
13
|
+
[ "bin/brakeman", "--no-pager" ],
|
|
14
|
+
[ "bin/test-coverage" ],
|
|
15
|
+
[ "bin/check-diff-coverage" ]
|
|
16
|
+
].freeze
|
|
17
|
+
GEM_BUILD_COMMAND = [ "rbenv", "exec", "gem", "build", "source_monitor.gemspec" ].freeze
|
|
18
|
+
|
|
19
|
+
def initialize(version:, executor: Executor.new, changelog: Changelog.new)
|
|
20
|
+
@version = version
|
|
21
|
+
@executor = executor
|
|
22
|
+
@changelog = changelog
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def call
|
|
26
|
+
validate_version!
|
|
27
|
+
run_commands(QUALITY_COMMANDS)
|
|
28
|
+
run_command(GEM_BUILD_COMMAND)
|
|
29
|
+
create_annotated_tag
|
|
30
|
+
true
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
attr_reader :version, :executor, :changelog
|
|
36
|
+
|
|
37
|
+
def run_commands(commands)
|
|
38
|
+
commands.each do |command|
|
|
39
|
+
run_command(command)
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def run_command(command, env: {})
|
|
44
|
+
success = executor.run(command, env:)
|
|
45
|
+
return if success
|
|
46
|
+
|
|
47
|
+
raise CommandFailure, "Command failed: #{command.join(' ')}"
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def create_annotated_tag
|
|
51
|
+
message = changelog.annotation_for(version)
|
|
52
|
+
|
|
53
|
+
Tempfile.create([ "feed-monitor-release", ".log" ]) do |file|
|
|
54
|
+
file.write(message)
|
|
55
|
+
file.flush
|
|
56
|
+
file.rewind
|
|
57
|
+
|
|
58
|
+
run_command([ "git", "tag", "-a", "v#{version}", "-F", file.path ])
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def validate_version!
|
|
63
|
+
raise ArgumentError, "version must be provided" if version.to_s.strip.empty?
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
class Executor
|
|
67
|
+
def run(command, env: {})
|
|
68
|
+
system(env, *command)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "active_support/core_ext/numeric/time"
|
|
4
|
+
require "source_monitor/fetching/stalled_fetch_reconciler"
|
|
5
|
+
|
|
6
|
+
module SourceMonitor
|
|
7
|
+
class Scheduler
|
|
8
|
+
DEFAULT_BATCH_SIZE = 100
|
|
9
|
+
STALE_QUEUE_TIMEOUT = 10.minutes
|
|
10
|
+
ELIGIBLE_FETCH_STATUSES = %w[idle failed].freeze
|
|
11
|
+
|
|
12
|
+
def self.run(limit: DEFAULT_BATCH_SIZE, now: Time.current)
|
|
13
|
+
new(limit:, now:).run
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def initialize(limit:, now:)
|
|
17
|
+
@limit = limit
|
|
18
|
+
@now = now
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def run
|
|
22
|
+
payload = { limit: limit }
|
|
23
|
+
recovery = SourceMonitor::Fetching::StalledFetchReconciler.call(now:, stale_after: STALE_QUEUE_TIMEOUT)
|
|
24
|
+
payload[:stalled_recoveries] = recovery.recovered_source_ids.size
|
|
25
|
+
payload[:stalled_jobs_removed] = recovery.jobs_removed.size
|
|
26
|
+
|
|
27
|
+
ActiveSupport::Notifications.instrument("source_monitor.scheduler.run", payload) do
|
|
28
|
+
start_monotonic = SourceMonitor::Instrumentation.monotonic_time
|
|
29
|
+
source_ids = lock_due_source_ids
|
|
30
|
+
payload[:enqueued_count] = source_ids.size
|
|
31
|
+
|
|
32
|
+
source_ids.each do |source_id|
|
|
33
|
+
SourceMonitor::Fetching::FetchRunner.enqueue(source_id)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
payload[:duration_ms] = ((SourceMonitor::Instrumentation.monotonic_time - start_monotonic) * 1000.0).round(2)
|
|
37
|
+
|
|
38
|
+
source_ids.size
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
attr_reader :limit, :now
|
|
45
|
+
|
|
46
|
+
def lock_due_source_ids
|
|
47
|
+
ids = []
|
|
48
|
+
|
|
49
|
+
SourceMonitor::Source.transaction do
|
|
50
|
+
rows = due_sources_relation
|
|
51
|
+
rows = rows.limit(limit) if limit
|
|
52
|
+
ids = rows.lock("FOR UPDATE SKIP LOCKED").pluck(:id)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
ids
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def due_sources_relation
|
|
59
|
+
SourceMonitor::Source
|
|
60
|
+
.active
|
|
61
|
+
.where(due_for_fetch_predicate)
|
|
62
|
+
.where(fetch_status_predicate)
|
|
63
|
+
.order(Arel.sql("next_fetch_at ASC NULLS FIRST"))
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def due_for_fetch_predicate
|
|
67
|
+
table = SourceMonitor::Source.arel_table
|
|
68
|
+
table[:next_fetch_at].eq(nil).or(table[:next_fetch_at].lteq(now))
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def fetch_status_predicate
|
|
72
|
+
table = SourceMonitor::Source.arel_table
|
|
73
|
+
|
|
74
|
+
eligible = table[:fetch_status].in(ELIGIBLE_FETCH_STATUSES)
|
|
75
|
+
stale_cutoff = now - STALE_QUEUE_TIMEOUT
|
|
76
|
+
stale_queued = table[:fetch_status].eq("queued").and(table[:updated_at].lteq(stale_cutoff))
|
|
77
|
+
stale_fetching = table[:fetch_status].eq("fetching").and(table[:last_fetch_started_at].lteq(stale_cutoff))
|
|
78
|
+
|
|
79
|
+
eligible.or(stale_queued).or(stale_fetching)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "active_support/core_ext/hash/deep_merge"
|
|
4
|
+
require "active_support/hash_with_indifferent_access"
|
|
5
|
+
require "active_support/core_ext/hash/keys"
|
|
6
|
+
|
|
7
|
+
module SourceMonitor
|
|
8
|
+
module Scrapers
|
|
9
|
+
# Base class for content scrapers used by the engine.
|
|
10
|
+
#
|
|
11
|
+
# == Adapter Contract
|
|
12
|
+
# Subclasses must implement #call and return a Result object describing the
|
|
13
|
+
# outcome of a scrape attempt. Implementations receive an item, the owning
|
|
14
|
+
# source, and a normalized settings hash that merges default adapter
|
|
15
|
+
# settings, source-level overrides, and per-invocation overrides. All
|
|
16
|
+
# adapters should remain stateless and thread-safe, relying on injected
|
|
17
|
+
# collaborators (e.g. HTTP clients) instead of global configuration.
|
|
18
|
+
#
|
|
19
|
+
# Adapters should:
|
|
20
|
+
# * Perform any outbound HTTP work using the provided +http+ client.
|
|
21
|
+
# * Populate the Result with :html and :content payloads when successful.
|
|
22
|
+
# * Use :status to communicate :success, :partial, or :failed outcomes.
|
|
23
|
+
# * Capture additional diagnostics (headers, timings, etc.) in :metadata.
|
|
24
|
+
class Base
|
|
25
|
+
Result = Struct.new(:status, :html, :content, :metadata, keyword_init: true)
|
|
26
|
+
|
|
27
|
+
class << self
|
|
28
|
+
def call(item:, source:, settings: nil, http: SourceMonitor::HTTP)
|
|
29
|
+
new(item: item, source: source, settings: settings, http: http).call
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def adapter_name
|
|
33
|
+
name.demodulize.sub(/Scraper\z/, "").underscore
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def default_settings
|
|
37
|
+
{}
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def initialize(item:, source:, settings: nil, http: SourceMonitor::HTTP)
|
|
42
|
+
@item = item
|
|
43
|
+
@source = source
|
|
44
|
+
@http = http
|
|
45
|
+
@settings = build_settings(settings)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def call
|
|
49
|
+
raise NotImplementedError, "#{self.class.name} must implement #call"
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
protected
|
|
53
|
+
|
|
54
|
+
attr_reader :item, :source, :http, :settings
|
|
55
|
+
|
|
56
|
+
private
|
|
57
|
+
|
|
58
|
+
def build_settings(overrides)
|
|
59
|
+
combined = normalize_settings(self.class.default_settings)
|
|
60
|
+
.deep_merge(normalize_settings(source_settings))
|
|
61
|
+
|
|
62
|
+
if overrides.present? && overrides.respond_to?(:to_hash)
|
|
63
|
+
combined = combined.deep_merge(normalize_settings(overrides.to_hash))
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
deep_indifferent_access(combined)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def source_settings
|
|
70
|
+
value = source&.scrape_settings
|
|
71
|
+
return {} unless value.respond_to?(:to_hash)
|
|
72
|
+
|
|
73
|
+
value.to_hash
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def deep_indifferent_access(value)
|
|
77
|
+
case value
|
|
78
|
+
when Hash
|
|
79
|
+
value.each_with_object(ActiveSupport::HashWithIndifferentAccess.new) do |(key, val), memo|
|
|
80
|
+
memo[key] = deep_indifferent_access(val)
|
|
81
|
+
end
|
|
82
|
+
when Array
|
|
83
|
+
value.map { |element| deep_indifferent_access(element) }
|
|
84
|
+
else
|
|
85
|
+
value
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def normalize_settings(value)
|
|
90
|
+
return value if value.nil?
|
|
91
|
+
|
|
92
|
+
case value
|
|
93
|
+
when Hash
|
|
94
|
+
value.each_with_object({}) do |(key, val), memo|
|
|
95
|
+
memo[key.to_s] = normalize_settings(val)
|
|
96
|
+
end
|
|
97
|
+
when Array
|
|
98
|
+
value.map { |element| normalize_settings(element) }
|
|
99
|
+
else
|
|
100
|
+
value
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "active_support/core_ext/object/blank"
|
|
4
|
+
|
|
5
|
+
module SourceMonitor
|
|
6
|
+
module Scrapers
|
|
7
|
+
module Fetchers
|
|
8
|
+
class HttpFetcher
|
|
9
|
+
Result = Struct.new(:status, :body, :headers, :http_status, :error, :message, keyword_init: true)
|
|
10
|
+
|
|
11
|
+
def initialize(http: SourceMonitor::HTTP)
|
|
12
|
+
@http = http
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def fetch(url:, settings: nil)
|
|
16
|
+
response = connection(settings).get(url)
|
|
17
|
+
|
|
18
|
+
if success_status?(response.status)
|
|
19
|
+
Result.new(status: :success, body: response.body, headers: response.headers, http_status: response.status)
|
|
20
|
+
else
|
|
21
|
+
Result.new(
|
|
22
|
+
status: :failed,
|
|
23
|
+
http_status: response.status,
|
|
24
|
+
error: "http_error",
|
|
25
|
+
message: "Non-success HTTP status"
|
|
26
|
+
)
|
|
27
|
+
end
|
|
28
|
+
rescue Faraday::ClientError => error
|
|
29
|
+
Result.new(
|
|
30
|
+
status: :failed,
|
|
31
|
+
http_status: extract_status(error),
|
|
32
|
+
error: error.class.name,
|
|
33
|
+
message: error.message
|
|
34
|
+
)
|
|
35
|
+
rescue Faraday::Error => error
|
|
36
|
+
Result.new(status: :failed, error: error.class.name, message: error.message)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
attr_reader :http
|
|
42
|
+
|
|
43
|
+
def connection(settings)
|
|
44
|
+
normalized = normalize_settings(settings)
|
|
45
|
+
http.client(
|
|
46
|
+
proxy: normalized[:proxy],
|
|
47
|
+
headers: normalized[:headers],
|
|
48
|
+
timeout: normalized[:timeout] || SourceMonitor::HTTP::DEFAULT_TIMEOUT,
|
|
49
|
+
open_timeout: normalized[:open_timeout] || SourceMonitor::HTTP::DEFAULT_OPEN_TIMEOUT
|
|
50
|
+
)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def normalize_settings(settings)
|
|
54
|
+
return {} unless settings
|
|
55
|
+
|
|
56
|
+
settings = settings.respond_to?(:to_h) ? settings.to_h : settings
|
|
57
|
+
{
|
|
58
|
+
headers: (settings[:headers] || {}).to_h,
|
|
59
|
+
timeout: settings[:timeout],
|
|
60
|
+
open_timeout: settings[:open_timeout],
|
|
61
|
+
proxy: settings[:proxy].presence
|
|
62
|
+
}
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def success_status?(status)
|
|
66
|
+
status.to_i >= 200 && status.to_i < 300
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def extract_status(error)
|
|
70
|
+
candidates = []
|
|
71
|
+
|
|
72
|
+
if error.respond_to?(:response_status)
|
|
73
|
+
candidates << error.response_status
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
if error.respond_to?(:response)
|
|
77
|
+
response = error.response
|
|
78
|
+
if response.respond_to?(:[]) && response[:status]
|
|
79
|
+
candidates << response[:status]
|
|
80
|
+
elsif response.is_a?(Hash)
|
|
81
|
+
candidates << response["status"]
|
|
82
|
+
candidates << response[:status]
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
if error.respond_to?(:message) && error.message
|
|
87
|
+
error.message.scan(/\d{3}/).each do |number|
|
|
88
|
+
candidates << number.to_i
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
candidates.compact.first
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "readability"
|
|
4
|
+
require "nokolexbor"
|
|
5
|
+
require "active_support/core_ext/object/blank"
|
|
6
|
+
|
|
7
|
+
module SourceMonitor
|
|
8
|
+
module Scrapers
|
|
9
|
+
module Parsers
|
|
10
|
+
class ReadabilityParser
|
|
11
|
+
Result = Struct.new(:status, :content, :strategy, :title, :metadata, keyword_init: true)
|
|
12
|
+
|
|
13
|
+
def parse(html:, selectors: nil, readability: nil)
|
|
14
|
+
document = ::Nokolexbor::HTML(html)
|
|
15
|
+
selectors_hash = normalize_hash(selectors)
|
|
16
|
+
readability_options = normalize_hash(readability)
|
|
17
|
+
|
|
18
|
+
content_html = extract_with_selectors(document, selectors_hash[:content])
|
|
19
|
+
strategy = content_html.present? ? :selectors : :readability
|
|
20
|
+
|
|
21
|
+
readability_doc = build_readability_document(html, readability_options)
|
|
22
|
+
content_html = readability_doc.content&.strip if content_html.blank?
|
|
23
|
+
|
|
24
|
+
status = content_html.present? ? :success : :partial
|
|
25
|
+
|
|
26
|
+
title = extract_title(document, selectors_hash[:title], readability_doc)
|
|
27
|
+
metadata = {}
|
|
28
|
+
if readability_doc.respond_to?(:content_length)
|
|
29
|
+
metadata[:readability_text_length] = readability_doc.content_length
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
Result.new(
|
|
33
|
+
status: status,
|
|
34
|
+
content: content_html.presence,
|
|
35
|
+
strategy: strategy,
|
|
36
|
+
title: title,
|
|
37
|
+
metadata: metadata.compact
|
|
38
|
+
)
|
|
39
|
+
rescue StandardError => error
|
|
40
|
+
Result.new(
|
|
41
|
+
status: :failed,
|
|
42
|
+
content: nil,
|
|
43
|
+
strategy: :readability,
|
|
44
|
+
title: nil,
|
|
45
|
+
metadata: { error: error.class.name, message: error.message }
|
|
46
|
+
)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
def normalize_hash(value)
|
|
52
|
+
return {} unless value
|
|
53
|
+
|
|
54
|
+
hash = value.respond_to?(:to_h) ? value.to_h : value
|
|
55
|
+
hash.each_with_object({}) do |(key, val), memo|
|
|
56
|
+
memo[key.to_sym] = val
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def extract_with_selectors(document, selectors)
|
|
61
|
+
fragments = Array(selectors).filter_map do |selector|
|
|
62
|
+
next if selector.blank?
|
|
63
|
+
|
|
64
|
+
nodes = document.css(selector.to_s)
|
|
65
|
+
next if nodes.empty?
|
|
66
|
+
|
|
67
|
+
nodes.map(&:to_html).join("\n")
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
return if fragments.empty?
|
|
71
|
+
|
|
72
|
+
fragments.join("\n")
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def build_readability_document(html, options)
|
|
76
|
+
symbolized = options.each_with_object({}) do |(key, value), memo|
|
|
77
|
+
memo[key.to_sym] = value
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
::Readability::Document.new(html, symbolized)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def extract_title(document, selectors, readability_doc)
|
|
84
|
+
Array(selectors).each do |selector|
|
|
85
|
+
next if selector.blank?
|
|
86
|
+
|
|
87
|
+
node = document.at_css(selector.to_s)
|
|
88
|
+
return node.text.strip if node&.text.present?
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
if readability_doc.respond_to?(:title)
|
|
92
|
+
title = readability_doc.title&.strip
|
|
93
|
+
return title if title.present?
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
document.at_css("title")&.text&.strip
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "active_support/core_ext/object/blank"
|
|
4
|
+
|
|
5
|
+
require "source_monitor/scrapers/fetchers/http_fetcher"
|
|
6
|
+
require "source_monitor/scrapers/parsers/readability_parser"
|
|
7
|
+
|
|
8
|
+
module SourceMonitor
|
|
9
|
+
module Scrapers
|
|
10
|
+
class Readability < Base
|
|
11
|
+
DEFAULT_ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
|
12
|
+
FETCHER_CLASS = SourceMonitor::Scrapers::Fetchers::HttpFetcher
|
|
13
|
+
PARSER_CLASS = SourceMonitor::Scrapers::Parsers::ReadabilityParser
|
|
14
|
+
|
|
15
|
+
def self.default_settings
|
|
16
|
+
{
|
|
17
|
+
http: {
|
|
18
|
+
headers: {
|
|
19
|
+
"Accept" => DEFAULT_ACCEPT,
|
|
20
|
+
"User-Agent" => SourceMonitor::HTTP::DEFAULT_USER_AGENT
|
|
21
|
+
},
|
|
22
|
+
timeout: SourceMonitor::HTTP::DEFAULT_TIMEOUT,
|
|
23
|
+
open_timeout: SourceMonitor::HTTP::DEFAULT_OPEN_TIMEOUT,
|
|
24
|
+
proxy: nil
|
|
25
|
+
},
|
|
26
|
+
selectors: {
|
|
27
|
+
content: nil,
|
|
28
|
+
title: nil
|
|
29
|
+
},
|
|
30
|
+
readability: {
|
|
31
|
+
remove_unlikely_candidates: true,
|
|
32
|
+
clean_conditionally: true,
|
|
33
|
+
retry_length: 250,
|
|
34
|
+
min_text_length: 25
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def call
|
|
40
|
+
url = preferred_url
|
|
41
|
+
return failure_result("missing_url", "No URL available for scraping", url:) if url.blank?
|
|
42
|
+
|
|
43
|
+
fetch_result = fetcher.fetch(url:, settings: settings[:http])
|
|
44
|
+
return build_fetch_failure(fetch_result, url) if fetch_result.status == :failed
|
|
45
|
+
|
|
46
|
+
parser_result = parser.parse(
|
|
47
|
+
html: fetch_result.body.to_s,
|
|
48
|
+
selectors: settings[:selectors],
|
|
49
|
+
readability: settings[:readability]
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
return build_parser_failure(parser_result, fetch_result, url) if parser_result.status == :failed
|
|
53
|
+
|
|
54
|
+
Result.new(
|
|
55
|
+
status: parser_result.status,
|
|
56
|
+
html: fetch_result.body,
|
|
57
|
+
content: parser_result.content,
|
|
58
|
+
metadata: build_metadata(fetch_result:, parser_result:, url:)
|
|
59
|
+
)
|
|
60
|
+
rescue StandardError => error
|
|
61
|
+
failure_result(error.class.name, error.message, url: url)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
private
|
|
65
|
+
|
|
66
|
+
def preferred_url
|
|
67
|
+
item.canonical_url.presence || item.url
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def fetcher
|
|
71
|
+
@fetcher ||= FETCHER_CLASS.new(http: http)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def parser
|
|
75
|
+
@parser ||= PARSER_CLASS.new
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def build_fetch_failure(fetch_result, url)
|
|
79
|
+
failure_result(
|
|
80
|
+
fetch_result.error || "fetch_error",
|
|
81
|
+
fetch_result.message || "Failed to fetch URL",
|
|
82
|
+
url: url,
|
|
83
|
+
http_status: fetch_result.http_status
|
|
84
|
+
)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def build_parser_failure(parser_result, fetch_result, url)
|
|
88
|
+
metadata = {
|
|
89
|
+
error: parser_result.metadata&.[](:error) || "parser_error",
|
|
90
|
+
message: parser_result.metadata&.[](:message) || "Failed to parse content",
|
|
91
|
+
url: url,
|
|
92
|
+
http_status: fetch_result.http_status
|
|
93
|
+
}.compact
|
|
94
|
+
|
|
95
|
+
Result.new(status: :failed, html: fetch_result.body, content: nil, metadata: metadata)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def build_metadata(fetch_result:, parser_result:, url:)
|
|
99
|
+
headers = fetch_result.headers || {}
|
|
100
|
+
content_type = headers["content-type"] || headers["Content-Type"]
|
|
101
|
+
|
|
102
|
+
metadata = {
|
|
103
|
+
url: url,
|
|
104
|
+
http_status: fetch_result.http_status,
|
|
105
|
+
content_type: content_type,
|
|
106
|
+
extraction_strategy: parser_result.strategy,
|
|
107
|
+
title: parser_result.title,
|
|
108
|
+
settings: deep_duplicate(settings)
|
|
109
|
+
}.compact
|
|
110
|
+
|
|
111
|
+
if parser_result.metadata && parser_result.metadata[:readability_text_length]
|
|
112
|
+
metadata[:readability_text_length] = parser_result.metadata[:readability_text_length]
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
metadata
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def failure_result(error, message, url:, http_status: nil)
|
|
119
|
+
Result.new(
|
|
120
|
+
status: :failed,
|
|
121
|
+
html: nil,
|
|
122
|
+
content: nil,
|
|
123
|
+
metadata: {
|
|
124
|
+
error: error,
|
|
125
|
+
message: message,
|
|
126
|
+
url: url,
|
|
127
|
+
http_status: derive_status(message, http_status)
|
|
128
|
+
}.compact
|
|
129
|
+
)
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def derive_status(message, explicit_status)
|
|
133
|
+
return explicit_status if explicit_status
|
|
134
|
+
|
|
135
|
+
return unless message
|
|
136
|
+
|
|
137
|
+
if (match = message.match(/status\s+(\d{3})/))
|
|
138
|
+
match[1].to_i
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def deep_duplicate(value)
|
|
143
|
+
case value
|
|
144
|
+
when Hash
|
|
145
|
+
value.each_with_object({}) do |(key, val), memo|
|
|
146
|
+
memo[key] = deep_duplicate(val)
|
|
147
|
+
end
|
|
148
|
+
when Array
|
|
149
|
+
value.map { |element| deep_duplicate(element) }
|
|
150
|
+
else
|
|
151
|
+
value
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
end
|