source_monitor 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +16 -0
  3. data/.rubocop.yml +12 -0
  4. data/.ruby-version +1 -0
  5. data/AGENTS.md +132 -0
  6. data/CHANGELOG.md +66 -0
  7. data/CONTRIBUTING.md +31 -0
  8. data/Gemfile +30 -0
  9. data/Gemfile.lock +411 -0
  10. data/MIT-LICENSE +20 -0
  11. data/README.md +108 -0
  12. data/Rakefile +8 -0
  13. data/app/assets/builds/.keep +0 -0
  14. data/app/assets/config/source_monitor_manifest.js +4 -0
  15. data/app/assets/images/source_monitor/.keep +0 -0
  16. data/app/assets/javascripts/source_monitor/application.js +20 -0
  17. data/app/assets/javascripts/source_monitor/controllers/async_submit_controller.js +36 -0
  18. data/app/assets/javascripts/source_monitor/controllers/dropdown_controller.js +109 -0
  19. data/app/assets/javascripts/source_monitor/controllers/modal_controller.js +56 -0
  20. data/app/assets/javascripts/source_monitor/controllers/notification_controller.js +53 -0
  21. data/app/assets/javascripts/source_monitor/turbo_actions.js +13 -0
  22. data/app/assets/stylesheets/source_monitor/application.tailwind.css +13 -0
  23. data/app/assets/svgs/source_monitor/.keep +0 -0
  24. data/app/controllers/concerns/.keep +0 -0
  25. data/app/controllers/concerns/source_monitor/sanitizes_search_params.rb +81 -0
  26. data/app/controllers/source_monitor/application_controller.rb +62 -0
  27. data/app/controllers/source_monitor/dashboard_controller.rb +27 -0
  28. data/app/controllers/source_monitor/fetch_logs_controller.rb +9 -0
  29. data/app/controllers/source_monitor/health_controller.rb +10 -0
  30. data/app/controllers/source_monitor/items_controller.rb +116 -0
  31. data/app/controllers/source_monitor/logs_controller.rb +15 -0
  32. data/app/controllers/source_monitor/scrape_logs_controller.rb +9 -0
  33. data/app/controllers/source_monitor/source_bulk_scrapes_controller.rb +35 -0
  34. data/app/controllers/source_monitor/source_fetches_controller.rb +22 -0
  35. data/app/controllers/source_monitor/source_health_checks_controller.rb +34 -0
  36. data/app/controllers/source_monitor/source_health_resets_controller.rb +27 -0
  37. data/app/controllers/source_monitor/source_retries_controller.rb +22 -0
  38. data/app/controllers/source_monitor/source_turbo_responses.rb +115 -0
  39. data/app/controllers/source_monitor/sources_controller.rb +179 -0
  40. data/app/helpers/source_monitor/application_helper.rb +327 -0
  41. data/app/jobs/source_monitor/application_job.rb +13 -0
  42. data/app/jobs/source_monitor/fetch_feed_job.rb +117 -0
  43. data/app/jobs/source_monitor/item_cleanup_job.rb +48 -0
  44. data/app/jobs/source_monitor/log_cleanup_job.rb +47 -0
  45. data/app/jobs/source_monitor/schedule_fetches_job.rb +29 -0
  46. data/app/jobs/source_monitor/scrape_item_job.rb +47 -0
  47. data/app/jobs/source_monitor/source_health_check_job.rb +77 -0
  48. data/app/mailers/source_monitor/application_mailer.rb +17 -0
  49. data/app/models/concerns/.keep +0 -0
  50. data/app/models/concerns/source_monitor/loggable.rb +18 -0
  51. data/app/models/source_monitor/application_record.rb +5 -0
  52. data/app/models/source_monitor/fetch_log.rb +31 -0
  53. data/app/models/source_monitor/health_check_log.rb +28 -0
  54. data/app/models/source_monitor/item.rb +102 -0
  55. data/app/models/source_monitor/item_content.rb +11 -0
  56. data/app/models/source_monitor/log_entry.rb +56 -0
  57. data/app/models/source_monitor/scrape_log.rb +31 -0
  58. data/app/models/source_monitor/source.rb +115 -0
  59. data/app/views/layouts/source_monitor/application.html.erb +54 -0
  60. data/app/views/source_monitor/dashboard/_fetch_schedule.html.erb +90 -0
  61. data/app/views/source_monitor/dashboard/_job_metrics.html.erb +82 -0
  62. data/app/views/source_monitor/dashboard/_recent_activity.html.erb +39 -0
  63. data/app/views/source_monitor/dashboard/_stat_card.html.erb +6 -0
  64. data/app/views/source_monitor/dashboard/_stats.html.erb +9 -0
  65. data/app/views/source_monitor/dashboard/index.html.erb +48 -0
  66. data/app/views/source_monitor/fetch_logs/show.html.erb +90 -0
  67. data/app/views/source_monitor/items/_details.html.erb +234 -0
  68. data/app/views/source_monitor/items/_details_wrapper.html.erb +3 -0
  69. data/app/views/source_monitor/items/index.html.erb +147 -0
  70. data/app/views/source_monitor/items/show.html.erb +3 -0
  71. data/app/views/source_monitor/logs/index.html.erb +208 -0
  72. data/app/views/source_monitor/scrape_logs/show.html.erb +73 -0
  73. data/app/views/source_monitor/shared/_toast.html.erb +34 -0
  74. data/app/views/source_monitor/sources/_bulk_scrape_form.html.erb +64 -0
  75. data/app/views/source_monitor/sources/_bulk_scrape_modal.html.erb +53 -0
  76. data/app/views/source_monitor/sources/_details.html.erb +302 -0
  77. data/app/views/source_monitor/sources/_details_wrapper.html.erb +3 -0
  78. data/app/views/source_monitor/sources/_empty_state_row.html.erb +5 -0
  79. data/app/views/source_monitor/sources/_fetch_interval_heatmap.html.erb +46 -0
  80. data/app/views/source_monitor/sources/_form.html.erb +143 -0
  81. data/app/views/source_monitor/sources/_health_status_badge.html.erb +46 -0
  82. data/app/views/source_monitor/sources/_row.html.erb +102 -0
  83. data/app/views/source_monitor/sources/edit.html.erb +28 -0
  84. data/app/views/source_monitor/sources/index.html.erb +153 -0
  85. data/app/views/source_monitor/sources/new.html.erb +22 -0
  86. data/app/views/source_monitor/sources/show.html.erb +3 -0
  87. data/config/coverage_baseline.json +2010 -0
  88. data/config/initializers/feedjira.rb +19 -0
  89. data/config/routes.rb +18 -0
  90. data/config/tailwind.config.js +17 -0
  91. data/db/migrate/20241008120000_create_source_monitor_sources.rb +40 -0
  92. data/db/migrate/20241008121000_create_source_monitor_items.rb +44 -0
  93. data/db/migrate/20241008122000_create_source_monitor_fetch_logs.rb +32 -0
  94. data/db/migrate/20241008123000_create_source_monitor_scrape_logs.rb +25 -0
  95. data/db/migrate/20251008183000_change_fetch_interval_to_minutes.rb +23 -0
  96. data/db/migrate/20251009090000_create_source_monitor_item_contents.rb +38 -0
  97. data/db/migrate/20251009103000_add_feed_content_readability_to_sources.rb +5 -0
  98. data/db/migrate/20251010090000_add_adaptive_fetching_toggle_to_sources.rb +7 -0
  99. data/db/migrate/20251010123000_add_deleted_at_to_source_monitor_items.rb +8 -0
  100. data/db/migrate/20251010153000_add_type_to_source_monitor_sources.rb +8 -0
  101. data/db/migrate/20251010154500_add_fetch_status_to_source_monitor_sources.rb +9 -0
  102. data/db/migrate/20251010160000_create_solid_cable_messages.rb +16 -0
  103. data/db/migrate/20251011090000_add_fetch_retry_state_to_sources.rb +14 -0
  104. data/db/migrate/20251012090000_add_health_fields_to_sources.rb +17 -0
  105. data/db/migrate/20251012100000_optimize_source_monitor_database_performance.rb +13 -0
  106. data/db/migrate/20251014064947_add_not_null_constraints_to_items.rb +30 -0
  107. data/db/migrate/20251014171659_add_performance_indexes.rb +29 -0
  108. data/db/migrate/20251014172525_add_fetch_status_check_constraint.rb +18 -0
  109. data/db/migrate/20251015100000_create_source_monitor_log_entries.rb +89 -0
  110. data/db/migrate/20251022100000_create_source_monitor_health_check_logs.rb +22 -0
  111. data/db/migrate/20251108120116_refresh_fetch_status_constraint.rb +29 -0
  112. data/docs/configuration.md +170 -0
  113. data/docs/deployment.md +63 -0
  114. data/docs/gh-cli-workflow.md +44 -0
  115. data/docs/installation.md +144 -0
  116. data/docs/troubleshooting.md +76 -0
  117. data/eslint.config.mjs +27 -0
  118. data/lib/generators/source_monitor/install/install_generator.rb +59 -0
  119. data/lib/generators/source_monitor/install/templates/source_monitor.rb.tt +155 -0
  120. data/lib/source_monitor/analytics/source_activity_rates.rb +53 -0
  121. data/lib/source_monitor/analytics/source_fetch_interval_distribution.rb +57 -0
  122. data/lib/source_monitor/analytics/sources_index_metrics.rb +92 -0
  123. data/lib/source_monitor/assets/bundler.rb +49 -0
  124. data/lib/source_monitor/assets.rb +6 -0
  125. data/lib/source_monitor/configuration.rb +654 -0
  126. data/lib/source_monitor/dashboard/queries.rb +356 -0
  127. data/lib/source_monitor/dashboard/quick_action.rb +7 -0
  128. data/lib/source_monitor/dashboard/quick_actions_presenter.rb +26 -0
  129. data/lib/source_monitor/dashboard/recent_activity.rb +30 -0
  130. data/lib/source_monitor/dashboard/recent_activity_presenter.rb +77 -0
  131. data/lib/source_monitor/dashboard/turbo_broadcaster.rb +87 -0
  132. data/lib/source_monitor/dashboard/upcoming_fetch_schedule.rb +126 -0
  133. data/lib/source_monitor/engine.rb +107 -0
  134. data/lib/source_monitor/events.rb +110 -0
  135. data/lib/source_monitor/feedjira_extensions.rb +103 -0
  136. data/lib/source_monitor/fetching/advisory_lock.rb +54 -0
  137. data/lib/source_monitor/fetching/completion/event_publisher.rb +22 -0
  138. data/lib/source_monitor/fetching/completion/follow_up_handler.rb +37 -0
  139. data/lib/source_monitor/fetching/completion/retention_handler.rb +30 -0
  140. data/lib/source_monitor/fetching/feed_fetcher.rb +627 -0
  141. data/lib/source_monitor/fetching/fetch_error.rb +88 -0
  142. data/lib/source_monitor/fetching/fetch_runner.rb +142 -0
  143. data/lib/source_monitor/fetching/retry_policy.rb +85 -0
  144. data/lib/source_monitor/fetching/stalled_fetch_reconciler.rb +146 -0
  145. data/lib/source_monitor/health/source_health_check.rb +100 -0
  146. data/lib/source_monitor/health/source_health_monitor.rb +210 -0
  147. data/lib/source_monitor/health/source_health_reset.rb +68 -0
  148. data/lib/source_monitor/health.rb +46 -0
  149. data/lib/source_monitor/http.rb +85 -0
  150. data/lib/source_monitor/instrumentation.rb +52 -0
  151. data/lib/source_monitor/items/item_creator.rb +601 -0
  152. data/lib/source_monitor/items/retention_pruner.rb +146 -0
  153. data/lib/source_monitor/items/retention_strategies/destroy.rb +26 -0
  154. data/lib/source_monitor/items/retention_strategies/soft_delete.rb +50 -0
  155. data/lib/source_monitor/items/retention_strategies.rb +9 -0
  156. data/lib/source_monitor/jobs/cleanup_options.rb +85 -0
  157. data/lib/source_monitor/jobs/fetch_failure_subscriber.rb +129 -0
  158. data/lib/source_monitor/jobs/solid_queue_metrics.rb +199 -0
  159. data/lib/source_monitor/jobs/visibility.rb +133 -0
  160. data/lib/source_monitor/logs/entry_sync.rb +69 -0
  161. data/lib/source_monitor/logs/filter_set.rb +163 -0
  162. data/lib/source_monitor/logs/query.rb +81 -0
  163. data/lib/source_monitor/logs/table_presenter.rb +161 -0
  164. data/lib/source_monitor/metrics.rb +77 -0
  165. data/lib/source_monitor/model_extensions.rb +109 -0
  166. data/lib/source_monitor/models/sanitizable.rb +76 -0
  167. data/lib/source_monitor/models/url_normalizable.rb +84 -0
  168. data/lib/source_monitor/pagination/paginator.rb +90 -0
  169. data/lib/source_monitor/realtime/adapter.rb +97 -0
  170. data/lib/source_monitor/realtime/broadcaster.rb +237 -0
  171. data/lib/source_monitor/realtime.rb +17 -0
  172. data/lib/source_monitor/release/changelog.rb +59 -0
  173. data/lib/source_monitor/release/runner.rb +73 -0
  174. data/lib/source_monitor/scheduler.rb +82 -0
  175. data/lib/source_monitor/scrapers/base.rb +105 -0
  176. data/lib/source_monitor/scrapers/fetchers/http_fetcher.rb +97 -0
  177. data/lib/source_monitor/scrapers/parsers/readability_parser.rb +101 -0
  178. data/lib/source_monitor/scrapers/readability.rb +156 -0
  179. data/lib/source_monitor/scraping/bulk_result_presenter.rb +85 -0
  180. data/lib/source_monitor/scraping/bulk_source_scraper.rb +233 -0
  181. data/lib/source_monitor/scraping/enqueuer.rb +125 -0
  182. data/lib/source_monitor/scraping/item_scraper/adapter_resolver.rb +44 -0
  183. data/lib/source_monitor/scraping/item_scraper/persistence.rb +189 -0
  184. data/lib/source_monitor/scraping/item_scraper.rb +84 -0
  185. data/lib/source_monitor/scraping/scheduler.rb +43 -0
  186. data/lib/source_monitor/scraping/state.rb +79 -0
  187. data/lib/source_monitor/security/authentication.rb +85 -0
  188. data/lib/source_monitor/security/parameter_sanitizer.rb +42 -0
  189. data/lib/source_monitor/sources/turbo_stream_presenter.rb +54 -0
  190. data/lib/source_monitor/turbo_streams/stream_responder.rb +95 -0
  191. data/lib/source_monitor/version.rb +3 -0
  192. data/lib/source_monitor.rb +149 -0
  193. data/lib/tasks/recover_stalled_fetches.rake +16 -0
  194. data/lib/tasks/source_monitor_assets.rake +28 -0
  195. data/lib/tasks/source_monitor_tasks.rake +29 -0
  196. data/lib/tasks/test_smoke.rake +12 -0
  197. data/package-lock.json +3997 -0
  198. data/package.json +29 -0
  199. data/postcss.config.js +6 -0
  200. data/source_monitor.gemspec +46 -0
  201. data/stylelint.config.js +12 -0
  202. metadata +469 -0
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+
5
+ module SourceMonitor
6
+ module Release
7
+ class Changelog
8
+ MissingEntryError = Class.new(StandardError)
9
+
10
+ def initialize(path: default_path)
11
+ @path = Pathname.new(path)
12
+ end
13
+
14
+ def latest_entry
15
+ @latest_entry ||= begin
16
+ sections = extract_sections
17
+ heading = sections.keys.find { |key| key != "## Release Checklist" }
18
+ raise MissingEntryError, "Unable to find changelog entry after Release Checklist" unless heading
19
+
20
+ content = ([ heading ] + sections.fetch(heading)).join
21
+ content.rstrip
22
+ end
23
+ end
24
+
25
+ def annotation_for(version)
26
+ raise ArgumentError, "version must be provided" if version.to_s.strip.empty?
27
+
28
+ [ "SourceMonitor v#{version}", latest_entry ].join("\n\n")
29
+ end
30
+
31
+ private
32
+
33
+ attr_reader :path
34
+
35
+ def default_path
36
+ Pathname.new(__dir__).join("..", "..", "..", "CHANGELOG.md").expand_path
37
+ end
38
+
39
+ def extract_sections
40
+ sections = {}
41
+ current_heading = nil
42
+
43
+ File.foreach(path) do |line|
44
+ if line.start_with?("## ")
45
+ current_heading = line.strip
46
+ sections[current_heading] ||= []
47
+ next
48
+ end
49
+
50
+ next unless current_heading
51
+
52
+ sections[current_heading] << line
53
+ end
54
+
55
+ sections
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "tempfile"
4
+ require_relative "changelog"
5
+
6
+ module SourceMonitor
7
+ module Release
8
+ class Runner
9
+ CommandFailure = Class.new(StandardError)
10
+
11
+ QUALITY_COMMANDS = [
12
+ [ "bin/rubocop" ],
13
+ [ "bin/brakeman", "--no-pager" ],
14
+ [ "bin/test-coverage" ],
15
+ [ "bin/check-diff-coverage" ]
16
+ ].freeze
17
+ GEM_BUILD_COMMAND = [ "rbenv", "exec", "gem", "build", "source_monitor.gemspec" ].freeze
18
+
19
+ def initialize(version:, executor: Executor.new, changelog: Changelog.new)
20
+ @version = version
21
+ @executor = executor
22
+ @changelog = changelog
23
+ end
24
+
25
+ def call
26
+ validate_version!
27
+ run_commands(QUALITY_COMMANDS)
28
+ run_command(GEM_BUILD_COMMAND)
29
+ create_annotated_tag
30
+ true
31
+ end
32
+
33
+ private
34
+
35
+ attr_reader :version, :executor, :changelog
36
+
37
+ def run_commands(commands)
38
+ commands.each do |command|
39
+ run_command(command)
40
+ end
41
+ end
42
+
43
+ def run_command(command, env: {})
44
+ success = executor.run(command, env:)
45
+ return if success
46
+
47
+ raise CommandFailure, "Command failed: #{command.join(' ')}"
48
+ end
49
+
50
+ def create_annotated_tag
51
+ message = changelog.annotation_for(version)
52
+
53
+ Tempfile.create([ "feed-monitor-release", ".log" ]) do |file|
54
+ file.write(message)
55
+ file.flush
56
+ file.rewind
57
+
58
+ run_command([ "git", "tag", "-a", "v#{version}", "-F", file.path ])
59
+ end
60
+ end
61
+
62
+ def validate_version!
63
+ raise ArgumentError, "version must be provided" if version.to_s.strip.empty?
64
+ end
65
+
66
+ class Executor
67
+ def run(command, env: {})
68
+ system(env, *command)
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_support/core_ext/numeric/time"
4
+ require "source_monitor/fetching/stalled_fetch_reconciler"
5
+
6
+ module SourceMonitor
7
+ class Scheduler
8
+ DEFAULT_BATCH_SIZE = 100
9
+ STALE_QUEUE_TIMEOUT = 10.minutes
10
+ ELIGIBLE_FETCH_STATUSES = %w[idle failed].freeze
11
+
12
+ def self.run(limit: DEFAULT_BATCH_SIZE, now: Time.current)
13
+ new(limit:, now:).run
14
+ end
15
+
16
+ def initialize(limit:, now:)
17
+ @limit = limit
18
+ @now = now
19
+ end
20
+
21
+ def run
22
+ payload = { limit: limit }
23
+ recovery = SourceMonitor::Fetching::StalledFetchReconciler.call(now:, stale_after: STALE_QUEUE_TIMEOUT)
24
+ payload[:stalled_recoveries] = recovery.recovered_source_ids.size
25
+ payload[:stalled_jobs_removed] = recovery.jobs_removed.size
26
+
27
+ ActiveSupport::Notifications.instrument("source_monitor.scheduler.run", payload) do
28
+ start_monotonic = SourceMonitor::Instrumentation.monotonic_time
29
+ source_ids = lock_due_source_ids
30
+ payload[:enqueued_count] = source_ids.size
31
+
32
+ source_ids.each do |source_id|
33
+ SourceMonitor::Fetching::FetchRunner.enqueue(source_id)
34
+ end
35
+
36
+ payload[:duration_ms] = ((SourceMonitor::Instrumentation.monotonic_time - start_monotonic) * 1000.0).round(2)
37
+
38
+ source_ids.size
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ attr_reader :limit, :now
45
+
46
+ def lock_due_source_ids
47
+ ids = []
48
+
49
+ SourceMonitor::Source.transaction do
50
+ rows = due_sources_relation
51
+ rows = rows.limit(limit) if limit
52
+ ids = rows.lock("FOR UPDATE SKIP LOCKED").pluck(:id)
53
+ end
54
+
55
+ ids
56
+ end
57
+
58
+ def due_sources_relation
59
+ SourceMonitor::Source
60
+ .active
61
+ .where(due_for_fetch_predicate)
62
+ .where(fetch_status_predicate)
63
+ .order(Arel.sql("next_fetch_at ASC NULLS FIRST"))
64
+ end
65
+
66
+ def due_for_fetch_predicate
67
+ table = SourceMonitor::Source.arel_table
68
+ table[:next_fetch_at].eq(nil).or(table[:next_fetch_at].lteq(now))
69
+ end
70
+
71
+ def fetch_status_predicate
72
+ table = SourceMonitor::Source.arel_table
73
+
74
+ eligible = table[:fetch_status].in(ELIGIBLE_FETCH_STATUSES)
75
+ stale_cutoff = now - STALE_QUEUE_TIMEOUT
76
+ stale_queued = table[:fetch_status].eq("queued").and(table[:updated_at].lteq(stale_cutoff))
77
+ stale_fetching = table[:fetch_status].eq("fetching").and(table[:last_fetch_started_at].lteq(stale_cutoff))
78
+
79
+ eligible.or(stale_queued).or(stale_fetching)
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,105 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_support/core_ext/hash/deep_merge"
4
+ require "active_support/hash_with_indifferent_access"
5
+ require "active_support/core_ext/hash/keys"
6
+
7
+ module SourceMonitor
8
+ module Scrapers
9
+ # Base class for content scrapers used by the engine.
10
+ #
11
+ # == Adapter Contract
12
+ # Subclasses must implement #call and return a Result object describing the
13
+ # outcome of a scrape attempt. Implementations receive an item, the owning
14
+ # source, and a normalized settings hash that merges default adapter
15
+ # settings, source-level overrides, and per-invocation overrides. All
16
+ # adapters should remain stateless and thread-safe, relying on injected
17
+ # collaborators (e.g. HTTP clients) instead of global configuration.
18
+ #
19
+ # Adapters should:
20
+ # * Perform any outbound HTTP work using the provided +http+ client.
21
+ # * Populate the Result with :html and :content payloads when successful.
22
+ # * Use :status to communicate :success, :partial, or :failed outcomes.
23
+ # * Capture additional diagnostics (headers, timings, etc.) in :metadata.
24
+ class Base
25
+ Result = Struct.new(:status, :html, :content, :metadata, keyword_init: true)
26
+
27
+ class << self
28
+ def call(item:, source:, settings: nil, http: SourceMonitor::HTTP)
29
+ new(item: item, source: source, settings: settings, http: http).call
30
+ end
31
+
32
+ def adapter_name
33
+ name.demodulize.sub(/Scraper\z/, "").underscore
34
+ end
35
+
36
+ def default_settings
37
+ {}
38
+ end
39
+ end
40
+
41
+ def initialize(item:, source:, settings: nil, http: SourceMonitor::HTTP)
42
+ @item = item
43
+ @source = source
44
+ @http = http
45
+ @settings = build_settings(settings)
46
+ end
47
+
48
+ def call
49
+ raise NotImplementedError, "#{self.class.name} must implement #call"
50
+ end
51
+
52
+ protected
53
+
54
+ attr_reader :item, :source, :http, :settings
55
+
56
+ private
57
+
58
+ def build_settings(overrides)
59
+ combined = normalize_settings(self.class.default_settings)
60
+ .deep_merge(normalize_settings(source_settings))
61
+
62
+ if overrides.present? && overrides.respond_to?(:to_hash)
63
+ combined = combined.deep_merge(normalize_settings(overrides.to_hash))
64
+ end
65
+
66
+ deep_indifferent_access(combined)
67
+ end
68
+
69
+ def source_settings
70
+ value = source&.scrape_settings
71
+ return {} unless value.respond_to?(:to_hash)
72
+
73
+ value.to_hash
74
+ end
75
+
76
+ def deep_indifferent_access(value)
77
+ case value
78
+ when Hash
79
+ value.each_with_object(ActiveSupport::HashWithIndifferentAccess.new) do |(key, val), memo|
80
+ memo[key] = deep_indifferent_access(val)
81
+ end
82
+ when Array
83
+ value.map { |element| deep_indifferent_access(element) }
84
+ else
85
+ value
86
+ end
87
+ end
88
+
89
+ def normalize_settings(value)
90
+ return value if value.nil?
91
+
92
+ case value
93
+ when Hash
94
+ value.each_with_object({}) do |(key, val), memo|
95
+ memo[key.to_s] = normalize_settings(val)
96
+ end
97
+ when Array
98
+ value.map { |element| normalize_settings(element) }
99
+ else
100
+ value
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_support/core_ext/object/blank"
4
+
5
+ module SourceMonitor
6
+ module Scrapers
7
+ module Fetchers
8
+ class HttpFetcher
9
+ Result = Struct.new(:status, :body, :headers, :http_status, :error, :message, keyword_init: true)
10
+
11
+ def initialize(http: SourceMonitor::HTTP)
12
+ @http = http
13
+ end
14
+
15
+ def fetch(url:, settings: nil)
16
+ response = connection(settings).get(url)
17
+
18
+ if success_status?(response.status)
19
+ Result.new(status: :success, body: response.body, headers: response.headers, http_status: response.status)
20
+ else
21
+ Result.new(
22
+ status: :failed,
23
+ http_status: response.status,
24
+ error: "http_error",
25
+ message: "Non-success HTTP status"
26
+ )
27
+ end
28
+ rescue Faraday::ClientError => error
29
+ Result.new(
30
+ status: :failed,
31
+ http_status: extract_status(error),
32
+ error: error.class.name,
33
+ message: error.message
34
+ )
35
+ rescue Faraday::Error => error
36
+ Result.new(status: :failed, error: error.class.name, message: error.message)
37
+ end
38
+
39
+ private
40
+
41
+ attr_reader :http
42
+
43
+ def connection(settings)
44
+ normalized = normalize_settings(settings)
45
+ http.client(
46
+ proxy: normalized[:proxy],
47
+ headers: normalized[:headers],
48
+ timeout: normalized[:timeout] || SourceMonitor::HTTP::DEFAULT_TIMEOUT,
49
+ open_timeout: normalized[:open_timeout] || SourceMonitor::HTTP::DEFAULT_OPEN_TIMEOUT
50
+ )
51
+ end
52
+
53
+ def normalize_settings(settings)
54
+ return {} unless settings
55
+
56
+ settings = settings.respond_to?(:to_h) ? settings.to_h : settings
57
+ {
58
+ headers: (settings[:headers] || {}).to_h,
59
+ timeout: settings[:timeout],
60
+ open_timeout: settings[:open_timeout],
61
+ proxy: settings[:proxy].presence
62
+ }
63
+ end
64
+
65
+ def success_status?(status)
66
+ status.to_i >= 200 && status.to_i < 300
67
+ end
68
+
69
+ def extract_status(error)
70
+ candidates = []
71
+
72
+ if error.respond_to?(:response_status)
73
+ candidates << error.response_status
74
+ end
75
+
76
+ if error.respond_to?(:response)
77
+ response = error.response
78
+ if response.respond_to?(:[]) && response[:status]
79
+ candidates << response[:status]
80
+ elsif response.is_a?(Hash)
81
+ candidates << response["status"]
82
+ candidates << response[:status]
83
+ end
84
+ end
85
+
86
+ if error.respond_to?(:message) && error.message
87
+ error.message.scan(/\d{3}/).each do |number|
88
+ candidates << number.to_i
89
+ end
90
+ end
91
+
92
+ candidates.compact.first
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "readability"
4
+ require "nokolexbor"
5
+ require "active_support/core_ext/object/blank"
6
+
7
+ module SourceMonitor
8
+ module Scrapers
9
+ module Parsers
10
+ class ReadabilityParser
11
+ Result = Struct.new(:status, :content, :strategy, :title, :metadata, keyword_init: true)
12
+
13
+ def parse(html:, selectors: nil, readability: nil)
14
+ document = ::Nokolexbor::HTML(html)
15
+ selectors_hash = normalize_hash(selectors)
16
+ readability_options = normalize_hash(readability)
17
+
18
+ content_html = extract_with_selectors(document, selectors_hash[:content])
19
+ strategy = content_html.present? ? :selectors : :readability
20
+
21
+ readability_doc = build_readability_document(html, readability_options)
22
+ content_html = readability_doc.content&.strip if content_html.blank?
23
+
24
+ status = content_html.present? ? :success : :partial
25
+
26
+ title = extract_title(document, selectors_hash[:title], readability_doc)
27
+ metadata = {}
28
+ if readability_doc.respond_to?(:content_length)
29
+ metadata[:readability_text_length] = readability_doc.content_length
30
+ end
31
+
32
+ Result.new(
33
+ status: status,
34
+ content: content_html.presence,
35
+ strategy: strategy,
36
+ title: title,
37
+ metadata: metadata.compact
38
+ )
39
+ rescue StandardError => error
40
+ Result.new(
41
+ status: :failed,
42
+ content: nil,
43
+ strategy: :readability,
44
+ title: nil,
45
+ metadata: { error: error.class.name, message: error.message }
46
+ )
47
+ end
48
+
49
+ private
50
+
51
+ def normalize_hash(value)
52
+ return {} unless value
53
+
54
+ hash = value.respond_to?(:to_h) ? value.to_h : value
55
+ hash.each_with_object({}) do |(key, val), memo|
56
+ memo[key.to_sym] = val
57
+ end
58
+ end
59
+
60
+ def extract_with_selectors(document, selectors)
61
+ fragments = Array(selectors).filter_map do |selector|
62
+ next if selector.blank?
63
+
64
+ nodes = document.css(selector.to_s)
65
+ next if nodes.empty?
66
+
67
+ nodes.map(&:to_html).join("\n")
68
+ end
69
+
70
+ return if fragments.empty?
71
+
72
+ fragments.join("\n")
73
+ end
74
+
75
+ def build_readability_document(html, options)
76
+ symbolized = options.each_with_object({}) do |(key, value), memo|
77
+ memo[key.to_sym] = value
78
+ end
79
+
80
+ ::Readability::Document.new(html, symbolized)
81
+ end
82
+
83
+ def extract_title(document, selectors, readability_doc)
84
+ Array(selectors).each do |selector|
85
+ next if selector.blank?
86
+
87
+ node = document.at_css(selector.to_s)
88
+ return node.text.strip if node&.text.present?
89
+ end
90
+
91
+ if readability_doc.respond_to?(:title)
92
+ title = readability_doc.title&.strip
93
+ return title if title.present?
94
+ end
95
+
96
+ document.at_css("title")&.text&.strip
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,156 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_support/core_ext/object/blank"
4
+
5
+ require "source_monitor/scrapers/fetchers/http_fetcher"
6
+ require "source_monitor/scrapers/parsers/readability_parser"
7
+
8
+ module SourceMonitor
9
+ module Scrapers
10
+ class Readability < Base
11
+ DEFAULT_ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
12
+ FETCHER_CLASS = SourceMonitor::Scrapers::Fetchers::HttpFetcher
13
+ PARSER_CLASS = SourceMonitor::Scrapers::Parsers::ReadabilityParser
14
+
15
+ def self.default_settings
16
+ {
17
+ http: {
18
+ headers: {
19
+ "Accept" => DEFAULT_ACCEPT,
20
+ "User-Agent" => SourceMonitor::HTTP::DEFAULT_USER_AGENT
21
+ },
22
+ timeout: SourceMonitor::HTTP::DEFAULT_TIMEOUT,
23
+ open_timeout: SourceMonitor::HTTP::DEFAULT_OPEN_TIMEOUT,
24
+ proxy: nil
25
+ },
26
+ selectors: {
27
+ content: nil,
28
+ title: nil
29
+ },
30
+ readability: {
31
+ remove_unlikely_candidates: true,
32
+ clean_conditionally: true,
33
+ retry_length: 250,
34
+ min_text_length: 25
35
+ }
36
+ }
37
+ end
38
+
39
+ def call
40
+ url = preferred_url
41
+ return failure_result("missing_url", "No URL available for scraping", url:) if url.blank?
42
+
43
+ fetch_result = fetcher.fetch(url:, settings: settings[:http])
44
+ return build_fetch_failure(fetch_result, url) if fetch_result.status == :failed
45
+
46
+ parser_result = parser.parse(
47
+ html: fetch_result.body.to_s,
48
+ selectors: settings[:selectors],
49
+ readability: settings[:readability]
50
+ )
51
+
52
+ return build_parser_failure(parser_result, fetch_result, url) if parser_result.status == :failed
53
+
54
+ Result.new(
55
+ status: parser_result.status,
56
+ html: fetch_result.body,
57
+ content: parser_result.content,
58
+ metadata: build_metadata(fetch_result:, parser_result:, url:)
59
+ )
60
+ rescue StandardError => error
61
+ failure_result(error.class.name, error.message, url: url)
62
+ end
63
+
64
+ private
65
+
66
+ def preferred_url
67
+ item.canonical_url.presence || item.url
68
+ end
69
+
70
+ def fetcher
71
+ @fetcher ||= FETCHER_CLASS.new(http: http)
72
+ end
73
+
74
+ def parser
75
+ @parser ||= PARSER_CLASS.new
76
+ end
77
+
78
+ def build_fetch_failure(fetch_result, url)
79
+ failure_result(
80
+ fetch_result.error || "fetch_error",
81
+ fetch_result.message || "Failed to fetch URL",
82
+ url: url,
83
+ http_status: fetch_result.http_status
84
+ )
85
+ end
86
+
87
+ def build_parser_failure(parser_result, fetch_result, url)
88
+ metadata = {
89
+ error: parser_result.metadata&.[](:error) || "parser_error",
90
+ message: parser_result.metadata&.[](:message) || "Failed to parse content",
91
+ url: url,
92
+ http_status: fetch_result.http_status
93
+ }.compact
94
+
95
+ Result.new(status: :failed, html: fetch_result.body, content: nil, metadata: metadata)
96
+ end
97
+
98
+ def build_metadata(fetch_result:, parser_result:, url:)
99
+ headers = fetch_result.headers || {}
100
+ content_type = headers["content-type"] || headers["Content-Type"]
101
+
102
+ metadata = {
103
+ url: url,
104
+ http_status: fetch_result.http_status,
105
+ content_type: content_type,
106
+ extraction_strategy: parser_result.strategy,
107
+ title: parser_result.title,
108
+ settings: deep_duplicate(settings)
109
+ }.compact
110
+
111
+ if parser_result.metadata && parser_result.metadata[:readability_text_length]
112
+ metadata[:readability_text_length] = parser_result.metadata[:readability_text_length]
113
+ end
114
+
115
+ metadata
116
+ end
117
+
118
+ def failure_result(error, message, url:, http_status: nil)
119
+ Result.new(
120
+ status: :failed,
121
+ html: nil,
122
+ content: nil,
123
+ metadata: {
124
+ error: error,
125
+ message: message,
126
+ url: url,
127
+ http_status: derive_status(message, http_status)
128
+ }.compact
129
+ )
130
+ end
131
+
132
+ def derive_status(message, explicit_status)
133
+ return explicit_status if explicit_status
134
+
135
+ return unless message
136
+
137
+ if (match = message.match(/status\s+(\d{3})/))
138
+ match[1].to_i
139
+ end
140
+ end
141
+
142
+ def deep_duplicate(value)
143
+ case value
144
+ when Hash
145
+ value.each_with_object({}) do |(key, val), memo|
146
+ memo[key] = deep_duplicate(val)
147
+ end
148
+ when Array
149
+ value.map { |element| deep_duplicate(element) }
150
+ else
151
+ value
152
+ end
153
+ end
154
+ end
155
+ end
156
+ end