source_monitor 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +16 -0
  3. data/.rubocop.yml +12 -0
  4. data/.ruby-version +1 -0
  5. data/AGENTS.md +132 -0
  6. data/CHANGELOG.md +66 -0
  7. data/CONTRIBUTING.md +31 -0
  8. data/Gemfile +30 -0
  9. data/Gemfile.lock +411 -0
  10. data/MIT-LICENSE +20 -0
  11. data/README.md +108 -0
  12. data/Rakefile +8 -0
  13. data/app/assets/builds/.keep +0 -0
  14. data/app/assets/config/source_monitor_manifest.js +4 -0
  15. data/app/assets/images/source_monitor/.keep +0 -0
  16. data/app/assets/javascripts/source_monitor/application.js +20 -0
  17. data/app/assets/javascripts/source_monitor/controllers/async_submit_controller.js +36 -0
  18. data/app/assets/javascripts/source_monitor/controllers/dropdown_controller.js +109 -0
  19. data/app/assets/javascripts/source_monitor/controllers/modal_controller.js +56 -0
  20. data/app/assets/javascripts/source_monitor/controllers/notification_controller.js +53 -0
  21. data/app/assets/javascripts/source_monitor/turbo_actions.js +13 -0
  22. data/app/assets/stylesheets/source_monitor/application.tailwind.css +13 -0
  23. data/app/assets/svgs/source_monitor/.keep +0 -0
  24. data/app/controllers/concerns/.keep +0 -0
  25. data/app/controllers/concerns/source_monitor/sanitizes_search_params.rb +81 -0
  26. data/app/controllers/source_monitor/application_controller.rb +62 -0
  27. data/app/controllers/source_monitor/dashboard_controller.rb +27 -0
  28. data/app/controllers/source_monitor/fetch_logs_controller.rb +9 -0
  29. data/app/controllers/source_monitor/health_controller.rb +10 -0
  30. data/app/controllers/source_monitor/items_controller.rb +116 -0
  31. data/app/controllers/source_monitor/logs_controller.rb +15 -0
  32. data/app/controllers/source_monitor/scrape_logs_controller.rb +9 -0
  33. data/app/controllers/source_monitor/source_bulk_scrapes_controller.rb +35 -0
  34. data/app/controllers/source_monitor/source_fetches_controller.rb +22 -0
  35. data/app/controllers/source_monitor/source_health_checks_controller.rb +34 -0
  36. data/app/controllers/source_monitor/source_health_resets_controller.rb +27 -0
  37. data/app/controllers/source_monitor/source_retries_controller.rb +22 -0
  38. data/app/controllers/source_monitor/source_turbo_responses.rb +115 -0
  39. data/app/controllers/source_monitor/sources_controller.rb +179 -0
  40. data/app/helpers/source_monitor/application_helper.rb +327 -0
  41. data/app/jobs/source_monitor/application_job.rb +13 -0
  42. data/app/jobs/source_monitor/fetch_feed_job.rb +117 -0
  43. data/app/jobs/source_monitor/item_cleanup_job.rb +48 -0
  44. data/app/jobs/source_monitor/log_cleanup_job.rb +47 -0
  45. data/app/jobs/source_monitor/schedule_fetches_job.rb +29 -0
  46. data/app/jobs/source_monitor/scrape_item_job.rb +47 -0
  47. data/app/jobs/source_monitor/source_health_check_job.rb +77 -0
  48. data/app/mailers/source_monitor/application_mailer.rb +17 -0
  49. data/app/models/concerns/.keep +0 -0
  50. data/app/models/concerns/source_monitor/loggable.rb +18 -0
  51. data/app/models/source_monitor/application_record.rb +5 -0
  52. data/app/models/source_monitor/fetch_log.rb +31 -0
  53. data/app/models/source_monitor/health_check_log.rb +28 -0
  54. data/app/models/source_monitor/item.rb +102 -0
  55. data/app/models/source_monitor/item_content.rb +11 -0
  56. data/app/models/source_monitor/log_entry.rb +56 -0
  57. data/app/models/source_monitor/scrape_log.rb +31 -0
  58. data/app/models/source_monitor/source.rb +115 -0
  59. data/app/views/layouts/source_monitor/application.html.erb +54 -0
  60. data/app/views/source_monitor/dashboard/_fetch_schedule.html.erb +90 -0
  61. data/app/views/source_monitor/dashboard/_job_metrics.html.erb +82 -0
  62. data/app/views/source_monitor/dashboard/_recent_activity.html.erb +39 -0
  63. data/app/views/source_monitor/dashboard/_stat_card.html.erb +6 -0
  64. data/app/views/source_monitor/dashboard/_stats.html.erb +9 -0
  65. data/app/views/source_monitor/dashboard/index.html.erb +48 -0
  66. data/app/views/source_monitor/fetch_logs/show.html.erb +90 -0
  67. data/app/views/source_monitor/items/_details.html.erb +234 -0
  68. data/app/views/source_monitor/items/_details_wrapper.html.erb +3 -0
  69. data/app/views/source_monitor/items/index.html.erb +147 -0
  70. data/app/views/source_monitor/items/show.html.erb +3 -0
  71. data/app/views/source_monitor/logs/index.html.erb +208 -0
  72. data/app/views/source_monitor/scrape_logs/show.html.erb +73 -0
  73. data/app/views/source_monitor/shared/_toast.html.erb +34 -0
  74. data/app/views/source_monitor/sources/_bulk_scrape_form.html.erb +64 -0
  75. data/app/views/source_monitor/sources/_bulk_scrape_modal.html.erb +53 -0
  76. data/app/views/source_monitor/sources/_details.html.erb +302 -0
  77. data/app/views/source_monitor/sources/_details_wrapper.html.erb +3 -0
  78. data/app/views/source_monitor/sources/_empty_state_row.html.erb +5 -0
  79. data/app/views/source_monitor/sources/_fetch_interval_heatmap.html.erb +46 -0
  80. data/app/views/source_monitor/sources/_form.html.erb +143 -0
  81. data/app/views/source_monitor/sources/_health_status_badge.html.erb +46 -0
  82. data/app/views/source_monitor/sources/_row.html.erb +102 -0
  83. data/app/views/source_monitor/sources/edit.html.erb +28 -0
  84. data/app/views/source_monitor/sources/index.html.erb +153 -0
  85. data/app/views/source_monitor/sources/new.html.erb +22 -0
  86. data/app/views/source_monitor/sources/show.html.erb +3 -0
  87. data/config/coverage_baseline.json +2010 -0
  88. data/config/initializers/feedjira.rb +19 -0
  89. data/config/routes.rb +18 -0
  90. data/config/tailwind.config.js +17 -0
  91. data/db/migrate/20241008120000_create_source_monitor_sources.rb +40 -0
  92. data/db/migrate/20241008121000_create_source_monitor_items.rb +44 -0
  93. data/db/migrate/20241008122000_create_source_monitor_fetch_logs.rb +32 -0
  94. data/db/migrate/20241008123000_create_source_monitor_scrape_logs.rb +25 -0
  95. data/db/migrate/20251008183000_change_fetch_interval_to_minutes.rb +23 -0
  96. data/db/migrate/20251009090000_create_source_monitor_item_contents.rb +38 -0
  97. data/db/migrate/20251009103000_add_feed_content_readability_to_sources.rb +5 -0
  98. data/db/migrate/20251010090000_add_adaptive_fetching_toggle_to_sources.rb +7 -0
  99. data/db/migrate/20251010123000_add_deleted_at_to_source_monitor_items.rb +8 -0
  100. data/db/migrate/20251010153000_add_type_to_source_monitor_sources.rb +8 -0
  101. data/db/migrate/20251010154500_add_fetch_status_to_source_monitor_sources.rb +9 -0
  102. data/db/migrate/20251010160000_create_solid_cable_messages.rb +16 -0
  103. data/db/migrate/20251011090000_add_fetch_retry_state_to_sources.rb +14 -0
  104. data/db/migrate/20251012090000_add_health_fields_to_sources.rb +17 -0
  105. data/db/migrate/20251012100000_optimize_source_monitor_database_performance.rb +13 -0
  106. data/db/migrate/20251014064947_add_not_null_constraints_to_items.rb +30 -0
  107. data/db/migrate/20251014171659_add_performance_indexes.rb +29 -0
  108. data/db/migrate/20251014172525_add_fetch_status_check_constraint.rb +18 -0
  109. data/db/migrate/20251015100000_create_source_monitor_log_entries.rb +89 -0
  110. data/db/migrate/20251022100000_create_source_monitor_health_check_logs.rb +22 -0
  111. data/db/migrate/20251108120116_refresh_fetch_status_constraint.rb +29 -0
  112. data/docs/configuration.md +170 -0
  113. data/docs/deployment.md +63 -0
  114. data/docs/gh-cli-workflow.md +44 -0
  115. data/docs/installation.md +144 -0
  116. data/docs/troubleshooting.md +76 -0
  117. data/eslint.config.mjs +27 -0
  118. data/lib/generators/source_monitor/install/install_generator.rb +59 -0
  119. data/lib/generators/source_monitor/install/templates/source_monitor.rb.tt +155 -0
  120. data/lib/source_monitor/analytics/source_activity_rates.rb +53 -0
  121. data/lib/source_monitor/analytics/source_fetch_interval_distribution.rb +57 -0
  122. data/lib/source_monitor/analytics/sources_index_metrics.rb +92 -0
  123. data/lib/source_monitor/assets/bundler.rb +49 -0
  124. data/lib/source_monitor/assets.rb +6 -0
  125. data/lib/source_monitor/configuration.rb +654 -0
  126. data/lib/source_monitor/dashboard/queries.rb +356 -0
  127. data/lib/source_monitor/dashboard/quick_action.rb +7 -0
  128. data/lib/source_monitor/dashboard/quick_actions_presenter.rb +26 -0
  129. data/lib/source_monitor/dashboard/recent_activity.rb +30 -0
  130. data/lib/source_monitor/dashboard/recent_activity_presenter.rb +77 -0
  131. data/lib/source_monitor/dashboard/turbo_broadcaster.rb +87 -0
  132. data/lib/source_monitor/dashboard/upcoming_fetch_schedule.rb +126 -0
  133. data/lib/source_monitor/engine.rb +107 -0
  134. data/lib/source_monitor/events.rb +110 -0
  135. data/lib/source_monitor/feedjira_extensions.rb +103 -0
  136. data/lib/source_monitor/fetching/advisory_lock.rb +54 -0
  137. data/lib/source_monitor/fetching/completion/event_publisher.rb +22 -0
  138. data/lib/source_monitor/fetching/completion/follow_up_handler.rb +37 -0
  139. data/lib/source_monitor/fetching/completion/retention_handler.rb +30 -0
  140. data/lib/source_monitor/fetching/feed_fetcher.rb +627 -0
  141. data/lib/source_monitor/fetching/fetch_error.rb +88 -0
  142. data/lib/source_monitor/fetching/fetch_runner.rb +142 -0
  143. data/lib/source_monitor/fetching/retry_policy.rb +85 -0
  144. data/lib/source_monitor/fetching/stalled_fetch_reconciler.rb +146 -0
  145. data/lib/source_monitor/health/source_health_check.rb +100 -0
  146. data/lib/source_monitor/health/source_health_monitor.rb +210 -0
  147. data/lib/source_monitor/health/source_health_reset.rb +68 -0
  148. data/lib/source_monitor/health.rb +46 -0
  149. data/lib/source_monitor/http.rb +85 -0
  150. data/lib/source_monitor/instrumentation.rb +52 -0
  151. data/lib/source_monitor/items/item_creator.rb +601 -0
  152. data/lib/source_monitor/items/retention_pruner.rb +146 -0
  153. data/lib/source_monitor/items/retention_strategies/destroy.rb +26 -0
  154. data/lib/source_monitor/items/retention_strategies/soft_delete.rb +50 -0
  155. data/lib/source_monitor/items/retention_strategies.rb +9 -0
  156. data/lib/source_monitor/jobs/cleanup_options.rb +85 -0
  157. data/lib/source_monitor/jobs/fetch_failure_subscriber.rb +129 -0
  158. data/lib/source_monitor/jobs/solid_queue_metrics.rb +199 -0
  159. data/lib/source_monitor/jobs/visibility.rb +133 -0
  160. data/lib/source_monitor/logs/entry_sync.rb +69 -0
  161. data/lib/source_monitor/logs/filter_set.rb +163 -0
  162. data/lib/source_monitor/logs/query.rb +81 -0
  163. data/lib/source_monitor/logs/table_presenter.rb +161 -0
  164. data/lib/source_monitor/metrics.rb +77 -0
  165. data/lib/source_monitor/model_extensions.rb +109 -0
  166. data/lib/source_monitor/models/sanitizable.rb +76 -0
  167. data/lib/source_monitor/models/url_normalizable.rb +84 -0
  168. data/lib/source_monitor/pagination/paginator.rb +90 -0
  169. data/lib/source_monitor/realtime/adapter.rb +97 -0
  170. data/lib/source_monitor/realtime/broadcaster.rb +237 -0
  171. data/lib/source_monitor/realtime.rb +17 -0
  172. data/lib/source_monitor/release/changelog.rb +59 -0
  173. data/lib/source_monitor/release/runner.rb +73 -0
  174. data/lib/source_monitor/scheduler.rb +82 -0
  175. data/lib/source_monitor/scrapers/base.rb +105 -0
  176. data/lib/source_monitor/scrapers/fetchers/http_fetcher.rb +97 -0
  177. data/lib/source_monitor/scrapers/parsers/readability_parser.rb +101 -0
  178. data/lib/source_monitor/scrapers/readability.rb +156 -0
  179. data/lib/source_monitor/scraping/bulk_result_presenter.rb +85 -0
  180. data/lib/source_monitor/scraping/bulk_source_scraper.rb +233 -0
  181. data/lib/source_monitor/scraping/enqueuer.rb +125 -0
  182. data/lib/source_monitor/scraping/item_scraper/adapter_resolver.rb +44 -0
  183. data/lib/source_monitor/scraping/item_scraper/persistence.rb +189 -0
  184. data/lib/source_monitor/scraping/item_scraper.rb +84 -0
  185. data/lib/source_monitor/scraping/scheduler.rb +43 -0
  186. data/lib/source_monitor/scraping/state.rb +79 -0
  187. data/lib/source_monitor/security/authentication.rb +85 -0
  188. data/lib/source_monitor/security/parameter_sanitizer.rb +42 -0
  189. data/lib/source_monitor/sources/turbo_stream_presenter.rb +54 -0
  190. data/lib/source_monitor/turbo_streams/stream_responder.rb +95 -0
  191. data/lib/source_monitor/version.rb +3 -0
  192. data/lib/source_monitor.rb +149 -0
  193. data/lib/tasks/recover_stalled_fetches.rake +16 -0
  194. data/lib/tasks/source_monitor_assets.rake +28 -0
  195. data/lib/tasks/source_monitor_tasks.rake +29 -0
  196. data/lib/tasks/test_smoke.rake +12 -0
  197. data/package-lock.json +3997 -0
  198. data/package.json +29 -0
  199. data/postcss.config.js +6 -0
  200. data/source_monitor.gemspec +46 -0
  201. data/stylelint.config.js +12 -0
  202. metadata +469 -0
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SourceMonitor
4
+ module Scraping
5
+ # Presenter for building flash messages from BulkSourceScraper results
6
+ # Extracts complex message formatting logic from the controller
7
+ class BulkResultPresenter
8
+ attr_reader :result, :pluralizer
9
+
10
+ def initialize(result:, pluralizer:)
11
+ @result = result
12
+ @pluralizer = pluralizer
13
+ end
14
+
15
+ def to_flash_payload
16
+ case result.status
17
+ when :success
18
+ build_success_payload
19
+ when :partial
20
+ build_partial_payload
21
+ else
22
+ build_error_payload
23
+ end
24
+ end
25
+
26
+ private
27
+
28
+ def build_success_payload
29
+ label = BulkSourceScraper.selection_label(result.selection)
30
+ pluralized_enqueued = pluralizer.call(result.enqueued_count, "item")
31
+
32
+ message = "Queued scraping for #{pluralized_enqueued} from the #{label}."
33
+
34
+ if result.already_enqueued_count.positive?
35
+ pluralized_already = pluralizer.call(result.already_enqueued_count, "item")
36
+ message = "#{message} #{pluralized_already.capitalize} already in progress."
37
+ end
38
+
39
+ { flash_key: :notice, message:, level: :success }
40
+ end
41
+
42
+ def build_partial_payload
43
+ label = BulkSourceScraper.selection_label(result.selection)
44
+ parts = []
45
+
46
+ if result.enqueued_count.positive?
47
+ pluralized_enqueued = pluralizer.call(result.enqueued_count, "item")
48
+ parts << "Queued #{pluralized_enqueued} from the #{label}"
49
+ end
50
+
51
+ if result.already_enqueued_count.positive?
52
+ pluralized_already = pluralizer.call(result.already_enqueued_count, "item")
53
+ parts << "#{pluralized_already.capitalize} already in progress"
54
+ end
55
+
56
+ if result.rate_limited?
57
+ limit = SourceMonitor.config.scraping.max_in_flight_per_source
58
+ parts << "Stopped after reaching the per-source limit#{" of #{limit}" if limit}"
59
+ end
60
+
61
+ other_failures = result.failure_details.except(:rate_limited)
62
+ if other_failures.values.sum.positive?
63
+ skipped = other_failures.map do |status, count|
64
+ label_key = status.to_s.tr("_", " ")
65
+ "#{pluralizer.call(count, label_key)}"
66
+ end.join(", ")
67
+ parts << "Skipped #{skipped}"
68
+ end
69
+
70
+ if parts.empty?
71
+ parts << "No new scrapes were queued from the #{label}"
72
+ end
73
+
74
+ { flash_key: :notice, message: parts.join(". ") + ".", level: :warning }
75
+ end
76
+
77
+ def build_error_payload
78
+ message = result.messages.presence&.first ||
79
+ "No items were queued because nothing matched the selected scope."
80
+
81
+ { flash_key: :alert, message:, level: :error }
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,233 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SourceMonitor
4
+ module Scraping
5
+ # Orchestrates bulk scrape enqueues for a source based on a user-selected
6
+ # scope. Works alongside the single-item enqueuer to ensure we respect
7
+ # per-source limits and provide actionable feedback for the UI.
8
+ class BulkSourceScraper
9
+ SELECTIONS = %i[current unscraped all].freeze
10
+ SELECTION_LABELS = {
11
+ current: "current view",
12
+ unscraped: "unscraped items",
13
+ all: "all items"
14
+ }.freeze
15
+ DEFAULT_PREVIEW_LIMIT = 10
16
+
17
+ Result = Struct.new(
18
+ :status,
19
+ :selection,
20
+ :attempted_count,
21
+ :enqueued_count,
22
+ :already_enqueued_count,
23
+ :failure_count,
24
+ :failure_details,
25
+ :messages,
26
+ :rate_limited,
27
+ keyword_init: true
28
+ ) do
29
+ def success?
30
+ status == :success
31
+ end
32
+
33
+ def partial?
34
+ status == :partial
35
+ end
36
+
37
+ def error?
38
+ status == :error
39
+ end
40
+
41
+ def rate_limited?
42
+ !!rate_limited
43
+ end
44
+ end
45
+
46
+ def self.selection_label(selection)
47
+ SELECTION_LABELS[normalize_selection(selection)] || SELECTION_LABELS[:current]
48
+ end
49
+
50
+ def self.selection_counts(source:, preview_items:, preview_limit: 10)
51
+ preview_collection = Array(preview_items).compact
52
+ base_scope = SourceMonitor::Item.active.where(source_id: source.id)
53
+ {
54
+ current: preview_collection.size.clamp(0, preview_limit.to_i.nonzero? || preview_collection.size),
55
+ unscraped: base_scope.merge(unscraped_scope).count,
56
+ all: base_scope.count
57
+ }
58
+ end
59
+
60
+ def self.normalize_selection(selection)
61
+ value = selection.is_a?(String) ? selection.strip : selection
62
+ value = value.to_s.downcase.to_sym if value
63
+ value if SELECTIONS.include?(value)
64
+ end
65
+
66
+ def initialize(source:, selection:, preview_limit: DEFAULT_PREVIEW_LIMIT, enqueuer: SourceMonitor::Scraping::Enqueuer, config: SourceMonitor.config.scraping)
67
+ @source = source
68
+ @selection = self.class.normalize_selection(selection) || :current
69
+ normalized_limit = preview_limit.respond_to?(:to_i) ? preview_limit.to_i : DEFAULT_PREVIEW_LIMIT
70
+ @preview_limit = normalized_limit.positive? ? normalized_limit : DEFAULT_PREVIEW_LIMIT
71
+ @enqueuer = enqueuer
72
+ @config = config
73
+ end
74
+
75
+ def call
76
+ return disabled_result unless source.scraping_enabled?
77
+ return invalid_selection_result unless SELECTIONS.include?(selection)
78
+
79
+ items = scoped_items.to_a
80
+ attempted_count = items.size
81
+
82
+ return no_items_result if attempted_count.zero?
83
+
84
+ failure_details = Hash.new(0)
85
+ messages = []
86
+ enqueued_count = 0
87
+ already_enqueued_count = 0
88
+ rate_limited = false
89
+
90
+ items.each do |item|
91
+ enqueue_result = enqueuer.enqueue(item: item, source:, reason: :manual)
92
+
93
+ case enqueue_result.status
94
+ when :enqueued
95
+ enqueued_count += 1
96
+ when :already_enqueued
97
+ already_enqueued_count += 1
98
+ when :rate_limited
99
+ failure_details[:rate_limited] += 1
100
+ messages << enqueue_result.message if enqueue_result.message.present?
101
+ rate_limited = true
102
+ break
103
+ else
104
+ key = enqueue_result.status || :unknown
105
+ failure_details[key] += 1
106
+ messages << enqueue_result.message if enqueue_result.message.present?
107
+ end
108
+ end
109
+
110
+ failure_count = failure_details.values.sum
111
+ status = determine_status(enqueued_count:, failure_count:, already_enqueued_count:)
112
+
113
+ Result.new(
114
+ status:,
115
+ selection:,
116
+ attempted_count: attempted_count,
117
+ enqueued_count:,
118
+ already_enqueued_count:,
119
+ failure_count:,
120
+ failure_details: failure_details.freeze,
121
+ messages: messages.compact.uniq,
122
+ rate_limited: rate_limited
123
+ )
124
+ end
125
+
126
+ private
127
+
128
+ attr_reader :source, :selection, :preview_limit, :enqueuer, :config
129
+
130
+ def scoped_items
131
+ scope = case selection
132
+ when :current
133
+ base_scope.limit(preview_limit)
134
+ when :unscraped
135
+ base_scope.merge(unscraped_scope)
136
+ when :all
137
+ base_scope
138
+ else
139
+ base_scope.limit(preview_limit)
140
+ end
141
+
142
+ scope = without_inflight(scope)
143
+ apply_batch_limit(scope)
144
+ end
145
+
146
+ def base_scope
147
+ SourceMonitor::Item.active.where(source_id: source.id).order(Arel.sql("published_at DESC NULLS LAST, created_at DESC"))
148
+ end
149
+
150
+ def without_inflight(scope)
151
+ statuses = SourceMonitor::Scraping::State::IN_FLIGHT_STATUSES
152
+ column = SourceMonitor::Item.arel_table[:scrape_status]
153
+ scope.where(column.eq(nil).or(column.not_in(statuses)))
154
+ end
155
+
156
+ def self.unscraped_scope
157
+ item_table = SourceMonitor::Item.arel_table
158
+ failed_statuses = %w[failed partial]
159
+ SourceMonitor::Item.active.where(
160
+ item_table[:scraped_at].eq(nil)
161
+ .or(item_table[:scrape_status].in(failed_statuses))
162
+ )
163
+ end
164
+
165
+ def unscraped_scope
166
+ self.class.unscraped_scope
167
+ end
168
+
169
+ def apply_batch_limit(scope)
170
+ limit = config.max_bulk_batch_size
171
+ return scope unless limit
172
+
173
+ current_limit = scope.limit_value
174
+ effective_limit = current_limit ? [ current_limit, limit ].min : limit
175
+ scope.limit(effective_limit)
176
+ end
177
+
178
+ def determine_status(enqueued_count:, failure_count:, already_enqueued_count:)
179
+ if enqueued_count.positive? && failure_count.zero?
180
+ :success
181
+ elsif enqueued_count.positive?
182
+ :partial
183
+ elsif already_enqueued_count.positive?
184
+ :partial
185
+ else
186
+ :error
187
+ end
188
+ end
189
+
190
+ def disabled_result
191
+ Result.new(
192
+ status: :error,
193
+ selection:,
194
+ attempted_count: 0,
195
+ enqueued_count: 0,
196
+ already_enqueued_count: 0,
197
+ failure_count: 1,
198
+ failure_details: { scraping_disabled: 1 },
199
+ messages: [ "Scraping is disabled for this source." ],
200
+ rate_limited: false
201
+ )
202
+ end
203
+
204
+ def invalid_selection_result
205
+ Result.new(
206
+ status: :error,
207
+ selection:,
208
+ attempted_count: 0,
209
+ enqueued_count: 0,
210
+ already_enqueued_count: 0,
211
+ failure_count: 1,
212
+ failure_details: { invalid_selection: 1 },
213
+ messages: [ "Invalid selection for bulk scrape." ],
214
+ rate_limited: false
215
+ )
216
+ end
217
+
218
+ def no_items_result
219
+ Result.new(
220
+ status: :error,
221
+ selection:,
222
+ attempted_count: 0,
223
+ enqueued_count: 0,
224
+ already_enqueued_count: 0,
225
+ failure_count: 1,
226
+ failure_details: { no_items: 1 },
227
+ messages: [ "No items match the selected scope." ],
228
+ rate_limited: false
229
+ )
230
+ end
231
+ end
232
+ end
233
+ end
@@ -0,0 +1,125 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SourceMonitor
4
+ module Scraping
5
+ # Coordinates queuing of scraping jobs while respecting source
6
+ # configuration and avoiding duplicate enqueues for the same item.
7
+ class Enqueuer
8
+ Result = Struct.new(:status, :message, :item, keyword_init: true) do
9
+ def enqueued?
10
+ status == :enqueued
11
+ end
12
+
13
+ def already_enqueued?
14
+ status == :already_enqueued
15
+ end
16
+
17
+ def failure?
18
+ !enqueued? && !already_enqueued?
19
+ end
20
+ end
21
+
22
+ attr_reader :item, :source, :job_class, :reason
23
+
24
+ def self.enqueue(item:, source: nil, job_class: SourceMonitor::ScrapeItemJob, reason: :manual)
25
+ new(item:, source:, job_class:, reason:).enqueue
26
+ end
27
+
28
+ def initialize(item:, source: nil, job_class: SourceMonitor::ScrapeItemJob, reason: :manual)
29
+ @item = item
30
+ @source = source || item&.source
31
+ @job_class = job_class
32
+ @reason = reason.to_sym
33
+ end
34
+
35
+ def enqueue
36
+ log("enqueue:start", item:, source:, reason: reason)
37
+ return failure(:missing_item, "Item could not be found.") unless item
38
+ return failure(:missing_source, "Item must belong to a source.") unless source
39
+ return failure(:scraping_disabled, "Scraping is disabled for this source.") unless source.scraping_enabled?
40
+ if auto_reason? && !source.auto_scrape?
41
+ return failure(:auto_scrape_disabled, "Automatic scraping is disabled for this source.")
42
+ end
43
+
44
+ already_queued = false
45
+ rate_limited = false
46
+ rate_limit_info = nil
47
+
48
+ item.with_lock do
49
+ item.reload
50
+
51
+ if SourceMonitor::Scraping::State.in_flight?(item.scrape_status)
52
+ log("enqueue:in_flight", item:, status: item.scrape_status)
53
+ already_queued = true
54
+ next
55
+ end
56
+
57
+ exhausted, info = rate_limit_exhausted?
58
+ if exhausted
59
+ rate_limited = true
60
+ rate_limit_info = info
61
+ next
62
+ end
63
+
64
+ SourceMonitor::Scraping::State.mark_pending!(item, broadcast: false, lock: false)
65
+ end
66
+
67
+ if already_queued
68
+ log("enqueue:already_enqueued", item:, status: item.scrape_status)
69
+ return Result.new(status: :already_enqueued, message: "Scrape already in progress for this item.", item: item)
70
+ end
71
+
72
+ if rate_limited
73
+ message = rate_limit_message(rate_limit_info)
74
+ log("enqueue:rate_limited", item:, limit: rate_limit_info&.fetch(:limit, nil), in_flight: rate_limit_info&.fetch(:in_flight, nil))
75
+ return Result.new(status: :rate_limited, message:, item: item)
76
+ end
77
+
78
+ job_class.perform_later(item.id)
79
+ log("enqueue:job_enqueued", item:, job_class: job_class.name)
80
+ Result.new(status: :enqueued, message: "Scrape has been enqueued for processing.", item: item)
81
+ end
82
+
83
+ private
84
+
85
+ def auto_reason?
86
+ reason == :auto
87
+ end
88
+
89
+ def failure(status, message)
90
+ log("enqueue:failure", item:, status:, message:)
91
+ Result.new(status:, message:, item: item)
92
+ end
93
+
94
+ def log(stage, item:, **extra)
95
+ return unless defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
96
+
97
+ payload = {
98
+ stage: "SourceMonitor::Scraping::Enqueuer##{stage}",
99
+ item_id: item&.id,
100
+ source_id: source&.id,
101
+ reason: reason
102
+ }.merge(extra.compact)
103
+ Rails.logger.info("[SourceMonitor::ManualScrape] #{payload.to_json}")
104
+ rescue StandardError
105
+ nil
106
+ end
107
+
108
+ def rate_limit_exhausted?
109
+ limit = SourceMonitor.config.scraping.max_in_flight_per_source
110
+ return [ false, nil ] unless limit
111
+
112
+ in_flight = source.items.where(scrape_status: SourceMonitor::Scraping::State::IN_FLIGHT_STATUSES).count
113
+ [ in_flight >= limit, { limit:, in_flight: in_flight } ]
114
+ end
115
+
116
+ def rate_limit_message(info)
117
+ return "Scraping queue is full for this source." unless info
118
+
119
+ limit = info[:limit]
120
+ in_flight = info[:in_flight]
121
+ "Unable to enqueue scrape: scraping queue is full for this source (#{in_flight}/#{limit} jobs in flight)."
122
+ end
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SourceMonitor
4
+ module Scraping
5
+ class ItemScraper
6
+ # Resolves scraper adapter classes based on configuration or engine namespace.
7
+ class AdapterResolver
8
+ VALID_NAME_PATTERN = /\A[a-z0-9_]+\z/i.freeze
9
+
10
+ def initialize(name:, source:)
11
+ @name = name.to_s
12
+ @source = source
13
+ end
14
+
15
+ def resolve!
16
+ raise_unknown!("No scraper adapter configured for source") if name.blank?
17
+ raise_unknown!("Invalid scraper adapter: #{name}") unless VALID_NAME_PATTERN.match?(name)
18
+
19
+ configured = SourceMonitor.config.scrapers.adapter_for(name)
20
+ return configured if configured
21
+
22
+ constant = resolve_constant
23
+ return constant if constant <= SourceMonitor::Scrapers::Base
24
+
25
+ raise_unknown!("Unknown scraper adapter: #{name}")
26
+ rescue NameError
27
+ raise_unknown!("Unknown scraper adapter: #{name}")
28
+ end
29
+
30
+ private
31
+
32
+ attr_reader :name, :source
33
+
34
+ def resolve_constant
35
+ SourceMonitor::Scrapers.const_get(name.camelize)
36
+ end
37
+
38
+ def raise_unknown!(message)
39
+ raise SourceMonitor::Scraping::ItemScraper::UnknownAdapterError, message
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,189 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_support/core_ext/object/blank"
4
+ require "active_support/core_ext/object/deep_dup"
5
+ require "active_support/core_ext/hash/indifferent_access"
6
+
7
+ module SourceMonitor
8
+ module Scraping
9
+ class ItemScraper
10
+ # Persists scrape outcomes to the database and builds a Result object.
11
+ class Persistence
12
+ def initialize(item:, source:, adapter_name:)
13
+ @item = item
14
+ @source = source
15
+ @adapter_name = adapter_name
16
+ end
17
+
18
+ def persist_success(adapter_result:, started_at:)
19
+ completed_at = Time.current
20
+ success = adapter_result.status.to_s != "failed"
21
+ status = normalize_status(adapter_result.status, success)
22
+ metadata = normalize_metadata(adapter_result.metadata)
23
+ http_status = metadata&.[](:http_status)
24
+ error_info = success ? {} : extract_error_info(metadata)
25
+ content_length = adapter_result.html.to_s.presence && adapter_result.html.to_s.bytesize
26
+
27
+ log = nil
28
+ item.class.transaction do
29
+ apply_item_success(status:, success:, completed_at:, adapter_result:)
30
+ log = build_log(
31
+ started_at:,
32
+ completed_at:,
33
+ duration_ms: duration_ms(started_at:, completed_at:),
34
+ success: success,
35
+ http_status: http_status,
36
+ content_length: content_length,
37
+ metadata: metadata,
38
+ error_class: error_info[:class],
39
+ error_message: error_info[:message]
40
+ )
41
+ end
42
+
43
+ SourceMonitor::Scraping::ItemScraper::Result.new(
44
+ status: status,
45
+ item: item,
46
+ log: log,
47
+ message: message_for(status, success, error_info[:message], metadata)
48
+ )
49
+ end
50
+
51
+ def persist_failure(error:, started_at:, message_override: nil)
52
+ raise ArgumentError, "Item does not belong to a source" unless source
53
+
54
+ completed_at = Time.current
55
+ message = message_override.presence || error.message.presence || "Scrape failed"
56
+ http_status = extract_http_status(error)
57
+ metadata = failure_metadata(error)
58
+
59
+ log = nil
60
+ item.class.transaction do
61
+ item.update!(scrape_status: "failed", scraped_at: completed_at)
62
+ log = build_log(
63
+ started_at:,
64
+ completed_at: completed_at,
65
+ duration_ms: duration_ms(started_at:, completed_at:),
66
+ success: false,
67
+ http_status: http_status,
68
+ content_length: nil,
69
+ metadata: metadata,
70
+ error_class: error.class.name,
71
+ error_message: message
72
+ )
73
+ end
74
+
75
+ SourceMonitor::Scraping::ItemScraper::Result.new(
76
+ status: :failed,
77
+ item: item,
78
+ log: log,
79
+ message: "Scrape failed: #{message}",
80
+ error: error
81
+ )
82
+ end
83
+
84
+ private
85
+
86
+ attr_reader :item, :source, :adapter_name
87
+
88
+ def apply_item_success(status:, success:, completed_at:, adapter_result:)
89
+ attributes = {
90
+ scrape_status: status.to_s,
91
+ scraped_at: completed_at
92
+ }
93
+
94
+ if success
95
+ attributes[:scraped_html] = adapter_result.html
96
+ attributes[:scraped_content] = adapter_result.content
97
+ end
98
+
99
+ item.update!(attributes)
100
+ end
101
+
102
+ def build_log(started_at:, completed_at:, duration_ms:, success:, http_status:, content_length:, metadata:, error_class:, error_message:)
103
+ SourceMonitor::ScrapeLog.create!(
104
+ source: source,
105
+ item: item,
106
+ success: success,
107
+ scraper_adapter: adapter_name,
108
+ started_at: started_at,
109
+ completed_at: completed_at,
110
+ duration_ms: duration_ms,
111
+ http_status: http_status,
112
+ content_length: content_length,
113
+ error_class: error_class,
114
+ error_message: error_message,
115
+ metadata: metadata
116
+ )
117
+ end
118
+
119
+ def normalize_status(raw_status, success)
120
+ return :success if raw_status.blank? && success
121
+ return :failed if raw_status.blank?
122
+
123
+ raw_status.to_sym
124
+ end
125
+
126
+ def normalize_metadata(metadata)
127
+ return {} if metadata.blank?
128
+
129
+ hash = metadata.respond_to?(:to_h) ? metadata.to_h : metadata
130
+ hash.deep_dup.with_indifferent_access
131
+ rescue StandardError
132
+ {}
133
+ end
134
+
135
+ def extract_error_info(metadata)
136
+ {
137
+ class: metadata&.[](:error)&.to_s,
138
+ message: metadata&.[](:message)&.to_s
139
+ }.compact
140
+ end
141
+
142
+ def failure_metadata(error)
143
+ {
144
+ error: error.class.name,
145
+ message: error.message,
146
+ backtrace: Array(error.backtrace).first(5)
147
+ }.compact
148
+ end
149
+
150
+ def extract_http_status(error)
151
+ return error.http_status if error.respond_to?(:http_status) && error.http_status.present?
152
+
153
+ if error.respond_to?(:response)
154
+ response = error.response
155
+ if response.is_a?(Hash)
156
+ return response[:status] || response["status"]
157
+ end
158
+ end
159
+
160
+ if error.message && (match = error.message.match(/\b(\d{3})\b/))
161
+ return match[1].to_i
162
+ end
163
+
164
+ nil
165
+ end
166
+
167
+ def duration_ms(started_at:, completed_at:)
168
+ return nil unless started_at && completed_at
169
+
170
+ ((completed_at - started_at) * 1000).round
171
+ end
172
+
173
+ def message_for(status, success, error_message, metadata)
174
+ return "Scrape failed: #{error_message}" if !success && error_message.present?
175
+
176
+ case status.to_s
177
+ when "success"
178
+ strategy = metadata&.[](:extraction_strategy)
179
+ strategy.present? ? "Scrape completed via #{strategy.to_s.titleize}" : "Scrape completed successfully"
180
+ when "partial"
181
+ "Scrape completed with partial content"
182
+ else
183
+ success ? "Scrape completed" : "Scrape failed"
184
+ end
185
+ end
186
+ end
187
+ end
188
+ end
189
+ end