source_monitor 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +16 -0
  3. data/.rubocop.yml +12 -0
  4. data/.ruby-version +1 -0
  5. data/AGENTS.md +132 -0
  6. data/CHANGELOG.md +66 -0
  7. data/CONTRIBUTING.md +31 -0
  8. data/Gemfile +30 -0
  9. data/Gemfile.lock +411 -0
  10. data/MIT-LICENSE +20 -0
  11. data/README.md +108 -0
  12. data/Rakefile +8 -0
  13. data/app/assets/builds/.keep +0 -0
  14. data/app/assets/config/source_monitor_manifest.js +4 -0
  15. data/app/assets/images/source_monitor/.keep +0 -0
  16. data/app/assets/javascripts/source_monitor/application.js +20 -0
  17. data/app/assets/javascripts/source_monitor/controllers/async_submit_controller.js +36 -0
  18. data/app/assets/javascripts/source_monitor/controllers/dropdown_controller.js +109 -0
  19. data/app/assets/javascripts/source_monitor/controllers/modal_controller.js +56 -0
  20. data/app/assets/javascripts/source_monitor/controllers/notification_controller.js +53 -0
  21. data/app/assets/javascripts/source_monitor/turbo_actions.js +13 -0
  22. data/app/assets/stylesheets/source_monitor/application.tailwind.css +13 -0
  23. data/app/assets/svgs/source_monitor/.keep +0 -0
  24. data/app/controllers/concerns/.keep +0 -0
  25. data/app/controllers/concerns/source_monitor/sanitizes_search_params.rb +81 -0
  26. data/app/controllers/source_monitor/application_controller.rb +62 -0
  27. data/app/controllers/source_monitor/dashboard_controller.rb +27 -0
  28. data/app/controllers/source_monitor/fetch_logs_controller.rb +9 -0
  29. data/app/controllers/source_monitor/health_controller.rb +10 -0
  30. data/app/controllers/source_monitor/items_controller.rb +116 -0
  31. data/app/controllers/source_monitor/logs_controller.rb +15 -0
  32. data/app/controllers/source_monitor/scrape_logs_controller.rb +9 -0
  33. data/app/controllers/source_monitor/source_bulk_scrapes_controller.rb +35 -0
  34. data/app/controllers/source_monitor/source_fetches_controller.rb +22 -0
  35. data/app/controllers/source_monitor/source_health_checks_controller.rb +34 -0
  36. data/app/controllers/source_monitor/source_health_resets_controller.rb +27 -0
  37. data/app/controllers/source_monitor/source_retries_controller.rb +22 -0
  38. data/app/controllers/source_monitor/source_turbo_responses.rb +115 -0
  39. data/app/controllers/source_monitor/sources_controller.rb +179 -0
  40. data/app/helpers/source_monitor/application_helper.rb +327 -0
  41. data/app/jobs/source_monitor/application_job.rb +13 -0
  42. data/app/jobs/source_monitor/fetch_feed_job.rb +117 -0
  43. data/app/jobs/source_monitor/item_cleanup_job.rb +48 -0
  44. data/app/jobs/source_monitor/log_cleanup_job.rb +47 -0
  45. data/app/jobs/source_monitor/schedule_fetches_job.rb +29 -0
  46. data/app/jobs/source_monitor/scrape_item_job.rb +47 -0
  47. data/app/jobs/source_monitor/source_health_check_job.rb +77 -0
  48. data/app/mailers/source_monitor/application_mailer.rb +17 -0
  49. data/app/models/concerns/.keep +0 -0
  50. data/app/models/concerns/source_monitor/loggable.rb +18 -0
  51. data/app/models/source_monitor/application_record.rb +5 -0
  52. data/app/models/source_monitor/fetch_log.rb +31 -0
  53. data/app/models/source_monitor/health_check_log.rb +28 -0
  54. data/app/models/source_monitor/item.rb +102 -0
  55. data/app/models/source_monitor/item_content.rb +11 -0
  56. data/app/models/source_monitor/log_entry.rb +56 -0
  57. data/app/models/source_monitor/scrape_log.rb +31 -0
  58. data/app/models/source_monitor/source.rb +115 -0
  59. data/app/views/layouts/source_monitor/application.html.erb +54 -0
  60. data/app/views/source_monitor/dashboard/_fetch_schedule.html.erb +90 -0
  61. data/app/views/source_monitor/dashboard/_job_metrics.html.erb +82 -0
  62. data/app/views/source_monitor/dashboard/_recent_activity.html.erb +39 -0
  63. data/app/views/source_monitor/dashboard/_stat_card.html.erb +6 -0
  64. data/app/views/source_monitor/dashboard/_stats.html.erb +9 -0
  65. data/app/views/source_monitor/dashboard/index.html.erb +48 -0
  66. data/app/views/source_monitor/fetch_logs/show.html.erb +90 -0
  67. data/app/views/source_monitor/items/_details.html.erb +234 -0
  68. data/app/views/source_monitor/items/_details_wrapper.html.erb +3 -0
  69. data/app/views/source_monitor/items/index.html.erb +147 -0
  70. data/app/views/source_monitor/items/show.html.erb +3 -0
  71. data/app/views/source_monitor/logs/index.html.erb +208 -0
  72. data/app/views/source_monitor/scrape_logs/show.html.erb +73 -0
  73. data/app/views/source_monitor/shared/_toast.html.erb +34 -0
  74. data/app/views/source_monitor/sources/_bulk_scrape_form.html.erb +64 -0
  75. data/app/views/source_monitor/sources/_bulk_scrape_modal.html.erb +53 -0
  76. data/app/views/source_monitor/sources/_details.html.erb +302 -0
  77. data/app/views/source_monitor/sources/_details_wrapper.html.erb +3 -0
  78. data/app/views/source_monitor/sources/_empty_state_row.html.erb +5 -0
  79. data/app/views/source_monitor/sources/_fetch_interval_heatmap.html.erb +46 -0
  80. data/app/views/source_monitor/sources/_form.html.erb +143 -0
  81. data/app/views/source_monitor/sources/_health_status_badge.html.erb +46 -0
  82. data/app/views/source_monitor/sources/_row.html.erb +102 -0
  83. data/app/views/source_monitor/sources/edit.html.erb +28 -0
  84. data/app/views/source_monitor/sources/index.html.erb +153 -0
  85. data/app/views/source_monitor/sources/new.html.erb +22 -0
  86. data/app/views/source_monitor/sources/show.html.erb +3 -0
  87. data/config/coverage_baseline.json +2010 -0
  88. data/config/initializers/feedjira.rb +19 -0
  89. data/config/routes.rb +18 -0
  90. data/config/tailwind.config.js +17 -0
  91. data/db/migrate/20241008120000_create_source_monitor_sources.rb +40 -0
  92. data/db/migrate/20241008121000_create_source_monitor_items.rb +44 -0
  93. data/db/migrate/20241008122000_create_source_monitor_fetch_logs.rb +32 -0
  94. data/db/migrate/20241008123000_create_source_monitor_scrape_logs.rb +25 -0
  95. data/db/migrate/20251008183000_change_fetch_interval_to_minutes.rb +23 -0
  96. data/db/migrate/20251009090000_create_source_monitor_item_contents.rb +38 -0
  97. data/db/migrate/20251009103000_add_feed_content_readability_to_sources.rb +5 -0
  98. data/db/migrate/20251010090000_add_adaptive_fetching_toggle_to_sources.rb +7 -0
  99. data/db/migrate/20251010123000_add_deleted_at_to_source_monitor_items.rb +8 -0
  100. data/db/migrate/20251010153000_add_type_to_source_monitor_sources.rb +8 -0
  101. data/db/migrate/20251010154500_add_fetch_status_to_source_monitor_sources.rb +9 -0
  102. data/db/migrate/20251010160000_create_solid_cable_messages.rb +16 -0
  103. data/db/migrate/20251011090000_add_fetch_retry_state_to_sources.rb +14 -0
  104. data/db/migrate/20251012090000_add_health_fields_to_sources.rb +17 -0
  105. data/db/migrate/20251012100000_optimize_source_monitor_database_performance.rb +13 -0
  106. data/db/migrate/20251014064947_add_not_null_constraints_to_items.rb +30 -0
  107. data/db/migrate/20251014171659_add_performance_indexes.rb +29 -0
  108. data/db/migrate/20251014172525_add_fetch_status_check_constraint.rb +18 -0
  109. data/db/migrate/20251015100000_create_source_monitor_log_entries.rb +89 -0
  110. data/db/migrate/20251022100000_create_source_monitor_health_check_logs.rb +22 -0
  111. data/db/migrate/20251108120116_refresh_fetch_status_constraint.rb +29 -0
  112. data/docs/configuration.md +170 -0
  113. data/docs/deployment.md +63 -0
  114. data/docs/gh-cli-workflow.md +44 -0
  115. data/docs/installation.md +144 -0
  116. data/docs/troubleshooting.md +76 -0
  117. data/eslint.config.mjs +27 -0
  118. data/lib/generators/source_monitor/install/install_generator.rb +59 -0
  119. data/lib/generators/source_monitor/install/templates/source_monitor.rb.tt +155 -0
  120. data/lib/source_monitor/analytics/source_activity_rates.rb +53 -0
  121. data/lib/source_monitor/analytics/source_fetch_interval_distribution.rb +57 -0
  122. data/lib/source_monitor/analytics/sources_index_metrics.rb +92 -0
  123. data/lib/source_monitor/assets/bundler.rb +49 -0
  124. data/lib/source_monitor/assets.rb +6 -0
  125. data/lib/source_monitor/configuration.rb +654 -0
  126. data/lib/source_monitor/dashboard/queries.rb +356 -0
  127. data/lib/source_monitor/dashboard/quick_action.rb +7 -0
  128. data/lib/source_monitor/dashboard/quick_actions_presenter.rb +26 -0
  129. data/lib/source_monitor/dashboard/recent_activity.rb +30 -0
  130. data/lib/source_monitor/dashboard/recent_activity_presenter.rb +77 -0
  131. data/lib/source_monitor/dashboard/turbo_broadcaster.rb +87 -0
  132. data/lib/source_monitor/dashboard/upcoming_fetch_schedule.rb +126 -0
  133. data/lib/source_monitor/engine.rb +107 -0
  134. data/lib/source_monitor/events.rb +110 -0
  135. data/lib/source_monitor/feedjira_extensions.rb +103 -0
  136. data/lib/source_monitor/fetching/advisory_lock.rb +54 -0
  137. data/lib/source_monitor/fetching/completion/event_publisher.rb +22 -0
  138. data/lib/source_monitor/fetching/completion/follow_up_handler.rb +37 -0
  139. data/lib/source_monitor/fetching/completion/retention_handler.rb +30 -0
  140. data/lib/source_monitor/fetching/feed_fetcher.rb +627 -0
  141. data/lib/source_monitor/fetching/fetch_error.rb +88 -0
  142. data/lib/source_monitor/fetching/fetch_runner.rb +142 -0
  143. data/lib/source_monitor/fetching/retry_policy.rb +85 -0
  144. data/lib/source_monitor/fetching/stalled_fetch_reconciler.rb +146 -0
  145. data/lib/source_monitor/health/source_health_check.rb +100 -0
  146. data/lib/source_monitor/health/source_health_monitor.rb +210 -0
  147. data/lib/source_monitor/health/source_health_reset.rb +68 -0
  148. data/lib/source_monitor/health.rb +46 -0
  149. data/lib/source_monitor/http.rb +85 -0
  150. data/lib/source_monitor/instrumentation.rb +52 -0
  151. data/lib/source_monitor/items/item_creator.rb +601 -0
  152. data/lib/source_monitor/items/retention_pruner.rb +146 -0
  153. data/lib/source_monitor/items/retention_strategies/destroy.rb +26 -0
  154. data/lib/source_monitor/items/retention_strategies/soft_delete.rb +50 -0
  155. data/lib/source_monitor/items/retention_strategies.rb +9 -0
  156. data/lib/source_monitor/jobs/cleanup_options.rb +85 -0
  157. data/lib/source_monitor/jobs/fetch_failure_subscriber.rb +129 -0
  158. data/lib/source_monitor/jobs/solid_queue_metrics.rb +199 -0
  159. data/lib/source_monitor/jobs/visibility.rb +133 -0
  160. data/lib/source_monitor/logs/entry_sync.rb +69 -0
  161. data/lib/source_monitor/logs/filter_set.rb +163 -0
  162. data/lib/source_monitor/logs/query.rb +81 -0
  163. data/lib/source_monitor/logs/table_presenter.rb +161 -0
  164. data/lib/source_monitor/metrics.rb +77 -0
  165. data/lib/source_monitor/model_extensions.rb +109 -0
  166. data/lib/source_monitor/models/sanitizable.rb +76 -0
  167. data/lib/source_monitor/models/url_normalizable.rb +84 -0
  168. data/lib/source_monitor/pagination/paginator.rb +90 -0
  169. data/lib/source_monitor/realtime/adapter.rb +97 -0
  170. data/lib/source_monitor/realtime/broadcaster.rb +237 -0
  171. data/lib/source_monitor/realtime.rb +17 -0
  172. data/lib/source_monitor/release/changelog.rb +59 -0
  173. data/lib/source_monitor/release/runner.rb +73 -0
  174. data/lib/source_monitor/scheduler.rb +82 -0
  175. data/lib/source_monitor/scrapers/base.rb +105 -0
  176. data/lib/source_monitor/scrapers/fetchers/http_fetcher.rb +97 -0
  177. data/lib/source_monitor/scrapers/parsers/readability_parser.rb +101 -0
  178. data/lib/source_monitor/scrapers/readability.rb +156 -0
  179. data/lib/source_monitor/scraping/bulk_result_presenter.rb +85 -0
  180. data/lib/source_monitor/scraping/bulk_source_scraper.rb +233 -0
  181. data/lib/source_monitor/scraping/enqueuer.rb +125 -0
  182. data/lib/source_monitor/scraping/item_scraper/adapter_resolver.rb +44 -0
  183. data/lib/source_monitor/scraping/item_scraper/persistence.rb +189 -0
  184. data/lib/source_monitor/scraping/item_scraper.rb +84 -0
  185. data/lib/source_monitor/scraping/scheduler.rb +43 -0
  186. data/lib/source_monitor/scraping/state.rb +79 -0
  187. data/lib/source_monitor/security/authentication.rb +85 -0
  188. data/lib/source_monitor/security/parameter_sanitizer.rb +42 -0
  189. data/lib/source_monitor/sources/turbo_stream_presenter.rb +54 -0
  190. data/lib/source_monitor/turbo_streams/stream_responder.rb +95 -0
  191. data/lib/source_monitor/version.rb +3 -0
  192. data/lib/source_monitor.rb +149 -0
  193. data/lib/tasks/recover_stalled_fetches.rake +16 -0
  194. data/lib/tasks/source_monitor_assets.rake +28 -0
  195. data/lib/tasks/source_monitor_tasks.rake +29 -0
  196. data/lib/tasks/test_smoke.rake +12 -0
  197. data/package-lock.json +3997 -0
  198. data/package.json +29 -0
  199. data/postcss.config.js +6 -0
  200. data/source_monitor.gemspec +46 -0
  201. data/stylelint.config.js +12 -0
  202. metadata +469 -0
@@ -0,0 +1,627 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "time"
4
+ require "digest"
5
+ require "source_monitor/http"
6
+ require "source_monitor/fetching/fetch_error"
7
+ require "source_monitor/fetching/retry_policy"
8
+ require "source_monitor/items/item_creator"
9
+
10
+ module SourceMonitor
11
+ module Fetching
12
+ class FeedFetcher
13
+ Result = Struct.new(:status, :feed, :response, :body, :error, :item_processing, :retry_decision, keyword_init: true)
14
+ EntryProcessingResult = Struct.new(
15
+ :created,
16
+ :updated,
17
+ :failed,
18
+ :items,
19
+ :errors,
20
+ :created_items,
21
+ :updated_items,
22
+ keyword_init: true
23
+ )
24
+ ResponseWrapper = Struct.new(:status, :headers, :body, keyword_init: true)
25
+
26
+ MIN_FETCH_INTERVAL = 5.minutes.to_f
27
+ MAX_FETCH_INTERVAL = 24.hours.to_f
28
+ INCREASE_FACTOR = 1.25
29
+ DECREASE_FACTOR = 0.75
30
+ FAILURE_INCREASE_FACTOR = 1.5
31
+ JITTER_PERCENT = 0.1
32
+
33
+ attr_reader :source, :client, :jitter_proc
34
+
35
+ def initialize(source:, client: nil, jitter: nil)
36
+ @source = source
37
+ @client = client
38
+ @jitter_proc = jitter
39
+ end
40
+
41
+ def call
42
+ attempt_started_at = Time.current
43
+ instrumentation_payload = base_instrumentation_payload
44
+ started_monotonic = SourceMonitor::Instrumentation.monotonic_time
45
+ result = nil
46
+
47
+ SourceMonitor::Instrumentation.fetch_start(instrumentation_payload)
48
+
49
+ result = perform_fetch(attempt_started_at, instrumentation_payload)
50
+ rescue FetchError => error
51
+ result = handle_failure(error, started_at: attempt_started_at, instrumentation_payload:)
52
+ rescue StandardError => error
53
+ fetch_error = UnexpectedResponseError.new(error.message, original_error: error)
54
+ result = handle_failure(fetch_error, started_at: attempt_started_at, instrumentation_payload:)
55
+ ensure
56
+ instrumentation_payload[:duration_ms] ||= duration_since(started_monotonic)
57
+ SourceMonitor::Instrumentation.fetch_finish(instrumentation_payload)
58
+ return result
59
+ end
60
+
61
+ private
62
+
63
+ def base_instrumentation_payload
64
+ {
65
+ source_id: source.id,
66
+ feed_url: source.feed_url
67
+ }
68
+ end
69
+
70
+ def duration_since(started_monotonic)
71
+ ((SourceMonitor::Instrumentation.monotonic_time - started_monotonic) * 1000.0).round(2)
72
+ end
73
+
74
+ def perform_fetch(started_at, instrumentation_payload)
75
+ response = perform_request
76
+ handle_response(response, started_at, instrumentation_payload)
77
+ rescue TimeoutError, ConnectionError, HTTPError, ParsingError => error
78
+ raise error
79
+ rescue Faraday::TimeoutError => error
80
+ raise TimeoutError.new(error.message, original_error: error)
81
+ rescue Faraday::ConnectionFailed, Faraday::SSLError => error
82
+ raise ConnectionError.new(error.message, original_error: error)
83
+ rescue Faraday::ClientError => error
84
+ raise build_http_error_from_faraday(error)
85
+ rescue Faraday::Error => error
86
+ raise FetchError.new(error.message, original_error: error)
87
+ end
88
+
89
+ def perform_request
90
+ connection.get(source.feed_url)
91
+ end
92
+
93
+ def connection
94
+ @connection ||= (client || SourceMonitor::HTTP.client(headers: request_headers))
95
+ end
96
+
97
+ def request_headers
98
+ headers = (source.custom_headers || {}).transform_keys { |key| key.to_s }
99
+ headers["If-None-Match"] = source.etag if source.etag.present?
100
+ if source.last_modified.present?
101
+ headers["If-Modified-Since"] = source.last_modified.httpdate
102
+ end
103
+ headers
104
+ end
105
+
106
+ def handle_response(response, started_at, instrumentation_payload)
107
+ case response.status
108
+ when 200
109
+ handle_success(response, started_at, instrumentation_payload)
110
+ when 304
111
+ handle_not_modified(response, started_at, instrumentation_payload)
112
+ else
113
+ raise HTTPError.new(status: response.status, response: response)
114
+ end
115
+ end
116
+
117
+ def handle_success(response, started_at, instrumentation_payload)
118
+ duration_ms = elapsed_ms(started_at)
119
+ body = response.body
120
+ feed = parse_feed(body, response)
121
+ processing = process_feed_entries(feed)
122
+
123
+ feed_body_signature = body_digest(body)
124
+ update_source_for_success(response, duration_ms, feed, feed_body_signature)
125
+ create_fetch_log(
126
+ response: response,
127
+ duration_ms: duration_ms,
128
+ started_at: started_at,
129
+ feed: feed,
130
+ success: true,
131
+ body: body,
132
+ feed_signature: feed_body_signature,
133
+ items_created: processing.created,
134
+ items_updated: processing.updated,
135
+ items_failed: processing.failed,
136
+ item_errors: processing.errors
137
+ )
138
+
139
+ instrumentation_payload[:success] = true
140
+ instrumentation_payload[:status] = :fetched
141
+ instrumentation_payload[:http_status] = response.status
142
+ instrumentation_payload[:parser] = feed.class.name if feed
143
+ instrumentation_payload[:items_created] = processing.created
144
+ instrumentation_payload[:items_updated] = processing.updated
145
+ instrumentation_payload[:items_failed] = processing.failed
146
+ instrumentation_payload[:retry_attempt] = 0
147
+
148
+ Result.new(status: :fetched, feed:, response:, body:, item_processing: processing)
149
+ end
150
+
151
+ def handle_not_modified(response, started_at, instrumentation_payload)
152
+ duration_ms = elapsed_ms(started_at)
153
+
154
+ update_source_for_not_modified(response, duration_ms)
155
+ create_fetch_log(
156
+ response: response,
157
+ duration_ms: duration_ms,
158
+ started_at: started_at,
159
+ success: true
160
+ )
161
+
162
+ instrumentation_payload[:success] = true
163
+ instrumentation_payload[:status] = :not_modified
164
+ instrumentation_payload[:http_status] = response.status
165
+ instrumentation_payload[:items_created] = 0
166
+ instrumentation_payload[:items_updated] = 0
167
+ instrumentation_payload[:items_failed] = 0
168
+ instrumentation_payload[:retry_attempt] = 0
169
+
170
+ Result.new(
171
+ status: :not_modified,
172
+ response: response,
173
+ body: nil,
174
+ item_processing: EntryProcessingResult.new(
175
+ created: 0,
176
+ updated: 0,
177
+ failed: 0,
178
+ items: [],
179
+ errors: [],
180
+ created_items: [],
181
+ updated_items: []
182
+ )
183
+ )
184
+ end
185
+
186
+ def parse_feed(body, response)
187
+ Feedjira.parse(body)
188
+ rescue StandardError => error
189
+ raise ParsingError.new(error.message, response: response, original_error: error)
190
+ end
191
+
192
+ def update_source_for_success(response, duration_ms, feed, feed_signature)
193
+ attributes = {
194
+ last_fetched_at: Time.current,
195
+ last_fetch_duration_ms: duration_ms,
196
+ last_http_status: response.status,
197
+ last_error: nil,
198
+ last_error_at: nil,
199
+ failure_count: 0,
200
+ feed_format: derive_feed_format(feed)
201
+ }
202
+
203
+ if (etag = response.headers["etag"] || response.headers["ETag"])
204
+ attributes[:etag] = etag
205
+ end
206
+
207
+ if (last_modified_header = response.headers["last-modified"] || response.headers["Last-Modified"])
208
+ parsed_time = parse_http_time(last_modified_header)
209
+ attributes[:last_modified] = parsed_time if parsed_time
210
+ end
211
+
212
+ apply_adaptive_interval!(attributes, content_changed: feed_signature_changed?(feed_signature))
213
+ attributes[:metadata] = updated_metadata(feed_signature: feed_signature)
214
+ reset_retry_state!(attributes)
215
+ source.update!(attributes)
216
+ end
217
+
218
+ def update_source_for_not_modified(response, duration_ms)
219
+ attributes = {
220
+ last_fetched_at: Time.current,
221
+ last_fetch_duration_ms: duration_ms,
222
+ last_http_status: response.status,
223
+ last_error: nil,
224
+ last_error_at: nil,
225
+ failure_count: 0
226
+ }
227
+
228
+ if (etag = response.headers["etag"] || response.headers["ETag"])
229
+ attributes[:etag] = etag
230
+ end
231
+
232
+ if (last_modified_header = response.headers["last-modified"] || response.headers["Last-Modified"])
233
+ parsed_time = parse_http_time(last_modified_header)
234
+ attributes[:last_modified] = parsed_time if parsed_time
235
+ end
236
+
237
+ apply_adaptive_interval!(attributes, content_changed: false)
238
+ attributes[:metadata] = updated_metadata
239
+ reset_retry_state!(attributes)
240
+ source.update!(attributes)
241
+ end
242
+
243
+ def update_source_for_failure(error, duration_ms)
244
+ now = Time.current
245
+ attrs = {
246
+ last_fetched_at: now,
247
+ last_fetch_duration_ms: duration_ms,
248
+ last_http_status: error.http_status,
249
+ last_error: error.message,
250
+ last_error_at: now,
251
+ failure_count: source.failure_count.to_i + 1
252
+ }
253
+
254
+ apply_adaptive_interval!(attrs, content_changed: false, failure: true)
255
+ attrs[:metadata] = updated_metadata
256
+ decision = apply_retry_strategy!(attrs, error, now)
257
+ source.update!(attrs)
258
+ decision
259
+ end
260
+
261
+ def reset_retry_state!(attributes)
262
+ attributes[:fetch_retry_attempt] = 0
263
+ attributes[:fetch_circuit_opened_at] = nil
264
+ attributes[:fetch_circuit_until] = nil
265
+ end
266
+
267
+ def apply_retry_strategy!(attributes, error, now)
268
+ decision = SourceMonitor::Fetching::RetryPolicy.new(source:, error:, now:).decision
269
+
270
+ if decision.open_circuit?
271
+ attributes[:fetch_retry_attempt] = 0
272
+ attributes[:fetch_circuit_opened_at] = now
273
+ attributes[:fetch_circuit_until] = decision.circuit_until
274
+ attributes[:next_fetch_at] = decision.circuit_until
275
+ attributes[:backoff_until] = decision.circuit_until
276
+ elsif decision.retry?
277
+ attributes[:fetch_retry_attempt] = decision.next_attempt
278
+ attributes[:fetch_circuit_opened_at] = nil
279
+ attributes[:fetch_circuit_until] = nil
280
+ unless source.adaptive_fetching_enabled? == false
281
+ retry_at = now + decision.wait
282
+ current_next = attributes[:next_fetch_at]
283
+ attributes[:next_fetch_at] = [ current_next, retry_at ].compact.min
284
+ attributes[:backoff_until] = retry_at
285
+ end
286
+ else
287
+ attributes[:fetch_retry_attempt] = 0
288
+ end
289
+
290
+ decision
291
+ rescue StandardError => policy_error
292
+ Rails.logger.error(
293
+ "[SourceMonitor] Failed to apply retry strategy for source #{source.id}: #{policy_error.class} - #{policy_error.message}"
294
+ ) if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
295
+ attributes[:fetch_retry_attempt] ||= 0
296
+ attributes[:fetch_circuit_opened_at] ||= nil
297
+ attributes[:fetch_circuit_until] ||= nil
298
+ nil
299
+ end
300
+
301
+ def create_fetch_log(response:, duration_ms:, started_at:, success:, feed: nil, error: nil, body: nil, feed_signature: nil,
302
+ items_created: 0, items_updated: 0, items_failed: 0, item_errors: [])
303
+ source.fetch_logs.create!(
304
+ success:,
305
+ started_at: started_at,
306
+ completed_at: started_at + (duration_ms / 1000.0),
307
+ duration_ms: duration_ms,
308
+ http_status: response&.status,
309
+ http_response_headers: normalized_headers(response&.headers),
310
+ feed_size_bytes: body&.bytesize,
311
+ items_in_feed: feed&.respond_to?(:entries) ? feed.entries.size : nil,
312
+ items_created: items_created,
313
+ items_updated: items_updated,
314
+ items_failed: items_failed,
315
+ error_class: error&.class&.name,
316
+ error_message: error&.message,
317
+ error_backtrace: error_backtrace(error),
318
+ metadata: feed_metadata(feed, error: error, feed_signature: feed_signature, item_errors: item_errors)
319
+ )
320
+ end
321
+
322
+ def derive_feed_format(feed)
323
+ return unless feed
324
+
325
+ feed.class.name.split("::").last.underscore
326
+ end
327
+
328
+ def feed_metadata(feed, error: nil, feed_signature: nil, item_errors: [])
329
+ metadata = {}
330
+ metadata[:parser] = feed.class.name if feed
331
+ metadata[:error_code] = error.code if error&.respond_to?(:code)
332
+ metadata[:feed_signature] = feed_signature if feed_signature
333
+ metadata[:item_errors] = item_errors if item_errors.present?
334
+ metadata
335
+ end
336
+
337
+ def normalized_headers(headers)
338
+ return {} unless headers
339
+
340
+ headers.to_h.transform_keys { |key| key.to_s.downcase }
341
+ end
342
+
343
+ def error_backtrace(error)
344
+ return if error.nil? || error.original_error.nil?
345
+
346
+ Array(error.original_error.backtrace).first(20).join("\n")
347
+ end
348
+
349
+ def parse_http_time(value)
350
+ return if value.blank?
351
+
352
+ Time.httpdate(value)
353
+ rescue ArgumentError
354
+ nil
355
+ end
356
+
357
+ def elapsed_ms(started_at)
358
+ ((Time.current - started_at) * 1000.0).round
359
+ end
360
+
361
+ def handle_failure(error, started_at:, instrumentation_payload:)
362
+ response = error.response
363
+ body = response&.body
364
+ duration_ms = elapsed_ms(started_at)
365
+
366
+ retry_decision = update_source_for_failure(error, duration_ms)
367
+ create_fetch_log(
368
+ response: response,
369
+ duration_ms: duration_ms,
370
+ started_at: started_at,
371
+ success: false,
372
+ error: error,
373
+ body: body
374
+ )
375
+
376
+ instrumentation_payload[:success] = false
377
+ instrumentation_payload[:status] = :failed
378
+ instrumentation_payload[:error_class] = error.class.name
379
+ instrumentation_payload[:error_message] = error.message
380
+ instrumentation_payload[:http_status] = error.http_status if error.http_status
381
+ instrumentation_payload[:error_code] = error.code if error.respond_to?(:code)
382
+ instrumentation_payload[:items_created] = 0
383
+ instrumentation_payload[:items_updated] = 0
384
+ instrumentation_payload[:items_failed] = 0
385
+ instrumentation_payload[:retry_attempt] = retry_decision&.next_attempt ? retry_decision.next_attempt : 0
386
+
387
+ Result.new(
388
+ status: :failed,
389
+ response: response,
390
+ body: body,
391
+ error: error,
392
+ retry_decision: retry_decision,
393
+ item_processing: EntryProcessingResult.new(
394
+ created: 0,
395
+ updated: 0,
396
+ failed: 0,
397
+ items: [],
398
+ errors: [],
399
+ created_items: [],
400
+ updated_items: []
401
+ )
402
+ )
403
+ end
404
+
405
+ def build_http_error_from_faraday(error)
406
+ response_hash = error.response || {}
407
+ headers = response_hash[:headers] || response_hash[:response_headers] || {}
408
+ ResponseWrapper.new(
409
+ status: response_hash[:status],
410
+ headers: headers,
411
+ body: response_hash[:body]
412
+ ).then do |response|
413
+ status = response.status || 0
414
+ message = error.message
415
+ HTTPError.new(status: status, message: message, response: response, original_error: error)
416
+ end
417
+ end
418
+
419
+ def feed_signature_changed?(feed_signature)
420
+ return false if feed_signature.blank?
421
+
422
+ (source.metadata || {}).fetch("last_feed_signature", nil) != feed_signature
423
+ end
424
+
425
+ def apply_adaptive_interval!(attributes, content_changed:, failure: false)
426
+ if source.adaptive_fetching_enabled?
427
+ interval_seconds = compute_next_interval_seconds(content_changed:, failure:)
428
+ scheduled_time = Time.current + adjusted_interval_with_jitter(interval_seconds)
429
+ scheduled_time = [ scheduled_time, source.backoff_until ].compact.max if source.backoff_until.present?
430
+
431
+ attributes[:fetch_interval_minutes] = interval_minutes_for(interval_seconds)
432
+ attributes[:next_fetch_at] = scheduled_time
433
+ attributes[:backoff_until] = failure ? scheduled_time : nil
434
+ else
435
+ fixed_minutes = [ source.fetch_interval_minutes.to_i, 1 ].max
436
+ attributes[:next_fetch_at] = Time.current + fixed_minutes.minutes
437
+ attributes[:backoff_until] = nil
438
+ end
439
+ end
440
+
441
+ def compute_next_interval_seconds(content_changed:, failure:)
442
+ current = [ current_interval_seconds, min_fetch_interval_seconds ].max
443
+
444
+ next_interval = if failure
445
+ current * failure_increase_factor_value
446
+ elsif content_changed
447
+ current * decrease_factor_value
448
+ else
449
+ current * increase_factor_value
450
+ end
451
+
452
+ next_interval = min_fetch_interval_seconds if next_interval < min_fetch_interval_seconds
453
+ next_interval = max_fetch_interval_seconds if next_interval > max_fetch_interval_seconds
454
+ next_interval.to_f
455
+ end
456
+
457
+ def current_interval_seconds
458
+ source.fetch_interval_minutes.to_f * 60.0
459
+ end
460
+
461
+ def interval_minutes_for(interval_seconds)
462
+ minutes = (interval_seconds / 60.0).round
463
+ [ minutes, 1 ].max
464
+ end
465
+
466
+ def min_fetch_interval_seconds
467
+ configured_seconds(fetching_config&.min_interval_minutes, MIN_FETCH_INTERVAL)
468
+ end
469
+
470
+ def max_fetch_interval_seconds
471
+ configured_seconds(fetching_config&.max_interval_minutes, MAX_FETCH_INTERVAL)
472
+ end
473
+
474
+ def increase_factor_value
475
+ configured_positive(fetching_config&.increase_factor, INCREASE_FACTOR)
476
+ end
477
+
478
+ def decrease_factor_value
479
+ configured_positive(fetching_config&.decrease_factor, DECREASE_FACTOR)
480
+ end
481
+
482
+ def failure_increase_factor_value
483
+ configured_positive(fetching_config&.failure_increase_factor, FAILURE_INCREASE_FACTOR)
484
+ end
485
+
486
+ def jitter_percent_value
487
+ configured_non_negative(fetching_config&.jitter_percent, JITTER_PERCENT)
488
+ end
489
+
490
+ def updated_metadata(feed_signature: nil)
491
+ metadata = (source.metadata || {}).dup
492
+ metadata.delete("dynamic_fetch_interval_seconds")
493
+ metadata["last_feed_signature"] = feed_signature if feed_signature.present?
494
+ metadata
495
+ end
496
+
497
+ def adjusted_interval_with_jitter(interval_seconds)
498
+ jitter = jitter_offset(interval_seconds)
499
+ adjusted = interval_seconds + jitter
500
+ adjusted = min_fetch_interval_seconds if adjusted < min_fetch_interval_seconds
501
+ adjusted
502
+ end
503
+
504
+ def jitter_offset(interval_seconds)
505
+ return 0 if interval_seconds <= 0
506
+ return jitter_proc.call(interval_seconds) if jitter_proc.respond_to?(:call)
507
+
508
+ jitter_range = interval_seconds * jitter_percent_value
509
+ return 0 if jitter_range <= 0
510
+
511
+ ((rand * 2) - 1) * jitter_range
512
+ end
513
+
514
+ def body_digest(body)
515
+ return if body.blank?
516
+
517
+ Digest::SHA256.hexdigest(body)
518
+ end
519
+
520
+ def process_feed_entries(feed)
521
+ return EntryProcessingResult.new(
522
+ created: 0,
523
+ updated: 0,
524
+ failed: 0,
525
+ items: [],
526
+ errors: [],
527
+ created_items: [],
528
+ updated_items: []
529
+ ) unless feed.respond_to?(:entries)
530
+
531
+ created = 0
532
+ updated = 0
533
+ failed = 0
534
+ items = []
535
+ created_items = []
536
+ updated_items = []
537
+ errors = []
538
+
539
+ Array(feed.entries).each do |entry|
540
+ begin
541
+ result = SourceMonitor::Items::ItemCreator.call(source:, entry:)
542
+ SourceMonitor::Events.run_item_processors(source:, entry:, result: result)
543
+ items << result.item
544
+ if result.created?
545
+ created += 1
546
+ created_items << result.item
547
+ SourceMonitor::Events.after_item_created(item: result.item, source:, entry:, result: result)
548
+ else
549
+ updated += 1
550
+ updated_items << result.item
551
+ end
552
+ rescue StandardError => error
553
+ failed += 1
554
+ errors << normalize_item_error(entry, error)
555
+ end
556
+ end
557
+
558
+ EntryProcessingResult.new(
559
+ created:,
560
+ updated:,
561
+ failed:,
562
+ items:,
563
+ errors: errors.compact,
564
+ created_items:,
565
+ updated_items:
566
+ )
567
+ end
568
+
569
+ def configured_seconds(minutes_value, default)
570
+ minutes = extract_numeric(minutes_value)
571
+ return default unless minutes && minutes.positive?
572
+
573
+ minutes * 60.0
574
+ end
575
+
576
+ def configured_positive(value, default)
577
+ number = extract_numeric(value)
578
+ return default unless number && number.positive?
579
+
580
+ number
581
+ end
582
+
583
+ def configured_non_negative(value, default)
584
+ number = extract_numeric(value)
585
+ return default if number.nil?
586
+
587
+ number.negative? ? 0.0 : number
588
+ end
589
+
590
+ def extract_numeric(value)
591
+ return value if value.is_a?(Numeric)
592
+ return value.to_f if value.respond_to?(:to_f)
593
+
594
+ nil
595
+ rescue StandardError
596
+ nil
597
+ end
598
+
599
+ def fetching_config
600
+ SourceMonitor.config.fetching
601
+ end
602
+
603
+ def normalize_item_error(entry, error)
604
+ {
605
+ guid: safe_entry_guid(entry),
606
+ title: safe_entry_title(entry),
607
+ error_class: error.class.name,
608
+ error_message: error.message
609
+ }
610
+ rescue StandardError
611
+ { error_class: error.class.name, error_message: error.message }
612
+ end
613
+
614
+ def safe_entry_guid(entry)
615
+ if entry.respond_to?(:entry_id)
616
+ entry.entry_id
617
+ elsif entry.respond_to?(:id)
618
+ entry.id
619
+ end
620
+ end
621
+
622
+ def safe_entry_title(entry)
623
+ entry.title if entry.respond_to?(:title)
624
+ end
625
+ end
626
+ end
627
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SourceMonitor
4
+ module Fetching
5
+ class FetchError < StandardError
6
+ CODE = "fetch_error"
7
+
8
+ attr_reader :original_error, :response
9
+
10
+ def initialize(message = nil, original_error: nil, response: nil)
11
+ super(message || default_message)
12
+ @original_error = original_error
13
+ @response = response
14
+ end
15
+
16
+ def code
17
+ self.class::CODE
18
+ end
19
+
20
+ def http_status
21
+ response&.status
22
+ end
23
+
24
+ protected
25
+
26
+ def default_message
27
+ "Fetch error"
28
+ end
29
+ end
30
+
31
+ class TimeoutError < FetchError
32
+ CODE = "timeout"
33
+
34
+ protected
35
+
36
+ def default_message
37
+ "Request timed out"
38
+ end
39
+ end
40
+
41
+ class ConnectionError < FetchError
42
+ CODE = "connection"
43
+
44
+ protected
45
+
46
+ def default_message
47
+ "Connection failed"
48
+ end
49
+ end
50
+
51
+ class HTTPError < FetchError
52
+ CODE = "http_error"
53
+
54
+ attr_reader :status
55
+
56
+ def initialize(status:, message: nil, response: nil, original_error: nil)
57
+ @status = status
58
+ super(message || "HTTP #{status}", response: response, original_error: original_error)
59
+ end
60
+
61
+ protected
62
+
63
+ def default_message
64
+ "HTTP #{status}"
65
+ end
66
+ end
67
+
68
+ class ParsingError < FetchError
69
+ CODE = "parsing"
70
+
71
+ protected
72
+
73
+ def default_message
74
+ "Unable to parse feed"
75
+ end
76
+ end
77
+
78
+ class UnexpectedResponseError < FetchError
79
+ CODE = "unexpected_response"
80
+
81
+ protected
82
+
83
+ def default_message
84
+ "Unexpected response received"
85
+ end
86
+ end
87
+ end
88
+ end