source_monitor 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +16 -0
  3. data/.rubocop.yml +12 -0
  4. data/.ruby-version +1 -0
  5. data/AGENTS.md +132 -0
  6. data/CHANGELOG.md +66 -0
  7. data/CONTRIBUTING.md +31 -0
  8. data/Gemfile +30 -0
  9. data/Gemfile.lock +411 -0
  10. data/MIT-LICENSE +20 -0
  11. data/README.md +108 -0
  12. data/Rakefile +8 -0
  13. data/app/assets/builds/.keep +0 -0
  14. data/app/assets/config/source_monitor_manifest.js +4 -0
  15. data/app/assets/images/source_monitor/.keep +0 -0
  16. data/app/assets/javascripts/source_monitor/application.js +20 -0
  17. data/app/assets/javascripts/source_monitor/controllers/async_submit_controller.js +36 -0
  18. data/app/assets/javascripts/source_monitor/controllers/dropdown_controller.js +109 -0
  19. data/app/assets/javascripts/source_monitor/controllers/modal_controller.js +56 -0
  20. data/app/assets/javascripts/source_monitor/controllers/notification_controller.js +53 -0
  21. data/app/assets/javascripts/source_monitor/turbo_actions.js +13 -0
  22. data/app/assets/stylesheets/source_monitor/application.tailwind.css +13 -0
  23. data/app/assets/svgs/source_monitor/.keep +0 -0
  24. data/app/controllers/concerns/.keep +0 -0
  25. data/app/controllers/concerns/source_monitor/sanitizes_search_params.rb +81 -0
  26. data/app/controllers/source_monitor/application_controller.rb +62 -0
  27. data/app/controllers/source_monitor/dashboard_controller.rb +27 -0
  28. data/app/controllers/source_monitor/fetch_logs_controller.rb +9 -0
  29. data/app/controllers/source_monitor/health_controller.rb +10 -0
  30. data/app/controllers/source_monitor/items_controller.rb +116 -0
  31. data/app/controllers/source_monitor/logs_controller.rb +15 -0
  32. data/app/controllers/source_monitor/scrape_logs_controller.rb +9 -0
  33. data/app/controllers/source_monitor/source_bulk_scrapes_controller.rb +35 -0
  34. data/app/controllers/source_monitor/source_fetches_controller.rb +22 -0
  35. data/app/controllers/source_monitor/source_health_checks_controller.rb +34 -0
  36. data/app/controllers/source_monitor/source_health_resets_controller.rb +27 -0
  37. data/app/controllers/source_monitor/source_retries_controller.rb +22 -0
  38. data/app/controllers/source_monitor/source_turbo_responses.rb +115 -0
  39. data/app/controllers/source_monitor/sources_controller.rb +179 -0
  40. data/app/helpers/source_monitor/application_helper.rb +327 -0
  41. data/app/jobs/source_monitor/application_job.rb +13 -0
  42. data/app/jobs/source_monitor/fetch_feed_job.rb +117 -0
  43. data/app/jobs/source_monitor/item_cleanup_job.rb +48 -0
  44. data/app/jobs/source_monitor/log_cleanup_job.rb +47 -0
  45. data/app/jobs/source_monitor/schedule_fetches_job.rb +29 -0
  46. data/app/jobs/source_monitor/scrape_item_job.rb +47 -0
  47. data/app/jobs/source_monitor/source_health_check_job.rb +77 -0
  48. data/app/mailers/source_monitor/application_mailer.rb +17 -0
  49. data/app/models/concerns/.keep +0 -0
  50. data/app/models/concerns/source_monitor/loggable.rb +18 -0
  51. data/app/models/source_monitor/application_record.rb +5 -0
  52. data/app/models/source_monitor/fetch_log.rb +31 -0
  53. data/app/models/source_monitor/health_check_log.rb +28 -0
  54. data/app/models/source_monitor/item.rb +102 -0
  55. data/app/models/source_monitor/item_content.rb +11 -0
  56. data/app/models/source_monitor/log_entry.rb +56 -0
  57. data/app/models/source_monitor/scrape_log.rb +31 -0
  58. data/app/models/source_monitor/source.rb +115 -0
  59. data/app/views/layouts/source_monitor/application.html.erb +54 -0
  60. data/app/views/source_monitor/dashboard/_fetch_schedule.html.erb +90 -0
  61. data/app/views/source_monitor/dashboard/_job_metrics.html.erb +82 -0
  62. data/app/views/source_monitor/dashboard/_recent_activity.html.erb +39 -0
  63. data/app/views/source_monitor/dashboard/_stat_card.html.erb +6 -0
  64. data/app/views/source_monitor/dashboard/_stats.html.erb +9 -0
  65. data/app/views/source_monitor/dashboard/index.html.erb +48 -0
  66. data/app/views/source_monitor/fetch_logs/show.html.erb +90 -0
  67. data/app/views/source_monitor/items/_details.html.erb +234 -0
  68. data/app/views/source_monitor/items/_details_wrapper.html.erb +3 -0
  69. data/app/views/source_monitor/items/index.html.erb +147 -0
  70. data/app/views/source_monitor/items/show.html.erb +3 -0
  71. data/app/views/source_monitor/logs/index.html.erb +208 -0
  72. data/app/views/source_monitor/scrape_logs/show.html.erb +73 -0
  73. data/app/views/source_monitor/shared/_toast.html.erb +34 -0
  74. data/app/views/source_monitor/sources/_bulk_scrape_form.html.erb +64 -0
  75. data/app/views/source_monitor/sources/_bulk_scrape_modal.html.erb +53 -0
  76. data/app/views/source_monitor/sources/_details.html.erb +302 -0
  77. data/app/views/source_monitor/sources/_details_wrapper.html.erb +3 -0
  78. data/app/views/source_monitor/sources/_empty_state_row.html.erb +5 -0
  79. data/app/views/source_monitor/sources/_fetch_interval_heatmap.html.erb +46 -0
  80. data/app/views/source_monitor/sources/_form.html.erb +143 -0
  81. data/app/views/source_monitor/sources/_health_status_badge.html.erb +46 -0
  82. data/app/views/source_monitor/sources/_row.html.erb +102 -0
  83. data/app/views/source_monitor/sources/edit.html.erb +28 -0
  84. data/app/views/source_monitor/sources/index.html.erb +153 -0
  85. data/app/views/source_monitor/sources/new.html.erb +22 -0
  86. data/app/views/source_monitor/sources/show.html.erb +3 -0
  87. data/config/coverage_baseline.json +2010 -0
  88. data/config/initializers/feedjira.rb +19 -0
  89. data/config/routes.rb +18 -0
  90. data/config/tailwind.config.js +17 -0
  91. data/db/migrate/20241008120000_create_source_monitor_sources.rb +40 -0
  92. data/db/migrate/20241008121000_create_source_monitor_items.rb +44 -0
  93. data/db/migrate/20241008122000_create_source_monitor_fetch_logs.rb +32 -0
  94. data/db/migrate/20241008123000_create_source_monitor_scrape_logs.rb +25 -0
  95. data/db/migrate/20251008183000_change_fetch_interval_to_minutes.rb +23 -0
  96. data/db/migrate/20251009090000_create_source_monitor_item_contents.rb +38 -0
  97. data/db/migrate/20251009103000_add_feed_content_readability_to_sources.rb +5 -0
  98. data/db/migrate/20251010090000_add_adaptive_fetching_toggle_to_sources.rb +7 -0
  99. data/db/migrate/20251010123000_add_deleted_at_to_source_monitor_items.rb +8 -0
  100. data/db/migrate/20251010153000_add_type_to_source_monitor_sources.rb +8 -0
  101. data/db/migrate/20251010154500_add_fetch_status_to_source_monitor_sources.rb +9 -0
  102. data/db/migrate/20251010160000_create_solid_cable_messages.rb +16 -0
  103. data/db/migrate/20251011090000_add_fetch_retry_state_to_sources.rb +14 -0
  104. data/db/migrate/20251012090000_add_health_fields_to_sources.rb +17 -0
  105. data/db/migrate/20251012100000_optimize_source_monitor_database_performance.rb +13 -0
  106. data/db/migrate/20251014064947_add_not_null_constraints_to_items.rb +30 -0
  107. data/db/migrate/20251014171659_add_performance_indexes.rb +29 -0
  108. data/db/migrate/20251014172525_add_fetch_status_check_constraint.rb +18 -0
  109. data/db/migrate/20251015100000_create_source_monitor_log_entries.rb +89 -0
  110. data/db/migrate/20251022100000_create_source_monitor_health_check_logs.rb +22 -0
  111. data/db/migrate/20251108120116_refresh_fetch_status_constraint.rb +29 -0
  112. data/docs/configuration.md +170 -0
  113. data/docs/deployment.md +63 -0
  114. data/docs/gh-cli-workflow.md +44 -0
  115. data/docs/installation.md +144 -0
  116. data/docs/troubleshooting.md +76 -0
  117. data/eslint.config.mjs +27 -0
  118. data/lib/generators/source_monitor/install/install_generator.rb +59 -0
  119. data/lib/generators/source_monitor/install/templates/source_monitor.rb.tt +155 -0
  120. data/lib/source_monitor/analytics/source_activity_rates.rb +53 -0
  121. data/lib/source_monitor/analytics/source_fetch_interval_distribution.rb +57 -0
  122. data/lib/source_monitor/analytics/sources_index_metrics.rb +92 -0
  123. data/lib/source_monitor/assets/bundler.rb +49 -0
  124. data/lib/source_monitor/assets.rb +6 -0
  125. data/lib/source_monitor/configuration.rb +654 -0
  126. data/lib/source_monitor/dashboard/queries.rb +356 -0
  127. data/lib/source_monitor/dashboard/quick_action.rb +7 -0
  128. data/lib/source_monitor/dashboard/quick_actions_presenter.rb +26 -0
  129. data/lib/source_monitor/dashboard/recent_activity.rb +30 -0
  130. data/lib/source_monitor/dashboard/recent_activity_presenter.rb +77 -0
  131. data/lib/source_monitor/dashboard/turbo_broadcaster.rb +87 -0
  132. data/lib/source_monitor/dashboard/upcoming_fetch_schedule.rb +126 -0
  133. data/lib/source_monitor/engine.rb +107 -0
  134. data/lib/source_monitor/events.rb +110 -0
  135. data/lib/source_monitor/feedjira_extensions.rb +103 -0
  136. data/lib/source_monitor/fetching/advisory_lock.rb +54 -0
  137. data/lib/source_monitor/fetching/completion/event_publisher.rb +22 -0
  138. data/lib/source_monitor/fetching/completion/follow_up_handler.rb +37 -0
  139. data/lib/source_monitor/fetching/completion/retention_handler.rb +30 -0
  140. data/lib/source_monitor/fetching/feed_fetcher.rb +627 -0
  141. data/lib/source_monitor/fetching/fetch_error.rb +88 -0
  142. data/lib/source_monitor/fetching/fetch_runner.rb +142 -0
  143. data/lib/source_monitor/fetching/retry_policy.rb +85 -0
  144. data/lib/source_monitor/fetching/stalled_fetch_reconciler.rb +146 -0
  145. data/lib/source_monitor/health/source_health_check.rb +100 -0
  146. data/lib/source_monitor/health/source_health_monitor.rb +210 -0
  147. data/lib/source_monitor/health/source_health_reset.rb +68 -0
  148. data/lib/source_monitor/health.rb +46 -0
  149. data/lib/source_monitor/http.rb +85 -0
  150. data/lib/source_monitor/instrumentation.rb +52 -0
  151. data/lib/source_monitor/items/item_creator.rb +601 -0
  152. data/lib/source_monitor/items/retention_pruner.rb +146 -0
  153. data/lib/source_monitor/items/retention_strategies/destroy.rb +26 -0
  154. data/lib/source_monitor/items/retention_strategies/soft_delete.rb +50 -0
  155. data/lib/source_monitor/items/retention_strategies.rb +9 -0
  156. data/lib/source_monitor/jobs/cleanup_options.rb +85 -0
  157. data/lib/source_monitor/jobs/fetch_failure_subscriber.rb +129 -0
  158. data/lib/source_monitor/jobs/solid_queue_metrics.rb +199 -0
  159. data/lib/source_monitor/jobs/visibility.rb +133 -0
  160. data/lib/source_monitor/logs/entry_sync.rb +69 -0
  161. data/lib/source_monitor/logs/filter_set.rb +163 -0
  162. data/lib/source_monitor/logs/query.rb +81 -0
  163. data/lib/source_monitor/logs/table_presenter.rb +161 -0
  164. data/lib/source_monitor/metrics.rb +77 -0
  165. data/lib/source_monitor/model_extensions.rb +109 -0
  166. data/lib/source_monitor/models/sanitizable.rb +76 -0
  167. data/lib/source_monitor/models/url_normalizable.rb +84 -0
  168. data/lib/source_monitor/pagination/paginator.rb +90 -0
  169. data/lib/source_monitor/realtime/adapter.rb +97 -0
  170. data/lib/source_monitor/realtime/broadcaster.rb +237 -0
  171. data/lib/source_monitor/realtime.rb +17 -0
  172. data/lib/source_monitor/release/changelog.rb +59 -0
  173. data/lib/source_monitor/release/runner.rb +73 -0
  174. data/lib/source_monitor/scheduler.rb +82 -0
  175. data/lib/source_monitor/scrapers/base.rb +105 -0
  176. data/lib/source_monitor/scrapers/fetchers/http_fetcher.rb +97 -0
  177. data/lib/source_monitor/scrapers/parsers/readability_parser.rb +101 -0
  178. data/lib/source_monitor/scrapers/readability.rb +156 -0
  179. data/lib/source_monitor/scraping/bulk_result_presenter.rb +85 -0
  180. data/lib/source_monitor/scraping/bulk_source_scraper.rb +233 -0
  181. data/lib/source_monitor/scraping/enqueuer.rb +125 -0
  182. data/lib/source_monitor/scraping/item_scraper/adapter_resolver.rb +44 -0
  183. data/lib/source_monitor/scraping/item_scraper/persistence.rb +189 -0
  184. data/lib/source_monitor/scraping/item_scraper.rb +84 -0
  185. data/lib/source_monitor/scraping/scheduler.rb +43 -0
  186. data/lib/source_monitor/scraping/state.rb +79 -0
  187. data/lib/source_monitor/security/authentication.rb +85 -0
  188. data/lib/source_monitor/security/parameter_sanitizer.rb +42 -0
  189. data/lib/source_monitor/sources/turbo_stream_presenter.rb +54 -0
  190. data/lib/source_monitor/turbo_streams/stream_responder.rb +95 -0
  191. data/lib/source_monitor/version.rb +3 -0
  192. data/lib/source_monitor.rb +149 -0
  193. data/lib/tasks/recover_stalled_fetches.rake +16 -0
  194. data/lib/tasks/source_monitor_assets.rake +28 -0
  195. data/lib/tasks/source_monitor_tasks.rake +29 -0
  196. data/lib/tasks/test_smoke.rake +12 -0
  197. data/package-lock.json +3997 -0
  198. data/package.json +29 -0
  199. data/postcss.config.js +6 -0
  200. data/source_monitor.gemspec +46 -0
  201. data/stylelint.config.js +12 -0
  202. metadata +469 -0
@@ -0,0 +1,601 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+ require "json"
5
+ require "cgi"
6
+ require "active_support/core_ext/object/blank"
7
+ require "active_support/core_ext/time"
8
+ require "source_monitor/instrumentation"
9
+ require "source_monitor/scrapers/readability"
10
+
11
+ module SourceMonitor
12
+ module Items
13
+ class ItemCreator
14
+ Result = Struct.new(:item, :status, :matched_by, keyword_init: true) do
15
+ def created?
16
+ status == :created
17
+ end
18
+
19
+ def updated?
20
+ status == :updated
21
+ end
22
+ end
23
+ FINGERPRINT_SEPARATOR = "\u0000".freeze
24
+ CONTENT_METHODS = %i[content content_encoded summary].freeze
25
+ TIMESTAMP_METHODS = %i[published updated].freeze
26
+ KEYWORD_SEPARATORS = /[,;]+/.freeze
27
+ METADATA_ROOT_KEY = "feedjira_entry".freeze
28
+ def self.call(source:, entry:)
29
+ new(source:, entry:).call
30
+ end
31
+
32
+ def initialize(source:, entry:)
33
+ @source = source
34
+ @entry = entry
35
+ end
36
+
37
+ def call
38
+ attributes = build_attributes
39
+ raw_guid = attributes[:guid]
40
+ attributes[:guid] = raw_guid.presence || attributes[:content_fingerprint]
41
+
42
+ existing_item, matched_by = existing_item_for(attributes, raw_guid_present: raw_guid.present?)
43
+
44
+ if existing_item
45
+ updated_item = update_existing_item(existing_item, attributes, matched_by)
46
+ return Result.new(item: updated_item, status: :updated, matched_by: matched_by)
47
+ end
48
+
49
+ create_new_item(attributes, raw_guid_present: raw_guid.present?)
50
+ end
51
+
52
+ private
53
+
54
+ attr_reader :source, :entry
55
+
56
+ def existing_item_for(attributes, raw_guid_present:)
57
+ guid = attributes[:guid]
58
+ fingerprint = attributes[:content_fingerprint]
59
+
60
+ if raw_guid_present
61
+ existing = find_item_by_guid(guid)
62
+ return [ existing, :guid ] if existing
63
+ end
64
+
65
+ if fingerprint.present?
66
+ existing = find_item_by_fingerprint(fingerprint)
67
+ return [ existing, :fingerprint ] if existing
68
+ end
69
+
70
+ [ nil, nil ]
71
+ end
72
+
73
+ def find_item_by_guid(guid)
74
+ return if guid.blank?
75
+
76
+ source.all_items.where("LOWER(guid) = ?", guid.downcase).first
77
+ end
78
+
79
+ def find_item_by_fingerprint(fingerprint)
80
+ return if fingerprint.blank?
81
+
82
+ source.all_items.find_by(content_fingerprint: fingerprint)
83
+ end
84
+
85
+ def instrument_duplicate(item, matched_by)
86
+ return unless matched_by
87
+
88
+ SourceMonitor::Instrumentation.item_duplicate(
89
+ source_id: source.id,
90
+ item_id: item.id,
91
+ guid: item.guid,
92
+ content_fingerprint: item.content_fingerprint,
93
+ matched_by: matched_by
94
+ )
95
+ end
96
+
97
+ def update_existing_item(existing_item, attributes, matched_by)
98
+ apply_attributes(existing_item, attributes)
99
+ existing_item.save!
100
+ instrument_duplicate(existing_item, matched_by)
101
+ existing_item
102
+ end
103
+
104
+ def create_new_item(attributes, raw_guid_present:)
105
+ new_item = source.items.new
106
+ apply_attributes(new_item, attributes)
107
+ new_item.save!
108
+ Result.new(item: new_item, status: :created)
109
+ rescue ActiveRecord::RecordNotUnique
110
+ handle_concurrent_duplicate(attributes, raw_guid_present:)
111
+ end
112
+
113
+ def handle_concurrent_duplicate(attributes, raw_guid_present:)
114
+ matched_by = raw_guid_present ? :guid : :fingerprint
115
+ existing = find_conflicting_item(attributes, matched_by)
116
+ updated = update_existing_item(existing, attributes, matched_by)
117
+ Result.new(item: updated, status: :updated, matched_by: matched_by)
118
+ end
119
+
120
+ def find_conflicting_item(attributes, matched_by)
121
+ case matched_by
122
+ when :guid
123
+ find_item_by_guid(attributes[:guid]) || source.all_items.find_by!(guid: attributes[:guid])
124
+ else
125
+ fingerprint = attributes[:content_fingerprint]
126
+ find_item_by_fingerprint(fingerprint) || source.all_items.find_by!(content_fingerprint: fingerprint)
127
+ end
128
+ end
129
+
130
+ def apply_attributes(record, attributes)
131
+ attributes = attributes.dup
132
+ metadata = attributes.delete(:metadata)
133
+ record.assign_attributes(attributes)
134
+ record.metadata = metadata if metadata
135
+ end
136
+
137
+ def process_feed_content(raw_content, title:)
138
+ return [ raw_content, nil ] unless should_process_feed_content?(raw_content)
139
+
140
+ parser = feed_content_parser_class.new
141
+ html = wrap_content_for_readability(raw_content, title: title)
142
+ result = parser.parse(html: html, readability: default_feed_readability_options)
143
+
144
+ processed_content = result.content.presence || raw_content
145
+ metadata = build_feed_content_metadata(result: result, raw_content: raw_content, processed_content: processed_content)
146
+
147
+ [ processed_content, metadata.presence ]
148
+ rescue StandardError => error
149
+ metadata = {
150
+ "status" => "failed",
151
+ "strategy" => "readability",
152
+ "applied" => false,
153
+ "changed" => false,
154
+ "error_class" => error.class.name,
155
+ "error_message" => error.message
156
+ }
157
+ [ raw_content, metadata ]
158
+ end
159
+
160
+ def should_process_feed_content?(raw_content)
161
+ source.respond_to?(:feed_content_readability_enabled?) &&
162
+ source.feed_content_readability_enabled? &&
163
+ raw_content.present? &&
164
+ html_fragment?(raw_content)
165
+ end
166
+
167
+ def feed_content_parser_class
168
+ SourceMonitor::Scrapers::Parsers::ReadabilityParser
169
+ end
170
+
171
+ def wrap_content_for_readability(content, title:)
172
+ safe_title = title.present? ? CGI.escapeHTML(title) : "Feed Entry"
173
+ <<~HTML
174
+ <!DOCTYPE html>
175
+ <html>
176
+ <head>
177
+ <meta charset="utf-8">
178
+ <title>#{safe_title}</title>
179
+ </head>
180
+ <body>
181
+ #{content}
182
+ </body>
183
+ </html>
184
+ HTML
185
+ end
186
+
187
+ def default_feed_readability_options
188
+ default = SourceMonitor::Scrapers::Readability.default_settings[:readability]
189
+ return {} unless default
190
+
191
+ deep_copy(default)
192
+ end
193
+
194
+ def build_feed_content_metadata(result:, raw_content:, processed_content:)
195
+ metadata = {
196
+ "strategy" => result.strategy&.to_s,
197
+ "status" => result.status&.to_s,
198
+ "applied" => result.content.present?,
199
+ "changed" => processed_content != raw_content
200
+ }
201
+
202
+ if result.metadata && result.metadata[:readability_text_length]
203
+ metadata["readability_text_length"] = result.metadata[:readability_text_length]
204
+ end
205
+
206
+ metadata["title"] = result.title if result.title.present?
207
+ metadata.compact
208
+ end
209
+
210
+ def html_fragment?(value)
211
+ value.to_s.match?(/<\s*\w+/)
212
+ end
213
+
214
+ def deep_copy(value)
215
+ if value.respond_to?(:deep_dup)
216
+ return value.deep_dup
217
+ end
218
+
219
+ case value
220
+ when Hash
221
+ value.each_with_object(value.class.new) do |(key, nested), copy|
222
+ copy[key] = deep_copy(nested)
223
+ end
224
+ when Array
225
+ value.map { |element| deep_copy(element) }
226
+ else
227
+ value.dup
228
+ end
229
+ rescue TypeError
230
+ value
231
+ end
232
+
233
+ def build_attributes
234
+ url = extract_url
235
+ title = string_or_nil(entry.title) if entry.respond_to?(:title)
236
+ raw_content = extract_content
237
+ content, content_processing_metadata = process_feed_content(raw_content, title: title)
238
+ fingerprint = generate_fingerprint(title, url, content)
239
+ published_at = extract_timestamp
240
+ updated_at_source = extract_updated_timestamp
241
+
242
+ metadata = extract_metadata
243
+ if content_processing_metadata.present?
244
+ metadata = metadata.merge("feed_content_processing" => content_processing_metadata)
245
+ end
246
+
247
+ {
248
+ guid: extract_guid,
249
+ title: title,
250
+ url: url,
251
+ canonical_url: url,
252
+ author: extract_author,
253
+ authors: extract_authors,
254
+ summary: extract_summary,
255
+ content: content,
256
+ published_at: published_at,
257
+ updated_at_source: updated_at_source,
258
+ categories: extract_categories,
259
+ tags: extract_tags,
260
+ keywords: extract_keywords,
261
+ enclosures: extract_enclosures,
262
+ media_thumbnail_url: extract_media_thumbnail_url,
263
+ media_content: extract_media_content,
264
+ language: extract_language,
265
+ copyright: extract_copyright,
266
+ comments_url: extract_comments_url,
267
+ comments_count: extract_comments_count,
268
+ metadata: metadata,
269
+ content_fingerprint: fingerprint
270
+ }.compact
271
+ end
272
+
273
+ def extract_guid
274
+ entry_guid = entry.respond_to?(:entry_id) ? string_or_nil(entry.entry_id) : nil
275
+ return entry_guid if entry_guid.present?
276
+
277
+ return unless entry.respond_to?(:id)
278
+
279
+ entry_id = string_or_nil(entry.id)
280
+ return if entry_id.blank?
281
+
282
+ url = extract_url
283
+ return entry_id if url.blank? || entry_id != url
284
+
285
+ nil
286
+ end
287
+
288
+ def extract_url
289
+ if entry.respond_to?(:url)
290
+ primary_url = string_or_nil(entry.url)
291
+ return primary_url if primary_url.present?
292
+ end
293
+
294
+ if entry.respond_to?(:link_nodes)
295
+ alternate = Array(entry.link_nodes).find do |node|
296
+ rel = string_or_nil(node&.rel)&.downcase
297
+ rel.nil? || rel == "alternate"
298
+ end
299
+ alternate ||= Array(entry.link_nodes).first
300
+ href = string_or_nil(alternate&.href)
301
+ return href if href.present?
302
+ end
303
+
304
+ if entry.respond_to?(:links)
305
+ href = Array(entry.links).map { |link| string_or_nil(link) }.find(&:present?)
306
+ return href if href.present?
307
+ end
308
+
309
+ nil
310
+ end
311
+
312
+ def extract_summary
313
+ return unless entry.respond_to?(:summary)
314
+
315
+ string_or_nil(entry.summary)
316
+ end
317
+
318
+ def extract_content
319
+ CONTENT_METHODS.each do |method|
320
+ next unless entry.respond_to?(method)
321
+
322
+ value = string_or_nil(entry.public_send(method))
323
+ return value if value.present?
324
+ end
325
+ nil
326
+ end
327
+
328
+ def extract_timestamp
329
+ TIMESTAMP_METHODS.each do |method|
330
+ next unless entry.respond_to?(method)
331
+
332
+ value = entry.public_send(method)
333
+ return value if value.present?
334
+ end
335
+ nil
336
+ end
337
+
338
+ def extract_updated_timestamp
339
+ return entry.updated if entry.respond_to?(:updated) && entry.updated.present?
340
+
341
+ nil
342
+ end
343
+
344
+ def extract_author
345
+ string_or_nil(entry.author) if entry.respond_to?(:author)
346
+ end
347
+
348
+ def extract_authors
349
+ values = []
350
+
351
+ if entry.respond_to?(:rss_authors)
352
+ values.concat(Array(entry.rss_authors).map { |value| string_or_nil(value) })
353
+ end
354
+
355
+ if entry.respond_to?(:dc_creators)
356
+ values.concat(Array(entry.dc_creators).map { |value| string_or_nil(value) })
357
+ elsif entry.respond_to?(:dc_creator)
358
+ values << string_or_nil(entry.dc_creator)
359
+ end
360
+
361
+ if entry.respond_to?(:author_nodes)
362
+ values.concat(
363
+ Array(entry.author_nodes).map do |node|
364
+ next unless node.respond_to?(:name) || node.respond_to?(:email) || node.respond_to?(:uri)
365
+
366
+ string_or_nil(node.name) || string_or_nil(node.email) || string_or_nil(node.uri)
367
+ end
368
+ )
369
+ end
370
+
371
+ if json_entry?
372
+ if entry.respond_to?(:json) && entry.json
373
+ json_authors = Array(entry.json["authors"]).map { |author| string_or_nil(author["name"]) }
374
+ values.concat(json_authors)
375
+ values << string_or_nil(entry.json.dig("author", "name"))
376
+ end
377
+ end
378
+
379
+ primary_author = extract_author
380
+ values << primary_author if primary_author.present?
381
+
382
+ values.compact.uniq
383
+ end
384
+
385
+ def extract_categories
386
+ list = []
387
+ list.concat(Array(entry.categories)) if entry.respond_to?(:categories)
388
+ list.concat(Array(entry.tags)) if entry.respond_to?(:tags)
389
+ if json_entry? && entry.respond_to?(:json) && entry.json
390
+ list.concat(Array(entry.json["tags"]))
391
+ end
392
+ sanitize_string_array(list)
393
+ end
394
+
395
+ def extract_tags
396
+ tags = []
397
+
398
+ tags.concat(Array(entry.tags)) if entry.respond_to?(:tags)
399
+
400
+ if json_entry? && entry.respond_to?(:json) && entry.json
401
+ tags.concat(Array(entry.json["tags"]))
402
+ end
403
+
404
+ tags = extract_categories if tags.empty? && entry.respond_to?(:categories)
405
+
406
+ sanitize_string_array(tags)
407
+ end
408
+
409
+ def extract_keywords
410
+ keywords = []
411
+ keywords.concat(split_keywords(entry.media_keywords_raw)) if entry.respond_to?(:media_keywords_raw)
412
+ keywords.concat(split_keywords(entry.itunes_keywords_raw)) if entry.respond_to?(:itunes_keywords_raw)
413
+ sanitize_string_array(keywords)
414
+ end
415
+
416
+ def extract_enclosures
417
+ enclosures = []
418
+
419
+ if entry.respond_to?(:enclosure_nodes)
420
+ Array(entry.enclosure_nodes).each do |node|
421
+ url = string_or_nil(node&.url)
422
+ next if url.blank?
423
+
424
+ enclosures << {
425
+ "url" => url,
426
+ "type" => string_or_nil(node&.type),
427
+ "length" => safe_integer(node&.length),
428
+ "source" => "rss_enclosure"
429
+ }.compact
430
+ end
431
+ end
432
+
433
+ if atom_entry? && entry.respond_to?(:link_nodes)
434
+ Array(entry.link_nodes).each do |link|
435
+ next unless string_or_nil(link&.rel)&.downcase == "enclosure"
436
+
437
+ url = string_or_nil(link&.href)
438
+ next if url.blank?
439
+
440
+ enclosures << {
441
+ "url" => url,
442
+ "type" => string_or_nil(link&.type),
443
+ "length" => safe_integer(link&.length),
444
+ "source" => "atom_link"
445
+ }.compact
446
+ end
447
+ end
448
+
449
+ if json_entry? && entry.respond_to?(:json) && entry.json
450
+ Array(entry.json["attachments"]).each do |attachment|
451
+ url = string_or_nil(attachment["url"])
452
+ next if url.blank?
453
+
454
+ enclosures << {
455
+ "url" => url,
456
+ "type" => string_or_nil(attachment["mime_type"]),
457
+ "length" => safe_integer(attachment["size_in_bytes"]),
458
+ "duration" => safe_integer(attachment["duration_in_seconds"]),
459
+ "title" => string_or_nil(attachment["title"]),
460
+ "source" => "json_feed_attachment"
461
+ }.compact
462
+ end
463
+ end
464
+
465
+ enclosures.uniq
466
+ end
467
+
468
+ def extract_media_thumbnail_url
469
+ if entry.respond_to?(:media_thumbnail_nodes)
470
+ thumbnail = Array(entry.media_thumbnail_nodes).find { |node| string_or_nil(node&.url).present? }
471
+ return string_or_nil(thumbnail&.url) if thumbnail
472
+ end
473
+
474
+ string_or_nil(entry.image) if entry.respond_to?(:image)
475
+ end
476
+
477
+ def extract_media_content
478
+ contents = []
479
+
480
+ if entry.respond_to?(:media_content_nodes)
481
+ Array(entry.media_content_nodes).each do |node|
482
+ url = string_or_nil(node&.url)
483
+ next if url.blank?
484
+
485
+ contents << {
486
+ "url" => url,
487
+ "type" => string_or_nil(node&.type),
488
+ "medium" => string_or_nil(node&.medium),
489
+ "height" => safe_integer(node&.height),
490
+ "width" => safe_integer(node&.width),
491
+ "file_size" => safe_integer(node&.file_size),
492
+ "duration" => safe_integer(node&.duration),
493
+ "expression" => string_or_nil(node&.expression)
494
+ }.compact
495
+ end
496
+ end
497
+
498
+ contents.uniq
499
+ end
500
+
501
+ def extract_language
502
+ if entry.respond_to?(:language)
503
+ return string_or_nil(entry.language)
504
+ end
505
+
506
+ if json_entry? && entry.respond_to?(:json) && entry.json
507
+ return string_or_nil(entry.json["language"])
508
+ end
509
+
510
+ nil
511
+ end
512
+
513
+ def extract_copyright
514
+ if entry.respond_to?(:copyright)
515
+ return string_or_nil(entry.copyright)
516
+ end
517
+
518
+ if json_entry? && entry.respond_to?(:json) && entry.json
519
+ return string_or_nil(entry.json["copyright"])
520
+ end
521
+
522
+ nil
523
+ end
524
+
525
+ def extract_comments_url
526
+ string_or_nil(entry.comments) if entry.respond_to?(:comments)
527
+ end
528
+
529
+ def extract_comments_count
530
+ raw = nil
531
+ raw ||= entry.slash_comments_raw if entry.respond_to?(:slash_comments_raw)
532
+ raw ||= entry.comments_count if entry.respond_to?(:comments_count)
533
+ safe_integer(raw)
534
+ end
535
+
536
+ def extract_metadata
537
+ return {} unless entry.respond_to?(:to_h)
538
+
539
+ normalized = normalize_metadata(entry.to_h)
540
+ return {} if normalized.blank?
541
+
542
+ { METADATA_ROOT_KEY => normalized }
543
+ end
544
+
545
+ def generate_fingerprint(title, url, content)
546
+ Digest::SHA256.hexdigest(
547
+ [
548
+ title.to_s,
549
+ url.to_s,
550
+ content.to_s
551
+ ].join(FINGERPRINT_SEPARATOR)
552
+ )
553
+ end
554
+
555
+ def string_or_nil(value)
556
+ return value unless value.is_a?(String)
557
+
558
+ value.strip.presence
559
+ end
560
+
561
+ def sanitize_string_array(values)
562
+ Array(values).map { |value| string_or_nil(value) }.compact.uniq
563
+ end
564
+
565
+ def split_keywords(value)
566
+ return [] if value.nil?
567
+
568
+ string = string_or_nil(value)
569
+ return [] if string.blank?
570
+
571
+ string.split(KEYWORD_SEPARATORS).map { |keyword| keyword.strip.presence }.compact
572
+ end
573
+
574
+ def safe_integer(value)
575
+ return if value.nil?
576
+ return value if value.is_a?(Integer)
577
+
578
+ string = value.to_s.strip
579
+ return if string.blank?
580
+
581
+ Integer(string, 10)
582
+ rescue ArgumentError
583
+ nil
584
+ end
585
+
586
+ def json_entry?
587
+ defined?(Feedjira::Parser::JSONFeedItem) && entry.is_a?(Feedjira::Parser::JSONFeedItem)
588
+ end
589
+
590
+ def atom_entry?
591
+ defined?(Feedjira::Parser::AtomEntry) && entry.is_a?(Feedjira::Parser::AtomEntry)
592
+ end
593
+
594
+ def normalize_metadata(value)
595
+ JSON.parse(JSON.generate(value))
596
+ rescue JSON::GeneratorError, JSON::ParserError, TypeError
597
+ {}
598
+ end
599
+ end
600
+ end
601
+ end