source_monitor 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/agents/rails-concern.md +464 -0
- data/.claude/agents/rails-controller.md +424 -0
- data/.claude/agents/rails-hotwire.md +446 -0
- data/.claude/agents/rails-implement.md +374 -0
- data/.claude/agents/rails-job.md +334 -0
- data/.claude/agents/rails-lint.md +294 -0
- data/.claude/agents/rails-mailer.md +371 -0
- data/.claude/agents/rails-migration.md +449 -0
- data/.claude/agents/rails-model.md +420 -0
- data/.claude/agents/rails-policy.md +443 -0
- data/.claude/agents/rails-presenter.md +427 -0
- data/.claude/agents/rails-query.md +412 -0
- data/.claude/agents/rails-review.md +490 -0
- data/.claude/agents/rails-service.md +458 -0
- data/.claude/agents/rails-state-records.md +465 -0
- data/.claude/agents/rails-tdd.md +314 -0
- data/.claude/agents/rails-test.md +441 -0
- data/.claude/agents/rails-view-component.md +418 -0
- data/.claude/hooks/block-secrets.sh +52 -0
- data/.claude/settings.json +85 -0
- data/.claude/skills/action-cable-patterns/SKILL.md +296 -0
- data/.claude/skills/action-mailer-patterns/SKILL.md +295 -0
- data/.claude/skills/active-storage-setup/SKILL.md +311 -0
- data/.claude/skills/api-versioning/SKILL.md +294 -0
- data/.claude/skills/authentication-flow/SKILL.md +335 -0
- data/.claude/skills/authentication-flow/reference/current.md +248 -0
- data/.claude/skills/authentication-flow/reference/passwordless.md +253 -0
- data/.claude/skills/authentication-flow/reference/sessions.md +201 -0
- data/.claude/skills/authorization-pundit/SKILL.md +462 -0
- data/.claude/skills/caching-strategies/SKILL.md +350 -0
- data/.claude/skills/database-migrations/SKILL.md +354 -0
- data/.claude/skills/form-object-patterns/SKILL.md +399 -0
- data/.claude/skills/hotwire-patterns/SKILL.md +247 -0
- data/.claude/skills/hotwire-patterns/reference/stimulus.md +307 -0
- data/.claude/skills/hotwire-patterns/reference/tailwind-integration.md +112 -0
- data/.claude/skills/hotwire-patterns/reference/turbo-frames.md +158 -0
- data/.claude/skills/hotwire-patterns/reference/turbo-streams.md +218 -0
- data/.claude/skills/i18n-patterns/SKILL.md +320 -0
- data/.claude/skills/install/SKILL.md +367 -0
- data/.claude/skills/performance-optimization/SKILL.md +311 -0
- data/.claude/skills/rails-architecture/SKILL.md +259 -0
- data/.claude/skills/rails-architecture/reference/error-handling.md +333 -0
- data/.claude/skills/rails-architecture/reference/event-tracking.md +142 -0
- data/.claude/skills/rails-architecture/reference/layer-interactions.md +417 -0
- data/.claude/skills/rails-architecture/reference/multi-tenancy.md +152 -0
- data/.claude/skills/rails-architecture/reference/query-patterns.md +342 -0
- data/.claude/skills/rails-architecture/reference/service-patterns.md +286 -0
- data/.claude/skills/rails-architecture/reference/state-records.md +250 -0
- data/.claude/skills/rails-architecture/reference/testing-strategy.md +326 -0
- data/.claude/skills/rails-concern/SKILL.md +399 -0
- data/.claude/skills/rails-controller/SKILL.md +336 -0
- data/.claude/skills/rails-model-generator/SKILL.md +321 -0
- data/.claude/skills/rails-model-generator/reference/validations.md +298 -0
- data/.claude/skills/rails-presenter/SKILL.md +274 -0
- data/.claude/skills/rails-query-object/SKILL.md +289 -0
- data/.claude/skills/rails-service-object/SKILL.md +349 -0
- data/.claude/skills/solid-queue-setup/SKILL.md +307 -0
- data/.claude/skills/tdd-cycle/SKILL.md +359 -0
- data/.claude/skills/viewcomponent-patterns/SKILL.md +333 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +2 -0
- data/.ruby-version +1 -1
- data/.vbw-planning/.notification-log.jsonl +192 -0
- data/.vbw-planning/.session-log.jsonl +871 -0
- data/.vbw-planning/PROJECT.md +51 -0
- data/.vbw-planning/REQUIREMENTS.md +50 -0
- data/.vbw-planning/SHIPPED.md +28 -0
- data/.vbw-planning/codebase/ARCHITECTURE.md +147 -0
- data/.vbw-planning/codebase/CONCERNS.md +99 -0
- data/.vbw-planning/codebase/CONVENTIONS.md +97 -0
- data/.vbw-planning/codebase/DEPENDENCIES.md +100 -0
- data/.vbw-planning/codebase/INDEX.md +86 -0
- data/.vbw-planning/codebase/META.md +42 -0
- data/.vbw-planning/codebase/PATTERNS.md +262 -0
- data/.vbw-planning/codebase/STACK.md +101 -0
- data/.vbw-planning/codebase/STRUCTURE.md +324 -0
- data/.vbw-planning/codebase/TESTING.md +154 -0
- data/.vbw-planning/config.json +12 -0
- data/.vbw-planning/discovery.json +24 -0
- data/.vbw-planning/milestones/default/ROADMAP.md +115 -0
- data/.vbw-planning/milestones/default/STATE.md +83 -0
- data/.vbw-planning/milestones/default/phases/01-coverage-analysis-quick-wins/PLAN-01-SUMMARY.md +56 -0
- data/.vbw-planning/milestones/default/phases/01-coverage-analysis-quick-wins/PLAN-01.md +187 -0
- data/.vbw-planning/milestones/default/phases/01-coverage-analysis-quick-wins/PLAN-02-SUMMARY.md +64 -0
- data/.vbw-planning/milestones/default/phases/01-coverage-analysis-quick-wins/PLAN-02.md +137 -0
- data/.vbw-planning/milestones/default/phases/02-critical-path-test-coverage/PLAN-01-SUMMARY.md +67 -0
- data/.vbw-planning/milestones/default/phases/02-critical-path-test-coverage/PLAN-01.md +142 -0
- data/.vbw-planning/milestones/default/phases/02-critical-path-test-coverage/PLAN-02-SUMMARY.md +64 -0
- data/.vbw-planning/milestones/default/phases/02-critical-path-test-coverage/PLAN-02.md +138 -0
- data/.vbw-planning/milestones/default/phases/02-critical-path-test-coverage/PLAN-03-SUMMARY.md +85 -0
- data/.vbw-planning/milestones/default/phases/02-critical-path-test-coverage/PLAN-03.md +147 -0
- data/.vbw-planning/milestones/default/phases/02-critical-path-test-coverage/PLAN-04-SUMMARY.md +63 -0
- data/.vbw-planning/milestones/default/phases/02-critical-path-test-coverage/PLAN-04.md +129 -0
- data/.vbw-planning/milestones/default/phases/02-critical-path-test-coverage/PLAN-05-SUMMARY.md +74 -0
- data/.vbw-planning/milestones/default/phases/02-critical-path-test-coverage/PLAN-05.md +154 -0
- data/.vbw-planning/milestones/default/phases/03-large-file-refactoring/03-VERIFICATION-wave1.md +303 -0
- data/.vbw-planning/milestones/default/phases/03-large-file-refactoring/03-VERIFICATION.md +510 -0
- data/.vbw-planning/milestones/default/phases/03-large-file-refactoring/PLAN-01-SUMMARY.md +61 -0
- data/.vbw-planning/milestones/default/phases/03-large-file-refactoring/PLAN-01.md +161 -0
- data/.vbw-planning/milestones/default/phases/03-large-file-refactoring/PLAN-02-SUMMARY.md +66 -0
- data/.vbw-planning/milestones/default/phases/03-large-file-refactoring/PLAN-02.md +132 -0
- data/.vbw-planning/milestones/default/phases/03-large-file-refactoring/PLAN-03-SUMMARY.md +59 -0
- data/.vbw-planning/milestones/default/phases/03-large-file-refactoring/PLAN-03.md +171 -0
- data/.vbw-planning/milestones/default/phases/03-large-file-refactoring/PLAN-04-SUMMARY.md +56 -0
- data/.vbw-planning/milestones/default/phases/03-large-file-refactoring/PLAN-04.md +152 -0
- data/.vbw-planning/milestones/default/phases/04-code-quality-conventions-cleanup/04-CONTEXT.md +33 -0
- data/.vbw-planning/milestones/default/phases/04-code-quality-conventions-cleanup/PLAN-01-SUMMARY.md +42 -0
- data/.vbw-planning/milestones/default/phases/04-code-quality-conventions-cleanup/PLAN-01.md +119 -0
- data/.vbw-planning/milestones/default/phases/04-code-quality-conventions-cleanup/PLAN-02-SUMMARY.md +52 -0
- data/.vbw-planning/milestones/default/phases/04-code-quality-conventions-cleanup/PLAN-02.md +195 -0
- data/.vbw-planning/milestones/default/phases/04-code-quality-conventions-cleanup/PLAN-03-SUMMARY.md +79 -0
- data/.vbw-planning/milestones/default/phases/04-code-quality-conventions-cleanup/PLAN-03.md +130 -0
- data/CHANGELOG.md +28 -0
- data/CLAUDE.md +179 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +114 -101
- data/Rakefile +2 -0
- data/app/assets/builds/source_monitor/application.css +2076 -0
- data/app/assets/builds/source_monitor/application.js +2758 -0
- data/app/assets/builds/source_monitor/application.js.map +7 -0
- data/app/controllers/source_monitor/application_controller.rb +2 -0
- data/app/controllers/source_monitor/health_controller.rb +2 -0
- data/app/controllers/source_monitor/import_sessions/bulk_configuration.rb +106 -0
- data/app/controllers/source_monitor/import_sessions/entry_annotation.rb +187 -0
- data/app/controllers/source_monitor/import_sessions/health_check_management.rb +112 -0
- data/app/controllers/source_monitor/import_sessions/opml_parser.rb +130 -0
- data/app/controllers/source_monitor/import_sessions_controller.rb +6 -507
- data/app/controllers/source_monitor/items_controller.rb +2 -0
- data/app/controllers/source_monitor/sources_controller.rb +0 -14
- data/app/helpers/source_monitor/application_helper.rb +4 -112
- data/app/helpers/source_monitor/health_badge_helper.rb +69 -0
- data/app/helpers/source_monitor/table_sort_helper.rb +53 -0
- data/app/jobs/source_monitor/application_job.rb +2 -0
- data/app/models/source_monitor/application_record.rb +2 -0
- data/app/models/source_monitor/log_entry.rb +0 -2
- data/config/coverage_baseline.json +217 -1862
- data/config/routes.rb +2 -0
- data/db/migrate/20251009103000_add_feed_content_readability_to_sources.rb +2 -0
- data/db/migrate/20251014171659_add_performance_indexes.rb +2 -0
- data/db/migrate/20251014172525_add_fetch_status_check_constraint.rb +2 -0
- data/db/migrate/20251108120116_refresh_fetch_status_constraint.rb +2 -0
- data/db/migrate/20260210204022_add_composite_index_to_log_entries.rb +17 -0
- data/lib/source_monitor/assets/bundler.rb +2 -0
- data/lib/source_monitor/assets.rb +2 -0
- data/lib/source_monitor/configuration/authentication_settings.rb +62 -0
- data/lib/source_monitor/configuration/events.rb +60 -0
- data/lib/source_monitor/configuration/fetching_settings.rb +27 -0
- data/lib/source_monitor/configuration/health_settings.rb +27 -0
- data/lib/source_monitor/configuration/http_settings.rb +43 -0
- data/lib/source_monitor/configuration/model_definition.rb +108 -0
- data/lib/source_monitor/configuration/models.rb +36 -0
- data/lib/source_monitor/configuration/realtime_settings.rb +95 -0
- data/lib/source_monitor/configuration/retention_settings.rb +45 -0
- data/lib/source_monitor/configuration/scraper_registry.rb +67 -0
- data/lib/source_monitor/configuration/scraping_settings.rb +39 -0
- data/lib/source_monitor/configuration/validation_definition.rb +32 -0
- data/lib/source_monitor/configuration.rb +12 -579
- data/lib/source_monitor/dashboard/queries/recent_activity_query.rb +138 -0
- data/lib/source_monitor/dashboard/queries/stats_query.rb +71 -0
- data/lib/source_monitor/dashboard/queries.rb +2 -195
- data/lib/source_monitor/engine.rb +2 -0
- data/lib/source_monitor/fetching/feed_fetcher/adaptive_interval.rb +141 -0
- data/lib/source_monitor/fetching/feed_fetcher/entry_processor.rb +89 -0
- data/lib/source_monitor/fetching/feed_fetcher/source_updater.rb +200 -0
- data/lib/source_monitor/fetching/feed_fetcher.rb +37 -379
- data/lib/source_monitor/items/item_creator/content_extractor.rb +113 -0
- data/lib/source_monitor/items/item_creator/entry_parser/media_extraction.rb +96 -0
- data/lib/source_monitor/items/item_creator/entry_parser.rb +294 -0
- data/lib/source_monitor/items/item_creator.rb +28 -455
- data/lib/source_monitor/setup/bundle_installer.rb +2 -0
- data/lib/source_monitor/setup/cli.rb +2 -0
- data/lib/source_monitor/setup/dependency_checker.rb +2 -0
- data/lib/source_monitor/setup/detectors.rb +2 -0
- data/lib/source_monitor/setup/gemfile_editor.rb +2 -0
- data/lib/source_monitor/setup/initializer_patcher.rb +2 -0
- data/lib/source_monitor/setup/install_generator.rb +2 -0
- data/lib/source_monitor/setup/migration_installer.rb +2 -0
- data/lib/source_monitor/setup/node_installer.rb +2 -0
- data/lib/source_monitor/setup/prompter.rb +2 -0
- data/lib/source_monitor/setup/requirements.rb +2 -0
- data/lib/source_monitor/setup/shell_runner.rb +2 -0
- data/lib/source_monitor/setup/verification/action_cable_verifier.rb +2 -0
- data/lib/source_monitor/setup/verification/printer.rb +2 -0
- data/lib/source_monitor/setup/verification/result.rb +2 -0
- data/lib/source_monitor/setup/verification/runner.rb +2 -0
- data/lib/source_monitor/setup/verification/solid_queue_verifier.rb +2 -0
- data/lib/source_monitor/setup/verification/telemetry_logger.rb +2 -0
- data/lib/source_monitor/setup/workflow.rb +2 -0
- data/lib/source_monitor/version.rb +3 -1
- data/lib/source_monitor.rb +140 -58
- data/lib/tasks/source_monitor_assets.rake +2 -0
- data/lib/tasks/source_monitor_setup.rake +2 -0
- data/lib/tasks/source_monitor_tasks.rake +2 -0
- data/source_monitor.gemspec +3 -1
- metadata +144 -4
|
@@ -7,6 +7,8 @@ require "active_support/core_ext/object/blank"
|
|
|
7
7
|
require "active_support/core_ext/time"
|
|
8
8
|
require "source_monitor/instrumentation"
|
|
9
9
|
require "source_monitor/scrapers/readability"
|
|
10
|
+
require "source_monitor/items/item_creator/entry_parser"
|
|
11
|
+
require "source_monitor/items/item_creator/content_extractor"
|
|
10
12
|
|
|
11
13
|
module SourceMonitor
|
|
12
14
|
module Items
|
|
@@ -20,11 +22,13 @@ module SourceMonitor
|
|
|
20
22
|
status == :updated
|
|
21
23
|
end
|
|
22
24
|
end
|
|
25
|
+
|
|
23
26
|
FINGERPRINT_SEPARATOR = "\u0000".freeze
|
|
24
27
|
CONTENT_METHODS = %i[content content_encoded summary].freeze
|
|
25
28
|
TIMESTAMP_METHODS = %i[published updated].freeze
|
|
26
29
|
KEYWORD_SEPARATORS = /[,;]+/.freeze
|
|
27
30
|
METADATA_ROOT_KEY = "feedjira_entry".freeze
|
|
31
|
+
|
|
28
32
|
def self.call(source:, entry:)
|
|
29
33
|
new(source:, entry:).call
|
|
30
34
|
end
|
|
@@ -134,468 +138,37 @@ module SourceMonitor
|
|
|
134
138
|
record.metadata = metadata if metadata
|
|
135
139
|
end
|
|
136
140
|
|
|
137
|
-
def process_feed_content(raw_content, title:)
|
|
138
|
-
return [ raw_content, nil ] unless should_process_feed_content?(raw_content)
|
|
139
|
-
|
|
140
|
-
parser = feed_content_parser_class.new
|
|
141
|
-
html = wrap_content_for_readability(raw_content, title: title)
|
|
142
|
-
result = parser.parse(html: html, readability: default_feed_readability_options)
|
|
143
|
-
|
|
144
|
-
processed_content = result.content.presence || raw_content
|
|
145
|
-
metadata = build_feed_content_metadata(result: result, raw_content: raw_content, processed_content: processed_content)
|
|
146
|
-
|
|
147
|
-
[ processed_content, metadata.presence ]
|
|
148
|
-
rescue StandardError => error
|
|
149
|
-
metadata = {
|
|
150
|
-
"status" => "failed",
|
|
151
|
-
"strategy" => "readability",
|
|
152
|
-
"applied" => false,
|
|
153
|
-
"changed" => false,
|
|
154
|
-
"error_class" => error.class.name,
|
|
155
|
-
"error_message" => error.message
|
|
156
|
-
}
|
|
157
|
-
[ raw_content, metadata ]
|
|
158
|
-
end
|
|
159
|
-
|
|
160
|
-
def should_process_feed_content?(raw_content)
|
|
161
|
-
source.respond_to?(:feed_content_readability_enabled?) &&
|
|
162
|
-
source.feed_content_readability_enabled? &&
|
|
163
|
-
raw_content.present? &&
|
|
164
|
-
html_fragment?(raw_content)
|
|
165
|
-
end
|
|
166
|
-
|
|
167
|
-
def feed_content_parser_class
|
|
168
|
-
SourceMonitor::Scrapers::Parsers::ReadabilityParser
|
|
169
|
-
end
|
|
170
|
-
|
|
171
|
-
def wrap_content_for_readability(content, title:)
|
|
172
|
-
safe_title = title.present? ? CGI.escapeHTML(title) : "Feed Entry"
|
|
173
|
-
<<~HTML
|
|
174
|
-
<!DOCTYPE html>
|
|
175
|
-
<html>
|
|
176
|
-
<head>
|
|
177
|
-
<meta charset="utf-8">
|
|
178
|
-
<title>#{safe_title}</title>
|
|
179
|
-
</head>
|
|
180
|
-
<body>
|
|
181
|
-
#{content}
|
|
182
|
-
</body>
|
|
183
|
-
</html>
|
|
184
|
-
HTML
|
|
185
|
-
end
|
|
186
|
-
|
|
187
|
-
def default_feed_readability_options
|
|
188
|
-
default = SourceMonitor::Scrapers::Readability.default_settings[:readability]
|
|
189
|
-
return {} unless default
|
|
190
|
-
|
|
191
|
-
deep_copy(default)
|
|
192
|
-
end
|
|
193
|
-
|
|
194
|
-
def build_feed_content_metadata(result:, raw_content:, processed_content:)
|
|
195
|
-
metadata = {
|
|
196
|
-
"strategy" => result.strategy&.to_s,
|
|
197
|
-
"status" => result.status&.to_s,
|
|
198
|
-
"applied" => result.content.present?,
|
|
199
|
-
"changed" => processed_content != raw_content
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
if result.metadata && result.metadata[:readability_text_length]
|
|
203
|
-
metadata["readability_text_length"] = result.metadata[:readability_text_length]
|
|
204
|
-
end
|
|
205
|
-
|
|
206
|
-
metadata["title"] = result.title if result.title.present?
|
|
207
|
-
metadata.compact
|
|
208
|
-
end
|
|
209
|
-
|
|
210
|
-
def html_fragment?(value)
|
|
211
|
-
value.to_s.match?(/<\s*\w+/)
|
|
212
|
-
end
|
|
213
|
-
|
|
214
|
-
def deep_copy(value)
|
|
215
|
-
if value.respond_to?(:deep_dup)
|
|
216
|
-
return value.deep_dup
|
|
217
|
-
end
|
|
218
|
-
|
|
219
|
-
case value
|
|
220
|
-
when Hash
|
|
221
|
-
value.each_with_object(value.class.new) do |(key, nested), copy|
|
|
222
|
-
copy[key] = deep_copy(nested)
|
|
223
|
-
end
|
|
224
|
-
when Array
|
|
225
|
-
value.map { |element| deep_copy(element) }
|
|
226
|
-
else
|
|
227
|
-
value.dup
|
|
228
|
-
end
|
|
229
|
-
rescue TypeError
|
|
230
|
-
value
|
|
231
|
-
end
|
|
232
|
-
|
|
233
141
|
def build_attributes
|
|
234
|
-
|
|
235
|
-
title = string_or_nil(entry.title) if entry.respond_to?(:title)
|
|
236
|
-
raw_content = extract_content
|
|
237
|
-
content, content_processing_metadata = process_feed_content(raw_content, title: title)
|
|
238
|
-
fingerprint = generate_fingerprint(title, url, content)
|
|
239
|
-
published_at = extract_timestamp
|
|
240
|
-
updated_at_source = extract_updated_timestamp
|
|
241
|
-
|
|
242
|
-
metadata = extract_metadata
|
|
243
|
-
if content_processing_metadata.present?
|
|
244
|
-
metadata = metadata.merge("feed_content_processing" => content_processing_metadata)
|
|
245
|
-
end
|
|
246
|
-
|
|
247
|
-
{
|
|
248
|
-
guid: extract_guid,
|
|
249
|
-
title: title,
|
|
250
|
-
url: url,
|
|
251
|
-
canonical_url: url,
|
|
252
|
-
author: extract_author,
|
|
253
|
-
authors: extract_authors,
|
|
254
|
-
summary: extract_summary,
|
|
255
|
-
content: content,
|
|
256
|
-
published_at: published_at,
|
|
257
|
-
updated_at_source: updated_at_source,
|
|
258
|
-
categories: extract_categories,
|
|
259
|
-
tags: extract_tags,
|
|
260
|
-
keywords: extract_keywords,
|
|
261
|
-
enclosures: extract_enclosures,
|
|
262
|
-
media_thumbnail_url: extract_media_thumbnail_url,
|
|
263
|
-
media_content: extract_media_content,
|
|
264
|
-
language: extract_language,
|
|
265
|
-
copyright: extract_copyright,
|
|
266
|
-
comments_url: extract_comments_url,
|
|
267
|
-
comments_count: extract_comments_count,
|
|
268
|
-
metadata: metadata,
|
|
269
|
-
content_fingerprint: fingerprint
|
|
270
|
-
}.compact
|
|
142
|
+
entry_parser.parse
|
|
271
143
|
end
|
|
272
144
|
|
|
273
|
-
def
|
|
274
|
-
|
|
275
|
-
return entry_guid if entry_guid.present?
|
|
276
|
-
|
|
277
|
-
return unless entry.respond_to?(:id)
|
|
278
|
-
|
|
279
|
-
entry_id = string_or_nil(entry.id)
|
|
280
|
-
return if entry_id.blank?
|
|
281
|
-
|
|
282
|
-
url = extract_url
|
|
283
|
-
return entry_id if url.blank? || entry_id != url
|
|
284
|
-
|
|
285
|
-
nil
|
|
145
|
+
def entry_parser
|
|
146
|
+
@entry_parser ||= EntryParser.new(source: source, entry: entry, content_extractor: content_extractor)
|
|
286
147
|
end
|
|
287
148
|
|
|
288
|
-
def
|
|
289
|
-
|
|
290
|
-
primary_url = string_or_nil(entry.url)
|
|
291
|
-
return primary_url if primary_url.present?
|
|
292
|
-
end
|
|
293
|
-
|
|
294
|
-
if entry.respond_to?(:link_nodes)
|
|
295
|
-
alternate = Array(entry.link_nodes).find do |node|
|
|
296
|
-
rel = string_or_nil(node&.rel)&.downcase
|
|
297
|
-
rel.nil? || rel == "alternate"
|
|
298
|
-
end
|
|
299
|
-
alternate ||= Array(entry.link_nodes).first
|
|
300
|
-
href = string_or_nil(alternate&.href)
|
|
301
|
-
return href if href.present?
|
|
302
|
-
end
|
|
303
|
-
|
|
304
|
-
if entry.respond_to?(:links)
|
|
305
|
-
href = Array(entry.links).map { |link| string_or_nil(link) }.find(&:present?)
|
|
306
|
-
return href if href.present?
|
|
307
|
-
end
|
|
308
|
-
|
|
309
|
-
nil
|
|
310
|
-
end
|
|
311
|
-
|
|
312
|
-
def extract_summary
|
|
313
|
-
return unless entry.respond_to?(:summary)
|
|
314
|
-
|
|
315
|
-
string_or_nil(entry.summary)
|
|
316
|
-
end
|
|
317
|
-
|
|
318
|
-
def extract_content
|
|
319
|
-
CONTENT_METHODS.each do |method|
|
|
320
|
-
next unless entry.respond_to?(method)
|
|
321
|
-
|
|
322
|
-
value = string_or_nil(entry.public_send(method))
|
|
323
|
-
return value if value.present?
|
|
324
|
-
end
|
|
325
|
-
nil
|
|
326
|
-
end
|
|
327
|
-
|
|
328
|
-
def extract_timestamp
|
|
329
|
-
TIMESTAMP_METHODS.each do |method|
|
|
330
|
-
next unless entry.respond_to?(method)
|
|
331
|
-
|
|
332
|
-
value = entry.public_send(method)
|
|
333
|
-
return value if value.present?
|
|
334
|
-
end
|
|
335
|
-
nil
|
|
336
|
-
end
|
|
337
|
-
|
|
338
|
-
def extract_updated_timestamp
|
|
339
|
-
return entry.updated if entry.respond_to?(:updated) && entry.updated.present?
|
|
340
|
-
|
|
341
|
-
nil
|
|
342
|
-
end
|
|
343
|
-
|
|
344
|
-
def extract_author
|
|
345
|
-
string_or_nil(entry.author) if entry.respond_to?(:author)
|
|
346
|
-
end
|
|
347
|
-
|
|
348
|
-
def extract_authors
|
|
349
|
-
values = []
|
|
350
|
-
|
|
351
|
-
if entry.respond_to?(:rss_authors)
|
|
352
|
-
values.concat(Array(entry.rss_authors).map { |value| string_or_nil(value) })
|
|
353
|
-
end
|
|
354
|
-
|
|
355
|
-
if entry.respond_to?(:dc_creators)
|
|
356
|
-
values.concat(Array(entry.dc_creators).map { |value| string_or_nil(value) })
|
|
357
|
-
elsif entry.respond_to?(:dc_creator)
|
|
358
|
-
values << string_or_nil(entry.dc_creator)
|
|
359
|
-
end
|
|
360
|
-
|
|
361
|
-
if entry.respond_to?(:author_nodes)
|
|
362
|
-
values.concat(
|
|
363
|
-
Array(entry.author_nodes).map do |node|
|
|
364
|
-
next unless node.respond_to?(:name) || node.respond_to?(:email) || node.respond_to?(:uri)
|
|
365
|
-
|
|
366
|
-
string_or_nil(node.name) || string_or_nil(node.email) || string_or_nil(node.uri)
|
|
367
|
-
end
|
|
368
|
-
)
|
|
369
|
-
end
|
|
370
|
-
|
|
371
|
-
if json_entry?
|
|
372
|
-
if entry.respond_to?(:json) && entry.json
|
|
373
|
-
json_authors = Array(entry.json["authors"]).map { |author| string_or_nil(author["name"]) }
|
|
374
|
-
values.concat(json_authors)
|
|
375
|
-
values << string_or_nil(entry.json.dig("author", "name"))
|
|
376
|
-
end
|
|
377
|
-
end
|
|
378
|
-
|
|
379
|
-
primary_author = extract_author
|
|
380
|
-
values << primary_author if primary_author.present?
|
|
381
|
-
|
|
382
|
-
values.compact.uniq
|
|
383
|
-
end
|
|
384
|
-
|
|
385
|
-
def extract_categories
|
|
386
|
-
list = []
|
|
387
|
-
list.concat(Array(entry.categories)) if entry.respond_to?(:categories)
|
|
388
|
-
list.concat(Array(entry.tags)) if entry.respond_to?(:tags)
|
|
389
|
-
if json_entry? && entry.respond_to?(:json) && entry.json
|
|
390
|
-
list.concat(Array(entry.json["tags"]))
|
|
391
|
-
end
|
|
392
|
-
sanitize_string_array(list)
|
|
149
|
+
def content_extractor
|
|
150
|
+
@content_extractor ||= ContentExtractor.new(source: source)
|
|
393
151
|
end
|
|
394
152
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
def
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
end
|
|
415
|
-
|
|
416
|
-
def extract_enclosures
|
|
417
|
-
enclosures = []
|
|
418
|
-
|
|
419
|
-
if entry.respond_to?(:enclosure_nodes)
|
|
420
|
-
Array(entry.enclosure_nodes).each do |node|
|
|
421
|
-
url = string_or_nil(node&.url)
|
|
422
|
-
next if url.blank?
|
|
423
|
-
|
|
424
|
-
enclosures << {
|
|
425
|
-
"url" => url,
|
|
426
|
-
"type" => string_or_nil(node&.type),
|
|
427
|
-
"length" => safe_integer(node&.length),
|
|
428
|
-
"source" => "rss_enclosure"
|
|
429
|
-
}.compact
|
|
430
|
-
end
|
|
431
|
-
end
|
|
432
|
-
|
|
433
|
-
if atom_entry? && entry.respond_to?(:link_nodes)
|
|
434
|
-
Array(entry.link_nodes).each do |link|
|
|
435
|
-
next unless string_or_nil(link&.rel)&.downcase == "enclosure"
|
|
436
|
-
|
|
437
|
-
url = string_or_nil(link&.href)
|
|
438
|
-
next if url.blank?
|
|
439
|
-
|
|
440
|
-
enclosures << {
|
|
441
|
-
"url" => url,
|
|
442
|
-
"type" => string_or_nil(link&.type),
|
|
443
|
-
"length" => safe_integer(link&.length),
|
|
444
|
-
"source" => "atom_link"
|
|
445
|
-
}.compact
|
|
446
|
-
end
|
|
447
|
-
end
|
|
448
|
-
|
|
449
|
-
if json_entry? && entry.respond_to?(:json) && entry.json
|
|
450
|
-
Array(entry.json["attachments"]).each do |attachment|
|
|
451
|
-
url = string_or_nil(attachment["url"])
|
|
452
|
-
next if url.blank?
|
|
453
|
-
|
|
454
|
-
enclosures << {
|
|
455
|
-
"url" => url,
|
|
456
|
-
"type" => string_or_nil(attachment["mime_type"]),
|
|
457
|
-
"length" => safe_integer(attachment["size_in_bytes"]),
|
|
458
|
-
"duration" => safe_integer(attachment["duration_in_seconds"]),
|
|
459
|
-
"title" => string_or_nil(attachment["title"]),
|
|
460
|
-
"source" => "json_feed_attachment"
|
|
461
|
-
}.compact
|
|
462
|
-
end
|
|
463
|
-
end
|
|
464
|
-
|
|
465
|
-
enclosures.uniq
|
|
466
|
-
end
|
|
467
|
-
|
|
468
|
-
def extract_media_thumbnail_url
|
|
469
|
-
if entry.respond_to?(:media_thumbnail_nodes)
|
|
470
|
-
thumbnail = Array(entry.media_thumbnail_nodes).find { |node| string_or_nil(node&.url).present? }
|
|
471
|
-
return string_or_nil(thumbnail&.url) if thumbnail
|
|
472
|
-
end
|
|
473
|
-
|
|
474
|
-
string_or_nil(entry.image) if entry.respond_to?(:image)
|
|
475
|
-
end
|
|
476
|
-
|
|
477
|
-
def extract_media_content
|
|
478
|
-
contents = []
|
|
479
|
-
|
|
480
|
-
if entry.respond_to?(:media_content_nodes)
|
|
481
|
-
Array(entry.media_content_nodes).each do |node|
|
|
482
|
-
url = string_or_nil(node&.url)
|
|
483
|
-
next if url.blank?
|
|
484
|
-
|
|
485
|
-
contents << {
|
|
486
|
-
"url" => url,
|
|
487
|
-
"type" => string_or_nil(node&.type),
|
|
488
|
-
"medium" => string_or_nil(node&.medium),
|
|
489
|
-
"height" => safe_integer(node&.height),
|
|
490
|
-
"width" => safe_integer(node&.width),
|
|
491
|
-
"file_size" => safe_integer(node&.file_size),
|
|
492
|
-
"duration" => safe_integer(node&.duration),
|
|
493
|
-
"expression" => string_or_nil(node&.expression)
|
|
494
|
-
}.compact
|
|
495
|
-
end
|
|
496
|
-
end
|
|
497
|
-
|
|
498
|
-
contents.uniq
|
|
499
|
-
end
|
|
500
|
-
|
|
501
|
-
def extract_language
|
|
502
|
-
if entry.respond_to?(:language)
|
|
503
|
-
return string_or_nil(entry.language)
|
|
504
|
-
end
|
|
505
|
-
|
|
506
|
-
if json_entry? && entry.respond_to?(:json) && entry.json
|
|
507
|
-
return string_or_nil(entry.json["language"])
|
|
508
|
-
end
|
|
509
|
-
|
|
510
|
-
nil
|
|
511
|
-
end
|
|
512
|
-
|
|
513
|
-
def extract_copyright
|
|
514
|
-
if entry.respond_to?(:copyright)
|
|
515
|
-
return string_or_nil(entry.copyright)
|
|
516
|
-
end
|
|
517
|
-
|
|
518
|
-
if json_entry? && entry.respond_to?(:json) && entry.json
|
|
519
|
-
return string_or_nil(entry.json["copyright"])
|
|
520
|
-
end
|
|
521
|
-
|
|
522
|
-
nil
|
|
523
|
-
end
|
|
524
|
-
|
|
525
|
-
def extract_comments_url
|
|
526
|
-
string_or_nil(entry.comments) if entry.respond_to?(:comments)
|
|
527
|
-
end
|
|
528
|
-
|
|
529
|
-
def extract_comments_count
|
|
530
|
-
raw = nil
|
|
531
|
-
raw ||= entry.slash_comments_raw if entry.respond_to?(:slash_comments_raw)
|
|
532
|
-
raw ||= entry.comments_count if entry.respond_to?(:comments_count)
|
|
533
|
-
safe_integer(raw)
|
|
534
|
-
end
|
|
535
|
-
|
|
536
|
-
def extract_metadata
|
|
537
|
-
return {} unless entry.respond_to?(:to_h)
|
|
538
|
-
|
|
539
|
-
normalized = normalize_metadata(entry.to_h)
|
|
540
|
-
return {} if normalized.blank?
|
|
541
|
-
|
|
542
|
-
{ METADATA_ROOT_KEY => normalized }
|
|
543
|
-
end
|
|
544
|
-
|
|
545
|
-
def generate_fingerprint(title, url, content)
|
|
546
|
-
Digest::SHA256.hexdigest(
|
|
547
|
-
[
|
|
548
|
-
title.to_s,
|
|
549
|
-
url.to_s,
|
|
550
|
-
content.to_s
|
|
551
|
-
].join(FINGERPRINT_SEPARATOR)
|
|
552
|
-
)
|
|
553
|
-
end
|
|
554
|
-
|
|
555
|
-
def string_or_nil(value)
|
|
556
|
-
return value unless value.is_a?(String)
|
|
557
|
-
|
|
558
|
-
value.strip.presence
|
|
559
|
-
end
|
|
560
|
-
|
|
561
|
-
def sanitize_string_array(values)
|
|
562
|
-
Array(values).map { |value| string_or_nil(value) }.compact.uniq
|
|
563
|
-
end
|
|
564
|
-
|
|
565
|
-
def split_keywords(value)
|
|
566
|
-
return [] if value.nil?
|
|
567
|
-
|
|
568
|
-
string = string_or_nil(value)
|
|
569
|
-
return [] if string.blank?
|
|
570
|
-
|
|
571
|
-
string.split(KEYWORD_SEPARATORS).map { |keyword| keyword.strip.presence }.compact
|
|
572
|
-
end
|
|
573
|
-
|
|
574
|
-
def safe_integer(value)
|
|
575
|
-
return if value.nil?
|
|
576
|
-
return value if value.is_a?(Integer)
|
|
577
|
-
|
|
578
|
-
string = value.to_s.strip
|
|
579
|
-
return if string.blank?
|
|
580
|
-
|
|
581
|
-
Integer(string, 10)
|
|
582
|
-
rescue ArgumentError
|
|
583
|
-
nil
|
|
584
|
-
end
|
|
585
|
-
|
|
586
|
-
def json_entry?
|
|
587
|
-
defined?(Feedjira::Parser::JSONFeedItem) && entry.is_a?(Feedjira::Parser::JSONFeedItem)
|
|
588
|
-
end
|
|
589
|
-
|
|
590
|
-
def atom_entry?
|
|
591
|
-
defined?(Feedjira::Parser::AtomEntry) && entry.is_a?(Feedjira::Parser::AtomEntry)
|
|
592
|
-
end
|
|
593
|
-
|
|
594
|
-
def normalize_metadata(value)
|
|
595
|
-
JSON.parse(JSON.generate(value))
|
|
596
|
-
rescue JSON::GeneratorError, JSON::ParserError, TypeError
|
|
597
|
-
{}
|
|
598
|
-
end
|
|
153
|
+
# Forwarding methods for backward compatibility with tests
|
|
154
|
+
def process_feed_content(raw_content, title:) = content_extractor.process_feed_content(raw_content, title: title)
|
|
155
|
+
def should_process_feed_content?(raw_content) = content_extractor.should_process_feed_content?(raw_content)
|
|
156
|
+
def feed_content_parser_class = content_extractor.feed_content_parser_class
|
|
157
|
+
def wrap_content_for_readability(content, title:) = content_extractor.wrap_content_for_readability(content, title: title)
|
|
158
|
+
def default_feed_readability_options = content_extractor.default_feed_readability_options
|
|
159
|
+
def build_feed_content_metadata(result:, raw_content:, processed_content:)
|
|
160
|
+
content_extractor.build_feed_content_metadata(result: result, raw_content: raw_content, processed_content: processed_content)
|
|
161
|
+
end
|
|
162
|
+
def html_fragment?(value) = content_extractor.html_fragment?(value)
|
|
163
|
+
def deep_copy(value) = content_extractor.deep_copy(value)
|
|
164
|
+
def string_or_nil(value) = entry_parser.string_or_nil(value)
|
|
165
|
+
def sanitize_string_array(values) = entry_parser.sanitize_string_array(values)
|
|
166
|
+
def split_keywords(value) = entry_parser.split_keywords(value)
|
|
167
|
+
def safe_integer(value) = entry_parser.safe_integer(value)
|
|
168
|
+
def json_entry? = entry_parser.json_entry?
|
|
169
|
+
def atom_entry? = entry_parser.atom_entry?
|
|
170
|
+
def normalize_metadata(value) = entry_parser.normalize_metadata(value)
|
|
171
|
+
def generate_fingerprint(title, url, content) = entry_parser.generate_fingerprint(title, url, content)
|
|
599
172
|
end
|
|
600
173
|
end
|
|
601
174
|
end
|