source_monitor 0.13.0 → 0.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/.claude/skills/sm-event-handler/SKILL.md +1 -1
  3. data/.claude/skills/sm-event-handler/reference/events-api.md +1 -1
  4. data/CHANGELOG.md +5 -1
  5. data/Gemfile.lock +1 -1
  6. data/README.md +3 -3
  7. data/app/assets/builds/source_monitor/application.css +4 -0
  8. data/app/controllers/source_monitor/bulk_scrape_enablements_controller.rb +1 -1
  9. data/app/controllers/source_monitor/import_sessions/bulk_configuration.rb +3 -1
  10. data/app/controllers/source_monitor/import_sessions_controller.rb +118 -72
  11. data/app/controllers/source_monitor/sources_controller.rb +4 -18
  12. data/app/models/source_monitor/source.rb +1 -1
  13. data/docs/setup.md +2 -2
  14. data/docs/upgrade.md +14 -0
  15. data/lib/source_monitor/analytics/scrape_recommendations.rb +21 -2
  16. data/lib/source_monitor/fetching/feed_fetcher/failure_outcome.rb +85 -0
  17. data/lib/source_monitor/fetching/feed_fetcher/success_outcome.rb +85 -0
  18. data/lib/source_monitor/fetching/feed_fetcher.rb +27 -88
  19. data/lib/source_monitor/fetching/fetch_runner.rb +12 -5
  20. data/lib/source_monitor/import_sessions/wizard.rb +612 -0
  21. data/lib/source_monitor/items/batch_item_creator.rb +7 -6
  22. data/lib/source_monitor/items/item_creator.rb +7 -14
  23. data/lib/source_monitor/items/normalized_entry.rb +61 -0
  24. data/lib/source_monitor/version.rb +1 -1
  25. data/lib/source_monitor.rb +2 -0
  26. metadata +5 -4
  27. data/app/controllers/source_monitor/import_sessions/entry_annotation.rb +0 -187
  28. data/app/controllers/source_monitor/import_sessions/health_check_management.rb +0 -112
  29. data/app/controllers/source_monitor/import_sessions/opml_parser.rb +0 -130
@@ -0,0 +1,612 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+ require "uri"
5
+ require "active_support/core_ext/object/blank"
6
+ require "source_monitor/import_sessions/entry_normalizer"
7
+
8
+ module SourceMonitor
9
+ module ImportSessions
10
+ class Wizard
11
+ ALLOWED_CONTENT_TYPES = %w[text/xml application/xml text/x-opml application/opml].freeze
12
+ GENERIC_CONTENT_TYPES = %w[application/octet-stream binary/octet-stream].freeze
13
+
14
+ class UploadError < StandardError; end
15
+ TRUE_PARAM_VALUES = [ true, "true", "1", 1, "on" ].freeze
16
+
17
+ UploadResult = Struct.new(:status, :errors, :current_step, :preview_context, keyword_init: true)
18
+
19
+ PreviewResult = Struct.new(
20
+ :status,
21
+ :selected_source_ids,
22
+ :valid_ids,
23
+ :current_step,
24
+ :selection_error,
25
+ :preview_context,
26
+ keyword_init: true
27
+ ) do
28
+ def blocked?
29
+ status == :blocked
30
+ end
31
+ end
32
+
33
+ PreviewContext = Struct.new(
34
+ :filter,
35
+ :page,
36
+ :selected_source_ids,
37
+ :preview_entries,
38
+ :filtered_entries,
39
+ :paginated_entries,
40
+ :has_next_page,
41
+ :has_previous_page,
42
+ keyword_init: true
43
+ )
44
+
45
+ HealthCheckResult = Struct.new(
46
+ :status,
47
+ :selected_source_ids,
48
+ :current_step,
49
+ :selection_error,
50
+ :health_check_context,
51
+ keyword_init: true
52
+ ) do
53
+ def blocked?
54
+ status == :blocked
55
+ end
56
+ end
57
+
58
+ HealthCheckContext = Struct.new(
59
+ :selected_source_ids,
60
+ :health_check_entries,
61
+ :health_check_target_ids,
62
+ :health_progress,
63
+ keyword_init: true
64
+ )
65
+
66
+ ConfirmResult = Struct.new(
67
+ :status,
68
+ :selected_source_ids,
69
+ :selected_entries,
70
+ :bulk_settings,
71
+ :selection_error,
72
+ :message,
73
+ :history,
74
+ keyword_init: true
75
+ ) do
76
+ def blocked?
77
+ status == :blocked
78
+ end
79
+ end
80
+
81
+ ConfirmContext = Struct.new(:selected_source_ids, :selected_entries, :bulk_settings, keyword_init: true)
82
+
83
+ def initialize(import_session:, params:, current_step:, now: Time.current)
84
+ @import_session = import_session
85
+ @params = params
86
+ @current_step = current_step
87
+ @now = now
88
+ end
89
+
90
+ def handle_upload
91
+ errors = validate_upload
92
+ return UploadResult.new(status: :invalid, errors: errors, current_step: current_step) if errors.any?
93
+
94
+ parsed_entries = parse_opml_file(opml_file)
95
+ valid_entries = parsed_entries.select { |entry| entry[:status] == "valid" }
96
+
97
+ if valid_entries.empty?
98
+ import_session.update!(
99
+ opml_file_metadata: build_file_metadata,
100
+ parsed_sources: parsed_entries,
101
+ current_step: "upload"
102
+ )
103
+
104
+ return UploadResult.new(
105
+ status: :invalid,
106
+ errors: [ "We couldn't find any valid feeds in that OPML file. Check the file and try again." ],
107
+ current_step: "upload"
108
+ )
109
+ end
110
+
111
+ next_step = target_step
112
+ import_session.update!(
113
+ opml_file_metadata: build_file_metadata.merge("uploaded_at" => now),
114
+ parsed_sources: parsed_entries,
115
+ current_step: next_step
116
+ )
117
+
118
+ UploadResult.new(
119
+ status: :success,
120
+ errors: [],
121
+ current_step: next_step,
122
+ preview_context: preview_context
123
+ )
124
+ rescue UploadError => error
125
+ UploadResult.new(status: :invalid, errors: [ error.message ], current_step: current_step)
126
+ end
127
+
128
+ def handle_preview
129
+ selected_source_ids = Array(import_session.selected_source_ids).map(&:to_s)
130
+ preview_entries = annotated_entries(selected_source_ids)
131
+ selectable_entries = preview_entries.select { |entry| entry[:selectable] }
132
+
133
+ valid_ids = if select_all_requested?
134
+ ids = selectable_entries.map { |entry| entry[:id] }
135
+ import_session.update_column(:selected_source_ids, ids)
136
+ ids
137
+ elsif select_none_requested?
138
+ import_session.update_column(:selected_source_ids, [])
139
+ []
140
+ else
141
+ requested_ids = build_selection_from_params(selectable_entries)
142
+ ids = selectable_entries.index_by { |entry| entry[:id] }.slice(*requested_ids).keys
143
+ import_session.update!(selected_source_ids: ids)
144
+ ids
145
+ end
146
+
147
+ if advancing_from_preview? && valid_ids.empty?
148
+ return PreviewResult.new(
149
+ status: :blocked,
150
+ selected_source_ids: valid_ids,
151
+ valid_ids: valid_ids,
152
+ current_step: current_step,
153
+ selection_error: "Select at least one new source to continue.",
154
+ preview_context: preview_context(selected_source_ids: valid_ids)
155
+ )
156
+ end
157
+
158
+ next_step = target_step
159
+ import_session.update_column(:current_step, next_step) if import_session.current_step != next_step
160
+
161
+ PreviewResult.new(
162
+ status: :success,
163
+ selected_source_ids: valid_ids,
164
+ valid_ids: valid_ids,
165
+ current_step: next_step,
166
+ preview_context: preview_context(selected_source_ids: valid_ids)
167
+ )
168
+ end
169
+
170
+ def handle_health_check
171
+ selected_source_ids = health_check_selection_from_params
172
+ import_session.update!(selected_source_ids: selected_source_ids)
173
+
174
+ if advancing_from_health_check? && selected_source_ids.blank?
175
+ deactivate_health_checks
176
+
177
+ return HealthCheckResult.new(
178
+ status: :blocked,
179
+ selected_source_ids: selected_source_ids,
180
+ current_step: current_step,
181
+ selection_error: "Select at least one source to continue.",
182
+ health_check_context: health_check_context
183
+ )
184
+ end
185
+
186
+ next_step = target_step
187
+ deactivate_health_checks if next_step != "health_check"
188
+ import_session.update_column(:current_step, next_step) if import_session.current_step != next_step
189
+
190
+ HealthCheckResult.new(
191
+ status: :success,
192
+ selected_source_ids: selected_source_ids,
193
+ current_step: next_step,
194
+ health_check_context: (health_check_context if next_step == "health_check")
195
+ )
196
+ end
197
+
198
+ def handle_confirm
199
+ context = confirm_context
200
+
201
+ if context.selected_entries.empty?
202
+ return ConfirmResult.new(
203
+ status: :blocked,
204
+ selected_source_ids: context.selected_source_ids,
205
+ selected_entries: context.selected_entries,
206
+ bulk_settings: context.bulk_settings,
207
+ selection_error: "Select at least one source to import."
208
+ )
209
+ end
210
+
211
+ history = SourceMonitor::ImportHistory.create!(
212
+ user_id: import_session.user_id,
213
+ bulk_settings: import_session.bulk_settings
214
+ )
215
+ SourceMonitor::ImportOpmlJob.perform_later(import_session.id, history.id)
216
+ import_session.update_column(:current_step, "confirm") if import_session.current_step != "confirm"
217
+
218
+ ConfirmResult.new(
219
+ status: :success,
220
+ selected_source_ids: context.selected_source_ids,
221
+ selected_entries: context.selected_entries,
222
+ bulk_settings: context.bulk_settings,
223
+ history: history,
224
+ message: "Import started for #{context.selected_entries.size} sources."
225
+ )
226
+ end
227
+
228
+ def preview_context(selected_source_ids: nil)
229
+ filter = permitted_filter(params[:filter]) || "all"
230
+ page = normalize_page_param(params[:page])
231
+ selected_source_ids = Array(selected_source_ids || import_session.selected_source_ids).map(&:to_s)
232
+ preview_entries = annotated_entries(selected_source_ids)
233
+
234
+ filtered_entries = filter_entries(preview_entries, filter)
235
+ paginator = SourceMonitor::Pagination::Paginator.new(
236
+ scope: filtered_entries,
237
+ page: page,
238
+ per_page: preview_per_page
239
+ ).paginate
240
+
241
+ PreviewContext.new(
242
+ filter: filter,
243
+ page: paginator.page,
244
+ selected_source_ids: selected_source_ids,
245
+ preview_entries: preview_entries,
246
+ filtered_entries: filtered_entries,
247
+ paginated_entries: paginator.records,
248
+ has_next_page: paginator.has_next_page,
249
+ has_previous_page: paginator.has_previous_page
250
+ )
251
+ end
252
+
253
+ def preview_context_with_default_selection
254
+ selected_source_ids = Array(import_session.selected_source_ids).map(&:to_s)
255
+ preview_entries = annotated_entries(selected_source_ids)
256
+
257
+ if selected_source_ids.blank? && preview_entries.present?
258
+ selected_source_ids = preview_entries.select { |entry| entry[:selectable] }.map { |entry| entry[:id] }
259
+ import_session.update_column(:selected_source_ids, selected_source_ids)
260
+ end
261
+
262
+ preview_context(selected_source_ids: selected_source_ids)
263
+ end
264
+
265
+ def health_check_context
266
+ start_health_checks_if_needed
267
+
268
+ selected_source_ids = Array(import_session.selected_source_ids).map(&:to_s)
269
+ entries = health_check_entries(selected_source_ids)
270
+ target_ids = health_check_targets
271
+
272
+ HealthCheckContext.new(
273
+ selected_source_ids: selected_source_ids,
274
+ health_check_entries: entries,
275
+ health_check_target_ids: target_ids,
276
+ health_progress: health_check_progress(entries)
277
+ )
278
+ end
279
+
280
+ def deactivate_health_checks
281
+ return unless import_session.health_checks_active?
282
+
283
+ import_session.update_columns(
284
+ health_checks_active: false,
285
+ health_check_completed_at: now
286
+ )
287
+ end
288
+
289
+ def confirm_context
290
+ selected_source_ids = Array(import_session.selected_source_ids).map(&:to_s)
291
+ selected_entries = annotated_entries(selected_source_ids)
292
+ .select { |entry| selected_source_ids.include?(entry[:id]) }
293
+
294
+ ConfirmContext.new(
295
+ selected_source_ids: selected_source_ids,
296
+ selected_entries: selected_entries,
297
+ bulk_settings: import_session.bulk_settings || {}
298
+ )
299
+ end
300
+
301
+ private
302
+
303
+ attr_reader :import_session, :params, :current_step, :now
304
+
305
+ def validate_upload
306
+ return [ "Upload an OPML file to continue." ] unless opml_file.present?
307
+
308
+ errors = []
309
+ errors << "The uploaded file is empty. Choose another OPML file." if opml_file.size.to_i <= 0
310
+
311
+ if opml_file.content_type.present? && !content_type_allowed?(opml_file.content_type) && !generic_content_type?(opml_file.content_type)
312
+ errors << "Upload must be an OPML or XML file."
313
+ end
314
+
315
+ errors
316
+ end
317
+
318
+ def opml_file
319
+ params[:opml_file]
320
+ end
321
+
322
+ def build_file_metadata
323
+ return {} unless opml_file.respond_to?(:original_filename)
324
+
325
+ {
326
+ "filename" => opml_file.original_filename,
327
+ "byte_size" => opml_file.size,
328
+ "content_type" => opml_file.content_type
329
+ }
330
+ end
331
+
332
+ def content_type_allowed?(content_type)
333
+ ALLOWED_CONTENT_TYPES.include?(content_type)
334
+ end
335
+
336
+ def generic_content_type?(content_type)
337
+ GENERIC_CONTENT_TYPES.include?(content_type)
338
+ end
339
+
340
+ def parse_opml_file(file)
341
+ content = file.read
342
+ file.rewind if file.respond_to?(:rewind)
343
+
344
+ raise UploadError, "The uploaded file appears to be empty." if content.blank?
345
+
346
+ document = Nokogiri::XML(content) { |config| config.strict.nonet }
347
+ raise UploadError, "The uploaded file is not valid XML or OPML." if document.root.nil?
348
+
349
+ document.xpath("//outline").each_with_index.filter_map do |outline, index|
350
+ next unless outline.attribute_nodes.any? { |attr| attr.name.casecmp("xmlurl").zero? }
351
+
352
+ build_entry(outline, index)
353
+ end
354
+ rescue Nokogiri::XML::SyntaxError => error
355
+ raise UploadError, "We couldn't parse that OPML file: #{error.message}"
356
+ end
357
+
358
+ def build_entry(outline, index)
359
+ feed_url = outline_attribute(outline, "xmlUrl")
360
+ website_url = outline_attribute(outline, "htmlUrl")
361
+ title = outline_attribute(outline, "title") || outline_attribute(outline, "text")
362
+
363
+ if feed_url.blank?
364
+ return malformed_entry(index, feed_url, title, website_url, "Missing feed URL")
365
+ end
366
+
367
+ unless valid_feed_url?(feed_url)
368
+ return malformed_entry(index, feed_url, title, website_url, "Feed URL must be HTTP or HTTPS")
369
+ end
370
+
371
+ {
372
+ id: "outline-#{index}",
373
+ raw_outline_index: index,
374
+ feed_url: feed_url,
375
+ title: title,
376
+ website_url: website_url,
377
+ status: "valid",
378
+ error: nil,
379
+ health_status: nil,
380
+ health_error: nil
381
+ }
382
+ end
383
+
384
+ def malformed_entry(index, feed_url, title, website_url, error)
385
+ {
386
+ id: "outline-#{index}",
387
+ raw_outline_index: index,
388
+ feed_url: feed_url.presence,
389
+ title: title,
390
+ website_url: website_url,
391
+ status: "malformed",
392
+ error: error,
393
+ health_status: nil,
394
+ health_error: nil
395
+ }
396
+ end
397
+
398
+ def outline_attribute(outline, name)
399
+ attribute = outline.attribute_nodes.find { |attr| attr.name.casecmp(name).zero? }
400
+ attribute&.value.to_s.presence
401
+ end
402
+
403
+ def valid_feed_url?(url)
404
+ parsed = URI.parse(url)
405
+ parsed.is_a?(URI::HTTP) && parsed.host.present?
406
+ rescue URI::InvalidURIError
407
+ false
408
+ end
409
+
410
+ def annotated_entries(selected_ids)
411
+ selected_ids ||= []
412
+ entries = Array(import_session.parsed_sources)
413
+ return [] if entries.blank?
414
+
415
+ normalized = entries.map { |entry| normalize_entry(entry) }
416
+ feed_urls = normalized.filter_map { |entry| entry[:feed_url]&.downcase }
417
+ duplicate_lookup = if feed_urls.present?
418
+ SourceMonitor::Source.where("LOWER(feed_url) IN (?)", feed_urls).pluck(:feed_url).map(&:downcase)
419
+ else
420
+ []
421
+ end
422
+
423
+ normalized.map do |entry|
424
+ duplicate = entry[:feed_url].present? && duplicate_lookup.include?(entry[:feed_url].downcase)
425
+ entry.merge(
426
+ duplicate: duplicate,
427
+ selectable: entry[:status] == "valid" && !duplicate,
428
+ selected: selected_ids.include?(entry[:id])
429
+ )
430
+ end
431
+ end
432
+
433
+ def normalize_entry(entry)
434
+ SourceMonitor::ImportSessions::EntryNormalizer.normalize(entry)
435
+ end
436
+
437
+ def filter_entries(entries, filter)
438
+ case filter
439
+ when "new"
440
+ entries.select { |entry| entry[:selectable] }
441
+ when "existing"
442
+ entries.select { |entry| entry[:duplicate] }
443
+ else
444
+ entries
445
+ end
446
+ end
447
+
448
+ def build_selection_from_params(selectable_entries)
449
+ ids = import_session_params[:selected_source_ids]
450
+ return [] unless ids
451
+
452
+ Array(ids).map(&:to_s).uniq & selectable_entries.map { |entry| entry[:id] }
453
+ end
454
+
455
+ def advancing_from_preview?
456
+ target_step != "preview"
457
+ end
458
+
459
+ def health_check_selection_from_params
460
+ if select_all_requested?
461
+ return health_check_targets.dup
462
+ end
463
+
464
+ return [] if select_none_requested?
465
+
466
+ ids = import_session_params[:selected_source_ids]
467
+ return Array(import_session.selected_source_ids).map(&:to_s) unless ids
468
+
469
+ Array(ids).map(&:to_s).uniq & health_check_targets
470
+ end
471
+
472
+ def advancing_from_health_check?
473
+ target_step != "health_check"
474
+ end
475
+
476
+ def start_health_checks_if_needed
477
+ return unless current_step == "health_check"
478
+
479
+ jobs_to_enqueue = []
480
+
481
+ import_session.with_lock do
482
+ import_session.reload
483
+ selected = Array(import_session.selected_source_ids).map(&:to_s)
484
+
485
+ if selected.blank?
486
+ import_session.update_columns(health_checks_active: false, health_check_target_ids: [])
487
+ next
488
+ end
489
+
490
+ if import_session.health_checks_active? && import_session.health_check_targets.sort == selected.sort
491
+ next
492
+ end
493
+
494
+ import_session.update!(
495
+ parsed_sources: reset_health_results(import_session.parsed_sources, selected),
496
+ health_checks_active: true,
497
+ health_check_target_ids: selected,
498
+ health_check_started_at: now,
499
+ health_check_completed_at: nil
500
+ )
501
+
502
+ jobs_to_enqueue = selected
503
+ end
504
+
505
+ enqueue_health_check_jobs(import_session, jobs_to_enqueue) if jobs_to_enqueue.any?
506
+ end
507
+
508
+ def reset_health_results(entries, target_ids)
509
+ Array(entries).map do |entry|
510
+ entry_hash = entry.to_h
511
+ entry_id = entry_hash["id"] || entry_hash[:id]
512
+ next entry_hash unless target_ids.include?(entry_id.to_s)
513
+
514
+ entry_hash.merge("health_status" => "pending", "health_error" => nil)
515
+ end
516
+ end
517
+
518
+ def enqueue_health_check_jobs(import_session, target_ids)
519
+ target_ids.each do |target_id|
520
+ SourceMonitor::ImportSessionHealthCheckJob.set(wait: 1.second).perform_later(import_session.id, target_id)
521
+ end
522
+ end
523
+
524
+ def health_check_entries(selected_ids)
525
+ targets = health_check_targets
526
+ entries = Array(import_session.parsed_sources).map { |entry| normalize_entry(entry) }
527
+
528
+ entries.select { |entry| targets.include?(entry[:id]) }.map do |entry|
529
+ entry.merge(selected: selected_ids.include?(entry[:id]))
530
+ end
531
+ end
532
+
533
+ def health_check_progress(entries)
534
+ total = health_check_targets.size
535
+ completed = entries.count { |entry| health_check_complete?(entry) }
536
+
537
+ {
538
+ completed: completed,
539
+ total: total,
540
+ pending: [ total - completed, 0 ].max,
541
+ active: import_session.health_checks_active?,
542
+ done: total.positive? && completed >= total
543
+ }
544
+ end
545
+
546
+ def health_check_complete?(entry)
547
+ %w[working failing].include?(entry[:health_status].to_s)
548
+ end
549
+
550
+ def health_check_targets
551
+ targets = import_session.health_check_targets
552
+ targets = Array(import_session.selected_source_ids).map(&:to_s) if targets.blank?
553
+ targets
554
+ end
555
+
556
+ def normalize_page_param(value)
557
+ number = value.to_i
558
+ number = 1 if number <= 0
559
+ number
560
+ rescue StandardError
561
+ 1
562
+ end
563
+
564
+ def permitted_filter(raw)
565
+ value = raw.to_s.presence
566
+ return unless value
567
+
568
+ %w[all new existing].find { |candidate| candidate == value }
569
+ end
570
+
571
+ def preview_per_page
572
+ 25
573
+ end
574
+
575
+ def target_step
576
+ permitted_step(import_session_params[:next_step]) || current_step || ImportSession.default_step
577
+ end
578
+
579
+ def permitted_step(value)
580
+ step = value.to_s.presence
581
+ return unless step
582
+
583
+ ImportSession::STEP_ORDER.find { |candidate| candidate == step }
584
+ end
585
+
586
+ def import_session_params
587
+ @import_session_params ||= begin
588
+ raw = params[:import_session] || params["import_session"] || {}
589
+ permitted = if raw.respond_to?(:permit)
590
+ raw.permit(:next_step, :select_all, :select_none, selected_source_ids: [])
591
+ else
592
+ raw.to_h
593
+ end
594
+
595
+ SourceMonitor::Security::ParameterSanitizer.sanitize(permitted.to_h).with_indifferent_access
596
+ end
597
+ end
598
+
599
+ def select_all_requested?
600
+ truthy_import_session_param?(:select_all)
601
+ end
602
+
603
+ def select_none_requested?
604
+ truthy_import_session_param?(:select_none)
605
+ end
606
+
607
+ def truthy_import_session_param?(key)
608
+ TRUE_PARAM_VALUES.include?(import_session_params[key])
609
+ end
610
+ end
611
+ end
612
+ end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "source_monitor/items/item_creator"
4
+ require "source_monitor/items/normalized_entry"
4
5
 
5
6
  module SourceMonitor
6
7
  module Items
@@ -32,17 +33,17 @@ module SourceMonitor
32
33
 
33
34
  # Step 1: Pre-parse entries to extract GUIDs and fingerprints for bulk lookup.
34
35
  entry_identifiers = @entries.map do |entry|
35
- parser = ItemCreator::EntryParser.new(
36
+ normalized_entry = NormalizedEntry.new(
36
37
  source: @source,
37
38
  entry: entry,
38
39
  content_extractor: content_extractor
39
40
  )
40
- attrs = parser.parse
41
- raw_guid = attrs[:guid]
42
- normalized_guid = raw_guid.present? ? raw_guid.downcase : nil
43
- guid = normalized_guid.presence || attrs[:content_fingerprint]
44
41
 
45
- { guid: guid, fingerprint: attrs[:content_fingerprint], raw_guid_present: normalized_guid.present? }
42
+ {
43
+ guid: normalized_entry.item_guid,
44
+ fingerprint: normalized_entry.content_fingerprint,
45
+ raw_guid_present: normalized_entry.raw_guid_present?
46
+ }
46
47
  end
47
48
 
48
49
  # Step 2: Batch-fetch existing items by GUID (single query)
@@ -9,6 +9,7 @@ require "source_monitor/instrumentation"
9
9
  require "source_monitor/scrapers/readability"
10
10
  require "source_monitor/items/item_creator/entry_parser"
11
11
  require "source_monitor/items/item_creator/content_extractor"
12
+ require "source_monitor/items/normalized_entry"
12
13
 
13
14
  module SourceMonitor
14
15
  module Items
@@ -49,14 +50,10 @@ module SourceMonitor
49
50
  end
50
51
 
51
52
  def call
52
- attributes = build_attributes
53
- raw_guid = attributes[:guid]
54
- # Normalize GUID to lowercase so the plain btree index on guid is used
55
- # for lookups instead of LOWER(guid) which forces sequential scans.
56
- normalized_guid = raw_guid.present? ? raw_guid.downcase : nil
57
- attributes[:guid] = normalized_guid.presence || attributes[:content_fingerprint]
53
+ normalized_entry = build_normalized_entry
54
+ attributes = normalized_entry.item_attributes
58
55
 
59
- existing_item, matched_by = existing_item_for(attributes, raw_guid_present: normalized_guid.present?)
56
+ existing_item, matched_by = existing_item_for(attributes, raw_guid_present: normalized_entry.raw_guid_present?)
60
57
 
61
58
  if existing_item
62
59
  apply_attributes(existing_item, attributes)
@@ -70,7 +67,7 @@ module SourceMonitor
70
67
  end
71
68
  end
72
69
 
73
- create_new_item(attributes, raw_guid_present: normalized_guid.present?)
70
+ create_new_item(attributes, raw_guid_present: normalized_entry.raw_guid_present?)
74
71
  end
75
72
 
76
73
  private
@@ -193,12 +190,8 @@ module SourceMonitor
193
190
  (record.changed - IGNORED_CHANGE_ATTRIBUTES).any?
194
191
  end
195
192
 
196
- def build_attributes
197
- entry_parser.parse
198
- end
199
-
200
- def entry_parser
201
- @entry_parser ||= EntryParser.new(source: source, entry: entry, content_extractor: content_extractor)
193
+ def build_normalized_entry
194
+ @normalized_entry ||= NormalizedEntry.new(source: source, entry: entry, content_extractor: content_extractor)
202
195
  end
203
196
 
204
197
  def content_extractor