source_monitor 0.13.0 → 0.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/skills/sm-event-handler/SKILL.md +1 -1
- data/.claude/skills/sm-event-handler/reference/events-api.md +1 -1
- data/CHANGELOG.md +5 -1
- data/Gemfile.lock +1 -1
- data/README.md +3 -3
- data/app/assets/builds/source_monitor/application.css +4 -0
- data/app/controllers/source_monitor/bulk_scrape_enablements_controller.rb +1 -1
- data/app/controllers/source_monitor/import_sessions/bulk_configuration.rb +3 -1
- data/app/controllers/source_monitor/import_sessions_controller.rb +118 -72
- data/app/controllers/source_monitor/sources_controller.rb +4 -18
- data/app/models/source_monitor/source.rb +1 -1
- data/docs/setup.md +2 -2
- data/docs/upgrade.md +14 -0
- data/lib/source_monitor/analytics/scrape_recommendations.rb +21 -2
- data/lib/source_monitor/fetching/feed_fetcher/failure_outcome.rb +85 -0
- data/lib/source_monitor/fetching/feed_fetcher/success_outcome.rb +85 -0
- data/lib/source_monitor/fetching/feed_fetcher.rb +27 -88
- data/lib/source_monitor/fetching/fetch_runner.rb +12 -5
- data/lib/source_monitor/import_sessions/wizard.rb +612 -0
- data/lib/source_monitor/items/batch_item_creator.rb +7 -6
- data/lib/source_monitor/items/item_creator.rb +7 -14
- data/lib/source_monitor/items/normalized_entry.rb +61 -0
- data/lib/source_monitor/version.rb +1 -1
- data/lib/source_monitor.rb +2 -0
- metadata +5 -4
- data/app/controllers/source_monitor/import_sessions/entry_annotation.rb +0 -187
- data/app/controllers/source_monitor/import_sessions/health_check_management.rb +0 -112
- data/app/controllers/source_monitor/import_sessions/opml_parser.rb +0 -130
|
@@ -0,0 +1,612 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
require "uri"
|
|
5
|
+
require "active_support/core_ext/object/blank"
|
|
6
|
+
require "source_monitor/import_sessions/entry_normalizer"
|
|
7
|
+
|
|
8
|
+
module SourceMonitor
|
|
9
|
+
module ImportSessions
|
|
10
|
+
class Wizard
|
|
11
|
+
ALLOWED_CONTENT_TYPES = %w[text/xml application/xml text/x-opml application/opml].freeze
|
|
12
|
+
GENERIC_CONTENT_TYPES = %w[application/octet-stream binary/octet-stream].freeze
|
|
13
|
+
|
|
14
|
+
class UploadError < StandardError; end
|
|
15
|
+
TRUE_PARAM_VALUES = [ true, "true", "1", 1, "on" ].freeze
|
|
16
|
+
|
|
17
|
+
UploadResult = Struct.new(:status, :errors, :current_step, :preview_context, keyword_init: true)
|
|
18
|
+
|
|
19
|
+
PreviewResult = Struct.new(
|
|
20
|
+
:status,
|
|
21
|
+
:selected_source_ids,
|
|
22
|
+
:valid_ids,
|
|
23
|
+
:current_step,
|
|
24
|
+
:selection_error,
|
|
25
|
+
:preview_context,
|
|
26
|
+
keyword_init: true
|
|
27
|
+
) do
|
|
28
|
+
def blocked?
|
|
29
|
+
status == :blocked
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
PreviewContext = Struct.new(
|
|
34
|
+
:filter,
|
|
35
|
+
:page,
|
|
36
|
+
:selected_source_ids,
|
|
37
|
+
:preview_entries,
|
|
38
|
+
:filtered_entries,
|
|
39
|
+
:paginated_entries,
|
|
40
|
+
:has_next_page,
|
|
41
|
+
:has_previous_page,
|
|
42
|
+
keyword_init: true
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
HealthCheckResult = Struct.new(
|
|
46
|
+
:status,
|
|
47
|
+
:selected_source_ids,
|
|
48
|
+
:current_step,
|
|
49
|
+
:selection_error,
|
|
50
|
+
:health_check_context,
|
|
51
|
+
keyword_init: true
|
|
52
|
+
) do
|
|
53
|
+
def blocked?
|
|
54
|
+
status == :blocked
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
HealthCheckContext = Struct.new(
|
|
59
|
+
:selected_source_ids,
|
|
60
|
+
:health_check_entries,
|
|
61
|
+
:health_check_target_ids,
|
|
62
|
+
:health_progress,
|
|
63
|
+
keyword_init: true
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
ConfirmResult = Struct.new(
|
|
67
|
+
:status,
|
|
68
|
+
:selected_source_ids,
|
|
69
|
+
:selected_entries,
|
|
70
|
+
:bulk_settings,
|
|
71
|
+
:selection_error,
|
|
72
|
+
:message,
|
|
73
|
+
:history,
|
|
74
|
+
keyword_init: true
|
|
75
|
+
) do
|
|
76
|
+
def blocked?
|
|
77
|
+
status == :blocked
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
ConfirmContext = Struct.new(:selected_source_ids, :selected_entries, :bulk_settings, keyword_init: true)
|
|
82
|
+
|
|
83
|
+
def initialize(import_session:, params:, current_step:, now: Time.current)
|
|
84
|
+
@import_session = import_session
|
|
85
|
+
@params = params
|
|
86
|
+
@current_step = current_step
|
|
87
|
+
@now = now
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def handle_upload
|
|
91
|
+
errors = validate_upload
|
|
92
|
+
return UploadResult.new(status: :invalid, errors: errors, current_step: current_step) if errors.any?
|
|
93
|
+
|
|
94
|
+
parsed_entries = parse_opml_file(opml_file)
|
|
95
|
+
valid_entries = parsed_entries.select { |entry| entry[:status] == "valid" }
|
|
96
|
+
|
|
97
|
+
if valid_entries.empty?
|
|
98
|
+
import_session.update!(
|
|
99
|
+
opml_file_metadata: build_file_metadata,
|
|
100
|
+
parsed_sources: parsed_entries,
|
|
101
|
+
current_step: "upload"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
return UploadResult.new(
|
|
105
|
+
status: :invalid,
|
|
106
|
+
errors: [ "We couldn't find any valid feeds in that OPML file. Check the file and try again." ],
|
|
107
|
+
current_step: "upload"
|
|
108
|
+
)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
next_step = target_step
|
|
112
|
+
import_session.update!(
|
|
113
|
+
opml_file_metadata: build_file_metadata.merge("uploaded_at" => now),
|
|
114
|
+
parsed_sources: parsed_entries,
|
|
115
|
+
current_step: next_step
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
UploadResult.new(
|
|
119
|
+
status: :success,
|
|
120
|
+
errors: [],
|
|
121
|
+
current_step: next_step,
|
|
122
|
+
preview_context: preview_context
|
|
123
|
+
)
|
|
124
|
+
rescue UploadError => error
|
|
125
|
+
UploadResult.new(status: :invalid, errors: [ error.message ], current_step: current_step)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def handle_preview
|
|
129
|
+
selected_source_ids = Array(import_session.selected_source_ids).map(&:to_s)
|
|
130
|
+
preview_entries = annotated_entries(selected_source_ids)
|
|
131
|
+
selectable_entries = preview_entries.select { |entry| entry[:selectable] }
|
|
132
|
+
|
|
133
|
+
valid_ids = if select_all_requested?
|
|
134
|
+
ids = selectable_entries.map { |entry| entry[:id] }
|
|
135
|
+
import_session.update_column(:selected_source_ids, ids)
|
|
136
|
+
ids
|
|
137
|
+
elsif select_none_requested?
|
|
138
|
+
import_session.update_column(:selected_source_ids, [])
|
|
139
|
+
[]
|
|
140
|
+
else
|
|
141
|
+
requested_ids = build_selection_from_params(selectable_entries)
|
|
142
|
+
ids = selectable_entries.index_by { |entry| entry[:id] }.slice(*requested_ids).keys
|
|
143
|
+
import_session.update!(selected_source_ids: ids)
|
|
144
|
+
ids
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
if advancing_from_preview? && valid_ids.empty?
|
|
148
|
+
return PreviewResult.new(
|
|
149
|
+
status: :blocked,
|
|
150
|
+
selected_source_ids: valid_ids,
|
|
151
|
+
valid_ids: valid_ids,
|
|
152
|
+
current_step: current_step,
|
|
153
|
+
selection_error: "Select at least one new source to continue.",
|
|
154
|
+
preview_context: preview_context(selected_source_ids: valid_ids)
|
|
155
|
+
)
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
next_step = target_step
|
|
159
|
+
import_session.update_column(:current_step, next_step) if import_session.current_step != next_step
|
|
160
|
+
|
|
161
|
+
PreviewResult.new(
|
|
162
|
+
status: :success,
|
|
163
|
+
selected_source_ids: valid_ids,
|
|
164
|
+
valid_ids: valid_ids,
|
|
165
|
+
current_step: next_step,
|
|
166
|
+
preview_context: preview_context(selected_source_ids: valid_ids)
|
|
167
|
+
)
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def handle_health_check
|
|
171
|
+
selected_source_ids = health_check_selection_from_params
|
|
172
|
+
import_session.update!(selected_source_ids: selected_source_ids)
|
|
173
|
+
|
|
174
|
+
if advancing_from_health_check? && selected_source_ids.blank?
|
|
175
|
+
deactivate_health_checks
|
|
176
|
+
|
|
177
|
+
return HealthCheckResult.new(
|
|
178
|
+
status: :blocked,
|
|
179
|
+
selected_source_ids: selected_source_ids,
|
|
180
|
+
current_step: current_step,
|
|
181
|
+
selection_error: "Select at least one source to continue.",
|
|
182
|
+
health_check_context: health_check_context
|
|
183
|
+
)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
next_step = target_step
|
|
187
|
+
deactivate_health_checks if next_step != "health_check"
|
|
188
|
+
import_session.update_column(:current_step, next_step) if import_session.current_step != next_step
|
|
189
|
+
|
|
190
|
+
HealthCheckResult.new(
|
|
191
|
+
status: :success,
|
|
192
|
+
selected_source_ids: selected_source_ids,
|
|
193
|
+
current_step: next_step,
|
|
194
|
+
health_check_context: (health_check_context if next_step == "health_check")
|
|
195
|
+
)
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
def handle_confirm
|
|
199
|
+
context = confirm_context
|
|
200
|
+
|
|
201
|
+
if context.selected_entries.empty?
|
|
202
|
+
return ConfirmResult.new(
|
|
203
|
+
status: :blocked,
|
|
204
|
+
selected_source_ids: context.selected_source_ids,
|
|
205
|
+
selected_entries: context.selected_entries,
|
|
206
|
+
bulk_settings: context.bulk_settings,
|
|
207
|
+
selection_error: "Select at least one source to import."
|
|
208
|
+
)
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
history = SourceMonitor::ImportHistory.create!(
|
|
212
|
+
user_id: import_session.user_id,
|
|
213
|
+
bulk_settings: import_session.bulk_settings
|
|
214
|
+
)
|
|
215
|
+
SourceMonitor::ImportOpmlJob.perform_later(import_session.id, history.id)
|
|
216
|
+
import_session.update_column(:current_step, "confirm") if import_session.current_step != "confirm"
|
|
217
|
+
|
|
218
|
+
ConfirmResult.new(
|
|
219
|
+
status: :success,
|
|
220
|
+
selected_source_ids: context.selected_source_ids,
|
|
221
|
+
selected_entries: context.selected_entries,
|
|
222
|
+
bulk_settings: context.bulk_settings,
|
|
223
|
+
history: history,
|
|
224
|
+
message: "Import started for #{context.selected_entries.size} sources."
|
|
225
|
+
)
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def preview_context(selected_source_ids: nil)
|
|
229
|
+
filter = permitted_filter(params[:filter]) || "all"
|
|
230
|
+
page = normalize_page_param(params[:page])
|
|
231
|
+
selected_source_ids = Array(selected_source_ids || import_session.selected_source_ids).map(&:to_s)
|
|
232
|
+
preview_entries = annotated_entries(selected_source_ids)
|
|
233
|
+
|
|
234
|
+
filtered_entries = filter_entries(preview_entries, filter)
|
|
235
|
+
paginator = SourceMonitor::Pagination::Paginator.new(
|
|
236
|
+
scope: filtered_entries,
|
|
237
|
+
page: page,
|
|
238
|
+
per_page: preview_per_page
|
|
239
|
+
).paginate
|
|
240
|
+
|
|
241
|
+
PreviewContext.new(
|
|
242
|
+
filter: filter,
|
|
243
|
+
page: paginator.page,
|
|
244
|
+
selected_source_ids: selected_source_ids,
|
|
245
|
+
preview_entries: preview_entries,
|
|
246
|
+
filtered_entries: filtered_entries,
|
|
247
|
+
paginated_entries: paginator.records,
|
|
248
|
+
has_next_page: paginator.has_next_page,
|
|
249
|
+
has_previous_page: paginator.has_previous_page
|
|
250
|
+
)
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
def preview_context_with_default_selection
|
|
254
|
+
selected_source_ids = Array(import_session.selected_source_ids).map(&:to_s)
|
|
255
|
+
preview_entries = annotated_entries(selected_source_ids)
|
|
256
|
+
|
|
257
|
+
if selected_source_ids.blank? && preview_entries.present?
|
|
258
|
+
selected_source_ids = preview_entries.select { |entry| entry[:selectable] }.map { |entry| entry[:id] }
|
|
259
|
+
import_session.update_column(:selected_source_ids, selected_source_ids)
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
preview_context(selected_source_ids: selected_source_ids)
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
def health_check_context
|
|
266
|
+
start_health_checks_if_needed
|
|
267
|
+
|
|
268
|
+
selected_source_ids = Array(import_session.selected_source_ids).map(&:to_s)
|
|
269
|
+
entries = health_check_entries(selected_source_ids)
|
|
270
|
+
target_ids = health_check_targets
|
|
271
|
+
|
|
272
|
+
HealthCheckContext.new(
|
|
273
|
+
selected_source_ids: selected_source_ids,
|
|
274
|
+
health_check_entries: entries,
|
|
275
|
+
health_check_target_ids: target_ids,
|
|
276
|
+
health_progress: health_check_progress(entries)
|
|
277
|
+
)
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
def deactivate_health_checks
|
|
281
|
+
return unless import_session.health_checks_active?
|
|
282
|
+
|
|
283
|
+
import_session.update_columns(
|
|
284
|
+
health_checks_active: false,
|
|
285
|
+
health_check_completed_at: now
|
|
286
|
+
)
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
def confirm_context
|
|
290
|
+
selected_source_ids = Array(import_session.selected_source_ids).map(&:to_s)
|
|
291
|
+
selected_entries = annotated_entries(selected_source_ids)
|
|
292
|
+
.select { |entry| selected_source_ids.include?(entry[:id]) }
|
|
293
|
+
|
|
294
|
+
ConfirmContext.new(
|
|
295
|
+
selected_source_ids: selected_source_ids,
|
|
296
|
+
selected_entries: selected_entries,
|
|
297
|
+
bulk_settings: import_session.bulk_settings || {}
|
|
298
|
+
)
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
private
|
|
302
|
+
|
|
303
|
+
attr_reader :import_session, :params, :current_step, :now
|
|
304
|
+
|
|
305
|
+
def validate_upload
|
|
306
|
+
return [ "Upload an OPML file to continue." ] unless opml_file.present?
|
|
307
|
+
|
|
308
|
+
errors = []
|
|
309
|
+
errors << "The uploaded file is empty. Choose another OPML file." if opml_file.size.to_i <= 0
|
|
310
|
+
|
|
311
|
+
if opml_file.content_type.present? && !content_type_allowed?(opml_file.content_type) && !generic_content_type?(opml_file.content_type)
|
|
312
|
+
errors << "Upload must be an OPML or XML file."
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
errors
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
def opml_file
|
|
319
|
+
params[:opml_file]
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
def build_file_metadata
|
|
323
|
+
return {} unless opml_file.respond_to?(:original_filename)
|
|
324
|
+
|
|
325
|
+
{
|
|
326
|
+
"filename" => opml_file.original_filename,
|
|
327
|
+
"byte_size" => opml_file.size,
|
|
328
|
+
"content_type" => opml_file.content_type
|
|
329
|
+
}
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
def content_type_allowed?(content_type)
|
|
333
|
+
ALLOWED_CONTENT_TYPES.include?(content_type)
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
def generic_content_type?(content_type)
|
|
337
|
+
GENERIC_CONTENT_TYPES.include?(content_type)
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
def parse_opml_file(file)
|
|
341
|
+
content = file.read
|
|
342
|
+
file.rewind if file.respond_to?(:rewind)
|
|
343
|
+
|
|
344
|
+
raise UploadError, "The uploaded file appears to be empty." if content.blank?
|
|
345
|
+
|
|
346
|
+
document = Nokogiri::XML(content) { |config| config.strict.nonet }
|
|
347
|
+
raise UploadError, "The uploaded file is not valid XML or OPML." if document.root.nil?
|
|
348
|
+
|
|
349
|
+
document.xpath("//outline").each_with_index.filter_map do |outline, index|
|
|
350
|
+
next unless outline.attribute_nodes.any? { |attr| attr.name.casecmp("xmlurl").zero? }
|
|
351
|
+
|
|
352
|
+
build_entry(outline, index)
|
|
353
|
+
end
|
|
354
|
+
rescue Nokogiri::XML::SyntaxError => error
|
|
355
|
+
raise UploadError, "We couldn't parse that OPML file: #{error.message}"
|
|
356
|
+
end
|
|
357
|
+
|
|
358
|
+
def build_entry(outline, index)
|
|
359
|
+
feed_url = outline_attribute(outline, "xmlUrl")
|
|
360
|
+
website_url = outline_attribute(outline, "htmlUrl")
|
|
361
|
+
title = outline_attribute(outline, "title") || outline_attribute(outline, "text")
|
|
362
|
+
|
|
363
|
+
if feed_url.blank?
|
|
364
|
+
return malformed_entry(index, feed_url, title, website_url, "Missing feed URL")
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
unless valid_feed_url?(feed_url)
|
|
368
|
+
return malformed_entry(index, feed_url, title, website_url, "Feed URL must be HTTP or HTTPS")
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
{
|
|
372
|
+
id: "outline-#{index}",
|
|
373
|
+
raw_outline_index: index,
|
|
374
|
+
feed_url: feed_url,
|
|
375
|
+
title: title,
|
|
376
|
+
website_url: website_url,
|
|
377
|
+
status: "valid",
|
|
378
|
+
error: nil,
|
|
379
|
+
health_status: nil,
|
|
380
|
+
health_error: nil
|
|
381
|
+
}
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
def malformed_entry(index, feed_url, title, website_url, error)
|
|
385
|
+
{
|
|
386
|
+
id: "outline-#{index}",
|
|
387
|
+
raw_outline_index: index,
|
|
388
|
+
feed_url: feed_url.presence,
|
|
389
|
+
title: title,
|
|
390
|
+
website_url: website_url,
|
|
391
|
+
status: "malformed",
|
|
392
|
+
error: error,
|
|
393
|
+
health_status: nil,
|
|
394
|
+
health_error: nil
|
|
395
|
+
}
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
def outline_attribute(outline, name)
|
|
399
|
+
attribute = outline.attribute_nodes.find { |attr| attr.name.casecmp(name).zero? }
|
|
400
|
+
attribute&.value.to_s.presence
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
def valid_feed_url?(url)
|
|
404
|
+
parsed = URI.parse(url)
|
|
405
|
+
parsed.is_a?(URI::HTTP) && parsed.host.present?
|
|
406
|
+
rescue URI::InvalidURIError
|
|
407
|
+
false
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
def annotated_entries(selected_ids)
|
|
411
|
+
selected_ids ||= []
|
|
412
|
+
entries = Array(import_session.parsed_sources)
|
|
413
|
+
return [] if entries.blank?
|
|
414
|
+
|
|
415
|
+
normalized = entries.map { |entry| normalize_entry(entry) }
|
|
416
|
+
feed_urls = normalized.filter_map { |entry| entry[:feed_url]&.downcase }
|
|
417
|
+
duplicate_lookup = if feed_urls.present?
|
|
418
|
+
SourceMonitor::Source.where("LOWER(feed_url) IN (?)", feed_urls).pluck(:feed_url).map(&:downcase)
|
|
419
|
+
else
|
|
420
|
+
[]
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
normalized.map do |entry|
|
|
424
|
+
duplicate = entry[:feed_url].present? && duplicate_lookup.include?(entry[:feed_url].downcase)
|
|
425
|
+
entry.merge(
|
|
426
|
+
duplicate: duplicate,
|
|
427
|
+
selectable: entry[:status] == "valid" && !duplicate,
|
|
428
|
+
selected: selected_ids.include?(entry[:id])
|
|
429
|
+
)
|
|
430
|
+
end
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
def normalize_entry(entry)
|
|
434
|
+
SourceMonitor::ImportSessions::EntryNormalizer.normalize(entry)
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
def filter_entries(entries, filter)
|
|
438
|
+
case filter
|
|
439
|
+
when "new"
|
|
440
|
+
entries.select { |entry| entry[:selectable] }
|
|
441
|
+
when "existing"
|
|
442
|
+
entries.select { |entry| entry[:duplicate] }
|
|
443
|
+
else
|
|
444
|
+
entries
|
|
445
|
+
end
|
|
446
|
+
end
|
|
447
|
+
|
|
448
|
+
def build_selection_from_params(selectable_entries)
|
|
449
|
+
ids = import_session_params[:selected_source_ids]
|
|
450
|
+
return [] unless ids
|
|
451
|
+
|
|
452
|
+
Array(ids).map(&:to_s).uniq & selectable_entries.map { |entry| entry[:id] }
|
|
453
|
+
end
|
|
454
|
+
|
|
455
|
+
def advancing_from_preview?
|
|
456
|
+
target_step != "preview"
|
|
457
|
+
end
|
|
458
|
+
|
|
459
|
+
def health_check_selection_from_params
|
|
460
|
+
if select_all_requested?
|
|
461
|
+
return health_check_targets.dup
|
|
462
|
+
end
|
|
463
|
+
|
|
464
|
+
return [] if select_none_requested?
|
|
465
|
+
|
|
466
|
+
ids = import_session_params[:selected_source_ids]
|
|
467
|
+
return Array(import_session.selected_source_ids).map(&:to_s) unless ids
|
|
468
|
+
|
|
469
|
+
Array(ids).map(&:to_s).uniq & health_check_targets
|
|
470
|
+
end
|
|
471
|
+
|
|
472
|
+
def advancing_from_health_check?
|
|
473
|
+
target_step != "health_check"
|
|
474
|
+
end
|
|
475
|
+
|
|
476
|
+
def start_health_checks_if_needed
|
|
477
|
+
return unless current_step == "health_check"
|
|
478
|
+
|
|
479
|
+
jobs_to_enqueue = []
|
|
480
|
+
|
|
481
|
+
import_session.with_lock do
|
|
482
|
+
import_session.reload
|
|
483
|
+
selected = Array(import_session.selected_source_ids).map(&:to_s)
|
|
484
|
+
|
|
485
|
+
if selected.blank?
|
|
486
|
+
import_session.update_columns(health_checks_active: false, health_check_target_ids: [])
|
|
487
|
+
next
|
|
488
|
+
end
|
|
489
|
+
|
|
490
|
+
if import_session.health_checks_active? && import_session.health_check_targets.sort == selected.sort
|
|
491
|
+
next
|
|
492
|
+
end
|
|
493
|
+
|
|
494
|
+
import_session.update!(
|
|
495
|
+
parsed_sources: reset_health_results(import_session.parsed_sources, selected),
|
|
496
|
+
health_checks_active: true,
|
|
497
|
+
health_check_target_ids: selected,
|
|
498
|
+
health_check_started_at: now,
|
|
499
|
+
health_check_completed_at: nil
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
jobs_to_enqueue = selected
|
|
503
|
+
end
|
|
504
|
+
|
|
505
|
+
enqueue_health_check_jobs(import_session, jobs_to_enqueue) if jobs_to_enqueue.any?
|
|
506
|
+
end
|
|
507
|
+
|
|
508
|
+
def reset_health_results(entries, target_ids)
|
|
509
|
+
Array(entries).map do |entry|
|
|
510
|
+
entry_hash = entry.to_h
|
|
511
|
+
entry_id = entry_hash["id"] || entry_hash[:id]
|
|
512
|
+
next entry_hash unless target_ids.include?(entry_id.to_s)
|
|
513
|
+
|
|
514
|
+
entry_hash.merge("health_status" => "pending", "health_error" => nil)
|
|
515
|
+
end
|
|
516
|
+
end
|
|
517
|
+
|
|
518
|
+
def enqueue_health_check_jobs(import_session, target_ids)
|
|
519
|
+
target_ids.each do |target_id|
|
|
520
|
+
SourceMonitor::ImportSessionHealthCheckJob.set(wait: 1.second).perform_later(import_session.id, target_id)
|
|
521
|
+
end
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
def health_check_entries(selected_ids)
|
|
525
|
+
targets = health_check_targets
|
|
526
|
+
entries = Array(import_session.parsed_sources).map { |entry| normalize_entry(entry) }
|
|
527
|
+
|
|
528
|
+
entries.select { |entry| targets.include?(entry[:id]) }.map do |entry|
|
|
529
|
+
entry.merge(selected: selected_ids.include?(entry[:id]))
|
|
530
|
+
end
|
|
531
|
+
end
|
|
532
|
+
|
|
533
|
+
def health_check_progress(entries)
|
|
534
|
+
total = health_check_targets.size
|
|
535
|
+
completed = entries.count { |entry| health_check_complete?(entry) }
|
|
536
|
+
|
|
537
|
+
{
|
|
538
|
+
completed: completed,
|
|
539
|
+
total: total,
|
|
540
|
+
pending: [ total - completed, 0 ].max,
|
|
541
|
+
active: import_session.health_checks_active?,
|
|
542
|
+
done: total.positive? && completed >= total
|
|
543
|
+
}
|
|
544
|
+
end
|
|
545
|
+
|
|
546
|
+
def health_check_complete?(entry)
|
|
547
|
+
%w[working failing].include?(entry[:health_status].to_s)
|
|
548
|
+
end
|
|
549
|
+
|
|
550
|
+
def health_check_targets
|
|
551
|
+
targets = import_session.health_check_targets
|
|
552
|
+
targets = Array(import_session.selected_source_ids).map(&:to_s) if targets.blank?
|
|
553
|
+
targets
|
|
554
|
+
end
|
|
555
|
+
|
|
556
|
+
def normalize_page_param(value)
|
|
557
|
+
number = value.to_i
|
|
558
|
+
number = 1 if number <= 0
|
|
559
|
+
number
|
|
560
|
+
rescue StandardError
|
|
561
|
+
1
|
|
562
|
+
end
|
|
563
|
+
|
|
564
|
+
def permitted_filter(raw)
|
|
565
|
+
value = raw.to_s.presence
|
|
566
|
+
return unless value
|
|
567
|
+
|
|
568
|
+
%w[all new existing].find { |candidate| candidate == value }
|
|
569
|
+
end
|
|
570
|
+
|
|
571
|
+
def preview_per_page
|
|
572
|
+
25
|
|
573
|
+
end
|
|
574
|
+
|
|
575
|
+
def target_step
|
|
576
|
+
permitted_step(import_session_params[:next_step]) || current_step || ImportSession.default_step
|
|
577
|
+
end
|
|
578
|
+
|
|
579
|
+
def permitted_step(value)
|
|
580
|
+
step = value.to_s.presence
|
|
581
|
+
return unless step
|
|
582
|
+
|
|
583
|
+
ImportSession::STEP_ORDER.find { |candidate| candidate == step }
|
|
584
|
+
end
|
|
585
|
+
|
|
586
|
+
def import_session_params
|
|
587
|
+
@import_session_params ||= begin
|
|
588
|
+
raw = params[:import_session] || params["import_session"] || {}
|
|
589
|
+
permitted = if raw.respond_to?(:permit)
|
|
590
|
+
raw.permit(:next_step, :select_all, :select_none, selected_source_ids: [])
|
|
591
|
+
else
|
|
592
|
+
raw.to_h
|
|
593
|
+
end
|
|
594
|
+
|
|
595
|
+
SourceMonitor::Security::ParameterSanitizer.sanitize(permitted.to_h).with_indifferent_access
|
|
596
|
+
end
|
|
597
|
+
end
|
|
598
|
+
|
|
599
|
+
def select_all_requested?
|
|
600
|
+
truthy_import_session_param?(:select_all)
|
|
601
|
+
end
|
|
602
|
+
|
|
603
|
+
def select_none_requested?
|
|
604
|
+
truthy_import_session_param?(:select_none)
|
|
605
|
+
end
|
|
606
|
+
|
|
607
|
+
def truthy_import_session_param?(key)
|
|
608
|
+
TRUE_PARAM_VALUES.include?(import_session_params[key])
|
|
609
|
+
end
|
|
610
|
+
end
|
|
611
|
+
end
|
|
612
|
+
end
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "source_monitor/items/item_creator"
|
|
4
|
+
require "source_monitor/items/normalized_entry"
|
|
4
5
|
|
|
5
6
|
module SourceMonitor
|
|
6
7
|
module Items
|
|
@@ -32,17 +33,17 @@ module SourceMonitor
|
|
|
32
33
|
|
|
33
34
|
# Step 1: Pre-parse entries to extract GUIDs and fingerprints for bulk lookup.
|
|
34
35
|
entry_identifiers = @entries.map do |entry|
|
|
35
|
-
|
|
36
|
+
normalized_entry = NormalizedEntry.new(
|
|
36
37
|
source: @source,
|
|
37
38
|
entry: entry,
|
|
38
39
|
content_extractor: content_extractor
|
|
39
40
|
)
|
|
40
|
-
attrs = parser.parse
|
|
41
|
-
raw_guid = attrs[:guid]
|
|
42
|
-
normalized_guid = raw_guid.present? ? raw_guid.downcase : nil
|
|
43
|
-
guid = normalized_guid.presence || attrs[:content_fingerprint]
|
|
44
41
|
|
|
45
|
-
{
|
|
42
|
+
{
|
|
43
|
+
guid: normalized_entry.item_guid,
|
|
44
|
+
fingerprint: normalized_entry.content_fingerprint,
|
|
45
|
+
raw_guid_present: normalized_entry.raw_guid_present?
|
|
46
|
+
}
|
|
46
47
|
end
|
|
47
48
|
|
|
48
49
|
# Step 2: Batch-fetch existing items by GUID (single query)
|
|
@@ -9,6 +9,7 @@ require "source_monitor/instrumentation"
|
|
|
9
9
|
require "source_monitor/scrapers/readability"
|
|
10
10
|
require "source_monitor/items/item_creator/entry_parser"
|
|
11
11
|
require "source_monitor/items/item_creator/content_extractor"
|
|
12
|
+
require "source_monitor/items/normalized_entry"
|
|
12
13
|
|
|
13
14
|
module SourceMonitor
|
|
14
15
|
module Items
|
|
@@ -49,14 +50,10 @@ module SourceMonitor
|
|
|
49
50
|
end
|
|
50
51
|
|
|
51
52
|
def call
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
# Normalize GUID to lowercase so the plain btree index on guid is used
|
|
55
|
-
# for lookups instead of LOWER(guid) which forces sequential scans.
|
|
56
|
-
normalized_guid = raw_guid.present? ? raw_guid.downcase : nil
|
|
57
|
-
attributes[:guid] = normalized_guid.presence || attributes[:content_fingerprint]
|
|
53
|
+
normalized_entry = build_normalized_entry
|
|
54
|
+
attributes = normalized_entry.item_attributes
|
|
58
55
|
|
|
59
|
-
existing_item, matched_by = existing_item_for(attributes, raw_guid_present:
|
|
56
|
+
existing_item, matched_by = existing_item_for(attributes, raw_guid_present: normalized_entry.raw_guid_present?)
|
|
60
57
|
|
|
61
58
|
if existing_item
|
|
62
59
|
apply_attributes(existing_item, attributes)
|
|
@@ -70,7 +67,7 @@ module SourceMonitor
|
|
|
70
67
|
end
|
|
71
68
|
end
|
|
72
69
|
|
|
73
|
-
create_new_item(attributes, raw_guid_present:
|
|
70
|
+
create_new_item(attributes, raw_guid_present: normalized_entry.raw_guid_present?)
|
|
74
71
|
end
|
|
75
72
|
|
|
76
73
|
private
|
|
@@ -193,12 +190,8 @@ module SourceMonitor
|
|
|
193
190
|
(record.changed - IGNORED_CHANGE_ATTRIBUTES).any?
|
|
194
191
|
end
|
|
195
192
|
|
|
196
|
-
def
|
|
197
|
-
|
|
198
|
-
end
|
|
199
|
-
|
|
200
|
-
def entry_parser
|
|
201
|
-
@entry_parser ||= EntryParser.new(source: source, entry: entry, content_extractor: content_extractor)
|
|
193
|
+
def build_normalized_entry
|
|
194
|
+
@normalized_entry ||= NormalizedEntry.new(source: source, entry: entry, content_extractor: content_extractor)
|
|
202
195
|
end
|
|
203
196
|
|
|
204
197
|
def content_extractor
|