html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
class AutoSource
|
|
7
|
+
module Scraper
|
|
8
|
+
##
|
|
9
|
+
# Scrapes JSON state blobs embedded in script tags such as Next.js, Nuxt,
|
|
10
|
+
# or custom window globals. The scraper searches `<script type="application/json">`
|
|
11
|
+
# tags and well-known JavaScript globals for arrays of article-like hashes
|
|
12
|
+
# and normalises them to a structure compatible with HtmlExtractor.
|
|
13
|
+
class JsonState
|
|
14
|
+
include Enumerable
|
|
15
|
+
|
|
16
|
+
# Selector for JSON-only script tags.
|
|
17
|
+
JSON_SCRIPT_SELECTOR = 'script[type="application/json"]'
|
|
18
|
+
# Regex patterns for known global JavaScript state assignments.
|
|
19
|
+
GLOBAL_ASSIGNMENT_PATTERNS = [
|
|
20
|
+
/(?:window|self|globalThis)\.__NEXT_DATA__\s*=\s*/m,
|
|
21
|
+
/(?:window|self|globalThis)\.__NUXT__\s*=\s*/m,
|
|
22
|
+
/(?:window|self|globalThis)\.STATE\s*=\s*/m,
|
|
23
|
+
/(?:window|self|globalThis)\.__REDUX_STATE__\s*=\s*/m,
|
|
24
|
+
/(?:window|self|globalThis)\.__PRELOADED_STATE__\s*=\s*/m,
|
|
25
|
+
/(?:window|self|globalThis)\.__APOLLO_STATE__\s*=\s*/m,
|
|
26
|
+
/(?:window|self|globalThis)\.__remixContext\s*=\s*/m,
|
|
27
|
+
/(?:window|self|globalThis)\.__sveltekit_data\s*=\s*/m,
|
|
28
|
+
/(?:window|self|globalThis)\.GATSBY_STATE\s*=\s*/m,
|
|
29
|
+
/(?:window|self|globalThis)\.__ember_meta\s*=\s*/m,
|
|
30
|
+
/(?:window|self|globalThis)\.angular\s*=\s*/m
|
|
31
|
+
].freeze
|
|
32
|
+
|
|
33
|
+
# Preferred keys when extracting title-like values from state payloads.
|
|
34
|
+
TITLE_KEYS = %i[title headline name text].freeze
|
|
35
|
+
# Preferred keys when extracting URL-like values from state payloads.
|
|
36
|
+
URL_KEYS = %i[url link href permalink slug path canonicalUrl shortUrl].freeze
|
|
37
|
+
# Preferred keys when extracting description-like values from state payloads.
|
|
38
|
+
DESCRIPTION_KEYS = %i[description summary excerpt dek subheading].freeze
|
|
39
|
+
# Preferred keys when extracting image-like values from state payloads.
|
|
40
|
+
IMAGE_KEYS = %i[image imageUrl thumbnailUrl thumbnail src featuredImage coverImage heroImage].freeze
|
|
41
|
+
# Preferred keys when extracting publication timestamps from state payloads.
|
|
42
|
+
PUBLISHED_AT_KEYS = %i[published_at publishedAt datePublished date publicationDate pubDate updatedAt updated_at
|
|
43
|
+
createdAt created_at].freeze
|
|
44
|
+
# Preferred keys when extracting category-like values from state payloads.
|
|
45
|
+
CATEGORY_KEYS = %i[categories tags section sections topic topics channel].freeze
|
|
46
|
+
# Preferred keys when extracting identifier-like values from state payloads.
|
|
47
|
+
ID_KEYS = %i[id guid uuid slug key].freeze
|
|
48
|
+
|
|
49
|
+
# Scans DOM nodes for JSON payloads containing article data.
|
|
50
|
+
module DocumentScanner
|
|
51
|
+
module_function
|
|
52
|
+
|
|
53
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
54
|
+
# @return [Array<Hash, Array>] parsed JSON documents discovered in scripts
|
|
55
|
+
def json_documents(parsed_body)
|
|
56
|
+
script_documents(parsed_body) + assignment_documents(parsed_body)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
60
|
+
# @return [Array<Hash, Array>] JSON documents extracted from JSON script tags
|
|
61
|
+
def script_documents(parsed_body)
|
|
62
|
+
parsed_body.css(JSON_SCRIPT_SELECTOR).filter_map { parse_json(_1.text) }
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
66
|
+
# @return [Array<Hash, Array>] JSON documents extracted from global assignments
|
|
67
|
+
def assignment_documents(parsed_body)
|
|
68
|
+
parsed_body.css('script').filter_map { parse_assignment(_1.text) }
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# @param text [String] script text that may contain a global assignment
|
|
72
|
+
# @return [Hash, Array, nil] parsed assignment payload when available
|
|
73
|
+
def parse_assignment(text)
|
|
74
|
+
payload = assignment_payload(text)
|
|
75
|
+
parse_json(payload) if payload
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# @param text [String] script text to inspect for known assignment patterns
|
|
79
|
+
# @return [String, nil] extracted JSON-like assignment payload
|
|
80
|
+
def assignment_payload(text)
|
|
81
|
+
trimmed = text.to_s.strip
|
|
82
|
+
return if trimmed.empty?
|
|
83
|
+
|
|
84
|
+
GLOBAL_ASSIGNMENT_PATTERNS.each do |pattern|
|
|
85
|
+
next unless trimmed.match?(pattern)
|
|
86
|
+
|
|
87
|
+
payload = trimmed.sub(pattern, '')
|
|
88
|
+
return extract_assignment_payload(payload)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
nil
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# @param text [String] text potentially containing JSON-like payloads
|
|
95
|
+
# @return [String, nil] normalized assignment payload
|
|
96
|
+
def extract_assignment_payload(text)
|
|
97
|
+
extract_json_block(text) || text
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# @param text [String] text potentially containing JSON blocks
|
|
101
|
+
# @return [String, nil] extracted JSON block spanning balanced brackets
|
|
102
|
+
def extract_json_block(text)
|
|
103
|
+
start_index = text.index(/[\[{]/)
|
|
104
|
+
return unless start_index
|
|
105
|
+
|
|
106
|
+
stop_index = scan_for_json_end(text, start_index)
|
|
107
|
+
text[start_index..stop_index] if stop_index
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
111
|
+
# @param text [String] text starting with a JSON object/array opening token
|
|
112
|
+
# @param start_index [Integer] index where JSON-like content starts
|
|
113
|
+
# @return [Integer, nil] index where the balanced JSON payload ends
|
|
114
|
+
def scan_for_json_end(text, start_index)
|
|
115
|
+
stack = []
|
|
116
|
+
in_string = false
|
|
117
|
+
escape = false
|
|
118
|
+
|
|
119
|
+
text.each_char.with_index do |char, index|
|
|
120
|
+
next if index < start_index
|
|
121
|
+
|
|
122
|
+
if in_string
|
|
123
|
+
if escape
|
|
124
|
+
escape = false
|
|
125
|
+
elsif char == '\\'
|
|
126
|
+
escape = true
|
|
127
|
+
elsif char == '"'
|
|
128
|
+
in_string = false
|
|
129
|
+
end
|
|
130
|
+
next
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
case char
|
|
134
|
+
when '"'
|
|
135
|
+
in_string = true
|
|
136
|
+
when '{'
|
|
137
|
+
stack << '}'
|
|
138
|
+
when '['
|
|
139
|
+
stack << ']'
|
|
140
|
+
when '}', ']'
|
|
141
|
+
expected = stack.pop
|
|
142
|
+
return index if expected == char && stack.empty?
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
nil
|
|
147
|
+
end
|
|
148
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
149
|
+
|
|
150
|
+
# @param payload [String, nil] JSON payload to parse
|
|
151
|
+
# @return [Hash, Array, nil] parsed payload or nil when parsing fails
|
|
152
|
+
def parse_json(payload)
|
|
153
|
+
return unless payload
|
|
154
|
+
|
|
155
|
+
JSON.parse(payload, symbolize_names: true)
|
|
156
|
+
rescue JSON::ParserError => error
|
|
157
|
+
parse_js_object(payload, error)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# @param payload [String] JavaScript object-literal payload
|
|
161
|
+
# @param _original_error [JSON::ParserError] original JSON parse error
|
|
162
|
+
# @return [Hash, Array, nil] parsed payload after JavaScript coercion
|
|
163
|
+
def parse_js_object(payload, _original_error)
|
|
164
|
+
coerced = coerce_javascript_object(payload)
|
|
165
|
+
return unless coerced
|
|
166
|
+
|
|
167
|
+
# Some sites emit JavaScript object literals (unquoted keys, trailing commas).
|
|
168
|
+
# Coerce those payloads into valid JSON so we keep the same parsing pipeline.
|
|
169
|
+
JSON.parse(coerced, symbolize_names: true)
|
|
170
|
+
rescue JSON::ParserError => error
|
|
171
|
+
Html2rss::Log.debug("#{name}: failed to parse coerced JavaScript object (#{error.message})")
|
|
172
|
+
nil
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# @param payload [String] JavaScript object-literal payload
|
|
176
|
+
# @return [String] JSON-compatible payload string
|
|
177
|
+
def coerce_javascript_object(payload)
|
|
178
|
+
string = payload.dup
|
|
179
|
+
|
|
180
|
+
# KISS approach: mutate common JS literal quirks instead of a full parser.
|
|
181
|
+
strip_trailing_commas(quote_unquoted_keys(string))
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# @param jsonish [String] JSON-like string with potentially unquoted keys
|
|
185
|
+
# @return [String] payload with unquoted object keys quoted
|
|
186
|
+
def quote_unquoted_keys(jsonish)
|
|
187
|
+
jsonish.gsub(/(\A\s*|[{,\[]\s*)([A-Za-z_]\w*)(\s*:)/) do
|
|
188
|
+
"#{Regexp.last_match(1)}\"#{Regexp.last_match(2)}\"#{Regexp.last_match(3)}"
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# @param jsonish [String] JSON-like string with potential trailing commas
|
|
193
|
+
# @return [String] payload without trailing commas before closing tokens
|
|
194
|
+
def strip_trailing_commas(jsonish)
|
|
195
|
+
jsonish.gsub(/,(\s*[\]}])/, '\1')
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
private_constant :DocumentScanner
|
|
199
|
+
|
|
200
|
+
# Retrieves values from heterogeneous objects by probing multiple keys.
|
|
201
|
+
module ValueFinder
|
|
202
|
+
module_function
|
|
203
|
+
|
|
204
|
+
# @param object [Hash, Array] candidate container traversed during key lookup
|
|
205
|
+
# @param keys [Array<Symbol>] keys to probe in order
|
|
206
|
+
# @return [Object, nil] first matching value
|
|
207
|
+
def fetch(object, keys)
|
|
208
|
+
case object
|
|
209
|
+
when Hash then fetch_from_hash(object, keys)
|
|
210
|
+
when Array then fetch_from_array(object, keys)
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
# @param hash [Hash] hash candidate traversed during key lookup
|
|
215
|
+
# @param keys [Array<Symbol>] keys to probe in order
|
|
216
|
+
# @return [Object, nil] first matching value from hash or nested metadata
|
|
217
|
+
def fetch_from_hash(hash, keys)
|
|
218
|
+
keys.each do |key|
|
|
219
|
+
return hash[key] if hash.key?(key)
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
fetch_nested(hash[:attributes], keys) ||
|
|
223
|
+
fetch_nested(hash[:data], keys)
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# @param array [Array] array whose entries may contain target keys
|
|
227
|
+
# @param keys [Array<Symbol>] keys to probe in order
|
|
228
|
+
# @return [Object, nil] first matching value from array entries
|
|
229
|
+
def fetch_from_array(array, keys)
|
|
230
|
+
array.each do |entry|
|
|
231
|
+
result = fetch(entry, keys)
|
|
232
|
+
return result if result
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
nil
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# @param value [Hash, Array, nil] nested value to recurse into
|
|
239
|
+
# @param keys [Array<Symbol>] keys to probe in order
|
|
240
|
+
# @return [Object, nil] matching nested value
|
|
241
|
+
def fetch_nested(value, keys)
|
|
242
|
+
fetch(value, keys) if value
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
private_constant :ValueFinder
|
|
246
|
+
|
|
247
|
+
# Identifies arrays that look like collections of article hashes.
|
|
248
|
+
module CandidateDetector
|
|
249
|
+
module_function
|
|
250
|
+
|
|
251
|
+
# @param document [Hash, Array, Object] candidate document node
|
|
252
|
+
# @return [Boolean] whether the node contains article-like arrays
|
|
253
|
+
def candidate_array?(document)
|
|
254
|
+
case document
|
|
255
|
+
when Array
|
|
256
|
+
return true if array_of_articles?(document)
|
|
257
|
+
|
|
258
|
+
document.any? { traversable_candidate?(_1) }
|
|
259
|
+
when Hash then document.each_value.any? { candidate_array?(_1) }
|
|
260
|
+
else false
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# @param value [Hash, Array, Object] candidate nested value
|
|
265
|
+
# @return [Boolean] whether nested value should be traversed for article candidates
|
|
266
|
+
def traversable_candidate?(value)
|
|
267
|
+
case value
|
|
268
|
+
when Array, Hash then candidate_array?(value)
|
|
269
|
+
else false
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
# @param array [Array<Object>] candidate list of entries
|
|
274
|
+
# @return [Boolean] whether array includes hash entries with title and URL fields
|
|
275
|
+
def array_of_articles?(array)
|
|
276
|
+
array.any? do |element|
|
|
277
|
+
next unless element.is_a?(Hash)
|
|
278
|
+
|
|
279
|
+
title_from(element) && url_from(element)
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
# @param object [Hash] article candidate object
|
|
284
|
+
# @return [Object, nil] detected title-like value
|
|
285
|
+
def title_from(object)
|
|
286
|
+
ValueFinder.fetch(object, TITLE_KEYS)
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
# @param object [Hash] article candidate object
|
|
290
|
+
# @return [Object, nil] detected URL-like value
|
|
291
|
+
def url_from(object)
|
|
292
|
+
ValueFinder.fetch(object, URL_KEYS)
|
|
293
|
+
end
|
|
294
|
+
end
|
|
295
|
+
private_constant :CandidateDetector
|
|
296
|
+
|
|
297
|
+
# Shapes raw entries into the structure required downstream.
|
|
298
|
+
module ArticleNormalizer
|
|
299
|
+
module_function
|
|
300
|
+
|
|
301
|
+
# rubocop:disable Metrics/MethodLength
|
|
302
|
+
# @param entry [Hash] raw article entry candidate
|
|
303
|
+
# @param base_url [String, Html2rss::Url] base URL for relative link resolution
|
|
304
|
+
# @return [Hash{Symbol => Object}, nil] normalized article hash for downstream extraction
|
|
305
|
+
def normalise(entry, base_url:)
|
|
306
|
+
return unless entry.is_a?(Hash)
|
|
307
|
+
|
|
308
|
+
title = string(ValueFinder.fetch(entry, TITLE_KEYS))
|
|
309
|
+
description = string(ValueFinder.fetch(entry, DESCRIPTION_KEYS))
|
|
310
|
+
article_url = resolve_link(entry, keys: URL_KEYS, base_url:,
|
|
311
|
+
log_key: 'JsonState: invalid URL encountered')
|
|
312
|
+
return unless article_url
|
|
313
|
+
return if title.nil? && description.nil?
|
|
314
|
+
|
|
315
|
+
{
|
|
316
|
+
title:,
|
|
317
|
+
description:,
|
|
318
|
+
url: article_url,
|
|
319
|
+
image: resolve_link(entry, keys: IMAGE_KEYS, base_url:,
|
|
320
|
+
log_key: 'JsonState: invalid image URL encountered'),
|
|
321
|
+
published_at: string(ValueFinder.fetch(entry, PUBLISHED_AT_KEYS)),
|
|
322
|
+
categories: categories(entry),
|
|
323
|
+
id: identifier(entry, article_url)
|
|
324
|
+
}.compact
|
|
325
|
+
end
|
|
326
|
+
# rubocop:enable Metrics/MethodLength
|
|
327
|
+
|
|
328
|
+
# @param value [Object] candidate scalar value
|
|
329
|
+
# @return [String, nil] normalized non-empty string value
|
|
330
|
+
def string(value)
|
|
331
|
+
trimmed = value.to_s.strip
|
|
332
|
+
trimmed unless trimmed.empty?
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
# @param entry [Hash] raw article entry candidate
|
|
336
|
+
# @param keys [Array<String>] preferred link keys
|
|
337
|
+
# @param base_url [String, Html2rss::Url] base URL for relative link resolution
|
|
338
|
+
# @param log_key [String] structured log message key
|
|
339
|
+
# @return [Html2rss::Url, nil] resolved absolute URL
|
|
340
|
+
def resolve_link(entry, keys:, base_url:, log_key:)
|
|
341
|
+
value = ValueFinder.fetch(entry, keys)
|
|
342
|
+
value = ValueFinder.fetch(value, keys) if value.is_a?(Hash)
|
|
343
|
+
string = string(value)
|
|
344
|
+
return unless string
|
|
345
|
+
|
|
346
|
+
Url.from_relative(string, base_url)
|
|
347
|
+
rescue ArgumentError
|
|
348
|
+
Log.debug(log_key, url: string)
|
|
349
|
+
nil
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
# rubocop:disable Metrics/MethodLength
|
|
353
|
+
# @param entry [Hash] raw article entry candidate
|
|
354
|
+
# @return [Array<String>, nil] normalized unique categories
|
|
355
|
+
def categories(entry)
|
|
356
|
+
raw = ValueFinder.fetch(entry, CATEGORY_KEYS)
|
|
357
|
+
names = case raw
|
|
358
|
+
when Array then raw
|
|
359
|
+
when Hash then raw.values
|
|
360
|
+
when String then [raw]
|
|
361
|
+
else []
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
result = names.flat_map do |value|
|
|
365
|
+
case value
|
|
366
|
+
when Hash
|
|
367
|
+
string(ValueFinder.fetch(value, %i[name title label]))
|
|
368
|
+
else
|
|
369
|
+
string(value)
|
|
370
|
+
end
|
|
371
|
+
end.compact
|
|
372
|
+
|
|
373
|
+
result.uniq!
|
|
374
|
+
result unless result.empty?
|
|
375
|
+
end
|
|
376
|
+
# rubocop:enable Metrics/MethodLength
|
|
377
|
+
|
|
378
|
+
# @param entry [Hash] raw article entry candidate
|
|
379
|
+
# @param article_url [Html2rss::Url] resolved article URL
|
|
380
|
+
# @return [String] stable article identifier fallbacking to resolved URL
|
|
381
|
+
def identifier(entry, article_url)
|
|
382
|
+
value = ValueFinder.fetch(entry, ID_KEYS)
|
|
383
|
+
value = ValueFinder.fetch(value, ID_KEYS) if value.is_a?(Hash)
|
|
384
|
+
string(value) || article_url.to_s
|
|
385
|
+
end
|
|
386
|
+
end
|
|
387
|
+
private_constant :ArticleNormalizer
|
|
388
|
+
|
|
389
|
+
# @return [Symbol] scraper config key
|
|
390
|
+
def self.options_key = :json_state
|
|
391
|
+
|
|
392
|
+
class << self
|
|
393
|
+
# @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
|
|
394
|
+
def articles?(parsed_body)
|
|
395
|
+
return false unless parsed_body
|
|
396
|
+
|
|
397
|
+
DocumentScanner.json_documents(parsed_body).any? { CandidateDetector.candidate_array?(_1) }
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
# @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
|
|
401
|
+
# @return [Array<Hash, Array>] parsed JSON documents discovered in the response body
|
|
402
|
+
def json_documents(parsed_body)
|
|
403
|
+
DocumentScanner.json_documents(parsed_body)
|
|
404
|
+
end
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
# @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
|
|
408
|
+
# @param url [String, Html2rss::Url] page URL used to resolve relative links
|
|
409
|
+
# @param _opts [Hash] scraper-specific options
|
|
410
|
+
# @option _opts [Object] :_reserved reserved for future scraper-specific options
|
|
411
|
+
def initialize(parsed_body, url:, **_opts)
|
|
412
|
+
@parsed_body = parsed_body
|
|
413
|
+
@url = url
|
|
414
|
+
end
|
|
415
|
+
|
|
416
|
+
attr_reader :parsed_body
|
|
417
|
+
|
|
418
|
+
# @yield [Hash{Symbol => Object}] normalized article hash
|
|
419
|
+
# @return [Enumerator, void] article enumerator when no block is given
|
|
420
|
+
def each
|
|
421
|
+
return enum_for(:each) unless block_given?
|
|
422
|
+
|
|
423
|
+
DocumentScanner.json_documents(parsed_body).each do |document|
|
|
424
|
+
discover_articles(document) do |article|
|
|
425
|
+
yield article if article
|
|
426
|
+
end
|
|
427
|
+
end
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
private
|
|
431
|
+
|
|
432
|
+
attr_reader :url
|
|
433
|
+
|
|
434
|
+
def discover_articles(document, &block)
|
|
435
|
+
case document
|
|
436
|
+
when Array then handle_array(document, &block)
|
|
437
|
+
when Hash then document.each_value { discover_articles(_1, &block) if traversable?(_1) }
|
|
438
|
+
end
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
def handle_array(array, &block)
|
|
442
|
+
if CandidateDetector.array_of_articles?(array)
|
|
443
|
+
array.each do |entry|
|
|
444
|
+
yield(ArticleNormalizer.normalise(entry, base_url: url))
|
|
445
|
+
end
|
|
446
|
+
else
|
|
447
|
+
array.each { discover_articles(_1, &block) if traversable?(_1) }
|
|
448
|
+
end
|
|
449
|
+
end
|
|
450
|
+
|
|
451
|
+
def traversable?(value)
|
|
452
|
+
value.is_a?(Array) || value.is_a?(Hash)
|
|
453
|
+
end
|
|
454
|
+
end
|
|
455
|
+
end
|
|
456
|
+
end
|
|
457
|
+
end
|