html2rss 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -1
  3. data/lib/html2rss/articles/deduplicator.rb +1 -0
  4. data/lib/html2rss/auto_source/cleanup.rb +11 -0
  5. data/lib/html2rss/auto_source/scraper/html.rb +5 -0
  6. data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
  7. data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
  8. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
  9. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
  10. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
  11. data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
  12. data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
  13. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
  14. data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
  15. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
  16. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
  17. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
  18. data/lib/html2rss/auto_source/scraper.rb +19 -1
  19. data/lib/html2rss/auto_source.rb +4 -0
  20. data/lib/html2rss/blocked_surface.rb +1 -0
  21. data/lib/html2rss/category_extractor.rb +2 -2
  22. data/lib/html2rss/cli.rb +30 -6
  23. data/lib/html2rss/config/class_methods.rb +24 -35
  24. data/lib/html2rss/config/dynamic_params.rb +6 -4
  25. data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
  26. data/lib/html2rss/config/request_headers.rb +9 -3
  27. data/lib/html2rss/config/schema.rb +33 -1
  28. data/lib/html2rss/config/validator.rb +40 -2
  29. data/lib/html2rss/config.rb +19 -13
  30. data/lib/html2rss/error.rb +25 -0
  31. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  32. data/lib/html2rss/feed_pipeline.rb +127 -0
  33. data/lib/html2rss/hash_util.rb +101 -0
  34. data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
  35. data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
  36. data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
  37. data/lib/html2rss/html_extractor.rb +5 -0
  38. data/lib/html2rss/html_navigator.rb +8 -0
  39. data/lib/html2rss/json_feed_builder.rb +1 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +8 -3
  41. data/lib/html2rss/rendering/description_builder.rb +0 -1
  42. data/lib/html2rss/rendering/image_renderer.rb +17 -7
  43. data/lib/html2rss/rendering/media_renderer.rb +4 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
  45. data/lib/html2rss/rendering/video_renderer.rb +8 -3
  46. data/lib/html2rss/rendering.rb +11 -2
  47. data/lib/html2rss/request_controls.rb +16 -21
  48. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  49. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  50. data/lib/html2rss/request_service/context.rb +14 -2
  51. data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
  52. data/lib/html2rss/request_service/policy.rb +4 -0
  53. data/lib/html2rss/request_service/response.rb +9 -1
  54. data/lib/html2rss/request_service.rb +19 -0
  55. data/lib/html2rss/request_session/runtime_input.rb +16 -2
  56. data/lib/html2rss/request_session/runtime_policy.rb +7 -0
  57. data/lib/html2rss/request_session.rb +13 -9
  58. data/lib/html2rss/rss_builder/article.rb +22 -1
  59. data/lib/html2rss/rss_builder/channel.rb +11 -2
  60. data/lib/html2rss/rss_builder/enclosure.rb +15 -1
  61. data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
  62. data/lib/html2rss/rss_builder.rb +4 -0
  63. data/lib/html2rss/selectors/config.rb +1 -0
  64. data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
  65. data/lib/html2rss/selectors/extractors/href.rb +2 -0
  66. data/lib/html2rss/selectors/extractors/html.rb +1 -0
  67. data/lib/html2rss/selectors/extractors/static.rb +2 -1
  68. data/lib/html2rss/selectors/extractors/text.rb +1 -0
  69. data/lib/html2rss/selectors/extractors.rb +2 -1
  70. data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
  71. data/lib/html2rss/selectors/post_processors/base.rb +13 -7
  72. data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
  73. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
  74. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
  75. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
  76. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
  77. data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
  78. data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
  79. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
  80. data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
  81. data/lib/html2rss/selectors/post_processors/template.rb +3 -0
  82. data/lib/html2rss/selectors/post_processors.rb +5 -0
  83. data/lib/html2rss/selectors.rb +7 -0
  84. data/lib/html2rss/url.rb +27 -23
  85. data/lib/html2rss/version.rb +2 -1
  86. data/lib/html2rss.rb +15 -78
  87. data/schema/html2rss-config.schema.json +83 -1
  88. metadata +7 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3dfb5fe2f11c7ef1948abb13942f89489585bce5f3eb2509807e8b6c8bfbfafb
4
- data.tar.gz: b2bce87ae6e3450faf38fcad8c839e8e9e762737ec51d6d5b6ef4bfc099aa456
3
+ metadata.gz: 695f871fe38ceaa657e684c6016d39d74afee79f32608eb28faded39ae25597b
4
+ data.tar.gz: 8c9f4060d63d56688cb324d821a35a0b80aca1a2ec7c53faae9f980b29212182
5
5
  SHA512:
6
- metadata.gz: c252b80acd7ede0cca3b0c15941a06b8dce703d0bf1835ebabe1be707558a29d72cb175915f39978ae1ee13d9551e7b692589d7d43d956db8fc5d52f52c20453
7
- data.tar.gz: e68d7254ffe2c5ad634b7e67d3023756be8fe954c2c96b1429fea91e76b6da11a332d616731209ea60398565767679aa1fc4ecd38e3159886fe77280074e6ec5
6
+ metadata.gz: 3d72d0f4b1f581c694f5ec4580425a5ee14da73f3cd7a793da5518fa46784ed0eae98f1e98bf3a840d2405f9b7fb94843c28ffe2cd6cba9279ac0e1e083e40cf
7
+ data.tar.gz: 71ca6aad736d10dd3486fd00b136851a8bcabbb840b9e206c56c80c5edbe1840f4556702e68344ef45726939632f780c3b74f31f579fb068a50b497c29f044fc
data/README.md CHANGED
@@ -36,7 +36,7 @@ Please see the [contributing guide](https://html2rss.github.io/get-involved/cont
36
36
  ### Core Components
37
37
 
38
38
  1. **Config** - Loads and validates configuration (YAML/hash)
39
- 2. **RequestService** - Fetches pages using Faraday or Browserless
39
+ 2. **RequestService** - Fetches pages using Faraday, Botasaurus, or Browserless
40
40
  3. **Selectors** - Extracts content via CSS selectors with extractors/post-processors
41
41
  4. **AutoSource** - Auto-detects content using Schema.org, JSON state blobs, semantic HTML, and structural patterns
42
42
  5. **RssBuilder** - Assembles Article objects and renders RSS 2.0
@@ -47,6 +47,65 @@ Please see the [contributing guide](https://html2rss.github.io/get-involved/cont
47
47
  Config -> Request -> Extraction -> Processing -> Building -> Output
48
48
  ```
49
49
 
50
+ ### Request Strategies
51
+
52
+ - `auto` (default): pipeline fallback orchestration (`faraday` -> `botasaurus` -> `browserless`) based on extraction outcome and retry policy.
53
+ - `faraday`: direct HTTP fetch.
54
+ - `botasaurus`: delegates fetching to a Botasaurus scrape API. Requires `BOTASAURUS_SCRAPER_URL` (for example `http://localhost:4010`).
55
+ - `browserless`: remote browser rendering via Browserless (`BROWSERLESS_IO_WEBSOCKET_URL` and token as needed).
56
+
57
+ Auto fallback shares one request budget across all strategy attempts. For pagination-heavy or dynamic pages, increase `request.max_requests` (or `--max-requests`) when retries exhaust the budget.
58
+
59
+ Auto fallback decisions are hidden at the default `LOG_LEVEL=warn`; run with `LOG_LEVEL=info` to include them in CLI output.
60
+
61
+ Supported `request.botasaurus` options:
62
+
63
+ - `navigation_mode` (`auto`, `get`, `google_get`, `google_get_bypass`; default `auto`)
64
+ - `max_retries` (`0..3`; default `2`)
65
+ - `wait_for_selector` (string)
66
+ - `wait_timeout_seconds` (integer)
67
+ - `block_images` (boolean)
68
+ - `block_images_and_css` (boolean)
69
+ - `wait_for_complete_page_load` (boolean)
70
+ - `headless` (boolean, default `false`)
71
+ - `proxy` (string)
72
+ - `user_agent` (string)
73
+ - `window_size` (two-item integer array, for example `[1920, 1080]`)
74
+ - `lang` (string, for example `en-US`)
75
+
76
+ Minimal YAML config example:
77
+
78
+ ```yaml
79
+ channel:
80
+ url: https://example.com
81
+ strategy: botasaurus
82
+ auto_source: {}
83
+ request:
84
+ botasaurus:
85
+ navigation_mode: auto
86
+ max_retries: 2
87
+ headless: false
88
+ ```
89
+
90
+ Example request payload shape:
91
+
92
+ ```json
93
+ {
94
+ "url": "https://example.com",
95
+ "navigation_mode": "auto",
96
+ "max_retries": 2,
97
+ "headless": false
98
+ }
99
+ ```
100
+
101
+ Example usage:
102
+
103
+ ```bash
104
+ BOTASAURUS_SCRAPER_URL=http://localhost:4010 html2rss auto https://example.com --strategy botasaurus
105
+ ```
106
+
107
+ Policy note: html2rss still enforces local request policy preflight and timeout budget. Botasaurus handles browser navigation/rendering internals, so some policy details are delegated to upstream execution.
108
+
50
109
  ### Config schema workflow
51
110
 
52
111
  The config schema is generated from the runtime `dry-validation` contracts and exported for client-side tooling.
@@ -3,6 +3,7 @@
3
3
  require 'set' # rubocop:disable Lint/RedundantRequireStatement
4
4
 
5
5
  module Html2rss
6
+ # Shared helpers that operate on `RssBuilder::Article` collections.
6
7
  module Articles
7
8
  ##
8
9
  # Deduplicates a list of articles while preserving their original order.
@@ -7,14 +7,21 @@ module Html2rss
7
7
  # :reek:MissingSafeMethod { enabled: false }
8
8
  # It applies various strategies to filter and refine the article list.
9
9
  class Cleanup
10
+ # Default cleanup behavior for auto-sourced article lists.
10
11
  DEFAULT_CONFIG = {
11
12
  keep_different_domain: false,
12
13
  min_words_title: 3
13
14
  }.freeze
14
15
 
16
+ # Allowed URL schemes for article filtering.
15
17
  VALID_SCHEMES = %w[http https].to_set.freeze
16
18
 
17
19
  class << self
20
+ # @param articles [Array<Article>] extracted article candidates
21
+ # @param url [Html2rss::Url] feed source URL used for same-host filtering
22
+ # @param keep_different_domain [Boolean] whether to keep off-domain entries
23
+ # @param min_words_title [Integer] minimum word count for title filtering
24
+ # @return [Array<Article>] cleaned article list
18
25
  def call(articles, url:, keep_different_domain:, min_words_title:)
19
26
  Log.debug "Cleanup: start with #{articles.size} articles"
20
27
 
@@ -35,6 +42,7 @@ module Html2rss
35
42
  #
36
43
  # @param articles [Array<Article>] The list of articles to process.
37
44
  # @param key [Symbol] The key to deduplicate by.
45
+ # @return [Array<Article>] the mutated articles array
38
46
  def deduplicate_by!(articles, key)
39
47
  seen = {}
40
48
  articles.reject! do |article|
@@ -47,6 +55,7 @@ module Html2rss
47
55
  # Keeps only articles with HTTP or HTTPS URLs.
48
56
  #
49
57
  # @param articles [Array<Article>] The list of articles to process.
58
+ # @return [Array<Article>] the mutated articles array
50
59
  def keep_only_http_urls!(articles)
51
60
  articles.select! { |article| VALID_SCHEMES.include?(article.url&.scheme) }
52
61
  end
@@ -56,6 +65,7 @@ module Html2rss
56
65
  #
57
66
  # @param articles [Array<Article>] The list of articles to process.
58
67
  # @param base_url [Html2rss::Url] The source URL to compare against.
68
+ # @return [Array<Article>] the mutated articles array
59
69
  def reject_different_domain!(articles, base_url)
60
70
  base_host = base_url.host
61
71
  articles.select! { |article| article.url&.host == base_host }
@@ -66,6 +76,7 @@ module Html2rss
66
76
  #
67
77
  # @param articles [Array<Article>] The list of articles to process.
68
78
  # @param min_words_title [Integer] The minimum number of words in the title.
79
+ # @return [Array<Article>] the mutated articles array
69
80
  def keep_only_with_min_words_title!(articles, min_words_title:)
70
81
  articles.select! do |article|
71
82
  article.title ? word_count_at_least?(article.title, min_words_title) : true
@@ -19,9 +19,12 @@ module Html2rss
19
19
  class Html
20
20
  include Enumerable
21
21
 
22
+ # Elements ignored when traversing potential article containers.
22
23
  TAGS_TO_IGNORE = /(nav|footer|header|svg|script|style)/i
23
24
 
25
+ # Minimum selector frequency required to treat a path as a stable list signal.
24
26
  DEFAULT_MINIMUM_SELECTOR_FREQUENCY = 2
27
+ # Number of most frequent selectors kept for container extraction.
25
28
  DEFAULT_USE_TOP_SELECTORS = 5
26
29
 
27
30
  ##
@@ -53,6 +56,8 @@ module Html2rss
53
56
  # @param url [String] The base URL.
54
57
  # @param extractor [Class] The extractor class to handle article extraction.
55
58
  # @param opts [Hash] Additional options.
59
+ # @option opts [Integer] :minimum_selector_frequency minimum count before a selector is considered stable
60
+ # @option opts [Integer] :use_top_selectors number of top selectors to keep
56
61
  def initialize(parsed_body, url:, extractor: HtmlExtractor, **opts)
57
62
  @parsed_body = parsed_body
58
63
  @url = url
@@ -5,7 +5,7 @@ require 'json'
5
5
  module Html2rss
6
6
  class AutoSource
7
7
  module Scraper
8
- #
8
+ ##
9
9
  # Scrapes JSON state blobs embedded in script tags such as Next.js, Nuxt,
10
10
  # or custom window globals. The scraper searches `<script type="application/json">`
11
11
  # tags and well-known JavaScript globals for arrays of article-like hashes
@@ -13,7 +13,9 @@ module Html2rss
13
13
  class JsonState
14
14
  include Enumerable
15
15
 
16
+ # Selector for JSON-only script tags.
16
17
  JSON_SCRIPT_SELECTOR = 'script[type="application/json"]'
18
+ # Regex patterns for known global JavaScript state assignments.
17
19
  GLOBAL_ASSIGNMENT_PATTERNS = [
18
20
  /(?:window|self|globalThis)\.__NEXT_DATA__\s*=\s*/m,
19
21
  /(?:window|self|globalThis)\.__NUXT__\s*=\s*/m,
@@ -28,36 +30,53 @@ module Html2rss
28
30
  /(?:window|self|globalThis)\.angular\s*=\s*/m
29
31
  ].freeze
30
32
 
31
- TITLE_KEYS = %w[title headline name text].freeze
32
- URL_KEYS = %w[url link href permalink slug path canonicalUrl shortUrl].freeze
33
- DESCRIPTION_KEYS = %w[description summary excerpt dek subheading].freeze
34
- IMAGE_KEYS = %w[image imageUrl thumbnailUrl thumbnail src featuredImage coverImage heroImage].freeze
35
- PUBLISHED_AT_KEYS = %w[published_at publishedAt datePublished date publicationDate pubDate updatedAt updated_at
33
+ # Preferred keys when extracting title-like values from state payloads.
34
+ TITLE_KEYS = %i[title headline name text].freeze
35
+ # Preferred keys when extracting URL-like values from state payloads.
36
+ URL_KEYS = %i[url link href permalink slug path canonicalUrl shortUrl].freeze
37
+ # Preferred keys when extracting description-like values from state payloads.
38
+ DESCRIPTION_KEYS = %i[description summary excerpt dek subheading].freeze
39
+ # Preferred keys when extracting image-like values from state payloads.
40
+ IMAGE_KEYS = %i[image imageUrl thumbnailUrl thumbnail src featuredImage coverImage heroImage].freeze
41
+ # Preferred keys when extracting publication timestamps from state payloads.
42
+ PUBLISHED_AT_KEYS = %i[published_at publishedAt datePublished date publicationDate pubDate updatedAt updated_at
36
43
  createdAt created_at].freeze
37
- CATEGORY_KEYS = %w[categories tags section sections topic topics channel].freeze
38
- ID_KEYS = %w[id guid uuid slug key].freeze
44
+ # Preferred keys when extracting category-like values from state payloads.
45
+ CATEGORY_KEYS = %i[categories tags section sections topic topics channel].freeze
46
+ # Preferred keys when extracting identifier-like values from state payloads.
47
+ ID_KEYS = %i[id guid uuid slug key].freeze
39
48
 
40
49
  # Scans DOM nodes for JSON payloads containing article data.
41
50
  module DocumentScanner
42
51
  module_function
43
52
 
53
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
54
+ # @return [Array<Hash, Array>] parsed JSON documents discovered in scripts
44
55
  def json_documents(parsed_body)
45
56
  script_documents(parsed_body) + assignment_documents(parsed_body)
46
57
  end
47
58
 
59
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
60
+ # @return [Array<Hash, Array>] JSON documents extracted from JSON script tags
48
61
  def script_documents(parsed_body)
49
62
  parsed_body.css(JSON_SCRIPT_SELECTOR).filter_map { parse_json(_1.text) }
50
63
  end
51
64
 
65
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
66
+ # @return [Array<Hash, Array>] JSON documents extracted from global assignments
52
67
  def assignment_documents(parsed_body)
53
68
  parsed_body.css('script').filter_map { parse_assignment(_1.text) }
54
69
  end
55
70
 
71
+ # @param text [String] script text that may contain a global assignment
72
+ # @return [Hash, Array, nil] parsed assignment payload when available
56
73
  def parse_assignment(text)
57
74
  payload = assignment_payload(text)
58
75
  parse_json(payload) if payload
59
76
  end
60
77
 
78
+ # @param text [String] script text to inspect for known assignment patterns
79
+ # @return [String, nil] extracted JSON-like assignment payload
61
80
  def assignment_payload(text)
62
81
  trimmed = text.to_s.strip
63
82
  return if trimmed.empty?
@@ -72,10 +91,14 @@ module Html2rss
72
91
  nil
73
92
  end
74
93
 
94
+ # @param text [String] text potentially containing JSON-like payloads
95
+ # @return [String, nil] normalized assignment payload
75
96
  def extract_assignment_payload(text)
76
97
  extract_json_block(text) || text
77
98
  end
78
99
 
100
+ # @param text [String] text potentially containing JSON blocks
101
+ # @return [String, nil] extracted JSON block spanning balanced brackets
79
102
  def extract_json_block(text)
80
103
  start_index = text.index(/[\[{]/)
81
104
  return unless start_index
@@ -85,6 +108,9 @@ module Html2rss
85
108
  end
86
109
 
87
110
  # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
111
+ # @param text [String] text starting with a JSON object/array opening token
112
+ # @param start_index [Integer] index where JSON-like content starts
113
+ # @return [Integer, nil] index where the balanced JSON payload ends
88
114
  def scan_for_json_end(text, start_index)
89
115
  stack = []
90
116
  in_string = false
@@ -121,6 +147,8 @@ module Html2rss
121
147
  end
122
148
  # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
123
149
 
150
+ # @param payload [String, nil] JSON payload to parse
151
+ # @return [Hash, Array, nil] parsed payload or nil when parsing fails
124
152
  def parse_json(payload)
125
153
  return unless payload
126
154
 
@@ -129,6 +157,9 @@ module Html2rss
129
157
  parse_js_object(payload, error)
130
158
  end
131
159
 
160
+ # @param payload [String] JavaScript object-literal payload
161
+ # @param _original_error [JSON::ParserError] original JSON parse error
162
+ # @return [Hash, Array, nil] parsed payload after JavaScript coercion
132
163
  def parse_js_object(payload, _original_error)
133
164
  coerced = coerce_javascript_object(payload)
134
165
  return unless coerced
@@ -141,6 +172,8 @@ module Html2rss
141
172
  nil
142
173
  end
143
174
 
175
+ # @param payload [String] JavaScript object-literal payload
176
+ # @return [String] JSON-compatible payload string
144
177
  def coerce_javascript_object(payload)
145
178
  string = payload.dup
146
179
 
@@ -148,12 +181,16 @@ module Html2rss
148
181
  strip_trailing_commas(quote_unquoted_keys(string))
149
182
  end
150
183
 
184
+ # @param jsonish [String] JSON-like string with potentially unquoted keys
185
+ # @return [String] payload with unquoted object keys quoted
151
186
  def quote_unquoted_keys(jsonish)
152
187
  jsonish.gsub(/(\A\s*|[{,\[]\s*)([A-Za-z_]\w*)(\s*:)/) do
153
188
  "#{Regexp.last_match(1)}\"#{Regexp.last_match(2)}\"#{Regexp.last_match(3)}"
154
189
  end
155
190
  end
156
191
 
192
+ # @param jsonish [String] JSON-like string with potential trailing commas
193
+ # @return [String] payload without trailing commas before closing tokens
157
194
  def strip_trailing_commas(jsonish)
158
195
  jsonish.gsub(/,(\s*[\]}])/, '\1')
159
196
  end
@@ -164,6 +201,9 @@ module Html2rss
164
201
  module ValueFinder
165
202
  module_function
166
203
 
204
+ # @param object [Hash, Array] candidate container traversed during key lookup
205
+ # @param keys [Array<Symbol>] keys to probe in order
206
+ # @return [Object, nil] first matching value
167
207
  def fetch(object, keys)
168
208
  case object
169
209
  when Hash then fetch_from_hash(object, keys)
@@ -171,19 +211,21 @@ module Html2rss
171
211
  end
172
212
  end
173
213
 
214
+ # @param hash [Hash] hash candidate traversed during key lookup
215
+ # @param keys [Array<Symbol>] keys to probe in order
216
+ # @return [Object, nil] first matching value from hash or nested metadata
174
217
  def fetch_from_hash(hash, keys)
175
218
  keys.each do |key|
176
- string_key = key.to_s
177
- return hash[string_key] if hash.key?(string_key)
178
-
179
- symbol_key = string_key.to_sym
180
- return hash[symbol_key] if hash.key?(symbol_key)
219
+ return hash[key] if hash.key?(key)
181
220
  end
182
221
 
183
- fetch_nested(hash[:attributes] || hash['attributes'], keys) ||
184
- fetch_nested(hash[:data] || hash['data'], keys)
222
+ fetch_nested(hash[:attributes], keys) ||
223
+ fetch_nested(hash[:data], keys)
185
224
  end
186
225
 
226
+ # @param array [Array] array whose entries may contain target keys
227
+ # @param keys [Array<Symbol>] keys to probe in order
228
+ # @return [Object, nil] first matching value from array entries
187
229
  def fetch_from_array(array, keys)
188
230
  array.each do |entry|
189
231
  result = fetch(entry, keys)
@@ -193,6 +235,9 @@ module Html2rss
193
235
  nil
194
236
  end
195
237
 
238
+ # @param value [Hash, Array, nil] nested value to recurse into
239
+ # @param keys [Array<Symbol>] keys to probe in order
240
+ # @return [Object, nil] matching nested value
196
241
  def fetch_nested(value, keys)
197
242
  fetch(value, keys) if value
198
243
  end
@@ -203,6 +248,8 @@ module Html2rss
203
248
  module CandidateDetector
204
249
  module_function
205
250
 
251
+ # @param document [Hash, Array, Object] candidate document node
252
+ # @return [Boolean] whether the node contains article-like arrays
206
253
  def candidate_array?(document)
207
254
  case document
208
255
  when Array
@@ -214,6 +261,8 @@ module Html2rss
214
261
  end
215
262
  end
216
263
 
264
+ # @param value [Hash, Array, Object] candidate nested value
265
+ # @return [Boolean] whether nested value should be traversed for article candidates
217
266
  def traversable_candidate?(value)
218
267
  case value
219
268
  when Array, Hash then candidate_array?(value)
@@ -221,6 +270,8 @@ module Html2rss
221
270
  end
222
271
  end
223
272
 
273
+ # @param array [Array<Object>] candidate list of entries
274
+ # @return [Boolean] whether array includes hash entries with title and URL fields
224
275
  def array_of_articles?(array)
225
276
  array.any? do |element|
226
277
  next unless element.is_a?(Hash)
@@ -229,10 +280,14 @@ module Html2rss
229
280
  end
230
281
  end
231
282
 
283
+ # @param object [Hash] article candidate object
284
+ # @return [Object, nil] detected title-like value
232
285
  def title_from(object)
233
286
  ValueFinder.fetch(object, TITLE_KEYS)
234
287
  end
235
288
 
289
+ # @param object [Hash] article candidate object
290
+ # @return [Object, nil] detected URL-like value
236
291
  def url_from(object)
237
292
  ValueFinder.fetch(object, URL_KEYS)
238
293
  end
@@ -244,6 +299,9 @@ module Html2rss
244
299
  module_function
245
300
 
246
301
  # rubocop:disable Metrics/MethodLength
302
+ # @param entry [Hash] raw article entry candidate
303
+ # @param base_url [String, Html2rss::Url] base URL for relative link resolution
304
+ # @return [Hash{Symbol => Object}, nil] normalized article hash for downstream extraction
247
305
  def normalise(entry, base_url:)
248
306
  return unless entry.is_a?(Hash)
249
307
 
@@ -267,11 +325,18 @@ module Html2rss
267
325
  end
268
326
  # rubocop:enable Metrics/MethodLength
269
327
 
328
+ # @param value [Object] candidate scalar value
329
+ # @return [String, nil] normalized non-empty string value
270
330
  def string(value)
271
331
  trimmed = value.to_s.strip
272
332
  trimmed unless trimmed.empty?
273
333
  end
274
334
 
335
+ # @param entry [Hash] raw article entry candidate
336
+ # @param keys [Array<String>] preferred link keys
337
+ # @param base_url [String, Html2rss::Url] base URL for relative link resolution
338
+ # @param log_key [String] structured log message key
339
+ # @return [Html2rss::Url, nil] resolved absolute URL
275
340
  def resolve_link(entry, keys:, base_url:, log_key:)
276
341
  value = ValueFinder.fetch(entry, keys)
277
342
  value = ValueFinder.fetch(value, keys) if value.is_a?(Hash)
@@ -285,6 +350,8 @@ module Html2rss
285
350
  end
286
351
 
287
352
  # rubocop:disable Metrics/MethodLength
353
+ # @param entry [Hash] raw article entry candidate
354
+ # @return [Array<String>, nil] normalized unique categories
288
355
  def categories(entry)
289
356
  raw = ValueFinder.fetch(entry, CATEGORY_KEYS)
290
357
  names = case raw
@@ -297,7 +364,7 @@ module Html2rss
297
364
  result = names.flat_map do |value|
298
365
  case value
299
366
  when Hash
300
- string(ValueFinder.fetch(value, %w[name title label]))
367
+ string(ValueFinder.fetch(value, %i[name title label]))
301
368
  else
302
369
  string(value)
303
370
  end
@@ -308,6 +375,9 @@ module Html2rss
308
375
  end
309
376
  # rubocop:enable Metrics/MethodLength
310
377
 
378
+ # @param entry [Hash] raw article entry candidate
379
+ # @param article_url [Html2rss::Url] resolved article URL
380
+ # @return [String] stable article identifier fallbacking to resolved URL
311
381
  def identifier(entry, article_url)
312
382
  value = ValueFinder.fetch(entry, ID_KEYS)
313
383
  value = ValueFinder.fetch(value, ID_KEYS) if value.is_a?(Hash)
@@ -316,20 +386,28 @@ module Html2rss
316
386
  end
317
387
  private_constant :ArticleNormalizer
318
388
 
389
+ # @return [Symbol] scraper config key
319
390
  def self.options_key = :json_state
320
391
 
321
392
  class << self
393
+ # @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
322
394
  def articles?(parsed_body)
323
395
  return false unless parsed_body
324
396
 
325
397
  DocumentScanner.json_documents(parsed_body).any? { CandidateDetector.candidate_array?(_1) }
326
398
  end
327
399
 
400
+ # @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
401
+ # @return [Array<Hash, Array>] parsed JSON documents discovered in the response body
328
402
  def json_documents(parsed_body)
329
403
  DocumentScanner.json_documents(parsed_body)
330
404
  end
331
405
  end
332
406
 
407
+ # @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
408
+ # @param url [String, Html2rss::Url] page URL used to resolve relative links
409
+ # @param _opts [Hash] scraper-specific options
410
+ # @option _opts [Object] :_reserved reserved for future scraper-specific options
333
411
  def initialize(parsed_body, url:, **_opts)
334
412
  @parsed_body = parsed_body
335
413
  @url = url
@@ -337,6 +415,8 @@ module Html2rss
337
415
 
338
416
  attr_reader :parsed_body
339
417
 
418
+ # @yield [Hash{Symbol => Object}] normalized article hash
419
+ # @return [Enumerator, void] article enumerator when no block is given
340
420
  def each
341
421
  return enum_for(:each) unless block_given?
342
422