html2rss 0.18.0 → 0.19.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -1
- data/lib/html2rss/articles/deduplicator.rb +1 -0
- data/lib/html2rss/auto_source/cleanup.rb +11 -0
- data/lib/html2rss/auto_source/scraper/html.rb +5 -0
- data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
- data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
- data/lib/html2rss/auto_source/scraper.rb +19 -1
- data/lib/html2rss/auto_source.rb +4 -0
- data/lib/html2rss/blocked_surface.rb +1 -0
- data/lib/html2rss/category_extractor.rb +2 -2
- data/lib/html2rss/cli.rb +30 -6
- data/lib/html2rss/config/class_methods.rb +24 -35
- data/lib/html2rss/config/dynamic_params.rb +6 -4
- data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
- data/lib/html2rss/config/request_headers.rb +9 -3
- data/lib/html2rss/config/schema.rb +33 -1
- data/lib/html2rss/config/validator.rb +40 -2
- data/lib/html2rss/config.rb +19 -13
- data/lib/html2rss/error.rb +25 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
- data/lib/html2rss/html_extractor.rb +5 -0
- data/lib/html2rss/html_navigator.rb +8 -0
- data/lib/html2rss/json_feed_builder.rb +1 -0
- data/lib/html2rss/rendering/audio_renderer.rb +8 -3
- data/lib/html2rss/rendering/description_builder.rb +0 -1
- data/lib/html2rss/rendering/image_renderer.rb +17 -7
- data/lib/html2rss/rendering/media_renderer.rb +4 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
- data/lib/html2rss/rendering/video_renderer.rb +8 -3
- data/lib/html2rss/rendering.rb +11 -2
- data/lib/html2rss/request_controls.rb +16 -21
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/context.rb +14 -2
- data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
- data/lib/html2rss/request_service/policy.rb +4 -0
- data/lib/html2rss/request_service/response.rb +9 -1
- data/lib/html2rss/request_service.rb +19 -0
- data/lib/html2rss/request_session/runtime_input.rb +16 -2
- data/lib/html2rss/request_session/runtime_policy.rb +7 -0
- data/lib/html2rss/request_session.rb +13 -9
- data/lib/html2rss/rss_builder/article.rb +22 -1
- data/lib/html2rss/rss_builder/channel.rb +11 -2
- data/lib/html2rss/rss_builder/enclosure.rb +15 -1
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
- data/lib/html2rss/rss_builder.rb +4 -0
- data/lib/html2rss/selectors/config.rb +1 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
- data/lib/html2rss/selectors/extractors/href.rb +2 -0
- data/lib/html2rss/selectors/extractors/html.rb +1 -0
- data/lib/html2rss/selectors/extractors/static.rb +2 -1
- data/lib/html2rss/selectors/extractors/text.rb +1 -0
- data/lib/html2rss/selectors/extractors.rb +2 -1
- data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
- data/lib/html2rss/selectors/post_processors/base.rb +13 -7
- data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
- data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
- data/lib/html2rss/selectors/post_processors/template.rb +3 -0
- data/lib/html2rss/selectors/post_processors.rb +5 -0
- data/lib/html2rss/selectors.rb +7 -0
- data/lib/html2rss/url.rb +27 -23
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +15 -78
- data/schema/html2rss-config.schema.json +83 -1
- metadata +7 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 69268fde80ddaa21f5ca3588de51f63182909714956af3ed8b1ee11a47075dc8
|
|
4
|
+
data.tar.gz: 045dfb3fec6cebfa8c7d066acd12c056dbf01766bbe1c292642d6d4d9db72055
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: de88861fd21375da62549cbed418f5f1550e7adf8e9c6ea98cfce9331944067bc8aa43eacdbdfe3ab6380764e485031f1d6bb3b456e0bf486340864328a5abc8
|
|
7
|
+
data.tar.gz: 6f65cd2e7dc555c35cb456bff595184331f3df07dafe10c50d6d334102237b0f98a5971c6033d8e56c4504254726ca0eca9757c22b1a697c2a47b8b76681945a
|
data/README.md
CHANGED
|
@@ -36,7 +36,7 @@ Please see the [contributing guide](https://html2rss.github.io/get-involved/cont
|
|
|
36
36
|
### Core Components
|
|
37
37
|
|
|
38
38
|
1. **Config** - Loads and validates configuration (YAML/hash)
|
|
39
|
-
2. **RequestService** - Fetches pages using Faraday or Browserless
|
|
39
|
+
2. **RequestService** - Fetches pages using Faraday, Botasaurus, or Browserless
|
|
40
40
|
3. **Selectors** - Extracts content via CSS selectors with extractors/post-processors
|
|
41
41
|
4. **AutoSource** - Auto-detects content using Schema.org, JSON state blobs, semantic HTML, and structural patterns
|
|
42
42
|
5. **RssBuilder** - Assembles Article objects and renders RSS 2.0
|
|
@@ -47,6 +47,65 @@ Please see the [contributing guide](https://html2rss.github.io/get-involved/cont
|
|
|
47
47
|
Config -> Request -> Extraction -> Processing -> Building -> Output
|
|
48
48
|
```
|
|
49
49
|
|
|
50
|
+
### Request Strategies
|
|
51
|
+
|
|
52
|
+
- `auto` (default): pipeline fallback orchestration (`faraday` -> `botasaurus` -> `browserless`) based on extraction outcome and retry policy.
|
|
53
|
+
- `faraday`: direct HTTP fetch.
|
|
54
|
+
- `botasaurus`: delegates fetching to a Botasaurus scrape API. Requires `BOTASAURUS_SCRAPER_URL` (for example `http://localhost:4010`).
|
|
55
|
+
- `browserless`: remote browser rendering via Browserless (`BROWSERLESS_IO_WEBSOCKET_URL` and token as needed).
|
|
56
|
+
|
|
57
|
+
Auto fallback shares one request budget across all strategy attempts. For pagination-heavy or dynamic pages, increase `request.max_requests` (or `--max-requests`) when retries exhaust the budget.
|
|
58
|
+
|
|
59
|
+
Auto fallback decisions are hidden at the default `LOG_LEVEL=warn`; run with `LOG_LEVEL=info` to include them in CLI output.
|
|
60
|
+
|
|
61
|
+
Supported `request.botasaurus` options:
|
|
62
|
+
|
|
63
|
+
- `navigation_mode` (`auto`, `get`, `google_get`, `google_get_bypass`; default `auto`)
|
|
64
|
+
- `max_retries` (`0..3`; default `2`)
|
|
65
|
+
- `wait_for_selector` (string)
|
|
66
|
+
- `wait_timeout_seconds` (integer)
|
|
67
|
+
- `block_images` (boolean)
|
|
68
|
+
- `block_images_and_css` (boolean)
|
|
69
|
+
- `wait_for_complete_page_load` (boolean)
|
|
70
|
+
- `headless` (boolean, default `false`)
|
|
71
|
+
- `proxy` (string)
|
|
72
|
+
- `user_agent` (string)
|
|
73
|
+
- `window_size` (two-item integer array, for example `[1920, 1080]`)
|
|
74
|
+
- `lang` (string, for example `en-US`)
|
|
75
|
+
|
|
76
|
+
Minimal YAML config example:
|
|
77
|
+
|
|
78
|
+
```yaml
|
|
79
|
+
channel:
|
|
80
|
+
url: https://example.com
|
|
81
|
+
strategy: botasaurus
|
|
82
|
+
auto_source: {}
|
|
83
|
+
request:
|
|
84
|
+
botasaurus:
|
|
85
|
+
navigation_mode: auto
|
|
86
|
+
max_retries: 2
|
|
87
|
+
headless: false
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Example request payload shape:
|
|
91
|
+
|
|
92
|
+
```json
|
|
93
|
+
{
|
|
94
|
+
"url": "https://example.com",
|
|
95
|
+
"navigation_mode": "auto",
|
|
96
|
+
"max_retries": 2,
|
|
97
|
+
"headless": false
|
|
98
|
+
}
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Example usage:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
BOTASAURUS_SCRAPER_URL=http://localhost:4010 html2rss auto https://example.com --strategy botasaurus
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Policy note: html2rss still enforces local request policy preflight and timeout budget. Botasaurus handles browser navigation/rendering internals, so some policy details are delegated to upstream execution.
|
|
108
|
+
|
|
50
109
|
### Config schema workflow
|
|
51
110
|
|
|
52
111
|
The config schema is generated from the runtime `dry-validation` contracts and exported for client-side tooling.
|
|
@@ -7,14 +7,21 @@ module Html2rss
|
|
|
7
7
|
# :reek:MissingSafeMethod { enabled: false }
|
|
8
8
|
# It applies various strategies to filter and refine the article list.
|
|
9
9
|
class Cleanup
|
|
10
|
+
# Default cleanup behavior for auto-sourced article lists.
|
|
10
11
|
DEFAULT_CONFIG = {
|
|
11
12
|
keep_different_domain: false,
|
|
12
13
|
min_words_title: 3
|
|
13
14
|
}.freeze
|
|
14
15
|
|
|
16
|
+
# Allowed URL schemes for article filtering.
|
|
15
17
|
VALID_SCHEMES = %w[http https].to_set.freeze
|
|
16
18
|
|
|
17
19
|
class << self
|
|
20
|
+
# @param articles [Array<Article>] extracted article candidates
|
|
21
|
+
# @param url [Html2rss::Url] feed source URL used for same-host filtering
|
|
22
|
+
# @param keep_different_domain [Boolean] whether to keep off-domain entries
|
|
23
|
+
# @param min_words_title [Integer] minimum word count for title filtering
|
|
24
|
+
# @return [Array<Article>] cleaned article list
|
|
18
25
|
def call(articles, url:, keep_different_domain:, min_words_title:)
|
|
19
26
|
Log.debug "Cleanup: start with #{articles.size} articles"
|
|
20
27
|
|
|
@@ -35,6 +42,7 @@ module Html2rss
|
|
|
35
42
|
#
|
|
36
43
|
# @param articles [Array<Article>] The list of articles to process.
|
|
37
44
|
# @param key [Symbol] The key to deduplicate by.
|
|
45
|
+
# @return [Array<Article>] the mutated articles array
|
|
38
46
|
def deduplicate_by!(articles, key)
|
|
39
47
|
seen = {}
|
|
40
48
|
articles.reject! do |article|
|
|
@@ -47,6 +55,7 @@ module Html2rss
|
|
|
47
55
|
# Keeps only articles with HTTP or HTTPS URLs.
|
|
48
56
|
#
|
|
49
57
|
# @param articles [Array<Article>] The list of articles to process.
|
|
58
|
+
# @return [Array<Article>] the mutated articles array
|
|
50
59
|
def keep_only_http_urls!(articles)
|
|
51
60
|
articles.select! { |article| VALID_SCHEMES.include?(article.url&.scheme) }
|
|
52
61
|
end
|
|
@@ -56,6 +65,7 @@ module Html2rss
|
|
|
56
65
|
#
|
|
57
66
|
# @param articles [Array<Article>] The list of articles to process.
|
|
58
67
|
# @param base_url [Html2rss::Url] The source URL to compare against.
|
|
68
|
+
# @return [Array<Article>] the mutated articles array
|
|
59
69
|
def reject_different_domain!(articles, base_url)
|
|
60
70
|
base_host = base_url.host
|
|
61
71
|
articles.select! { |article| article.url&.host == base_host }
|
|
@@ -66,6 +76,7 @@ module Html2rss
|
|
|
66
76
|
#
|
|
67
77
|
# @param articles [Array<Article>] The list of articles to process.
|
|
68
78
|
# @param min_words_title [Integer] The minimum number of words in the title.
|
|
79
|
+
# @return [Array<Article>] the mutated articles array
|
|
69
80
|
def keep_only_with_min_words_title!(articles, min_words_title:)
|
|
70
81
|
articles.select! do |article|
|
|
71
82
|
article.title ? word_count_at_least?(article.title, min_words_title) : true
|
|
@@ -19,9 +19,12 @@ module Html2rss
|
|
|
19
19
|
class Html
|
|
20
20
|
include Enumerable
|
|
21
21
|
|
|
22
|
+
# Elements ignored when traversing potential article containers.
|
|
22
23
|
TAGS_TO_IGNORE = /(nav|footer|header|svg|script|style)/i
|
|
23
24
|
|
|
25
|
+
# Minimum selector frequency required to treat a path as a stable list signal.
|
|
24
26
|
DEFAULT_MINIMUM_SELECTOR_FREQUENCY = 2
|
|
27
|
+
# Number of most frequent selectors kept for container extraction.
|
|
25
28
|
DEFAULT_USE_TOP_SELECTORS = 5
|
|
26
29
|
|
|
27
30
|
##
|
|
@@ -53,6 +56,8 @@ module Html2rss
|
|
|
53
56
|
# @param url [String] The base URL.
|
|
54
57
|
# @param extractor [Class] The extractor class to handle article extraction.
|
|
55
58
|
# @param opts [Hash] Additional options.
|
|
59
|
+
# @option opts [Integer] :minimum_selector_frequency minimum count before a selector is considered stable
|
|
60
|
+
# @option opts [Integer] :use_top_selectors number of top selectors to keep
|
|
56
61
|
def initialize(parsed_body, url:, extractor: HtmlExtractor, **opts)
|
|
57
62
|
@parsed_body = parsed_body
|
|
58
63
|
@url = url
|
|
@@ -5,7 +5,7 @@ require 'json'
|
|
|
5
5
|
module Html2rss
|
|
6
6
|
class AutoSource
|
|
7
7
|
module Scraper
|
|
8
|
-
|
|
8
|
+
##
|
|
9
9
|
# Scrapes JSON state blobs embedded in script tags such as Next.js, Nuxt,
|
|
10
10
|
# or custom window globals. The scraper searches `<script type="application/json">`
|
|
11
11
|
# tags and well-known JavaScript globals for arrays of article-like hashes
|
|
@@ -13,7 +13,9 @@ module Html2rss
|
|
|
13
13
|
class JsonState
|
|
14
14
|
include Enumerable
|
|
15
15
|
|
|
16
|
+
# Selector for JSON-only script tags.
|
|
16
17
|
JSON_SCRIPT_SELECTOR = 'script[type="application/json"]'
|
|
18
|
+
# Regex patterns for known global JavaScript state assignments.
|
|
17
19
|
GLOBAL_ASSIGNMENT_PATTERNS = [
|
|
18
20
|
/(?:window|self|globalThis)\.__NEXT_DATA__\s*=\s*/m,
|
|
19
21
|
/(?:window|self|globalThis)\.__NUXT__\s*=\s*/m,
|
|
@@ -28,36 +30,53 @@ module Html2rss
|
|
|
28
30
|
/(?:window|self|globalThis)\.angular\s*=\s*/m
|
|
29
31
|
].freeze
|
|
30
32
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
33
|
+
# Preferred keys when extracting title-like values from state payloads.
|
|
34
|
+
TITLE_KEYS = %i[title headline name text].freeze
|
|
35
|
+
# Preferred keys when extracting URL-like values from state payloads.
|
|
36
|
+
URL_KEYS = %i[url link href permalink slug path canonicalUrl shortUrl].freeze
|
|
37
|
+
# Preferred keys when extracting description-like values from state payloads.
|
|
38
|
+
DESCRIPTION_KEYS = %i[description summary excerpt dek subheading].freeze
|
|
39
|
+
# Preferred keys when extracting image-like values from state payloads.
|
|
40
|
+
IMAGE_KEYS = %i[image imageUrl thumbnailUrl thumbnail src featuredImage coverImage heroImage].freeze
|
|
41
|
+
# Preferred keys when extracting publication timestamps from state payloads.
|
|
42
|
+
PUBLISHED_AT_KEYS = %i[published_at publishedAt datePublished date publicationDate pubDate updatedAt updated_at
|
|
36
43
|
createdAt created_at].freeze
|
|
37
|
-
|
|
38
|
-
|
|
44
|
+
# Preferred keys when extracting category-like values from state payloads.
|
|
45
|
+
CATEGORY_KEYS = %i[categories tags section sections topic topics channel].freeze
|
|
46
|
+
# Preferred keys when extracting identifier-like values from state payloads.
|
|
47
|
+
ID_KEYS = %i[id guid uuid slug key].freeze
|
|
39
48
|
|
|
40
49
|
# Scans DOM nodes for JSON payloads containing article data.
|
|
41
50
|
module DocumentScanner
|
|
42
51
|
module_function
|
|
43
52
|
|
|
53
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
54
|
+
# @return [Array<Hash, Array>] parsed JSON documents discovered in scripts
|
|
44
55
|
def json_documents(parsed_body)
|
|
45
56
|
script_documents(parsed_body) + assignment_documents(parsed_body)
|
|
46
57
|
end
|
|
47
58
|
|
|
59
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
60
|
+
# @return [Array<Hash, Array>] JSON documents extracted from JSON script tags
|
|
48
61
|
def script_documents(parsed_body)
|
|
49
62
|
parsed_body.css(JSON_SCRIPT_SELECTOR).filter_map { parse_json(_1.text) }
|
|
50
63
|
end
|
|
51
64
|
|
|
65
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
66
|
+
# @return [Array<Hash, Array>] JSON documents extracted from global assignments
|
|
52
67
|
def assignment_documents(parsed_body)
|
|
53
68
|
parsed_body.css('script').filter_map { parse_assignment(_1.text) }
|
|
54
69
|
end
|
|
55
70
|
|
|
71
|
+
# @param text [String] script text that may contain a global assignment
|
|
72
|
+
# @return [Hash, Array, nil] parsed assignment payload when available
|
|
56
73
|
def parse_assignment(text)
|
|
57
74
|
payload = assignment_payload(text)
|
|
58
75
|
parse_json(payload) if payload
|
|
59
76
|
end
|
|
60
77
|
|
|
78
|
+
# @param text [String] script text to inspect for known assignment patterns
|
|
79
|
+
# @return [String, nil] extracted JSON-like assignment payload
|
|
61
80
|
def assignment_payload(text)
|
|
62
81
|
trimmed = text.to_s.strip
|
|
63
82
|
return if trimmed.empty?
|
|
@@ -72,10 +91,14 @@ module Html2rss
|
|
|
72
91
|
nil
|
|
73
92
|
end
|
|
74
93
|
|
|
94
|
+
# @param text [String] text potentially containing JSON-like payloads
|
|
95
|
+
# @return [String, nil] normalized assignment payload
|
|
75
96
|
def extract_assignment_payload(text)
|
|
76
97
|
extract_json_block(text) || text
|
|
77
98
|
end
|
|
78
99
|
|
|
100
|
+
# @param text [String] text potentially containing JSON blocks
|
|
101
|
+
# @return [String, nil] extracted JSON block spanning balanced brackets
|
|
79
102
|
def extract_json_block(text)
|
|
80
103
|
start_index = text.index(/[\[{]/)
|
|
81
104
|
return unless start_index
|
|
@@ -85,6 +108,9 @@ module Html2rss
|
|
|
85
108
|
end
|
|
86
109
|
|
|
87
110
|
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
111
|
+
# @param text [String] text starting with a JSON object/array opening token
|
|
112
|
+
# @param start_index [Integer] index where JSON-like content starts
|
|
113
|
+
# @return [Integer, nil] index where the balanced JSON payload ends
|
|
88
114
|
def scan_for_json_end(text, start_index)
|
|
89
115
|
stack = []
|
|
90
116
|
in_string = false
|
|
@@ -121,6 +147,8 @@ module Html2rss
|
|
|
121
147
|
end
|
|
122
148
|
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
123
149
|
|
|
150
|
+
# @param payload [String, nil] JSON payload to parse
|
|
151
|
+
# @return [Hash, Array, nil] parsed payload or nil when parsing fails
|
|
124
152
|
def parse_json(payload)
|
|
125
153
|
return unless payload
|
|
126
154
|
|
|
@@ -129,6 +157,9 @@ module Html2rss
|
|
|
129
157
|
parse_js_object(payload, error)
|
|
130
158
|
end
|
|
131
159
|
|
|
160
|
+
# @param payload [String] JavaScript object-literal payload
|
|
161
|
+
# @param _original_error [JSON::ParserError] original JSON parse error
|
|
162
|
+
# @return [Hash, Array, nil] parsed payload after JavaScript coercion
|
|
132
163
|
def parse_js_object(payload, _original_error)
|
|
133
164
|
coerced = coerce_javascript_object(payload)
|
|
134
165
|
return unless coerced
|
|
@@ -141,6 +172,8 @@ module Html2rss
|
|
|
141
172
|
nil
|
|
142
173
|
end
|
|
143
174
|
|
|
175
|
+
# @param payload [String] JavaScript object-literal payload
|
|
176
|
+
# @return [String] JSON-compatible payload string
|
|
144
177
|
def coerce_javascript_object(payload)
|
|
145
178
|
string = payload.dup
|
|
146
179
|
|
|
@@ -148,12 +181,16 @@ module Html2rss
|
|
|
148
181
|
strip_trailing_commas(quote_unquoted_keys(string))
|
|
149
182
|
end
|
|
150
183
|
|
|
184
|
+
# @param jsonish [String] JSON-like string with potentially unquoted keys
|
|
185
|
+
# @return [String] payload with unquoted object keys quoted
|
|
151
186
|
def quote_unquoted_keys(jsonish)
|
|
152
187
|
jsonish.gsub(/(\A\s*|[{,\[]\s*)([A-Za-z_]\w*)(\s*:)/) do
|
|
153
188
|
"#{Regexp.last_match(1)}\"#{Regexp.last_match(2)}\"#{Regexp.last_match(3)}"
|
|
154
189
|
end
|
|
155
190
|
end
|
|
156
191
|
|
|
192
|
+
# @param jsonish [String] JSON-like string with potential trailing commas
|
|
193
|
+
# @return [String] payload without trailing commas before closing tokens
|
|
157
194
|
def strip_trailing_commas(jsonish)
|
|
158
195
|
jsonish.gsub(/,(\s*[\]}])/, '\1')
|
|
159
196
|
end
|
|
@@ -164,6 +201,9 @@ module Html2rss
|
|
|
164
201
|
module ValueFinder
|
|
165
202
|
module_function
|
|
166
203
|
|
|
204
|
+
# @param object [Hash, Array] candidate container traversed during key lookup
|
|
205
|
+
# @param keys [Array<Symbol>] keys to probe in order
|
|
206
|
+
# @return [Object, nil] first matching value
|
|
167
207
|
def fetch(object, keys)
|
|
168
208
|
case object
|
|
169
209
|
when Hash then fetch_from_hash(object, keys)
|
|
@@ -171,19 +211,21 @@ module Html2rss
|
|
|
171
211
|
end
|
|
172
212
|
end
|
|
173
213
|
|
|
214
|
+
# @param hash [Hash] hash candidate traversed during key lookup
|
|
215
|
+
# @param keys [Array<Symbol>] keys to probe in order
|
|
216
|
+
# @return [Object, nil] first matching value from hash or nested metadata
|
|
174
217
|
def fetch_from_hash(hash, keys)
|
|
175
218
|
keys.each do |key|
|
|
176
|
-
|
|
177
|
-
return hash[string_key] if hash.key?(string_key)
|
|
178
|
-
|
|
179
|
-
symbol_key = string_key.to_sym
|
|
180
|
-
return hash[symbol_key] if hash.key?(symbol_key)
|
|
219
|
+
return hash[key] if hash.key?(key)
|
|
181
220
|
end
|
|
182
221
|
|
|
183
|
-
fetch_nested(hash[:attributes]
|
|
184
|
-
fetch_nested(hash[:data]
|
|
222
|
+
fetch_nested(hash[:attributes], keys) ||
|
|
223
|
+
fetch_nested(hash[:data], keys)
|
|
185
224
|
end
|
|
186
225
|
|
|
226
|
+
# @param array [Array] array whose entries may contain target keys
|
|
227
|
+
# @param keys [Array<Symbol>] keys to probe in order
|
|
228
|
+
# @return [Object, nil] first matching value from array entries
|
|
187
229
|
def fetch_from_array(array, keys)
|
|
188
230
|
array.each do |entry|
|
|
189
231
|
result = fetch(entry, keys)
|
|
@@ -193,6 +235,9 @@ module Html2rss
|
|
|
193
235
|
nil
|
|
194
236
|
end
|
|
195
237
|
|
|
238
|
+
# @param value [Hash, Array, nil] nested value to recurse into
|
|
239
|
+
# @param keys [Array<Symbol>] keys to probe in order
|
|
240
|
+
# @return [Object, nil] matching nested value
|
|
196
241
|
def fetch_nested(value, keys)
|
|
197
242
|
fetch(value, keys) if value
|
|
198
243
|
end
|
|
@@ -203,6 +248,8 @@ module Html2rss
|
|
|
203
248
|
module CandidateDetector
|
|
204
249
|
module_function
|
|
205
250
|
|
|
251
|
+
# @param document [Hash, Array, Object] candidate document node
|
|
252
|
+
# @return [Boolean] whether the node contains article-like arrays
|
|
206
253
|
def candidate_array?(document)
|
|
207
254
|
case document
|
|
208
255
|
when Array
|
|
@@ -214,6 +261,8 @@ module Html2rss
|
|
|
214
261
|
end
|
|
215
262
|
end
|
|
216
263
|
|
|
264
|
+
# @param value [Hash, Array, Object] candidate nested value
|
|
265
|
+
# @return [Boolean] whether nested value should be traversed for article candidates
|
|
217
266
|
def traversable_candidate?(value)
|
|
218
267
|
case value
|
|
219
268
|
when Array, Hash then candidate_array?(value)
|
|
@@ -221,6 +270,8 @@ module Html2rss
|
|
|
221
270
|
end
|
|
222
271
|
end
|
|
223
272
|
|
|
273
|
+
# @param array [Array<Object>] candidate list of entries
|
|
274
|
+
# @return [Boolean] whether array includes hash entries with title and URL fields
|
|
224
275
|
def array_of_articles?(array)
|
|
225
276
|
array.any? do |element|
|
|
226
277
|
next unless element.is_a?(Hash)
|
|
@@ -229,10 +280,14 @@ module Html2rss
|
|
|
229
280
|
end
|
|
230
281
|
end
|
|
231
282
|
|
|
283
|
+
# @param object [Hash] article candidate object
|
|
284
|
+
# @return [Object, nil] detected title-like value
|
|
232
285
|
def title_from(object)
|
|
233
286
|
ValueFinder.fetch(object, TITLE_KEYS)
|
|
234
287
|
end
|
|
235
288
|
|
|
289
|
+
# @param object [Hash] article candidate object
|
|
290
|
+
# @return [Object, nil] detected URL-like value
|
|
236
291
|
def url_from(object)
|
|
237
292
|
ValueFinder.fetch(object, URL_KEYS)
|
|
238
293
|
end
|
|
@@ -244,6 +299,9 @@ module Html2rss
|
|
|
244
299
|
module_function
|
|
245
300
|
|
|
246
301
|
# rubocop:disable Metrics/MethodLength
|
|
302
|
+
# @param entry [Hash] raw article entry candidate
|
|
303
|
+
# @param base_url [String, Html2rss::Url] base URL for relative link resolution
|
|
304
|
+
# @return [Hash{Symbol => Object}, nil] normalized article hash for downstream extraction
|
|
247
305
|
def normalise(entry, base_url:)
|
|
248
306
|
return unless entry.is_a?(Hash)
|
|
249
307
|
|
|
@@ -267,11 +325,18 @@ module Html2rss
|
|
|
267
325
|
end
|
|
268
326
|
# rubocop:enable Metrics/MethodLength
|
|
269
327
|
|
|
328
|
+
# @param value [Object] candidate scalar value
|
|
329
|
+
# @return [String, nil] normalized non-empty string value
|
|
270
330
|
def string(value)
|
|
271
331
|
trimmed = value.to_s.strip
|
|
272
332
|
trimmed unless trimmed.empty?
|
|
273
333
|
end
|
|
274
334
|
|
|
335
|
+
# @param entry [Hash] raw article entry candidate
|
|
336
|
+
# @param keys [Array<String>] preferred link keys
|
|
337
|
+
# @param base_url [String, Html2rss::Url] base URL for relative link resolution
|
|
338
|
+
# @param log_key [String] structured log message key
|
|
339
|
+
# @return [Html2rss::Url, nil] resolved absolute URL
|
|
275
340
|
def resolve_link(entry, keys:, base_url:, log_key:)
|
|
276
341
|
value = ValueFinder.fetch(entry, keys)
|
|
277
342
|
value = ValueFinder.fetch(value, keys) if value.is_a?(Hash)
|
|
@@ -285,6 +350,8 @@ module Html2rss
|
|
|
285
350
|
end
|
|
286
351
|
|
|
287
352
|
# rubocop:disable Metrics/MethodLength
|
|
353
|
+
# @param entry [Hash] raw article entry candidate
|
|
354
|
+
# @return [Array<String>, nil] normalized unique categories
|
|
288
355
|
def categories(entry)
|
|
289
356
|
raw = ValueFinder.fetch(entry, CATEGORY_KEYS)
|
|
290
357
|
names = case raw
|
|
@@ -297,7 +364,7 @@ module Html2rss
|
|
|
297
364
|
result = names.flat_map do |value|
|
|
298
365
|
case value
|
|
299
366
|
when Hash
|
|
300
|
-
string(ValueFinder.fetch(value, %
|
|
367
|
+
string(ValueFinder.fetch(value, %i[name title label]))
|
|
301
368
|
else
|
|
302
369
|
string(value)
|
|
303
370
|
end
|
|
@@ -308,6 +375,9 @@ module Html2rss
|
|
|
308
375
|
end
|
|
309
376
|
# rubocop:enable Metrics/MethodLength
|
|
310
377
|
|
|
378
|
+
# @param entry [Hash] raw article entry candidate
|
|
379
|
+
# @param article_url [Html2rss::Url] resolved article URL
|
|
380
|
+
# @return [String] stable article identifier fallbacking to resolved URL
|
|
311
381
|
def identifier(entry, article_url)
|
|
312
382
|
value = ValueFinder.fetch(entry, ID_KEYS)
|
|
313
383
|
value = ValueFinder.fetch(value, ID_KEYS) if value.is_a?(Hash)
|
|
@@ -316,20 +386,28 @@ module Html2rss
|
|
|
316
386
|
end
|
|
317
387
|
private_constant :ArticleNormalizer
|
|
318
388
|
|
|
389
|
+
# @return [Symbol] scraper config key
|
|
319
390
|
def self.options_key = :json_state
|
|
320
391
|
|
|
321
392
|
class << self
|
|
393
|
+
# @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
|
|
322
394
|
def articles?(parsed_body)
|
|
323
395
|
return false unless parsed_body
|
|
324
396
|
|
|
325
397
|
DocumentScanner.json_documents(parsed_body).any? { CandidateDetector.candidate_array?(_1) }
|
|
326
398
|
end
|
|
327
399
|
|
|
400
|
+
# @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
|
|
401
|
+
# @return [Array<Hash, Array>] parsed JSON documents discovered in the response body
|
|
328
402
|
def json_documents(parsed_body)
|
|
329
403
|
DocumentScanner.json_documents(parsed_body)
|
|
330
404
|
end
|
|
331
405
|
end
|
|
332
406
|
|
|
407
|
+
# @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
|
|
408
|
+
# @param url [String, Html2rss::Url] page URL used to resolve relative links
|
|
409
|
+
# @param _opts [Hash] scraper-specific options
|
|
410
|
+
# @option _opts [Object] :_reserved reserved for future scraper-specific options
|
|
333
411
|
def initialize(parsed_body, url:, **_opts)
|
|
334
412
|
@parsed_body = parsed_body
|
|
335
413
|
@url = url
|
|
@@ -337,6 +415,8 @@ module Html2rss
|
|
|
337
415
|
|
|
338
416
|
attr_reader :parsed_body
|
|
339
417
|
|
|
418
|
+
# @yield [Hash{Symbol => Object}] normalized article hash
|
|
419
|
+
# @return [Enumerator, void] article enumerator when no block is given
|
|
340
420
|
def each
|
|
341
421
|
return enum_for(:each) unless block_given?
|
|
342
422
|
|