html2rss 0.18.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -1
- data/lib/html2rss/articles/deduplicator.rb +1 -0
- data/lib/html2rss/auto_source/cleanup.rb +11 -0
- data/lib/html2rss/auto_source/scraper/html.rb +5 -0
- data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
- data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
- data/lib/html2rss/auto_source/scraper.rb +19 -1
- data/lib/html2rss/auto_source.rb +4 -0
- data/lib/html2rss/blocked_surface.rb +1 -0
- data/lib/html2rss/category_extractor.rb +2 -2
- data/lib/html2rss/cli.rb +30 -6
- data/lib/html2rss/config/class_methods.rb +24 -35
- data/lib/html2rss/config/dynamic_params.rb +6 -4
- data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
- data/lib/html2rss/config/request_headers.rb +9 -3
- data/lib/html2rss/config/schema.rb +33 -1
- data/lib/html2rss/config/validator.rb +40 -2
- data/lib/html2rss/config.rb +19 -13
- data/lib/html2rss/error.rb +25 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
- data/lib/html2rss/html_extractor.rb +5 -0
- data/lib/html2rss/html_navigator.rb +8 -0
- data/lib/html2rss/json_feed_builder.rb +1 -0
- data/lib/html2rss/rendering/audio_renderer.rb +8 -3
- data/lib/html2rss/rendering/description_builder.rb +0 -1
- data/lib/html2rss/rendering/image_renderer.rb +17 -7
- data/lib/html2rss/rendering/media_renderer.rb +4 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
- data/lib/html2rss/rendering/video_renderer.rb +8 -3
- data/lib/html2rss/rendering.rb +11 -2
- data/lib/html2rss/request_controls.rb +16 -21
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/context.rb +14 -2
- data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
- data/lib/html2rss/request_service/policy.rb +4 -0
- data/lib/html2rss/request_service/response.rb +9 -1
- data/lib/html2rss/request_service.rb +19 -0
- data/lib/html2rss/request_session/runtime_input.rb +16 -2
- data/lib/html2rss/request_session/runtime_policy.rb +7 -0
- data/lib/html2rss/request_session.rb +13 -9
- data/lib/html2rss/rss_builder/article.rb +22 -1
- data/lib/html2rss/rss_builder/channel.rb +11 -2
- data/lib/html2rss/rss_builder/enclosure.rb +15 -1
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
- data/lib/html2rss/rss_builder.rb +4 -0
- data/lib/html2rss/selectors/config.rb +1 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
- data/lib/html2rss/selectors/extractors/href.rb +2 -0
- data/lib/html2rss/selectors/extractors/html.rb +1 -0
- data/lib/html2rss/selectors/extractors/static.rb +2 -1
- data/lib/html2rss/selectors/extractors/text.rb +1 -0
- data/lib/html2rss/selectors/extractors.rb +2 -1
- data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
- data/lib/html2rss/selectors/post_processors/base.rb +13 -7
- data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
- data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
- data/lib/html2rss/selectors/post_processors/template.rb +3 -0
- data/lib/html2rss/selectors/post_processors.rb +5 -0
- data/lib/html2rss/selectors.rb +7 -0
- data/lib/html2rss/url.rb +27 -23
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +15 -78
- data/schema/html2rss-config.schema.json +83 -1
- metadata +7 -2
data/lib/html2rss.rb
CHANGED
|
@@ -26,7 +26,7 @@ module Html2rss
|
|
|
26
26
|
#
|
|
27
27
|
# @param file [String] path to the YAML file
|
|
28
28
|
# @param feed_name [String, nil] optional feed name inside a multi-feed config
|
|
29
|
-
# @return [Hash
|
|
29
|
+
# @return [Hash{Symbol => Object}] loaded configuration hash
|
|
30
30
|
def self.config_from_yaml_file(file, feed_name = nil)
|
|
31
31
|
Config.load_yaml(file, feed_name)
|
|
32
32
|
end
|
|
@@ -34,23 +34,19 @@ module Html2rss
|
|
|
34
34
|
##
|
|
35
35
|
# Returns an RSS object generated from the provided configuration.
|
|
36
36
|
#
|
|
37
|
-
# @param raw_config [Hash
|
|
37
|
+
# @param raw_config [Hash{Symbol => Object}] feed configuration
|
|
38
38
|
# @return [RSS::Rss] generated RSS feed
|
|
39
39
|
def self.feed(raw_config)
|
|
40
|
-
|
|
41
|
-
build_rss_feed(response:, config:, articles:)
|
|
42
|
-
end
|
|
40
|
+
FeedPipeline.new(raw_config).to_rss
|
|
43
41
|
end
|
|
44
42
|
|
|
45
43
|
##
|
|
46
44
|
# Returns a JSONFeed 1.1 hash generated from the provided configuration.
|
|
47
45
|
#
|
|
48
|
-
# @param raw_config [Hash
|
|
46
|
+
# @param raw_config [Hash{Symbol => Object}] feed configuration
|
|
49
47
|
# @return [Hash] JSONFeed-compliant hash
|
|
50
48
|
def self.json_feed(raw_config)
|
|
51
|
-
|
|
52
|
-
build_json_feed(response:, config:, articles:)
|
|
53
|
-
end
|
|
49
|
+
FeedPipeline.new(raw_config).to_json_feed
|
|
54
50
|
end
|
|
55
51
|
|
|
56
52
|
##
|
|
@@ -62,7 +58,7 @@ module Html2rss
|
|
|
62
58
|
# @param max_redirects [Integer, nil] optional redirect limit override
|
|
63
59
|
# @param max_requests [Integer, nil] optional request budget override
|
|
64
60
|
# @return [RSS::Rss] generated RSS feed
|
|
65
|
-
def self.auto_source(url, strategy: :
|
|
61
|
+
def self.auto_source(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil)
|
|
66
62
|
feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
|
|
67
63
|
end
|
|
68
64
|
|
|
@@ -75,80 +71,13 @@ module Html2rss
|
|
|
75
71
|
# @param max_redirects [Integer, nil] optional redirect limit override
|
|
76
72
|
# @param max_requests [Integer, nil] optional request budget override
|
|
77
73
|
# @return [Hash] JSONFeed-compliant hash
|
|
78
|
-
def self.auto_json_feed(url, strategy: :
|
|
74
|
+
def self.auto_json_feed(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil)
|
|
79
75
|
json_feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
|
|
80
76
|
end
|
|
81
77
|
|
|
82
78
|
class << self
|
|
83
79
|
private
|
|
84
80
|
|
|
85
|
-
def run_pipeline(raw_config)
|
|
86
|
-
# 1. Normalize and validate the user-facing feed config.
|
|
87
|
-
config = Config.from_hash(raw_config, params: raw_config[:params])
|
|
88
|
-
runtime_input = RequestSession::RuntimeInput.from_config(config)
|
|
89
|
-
|
|
90
|
-
# 2. Fetch the initial page using a shared request session.
|
|
91
|
-
request_session = RequestSession.from_runtime_input(runtime_input)
|
|
92
|
-
response = request_session.fetch_initial_response
|
|
93
|
-
|
|
94
|
-
# 3. Collect articles from configured selectors and auto-source scrapers.
|
|
95
|
-
articles = Articles::Deduplicator.new(
|
|
96
|
-
collect_articles(response:, config:, request_session:)
|
|
97
|
-
).call
|
|
98
|
-
|
|
99
|
-
# 4. Render the final output format chosen by the public entrypoint.
|
|
100
|
-
yield response:, config:, articles:
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
def collect_articles(response:, config:, request_session:)
|
|
104
|
-
selector_articles(response:, config:, request_session:) +
|
|
105
|
-
auto_source_articles(response:, config:, request_session:)
|
|
106
|
-
end
|
|
107
|
-
|
|
108
|
-
def selector_articles(response:, config:, request_session:) # rubocop:disable Metrics/MethodLength
|
|
109
|
-
return [] unless (selectors = config.selectors)
|
|
110
|
-
|
|
111
|
-
page_responses = if (max_pages = selectors.dig(:items, :pagination, :max_pages))
|
|
112
|
-
RequestSession::RelNextPager.new(
|
|
113
|
-
session: request_session,
|
|
114
|
-
initial_response: response,
|
|
115
|
-
max_pages:
|
|
116
|
-
).to_a
|
|
117
|
-
else
|
|
118
|
-
[response]
|
|
119
|
-
end
|
|
120
|
-
|
|
121
|
-
page_responses.flat_map do |page_response|
|
|
122
|
-
Selectors.new(page_response, selectors:, time_zone: config.time_zone).articles
|
|
123
|
-
end
|
|
124
|
-
end
|
|
125
|
-
|
|
126
|
-
def auto_source_articles(response:, config:, request_session:)
|
|
127
|
-
return [] unless (auto_source = config.auto_source)
|
|
128
|
-
|
|
129
|
-
AutoSource.new(response, auto_source, request_session:).articles
|
|
130
|
-
end
|
|
131
|
-
|
|
132
|
-
def build_rss_feed(response:, config:, articles:)
|
|
133
|
-
channel = RssBuilder::Channel.new(response, overrides: config.channel)
|
|
134
|
-
|
|
135
|
-
RssBuilder.new(channel:, articles:, stylesheets: config.stylesheets).call
|
|
136
|
-
end
|
|
137
|
-
|
|
138
|
-
def build_json_feed(response:, config:, articles:)
|
|
139
|
-
channel = RssBuilder::Channel.new(response, overrides: config.channel)
|
|
140
|
-
|
|
141
|
-
JsonFeedBuilder.new(channel:, articles:).call
|
|
142
|
-
end
|
|
143
|
-
|
|
144
|
-
def explicit_request_control_keys(strategy:, max_redirects:, max_requests:)
|
|
145
|
-
keys = []
|
|
146
|
-
keys << :strategy unless strategy == :faraday
|
|
147
|
-
keys << :max_redirects unless max_redirects.nil?
|
|
148
|
-
keys << :max_requests unless max_requests.nil?
|
|
149
|
-
keys
|
|
150
|
-
end
|
|
151
|
-
|
|
152
81
|
def build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:)
|
|
153
82
|
Config.auto_source_config(
|
|
154
83
|
url:,
|
|
@@ -165,6 +94,14 @@ module Html2rss
|
|
|
165
94
|
explicit_keys: explicit_request_control_keys(strategy:, max_redirects:, max_requests:)
|
|
166
95
|
)
|
|
167
96
|
end
|
|
97
|
+
|
|
98
|
+
def explicit_request_control_keys(strategy:, max_redirects:, max_requests:)
|
|
99
|
+
keys = []
|
|
100
|
+
keys << :strategy unless strategy.nil? || strategy == Config.default_strategy_name
|
|
101
|
+
keys << :max_redirects unless max_redirects.nil?
|
|
102
|
+
keys << :max_requests unless max_requests.nil?
|
|
103
|
+
keys
|
|
104
|
+
end
|
|
168
105
|
end
|
|
169
106
|
end
|
|
170
107
|
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
"type": "object",
|
|
4
4
|
"properties": {
|
|
5
5
|
"strategy": {
|
|
6
|
+
"type": "string",
|
|
6
7
|
"not": {
|
|
7
8
|
"type": "null"
|
|
8
9
|
}
|
|
@@ -445,13 +446,94 @@
|
|
|
445
446
|
}
|
|
446
447
|
},
|
|
447
448
|
"required": []
|
|
449
|
+
},
|
|
450
|
+
"botasaurus": {
|
|
451
|
+
"type": "object",
|
|
452
|
+
"properties": {
|
|
453
|
+
"navigation_mode": {
|
|
454
|
+
"type": "string",
|
|
455
|
+
"minLength": 1,
|
|
456
|
+
"enum": [
|
|
457
|
+
"auto",
|
|
458
|
+
"get",
|
|
459
|
+
"google_get",
|
|
460
|
+
"google_get_bypass"
|
|
461
|
+
]
|
|
462
|
+
},
|
|
463
|
+
"max_retries": {
|
|
464
|
+
"type": "integer",
|
|
465
|
+
"not": {
|
|
466
|
+
"type": "null"
|
|
467
|
+
},
|
|
468
|
+
"minimum": 0,
|
|
469
|
+
"maximum": 3
|
|
470
|
+
},
|
|
471
|
+
"wait_for_selector": {
|
|
472
|
+
"type": [
|
|
473
|
+
"null",
|
|
474
|
+
"string"
|
|
475
|
+
]
|
|
476
|
+
},
|
|
477
|
+
"wait_timeout_seconds": {
|
|
478
|
+
"type": "integer",
|
|
479
|
+
"not": {
|
|
480
|
+
"type": "null"
|
|
481
|
+
},
|
|
482
|
+
"exclusiveMinimum": 0
|
|
483
|
+
},
|
|
484
|
+
"block_images": {
|
|
485
|
+
"type": "boolean",
|
|
486
|
+
"not": {
|
|
487
|
+
"type": "null"
|
|
488
|
+
}
|
|
489
|
+
},
|
|
490
|
+
"block_images_and_css": {
|
|
491
|
+
"type": "boolean",
|
|
492
|
+
"not": {
|
|
493
|
+
"type": "null"
|
|
494
|
+
}
|
|
495
|
+
},
|
|
496
|
+
"wait_for_complete_page_load": {
|
|
497
|
+
"type": "boolean",
|
|
498
|
+
"not": {
|
|
499
|
+
"type": "null"
|
|
500
|
+
}
|
|
501
|
+
},
|
|
502
|
+
"headless": {
|
|
503
|
+
"type": "boolean",
|
|
504
|
+
"not": {
|
|
505
|
+
"type": "null"
|
|
506
|
+
}
|
|
507
|
+
},
|
|
508
|
+
"proxy": {
|
|
509
|
+
"type": "string",
|
|
510
|
+
"minLength": 1
|
|
511
|
+
},
|
|
512
|
+
"user_agent": {
|
|
513
|
+
"type": "string",
|
|
514
|
+
"minLength": 1
|
|
515
|
+
},
|
|
516
|
+
"window_size": {
|
|
517
|
+
"type": "array",
|
|
518
|
+
"items": {
|
|
519
|
+
"minLength": 2,
|
|
520
|
+
"maxLength": 2,
|
|
521
|
+
"type": "integer",
|
|
522
|
+
"exclusiveMinimum": 0
|
|
523
|
+
}
|
|
524
|
+
},
|
|
525
|
+
"lang": {
|
|
526
|
+
"type": "string",
|
|
527
|
+
"minLength": 1
|
|
528
|
+
}
|
|
529
|
+
},
|
|
530
|
+
"required": []
|
|
448
531
|
}
|
|
449
532
|
},
|
|
450
533
|
"required": []
|
|
451
534
|
}
|
|
452
535
|
},
|
|
453
536
|
"required": [
|
|
454
|
-
"strategy",
|
|
455
537
|
"channel"
|
|
456
538
|
],
|
|
457
539
|
"anyOf": [
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html2rss
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.19.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gil Desmarais
|
|
@@ -315,6 +315,9 @@ files:
|
|
|
315
315
|
- lib/html2rss/config/schema.rb
|
|
316
316
|
- lib/html2rss/config/validator.rb
|
|
317
317
|
- lib/html2rss/error.rb
|
|
318
|
+
- lib/html2rss/feed_pipeline.rb
|
|
319
|
+
- lib/html2rss/feed_pipeline/auto_fallback.rb
|
|
320
|
+
- lib/html2rss/hash_util.rb
|
|
318
321
|
- lib/html2rss/html_extractor.rb
|
|
319
322
|
- lib/html2rss/html_extractor/date_extractor.rb
|
|
320
323
|
- lib/html2rss/html_extractor/enclosure_extractor.rb
|
|
@@ -331,6 +334,8 @@ files:
|
|
|
331
334
|
- lib/html2rss/rendering/video_renderer.rb
|
|
332
335
|
- lib/html2rss/request_controls.rb
|
|
333
336
|
- lib/html2rss/request_service.rb
|
|
337
|
+
- lib/html2rss/request_service/botasaurus_contract.rb
|
|
338
|
+
- lib/html2rss/request_service/botasaurus_strategy.rb
|
|
334
339
|
- lib/html2rss/request_service/browserless_strategy.rb
|
|
335
340
|
- lib/html2rss/request_service/budget.rb
|
|
336
341
|
- lib/html2rss/request_service/context.rb
|
|
@@ -379,7 +384,7 @@ licenses:
|
|
|
379
384
|
- MIT
|
|
380
385
|
metadata:
|
|
381
386
|
allowed_push_host: https://rubygems.org
|
|
382
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
|
387
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.19.0
|
|
383
388
|
rubygems_mfa_required: 'true'
|
|
384
389
|
rdoc_options: []
|
|
385
390
|
require_paths:
|