html2rss 0.18.0 → 0.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -1
  3. data/lib/html2rss/articles/deduplicator.rb +1 -0
  4. data/lib/html2rss/auto_source/cleanup.rb +11 -0
  5. data/lib/html2rss/auto_source/scraper/html.rb +5 -0
  6. data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
  7. data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
  8. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
  9. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
  10. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
  11. data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
  12. data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
  13. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
  14. data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
  15. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
  16. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
  17. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
  18. data/lib/html2rss/auto_source/scraper.rb +19 -1
  19. data/lib/html2rss/auto_source.rb +4 -0
  20. data/lib/html2rss/blocked_surface.rb +1 -0
  21. data/lib/html2rss/category_extractor.rb +2 -2
  22. data/lib/html2rss/cli.rb +30 -6
  23. data/lib/html2rss/config/class_methods.rb +24 -35
  24. data/lib/html2rss/config/dynamic_params.rb +6 -4
  25. data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
  26. data/lib/html2rss/config/request_headers.rb +9 -3
  27. data/lib/html2rss/config/schema.rb +33 -1
  28. data/lib/html2rss/config/validator.rb +40 -2
  29. data/lib/html2rss/config.rb +19 -13
  30. data/lib/html2rss/error.rb +25 -0
  31. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  32. data/lib/html2rss/feed_pipeline.rb +127 -0
  33. data/lib/html2rss/hash_util.rb +101 -0
  34. data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
  35. data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
  36. data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
  37. data/lib/html2rss/html_extractor.rb +5 -0
  38. data/lib/html2rss/html_navigator.rb +8 -0
  39. data/lib/html2rss/json_feed_builder.rb +1 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +8 -3
  41. data/lib/html2rss/rendering/description_builder.rb +0 -1
  42. data/lib/html2rss/rendering/image_renderer.rb +17 -7
  43. data/lib/html2rss/rendering/media_renderer.rb +4 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
  45. data/lib/html2rss/rendering/video_renderer.rb +8 -3
  46. data/lib/html2rss/rendering.rb +11 -2
  47. data/lib/html2rss/request_controls.rb +16 -21
  48. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  49. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  50. data/lib/html2rss/request_service/context.rb +14 -2
  51. data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
  52. data/lib/html2rss/request_service/policy.rb +4 -0
  53. data/lib/html2rss/request_service/response.rb +9 -1
  54. data/lib/html2rss/request_service.rb +19 -0
  55. data/lib/html2rss/request_session/runtime_input.rb +16 -2
  56. data/lib/html2rss/request_session/runtime_policy.rb +7 -0
  57. data/lib/html2rss/request_session.rb +13 -9
  58. data/lib/html2rss/rss_builder/article.rb +22 -1
  59. data/lib/html2rss/rss_builder/channel.rb +11 -2
  60. data/lib/html2rss/rss_builder/enclosure.rb +15 -1
  61. data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
  62. data/lib/html2rss/rss_builder.rb +4 -0
  63. data/lib/html2rss/selectors/config.rb +1 -0
  64. data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
  65. data/lib/html2rss/selectors/extractors/href.rb +2 -0
  66. data/lib/html2rss/selectors/extractors/html.rb +1 -0
  67. data/lib/html2rss/selectors/extractors/static.rb +2 -1
  68. data/lib/html2rss/selectors/extractors/text.rb +1 -0
  69. data/lib/html2rss/selectors/extractors.rb +2 -1
  70. data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
  71. data/lib/html2rss/selectors/post_processors/base.rb +13 -7
  72. data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
  73. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
  74. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
  75. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
  76. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
  77. data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
  78. data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
  79. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
  80. data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
  81. data/lib/html2rss/selectors/post_processors/template.rb +3 -0
  82. data/lib/html2rss/selectors/post_processors.rb +5 -0
  83. data/lib/html2rss/selectors.rb +7 -0
  84. data/lib/html2rss/url.rb +27 -23
  85. data/lib/html2rss/version.rb +2 -1
  86. data/lib/html2rss.rb +15 -78
  87. data/schema/html2rss-config.schema.json +83 -1
  88. metadata +7 -2
data/lib/html2rss.rb CHANGED
@@ -26,7 +26,7 @@ module Html2rss
26
26
  #
27
27
  # @param file [String] path to the YAML file
28
28
  # @param feed_name [String, nil] optional feed name inside a multi-feed config
29
- # @return [Hash<Symbol, Object>] loaded configuration hash
29
+ # @return [Hash{Symbol => Object}] loaded configuration hash
30
30
  def self.config_from_yaml_file(file, feed_name = nil)
31
31
  Config.load_yaml(file, feed_name)
32
32
  end
@@ -34,23 +34,19 @@ module Html2rss
34
34
  ##
35
35
  # Returns an RSS object generated from the provided configuration.
36
36
  #
37
- # @param raw_config [Hash<Symbol, Object>] feed configuration
37
+ # @param raw_config [Hash{Symbol => Object}] feed configuration
38
38
  # @return [RSS::Rss] generated RSS feed
39
39
  def self.feed(raw_config)
40
- run_pipeline(raw_config) do |response:, config:, articles:|
41
- build_rss_feed(response:, config:, articles:)
42
- end
40
+ FeedPipeline.new(raw_config).to_rss
43
41
  end
44
42
 
45
43
  ##
46
44
  # Returns a JSONFeed 1.1 hash generated from the provided configuration.
47
45
  #
48
- # @param raw_config [Hash<Symbol, Object>] feed configuration
46
+ # @param raw_config [Hash{Symbol => Object}] feed configuration
49
47
  # @return [Hash] JSONFeed-compliant hash
50
48
  def self.json_feed(raw_config)
51
- run_pipeline(raw_config) do |response:, config:, articles:|
52
- build_json_feed(response:, config:, articles:)
53
- end
49
+ FeedPipeline.new(raw_config).to_json_feed
54
50
  end
55
51
 
56
52
  ##
@@ -62,7 +58,7 @@ module Html2rss
62
58
  # @param max_redirects [Integer, nil] optional redirect limit override
63
59
  # @param max_requests [Integer, nil] optional request budget override
64
60
  # @return [RSS::Rss] generated RSS feed
65
- def self.auto_source(url, strategy: :faraday, items_selector: nil, max_redirects: nil, max_requests: nil)
61
+ def self.auto_source(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil)
66
62
  feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
67
63
  end
68
64
 
@@ -75,80 +71,13 @@ module Html2rss
75
71
  # @param max_redirects [Integer, nil] optional redirect limit override
76
72
  # @param max_requests [Integer, nil] optional request budget override
77
73
  # @return [Hash] JSONFeed-compliant hash
78
- def self.auto_json_feed(url, strategy: :faraday, items_selector: nil, max_redirects: nil, max_requests: nil)
74
+ def self.auto_json_feed(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil)
79
75
  json_feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
80
76
  end
81
77
 
82
78
  class << self
83
79
  private
84
80
 
85
- def run_pipeline(raw_config)
86
- # 1. Normalize and validate the user-facing feed config.
87
- config = Config.from_hash(raw_config, params: raw_config[:params])
88
- runtime_input = RequestSession::RuntimeInput.from_config(config)
89
-
90
- # 2. Fetch the initial page using a shared request session.
91
- request_session = RequestSession.from_runtime_input(runtime_input)
92
- response = request_session.fetch_initial_response
93
-
94
- # 3. Collect articles from configured selectors and auto-source scrapers.
95
- articles = Articles::Deduplicator.new(
96
- collect_articles(response:, config:, request_session:)
97
- ).call
98
-
99
- # 4. Render the final output format chosen by the public entrypoint.
100
- yield response:, config:, articles:
101
- end
102
-
103
- def collect_articles(response:, config:, request_session:)
104
- selector_articles(response:, config:, request_session:) +
105
- auto_source_articles(response:, config:, request_session:)
106
- end
107
-
108
- def selector_articles(response:, config:, request_session:) # rubocop:disable Metrics/MethodLength
109
- return [] unless (selectors = config.selectors)
110
-
111
- page_responses = if (max_pages = selectors.dig(:items, :pagination, :max_pages))
112
- RequestSession::RelNextPager.new(
113
- session: request_session,
114
- initial_response: response,
115
- max_pages:
116
- ).to_a
117
- else
118
- [response]
119
- end
120
-
121
- page_responses.flat_map do |page_response|
122
- Selectors.new(page_response, selectors:, time_zone: config.time_zone).articles
123
- end
124
- end
125
-
126
- def auto_source_articles(response:, config:, request_session:)
127
- return [] unless (auto_source = config.auto_source)
128
-
129
- AutoSource.new(response, auto_source, request_session:).articles
130
- end
131
-
132
- def build_rss_feed(response:, config:, articles:)
133
- channel = RssBuilder::Channel.new(response, overrides: config.channel)
134
-
135
- RssBuilder.new(channel:, articles:, stylesheets: config.stylesheets).call
136
- end
137
-
138
- def build_json_feed(response:, config:, articles:)
139
- channel = RssBuilder::Channel.new(response, overrides: config.channel)
140
-
141
- JsonFeedBuilder.new(channel:, articles:).call
142
- end
143
-
144
- def explicit_request_control_keys(strategy:, max_redirects:, max_requests:)
145
- keys = []
146
- keys << :strategy unless strategy == :faraday
147
- keys << :max_redirects unless max_redirects.nil?
148
- keys << :max_requests unless max_requests.nil?
149
- keys
150
- end
151
-
152
81
  def build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:)
153
82
  Config.auto_source_config(
154
83
  url:,
@@ -165,6 +94,14 @@ module Html2rss
165
94
  explicit_keys: explicit_request_control_keys(strategy:, max_redirects:, max_requests:)
166
95
  )
167
96
  end
97
+
98
+ def explicit_request_control_keys(strategy:, max_redirects:, max_requests:)
99
+ keys = []
100
+ keys << :strategy unless strategy.nil? || strategy == Config.default_strategy_name
101
+ keys << :max_redirects unless max_redirects.nil?
102
+ keys << :max_requests unless max_requests.nil?
103
+ keys
104
+ end
168
105
  end
169
106
  end
170
107
 
@@ -3,6 +3,7 @@
3
3
  "type": "object",
4
4
  "properties": {
5
5
  "strategy": {
6
+ "type": "string",
6
7
  "not": {
7
8
  "type": "null"
8
9
  }
@@ -445,13 +446,94 @@
445
446
  }
446
447
  },
447
448
  "required": []
449
+ },
450
+ "botasaurus": {
451
+ "type": "object",
452
+ "properties": {
453
+ "navigation_mode": {
454
+ "type": "string",
455
+ "minLength": 1,
456
+ "enum": [
457
+ "auto",
458
+ "get",
459
+ "google_get",
460
+ "google_get_bypass"
461
+ ]
462
+ },
463
+ "max_retries": {
464
+ "type": "integer",
465
+ "not": {
466
+ "type": "null"
467
+ },
468
+ "minimum": 0,
469
+ "maximum": 3
470
+ },
471
+ "wait_for_selector": {
472
+ "type": [
473
+ "null",
474
+ "string"
475
+ ]
476
+ },
477
+ "wait_timeout_seconds": {
478
+ "type": "integer",
479
+ "not": {
480
+ "type": "null"
481
+ },
482
+ "exclusiveMinimum": 0
483
+ },
484
+ "block_images": {
485
+ "type": "boolean",
486
+ "not": {
487
+ "type": "null"
488
+ }
489
+ },
490
+ "block_images_and_css": {
491
+ "type": "boolean",
492
+ "not": {
493
+ "type": "null"
494
+ }
495
+ },
496
+ "wait_for_complete_page_load": {
497
+ "type": "boolean",
498
+ "not": {
499
+ "type": "null"
500
+ }
501
+ },
502
+ "headless": {
503
+ "type": "boolean",
504
+ "not": {
505
+ "type": "null"
506
+ }
507
+ },
508
+ "proxy": {
509
+ "type": "string",
510
+ "minLength": 1
511
+ },
512
+ "user_agent": {
513
+ "type": "string",
514
+ "minLength": 1
515
+ },
516
+ "window_size": {
517
+ "type": "array",
518
+ "items": {
519
+ "minLength": 2,
520
+ "maxLength": 2,
521
+ "type": "integer",
522
+ "exclusiveMinimum": 0
523
+ }
524
+ },
525
+ "lang": {
526
+ "type": "string",
527
+ "minLength": 1
528
+ }
529
+ },
530
+ "required": []
448
531
  }
449
532
  },
450
533
  "required": []
451
534
  }
452
535
  },
453
536
  "required": [
454
- "strategy",
455
537
  "channel"
456
538
  ],
457
539
  "anyOf": [
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.18.0
4
+ version: 0.19.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
@@ -315,6 +315,9 @@ files:
315
315
  - lib/html2rss/config/schema.rb
316
316
  - lib/html2rss/config/validator.rb
317
317
  - lib/html2rss/error.rb
318
+ - lib/html2rss/feed_pipeline.rb
319
+ - lib/html2rss/feed_pipeline/auto_fallback.rb
320
+ - lib/html2rss/hash_util.rb
318
321
  - lib/html2rss/html_extractor.rb
319
322
  - lib/html2rss/html_extractor/date_extractor.rb
320
323
  - lib/html2rss/html_extractor/enclosure_extractor.rb
@@ -331,6 +334,8 @@ files:
331
334
  - lib/html2rss/rendering/video_renderer.rb
332
335
  - lib/html2rss/request_controls.rb
333
336
  - lib/html2rss/request_service.rb
337
+ - lib/html2rss/request_service/botasaurus_contract.rb
338
+ - lib/html2rss/request_service/botasaurus_strategy.rb
334
339
  - lib/html2rss/request_service/browserless_strategy.rb
335
340
  - lib/html2rss/request_service/budget.rb
336
341
  - lib/html2rss/request_service/context.rb
@@ -379,7 +384,7 @@ licenses:
379
384
  - MIT
380
385
  metadata:
381
386
  allowed_push_host: https://rubygems.org
382
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.18.0
387
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.19.1
383
388
  rubygems_mfa_required: 'true'
384
389
  rdoc_options: []
385
390
  require_paths: