html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
data/lib/html2rss.rb CHANGED
@@ -3,11 +3,10 @@
3
3
  require 'zeitwerk'
4
4
 
5
5
  loader = Zeitwerk::Loader.for_gem
6
+ loader.inflector.inflect('cli' => 'CLI')
6
7
  loader.setup
7
8
 
8
- require 'addressable'
9
9
  require 'logger'
10
- require 'yaml'
11
10
 
12
11
  ##
13
12
  # The Html2rss namespace.
@@ -23,90 +22,87 @@ module Html2rss
23
22
  end
24
23
 
25
24
  ##
26
- # The Html2rss::Error base class.
27
- class Error < StandardError; end
28
-
29
- ##
30
- # Key for the feeds configuration in the YAML file.
31
- CONFIG_KEY_FEEDS = :feeds
32
-
33
- ##
34
- # Returns an RSS object generated from the provided YAML file configuration.
35
- #
36
- # Example:
37
- #
38
- # feed = Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')
39
- # # => #<RSS::Rss:0x00007fb2f6331228
25
+ # Loads a feed configuration from YAML.
40
26
  #
41
- # @param file [String] Path to the YAML file.
42
- # @param name [String, Symbol, nil] Name of the feed in the YAML file.
43
- # @param global_config [Hash] Global options (e.g., HTTP headers).
44
- # @param params [Hash] Dynamic parameters for the feed configuration.
45
- # @return [RSS::Rss] RSS object generated from the configuration.
46
- def self.feed_from_yaml_config(file, name = nil, global_config: {}, params: {})
47
- yaml = YAML.safe_load_file(file, symbolize_names: true)
48
- feeds = yaml[CONFIG_KEY_FEEDS] || {}
49
-
50
- feed_config = find_feed_config(yaml, feeds, name, global_config)
51
-
52
- feed(Config.new(feed_config, global_config, params))
27
+ # @param file [String] path to the YAML file
28
+ # @param feed_name [String, nil] optional feed name inside a multi-feed config
29
+ # @return [Hash{Symbol => Object}] loaded configuration hash
30
+ def self.config_from_yaml_file(file, feed_name = nil)
31
+ Config.load_yaml(file, feed_name)
53
32
  end
54
33
 
55
34
  ##
56
35
  # Returns an RSS object generated from the provided configuration.
57
36
  #
58
- # Example:
59
- #
60
- # feed = Html2rss.feed(
61
- # channel: { name: 'StackOverflow: Hot Network Questions', url: 'https://stackoverflow.com' },
62
- # selectors: {
63
- # items: { selector: '#hot-network-questions > ul > li' },
64
- # title: { selector: 'a' },
65
- # link: { selector: 'a', extractor: 'href' }
66
- # }
67
- # )
68
- # # => #<RSS::Rss:0x00007fb2f48d14a0 ...>
69
- #
70
- # @param config [Hash<Symbol, Object>, Html2rss::Config] Feed configuration.
71
- # @return [RSS::Rss] RSS object generated from the configuration.
72
- def self.feed(config)
73
- config = Config.new(config) unless config.is_a?(Config)
74
- RssBuilder.build(config)
37
+ # @param raw_config [Hash{Symbol => Object}] feed configuration
38
+ # @return [RSS::Rss] generated RSS feed
39
+ def self.feed(raw_config)
40
+ FeedPipeline.new(raw_config).to_rss
75
41
  end
76
42
 
77
43
  ##
78
- # Builds the feed configuration based on the provided parameters.
44
+ # Returns a JSONFeed 1.1 hash generated from the provided configuration.
79
45
  #
80
- # @param yaml [Hash] Parsed YAML content.
81
- # @param feeds [Hash] Feeds from the YAML content.
82
- # @param feed_name [String, Symbol, nil] Name of the feed in the YAML file.
83
- # @param global_config [Hash] Global options (e.g., HTTP headers).
84
- # @return [Hash] Feed configuration.
85
- def self.find_feed_config(yaml, feeds, feed_name, global_config)
86
- return yaml unless feed_name
87
-
88
- feed_name = feed_name.to_sym
89
- if feeds.key?(feed_name)
90
- global_config.merge!(yaml.reject { |key| key == CONFIG_KEY_FEEDS })
91
- feeds[feed_name]
92
- else
93
- yaml
94
- end
46
+ # @param raw_config [Hash{Symbol => Object}] feed configuration
47
+ # @return [Hash] JSONFeed-compliant hash
48
+ def self.json_feed(raw_config)
49
+ FeedPipeline.new(raw_config).to_json_feed
95
50
  end
96
51
 
97
52
  ##
98
53
  # Scrapes the provided URL and returns an RSS object.
99
- # No need for a "feed config".
100
54
  #
101
- # @param url [String] the URL to automatically source the feed from
102
- # @param strategy [Symbol] the request strategy to use
103
- # @return [RSS::Rss]
104
- def self.auto_source(url, strategy: :faraday)
105
- ctx = RequestService::Context.new(url:, headers: {})
106
- response = RequestService.execute(ctx, strategy:)
107
-
108
- Html2rss::AutoSource.new(ctx.url, body: response.body, headers: response.headers).build
55
+ # @param url [String] source page URL
56
+ # @param strategy [Symbol] request strategy to use
57
+ # @param items_selector [String, nil] optional selector hint for item extraction
58
+ # @param max_redirects [Integer, nil] optional redirect limit override
59
+ # @param max_requests [Integer, nil] optional request budget override
60
+ # @return [RSS::Rss] generated RSS feed
61
+ def self.auto_source(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil)
62
+ feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
109
63
  end
110
64
 
111
- private_class_method :find_feed_config
65
+ ##
66
+ # Scrapes the provided URL and returns a JSONFeed 1.1 hash.
67
+ #
68
+ # @param url [String] source page URL
69
+ # @param strategy [Symbol] request strategy to use
70
+ # @param items_selector [String, nil] optional selector hint for item extraction
71
+ # @param max_redirects [Integer, nil] optional redirect limit override
72
+ # @param max_requests [Integer, nil] optional request budget override
73
+ # @return [Hash] JSONFeed-compliant hash
74
+ def self.auto_json_feed(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil)
75
+ json_feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
76
+ end
77
+
78
+ class << self
79
+ private
80
+
81
+ def build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:)
82
+ Config.auto_source_config(
83
+ url:,
84
+ items_selector:,
85
+ request_controls: shortcut_request_controls(strategy:, max_redirects:, max_requests:)
86
+ )
87
+ end
88
+
89
+ def shortcut_request_controls(strategy:, max_redirects:, max_requests:)
90
+ RequestControls.new(
91
+ strategy:,
92
+ max_redirects:,
93
+ max_requests:,
94
+ explicit_keys: explicit_request_control_keys(strategy:, max_redirects:, max_requests:)
95
+ )
96
+ end
97
+
98
+ def explicit_request_control_keys(strategy:, max_redirects:, max_requests:)
99
+ keys = []
100
+ keys << :strategy unless strategy.nil? || strategy == Config.default_strategy_name
101
+ keys << :max_redirects unless max_redirects.nil?
102
+ keys << :max_requests unless max_requests.nil?
103
+ keys
104
+ end
105
+ end
112
106
  end
107
+
108
+ loader.eager_load
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'fileutils'
5
+ require_relative '../html2rss'
6
+
7
+ namespace :config do
8
+ desc 'Generate config JSON schema'
9
+ task :schema do
10
+ destination = Html2rss::Config.schema_path
11
+
12
+ FileUtils.mkdir_p(File.dirname(destination))
13
+ File.write(destination, "#{Html2rss::Config.json_schema_json}\n")
14
+
15
+ puts "Generated config schema at #{destination}"
16
+ end
17
+ end