html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
data/lib/html2rss.rb
CHANGED
|
@@ -3,11 +3,10 @@
|
|
|
3
3
|
require 'zeitwerk'
|
|
4
4
|
|
|
5
5
|
loader = Zeitwerk::Loader.for_gem
|
|
6
|
+
loader.inflector.inflect('cli' => 'CLI')
|
|
6
7
|
loader.setup
|
|
7
8
|
|
|
8
|
-
require 'addressable'
|
|
9
9
|
require 'logger'
|
|
10
|
-
require 'yaml'
|
|
11
10
|
|
|
12
11
|
##
|
|
13
12
|
# The Html2rss namespace.
|
|
@@ -23,90 +22,87 @@ module Html2rss
|
|
|
23
22
|
end
|
|
24
23
|
|
|
25
24
|
##
|
|
26
|
-
#
|
|
27
|
-
class Error < StandardError; end
|
|
28
|
-
|
|
29
|
-
##
|
|
30
|
-
# Key for the feeds configuration in the YAML file.
|
|
31
|
-
CONFIG_KEY_FEEDS = :feeds
|
|
32
|
-
|
|
33
|
-
##
|
|
34
|
-
# Returns an RSS object generated from the provided YAML file configuration.
|
|
35
|
-
#
|
|
36
|
-
# Example:
|
|
37
|
-
#
|
|
38
|
-
# feed = Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')
|
|
39
|
-
# # => #<RSS::Rss:0x00007fb2f6331228
|
|
25
|
+
# Loads a feed configuration from YAML.
|
|
40
26
|
#
|
|
41
|
-
# @param file [String]
|
|
42
|
-
# @param
|
|
43
|
-
# @
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def self.feed_from_yaml_config(file, name = nil, global_config: {}, params: {})
|
|
47
|
-
yaml = YAML.safe_load_file(file, symbolize_names: true)
|
|
48
|
-
feeds = yaml[CONFIG_KEY_FEEDS] || {}
|
|
49
|
-
|
|
50
|
-
feed_config = find_feed_config(yaml, feeds, name, global_config)
|
|
51
|
-
|
|
52
|
-
feed(Config.new(feed_config, global_config, params))
|
|
27
|
+
# @param file [String] path to the YAML file
|
|
28
|
+
# @param feed_name [String, nil] optional feed name inside a multi-feed config
|
|
29
|
+
# @return [Hash{Symbol => Object}] loaded configuration hash
|
|
30
|
+
def self.config_from_yaml_file(file, feed_name = nil)
|
|
31
|
+
Config.load_yaml(file, feed_name)
|
|
53
32
|
end
|
|
54
33
|
|
|
55
34
|
##
|
|
56
35
|
# Returns an RSS object generated from the provided configuration.
|
|
57
36
|
#
|
|
58
|
-
#
|
|
59
|
-
#
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
# selectors: {
|
|
63
|
-
# items: { selector: '#hot-network-questions > ul > li' },
|
|
64
|
-
# title: { selector: 'a' },
|
|
65
|
-
# link: { selector: 'a', extractor: 'href' }
|
|
66
|
-
# }
|
|
67
|
-
# )
|
|
68
|
-
# # => #<RSS::Rss:0x00007fb2f48d14a0 ...>
|
|
69
|
-
#
|
|
70
|
-
# @param config [Hash<Symbol, Object>, Html2rss::Config] Feed configuration.
|
|
71
|
-
# @return [RSS::Rss] RSS object generated from the configuration.
|
|
72
|
-
def self.feed(config)
|
|
73
|
-
config = Config.new(config) unless config.is_a?(Config)
|
|
74
|
-
RssBuilder.build(config)
|
|
37
|
+
# @param raw_config [Hash{Symbol => Object}] feed configuration
|
|
38
|
+
# @return [RSS::Rss] generated RSS feed
|
|
39
|
+
def self.feed(raw_config)
|
|
40
|
+
FeedPipeline.new(raw_config).to_rss
|
|
75
41
|
end
|
|
76
42
|
|
|
77
43
|
##
|
|
78
|
-
#
|
|
44
|
+
# Returns a JSONFeed 1.1 hash generated from the provided configuration.
|
|
79
45
|
#
|
|
80
|
-
# @param
|
|
81
|
-
# @
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
# @return [Hash] Feed configuration.
|
|
85
|
-
def self.find_feed_config(yaml, feeds, feed_name, global_config)
|
|
86
|
-
return yaml unless feed_name
|
|
87
|
-
|
|
88
|
-
feed_name = feed_name.to_sym
|
|
89
|
-
if feeds.key?(feed_name)
|
|
90
|
-
global_config.merge!(yaml.reject { |key| key == CONFIG_KEY_FEEDS })
|
|
91
|
-
feeds[feed_name]
|
|
92
|
-
else
|
|
93
|
-
yaml
|
|
94
|
-
end
|
|
46
|
+
# @param raw_config [Hash{Symbol => Object}] feed configuration
|
|
47
|
+
# @return [Hash] JSONFeed-compliant hash
|
|
48
|
+
def self.json_feed(raw_config)
|
|
49
|
+
FeedPipeline.new(raw_config).to_json_feed
|
|
95
50
|
end
|
|
96
51
|
|
|
97
52
|
##
|
|
98
53
|
# Scrapes the provided URL and returns an RSS object.
|
|
99
|
-
# No need for a "feed config".
|
|
100
54
|
#
|
|
101
|
-
# @param url [String]
|
|
102
|
-
# @param strategy [Symbol]
|
|
103
|
-
# @
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
55
|
+
# @param url [String] source page URL
|
|
56
|
+
# @param strategy [Symbol] request strategy to use
|
|
57
|
+
# @param items_selector [String, nil] optional selector hint for item extraction
|
|
58
|
+
# @param max_redirects [Integer, nil] optional redirect limit override
|
|
59
|
+
# @param max_requests [Integer, nil] optional request budget override
|
|
60
|
+
# @return [RSS::Rss] generated RSS feed
|
|
61
|
+
def self.auto_source(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil)
|
|
62
|
+
feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
|
|
109
63
|
end
|
|
110
64
|
|
|
111
|
-
|
|
65
|
+
##
|
|
66
|
+
# Scrapes the provided URL and returns a JSONFeed 1.1 hash.
|
|
67
|
+
#
|
|
68
|
+
# @param url [String] source page URL
|
|
69
|
+
# @param strategy [Symbol] request strategy to use
|
|
70
|
+
# @param items_selector [String, nil] optional selector hint for item extraction
|
|
71
|
+
# @param max_redirects [Integer, nil] optional redirect limit override
|
|
72
|
+
# @param max_requests [Integer, nil] optional request budget override
|
|
73
|
+
# @return [Hash] JSONFeed-compliant hash
|
|
74
|
+
def self.auto_json_feed(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil)
|
|
75
|
+
json_feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
class << self
|
|
79
|
+
private
|
|
80
|
+
|
|
81
|
+
def build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:)
|
|
82
|
+
Config.auto_source_config(
|
|
83
|
+
url:,
|
|
84
|
+
items_selector:,
|
|
85
|
+
request_controls: shortcut_request_controls(strategy:, max_redirects:, max_requests:)
|
|
86
|
+
)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def shortcut_request_controls(strategy:, max_redirects:, max_requests:)
|
|
90
|
+
RequestControls.new(
|
|
91
|
+
strategy:,
|
|
92
|
+
max_redirects:,
|
|
93
|
+
max_requests:,
|
|
94
|
+
explicit_keys: explicit_request_control_keys(strategy:, max_redirects:, max_requests:)
|
|
95
|
+
)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def explicit_request_control_keys(strategy:, max_redirects:, max_requests:)
|
|
99
|
+
keys = []
|
|
100
|
+
keys << :strategy unless strategy.nil? || strategy == Config.default_strategy_name
|
|
101
|
+
keys << :max_redirects unless max_redirects.nil?
|
|
102
|
+
keys << :max_requests unless max_requests.nil?
|
|
103
|
+
keys
|
|
104
|
+
end
|
|
105
|
+
end
|
|
112
106
|
end
|
|
107
|
+
|
|
108
|
+
loader.eager_load
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'fileutils'
|
|
5
|
+
require_relative '../html2rss'
|
|
6
|
+
|
|
7
|
+
namespace :config do
|
|
8
|
+
desc 'Generate config JSON schema'
|
|
9
|
+
task :schema do
|
|
10
|
+
destination = Html2rss::Config.schema_path
|
|
11
|
+
|
|
12
|
+
FileUtils.mkdir_p(File.dirname(destination))
|
|
13
|
+
File.write(destination, "#{Html2rss::Config.json_schema_json}\n")
|
|
14
|
+
|
|
15
|
+
puts "Generated config schema at #{destination}"
|
|
16
|
+
end
|
|
17
|
+
end
|