html2rss 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8168109d2cc60920d8a18b6b99970a5558e43163ad5cd11cb3d3f0d944d46943
4
- data.tar.gz: 833a936f89f9ce31c0b4fb0036020c7962a4ac77e0dfa72f1134a0bae8bea4c4
3
+ metadata.gz: 9ff7cdc4e25f3abc6da000e4f672d6832fb6027e49885aecc0cad38329c5e6ae
4
+ data.tar.gz: e42c216e328bb2c56971dd58871f023f15e672563e27741c56c6cf7fe4cb322a
5
5
  SHA512:
6
- metadata.gz: 734f286a486d49c86ab7baf48d157cdee9d988fdc8b693ac7d79bf3c64c661fcd54538d5e94dc19bdc8a6f3021168c1ecac2d8e34417f56879392d71600c7340
7
- data.tar.gz: f008a767b452557cff1b45b1abb0eccb26f38d839417e95d21b8cf74f4546f9143b067e2d11f9fc00f5955c3f145f8933a1b1d4912ad25756c890280d4bb1a37
6
+ metadata.gz: e52812f947561b9a52537f1b28c530f63e116194642d13ff526fac1ad32f02d7ea6ff8ca9b5ee16e2d7f686e1babec1908ca51399ba42ef9461ed3dbe0d02117
7
+ data.tar.gz: 4754495a5947aca6de71846d1c88128d9fc1826e1014fd00806f58e7cd1e1575dc3bd2380ced3aeecd0cd6d51e0366ddee7a1f7db0220750e13f66645de2edde
@@ -0,0 +1,185 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ class Html
7
+ ##
8
+ # ClassClustering clusters DOM elements on anchorless pages by class lists and scores
9
+ # candidate groups to find the best list of content cards/articles.
10
+ # rubocop:disable Metrics/ClassLength
11
+ class ClassClustering
12
+ # Node tags considered layout containers
13
+ LAYOUT_TAG_NAMES = Set['div', 'section', 'article'].freeze
14
+ # HTML/layout tags excluded from candidate nodes
15
+ EXCLUDED_TAGS = Set['html', 'body', 'nav', 'footer', 'header', 'svg', 'script', 'style'].freeze
16
+
17
+ class << self
18
+ ##
19
+ # Clusters elements in parsed_body and returns the best set of content card nodes.
20
+ #
21
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
22
+ # @param minimum_selector_frequency [Integer] minimum frequency for class groups
23
+ # @return [Array<Nokogiri::XML::Node>] candidate nodes of the top-scoring class group
24
+ def call(parsed_body, minimum_selector_frequency:)
25
+ new(parsed_body, minimum_selector_frequency:).call
26
+ end
27
+ end
28
+
29
+ # @param parsed_body [Nokogiri::HTML::Document]
30
+ # @param minimum_selector_frequency [Integer]
31
+ def initialize(parsed_body, minimum_selector_frequency:)
32
+ @parsed_body = parsed_body
33
+ @minimum_frequency = minimum_selector_frequency
34
+ @text_words = {}.compare_by_identity
35
+ @has_date = {}.compare_by_identity
36
+ end
37
+
38
+ # @return [Array<Nokogiri::XML::Node>]
39
+ def call
40
+ candidate_groups = collect_candidate_groups
41
+ return [] if candidate_groups.empty?
42
+
43
+ non_containers = filter_containers(candidate_groups)
44
+ final_groups = filter_1_to_1_overlap(non_containers)
45
+
46
+ select_best_group(final_groups)
47
+ end
48
+
49
+ private
50
+
51
+ def collect_candidate_groups
52
+ class_groups = Hash.new { |h, k| h[k] = [] }
53
+ cache = {}.compare_by_identity
54
+
55
+ @parsed_body.css('[class]').each { |node| add_node_to_groups(node, class_groups, cache) }
56
+
57
+ class_groups.select { |_, nodes| nodes.size >= @minimum_frequency }
58
+ end
59
+
60
+ def add_node_to_groups(node, class_groups, cache)
61
+ return if EXCLUDED_TAGS.include?(node.name) || HtmlExtractor.ignored_container_path?(node, cache)
62
+
63
+ cls = normalize_class(node['class'])
64
+ class_groups[cls] << node unless cls.empty?
65
+ end
66
+
67
+ def normalize_class(class_attr)
68
+ class_str = class_attr.to_s.strip
69
+ return '' if class_str.empty?
70
+
71
+ # Bypass split/sort/join allocation for single-class lists
72
+ if class_str.include?(' ')
73
+ class_str.split(/\s+/).sort.join(' ')
74
+ else
75
+ class_str
76
+ end
77
+ end
78
+
79
+ # Discard group A if any node of A contains > 1 node of another group B
80
+ def filter_containers(groups)
81
+ groups.reject do |cls_a, nodes_a|
82
+ groups.any? { |cls_b, nodes_b| cls_a != cls_b && container_of?(nodes_a, nodes_b) }
83
+ end
84
+ end
85
+
86
+ def container_of?(nodes_a, nodes_b)
87
+ return false unless LAYOUT_TAG_NAMES.include?(nodes_b.first.name)
88
+
89
+ nodes_a.any? do |node_a|
90
+ nodes_b.count { |node_b| node_a != node_b && node_b.ancestors.include?(node_a) } > 1
91
+ end
92
+ end
93
+
94
+ # If group A contains group B, and they have the same size:
95
+ # - If B (the descendant) contains >= 80% of A's words, AND B's tag is div/section/article,
96
+ # B is the actual content card. Discard A.
97
+ # - Otherwise, B is a sub-element (header, metadata line, button). Discard B.
98
+ def filter_1_to_1_overlap(groups)
99
+ discarded = Set.new
100
+ groups.each_key do |cls_a|
101
+ groups.each_key do |cls_b|
102
+ next if cls_a == cls_b || discarded.include?(cls_a) || discarded.include?(cls_b)
103
+
104
+ resolve_1_to_1_overlap(cls_a, cls_b, groups, discarded)
105
+ end
106
+ end
107
+
108
+ groups.except(*discarded)
109
+ end
110
+
111
+ def resolve_1_to_1_overlap(cls_a, cls_b, groups, discarded)
112
+ nodes_a = groups[cls_a]
113
+ nodes_b = groups[cls_b]
114
+ return if nodes_a.size != nodes_b.size
115
+ return unless nodes_a.zip(nodes_b).all? { |a, b| a != b && b.ancestors.include?(a) }
116
+
117
+ discarded << (keep_descendant?(nodes_a, nodes_b) ? cls_a : cls_b)
118
+ end
119
+
120
+ def keep_descendant?(nodes_a, nodes_b)
121
+ avg_words(nodes_b) >= 0.8 * avg_words(nodes_a) &&
122
+ LAYOUT_TAG_NAMES.include?(nodes_b.first.name)
123
+ end
124
+
125
+ def select_best_group(groups)
126
+ best_nodes = []
127
+ best_score = -1
128
+
129
+ groups.each_value do |nodes|
130
+ score = score_group(nodes)
131
+ next if score.negative?
132
+
133
+ (best_nodes = nodes) && (best_score = score) if score > best_score
134
+ end
135
+
136
+ best_nodes
137
+ end
138
+
139
+ def score_group(nodes)
140
+ avg_w = avg_words(nodes)
141
+ return -1 if avg_w < 5
142
+
143
+ score = nodes.size + (avg_w / 5.0)
144
+ score += 20 if nodes_heading?(nodes)
145
+ score += 20 if nodes_time?(nodes)
146
+ score += 40 if nodes_date?(nodes)
147
+ score
148
+ end
149
+
150
+ def nodes_heading?(nodes)
151
+ nodes.any? do |n|
152
+ n.at_css(HtmlExtractor::HEADING_TAGS.join(',')) ||
153
+ n.at_css('.font-bold, .font-semibold')
154
+ end
155
+ end
156
+
157
+ def nodes_time?(nodes)
158
+ nodes.any? { |n| n.at_css('time, [datetime]') }
159
+ end
160
+
161
+ def nodes_date?(nodes)
162
+ nodes.any? { |n| date?(n) }
163
+ end
164
+
165
+ def avg_words(nodes)
166
+ nodes.sum { |n| text_words(n) } / nodes.size.to_f
167
+ end
168
+
169
+ def text_words(node)
170
+ @text_words[node] ||= HtmlExtractor.extract_visible_text(node).to_s.scan(/\p{Alnum}+/).size
171
+ end
172
+
173
+ def date?(node)
174
+ @has_date[node] ||= begin
175
+ text = HtmlExtractor.extract_visible_text(node).to_s
176
+ text.match?(%r{\b\d{4}[-/]\d{2}[-/]\d{2}\b}) ||
177
+ text.match?(/\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b/i)
178
+ end
179
+ end
180
+ # rubocop:enable Metrics/ClassLength
181
+ end
182
+ end
183
+ end
184
+ end
185
+ end
@@ -62,6 +62,7 @@ module Html2rss
62
62
  @url = url
63
63
  @extractor = extractor
64
64
  @opts = opts
65
+ @fallback_anchorless = opts.fetch(:fallback_anchorless, false)
65
66
  @link_heuristics = LinkHeuristics.new(url)
66
67
  @ignored_cache = {}.compare_by_identity
67
68
  end
@@ -105,8 +106,19 @@ module Html2rss
105
106
  private
106
107
 
107
108
  def articles
108
- @articles ||= each_article_tag.filter_map do |article_tag, selected_anchor|
109
- extract_article(article_tag, selected_anchor:)
109
+ @articles ||= begin
110
+ extracted = each_article_tag.filter_map do |article_tag, selected_anchor|
111
+ extract_article(article_tag, selected_anchor:)
112
+ end
113
+
114
+ extracted += find_anchorless_articles if @fallback_anchorless
115
+ extracted
116
+ end
117
+ end
118
+
119
+ def find_anchorless_articles
120
+ ClassClustering.call(parsed_body, minimum_selector_frequency:).map do |node|
121
+ @extractor.new(node, base_url: @url, selected_anchor: nil, fallback_anchorless: true).call
110
122
  end
111
123
  end
112
124
 
@@ -305,7 +305,7 @@ module Html2rss
305
305
  # rubocop:disable Metrics/MethodLength
306
306
  # @param entry [Hash] raw article entry candidate
307
307
  # @param base_url [String, Html2rss::Url] base URL for relative link resolution
308
- # @return [Hash{Symbol => Object}, nil] normalized article hash for downstream extraction
308
+ # @return [Hash{Symbol => Object, nil}] normalized article hash for downstream extraction
309
309
  def normalise(entry, base_url:)
310
310
  return unless entry.is_a?(Hash)
311
311
 
@@ -92,7 +92,7 @@ module Html2rss
92
92
  attr_reader :parsed_body, :url
93
93
 
94
94
  # @param root [Nokogiri::XML::Element] supported Microdata root node
95
- # @return [Hash{Symbol => Object}, nil] normalized article hash
95
+ # @return [Hash{Symbol => Object, nil}] normalized article hash
96
96
  def article_from(root)
97
97
  schema_object = SchemaObjectBuilder.call(root)
98
98
  return unless schema_object
@@ -378,7 +378,7 @@ module Html2rss
378
378
  extend ValueNormalizer
379
379
 
380
380
  # @param root [Nokogiri::XML::Element] supported microdata root node
381
- # @return [Hash{Symbol => Object}, nil] compact schema-like object
381
+ # @return [Hash{Symbol => Object, nil}] compact schema-like object
382
382
  def call(root)
383
383
  type = Microdata.supported_type_name(root)
384
384
  return unless type
@@ -37,7 +37,8 @@ module Html2rss
37
37
  html: {
38
38
  enabled: true,
39
39
  minimum_selector_frequency: Scraper::Html::DEFAULT_MINIMUM_SELECTOR_FREQUENCY,
40
- use_top_selectors: Scraper::Html::DEFAULT_USE_TOP_SELECTORS
40
+ use_top_selectors: Scraper::Html::DEFAULT_USE_TOP_SELECTORS,
41
+ fallback_anchorless: true
41
42
  }
42
43
  },
43
44
  cleanup: Cleanup::DEFAULT_CONFIG
@@ -63,6 +64,7 @@ module Html2rss
63
64
  optional(:enabled).filled(:bool)
64
65
  optional(:minimum_selector_frequency).filled(:integer, gt?: 0)
65
66
  optional(:use_top_selectors).filled(:integer, gt?: 0)
67
+ optional(:fallback_anchorless).filled(:bool)
66
68
  end
67
69
  end.freeze
68
70
  private_constant :SCRAPER_CONFIG
data/lib/html2rss/cli.rb CHANGED
@@ -48,6 +48,9 @@ module Html2rss
48
48
  method_option :max_requests,
49
49
  type: :numeric,
50
50
  desc: 'Maximum requests to allow for this feed build'
51
+ method_option :input,
52
+ type: :string,
53
+ desc: 'Local HTML file path to read input from'
51
54
  # @param yaml_file [String] path to YAML config
52
55
  # @param feed_name [String, nil] optional named feed in multi-feed config
53
56
  # @return [void]
@@ -55,6 +58,7 @@ module Html2rss
55
58
  config = Html2rss.config_from_yaml_file(yaml_file, feed_name)
56
59
  config[:params] = options[:params] || {}
57
60
  apply_runtime_request_overrides!(config)
61
+ apply_local_file_input!(config, options[:input]) if options[:input]
58
62
 
59
63
  puts(execute_feed { Html2rss.feed(config) })
60
64
  end
@@ -76,20 +80,17 @@ module Html2rss
76
80
  method_option :max_requests,
77
81
  type: :numeric,
78
82
  desc: 'Maximum requests to allow for this feed build'
79
- # @param url [String] source page URL for auto discovery
83
+ method_option :input,
84
+ type: :string,
85
+ desc: 'Local HTML file path to read input from'
86
+ # @param url [String, nil] source page URL for auto discovery
80
87
  # @return [void]
81
- def auto(url) # rubocop:disable Metrics/MethodLength
88
+ def auto(url = nil)
82
89
  format = options.fetch(:format, 'rss')
83
- source_method = format == 'jsonfeed' ? Html2rss.method(:auto_json_feed) : Html2rss.method(:auto_source)
90
+ strategy, local_file_path, url = prepare_auto_inputs(url, options[:input])
84
91
 
85
92
  result = execute_feed do
86
- source_method.call(
87
- url,
88
- strategy: current_strategy,
89
- items_selector: options[:items_selector],
90
- max_redirects: options[:max_redirects],
91
- max_requests: options[:max_requests]
92
- )
93
+ source_call(url, strategy, local_file_path, format == 'jsonfeed')
93
94
  end
94
95
 
95
96
  puts(format == 'jsonfeed' ? JSON.pretty_generate(result) : result)
@@ -159,6 +160,33 @@ module Html2rss
159
160
  config.delete(:request) if request_config.empty?
160
161
  end
161
162
 
163
+ def apply_local_file_input!(config, input_path)
164
+ file_path = check_file_exists!(input_path)
165
+ config[:strategy] = :local_file
166
+ config[:request] = (config[:request] || {}).merge(local_file_path: file_path)
167
+
168
+ return unless config.dig(:channel, :url).to_s.empty?
169
+
170
+ config[:channel] = (config[:channel] || {}).merge(
171
+ url: detect_base_url!(file_path, 'Please specify a channel.url in the config.')
172
+ )
173
+ end
174
+
175
+ def prepare_auto_inputs(url, input_option)
176
+ if input_option.nil?
177
+ raise Thor::Error, 'A URL is required unless --input is specified' unless url
178
+
179
+ return [current_strategy, nil, url]
180
+ end
181
+
182
+ file_path = check_file_exists!(input_option)
183
+ detected_url = url || detect_base_url!(
184
+ file_path, 'Please specify a URL: html2rss auto [URL] --input <file>'
185
+ )
186
+
187
+ [:local_file, file_path, detected_url]
188
+ end
189
+
162
190
  def request_controls
163
191
  Html2rss::RequestControls.new(
164
192
  strategy: options[:strategy]&.to_sym,
@@ -213,5 +241,28 @@ module Html2rss
213
241
  Html2rss::NoFeedItemsExtracted => error
214
242
  raise Thor::Error, error.message
215
243
  end
244
+
245
+ def source_call(url, strategy, local_file_path, is_json)
246
+ method = is_json ? Html2rss.method(:auto_json_feed) : Html2rss.method(:auto_source)
247
+ method.call(
248
+ url,
249
+ strategy:,
250
+ items_selector: options[:items_selector],
251
+ max_redirects: options[:max_redirects],
252
+ max_requests: options[:max_requests],
253
+ local_file_path:
254
+ )
255
+ end
256
+
257
+ def check_file_exists!(path)
258
+ File.expand_path(path).tap do |file_path|
259
+ raise Thor::Error, "Input file does not exist: #{path}" unless File.exist?(file_path)
260
+ end
261
+ end
262
+
263
+ def detect_base_url!(file_path, error_hint)
264
+ Html2rss::Url.extract_from_html(File.read(file_path))&.to_s ||
265
+ raise(Thor::Error, "Could not auto-detect a base URL from HTML metadata. #{error_hint}")
266
+ end
216
267
  end
217
268
  end
@@ -29,7 +29,7 @@ module Html2rss
29
29
  # Validates a configuration hash with the runtime validator.
30
30
  #
31
31
  # @param config [Hash{Symbol => Object}] the configuration hash
32
- # @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting
32
+ # @param params [Hash{Symbol => Object, Hash{String => Object, nil}}] dynamic parameters for string formatting
33
33
  # @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
34
34
  def validate(config, params: UNSET)
35
35
  prepared_config = prepare_for_validation(resolve_effective_config(config, params:))
@@ -56,7 +56,7 @@ module Html2rss
56
56
  # @param file [String] the YAML file to load
57
57
  # @param feed_name [String, nil] optional feed name for multi-feed files
58
58
  # @param multiple_feeds_key [Symbol] key under which multiple feeds are defined
59
- # @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting
59
+ # @param params [Hash{Symbol => Object, Hash{String => Object, nil}}] dynamic parameters for string formatting
60
60
  # @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
61
61
  def validate_yaml(file, feed_name = nil, multiple_feeds_key: MultipleFeedsConfig::CONFIG_KEY_FEEDS, params: UNSET)
62
62
  validate(load_yaml(file, feed_name, multiple_feeds_key:), params:)
@@ -99,7 +99,7 @@ module Html2rss
99
99
  # and returns a new configuration object.
100
100
  #
101
101
  # @param config [Hash{Symbol => Object}] the configuration hash.
102
- # @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting.
102
+ # @param params [Hash{Symbol => Object, Hash{String => Object, nil}}] dynamic parameters for string formatting.
103
103
  # @return [Html2rss::Config] the configuration object.
104
104
  def from_hash(config, params: UNSET)
105
105
  new(resolve_effective_config(config, params:))
@@ -83,6 +83,7 @@ module Html2rss
83
83
  optional(:total_timeout_seconds).filled(:integer, gt?: 0)
84
84
  optional(:browserless).hash(BrowserlessRequestConfig)
85
85
  optional(:botasaurus).hash(BotasaurusRequestConfig)
86
+ optional(:local_file_path).filled(:string)
86
87
  end
87
88
 
88
89
  params do
@@ -69,9 +69,9 @@ module Html2rss
69
69
  # @return [Hash{Symbol => Object}] request envelope configuration
70
70
  def request = config[:request]
71
71
 
72
- # @return [Hash{Symbol => Object}, nil] selectors configuration
72
+ # @return [Hash{Symbol => Object, nil}] selectors configuration
73
73
  def selectors = config[:selectors]
74
- # @return [Hash{Symbol => Object}, nil] auto-source configuration
74
+ # @return [Hash{Symbol => Object, nil}] auto-source configuration
75
75
  def auto_source = config[:auto_source]
76
76
 
77
77
  private
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class HtmlExtractor
5
+ ##
6
+ # HeadingExtractor identifies and returns the best heading element within a container.
7
+ class HeadingExtractor
8
+ # Heading tags used to prioritize title extraction.
9
+ HEADING_TAGS = HtmlExtractor::HEADING_TAGS
10
+
11
+ class << self
12
+ ##
13
+ # @param article_tag [Nokogiri::XML::Element] container node
14
+ # @param fallback_anchorless [Boolean] whether to use fallback search
15
+ # @param selected_anchor [Nokogiri::XML::Node, nil] anchor element
16
+ # @return [Nokogiri::XML::Node, nil] the heading node, if found
17
+ def call(article_tag, fallback_anchorless:, selected_anchor:)
18
+ tags = article_tag.css(HEADING_TAGS.join(','))
19
+ if tags.any?
20
+ select_best_heading(tags)
21
+ elsif fallback_anchorless && selected_anchor.nil?
22
+ fallback_heading(article_tag)
23
+ end
24
+ end
25
+
26
+ private
27
+
28
+ def select_best_heading(tags)
29
+ min_tag_name = tags.map(&:name).min
30
+ best_tag = nil
31
+ max_size = -1
32
+
33
+ tags.each do |tag|
34
+ next if tag.name != min_tag_name
35
+
36
+ size = TextExtractor.call(tag)&.size.to_i
37
+ (best_tag = tag) && (max_size = size) if size > max_size
38
+ end
39
+
40
+ best_tag
41
+ end
42
+
43
+ def fallback_heading(article_tag)
44
+ fallback_tags = article_tag.css('strong, b, [class*="title"], [class*="font-bold"], [class*="font-semibold"]')
45
+ fallback_tags.find { |t| !TextExtractor.call(t).to_s.strip.empty? }
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'zlib'
4
+
5
+ module Html2rss
6
+ class HtmlExtractor
7
+ ##
8
+ # IdGenerator determines the unique ID for an article container node.
9
+ class IdGenerator
10
+ class << self
11
+ ##
12
+ # @param article_tag [Nokogiri::XML::Element] container node
13
+ # @param heading [Nokogiri::XML::Node, nil] heading node
14
+ # @param url [Html2rss::Url, nil] absolute article URL
15
+ # @param selected_anchor [Nokogiri::XML::Node, nil] anchor element
16
+ # @param fallback_anchorless [Boolean] whether to use fallback hashing
17
+ # @return [String, nil] the generated ID, if any
18
+ def call(article_tag, heading:, url:, selected_anchor:, fallback_anchorless:)
19
+ id_from_dom = parse_id_from_dom(article_tag, url, selected_anchor)
20
+ return id_from_dom if id_from_dom
21
+
22
+ heading_text = resolve_heading_text(article_tag, heading, fallback_anchorless)
23
+ if heading_text && !heading_text.strip.empty?
24
+ generate_slug(heading_text)
25
+ elsif fallback_anchorless
26
+ generate_content_hash(article_tag)
27
+ end
28
+ end
29
+
30
+ private
31
+
32
+ def parse_id_from_dom(article_tag, url, selected_anchor)
33
+ candidates = [article_tag['id'], article_tag.at_css('[id]')&.attr('id')]
34
+ candidates += [url&.path, url&.query] if selected_anchor
35
+ candidates.compact.reject(&:empty?).first
36
+ end
37
+
38
+ def resolve_heading_text(article_tag, heading, fallback_anchorless)
39
+ text = heading ? TextExtractor.call(heading) : nil
40
+ if text.nil? || text.strip.empty?
41
+ fallback_text_node_content(article_tag, fallback_anchorless)
42
+ else
43
+ text
44
+ end
45
+ end
46
+
47
+ def fallback_text_node_content(article_tag, fallback_anchorless)
48
+ return unless fallback_anchorless
49
+
50
+ article_tag.xpath('.//text()').find { |t| !t.text.strip.empty? }&.text&.strip
51
+ end
52
+
53
+ def generate_slug(text)
54
+ slug = text.downcase.gsub(/[^a-z0-9]+/, '-')
55
+ slug = slug[1..] if slug.start_with?('-')
56
+ slug = slug[0..-2] if slug.end_with?('-')
57
+ slug unless slug.empty?
58
+ end
59
+
60
+ def generate_content_hash(article_tag)
61
+ text = TextExtractor.call(article_tag).to_s.strip
62
+ Zlib.crc32(text).to_s(36) unless text.empty?
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class HtmlExtractor
5
+ ##
6
+ # TextExtractor extracts visible text from DOM elements, preserving lists
7
+ # and block spacing while sanitizing white spaces.
8
+ class TextExtractor
9
+ # HTML block elements that trigger line breaks or special formatting.
10
+ BLOCK_TAGS = %w[p div li ul ol h1 h2 h3 h4 h5 h6 tr br].to_set.freeze
11
+ # Tags ignored when extracting visible text content.
12
+ INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
13
+
14
+ class << self
15
+ ##
16
+ # @param tag [Nokogiri::XML::Node] the node from which to extract visible text
17
+ # @param separator [String] separator used to join text fragments (default is a space)
18
+ # @param exclude_nodes [Array<Nokogiri::XML::Node>, nil] nodes to exclude from extraction
19
+ # @return [String, nil] the concatenated visible text, or nil if none is found
20
+ def call(tag, separator: ' ', exclude_nodes: nil)
21
+ return tag.text.gsub(/\s+/, ' ').strip if tag.respond_to?(:text?) && tag.text?
22
+
23
+ parts = iterate_children(tag, separator, exclude_nodes)
24
+ return if parts.empty?
25
+
26
+ parts.join.squeeze(' ').strip
27
+ end
28
+
29
+ private
30
+
31
+ def iterate_children(tag, separator, exclude_nodes)
32
+ last = false
33
+ tag.children.each_with_object([]) do |c, p|
34
+ next if exclude_nodes&.include?(c) || !visible_child?(c)
35
+
36
+ text, block = process_child_node(c, separator, exclude_nodes)
37
+ next if text.empty?
38
+
39
+ append_separator!(p, separator, block, last)
40
+ (p << text) && (last = block)
41
+ end
42
+ end
43
+
44
+ def process_child_node(child, separator, exclude_nodes)
45
+ child_text = get_child_text(child, separator, exclude_nodes)
46
+ return ['', false] if child_text.empty?
47
+
48
+ child_text = "- #{child_text}" if child.name == 'li'
49
+ [child_text, BLOCK_TAGS.include?(child.name)]
50
+ end
51
+
52
+ def get_child_text(child, separator, exclude_nodes)
53
+ if child.children.empty?
54
+ child.text.to_s.gsub(/\s+/, ' ').strip
55
+ else
56
+ call(child, separator:, exclude_nodes:).to_s.strip
57
+ end
58
+ end
59
+
60
+ def append_separator!(parts, separator, is_block, last_was_block)
61
+ return if parts.empty?
62
+
63
+ parts << if is_block || last_was_block
64
+ (separator == ' ' ? "\n" : separator)
65
+ else
66
+ ' '
67
+ end
68
+ end
69
+
70
+ def visible_child?(node)
71
+ !INVISIBLE_CONTENT_TAGS.include?(node.name) &&
72
+ !(node.name == 'a' && node['href']&.start_with?('#'))
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -4,13 +4,11 @@ module Html2rss
4
4
  ##
5
5
  # HtmlExtractor is responsible for extracting details (headline, url, images, etc.)
6
6
  # from an article_tag.
7
- class HtmlExtractor # rubocop:disable Metrics/ClassLength
8
- # Tags ignored when extracting visible text content from article containers.
9
- INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
7
+ # rubocop:disable Metrics/ClassLength
8
+ class HtmlExtractor
10
9
  # Heading tags used to prioritize title extraction.
11
10
  HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
12
- # Selector used to derive non-headline description nodes.
13
- NON_HEADLINE_SELECTOR = (HEADING_TAGS.map { |tag| ":not(#{tag})" } + INVISIBLE_CONTENT_TAGS.to_a).freeze
11
+
14
12
  # Element tags that indicate ignored DOM chrome when found in a container path.
15
13
  IGNORED_CONTAINER_TAGS = %w[nav footer header svg script style].to_set.freeze
16
14
 
@@ -26,20 +24,14 @@ module Html2rss
26
24
  class << self
27
25
  ##
28
26
  # Extracts visible text from a given node and its children.
27
+ # Delegates to TextExtractor.
29
28
  #
30
29
  # @param tag [Nokogiri::XML::Node] the node from which to extract visible text
31
30
  # @param separator [String] separator used to join text fragments (default is a space)
31
+ # @param exclude_nodes [Array<Nokogiri::XML::Node>, nil] nodes to exclude from extraction
32
32
  # @return [String, nil] the concatenated visible text, or nil if none is found
33
- def extract_visible_text(tag, separator: ' ')
34
- parts = tag.children.filter_map do |child|
35
- next unless visible_child?(child)
36
-
37
- raw_text = child.children.empty? ? child.text : extract_visible_text(child)
38
- text = raw_text&.strip
39
- text unless text.to_s.empty?
40
- end
41
-
42
- parts.join(separator).squeeze(' ').strip unless parts.empty?
33
+ def extract_visible_text(tag, separator: ' ', exclude_nodes: nil)
34
+ TextExtractor.call(tag, separator:, exclude_nodes:)
43
35
  end
44
36
 
45
37
  ##
@@ -74,23 +66,20 @@ module Html2rss
74
66
  end
75
67
  false
76
68
  end
77
-
78
- def visible_child?(node)
79
- !INVISIBLE_CONTENT_TAGS.include?(node.name) &&
80
- !(node.name == 'a' && node['href']&.start_with?('#'))
81
- end
82
69
  end
83
70
 
84
71
  ##
85
72
  # @param article_tag [Nokogiri::XML::Node] article-like container to extract from
86
73
  # @param base_url [String, Html2rss::Url] base url used to resolve relative links
87
74
  # @param selected_anchor [Nokogiri::XML::Node, nil] explicit primary anchor for the container
88
- def initialize(article_tag, base_url:, selected_anchor:)
75
+ # @param fallback_anchorless [Boolean] whether to fall back to anchorless extraction
76
+ def initialize(article_tag, base_url:, selected_anchor:, fallback_anchorless: false)
89
77
  raise ArgumentError, 'article_tag is required' unless article_tag
90
78
 
91
79
  @article_tag = article_tag
92
80
  @base_url = base_url
93
81
  @selected_anchor = selected_anchor
82
+ @fallback_anchorless = fallback_anchorless
94
83
  end
95
84
 
96
85
  # @return [Hash{Symbol => Object}] extracted article attributes
@@ -115,54 +104,62 @@ module Html2rss
115
104
  @extract_url ||= begin
116
105
  href = selected_anchor&.[]('href').to_s
117
106
 
118
- Url.from_relative(href.split('#').first.strip, base_url) unless href.empty?
107
+ if href.empty?
108
+ anchorless_url_fallback
109
+ else
110
+ Url.from_relative(href.split('#').first.strip, base_url)
111
+ end
119
112
  end
120
113
  end
121
114
 
122
- def extract_title
123
- title_source = heading || selected_anchor
124
- self.class.extract_visible_text(title_source) if title_source
115
+ def anchorless_url_fallback
116
+ return unless @fallback_anchorless
117
+
118
+ id = generate_id
119
+ Url.from_relative("##{id}", base_url) if id
125
120
  end
126
121
 
127
- def heading
128
- @heading ||= begin
129
- tags = article_tag.css(HEADING_TAGS.join(','))
130
- tags.any? ? select_best_heading(tags) : nil
122
+ def extract_title
123
+ title_source = heading || selected_anchor
124
+ if title_source
125
+ self.class.extract_visible_text(title_source)
126
+ else
127
+ fallback_anchorless_title
131
128
  end
132
129
  end
133
130
 
134
- def select_best_heading(tags)
135
- min_tag_name = tags.map(&:name).min
136
- best_tag = nil
137
- max_size = -1
138
-
139
- tags.each do |tag|
140
- next if tag.name != min_tag_name
131
+ def fallback_anchorless_title
132
+ return unless @fallback_anchorless && selected_anchor.nil?
141
133
 
142
- size = self.class.extract_visible_text(tag)&.size.to_i
143
- (best_tag = tag) && (max_size = size) if size > max_size
144
- end
134
+ text_node = article_tag.xpath('.//text()').find { |t| !t.text.strip.empty? }
135
+ text_node&.text&.strip
136
+ end
145
137
 
146
- best_tag
138
+ def heading
139
+ @heading ||= HeadingExtractor.call(
140
+ article_tag,
141
+ fallback_anchorless: @fallback_anchorless,
142
+ selected_anchor:
143
+ )
147
144
  end
148
145
 
149
146
  def extract_description
150
- text = self.class.extract_visible_text(article_tag.css(NON_HEADLINE_SELECTOR), separator: '<br>')
151
- return text if text && !text.empty?
152
-
153
- description = self.class.extract_visible_text(article_tag)
154
- return nil if description.nil? || description.strip.empty?
147
+ exclude = [heading, selected_anchor].compact.to_set
148
+ description = self.class.extract_visible_text(article_tag, exclude_nodes: exclude)
149
+ return if description.nil?
155
150
 
156
- description.strip
151
+ desc = description.strip
152
+ desc.empty? ? nil : desc
157
153
  end
158
154
 
159
155
  def generate_id
160
- [
161
- article_tag['id'],
162
- article_tag.at_css('[id]')&.attr('id'),
163
- extract_url&.path,
164
- extract_url&.query
165
- ].compact.reject(&:empty?).first
156
+ @generate_id ||= IdGenerator.call(
157
+ article_tag,
158
+ heading:,
159
+ url: (selected_anchor ? extract_url : nil),
160
+ selected_anchor:,
161
+ fallback_anchorless: @fallback_anchorless
162
+ )
166
163
  end
167
164
 
168
165
  def extract_image = ImageExtractor.call(article_tag, base_url:)
@@ -170,4 +167,5 @@ module Html2rss
170
167
  def extract_enclosures = EnclosureExtractor.call(article_tag, base_url)
171
168
  def extract_categories = CategoryExtractor.call(article_tag)
172
169
  end
170
+ # rubocop:enable Metrics/ClassLength
173
171
  end
@@ -4,6 +4,8 @@ module Html2rss
4
4
  # Namespace for HTML rendering logic, used to generate rich content such as
5
5
  # images, audio, video, or embedded documents for feed descriptions.
6
6
  #
7
+ # @see Html2rss::Rendering::DescriptionBuilder
8
+ #
7
9
  # @example
8
10
  # Html2rss::Rendering::ImageRenderer.new(
9
11
  # url: "https://example.com/image.jpg",
@@ -16,8 +18,6 @@ module Html2rss
16
18
  # image: "https://example.com/image.jpg",
17
19
  # title: "Example"
18
20
  # )
19
- #
20
- # @see Html2rss::Rendering::DescriptionBuilder
21
21
  module Rendering
22
22
  end
23
23
  end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestService
5
+ ##
6
+ # Strategy to read a local HTML file.
7
+ class LocalFileStrategy < Strategy
8
+ ##
9
+ # Executes the local file read.
10
+ #
11
+ # @return [Response] the mock response wrapped around the file contents
12
+ # @raise [ArgumentError] if the local file path is missing
13
+ # @raise [Errno::ENOENT] if the file does not exist
14
+ def execute
15
+ file_path = ctx.request[:local_file_path]
16
+ raise ArgumentError, 'Local file path is required for local_file strategy' unless file_path
17
+ raise Errno::ENOENT, "File not found: #{file_path}" unless File.exist?(file_path)
18
+
19
+ body = File.read(file_path)
20
+ Response.new(
21
+ body:,
22
+ headers: { 'content-type' => 'text/html; charset=utf-8' },
23
+ url: ctx.url,
24
+ status: 200
25
+ )
26
+ end
27
+ end
28
+ end
29
+ end
@@ -97,6 +97,10 @@ module Html2rss
97
97
 
98
98
  attr_reader :ctx, :browser, :skip_request_resources, :referer, :latest_navigation_response, :main_frame
99
99
 
100
+ ##
101
+ # Re-raises a deferred navigation error when one was captured.
102
+ #
103
+ # @raise [Html2rss::Error] when a navigation request or response validation failed
100
104
  def raise_navigation_error_if_any
101
105
  raise @navigation_error if @navigation_error
102
106
  end
@@ -57,7 +57,8 @@ module Html2rss
57
57
  @strategies = {
58
58
  faraday: FaradayStrategy,
59
59
  botasaurus: BotasaurusStrategy,
60
- browserless: BrowserlessStrategy
60
+ browserless: BrowserlessStrategy,
61
+ local_file: LocalFileStrategy
61
62
  }
62
63
  @default_strategy_name = :faraday
63
64
  end
@@ -25,7 +25,7 @@ module Html2rss
25
25
  # during post processing with {PostProcessors::ParseTime}.
26
26
  class Attribute
27
27
  # The available options for the attribute extractor.
28
- Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
28
+ Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true)
29
29
 
30
30
  ##
31
31
  # Initializes the Attribute extractor.
@@ -25,7 +25,7 @@ module Html2rss
25
25
  # 'http://blog-without-a-feed.example.com/posts/latest-findings'
26
26
  class Href
27
27
  # The available options for the href (attribute) extractor.
28
- Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
28
+ Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true)
29
29
 
30
30
  ##
31
31
  # Initializes the Href extractor.
@@ -24,7 +24,7 @@ module Html2rss
24
24
  # {PostProcessors::SanitizeHtml}.
25
25
  class Html
26
26
  # The available options for the html extractor.
27
- Options = Struct.new('HtmlOptions', :selector, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
27
+ Options = Struct.new('HtmlOptions', :selector, keyword_init: true)
28
28
 
29
29
  ##
30
30
  # Initializes the Html extractor.
@@ -17,7 +17,7 @@ module Html2rss
17
17
  # 'Foobar'
18
18
  class Static
19
19
  # The available option for the static extractor.
20
- Options = Struct.new('StaticOptions', :static, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
20
+ Options = Struct.new('StaticOptions', :static, keyword_init: true)
21
21
 
22
22
  ##
23
23
  # Initializes the Static extractor.
@@ -22,7 +22,7 @@ module Html2rss
22
22
  # 'Lorem ipsum dolor ...'
23
23
  class Text
24
24
  # The available options for the text extractor.
25
- Options = Struct.new('TextOptions', :selector, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
25
+ Options = Struct.new('TextOptions', :selector, keyword_init: true)
26
26
 
27
27
  ##
28
28
  # Initializes the Text extractor.
@@ -128,8 +128,15 @@ module Html2rss
128
128
  ##
129
129
  # @return [String, nil]
130
130
  def get
131
- sanitized_html = Sanitize.fragment(value, self.class.sanitize_config(channel_url)).to_s
131
+ # Temporarily replace newlines with a placeholder to preserve them during space collapsing
132
+ temp_value = value.to_s.gsub("\n", ' __NEWLINE_PLACEHOLDER__ ')
133
+ sanitized_html = Sanitize.fragment(temp_value, self.class.sanitize_config(channel_url)).to_s
132
134
  sanitized_html.gsub!(/\s+/, ' ')
135
+
136
+ # Restore newlines and clean up surrounding whitespace
137
+ sanitized_html.gsub!(/[ \t\r]*__NEWLINE_PLACEHOLDER__[ \t\r]*/, "\n")
138
+ sanitized_html.gsub!(/\n{3,}/, "\n\n")
139
+
133
140
  sanitized_html.strip!
134
141
  sanitized_html.empty? ? nil : sanitized_html
135
142
  end
@@ -18,7 +18,7 @@ module Html2rss
18
18
  include Enumerable
19
19
 
20
20
  # A context instance passed to item extractors and post-processors.
21
- Context = Struct.new('Context', :options, :item, :config, :scraper, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
21
+ Context = Struct.new('Context', :options, :item, :config, :scraper, keyword_init: true)
22
22
 
23
23
  # Default selectors options merged into user configuration.
24
24
  DEFAULT_CONFIG = { items: { enhance: true } }.freeze
data/lib/html2rss/url.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require 'addressable/uri'
4
4
  require 'cgi'
5
+ require 'nokogiri'
5
6
 
6
7
  module Html2rss
7
8
  ##
@@ -55,10 +56,9 @@ module Html2rss
55
56
  # @return [Url, nil] the sanitized URL, or nil if no valid URL found
56
57
  def self.sanitize(raw_url)
57
58
  match = raw_url.to_s.match(%r{(?:(?:https?|ftp|mailto)://|mailto:)[^\s<>"]+})
58
- url = match ? match[0].strip : ''
59
- return nil if url.empty?
59
+ return unless match
60
60
 
61
- new(Addressable::URI.parse(url).normalize)
61
+ new(Addressable::URI.parse(match[0].strip).normalize)
62
62
  end
63
63
 
64
64
  ##
@@ -70,10 +70,9 @@ module Html2rss
70
70
  def self.from_absolute(url_string)
71
71
  return url_string if url_string.is_a?(self)
72
72
 
73
- url = new(Addressable::URI.parse(url_string.to_s.strip).normalize)
74
- raise ArgumentError, 'URL must be absolute' unless url.absolute?
75
-
76
- url
73
+ new(Addressable::URI.parse(url_string.to_s.strip).normalize).tap do |url|
74
+ raise ArgumentError, 'URL must be absolute' unless url.absolute?
75
+ end
77
76
  rescue Addressable::URI::InvalidURIError
78
77
  raise ArgumentError, 'URL must be absolute'
79
78
  end
@@ -92,14 +91,28 @@ module Html2rss
92
91
  # Url.for_channel('/relative/path')
93
92
  # # => raises ArgumentError: "URL must be absolute"
94
93
  def self.for_channel(url_string)
95
- return nil if url_string.nil? || url_string.empty?
96
-
97
- stripped = url_string.strip
94
+ stripped = url_string.to_s.strip
98
95
  return nil if stripped.empty?
99
96
 
100
- url = from_absolute(stripped)
101
- validate_channel_url(url)
102
- url
97
+ from_absolute(stripped).tap { validate_channel_url(_1) }
98
+ end
99
+
100
+ ##
101
+ # Extracts a base URL from HTML metadata tags.
102
+ #
103
+ # @param html [String] raw HTML content
104
+ # @return [Url, nil] the extracted absolute URL, or nil if none is found
105
+ def self.extract_from_html(html)
106
+ doc = Nokogiri::HTML(html)
107
+ tags = { 'link[rel="canonical"]' => 'href', 'meta[property="og:url"]' => 'content',
108
+ 'meta[name="twitter:url"]' => 'content', 'base[href]' => 'href' }
109
+ tags.each do |sel, attr|
110
+ val = doc.at_css(sel)&.[](attr).to_s.strip
111
+ return from_absolute(val) unless val.empty?
112
+ rescue ArgumentError
113
+ next
114
+ end
115
+ nil
103
116
  end
104
117
 
105
118
  ##
@@ -4,6 +4,6 @@
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
6
  # Current application version.
7
- VERSION = '0.21.0'
7
+ VERSION = '0.22.0'
8
8
  public_constant :VERSION
9
9
  end
data/lib/html2rss.rb CHANGED
@@ -52,6 +52,8 @@ module Html2rss
52
52
  FeedPipeline.new(raw_config).to_json_feed
53
53
  end
54
54
 
55
+ # rubocop:disable Metrics/ParameterLists
56
+
55
57
  ##
56
58
  # Scrapes the provided URL and returns an RSS object.
57
59
  #
@@ -60,9 +62,15 @@ module Html2rss
60
62
  # @param items_selector [String, nil] optional selector hint for item extraction
61
63
  # @param max_redirects [Integer, nil] optional redirect limit override
62
64
  # @param max_requests [Integer, nil] optional request budget override
65
+ # @param local_file_path [String, nil] optional local HTML file path
63
66
  # @return [RSS::Rss] generated RSS feed
64
- def self.auto_source(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil)
65
- feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
67
+ def self.auto_source(url,
68
+ strategy: :auto,
69
+ items_selector: nil,
70
+ max_redirects: nil,
71
+ max_requests: nil,
72
+ local_file_path: nil)
73
+ feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:, local_file_path:))
66
74
  end
67
75
 
68
76
  ##
@@ -73,11 +81,20 @@ module Html2rss
73
81
  # @param items_selector [String, nil] optional selector hint for item extraction
74
82
  # @param max_redirects [Integer, nil] optional redirect limit override
75
83
  # @param max_requests [Integer, nil] optional request budget override
84
+ # @param local_file_path [String, nil] optional local HTML file path
76
85
  # @return [Hash] JSONFeed-compliant hash
77
- def self.auto_json_feed(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil)
78
- json_feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
86
+ def self.auto_json_feed(url,
87
+ strategy: :auto,
88
+ items_selector: nil,
89
+ max_redirects: nil,
90
+ max_requests: nil,
91
+ local_file_path: nil)
92
+ json_feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:,
93
+ local_file_path:))
79
94
  end
80
95
 
96
+ # rubocop:enable Metrics/ParameterLists
97
+
81
98
  # rubocop:disable ThreadSafety/ClassInstanceVariable
82
99
  class << self
83
100
  ##
@@ -125,12 +142,17 @@ module Html2rss
125
142
  class << self
126
143
  private
127
144
 
128
- def build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:)
129
- Config.auto_source_config(
145
+ def build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:, local_file_path: nil) # rubocop:disable Metrics/ParameterLists
146
+ config = Config.auto_source_config(
130
147
  url:,
131
148
  items_selector:,
132
149
  request_controls: shortcut_request_controls(strategy:, max_redirects:, max_requests:)
133
150
  )
151
+ if local_file_path
152
+ config[:request] ||= {}
153
+ config[:request][:local_file_path] = local_file_path
154
+ end
155
+ config
134
156
  end
135
157
 
136
158
  def shortcut_request_controls(strategy:, max_redirects:, max_requests:)
@@ -179,6 +179,12 @@
179
179
  "type": "null"
180
180
  },
181
181
  "exclusiveMinimum": 0
182
+ },
183
+ "fallback_anchorless": {
184
+ "type": "boolean",
185
+ "not": {
186
+ "type": "null"
187
+ }
182
188
  }
183
189
  },
184
190
  "required": []
@@ -227,7 +233,8 @@
227
233
  "html": {
228
234
  "enabled": true,
229
235
  "minimum_selector_frequency": 2,
230
- "use_top_selectors": 5
236
+ "use_top_selectors": 5,
237
+ "fallback_anchorless": true
231
238
  }
232
239
  },
233
240
  "cleanup": {
@@ -535,6 +542,10 @@
535
542
  }
536
543
  },
537
544
  "required": []
545
+ },
546
+ "local_file_path": {
547
+ "type": "string",
548
+ "minLength": 1
538
549
  }
539
550
  },
540
551
  "required": []
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.21.0
4
+ version: 0.22.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
@@ -278,6 +278,7 @@ files:
278
278
  - lib/html2rss/auto_source/cleanup.rb
279
279
  - lib/html2rss/auto_source/scraper.rb
280
280
  - lib/html2rss/auto_source/scraper/html.rb
281
+ - lib/html2rss/auto_source/scraper/html/class_clustering.rb
281
282
  - lib/html2rss/auto_source/scraper/json_state.rb
282
283
  - lib/html2rss/auto_source/scraper/link_heuristics.rb
283
284
  - lib/html2rss/auto_source/scraper/microdata.rb
@@ -310,10 +311,13 @@ files:
310
311
  - lib/html2rss/html_extractor.rb
311
312
  - lib/html2rss/html_extractor/date_extractor.rb
312
313
  - lib/html2rss/html_extractor/enclosure_extractor.rb
314
+ - lib/html2rss/html_extractor/heading_extractor.rb
315
+ - lib/html2rss/html_extractor/id_generator.rb
313
316
  - lib/html2rss/html_extractor/image_extractor.rb
314
317
  - lib/html2rss/html_extractor/list_candidates.rb
315
318
  - lib/html2rss/html_extractor/semantic_anchor_candidates.rb
316
319
  - lib/html2rss/html_extractor/semantic_containers.rb
320
+ - lib/html2rss/html_extractor/text_extractor.rb
317
321
  - lib/html2rss/html_navigator.rb
318
322
  - lib/html2rss/json_feed_builder.rb
319
323
  - lib/html2rss/json_feed_builder/item.rb
@@ -332,6 +336,7 @@ files:
332
336
  - lib/html2rss/request_service/budget.rb
333
337
  - lib/html2rss/request_service/context.rb
334
338
  - lib/html2rss/request_service/faraday_strategy.rb
339
+ - lib/html2rss/request_service/local_file_strategy.rb
335
340
  - lib/html2rss/request_service/policy.rb
336
341
  - lib/html2rss/request_service/puppet_commander.rb
337
342
  - lib/html2rss/request_service/response.rb
@@ -376,7 +381,7 @@ licenses:
376
381
  - MIT
377
382
  metadata:
378
383
  allowed_push_host: https://rubygems.org
379
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.21.0
384
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.22.0
380
385
  rubygems_mfa_required: 'true'
381
386
  rdoc_options: []
382
387
  require_paths: