html2rss 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -656
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +115 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
data/exe/html2rss
CHANGED
data/html2rss.gemspec
CHANGED
|
@@ -26,15 +26,18 @@ Gem::Specification.new do |spec|
|
|
|
26
26
|
end
|
|
27
27
|
|
|
28
28
|
spec.files = `git ls-files -z`.split("\x0").select do |f|
|
|
29
|
-
f.match(%r{^(lib/|exe/|README.md|LICENSE|html2rss.gemspec)})
|
|
29
|
+
f.match(%r{^(lib/|exe/|schema/|README.md|LICENSE|html2rss.gemspec)})
|
|
30
30
|
end
|
|
31
31
|
spec.bindir = 'exe'
|
|
32
32
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
33
33
|
spec.require_paths = ['lib']
|
|
34
34
|
|
|
35
35
|
spec.add_dependency 'addressable', '~> 2.7'
|
|
36
|
+
spec.add_dependency 'brotli'
|
|
37
|
+
spec.add_dependency 'dry-validation'
|
|
36
38
|
spec.add_dependency 'faraday', '> 2.0.1', '< 3.0'
|
|
37
39
|
spec.add_dependency 'faraday-follow_redirects'
|
|
40
|
+
spec.add_dependency 'faraday-gzip', '~> 3'
|
|
38
41
|
spec.add_dependency 'kramdown'
|
|
39
42
|
spec.add_dependency 'mime-types', '> 3.0'
|
|
40
43
|
spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
|
|
@@ -43,7 +46,7 @@ Gem::Specification.new do |spec|
|
|
|
43
46
|
spec.add_dependency 'regexp_parser'
|
|
44
47
|
spec.add_dependency 'reverse_markdown', '~> 3.0'
|
|
45
48
|
spec.add_dependency 'rss'
|
|
46
|
-
spec.add_dependency 'sanitize'
|
|
49
|
+
spec.add_dependency 'sanitize'
|
|
47
50
|
spec.add_dependency 'thor'
|
|
48
51
|
spec.add_dependency 'tzinfo'
|
|
49
52
|
spec.add_dependency 'zeitwerk'
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'set' # rubocop:disable Lint/RedundantRequireStatement
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
module Articles
|
|
7
|
+
##
|
|
8
|
+
# Deduplicates a list of articles while preserving their original order.
|
|
9
|
+
#
|
|
10
|
+
# The deduplicator prefers each article's URL (combined with its ID when
|
|
11
|
+
# available) to determine uniqueness. When no URL is present, it falls
|
|
12
|
+
# back to the article ID, then to the GUID enriched with title and
|
|
13
|
+
# description metadata. If none of these identifiers are available it
|
|
14
|
+
# defaults to the article object's hash to preserve the original entry.
|
|
15
|
+
class Deduplicator
|
|
16
|
+
##
|
|
17
|
+
# @param articles [Array<Html2rss::RssBuilder::Article>]
|
|
18
|
+
# @raise [ArgumentError] if articles are not provided
|
|
19
|
+
def initialize(articles)
|
|
20
|
+
raise ArgumentError, 'articles must be provided' unless articles
|
|
21
|
+
|
|
22
|
+
@articles = articles
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
##
|
|
26
|
+
# Returns the list of unique articles, preserving the order of the
|
|
27
|
+
# original collection and keeping the first occurrence of a duplicate.
|
|
28
|
+
# @return [Array<Html2rss::RssBuilder::Article>]
|
|
29
|
+
def call
|
|
30
|
+
seen = Set.new
|
|
31
|
+
|
|
32
|
+
articles.filter do |article|
|
|
33
|
+
fingerprint = deduplication_fingerprint_for(article) || article.hash
|
|
34
|
+
seen.add?(fingerprint)
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
attr_reader :articles
|
|
41
|
+
|
|
42
|
+
def deduplication_fingerprint_for(article)
|
|
43
|
+
return unless article.respond_to?(:deduplication_fingerprint)
|
|
44
|
+
|
|
45
|
+
article.deduplication_fingerprint
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -7,8 +7,15 @@ module Html2rss
|
|
|
7
7
|
# :reek:MissingSafeMethod { enabled: false }
|
|
8
8
|
# It applies various strategies to filter and refine the article list.
|
|
9
9
|
class Cleanup
|
|
10
|
+
DEFAULT_CONFIG = {
|
|
11
|
+
keep_different_domain: false,
|
|
12
|
+
min_words_title: 3
|
|
13
|
+
}.freeze
|
|
14
|
+
|
|
15
|
+
VALID_SCHEMES = %w[http https].to_set.freeze
|
|
16
|
+
|
|
10
17
|
class << self
|
|
11
|
-
def call(articles, url:, keep_different_domain:
|
|
18
|
+
def call(articles, url:, keep_different_domain:, min_words_title:)
|
|
12
19
|
Log.debug "Cleanup: start with #{articles.size} articles"
|
|
13
20
|
|
|
14
21
|
articles.select!(&:valid?)
|
|
@@ -17,13 +24,12 @@ module Html2rss
|
|
|
17
24
|
|
|
18
25
|
keep_only_http_urls!(articles)
|
|
19
26
|
reject_different_domain!(articles, url) unless keep_different_domain
|
|
27
|
+
keep_only_with_min_words_title!(articles, min_words_title:)
|
|
20
28
|
|
|
21
29
|
Log.debug "Cleanup: end with #{articles.size} articles"
|
|
22
30
|
articles
|
|
23
31
|
end
|
|
24
32
|
|
|
25
|
-
private
|
|
26
|
-
|
|
27
33
|
##
|
|
28
34
|
# Deduplicates articles by a given key.
|
|
29
35
|
#
|
|
@@ -42,18 +48,40 @@ module Html2rss
|
|
|
42
48
|
#
|
|
43
49
|
# @param articles [Array<Article>] The list of articles to process.
|
|
44
50
|
def keep_only_http_urls!(articles)
|
|
45
|
-
articles.select! { |article|
|
|
51
|
+
articles.select! { |article| VALID_SCHEMES.include?(article.url&.scheme) }
|
|
46
52
|
end
|
|
47
53
|
|
|
48
54
|
##
|
|
49
55
|
# Rejects articles that have a URL not on the same domain as the source.
|
|
50
56
|
#
|
|
51
57
|
# @param articles [Array<Article>] The list of articles to process.
|
|
52
|
-
# @param base_url [
|
|
58
|
+
# @param base_url [Html2rss::Url] The source URL to compare against.
|
|
53
59
|
def reject_different_domain!(articles, base_url)
|
|
54
60
|
base_host = base_url.host
|
|
55
61
|
articles.select! { |article| article.url&.host == base_host }
|
|
56
62
|
end
|
|
63
|
+
|
|
64
|
+
##
|
|
65
|
+
# Keeps only articles with a title that is present and has at least `min_words_title` words.
|
|
66
|
+
#
|
|
67
|
+
# @param articles [Array<Article>] The list of articles to process.
|
|
68
|
+
# @param min_words_title [Integer] The minimum number of words in the title.
|
|
69
|
+
def keep_only_with_min_words_title!(articles, min_words_title:)
|
|
70
|
+
articles.select! do |article|
|
|
71
|
+
article.title ? word_count_at_least?(article.title, min_words_title) : true
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
private
|
|
76
|
+
|
|
77
|
+
def word_count_at_least?(str, min_words)
|
|
78
|
+
count = 0
|
|
79
|
+
str.to_s.scan(/\p{Alnum}+/) do
|
|
80
|
+
count += 1
|
|
81
|
+
return true if count >= min_words
|
|
82
|
+
end
|
|
83
|
+
false
|
|
84
|
+
end
|
|
57
85
|
end
|
|
58
86
|
end
|
|
59
87
|
end
|
|
@@ -6,34 +6,58 @@ module Html2rss
|
|
|
6
6
|
class AutoSource
|
|
7
7
|
module Scraper
|
|
8
8
|
##
|
|
9
|
-
# Scrapes
|
|
10
|
-
#
|
|
9
|
+
# Scrapes article-like blocks from plain HTML by looking for repeated link
|
|
10
|
+
# structures when richer structured data is unavailable.
|
|
11
|
+
#
|
|
12
|
+
# The approach is intentionally heuristic:
|
|
13
|
+
# 1. collect repeated anchor paths
|
|
14
|
+
# 2. walk upward to a shared container shape
|
|
15
|
+
# 3. extract the best anchor found inside each container
|
|
16
|
+
#
|
|
17
|
+
# This scraper is broader and noisier than `SemanticHtml`, so it acts as a
|
|
18
|
+
# fallback for pages without stronger semantic signals.
|
|
11
19
|
class Html
|
|
12
20
|
include Enumerable
|
|
13
21
|
|
|
14
|
-
TAGS_TO_IGNORE = /(nav|footer|header)/i
|
|
22
|
+
TAGS_TO_IGNORE = /(nav|footer|header|svg|script|style)/i
|
|
15
23
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
end
|
|
24
|
+
DEFAULT_MINIMUM_SELECTOR_FREQUENCY = 2
|
|
25
|
+
DEFAULT_USE_TOP_SELECTORS = 5
|
|
19
26
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
27
|
+
##
|
|
28
|
+
# @return [Symbol] config key used to enable or configure this scraper
|
|
29
|
+
def self.options_key = :html
|
|
23
30
|
|
|
24
|
-
|
|
31
|
+
##
|
|
32
|
+
# Probes whether the document appears to contain repeated anchor
|
|
33
|
+
# structures that this fallback scraper can cluster into article-like
|
|
34
|
+
# containers.
|
|
35
|
+
#
|
|
36
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
37
|
+
# @return [Boolean] true when the scraper can likely extract articles
|
|
38
|
+
def self.articles?(parsed_body)
|
|
39
|
+
new(parsed_body, url: '').any?
|
|
25
40
|
end
|
|
26
41
|
|
|
27
42
|
##
|
|
28
43
|
# Simplify an XPath selector by removing the index notation.
|
|
44
|
+
# This keeps repeated anchor paths comparable across sibling blocks.
|
|
45
|
+
#
|
|
46
|
+
# @param xpath [String] original XPath
|
|
47
|
+
# @return [String] XPath without positional indexes
|
|
29
48
|
def self.simplify_xpath(xpath)
|
|
30
49
|
xpath.gsub(/\[\d+\]/, '')
|
|
31
50
|
end
|
|
32
51
|
|
|
33
|
-
|
|
52
|
+
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
|
|
53
|
+
# @param url [String] The base URL.
|
|
54
|
+
# @param extractor [Class] The extractor class to handle article extraction.
|
|
55
|
+
# @param opts [Hash] Additional options.
|
|
56
|
+
def initialize(parsed_body, url:, extractor: HtmlExtractor, **opts)
|
|
34
57
|
@parsed_body = parsed_body
|
|
35
58
|
@url = url
|
|
36
|
-
@
|
|
59
|
+
@extractor = extractor
|
|
60
|
+
@opts = opts
|
|
37
61
|
end
|
|
38
62
|
|
|
39
63
|
attr_reader :parsed_body
|
|
@@ -44,51 +68,102 @@ module Html2rss
|
|
|
44
68
|
def each
|
|
45
69
|
return enum_for(:each) unless block_given?
|
|
46
70
|
|
|
47
|
-
|
|
71
|
+
each_article_tag do |article_tag|
|
|
72
|
+
article_hash = extract_article(article_tag)
|
|
73
|
+
yield article_hash if article_hash
|
|
74
|
+
end
|
|
75
|
+
end
|
|
48
76
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
77
|
+
##
|
|
78
|
+
# Decides whether a traversed node has reached a useful article-like
|
|
79
|
+
# boundary for the generic HTML scraper.
|
|
80
|
+
#
|
|
81
|
+
# The predicate prefers containers that add surrounding link context,
|
|
82
|
+
# which helps the scraper move from a leaf anchor toward a repeated
|
|
83
|
+
# teaser/card wrapper.
|
|
84
|
+
#
|
|
85
|
+
# @param node [Nokogiri::XML::Node] candidate boundary node
|
|
86
|
+
# @return [Boolean] true when the node is a good extraction boundary
|
|
87
|
+
def article_tag_condition?(node)
|
|
88
|
+
# Ignore tags that are below a tag which is in TAGS_TO_IGNORE.
|
|
89
|
+
return false if node.path.match?(TAGS_TO_IGNORE)
|
|
90
|
+
return true if %w[body html].include?(node.name)
|
|
91
|
+
return false unless (parent = node.parent)
|
|
52
92
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
93
|
+
anchor_count(parent) > anchor_count(node)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
private
|
|
97
|
+
|
|
98
|
+
##
|
|
99
|
+
# Find relevant anchors in root.
|
|
100
|
+
# @return [Set<String>] The set of XPath selectors
|
|
101
|
+
def selectors
|
|
102
|
+
@selectors ||= Hash.new(0).tap do |selectors|
|
|
103
|
+
each_relevant_anchor { |node| increment_selector_count(selectors, node) }
|
|
57
104
|
end
|
|
58
105
|
end
|
|
59
106
|
|
|
60
107
|
##
|
|
61
|
-
#
|
|
62
|
-
# @
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
108
|
+
# Filter the frequent selectors by the minimum_selector_frequency and use_top_selectors.
|
|
109
|
+
# @return [Array<String>] The filtered selectors
|
|
110
|
+
def filtered_selectors
|
|
111
|
+
selectors.select { |_selector, count| count >= minimum_selector_frequency }
|
|
112
|
+
.max_by(use_top_selectors, &:last)
|
|
113
|
+
.map(&:first)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def minimum_selector_frequency = @opts[:minimum_selector_frequency] || DEFAULT_MINIMUM_SELECTOR_FREQUENCY
|
|
117
|
+
def use_top_selectors = @opts[:use_top_selectors] || DEFAULT_USE_TOP_SELECTORS
|
|
118
|
+
|
|
119
|
+
def anchor_count(node)
|
|
120
|
+
@anchor_counts ||= {}
|
|
121
|
+
@anchor_counts[node.path] ||= node.name == 'a' ? 1 : node.css('a').size
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def each_relevant_anchor
|
|
125
|
+
return enum_for(:each_relevant_anchor) unless block_given?
|
|
71
126
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
.to_set
|
|
127
|
+
traversal_root&.traverse do |node|
|
|
128
|
+
yield node if relevant_anchor?(node)
|
|
75
129
|
end
|
|
76
130
|
end
|
|
77
131
|
|
|
78
|
-
def
|
|
79
|
-
|
|
80
|
-
|
|
132
|
+
def relevant_anchor?(node)
|
|
133
|
+
node.element? && node.name == 'a' && !String(node['href']).empty?
|
|
134
|
+
end
|
|
81
135
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
136
|
+
def increment_selector_count(selectors, node)
|
|
137
|
+
path = self.class.simplify_xpath(node.path)
|
|
138
|
+
selectors[path] += 1 unless path.match?(TAGS_TO_IGNORE)
|
|
139
|
+
end
|
|
86
140
|
|
|
87
|
-
|
|
141
|
+
def traversal_root
|
|
142
|
+
parsed_body.at_css('body, html') || parsed_body.root
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def each_article_tag
|
|
146
|
+
return enum_for(:each_article_tag) unless block_given?
|
|
147
|
+
|
|
148
|
+
filtered_selectors.each do |selector|
|
|
149
|
+
parsed_body.xpath(selector).each do |selected_tag|
|
|
150
|
+
article_tag = article_tag_for(selected_tag)
|
|
151
|
+
yield article_tag if article_tag
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def article_tag_for(selected_tag)
|
|
157
|
+
return if selected_tag.path.match?(Html::TAGS_TO_IGNORE)
|
|
158
|
+
|
|
159
|
+
HtmlNavigator.parent_until_condition(selected_tag, method(:article_tag_condition?))
|
|
160
|
+
end
|
|
88
161
|
|
|
89
|
-
|
|
162
|
+
def extract_article(article_tag)
|
|
163
|
+
selected_anchor = HtmlExtractor.main_anchor_for(article_tag)
|
|
164
|
+
return unless selected_anchor
|
|
90
165
|
|
|
91
|
-
|
|
166
|
+
@extractor.new(article_tag, base_url: @url, selected_anchor:).call
|
|
92
167
|
end
|
|
93
168
|
end
|
|
94
169
|
end
|