html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -11,6 +11,7 @@ module Html2rss
|
|
|
11
11
|
#
|
|
12
12
|
# @see https://schema.org/Thing
|
|
13
13
|
class Thing
|
|
14
|
+
# Supported Schema.org `@type` values mapped to article extraction.
|
|
14
15
|
SUPPORTED_TYPES = %w[
|
|
15
16
|
AdvertiserContentArticle
|
|
16
17
|
AnalysisNewsArticle
|
|
@@ -32,11 +33,14 @@ module Html2rss
|
|
|
32
33
|
TechArticle
|
|
33
34
|
].to_set.freeze
|
|
34
35
|
|
|
35
|
-
|
|
36
|
+
# Attributes exposed by `#call` in generated article hashes.
|
|
37
|
+
DEFAULT_ATTRIBUTES = %i[id title description url image published_at categories].freeze
|
|
36
38
|
|
|
39
|
+
# @param schema_object [Hash{Symbol => Object}] parsed schema.org object
|
|
40
|
+
# @param url [String, Html2rss::Url, nil] base URL used for relative normalization
|
|
37
41
|
def initialize(schema_object, url:)
|
|
38
42
|
@schema_object = schema_object
|
|
39
|
-
@
|
|
43
|
+
@base_url = normalized_base_url(url)
|
|
40
44
|
end
|
|
41
45
|
|
|
42
46
|
# @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
|
|
@@ -46,24 +50,27 @@ module Html2rss
|
|
|
46
50
|
end
|
|
47
51
|
end
|
|
48
52
|
|
|
53
|
+
# @return [String, nil] stable schema object identifier
|
|
49
54
|
def id
|
|
50
55
|
return @id if defined?(@id)
|
|
51
56
|
|
|
52
|
-
id = (schema_object[:@id] || url&.path
|
|
57
|
+
id = normalized_id(schema_object[:@id], reference_url: url || base_url) || url&.path.to_s
|
|
53
58
|
|
|
54
59
|
return if id.empty?
|
|
55
60
|
|
|
56
61
|
@id = id
|
|
57
62
|
end
|
|
58
63
|
|
|
64
|
+
# @return [String, nil] article title
|
|
59
65
|
def title = schema_object[:title]
|
|
60
66
|
|
|
67
|
+
# @return [String, nil] longest available description field
|
|
61
68
|
def description
|
|
62
69
|
schema_object.values_at(:description, :schema_object_body, :abstract)
|
|
63
70
|
.max_by { |string| string.to_s.size }
|
|
64
71
|
end
|
|
65
72
|
|
|
66
|
-
# @return [
|
|
73
|
+
# @return [Html2rss::Url, nil] the URL of the schema object
|
|
67
74
|
def url
|
|
68
75
|
url = schema_object[:url]
|
|
69
76
|
if url.to_s.empty?
|
|
@@ -71,21 +78,29 @@ module Html2rss
|
|
|
71
78
|
return
|
|
72
79
|
end
|
|
73
80
|
|
|
74
|
-
|
|
81
|
+
Url.from_relative(url, base_url || url)
|
|
75
82
|
end
|
|
76
83
|
|
|
84
|
+
# @return [Html2rss::Url, nil] normalized article image URL
|
|
77
85
|
def image
|
|
78
86
|
if (image_url = image_urls.first)
|
|
79
|
-
|
|
87
|
+
Url.from_relative(image_url, base_url || image_url)
|
|
80
88
|
end
|
|
81
89
|
end
|
|
82
90
|
|
|
91
|
+
# @return [String, nil] published-at timestamp string
|
|
83
92
|
def published_at = schema_object[:datePublished]
|
|
84
93
|
|
|
85
|
-
|
|
94
|
+
# @return [Array<String>, nil] extracted category labels
|
|
95
|
+
def categories
|
|
96
|
+
return @categories if defined?(@categories)
|
|
86
97
|
|
|
87
|
-
|
|
98
|
+
@categories = CategoryExtractor.call(schema_object)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
attr_reader :schema_object, :base_url
|
|
88
102
|
|
|
103
|
+
# @return [Array<String>] normalized image URL candidates
|
|
89
104
|
def image_urls
|
|
90
105
|
schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
|
|
91
106
|
next unless object
|
|
@@ -97,6 +112,52 @@ module Html2rss
|
|
|
97
112
|
end
|
|
98
113
|
end
|
|
99
114
|
end
|
|
115
|
+
|
|
116
|
+
# @param value [String, Symbol, nil] candidate schema identifier
|
|
117
|
+
# @param reference_url [Html2rss::Url, nil] URL used for same-origin normalization
|
|
118
|
+
# @return [String, nil] normalized identifier value
|
|
119
|
+
def normalized_id(value, reference_url:)
|
|
120
|
+
text = value.to_s
|
|
121
|
+
return if text.empty?
|
|
122
|
+
|
|
123
|
+
normalized_url = normalized_id_url(text, reference_url:)
|
|
124
|
+
return text unless reference_url && normalized_url.host == reference_url.host
|
|
125
|
+
|
|
126
|
+
normalized_id_value(normalized_url)
|
|
127
|
+
rescue ArgumentError
|
|
128
|
+
text
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# @param text [String] raw identifier text
|
|
132
|
+
# @param reference_url [Html2rss::Url, nil] URL used to resolve relative IDs
|
|
133
|
+
# @return [Html2rss::Url] normalized identifier URL
|
|
134
|
+
def normalized_id_url(text, reference_url:)
|
|
135
|
+
if text.start_with?('/')
|
|
136
|
+
Url.from_relative(text, reference_url || text)
|
|
137
|
+
else
|
|
138
|
+
Url.from_absolute(text)
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# @param url [Html2rss::Url] normalized identifier URL
|
|
143
|
+
# @return [String, nil] path/query portion used as stable ID
|
|
144
|
+
def normalized_id_value(url)
|
|
145
|
+
path = url.path.to_s
|
|
146
|
+
return "#{path}?#{url.query}" if (path.empty? || path == '/') && !url.query.to_s.empty?
|
|
147
|
+
return path unless path.empty?
|
|
148
|
+
|
|
149
|
+
url.query
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# @param url [String, Html2rss::Url, nil] candidate page URL
|
|
153
|
+
# @return [Html2rss::Url, nil] normalized absolute URL for schema resolution
|
|
154
|
+
def normalized_base_url(url)
|
|
155
|
+
return if url.to_s.strip.empty?
|
|
156
|
+
|
|
157
|
+
Url.from_absolute(url)
|
|
158
|
+
rescue ArgumentError
|
|
159
|
+
nil
|
|
160
|
+
end
|
|
100
161
|
end
|
|
101
162
|
end
|
|
102
163
|
end
|
|
@@ -8,24 +8,31 @@ module Html2rss
|
|
|
8
8
|
module Scraper
|
|
9
9
|
##
|
|
10
10
|
# Scrapes articles from Schema.org objects, by looking for the objects in:
|
|
11
|
-
|
|
12
11
|
# <script type="application/ld+json"> "schema" tags.
|
|
13
12
|
#
|
|
14
|
-
#
|
|
15
|
-
#
|
|
16
|
-
# 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
|
|
13
|
+
# @see https://schema.org/docs/full.html
|
|
14
|
+
# @see https://developers.google.com/search/docs/appearance/structured-data/article#microdata
|
|
17
15
|
class Schema
|
|
18
16
|
include Enumerable
|
|
19
17
|
|
|
18
|
+
# Selector for JSON-LD script tags containing Schema.org objects.
|
|
20
19
|
TAG_SELECTOR = 'script[type="application/ld+json"]'
|
|
21
20
|
|
|
21
|
+
# @return [Symbol] scraper config key
|
|
22
|
+
def self.options_key = :schema
|
|
23
|
+
|
|
22
24
|
class << self
|
|
25
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
26
|
+
# @return [Boolean] whether the page includes supported schema types
|
|
23
27
|
def articles?(parsed_body)
|
|
24
|
-
parsed_body.css(TAG_SELECTOR).any?
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
28
|
+
parsed_body.css(TAG_SELECTOR).any? { |script| supported_schema_type?(script) }
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# @param script [Nokogiri::XML::Element] schema JSON-LD script tag
|
|
32
|
+
# @return [Boolean] whether the tag references a supported schema type
|
|
33
|
+
def supported_schema_type?(script)
|
|
34
|
+
supported_types = Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES
|
|
35
|
+
supported_types.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
|
|
29
36
|
end
|
|
30
37
|
|
|
31
38
|
##
|
|
@@ -49,11 +56,14 @@ module Html2rss
|
|
|
49
56
|
end
|
|
50
57
|
end
|
|
51
58
|
|
|
59
|
+
# @param object [Hash{Symbol => Object}] schema candidate object
|
|
60
|
+
# @return [Boolean] whether an extractor exists for the candidate object
|
|
52
61
|
def supported_schema_object?(object)
|
|
53
62
|
scraper_for_schema_object(object) ? true : false
|
|
54
63
|
end
|
|
55
64
|
|
|
56
65
|
##
|
|
66
|
+
# @param schema_object [Hash{Symbol => Object}] schema object with an @type key
|
|
57
67
|
# @return [Scraper::Schema::Thing, Scraper::Schema::ItemList, nil] a class responding to `#call`
|
|
58
68
|
def scraper_for_schema_object(schema_object)
|
|
59
69
|
type = schema_object[:@type]
|
|
@@ -63,7 +73,7 @@ module Html2rss
|
|
|
63
73
|
elsif ItemList::SUPPORTED_TYPES.member?(type)
|
|
64
74
|
ItemList
|
|
65
75
|
else
|
|
66
|
-
Log.
|
|
76
|
+
Log.debug("#{name}: unsupported schema object @type=#{type.inspect}")
|
|
67
77
|
nil
|
|
68
78
|
end
|
|
69
79
|
end
|
|
@@ -73,14 +83,19 @@ module Html2rss
|
|
|
73
83
|
def parse_script_tag(script_tag)
|
|
74
84
|
JSON.parse(script_tag.text, symbolize_names: true)
|
|
75
85
|
rescue JSON::ParserError => error
|
|
76
|
-
Log.warn(
|
|
86
|
+
Log.warn("#{name}: failed to parse JSON", error: error.message)
|
|
77
87
|
[]
|
|
78
88
|
end
|
|
79
89
|
end
|
|
80
90
|
|
|
81
|
-
|
|
91
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
92
|
+
# @param url [String, Html2rss::Url] base page URL
|
|
93
|
+
# @param opts [Hash] scraper-specific options
|
|
94
|
+
# @option opts [Object] :_reserved reserved for future scraper-specific options
|
|
95
|
+
def initialize(parsed_body, url:, **opts)
|
|
82
96
|
@parsed_body = parsed_body
|
|
83
97
|
@url = url
|
|
98
|
+
@opts = opts
|
|
84
99
|
end
|
|
85
100
|
|
|
86
101
|
##
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class AutoSource
|
|
5
|
+
module Scraper
|
|
6
|
+
class SemanticHtml
|
|
7
|
+
##
|
|
8
|
+
# Selects the best content-like anchor from a semantic container.
|
|
9
|
+
#
|
|
10
|
+
# The selector turns raw DOM anchors into ranked facts so semantic
|
|
11
|
+
# scraping can reason about link intent instead of DOM order. It favors
|
|
12
|
+
# heading-aligned article links and suppresses utility links, duplicate
|
|
13
|
+
# destinations, and weak textless affordances.
|
|
14
|
+
class AnchorSelector # rubocop:disable Metrics/ClassLength
|
|
15
|
+
AnchorFacts = Data.define(
|
|
16
|
+
:anchor,
|
|
17
|
+
:text,
|
|
18
|
+
:url,
|
|
19
|
+
:destination,
|
|
20
|
+
:segments,
|
|
21
|
+
:meaningful_text,
|
|
22
|
+
:content_like_destination,
|
|
23
|
+
:heading_anchor,
|
|
24
|
+
:heading_text_match,
|
|
25
|
+
:score
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# Comma-separated heading selector used for heading/anchor matching.
|
|
29
|
+
HEADING_SELECTOR = HtmlExtractor::HEADING_TAGS.join(',').freeze
|
|
30
|
+
# Path segments that usually represent utility navigation rather than article content.
|
|
31
|
+
UTILITY_PATH_SEGMENTS = %w[
|
|
32
|
+
about account author category comment comments contact feedback help
|
|
33
|
+
login newsletter profile register search settings share signup subscribe
|
|
34
|
+
topic topics view-all archive archives
|
|
35
|
+
feed feeds
|
|
36
|
+
recommended
|
|
37
|
+
for-you
|
|
38
|
+
preference preferences
|
|
39
|
+
notification notifications
|
|
40
|
+
privacy terms
|
|
41
|
+
cookie cookies
|
|
42
|
+
logout
|
|
43
|
+
user users
|
|
44
|
+
].to_set.freeze
|
|
45
|
+
# Path segments that signal content-like destinations.
|
|
46
|
+
CONTENT_PATH_SEGMENTS = %w[
|
|
47
|
+
article articles news post posts story stories update updates
|
|
48
|
+
].to_set.freeze
|
|
49
|
+
# Ancestor tags that usually indicate navigation/utility regions.
|
|
50
|
+
UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
|
|
51
|
+
|
|
52
|
+
# @param base_url [String, Html2rss::Url] page URL used to normalize href destinations
|
|
53
|
+
def initialize(base_url)
|
|
54
|
+
@base_url = base_url
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
##
|
|
58
|
+
# Chooses the single anchor that best represents the story contained
|
|
59
|
+
# in a semantic block.
|
|
60
|
+
#
|
|
61
|
+
# Ranking is scoped to one container at a time. That keeps the logic
|
|
62
|
+
# local, makes duplicate links to the same destination collapse into
|
|
63
|
+
# one candidate, and avoids page-wide heuristics leaking across cards.
|
|
64
|
+
#
|
|
65
|
+
# @param container [Nokogiri::XML::Element] semantic container being evaluated
|
|
66
|
+
# @return [Nokogiri::XML::Element, nil] selected primary anchor or nil when none qualify
|
|
67
|
+
def primary_anchor_for(container)
|
|
68
|
+
facts_for(container).max_by(&:score)&.anchor
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
private
|
|
72
|
+
|
|
73
|
+
attr_reader :base_url
|
|
74
|
+
|
|
75
|
+
def facts_for(container)
|
|
76
|
+
heading = heading_for(container)
|
|
77
|
+
heading_text = visible_text(heading)
|
|
78
|
+
|
|
79
|
+
container.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR).each_with_object({}) do |anchor, best_by_destination|
|
|
80
|
+
next if anchor.path.match?(Html::TAGS_TO_IGNORE)
|
|
81
|
+
|
|
82
|
+
facts = build_facts(anchor, heading, heading_text)
|
|
83
|
+
next unless facts
|
|
84
|
+
|
|
85
|
+
keep_stronger_fact(best_by_destination, facts)
|
|
86
|
+
end.values
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def build_facts(anchor, heading, heading_text) # rubocop:disable Metrics/MethodLength
|
|
90
|
+
text = visible_text(anchor)
|
|
91
|
+
meaningful_text = meaningful_text?(text)
|
|
92
|
+
ancestors = anchor.ancestors.to_a
|
|
93
|
+
url = normalized_destination(anchor)
|
|
94
|
+
return unless url
|
|
95
|
+
|
|
96
|
+
segments = url.path_segments
|
|
97
|
+
content_like_destination = content_like_destination?(segments)
|
|
98
|
+
return if ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
|
|
99
|
+
|
|
100
|
+
heading_anchor = heading_anchor?(ancestors, heading)
|
|
101
|
+
heading_text_match = heading_text_match?(heading_text, text, meaningful_text)
|
|
102
|
+
return unless heading_anchor || content_like_anchor?(meaningful_text, content_like_destination)
|
|
103
|
+
|
|
104
|
+
AnchorFacts.new(
|
|
105
|
+
anchor:,
|
|
106
|
+
text:,
|
|
107
|
+
url:,
|
|
108
|
+
destination: url.to_s,
|
|
109
|
+
segments:,
|
|
110
|
+
meaningful_text:,
|
|
111
|
+
content_like_destination:,
|
|
112
|
+
heading_anchor:,
|
|
113
|
+
heading_text_match:,
|
|
114
|
+
score: score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
|
|
115
|
+
)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
|
|
119
|
+
utility_destination?(segments) ||
|
|
120
|
+
utility_text?(text) ||
|
|
121
|
+
icon_only_anchor?(anchor, meaningful_text) ||
|
|
122
|
+
utility_landmark_anchor?(ancestors)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def keep_stronger_fact(best_by_destination, facts)
|
|
126
|
+
current = best_by_destination[facts.destination]
|
|
127
|
+
return best_by_destination[facts.destination] = facts unless current
|
|
128
|
+
return if current.score >= facts.score
|
|
129
|
+
|
|
130
|
+
best_by_destination[facts.destination] = facts
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def content_like_anchor?(meaningful_text, content_like_destination)
|
|
134
|
+
meaningful_text || content_like_destination
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
|
|
138
|
+
score = 0
|
|
139
|
+
score += 100 if heading_anchor
|
|
140
|
+
score += 20 if heading_text_match
|
|
141
|
+
score += 10 if meaningful_text
|
|
142
|
+
score += 10 if content_like_destination
|
|
143
|
+
score
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def heading_anchor?(ancestors, heading)
|
|
147
|
+
heading && ancestors.include?(heading)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def heading_text_match?(heading_text, text, meaningful_text)
|
|
151
|
+
meaningful_text && meaningful_text?(heading_text) && heading_text == text
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def heading_for(container)
|
|
155
|
+
container.at_css(HEADING_SELECTOR)
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def icon_only_anchor?(anchor, meaningful_text)
|
|
159
|
+
!meaningful_text && anchor.at_css('img, svg')
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def utility_destination?(segments)
|
|
163
|
+
segments.empty? || segments.any? { |segment| UTILITY_PATH_SEGMENTS.include?(segment) }
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def content_like_destination?(segments)
|
|
167
|
+
segments.any? do |segment|
|
|
168
|
+
CONTENT_PATH_SEGMENTS.include?(segment) || segment.match?(/\A\d[\w-]*\z/)
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def normalized_destination(anchor)
|
|
173
|
+
href = anchor['href'].to_s.split('#').first.to_s.strip
|
|
174
|
+
return if href.empty?
|
|
175
|
+
|
|
176
|
+
Html2rss::Url.from_relative(href, base_url)
|
|
177
|
+
rescue ArgumentError
|
|
178
|
+
nil
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def meaningful_text?(text)
|
|
182
|
+
text.scan(/\p{Alnum}+/).any?
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def utility_text?(text)
|
|
186
|
+
text.match?(
|
|
187
|
+
/\A(about|contact|log in|login|sign up|signup|share|comments?|view all|recommended for you|subscribe)\b/i
|
|
188
|
+
)
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def utility_landmark_anchor?(ancestors)
|
|
192
|
+
ancestors.any? { |node| UTILITY_LANDMARK_TAGS.include?(node.name) }
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def visible_text(node)
|
|
196
|
+
return '' unless node
|
|
197
|
+
|
|
198
|
+
HtmlExtractor.extract_visible_text(node).to_s.strip
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
end
|
|
@@ -1,115 +1,124 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
require 'parallel'
|
|
3
|
+
require_relative 'semantic_html/anchor_selector'
|
|
5
4
|
|
|
6
5
|
module Html2rss
|
|
7
6
|
class AutoSource
|
|
8
7
|
module Scraper
|
|
9
8
|
##
|
|
10
|
-
# Scrapes
|
|
11
|
-
#
|
|
9
|
+
# Scrapes semantic containers by choosing one primary content link per
|
|
10
|
+
# block before extraction.
|
|
12
11
|
#
|
|
13
|
-
#
|
|
14
|
-
# 1.
|
|
12
|
+
# This scraper is intentionally container-first:
|
|
13
|
+
# 1. collect candidate semantic containers once
|
|
14
|
+
# 2. select the strongest content-like anchor within each container
|
|
15
|
+
# 3. extract fields from the container while honoring that anchor choice
|
|
16
|
+
#
|
|
17
|
+
# The result is lower recall on weak-signal blocks, but much better link
|
|
18
|
+
# quality on modern teaser cards that mix headlines, utility links, and
|
|
19
|
+
# duplicate image overlays.
|
|
15
20
|
class SemanticHtml
|
|
16
21
|
include Enumerable
|
|
17
22
|
|
|
23
|
+
# Container plus selected anchor chosen for extraction.
|
|
24
|
+
Entry = Data.define(:container, :selected_anchor)
|
|
25
|
+
|
|
26
|
+
# Candidate semantic container selectors used to locate extractable blocks.
|
|
27
|
+
CONTAINER_SELECTORS = [
|
|
28
|
+
'article:not(:has(article))',
|
|
29
|
+
'section:not(:has(section))',
|
|
30
|
+
'li:not(:has(li))',
|
|
31
|
+
'tr:not(:has(tr))',
|
|
32
|
+
'div:not(:has(div))'
|
|
33
|
+
].freeze
|
|
34
|
+
|
|
18
35
|
##
|
|
19
|
-
#
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
'article :not(article) a[href]',
|
|
25
|
-
'article a[href]'
|
|
26
|
-
],
|
|
27
|
-
'li' => [
|
|
28
|
-
'ul > li :not(li) a[href]',
|
|
29
|
-
'ol > li :not(li) a[href]'
|
|
30
|
-
]
|
|
31
|
-
}.freeze
|
|
32
|
-
|
|
33
|
-
# Check if the parsed_body contains articles
|
|
34
|
-
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document
|
|
35
|
-
# @return [Boolean] True if articles are found, otherwise false.
|
|
36
|
+
# @return [Symbol] config key used to enable or configure this scraper
|
|
37
|
+
def self.options_key = :semantic_html
|
|
38
|
+
|
|
39
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
40
|
+
# @return [Boolean] true when at least one semantic container has an eligible anchor
|
|
36
41
|
def self.articles?(parsed_body)
|
|
37
42
|
return false unless parsed_body
|
|
38
43
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
44
|
+
new(parsed_body, url: 'https://example.com').extractable?
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
48
|
+
# @param url [String, Html2rss::Url] base url
|
|
49
|
+
# @param extractor [Class] extractor class used for article extraction
|
|
50
|
+
# @param _opts [Hash] scraper-specific options
|
|
51
|
+
# @option _opts [Object] :_reserved reserved for future scraper-specific options
|
|
52
|
+
def initialize(parsed_body, url:, extractor: HtmlExtractor, **_opts)
|
|
53
|
+
@parsed_body = parsed_body
|
|
54
|
+
@url = url
|
|
55
|
+
@extractor = extractor
|
|
56
|
+
@anchor_selector = AnchorSelector.new(url)
|
|
43
57
|
end
|
|
44
58
|
|
|
45
|
-
|
|
46
|
-
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
|
|
47
|
-
# @param tag_name [String] The tag name to search for
|
|
48
|
-
# @param stop_tag [String] The tag name to stop searching at
|
|
49
|
-
# @return [Nokogiri::XML::Node] The found ancestor tag or the current tag if matched
|
|
50
|
-
def self.find_tag_in_ancestors(current_tag, tag_name, stop_tag: 'html')
|
|
51
|
-
return current_tag if current_tag.name == tag_name
|
|
59
|
+
attr_reader :parsed_body
|
|
52
60
|
|
|
53
|
-
|
|
61
|
+
##
|
|
62
|
+
# Yields extracted article hashes for each semantic container that
|
|
63
|
+
# survives anchor selection.
|
|
64
|
+
#
|
|
65
|
+
# Detection and extraction share the same memoized entry list so this
|
|
66
|
+
# scraper does not rerun anchor ranking once a page has already been
|
|
67
|
+
# accepted as extractable.
|
|
68
|
+
#
|
|
69
|
+
# @yieldparam article_hash [Hash] extracted article hash
|
|
70
|
+
# @return [Enumerator<Hash>]
|
|
71
|
+
def each
|
|
72
|
+
return enum_for(:each) unless block_given?
|
|
54
73
|
|
|
55
|
-
|
|
56
|
-
|
|
74
|
+
extractable_entries.each do |entry|
|
|
75
|
+
article_hash = @extractor.new(
|
|
76
|
+
entry.container,
|
|
77
|
+
base_url: @url,
|
|
78
|
+
selected_anchor: entry.selected_anchor
|
|
79
|
+
).call
|
|
80
|
+
yield article_hash if article_hash
|
|
57
81
|
end
|
|
58
|
-
|
|
59
|
-
current_tag
|
|
60
82
|
end
|
|
61
83
|
|
|
62
|
-
|
|
63
|
-
#
|
|
64
|
-
#
|
|
65
|
-
#
|
|
66
|
-
|
|
67
|
-
|
|
84
|
+
##
|
|
85
|
+
# Reports whether the page contains at least one semantic container with
|
|
86
|
+
# a selectable primary anchor.
|
|
87
|
+
#
|
|
88
|
+
# @return [Boolean] true when at least one candidate container yields a primary anchor
|
|
89
|
+
def extractable?
|
|
90
|
+
extractable_entries.any?
|
|
68
91
|
end
|
|
69
92
|
|
|
70
|
-
|
|
71
|
-
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
|
|
72
|
-
# @param selector [String] The CSS selector to search for
|
|
73
|
-
# @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
|
|
74
|
-
def self.find_closest_selector_upwards(current_tag, selector:)
|
|
75
|
-
while current_tag
|
|
76
|
-
found = current_tag.at_css(selector)
|
|
77
|
-
return found if found
|
|
78
|
-
|
|
79
|
-
return nil unless current_tag.respond_to?(:parent)
|
|
93
|
+
protected
|
|
80
94
|
|
|
81
|
-
|
|
82
|
-
|
|
95
|
+
def candidate_containers
|
|
96
|
+
@candidate_containers ||= collect_candidate_containers
|
|
83
97
|
end
|
|
84
98
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def self.anchor_tag_selector_pairs
|
|
88
|
-
ANCHOR_TAG_SELECTORS.flat_map do |tag_name, selectors|
|
|
89
|
-
selectors.map { |selector| [tag_name, selector] }
|
|
90
|
-
end
|
|
99
|
+
def primary_anchor_for(container)
|
|
100
|
+
@anchor_selector.primary_anchor_for(container)
|
|
91
101
|
end
|
|
92
102
|
|
|
93
|
-
def
|
|
94
|
-
@
|
|
95
|
-
|
|
96
|
-
|
|
103
|
+
def extractable_entries
|
|
104
|
+
@extractable_entries ||= candidate_containers.filter_map do |container|
|
|
105
|
+
selected_anchor = primary_anchor_for(container)
|
|
106
|
+
next unless selected_anchor
|
|
97
107
|
|
|
98
|
-
|
|
108
|
+
Entry.new(container:, selected_anchor:)
|
|
109
|
+
end
|
|
110
|
+
end
|
|
99
111
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
# @return [Enumerator] Enumerator for the scraped articles
|
|
103
|
-
def each
|
|
104
|
-
return enum_for(:each) unless block_given?
|
|
112
|
+
def collect_candidate_containers
|
|
113
|
+
seen = {}.compare_by_identity
|
|
105
114
|
|
|
106
|
-
|
|
107
|
-
parsed_body.css(selector).each do |
|
|
108
|
-
|
|
115
|
+
CONTAINER_SELECTORS.each_with_object([]) do |selector, containers|
|
|
116
|
+
parsed_body.css(selector).each do |container|
|
|
117
|
+
next if container.path.match?(Html::TAGS_TO_IGNORE)
|
|
118
|
+
next if seen[container]
|
|
109
119
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
end
|
|
120
|
+
seen[container] = true
|
|
121
|
+
containers << container
|
|
113
122
|
end
|
|
114
123
|
end
|
|
115
124
|
end
|