html2rss 0.20.1 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/html2rss.gemspec +1 -2
- data/lib/html2rss/auto_source/scraper/html.rb +61 -16
- data/lib/html2rss/auto_source/scraper/json_state.rb +40 -27
- data/lib/html2rss/auto_source/scraper/link_heuristics.rb +85 -131
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +74 -28
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +31 -60
- data/lib/html2rss/auto_source/scraper/schema.rb +8 -2
- data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +4 -18
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +55 -11
- data/lib/html2rss/auto_source/scraper.rb +0 -3
- data/lib/html2rss/auto_source.rb +2 -11
- data/lib/html2rss/category_extractor.rb +54 -20
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +60 -89
- data/lib/html2rss/html_extractor/list_candidates.rb +2 -8
- data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +29 -12
- data/lib/html2rss/html_extractor/semantic_containers.rb +9 -35
- data/lib/html2rss/html_extractor.rb +51 -30
- data/lib/html2rss/rendering/description_builder.rb +3 -3
- data/lib/html2rss/rss_builder/article.rb +44 -23
- data/lib/html2rss/rss_builder/enclosure.rb +4 -2
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +25 -36
- data/lib/html2rss/selectors/post_processors/substring.rb +11 -18
- data/lib/html2rss/selectors/post_processors/template.rb +3 -2
- data/lib/html2rss/selectors.rb +18 -4
- data/lib/html2rss/url.rb +4 -3
- data/lib/html2rss/version.rb +1 -1
- metadata +3 -17
|
@@ -13,11 +13,10 @@ module Html2rss
|
|
|
13
13
|
# @param schema_object [Hash] The schema object
|
|
14
14
|
# @return [Array<String>] Array of category strings
|
|
15
15
|
def self.call(schema_object)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
(field_categories | about_categories).to_a
|
|
16
|
+
Set.new.tap do |categories|
|
|
17
|
+
extract_field_categories!(categories, schema_object)
|
|
18
|
+
extract_about_categories!(categories, schema_object)
|
|
19
|
+
end.to_a
|
|
21
20
|
end
|
|
22
21
|
|
|
23
22
|
##
|
|
@@ -26,10 +25,18 @@ module Html2rss
|
|
|
26
25
|
# @param schema_object [Hash] The schema object
|
|
27
26
|
# @return [Set<String>] Set of category strings
|
|
28
27
|
def self.extract_field_categories(schema_object)
|
|
29
|
-
Set.new.tap
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
28
|
+
Set.new.tap { |categories| extract_field_categories!(categories, schema_object) }
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
##
|
|
32
|
+
# Extracts categories from keywords, categories, and tags fields.
|
|
33
|
+
#
|
|
34
|
+
# @param categories [Set<String>] Accumulator set
|
|
35
|
+
# @param schema_object [Hash] The schema object
|
|
36
|
+
# @return [void]
|
|
37
|
+
def self.extract_field_categories!(categories, schema_object)
|
|
38
|
+
%i[keywords categories tags].each do |field|
|
|
39
|
+
extract_field_value!(categories, schema_object[field])
|
|
33
40
|
end
|
|
34
41
|
end
|
|
35
42
|
|
|
@@ -39,15 +46,23 @@ module Html2rss
|
|
|
39
46
|
# @param schema_object [Hash] The schema object
|
|
40
47
|
# @return [Set<String>] Set of category strings
|
|
41
48
|
def self.extract_about_categories(schema_object)
|
|
49
|
+
Set.new.tap { |categories| extract_about_categories!(categories, schema_object) }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
##
|
|
53
|
+
# Extracts categories from the about field.
|
|
54
|
+
#
|
|
55
|
+
# @param categories [Set<String>] Accumulator set
|
|
56
|
+
# @param schema_object [Hash] The schema object
|
|
57
|
+
# @return [void]
|
|
58
|
+
def self.extract_about_categories!(categories, schema_object)
|
|
42
59
|
about = schema_object[:about]
|
|
43
|
-
return
|
|
60
|
+
return unless about
|
|
44
61
|
|
|
45
62
|
if about.is_a?(Array)
|
|
46
|
-
extract_about_array(about)
|
|
63
|
+
extract_about_array!(categories, about)
|
|
47
64
|
elsif about.is_a?(String)
|
|
48
|
-
extract_string_categories(about)
|
|
49
|
-
else
|
|
50
|
-
Set.new
|
|
65
|
+
extract_string_categories!(categories, about)
|
|
51
66
|
end
|
|
52
67
|
end
|
|
53
68
|
|
|
@@ -58,15 +73,25 @@ module Html2rss
|
|
|
58
73
|
# @param field [String] The field name
|
|
59
74
|
# @return [Set<String>] Set of category strings
|
|
60
75
|
def self.extract_field_value(schema_object, field)
|
|
61
|
-
|
|
62
|
-
|
|
76
|
+
Set.new.tap { |categories| extract_field_value!(categories, schema_object[field.to_sym]) }
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
##
|
|
80
|
+
# Extracts categories from a single field value.
|
|
81
|
+
#
|
|
82
|
+
# @param categories [Set<String>] Accumulator set
|
|
83
|
+
# @param value [Object] The field value
|
|
84
|
+
# @return [void]
|
|
85
|
+
def self.extract_field_value!(categories, value)
|
|
86
|
+
return unless value
|
|
63
87
|
|
|
64
88
|
if value.is_a?(Array)
|
|
65
|
-
|
|
89
|
+
value.each do |item|
|
|
90
|
+
s = item.to_s
|
|
91
|
+
categories.add(s) unless s.empty?
|
|
92
|
+
end
|
|
66
93
|
elsif value.is_a?(String)
|
|
67
|
-
extract_string_categories(value)
|
|
68
|
-
else
|
|
69
|
-
Set.new
|
|
94
|
+
extract_string_categories!(categories, value)
|
|
70
95
|
end
|
|
71
96
|
end
|
|
72
97
|
|
|
@@ -76,13 +101,21 @@ module Html2rss
|
|
|
76
101
|
# @param about [Array] The about array
|
|
77
102
|
# @return [Set<String>] Set of category strings
|
|
78
103
|
def self.extract_about_array(about)
|
|
79
|
-
Set.new.tap
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
104
|
+
Set.new.tap { |categories| extract_about_array!(categories, about) }
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
##
|
|
108
|
+
# Extracts categories from an about array.
|
|
109
|
+
#
|
|
110
|
+
# @param categories [Set<String>] Accumulator set
|
|
111
|
+
# @param about [Array] The about array
|
|
112
|
+
# @return [void]
|
|
113
|
+
def self.extract_about_array!(categories, about)
|
|
114
|
+
about.each do |item|
|
|
115
|
+
if item.is_a?(Hash) && item[:name]
|
|
116
|
+
categories.add(item[:name].to_s)
|
|
117
|
+
elsif item.is_a?(String)
|
|
118
|
+
categories.add(item)
|
|
86
119
|
end
|
|
87
120
|
end
|
|
88
121
|
end
|
|
@@ -93,7 +126,20 @@ module Html2rss
|
|
|
93
126
|
# @param string [String] source string that may contain category delimiters
|
|
94
127
|
# @return [Set<String>] Set of category strings
|
|
95
128
|
def self.extract_string_categories(string)
|
|
96
|
-
Set.new(string
|
|
129
|
+
Set.new.tap { |categories| extract_string_categories!(categories, string) }
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
##
|
|
133
|
+
# Extracts categories from a string by splitting on separators.
|
|
134
|
+
#
|
|
135
|
+
# @param categories [Set<String>] Accumulator set
|
|
136
|
+
# @param string [String] source string that may contain category delimiters
|
|
137
|
+
# @return [void]
|
|
138
|
+
def self.extract_string_categories!(categories, string)
|
|
139
|
+
string.split(/[,;|]/).each do |part|
|
|
140
|
+
s = part.strip
|
|
141
|
+
categories.add(s) unless s.empty?
|
|
142
|
+
end
|
|
97
143
|
end
|
|
98
144
|
end
|
|
99
145
|
end
|
|
@@ -16,9 +16,10 @@ module Html2rss
|
|
|
16
16
|
|
|
17
17
|
# @return [Html2rss::Url, nil]
|
|
18
18
|
def url
|
|
19
|
-
url
|
|
19
|
+
return @url if defined?(@url)
|
|
20
20
|
|
|
21
|
-
|
|
21
|
+
item_url = schema_object.dig(:item, :url)
|
|
22
|
+
@url = item_url ? Url.from_relative(item_url, base_url || item_url) : super
|
|
22
23
|
end
|
|
23
24
|
end
|
|
24
25
|
end
|
|
@@ -13,24 +13,10 @@ module Html2rss
|
|
|
13
13
|
class Thing
|
|
14
14
|
# Supported Schema.org `@type` values mapped to article extraction.
|
|
15
15
|
SUPPORTED_TYPES = %w[
|
|
16
|
-
AdvertiserContentArticle
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
AskPublicNewsArticle
|
|
21
|
-
BackgroundNewsArticle
|
|
22
|
-
BlogPosting
|
|
23
|
-
DiscussionForumPosting
|
|
24
|
-
LiveBlogPosting
|
|
25
|
-
NewsArticle
|
|
26
|
-
OpinionNewsArticle
|
|
27
|
-
Report
|
|
28
|
-
ReportageNewsArticle
|
|
29
|
-
ReviewNewsArticle
|
|
30
|
-
SatiricalArticle
|
|
31
|
-
ScholarlyArticle
|
|
32
|
-
SocialMediaPosting
|
|
33
|
-
TechArticle
|
|
16
|
+
AdvertiserContentArticle AnalysisNewsArticle APIReference Article
|
|
17
|
+
AskPublicNewsArticle BackgroundNewsArticle BlogPosting DiscussionForumPosting
|
|
18
|
+
LiveBlogPosting NewsArticle OpinionNewsArticle Report ReportageNewsArticle
|
|
19
|
+
ReviewNewsArticle SatiricalArticle ScholarlyArticle SocialMediaPosting TechArticle
|
|
34
20
|
].to_set.freeze
|
|
35
21
|
|
|
36
22
|
# Attributes exposed by `#call` in generated article hashes.
|
|
@@ -44,21 +30,14 @@ module Html2rss
|
|
|
44
30
|
end
|
|
45
31
|
|
|
46
32
|
# @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
|
|
47
|
-
def call
|
|
48
|
-
DEFAULT_ATTRIBUTES.to_h do |attribute|
|
|
49
|
-
[attribute, public_send(attribute)]
|
|
50
|
-
end
|
|
51
|
-
end
|
|
33
|
+
def call = DEFAULT_ATTRIBUTES.to_h { [_1, public_send(_1)] }
|
|
52
34
|
|
|
53
35
|
# @return [String, nil] stable schema object identifier
|
|
54
36
|
def id
|
|
55
37
|
return @id if defined?(@id)
|
|
56
38
|
|
|
57
39
|
id = normalized_id(schema_object[:@id], reference_url: url || base_url) || url&.path.to_s
|
|
58
|
-
|
|
59
|
-
return if id.empty?
|
|
60
|
-
|
|
61
|
-
@id = id
|
|
40
|
+
@id = id.to_s.empty? ? nil : id
|
|
62
41
|
end
|
|
63
42
|
|
|
64
43
|
# @return [String, nil] article title
|
|
@@ -66,26 +45,28 @@ module Html2rss
|
|
|
66
45
|
|
|
67
46
|
# @return [String, nil] longest available description field
|
|
68
47
|
def description
|
|
69
|
-
schema_object.values_at(:description, :schema_object_body, :abstract)
|
|
70
|
-
.max_by { |string| string.to_s.size }
|
|
48
|
+
schema_object.values_at(:description, :schema_object_body, :abstract).max_by { _1.to_s.size }
|
|
71
49
|
end
|
|
72
50
|
|
|
73
51
|
# @return [Html2rss::Url, nil] the URL of the schema object
|
|
74
52
|
def url
|
|
53
|
+
return @url if defined?(@url)
|
|
54
|
+
|
|
75
55
|
url = schema_object[:url]
|
|
76
56
|
if url.to_s.empty?
|
|
77
57
|
Log.debug("Schema#Thing.url: no url in schema_object: #{schema_object.inspect}")
|
|
78
|
-
return
|
|
58
|
+
return @url = nil
|
|
79
59
|
end
|
|
80
60
|
|
|
81
|
-
Url.from_relative(url, base_url || url)
|
|
61
|
+
@url = Url.from_relative(url, base_url || url)
|
|
82
62
|
end
|
|
83
63
|
|
|
84
64
|
# @return [Html2rss::Url, nil] normalized article image URL
|
|
85
65
|
def image
|
|
86
|
-
if (
|
|
87
|
-
|
|
88
|
-
|
|
66
|
+
return @image if defined?(@image)
|
|
67
|
+
|
|
68
|
+
img_url = image_urls.first
|
|
69
|
+
@image = img_url ? Url.from_relative(img_url, base_url || img_url) : nil
|
|
89
70
|
end
|
|
90
71
|
|
|
91
72
|
# @return [String, nil] published-at timestamp string
|
|
@@ -93,24 +74,23 @@ module Html2rss
|
|
|
93
74
|
|
|
94
75
|
# @return [Array<String>, nil] extracted category labels
|
|
95
76
|
def categories
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
@categories = CategoryExtractor.call(schema_object)
|
|
77
|
+
@categories ||= CategoryExtractor.call(schema_object)
|
|
99
78
|
end
|
|
100
79
|
|
|
101
80
|
attr_reader :schema_object, :base_url
|
|
102
81
|
|
|
103
82
|
# @return [Array<String>] normalized image URL candidates
|
|
104
83
|
def image_urls
|
|
105
|
-
schema_object.values_at(:image, :thumbnailUrl).filter_map
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
84
|
+
@image_urls ||= schema_object.values_at(:image, :thumbnailUrl).filter_map { image_url_from(_1) }
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
private
|
|
88
|
+
|
|
89
|
+
def image_url_from(obj)
|
|
90
|
+
return obj if obj.is_a?(String)
|
|
91
|
+
return unless obj.is_a?(Hash) && obj[:@type] == 'ImageObject'
|
|
92
|
+
|
|
93
|
+
obj[:url] || obj[:contentUrl]
|
|
114
94
|
end
|
|
115
95
|
|
|
116
96
|
# @param value [String, Symbol, nil] candidate schema identifier
|
|
@@ -120,10 +100,8 @@ module Html2rss
|
|
|
120
100
|
text = value.to_s
|
|
121
101
|
return if text.empty?
|
|
122
102
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
normalized_id_value(normalized_url)
|
|
103
|
+
norm_url = normalized_id_url(text, reference_url:)
|
|
104
|
+
reference_url && norm_url.host == reference_url.host ? normalized_id_value(norm_url) : text
|
|
127
105
|
rescue ArgumentError
|
|
128
106
|
text
|
|
129
107
|
end
|
|
@@ -132,11 +110,7 @@ module Html2rss
|
|
|
132
110
|
# @param reference_url [Html2rss::Url, nil] URL used to resolve relative IDs
|
|
133
111
|
# @return [Html2rss::Url] normalized identifier URL
|
|
134
112
|
def normalized_id_url(text, reference_url:)
|
|
135
|
-
|
|
136
|
-
Url.from_relative(text, reference_url || text)
|
|
137
|
-
else
|
|
138
|
-
Url.from_absolute(text)
|
|
139
|
-
end
|
|
113
|
+
text.start_with?('/') ? Url.from_relative(text, reference_url || text) : Url.from_absolute(text)
|
|
140
114
|
end
|
|
141
115
|
|
|
142
116
|
# @param url [Html2rss::Url] normalized identifier URL
|
|
@@ -144,17 +118,14 @@ module Html2rss
|
|
|
144
118
|
def normalized_id_value(url)
|
|
145
119
|
path = url.path.to_s
|
|
146
120
|
return "#{path}?#{url.query}" if (path.empty? || path == '/') && !url.query.to_s.empty?
|
|
147
|
-
return path unless path.empty?
|
|
148
121
|
|
|
149
|
-
url.query
|
|
122
|
+
path.empty? ? url.query : path
|
|
150
123
|
end
|
|
151
124
|
|
|
152
125
|
# @param url [String, Html2rss::Url, nil] candidate page URL
|
|
153
126
|
# @return [Html2rss::Url, nil] normalized absolute URL for schema resolution
|
|
154
127
|
def normalized_base_url(url)
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
Url.from_absolute(url)
|
|
128
|
+
Url.from_absolute(url) unless url.to_s.strip.empty?
|
|
158
129
|
rescue ArgumentError
|
|
159
130
|
nil
|
|
160
131
|
end
|
|
@@ -18,6 +18,13 @@ module Html2rss
|
|
|
18
18
|
# Selector for JSON-LD script tags containing Schema.org objects.
|
|
19
19
|
TAG_SELECTOR = 'script[type="application/ld+json"]'
|
|
20
20
|
|
|
21
|
+
# Pre-compiled regex union for supported schema types.
|
|
22
|
+
# Performs a single pass over script tag text instead of multiple regex matches.
|
|
23
|
+
SUPPORTED_TYPES_RE = begin
|
|
24
|
+
types = Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES
|
|
25
|
+
/"@type"\s*:\s*"(?:#{Regexp.union(types.to_a).source})"/
|
|
26
|
+
end.freeze
|
|
27
|
+
|
|
21
28
|
# @return [Symbol] scraper config key
|
|
22
29
|
def self.options_key = :schema
|
|
23
30
|
|
|
@@ -31,8 +38,7 @@ module Html2rss
|
|
|
31
38
|
# @param script [Nokogiri::XML::Element] schema JSON-LD script tag
|
|
32
39
|
# @return [Boolean] whether the tag references a supported schema type
|
|
33
40
|
def supported_schema_type?(script)
|
|
34
|
-
|
|
35
|
-
supported_types.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
|
|
41
|
+
script.text.match?(SUPPORTED_TYPES_RE)
|
|
36
42
|
end
|
|
37
43
|
|
|
38
44
|
##
|
|
@@ -22,8 +22,7 @@ module Html2rss
|
|
|
22
22
|
# @return [Array<Entry>] deduplicated list of scraper entries
|
|
23
23
|
def call(entries)
|
|
24
24
|
destination_groups(entries).filter_map do |group|
|
|
25
|
-
|
|
26
|
-
collapsed_group.reduce do |best, entry|
|
|
25
|
+
group.reduce do |best, entry|
|
|
27
26
|
stronger_entry?(entry, best) ? entry : best
|
|
28
27
|
end
|
|
29
28
|
end
|
|
@@ -67,21 +66,6 @@ module Html2rss
|
|
|
67
66
|
|
|
68
67
|
def destination_groups(entries) = entries.group_by { entry_destination(_1) }.values
|
|
69
68
|
|
|
70
|
-
def collapse_nested_destination_group(entries)
|
|
71
|
-
return entries if entries.size <= 1
|
|
72
|
-
|
|
73
|
-
entries.reject do |entry|
|
|
74
|
-
entries.any? do |other|
|
|
75
|
-
next if entry.equal?(other)
|
|
76
|
-
next unless nested_container_pair?(entry.container, other.container)
|
|
77
|
-
|
|
78
|
-
stronger_entry?(other, entry)
|
|
79
|
-
end
|
|
80
|
-
end
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
def nested_container_pair?(left, right) = left.ancestors.include?(right) || right.ancestors.include?(left)
|
|
84
|
-
|
|
85
69
|
def entry_destination(entry) = entry.destination_facts&.destination || article_for(entry)&.[](:url)&.to_s
|
|
86
70
|
|
|
87
71
|
def payload_richness_signature(article)
|
|
@@ -94,7 +78,9 @@ module Html2rss
|
|
|
94
78
|
]
|
|
95
79
|
end
|
|
96
80
|
|
|
97
|
-
def word_count(text)
|
|
81
|
+
def word_count(text)
|
|
82
|
+
(@word_counts ||= {})[text] ||= text.to_s.scan(/\p{Alnum}+/).size
|
|
83
|
+
end
|
|
98
84
|
end
|
|
99
85
|
end
|
|
100
86
|
end
|
|
@@ -21,6 +21,18 @@ module Html2rss
|
|
|
21
21
|
class SemanticHtml # rubocop:disable Metrics/ClassLength
|
|
22
22
|
include Enumerable
|
|
23
23
|
|
|
24
|
+
# Regexp to match content-related tokens.
|
|
25
|
+
CONTENT_REGEXP = begin
|
|
26
|
+
words = LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:content)
|
|
27
|
+
/(?:^|\s|[-_])(#{Regexp.union(words.to_a).source})(?:\s|[-_]|$)/i
|
|
28
|
+
end.freeze
|
|
29
|
+
|
|
30
|
+
# Regexp to match junk/utility-related tokens.
|
|
31
|
+
JUNK_REGEXP = begin
|
|
32
|
+
words = LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:utility)
|
|
33
|
+
/(?:^|\s|[-_])(#{Regexp.union(words.to_a).source})(?:\s|[-_]|$)/i
|
|
34
|
+
end.freeze
|
|
35
|
+
|
|
24
36
|
# Container plus selected anchor, scoring metadata, and extracted article.
|
|
25
37
|
Entry = Data.define(
|
|
26
38
|
:container,
|
|
@@ -218,47 +230,79 @@ module Html2rss
|
|
|
218
230
|
weak_article_candidate)
|
|
219
231
|
end
|
|
220
232
|
|
|
233
|
+
##
|
|
234
|
+
# @param container [Nokogiri::XML::Node]
|
|
235
|
+
# @return [Boolean]
|
|
221
236
|
def publish_marker?(container)
|
|
222
|
-
|
|
237
|
+
(@publish_markers ||= {}.compare_by_identity)[container] ||=
|
|
238
|
+
!!container.at_css('time, [datetime], [itemprop="datePublished"], [itemprop="dateModified"]')
|
|
223
239
|
end
|
|
224
240
|
|
|
241
|
+
##
|
|
242
|
+
# @param container [Nokogiri::XML::Node]
|
|
243
|
+
# @param publish_signal [Boolean]
|
|
244
|
+
# @param descriptive_signal [Boolean]
|
|
245
|
+
# @param content_signal [Boolean]
|
|
246
|
+
# @return [Integer]
|
|
225
247
|
def article_signal_count(container, publish_signal:, descriptive_signal:, content_signal:)
|
|
226
248
|
[article_container?(container), publish_signal, descriptive_signal, content_signal].count(&:itself)
|
|
227
249
|
end
|
|
228
250
|
|
|
251
|
+
##
|
|
252
|
+
# @param container [Nokogiri::XML::Node]
|
|
253
|
+
# @return [Boolean]
|
|
229
254
|
def article_container?(container) = container.name == 'article'
|
|
230
255
|
|
|
231
256
|
def descriptive_context?(container_text, title)
|
|
232
257
|
snippet = container_text.to_s.sub(/\A#{Regexp.escape(title.to_s)}/i, '')
|
|
233
|
-
|
|
258
|
+
# Only check for existence of enough words if snippet is long enough to have them
|
|
259
|
+
snippet.length > 30 && word_count(snippet) >= 8
|
|
234
260
|
end
|
|
235
261
|
|
|
236
|
-
|
|
262
|
+
##
|
|
263
|
+
# @param container [Nokogiri::XML::Node]
|
|
264
|
+
# @return [Nokogiri::XML::Node, nil]
|
|
265
|
+
def heading_for(container)
|
|
266
|
+
(@headings ||= {}.compare_by_identity)[container] ||= container.at_css(AnchorSelector::HEADING_SELECTOR)
|
|
267
|
+
end
|
|
237
268
|
|
|
238
|
-
def normalized_destination(anchor)
|
|
269
|
+
def normalized_destination(anchor)
|
|
270
|
+
(@normalized_destinations ||= {}.compare_by_identity)[anchor] ||= @link_heuristics.destination_facts(anchor)
|
|
271
|
+
end
|
|
239
272
|
|
|
240
273
|
def visible_text(node)
|
|
241
274
|
return '' unless node
|
|
242
275
|
|
|
243
|
-
HtmlExtractor.extract_visible_text(node).to_s.strip
|
|
276
|
+
(@visible_texts ||= {}.compare_by_identity)[node] ||= HtmlExtractor.extract_visible_text(node).to_s.strip
|
|
244
277
|
end
|
|
245
278
|
|
|
279
|
+
##
|
|
280
|
+
# @param container [Nokogiri::XML::Node]
|
|
281
|
+
# @param selected_anchor [Nokogiri::XML::Node]
|
|
282
|
+
# @return [String]
|
|
246
283
|
def entry_title(container, selected_anchor) = visible_text(heading_for(container) || selected_anchor)
|
|
247
284
|
|
|
248
|
-
|
|
285
|
+
##
|
|
286
|
+
# @param text [String, #to_s]
|
|
287
|
+
# @return [Integer]
|
|
288
|
+
def word_count(text)
|
|
289
|
+
(@word_counts ||= {})[text] ||= begin
|
|
290
|
+
count = 0
|
|
291
|
+
text.to_s.scan(/\p{Alnum}+/) { count += 1 }
|
|
292
|
+
count
|
|
293
|
+
end
|
|
294
|
+
end
|
|
249
295
|
|
|
250
296
|
def container_tokens(container)
|
|
251
|
-
|
|
252
|
-
id = container['id'].to_s
|
|
253
|
-
(classes << id).flat_map { |str| str.downcase.split(/[-_]+/) }.reject(&:empty?)
|
|
297
|
+
(@container_tokens ||= {}.compare_by_identity)[container] ||= "#{container['class']} #{container['id']}"
|
|
254
298
|
end
|
|
255
299
|
|
|
256
300
|
def content_tokens?(tokens)
|
|
257
|
-
|
|
301
|
+
tokens.match?(CONTENT_REGEXP)
|
|
258
302
|
end
|
|
259
303
|
|
|
260
304
|
def junk_tokens?(tokens)
|
|
261
|
-
|
|
305
|
+
tokens.match?(JUNK_REGEXP)
|
|
262
306
|
end
|
|
263
307
|
|
|
264
308
|
def stable_rank(entries)
|
|
@@ -11,9 +11,6 @@ module Html2rss
|
|
|
11
11
|
# Detection is intentionally shallow for most scrapers, but instance-based
|
|
12
12
|
# matching is available for scrapers that need to carry expensive selection
|
|
13
13
|
# state forward into extraction.
|
|
14
|
-
# Scrapers run in parallel threads, so implementations must avoid shared
|
|
15
|
-
# mutable state and degrade by returning no articles when a follow-up would
|
|
16
|
-
# be unsafe or unsupported.
|
|
17
14
|
module Scraper
|
|
18
15
|
# Root markers indicating likely app-shell/client-rendered surfaces.
|
|
19
16
|
APP_SHELL_ROOT_SELECTORS = '#app, #root, #__next, [data-reactroot], [ng-app], [id*="app-shell"]'
|
data/lib/html2rss/auto_source.rb
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require 'parallel'
|
|
4
3
|
require 'dry-validation'
|
|
5
4
|
|
|
6
5
|
module Html2rss
|
|
@@ -121,11 +120,8 @@ module Html2rss
|
|
|
121
120
|
scraper_instances = Scraper.instances_for(parsed_body, url:, request_session:, opts: @opts[:scraper])
|
|
122
121
|
return [] if scraper_instances.empty?
|
|
123
122
|
|
|
124
|
-
# Scrapers are
|
|
125
|
-
|
|
126
|
-
# concurrency-safe from the scraper side, and return no articles when a
|
|
127
|
-
# follow-up would be unsafe or unsupported.
|
|
128
|
-
articles = Parallel.flat_map(scraper_instances, in_threads: thread_count_for(scraper_instances)) do |instance|
|
|
123
|
+
# Scrapers are run sequentially.
|
|
124
|
+
articles = scraper_instances.flat_map do |instance|
|
|
129
125
|
run_scraper(instance)
|
|
130
126
|
end
|
|
131
127
|
Cleanup.call(articles, url:, **cleanup_options)
|
|
@@ -140,10 +136,5 @@ module Html2rss
|
|
|
140
136
|
def cleanup_options
|
|
141
137
|
@opts.fetch(:cleanup, {})
|
|
142
138
|
end
|
|
143
|
-
|
|
144
|
-
def thread_count_for(scrapers)
|
|
145
|
-
count = [scrapers.size, Parallel.processor_count].min
|
|
146
|
-
count.zero? ? 1 : count
|
|
147
|
-
end
|
|
148
139
|
end
|
|
149
140
|
end
|
|
@@ -8,8 +8,10 @@ module Html2rss
|
|
|
8
8
|
# Common category-related terms to look for in class names
|
|
9
9
|
CATEGORY_TERMS = %w[category tag topic section label theme subject].freeze
|
|
10
10
|
|
|
11
|
-
# CSS selectors to find elements with category-related class names
|
|
12
|
-
CATEGORY_SELECTORS = CATEGORY_TERMS.
|
|
11
|
+
# CSS selectors to find elements with category-related class names or data attributes
|
|
12
|
+
CATEGORY_SELECTORS = CATEGORY_TERMS.flat_map do |term|
|
|
13
|
+
["[class*=\"#{term}\"]", "[data-#{term}]", "[#{term}]"]
|
|
14
|
+
end.freeze
|
|
13
15
|
|
|
14
16
|
# Regex pattern for matching category-related attribute names
|
|
15
17
|
CATEGORY_ATTR_PATTERN = /#{CATEGORY_TERMS.join('|')}/i
|
|
@@ -36,12 +38,12 @@ module Html2rss
|
|
|
36
38
|
# @return [Set<String>] Set of category strings
|
|
37
39
|
def self.extract_all_categories(article_tag)
|
|
38
40
|
Set.new.tap do |categories|
|
|
39
|
-
article_tag.css('
|
|
41
|
+
article_tag.css(CATEGORY_SELECTORS.join(',')).each do |element|
|
|
40
42
|
# Extract text categories from elements with category-related class names
|
|
41
|
-
|
|
43
|
+
extract_text_categories!(categories, element) if element['class']&.match?(CATEGORY_ATTR_PATTERN)
|
|
42
44
|
|
|
43
45
|
# Extract data categories from all elements
|
|
44
|
-
|
|
46
|
+
extract_element_data_categories!(categories, element)
|
|
45
47
|
end
|
|
46
48
|
end
|
|
47
49
|
end
|
|
@@ -49,34 +51,66 @@ module Html2rss
|
|
|
49
51
|
##
|
|
50
52
|
# Extracts categories from data attributes of a single element.
|
|
51
53
|
#
|
|
54
|
+
# @param categories [Set<String>] Accumulator set
|
|
52
55
|
# @param element [Nokogiri::XML::Element] metadata element that may contain category links
|
|
53
|
-
# @return [
|
|
54
|
-
def self.extract_element_data_categories(element)
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
next unless attr.name.match?(CATEGORY_ATTR_PATTERN)
|
|
56
|
+
# @return [void]
|
|
57
|
+
def self.extract_element_data_categories!(categories, element)
|
|
58
|
+
element.attributes.each_value do |attr|
|
|
59
|
+
next unless attr.name.match?(CATEGORY_ATTR_PATTERN)
|
|
58
60
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
end
|
|
61
|
+
value = attr.value&.strip
|
|
62
|
+
categories.add(value) if value && !value.empty?
|
|
62
63
|
end
|
|
63
64
|
end
|
|
64
65
|
|
|
65
66
|
##
|
|
66
67
|
# Extracts text-based categories from elements, splitting content into discrete values.
|
|
67
68
|
#
|
|
69
|
+
# @param categories [Set<String>] Accumulator set
|
|
68
70
|
# @param element [Nokogiri::XML::Element] metadata element whose text may contain delimiters
|
|
69
|
-
# @return [
|
|
70
|
-
def self.extract_text_categories(element)
|
|
71
|
-
|
|
72
|
-
|
|
71
|
+
# @return [void]
|
|
72
|
+
def self.extract_text_categories!(categories, element)
|
|
73
|
+
if element.name == 'a'
|
|
74
|
+
add_text_to_categories!(categories, element)
|
|
75
|
+
return
|
|
73
76
|
end
|
|
74
|
-
return Set.new(anchor_values.reject(&:empty?)) if anchor_values.any?
|
|
75
77
|
|
|
78
|
+
anchors = element.css('a')
|
|
79
|
+
|
|
80
|
+
if anchors.any?
|
|
81
|
+
anchors.each { |node| add_text_to_categories!(categories, node) }
|
|
82
|
+
else
|
|
83
|
+
extract_split_text_categories!(categories, element)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
##
|
|
88
|
+
# Adds the visible text of the given element to the categories set.
|
|
89
|
+
#
|
|
90
|
+
# @param categories [Set<String>] Accumulator set
|
|
91
|
+
# @param element [Nokogiri::XML::Element] The element to extract text from
|
|
92
|
+
# @return [void]
|
|
93
|
+
def self.add_text_to_categories!(categories, element)
|
|
94
|
+
text = HtmlExtractor.extract_visible_text(element)
|
|
95
|
+
categories.add(text) if text && !text.empty?
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
##
|
|
99
|
+
# Extracts categories from the element's text by splitting on newlines.
|
|
100
|
+
#
|
|
101
|
+
# @param categories [Set<String>] Accumulator set
|
|
102
|
+
# @param element [Nokogiri::XML::Element] The element to extract text from
|
|
103
|
+
# @return [void]
|
|
104
|
+
def self.extract_split_text_categories!(categories, element)
|
|
76
105
|
text = HtmlExtractor.extract_visible_text(element)
|
|
77
|
-
return
|
|
106
|
+
return unless text
|
|
78
107
|
|
|
79
|
-
|
|
108
|
+
text.split(/\n+/).each do |line|
|
|
109
|
+
line = line.strip
|
|
110
|
+
categories.add(line) unless line.empty?
|
|
111
|
+
end
|
|
80
112
|
end
|
|
113
|
+
|
|
114
|
+
private_class_method :add_text_to_categories!, :extract_split_text_categories!
|
|
81
115
|
end
|
|
82
116
|
end
|