coelacanth 0.3.9 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -5
- data/Gemfile +3 -3
- data/README.md +128 -55
- data/lib/coelacanth/client/ferrum.rb +6 -2
- data/lib/coelacanth/dom.rb +3 -2
- data/lib/coelacanth/extractor/fallback_probe.rb +34 -0
- data/lib/coelacanth/extractor/heuristic_probe.rb +175 -0
- data/lib/coelacanth/extractor/image_collector.rb +19 -0
- data/lib/coelacanth/extractor/listing_collector.rb +270 -0
- data/lib/coelacanth/extractor/markdown_renderer.rb +128 -0
- data/lib/coelacanth/extractor/metadata_probe.rb +121 -0
- data/lib/coelacanth/extractor/normalizer.rb +47 -0
- data/lib/coelacanth/extractor/utilities.rb +145 -0
- data/lib/coelacanth/extractor/weak_ml_probe.rb +136 -0
- data/lib/coelacanth/extractor.rb +67 -0
- data/lib/coelacanth/version.rb +1 -1
- data/lib/coelacanth.rb +8 -1
- metadata +11 -2
- data/Gemfile.lock +0 -103
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "utilities"
|
|
4
|
+
|
|
5
|
+
module Coelacanth
|
|
6
|
+
class Extractor
|
|
7
|
+
# Identifies sidebar or inline news listings and returns link arrays.
|
|
8
|
+
class ListingCollector
|
|
9
|
+
CANDIDATE_SELECTOR = "aside, section, div, ul, ol, dl".freeze
|
|
10
|
+
MIN_ITEMS = 3
|
|
11
|
+
MIN_TITLE_LENGTH = 2
|
|
12
|
+
|
|
13
|
+
def call(document:, base_url: nil, primary_node: nil)
|
|
14
|
+
candidates = collect_candidates(document, base_url, primary_node)
|
|
15
|
+
|
|
16
|
+
candidates
|
|
17
|
+
.sort_by { |candidate| -candidate[:score] }
|
|
18
|
+
.reject { |candidate| candidate[:score] < minimum_score }
|
|
19
|
+
.first(3)
|
|
20
|
+
.map { |candidate| format_candidate(candidate) }
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def collect_candidates(document, base_url, primary_node)
|
|
26
|
+
document.css(CANDIDATE_SELECTOR).filter_map do |node|
|
|
27
|
+
next if skip_node?(node, primary_node)
|
|
28
|
+
|
|
29
|
+
items = extract_items(node, base_url)
|
|
30
|
+
next if items.length < MIN_ITEMS
|
|
31
|
+
|
|
32
|
+
heading = heading_for(node)
|
|
33
|
+
score = score_node(node, items, heading)
|
|
34
|
+
next if score < minimum_score
|
|
35
|
+
|
|
36
|
+
{ node: node, items: items, heading: heading, score: score }
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def skip_node?(node, primary_node)
|
|
41
|
+
return true if nested_listing_container?(node)
|
|
42
|
+
|
|
43
|
+
return false unless primary_node
|
|
44
|
+
return false unless primary_node.respond_to?(:name)
|
|
45
|
+
return false if %w[body html].include?(primary_node.name)
|
|
46
|
+
|
|
47
|
+
node == primary_node ||
|
|
48
|
+
ancestor?(node, primary_node) ||
|
|
49
|
+
ancestor?(primary_node, node)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def nested_listing_container?(node)
|
|
53
|
+
Utilities.ancestors(node).any? do |ancestor|
|
|
54
|
+
Utilities.element?(ancestor) && LISTING_CONTAINER_TAGS.include?(ancestor.name)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
LISTING_CONTAINER_TAGS = %w[aside section div ul ol dl].freeze
|
|
59
|
+
|
|
60
|
+
def ancestor?(node, candidate)
|
|
61
|
+
Utilities.ancestors(node).any? { |ancestor| ancestor == candidate }
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def extract_items(node, base_url)
|
|
65
|
+
item_nodes = candidate_children(node)
|
|
66
|
+
return [] if item_nodes.empty?
|
|
67
|
+
|
|
68
|
+
item_nodes.filter_map do |child|
|
|
69
|
+
next unless contains_link?(child)
|
|
70
|
+
|
|
71
|
+
anchor = primary_anchor(child)
|
|
72
|
+
next unless anchor
|
|
73
|
+
|
|
74
|
+
title = normalize_text(anchor.text)
|
|
75
|
+
next if title.length < MIN_TITLE_LENGTH
|
|
76
|
+
|
|
77
|
+
href = anchor["href"].to_s.strip
|
|
78
|
+
next if href.empty?
|
|
79
|
+
|
|
80
|
+
url = base_url ? Utilities.absolute_url(base_url, href) : href
|
|
81
|
+
url ||= href
|
|
82
|
+
|
|
83
|
+
snippet = build_snippet(child, title)
|
|
84
|
+
|
|
85
|
+
item = { title: title, url: url }
|
|
86
|
+
item[:snippet] = snippet unless snippet.nil? || snippet.empty?
|
|
87
|
+
item
|
|
88
|
+
end.uniq { |item| [item[:title], item[:url]] }
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def candidate_children(node)
|
|
92
|
+
direct_children = Utilities.element_children(node)
|
|
93
|
+
return [] if direct_children.empty?
|
|
94
|
+
|
|
95
|
+
anchor_children = direct_children.select { |child| contains_link?(child) }
|
|
96
|
+
return anchor_children if anchor_children.length >= MIN_ITEMS
|
|
97
|
+
|
|
98
|
+
groups = %w[li article div section p dd]
|
|
99
|
+
|
|
100
|
+
groups.each do |tag|
|
|
101
|
+
grouped = direct_children.select { |child| child.name == tag }
|
|
102
|
+
return grouped if grouped.length >= MIN_ITEMS
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
list_container = direct_children.find { |child| %w[ul ol dl].include?(child.name) }
|
|
106
|
+
return Utilities.element_children(list_container) if list_container
|
|
107
|
+
|
|
108
|
+
[]
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def contains_link?(node)
|
|
112
|
+
node.css("a[href]").any?
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def primary_anchor(node)
|
|
116
|
+
anchors = node.css("a[href]")
|
|
117
|
+
anchors.max_by { |anchor| normalize_text(anchor.text).length }
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def normalize_text(text)
|
|
121
|
+
text.to_s.gsub(/[\r\n\t]/, " ").squeeze(" ").strip
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def build_snippet(node, title)
|
|
125
|
+
snippet_from_node_text(node, title) || metadata_context(node, title)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def snippet_from_node_text(node, title)
|
|
129
|
+
text = normalize_text(node.text)
|
|
130
|
+
snippet = text.sub(title, "").strip
|
|
131
|
+
snippet.empty? ? nil : truncate(snippet)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def metadata_context(node, title)
|
|
135
|
+
candidate = time_text(node) || preceding_metadata(node)
|
|
136
|
+
return nil if candidate.nil?
|
|
137
|
+
|
|
138
|
+
candidate = candidate.sub(title, "").strip
|
|
139
|
+
candidate.empty? ? nil : truncate(candidate)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def time_text(node)
|
|
143
|
+
node.css("time").filter_map { |time| normalize_text(time.text) }.find { |text| !text.empty? }
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def preceding_metadata(node)
|
|
147
|
+
previous = Utilities.previous_element(node)
|
|
148
|
+
3.times do
|
|
149
|
+
break unless previous
|
|
150
|
+
|
|
151
|
+
text = normalize_text(previous.text)
|
|
152
|
+
return text unless text.empty?
|
|
153
|
+
|
|
154
|
+
previous = Utilities.previous_element(previous)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
nil
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def truncate(text)
|
|
161
|
+
return text if text.length <= 120
|
|
162
|
+
|
|
163
|
+
text[0...117] + "..."
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def heading_for(node)
|
|
167
|
+
if (heading = node.at_css("h1, h2, h3, h4"))
|
|
168
|
+
return normalize_text(heading.text)
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
previous = Utilities.previous_element(node)
|
|
172
|
+
3.times do
|
|
173
|
+
break unless previous
|
|
174
|
+
|
|
175
|
+
return normalize_text(previous.text) if previous.name =~ /h[1-6]/
|
|
176
|
+
previous = Utilities.previous_element(previous)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
nil
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
def score_node(node, items, heading)
|
|
183
|
+
structure_score = structural_score(node)
|
|
184
|
+
heading_score = heading ? 45 : 0
|
|
185
|
+
item_score = items.length * 40
|
|
186
|
+
density_score = Utilities.link_density(node) * 90
|
|
187
|
+
adjacency_score = sibling_sequence_bonus(node)
|
|
188
|
+
depth_penalty = Utilities.depth(node) * 5
|
|
189
|
+
length_penalty = long_text_penalty(node)
|
|
190
|
+
|
|
191
|
+
structure_score + heading_score + item_score + density_score + adjacency_score - depth_penalty - length_penalty
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def structural_score(node)
|
|
195
|
+
children = candidate_children(node)
|
|
196
|
+
return 0 if children.empty?
|
|
197
|
+
|
|
198
|
+
dominant_tag, dominant_children = children.group_by(&:name).max_by { |_, nodes| nodes.length }
|
|
199
|
+
dominant_count = dominant_children.length
|
|
200
|
+
|
|
201
|
+
uniform_bonus = dominant_count == children.length ? 60 : 20
|
|
202
|
+
list_bonus = %w[ul ol dl].include?(node.name) ? 90 : 0
|
|
203
|
+
list_bonus += 45 if dominant_tag && %w[li dd].include?(dominant_tag)
|
|
204
|
+
|
|
205
|
+
distribution_bonus = distribution_consistency_bonus(children)
|
|
206
|
+
|
|
207
|
+
dominant_count * 12 + uniform_bonus + list_bonus + distribution_bonus
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def distribution_consistency_bonus(children)
|
|
211
|
+
return 0 if children.length < MIN_ITEMS
|
|
212
|
+
|
|
213
|
+
lengths = children.map { |child| Utilities.text_length(child) }
|
|
214
|
+
average = lengths.sum.to_f / lengths.length
|
|
215
|
+
variance = lengths.map { |len| (len - average).abs }
|
|
216
|
+
|
|
217
|
+
variance.max <= 120 ? 40 : 10
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def sibling_sequence_bonus(node)
|
|
221
|
+
siblings = Utilities.sibling_elements(node)
|
|
222
|
+
return 0 if siblings.empty?
|
|
223
|
+
|
|
224
|
+
index = siblings.index(node)
|
|
225
|
+
return 0 unless index
|
|
226
|
+
|
|
227
|
+
forward = 0
|
|
228
|
+
while (candidate = siblings[index + forward + 1]) && similar_structure?(node, candidate)
|
|
229
|
+
forward += 1
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
backward = 0
|
|
233
|
+
while index - backward - 1 >= 0 && (candidate = siblings[index - backward - 1]) && similar_structure?(node, candidate)
|
|
234
|
+
backward += 1
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
(forward + backward) * 15
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def similar_structure?(node, other)
|
|
241
|
+
return false unless other
|
|
242
|
+
|
|
243
|
+
node_children = candidate_children(node)
|
|
244
|
+
other_children = candidate_children(other)
|
|
245
|
+
return false if node_children.empty? || other_children.empty?
|
|
246
|
+
|
|
247
|
+
node_children.first.name == other_children.first.name && node_children.length == other_children.length
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
def long_text_penalty(node)
|
|
251
|
+
children = candidate_children(node)
|
|
252
|
+
return 0 if children.empty?
|
|
253
|
+
|
|
254
|
+
overlong = children.count { |child| Utilities.text_length(child) > 280 }
|
|
255
|
+
overlong * 30
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
def minimum_score
|
|
259
|
+
180
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
def format_candidate(candidate)
|
|
263
|
+
{
|
|
264
|
+
heading: candidate[:heading],
|
|
265
|
+
items: candidate[:items]
|
|
266
|
+
}
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
end
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coelacanth
|
|
4
|
+
class Extractor
|
|
5
|
+
# Converts a DOM node into a lightweight Markdown representation.
|
|
6
|
+
class MarkdownRenderer
|
|
7
|
+
def self.render(node)
|
|
8
|
+
new(node).render
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def initialize(node)
|
|
12
|
+
@node = node
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def render
|
|
16
|
+
return "" unless @node
|
|
17
|
+
|
|
18
|
+
lines = traverse(@node)
|
|
19
|
+
lines.compact.join("\n").gsub(/\n{3,}/, "\n\n")
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
def traverse(node, depth = 0)
|
|
25
|
+
return if node.nil?
|
|
26
|
+
|
|
27
|
+
if document_node?(node)
|
|
28
|
+
node.children.flat_map { |child| traverse(child, depth) }
|
|
29
|
+
elsif element_node?(node)
|
|
30
|
+
render_element(node, depth)
|
|
31
|
+
elsif text_node?(node)
|
|
32
|
+
text = node.text.to_s.strip
|
|
33
|
+
text.empty? ? nil : text
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def render_element(node, depth)
|
|
38
|
+
case node.name
|
|
39
|
+
when "p"
|
|
40
|
+
[node.children.flat_map { |child| traverse(child, depth) }.join(" "), ""]
|
|
41
|
+
when "br"
|
|
42
|
+
"\n"
|
|
43
|
+
when "h1", "h2", "h3", "h4", "h5", "h6"
|
|
44
|
+
level = node.name.delete_prefix("h").to_i
|
|
45
|
+
heading = "#" * level + " " + inline_children(node, depth)
|
|
46
|
+
[heading, ""]
|
|
47
|
+
when "ul"
|
|
48
|
+
element_children(node).flat_map { |child| render_list_item(child, depth, "-") } + [""]
|
|
49
|
+
when "ol"
|
|
50
|
+
element_children(node).each_with_index.flat_map do |child, index|
|
|
51
|
+
render_list_item(child, depth, "#{index + 1}.")
|
|
52
|
+
end + [""]
|
|
53
|
+
when "li"
|
|
54
|
+
["- #{inline_children(node, depth)}"]
|
|
55
|
+
when "strong", "b"
|
|
56
|
+
"**#{inline_children(node, depth)}**"
|
|
57
|
+
when "em", "i"
|
|
58
|
+
"*#{inline_children(node, depth)}*"
|
|
59
|
+
when "blockquote"
|
|
60
|
+
quote = node.children.flat_map { |child| traverse(child, depth + 1) }.compact
|
|
61
|
+
quote.map { |line| "> #{line}" } + [""]
|
|
62
|
+
when "pre", "code"
|
|
63
|
+
content = node.text
|
|
64
|
+
["```", content.rstrip, "```", ""]
|
|
65
|
+
when "img"
|
|
66
|
+
alt = node["alt"].to_s.strip
|
|
67
|
+
src = node["src"].to_s.strip
|
|
68
|
+
["", ""]
|
|
69
|
+
else
|
|
70
|
+
node.children.flat_map { |child| traverse(child, depth) }
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def inline_children(node, depth)
|
|
75
|
+
node.children.flat_map { |child| traverse(child, depth) }.join(" ").squeeze(" ").strip
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def render_list_item(node, depth, marker)
|
|
79
|
+
text = inline_children(node, depth)
|
|
80
|
+
return [] if text.empty?
|
|
81
|
+
|
|
82
|
+
["#{marker} #{text}"]
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def document_node?(node)
|
|
86
|
+
return false unless node
|
|
87
|
+
|
|
88
|
+
(defined?(::Oga::XML::Document) && node.is_a?(::Oga::XML::Document)) ||
|
|
89
|
+
(node.respond_to?(:document?) && node.document?) ||
|
|
90
|
+
(node.respond_to?(:type) && node.type == :document)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def element_node?(node)
|
|
94
|
+
return false unless node
|
|
95
|
+
|
|
96
|
+
if node.respond_to?(:element?)
|
|
97
|
+
node.element?
|
|
98
|
+
elsif defined?(::Oga::XML::Element) && node.is_a?(::Oga::XML::Element)
|
|
99
|
+
true
|
|
100
|
+
elsif node.respond_to?(:type)
|
|
101
|
+
node.type == :element
|
|
102
|
+
else
|
|
103
|
+
false
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def text_node?(node)
|
|
108
|
+
return false unless node
|
|
109
|
+
|
|
110
|
+
if node.respond_to?(:text?)
|
|
111
|
+
node.text?
|
|
112
|
+
elsif defined?(::Oga::XML::Text) && node.is_a?(::Oga::XML::Text)
|
|
113
|
+
true
|
|
114
|
+
elsif node.respond_to?(:type)
|
|
115
|
+
node.type == :text
|
|
116
|
+
else
|
|
117
|
+
false
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def element_children(node)
|
|
122
|
+
return [] unless node.respond_to?(:children)
|
|
123
|
+
|
|
124
|
+
node.children.select { |child| element_node?(child) }
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "oga"
|
|
5
|
+
|
|
6
|
+
require_relative "utilities"
|
|
7
|
+
|
|
8
|
+
module Coelacanth
|
|
9
|
+
class Extractor
|
|
10
|
+
# Attempts to pull article metadata such as JSON-LD and OpenGraph tags.
|
|
11
|
+
class MetadataProbe
|
|
12
|
+
ARTICLE_TYPES = %w[Article NewsArticle BlogPosting ReportageNewsArticle LiveBlogPosting].freeze
|
|
13
|
+
|
|
14
|
+
Result = Struct.new(
|
|
15
|
+
:title,
|
|
16
|
+
:node,
|
|
17
|
+
:published_at,
|
|
18
|
+
:byline,
|
|
19
|
+
:source_tag,
|
|
20
|
+
:confidence,
|
|
21
|
+
keyword_init: true
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def call(doc:, url: nil)
|
|
25
|
+
from_jsonld(doc, url) || from_semantic_nodes(doc)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
def from_jsonld(doc, url)
|
|
31
|
+
doc.css("script[type='application/ld+json']").each do |script|
|
|
32
|
+
next if script.text.strip.empty?
|
|
33
|
+
|
|
34
|
+
begin
|
|
35
|
+
payload = JSON.parse(script.text)
|
|
36
|
+
rescue JSON::ParserError
|
|
37
|
+
next
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
candidates = payload.is_a?(Array) ? payload : [payload]
|
|
41
|
+
candidates.each do |candidate|
|
|
42
|
+
next unless article_type?(candidate)
|
|
43
|
+
|
|
44
|
+
body = candidate["articleBody"].to_s.strip
|
|
45
|
+
next if body.empty?
|
|
46
|
+
|
|
47
|
+
node = Oga.parse_html("<article>#{body}</article>").at_css("article")
|
|
48
|
+
return Result.new(
|
|
49
|
+
title: candidate["headline"] || candidate["name"],
|
|
50
|
+
node: node,
|
|
51
|
+
published_at: Utilities.parse_time(candidate["datePublished"] || candidate["dateCreated"]),
|
|
52
|
+
byline: extract_author(candidate["author"]),
|
|
53
|
+
source_tag: :jsonld,
|
|
54
|
+
confidence: 0.9
|
|
55
|
+
)
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
nil
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def from_semantic_nodes(doc)
|
|
62
|
+
node = doc.at_css("main, article, [role='main'], [itemprop='articleBody']")
|
|
63
|
+
return if node.nil?
|
|
64
|
+
|
|
65
|
+
Result.new(
|
|
66
|
+
title: title_from_meta(doc),
|
|
67
|
+
node: node,
|
|
68
|
+
published_at: published_at_from_meta(doc),
|
|
69
|
+
byline: byline_from_meta(doc),
|
|
70
|
+
source_tag: :semantic,
|
|
71
|
+
confidence: 0.82
|
|
72
|
+
)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def article_type?(candidate)
|
|
76
|
+
type = candidate["@type"]
|
|
77
|
+
Array(type).any? { |value| ARTICLE_TYPES.include?(value) }
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def extract_author(author)
|
|
81
|
+
case author
|
|
82
|
+
when String
|
|
83
|
+
author
|
|
84
|
+
when Hash
|
|
85
|
+
author["name"]
|
|
86
|
+
when Array
|
|
87
|
+
author.map { |item| extract_author(item) }.compact.join(", ")
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def title_from_meta(doc)
|
|
92
|
+
Utilities.meta_content(
|
|
93
|
+
doc,
|
|
94
|
+
"meta[property='og:title']",
|
|
95
|
+
"meta[name='twitter:title']",
|
|
96
|
+
"meta[name='title']"
|
|
97
|
+
) || doc.at_css("title")&.text&.strip
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def published_at_from_meta(doc)
|
|
101
|
+
Utilities.parse_time(
|
|
102
|
+
Utilities.meta_content(
|
|
103
|
+
doc,
|
|
104
|
+
"meta[property='article:published_time']",
|
|
105
|
+
"meta[name='pubdate']",
|
|
106
|
+
"meta[name='publish_date']",
|
|
107
|
+
"meta[name='date']"
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def byline_from_meta(doc)
|
|
113
|
+
Utilities.meta_content(
|
|
114
|
+
doc,
|
|
115
|
+
"meta[name='author']",
|
|
116
|
+
"meta[property='article:author']"
|
|
117
|
+
)
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "oga"
|
|
4
|
+
|
|
5
|
+
require_relative "utilities"
|
|
6
|
+
|
|
7
|
+
module Coelacanth
|
|
8
|
+
class Extractor
|
|
9
|
+
# Sanitizes HTML and prepares an Oga document.
|
|
10
|
+
class Normalizer
|
|
11
|
+
REMOVABLE_SELECTORS = %w[style noscript iframe form nav].freeze
|
|
12
|
+
|
|
13
|
+
def call(html:, base_url: nil)
|
|
14
|
+
document = Oga.parse_html(html)
|
|
15
|
+
remove_noise(document)
|
|
16
|
+
normalize_images(document, base_url)
|
|
17
|
+
document
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
def remove_noise(document)
|
|
23
|
+
REMOVABLE_SELECTORS.each do |selector|
|
|
24
|
+
document.css(selector).each(&:remove)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
document.css("script").each do |node|
|
|
28
|
+
next if node["type"].to_s.strip.casecmp("application/ld+json").zero?
|
|
29
|
+
|
|
30
|
+
node.remove
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def normalize_images(document, base_url)
|
|
35
|
+
return unless base_url
|
|
36
|
+
|
|
37
|
+
document.css("img").each do |image|
|
|
38
|
+
src = image["src"].to_s.strip
|
|
39
|
+
next if src.empty?
|
|
40
|
+
|
|
41
|
+
absolute = Utilities.absolute_url(base_url, src)
|
|
42
|
+
image.set("src", absolute) if absolute
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "time"
|
|
4
|
+
require "uri"
|
|
5
|
+
|
|
6
|
+
module Coelacanth
|
|
7
|
+
class Extractor
|
|
8
|
+
# Shared helpers for the extractor pipeline.
|
|
9
|
+
module Utilities
|
|
10
|
+
PUNCTUATION = %w[。 、 . ・ . , ! ? : ; ; :]
|
|
11
|
+
|
|
12
|
+
module_function
|
|
13
|
+
|
|
14
|
+
def text_length(node)
|
|
15
|
+
node&.text&.strip&.length.to_i
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def link_text_length(node)
|
|
19
|
+
return 0 unless node
|
|
20
|
+
|
|
21
|
+
node.css("a").sum { |anchor| anchor.text.strip.length }
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def punctuation_density(node)
|
|
25
|
+
length = text_length(node)
|
|
26
|
+
return 0.0 if length.zero?
|
|
27
|
+
|
|
28
|
+
count = node.text.chars.count { |char| PUNCTUATION.include?(char) }
|
|
29
|
+
count.to_f / length
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def link_density(node)
|
|
33
|
+
length = text_length(node)
|
|
34
|
+
return 0.0 if length.zero?
|
|
35
|
+
|
|
36
|
+
link_text_length(node).to_f / length
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def depth(node)
|
|
40
|
+
ancestors(node).length
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def ancestors(node)
|
|
44
|
+
return [] unless node
|
|
45
|
+
|
|
46
|
+
if node.respond_to?(:ancestors)
|
|
47
|
+
Array(node.ancestors)
|
|
48
|
+
else
|
|
49
|
+
collect_ancestors(node)
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def collect_ancestors(node)
|
|
54
|
+
ancestors = []
|
|
55
|
+
current = node
|
|
56
|
+
|
|
57
|
+
while current.respond_to?(:parent) && (current = current.parent)
|
|
58
|
+
ancestors << current
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
ancestors
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def element?(node)
|
|
65
|
+
return false unless node
|
|
66
|
+
|
|
67
|
+
if node.respond_to?(:element?)
|
|
68
|
+
node.element?
|
|
69
|
+
elsif defined?(::Oga::XML::Element) && node.is_a?(::Oga::XML::Element)
|
|
70
|
+
true
|
|
71
|
+
elsif node.respond_to?(:type)
|
|
72
|
+
node.type == :element
|
|
73
|
+
else
|
|
74
|
+
false
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def element_children(node)
|
|
79
|
+
return [] unless node.respond_to?(:children)
|
|
80
|
+
|
|
81
|
+
node.children.select { |child| element?(child) }
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def sibling_elements(node)
|
|
85
|
+
parent = node.respond_to?(:parent) ? node.parent : nil
|
|
86
|
+
return [] unless parent
|
|
87
|
+
|
|
88
|
+
element_children(parent)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def previous_element(node)
|
|
92
|
+
siblings = sibling_elements(node)
|
|
93
|
+
index = siblings.index(node)
|
|
94
|
+
return unless index && index.positive?
|
|
95
|
+
|
|
96
|
+
siblings[index - 1]
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def next_element(node)
|
|
100
|
+
siblings = sibling_elements(node)
|
|
101
|
+
index = siblings.index(node)
|
|
102
|
+
return unless index
|
|
103
|
+
|
|
104
|
+
siblings[index + 1]
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def class_id_tokens(node)
|
|
108
|
+
tokens = []
|
|
109
|
+
tokens.concat(split_tokens(node[:class])) if node[:class]
|
|
110
|
+
tokens.concat(split_tokens(node[:id])) if node[:id]
|
|
111
|
+
tokens
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def split_tokens(value)
|
|
115
|
+
value.to_s.split(/[\s_-]+/).map(&:downcase)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def meta_content(doc, *selectors)
|
|
119
|
+
selectors.each do |selector|
|
|
120
|
+
if (node = doc.at_css(selector))
|
|
121
|
+
return node["content"].to_s.strip unless node["content"].to_s.strip.empty?
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
nil
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def parse_time(value)
|
|
128
|
+
return if value.nil? || value.empty?
|
|
129
|
+
|
|
130
|
+
Time.parse(value)
|
|
131
|
+
rescue ArgumentError
|
|
132
|
+
nil
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def absolute_url(base_url, path)
|
|
136
|
+
return if path.nil? || path.empty?
|
|
137
|
+
return path if path =~ /^https?:/i
|
|
138
|
+
|
|
139
|
+
URI.join(base_url, path).to_s
|
|
140
|
+
rescue URI::Error
|
|
141
|
+
path
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|