coelacanth 0.3.9 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,270 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "utilities"
4
+
5
+ module Coelacanth
6
+ class Extractor
7
+ # Identifies sidebar or inline news listings and returns link arrays.
8
+ class ListingCollector
9
+ CANDIDATE_SELECTOR = "aside, section, div, ul, ol, dl".freeze
10
+ MIN_ITEMS = 3
11
+ MIN_TITLE_LENGTH = 2
12
+
13
+ def call(document:, base_url: nil, primary_node: nil)
14
+ candidates = collect_candidates(document, base_url, primary_node)
15
+
16
+ candidates
17
+ .sort_by { |candidate| -candidate[:score] }
18
+ .reject { |candidate| candidate[:score] < minimum_score }
19
+ .first(3)
20
+ .map { |candidate| format_candidate(candidate) }
21
+ end
22
+
23
+ private
24
+
25
+ def collect_candidates(document, base_url, primary_node)
26
+ document.css(CANDIDATE_SELECTOR).filter_map do |node|
27
+ next if skip_node?(node, primary_node)
28
+
29
+ items = extract_items(node, base_url)
30
+ next if items.length < MIN_ITEMS
31
+
32
+ heading = heading_for(node)
33
+ score = score_node(node, items, heading)
34
+ next if score < minimum_score
35
+
36
+ { node: node, items: items, heading: heading, score: score }
37
+ end
38
+ end
39
+
40
+ def skip_node?(node, primary_node)
41
+ return true if nested_listing_container?(node)
42
+
43
+ return false unless primary_node
44
+ return false unless primary_node.respond_to?(:name)
45
+ return false if %w[body html].include?(primary_node.name)
46
+
47
+ node == primary_node ||
48
+ ancestor?(node, primary_node) ||
49
+ ancestor?(primary_node, node)
50
+ end
51
+
52
+ def nested_listing_container?(node)
53
+ Utilities.ancestors(node).any? do |ancestor|
54
+ Utilities.element?(ancestor) && LISTING_CONTAINER_TAGS.include?(ancestor.name)
55
+ end
56
+ end
57
+
58
+ LISTING_CONTAINER_TAGS = %w[aside section div ul ol dl].freeze
59
+
60
+ def ancestor?(node, candidate)
61
+ Utilities.ancestors(node).any? { |ancestor| ancestor == candidate }
62
+ end
63
+
64
+ def extract_items(node, base_url)
65
+ item_nodes = candidate_children(node)
66
+ return [] if item_nodes.empty?
67
+
68
+ item_nodes.filter_map do |child|
69
+ next unless contains_link?(child)
70
+
71
+ anchor = primary_anchor(child)
72
+ next unless anchor
73
+
74
+ title = normalize_text(anchor.text)
75
+ next if title.length < MIN_TITLE_LENGTH
76
+
77
+ href = anchor["href"].to_s.strip
78
+ next if href.empty?
79
+
80
+ url = base_url ? Utilities.absolute_url(base_url, href) : href
81
+ url ||= href
82
+
83
+ snippet = build_snippet(child, title)
84
+
85
+ item = { title: title, url: url }
86
+ item[:snippet] = snippet unless snippet.nil? || snippet.empty?
87
+ item
88
+ end.uniq { |item| [item[:title], item[:url]] }
89
+ end
90
+
91
+ def candidate_children(node)
92
+ direct_children = Utilities.element_children(node)
93
+ return [] if direct_children.empty?
94
+
95
+ anchor_children = direct_children.select { |child| contains_link?(child) }
96
+ return anchor_children if anchor_children.length >= MIN_ITEMS
97
+
98
+ groups = %w[li article div section p dd]
99
+
100
+ groups.each do |tag|
101
+ grouped = direct_children.select { |child| child.name == tag }
102
+ return grouped if grouped.length >= MIN_ITEMS
103
+ end
104
+
105
+ list_container = direct_children.find { |child| %w[ul ol dl].include?(child.name) }
106
+ return Utilities.element_children(list_container) if list_container
107
+
108
+ []
109
+ end
110
+
111
+ def contains_link?(node)
112
+ node.css("a[href]").any?
113
+ end
114
+
115
+ def primary_anchor(node)
116
+ anchors = node.css("a[href]")
117
+ anchors.max_by { |anchor| normalize_text(anchor.text).length }
118
+ end
119
+
120
+ def normalize_text(text)
121
+ text.to_s.gsub(/[\r\n\t]/, " ").squeeze(" ").strip
122
+ end
123
+
124
+ def build_snippet(node, title)
125
+ snippet_from_node_text(node, title) || metadata_context(node, title)
126
+ end
127
+
128
+ def snippet_from_node_text(node, title)
129
+ text = normalize_text(node.text)
130
+ snippet = text.sub(title, "").strip
131
+ snippet.empty? ? nil : truncate(snippet)
132
+ end
133
+
134
+ def metadata_context(node, title)
135
+ candidate = time_text(node) || preceding_metadata(node)
136
+ return nil if candidate.nil?
137
+
138
+ candidate = candidate.sub(title, "").strip
139
+ candidate.empty? ? nil : truncate(candidate)
140
+ end
141
+
142
+ def time_text(node)
143
+ node.css("time").filter_map { |time| normalize_text(time.text) }.find { |text| !text.empty? }
144
+ end
145
+
146
+ def preceding_metadata(node)
147
+ previous = Utilities.previous_element(node)
148
+ 3.times do
149
+ break unless previous
150
+
151
+ text = normalize_text(previous.text)
152
+ return text unless text.empty?
153
+
154
+ previous = Utilities.previous_element(previous)
155
+ end
156
+
157
+ nil
158
+ end
159
+
160
+ def truncate(text)
161
+ return text if text.length <= 120
162
+
163
+ text[0...117] + "..."
164
+ end
165
+
166
+ def heading_for(node)
167
+ if (heading = node.at_css("h1, h2, h3, h4"))
168
+ return normalize_text(heading.text)
169
+ end
170
+
171
+ previous = Utilities.previous_element(node)
172
+ 3.times do
173
+ break unless previous
174
+
175
+ return normalize_text(previous.text) if previous.name =~ /h[1-6]/
176
+ previous = Utilities.previous_element(previous)
177
+ end
178
+
179
+ nil
180
+ end
181
+
182
+ def score_node(node, items, heading)
183
+ structure_score = structural_score(node)
184
+ heading_score = heading ? 45 : 0
185
+ item_score = items.length * 40
186
+ density_score = Utilities.link_density(node) * 90
187
+ adjacency_score = sibling_sequence_bonus(node)
188
+ depth_penalty = Utilities.depth(node) * 5
189
+ length_penalty = long_text_penalty(node)
190
+
191
+ structure_score + heading_score + item_score + density_score + adjacency_score - depth_penalty - length_penalty
192
+ end
193
+
194
+ def structural_score(node)
195
+ children = candidate_children(node)
196
+ return 0 if children.empty?
197
+
198
+ dominant_tag, dominant_children = children.group_by(&:name).max_by { |_, nodes| nodes.length }
199
+ dominant_count = dominant_children.length
200
+
201
+ uniform_bonus = dominant_count == children.length ? 60 : 20
202
+ list_bonus = %w[ul ol dl].include?(node.name) ? 90 : 0
203
+ list_bonus += 45 if dominant_tag && %w[li dd].include?(dominant_tag)
204
+
205
+ distribution_bonus = distribution_consistency_bonus(children)
206
+
207
+ dominant_count * 12 + uniform_bonus + list_bonus + distribution_bonus
208
+ end
209
+
210
+ def distribution_consistency_bonus(children)
211
+ return 0 if children.length < MIN_ITEMS
212
+
213
+ lengths = children.map { |child| Utilities.text_length(child) }
214
+ average = lengths.sum.to_f / lengths.length
215
+ variance = lengths.map { |len| (len - average).abs }
216
+
217
+ variance.max <= 120 ? 40 : 10
218
+ end
219
+
220
+ def sibling_sequence_bonus(node)
221
+ siblings = Utilities.sibling_elements(node)
222
+ return 0 if siblings.empty?
223
+
224
+ index = siblings.index(node)
225
+ return 0 unless index
226
+
227
+ forward = 0
228
+ while (candidate = siblings[index + forward + 1]) && similar_structure?(node, candidate)
229
+ forward += 1
230
+ end
231
+
232
+ backward = 0
233
+ while index - backward - 1 >= 0 && (candidate = siblings[index - backward - 1]) && similar_structure?(node, candidate)
234
+ backward += 1
235
+ end
236
+
237
+ (forward + backward) * 15
238
+ end
239
+
240
+ def similar_structure?(node, other)
241
+ return false unless other
242
+
243
+ node_children = candidate_children(node)
244
+ other_children = candidate_children(other)
245
+ return false if node_children.empty? || other_children.empty?
246
+
247
+ node_children.first.name == other_children.first.name && node_children.length == other_children.length
248
+ end
249
+
250
+ def long_text_penalty(node)
251
+ children = candidate_children(node)
252
+ return 0 if children.empty?
253
+
254
+ overlong = children.count { |child| Utilities.text_length(child) > 280 }
255
+ overlong * 30
256
+ end
257
+
258
+ def minimum_score
259
+ 180
260
+ end
261
+
262
+ def format_candidate(candidate)
263
+ {
264
+ heading: candidate[:heading],
265
+ items: candidate[:items]
266
+ }
267
+ end
268
+ end
269
+ end
270
+ end
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coelacanth
4
+ class Extractor
5
+ # Converts a DOM node into a lightweight Markdown representation.
6
+ class MarkdownRenderer
7
+ def self.render(node)
8
+ new(node).render
9
+ end
10
+
11
+ def initialize(node)
12
+ @node = node
13
+ end
14
+
15
+ def render
16
+ return "" unless @node
17
+
18
+ lines = traverse(@node)
19
+ lines.compact.join("\n").gsub(/\n{3,}/, "\n\n")
20
+ end
21
+
22
+ private
23
+
24
+ def traverse(node, depth = 0)
25
+ return if node.nil?
26
+
27
+ if document_node?(node)
28
+ node.children.flat_map { |child| traverse(child, depth) }
29
+ elsif element_node?(node)
30
+ render_element(node, depth)
31
+ elsif text_node?(node)
32
+ text = node.text.to_s.strip
33
+ text.empty? ? nil : text
34
+ end
35
+ end
36
+
37
+ def render_element(node, depth)
38
+ case node.name
39
+ when "p"
40
+ [node.children.flat_map { |child| traverse(child, depth) }.join(" "), ""]
41
+ when "br"
42
+ "\n"
43
+ when "h1", "h2", "h3", "h4", "h5", "h6"
44
+ level = node.name.delete_prefix("h").to_i
45
+ heading = "#" * level + " " + inline_children(node, depth)
46
+ [heading, ""]
47
+ when "ul"
48
+ element_children(node).flat_map { |child| render_list_item(child, depth, "-") } + [""]
49
+ when "ol"
50
+ element_children(node).each_with_index.flat_map do |child, index|
51
+ render_list_item(child, depth, "#{index + 1}.")
52
+ end + [""]
53
+ when "li"
54
+ ["- #{inline_children(node, depth)}"]
55
+ when "strong", "b"
56
+ "**#{inline_children(node, depth)}**"
57
+ when "em", "i"
58
+ "*#{inline_children(node, depth)}*"
59
+ when "blockquote"
60
+ quote = node.children.flat_map { |child| traverse(child, depth + 1) }.compact
61
+ quote.map { |line| "> #{line}" } + [""]
62
+ when "pre", "code"
63
+ content = node.text
64
+ ["```", content.rstrip, "```", ""]
65
+ when "img"
66
+ alt = node["alt"].to_s.strip
67
+ src = node["src"].to_s.strip
68
+ ["![#{alt}](#{src})", ""]
69
+ else
70
+ node.children.flat_map { |child| traverse(child, depth) }
71
+ end
72
+ end
73
+
74
+ def inline_children(node, depth)
75
+ node.children.flat_map { |child| traverse(child, depth) }.join(" ").squeeze(" ").strip
76
+ end
77
+
78
+ def render_list_item(node, depth, marker)
79
+ text = inline_children(node, depth)
80
+ return [] if text.empty?
81
+
82
+ ["#{marker} #{text}"]
83
+ end
84
+
85
+ def document_node?(node)
86
+ return false unless node
87
+
88
+ (defined?(::Oga::XML::Document) && node.is_a?(::Oga::XML::Document)) ||
89
+ (node.respond_to?(:document?) && node.document?) ||
90
+ (node.respond_to?(:type) && node.type == :document)
91
+ end
92
+
93
+ def element_node?(node)
94
+ return false unless node
95
+
96
+ if node.respond_to?(:element?)
97
+ node.element?
98
+ elsif defined?(::Oga::XML::Element) && node.is_a?(::Oga::XML::Element)
99
+ true
100
+ elsif node.respond_to?(:type)
101
+ node.type == :element
102
+ else
103
+ false
104
+ end
105
+ end
106
+
107
+ def text_node?(node)
108
+ return false unless node
109
+
110
+ if node.respond_to?(:text?)
111
+ node.text?
112
+ elsif defined?(::Oga::XML::Text) && node.is_a?(::Oga::XML::Text)
113
+ true
114
+ elsif node.respond_to?(:type)
115
+ node.type == :text
116
+ else
117
+ false
118
+ end
119
+ end
120
+
121
+ def element_children(node)
122
+ return [] unless node.respond_to?(:children)
123
+
124
+ node.children.select { |child| element_node?(child) }
125
+ end
126
+ end
127
+ end
128
+ end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "oga"
5
+
6
+ require_relative "utilities"
7
+
8
+ module Coelacanth
9
+ class Extractor
10
+ # Attempts to pull article metadata such as JSON-LD and OpenGraph tags.
11
+ class MetadataProbe
12
+ ARTICLE_TYPES = %w[Article NewsArticle BlogPosting ReportageNewsArticle LiveBlogPosting].freeze
13
+
14
+ Result = Struct.new(
15
+ :title,
16
+ :node,
17
+ :published_at,
18
+ :byline,
19
+ :source_tag,
20
+ :confidence,
21
+ keyword_init: true
22
+ )
23
+
24
+ def call(doc:, url: nil)
25
+ from_jsonld(doc, url) || from_semantic_nodes(doc)
26
+ end
27
+
28
+ private
29
+
30
+ def from_jsonld(doc, url)
31
+ doc.css("script[type='application/ld+json']").each do |script|
32
+ next if script.text.strip.empty?
33
+
34
+ begin
35
+ payload = JSON.parse(script.text)
36
+ rescue JSON::ParserError
37
+ next
38
+ end
39
+
40
+ candidates = payload.is_a?(Array) ? payload : [payload]
41
+ candidates.each do |candidate|
42
+ next unless article_type?(candidate)
43
+
44
+ body = candidate["articleBody"].to_s.strip
45
+ next if body.empty?
46
+
47
+ node = Oga.parse_html("<article>#{body}</article>").at_css("article")
48
+ return Result.new(
49
+ title: candidate["headline"] || candidate["name"],
50
+ node: node,
51
+ published_at: Utilities.parse_time(candidate["datePublished"] || candidate["dateCreated"]),
52
+ byline: extract_author(candidate["author"]),
53
+ source_tag: :jsonld,
54
+ confidence: 0.9
55
+ )
56
+ end
57
+ end
58
+ nil
59
+ end
60
+
61
+ def from_semantic_nodes(doc)
62
+ node = doc.at_css("main, article, [role='main'], [itemprop='articleBody']")
63
+ return if node.nil?
64
+
65
+ Result.new(
66
+ title: title_from_meta(doc),
67
+ node: node,
68
+ published_at: published_at_from_meta(doc),
69
+ byline: byline_from_meta(doc),
70
+ source_tag: :semantic,
71
+ confidence: 0.82
72
+ )
73
+ end
74
+
75
+ def article_type?(candidate)
76
+ type = candidate["@type"]
77
+ Array(type).any? { |value| ARTICLE_TYPES.include?(value) }
78
+ end
79
+
80
+ def extract_author(author)
81
+ case author
82
+ when String
83
+ author
84
+ when Hash
85
+ author["name"]
86
+ when Array
87
+ author.map { |item| extract_author(item) }.compact.join(", ")
88
+ end
89
+ end
90
+
91
+ def title_from_meta(doc)
92
+ Utilities.meta_content(
93
+ doc,
94
+ "meta[property='og:title']",
95
+ "meta[name='twitter:title']",
96
+ "meta[name='title']"
97
+ ) || doc.at_css("title")&.text&.strip
98
+ end
99
+
100
+ def published_at_from_meta(doc)
101
+ Utilities.parse_time(
102
+ Utilities.meta_content(
103
+ doc,
104
+ "meta[property='article:published_time']",
105
+ "meta[name='pubdate']",
106
+ "meta[name='publish_date']",
107
+ "meta[name='date']"
108
+ )
109
+ )
110
+ end
111
+
112
+ def byline_from_meta(doc)
113
+ Utilities.meta_content(
114
+ doc,
115
+ "meta[name='author']",
116
+ "meta[property='article:author']"
117
+ )
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "oga"
4
+
5
+ require_relative "utilities"
6
+
7
+ module Coelacanth
8
+ class Extractor
9
+ # Sanitizes HTML and prepares an Oga document.
10
+ class Normalizer
11
+ REMOVABLE_SELECTORS = %w[style noscript iframe form nav].freeze
12
+
13
+ def call(html:, base_url: nil)
14
+ document = Oga.parse_html(html)
15
+ remove_noise(document)
16
+ normalize_images(document, base_url)
17
+ document
18
+ end
19
+
20
+ private
21
+
22
+ def remove_noise(document)
23
+ REMOVABLE_SELECTORS.each do |selector|
24
+ document.css(selector).each(&:remove)
25
+ end
26
+
27
+ document.css("script").each do |node|
28
+ next if node["type"].to_s.strip.casecmp("application/ld+json").zero?
29
+
30
+ node.remove
31
+ end
32
+ end
33
+
34
+ def normalize_images(document, base_url)
35
+ return unless base_url
36
+
37
+ document.css("img").each do |image|
38
+ src = image["src"].to_s.strip
39
+ next if src.empty?
40
+
41
+ absolute = Utilities.absolute_url(base_url, src)
42
+ image.set("src", absolute) if absolute
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,145 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "time"
4
+ require "uri"
5
+
6
+ module Coelacanth
7
+ class Extractor
8
+ # Shared helpers for the extractor pipeline.
9
+ module Utilities
10
+ PUNCTUATION = %w[。 、 . ・ . , ! ? : ; ; :]
11
+
12
+ module_function
13
+
14
+ def text_length(node)
15
+ node&.text&.strip&.length.to_i
16
+ end
17
+
18
+ def link_text_length(node)
19
+ return 0 unless node
20
+
21
+ node.css("a").sum { |anchor| anchor.text.strip.length }
22
+ end
23
+
24
+ def punctuation_density(node)
25
+ length = text_length(node)
26
+ return 0.0 if length.zero?
27
+
28
+ count = node.text.chars.count { |char| PUNCTUATION.include?(char) }
29
+ count.to_f / length
30
+ end
31
+
32
+ def link_density(node)
33
+ length = text_length(node)
34
+ return 0.0 if length.zero?
35
+
36
+ link_text_length(node).to_f / length
37
+ end
38
+
39
+ def depth(node)
40
+ ancestors(node).length
41
+ end
42
+
43
+ def ancestors(node)
44
+ return [] unless node
45
+
46
+ if node.respond_to?(:ancestors)
47
+ Array(node.ancestors)
48
+ else
49
+ collect_ancestors(node)
50
+ end
51
+ end
52
+
53
+ def collect_ancestors(node)
54
+ ancestors = []
55
+ current = node
56
+
57
+ while current.respond_to?(:parent) && (current = current.parent)
58
+ ancestors << current
59
+ end
60
+
61
+ ancestors
62
+ end
63
+
64
+ def element?(node)
65
+ return false unless node
66
+
67
+ if node.respond_to?(:element?)
68
+ node.element?
69
+ elsif defined?(::Oga::XML::Element) && node.is_a?(::Oga::XML::Element)
70
+ true
71
+ elsif node.respond_to?(:type)
72
+ node.type == :element
73
+ else
74
+ false
75
+ end
76
+ end
77
+
78
+ def element_children(node)
79
+ return [] unless node.respond_to?(:children)
80
+
81
+ node.children.select { |child| element?(child) }
82
+ end
83
+
84
+ def sibling_elements(node)
85
+ parent = node.respond_to?(:parent) ? node.parent : nil
86
+ return [] unless parent
87
+
88
+ element_children(parent)
89
+ end
90
+
91
+ def previous_element(node)
92
+ siblings = sibling_elements(node)
93
+ index = siblings.index(node)
94
+ return unless index && index.positive?
95
+
96
+ siblings[index - 1]
97
+ end
98
+
99
+ def next_element(node)
100
+ siblings = sibling_elements(node)
101
+ index = siblings.index(node)
102
+ return unless index
103
+
104
+ siblings[index + 1]
105
+ end
106
+
107
+ def class_id_tokens(node)
108
+ tokens = []
109
+ tokens.concat(split_tokens(node[:class])) if node[:class]
110
+ tokens.concat(split_tokens(node[:id])) if node[:id]
111
+ tokens
112
+ end
113
+
114
+ def split_tokens(value)
115
+ value.to_s.split(/[\s_-]+/).map(&:downcase)
116
+ end
117
+
118
+ def meta_content(doc, *selectors)
119
+ selectors.each do |selector|
120
+ if (node = doc.at_css(selector))
121
+ return node["content"].to_s.strip unless node["content"].to_s.strip.empty?
122
+ end
123
+ end
124
+ nil
125
+ end
126
+
127
+ def parse_time(value)
128
+ return if value.nil? || value.empty?
129
+
130
+ Time.parse(value)
131
+ rescue ArgumentError
132
+ nil
133
+ end
134
+
135
+ def absolute_url(base_url, path)
136
+ return if path.nil? || path.empty?
137
+ return path if path =~ /^https?:/i
138
+
139
+ URI.join(base_url, path).to_s
140
+ rescue URI::Error
141
+ path
142
+ end
143
+ end
144
+ end
145
+ end