coelacanth 0.3.10 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coelacanth
4
+ class Extractor
5
+ # Collects image metadata from the extracted DOM node.
6
+ class ImageCollector
7
+ def call(node)
8
+ return [] unless node
9
+
10
+ node.css("img").map do |image|
11
+ {
12
+ src: image["src"].to_s.strip,
13
+ alt: image["alt"].to_s.strip
14
+ }
15
+ end.reject { |entry| entry[:src].empty? }
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,108 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "utilities"
4
+
5
+ module Coelacanth
6
+ class Extractor
7
+ # Extracts structured listings from Markdown content.
8
+ class MarkdownListingCollector
9
+ LIST_ITEM_PATTERN = /\A(?:[-*+]|\d+\.)\s+/.freeze
10
+ HEADING_PATTERN = /\A#+\s*/.freeze
11
+ MIN_ITEMS = 3
12
+ MIN_TITLE_LENGTH = 2
13
+
14
+ def call(markdown:, base_url: nil)
15
+ return [] if markdown.to_s.strip.empty?
16
+
17
+ listings = []
18
+ current = nil
19
+ pending_heading = nil
20
+
21
+ finalize_current = lambda do
22
+ next unless current
23
+
24
+ if current[:items].length >= MIN_ITEMS
25
+ listings << { heading: current[:heading], items: current[:items] }
26
+ end
27
+
28
+ current = nil
29
+ end
30
+
31
+ markdown.each_line do |line|
32
+ stripped = line.strip
33
+
34
+ if stripped.empty?
35
+ finalize_current.call
36
+ next
37
+ end
38
+
39
+ if heading_line?(stripped)
40
+ finalize_current.call
41
+ pending_heading = normalize_heading(stripped)
42
+ next
43
+ end
44
+
45
+ if list_item_line?(stripped)
46
+ current ||= { heading: pending_heading, items: [] }
47
+ pending_heading = nil
48
+
49
+ if (item = build_item(stripped, base_url))
50
+ current[:items] << item
51
+ end
52
+ else
53
+ finalize_current.call
54
+ pending_heading = nil
55
+ end
56
+ end
57
+
58
+ finalize_current.call
59
+
60
+ listings
61
+ end
62
+
63
+ private
64
+
65
+ def heading_line?(line)
66
+ line.start_with?("#") && line.match?(HEADING_PATTERN)
67
+ end
68
+
69
+ def list_item_line?(line)
70
+ line.match?(LIST_ITEM_PATTERN)
71
+ end
72
+
73
+ def normalize_heading(line)
74
+ line.sub(HEADING_PATTERN, "").strip
75
+ end
76
+
77
+ def build_item(line, base_url)
78
+ content = line.sub(LIST_ITEM_PATTERN, "").strip
79
+ return if content.empty?
80
+
81
+ if (match = content.match(/\A\[([^\]]+)\]\(([^\)]+)\)(.*)\z/))
82
+ title = match[1].to_s.strip
83
+ href = match[2].to_s.strip
84
+ trailing = match[3].to_s.strip
85
+
86
+ return if title.length < MIN_TITLE_LENGTH
87
+
88
+ url = Utilities.absolute_url(base_url, href) || href
89
+ item = { title: title, url: url }
90
+
91
+ snippet = normalize_snippet(trailing)
92
+ item[:snippet] = snippet unless snippet.nil? || snippet.empty?
93
+ item
94
+ else
95
+ title = content
96
+ return if title.length < MIN_TITLE_LENGTH
97
+
98
+ { title: title }
99
+ end
100
+ end
101
+
102
+ def normalize_snippet(text)
103
+ stripped = text.to_s.sub(/\A[-–—:]\s*/, "").strip
104
+ stripped.empty? ? nil : stripped
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,132 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Coelacanth
4
+ class Extractor
5
+ # Converts a DOM node into a lightweight Markdown representation.
6
+ class MarkdownRenderer
7
+ def self.render(node)
8
+ new(node).render
9
+ end
10
+
11
+ def initialize(node)
12
+ @node = node
13
+ end
14
+
15
+ def render
16
+ return "" unless @node
17
+
18
+ lines = traverse(@node)
19
+ lines.compact.join("\n").gsub(/\n{3,}/, "\n\n")
20
+ end
21
+
22
+ private
23
+
24
+ def traverse(node, depth = 0)
25
+ return if node.nil?
26
+
27
+ if document_node?(node)
28
+ node.children.flat_map { |child| traverse(child, depth) }
29
+ elsif element_node?(node)
30
+ render_element(node, depth)
31
+ elsif text_node?(node)
32
+ text = node.text.to_s.strip
33
+ text.empty? ? nil : text
34
+ end
35
+ end
36
+
37
+ def render_element(node, depth)
38
+ case node.name
39
+ when "p"
40
+ [node.children.flat_map { |child| traverse(child, depth) }.join(" "), ""]
41
+ when "br"
42
+ "\n"
43
+ when "h1", "h2", "h3", "h4", "h5", "h6"
44
+ level = node.name.delete_prefix("h").to_i
45
+ heading = "#" * level + " " + inline_children(node, depth)
46
+ [heading, ""]
47
+ when "ul"
48
+ element_children(node).flat_map { |child| render_list_item(child, depth, "-") } + [""]
49
+ when "ol"
50
+ element_children(node).each_with_index.flat_map do |child, index|
51
+ render_list_item(child, depth, "#{index + 1}.")
52
+ end + [""]
53
+ when "li"
54
+ ["- #{inline_children(node, depth)}"]
55
+ when "a"
56
+ href = node["href"].to_s.strip
57
+ text = inline_children(node, depth)
58
+ href.empty? ? text : "[#{text}](#{href})"
59
+ when "strong", "b"
60
+ "**#{inline_children(node, depth)}**"
61
+ when "em", "i"
62
+ "*#{inline_children(node, depth)}*"
63
+ when "blockquote"
64
+ quote = node.children.flat_map { |child| traverse(child, depth + 1) }.compact
65
+ quote.map { |line| "> #{line}" } + [""]
66
+ when "pre", "code"
67
+ content = node.text
68
+ ["```", content.rstrip, "```", ""]
69
+ when "img"
70
+ alt = node["alt"].to_s.strip
71
+ src = node["src"].to_s.strip
72
+ ["![#{alt}](#{src})", ""]
73
+ else
74
+ node.children.flat_map { |child| traverse(child, depth) }
75
+ end
76
+ end
77
+
78
+ def inline_children(node, depth)
79
+ node.children.flat_map { |child| traverse(child, depth) }.join(" ").squeeze(" ").strip
80
+ end
81
+
82
+ def render_list_item(node, depth, marker)
83
+ text = inline_children(node, depth)
84
+ return [] if text.empty?
85
+
86
+ ["#{marker} #{text}"]
87
+ end
88
+
89
+ def document_node?(node)
90
+ return false unless node
91
+
92
+ (defined?(::Oga::XML::Document) && node.is_a?(::Oga::XML::Document)) ||
93
+ (node.respond_to?(:document?) && node.document?) ||
94
+ (node.respond_to?(:type) && node.type == :document)
95
+ end
96
+
97
+ def element_node?(node)
98
+ return false unless node
99
+
100
+ if node.respond_to?(:element?)
101
+ node.element?
102
+ elsif defined?(::Oga::XML::Element) && node.is_a?(::Oga::XML::Element)
103
+ true
104
+ elsif node.respond_to?(:type)
105
+ node.type == :element
106
+ else
107
+ false
108
+ end
109
+ end
110
+
111
+ def text_node?(node)
112
+ return false unless node
113
+
114
+ if node.respond_to?(:text?)
115
+ node.text?
116
+ elsif defined?(::Oga::XML::Text) && node.is_a?(::Oga::XML::Text)
117
+ true
118
+ elsif node.respond_to?(:type)
119
+ node.type == :text
120
+ else
121
+ false
122
+ end
123
+ end
124
+
125
+ def element_children(node)
126
+ return [] unless node.respond_to?(:children)
127
+
128
+ node.children.select { |child| element_node?(child) }
129
+ end
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "oga"
5
+
6
+ require_relative "utilities"
7
+
8
+ module Coelacanth
9
+ class Extractor
10
+ # Attempts to pull article metadata such as JSON-LD and OpenGraph tags.
11
+ class MetadataProbe
12
+ ARTICLE_TYPES = %w[Article NewsArticle BlogPosting ReportageNewsArticle LiveBlogPosting].freeze
13
+
14
+ Result = Struct.new(
15
+ :title,
16
+ :node,
17
+ :published_at,
18
+ :byline,
19
+ :source_tag,
20
+ :confidence,
21
+ keyword_init: true
22
+ )
23
+
24
+ def call(doc:, url: nil)
25
+ from_jsonld(doc, url) || from_semantic_nodes(doc)
26
+ end
27
+
28
+ private
29
+
30
+ def from_jsonld(doc, url)
31
+ doc.css("script[type='application/ld+json']").each do |script|
32
+ next if script.text.strip.empty?
33
+
34
+ begin
35
+ payload = JSON.parse(script.text)
36
+ rescue JSON::ParserError
37
+ next
38
+ end
39
+
40
+ candidates = payload.is_a?(Array) ? payload : [payload]
41
+ candidates.each do |candidate|
42
+ next unless article_type?(candidate)
43
+
44
+ body = candidate["articleBody"].to_s.strip
45
+ next if body.empty?
46
+
47
+ node = Oga.parse_html("<article>#{body}</article>").at_css("article")
48
+ return Result.new(
49
+ title: candidate["headline"] || candidate["name"],
50
+ node: node,
51
+ published_at: Utilities.parse_time(candidate["datePublished"] || candidate["dateCreated"]),
52
+ byline: extract_author(candidate["author"]),
53
+ source_tag: :jsonld,
54
+ confidence: 0.9
55
+ )
56
+ end
57
+ end
58
+ nil
59
+ end
60
+
61
+ def from_semantic_nodes(doc)
62
+ node = doc.at_css("main, article, [role='main'], [itemprop='articleBody']")
63
+ return if node.nil?
64
+
65
+ Result.new(
66
+ title: title_from_meta(doc),
67
+ node: node,
68
+ published_at: published_at_from_meta(doc),
69
+ byline: byline_from_meta(doc),
70
+ source_tag: :semantic,
71
+ confidence: 0.82
72
+ )
73
+ end
74
+
75
+ def article_type?(candidate)
76
+ type = candidate["@type"]
77
+ Array(type).any? { |value| ARTICLE_TYPES.include?(value) }
78
+ end
79
+
80
+ def extract_author(author)
81
+ case author
82
+ when String
83
+ author
84
+ when Hash
85
+ author["name"]
86
+ when Array
87
+ author.map { |item| extract_author(item) }.compact.join(", ")
88
+ end
89
+ end
90
+
91
+ def title_from_meta(doc)
92
+ Utilities.meta_content(
93
+ doc,
94
+ "meta[property='og:title']",
95
+ "meta[name='twitter:title']",
96
+ "meta[name='title']"
97
+ ) || doc.at_css("title")&.text&.strip
98
+ end
99
+
100
+ def published_at_from_meta(doc)
101
+ Utilities.parse_time(
102
+ Utilities.meta_content(
103
+ doc,
104
+ "meta[property='article:published_time']",
105
+ "meta[name='pubdate']",
106
+ "meta[name='publish_date']",
107
+ "meta[name='date']"
108
+ )
109
+ )
110
+ end
111
+
112
+ def byline_from_meta(doc)
113
+ Utilities.meta_content(
114
+ doc,
115
+ "meta[name='author']",
116
+ "meta[property='article:author']"
117
+ )
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "oga"
4
+
5
+ require_relative "utilities"
6
+
7
+ module Coelacanth
8
+ class Extractor
9
+ # Sanitizes HTML and prepares an Oga document.
10
+ class Normalizer
11
+ REMOVABLE_SELECTORS = %w[style noscript iframe form nav].freeze
12
+
13
+ def call(html:, base_url: nil)
14
+ document = Oga.parse_html(html)
15
+ remove_noise(document)
16
+ normalize_images(document, base_url)
17
+ document
18
+ end
19
+
20
+ private
21
+
22
+ def remove_noise(document)
23
+ REMOVABLE_SELECTORS.each do |selector|
24
+ document.css(selector).each(&:remove)
25
+ end
26
+
27
+ document.css("script").each do |node|
28
+ next if node["type"].to_s.strip.casecmp("application/ld+json").zero?
29
+
30
+ node.remove
31
+ end
32
+ end
33
+
34
+ def normalize_images(document, base_url)
35
+ return unless base_url
36
+
37
+ document.css("img").each do |image|
38
+ src = image["src"].to_s.strip
39
+ next if src.empty?
40
+
41
+ absolute = Utilities.absolute_url(base_url, src)
42
+ image.set("src", absolute) if absolute
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,145 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "time"
4
+ require "uri"
5
+
6
+ module Coelacanth
7
+ class Extractor
8
+ # Shared helpers for the extractor pipeline.
9
+ module Utilities
10
+ PUNCTUATION = %w[。 、 . ・ . , ! ? : ; ; :]
11
+
12
+ module_function
13
+
14
+ def text_length(node)
15
+ node&.text&.strip&.length.to_i
16
+ end
17
+
18
+ def link_text_length(node)
19
+ return 0 unless node
20
+
21
+ node.css("a").sum { |anchor| anchor.text.strip.length }
22
+ end
23
+
24
+ def punctuation_density(node)
25
+ length = text_length(node)
26
+ return 0.0 if length.zero?
27
+
28
+ count = node.text.chars.count { |char| PUNCTUATION.include?(char) }
29
+ count.to_f / length
30
+ end
31
+
32
+ def link_density(node)
33
+ length = text_length(node)
34
+ return 0.0 if length.zero?
35
+
36
+ link_text_length(node).to_f / length
37
+ end
38
+
39
+ def depth(node)
40
+ ancestors(node).length
41
+ end
42
+
43
+ def ancestors(node)
44
+ return [] unless node
45
+
46
+ if node.respond_to?(:ancestors)
47
+ Array(node.ancestors)
48
+ else
49
+ collect_ancestors(node)
50
+ end
51
+ end
52
+
53
+ def collect_ancestors(node)
54
+ ancestors = []
55
+ current = node
56
+
57
+ while current.respond_to?(:parent) && (current = current.parent)
58
+ ancestors << current
59
+ end
60
+
61
+ ancestors
62
+ end
63
+
64
+ def element?(node)
65
+ return false unless node
66
+
67
+ if node.respond_to?(:element?)
68
+ node.element?
69
+ elsif defined?(::Oga::XML::Element) && node.is_a?(::Oga::XML::Element)
70
+ true
71
+ elsif node.respond_to?(:type)
72
+ node.type == :element
73
+ else
74
+ false
75
+ end
76
+ end
77
+
78
+ def element_children(node)
79
+ return [] unless node.respond_to?(:children)
80
+
81
+ node.children.select { |child| element?(child) }
82
+ end
83
+
84
+ def sibling_elements(node)
85
+ parent = node.respond_to?(:parent) ? node.parent : nil
86
+ return [] unless parent
87
+
88
+ element_children(parent)
89
+ end
90
+
91
+ def previous_element(node)
92
+ siblings = sibling_elements(node)
93
+ index = siblings.index(node)
94
+ return unless index && index.positive?
95
+
96
+ siblings[index - 1]
97
+ end
98
+
99
+ def next_element(node)
100
+ siblings = sibling_elements(node)
101
+ index = siblings.index(node)
102
+ return unless index
103
+
104
+ siblings[index + 1]
105
+ end
106
+
107
+ def class_id_tokens(node)
108
+ tokens = []
109
+ tokens.concat(split_tokens(node[:class])) if node[:class]
110
+ tokens.concat(split_tokens(node[:id])) if node[:id]
111
+ tokens
112
+ end
113
+
114
+ def split_tokens(value)
115
+ value.to_s.split(/[\s_-]+/).map(&:downcase)
116
+ end
117
+
118
+ def meta_content(doc, *selectors)
119
+ selectors.each do |selector|
120
+ if (node = doc.at_css(selector))
121
+ return node["content"].to_s.strip unless node["content"].to_s.strip.empty?
122
+ end
123
+ end
124
+ nil
125
+ end
126
+
127
+ def parse_time(value)
128
+ return if value.nil? || value.empty?
129
+
130
+ Time.parse(value)
131
+ rescue ArgumentError
132
+ nil
133
+ end
134
+
135
+ def absolute_url(base_url, path)
136
+ return if path.nil? || path.empty?
137
+ return path if path =~ /^https?:/i
138
+
139
+ URI.join(base_url, path).to_s
140
+ rescue URI::Error
141
+ path
142
+ end
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,136 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "oga"
4
+
5
+ require_relative "utilities"
6
+
7
+ module Coelacanth
8
+ class Extractor
9
+ # Lightweight probabilistic scorer that emulates a learned classifier using heuristics.
10
+ class WeakMlProbe
11
+ Result = Struct.new(
12
+ :title,
13
+ :node,
14
+ :published_at,
15
+ :byline,
16
+ :source_tag,
17
+ :confidence,
18
+ keyword_init: true
19
+ )
20
+
21
+ BLOCK_SELECTOR = "article, main, section, div".freeze
22
+ TOKEN_WEIGHTS = {
23
+ "content" => 1.1,
24
+ "article" => 1.0,
25
+ "body" => 0.9,
26
+ "post" => 0.8,
27
+ "entry" => 0.75,
28
+ "text" => 0.6,
29
+ "story" => 0.6,
30
+ "blog" => 0.5,
31
+ "share" => -1.0,
32
+ "nav" => -1.3,
33
+ "footer" => -1.2,
34
+ "header" => -1.1,
35
+ "related" => -0.8
36
+ }.freeze
37
+
38
+ FEATURE_WEIGHTS = {
39
+ bias: -1.2,
40
+ text_length: 0.002,
41
+ link_density: -2.6,
42
+ punctuation_density: 1.8,
43
+ depth: -0.12,
44
+ token_score: 1.6
45
+ }.freeze
46
+
47
+ def call(doc:, url: nil)
48
+ candidates = doc.css(BLOCK_SELECTOR).map do |node|
49
+ evaluate(node)
50
+ end.compact
51
+
52
+ return if candidates.empty?
53
+
54
+ best = candidates.max_by { |candidate| candidate[:probability] }
55
+ return if best[:probability] < 0.45
56
+
57
+ Result.new(
58
+ title: title_from_meta(doc),
59
+ node: best[:node],
60
+ published_at: published_at_from_meta(doc),
61
+ byline: byline_from_meta(doc),
62
+ source_tag: :ml,
63
+ confidence: best[:probability].clamp(0.0, 0.9)
64
+ )
65
+ end
66
+
67
+ private
68
+
69
+ def evaluate(node)
70
+ text_length = Utilities.text_length(node)
71
+ return if text_length < 60
72
+
73
+ features = {
74
+ text_length: text_length,
75
+ link_density: Utilities.link_density(node),
76
+ punctuation_density: Utilities.punctuation_density(node),
77
+ depth: Utilities.depth(node),
78
+ token_score: token_score(node)
79
+ }
80
+
81
+ score = linear_combination(features)
82
+ probability = logistic(score)
83
+
84
+ { node: node, probability: probability }
85
+ end
86
+
87
+ def token_score(node)
88
+ Utilities.class_id_tokens(node).sum do |token|
89
+ TOKEN_WEIGHTS.fetch(token, 0.0)
90
+ end
91
+ end
92
+
93
+ def linear_combination(features)
94
+ FEATURE_WEIGHTS[:bias] +
95
+ FEATURE_WEIGHTS[:text_length] * features[:text_length] +
96
+ FEATURE_WEIGHTS[:link_density] * features[:link_density] +
97
+ FEATURE_WEIGHTS[:punctuation_density] * features[:punctuation_density] +
98
+ FEATURE_WEIGHTS[:depth] * features[:depth] +
99
+ FEATURE_WEIGHTS[:token_score] * features[:token_score]
100
+ end
101
+
102
+ def logistic(score)
103
+ 1.0 / (1.0 + Math.exp(-score))
104
+ end
105
+
106
+ def title_from_meta(doc)
107
+ Utilities.meta_content(
108
+ doc,
109
+ "meta[property='og:title']",
110
+ "meta[name='twitter:title']",
111
+ "meta[name='title']"
112
+ ) || doc.at_css("title")&.text&.strip
113
+ end
114
+
115
+ def published_at_from_meta(doc)
116
+ Utilities.parse_time(
117
+ Utilities.meta_content(
118
+ doc,
119
+ "meta[property='article:published_time']",
120
+ "meta[name='pubdate']",
121
+ "meta[name='publish_date']",
122
+ "meta[name='date']"
123
+ )
124
+ )
125
+ end
126
+
127
+ def byline_from_meta(doc)
128
+ Utilities.meta_content(
129
+ doc,
130
+ "meta[name='author']",
131
+ "meta[property='article:author']"
132
+ )
133
+ end
134
+ end
135
+ end
136
+ end