coelacanth 0.3.10 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.env.example +5 -0
- data/CHANGELOG.md +3 -8
- data/Gemfile +1 -1
- data/README.md +146 -52
- data/compose.yml +5 -2
- data/config/coelacanth.yml +5 -3
- data/lib/coelacanth/client/ferrum.rb +12 -2
- data/lib/coelacanth/client/screenshot_one.rb +31 -9
- data/lib/coelacanth/configure.rb +6 -1
- data/lib/coelacanth/dom.rb +8 -2
- data/lib/coelacanth/extractor/fallback_probe.rb +34 -0
- data/lib/coelacanth/extractor/heuristic_probe.rb +175 -0
- data/lib/coelacanth/extractor/image_collector.rb +19 -0
- data/lib/coelacanth/extractor/markdown_listing_collector.rb +108 -0
- data/lib/coelacanth/extractor/markdown_renderer.rb +132 -0
- data/lib/coelacanth/extractor/metadata_probe.rb +121 -0
- data/lib/coelacanth/extractor/normalizer.rb +47 -0
- data/lib/coelacanth/extractor/utilities.rb +145 -0
- data/lib/coelacanth/extractor/weak_ml_probe.rb +136 -0
- data/lib/coelacanth/extractor.rb +67 -0
- data/lib/coelacanth/http.rb +72 -0
- data/lib/coelacanth/redirect.rb +6 -1
- data/lib/coelacanth/robots.rb +150 -0
- data/lib/coelacanth/version.rb +1 -1
- data/lib/coelacanth.rb +16 -1
- metadata +14 -2
- data/Gemfile.lock +0 -103
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coelacanth
|
|
4
|
+
class Extractor
|
|
5
|
+
# Collects image metadata from the extracted DOM node.
|
|
6
|
+
class ImageCollector
|
|
7
|
+
def call(node)
|
|
8
|
+
return [] unless node
|
|
9
|
+
|
|
10
|
+
node.css("img").map do |image|
|
|
11
|
+
{
|
|
12
|
+
src: image["src"].to_s.strip,
|
|
13
|
+
alt: image["alt"].to_s.strip
|
|
14
|
+
}
|
|
15
|
+
end.reject { |entry| entry[:src].empty? }
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "utilities"
|
|
4
|
+
|
|
5
|
+
module Coelacanth
|
|
6
|
+
class Extractor
|
|
7
|
+
# Extracts structured listings from Markdown content.
|
|
8
|
+
class MarkdownListingCollector
|
|
9
|
+
LIST_ITEM_PATTERN = /\A(?:[-*+]|\d+\.)\s+/.freeze
|
|
10
|
+
HEADING_PATTERN = /\A#+\s*/.freeze
|
|
11
|
+
MIN_ITEMS = 3
|
|
12
|
+
MIN_TITLE_LENGTH = 2
|
|
13
|
+
|
|
14
|
+
def call(markdown:, base_url: nil)
|
|
15
|
+
return [] if markdown.to_s.strip.empty?
|
|
16
|
+
|
|
17
|
+
listings = []
|
|
18
|
+
current = nil
|
|
19
|
+
pending_heading = nil
|
|
20
|
+
|
|
21
|
+
finalize_current = lambda do
|
|
22
|
+
next unless current
|
|
23
|
+
|
|
24
|
+
if current[:items].length >= MIN_ITEMS
|
|
25
|
+
listings << { heading: current[:heading], items: current[:items] }
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
current = nil
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
markdown.each_line do |line|
|
|
32
|
+
stripped = line.strip
|
|
33
|
+
|
|
34
|
+
if stripped.empty?
|
|
35
|
+
finalize_current.call
|
|
36
|
+
next
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
if heading_line?(stripped)
|
|
40
|
+
finalize_current.call
|
|
41
|
+
pending_heading = normalize_heading(stripped)
|
|
42
|
+
next
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
if list_item_line?(stripped)
|
|
46
|
+
current ||= { heading: pending_heading, items: [] }
|
|
47
|
+
pending_heading = nil
|
|
48
|
+
|
|
49
|
+
if (item = build_item(stripped, base_url))
|
|
50
|
+
current[:items] << item
|
|
51
|
+
end
|
|
52
|
+
else
|
|
53
|
+
finalize_current.call
|
|
54
|
+
pending_heading = nil
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
finalize_current.call
|
|
59
|
+
|
|
60
|
+
listings
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private
|
|
64
|
+
|
|
65
|
+
def heading_line?(line)
|
|
66
|
+
line.start_with?("#") && line.match?(HEADING_PATTERN)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def list_item_line?(line)
|
|
70
|
+
line.match?(LIST_ITEM_PATTERN)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def normalize_heading(line)
|
|
74
|
+
line.sub(HEADING_PATTERN, "").strip
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def build_item(line, base_url)
|
|
78
|
+
content = line.sub(LIST_ITEM_PATTERN, "").strip
|
|
79
|
+
return if content.empty?
|
|
80
|
+
|
|
81
|
+
if (match = content.match(/\A\[([^\]]+)\]\(([^\)]+)\)(.*)\z/))
|
|
82
|
+
title = match[1].to_s.strip
|
|
83
|
+
href = match[2].to_s.strip
|
|
84
|
+
trailing = match[3].to_s.strip
|
|
85
|
+
|
|
86
|
+
return if title.length < MIN_TITLE_LENGTH
|
|
87
|
+
|
|
88
|
+
url = Utilities.absolute_url(base_url, href) || href
|
|
89
|
+
item = { title: title, url: url }
|
|
90
|
+
|
|
91
|
+
snippet = normalize_snippet(trailing)
|
|
92
|
+
item[:snippet] = snippet unless snippet.nil? || snippet.empty?
|
|
93
|
+
item
|
|
94
|
+
else
|
|
95
|
+
title = content
|
|
96
|
+
return if title.length < MIN_TITLE_LENGTH
|
|
97
|
+
|
|
98
|
+
{ title: title }
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def normalize_snippet(text)
|
|
103
|
+
stripped = text.to_s.sub(/\A[-–—:]\s*/, "").strip
|
|
104
|
+
stripped.empty? ? nil : stripped
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coelacanth
|
|
4
|
+
class Extractor
|
|
5
|
+
# Converts a DOM node into a lightweight Markdown representation.
|
|
6
|
+
class MarkdownRenderer
|
|
7
|
+
def self.render(node)
|
|
8
|
+
new(node).render
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def initialize(node)
|
|
12
|
+
@node = node
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def render
|
|
16
|
+
return "" unless @node
|
|
17
|
+
|
|
18
|
+
lines = traverse(@node)
|
|
19
|
+
lines.compact.join("\n").gsub(/\n{3,}/, "\n\n")
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
def traverse(node, depth = 0)
|
|
25
|
+
return if node.nil?
|
|
26
|
+
|
|
27
|
+
if document_node?(node)
|
|
28
|
+
node.children.flat_map { |child| traverse(child, depth) }
|
|
29
|
+
elsif element_node?(node)
|
|
30
|
+
render_element(node, depth)
|
|
31
|
+
elsif text_node?(node)
|
|
32
|
+
text = node.text.to_s.strip
|
|
33
|
+
text.empty? ? nil : text
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def render_element(node, depth)
|
|
38
|
+
case node.name
|
|
39
|
+
when "p"
|
|
40
|
+
[node.children.flat_map { |child| traverse(child, depth) }.join(" "), ""]
|
|
41
|
+
when "br"
|
|
42
|
+
"\n"
|
|
43
|
+
when "h1", "h2", "h3", "h4", "h5", "h6"
|
|
44
|
+
level = node.name.delete_prefix("h").to_i
|
|
45
|
+
heading = "#" * level + " " + inline_children(node, depth)
|
|
46
|
+
[heading, ""]
|
|
47
|
+
when "ul"
|
|
48
|
+
element_children(node).flat_map { |child| render_list_item(child, depth, "-") } + [""]
|
|
49
|
+
when "ol"
|
|
50
|
+
element_children(node).each_with_index.flat_map do |child, index|
|
|
51
|
+
render_list_item(child, depth, "#{index + 1}.")
|
|
52
|
+
end + [""]
|
|
53
|
+
when "li"
|
|
54
|
+
["- #{inline_children(node, depth)}"]
|
|
55
|
+
when "a"
|
|
56
|
+
href = node["href"].to_s.strip
|
|
57
|
+
text = inline_children(node, depth)
|
|
58
|
+
href.empty? ? text : "[#{text}](#{href})"
|
|
59
|
+
when "strong", "b"
|
|
60
|
+
"**#{inline_children(node, depth)}**"
|
|
61
|
+
when "em", "i"
|
|
62
|
+
"*#{inline_children(node, depth)}*"
|
|
63
|
+
when "blockquote"
|
|
64
|
+
quote = node.children.flat_map { |child| traverse(child, depth + 1) }.compact
|
|
65
|
+
quote.map { |line| "> #{line}" } + [""]
|
|
66
|
+
when "pre", "code"
|
|
67
|
+
content = node.text
|
|
68
|
+
["```", content.rstrip, "```", ""]
|
|
69
|
+
when "img"
|
|
70
|
+
alt = node["alt"].to_s.strip
|
|
71
|
+
src = node["src"].to_s.strip
|
|
72
|
+
["", ""]
|
|
73
|
+
else
|
|
74
|
+
node.children.flat_map { |child| traverse(child, depth) }
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def inline_children(node, depth)
|
|
79
|
+
node.children.flat_map { |child| traverse(child, depth) }.join(" ").squeeze(" ").strip
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def render_list_item(node, depth, marker)
|
|
83
|
+
text = inline_children(node, depth)
|
|
84
|
+
return [] if text.empty?
|
|
85
|
+
|
|
86
|
+
["#{marker} #{text}"]
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def document_node?(node)
|
|
90
|
+
return false unless node
|
|
91
|
+
|
|
92
|
+
(defined?(::Oga::XML::Document) && node.is_a?(::Oga::XML::Document)) ||
|
|
93
|
+
(node.respond_to?(:document?) && node.document?) ||
|
|
94
|
+
(node.respond_to?(:type) && node.type == :document)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def element_node?(node)
|
|
98
|
+
return false unless node
|
|
99
|
+
|
|
100
|
+
if node.respond_to?(:element?)
|
|
101
|
+
node.element?
|
|
102
|
+
elsif defined?(::Oga::XML::Element) && node.is_a?(::Oga::XML::Element)
|
|
103
|
+
true
|
|
104
|
+
elsif node.respond_to?(:type)
|
|
105
|
+
node.type == :element
|
|
106
|
+
else
|
|
107
|
+
false
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def text_node?(node)
|
|
112
|
+
return false unless node
|
|
113
|
+
|
|
114
|
+
if node.respond_to?(:text?)
|
|
115
|
+
node.text?
|
|
116
|
+
elsif defined?(::Oga::XML::Text) && node.is_a?(::Oga::XML::Text)
|
|
117
|
+
true
|
|
118
|
+
elsif node.respond_to?(:type)
|
|
119
|
+
node.type == :text
|
|
120
|
+
else
|
|
121
|
+
false
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def element_children(node)
|
|
126
|
+
return [] unless node.respond_to?(:children)
|
|
127
|
+
|
|
128
|
+
node.children.select { |child| element_node?(child) }
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "oga"
|
|
5
|
+
|
|
6
|
+
require_relative "utilities"
|
|
7
|
+
|
|
8
|
+
module Coelacanth
|
|
9
|
+
class Extractor
|
|
10
|
+
# Attempts to pull article metadata such as JSON-LD and OpenGraph tags.
|
|
11
|
+
class MetadataProbe
|
|
12
|
+
ARTICLE_TYPES = %w[Article NewsArticle BlogPosting ReportageNewsArticle LiveBlogPosting].freeze
|
|
13
|
+
|
|
14
|
+
Result = Struct.new(
|
|
15
|
+
:title,
|
|
16
|
+
:node,
|
|
17
|
+
:published_at,
|
|
18
|
+
:byline,
|
|
19
|
+
:source_tag,
|
|
20
|
+
:confidence,
|
|
21
|
+
keyword_init: true
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def call(doc:, url: nil)
|
|
25
|
+
from_jsonld(doc, url) || from_semantic_nodes(doc)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
def from_jsonld(doc, url)
|
|
31
|
+
doc.css("script[type='application/ld+json']").each do |script|
|
|
32
|
+
next if script.text.strip.empty?
|
|
33
|
+
|
|
34
|
+
begin
|
|
35
|
+
payload = JSON.parse(script.text)
|
|
36
|
+
rescue JSON::ParserError
|
|
37
|
+
next
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
candidates = payload.is_a?(Array) ? payload : [payload]
|
|
41
|
+
candidates.each do |candidate|
|
|
42
|
+
next unless article_type?(candidate)
|
|
43
|
+
|
|
44
|
+
body = candidate["articleBody"].to_s.strip
|
|
45
|
+
next if body.empty?
|
|
46
|
+
|
|
47
|
+
node = Oga.parse_html("<article>#{body}</article>").at_css("article")
|
|
48
|
+
return Result.new(
|
|
49
|
+
title: candidate["headline"] || candidate["name"],
|
|
50
|
+
node: node,
|
|
51
|
+
published_at: Utilities.parse_time(candidate["datePublished"] || candidate["dateCreated"]),
|
|
52
|
+
byline: extract_author(candidate["author"]),
|
|
53
|
+
source_tag: :jsonld,
|
|
54
|
+
confidence: 0.9
|
|
55
|
+
)
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
nil
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def from_semantic_nodes(doc)
|
|
62
|
+
node = doc.at_css("main, article, [role='main'], [itemprop='articleBody']")
|
|
63
|
+
return if node.nil?
|
|
64
|
+
|
|
65
|
+
Result.new(
|
|
66
|
+
title: title_from_meta(doc),
|
|
67
|
+
node: node,
|
|
68
|
+
published_at: published_at_from_meta(doc),
|
|
69
|
+
byline: byline_from_meta(doc),
|
|
70
|
+
source_tag: :semantic,
|
|
71
|
+
confidence: 0.82
|
|
72
|
+
)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def article_type?(candidate)
|
|
76
|
+
type = candidate["@type"]
|
|
77
|
+
Array(type).any? { |value| ARTICLE_TYPES.include?(value) }
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def extract_author(author)
|
|
81
|
+
case author
|
|
82
|
+
when String
|
|
83
|
+
author
|
|
84
|
+
when Hash
|
|
85
|
+
author["name"]
|
|
86
|
+
when Array
|
|
87
|
+
author.map { |item| extract_author(item) }.compact.join(", ")
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def title_from_meta(doc)
|
|
92
|
+
Utilities.meta_content(
|
|
93
|
+
doc,
|
|
94
|
+
"meta[property='og:title']",
|
|
95
|
+
"meta[name='twitter:title']",
|
|
96
|
+
"meta[name='title']"
|
|
97
|
+
) || doc.at_css("title")&.text&.strip
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def published_at_from_meta(doc)
|
|
101
|
+
Utilities.parse_time(
|
|
102
|
+
Utilities.meta_content(
|
|
103
|
+
doc,
|
|
104
|
+
"meta[property='article:published_time']",
|
|
105
|
+
"meta[name='pubdate']",
|
|
106
|
+
"meta[name='publish_date']",
|
|
107
|
+
"meta[name='date']"
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def byline_from_meta(doc)
|
|
113
|
+
Utilities.meta_content(
|
|
114
|
+
doc,
|
|
115
|
+
"meta[name='author']",
|
|
116
|
+
"meta[property='article:author']"
|
|
117
|
+
)
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "oga"
|
|
4
|
+
|
|
5
|
+
require_relative "utilities"
|
|
6
|
+
|
|
7
|
+
module Coelacanth
|
|
8
|
+
class Extractor
|
|
9
|
+
# Sanitizes HTML and prepares an Oga document.
|
|
10
|
+
class Normalizer
|
|
11
|
+
REMOVABLE_SELECTORS = %w[style noscript iframe form nav].freeze
|
|
12
|
+
|
|
13
|
+
def call(html:, base_url: nil)
|
|
14
|
+
document = Oga.parse_html(html)
|
|
15
|
+
remove_noise(document)
|
|
16
|
+
normalize_images(document, base_url)
|
|
17
|
+
document
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
def remove_noise(document)
|
|
23
|
+
REMOVABLE_SELECTORS.each do |selector|
|
|
24
|
+
document.css(selector).each(&:remove)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
document.css("script").each do |node|
|
|
28
|
+
next if node["type"].to_s.strip.casecmp("application/ld+json").zero?
|
|
29
|
+
|
|
30
|
+
node.remove
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def normalize_images(document, base_url)
|
|
35
|
+
return unless base_url
|
|
36
|
+
|
|
37
|
+
document.css("img").each do |image|
|
|
38
|
+
src = image["src"].to_s.strip
|
|
39
|
+
next if src.empty?
|
|
40
|
+
|
|
41
|
+
absolute = Utilities.absolute_url(base_url, src)
|
|
42
|
+
image.set("src", absolute) if absolute
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "time"
|
|
4
|
+
require "uri"
|
|
5
|
+
|
|
6
|
+
module Coelacanth
|
|
7
|
+
class Extractor
|
|
8
|
+
# Shared helpers for the extractor pipeline.
|
|
9
|
+
module Utilities
|
|
10
|
+
PUNCTUATION = %w[。 、 . ・ . , ! ? : ; ; :]
|
|
11
|
+
|
|
12
|
+
module_function
|
|
13
|
+
|
|
14
|
+
def text_length(node)
|
|
15
|
+
node&.text&.strip&.length.to_i
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def link_text_length(node)
|
|
19
|
+
return 0 unless node
|
|
20
|
+
|
|
21
|
+
node.css("a").sum { |anchor| anchor.text.strip.length }
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def punctuation_density(node)
|
|
25
|
+
length = text_length(node)
|
|
26
|
+
return 0.0 if length.zero?
|
|
27
|
+
|
|
28
|
+
count = node.text.chars.count { |char| PUNCTUATION.include?(char) }
|
|
29
|
+
count.to_f / length
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def link_density(node)
|
|
33
|
+
length = text_length(node)
|
|
34
|
+
return 0.0 if length.zero?
|
|
35
|
+
|
|
36
|
+
link_text_length(node).to_f / length
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def depth(node)
|
|
40
|
+
ancestors(node).length
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def ancestors(node)
|
|
44
|
+
return [] unless node
|
|
45
|
+
|
|
46
|
+
if node.respond_to?(:ancestors)
|
|
47
|
+
Array(node.ancestors)
|
|
48
|
+
else
|
|
49
|
+
collect_ancestors(node)
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def collect_ancestors(node)
|
|
54
|
+
ancestors = []
|
|
55
|
+
current = node
|
|
56
|
+
|
|
57
|
+
while current.respond_to?(:parent) && (current = current.parent)
|
|
58
|
+
ancestors << current
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
ancestors
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def element?(node)
|
|
65
|
+
return false unless node
|
|
66
|
+
|
|
67
|
+
if node.respond_to?(:element?)
|
|
68
|
+
node.element?
|
|
69
|
+
elsif defined?(::Oga::XML::Element) && node.is_a?(::Oga::XML::Element)
|
|
70
|
+
true
|
|
71
|
+
elsif node.respond_to?(:type)
|
|
72
|
+
node.type == :element
|
|
73
|
+
else
|
|
74
|
+
false
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def element_children(node)
|
|
79
|
+
return [] unless node.respond_to?(:children)
|
|
80
|
+
|
|
81
|
+
node.children.select { |child| element?(child) }
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def sibling_elements(node)
|
|
85
|
+
parent = node.respond_to?(:parent) ? node.parent : nil
|
|
86
|
+
return [] unless parent
|
|
87
|
+
|
|
88
|
+
element_children(parent)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def previous_element(node)
|
|
92
|
+
siblings = sibling_elements(node)
|
|
93
|
+
index = siblings.index(node)
|
|
94
|
+
return unless index && index.positive?
|
|
95
|
+
|
|
96
|
+
siblings[index - 1]
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def next_element(node)
|
|
100
|
+
siblings = sibling_elements(node)
|
|
101
|
+
index = siblings.index(node)
|
|
102
|
+
return unless index
|
|
103
|
+
|
|
104
|
+
siblings[index + 1]
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def class_id_tokens(node)
|
|
108
|
+
tokens = []
|
|
109
|
+
tokens.concat(split_tokens(node[:class])) if node[:class]
|
|
110
|
+
tokens.concat(split_tokens(node[:id])) if node[:id]
|
|
111
|
+
tokens
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def split_tokens(value)
|
|
115
|
+
value.to_s.split(/[\s_-]+/).map(&:downcase)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def meta_content(doc, *selectors)
|
|
119
|
+
selectors.each do |selector|
|
|
120
|
+
if (node = doc.at_css(selector))
|
|
121
|
+
return node["content"].to_s.strip unless node["content"].to_s.strip.empty?
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
nil
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def parse_time(value)
|
|
128
|
+
return if value.nil? || value.empty?
|
|
129
|
+
|
|
130
|
+
Time.parse(value)
|
|
131
|
+
rescue ArgumentError
|
|
132
|
+
nil
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def absolute_url(base_url, path)
|
|
136
|
+
return if path.nil? || path.empty?
|
|
137
|
+
return path if path =~ /^https?:/i
|
|
138
|
+
|
|
139
|
+
URI.join(base_url, path).to_s
|
|
140
|
+
rescue URI::Error
|
|
141
|
+
path
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "oga"
|
|
4
|
+
|
|
5
|
+
require_relative "utilities"
|
|
6
|
+
|
|
7
|
+
module Coelacanth
|
|
8
|
+
class Extractor
|
|
9
|
+
# Lightweight probabilistic scorer that emulates a learned classifier using heuristics.
|
|
10
|
+
class WeakMlProbe
|
|
11
|
+
Result = Struct.new(
|
|
12
|
+
:title,
|
|
13
|
+
:node,
|
|
14
|
+
:published_at,
|
|
15
|
+
:byline,
|
|
16
|
+
:source_tag,
|
|
17
|
+
:confidence,
|
|
18
|
+
keyword_init: true
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
BLOCK_SELECTOR = "article, main, section, div".freeze
|
|
22
|
+
TOKEN_WEIGHTS = {
|
|
23
|
+
"content" => 1.1,
|
|
24
|
+
"article" => 1.0,
|
|
25
|
+
"body" => 0.9,
|
|
26
|
+
"post" => 0.8,
|
|
27
|
+
"entry" => 0.75,
|
|
28
|
+
"text" => 0.6,
|
|
29
|
+
"story" => 0.6,
|
|
30
|
+
"blog" => 0.5,
|
|
31
|
+
"share" => -1.0,
|
|
32
|
+
"nav" => -1.3,
|
|
33
|
+
"footer" => -1.2,
|
|
34
|
+
"header" => -1.1,
|
|
35
|
+
"related" => -0.8
|
|
36
|
+
}.freeze
|
|
37
|
+
|
|
38
|
+
FEATURE_WEIGHTS = {
|
|
39
|
+
bias: -1.2,
|
|
40
|
+
text_length: 0.002,
|
|
41
|
+
link_density: -2.6,
|
|
42
|
+
punctuation_density: 1.8,
|
|
43
|
+
depth: -0.12,
|
|
44
|
+
token_score: 1.6
|
|
45
|
+
}.freeze
|
|
46
|
+
|
|
47
|
+
def call(doc:, url: nil)
|
|
48
|
+
candidates = doc.css(BLOCK_SELECTOR).map do |node|
|
|
49
|
+
evaluate(node)
|
|
50
|
+
end.compact
|
|
51
|
+
|
|
52
|
+
return if candidates.empty?
|
|
53
|
+
|
|
54
|
+
best = candidates.max_by { |candidate| candidate[:probability] }
|
|
55
|
+
return if best[:probability] < 0.45
|
|
56
|
+
|
|
57
|
+
Result.new(
|
|
58
|
+
title: title_from_meta(doc),
|
|
59
|
+
node: best[:node],
|
|
60
|
+
published_at: published_at_from_meta(doc),
|
|
61
|
+
byline: byline_from_meta(doc),
|
|
62
|
+
source_tag: :ml,
|
|
63
|
+
confidence: best[:probability].clamp(0.0, 0.9)
|
|
64
|
+
)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
private
|
|
68
|
+
|
|
69
|
+
def evaluate(node)
|
|
70
|
+
text_length = Utilities.text_length(node)
|
|
71
|
+
return if text_length < 60
|
|
72
|
+
|
|
73
|
+
features = {
|
|
74
|
+
text_length: text_length,
|
|
75
|
+
link_density: Utilities.link_density(node),
|
|
76
|
+
punctuation_density: Utilities.punctuation_density(node),
|
|
77
|
+
depth: Utilities.depth(node),
|
|
78
|
+
token_score: token_score(node)
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
score = linear_combination(features)
|
|
82
|
+
probability = logistic(score)
|
|
83
|
+
|
|
84
|
+
{ node: node, probability: probability }
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def token_score(node)
|
|
88
|
+
Utilities.class_id_tokens(node).sum do |token|
|
|
89
|
+
TOKEN_WEIGHTS.fetch(token, 0.0)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def linear_combination(features)
|
|
94
|
+
FEATURE_WEIGHTS[:bias] +
|
|
95
|
+
FEATURE_WEIGHTS[:text_length] * features[:text_length] +
|
|
96
|
+
FEATURE_WEIGHTS[:link_density] * features[:link_density] +
|
|
97
|
+
FEATURE_WEIGHTS[:punctuation_density] * features[:punctuation_density] +
|
|
98
|
+
FEATURE_WEIGHTS[:depth] * features[:depth] +
|
|
99
|
+
FEATURE_WEIGHTS[:token_score] * features[:token_score]
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def logistic(score)
|
|
103
|
+
1.0 / (1.0 + Math.exp(-score))
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def title_from_meta(doc)
|
|
107
|
+
Utilities.meta_content(
|
|
108
|
+
doc,
|
|
109
|
+
"meta[property='og:title']",
|
|
110
|
+
"meta[name='twitter:title']",
|
|
111
|
+
"meta[name='title']"
|
|
112
|
+
) || doc.at_css("title")&.text&.strip
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def published_at_from_meta(doc)
|
|
116
|
+
Utilities.parse_time(
|
|
117
|
+
Utilities.meta_content(
|
|
118
|
+
doc,
|
|
119
|
+
"meta[property='article:published_time']",
|
|
120
|
+
"meta[name='pubdate']",
|
|
121
|
+
"meta[name='publish_date']",
|
|
122
|
+
"meta[name='date']"
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def byline_from_meta(doc)
|
|
128
|
+
Utilities.meta_content(
|
|
129
|
+
doc,
|
|
130
|
+
"meta[name='author']",
|
|
131
|
+
"meta[property='article:author']"
|
|
132
|
+
)
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|