coelacanth 0.3.9 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -5
- data/Gemfile +3 -3
- data/README.md +128 -55
- data/lib/coelacanth/client/ferrum.rb +6 -2
- data/lib/coelacanth/dom.rb +3 -2
- data/lib/coelacanth/extractor/fallback_probe.rb +34 -0
- data/lib/coelacanth/extractor/heuristic_probe.rb +175 -0
- data/lib/coelacanth/extractor/image_collector.rb +19 -0
- data/lib/coelacanth/extractor/listing_collector.rb +270 -0
- data/lib/coelacanth/extractor/markdown_renderer.rb +128 -0
- data/lib/coelacanth/extractor/metadata_probe.rb +121 -0
- data/lib/coelacanth/extractor/normalizer.rb +47 -0
- data/lib/coelacanth/extractor/utilities.rb +145 -0
- data/lib/coelacanth/extractor/weak_ml_probe.rb +136 -0
- data/lib/coelacanth/extractor.rb +67 -0
- data/lib/coelacanth/version.rb +1 -1
- data/lib/coelacanth.rb +8 -1
- metadata +11 -2
- data/Gemfile.lock +0 -103
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "oga"
|
|
4
|
+
|
|
5
|
+
require_relative "utilities"
|
|
6
|
+
|
|
7
|
+
module Coelacanth
|
|
8
|
+
class Extractor
|
|
9
|
+
# Lightweight probabilistic scorer that emulates a learned classifier using heuristics.
|
|
10
|
+
class WeakMlProbe
|
|
11
|
+
Result = Struct.new(
|
|
12
|
+
:title,
|
|
13
|
+
:node,
|
|
14
|
+
:published_at,
|
|
15
|
+
:byline,
|
|
16
|
+
:source_tag,
|
|
17
|
+
:confidence,
|
|
18
|
+
keyword_init: true
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
BLOCK_SELECTOR = "article, main, section, div".freeze
|
|
22
|
+
TOKEN_WEIGHTS = {
|
|
23
|
+
"content" => 1.1,
|
|
24
|
+
"article" => 1.0,
|
|
25
|
+
"body" => 0.9,
|
|
26
|
+
"post" => 0.8,
|
|
27
|
+
"entry" => 0.75,
|
|
28
|
+
"text" => 0.6,
|
|
29
|
+
"story" => 0.6,
|
|
30
|
+
"blog" => 0.5,
|
|
31
|
+
"share" => -1.0,
|
|
32
|
+
"nav" => -1.3,
|
|
33
|
+
"footer" => -1.2,
|
|
34
|
+
"header" => -1.1,
|
|
35
|
+
"related" => -0.8
|
|
36
|
+
}.freeze
|
|
37
|
+
|
|
38
|
+
FEATURE_WEIGHTS = {
|
|
39
|
+
bias: -1.2,
|
|
40
|
+
text_length: 0.002,
|
|
41
|
+
link_density: -2.6,
|
|
42
|
+
punctuation_density: 1.8,
|
|
43
|
+
depth: -0.12,
|
|
44
|
+
token_score: 1.6
|
|
45
|
+
}.freeze
|
|
46
|
+
|
|
47
|
+
def call(doc:, url: nil)
|
|
48
|
+
candidates = doc.css(BLOCK_SELECTOR).map do |node|
|
|
49
|
+
evaluate(node)
|
|
50
|
+
end.compact
|
|
51
|
+
|
|
52
|
+
return if candidates.empty?
|
|
53
|
+
|
|
54
|
+
best = candidates.max_by { |candidate| candidate[:probability] }
|
|
55
|
+
return if best[:probability] < 0.45
|
|
56
|
+
|
|
57
|
+
Result.new(
|
|
58
|
+
title: title_from_meta(doc),
|
|
59
|
+
node: best[:node],
|
|
60
|
+
published_at: published_at_from_meta(doc),
|
|
61
|
+
byline: byline_from_meta(doc),
|
|
62
|
+
source_tag: :ml,
|
|
63
|
+
confidence: best[:probability].clamp(0.0, 0.9)
|
|
64
|
+
)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
private
|
|
68
|
+
|
|
69
|
+
def evaluate(node)
|
|
70
|
+
text_length = Utilities.text_length(node)
|
|
71
|
+
return if text_length < 60
|
|
72
|
+
|
|
73
|
+
features = {
|
|
74
|
+
text_length: text_length,
|
|
75
|
+
link_density: Utilities.link_density(node),
|
|
76
|
+
punctuation_density: Utilities.punctuation_density(node),
|
|
77
|
+
depth: Utilities.depth(node),
|
|
78
|
+
token_score: token_score(node)
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
score = linear_combination(features)
|
|
82
|
+
probability = logistic(score)
|
|
83
|
+
|
|
84
|
+
{ node: node, probability: probability }
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def token_score(node)
|
|
88
|
+
Utilities.class_id_tokens(node).sum do |token|
|
|
89
|
+
TOKEN_WEIGHTS.fetch(token, 0.0)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def linear_combination(features)
|
|
94
|
+
FEATURE_WEIGHTS[:bias] +
|
|
95
|
+
FEATURE_WEIGHTS[:text_length] * features[:text_length] +
|
|
96
|
+
FEATURE_WEIGHTS[:link_density] * features[:link_density] +
|
|
97
|
+
FEATURE_WEIGHTS[:punctuation_density] * features[:punctuation_density] +
|
|
98
|
+
FEATURE_WEIGHTS[:depth] * features[:depth] +
|
|
99
|
+
FEATURE_WEIGHTS[:token_score] * features[:token_score]
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def logistic(score)
|
|
103
|
+
1.0 / (1.0 + Math.exp(-score))
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def title_from_meta(doc)
|
|
107
|
+
Utilities.meta_content(
|
|
108
|
+
doc,
|
|
109
|
+
"meta[property='og:title']",
|
|
110
|
+
"meta[name='twitter:title']",
|
|
111
|
+
"meta[name='title']"
|
|
112
|
+
) || doc.at_css("title")&.text&.strip
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def published_at_from_meta(doc)
|
|
116
|
+
Utilities.parse_time(
|
|
117
|
+
Utilities.meta_content(
|
|
118
|
+
doc,
|
|
119
|
+
"meta[property='article:published_time']",
|
|
120
|
+
"meta[name='pubdate']",
|
|
121
|
+
"meta[name='publish_date']",
|
|
122
|
+
"meta[name='date']"
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def byline_from_meta(doc)
|
|
128
|
+
Utilities.meta_content(
|
|
129
|
+
doc,
|
|
130
|
+
"meta[name='author']",
|
|
131
|
+
"meta[property='article:author']"
|
|
132
|
+
)
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "extractor/normalizer"
|
|
4
|
+
require_relative "extractor/metadata_probe"
|
|
5
|
+
require_relative "extractor/heuristic_probe"
|
|
6
|
+
require_relative "extractor/weak_ml_probe"
|
|
7
|
+
require_relative "extractor/fallback_probe"
|
|
8
|
+
require_relative "extractor/markdown_renderer"
|
|
9
|
+
require_relative "extractor/image_collector"
|
|
10
|
+
require_relative "extractor/listing_collector"
|
|
11
|
+
|
|
12
|
+
module Coelacanth
|
|
13
|
+
# High-level API for extracting articles without site-specific selectors.
|
|
14
|
+
class Extractor
|
|
15
|
+
PipelineResult = Struct.new(
|
|
16
|
+
:title,
|
|
17
|
+
:node,
|
|
18
|
+
:published_at,
|
|
19
|
+
:byline,
|
|
20
|
+
:source_tag,
|
|
21
|
+
:confidence,
|
|
22
|
+
keyword_init: true
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def call(html:, url: nil)
|
|
26
|
+
document = Normalizer.new.call(html: html, base_url: url)
|
|
27
|
+
|
|
28
|
+
[
|
|
29
|
+
[MetadataProbe.new, 0.85],
|
|
30
|
+
[HeuristicProbe.new, 0.75],
|
|
31
|
+
[WeakMlProbe.new, 0.70],
|
|
32
|
+
[FallbackProbe.new, 0.0]
|
|
33
|
+
].each do |probe, threshold|
|
|
34
|
+
result = probe.call(doc: document, url: url)
|
|
35
|
+
next unless result
|
|
36
|
+
|
|
37
|
+
return build_response(result, document:, url:) if result.confidence.to_f >= threshold
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
build_response(
|
|
41
|
+
PipelineResult.new(node: document, source_tag: :none, confidence: 0.0),
|
|
42
|
+
document: document,
|
|
43
|
+
url: url
|
|
44
|
+
)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
def build_response(result, document:, url:)
|
|
50
|
+
node = result.node
|
|
51
|
+
body_markdown = MarkdownRenderer.render(node)
|
|
52
|
+
body_markdown_list = body_markdown.to_s.split(/\n{2,}/).map { |segment| segment.strip }.reject(&:empty?)
|
|
53
|
+
|
|
54
|
+
{
|
|
55
|
+
title: result.title,
|
|
56
|
+
body_markdown: body_markdown,
|
|
57
|
+
body_markdown_list: body_markdown_list,
|
|
58
|
+
images: ImageCollector.new.call(node),
|
|
59
|
+
published_at: result.published_at,
|
|
60
|
+
byline: result.byline,
|
|
61
|
+
source: result.source_tag,
|
|
62
|
+
confidence: result.confidence,
|
|
63
|
+
listings: ListingCollector.new.call(document: document, base_url: url, primary_node: node)
|
|
64
|
+
}
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
data/lib/coelacanth/version.rb
CHANGED
data/lib/coelacanth.rb
CHANGED
|
@@ -6,6 +6,7 @@ require_relative "coelacanth/client/base"
|
|
|
6
6
|
require_relative "coelacanth/client/ferrum"
|
|
7
7
|
require_relative "coelacanth/client/screenshot_one"
|
|
8
8
|
require_relative "coelacanth/dom"
|
|
9
|
+
require_relative "coelacanth/extractor"
|
|
9
10
|
require_relative "coelacanth/redirect"
|
|
10
11
|
require_relative "coelacanth/validator"
|
|
11
12
|
require_relative "coelacanth/version"
|
|
@@ -20,9 +21,15 @@ module Coelacanth
|
|
|
20
21
|
client_class = config.read("client") == "screenshot_one" ? Client::ScreenshotOne : Client::Ferrum
|
|
21
22
|
@client = client_class.new(url)
|
|
22
23
|
regular_url = Redirect.new.resolve_redirect(url)
|
|
24
|
+
response = Net::HTTP.get_response(URI.parse(regular_url))
|
|
25
|
+
html = response.body.to_s
|
|
26
|
+
html = html.force_encoding(Encoding::UTF_8)
|
|
27
|
+
html = html.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
|
|
28
|
+
extractor_result = Extractor.new.call(html: html, url: regular_url)
|
|
23
29
|
{
|
|
24
|
-
dom: Dom.new.oga(regular_url),
|
|
30
|
+
dom: Dom.new.oga(regular_url, html: html),
|
|
25
31
|
screenshot: @client.get_screenshot,
|
|
32
|
+
extraction: extractor_result,
|
|
26
33
|
}
|
|
27
34
|
end
|
|
28
35
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: coelacanth
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Yusuke
|
|
@@ -24,7 +24,6 @@ files:
|
|
|
24
24
|
- CODE_OF_CONDUCT.md
|
|
25
25
|
- Dockerfile
|
|
26
26
|
- Gemfile
|
|
27
|
-
- Gemfile.lock
|
|
28
27
|
- LICENSE.txt
|
|
29
28
|
- README.md
|
|
30
29
|
- Rakefile
|
|
@@ -36,6 +35,16 @@ files:
|
|
|
36
35
|
- lib/coelacanth/client/screenshot_one.rb
|
|
37
36
|
- lib/coelacanth/configure.rb
|
|
38
37
|
- lib/coelacanth/dom.rb
|
|
38
|
+
- lib/coelacanth/extractor.rb
|
|
39
|
+
- lib/coelacanth/extractor/fallback_probe.rb
|
|
40
|
+
- lib/coelacanth/extractor/heuristic_probe.rb
|
|
41
|
+
- lib/coelacanth/extractor/image_collector.rb
|
|
42
|
+
- lib/coelacanth/extractor/listing_collector.rb
|
|
43
|
+
- lib/coelacanth/extractor/markdown_renderer.rb
|
|
44
|
+
- lib/coelacanth/extractor/metadata_probe.rb
|
|
45
|
+
- lib/coelacanth/extractor/normalizer.rb
|
|
46
|
+
- lib/coelacanth/extractor/utilities.rb
|
|
47
|
+
- lib/coelacanth/extractor/weak_ml_probe.rb
|
|
39
48
|
- lib/coelacanth/redirect.rb
|
|
40
49
|
- lib/coelacanth/validator.rb
|
|
41
50
|
- lib/coelacanth/version.rb
|
data/Gemfile.lock
DELETED
|
@@ -1,103 +0,0 @@
|
|
|
1
|
-
PATH
|
|
2
|
-
remote: .
|
|
3
|
-
specs:
|
|
4
|
-
coelacanth (0.3.5)
|
|
5
|
-
|
|
6
|
-
GEM
|
|
7
|
-
remote: https://rubygems.org/
|
|
8
|
-
specs:
|
|
9
|
-
addressable (2.8.7)
|
|
10
|
-
public_suffix (>= 2.0.2, < 7.0)
|
|
11
|
-
ansi (1.5.0)
|
|
12
|
-
ast (2.4.3)
|
|
13
|
-
base64 (0.2.0)
|
|
14
|
-
bigdecimal (3.1.9)
|
|
15
|
-
concurrent-ruby (1.3.5)
|
|
16
|
-
crack (1.0.0)
|
|
17
|
-
bigdecimal
|
|
18
|
-
rexml
|
|
19
|
-
diff-lcs (1.6.1)
|
|
20
|
-
ferrum (0.17.1)
|
|
21
|
-
addressable (~> 2.5)
|
|
22
|
-
base64 (~> 0.2)
|
|
23
|
-
concurrent-ruby (~> 1.1)
|
|
24
|
-
webrick (~> 1.7)
|
|
25
|
-
websocket-driver (~> 0.7)
|
|
26
|
-
hashdiff (1.1.2)
|
|
27
|
-
json (2.12.2)
|
|
28
|
-
language_server-protocol (3.17.0.5)
|
|
29
|
-
lint_roller (1.1.0)
|
|
30
|
-
oga (3.4)
|
|
31
|
-
ast
|
|
32
|
-
ruby-ll (~> 2.1)
|
|
33
|
-
parallel (1.27.0)
|
|
34
|
-
parser (3.3.8.0)
|
|
35
|
-
ast (~> 2.4.1)
|
|
36
|
-
racc
|
|
37
|
-
prism (1.4.0)
|
|
38
|
-
public_suffix (6.0.2)
|
|
39
|
-
racc (1.8.1)
|
|
40
|
-
rainbow (3.1.1)
|
|
41
|
-
rake (13.2.1)
|
|
42
|
-
regexp_parser (2.10.0)
|
|
43
|
-
rexml (3.4.1)
|
|
44
|
-
rspec (3.13.0)
|
|
45
|
-
rspec-core (~> 3.13.0)
|
|
46
|
-
rspec-expectations (~> 3.13.0)
|
|
47
|
-
rspec-mocks (~> 3.13.0)
|
|
48
|
-
rspec-core (3.13.3)
|
|
49
|
-
rspec-support (~> 3.13.0)
|
|
50
|
-
rspec-expectations (3.13.3)
|
|
51
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
|
52
|
-
rspec-support (~> 3.13.0)
|
|
53
|
-
rspec-mocks (3.13.2)
|
|
54
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
|
55
|
-
rspec-support (~> 3.13.0)
|
|
56
|
-
rspec-support (3.13.2)
|
|
57
|
-
rubocop (1.75.7)
|
|
58
|
-
json (~> 2.3)
|
|
59
|
-
language_server-protocol (~> 3.17.0.2)
|
|
60
|
-
lint_roller (~> 1.1.0)
|
|
61
|
-
parallel (~> 1.10)
|
|
62
|
-
parser (>= 3.3.0.2)
|
|
63
|
-
rainbow (>= 2.2.2, < 4.0)
|
|
64
|
-
regexp_parser (>= 2.9.3, < 3.0)
|
|
65
|
-
rubocop-ast (>= 1.44.0, < 2.0)
|
|
66
|
-
ruby-progressbar (~> 1.7)
|
|
67
|
-
unicode-display_width (>= 2.4.0, < 4.0)
|
|
68
|
-
rubocop-ast (1.44.1)
|
|
69
|
-
parser (>= 3.3.7.2)
|
|
70
|
-
prism (~> 1.4)
|
|
71
|
-
ruby-ll (2.1.3)
|
|
72
|
-
ansi
|
|
73
|
-
ast
|
|
74
|
-
ruby-progressbar (1.13.0)
|
|
75
|
-
unicode-display_width (3.1.4)
|
|
76
|
-
unicode-emoji (~> 4.0, >= 4.0.4)
|
|
77
|
-
unicode-emoji (4.0.4)
|
|
78
|
-
webmock (3.25.1)
|
|
79
|
-
addressable (>= 2.8.0)
|
|
80
|
-
crack (>= 0.3.2)
|
|
81
|
-
hashdiff (>= 0.4.0, < 2.0.0)
|
|
82
|
-
webrick (1.9.1)
|
|
83
|
-
websocket-driver (0.7.7)
|
|
84
|
-
base64
|
|
85
|
-
websocket-extensions (>= 0.1.0)
|
|
86
|
-
websocket-extensions (0.1.5)
|
|
87
|
-
|
|
88
|
-
PLATFORMS
|
|
89
|
-
ruby
|
|
90
|
-
x86_64-linux
|
|
91
|
-
|
|
92
|
-
DEPENDENCIES
|
|
93
|
-
base64 (~> 0.2.0)
|
|
94
|
-
coelacanth!
|
|
95
|
-
ferrum (~> 0.16)
|
|
96
|
-
oga (~> 3.4)
|
|
97
|
-
rake (~> 13.2)
|
|
98
|
-
rspec (~> 3.0)
|
|
99
|
-
rubocop (~> 1.75)
|
|
100
|
-
webmock (~> 3.25)
|
|
101
|
-
|
|
102
|
-
BUNDLED WITH
|
|
103
|
-
2.6.7
|