coelacanth 0.3.9 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,136 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "oga"
4
+
5
+ require_relative "utilities"
6
+
7
+ module Coelacanth
8
+ class Extractor
9
+ # Lightweight probabilistic scorer that emulates a learned classifier using heuristics.
10
+ class WeakMlProbe
11
+ Result = Struct.new(
12
+ :title,
13
+ :node,
14
+ :published_at,
15
+ :byline,
16
+ :source_tag,
17
+ :confidence,
18
+ keyword_init: true
19
+ )
20
+
21
+ BLOCK_SELECTOR = "article, main, section, div".freeze
22
+ TOKEN_WEIGHTS = {
23
+ "content" => 1.1,
24
+ "article" => 1.0,
25
+ "body" => 0.9,
26
+ "post" => 0.8,
27
+ "entry" => 0.75,
28
+ "text" => 0.6,
29
+ "story" => 0.6,
30
+ "blog" => 0.5,
31
+ "share" => -1.0,
32
+ "nav" => -1.3,
33
+ "footer" => -1.2,
34
+ "header" => -1.1,
35
+ "related" => -0.8
36
+ }.freeze
37
+
38
+ FEATURE_WEIGHTS = {
39
+ bias: -1.2,
40
+ text_length: 0.002,
41
+ link_density: -2.6,
42
+ punctuation_density: 1.8,
43
+ depth: -0.12,
44
+ token_score: 1.6
45
+ }.freeze
46
+
47
+ def call(doc:, url: nil)
48
+ candidates = doc.css(BLOCK_SELECTOR).map do |node|
49
+ evaluate(node)
50
+ end.compact
51
+
52
+ return if candidates.empty?
53
+
54
+ best = candidates.max_by { |candidate| candidate[:probability] }
55
+ return if best[:probability] < 0.45
56
+
57
+ Result.new(
58
+ title: title_from_meta(doc),
59
+ node: best[:node],
60
+ published_at: published_at_from_meta(doc),
61
+ byline: byline_from_meta(doc),
62
+ source_tag: :ml,
63
+ confidence: best[:probability].clamp(0.0, 0.9)
64
+ )
65
+ end
66
+
67
+ private
68
+
69
+ def evaluate(node)
70
+ text_length = Utilities.text_length(node)
71
+ return if text_length < 60
72
+
73
+ features = {
74
+ text_length: text_length,
75
+ link_density: Utilities.link_density(node),
76
+ punctuation_density: Utilities.punctuation_density(node),
77
+ depth: Utilities.depth(node),
78
+ token_score: token_score(node)
79
+ }
80
+
81
+ score = linear_combination(features)
82
+ probability = logistic(score)
83
+
84
+ { node: node, probability: probability }
85
+ end
86
+
87
+ def token_score(node)
88
+ Utilities.class_id_tokens(node).sum do |token|
89
+ TOKEN_WEIGHTS.fetch(token, 0.0)
90
+ end
91
+ end
92
+
93
+ def linear_combination(features)
94
+ FEATURE_WEIGHTS[:bias] +
95
+ FEATURE_WEIGHTS[:text_length] * features[:text_length] +
96
+ FEATURE_WEIGHTS[:link_density] * features[:link_density] +
97
+ FEATURE_WEIGHTS[:punctuation_density] * features[:punctuation_density] +
98
+ FEATURE_WEIGHTS[:depth] * features[:depth] +
99
+ FEATURE_WEIGHTS[:token_score] * features[:token_score]
100
+ end
101
+
102
+ def logistic(score)
103
+ 1.0 / (1.0 + Math.exp(-score))
104
+ end
105
+
106
+ def title_from_meta(doc)
107
+ Utilities.meta_content(
108
+ doc,
109
+ "meta[property='og:title']",
110
+ "meta[name='twitter:title']",
111
+ "meta[name='title']"
112
+ ) || doc.at_css("title")&.text&.strip
113
+ end
114
+
115
+ def published_at_from_meta(doc)
116
+ Utilities.parse_time(
117
+ Utilities.meta_content(
118
+ doc,
119
+ "meta[property='article:published_time']",
120
+ "meta[name='pubdate']",
121
+ "meta[name='publish_date']",
122
+ "meta[name='date']"
123
+ )
124
+ )
125
+ end
126
+
127
+ def byline_from_meta(doc)
128
+ Utilities.meta_content(
129
+ doc,
130
+ "meta[name='author']",
131
+ "meta[property='article:author']"
132
+ )
133
+ end
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "extractor/normalizer"
4
+ require_relative "extractor/metadata_probe"
5
+ require_relative "extractor/heuristic_probe"
6
+ require_relative "extractor/weak_ml_probe"
7
+ require_relative "extractor/fallback_probe"
8
+ require_relative "extractor/markdown_renderer"
9
+ require_relative "extractor/image_collector"
10
+ require_relative "extractor/listing_collector"
11
+
12
+ module Coelacanth
13
+ # High-level API for extracting articles without site-specific selectors.
14
+ class Extractor
15
+ PipelineResult = Struct.new(
16
+ :title,
17
+ :node,
18
+ :published_at,
19
+ :byline,
20
+ :source_tag,
21
+ :confidence,
22
+ keyword_init: true
23
+ )
24
+
25
+ def call(html:, url: nil)
26
+ document = Normalizer.new.call(html: html, base_url: url)
27
+
28
+ [
29
+ [MetadataProbe.new, 0.85],
30
+ [HeuristicProbe.new, 0.75],
31
+ [WeakMlProbe.new, 0.70],
32
+ [FallbackProbe.new, 0.0]
33
+ ].each do |probe, threshold|
34
+ result = probe.call(doc: document, url: url)
35
+ next unless result
36
+
37
+ return build_response(result, document:, url:) if result.confidence.to_f >= threshold
38
+ end
39
+
40
+ build_response(
41
+ PipelineResult.new(node: document, source_tag: :none, confidence: 0.0),
42
+ document: document,
43
+ url: url
44
+ )
45
+ end
46
+
47
+ private
48
+
49
+ def build_response(result, document:, url:)
50
+ node = result.node
51
+ body_markdown = MarkdownRenderer.render(node)
52
+ body_markdown_list = body_markdown.to_s.split(/\n{2,}/).map { |segment| segment.strip }.reject(&:empty?)
53
+
54
+ {
55
+ title: result.title,
56
+ body_markdown: body_markdown,
57
+ body_markdown_list: body_markdown_list,
58
+ images: ImageCollector.new.call(node),
59
+ published_at: result.published_at,
60
+ byline: result.byline,
61
+ source: result.source_tag,
62
+ confidence: result.confidence,
63
+ listings: ListingCollector.new.call(document: document, base_url: url, primary_node: node)
64
+ }
65
+ end
66
+ end
67
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Coelacanth
4
- VERSION = "0.3.9"
4
+ VERSION = "0.4.0"
5
5
  end
data/lib/coelacanth.rb CHANGED
@@ -6,6 +6,7 @@ require_relative "coelacanth/client/base"
6
6
  require_relative "coelacanth/client/ferrum"
7
7
  require_relative "coelacanth/client/screenshot_one"
8
8
  require_relative "coelacanth/dom"
9
+ require_relative "coelacanth/extractor"
9
10
  require_relative "coelacanth/redirect"
10
11
  require_relative "coelacanth/validator"
11
12
  require_relative "coelacanth/version"
@@ -20,9 +21,15 @@ module Coelacanth
20
21
  client_class = config.read("client") == "screenshot_one" ? Client::ScreenshotOne : Client::Ferrum
21
22
  @client = client_class.new(url)
22
23
  regular_url = Redirect.new.resolve_redirect(url)
24
+ response = Net::HTTP.get_response(URI.parse(regular_url))
25
+ html = response.body.to_s
26
+ html = html.force_encoding(Encoding::UTF_8)
27
+ html = html.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
28
+ extractor_result = Extractor.new.call(html: html, url: regular_url)
23
29
  {
24
- dom: Dom.new.oga(regular_url),
30
+ dom: Dom.new.oga(regular_url, html: html),
25
31
  screenshot: @client.get_screenshot,
32
+ extraction: extractor_result,
26
33
  }
27
34
  end
28
35
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coelacanth
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.9
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yusuke
@@ -24,7 +24,6 @@ files:
24
24
  - CODE_OF_CONDUCT.md
25
25
  - Dockerfile
26
26
  - Gemfile
27
- - Gemfile.lock
28
27
  - LICENSE.txt
29
28
  - README.md
30
29
  - Rakefile
@@ -36,6 +35,16 @@ files:
36
35
  - lib/coelacanth/client/screenshot_one.rb
37
36
  - lib/coelacanth/configure.rb
38
37
  - lib/coelacanth/dom.rb
38
+ - lib/coelacanth/extractor.rb
39
+ - lib/coelacanth/extractor/fallback_probe.rb
40
+ - lib/coelacanth/extractor/heuristic_probe.rb
41
+ - lib/coelacanth/extractor/image_collector.rb
42
+ - lib/coelacanth/extractor/listing_collector.rb
43
+ - lib/coelacanth/extractor/markdown_renderer.rb
44
+ - lib/coelacanth/extractor/metadata_probe.rb
45
+ - lib/coelacanth/extractor/normalizer.rb
46
+ - lib/coelacanth/extractor/utilities.rb
47
+ - lib/coelacanth/extractor/weak_ml_probe.rb
39
48
  - lib/coelacanth/redirect.rb
40
49
  - lib/coelacanth/validator.rb
41
50
  - lib/coelacanth/version.rb
data/Gemfile.lock DELETED
@@ -1,103 +0,0 @@
1
- PATH
2
- remote: .
3
- specs:
4
- coelacanth (0.3.5)
5
-
6
- GEM
7
- remote: https://rubygems.org/
8
- specs:
9
- addressable (2.8.7)
10
- public_suffix (>= 2.0.2, < 7.0)
11
- ansi (1.5.0)
12
- ast (2.4.3)
13
- base64 (0.2.0)
14
- bigdecimal (3.1.9)
15
- concurrent-ruby (1.3.5)
16
- crack (1.0.0)
17
- bigdecimal
18
- rexml
19
- diff-lcs (1.6.1)
20
- ferrum (0.17.1)
21
- addressable (~> 2.5)
22
- base64 (~> 0.2)
23
- concurrent-ruby (~> 1.1)
24
- webrick (~> 1.7)
25
- websocket-driver (~> 0.7)
26
- hashdiff (1.1.2)
27
- json (2.12.2)
28
- language_server-protocol (3.17.0.5)
29
- lint_roller (1.1.0)
30
- oga (3.4)
31
- ast
32
- ruby-ll (~> 2.1)
33
- parallel (1.27.0)
34
- parser (3.3.8.0)
35
- ast (~> 2.4.1)
36
- racc
37
- prism (1.4.0)
38
- public_suffix (6.0.2)
39
- racc (1.8.1)
40
- rainbow (3.1.1)
41
- rake (13.2.1)
42
- regexp_parser (2.10.0)
43
- rexml (3.4.1)
44
- rspec (3.13.0)
45
- rspec-core (~> 3.13.0)
46
- rspec-expectations (~> 3.13.0)
47
- rspec-mocks (~> 3.13.0)
48
- rspec-core (3.13.3)
49
- rspec-support (~> 3.13.0)
50
- rspec-expectations (3.13.3)
51
- diff-lcs (>= 1.2.0, < 2.0)
52
- rspec-support (~> 3.13.0)
53
- rspec-mocks (3.13.2)
54
- diff-lcs (>= 1.2.0, < 2.0)
55
- rspec-support (~> 3.13.0)
56
- rspec-support (3.13.2)
57
- rubocop (1.75.7)
58
- json (~> 2.3)
59
- language_server-protocol (~> 3.17.0.2)
60
- lint_roller (~> 1.1.0)
61
- parallel (~> 1.10)
62
- parser (>= 3.3.0.2)
63
- rainbow (>= 2.2.2, < 4.0)
64
- regexp_parser (>= 2.9.3, < 3.0)
65
- rubocop-ast (>= 1.44.0, < 2.0)
66
- ruby-progressbar (~> 1.7)
67
- unicode-display_width (>= 2.4.0, < 4.0)
68
- rubocop-ast (1.44.1)
69
- parser (>= 3.3.7.2)
70
- prism (~> 1.4)
71
- ruby-ll (2.1.3)
72
- ansi
73
- ast
74
- ruby-progressbar (1.13.0)
75
- unicode-display_width (3.1.4)
76
- unicode-emoji (~> 4.0, >= 4.0.4)
77
- unicode-emoji (4.0.4)
78
- webmock (3.25.1)
79
- addressable (>= 2.8.0)
80
- crack (>= 0.3.2)
81
- hashdiff (>= 0.4.0, < 2.0.0)
82
- webrick (1.9.1)
83
- websocket-driver (0.7.7)
84
- base64
85
- websocket-extensions (>= 0.1.0)
86
- websocket-extensions (0.1.5)
87
-
88
- PLATFORMS
89
- ruby
90
- x86_64-linux
91
-
92
- DEPENDENCIES
93
- base64 (~> 0.2.0)
94
- coelacanth!
95
- ferrum (~> 0.16)
96
- oga (~> 3.4)
97
- rake (~> 13.2)
98
- rspec (~> 3.0)
99
- rubocop (~> 1.75)
100
- webmock (~> 3.25)
101
-
102
- BUNDLED WITH
103
- 2.6.7