coelacanth 0.3.10 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "extractor/normalizer"
4
+ require_relative "extractor/metadata_probe"
5
+ require_relative "extractor/heuristic_probe"
6
+ require_relative "extractor/weak_ml_probe"
7
+ require_relative "extractor/fallback_probe"
8
+ require_relative "extractor/markdown_renderer"
9
+ require_relative "extractor/image_collector"
10
+ require_relative "extractor/markdown_listing_collector"
11
+
12
+ module Coelacanth
13
+ # High-level API for extracting articles without site-specific selectors.
14
+ class Extractor
15
+ PipelineResult = Struct.new(
16
+ :title,
17
+ :node,
18
+ :published_at,
19
+ :byline,
20
+ :source_tag,
21
+ :confidence,
22
+ keyword_init: true
23
+ )
24
+
25
+ def call(html:, url: nil)
26
+ document = Normalizer.new.call(html: html, base_url: url)
27
+
28
+ [
29
+ [MetadataProbe.new, 0.85],
30
+ [HeuristicProbe.new, 0.75],
31
+ [WeakMlProbe.new, 0.70],
32
+ [FallbackProbe.new, 0.0]
33
+ ].each do |probe, threshold|
34
+ result = probe.call(doc: document, url: url)
35
+ next unless result
36
+
37
+ return build_response(result, document:, url:) if result.confidence.to_f >= threshold
38
+ end
39
+
40
+ build_response(
41
+ PipelineResult.new(node: document, source_tag: :none, confidence: 0.0),
42
+ document: document,
43
+ url: url
44
+ )
45
+ end
46
+
47
+ private
48
+
49
+ def build_response(result, document:, url:)
50
+ node = result.node
51
+ body_markdown = MarkdownRenderer.render(node)
52
+ body_markdown_list = body_markdown.to_s.split(/\n{2,}/).map { |segment| segment.strip }.reject(&:empty?)
53
+
54
+ {
55
+ title: result.title,
56
+ body_markdown: body_markdown,
57
+ body_markdown_list: body_markdown_list,
58
+ images: ImageCollector.new.call(node),
59
+ published_at: result.published_at,
60
+ byline: result.byline,
61
+ source: result.source_tag,
62
+ confidence: result.confidence,
63
+ listings: MarkdownListingCollector.new.call(markdown: body_markdown, base_url: url)
64
+ }
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+ require "open-uri"
5
+ require "timeout"
6
+
7
+ require_relative "robots"
8
+
9
+ module Coelacanth
10
+ class TimeoutError < StandardError; end unless const_defined?(:TimeoutError)
11
+
12
+ module HTTP
13
+ DEFAULT_OPEN_TIMEOUT = 5
14
+ DEFAULT_READ_TIMEOUT = 10
15
+ MAX_RETRIES = 2
16
+
17
+ ErrorResponse = Struct.new(:status, :meta, :base_uri, :body, keyword_init: true) do
18
+ def string
19
+ body.to_s
20
+ end
21
+
22
+ alias to_s string
23
+ end
24
+
25
+ module_function
26
+
27
+ def get_response(uri, open_timeout: DEFAULT_OPEN_TIMEOUT, read_timeout: DEFAULT_READ_TIMEOUT, retries: MAX_RETRIES)
28
+ ensure_allowed!(uri)
29
+ raw_get_response(uri, open_timeout: open_timeout, read_timeout: read_timeout, retries: retries)
30
+ end
31
+
32
+ def raw_get_response(uri, open_timeout: DEFAULT_OPEN_TIMEOUT, read_timeout: DEFAULT_READ_TIMEOUT, retries: MAX_RETRIES)
33
+ attempts = 0
34
+ begin
35
+ attempts += 1
36
+ request = Net::HTTP::Get.new(uri)
37
+ Net::HTTP.start(
38
+ uri.host,
39
+ uri.port,
40
+ use_ssl: uri.scheme == "https",
41
+ open_timeout: open_timeout,
42
+ read_timeout: read_timeout
43
+ ) do |http|
44
+ return http.request(request)
45
+ end
46
+ rescue Net::OpenTimeout, Net::ReadTimeout, Timeout::Error => e
47
+ retry if attempts <= retries
48
+
49
+ raise Coelacanth::TimeoutError, "GET #{uri} timed out after #{attempts} attempts: #{e.message}"
50
+ end
51
+ end
52
+
53
+ def ensure_allowed!(uri)
54
+ return if Coelacanth::Robots.allowed?(uri)
55
+
56
+ raise Coelacanth::RobotsDisallowedError,
57
+ "Access to #{uri} is disallowed by robots.txt for user-agent '#{Coelacanth::Robots.user_agent}'"
58
+ end
59
+
60
+ def raise_http_error(uri, response)
61
+ message = format("%s %s for GET %s", response.code, response.message, uri)
62
+ io = ErrorResponse.new(
63
+ status: [response.code, response.message],
64
+ meta: response.each_header.to_h,
65
+ base_uri: uri,
66
+ body: response.body
67
+ )
68
+
69
+ raise OpenURI::HTTPError.new(message, io)
70
+ end
71
+ end
72
+ end
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "ferrum"
4
4
  require "oga"
5
+ require_relative "http"
5
6
 
6
7
  module Coelacanth
7
8
  # Coelacanth::Redirect
@@ -11,11 +12,15 @@ module Coelacanth
11
12
  raise Coelacanth::DeepRedirectError, "Too many redirect" if limit.zero?
12
13
  raise Coelacanth::RedirectError, "Url or location is nil" if @url.nil?
13
14
 
14
- response = Net::HTTP.get_response(URI.parse(@url))
15
+ response = Coelacanth::HTTP.get_response(URI.parse(@url))
15
16
  @status_code = response.code
16
17
  @origin_response = response
17
18
 
18
19
  handle_response(@origin_response, limit)
20
+ rescue Coelacanth::TimeoutError
21
+ @status_code = nil
22
+ @origin_response = nil
23
+ @url
19
24
  end
20
25
 
21
26
  private
@@ -0,0 +1,150 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Coelacanth
6
+ module Robots
7
+ DEFAULT_USER_AGENT = "CoelacanthBot"
8
+ RULE_STRUCT = Struct.new(:type, :pattern, :regex, :length, keyword_init: true)
9
+
10
+ module_function
11
+
12
+ def allowed?(uri, user_agent: user_agent())
13
+ rules = rules_for(uri)
14
+ return true if rules.empty?
15
+
16
+ agent_key = normalize_agent(user_agent)
17
+ agent_rules = rules[agent_key]
18
+ agent_rules = rules["*"] if agent_rules.nil? || agent_rules.empty?
19
+
20
+ return true if agent_rules.nil? || agent_rules.empty?
21
+
22
+ evaluate(agent_rules, normalize_path(uri))
23
+ end
24
+
25
+ def user_agent
26
+ ENV.fetch("COELACANTH_HTTP_USER_AGENT", DEFAULT_USER_AGENT)
27
+ end
28
+
29
+ def rules_for(uri)
30
+ robots_cache[cache_key(uri)] ||= fetch_rules(uri)
31
+ end
32
+
33
+ def clear_cache!
34
+ robots_cache.clear
35
+ end
36
+
37
+ def robots_cache
38
+ @robots_cache ||= {}
39
+ end
40
+
41
+ def fetch_rules(uri)
42
+ response = Coelacanth::HTTP.raw_get_response(robots_uri_for(uri))
43
+ return {} unless response.is_a?(Net::HTTPSuccess)
44
+
45
+ parse_robots(response.body.to_s)
46
+ rescue Coelacanth::TimeoutError, StandardError
47
+ {}
48
+ end
49
+
50
+ def robots_uri_for(uri)
51
+ klass = uri.scheme == "https" ? URI::HTTPS : URI::HTTP
52
+ port = uri.port
53
+ port = nil if port == default_port_for(uri.scheme)
54
+
55
+ klass.build(host: uri.host, path: "/robots.txt", port: port)
56
+ end
57
+
58
+ def parse_robots(body)
59
+ rules = Hash.new { |hash, key| hash[key] = [] }
60
+ current_agents = []
61
+ last_directive = nil
62
+
63
+ body.each_line do |line|
64
+ sanitized = sanitize_line(line)
65
+ if sanitized.empty?
66
+ current_agents = []
67
+ last_directive = nil
68
+ next
69
+ end
70
+
71
+ field, value = sanitized.split(":", 2)
72
+ next if value.nil?
73
+
74
+ field = field.strip.downcase
75
+ value = value.strip
76
+
77
+ case field
78
+ when "user-agent"
79
+ current_agents = [] unless last_directive == :user_agent
80
+ agent = normalize_agent(value)
81
+ current_agents << agent unless current_agents.include?(agent)
82
+ last_directive = :user_agent
83
+ when "allow", "disallow"
84
+ last_directive = field.to_sym
85
+ next if value.empty?
86
+
87
+ current_agents = ["*"] if current_agents.empty?
88
+ rule = build_rule(type: last_directive, value: value)
89
+ current_agents.each do |agent|
90
+ rules[agent] << rule
91
+ end
92
+ else
93
+ last_directive = field.to_sym
94
+ end
95
+ end
96
+
97
+ rules
98
+ end
99
+
100
+ def sanitize_line(line)
101
+ line.split("#", 2).first.to_s.strip
102
+ end
103
+
104
+ def build_rule(type:, value:)
105
+ pattern = value.start_with?("/") ? value : "/#{value}"
106
+ escaped = Regexp.escape(pattern)
107
+ escaped = escaped.gsub("\\*", ".*")
108
+ escaped = escaped.gsub("\\$", "\\z")
109
+ regex = Regexp.new("\\A" + escaped)
110
+ RULE_STRUCT.new(type: type, pattern: pattern, regex: regex, length: pattern.length)
111
+ end
112
+
113
+ def evaluate(rules, path)
114
+ matches = rules.select { |rule| rule.regex.match?(path) }
115
+ return true if matches.empty?
116
+
117
+ longest_allow = matches.select { |rule| rule.type == :allow }.max_by(&:length)
118
+ longest_disallow = matches.select { |rule| rule.type == :disallow }.max_by(&:length)
119
+
120
+ return true if longest_disallow.nil?
121
+ return true if longest_allow && longest_allow.length >= longest_disallow.length
122
+
123
+ false
124
+ end
125
+
126
+ def normalize_path(uri)
127
+ path = uri.path
128
+ path = "/" if path.nil? || path.empty?
129
+ query = uri.query
130
+ return path if query.nil? || query.empty?
131
+
132
+ "#{path}?#{query}"
133
+ end
134
+
135
+ def normalize_agent(agent)
136
+ agent.to_s.strip.downcase
137
+ end
138
+
139
+ def cache_key(uri)
140
+ port = uri.port
141
+ default_port = default_port_for(uri.scheme)
142
+ port_part = port && port != default_port ? ":#{port}" : ""
143
+ "#{uri.scheme}://#{uri.host}#{port_part}"
144
+ end
145
+
146
+ def default_port_for(scheme)
147
+ scheme == "https" ? URI::HTTPS.default_port : URI::HTTP.default_port
148
+ end
149
+ end
150
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Coelacanth
4
- VERSION = "0.3.10"
4
+ VERSION = "0.4.1"
5
5
  end
data/lib/coelacanth.rb CHANGED
@@ -6,6 +6,8 @@ require_relative "coelacanth/client/base"
6
6
  require_relative "coelacanth/client/ferrum"
7
7
  require_relative "coelacanth/client/screenshot_one"
8
8
  require_relative "coelacanth/dom"
9
+ require_relative "coelacanth/extractor"
10
+ require_relative "coelacanth/http"
9
11
  require_relative "coelacanth/redirect"
10
12
  require_relative "coelacanth/validator"
11
13
  require_relative "coelacanth/version"
@@ -15,14 +17,27 @@ module Coelacanth
15
17
  class Error < StandardError; end
16
18
  class RedirectError < StandardError; end
17
19
  class DeepRedirectError < StandardError; end
20
+ class TimeoutError < StandardError; end
21
+ class RobotsDisallowedError < StandardError; end
18
22
 
19
23
  def self.analyze(url)
20
24
  client_class = config.read("client") == "screenshot_one" ? Client::ScreenshotOne : Client::Ferrum
21
25
  @client = client_class.new(url)
22
26
  regular_url = Redirect.new.resolve_redirect(url)
27
+ response = begin
28
+ Coelacanth::HTTP.get_response(URI.parse(regular_url))
29
+ rescue Coelacanth::TimeoutError
30
+ nil
31
+ end
32
+ html = response&.body.to_s
33
+ html = html.dup
34
+ html = html.force_encoding(Encoding::UTF_8)
35
+ html = html.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
36
+ extractor_result = Extractor.new.call(html: html, url: regular_url)
23
37
  {
24
- dom: Dom.new.oga(regular_url),
38
+ dom: Dom.new.oga(regular_url, html: html),
25
39
  screenshot: @client.get_screenshot,
40
+ extraction: extractor_result,
26
41
  }
27
42
  end
28
43
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coelacanth
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.10
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yusuke
@@ -18,13 +18,13 @@ executables: []
18
18
  extensions: []
19
19
  extra_rdoc_files: []
20
20
  files:
21
+ - ".env.example"
21
22
  - ".rspec"
22
23
  - ".rubocop.yml"
23
24
  - CHANGELOG.md
24
25
  - CODE_OF_CONDUCT.md
25
26
  - Dockerfile
26
27
  - Gemfile
27
- - Gemfile.lock
28
28
  - LICENSE.txt
29
29
  - README.md
30
30
  - Rakefile
@@ -36,7 +36,19 @@ files:
36
36
  - lib/coelacanth/client/screenshot_one.rb
37
37
  - lib/coelacanth/configure.rb
38
38
  - lib/coelacanth/dom.rb
39
+ - lib/coelacanth/extractor.rb
40
+ - lib/coelacanth/extractor/fallback_probe.rb
41
+ - lib/coelacanth/extractor/heuristic_probe.rb
42
+ - lib/coelacanth/extractor/image_collector.rb
43
+ - lib/coelacanth/extractor/markdown_listing_collector.rb
44
+ - lib/coelacanth/extractor/markdown_renderer.rb
45
+ - lib/coelacanth/extractor/metadata_probe.rb
46
+ - lib/coelacanth/extractor/normalizer.rb
47
+ - lib/coelacanth/extractor/utilities.rb
48
+ - lib/coelacanth/extractor/weak_ml_probe.rb
49
+ - lib/coelacanth/http.rb
39
50
  - lib/coelacanth/redirect.rb
51
+ - lib/coelacanth/robots.rb
40
52
  - lib/coelacanth/validator.rb
41
53
  - lib/coelacanth/version.rb
42
54
  homepage: https://github.com/slidict/coelacanth
data/Gemfile.lock DELETED
@@ -1,103 +0,0 @@
1
- PATH
2
- remote: .
3
- specs:
4
- coelacanth (0.3.5)
5
-
6
- GEM
7
- remote: https://rubygems.org/
8
- specs:
9
- addressable (2.8.7)
10
- public_suffix (>= 2.0.2, < 7.0)
11
- ansi (1.5.0)
12
- ast (2.4.3)
13
- base64 (0.3.0)
14
- bigdecimal (3.1.9)
15
- concurrent-ruby (1.3.5)
16
- crack (1.0.0)
17
- bigdecimal
18
- rexml
19
- diff-lcs (1.6.1)
20
- ferrum (0.17.1)
21
- addressable (~> 2.5)
22
- base64 (~> 0.2)
23
- concurrent-ruby (~> 1.1)
24
- webrick (~> 1.7)
25
- websocket-driver (~> 0.7)
26
- hashdiff (1.1.2)
27
- json (2.12.2)
28
- language_server-protocol (3.17.0.5)
29
- lint_roller (1.1.0)
30
- oga (3.4)
31
- ast
32
- ruby-ll (~> 2.1)
33
- parallel (1.27.0)
34
- parser (3.3.8.0)
35
- ast (~> 2.4.1)
36
- racc
37
- prism (1.4.0)
38
- public_suffix (6.0.2)
39
- racc (1.8.1)
40
- rainbow (3.1.1)
41
- rake (13.3.0)
42
- regexp_parser (2.10.0)
43
- rexml (3.4.1)
44
- rspec (3.13.0)
45
- rspec-core (~> 3.13.0)
46
- rspec-expectations (~> 3.13.0)
47
- rspec-mocks (~> 3.13.0)
48
- rspec-core (3.13.3)
49
- rspec-support (~> 3.13.0)
50
- rspec-expectations (3.13.3)
51
- diff-lcs (>= 1.2.0, < 2.0)
52
- rspec-support (~> 3.13.0)
53
- rspec-mocks (3.13.2)
54
- diff-lcs (>= 1.2.0, < 2.0)
55
- rspec-support (~> 3.13.0)
56
- rspec-support (3.13.2)
57
- rubocop (1.76.1)
58
- json (~> 2.3)
59
- language_server-protocol (~> 3.17.0.2)
60
- lint_roller (~> 1.1.0)
61
- parallel (~> 1.10)
62
- parser (>= 3.3.0.2)
63
- rainbow (>= 2.2.2, < 4.0)
64
- regexp_parser (>= 2.9.3, < 3.0)
65
- rubocop-ast (>= 1.45.0, < 2.0)
66
- ruby-progressbar (~> 1.7)
67
- unicode-display_width (>= 2.4.0, < 4.0)
68
- rubocop-ast (1.45.1)
69
- parser (>= 3.3.7.2)
70
- prism (~> 1.4)
71
- ruby-ll (2.1.3)
72
- ansi
73
- ast
74
- ruby-progressbar (1.13.0)
75
- unicode-display_width (3.1.4)
76
- unicode-emoji (~> 4.0, >= 4.0.4)
77
- unicode-emoji (4.0.4)
78
- webmock (3.25.1)
79
- addressable (>= 2.8.0)
80
- crack (>= 0.3.2)
81
- hashdiff (>= 0.4.0, < 2.0.0)
82
- webrick (1.9.1)
83
- websocket-driver (0.7.7)
84
- base64
85
- websocket-extensions (>= 0.1.0)
86
- websocket-extensions (0.1.5)
87
-
88
- PLATFORMS
89
- ruby
90
- x86_64-linux
91
-
92
- DEPENDENCIES
93
- base64 (~> 0.3.0)
94
- coelacanth!
95
- ferrum (~> 0.16)
96
- oga (~> 3.4)
97
- rake (~> 13.3)
98
- rspec (~> 3.0)
99
- rubocop (~> 1.76)
100
- webmock (~> 3.25)
101
-
102
- BUNDLED WITH
103
- 2.6.7