coelacanth 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.env.example +5 -0
- data/CHANGELOG.md +3 -14
- data/README.md +26 -5
- data/compose.yml +5 -2
- data/config/coelacanth.yml +5 -3
- data/lib/coelacanth/client/ferrum.rb +12 -2
- data/lib/coelacanth/client/screenshot_one.rb +31 -9
- data/lib/coelacanth/configure.rb +6 -1
- data/lib/coelacanth/dom.rb +7 -2
- data/lib/coelacanth/extractor/markdown_listing_collector.rb +108 -0
- data/lib/coelacanth/extractor/markdown_renderer.rb +4 -0
- data/lib/coelacanth/extractor.rb +2 -2
- data/lib/coelacanth/http.rb +72 -0
- data/lib/coelacanth/redirect.rb +6 -1
- data/lib/coelacanth/robots.rb +150 -0
- data/lib/coelacanth/version.rb +1 -1
- data/lib/coelacanth.rb +10 -2
- metadata +5 -2
- data/lib/coelacanth/extractor/listing_collector.rb +0 -270
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9097f36247caad8f0764313b306398e3707290eca25d5cfffc05f61e97784884
|
|
4
|
+
data.tar.gz: 19e093800bcb9ae663e0f36a1ad18d15472ff2e76f34f6ddf65c396440aea2e0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 32c30afcf5316814e42f2ed2f5b94efe49daecdd78b7acdf1d10d64ed2c8253a86a0a0be3b66ed1824cfbefb32c5654f3abf0fb08c41411d736281f480375400
|
|
7
|
+
data.tar.gz: 4e6683d596d50535dd13df26d2c6c4339fe543013d37250c8e55f7605dc441aa3ed888fcf46f9550540583e3234bd52667cd28d1b9e098a18e3eb5faae354bed
|
data/.env.example
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
# Copy this file to .env and fill in the values to configure Coelacanth.
|
|
2
|
+
# Optional: only set when the remote browser requires authentication.
|
|
3
|
+
COELACANTH_REMOTE_CLIENT_AUTHORIZATION=
|
|
4
|
+
COELACANTH_REMOTE_CLIENT_USER_AGENT="Coelacanth Chrome Extension"
|
|
5
|
+
COELACANTH_SCREENSHOT_ONE_API_KEY="your_screenshot_one_api_key_here"
|
data/CHANGELOG.md
CHANGED
|
@@ -4,19 +4,8 @@ All notable changes to this project will be documented in this file.
|
|
|
4
4
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
5
5
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
6
|
|
|
7
|
-
## [v0.4.
|
|
8
|
-
### :sparkles: New Features
|
|
9
|
-
- [`d13c21e`](https://github.com/slidict/coelacanth/commit/d13c21e106e7155ba7d545570341da37caa0b4b3) - support structural listings for digital.go.jp *(commit by [@yubele](https://github.com/yubele))*
|
|
10
|
-
- [`19fb4d7`](https://github.com/slidict/coelacanth/commit/19fb4d7a29350bb063c6bb76a89c46d7d001fc3e) - expose body markdown blocks *(commit by [@yubele](https://github.com/yubele))*
|
|
11
|
-
|
|
7
|
+
## [v0.4.1] - 2025-11-03
|
|
12
8
|
### :wrench: Chores
|
|
13
|
-
- [`
|
|
14
|
-
- [`388658f`](https://github.com/slidict/coelacanth/commit/388658fcaa94f09123c67ab62dd69d207d03ee7a) - **deps**: Bump rubocop from 1.77.0 to 1.81.6 *(commit by [@dependabot[bot]](https://github.com/apps/dependabot))*
|
|
15
|
-
- [`be62dbb`](https://github.com/slidict/coelacanth/commit/be62dbbf77fc9a324f29d684df167cbedcea7b8f) - translate listing samples to English *(commit by [@yubele](https://github.com/yubele))*
|
|
16
|
-
- [`41bf358`](https://github.com/slidict/coelacanth/commit/41bf358eb955fb75b467719750a19f9d07a2cba1) - require ruby 3.4 or newer *(commit by [@yubele](https://github.com/yubele))*
|
|
17
|
-
- [`cff304d`](https://github.com/slidict/coelacanth/commit/cff304d255952ad6be18e796c6649b72272bc23e) - **deps-dev**: Bump rexml in the bundler group across 1 directory *(commit by [@dependabot[bot]](https://github.com/apps/dependabot))*
|
|
18
|
-
- [`962c267`](https://github.com/slidict/coelacanth/commit/962c267ab8f301e1375b67fc32a3cbc54effc232) - Bump version to 0.4.0 *(commit by [@yubele](https://github.com/yubele))*
|
|
19
|
-
- [`bf3633c`](https://github.com/slidict/coelacanth/commit/bf3633c82a61684238485606300bfa0891d247eb) - Delete Gemfile.lock *(commit by [@yubele](https://github.com/yubele))*
|
|
20
|
-
- [`62f2a0e`](https://github.com/slidict/coelacanth/commit/62f2a0ecb80fb5d77f704e6c48d1d8f4c7670818) - Add Gemfile.lock to .gitignore *(commit by [@yubele](https://github.com/yubele))*
|
|
9
|
+
- [`41e89b7`](https://github.com/slidict/coelacanth/commit/41e89b799573f6cfaf0a12e7abc5c260f1905aec) - Bump version from 0.4.0 to 0.4.1 *(commit by [@yubele](https://github.com/yubele))*
|
|
21
10
|
|
|
22
|
-
[v0.4.
|
|
11
|
+
[v0.4.1]: https://github.com/slidict/coelacanth/compare/v0.4.0...v0.4.1
|
data/README.md
CHANGED
|
@@ -102,8 +102,8 @@ Coelacanth ships with a multi-stage extractor that tries increasingly involved p
|
|
|
102
102
|
4. **FallbackProbe** acts as a safety net by following AMP/print links or summarizing the whole document when the previous
|
|
103
103
|
probes fail.
|
|
104
104
|
|
|
105
|
-
|
|
106
|
-
|
|
105
|
+
Markdown-based listings are generated from the extracted body so lists such as "Latest news" blocks can be stored alongside the
|
|
106
|
+
article without scanning the rest of the page layout.
|
|
107
107
|
|
|
108
108
|
## Configuration
|
|
109
109
|
Runtime configuration is stored in `config/coelacanth.yml`. Environments inherit from the `development` section by default.
|
|
@@ -115,10 +115,12 @@ development:
|
|
|
115
115
|
ws_url: "ws://chrome:3000/chrome"
|
|
116
116
|
timeout: 10
|
|
117
117
|
headers:
|
|
118
|
-
|
|
119
|
-
|
|
118
|
+
<% if (auth = ENV["COELACANTH_REMOTE_CLIENT_AUTHORIZATION"]).to_s.strip != "" %>
|
|
119
|
+
Authorization: "<%= auth %>"
|
|
120
|
+
<% end %>
|
|
121
|
+
User-Agent: "<%= ENV.fetch("COELACANTH_REMOTE_CLIENT_USER_AGENT", "Coelacanth Chrome Extension") %>"
|
|
120
122
|
screenshot_one:
|
|
121
|
-
key: "your_screenshot_one_api_key_here"
|
|
123
|
+
key: "<%= ENV.fetch("COELACANTH_SCREENSHOT_ONE_API_KEY", "your_screenshot_one_api_key_here") %>"
|
|
122
124
|
```
|
|
123
125
|
|
|
124
126
|
- **Ferrum client** – Requires a running Chrome instance that exposes the DevTools protocol via WebSocket. Configure the URL,
|
|
@@ -127,6 +129,25 @@ development:
|
|
|
127
129
|
- Configuration is environment-aware: set `RAILS_ENV`/`RACK_ENV` or use Rails' built-in environment handling when the gem is
|
|
128
130
|
used inside a Rails project.
|
|
129
131
|
|
|
132
|
+
### Environment variables
|
|
133
|
+
|
|
134
|
+
Configuration values that would otherwise contain credentials are loaded from environment variables. Set the following
|
|
135
|
+
variables in your shell (or `dotenv` file) before running the gem:
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
# Optional: only set when the remote browser requires authentication.
|
|
139
|
+
export COELACANTH_REMOTE_CLIENT_AUTHORIZATION="Bearer <token>"
|
|
140
|
+
|
|
141
|
+
export COELACANTH_REMOTE_CLIENT_USER_AGENT="Coelacanth Chrome Extension"
|
|
142
|
+
export COELACANTH_SCREENSHOT_ONE_API_KEY="your_screenshot_one_api_key_here"
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
If `COELACANTH_REMOTE_CLIENT_AUTHORIZATION` is omitted or left blank, the `Authorization` header is not injected into the
|
|
146
|
+
remote browser session.
|
|
147
|
+
|
|
148
|
+
When using Docker Compose, you can create a `.env` file or export the variables in your environment so the `app` service picks
|
|
149
|
+
them up automatically.
|
|
150
|
+
|
|
130
151
|
If you are working inside Docker, make sure the `UID` environment variable matches your host user by exporting it in your shell
|
|
131
152
|
startup file:
|
|
132
153
|
|
data/compose.yml
CHANGED
|
@@ -3,8 +3,11 @@ networks:
|
|
|
3
3
|
driver: bridge
|
|
4
4
|
services:
|
|
5
5
|
app:
|
|
6
|
-
environment:
|
|
7
|
-
- UID=${UID}
|
|
6
|
+
environment:
|
|
7
|
+
- UID=${UID}
|
|
8
|
+
- COELACANTH_REMOTE_CLIENT_AUTHORIZATION=${COELACANTH_REMOTE_CLIENT_AUTHORIZATION:-}
|
|
9
|
+
- COELACANTH_REMOTE_CLIENT_USER_AGENT=${COELACANTH_REMOTE_CLIENT_USER_AGENT:-}
|
|
10
|
+
- COELACANTH_SCREENSHOT_ONE_API_KEY=${COELACANTH_SCREENSHOT_ONE_API_KEY:-}
|
|
8
11
|
tty: true
|
|
9
12
|
stdin_open: true
|
|
10
13
|
build:
|
data/config/coelacanth.yml
CHANGED
|
@@ -4,10 +4,12 @@ development: &development
|
|
|
4
4
|
ws_url: "ws://chrome:3000/chrome"
|
|
5
5
|
timeout: 10 # seconds
|
|
6
6
|
headers:
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
<% if (auth = ENV["COELACANTH_REMOTE_CLIENT_AUTHORIZATION"]).to_s.strip != "" %>
|
|
8
|
+
Authorization: "<%= auth %>"
|
|
9
|
+
<% end %>
|
|
10
|
+
User-Agent: "<%= ENV.fetch("COELACANTH_REMOTE_CLIENT_USER_AGENT", "Coelacanth Chrome Extension") %>"
|
|
9
11
|
screenshot_one:
|
|
10
|
-
key: "your_screenshot_one_api_key_here"
|
|
12
|
+
key: "<%= ENV.fetch("COELACANTH_SCREENSHOT_ONE_API_KEY", "your_screenshot_one_api_key_here") %>"
|
|
11
13
|
test:
|
|
12
14
|
<<: *development
|
|
13
15
|
production:
|
|
@@ -16,7 +16,7 @@ module Coelacanth::Client
|
|
|
16
16
|
body = remote_client.body
|
|
17
17
|
body
|
|
18
18
|
rescue => e
|
|
19
|
-
raise
|
|
19
|
+
raise sanitized_remote_client_error(e)
|
|
20
20
|
end
|
|
21
21
|
|
|
22
22
|
def get_screenshot
|
|
@@ -26,11 +26,21 @@ module Coelacanth::Client
|
|
|
26
26
|
File.read(tempfile.path)
|
|
27
27
|
rescue => e
|
|
28
28
|
tempfile.close
|
|
29
|
-
raise
|
|
29
|
+
raise sanitized_remote_client_error(e)
|
|
30
30
|
end
|
|
31
31
|
|
|
32
32
|
private
|
|
33
33
|
|
|
34
|
+
def sanitized_remote_client_error(error)
|
|
35
|
+
"#{error.class}: #{error.message} RemoteClient: #{sanitized_remote_client_identifier}"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def sanitized_remote_client_identifier
|
|
39
|
+
return "nil" unless @remote_client
|
|
40
|
+
|
|
41
|
+
"#{@remote_client.class.name}(object_id=#{@remote_client.object_id})"
|
|
42
|
+
end
|
|
43
|
+
|
|
34
44
|
def remote_client
|
|
35
45
|
return @remote_client if @remote_client
|
|
36
46
|
|
|
@@ -1,19 +1,29 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require "open-uri"
|
|
4
3
|
require "ferrum"
|
|
4
|
+
require_relative "ferrum"
|
|
5
|
+
require_relative "../http"
|
|
5
6
|
|
|
6
7
|
module Coelacanth::Client
|
|
7
8
|
# Coelacanth::Client
|
|
8
9
|
class ScreenshotOne < Coelacanth::Client::Base
|
|
9
10
|
def get_response
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
11
|
+
uri = URI.parse(@url)
|
|
12
|
+
response = Coelacanth::HTTP.get_response(
|
|
13
|
+
uri,
|
|
14
|
+
open_timeout: Coelacanth::HTTP::DEFAULT_OPEN_TIMEOUT,
|
|
15
|
+
read_timeout: Coelacanth::HTTP::DEFAULT_READ_TIMEOUT
|
|
16
|
+
)
|
|
17
|
+
@origin_response = response
|
|
18
|
+
@status_code = response.code.to_i
|
|
19
|
+
|
|
20
|
+
return response.body if response.is_a?(Net::HTTPSuccess)
|
|
21
|
+
|
|
22
|
+
Coelacanth::HTTP.raise_http_error(uri, response)
|
|
23
|
+
rescue Coelacanth::TimeoutError
|
|
24
|
+
fallback_response = fallback_client.get_response
|
|
25
|
+
@status_code = fallback_client.instance_variable_get(:@status_code)
|
|
26
|
+
fallback_response
|
|
17
27
|
end
|
|
18
28
|
|
|
19
29
|
def get_screenshot
|
|
@@ -34,10 +44,22 @@ module Coelacanth::Client
|
|
|
34
44
|
}
|
|
35
45
|
uri.query = URI.encode_www_form(params)
|
|
36
46
|
|
|
37
|
-
response =
|
|
47
|
+
response = Coelacanth::HTTP.get_response(
|
|
48
|
+
uri,
|
|
49
|
+
open_timeout: Coelacanth::HTTP::DEFAULT_OPEN_TIMEOUT,
|
|
50
|
+
read_timeout: 30
|
|
51
|
+
)
|
|
38
52
|
raise "Failed to fetch screenshot: #{response.code}" unless response.is_a?(Net::HTTPSuccess)
|
|
39
53
|
|
|
40
54
|
response.body
|
|
55
|
+
rescue Coelacanth::TimeoutError
|
|
56
|
+
fallback_client.get_screenshot
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
private
|
|
60
|
+
|
|
61
|
+
def fallback_client
|
|
62
|
+
@fallback_client ||= Coelacanth::Client::Ferrum.new(@url, @config)
|
|
41
63
|
end
|
|
42
64
|
end
|
|
43
65
|
end
|
data/lib/coelacanth/configure.rb
CHANGED
|
@@ -15,7 +15,12 @@ module Coelacanth
|
|
|
15
15
|
end
|
|
16
16
|
|
|
17
17
|
def yaml
|
|
18
|
-
@yaml ||= YAML.
|
|
18
|
+
@yaml ||= YAML.safe_load(
|
|
19
|
+
ERB.new(File.read(file)).result,
|
|
20
|
+
permitted_classes: [],
|
|
21
|
+
permitted_symbols: [],
|
|
22
|
+
aliases: true
|
|
23
|
+
)[env]
|
|
19
24
|
end
|
|
20
25
|
|
|
21
26
|
private
|
data/lib/coelacanth/dom.rb
CHANGED
|
@@ -1,13 +1,18 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "oga"
|
|
4
|
+
require_relative "http"
|
|
4
5
|
|
|
5
6
|
module Coelacanth
|
|
6
7
|
# Coelacanth::Dom
|
|
7
8
|
class Dom
|
|
8
9
|
def oga(url, html: nil)
|
|
9
|
-
html ||=
|
|
10
|
-
|
|
10
|
+
html ||= begin
|
|
11
|
+
Coelacanth::HTTP.get_response(URI.parse(url)).body
|
|
12
|
+
rescue Coelacanth::TimeoutError
|
|
13
|
+
""
|
|
14
|
+
end
|
|
15
|
+
Oga.parse_xml(html.to_s)
|
|
11
16
|
end
|
|
12
17
|
end
|
|
13
18
|
end
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "utilities"
|
|
4
|
+
|
|
5
|
+
module Coelacanth
|
|
6
|
+
class Extractor
|
|
7
|
+
# Extracts structured listings from Markdown content.
|
|
8
|
+
class MarkdownListingCollector
|
|
9
|
+
LIST_ITEM_PATTERN = /\A(?:[-*+]|\d+\.)\s+/.freeze
|
|
10
|
+
HEADING_PATTERN = /\A#+\s*/.freeze
|
|
11
|
+
MIN_ITEMS = 3
|
|
12
|
+
MIN_TITLE_LENGTH = 2
|
|
13
|
+
|
|
14
|
+
def call(markdown:, base_url: nil)
|
|
15
|
+
return [] if markdown.to_s.strip.empty?
|
|
16
|
+
|
|
17
|
+
listings = []
|
|
18
|
+
current = nil
|
|
19
|
+
pending_heading = nil
|
|
20
|
+
|
|
21
|
+
finalize_current = lambda do
|
|
22
|
+
next unless current
|
|
23
|
+
|
|
24
|
+
if current[:items].length >= MIN_ITEMS
|
|
25
|
+
listings << { heading: current[:heading], items: current[:items] }
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
current = nil
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
markdown.each_line do |line|
|
|
32
|
+
stripped = line.strip
|
|
33
|
+
|
|
34
|
+
if stripped.empty?
|
|
35
|
+
finalize_current.call
|
|
36
|
+
next
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
if heading_line?(stripped)
|
|
40
|
+
finalize_current.call
|
|
41
|
+
pending_heading = normalize_heading(stripped)
|
|
42
|
+
next
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
if list_item_line?(stripped)
|
|
46
|
+
current ||= { heading: pending_heading, items: [] }
|
|
47
|
+
pending_heading = nil
|
|
48
|
+
|
|
49
|
+
if (item = build_item(stripped, base_url))
|
|
50
|
+
current[:items] << item
|
|
51
|
+
end
|
|
52
|
+
else
|
|
53
|
+
finalize_current.call
|
|
54
|
+
pending_heading = nil
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
finalize_current.call
|
|
59
|
+
|
|
60
|
+
listings
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private
|
|
64
|
+
|
|
65
|
+
def heading_line?(line)
|
|
66
|
+
line.start_with?("#") && line.match?(HEADING_PATTERN)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def list_item_line?(line)
|
|
70
|
+
line.match?(LIST_ITEM_PATTERN)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def normalize_heading(line)
|
|
74
|
+
line.sub(HEADING_PATTERN, "").strip
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def build_item(line, base_url)
|
|
78
|
+
content = line.sub(LIST_ITEM_PATTERN, "").strip
|
|
79
|
+
return if content.empty?
|
|
80
|
+
|
|
81
|
+
if (match = content.match(/\A\[([^\]]+)\]\(([^\)]+)\)(.*)\z/))
|
|
82
|
+
title = match[1].to_s.strip
|
|
83
|
+
href = match[2].to_s.strip
|
|
84
|
+
trailing = match[3].to_s.strip
|
|
85
|
+
|
|
86
|
+
return if title.length < MIN_TITLE_LENGTH
|
|
87
|
+
|
|
88
|
+
url = Utilities.absolute_url(base_url, href) || href
|
|
89
|
+
item = { title: title, url: url }
|
|
90
|
+
|
|
91
|
+
snippet = normalize_snippet(trailing)
|
|
92
|
+
item[:snippet] = snippet unless snippet.nil? || snippet.empty?
|
|
93
|
+
item
|
|
94
|
+
else
|
|
95
|
+
title = content
|
|
96
|
+
return if title.length < MIN_TITLE_LENGTH
|
|
97
|
+
|
|
98
|
+
{ title: title }
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def normalize_snippet(text)
|
|
103
|
+
stripped = text.to_s.sub(/\A[-–—:]\s*/, "").strip
|
|
104
|
+
stripped.empty? ? nil : stripped
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
@@ -52,6 +52,10 @@ module Coelacanth
|
|
|
52
52
|
end + [""]
|
|
53
53
|
when "li"
|
|
54
54
|
["- #{inline_children(node, depth)}"]
|
|
55
|
+
when "a"
|
|
56
|
+
href = node["href"].to_s.strip
|
|
57
|
+
text = inline_children(node, depth)
|
|
58
|
+
href.empty? ? text : "[#{text}](#{href})"
|
|
55
59
|
when "strong", "b"
|
|
56
60
|
"**#{inline_children(node, depth)}**"
|
|
57
61
|
when "em", "i"
|
data/lib/coelacanth/extractor.rb
CHANGED
|
@@ -7,7 +7,7 @@ require_relative "extractor/weak_ml_probe"
|
|
|
7
7
|
require_relative "extractor/fallback_probe"
|
|
8
8
|
require_relative "extractor/markdown_renderer"
|
|
9
9
|
require_relative "extractor/image_collector"
|
|
10
|
-
require_relative "extractor/
|
|
10
|
+
require_relative "extractor/markdown_listing_collector"
|
|
11
11
|
|
|
12
12
|
module Coelacanth
|
|
13
13
|
# High-level API for extracting articles without site-specific selectors.
|
|
@@ -60,7 +60,7 @@ module Coelacanth
|
|
|
60
60
|
byline: result.byline,
|
|
61
61
|
source: result.source_tag,
|
|
62
62
|
confidence: result.confidence,
|
|
63
|
-
listings:
|
|
63
|
+
listings: MarkdownListingCollector.new.call(markdown: body_markdown, base_url: url)
|
|
64
64
|
}
|
|
65
65
|
end
|
|
66
66
|
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
require "open-uri"
|
|
5
|
+
require "timeout"
|
|
6
|
+
|
|
7
|
+
require_relative "robots"
|
|
8
|
+
|
|
9
|
+
module Coelacanth
|
|
10
|
+
class TimeoutError < StandardError; end unless const_defined?(:TimeoutError)
|
|
11
|
+
|
|
12
|
+
module HTTP
|
|
13
|
+
DEFAULT_OPEN_TIMEOUT = 5
|
|
14
|
+
DEFAULT_READ_TIMEOUT = 10
|
|
15
|
+
MAX_RETRIES = 2
|
|
16
|
+
|
|
17
|
+
ErrorResponse = Struct.new(:status, :meta, :base_uri, :body, keyword_init: true) do
|
|
18
|
+
def string
|
|
19
|
+
body.to_s
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
alias to_s string
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
module_function
|
|
26
|
+
|
|
27
|
+
def get_response(uri, open_timeout: DEFAULT_OPEN_TIMEOUT, read_timeout: DEFAULT_READ_TIMEOUT, retries: MAX_RETRIES)
|
|
28
|
+
ensure_allowed!(uri)
|
|
29
|
+
raw_get_response(uri, open_timeout: open_timeout, read_timeout: read_timeout, retries: retries)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def raw_get_response(uri, open_timeout: DEFAULT_OPEN_TIMEOUT, read_timeout: DEFAULT_READ_TIMEOUT, retries: MAX_RETRIES)
|
|
33
|
+
attempts = 0
|
|
34
|
+
begin
|
|
35
|
+
attempts += 1
|
|
36
|
+
request = Net::HTTP::Get.new(uri)
|
|
37
|
+
Net::HTTP.start(
|
|
38
|
+
uri.host,
|
|
39
|
+
uri.port,
|
|
40
|
+
use_ssl: uri.scheme == "https",
|
|
41
|
+
open_timeout: open_timeout,
|
|
42
|
+
read_timeout: read_timeout
|
|
43
|
+
) do |http|
|
|
44
|
+
return http.request(request)
|
|
45
|
+
end
|
|
46
|
+
rescue Net::OpenTimeout, Net::ReadTimeout, Timeout::Error => e
|
|
47
|
+
retry if attempts <= retries
|
|
48
|
+
|
|
49
|
+
raise Coelacanth::TimeoutError, "GET #{uri} timed out after #{attempts} attempts: #{e.message}"
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def ensure_allowed!(uri)
|
|
54
|
+
return if Coelacanth::Robots.allowed?(uri)
|
|
55
|
+
|
|
56
|
+
raise Coelacanth::RobotsDisallowedError,
|
|
57
|
+
"Access to #{uri} is disallowed by robots.txt for user-agent '#{Coelacanth::Robots.user_agent}'"
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def raise_http_error(uri, response)
|
|
61
|
+
message = format("%s %s for GET %s", response.code, response.message, uri)
|
|
62
|
+
io = ErrorResponse.new(
|
|
63
|
+
status: [response.code, response.message],
|
|
64
|
+
meta: response.each_header.to_h,
|
|
65
|
+
base_uri: uri,
|
|
66
|
+
body: response.body
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
raise OpenURI::HTTPError.new(message, io)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
data/lib/coelacanth/redirect.rb
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "ferrum"
|
|
4
4
|
require "oga"
|
|
5
|
+
require_relative "http"
|
|
5
6
|
|
|
6
7
|
module Coelacanth
|
|
7
8
|
# Coelacanth::Redirect
|
|
@@ -11,11 +12,15 @@ module Coelacanth
|
|
|
11
12
|
raise Coelacanth::DeepRedirectError, "Too many redirect" if limit.zero?
|
|
12
13
|
raise Coelacanth::RedirectError, "Url or location is nil" if @url.nil?
|
|
13
14
|
|
|
14
|
-
response =
|
|
15
|
+
response = Coelacanth::HTTP.get_response(URI.parse(@url))
|
|
15
16
|
@status_code = response.code
|
|
16
17
|
@origin_response = response
|
|
17
18
|
|
|
18
19
|
handle_response(@origin_response, limit)
|
|
20
|
+
rescue Coelacanth::TimeoutError
|
|
21
|
+
@status_code = nil
|
|
22
|
+
@origin_response = nil
|
|
23
|
+
@url
|
|
19
24
|
end
|
|
20
25
|
|
|
21
26
|
private
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
5
|
+
module Coelacanth
|
|
6
|
+
module Robots
|
|
7
|
+
DEFAULT_USER_AGENT = "CoelacanthBot"
|
|
8
|
+
RULE_STRUCT = Struct.new(:type, :pattern, :regex, :length, keyword_init: true)
|
|
9
|
+
|
|
10
|
+
module_function
|
|
11
|
+
|
|
12
|
+
def allowed?(uri, user_agent: user_agent())
|
|
13
|
+
rules = rules_for(uri)
|
|
14
|
+
return true if rules.empty?
|
|
15
|
+
|
|
16
|
+
agent_key = normalize_agent(user_agent)
|
|
17
|
+
agent_rules = rules[agent_key]
|
|
18
|
+
agent_rules = rules["*"] if agent_rules.nil? || agent_rules.empty?
|
|
19
|
+
|
|
20
|
+
return true if agent_rules.nil? || agent_rules.empty?
|
|
21
|
+
|
|
22
|
+
evaluate(agent_rules, normalize_path(uri))
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def user_agent
|
|
26
|
+
ENV.fetch("COELACANTH_HTTP_USER_AGENT", DEFAULT_USER_AGENT)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def rules_for(uri)
|
|
30
|
+
robots_cache[cache_key(uri)] ||= fetch_rules(uri)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def clear_cache!
|
|
34
|
+
robots_cache.clear
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def robots_cache
|
|
38
|
+
@robots_cache ||= {}
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def fetch_rules(uri)
|
|
42
|
+
response = Coelacanth::HTTP.raw_get_response(robots_uri_for(uri))
|
|
43
|
+
return {} unless response.is_a?(Net::HTTPSuccess)
|
|
44
|
+
|
|
45
|
+
parse_robots(response.body.to_s)
|
|
46
|
+
rescue Coelacanth::TimeoutError, StandardError
|
|
47
|
+
{}
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def robots_uri_for(uri)
|
|
51
|
+
klass = uri.scheme == "https" ? URI::HTTPS : URI::HTTP
|
|
52
|
+
port = uri.port
|
|
53
|
+
port = nil if port == default_port_for(uri.scheme)
|
|
54
|
+
|
|
55
|
+
klass.build(host: uri.host, path: "/robots.txt", port: port)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def parse_robots(body)
|
|
59
|
+
rules = Hash.new { |hash, key| hash[key] = [] }
|
|
60
|
+
current_agents = []
|
|
61
|
+
last_directive = nil
|
|
62
|
+
|
|
63
|
+
body.each_line do |line|
|
|
64
|
+
sanitized = sanitize_line(line)
|
|
65
|
+
if sanitized.empty?
|
|
66
|
+
current_agents = []
|
|
67
|
+
last_directive = nil
|
|
68
|
+
next
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
field, value = sanitized.split(":", 2)
|
|
72
|
+
next if value.nil?
|
|
73
|
+
|
|
74
|
+
field = field.strip.downcase
|
|
75
|
+
value = value.strip
|
|
76
|
+
|
|
77
|
+
case field
|
|
78
|
+
when "user-agent"
|
|
79
|
+
current_agents = [] unless last_directive == :user_agent
|
|
80
|
+
agent = normalize_agent(value)
|
|
81
|
+
current_agents << agent unless current_agents.include?(agent)
|
|
82
|
+
last_directive = :user_agent
|
|
83
|
+
when "allow", "disallow"
|
|
84
|
+
last_directive = field.to_sym
|
|
85
|
+
next if value.empty?
|
|
86
|
+
|
|
87
|
+
current_agents = ["*"] if current_agents.empty?
|
|
88
|
+
rule = build_rule(type: last_directive, value: value)
|
|
89
|
+
current_agents.each do |agent|
|
|
90
|
+
rules[agent] << rule
|
|
91
|
+
end
|
|
92
|
+
else
|
|
93
|
+
last_directive = field.to_sym
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
rules
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def sanitize_line(line)
|
|
101
|
+
line.split("#", 2).first.to_s.strip
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def build_rule(type:, value:)
|
|
105
|
+
pattern = value.start_with?("/") ? value : "/#{value}"
|
|
106
|
+
escaped = Regexp.escape(pattern)
|
|
107
|
+
escaped = escaped.gsub("\\*", ".*")
|
|
108
|
+
escaped = escaped.gsub("\\$", "\\z")
|
|
109
|
+
regex = Regexp.new("\\A" + escaped)
|
|
110
|
+
RULE_STRUCT.new(type: type, pattern: pattern, regex: regex, length: pattern.length)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def evaluate(rules, path)
|
|
114
|
+
matches = rules.select { |rule| rule.regex.match?(path) }
|
|
115
|
+
return true if matches.empty?
|
|
116
|
+
|
|
117
|
+
longest_allow = matches.select { |rule| rule.type == :allow }.max_by(&:length)
|
|
118
|
+
longest_disallow = matches.select { |rule| rule.type == :disallow }.max_by(&:length)
|
|
119
|
+
|
|
120
|
+
return true if longest_disallow.nil?
|
|
121
|
+
return true if longest_allow && longest_allow.length >= longest_disallow.length
|
|
122
|
+
|
|
123
|
+
false
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def normalize_path(uri)
|
|
127
|
+
path = uri.path
|
|
128
|
+
path = "/" if path.nil? || path.empty?
|
|
129
|
+
query = uri.query
|
|
130
|
+
return path if query.nil? || query.empty?
|
|
131
|
+
|
|
132
|
+
"#{path}?#{query}"
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def normalize_agent(agent)
|
|
136
|
+
agent.to_s.strip.downcase
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def cache_key(uri)
|
|
140
|
+
port = uri.port
|
|
141
|
+
default_port = default_port_for(uri.scheme)
|
|
142
|
+
port_part = port && port != default_port ? ":#{port}" : ""
|
|
143
|
+
"#{uri.scheme}://#{uri.host}#{port_part}"
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def default_port_for(scheme)
|
|
147
|
+
scheme == "https" ? URI::HTTPS.default_port : URI::HTTP.default_port
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
data/lib/coelacanth/version.rb
CHANGED
data/lib/coelacanth.rb
CHANGED
|
@@ -7,6 +7,7 @@ require_relative "coelacanth/client/ferrum"
|
|
|
7
7
|
require_relative "coelacanth/client/screenshot_one"
|
|
8
8
|
require_relative "coelacanth/dom"
|
|
9
9
|
require_relative "coelacanth/extractor"
|
|
10
|
+
require_relative "coelacanth/http"
|
|
10
11
|
require_relative "coelacanth/redirect"
|
|
11
12
|
require_relative "coelacanth/validator"
|
|
12
13
|
require_relative "coelacanth/version"
|
|
@@ -16,13 +17,20 @@ module Coelacanth
|
|
|
16
17
|
class Error < StandardError; end
|
|
17
18
|
class RedirectError < StandardError; end
|
|
18
19
|
class DeepRedirectError < StandardError; end
|
|
20
|
+
class TimeoutError < StandardError; end
|
|
21
|
+
class RobotsDisallowedError < StandardError; end
|
|
19
22
|
|
|
20
23
|
def self.analyze(url)
|
|
21
24
|
client_class = config.read("client") == "screenshot_one" ? Client::ScreenshotOne : Client::Ferrum
|
|
22
25
|
@client = client_class.new(url)
|
|
23
26
|
regular_url = Redirect.new.resolve_redirect(url)
|
|
24
|
-
response =
|
|
25
|
-
|
|
27
|
+
response = begin
|
|
28
|
+
Coelacanth::HTTP.get_response(URI.parse(regular_url))
|
|
29
|
+
rescue Coelacanth::TimeoutError
|
|
30
|
+
nil
|
|
31
|
+
end
|
|
32
|
+
html = response&.body.to_s
|
|
33
|
+
html = html.dup
|
|
26
34
|
html = html.force_encoding(Encoding::UTF_8)
|
|
27
35
|
html = html.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
|
|
28
36
|
extractor_result = Extractor.new.call(html: html, url: regular_url)
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: coelacanth
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.4.
|
|
4
|
+
version: 0.4.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Yusuke
|
|
@@ -18,6 +18,7 @@ executables: []
|
|
|
18
18
|
extensions: []
|
|
19
19
|
extra_rdoc_files: []
|
|
20
20
|
files:
|
|
21
|
+
- ".env.example"
|
|
21
22
|
- ".rspec"
|
|
22
23
|
- ".rubocop.yml"
|
|
23
24
|
- CHANGELOG.md
|
|
@@ -39,13 +40,15 @@ files:
|
|
|
39
40
|
- lib/coelacanth/extractor/fallback_probe.rb
|
|
40
41
|
- lib/coelacanth/extractor/heuristic_probe.rb
|
|
41
42
|
- lib/coelacanth/extractor/image_collector.rb
|
|
42
|
-
- lib/coelacanth/extractor/
|
|
43
|
+
- lib/coelacanth/extractor/markdown_listing_collector.rb
|
|
43
44
|
- lib/coelacanth/extractor/markdown_renderer.rb
|
|
44
45
|
- lib/coelacanth/extractor/metadata_probe.rb
|
|
45
46
|
- lib/coelacanth/extractor/normalizer.rb
|
|
46
47
|
- lib/coelacanth/extractor/utilities.rb
|
|
47
48
|
- lib/coelacanth/extractor/weak_ml_probe.rb
|
|
49
|
+
- lib/coelacanth/http.rb
|
|
48
50
|
- lib/coelacanth/redirect.rb
|
|
51
|
+
- lib/coelacanth/robots.rb
|
|
49
52
|
- lib/coelacanth/validator.rb
|
|
50
53
|
- lib/coelacanth/version.rb
|
|
51
54
|
homepage: https://github.com/slidict/coelacanth
|
|
@@ -1,270 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require_relative "utilities"
|
|
4
|
-
|
|
5
|
-
module Coelacanth
|
|
6
|
-
class Extractor
|
|
7
|
-
# Identifies sidebar or inline news listings and returns link arrays.
|
|
8
|
-
class ListingCollector
|
|
9
|
-
CANDIDATE_SELECTOR = "aside, section, div, ul, ol, dl".freeze
|
|
10
|
-
MIN_ITEMS = 3
|
|
11
|
-
MIN_TITLE_LENGTH = 2
|
|
12
|
-
|
|
13
|
-
def call(document:, base_url: nil, primary_node: nil)
|
|
14
|
-
candidates = collect_candidates(document, base_url, primary_node)
|
|
15
|
-
|
|
16
|
-
candidates
|
|
17
|
-
.sort_by { |candidate| -candidate[:score] }
|
|
18
|
-
.reject { |candidate| candidate[:score] < minimum_score }
|
|
19
|
-
.first(3)
|
|
20
|
-
.map { |candidate| format_candidate(candidate) }
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
private
|
|
24
|
-
|
|
25
|
-
def collect_candidates(document, base_url, primary_node)
|
|
26
|
-
document.css(CANDIDATE_SELECTOR).filter_map do |node|
|
|
27
|
-
next if skip_node?(node, primary_node)
|
|
28
|
-
|
|
29
|
-
items = extract_items(node, base_url)
|
|
30
|
-
next if items.length < MIN_ITEMS
|
|
31
|
-
|
|
32
|
-
heading = heading_for(node)
|
|
33
|
-
score = score_node(node, items, heading)
|
|
34
|
-
next if score < minimum_score
|
|
35
|
-
|
|
36
|
-
{ node: node, items: items, heading: heading, score: score }
|
|
37
|
-
end
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
def skip_node?(node, primary_node)
|
|
41
|
-
return true if nested_listing_container?(node)
|
|
42
|
-
|
|
43
|
-
return false unless primary_node
|
|
44
|
-
return false unless primary_node.respond_to?(:name)
|
|
45
|
-
return false if %w[body html].include?(primary_node.name)
|
|
46
|
-
|
|
47
|
-
node == primary_node ||
|
|
48
|
-
ancestor?(node, primary_node) ||
|
|
49
|
-
ancestor?(primary_node, node)
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
def nested_listing_container?(node)
|
|
53
|
-
Utilities.ancestors(node).any? do |ancestor|
|
|
54
|
-
Utilities.element?(ancestor) && LISTING_CONTAINER_TAGS.include?(ancestor.name)
|
|
55
|
-
end
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
LISTING_CONTAINER_TAGS = %w[aside section div ul ol dl].freeze
|
|
59
|
-
|
|
60
|
-
def ancestor?(node, candidate)
|
|
61
|
-
Utilities.ancestors(node).any? { |ancestor| ancestor == candidate }
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
def extract_items(node, base_url)
|
|
65
|
-
item_nodes = candidate_children(node)
|
|
66
|
-
return [] if item_nodes.empty?
|
|
67
|
-
|
|
68
|
-
item_nodes.filter_map do |child|
|
|
69
|
-
next unless contains_link?(child)
|
|
70
|
-
|
|
71
|
-
anchor = primary_anchor(child)
|
|
72
|
-
next unless anchor
|
|
73
|
-
|
|
74
|
-
title = normalize_text(anchor.text)
|
|
75
|
-
next if title.length < MIN_TITLE_LENGTH
|
|
76
|
-
|
|
77
|
-
href = anchor["href"].to_s.strip
|
|
78
|
-
next if href.empty?
|
|
79
|
-
|
|
80
|
-
url = base_url ? Utilities.absolute_url(base_url, href) : href
|
|
81
|
-
url ||= href
|
|
82
|
-
|
|
83
|
-
snippet = build_snippet(child, title)
|
|
84
|
-
|
|
85
|
-
item = { title: title, url: url }
|
|
86
|
-
item[:snippet] = snippet unless snippet.nil? || snippet.empty?
|
|
87
|
-
item
|
|
88
|
-
end.uniq { |item| [item[:title], item[:url]] }
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
def candidate_children(node)
|
|
92
|
-
direct_children = Utilities.element_children(node)
|
|
93
|
-
return [] if direct_children.empty?
|
|
94
|
-
|
|
95
|
-
anchor_children = direct_children.select { |child| contains_link?(child) }
|
|
96
|
-
return anchor_children if anchor_children.length >= MIN_ITEMS
|
|
97
|
-
|
|
98
|
-
groups = %w[li article div section p dd]
|
|
99
|
-
|
|
100
|
-
groups.each do |tag|
|
|
101
|
-
grouped = direct_children.select { |child| child.name == tag }
|
|
102
|
-
return grouped if grouped.length >= MIN_ITEMS
|
|
103
|
-
end
|
|
104
|
-
|
|
105
|
-
list_container = direct_children.find { |child| %w[ul ol dl].include?(child.name) }
|
|
106
|
-
return Utilities.element_children(list_container) if list_container
|
|
107
|
-
|
|
108
|
-
[]
|
|
109
|
-
end
|
|
110
|
-
|
|
111
|
-
def contains_link?(node)
|
|
112
|
-
node.css("a[href]").any?
|
|
113
|
-
end
|
|
114
|
-
|
|
115
|
-
def primary_anchor(node)
|
|
116
|
-
anchors = node.css("a[href]")
|
|
117
|
-
anchors.max_by { |anchor| normalize_text(anchor.text).length }
|
|
118
|
-
end
|
|
119
|
-
|
|
120
|
-
def normalize_text(text)
|
|
121
|
-
text.to_s.gsub(/[\r\n\t]/, " ").squeeze(" ").strip
|
|
122
|
-
end
|
|
123
|
-
|
|
124
|
-
def build_snippet(node, title)
|
|
125
|
-
snippet_from_node_text(node, title) || metadata_context(node, title)
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
def snippet_from_node_text(node, title)
|
|
129
|
-
text = normalize_text(node.text)
|
|
130
|
-
snippet = text.sub(title, "").strip
|
|
131
|
-
snippet.empty? ? nil : truncate(snippet)
|
|
132
|
-
end
|
|
133
|
-
|
|
134
|
-
def metadata_context(node, title)
|
|
135
|
-
candidate = time_text(node) || preceding_metadata(node)
|
|
136
|
-
return nil if candidate.nil?
|
|
137
|
-
|
|
138
|
-
candidate = candidate.sub(title, "").strip
|
|
139
|
-
candidate.empty? ? nil : truncate(candidate)
|
|
140
|
-
end
|
|
141
|
-
|
|
142
|
-
def time_text(node)
|
|
143
|
-
node.css("time").filter_map { |time| normalize_text(time.text) }.find { |text| !text.empty? }
|
|
144
|
-
end
|
|
145
|
-
|
|
146
|
-
def preceding_metadata(node)
|
|
147
|
-
previous = Utilities.previous_element(node)
|
|
148
|
-
3.times do
|
|
149
|
-
break unless previous
|
|
150
|
-
|
|
151
|
-
text = normalize_text(previous.text)
|
|
152
|
-
return text unless text.empty?
|
|
153
|
-
|
|
154
|
-
previous = Utilities.previous_element(previous)
|
|
155
|
-
end
|
|
156
|
-
|
|
157
|
-
nil
|
|
158
|
-
end
|
|
159
|
-
|
|
160
|
-
def truncate(text)
|
|
161
|
-
return text if text.length <= 120
|
|
162
|
-
|
|
163
|
-
text[0...117] + "..."
|
|
164
|
-
end
|
|
165
|
-
|
|
166
|
-
def heading_for(node)
|
|
167
|
-
if (heading = node.at_css("h1, h2, h3, h4"))
|
|
168
|
-
return normalize_text(heading.text)
|
|
169
|
-
end
|
|
170
|
-
|
|
171
|
-
previous = Utilities.previous_element(node)
|
|
172
|
-
3.times do
|
|
173
|
-
break unless previous
|
|
174
|
-
|
|
175
|
-
return normalize_text(previous.text) if previous.name =~ /h[1-6]/
|
|
176
|
-
previous = Utilities.previous_element(previous)
|
|
177
|
-
end
|
|
178
|
-
|
|
179
|
-
nil
|
|
180
|
-
end
|
|
181
|
-
|
|
182
|
-
def score_node(node, items, heading)
|
|
183
|
-
structure_score = structural_score(node)
|
|
184
|
-
heading_score = heading ? 45 : 0
|
|
185
|
-
item_score = items.length * 40
|
|
186
|
-
density_score = Utilities.link_density(node) * 90
|
|
187
|
-
adjacency_score = sibling_sequence_bonus(node)
|
|
188
|
-
depth_penalty = Utilities.depth(node) * 5
|
|
189
|
-
length_penalty = long_text_penalty(node)
|
|
190
|
-
|
|
191
|
-
structure_score + heading_score + item_score + density_score + adjacency_score - depth_penalty - length_penalty
|
|
192
|
-
end
|
|
193
|
-
|
|
194
|
-
def structural_score(node)
|
|
195
|
-
children = candidate_children(node)
|
|
196
|
-
return 0 if children.empty?
|
|
197
|
-
|
|
198
|
-
dominant_tag, dominant_children = children.group_by(&:name).max_by { |_, nodes| nodes.length }
|
|
199
|
-
dominant_count = dominant_children.length
|
|
200
|
-
|
|
201
|
-
uniform_bonus = dominant_count == children.length ? 60 : 20
|
|
202
|
-
list_bonus = %w[ul ol dl].include?(node.name) ? 90 : 0
|
|
203
|
-
list_bonus += 45 if dominant_tag && %w[li dd].include?(dominant_tag)
|
|
204
|
-
|
|
205
|
-
distribution_bonus = distribution_consistency_bonus(children)
|
|
206
|
-
|
|
207
|
-
dominant_count * 12 + uniform_bonus + list_bonus + distribution_bonus
|
|
208
|
-
end
|
|
209
|
-
|
|
210
|
-
def distribution_consistency_bonus(children)
|
|
211
|
-
return 0 if children.length < MIN_ITEMS
|
|
212
|
-
|
|
213
|
-
lengths = children.map { |child| Utilities.text_length(child) }
|
|
214
|
-
average = lengths.sum.to_f / lengths.length
|
|
215
|
-
variance = lengths.map { |len| (len - average).abs }
|
|
216
|
-
|
|
217
|
-
variance.max <= 120 ? 40 : 10
|
|
218
|
-
end
|
|
219
|
-
|
|
220
|
-
def sibling_sequence_bonus(node)
|
|
221
|
-
siblings = Utilities.sibling_elements(node)
|
|
222
|
-
return 0 if siblings.empty?
|
|
223
|
-
|
|
224
|
-
index = siblings.index(node)
|
|
225
|
-
return 0 unless index
|
|
226
|
-
|
|
227
|
-
forward = 0
|
|
228
|
-
while (candidate = siblings[index + forward + 1]) && similar_structure?(node, candidate)
|
|
229
|
-
forward += 1
|
|
230
|
-
end
|
|
231
|
-
|
|
232
|
-
backward = 0
|
|
233
|
-
while index - backward - 1 >= 0 && (candidate = siblings[index - backward - 1]) && similar_structure?(node, candidate)
|
|
234
|
-
backward += 1
|
|
235
|
-
end
|
|
236
|
-
|
|
237
|
-
(forward + backward) * 15
|
|
238
|
-
end
|
|
239
|
-
|
|
240
|
-
def similar_structure?(node, other)
|
|
241
|
-
return false unless other
|
|
242
|
-
|
|
243
|
-
node_children = candidate_children(node)
|
|
244
|
-
other_children = candidate_children(other)
|
|
245
|
-
return false if node_children.empty? || other_children.empty?
|
|
246
|
-
|
|
247
|
-
node_children.first.name == other_children.first.name && node_children.length == other_children.length
|
|
248
|
-
end
|
|
249
|
-
|
|
250
|
-
def long_text_penalty(node)
|
|
251
|
-
children = candidate_children(node)
|
|
252
|
-
return 0 if children.empty?
|
|
253
|
-
|
|
254
|
-
overlong = children.count { |child| Utilities.text_length(child) > 280 }
|
|
255
|
-
overlong * 30
|
|
256
|
-
end
|
|
257
|
-
|
|
258
|
-
def minimum_score
|
|
259
|
-
180
|
|
260
|
-
end
|
|
261
|
-
|
|
262
|
-
def format_candidate(candidate)
|
|
263
|
-
{
|
|
264
|
-
heading: candidate[:heading],
|
|
265
|
-
items: candidate[:items]
|
|
266
|
-
}
|
|
267
|
-
end
|
|
268
|
-
end
|
|
269
|
-
end
|
|
270
|
-
end
|