coelacanth 0.5.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -5
- data/README.md +32 -1
- data/compose.yml +29 -20
- data/config/coelacanth.yml +8 -1
- data/lib/coelacanth/client/ferrum.rb +2 -2
- data/lib/coelacanth/client/gotenberg.rb +89 -0
- data/lib/coelacanth/extractor/morphological_analyzer.rb +4 -0
- data/lib/coelacanth/version.rb +1 -1
- data/lib/coelacanth.rb +19 -1
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 94493e66b4aae51d1790f03eb89c95d2ec1861b9d150c1001bba7554a1b37ab7
|
|
4
|
+
data.tar.gz: acdd939524543ec62740e153774c44f5fcdee11d30a9bd5ca4e6bcabe2f65398
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 05b8789bd0bf8a23cb678b9f17349a403c0f1e7017903dea14ae13dbea1df1204a3a5cdd184c70e86e0d7366dce67cabbbd528846c2c498953088c1309830421
|
|
7
|
+
data.tar.gz: 047d1acabd38e890579947bd7311178ab0197089acd0185d5dbfa0628bbe6a5ad003c330e786c2d82d44fbada5b3ed13882178b50cba86c74ed2e2e12a527199
|
data/CHANGELOG.md
CHANGED
|
@@ -4,9 +4,11 @@ All notable changes to this project will be documented in this file.
|
|
|
4
4
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
5
5
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
6
|
|
|
7
|
-
## [v0.
|
|
8
|
-
### :
|
|
9
|
-
- [`
|
|
10
|
-
- [`2a566ad`](https://github.com/slidict/coelacanth/commit/2a566adeaaa5b813fded4b9ebd8ce8d90d43ee7c) - add morphological analysis for body markdown *(commit by [@yubele](https://github.com/yubele))*
|
|
7
|
+
## [v0.6.1] - 2026-06-15
|
|
8
|
+
### :bug: Bug Fixes
|
|
9
|
+
- [`f387ef9`](https://github.com/slidict/coelacanth/commit/f387ef997e0c383458a4fd0384f74883fcb63f37) - Update gem-push.yml *(commit by [@abechan1](https://github.com/abechan1))*
|
|
11
10
|
|
|
12
|
-
|
|
11
|
+
### :wrench: Chores
|
|
12
|
+
- [`4ad8456`](https://github.com/slidict/coelacanth/commit/4ad8456deba3aea4e6eea8836d6bdf728fbbdb70) - Bump version to 0.6.1 *(commit by [@abechan1](https://github.com/abechan1))*
|
|
13
|
+
|
|
14
|
+
[v0.6.1]: https://github.com/slidict/coelacanth/compare/v0.6.0...v0.6.1
|
data/README.md
CHANGED
|
@@ -84,6 +84,20 @@ result[:screenshot] # => PNG screenshot as a binary string
|
|
|
84
84
|
result[:response] # => HTTP status, headers, and final URL
|
|
85
85
|
```
|
|
86
86
|
|
|
87
|
+
# Plain-text morphology
|
|
88
|
+
|
|
89
|
+
You can run the morphological analyzer without fetching a page by passing plain
|
|
90
|
+
text:
|
|
91
|
+
|
|
92
|
+
```ruby
|
|
93
|
+
Coelacanth.morphological_analysis("これはテストです。 Testing morphology twice.")
|
|
94
|
+
# => [
|
|
95
|
+
# { token: "testing morphology twice", score: 1.23, count: 2 },
|
|
96
|
+
# { token: "テスト", score: 1.02, count: 1 },
|
|
97
|
+
# ...
|
|
98
|
+
# ]
|
|
99
|
+
```
|
|
100
|
+
|
|
87
101
|
The returned hash includes:
|
|
88
102
|
|
|
89
103
|
- `:extraction` – output from `Coelacanth::Extractor`, including title, Markdown body (`body_markdown`,
|
|
@@ -129,7 +143,7 @@ Runtime configuration is stored in `config/coelacanth.yml`. Environments inherit
|
|
|
129
143
|
|
|
130
144
|
```yaml
|
|
131
145
|
development:
|
|
132
|
-
client: "ferrum" # Options: "ferrum", "screenshot_one"
|
|
146
|
+
client: "ferrum" # Options: "ferrum", "screenshot_one", "gotenberg"
|
|
133
147
|
remote_client:
|
|
134
148
|
ws_url: "ws://chrome:3000/chrome"
|
|
135
149
|
timeout: 10
|
|
@@ -141,6 +155,13 @@ development:
|
|
|
141
155
|
User-Agent: "<%= ENV.fetch("COELACANTH_REMOTE_CLIENT_USER_AGENT", "Coelacanth Chrome Extension") %>"
|
|
142
156
|
screenshot_one:
|
|
143
157
|
key: "<%= ENV.fetch("COELACANTH_SCREENSHOT_ONE_API_KEY", "your_screenshot_one_api_key_here") %>"
|
|
158
|
+
gotenberg:
|
|
159
|
+
url: "<%= ENV.fetch("COELACANTH_GOTENBERG_URL", "http://gotenberg:3000") %>"
|
|
160
|
+
open_timeout: 5
|
|
161
|
+
read_timeout: 30
|
|
162
|
+
wait_delay: "<%= ENV.fetch("COELACANTH_GOTENBERG_WAIT_DELAY", "") %>"
|
|
163
|
+
user_agent: "<%= ENV.fetch("COELACANTH_GOTENBERG_USER_AGENT", "") %>"
|
|
164
|
+
extra_http_headers:
|
|
144
165
|
youtube:
|
|
145
166
|
api_key: "<%= ENV.fetch("COELACANTH_YOUTUBE_API_KEY", "") %>"
|
|
146
167
|
morphology:
|
|
@@ -157,6 +178,9 @@ development:
|
|
|
157
178
|
- **Ferrum client** – Requires a running Chrome instance that exposes the DevTools protocol via WebSocket. Configure the URL,
|
|
158
179
|
timeout, the network idle timeout, and any headers to inject.
|
|
159
180
|
- **ScreenshotOne client** – Supply an API key to offload screenshot capture to [ScreenshotOne](https://screenshotone.com/).
|
|
181
|
+
- **Gotenberg client** – Set `client: "gotenberg"` to capture screenshots through Gotenberg's Chromium URL screenshot
|
|
182
|
+
endpoint. Configure `gotenberg.url`, request timeouts, an optional `wait_delay`, an optional screenshot `user_agent`,
|
|
183
|
+
and optional `extra_http_headers`.
|
|
160
184
|
- **Eyecatch image extraction** – Representative images are discovered automatically by checking Open Graph/Twitter metadata,
|
|
161
185
|
Schema.org JSON-LD payloads, and high-signal `<img>` elements (hero/cover images, large dimensions, etc.). No manual XPath
|
|
162
186
|
maintenance is required.
|
|
@@ -193,6 +217,9 @@ export COELACANTH_REMOTE_CLIENT_AUTHORIZATION="Bearer <token>"
|
|
|
193
217
|
|
|
194
218
|
export COELACANTH_REMOTE_CLIENT_USER_AGENT="Coelacanth Chrome Extension"
|
|
195
219
|
export COELACANTH_SCREENSHOT_ONE_API_KEY="your_screenshot_one_api_key_here"
|
|
220
|
+
export COELACANTH_GOTENBERG_URL="http://gotenberg:3000"
|
|
221
|
+
export COELACANTH_GOTENBERG_WAIT_DELAY="2s"
|
|
222
|
+
export COELACANTH_GOTENBERG_USER_AGENT="Coelacanth Chrome Extension"
|
|
196
223
|
export COELACANTH_YOUTUBE_API_KEY="your_youtube_data_api_key"
|
|
197
224
|
```
|
|
198
225
|
|
|
@@ -216,6 +243,10 @@ YouTube, so non-video pages continue to behave as before.
|
|
|
216
243
|
When using Docker Compose, you can create a `.env` file or export the variables in your environment so the `app` service picks
|
|
217
244
|
them up automatically.
|
|
218
245
|
|
|
246
|
+
Docker Compose also starts a `gotenberg` service and passes `COELACANTH_GOTENBERG_URL` to the `app` service by default.
|
|
247
|
+
To try that path locally, set `client: "gotenberg"` in `config/coelacanth.yml` or export the equivalent environment-specific
|
|
248
|
+
configuration before running the app container.
|
|
249
|
+
|
|
219
250
|
If you are working inside Docker, make sure the `UID` environment variable matches your host user by exporting it in your shell
|
|
220
251
|
startup file:
|
|
221
252
|
|
data/compose.yml
CHANGED
|
@@ -1,25 +1,34 @@
|
|
|
1
|
-
networks:
|
|
2
|
-
app-tier:
|
|
3
|
-
driver: bridge
|
|
4
|
-
services:
|
|
5
|
-
app:
|
|
1
|
+
networks:
|
|
2
|
+
app-tier:
|
|
3
|
+
driver: bridge
|
|
4
|
+
services:
|
|
5
|
+
app:
|
|
6
6
|
environment:
|
|
7
7
|
- UID=${UID}
|
|
8
8
|
- COELACANTH_REMOTE_CLIENT_AUTHORIZATION=${COELACANTH_REMOTE_CLIENT_AUTHORIZATION:-}
|
|
9
9
|
- COELACANTH_REMOTE_CLIENT_USER_AGENT=${COELACANTH_REMOTE_CLIENT_USER_AGENT:-}
|
|
10
10
|
- COELACANTH_SCREENSHOT_ONE_API_KEY=${COELACANTH_SCREENSHOT_ONE_API_KEY:-}
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
11
|
+
- COELACANTH_GOTENBERG_URL=${COELACANTH_GOTENBERG_URL:-http://gotenberg:3000}
|
|
12
|
+
- COELACANTH_GOTENBERG_WAIT_DELAY=${COELACANTH_GOTENBERG_WAIT_DELAY:-}
|
|
13
|
+
- COELACANTH_GOTENBERG_USER_AGENT=${COELACANTH_GOTENBERG_USER_AGENT:-}
|
|
14
|
+
tty: true
|
|
15
|
+
stdin_open: true
|
|
16
|
+
build:
|
|
17
|
+
context: .
|
|
18
|
+
dockerfile: Dockerfile
|
|
19
|
+
volumes:
|
|
20
|
+
- ./:/app:cached
|
|
21
|
+
working_dir: /app
|
|
22
|
+
command: bash
|
|
23
|
+
depends_on:
|
|
24
|
+
- gotenberg
|
|
25
|
+
networks:
|
|
26
|
+
- app-tier
|
|
27
|
+
chrome:
|
|
28
|
+
image: browserless/chrome:latest
|
|
29
|
+
networks:
|
|
30
|
+
- app-tier
|
|
31
|
+
gotenberg:
|
|
32
|
+
image: gotenberg/gotenberg:8
|
|
33
|
+
networks:
|
|
34
|
+
- app-tier
|
data/config/coelacanth.yml
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
development: &development
|
|
2
|
-
client: "ferrum" # Options: "ferrum", "screenshot_one"
|
|
2
|
+
client: "ferrum" # Options: "ferrum", "screenshot_one", "gotenberg"
|
|
3
3
|
remote_client:
|
|
4
4
|
ws_url: "ws://chrome:3000/chrome"
|
|
5
5
|
timeout: 10 # seconds
|
|
@@ -11,6 +11,13 @@ development: &development
|
|
|
11
11
|
User-Agent: "<%= ENV.fetch("COELACANTH_REMOTE_CLIENT_USER_AGENT", "Coelacanth Chrome Extension") %>"
|
|
12
12
|
screenshot_one:
|
|
13
13
|
key: "<%= ENV.fetch("COELACANTH_SCREENSHOT_ONE_API_KEY", "your_screenshot_one_api_key_here") %>"
|
|
14
|
+
gotenberg:
|
|
15
|
+
url: "<%= ENV.fetch("COELACANTH_GOTENBERG_URL", "http://gotenberg:3000") %>"
|
|
16
|
+
open_timeout: 5
|
|
17
|
+
read_timeout: 30
|
|
18
|
+
wait_delay: "<%= ENV.fetch("COELACANTH_GOTENBERG_WAIT_DELAY", "") %>"
|
|
19
|
+
user_agent: "<%= ENV.fetch("COELACANTH_GOTENBERG_USER_AGENT", "") %>"
|
|
20
|
+
extra_http_headers:
|
|
14
21
|
youtube:
|
|
15
22
|
api_key: "<%= ENV.fetch("COELACANTH_YOUTUBE_API_KEY", "") %>"
|
|
16
23
|
morphology:
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "net/http"
|
|
5
|
+
require "uri"
|
|
6
|
+
|
|
7
|
+
require_relative "../http"
|
|
8
|
+
|
|
9
|
+
module Coelacanth::Client
|
|
10
|
+
# Client for capturing screenshots through a Gotenberg Chromium service.
|
|
11
|
+
class Gotenberg < Coelacanth::Client::Base
|
|
12
|
+
SCREENSHOT_URL_PATH = "/forms/chromium/screenshot/url"
|
|
13
|
+
|
|
14
|
+
def get_response
|
|
15
|
+
uri = URI.parse(@url)
|
|
16
|
+
response = Coelacanth::HTTP.get_response(
|
|
17
|
+
uri,
|
|
18
|
+
open_timeout: Coelacanth::HTTP::DEFAULT_OPEN_TIMEOUT,
|
|
19
|
+
read_timeout: Coelacanth::HTTP::DEFAULT_READ_TIMEOUT
|
|
20
|
+
)
|
|
21
|
+
@origin_response = response
|
|
22
|
+
@status_code = response.code.to_i
|
|
23
|
+
|
|
24
|
+
return response.body if response.is_a?(Net::HTTPSuccess)
|
|
25
|
+
|
|
26
|
+
Coelacanth::HTTP.raise_http_error(uri, response)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def get_screenshot
|
|
30
|
+
response = Net::HTTP.start(
|
|
31
|
+
endpoint_uri.host,
|
|
32
|
+
endpoint_uri.port,
|
|
33
|
+
use_ssl: endpoint_uri.scheme == "https",
|
|
34
|
+
open_timeout: open_timeout,
|
|
35
|
+
read_timeout: read_timeout
|
|
36
|
+
) do |http|
|
|
37
|
+
http.request(screenshot_request)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
return response.body if response.is_a?(Net::HTTPSuccess)
|
|
41
|
+
|
|
42
|
+
raise "Failed to fetch screenshot from Gotenberg: #{response.code} #{response.message}"
|
|
43
|
+
rescue Net::OpenTimeout, Net::ReadTimeout, Timeout::Error => e
|
|
44
|
+
raise Coelacanth::TimeoutError, "Gotenberg screenshot request timed out: #{e.message}"
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
def screenshot_request
|
|
50
|
+
request = Net::HTTP::Post.new(endpoint_uri)
|
|
51
|
+
request.set_form(gotenberg_form_fields, "multipart/form-data")
|
|
52
|
+
request
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def gotenberg_form_fields
|
|
56
|
+
fields = [["url", @url]]
|
|
57
|
+
wait_delay = @config.read("gotenberg.wait_delay")
|
|
58
|
+
user_agent = @config.read("gotenberg.user_agent")
|
|
59
|
+
extra_http_headers = @config.read("gotenberg.extra_http_headers")
|
|
60
|
+
|
|
61
|
+
fields << ["waitDelay", wait_delay] if present?(wait_delay)
|
|
62
|
+
fields << ["userAgent", user_agent] if present?(user_agent)
|
|
63
|
+
if extra_http_headers && extra_http_headers.any?
|
|
64
|
+
fields << ["extraHttpHeaders", extra_http_headers.to_json]
|
|
65
|
+
end
|
|
66
|
+
fields
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def endpoint_uri
|
|
70
|
+
@endpoint_uri ||= URI.join(base_url, SCREENSHOT_URL_PATH)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def base_url
|
|
74
|
+
@config.read("gotenberg.url") || "http://localhost:3000"
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def open_timeout
|
|
78
|
+
@config.read("gotenberg.open_timeout") || Coelacanth::HTTP::DEFAULT_OPEN_TIMEOUT
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def read_timeout
|
|
82
|
+
@config.read("gotenberg.read_timeout") || 30
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def present?(value)
|
|
86
|
+
!value.nil? && value.to_s.strip != ""
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
data/lib/coelacanth/version.rb
CHANGED
data/lib/coelacanth.rb
CHANGED
|
@@ -4,6 +4,7 @@ require "net/http"
|
|
|
4
4
|
require_relative "coelacanth/configure"
|
|
5
5
|
require_relative "coelacanth/client/base"
|
|
6
6
|
require_relative "coelacanth/client/ferrum"
|
|
7
|
+
require_relative "coelacanth/client/gotenberg"
|
|
7
8
|
require_relative "coelacanth/client/screenshot_one"
|
|
8
9
|
require_relative "coelacanth/dom"
|
|
9
10
|
require_relative "coelacanth/extractor"
|
|
@@ -21,7 +22,7 @@ module Coelacanth
|
|
|
21
22
|
class RobotsDisallowedError < StandardError; end
|
|
22
23
|
|
|
23
24
|
def self.analyze(url)
|
|
24
|
-
client_class = config.read("client")
|
|
25
|
+
client_class = client_class_for(config.read("client"))
|
|
25
26
|
@client = client_class.new(url)
|
|
26
27
|
regular_url = Redirect.new.resolve_redirect(url)
|
|
27
28
|
response = begin
|
|
@@ -47,7 +48,24 @@ module Coelacanth
|
|
|
47
48
|
}
|
|
48
49
|
end
|
|
49
50
|
|
|
51
|
+
def self.client_class_for(client_name)
|
|
52
|
+
case client_name
|
|
53
|
+
when "screenshot_one"
|
|
54
|
+
Client::ScreenshotOne
|
|
55
|
+
when "gotenberg"
|
|
56
|
+
Client::Gotenberg
|
|
57
|
+
else
|
|
58
|
+
Client::Ferrum
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
50
62
|
def self.config
|
|
51
63
|
@config ||= Configure.new
|
|
52
64
|
end
|
|
65
|
+
|
|
66
|
+
def self.morphological_analysis(text, title: nil)
|
|
67
|
+
Extractor::MorphologicalAnalyzer
|
|
68
|
+
.new(config: config)
|
|
69
|
+
.call_text(text, title: title)
|
|
70
|
+
end
|
|
53
71
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: coelacanth
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.6.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Yusuke
|
|
@@ -33,6 +33,7 @@ files:
|
|
|
33
33
|
- lib/coelacanth.rb
|
|
34
34
|
- lib/coelacanth/client/base.rb
|
|
35
35
|
- lib/coelacanth/client/ferrum.rb
|
|
36
|
+
- lib/coelacanth/client/gotenberg.rb
|
|
36
37
|
- lib/coelacanth/client/screenshot_one.rb
|
|
37
38
|
- lib/coelacanth/configure.rb
|
|
38
39
|
- lib/coelacanth/dom.rb
|