coelacanth 0.5.1 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -4
- data/README.md +19 -2
- data/compose.yml +29 -20
- data/config/coelacanth.yml +8 -1
- data/lib/coelacanth/client/ferrum.rb +2 -2
- data/lib/coelacanth/client/gotenberg.rb +89 -0
- data/lib/coelacanth/version.rb +1 -1
- data/lib/coelacanth.rb +13 -1
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 94493e66b4aae51d1790f03eb89c95d2ec1861b9d150c1001bba7554a1b37ab7
|
|
4
|
+
data.tar.gz: acdd939524543ec62740e153774c44f5fcdee11d30a9bd5ca4e6bcabe2f65398
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 05b8789bd0bf8a23cb678b9f17349a403c0f1e7017903dea14ae13dbea1df1204a3a5cdd184c70e86e0d7366dce67cabbbd528846c2c498953088c1309830421
|
|
7
|
+
data.tar.gz: 047d1acabd38e890579947bd7311178ab0197089acd0185d5dbfa0628bbe6a5ad003c330e786c2d82d44fbada5b3ed13882178b50cba86c74ed2e2e12a527199
|
data/CHANGELOG.md
CHANGED
|
@@ -4,8 +4,11 @@ All notable changes to this project will be documented in this file.
|
|
|
4
4
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
5
5
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
6
|
|
|
7
|
-
## [v0.
|
|
8
|
-
### :
|
|
9
|
-
- [`
|
|
7
|
+
## [v0.6.1] - 2026-06-15
|
|
8
|
+
### :bug: Bug Fixes
|
|
9
|
+
- [`f387ef9`](https://github.com/slidict/coelacanth/commit/f387ef997e0c383458a4fd0384f74883fcb63f37) - Update gem-push.yml *(commit by [@abechan1](https://github.com/abechan1))*
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
### :wrench: Chores
|
|
12
|
+
- [`4ad8456`](https://github.com/slidict/coelacanth/commit/4ad8456deba3aea4e6eea8836d6bdf728fbbdb70) - Bump version to 0.6.1 *(commit by [@abechan1](https://github.com/abechan1))*
|
|
13
|
+
|
|
14
|
+
[v0.6.1]: https://github.com/slidict/coelacanth/compare/v0.6.0...v0.6.1
|
data/README.md
CHANGED
|
@@ -82,6 +82,7 @@ result[:extraction] # => article metadata and body markdown
|
|
|
82
82
|
result[:dom] # => Oga DOM representation for downstream processing
|
|
83
83
|
result[:screenshot] # => PNG screenshot as a binary string
|
|
84
84
|
result[:response] # => HTTP status, headers, and final URL
|
|
85
|
+
```
|
|
85
86
|
|
|
86
87
|
# Plain-text morphology
|
|
87
88
|
|
|
@@ -96,7 +97,6 @@ Coelacanth.morphological_analysis("これはテストです。 Testing morpholog
|
|
|
96
97
|
# ...
|
|
97
98
|
# ]
|
|
98
99
|
```
|
|
99
|
-
```
|
|
100
100
|
|
|
101
101
|
The returned hash includes:
|
|
102
102
|
|
|
@@ -143,7 +143,7 @@ Runtime configuration is stored in `config/coelacanth.yml`. Environments inherit
|
|
|
143
143
|
|
|
144
144
|
```yaml
|
|
145
145
|
development:
|
|
146
|
-
client: "ferrum" # Options: "ferrum", "screenshot_one"
|
|
146
|
+
client: "ferrum" # Options: "ferrum", "screenshot_one", "gotenberg"
|
|
147
147
|
remote_client:
|
|
148
148
|
ws_url: "ws://chrome:3000/chrome"
|
|
149
149
|
timeout: 10
|
|
@@ -155,6 +155,13 @@ development:
|
|
|
155
155
|
User-Agent: "<%= ENV.fetch("COELACANTH_REMOTE_CLIENT_USER_AGENT", "Coelacanth Chrome Extension") %>"
|
|
156
156
|
screenshot_one:
|
|
157
157
|
key: "<%= ENV.fetch("COELACANTH_SCREENSHOT_ONE_API_KEY", "your_screenshot_one_api_key_here") %>"
|
|
158
|
+
gotenberg:
|
|
159
|
+
url: "<%= ENV.fetch("COELACANTH_GOTENBERG_URL", "http://gotenberg:3000") %>"
|
|
160
|
+
open_timeout: 5
|
|
161
|
+
read_timeout: 30
|
|
162
|
+
wait_delay: "<%= ENV.fetch("COELACANTH_GOTENBERG_WAIT_DELAY", "") %>"
|
|
163
|
+
user_agent: "<%= ENV.fetch("COELACANTH_GOTENBERG_USER_AGENT", "") %>"
|
|
164
|
+
extra_http_headers:
|
|
158
165
|
youtube:
|
|
159
166
|
api_key: "<%= ENV.fetch("COELACANTH_YOUTUBE_API_KEY", "") %>"
|
|
160
167
|
morphology:
|
|
@@ -171,6 +178,9 @@ development:
|
|
|
171
178
|
- **Ferrum client** – Requires a running Chrome instance that exposes the DevTools protocol via WebSocket. Configure the URL,
|
|
172
179
|
timeout, the network idle timeout, and any headers to inject.
|
|
173
180
|
- **ScreenshotOne client** – Supply an API key to offload screenshot capture to [ScreenshotOne](https://screenshotone.com/).
|
|
181
|
+
- **Gotenberg client** – Set `client: "gotenberg"` to capture screenshots through Gotenberg's Chromium URL screenshot
|
|
182
|
+
endpoint. Configure `gotenberg.url`, request timeouts, an optional `wait_delay`, an optional screenshot `user_agent`,
|
|
183
|
+
and optional `extra_http_headers`.
|
|
174
184
|
- **Eyecatch image extraction** – Representative images are discovered automatically by checking Open Graph/Twitter metadata,
|
|
175
185
|
Schema.org JSON-LD payloads, and high-signal `<img>` elements (hero/cover images, large dimensions, etc.). No manual XPath
|
|
176
186
|
maintenance is required.
|
|
@@ -207,6 +217,9 @@ export COELACANTH_REMOTE_CLIENT_AUTHORIZATION="Bearer <token>"
|
|
|
207
217
|
|
|
208
218
|
export COELACANTH_REMOTE_CLIENT_USER_AGENT="Coelacanth Chrome Extension"
|
|
209
219
|
export COELACANTH_SCREENSHOT_ONE_API_KEY="your_screenshot_one_api_key_here"
|
|
220
|
+
export COELACANTH_GOTENBERG_URL="http://gotenberg:3000"
|
|
221
|
+
export COELACANTH_GOTENBERG_WAIT_DELAY="2s"
|
|
222
|
+
export COELACANTH_GOTENBERG_USER_AGENT="Coelacanth Chrome Extension"
|
|
210
223
|
export COELACANTH_YOUTUBE_API_KEY="your_youtube_data_api_key"
|
|
211
224
|
```
|
|
212
225
|
|
|
@@ -230,6 +243,10 @@ YouTube, so non-video pages continue to behave as before.
|
|
|
230
243
|
When using Docker Compose, you can create a `.env` file or export the variables in your environment so the `app` service picks
|
|
231
244
|
them up automatically.
|
|
232
245
|
|
|
246
|
+
Docker Compose also starts a `gotenberg` service and passes `COELACANTH_GOTENBERG_URL` to the `app` service by default.
|
|
247
|
+
To try that path locally, set `client: "gotenberg"` in `config/coelacanth.yml` or export the equivalent environment-specific
|
|
248
|
+
configuration before running the app container.
|
|
249
|
+
|
|
233
250
|
If you are working inside Docker, make sure the `UID` environment variable matches your host user by exporting it in your shell
|
|
234
251
|
startup file:
|
|
235
252
|
|
data/compose.yml
CHANGED
|
@@ -1,25 +1,34 @@
|
|
|
1
|
-
networks:
|
|
2
|
-
app-tier:
|
|
3
|
-
driver: bridge
|
|
4
|
-
services:
|
|
5
|
-
app:
|
|
1
|
+
networks:
|
|
2
|
+
app-tier:
|
|
3
|
+
driver: bridge
|
|
4
|
+
services:
|
|
5
|
+
app:
|
|
6
6
|
environment:
|
|
7
7
|
- UID=${UID}
|
|
8
8
|
- COELACANTH_REMOTE_CLIENT_AUTHORIZATION=${COELACANTH_REMOTE_CLIENT_AUTHORIZATION:-}
|
|
9
9
|
- COELACANTH_REMOTE_CLIENT_USER_AGENT=${COELACANTH_REMOTE_CLIENT_USER_AGENT:-}
|
|
10
10
|
- COELACANTH_SCREENSHOT_ONE_API_KEY=${COELACANTH_SCREENSHOT_ONE_API_KEY:-}
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
11
|
+
- COELACANTH_GOTENBERG_URL=${COELACANTH_GOTENBERG_URL:-http://gotenberg:3000}
|
|
12
|
+
- COELACANTH_GOTENBERG_WAIT_DELAY=${COELACANTH_GOTENBERG_WAIT_DELAY:-}
|
|
13
|
+
- COELACANTH_GOTENBERG_USER_AGENT=${COELACANTH_GOTENBERG_USER_AGENT:-}
|
|
14
|
+
tty: true
|
|
15
|
+
stdin_open: true
|
|
16
|
+
build:
|
|
17
|
+
context: .
|
|
18
|
+
dockerfile: Dockerfile
|
|
19
|
+
volumes:
|
|
20
|
+
- ./:/app:cached
|
|
21
|
+
working_dir: /app
|
|
22
|
+
command: bash
|
|
23
|
+
depends_on:
|
|
24
|
+
- gotenberg
|
|
25
|
+
networks:
|
|
26
|
+
- app-tier
|
|
27
|
+
chrome:
|
|
28
|
+
image: browserless/chrome:latest
|
|
29
|
+
networks:
|
|
30
|
+
- app-tier
|
|
31
|
+
gotenberg:
|
|
32
|
+
image: gotenberg/gotenberg:8
|
|
33
|
+
networks:
|
|
34
|
+
- app-tier
|
data/config/coelacanth.yml
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
development: &development
|
|
2
|
-
client: "ferrum" # Options: "ferrum", "screenshot_one"
|
|
2
|
+
client: "ferrum" # Options: "ferrum", "screenshot_one", "gotenberg"
|
|
3
3
|
remote_client:
|
|
4
4
|
ws_url: "ws://chrome:3000/chrome"
|
|
5
5
|
timeout: 10 # seconds
|
|
@@ -11,6 +11,13 @@ development: &development
|
|
|
11
11
|
User-Agent: "<%= ENV.fetch("COELACANTH_REMOTE_CLIENT_USER_AGENT", "Coelacanth Chrome Extension") %>"
|
|
12
12
|
screenshot_one:
|
|
13
13
|
key: "<%= ENV.fetch("COELACANTH_SCREENSHOT_ONE_API_KEY", "your_screenshot_one_api_key_here") %>"
|
|
14
|
+
gotenberg:
|
|
15
|
+
url: "<%= ENV.fetch("COELACANTH_GOTENBERG_URL", "http://gotenberg:3000") %>"
|
|
16
|
+
open_timeout: 5
|
|
17
|
+
read_timeout: 30
|
|
18
|
+
wait_delay: "<%= ENV.fetch("COELACANTH_GOTENBERG_WAIT_DELAY", "") %>"
|
|
19
|
+
user_agent: "<%= ENV.fetch("COELACANTH_GOTENBERG_USER_AGENT", "") %>"
|
|
20
|
+
extra_http_headers:
|
|
14
21
|
youtube:
|
|
15
22
|
api_key: "<%= ENV.fetch("COELACANTH_YOUTUBE_API_KEY", "") %>"
|
|
16
23
|
morphology:
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "net/http"
|
|
5
|
+
require "uri"
|
|
6
|
+
|
|
7
|
+
require_relative "../http"
|
|
8
|
+
|
|
9
|
+
module Coelacanth::Client
|
|
10
|
+
# Client for capturing screenshots through a Gotenberg Chromium service.
|
|
11
|
+
class Gotenberg < Coelacanth::Client::Base
|
|
12
|
+
SCREENSHOT_URL_PATH = "/forms/chromium/screenshot/url"
|
|
13
|
+
|
|
14
|
+
def get_response
|
|
15
|
+
uri = URI.parse(@url)
|
|
16
|
+
response = Coelacanth::HTTP.get_response(
|
|
17
|
+
uri,
|
|
18
|
+
open_timeout: Coelacanth::HTTP::DEFAULT_OPEN_TIMEOUT,
|
|
19
|
+
read_timeout: Coelacanth::HTTP::DEFAULT_READ_TIMEOUT
|
|
20
|
+
)
|
|
21
|
+
@origin_response = response
|
|
22
|
+
@status_code = response.code.to_i
|
|
23
|
+
|
|
24
|
+
return response.body if response.is_a?(Net::HTTPSuccess)
|
|
25
|
+
|
|
26
|
+
Coelacanth::HTTP.raise_http_error(uri, response)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def get_screenshot
|
|
30
|
+
response = Net::HTTP.start(
|
|
31
|
+
endpoint_uri.host,
|
|
32
|
+
endpoint_uri.port,
|
|
33
|
+
use_ssl: endpoint_uri.scheme == "https",
|
|
34
|
+
open_timeout: open_timeout,
|
|
35
|
+
read_timeout: read_timeout
|
|
36
|
+
) do |http|
|
|
37
|
+
http.request(screenshot_request)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
return response.body if response.is_a?(Net::HTTPSuccess)
|
|
41
|
+
|
|
42
|
+
raise "Failed to fetch screenshot from Gotenberg: #{response.code} #{response.message}"
|
|
43
|
+
rescue Net::OpenTimeout, Net::ReadTimeout, Timeout::Error => e
|
|
44
|
+
raise Coelacanth::TimeoutError, "Gotenberg screenshot request timed out: #{e.message}"
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
def screenshot_request
|
|
50
|
+
request = Net::HTTP::Post.new(endpoint_uri)
|
|
51
|
+
request.set_form(gotenberg_form_fields, "multipart/form-data")
|
|
52
|
+
request
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def gotenberg_form_fields
|
|
56
|
+
fields = [["url", @url]]
|
|
57
|
+
wait_delay = @config.read("gotenberg.wait_delay")
|
|
58
|
+
user_agent = @config.read("gotenberg.user_agent")
|
|
59
|
+
extra_http_headers = @config.read("gotenberg.extra_http_headers")
|
|
60
|
+
|
|
61
|
+
fields << ["waitDelay", wait_delay] if present?(wait_delay)
|
|
62
|
+
fields << ["userAgent", user_agent] if present?(user_agent)
|
|
63
|
+
if extra_http_headers && extra_http_headers.any?
|
|
64
|
+
fields << ["extraHttpHeaders", extra_http_headers.to_json]
|
|
65
|
+
end
|
|
66
|
+
fields
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def endpoint_uri
|
|
70
|
+
@endpoint_uri ||= URI.join(base_url, SCREENSHOT_URL_PATH)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def base_url
|
|
74
|
+
@config.read("gotenberg.url") || "http://localhost:3000"
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def open_timeout
|
|
78
|
+
@config.read("gotenberg.open_timeout") || Coelacanth::HTTP::DEFAULT_OPEN_TIMEOUT
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def read_timeout
|
|
82
|
+
@config.read("gotenberg.read_timeout") || 30
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def present?(value)
|
|
86
|
+
!value.nil? && value.to_s.strip != ""
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
data/lib/coelacanth/version.rb
CHANGED
data/lib/coelacanth.rb
CHANGED
|
@@ -4,6 +4,7 @@ require "net/http"
|
|
|
4
4
|
require_relative "coelacanth/configure"
|
|
5
5
|
require_relative "coelacanth/client/base"
|
|
6
6
|
require_relative "coelacanth/client/ferrum"
|
|
7
|
+
require_relative "coelacanth/client/gotenberg"
|
|
7
8
|
require_relative "coelacanth/client/screenshot_one"
|
|
8
9
|
require_relative "coelacanth/dom"
|
|
9
10
|
require_relative "coelacanth/extractor"
|
|
@@ -21,7 +22,7 @@ module Coelacanth
|
|
|
21
22
|
class RobotsDisallowedError < StandardError; end
|
|
22
23
|
|
|
23
24
|
def self.analyze(url)
|
|
24
|
-
client_class = config.read("client")
|
|
25
|
+
client_class = client_class_for(config.read("client"))
|
|
25
26
|
@client = client_class.new(url)
|
|
26
27
|
regular_url = Redirect.new.resolve_redirect(url)
|
|
27
28
|
response = begin
|
|
@@ -47,6 +48,17 @@ module Coelacanth
|
|
|
47
48
|
}
|
|
48
49
|
end
|
|
49
50
|
|
|
51
|
+
def self.client_class_for(client_name)
|
|
52
|
+
case client_name
|
|
53
|
+
when "screenshot_one"
|
|
54
|
+
Client::ScreenshotOne
|
|
55
|
+
when "gotenberg"
|
|
56
|
+
Client::Gotenberg
|
|
57
|
+
else
|
|
58
|
+
Client::Ferrum
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
50
62
|
def self.config
|
|
51
63
|
@config ||= Configure.new
|
|
52
64
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: coelacanth
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.6.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Yusuke
|
|
@@ -33,6 +33,7 @@ files:
|
|
|
33
33
|
- lib/coelacanth.rb
|
|
34
34
|
- lib/coelacanth/client/base.rb
|
|
35
35
|
- lib/coelacanth/client/ferrum.rb
|
|
36
|
+
- lib/coelacanth/client/gotenberg.rb
|
|
36
37
|
- lib/coelacanth/client/screenshot_one.rb
|
|
37
38
|
- lib/coelacanth/configure.rb
|
|
38
39
|
- lib/coelacanth/dom.rb
|