coelacanth 0.5.1 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f6e9c7c1d9049b351076d0238e2b5747144b73244e3f0c37d2ce69fb625b823a
4
- data.tar.gz: d5ef6a678e876b1fabfffd68426785ba6bb4174178e323adbe9593bf9c6ff57a
3
+ metadata.gz: 94493e66b4aae51d1790f03eb89c95d2ec1861b9d150c1001bba7554a1b37ab7
4
+ data.tar.gz: acdd939524543ec62740e153774c44f5fcdee11d30a9bd5ca4e6bcabe2f65398
5
5
  SHA512:
6
- metadata.gz: 5f9c343b0ddb3a58a78af386715c8c99d905e2161905684438972fc177c8943d2c9a51ebadeffb9393162d83e914436ea134e5c78a3affe8db379f3a65fc3a65
7
- data.tar.gz: cc6fda69757f79b5a53de36f0ead0315cf1598a4cc372f4e8d2c71cd021c1cf7370aacb583111cdaf0e5bcd711eb445c2d7c673705578217599bd94e7b1fc2ec
6
+ metadata.gz: 05b8789bd0bf8a23cb678b9f17349a403c0f1e7017903dea14ae13dbea1df1204a3a5cdd184c70e86e0d7366dce67cabbbd528846c2c498953088c1309830421
7
+ data.tar.gz: 047d1acabd38e890579947bd7311178ab0197089acd0185d5dbfa0628bbe6a5ad003c330e786c2d82d44fbada5b3ed13882178b50cba86c74ed2e2e12a527199
data/CHANGELOG.md CHANGED
@@ -4,8 +4,11 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
5
5
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
6
 
7
- ## [v0.5.1] - 2025-11-08
8
- ### :sparkles: New Features
9
- - [`726419e`](https://github.com/slidict/coelacanth/commit/726419e6b5fcb6b887d3881532328fdab25e3f0d) - expose standalone morphological analysis *(commit by [@yubele](https://github.com/yubele))*
7
+ ## [v0.6.1] - 2026-06-15
8
+ ### :bug: Bug Fixes
9
+ - [`f387ef9`](https://github.com/slidict/coelacanth/commit/f387ef997e0c383458a4fd0384f74883fcb63f37) - Update gem-push.yml *(commit by [@abechan1](https://github.com/abechan1))*
10
10
 
11
- [v0.5.1]: https://github.com/slidict/coelacanth/compare/v0.5.0...v0.5.1
11
+ ### :wrench: Chores
12
+ - [`4ad8456`](https://github.com/slidict/coelacanth/commit/4ad8456deba3aea4e6eea8836d6bdf728fbbdb70) - Bump version to 0.6.1 *(commit by [@abechan1](https://github.com/abechan1))*
13
+
14
+ [v0.6.1]: https://github.com/slidict/coelacanth/compare/v0.6.0...v0.6.1
data/README.md CHANGED
@@ -82,6 +82,7 @@ result[:extraction] # => article metadata and body markdown
82
82
  result[:dom] # => Oga DOM representation for downstream processing
83
83
  result[:screenshot] # => PNG screenshot as a binary string
84
84
  result[:response] # => HTTP status, headers, and final URL
85
+ ```
85
86
 
86
87
  # Plain-text morphology
87
88
 
@@ -96,7 +97,6 @@ Coelacanth.morphological_analysis("これはテストです。 Testing morpholog
96
97
  # ...
97
98
  # ]
98
99
  ```
99
- ```
100
100
 
101
101
  The returned hash includes:
102
102
 
@@ -143,7 +143,7 @@ Runtime configuration is stored in `config/coelacanth.yml`. Environments inherit
143
143
 
144
144
  ```yaml
145
145
  development:
146
- client: "ferrum" # Options: "ferrum", "screenshot_one"
146
+ client: "ferrum" # Options: "ferrum", "screenshot_one", "gotenberg"
147
147
  remote_client:
148
148
  ws_url: "ws://chrome:3000/chrome"
149
149
  timeout: 10
@@ -155,6 +155,13 @@ development:
155
155
  User-Agent: "<%= ENV.fetch("COELACANTH_REMOTE_CLIENT_USER_AGENT", "Coelacanth Chrome Extension") %>"
156
156
  screenshot_one:
157
157
  key: "<%= ENV.fetch("COELACANTH_SCREENSHOT_ONE_API_KEY", "your_screenshot_one_api_key_here") %>"
158
+ gotenberg:
159
+ url: "<%= ENV.fetch("COELACANTH_GOTENBERG_URL", "http://gotenberg:3000") %>"
160
+ open_timeout: 5
161
+ read_timeout: 30
162
+ wait_delay: "<%= ENV.fetch("COELACANTH_GOTENBERG_WAIT_DELAY", "") %>"
163
+ user_agent: "<%= ENV.fetch("COELACANTH_GOTENBERG_USER_AGENT", "") %>"
164
+ extra_http_headers:
158
165
  youtube:
159
166
  api_key: "<%= ENV.fetch("COELACANTH_YOUTUBE_API_KEY", "") %>"
160
167
  morphology:
@@ -171,6 +178,9 @@ development:
171
178
  - **Ferrum client** – Requires a running Chrome instance that exposes the DevTools protocol via WebSocket. Configure the URL,
172
179
  timeout, the network idle timeout, and any headers to inject.
173
180
  - **ScreenshotOne client** – Supply an API key to offload screenshot capture to [ScreenshotOne](https://screenshotone.com/).
181
+ - **Gotenberg client** – Set `client: "gotenberg"` to capture screenshots through Gotenberg's Chromium URL screenshot
182
+ endpoint. Configure `gotenberg.url`, request timeouts, an optional `wait_delay`, an optional screenshot `user_agent`,
183
+ and optional `extra_http_headers`.
174
184
  - **Eyecatch image extraction** – Representative images are discovered automatically by checking Open Graph/Twitter metadata,
175
185
  Schema.org JSON-LD payloads, and high-signal `<img>` elements (hero/cover images, large dimensions, etc.). No manual XPath
176
186
  maintenance is required.
@@ -207,6 +217,9 @@ export COELACANTH_REMOTE_CLIENT_AUTHORIZATION="Bearer <token>"
207
217
 
208
218
  export COELACANTH_REMOTE_CLIENT_USER_AGENT="Coelacanth Chrome Extension"
209
219
  export COELACANTH_SCREENSHOT_ONE_API_KEY="your_screenshot_one_api_key_here"
220
+ export COELACANTH_GOTENBERG_URL="http://gotenberg:3000"
221
+ export COELACANTH_GOTENBERG_WAIT_DELAY="2s"
222
+ export COELACANTH_GOTENBERG_USER_AGENT="Coelacanth Chrome Extension"
210
223
  export COELACANTH_YOUTUBE_API_KEY="your_youtube_data_api_key"
211
224
  ```
212
225
 
@@ -230,6 +243,10 @@ YouTube, so non-video pages continue to behave as before.
230
243
  When using Docker Compose, you can create a `.env` file or export the variables in your environment so the `app` service picks
231
244
  them up automatically.
232
245
 
246
+ Docker Compose also starts a `gotenberg` service and passes `COELACANTH_GOTENBERG_URL` to the `app` service by default.
247
+ To try that path locally, set `client: "gotenberg"` in `config/coelacanth.yml` or export the equivalent environment-specific
248
+ configuration before running the app container.
249
+
233
250
  If you are working inside Docker, make sure the `UID` environment variable matches your host user by exporting it in your shell
234
251
  startup file:
235
252
 
data/compose.yml CHANGED
@@ -1,25 +1,34 @@
1
- networks:
2
- app-tier:
3
- driver: bridge
4
- services:
5
- app:
1
+ networks:
2
+ app-tier:
3
+ driver: bridge
4
+ services:
5
+ app:
6
6
  environment:
7
7
  - UID=${UID}
8
8
  - COELACANTH_REMOTE_CLIENT_AUTHORIZATION=${COELACANTH_REMOTE_CLIENT_AUTHORIZATION:-}
9
9
  - COELACANTH_REMOTE_CLIENT_USER_AGENT=${COELACANTH_REMOTE_CLIENT_USER_AGENT:-}
10
10
  - COELACANTH_SCREENSHOT_ONE_API_KEY=${COELACANTH_SCREENSHOT_ONE_API_KEY:-}
11
- tty: true
12
- stdin_open: true
13
- build:
14
- context: .
15
- dockerfile: Dockerfile
16
- volumes:
17
- - ./:/app:cached
18
- working_dir: /app
19
- command: bash
20
- networks:
21
- - app-tier
22
- chrome:
23
- image: browserless/chrome:latest
24
- networks:
25
- - app-tier
11
+ - COELACANTH_GOTENBERG_URL=${COELACANTH_GOTENBERG_URL:-http://gotenberg:3000}
12
+ - COELACANTH_GOTENBERG_WAIT_DELAY=${COELACANTH_GOTENBERG_WAIT_DELAY:-}
13
+ - COELACANTH_GOTENBERG_USER_AGENT=${COELACANTH_GOTENBERG_USER_AGENT:-}
14
+ tty: true
15
+ stdin_open: true
16
+ build:
17
+ context: .
18
+ dockerfile: Dockerfile
19
+ volumes:
20
+ - ./:/app:cached
21
+ working_dir: /app
22
+ command: bash
23
+ depends_on:
24
+ - gotenberg
25
+ networks:
26
+ - app-tier
27
+ chrome:
28
+ image: browserless/chrome:latest
29
+ networks:
30
+ - app-tier
31
+ gotenberg:
32
+ image: gotenberg/gotenberg:8
33
+ networks:
34
+ - app-tier
@@ -1,5 +1,5 @@
1
1
  development: &development
2
- client: "ferrum" # Options: "ferrum", "screenshot_one"
2
+ client: "ferrum" # Options: "ferrum", "screenshot_one", "gotenberg"
3
3
  remote_client:
4
4
  ws_url: "ws://chrome:3000/chrome"
5
5
  timeout: 10 # seconds
@@ -11,6 +11,13 @@ development: &development
11
11
  User-Agent: "<%= ENV.fetch("COELACANTH_REMOTE_CLIENT_USER_AGENT", "Coelacanth Chrome Extension") %>"
12
12
  screenshot_one:
13
13
  key: "<%= ENV.fetch("COELACANTH_SCREENSHOT_ONE_API_KEY", "your_screenshot_one_api_key_here") %>"
14
+ gotenberg:
15
+ url: "<%= ENV.fetch("COELACANTH_GOTENBERG_URL", "http://gotenberg:3000") %>"
16
+ open_timeout: 5
17
+ read_timeout: 30
18
+ wait_delay: "<%= ENV.fetch("COELACANTH_GOTENBERG_WAIT_DELAY", "") %>"
19
+ user_agent: "<%= ENV.fetch("COELACANTH_GOTENBERG_USER_AGENT", "") %>"
20
+ extra_http_headers:
14
21
  youtube:
15
22
  api_key: "<%= ENV.fetch("COELACANTH_YOUTUBE_API_KEY", "") %>"
16
23
  morphology:
@@ -5,8 +5,8 @@ require "ferrum"
5
5
  module Coelacanth::Client
6
6
  # Coelacanth::Client
7
7
  class Ferrum < Coelacanth::Client::Base
8
- def initialize(url)
9
- super(url)
8
+ def initialize(url, config = Coelacanth.config)
9
+ super(url, config)
10
10
  remote_client.goto(url)
11
11
  end
12
12
 
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "net/http"
5
+ require "uri"
6
+
7
+ require_relative "../http"
8
+
9
+ module Coelacanth::Client
10
+ # Client for capturing screenshots through a Gotenberg Chromium service.
11
+ class Gotenberg < Coelacanth::Client::Base
12
+ SCREENSHOT_URL_PATH = "/forms/chromium/screenshot/url"
13
+
14
+ def get_response
15
+ uri = URI.parse(@url)
16
+ response = Coelacanth::HTTP.get_response(
17
+ uri,
18
+ open_timeout: Coelacanth::HTTP::DEFAULT_OPEN_TIMEOUT,
19
+ read_timeout: Coelacanth::HTTP::DEFAULT_READ_TIMEOUT
20
+ )
21
+ @origin_response = response
22
+ @status_code = response.code.to_i
23
+
24
+ return response.body if response.is_a?(Net::HTTPSuccess)
25
+
26
+ Coelacanth::HTTP.raise_http_error(uri, response)
27
+ end
28
+
29
+ def get_screenshot
30
+ response = Net::HTTP.start(
31
+ endpoint_uri.host,
32
+ endpoint_uri.port,
33
+ use_ssl: endpoint_uri.scheme == "https",
34
+ open_timeout: open_timeout,
35
+ read_timeout: read_timeout
36
+ ) do |http|
37
+ http.request(screenshot_request)
38
+ end
39
+
40
+ return response.body if response.is_a?(Net::HTTPSuccess)
41
+
42
+ raise "Failed to fetch screenshot from Gotenberg: #{response.code} #{response.message}"
43
+ rescue Net::OpenTimeout, Net::ReadTimeout, Timeout::Error => e
44
+ raise Coelacanth::TimeoutError, "Gotenberg screenshot request timed out: #{e.message}"
45
+ end
46
+
47
+ private
48
+
49
+ def screenshot_request
50
+ request = Net::HTTP::Post.new(endpoint_uri)
51
+ request.set_form(gotenberg_form_fields, "multipart/form-data")
52
+ request
53
+ end
54
+
55
+ def gotenberg_form_fields
56
+ fields = [["url", @url]]
57
+ wait_delay = @config.read("gotenberg.wait_delay")
58
+ user_agent = @config.read("gotenberg.user_agent")
59
+ extra_http_headers = @config.read("gotenberg.extra_http_headers")
60
+
61
+ fields << ["waitDelay", wait_delay] if present?(wait_delay)
62
+ fields << ["userAgent", user_agent] if present?(user_agent)
63
+ if extra_http_headers && extra_http_headers.any?
64
+ fields << ["extraHttpHeaders", extra_http_headers.to_json]
65
+ end
66
+ fields
67
+ end
68
+
69
+ def endpoint_uri
70
+ @endpoint_uri ||= URI.join(base_url, SCREENSHOT_URL_PATH)
71
+ end
72
+
73
+ def base_url
74
+ @config.read("gotenberg.url") || "http://localhost:3000"
75
+ end
76
+
77
+ def open_timeout
78
+ @config.read("gotenberg.open_timeout") || Coelacanth::HTTP::DEFAULT_OPEN_TIMEOUT
79
+ end
80
+
81
+ def read_timeout
82
+ @config.read("gotenberg.read_timeout") || 30
83
+ end
84
+
85
+ def present?(value)
86
+ !value.nil? && value.to_s.strip != ""
87
+ end
88
+ end
89
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Coelacanth
4
- VERSION = "0.5.1"
4
+ VERSION = "0.6.1"
5
5
  end
data/lib/coelacanth.rb CHANGED
@@ -4,6 +4,7 @@ require "net/http"
4
4
  require_relative "coelacanth/configure"
5
5
  require_relative "coelacanth/client/base"
6
6
  require_relative "coelacanth/client/ferrum"
7
+ require_relative "coelacanth/client/gotenberg"
7
8
  require_relative "coelacanth/client/screenshot_one"
8
9
  require_relative "coelacanth/dom"
9
10
  require_relative "coelacanth/extractor"
@@ -21,7 +22,7 @@ module Coelacanth
21
22
  class RobotsDisallowedError < StandardError; end
22
23
 
23
24
  def self.analyze(url)
24
- client_class = config.read("client") == "screenshot_one" ? Client::ScreenshotOne : Client::Ferrum
25
+ client_class = client_class_for(config.read("client"))
25
26
  @client = client_class.new(url)
26
27
  regular_url = Redirect.new.resolve_redirect(url)
27
28
  response = begin
@@ -47,6 +48,17 @@ module Coelacanth
47
48
  }
48
49
  end
49
50
 
51
+ def self.client_class_for(client_name)
52
+ case client_name
53
+ when "screenshot_one"
54
+ Client::ScreenshotOne
55
+ when "gotenberg"
56
+ Client::Gotenberg
57
+ else
58
+ Client::Ferrum
59
+ end
60
+ end
61
+
50
62
  def self.config
51
63
  @config ||= Configure.new
52
64
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coelacanth
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yusuke
@@ -33,6 +33,7 @@ files:
33
33
  - lib/coelacanth.rb
34
34
  - lib/coelacanth/client/base.rb
35
35
  - lib/coelacanth/client/ferrum.rb
36
+ - lib/coelacanth/client/gotenberg.rb
36
37
  - lib/coelacanth/client/screenshot_one.rb
37
38
  - lib/coelacanth/configure.rb
38
39
  - lib/coelacanth/dom.rb