coelacanth 0.4.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -4
- data/README.md +69 -2
- data/config/coelacanth.yml +21 -0
- data/lib/coelacanth/extractor/eyecatch_image_extractor.rb +384 -0
- data/lib/coelacanth/extractor/morphological_analyzer.rb +552 -0
- data/lib/coelacanth/extractor/preprocessor.rb +166 -0
- data/lib/coelacanth/extractor.rb +41 -6
- data/lib/coelacanth/http.rb +28 -2
- data/lib/coelacanth/version.rb +1 -1
- data/lib/coelacanth.rb +7 -1
- metadata +4 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 06a629b2865e5c4be5508a92637b2824bce0922b2de1209cee7c8f358ea8b438
|
|
4
|
+
data.tar.gz: 43eac188f8c3d27e975753ff459c444d9ca49dc6e83aa8d067dca75b3223db87
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4dc3c36802dce0be0e9deb9debdeccb5840bafa44a2613e6a59a270242b16f7f977d44b66a1e47472b9edf5a2a4026d057ffc91a490a776118dd493490e5ca9f
|
|
7
|
+
data.tar.gz: 3e558a85ab45b8f738be4c993413c279c7cb46bca7e44399e8fd40d1aa5e764b90865bbc51c7b38ad62be7501605e4f9d6edd3c0e8885ce8fe81830fda36d362
|
data/CHANGELOG.md
CHANGED
|
@@ -4,8 +4,9 @@ All notable changes to this project will be documented in this file.
|
|
|
4
4
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
5
5
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
6
|
|
|
7
|
-
## [v0.
|
|
8
|
-
### :
|
|
9
|
-
- [`
|
|
7
|
+
## [v0.5.0] - 2025-11-08
|
|
8
|
+
### :sparkles: New Features
|
|
9
|
+
- [`d34ef32`](https://github.com/slidict/coelacanth/commit/d34ef32dbb969f7ef86dce6cd587c44a848ee32d) - add YouTube preprocessing support *(commit by [@yubele](https://github.com/yubele))*
|
|
10
|
+
- [`2a566ad`](https://github.com/slidict/coelacanth/commit/2a566adeaaa5b813fded4b9ebd8ce8d90d43ee7c) - add morphological analysis for body markdown *(commit by [@yubele](https://github.com/yubele))*
|
|
10
11
|
|
|
11
|
-
[v0.
|
|
12
|
+
[v0.5.0]: https://github.com/slidict/coelacanth/compare/v0.4.3...v0.5.0
|
data/README.md
CHANGED
|
@@ -81,14 +81,33 @@ result = Coelacanth.analyze("https://example.com/article")
|
|
|
81
81
|
result[:extraction] # => article metadata and body markdown
|
|
82
82
|
result[:dom] # => Oga DOM representation for downstream processing
|
|
83
83
|
result[:screenshot] # => PNG screenshot as a binary string
|
|
84
|
+
result[:response] # => HTTP status, headers, and final URL
|
|
84
85
|
```
|
|
85
86
|
|
|
86
87
|
The returned hash includes:
|
|
87
88
|
|
|
88
|
-
- `:extraction` – output from `Coelacanth::Extractor`, including title, Markdown body (`body_markdown
|
|
89
|
-
`body_markdown_list
|
|
89
|
+
- `:extraction` – output from `Coelacanth::Extractor`, including title, Markdown body (`body_markdown`,
|
|
90
|
+
`body_markdown_list`, and scored morphemes in `body_morphemes`), the normalized plain-text body (`body_text`),
|
|
91
|
+
images, listings, published date, detected site name, and the probe source and confidence score. The extractor also echoes the
|
|
92
|
+
HTTP metadata it received via `response_metadata` for downstream consumers that only operate on the extraction payload.
|
|
90
93
|
- `:dom` – a parsed Oga DOM if you need to traverse the document manually.
|
|
91
94
|
- `:screenshot` – raw PNG data that you can persist or feed to other systems.
|
|
95
|
+
- `:response` – HTTP metadata captured during the initial fetch.
|
|
96
|
+
|
|
97
|
+
### Response and extraction metadata
|
|
98
|
+
|
|
99
|
+
The `:response` key exposes a hash with the following keys:
|
|
100
|
+
|
|
101
|
+
- `:status_code` – Numeric HTTP status (e.g., `200`).
|
|
102
|
+
- `:headers` – A lowercase header hash as returned by `Net::HTTP#each_header`.
|
|
103
|
+
- `:final_url` – The URL that was ultimately fetched after resolving redirects.
|
|
104
|
+
|
|
105
|
+
Within the extraction payload (`result[:extraction]`), the following additional metadata is available:
|
|
106
|
+
|
|
107
|
+
- `:site_name` – Site or application name inferred from Open Graph/Twitter meta tags or the document `<title>`.
|
|
108
|
+
- `:body_text` – Plain-text body with collapsed whitespace, suitable for search indexing or summarization.
|
|
109
|
+
- `:response_metadata` – Mirrors the top-level `:response` hash so downstream processing can access HTTP metadata without
|
|
110
|
+
carrying the entire analysis result.
|
|
92
111
|
|
|
93
112
|
## Extractor pipeline
|
|
94
113
|
Coelacanth ships with a multi-stage extractor that tries increasingly involved probes until one meets its confidence target:
|
|
@@ -122,14 +141,47 @@ development:
|
|
|
122
141
|
User-Agent: "<%= ENV.fetch("COELACANTH_REMOTE_CLIENT_USER_AGENT", "Coelacanth Chrome Extension") %>"
|
|
123
142
|
screenshot_one:
|
|
124
143
|
key: "<%= ENV.fetch("COELACANTH_SCREENSHOT_ONE_API_KEY", "your_screenshot_one_api_key_here") %>"
|
|
144
|
+
youtube:
|
|
145
|
+
api_key: "<%= ENV.fetch("COELACANTH_YOUTUBE_API_KEY", "") %>"
|
|
146
|
+
morphology:
|
|
147
|
+
latin_joiners:
|
|
148
|
+
- ","
|
|
149
|
+
japanese_hiragana_suffixes:
|
|
150
|
+
- "ら"
|
|
151
|
+
- "の"
|
|
152
|
+
- "え"
|
|
153
|
+
japanese_category_breaks:
|
|
154
|
+
- "katakana_to_kanji"
|
|
125
155
|
```
|
|
126
156
|
|
|
127
157
|
- **Ferrum client** – Requires a running Chrome instance that exposes the DevTools protocol via WebSocket. Configure the URL,
|
|
128
158
|
timeout, the network idle timeout, and any headers to inject.
|
|
129
159
|
- **ScreenshotOne client** – Supply an API key to offload screenshot capture to [ScreenshotOne](https://screenshotone.com/).
|
|
160
|
+
- **Eyecatch image extraction** – Representative images are discovered automatically by checking Open Graph/Twitter metadata,
|
|
161
|
+
Schema.org JSON-LD payloads, and high-signal `<img>` elements (hero/cover images, large dimensions, etc.). No manual XPath
|
|
162
|
+
maintenance is required.
|
|
163
|
+
- **YouTube Data API** – Set an API key to turn YouTube watch URLs into structured articles using the video description and
|
|
164
|
+
thumbnail for downstream processing.
|
|
130
165
|
- Configuration is environment-aware: set `RAILS_ENV`/`RACK_ENV` or use Rails' built-in environment handling when the gem is
|
|
131
166
|
used inside a Rails project.
|
|
132
167
|
|
|
168
|
+
#### Morphological analyzer tuning
|
|
169
|
+
|
|
170
|
+
The terms returned in `body_morphemes` can be tuned per deployment by configuring the optional `morphology` section:
|
|
171
|
+
|
|
172
|
+
- `morphology.latin_joiners` — An array of characters that should be treated as connectors between Latin tokens. The default
|
|
173
|
+
value includes a comma so numbers such as `7,000` stay intact instead of being split into separate terms.
|
|
174
|
+
- `morphology.japanese_hiragana_suffixes` — A whitelist of Hiragana tokens that are allowed to extend Kanji sequences. By
|
|
175
|
+
default we keep common nominal suffixes such as `ら`, `の`, and the trailing `え` in `訴え` while preventing particles like `に`
|
|
176
|
+
from merging with the preceding noun. Provide your own list or set the value to `null`/`~` to allow any Hiragana suffix.
|
|
177
|
+
- `morphology.japanese_category_breaks` — An array of transitions (e.g., `katakana_to_kanji`) that should stop Japanese token
|
|
178
|
+
sequences. This is useful when you want Katakana loanwords such as `タワマン` to stand alone instead of being merged with the
|
|
179
|
+
Kanji terms that follow them.
|
|
180
|
+
|
|
181
|
+
Representative images are downloaded into a temporary directory using the built-in HTTP client. The extractor returns both the
|
|
182
|
+
resolved URL and the local file path via `extraction[:eyecatch_image]`. Remember to move or delete the file once you have
|
|
183
|
+
persisted it—temporary directories are not automatically cleaned up for long-running processes.
|
|
184
|
+
|
|
133
185
|
### Environment variables
|
|
134
186
|
|
|
135
187
|
Configuration values that would otherwise contain credentials are loaded from environment variables. Set the following
|
|
@@ -141,11 +193,26 @@ export COELACANTH_REMOTE_CLIENT_AUTHORIZATION="Bearer <token>"
|
|
|
141
193
|
|
|
142
194
|
export COELACANTH_REMOTE_CLIENT_USER_AGENT="Coelacanth Chrome Extension"
|
|
143
195
|
export COELACANTH_SCREENSHOT_ONE_API_KEY="your_screenshot_one_api_key_here"
|
|
196
|
+
export COELACANTH_YOUTUBE_API_KEY="your_youtube_data_api_key"
|
|
144
197
|
```
|
|
145
198
|
|
|
146
199
|
If `COELACANTH_REMOTE_CLIENT_AUTHORIZATION` is omitted or left blank, the `Authorization` header is not injected into the
|
|
147
200
|
remote browser session.
|
|
148
201
|
|
|
202
|
+
### YouTube Data API integration
|
|
203
|
+
|
|
204
|
+
With `COELACANTH_YOUTUBE_API_KEY` configured (or `youtube.api_key` populated directly in `config/coelacanth.yml`),
|
|
205
|
+
`Coelacanth::Extractor` runs a preprocessor that recognizes standard YouTube watch URLs (`youtube.com`, `youtu.be`,
|
|
206
|
+
`m.youtube.com`, etc.). The preprocessor fetches the video snippet from the YouTube Data API and builds an article-like HTML
|
|
207
|
+
document that contains:
|
|
208
|
+
|
|
209
|
+
- The video title and publish timestamp as structured metadata (JSON-LD and Open Graph).
|
|
210
|
+
- The full description rendered as Markdown-friendly paragraphs.
|
|
211
|
+
- The highest available thumbnail, passed to the eye-catch/image collector pipeline.
|
|
212
|
+
|
|
213
|
+
If the API key is missing or the API request fails, the extractor falls back to the original HTML that was fetched from
|
|
214
|
+
YouTube, so non-video pages continue to behave as before.
|
|
215
|
+
|
|
149
216
|
When using Docker Compose, you can create a `.env` file or export the variables in your environment so the `app` service picks
|
|
150
217
|
them up automatically.
|
|
151
218
|
|
data/config/coelacanth.yml
CHANGED
|
@@ -11,6 +11,27 @@ development: &development
|
|
|
11
11
|
User-Agent: "<%= ENV.fetch("COELACANTH_REMOTE_CLIENT_USER_AGENT", "Coelacanth Chrome Extension") %>"
|
|
12
12
|
screenshot_one:
|
|
13
13
|
key: "<%= ENV.fetch("COELACANTH_SCREENSHOT_ONE_API_KEY", "your_screenshot_one_api_key_here") %>"
|
|
14
|
+
youtube:
|
|
15
|
+
api_key: "<%= ENV.fetch("COELACANTH_YOUTUBE_API_KEY", "") %>"
|
|
16
|
+
morphology:
|
|
17
|
+
# Example configuration:
|
|
18
|
+
# latin_joiners:
|
|
19
|
+
# - "'"
|
|
20
|
+
# - "-"
|
|
21
|
+
# japanese_hiragana_suffixes:
|
|
22
|
+
# - "さん"
|
|
23
|
+
# - "ちゃん"
|
|
24
|
+
# japanese_category_breaks:
|
|
25
|
+
# - "kanji_to_katakana"
|
|
26
|
+
# - "katakana_to_kanji"
|
|
27
|
+
latin_joiners:
|
|
28
|
+
- ","
|
|
29
|
+
japanese_hiragana_suffixes:
|
|
30
|
+
- "ら"
|
|
31
|
+
- "の"
|
|
32
|
+
- "え"
|
|
33
|
+
japanese_category_breaks:
|
|
34
|
+
- "katakana_to_kanji"
|
|
14
35
|
test:
|
|
15
36
|
<<: *development
|
|
16
37
|
production:
|
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "set"
|
|
5
|
+
require "tmpdir"
|
|
6
|
+
require "uri"
|
|
7
|
+
|
|
8
|
+
require_relative "utilities"
|
|
9
|
+
require_relative "../http"
|
|
10
|
+
|
|
11
|
+
module Coelacanth
|
|
12
|
+
class Extractor
|
|
13
|
+
# Finds and downloads the representative image for a document.
|
|
14
|
+
class EyecatchImageExtractor
|
|
15
|
+
Result = Struct.new(:url, :path, keyword_init: true)
|
|
16
|
+
|
|
17
|
+
POSITIVE_KEYWORDS = %w[eyecatch hero main featured cover headline banner article primary lead].freeze
|
|
18
|
+
NEGATIVE_KEYWORDS = %w[avatar icon logo emoji badge button profile author comment footer nav thumbnail thumb ad sponsor].freeze
|
|
19
|
+
|
|
20
|
+
METADATA_SOURCES = [
|
|
21
|
+
{ selector: "meta[property='og:image:secure_url']", attribute: "content", score: 140 },
|
|
22
|
+
{ selector: "meta[property='og:image:url']", attribute: "content", score: 135 },
|
|
23
|
+
{ selector: "meta[property='og:image']", attribute: "content", score: 130 },
|
|
24
|
+
{ selector: "meta[name='twitter:image:src']", attribute: "content", score: 125 },
|
|
25
|
+
{ selector: "meta[name='twitter:image']", attribute: "content", score: 120 },
|
|
26
|
+
{ selector: "meta[itemprop='image']", attribute: "content", score: 110 },
|
|
27
|
+
{ selector: "meta[name='thumbnail']", attribute: "content", score: 100 },
|
|
28
|
+
{ selector: "link[rel='image_src']", attribute: "href", score: 95 }
|
|
29
|
+
].freeze
|
|
30
|
+
|
|
31
|
+
JSON_LD_IMAGE_KEYS = %w[image imageUrl imageURL thumbnail thumbnailUrl thumbnailURL contentUrl contentURL].freeze
|
|
32
|
+
|
|
33
|
+
LAZY_SOURCE_ATTRIBUTES = %w[data-src data-original data-lazy-src data-lazy data-url data-image data-preview src].freeze
|
|
34
|
+
|
|
35
|
+
def initialize(http_client: Coelacanth::HTTP)
|
|
36
|
+
@http_client = http_client
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def call(doc:, base_url: nil)
|
|
40
|
+
return unless doc
|
|
41
|
+
|
|
42
|
+
image_url = locate_image_url(doc, base_url)
|
|
43
|
+
return unless image_url
|
|
44
|
+
|
|
45
|
+
download(image_url)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
|
|
50
|
+
attr_reader :http_client
|
|
51
|
+
|
|
52
|
+
def locate_image_url(doc, base_url)
|
|
53
|
+
candidates = []
|
|
54
|
+
|
|
55
|
+
candidates.concat(metadata_candidates(doc, base_url))
|
|
56
|
+
candidates.concat(structured_data_candidates(doc, base_url))
|
|
57
|
+
candidates.concat(document_image_candidates(doc, base_url))
|
|
58
|
+
|
|
59
|
+
best_candidate(candidates)&.dig(:url)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def metadata_candidates(doc, base_url)
|
|
63
|
+
METADATA_SOURCES.flat_map do |source|
|
|
64
|
+
doc.css(source[:selector]).filter_map do |node|
|
|
65
|
+
value = node[source[:attribute]].to_s.strip
|
|
66
|
+
next if value.empty?
|
|
67
|
+
|
|
68
|
+
url = absolutize(base_url, value)
|
|
69
|
+
next unless url
|
|
70
|
+
|
|
71
|
+
{
|
|
72
|
+
url: url,
|
|
73
|
+
score: source[:score],
|
|
74
|
+
origin: :metadata
|
|
75
|
+
}
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def structured_data_candidates(doc, base_url)
|
|
81
|
+
doc.css("script[type='application/ld+json']").flat_map do |script|
|
|
82
|
+
parse_structured_data(script).flat_map do |value|
|
|
83
|
+
url = absolutize(base_url, value)
|
|
84
|
+
next unless url
|
|
85
|
+
|
|
86
|
+
{
|
|
87
|
+
url: url,
|
|
88
|
+
score: 105,
|
|
89
|
+
origin: :structured_data
|
|
90
|
+
}
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def parse_structured_data(script)
|
|
96
|
+
payload = script.text.to_s.strip
|
|
97
|
+
return [] if payload.empty?
|
|
98
|
+
|
|
99
|
+
Array(extract_images_from_jsonld(JSON.parse(payload)))
|
|
100
|
+
rescue JSON::ParserError
|
|
101
|
+
[]
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def extract_images_from_jsonld(data)
|
|
105
|
+
case data
|
|
106
|
+
when String
|
|
107
|
+
return [] unless valid_image_url?(data)
|
|
108
|
+
|
|
109
|
+
[data]
|
|
110
|
+
when Array
|
|
111
|
+
data.flat_map { |value| extract_images_from_jsonld(value) }
|
|
112
|
+
when Hash
|
|
113
|
+
urls = []
|
|
114
|
+
|
|
115
|
+
JSON_LD_IMAGE_KEYS.each do |key|
|
|
116
|
+
next unless data.key?(key)
|
|
117
|
+
|
|
118
|
+
urls.concat(Array(extract_images_from_jsonld(data[key])))
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
if data["@type"].to_s.casecmp("ImageObject").zero? && data["url"].to_s.strip != ""
|
|
122
|
+
urls << data["url"]
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
data.each_value do |value|
|
|
126
|
+
next unless value.is_a?(Array) || value.is_a?(Hash)
|
|
127
|
+
|
|
128
|
+
urls.concat(Array(extract_images_from_jsonld(value)))
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
urls
|
|
132
|
+
else
|
|
133
|
+
[]
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def document_image_candidates(doc, base_url)
|
|
138
|
+
doc.css("img").flat_map do |node|
|
|
139
|
+
sources_for(node).filter_map do |source|
|
|
140
|
+
url = absolutize(base_url, source[:url])
|
|
141
|
+
next unless url
|
|
142
|
+
|
|
143
|
+
score = 60
|
|
144
|
+
score += descriptor_bonus(source[:weight])
|
|
145
|
+
score += score_for_image_node(node, url)
|
|
146
|
+
|
|
147
|
+
{
|
|
148
|
+
url: url,
|
|
149
|
+
score: score,
|
|
150
|
+
origin: :document
|
|
151
|
+
}
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def sources_for(node)
|
|
157
|
+
seen = Set.new
|
|
158
|
+
entries = []
|
|
159
|
+
|
|
160
|
+
LAZY_SOURCE_ATTRIBUTES.each do |attribute|
|
|
161
|
+
value = node[attribute]
|
|
162
|
+
next unless valid_image_url?(value)
|
|
163
|
+
next if seen.include?(value)
|
|
164
|
+
|
|
165
|
+
seen << value
|
|
166
|
+
entries << { url: value, weight: nil }
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
[node["srcset"], node["data-srcset"]].compact.each do |srcset|
|
|
170
|
+
parse_srcset(srcset).each do |entry|
|
|
171
|
+
next if seen.include?(entry[:url])
|
|
172
|
+
|
|
173
|
+
seen << entry[:url]
|
|
174
|
+
entries << entry
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
if node.parent&.name == "picture"
|
|
179
|
+
node.parent.css("source").each do |source|
|
|
180
|
+
[source["src"], source["data-src"]].compact.each do |value|
|
|
181
|
+
next unless valid_image_url?(value)
|
|
182
|
+
next if seen.include?(value)
|
|
183
|
+
|
|
184
|
+
seen << value
|
|
185
|
+
entries << { url: value, weight: nil }
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
[source["srcset"], source["data-srcset"]].compact.each do |srcset|
|
|
189
|
+
parse_srcset(srcset).each do |entry|
|
|
190
|
+
next if seen.include?(entry[:url])
|
|
191
|
+
|
|
192
|
+
seen << entry[:url]
|
|
193
|
+
entries << entry
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
entries
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def parse_srcset(srcset)
|
|
203
|
+
return [] if srcset.to_s.strip.empty?
|
|
204
|
+
|
|
205
|
+
srcset.split(",").filter_map do |candidate|
|
|
206
|
+
parts = candidate.strip.split
|
|
207
|
+
url = parts[0].to_s.strip
|
|
208
|
+
next unless valid_image_url?(url)
|
|
209
|
+
|
|
210
|
+
descriptor = parts[1]
|
|
211
|
+
{ url: url, weight: descriptor_weight(descriptor) }
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def descriptor_weight(descriptor)
|
|
216
|
+
return nil if descriptor.to_s.empty?
|
|
217
|
+
|
|
218
|
+
if descriptor.end_with?("w")
|
|
219
|
+
descriptor.to_i
|
|
220
|
+
elsif descriptor.end_with?("x")
|
|
221
|
+
(descriptor.to_f * 1000).to_i
|
|
222
|
+
elsif descriptor.end_with?("h")
|
|
223
|
+
descriptor.to_i
|
|
224
|
+
else
|
|
225
|
+
descriptor.to_i
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def descriptor_bonus(weight)
|
|
230
|
+
return 0 unless weight
|
|
231
|
+
|
|
232
|
+
case weight
|
|
233
|
+
when 0..399 then 0
|
|
234
|
+
when 400..799 then 8
|
|
235
|
+
when 800..1199 then 15
|
|
236
|
+
else
|
|
237
|
+
22
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def score_for_image_node(node, url)
|
|
242
|
+
score = 0
|
|
243
|
+
|
|
244
|
+
tokens = Utilities.class_id_tokens(node).map(&:downcase)
|
|
245
|
+
score += tokens.count { |token| POSITIVE_KEYWORDS.include?(token) } * 25
|
|
246
|
+
score -= tokens.count { |token| NEGATIVE_KEYWORDS.include?(token) } * 30
|
|
247
|
+
|
|
248
|
+
alt_text = node["alt"].to_s.downcase
|
|
249
|
+
score += keyword_score(alt_text, 12)
|
|
250
|
+
score -= keyword_score(alt_text, 18, NEGATIVE_KEYWORDS)
|
|
251
|
+
|
|
252
|
+
src_score_text = url.downcase
|
|
253
|
+
score += keyword_score(src_score_text, 8)
|
|
254
|
+
score -= keyword_score(src_score_text, 16, NEGATIVE_KEYWORDS)
|
|
255
|
+
|
|
256
|
+
width = dimension_from(node["width"], node["data-width"]) || descriptor_dimension(node["srcset"]) || descriptor_dimension(node["data-srcset"])
|
|
257
|
+
height = dimension_from(node["height"], node["data-height"]) || width
|
|
258
|
+
|
|
259
|
+
score += 18 if width && width >= 700
|
|
260
|
+
score += 12 if height && height >= 400
|
|
261
|
+
score -= 20 if width && width <= 64
|
|
262
|
+
score -= 20 if height && height <= 64
|
|
263
|
+
|
|
264
|
+
ancestors = Utilities.ancestors(node)
|
|
265
|
+
score += 12 if ancestors.any? { |ancestor| ancestor.respond_to?(:name) && ancestor.name == "figure" }
|
|
266
|
+
score += 8 if ancestors.any? { |ancestor| ancestor.respond_to?(:name) && ancestor.name == "article" }
|
|
267
|
+
score -= 18 if ancestors.any? { |ancestor| ancestor.respond_to?(:name) && %w[footer aside nav].include?(ancestor.name) }
|
|
268
|
+
|
|
269
|
+
score
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
def keyword_score(text, value, keywords = POSITIVE_KEYWORDS)
|
|
273
|
+
return 0 if text.empty?
|
|
274
|
+
|
|
275
|
+
keywords.count { |keyword| text.include?(keyword) } * value
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
def dimension_from(*values)
|
|
279
|
+
values.compact.each do |value|
|
|
280
|
+
digits = value.to_s.scan(/[0-9]+/).first
|
|
281
|
+
return digits.to_i if digits
|
|
282
|
+
end
|
|
283
|
+
nil
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
def descriptor_dimension(srcset)
|
|
287
|
+
candidate = parse_srcset(srcset).max_by { |entry| entry[:weight].to_i }
|
|
288
|
+
candidate && candidate[:weight]
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
def valid_image_url?(value)
|
|
292
|
+
value = value.to_s.strip
|
|
293
|
+
return false if value.empty?
|
|
294
|
+
return false if value.match?(/\A(?:data|javascript):/i)
|
|
295
|
+
|
|
296
|
+
true
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
def best_candidate(candidates)
|
|
300
|
+
deduped = {}
|
|
301
|
+
candidates.each do |candidate|
|
|
302
|
+
next unless candidate[:url]
|
|
303
|
+
|
|
304
|
+
key = candidate[:url]
|
|
305
|
+
existing = deduped[key]
|
|
306
|
+
if !existing || candidate[:score] > existing[:score]
|
|
307
|
+
deduped[key] = candidate
|
|
308
|
+
end
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
deduped.values.max_by { |candidate| candidate[:score] }
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
def absolutize(base_url, value)
|
|
315
|
+
return if value.nil? || value.empty?
|
|
316
|
+
|
|
317
|
+
if base_url
|
|
318
|
+
Utilities.absolute_url(base_url, value)
|
|
319
|
+
else
|
|
320
|
+
value
|
|
321
|
+
end
|
|
322
|
+
rescue URI::Error
|
|
323
|
+
value
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
def download(url)
|
|
327
|
+
response = http_client.get_response(URI.parse(url))
|
|
328
|
+
return unless http_success?(response)
|
|
329
|
+
|
|
330
|
+
body = response.body.to_s
|
|
331
|
+
return if body.empty?
|
|
332
|
+
|
|
333
|
+
directory = Dir.mktmpdir("coelacanth-eyecatch-")
|
|
334
|
+
file_path = File.join(directory, filename_for(url, response))
|
|
335
|
+
File.binwrite(file_path, body)
|
|
336
|
+
|
|
337
|
+
Result.new(url: url, path: file_path)
|
|
338
|
+
rescue StandardError
|
|
339
|
+
nil
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
def http_success?(response)
|
|
343
|
+
return false unless response.respond_to?(:code)
|
|
344
|
+
|
|
345
|
+
response.code.to_i.between?(200, 299)
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
def filename_for(url, response)
|
|
349
|
+
uri = URI.parse(url)
|
|
350
|
+
candidate = File.basename(uri.path.to_s)
|
|
351
|
+
candidate = nil if candidate.nil? or candidate.empty? or candidate == "."
|
|
352
|
+
extension = File.extname(candidate.to_s)
|
|
353
|
+
|
|
354
|
+
if extension.empty?
|
|
355
|
+
extension = extension_for_content_type(response)
|
|
356
|
+
candidate = ["eyecatch", extension.delete_prefix(".")].compact.join(".")
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
candidate || "eyecatch#{extension_for_content_type(response)}"
|
|
360
|
+
rescue URI::Error
|
|
361
|
+
"eyecatch#{extension_for_content_type(response)}"
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
def extension_for_content_type(response)
|
|
365
|
+
content_type = if response.respond_to?(:content_type)
|
|
366
|
+
response.content_type
|
|
367
|
+
elsif response.respond_to?(:[])
|
|
368
|
+
response["content-type"]
|
|
369
|
+
end
|
|
370
|
+
content_type = content_type.to_s.split(";").first
|
|
371
|
+
|
|
372
|
+
case content_type
|
|
373
|
+
when "image/jpeg", "image/jpg" then ".jpg"
|
|
374
|
+
when "image/png" then ".png"
|
|
375
|
+
when "image/gif" then ".gif"
|
|
376
|
+
when "image/webp" then ".webp"
|
|
377
|
+
when "image/svg+xml" then ".svg"
|
|
378
|
+
else
|
|
379
|
+
".bin"
|
|
380
|
+
end
|
|
381
|
+
end
|
|
382
|
+
end
|
|
383
|
+
end
|
|
384
|
+
end
|