coelacanth 0.3.10 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -7
- data/Gemfile +1 -1
- data/README.md +128 -55
- data/lib/coelacanth/dom.rb +3 -2
- data/lib/coelacanth/extractor/fallback_probe.rb +34 -0
- data/lib/coelacanth/extractor/heuristic_probe.rb +175 -0
- data/lib/coelacanth/extractor/image_collector.rb +19 -0
- data/lib/coelacanth/extractor/listing_collector.rb +270 -0
- data/lib/coelacanth/extractor/markdown_renderer.rb +128 -0
- data/lib/coelacanth/extractor/metadata_probe.rb +121 -0
- data/lib/coelacanth/extractor/normalizer.rb +47 -0
- data/lib/coelacanth/extractor/utilities.rb +145 -0
- data/lib/coelacanth/extractor/weak_ml_probe.rb +136 -0
- data/lib/coelacanth/extractor.rb +67 -0
- data/lib/coelacanth/version.rb +1 -1
- data/lib/coelacanth.rb +8 -1
- metadata +11 -2
- data/Gemfile.lock +0 -103
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3f8a15ca435355182b77131ffa50b2c93b08693c3ccf0d26b5010e1f7f9df4fd
|
|
4
|
+
data.tar.gz: 34f198f28014ed558f14ef5599f9275579054a1ec9ebeb6b503c60555329bcf2
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: aa7cdd1ac14d7104604114885facaffc6dc5e4d8b91911cb1ae26b34aa2550010bd1658f3c832a6e1dc4a0a059113317c7242bc187e02a396cc17398570407e0
|
|
7
|
+
data.tar.gz: 8f1c4c988ec19a862409e06fbe297529d058a78dc77f328959ab5274fdd372ef65d8e92b38ba76ef08b55ad7d5abeb75d585685f411e223ce9fc60515430ccc7
|
data/CHANGELOG.md
CHANGED
|
@@ -4,13 +4,19 @@ All notable changes to this project will be documented in this file.
|
|
|
4
4
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
5
5
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
6
|
|
|
7
|
-
## [v0.
|
|
8
|
-
### :
|
|
9
|
-
- [`
|
|
7
|
+
## [v0.4.0] - 2025-11-02
|
|
8
|
+
### :sparkles: New Features
|
|
9
|
+
- [`d13c21e`](https://github.com/slidict/coelacanth/commit/d13c21e106e7155ba7d545570341da37caa0b4b3) - support structural listings for digital.go.jp *(commit by [@yubele](https://github.com/yubele))*
|
|
10
|
+
- [`19fb4d7`](https://github.com/slidict/coelacanth/commit/19fb4d7a29350bb063c6bb76a89c46d7d001fc3e) - expose body markdown blocks *(commit by [@yubele](https://github.com/yubele))*
|
|
10
11
|
|
|
11
12
|
### :wrench: Chores
|
|
12
|
-
- [`
|
|
13
|
-
- [`
|
|
14
|
-
- [`
|
|
13
|
+
- [`513d522`](https://github.com/slidict/coelacanth/commit/513d5226147b7be72c43817d0ca417a2ea355d31) - **deps**: Bump rubocop from 1.76.1 to 1.77.0 *(commit by [@dependabot[bot]](https://github.com/apps/dependabot))*
|
|
14
|
+
- [`388658f`](https://github.com/slidict/coelacanth/commit/388658fcaa94f09123c67ab62dd69d207d03ee7a) - **deps**: Bump rubocop from 1.77.0 to 1.81.6 *(commit by [@dependabot[bot]](https://github.com/apps/dependabot))*
|
|
15
|
+
- [`be62dbb`](https://github.com/slidict/coelacanth/commit/be62dbbf77fc9a324f29d684df167cbedcea7b8f) - translate listing samples to English *(commit by [@yubele](https://github.com/yubele))*
|
|
16
|
+
- [`41bf358`](https://github.com/slidict/coelacanth/commit/41bf358eb955fb75b467719750a19f9d07a2cba1) - require ruby 3.4 or newer *(commit by [@yubele](https://github.com/yubele))*
|
|
17
|
+
- [`cff304d`](https://github.com/slidict/coelacanth/commit/cff304d255952ad6be18e796c6649b72272bc23e) - **deps-dev**: Bump rexml in the bundler group across 1 directory *(commit by [@dependabot[bot]](https://github.com/apps/dependabot))*
|
|
18
|
+
- [`962c267`](https://github.com/slidict/coelacanth/commit/962c267ab8f301e1375b67fc32a3cbc54effc232) - Bump version to 0.4.0 *(commit by [@yubele](https://github.com/yubele))*
|
|
19
|
+
- [`bf3633c`](https://github.com/slidict/coelacanth/commit/bf3633c82a61684238485606300bfa0891d247eb) - Delete Gemfile.lock *(commit by [@yubele](https://github.com/yubele))*
|
|
20
|
+
- [`62f2a0e`](https://github.com/slidict/coelacanth/commit/62f2a0ecb80fb5d77f704e6c48d1d8f4c7670818) - Add Gemfile.lock to .gitignore *(commit by [@yubele](https://github.com/yubele))*
|
|
15
21
|
|
|
16
|
-
[v0.
|
|
22
|
+
[v0.4.0]: https://github.com/slidict/coelacanth/compare/v0.3.10...v0.4.0
|
data/Gemfile
CHANGED
data/README.md
CHANGED
|
@@ -1,94 +1,167 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Coelacanth
|
|
2
2
|
|
|
3
3
|
[](https://badge.fury.io/rb/coelacanth)
|
|
4
4
|
[](https://github.com/slidict/coelacanth/actions)
|
|
5
5
|
[](https://opensource.org/licenses/MIT)
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
Coelacanth is a Ruby gem for extracting high-quality article content, metadata, and screenshots from arbitrary web pages. It is
|
|
8
|
+
built to power content ingestion pipelines that have to withstand layout experiments, CMS redesigns, and inconsistent markup
|
|
9
|
+
while remaining easy to extend.
|
|
10
|
+
|
|
11
|
+
It is the successor to [`web_stat`](https://rubygems.org/gems/web_stat) and continues the same goal of reliable article
|
|
12
|
+
extraction under the `slidict` umbrella. Compared to [`web_stat`](https://github.com/slidict/web_stat/) the gem has been
|
|
13
|
+
re-architected with a modern extractor pipeline, built-in screenshot capture, and a clearer configuration story so you can drop
|
|
14
|
+
it into contemporary ingestion stacks without bespoke glue code.
|
|
15
|
+
|
|
16
|
+
## Table of contents
|
|
17
|
+
- [Features](#features)
|
|
18
|
+
- [Requirements](#requirements)
|
|
19
|
+
- [Installation](#installation)
|
|
20
|
+
- [Quick start](#quick-start)
|
|
21
|
+
- [Extractor pipeline](#extractor-pipeline)
|
|
22
|
+
- [Configuration](#configuration)
|
|
23
|
+
- [Development workflow](#development-workflow)
|
|
24
|
+
- [Testing](#testing)
|
|
25
|
+
- [Contributing](#contributing)
|
|
26
|
+
- [License](#license)
|
|
8
27
|
|
|
9
|
-
##
|
|
10
|
-
|
|
11
|
-
|
|
28
|
+
## Features
|
|
29
|
+
- **Layout-resilient extraction** – Multi-stage extractor falls back from structured metadata to heuristics and lightweight
|
|
30
|
+
machine learning so you continue to get clean article bodies even when markup drifts.
|
|
31
|
+
- **UTF-8 normalization** – HTML responses are normalized into UTF-8 before parsing to play nicely with Japanese and other
|
|
32
|
+
multi-byte sources.
|
|
33
|
+
- **Screenshot capture** – Fetches full-page PNGs via a configurable browser client so you can archive visual context alongside
|
|
34
|
+
the extracted text.
|
|
35
|
+
- **Redirect resolution** – Follows HTTP redirects and long redirect chains to guarantee the extractor works on the final
|
|
36
|
+
landing page.
|
|
37
|
+
- **Configurable HTTP headers** – Inject custom headers (user agent, authorization, etc.) into the remote browser session for
|
|
38
|
+
authenticated or geo-targeted crawling.
|
|
39
|
+
|
|
40
|
+
### What's new compared to web_stat?
|
|
41
|
+
|
|
42
|
+
- **Multi-stage pipeline** – `web_stat` relied on a single-pass heuristic extractor, whereas Coelacanth layers metadata,
|
|
43
|
+
heuristic, and optional ML probes that graduate based on confidence thresholds.
|
|
44
|
+
- **First-class screenshots** – Capture full-page PNGs alongside the extracted text without writing a separate headless browser
|
|
45
|
+
integration.
|
|
46
|
+
- **Environment-aware configuration** – Manage remote browser credentials, HTTP headers, and client selection through
|
|
47
|
+
`config/coelacanth.yml` instead of hand-tuned initializer code.
|
|
48
|
+
- **Markdown-first output** – Get both Markdown and raw DOM representations from `Coelacanth.analyze` so you can publish the
|
|
49
|
+
same payload to static-site builders, CMS importers, or downstream summarizers.
|
|
50
|
+
|
|
51
|
+
## Requirements
|
|
52
|
+
- Ruby **3.4 or newer**
|
|
53
|
+
- [Bundler](https://bundler.io/) for dependency management
|
|
54
|
+
- A remote Chrome-compatible WebSocket endpoint when using the default Ferrum client (see [Configuration](#configuration))
|
|
12
55
|
|
|
56
|
+
## Installation
|
|
57
|
+
Add the gem to your application:
|
|
13
58
|
|
|
14
59
|
```ruby
|
|
15
|
-
gem
|
|
60
|
+
gem "coelacanth"
|
|
16
61
|
```
|
|
17
62
|
|
|
18
|
-
|
|
63
|
+
Install the dependencies:
|
|
19
64
|
|
|
20
65
|
```bash
|
|
21
|
-
|
|
66
|
+
bundle install
|
|
22
67
|
```
|
|
23
68
|
|
|
24
|
-
Or install
|
|
69
|
+
Or install the gem directly:
|
|
25
70
|
|
|
26
71
|
```bash
|
|
27
|
-
|
|
72
|
+
gem install coelacanth
|
|
28
73
|
```
|
|
29
74
|
|
|
30
|
-
|
|
75
|
+
## Quick start
|
|
76
|
+
```ruby
|
|
77
|
+
require "coelacanth"
|
|
31
78
|
|
|
32
|
-
|
|
79
|
+
result = Coelacanth.analyze("https://example.com/article")
|
|
33
80
|
|
|
34
|
-
|
|
35
|
-
|
|
81
|
+
result[:extraction] # => article metadata and body markdown
|
|
82
|
+
result[:dom] # => Oga DOM representation for downstream processing
|
|
83
|
+
result[:screenshot] # => PNG screenshot as a binary string
|
|
36
84
|
```
|
|
37
85
|
|
|
38
|
-
|
|
86
|
+
The returned hash includes:
|
|
87
|
+
|
|
88
|
+
- `:extraction` – output from `Coelacanth::Extractor`, including title, Markdown body (`body_markdown` and
|
|
89
|
+
`body_markdown_list`), images, listings, published date, and the probe source and confidence score.
|
|
90
|
+
- `:dom` – a parsed Oga DOM if you need to traverse the document manually.
|
|
91
|
+
- `:screenshot` – raw PNG data that you can persist or feed to other systems.
|
|
92
|
+
|
|
93
|
+
## Extractor pipeline
|
|
94
|
+
Coelacanth ships with a multi-stage extractor that tries increasingly involved probes until one meets its confidence target:
|
|
95
|
+
|
|
96
|
+
1. **MetadataProbe** (threshold `0.85`) pulls `schema.org` JSON-LD, Open Graph, Twitter Cards, or semantic containers such as
|
|
97
|
+
`<main>`/`<article>` when available.
|
|
98
|
+
2. **HeuristicProbe** (threshold `0.75`) scores block-level nodes using text length, link density, punctuation density, DOM
|
|
99
|
+
depth, and sibling variance, then greedily attaches surrounding headers and media.
|
|
100
|
+
3. **WeakMlProbe** (threshold `0.70`) optionally boosts accuracy with a lightweight classifier that combines heuristic features
|
|
101
|
+
with class and id tokens (e.g., `article-body`, `post`, `content`).
|
|
102
|
+
4. **FallbackProbe** acts as a safety net by following AMP/print links or summarizing the whole document when the previous
|
|
103
|
+
probes fail.
|
|
104
|
+
|
|
105
|
+
A `ListingCollector` scans nearby layout regions to surface structured listings (e.g., sidebar "latest news" blocks) so you can
|
|
106
|
+
store related links alongside the article.
|
|
107
|
+
|
|
108
|
+
## Configuration
|
|
109
|
+
Runtime configuration is stored in `config/coelacanth.yml`. Environments inherit from the `development` section by default.
|
|
110
|
+
|
|
111
|
+
```yaml
|
|
112
|
+
development:
|
|
113
|
+
client: "ferrum" # Options: "ferrum", "screenshot_one"
|
|
114
|
+
remote_client:
|
|
115
|
+
ws_url: "ws://chrome:3000/chrome"
|
|
116
|
+
timeout: 10
|
|
117
|
+
headers:
|
|
118
|
+
Authorization: "Bearer 1234567890"
|
|
119
|
+
User-Agent: "Coelacanth Chrome Extension"
|
|
120
|
+
screenshot_one:
|
|
121
|
+
key: "your_screenshot_one_api_key_here"
|
|
122
|
+
```
|
|
39
123
|
|
|
40
|
-
|
|
124
|
+
- **Ferrum client** – Requires a running Chrome instance that exposes the DevTools protocol via WebSocket. Configure the URL,
|
|
125
|
+
timeout, and any headers to inject.
|
|
126
|
+
- **ScreenshotOne client** – Supply an API key to offload screenshot capture to [ScreenshotOne](https://screenshotone.com/).
|
|
127
|
+
- Configuration is environment-aware: set `RAILS_ENV`/`RACK_ENV` or use Rails' built-in environment handling when the gem is
|
|
128
|
+
used inside a Rails project.
|
|
41
129
|
|
|
42
|
-
|
|
43
|
-
|
|
130
|
+
If you are working inside Docker, make sure the `UID` environment variable matches your host user by exporting it in your shell
|
|
131
|
+
startup file:
|
|
44
132
|
|
|
45
|
-
```
|
|
46
|
-
|
|
133
|
+
```bash
|
|
134
|
+
export UID=${UID}
|
|
47
135
|
```
|
|
48
136
|
|
|
49
|
-
|
|
137
|
+
## Development workflow
|
|
138
|
+
Clone the repository and install dependencies:
|
|
50
139
|
|
|
51
|
-
```
|
|
52
|
-
|
|
53
|
-
|
|
140
|
+
```bash
|
|
141
|
+
git clone https://github.com/slidict/coelacanth.git
|
|
142
|
+
cd coelacanth
|
|
143
|
+
bundle install
|
|
54
144
|
```
|
|
55
145
|
|
|
56
|
-
|
|
146
|
+
You can open an interactive console with the gem loaded via:
|
|
57
147
|
|
|
148
|
+
```bash
|
|
149
|
+
bin/console
|
|
58
150
|
```
|
|
59
|
-
$ bundle exec rspec
|
|
60
|
-
```
|
|
61
|
-
|
|
62
|
-
## Features
|
|
63
|
-
- Get dom by oga
|
|
64
|
-
- Get screenshot
|
|
65
|
-
|
|
66
|
-
## Commit Message Guidelines
|
|
67
|
-
|
|
68
|
-
To ensure consistency and facilitate automatic updates to the `CHANGELOG.md`, please follow the [Conventional Commits](https://www.conventionalcommits.org/) specification when creating commit messages. This helps maintain a clear and structured commit history.
|
|
69
151
|
|
|
70
|
-
|
|
152
|
+
## Testing
|
|
153
|
+
Run the test suite with RSpec:
|
|
71
154
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
- `fix: resolve issue with URL redirection`
|
|
76
|
-
- `docs: update README with usage instructions`
|
|
77
|
-
- `chore: update dependencies`
|
|
78
|
-
- `build: update build configuration`
|
|
79
|
-
- `ci: update CI pipeline`
|
|
80
|
-
- `style: fix code style issues`
|
|
81
|
-
- `refactor: refactor code for better readability`
|
|
82
|
-
- `perf: improve performance of data processing`
|
|
83
|
-
- `test: add new tests for URL parsing module`
|
|
84
|
-
|
|
85
|
-
By following these guidelines, you help ensure that our project's commit history is easy to navigate and that versioning and release notes are generated correctly.
|
|
155
|
+
```bash
|
|
156
|
+
bundle exec rspec
|
|
157
|
+
```
|
|
86
158
|
|
|
87
159
|
## Contributing
|
|
88
|
-
Bug reports and pull requests are welcome on GitHub at
|
|
160
|
+
Bug reports and pull requests are welcome on GitHub at
|
|
161
|
+
[https://github.com/slidict/coelacanth](https://github.com/slidict/coelacanth). Please follow the
|
|
162
|
+
[Conventional Commits](https://www.conventionalcommits.org/) specification so we can keep the changelog automation healthy.
|
|
89
163
|
|
|
90
|
-
|
|
91
|
-
The gem is available as open-source under the terms of the MIT License.
|
|
164
|
+
By participating in this project you agree to abide by the [Contributor Covenant](CODE_OF_CONDUCT.md).
|
|
92
165
|
|
|
93
|
-
##
|
|
94
|
-
|
|
166
|
+
## License
|
|
167
|
+
Coelacanth is available as open source under the terms of the [MIT License](LICENSE.txt).
|
data/lib/coelacanth/dom.rb
CHANGED
|
@@ -5,8 +5,9 @@ require "oga"
|
|
|
5
5
|
module Coelacanth
|
|
6
6
|
# Coelacanth::Dom
|
|
7
7
|
class Dom
|
|
8
|
-
def oga(url)
|
|
9
|
-
|
|
8
|
+
def oga(url, html: nil)
|
|
9
|
+
html ||= Net::HTTP.get_response(URI.parse(url)).body
|
|
10
|
+
Oga.parse_xml(html)
|
|
10
11
|
end
|
|
11
12
|
end
|
|
12
13
|
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "oga"
|
|
4
|
+
|
|
5
|
+
require_relative "utilities"
|
|
6
|
+
|
|
7
|
+
module Coelacanth
|
|
8
|
+
class Extractor
|
|
9
|
+
# Attempts final recovery strategies when all other probes fail.
|
|
10
|
+
class FallbackProbe
|
|
11
|
+
Result = Struct.new(
|
|
12
|
+
:title,
|
|
13
|
+
:node,
|
|
14
|
+
:published_at,
|
|
15
|
+
:byline,
|
|
16
|
+
:source_tag,
|
|
17
|
+
:confidence,
|
|
18
|
+
keyword_init: true
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
def call(doc:, url: nil)
|
|
22
|
+
body = doc.at_css("body") || doc
|
|
23
|
+
Result.new(
|
|
24
|
+
title: doc.at_css("title")&.text&.strip,
|
|
25
|
+
node: body,
|
|
26
|
+
published_at: nil,
|
|
27
|
+
byline: nil,
|
|
28
|
+
source_tag: :fallback,
|
|
29
|
+
confidence: 0.35
|
|
30
|
+
)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "oga"
|
|
4
|
+
|
|
5
|
+
require_relative "utilities"
|
|
6
|
+
|
|
7
|
+
module Coelacanth
|
|
8
|
+
class Extractor
|
|
9
|
+
# Scores DOM nodes based on simple heuristics to locate the primary article body.
|
|
10
|
+
class HeuristicProbe
|
|
11
|
+
Result = Struct.new(
|
|
12
|
+
:title,
|
|
13
|
+
:node,
|
|
14
|
+
:published_at,
|
|
15
|
+
:byline,
|
|
16
|
+
:source_tag,
|
|
17
|
+
:confidence,
|
|
18
|
+
keyword_init: true
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
BLOCK_SELECTOR = "article, main, section, div".freeze
|
|
22
|
+
TAG_WEIGHTS = Hash.new(0).merge(
|
|
23
|
+
"article" => 80,
|
|
24
|
+
"main" => 60,
|
|
25
|
+
"section" => 30,
|
|
26
|
+
"div" => 10
|
|
27
|
+
).freeze
|
|
28
|
+
NEGATIVE_TOKENS = %w[nav footer header sidebar related share menu].freeze
|
|
29
|
+
POSITIVE_TOKENS = %w[content article body post entry text].freeze
|
|
30
|
+
|
|
31
|
+
def call(doc:, url: nil)
|
|
32
|
+
candidates = doc.css(BLOCK_SELECTOR).map do |node|
|
|
33
|
+
score_candidate(node)
|
|
34
|
+
end.compact
|
|
35
|
+
|
|
36
|
+
return if candidates.empty?
|
|
37
|
+
|
|
38
|
+
best = candidates.max_by { |candidate| candidate[:score] }
|
|
39
|
+
return if best[:score] < minimum_score
|
|
40
|
+
|
|
41
|
+
Result.new(
|
|
42
|
+
title: title_from_meta(doc),
|
|
43
|
+
node: expand(best[:node]),
|
|
44
|
+
published_at: published_at_from_meta(doc),
|
|
45
|
+
byline: byline_from_meta(doc),
|
|
46
|
+
source_tag: :heuristic,
|
|
47
|
+
confidence: confidence(best[:score])
|
|
48
|
+
)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def score_candidate(node)
|
|
54
|
+
text_length = Utilities.text_length(node)
|
|
55
|
+
return if text_length < 80
|
|
56
|
+
|
|
57
|
+
link_density = Utilities.link_density(node)
|
|
58
|
+
punct_density = Utilities.punctuation_density(node)
|
|
59
|
+
tag_weight = TAG_WEIGHTS[node.name]
|
|
60
|
+
class_weight = class_score(node)
|
|
61
|
+
depth_penalty = Utilities.depth(node) * 4
|
|
62
|
+
sibling_bonus = sibling_variance(node)
|
|
63
|
+
|
|
64
|
+
score = (
|
|
65
|
+
text_length * 0.35 +
|
|
66
|
+
punct_density * 280 -
|
|
67
|
+
link_density * 160 +
|
|
68
|
+
tag_weight +
|
|
69
|
+
class_weight +
|
|
70
|
+
sibling_bonus -
|
|
71
|
+
depth_penalty
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
{ node: node, score: score }
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def minimum_score
|
|
78
|
+
95
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def class_score(node)
|
|
82
|
+
tokens = Utilities.class_id_tokens(node)
|
|
83
|
+
score = tokens.count { |token| POSITIVE_TOKENS.include?(token) } * 40
|
|
84
|
+
score -= tokens.count { |token| NEGATIVE_TOKENS.include?(token) } * 60
|
|
85
|
+
score
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def sibling_variance(node)
|
|
89
|
+
parent = node.parent
|
|
90
|
+
return 0 unless parent
|
|
91
|
+
|
|
92
|
+
siblings = Utilities.element_children(parent)
|
|
93
|
+
return 0 if siblings.length < 2
|
|
94
|
+
|
|
95
|
+
lengths = siblings.map { |sibling| Utilities.text_length(sibling) }
|
|
96
|
+
mean = lengths.sum.to_f / lengths.length
|
|
97
|
+
variance = lengths.map { |length| (length - mean)**2 }.sum.to_f / lengths.length
|
|
98
|
+
Math.sqrt(variance) * 0.25
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def expand(node)
|
|
102
|
+
return node unless node.parent
|
|
103
|
+
|
|
104
|
+
before = neighboring_nodes(node, -1).reverse
|
|
105
|
+
after = neighboring_nodes(node, 1)
|
|
106
|
+
|
|
107
|
+
wrap_fragment(before + [node] + after)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def neighboring_nodes(node, direction)
|
|
111
|
+
siblings = []
|
|
112
|
+
current = node
|
|
113
|
+
loop do
|
|
114
|
+
current = if direction.negative?
|
|
115
|
+
Utilities.previous_element(current)
|
|
116
|
+
else
|
|
117
|
+
Utilities.next_element(current)
|
|
118
|
+
end
|
|
119
|
+
break unless current
|
|
120
|
+
|
|
121
|
+
break unless include_in_expansion?(current)
|
|
122
|
+
|
|
123
|
+
siblings << current
|
|
124
|
+
end
|
|
125
|
+
siblings
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def include_in_expansion?(node)
|
|
129
|
+
%w[h1 h2 h3 h4 h5 h6 img blockquote p ul ol figure].include?(node.name)
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def wrap_fragment(nodes)
|
|
133
|
+
container = Oga::XML::Element.new(name: "article")
|
|
134
|
+
nodes.each { |node| container.children << node }
|
|
135
|
+
container
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def confidence(score)
|
|
139
|
+
return 0.0 if score.to_f <= 0.0
|
|
140
|
+
|
|
141
|
+
value = 1.0 / (1.0 + Math.exp(-(score - 100) / 12.0))
|
|
142
|
+
value.clamp(0.0, 0.95)
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def title_from_meta(doc)
|
|
146
|
+
Utilities.meta_content(
|
|
147
|
+
doc,
|
|
148
|
+
"meta[property='og:title']",
|
|
149
|
+
"meta[name='twitter:title']",
|
|
150
|
+
"meta[name='title']"
|
|
151
|
+
) || doc.at_css("title")&.text&.strip
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def published_at_from_meta(doc)
|
|
155
|
+
Utilities.parse_time(
|
|
156
|
+
Utilities.meta_content(
|
|
157
|
+
doc,
|
|
158
|
+
"meta[property='article:published_time']",
|
|
159
|
+
"meta[name='pubdate']",
|
|
160
|
+
"meta[name='publish_date']",
|
|
161
|
+
"meta[name='date']"
|
|
162
|
+
)
|
|
163
|
+
)
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def byline_from_meta(doc)
|
|
167
|
+
Utilities.meta_content(
|
|
168
|
+
doc,
|
|
169
|
+
"meta[name='author']",
|
|
170
|
+
"meta[property='article:author']"
|
|
171
|
+
)
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Coelacanth
|
|
4
|
+
class Extractor
|
|
5
|
+
# Collects image metadata from the extracted DOM node.
|
|
6
|
+
class ImageCollector
|
|
7
|
+
def call(node)
|
|
8
|
+
return [] unless node
|
|
9
|
+
|
|
10
|
+
node.css("img").map do |image|
|
|
11
|
+
{
|
|
12
|
+
src: image["src"].to_s.strip,
|
|
13
|
+
alt: image["alt"].to_s.strip
|
|
14
|
+
}
|
|
15
|
+
end.reject { |entry| entry[:src].empty? }
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|