rubycrawl 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +81 -32
- data/lib/rubycrawl/browser/extraction.rb +34 -12
- data/lib/rubycrawl/browser/readability.js +2786 -0
- data/lib/rubycrawl/browser.rb +1 -1
- data/lib/rubycrawl/robots_parser.rb +86 -0
- data/lib/rubycrawl/site_crawler.rb +15 -1
- data/lib/rubycrawl/tasks/install.rake +6 -5
- data/lib/rubycrawl/version.rb +1 -1
- data/lib/rubycrawl.rb +9 -7
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c38e6b7b377a04d6baec4756a7bdf749580e5391d42483b9f6f7e50ee0cbd25f
|
|
4
|
+
data.tar.gz: 8323d9dbe93915b2f81fb6adbd6056b0007ef3ac58a828feb3492d14e02b7423
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2905355938f1f18c747c83bdcc1360f88c887026d8b2242c00a87727cb32ab9954927a24ea12975d8c89a1f0e358ffab22ed458b2cac1d8a9acfb9537bb03eca
|
|
7
|
+
data.tar.gz: 556b1d58707d72698a8e537dc41e8a0b4d47656b501b1cbdcff9db1532180d4d28b4f443f505789454da381efa52adf722c501eb85212564809b6571d504ee55
|
data/README.md
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# RubyCrawl 🎭
|
|
2
2
|
|
|
3
3
|
[](https://rubygems.org/gems/rubycrawl)
|
|
4
|
+
[](https://github.com/craft-wise/rubycrawl/actions/workflows/ci.yml)
|
|
4
5
|
[](https://opensource.org/licenses/MIT)
|
|
5
6
|
[](https://www.ruby-lang.org/)
|
|
6
7
|
|
|
@@ -16,6 +17,7 @@ RubyCrawl provides **accurate, JavaScript-enabled web scraping** using a pure Ru
|
|
|
16
17
|
- ✅ **Production-ready** — Auto-retry, error handling, resource optimization
|
|
17
18
|
- ✅ **Multi-page crawling** — BFS algorithm with smart URL deduplication
|
|
18
19
|
- ✅ **Rails-friendly** — Generators, initializers, and ActiveJob integration
|
|
20
|
+
- ✅ **Readability-powered** — Mozilla Readability.js for article-quality extraction, heuristic fallback for all other pages
|
|
19
21
|
|
|
20
22
|
```ruby
|
|
21
23
|
# One line to crawl any JavaScript-heavy site
|
|
@@ -35,7 +37,7 @@ result.metadata # Title, description, OG tags, etc.
|
|
|
35
37
|
- **Simple API**: Clean Ruby interface — zero Ferrum or CDP knowledge required
|
|
36
38
|
- **Resource optimization**: Built-in resource blocking for 2-3x faster crawls
|
|
37
39
|
- **Auto-managed browsers**: Lazy Chrome singleton, isolated page per crawl
|
|
38
|
-
- **Content extraction**:
|
|
40
|
+
- **Content extraction**: Mozilla Readability.js (primary) + link-density heuristic (fallback) — article-quality `clean_html`, `clean_text`, `clean_markdown`, links, metadata
|
|
39
41
|
- **Multi-page crawling**: BFS crawler with configurable depth limits and URL deduplication
|
|
40
42
|
- **Smart URL handling**: Automatic normalization, tracking parameter removal, same-host filtering
|
|
41
43
|
- **Rails integration**: First-class Rails support with generators and initializers
|
|
@@ -102,14 +104,15 @@ require "rubycrawl"
|
|
|
102
104
|
result = RubyCrawl.crawl("https://example.com")
|
|
103
105
|
|
|
104
106
|
# Access extracted content
|
|
105
|
-
result.final_url
|
|
106
|
-
result.clean_text
|
|
107
|
-
result.clean_html
|
|
108
|
-
result.raw_text
|
|
109
|
-
result.html
|
|
110
|
-
result.links
|
|
111
|
-
result.metadata
|
|
112
|
-
result.
|
|
107
|
+
result.final_url # Final URL after redirects
|
|
108
|
+
result.clean_text # Noise-stripped plain text (no nav/footer/ads)
|
|
109
|
+
result.clean_html # Noise-stripped HTML (same noise removed as clean_text)
|
|
110
|
+
result.raw_text # Full body.innerText (unfiltered)
|
|
111
|
+
result.html # Full raw HTML content
|
|
112
|
+
result.links # Extracted links with url, text, title, rel
|
|
113
|
+
result.metadata # Title, description, OG tags, etc.
|
|
114
|
+
result.metadata['extractor'] # "readability" or "heuristic" — which extractor ran
|
|
115
|
+
result.clean_markdown # Markdown converted from clean_html (lazy — first access only)
|
|
113
116
|
```
|
|
114
117
|
|
|
115
118
|
## Use Cases
|
|
@@ -187,13 +190,38 @@ puts "Indexed #{pages_crawled} pages"
|
|
|
187
190
|
|
|
188
191
|
#### Multi-Page Options
|
|
189
192
|
|
|
190
|
-
| Option
|
|
191
|
-
|
|
|
192
|
-
| `max_pages`
|
|
193
|
-
| `max_depth`
|
|
194
|
-
| `same_host_only`
|
|
195
|
-
| `wait_until`
|
|
196
|
-
| `block_resources`
|
|
193
|
+
| Option | Default | Description |
|
|
194
|
+
| ---------------------- | --------- | --------------------------------------------------- |
|
|
195
|
+
| `max_pages` | 50 | Maximum number of pages to crawl |
|
|
196
|
+
| `max_depth` | 3 | Maximum link depth from start URL |
|
|
197
|
+
| `same_host_only` | true | Only follow links on the same domain |
|
|
198
|
+
| `wait_until` | inherited | Page load strategy |
|
|
199
|
+
| `block_resources` | inherited | Block images/fonts/CSS |
|
|
200
|
+
| `respect_robots_txt` | false | Honour robots.txt rules and auto-sleep `Crawl-delay`|
|
|
201
|
+
|
|
202
|
+
#### robots.txt Support
|
|
203
|
+
|
|
204
|
+
When `respect_robots_txt: true`, RubyCrawl fetches `robots.txt` once at the start of the crawl and:
|
|
205
|
+
|
|
206
|
+
- Skips any URL disallowed for `User-agent: *`
|
|
207
|
+
- Automatically sleeps the `Crawl-delay` specified in robots.txt between pages
|
|
208
|
+
|
|
209
|
+
```ruby
|
|
210
|
+
RubyCrawl.crawl_site("https://example.com",
|
|
211
|
+
respect_robots_txt: true,
|
|
212
|
+
max_pages: 100
|
|
213
|
+
) do |page|
|
|
214
|
+
puts page.url
|
|
215
|
+
end
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
Or enable globally:
|
|
219
|
+
|
|
220
|
+
```ruby
|
|
221
|
+
RubyCrawl.configure(respect_robots_txt: true)
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
If robots.txt is unreachable or missing, crawling proceeds normally (fail open).
|
|
197
225
|
|
|
198
226
|
#### Page Result Object
|
|
199
227
|
|
|
@@ -245,11 +273,12 @@ result = RubyCrawl.crawl(
|
|
|
245
273
|
|
|
246
274
|
| Option | Values | Default | Description |
|
|
247
275
|
| ----------------- | ----------------------------------------------------------- | ------- | --------------------------------------------------- |
|
|
248
|
-
| `wait_until`
|
|
249
|
-
| `block_resources`
|
|
250
|
-
| `max_attempts`
|
|
251
|
-
| `timeout`
|
|
252
|
-
| `headless`
|
|
276
|
+
| `wait_until` | `"load"`, `"domcontentloaded"`, `"networkidle"`, `"commit"` | `nil` | When to consider page loaded (nil = Ferrum default) |
|
|
277
|
+
| `block_resources` | `true`, `false` | `nil` | Block images, fonts, CSS, media for faster crawls |
|
|
278
|
+
| `max_attempts` | Integer | `3` | Total number of attempts (including the first) |
|
|
279
|
+
| `timeout` | Integer (seconds) | `30` | Browser navigation timeout |
|
|
280
|
+
| `headless` | `true`, `false` | `true` | Run Chrome headlessly |
|
|
281
|
+
| `respect_robots_txt` | `true`, `false` | `false` | Honour robots.txt rules and auto-sleep Crawl-delay |
|
|
253
282
|
|
|
254
283
|
**Wait strategies explained:**
|
|
255
284
|
|
|
@@ -318,7 +347,8 @@ result.metadata
|
|
|
318
347
|
# "twitter_image" => "https://...",
|
|
319
348
|
# "canonical" => "https://...",
|
|
320
349
|
# "lang" => "en",
|
|
321
|
-
# "charset" => "UTF-8"
|
|
350
|
+
# "charset" => "UTF-8",
|
|
351
|
+
# "extractor" => "readability" # or "heuristic"
|
|
322
352
|
# }
|
|
323
353
|
```
|
|
324
354
|
|
|
@@ -473,25 +503,45 @@ RubyCrawl uses a single-process architecture:
|
|
|
473
503
|
```
|
|
474
504
|
RubyCrawl (public API)
|
|
475
505
|
↓
|
|
476
|
-
Browser (lib/rubycrawl/browser.rb)
|
|
506
|
+
Browser (lib/rubycrawl/browser.rb) ← Ferrum wrapper
|
|
477
507
|
↓
|
|
478
|
-
Ferrum::Browser
|
|
508
|
+
Ferrum::Browser ← Chrome DevTools Protocol (pure Ruby)
|
|
479
509
|
↓
|
|
480
|
-
Chromium
|
|
510
|
+
Chromium ← headless browser
|
|
511
|
+
↓
|
|
512
|
+
Readability.js → heuristic fallback ← content extraction (inside browser)
|
|
481
513
|
```
|
|
482
514
|
|
|
483
515
|
- Chrome launches once lazily and is reused across all crawls
|
|
484
516
|
- Each crawl gets an isolated page context (own cookies/storage)
|
|
485
|
-
-
|
|
517
|
+
- Content extraction runs inside the browser via `page.evaluate()`:
|
|
518
|
+
- **Primary**: Mozilla Readability.js — article-quality extraction for blogs, docs, news
|
|
519
|
+
- **Fallback**: link-density heuristic — covers marketing pages, homepages, SPAs
|
|
520
|
+
- `result.metadata['extractor']` tells you which path was used (`"readability"` or `"heuristic"`)
|
|
486
521
|
- No separate processes, no HTTP boundary, no Node.js
|
|
487
522
|
|
|
488
523
|
## Performance
|
|
489
524
|
|
|
490
525
|
- **Resource blocking**: Keep `block_resources: true` (default: nil) to skip images/fonts/CSS for 2-3x faster crawls
|
|
491
526
|
- **Wait strategy**: Use `wait_until: "load"` for static sites, `"networkidle"` for SPAs
|
|
492
|
-
- **Concurrency**: Use background jobs (Sidekiq, GoodJob, etc.) for parallel crawling
|
|
493
527
|
- **Browser reuse**: The first crawl is slower (~2s) due to Chrome launch; subsequent crawls are much faster (~200-500ms)
|
|
494
528
|
|
|
529
|
+
### Parallelism
|
|
530
|
+
|
|
531
|
+
RubyCrawl does not support parallel page loading within a single process — Ferrum uses one Chrome instance and concurrent access is not thread-safe.
|
|
532
|
+
|
|
533
|
+
The recommended pattern is **job-level parallelism**: each background job gets its own `RubyCrawl` instance and Chrome process, with natural rate limiting via your job queue's concurrency setting:
|
|
534
|
+
|
|
535
|
+
```ruby
|
|
536
|
+
# Enqueue independent crawls — each job runs its own Chrome
|
|
537
|
+
urls.each { |url| CrawlJob.perform_later(url) }
|
|
538
|
+
|
|
539
|
+
# Control concurrency via your queue worker config (Sidekiq, GoodJob, etc.)
|
|
540
|
+
# e.g. Sidekiq concurrency: 3 → 3 Chrome processes crawling in parallel
|
|
541
|
+
```
|
|
542
|
+
|
|
543
|
+
This also works naturally with `respect_robots_txt: true` — each job respects Crawl-delay independently.
|
|
544
|
+
|
|
495
545
|
## Development
|
|
496
546
|
|
|
497
547
|
```bash
|
|
@@ -499,12 +549,9 @@ git clone git@github.com:craft-wise/rubycrawl.git
|
|
|
499
549
|
cd rubycrawl
|
|
500
550
|
bin/setup
|
|
501
551
|
|
|
502
|
-
# Run
|
|
552
|
+
# Run all tests (Chrome required — installed as a gem dependency)
|
|
503
553
|
bundle exec rspec
|
|
504
554
|
|
|
505
|
-
# Run integration tests (requires Chrome)
|
|
506
|
-
INTEGRATION=1 bundle exec rspec
|
|
507
|
-
|
|
508
555
|
# Manual testing
|
|
509
556
|
bin/console
|
|
510
557
|
> RubyCrawl.crawl("https://example.com")
|
|
@@ -528,7 +575,9 @@ The gem is available as open source under the terms of the [MIT License](LICENSE
|
|
|
528
575
|
|
|
529
576
|
Built with [Ferrum](https://github.com/rubycdp/ferrum) — pure Ruby Chrome DevTools Protocol client.
|
|
530
577
|
|
|
531
|
-
|
|
578
|
+
Content extraction powered by [Mozilla Readability.js](https://github.com/mozilla/readability) — the algorithm behind Firefox Reader View.
|
|
579
|
+
|
|
580
|
+
Markdown conversion powered by [reverse_markdown](https://github.com/xijo/reverse_markdown) for GitHub-flavored output.
|
|
532
581
|
|
|
533
582
|
## Support
|
|
534
583
|
|
|
@@ -3,13 +3,10 @@
|
|
|
3
3
|
class RubyCrawl
|
|
4
4
|
class Browser
|
|
5
5
|
# JavaScript extraction constants, evaluated inside Chromium via page.evaluate().
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
#
|
|
6
|
+
# All constants are IIFEs — Ferrum's page.evaluate() evaluates an expression,
|
|
7
|
+
# it does NOT call function definitions. Wrapping as (() => { ... })() ensures
|
|
8
|
+
# the function is immediately invoked and its return value is captured.
|
|
9
9
|
module Extraction
|
|
10
|
-
# All constants are IIFEs — Ferrum's page.evaluate() evaluates an expression,
|
|
11
|
-
# it does NOT call function definitions. Wrapping as (() => { ... })() ensures
|
|
12
|
-
# the function is immediately invoked and its return value is captured.
|
|
13
10
|
EXTRACT_METADATA_JS = <<~JS
|
|
14
11
|
(() => {
|
|
15
12
|
const getMeta = (name) => {
|
|
@@ -54,8 +51,7 @@ class RubyCrawl
|
|
|
54
51
|
(() => (document.body?.innerText || "").trim())()
|
|
55
52
|
JS
|
|
56
53
|
|
|
57
|
-
# Semantic noise selectors —
|
|
58
|
-
# Interpolated directly into EXTRACT_CONTENT_JS as a string literal.
|
|
54
|
+
# Semantic noise selectors — used by the heuristic fallback.
|
|
59
55
|
NOISE_SELECTORS = [
|
|
60
56
|
'nav', 'header', 'footer', 'aside',
|
|
61
57
|
'[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
|
|
@@ -64,11 +60,37 @@ class RubyCrawl
|
|
|
64
60
|
'script', 'style', 'noscript', 'iframe'
|
|
65
61
|
].join(', ').freeze
|
|
66
62
|
|
|
67
|
-
#
|
|
68
|
-
#
|
|
69
|
-
#
|
|
63
|
+
# Mozilla Readability.js v0.6.0 — vendored source, read once at load time.
|
|
64
|
+
# Embedded inside EXTRACT_CONTENT_JS's outer IIFE so Readability is defined
|
|
65
|
+
# and used within the same Runtime.evaluate expression (Ferrum evaluates a
|
|
66
|
+
# single expression — separate evaluate calls have separate scopes).
|
|
67
|
+
READABILITY_JS = File.read(File.join(__dir__, 'readability.js')).freeze
|
|
68
|
+
|
|
69
|
+
# Extracts clean article HTML using Mozilla Readability (primary) with a
|
|
70
|
+
# link-density heuristic as fallback when Readability returns no content.
|
|
71
|
+
# Everything is wrapped in one outer IIFE so page.evaluate gets a single
|
|
72
|
+
# expression and Readability is in scope for the extraction logic.
|
|
73
|
+
# DOM mutations from the fallback path are reversed after extraction.
|
|
70
74
|
EXTRACT_CONTENT_JS = <<~JS.freeze
|
|
71
75
|
(() => {
|
|
76
|
+
// Mozilla Readability.js v0.6.0 — defined in this IIFE's scope.
|
|
77
|
+
#{READABILITY_JS}
|
|
78
|
+
|
|
79
|
+
// Primary: Mozilla Readability — article-quality extraction.
|
|
80
|
+
let readabilityDebug = null;
|
|
81
|
+
try {
|
|
82
|
+
const docClone = document.cloneNode(true);
|
|
83
|
+
const reader = new Readability(docClone, { charThreshold: 100 });
|
|
84
|
+
const article = reader.parse();
|
|
85
|
+
if (article && article.textContent && article.textContent.trim().length > 200) {
|
|
86
|
+
return { cleanHtml: article.content, extractor: "readability" };
|
|
87
|
+
}
|
|
88
|
+
readabilityDebug = article ? `returned ${article.textContent?.trim().length ?? 0} text chars (below threshold)` : "returned null (no article detected)";
|
|
89
|
+
} catch (e) {
|
|
90
|
+
readabilityDebug = `error: ${e.message}`;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Fallback: link-density heuristic (works on nav-heavy / non-article pages).
|
|
72
94
|
const noiseSelectors = #{NOISE_SELECTORS.to_json};
|
|
73
95
|
function linkDensity(el) {
|
|
74
96
|
const total = (el.innerText || "").trim().length;
|
|
@@ -98,7 +120,7 @@ class RubyCrawl
|
|
|
98
120
|
}
|
|
99
121
|
const cleanHtml = document.body.innerHTML;
|
|
100
122
|
removed.reverse().forEach(({ el, parent, next }) => parent.insertBefore(el, next));
|
|
101
|
-
return { cleanHtml };
|
|
123
|
+
return { cleanHtml, extractor: "heuristic", debug: readabilityDebug };
|
|
102
124
|
})()
|
|
103
125
|
JS
|
|
104
126
|
end
|