rubycrawl 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9fb671708a756ff233448c5d18ac0bd815eb44ecf8d0a8c893fcf8107f95c182
4
- data.tar.gz: 66898cb3f978494123441044319105e027f670557cd61430a184e0e1d105a3ed
3
+ metadata.gz: c38e6b7b377a04d6baec4756a7bdf749580e5391d42483b9f6f7e50ee0cbd25f
4
+ data.tar.gz: 8323d9dbe93915b2f81fb6adbd6056b0007ef3ac58a828feb3492d14e02b7423
5
5
  SHA512:
6
- metadata.gz: d715984a47719f7c022c512bf136c7bd01050b67ea1dba44f9ea2e24b93196525f46c31e95402c18ba2afbceba02a734f8f214ee686522260557589fca7fea01
7
- data.tar.gz: 621e0c5ee326f2757c5459ca763bffd24bfa70baa9c33d820003d7bae0da0ce20529e1bff95136893b846fd80efc70129873ee87ccdb7c2bef5926879d01e6ba
6
+ metadata.gz: 2905355938f1f18c747c83bdcc1360f88c887026d8b2242c00a87727cb32ab9954927a24ea12975d8c89a1f0e358ffab22ed458b2cac1d8a9acfb9537bb03eca
7
+ data.tar.gz: 556b1d58707d72698a8e537dc41e8a0b4d47656b501b1cbdcff9db1532180d4d28b4f443f505789454da381efa52adf722c501eb85212564809b6571d504ee55
data/README.md CHANGED
@@ -1,6 +1,7 @@
1
1
  # RubyCrawl 🎭
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/rubycrawl.svg)](https://rubygems.org/gems/rubycrawl)
4
+ [![CI](https://github.com/craft-wise/rubycrawl/actions/workflows/ci.yml/badge.svg)](https://github.com/craft-wise/rubycrawl/actions/workflows/ci.yml)
4
5
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
6
  [![Ruby](https://img.shields.io/badge/ruby-%3E%3D%203.0-red.svg)](https://www.ruby-lang.org/)
6
7
 
@@ -16,6 +17,7 @@ RubyCrawl provides **accurate, JavaScript-enabled web scraping** using a pure Ru
16
17
  - ✅ **Production-ready** — Auto-retry, error handling, resource optimization
17
18
  - ✅ **Multi-page crawling** — BFS algorithm with smart URL deduplication
18
19
  - ✅ **Rails-friendly** — Generators, initializers, and ActiveJob integration
20
+ - ✅ **Readability-powered** — Mozilla Readability.js for article-quality extraction, heuristic fallback for all other pages
19
21
 
20
22
  ```ruby
21
23
  # One line to crawl any JavaScript-heavy site
@@ -35,7 +37,7 @@ result.metadata # Title, description, OG tags, etc.
35
37
  - **Simple API**: Clean Ruby interface — zero Ferrum or CDP knowledge required
36
38
  - **Resource optimization**: Built-in resource blocking for 2-3x faster crawls
37
39
  - **Auto-managed browsers**: Lazy Chrome singleton, isolated page per crawl
38
- - **Content extraction**: HTML, plain text, clean HTML, Markdown (lazy), links, metadata
40
+ - **Content extraction**: Mozilla Readability.js (primary) + link-density heuristic (fallback) — article-quality `clean_html`, `clean_text`, `clean_markdown`, links, metadata
39
41
  - **Multi-page crawling**: BFS crawler with configurable depth limits and URL deduplication
40
42
  - **Smart URL handling**: Automatic normalization, tracking parameter removal, same-host filtering
41
43
  - **Rails integration**: First-class Rails support with generators and initializers
@@ -102,14 +104,15 @@ require "rubycrawl"
102
104
  result = RubyCrawl.crawl("https://example.com")
103
105
 
104
106
  # Access extracted content
105
- result.final_url # Final URL after redirects
106
- result.clean_text # Noise-stripped plain text (no nav/footer/ads)
107
- result.clean_html # Noise-stripped HTML (same noise removed as clean_text)
108
- result.raw_text # Full body.innerText (unfiltered)
109
- result.html # Full raw HTML content
110
- result.links # Extracted links with url, text, title, rel
111
- result.metadata # Title, description, OG tags, etc.
112
- result.clean_markdown # Markdown converted from clean_html (lazy first access only)
107
+ result.final_url # Final URL after redirects
108
+ result.clean_text # Noise-stripped plain text (no nav/footer/ads)
109
+ result.clean_html # Noise-stripped HTML (same noise removed as clean_text)
110
+ result.raw_text # Full body.innerText (unfiltered)
111
+ result.html # Full raw HTML content
112
+ result.links # Extracted links with url, text, title, rel
113
+ result.metadata # Title, description, OG tags, etc.
114
+ result.metadata['extractor'] # "readability" or "heuristic"which extractor ran
115
+ result.clean_markdown # Markdown converted from clean_html (lazy — first access only)
113
116
  ```
114
117
 
115
118
  ## Use Cases
@@ -187,13 +190,38 @@ puts "Indexed #{pages_crawled} pages"
187
190
 
188
191
  #### Multi-Page Options
189
192
 
190
- | Option | Default | Description |
191
- | ----------------- | --------- | ------------------------------------ |
192
- | `max_pages` | 50 | Maximum number of pages to crawl |
193
- | `max_depth` | 3 | Maximum link depth from start URL |
194
- | `same_host_only` | true | Only follow links on the same domain |
195
- | `wait_until` | inherited | Page load strategy |
196
- | `block_resources` | inherited | Block images/fonts/CSS |
193
+ | Option | Default | Description |
194
+ | ---------------------- | --------- | --------------------------------------------------- |
195
+ | `max_pages` | 50 | Maximum number of pages to crawl |
196
+ | `max_depth` | 3 | Maximum link depth from start URL |
197
+ | `same_host_only` | true | Only follow links on the same domain |
198
+ | `wait_until` | inherited | Page load strategy |
199
+ | `block_resources` | inherited | Block images/fonts/CSS |
200
+ | `respect_robots_txt` | false | Honour robots.txt rules and auto-sleep `Crawl-delay`|
201
+
202
+ #### robots.txt Support
203
+
204
+ When `respect_robots_txt: true`, RubyCrawl fetches `robots.txt` once at the start of the crawl and:
205
+
206
+ - Skips any URL disallowed for `User-agent: *`
207
+ - Automatically sleeps the `Crawl-delay` specified in robots.txt between pages
208
+
209
+ ```ruby
210
+ RubyCrawl.crawl_site("https://example.com",
211
+ respect_robots_txt: true,
212
+ max_pages: 100
213
+ ) do |page|
214
+ puts page.url
215
+ end
216
+ ```
217
+
218
+ Or enable globally:
219
+
220
+ ```ruby
221
+ RubyCrawl.configure(respect_robots_txt: true)
222
+ ```
223
+
224
+ If robots.txt is unreachable or missing, crawling proceeds normally (fail open).
197
225
 
198
226
  #### Page Result Object
199
227
 
@@ -245,11 +273,12 @@ result = RubyCrawl.crawl(
245
273
 
246
274
  | Option | Values | Default | Description |
247
275
  | ----------------- | ----------------------------------------------------------- | ------- | --------------------------------------------------- |
248
- | `wait_until` | `"load"`, `"domcontentloaded"`, `"networkidle"`, `"commit"` | `nil` | When to consider page loaded (nil = Ferrum default) |
249
- | `block_resources` | `true`, `false` | `nil` | Block images, fonts, CSS, media for faster crawls |
250
- | `max_attempts` | Integer | `3` | Total number of attempts (including the first) |
251
- | `timeout` | Integer (seconds) | `30` | Browser navigation timeout |
252
- | `headless` | `true`, `false` | `true` | Run Chrome headlessly |
276
+ | `wait_until` | `"load"`, `"domcontentloaded"`, `"networkidle"`, `"commit"` | `nil` | When to consider page loaded (nil = Ferrum default) |
277
+ | `block_resources` | `true`, `false` | `nil` | Block images, fonts, CSS, media for faster crawls |
278
+ | `max_attempts` | Integer | `3` | Total number of attempts (including the first) |
279
+ | `timeout` | Integer (seconds) | `30` | Browser navigation timeout |
280
+ | `headless` | `true`, `false` | `true` | Run Chrome headlessly |
281
+ | `respect_robots_txt` | `true`, `false` | `false` | Honour robots.txt rules and auto-sleep Crawl-delay |
253
282
 
254
283
  **Wait strategies explained:**
255
284
 
@@ -318,7 +347,8 @@ result.metadata
318
347
  # "twitter_image" => "https://...",
319
348
  # "canonical" => "https://...",
320
349
  # "lang" => "en",
321
- # "charset" => "UTF-8"
350
+ # "charset" => "UTF-8",
351
+ # "extractor" => "readability" # or "heuristic"
322
352
  # }
323
353
  ```
324
354
 
@@ -473,25 +503,45 @@ RubyCrawl uses a single-process architecture:
473
503
  ```
474
504
  RubyCrawl (public API)
475
505
 
476
- Browser (lib/rubycrawl/browser.rb) ← Ferrum wrapper
506
+ Browser (lib/rubycrawl/browser.rb) ← Ferrum wrapper
477
507
 
478
- Ferrum::Browser ← Chrome DevTools Protocol (pure Ruby)
508
+ Ferrum::Browser ← Chrome DevTools Protocol (pure Ruby)
479
509
 
480
- Chromium ← headless browser
510
+ Chromium ← headless browser
511
+
512
+ Readability.js → heuristic fallback ← content extraction (inside browser)
481
513
  ```
482
514
 
483
515
  - Chrome launches once lazily and is reused across all crawls
484
516
  - Each crawl gets an isolated page context (own cookies/storage)
485
- - JS extraction runs inside the browser via `page.evaluate()`
517
+ - Content extraction runs inside the browser via `page.evaluate()`:
518
+ - **Primary**: Mozilla Readability.js — article-quality extraction for blogs, docs, news
519
+ - **Fallback**: link-density heuristic — covers marketing pages, homepages, SPAs
520
+ - `result.metadata['extractor']` tells you which path was used (`"readability"` or `"heuristic"`)
486
521
  - No separate processes, no HTTP boundary, no Node.js
487
522
 
488
523
  ## Performance
489
524
 
490
525
  - **Resource blocking**: Keep `block_resources: true` (default: nil) to skip images/fonts/CSS for 2-3x faster crawls
491
526
  - **Wait strategy**: Use `wait_until: "load"` for static sites, `"networkidle"` for SPAs
492
- - **Concurrency**: Use background jobs (Sidekiq, GoodJob, etc.) for parallel crawling
493
527
  - **Browser reuse**: The first crawl is slower (~2s) due to Chrome launch; subsequent crawls are much faster (~200-500ms)
494
528
 
529
+ ### Parallelism
530
+
531
+ RubyCrawl does not support parallel page loading within a single process — Ferrum uses one Chrome instance and concurrent access is not thread-safe.
532
+
533
+ The recommended pattern is **job-level parallelism**: each background job gets its own `RubyCrawl` instance and Chrome process, with natural rate limiting via your job queue's concurrency setting:
534
+
535
+ ```ruby
536
+ # Enqueue independent crawls — each job runs its own Chrome
537
+ urls.each { |url| CrawlJob.perform_later(url) }
538
+
539
+ # Control concurrency via your queue worker config (Sidekiq, GoodJob, etc.)
540
+ # e.g. Sidekiq concurrency: 3 → 3 Chrome processes crawling in parallel
541
+ ```
542
+
543
+ This also works naturally with `respect_robots_txt: true` — each job respects Crawl-delay independently.
544
+
495
545
  ## Development
496
546
 
497
547
  ```bash
@@ -499,12 +549,9 @@ git clone git@github.com:craft-wise/rubycrawl.git
499
549
  cd rubycrawl
500
550
  bin/setup
501
551
 
502
- # Run unit tests (no browser required)
552
+ # Run all tests (Chrome required — installed as a gem dependency)
503
553
  bundle exec rspec
504
554
 
505
- # Run integration tests (requires Chrome)
506
- INTEGRATION=1 bundle exec rspec
507
-
508
555
  # Manual testing
509
556
  bin/console
510
557
  > RubyCrawl.crawl("https://example.com")
@@ -528,7 +575,9 @@ The gem is available as open source under the terms of the [MIT License](LICENSE
528
575
 
529
576
  Built with [Ferrum](https://github.com/rubycdp/ferrum) — pure Ruby Chrome DevTools Protocol client.
530
577
 
531
- Powered by [reverse_markdown](https://github.com/xijo/reverse_markdown) for GitHub-flavored Markdown conversion.
578
+ Content extraction powered by [Mozilla Readability.js](https://github.com/mozilla/readability) the algorithm behind Firefox Reader View.
579
+
580
+ Markdown conversion powered by [reverse_markdown](https://github.com/xijo/reverse_markdown) for GitHub-flavored output.
532
581
 
533
582
  ## Support
534
583
 
@@ -3,13 +3,10 @@
3
3
  class RubyCrawl
4
4
  class Browser
5
5
  # JavaScript extraction constants, evaluated inside Chromium via page.evaluate().
6
- # Ported verbatim from node/src/index.jslogic is unchanged.
7
- # NOISE_SELECTORS is interpolated directly into EXTRACT_CONTENT_JS (no need to
8
- # pass as a JS argument as the Node version did).
6
+ # All constants are IIFEsFerrum's page.evaluate() evaluates an expression,
7
+ # it does NOT call function definitions. Wrapping as (() => { ... })() ensures
8
+ # the function is immediately invoked and its return value is captured.
9
9
  module Extraction
10
- # All constants are IIFEs — Ferrum's page.evaluate() evaluates an expression,
11
- # it does NOT call function definitions. Wrapping as (() => { ... })() ensures
12
- # the function is immediately invoked and its return value is captured.
13
10
  EXTRACT_METADATA_JS = <<~JS
14
11
  (() => {
15
12
  const getMeta = (name) => {
@@ -54,8 +51,7 @@ class RubyCrawl
54
51
  (() => (document.body?.innerText || "").trim())()
55
52
  JS
56
53
 
57
- # Semantic noise selectors — covers standard HTML5 elements and ARIA roles.
58
- # Interpolated directly into EXTRACT_CONTENT_JS as a string literal.
54
+ # Semantic noise selectors — used by the heuristic fallback.
59
55
  NOISE_SELECTORS = [
60
56
  'nav', 'header', 'footer', 'aside',
61
57
  '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
@@ -64,11 +60,37 @@ class RubyCrawl
64
60
  'script', 'style', 'noscript', 'iframe'
65
61
  ].join(', ').freeze
66
62
 
67
- # Removes semantic noise (nav/header/footer/aside + ARIA roles) and high
68
- # link-density containers, then returns both clean plain text and clean HTML.
69
- # DOM mutations are reversed after extraction so the page is unchanged.
63
+ # Mozilla Readability.js v0.6.0 vendored source, read once at load time.
64
+ # Embedded inside EXTRACT_CONTENT_JS's outer IIFE so Readability is defined
65
+ # and used within the same Runtime.evaluate expression (Ferrum evaluates a
66
+ # single expression — separate evaluate calls have separate scopes).
67
+ READABILITY_JS = File.read(File.join(__dir__, 'readability.js')).freeze
68
+
69
+ # Extracts clean article HTML using Mozilla Readability (primary) with a
70
+ # link-density heuristic as fallback when Readability returns no content.
71
+ # Everything is wrapped in one outer IIFE so page.evaluate gets a single
72
+ # expression and Readability is in scope for the extraction logic.
73
+ # DOM mutations from the fallback path are reversed after extraction.
70
74
  EXTRACT_CONTENT_JS = <<~JS.freeze
71
75
  (() => {
76
+ // Mozilla Readability.js v0.6.0 — defined in this IIFE's scope.
77
+ #{READABILITY_JS}
78
+
79
+ // Primary: Mozilla Readability — article-quality extraction.
80
+ let readabilityDebug = null;
81
+ try {
82
+ const docClone = document.cloneNode(true);
83
+ const reader = new Readability(docClone, { charThreshold: 100 });
84
+ const article = reader.parse();
85
+ if (article && article.textContent && article.textContent.trim().length > 200) {
86
+ return { cleanHtml: article.content, extractor: "readability" };
87
+ }
88
+ readabilityDebug = article ? `returned ${article.textContent?.trim().length ?? 0} text chars (below threshold)` : "returned null (no article detected)";
89
+ } catch (e) {
90
+ readabilityDebug = `error: ${e.message}`;
91
+ }
92
+
93
+ // Fallback: link-density heuristic (works on nav-heavy / non-article pages).
72
94
  const noiseSelectors = #{NOISE_SELECTORS.to_json};
73
95
  function linkDensity(el) {
74
96
  const total = (el.innerText || "").trim().length;
@@ -98,7 +120,7 @@ class RubyCrawl
98
120
  }
99
121
  const cleanHtml = document.body.innerHTML;
100
122
  removed.reverse().forEach(({ el, parent, next }) => parent.insertBefore(el, next));
101
- return { cleanHtml };
123
+ return { cleanHtml, extractor: "heuristic", debug: readabilityDebug };
102
124
  })()
103
125
  JS
104
126
  end