scraply 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +7 -3
- package/readme.md +149 -55
- package/src/config/browser.js +37 -0
- package/src/config/defaults.js +47 -11
- package/src/config/load.js +57 -1
- package/src/core/errors.js +23 -0
- package/src/core/queue.js +83 -11
- package/src/core/retry.js +34 -26
- package/src/crawler.js +265 -76
- package/src/extract/extract.js +17 -3
- package/src/extract/links.js +4 -4
- package/src/extract/parse.js +35 -0
- package/src/extract/sitemap.js +35 -0
- package/src/fetchers/browserFetcher.js +18 -12
- package/src/fetchers/httpFetcher.js +40 -3
- package/src/index.d.ts +285 -0
- package/src/index.js +48 -7
- package/src/output/writers.js +14 -5
package/package.json
CHANGED
|
@@ -1,11 +1,16 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "scraply",
|
|
3
3
|
"description": "A simple, configurable and functional content scraper",
|
|
4
|
-
"version": "2.0.
|
|
4
|
+
"version": "2.0.2",
|
|
5
5
|
"main": "src/index.js",
|
|
6
|
+
"types": "./src/index.d.ts",
|
|
6
7
|
"type": "module",
|
|
7
8
|
"exports": {
|
|
8
|
-
".":
|
|
9
|
+
".": {
|
|
10
|
+
"types": "./src/index.d.ts",
|
|
11
|
+
"import": "./src/index.js",
|
|
12
|
+
"default": "./src/index.js"
|
|
13
|
+
}
|
|
9
14
|
},
|
|
10
15
|
"files": [
|
|
11
16
|
"src"
|
|
@@ -14,7 +19,6 @@
|
|
|
14
19
|
"node": ">=18"
|
|
15
20
|
},
|
|
16
21
|
"scripts": {
|
|
17
|
-
"start": "node .",
|
|
18
22
|
"dev": "node src/dev.js"
|
|
19
23
|
},
|
|
20
24
|
"keywords": [
|
package/readme.md
CHANGED
|
@@ -32,26 +32,79 @@ await scraply({
|
|
|
32
32
|
This crawls `example.com`, extracts the readable text of every allowed page, and writes the results to `dataset/formatted/example.json`.
|
|
33
33
|
|
|
34
34
|
## How Scraply works
|
|
35
|
-
1. The crawl is seeded from `startUrls
|
|
35
|
+
1. The crawl is seeded from `startUrls` (and optionally a [sitemap](#sitemap-seeding)).
|
|
36
36
|
2. Each page is fetched, its links are discovered and filtered (`include` / `exclude`), and new links are queued.
|
|
37
|
-
3. The
|
|
37
|
+
3. The body is processed [based on its `Content-Type`](#content-type-aware-extraction) and saved under `dataset/crawled/` as `{ url, content, crawledAt, hash }` (`crawledAt` is an ISO timestamp; `hash` is the SHA-256 of `content`, handy for change detection). JSON responses additionally carry the parsed value on `data`.
|
|
38
38
|
4. When the queue drains, all crawled pages are routed by URL into the files defined in `output.routes` and written to `dataset/formatted/`.
|
|
39
39
|
|
|
40
|
+
Each queue entry ends in one of three terminal states: **crawled** (saved), **skipped** (disallowed `Content-Type`), or **error** (fetch failed). The three are tracked separately so stats stay meaningful.
|
|
41
|
+
|
|
42
|
+
### Content-type-aware extraction
|
|
43
|
+
The body is handled according to its `Content-Type`:
|
|
44
|
+
|
|
45
|
+
- **HTML** — links are discovered, then readable text is extracted. Use `extract.root` to allow-list the container(s) to read from (e.g. `'main'`), and `extract.removeSelectors` to strip noise. When `root` matches nothing it falls back to `extract.rootFallback` (default `<body>`).
|
|
46
|
+
- **JSON** — parsed and stored as pretty-printed `content`, with the parsed value also exposed on `record.data` (set `extract.json: false` to keep the raw text instead). The `extract` hook still runs (with `$` as `null`).
|
|
47
|
+
- **Other text** — stored as-is.
|
|
48
|
+
|
|
49
|
+
Only responses whose `Content-Type` matches `allowedContentTypes` are processed; everything else is skipped (and fires the `skip` hook).
|
|
50
|
+
|
|
51
|
+
```js
|
|
52
|
+
await scraply({
|
|
53
|
+
startUrls: ['https://docs.example.com'],
|
|
54
|
+
allowedContentTypes: ['text/html', 'application/json'],
|
|
55
|
+
extract: { root: 'main' }
|
|
56
|
+
});
|
|
57
|
+
```
|
|
58
|
+
|
|
40
59
|
### Persistence and resuming
|
|
41
60
|
The queue and crawled pages are checkpointed to disk in `dataset/`. If a run is interrupted (or rate-limited), progress is saved and the next run resumes exactly where it left off without re-crawling finished URLs. When every URL has been processed, Scraply starts a fresh crawl (set `crawl.resetOnComplete: false` to keep the finished queue instead).
|
|
42
61
|
|
|
43
|
-
|
|
44
|
-
|
|
62
|
+
- Re-attempt **failed** URLs on the next run with `crawl.retryErrors: true` (or call `requeueErrors()` and crawl again).
|
|
63
|
+
- Re-attempt **skipped** URLs with `crawl.retrySkipped: true` (or `requeueSkipped()`) — handy after widening `allowedContentTypes` or changing `sites`.
|
|
64
|
+
|
|
65
|
+
### Concurrency and limits
|
|
66
|
+
Pages are crawled with a worker pool (`crawl.concurrency`). Requests to the same host are spaced by `crawl.delay` for politeness, while different hosts run in parallel. `crawl.maxDepth` bounds link depth and `crawl.maxPages` caps the total number of successfully crawled pages (counted across resumes).
|
|
45
67
|
|
|
46
68
|
### Rate limiting
|
|
47
|
-
On HTTP `429`, Scraply
|
|
69
|
+
On HTTP `429`, Scraply waits (honoring `retry-after` / `x-ratelimit-reset`) and retries — independently of the normal `retry` budget. This is the default (`rateLimit.exitOnLimit: false`). Set `rateLimit.exitOnLimit: true` to instead abort the crawl by throwing a `RateLimitError` (carrying `error.code = rateLimit.exitCode`); because the queue is persistent, a later run resumes where it stopped. Scraply never calls `process.exit` on your behalf — catch the error and decide what to do (e.g. `process.exit(error.code)` from a CLI).
|
|
70
|
+
|
|
71
|
+
```js
|
|
72
|
+
import { scraply, RateLimitError } from 'scraply';
|
|
73
|
+
|
|
74
|
+
try {
|
|
75
|
+
await scraply({ startUrls: ['https://example.com'], rateLimit: { exitOnLimit: true } });
|
|
76
|
+
} catch (error) {
|
|
77
|
+
if (error instanceof RateLimitError) process.exit(error.code);
|
|
78
|
+
throw error;
|
|
79
|
+
}
|
|
80
|
+
```
|
|
48
81
|
|
|
49
82
|
## Fetchers
|
|
50
83
|
`fetcher` selects the backend:
|
|
51
|
-
- `'http'` (default): fast static fetching with the native `fetch`.
|
|
84
|
+
- `'http'` (default): fast static fetching with the native `fetch`. Redirects are followed up to `request.maxRedirects`, and response bodies larger than `request.maxContentLength` (default 20 MB, `0` disables) are rejected before they are buffered.
|
|
52
85
|
- `'browser'`: full JavaScript rendering via Puppeteer (`puppeteer-cluster`).
|
|
53
86
|
- a custom object implementing the `Fetcher` interface (`{ name, fetch, init?, close? }`), so backends like Playwright or a remote CDP browser can be plugged in without changing the crawler.
|
|
54
87
|
|
|
88
|
+
Both built-in fetchers send `request.userAgent` and any extra `request.headers` (e.g. `Authorization`, `Accept-Language`, `Cookie`) with every request.
|
|
89
|
+
|
|
90
|
+
### Browser fetcher options
|
|
91
|
+
The `browser` block applies only when `fetcher: 'browser'`. Both options are validated at config load time. See [`src/config/defaults.js`](src/config/defaults.js) for defaults.
|
|
92
|
+
|
|
93
|
+
- **`browser.waitUntil`** — passed to Puppeteer `page.goto`. Default `'load'`. Use `'networkidle2'` for SPAs that inject links or content after the initial load (Vue/React sites). Increase `request.timeout` when using slower modes.
|
|
94
|
+
- **`browser.blockResources`** — Puppeteer resource types to abort during fetch (`'image'`, `'stylesheet'`, `'font'`, `'media'`). Default `['image', 'font', 'media']`. Stylesheets are excluded by default because many SPAs need CSS before content renders. Pass `[]` to disable resource blocking entirely.
|
|
95
|
+
|
|
96
|
+
```js
|
|
97
|
+
await scraply({
|
|
98
|
+
startUrls: ['https://spa.example.com/products'],
|
|
99
|
+
fetcher: 'browser',
|
|
100
|
+
browser: {
|
|
101
|
+
waitUntil: 'networkidle2',
|
|
102
|
+
blockResources: ['image', 'font', 'media']
|
|
103
|
+
},
|
|
104
|
+
request: { timeout: 60000 }
|
|
105
|
+
});
|
|
106
|
+
```
|
|
107
|
+
|
|
55
108
|
## Programmatic API
|
|
56
109
|
`createCrawler(config)` returns an instance exposing each stage, plus lifecycle hooks:
|
|
57
110
|
|
|
@@ -66,74 +119,112 @@ crawler.on('page', (record) => console.log('crawled', record.url));
|
|
|
66
119
|
// Veto links before they are queued.
|
|
67
120
|
crawler.on('shouldEnqueue', (url) => !url.includes('/admin'));
|
|
68
121
|
|
|
69
|
-
// Transform the stored record.
|
|
122
|
+
// Transform the stored record (runs before it is persisted, so changes are saved).
|
|
70
123
|
crawler.on('transform', (record) => ({ ...record, length: record.content.length }));
|
|
71
124
|
|
|
72
125
|
await crawler.run();
|
|
73
126
|
```
|
|
74
127
|
|
|
75
|
-
Instance methods: `run()`, `crawl()`, `fetch(url)`, `extract(html, url)`, `enqueue(urls, opts)`, `format(records?)`, `stop()`, `on(event, fn)`.
|
|
128
|
+
Instance methods: `run()`, `crawl()`, `fetch(url)`, `extract(html, url)`, `enqueue(urls, opts)`, `format(records?)`, `requeueErrors()`, `requeueSkipped()`, `stop()`, `on(event, fn)`.
|
|
129
|
+
|
|
130
|
+
`format()` reads crawled pages from `dataset/crawled/` via the persisted queue. You can call it alone to re-route output after a crawl — no need to fetch pages again.
|
|
131
|
+
|
|
132
|
+
### Hooks
|
|
133
|
+
Register with `crawler.on(name, fn)` (returns an unsubscribe function). Async handlers are awaited. **Reduce** hooks may return a replacement value; **emit** hooks are side-effect only.
|
|
134
|
+
|
|
135
|
+
| Hook | Type | Arguments | Notes |
|
|
136
|
+
|---|---|---|---|
|
|
137
|
+
| `response` | emit | `(result, entry)` | Raw `{ data, status, headers }`, before the content-type gate. |
|
|
138
|
+
| `skip` | emit | `(entry, { reason, status, result })` | A response was skipped (e.g. disallowed `Content-Type`). |
|
|
139
|
+
| `shouldEnqueue` | reduce | `(allow, url, referrer)` | Return `false` to veto a URL. |
|
|
140
|
+
| `links` | reduce | `(links, $, entry, result)` | Add/replace discovered links before they are enqueued. `$` is `null` for non-HTML — useful to pull URLs out of a JSON API response. |
|
|
141
|
+
| `extract` | reduce | `(content, $, entry, result)` | Replace the extracted content. `$` is `null` for non-HTML. `result` gives raw access to the body/headers. |
|
|
142
|
+
| `transform` | reduce | `(record, entry, result)` | Replace the record **before** it is saved and formatted. |
|
|
143
|
+
| `page` | emit | `(record, entry, result)` | Fires after the record is persisted. |
|
|
144
|
+
| `error` | emit | `(error, entry)` | A fetch/process failed. |
|
|
76
145
|
|
|
77
|
-
|
|
146
|
+
Standalone exports for advanced use: `runCrawlers`, `RateLimitError`, `normalizeUrl`, `matchesPattern`, `matchesAnyPattern`, `extractText`, `discoverLinks`, `classifyContentType`, `parseJson`, `parseSitemap`, `routeRecord`, `writeRecords`, `formatRecords`, `loadConfig`, `DEFAULT_CONFIG`, `resolveFetcher`, `createHttpFetcher`, `createBrowserFetcher`, `assertBrowserConfig`, `BROWSER_WAIT_UNTIL`, `BROWSER_BLOCKABLE_RESOURCES`.
|
|
78
147
|
|
|
79
|
-
|
|
148
|
+
### Running multiple crawlers
|
|
149
|
+
Scraply never calls `process.exit`, so several crawlers can share one process. `runCrawlers` accepts config objects or crawler instances and runs them sequentially (or `concurrency` at a time):
|
|
150
|
+
|
|
151
|
+
```js
|
|
152
|
+
import { runCrawlers } from 'scraply';
|
|
153
|
+
|
|
154
|
+
await runCrawlers([mainConfig, academyConfig]); // sequential
|
|
155
|
+
await runCrawlers([a, b, c], { concurrency: 2 }); // two at a time
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
By default each crawler installs a SIGINT/SIGTERM handler for a graceful stop (a second signal forces quit). Set `signals: false` in a config when embedding Scraply so it never touches process signals.
|
|
80
159
|
|
|
81
160
|
## Configuration
|
|
82
|
-
All options are optional except `startUrls`. Durations are milliseconds.
|
|
161
|
+
All options are optional except `startUrls`. Pass a partial object to `scraply()` or `createCrawler()` — it is [deep-merged](src/config/load.js) over the defaults. Durations are in milliseconds.
|
|
162
|
+
|
|
163
|
+
**Full default values and inline comments:** [`src/config/defaults.js`](src/config/defaults.js)
|
|
83
164
|
|
|
84
165
|
```js
|
|
85
|
-
{
|
|
86
|
-
startUrls: ['https://crawler-test.com/'],
|
|
87
|
-
include: [], // URL prefixes or RegExp; defaults to startUrls
|
|
88
|
-
exclude: [/\.(zip|png|js|css|...)$/i],
|
|
89
|
-
allowedContentTypes: ['text/html'],
|
|
90
|
-
fetcher: 'http', // 'http' | 'browser' | Fetcher instance
|
|
91
|
-
logLevel: 'info', // 'silent' | 'error' | 'warn' | 'info' | 'debug'
|
|
92
|
-
|
|
93
|
-
storage: { dir: 'dataset' },
|
|
94
|
-
|
|
95
|
-
request: {
|
|
96
|
-
timeout: 10000,
|
|
97
|
-
maxRedirects: 5,
|
|
98
|
-
maxContentLength: 20 * 1024 * 1024,
|
|
99
|
-
userAgent: 'Mozilla/5.0 (compatible; Scraply/2.0; +https://www.npmjs.com/package/scraply)'
|
|
100
|
-
},
|
|
166
|
+
import { DEFAULT_CONFIG, loadConfig } from 'scraply';
|
|
101
167
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
168
|
+
// Inspect or extend the defaults programmatically.
|
|
169
|
+
const config = loadConfig({
|
|
170
|
+
...DEFAULT_CONFIG,
|
|
171
|
+
startUrls: ['https://example.com']
|
|
172
|
+
});
|
|
173
|
+
```
|
|
107
174
|
|
|
108
|
-
|
|
109
|
-
fallbackDelay: 60000,
|
|
110
|
-
exitOnLimit: true,
|
|
111
|
-
exitCode: 10
|
|
112
|
-
},
|
|
175
|
+
Top-level keys: `startUrls`, `include`, `exclude`, `allowedContentTypes`, `sites`, `fetcher`, `browser`, `logLevel`, `signals`, `storage`, `request`, `retry`, `rateLimit`, `crawl`, `extract`, `output`.
|
|
113
176
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
delay: 200, // per-host spacing
|
|
117
|
-
maxDepth: Infinity,
|
|
118
|
-
resetOnComplete: true
|
|
119
|
-
},
|
|
177
|
+
### Extending list options
|
|
178
|
+
List fields — `include`, `exclude`, `allowedContentTypes`, `extract.removeSelectors`, `output.exclude` — accept either an array (which **replaces** the default) or a directive object that **combines** with Scraply's defaults:
|
|
120
179
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
180
|
+
```js
|
|
181
|
+
extract: {
|
|
182
|
+
// Keep all of Scraply's default removeSelectors AND add your own.
|
|
183
|
+
removeSelectors: { extend: ['.cookie-banner', '#promo'] }
|
|
184
|
+
}
|
|
185
|
+
// Also supported: { prepend: [...] } and { replace: [...] }.
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### Per-site overrides
|
|
189
|
+
`sites` lets one crawl apply different rules per origin or path. Each entry has a `match` (URL prefix / RegExp, or an array of them) plus the fields to override: **`allowedContentTypes`** and **`extract`** (`root`, `rootFallback`, `json`, `removeSelectors`). The most specific match wins and is merged over the top-level config — so a single crawler can handle several origins with one queue. A site's `extract.removeSelectors` accepts the same `{ extend }` / `{ replace }` directives, resolved against the top-level list.
|
|
124
190
|
|
|
191
|
+
```js
|
|
192
|
+
await scraply({
|
|
193
|
+
startUrls: ['https://example.com', 'https://docs.example.com'],
|
|
194
|
+
include: ['https://example.com', 'https://docs.example.com'],
|
|
125
195
|
output: {
|
|
126
|
-
format: 'json', // 'json' | 'jsonl' | 'lines'
|
|
127
|
-
exclude: [],
|
|
128
196
|
routes: {
|
|
129
|
-
'https://
|
|
197
|
+
'https://example.com': { '*': 'site.json' },
|
|
198
|
+
'https://docs.example.com': { '*': 'docs.json' } // routes are origin-aware in one config
|
|
130
199
|
}
|
|
131
|
-
}
|
|
132
|
-
|
|
200
|
+
},
|
|
201
|
+
sites: [
|
|
202
|
+
{
|
|
203
|
+
match: 'https://docs.example.com',
|
|
204
|
+
allowedContentTypes: ['text/html', 'application/json'],
|
|
205
|
+
extract: { root: 'main', removeSelectors: { extend: ['.sidebar'] } }
|
|
206
|
+
}
|
|
207
|
+
]
|
|
208
|
+
});
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
**Scope:** `sites` overrides only `allowedContentTypes` and `extract`. `request`, `retry`, `crawl`, and `fetcher` are per-instance (one fetcher/pool per crawler), and `storage.dir` is shared (one queue + one `dataset/`). To keep **separate datasets** per origin, run one crawler each via [`runCrawlers`](#running-multiple-crawlers) with different `storage.dir`s.
|
|
212
|
+
|
|
213
|
+
### Sitemap seeding
|
|
214
|
+
Set `crawl.sitemap: true` to seed the crawl from `<origin>/sitemap.xml` for each start URL, or pass an explicit array of sitemap URLs. Sitemap indexes are followed automatically, and discovered URLs still pass through `include` / `exclude`.
|
|
215
|
+
|
|
216
|
+
```js
|
|
217
|
+
await scraply({ startUrls: ['https://example.com'], crawl: { sitemap: true } });
|
|
218
|
+
await scraply({ startUrls: ['https://example.com'], crawl: { sitemap: ['https://example.com/sitemap_index.xml'] } });
|
|
133
219
|
```
|
|
134
220
|
|
|
135
221
|
### Output routing
|
|
136
|
-
`output.routes`
|
|
222
|
+
`output.routes` is a two-level map:
|
|
223
|
+
|
|
224
|
+
1. **Outer keys** — URL prefix (usually `https://origin`, or `https://origin/path`) matched against the full crawled URL.
|
|
225
|
+
2. **Inner keys** — pathname segments joined with `/`, **without a leading slash**, matched from the longest suffix upward. Use `'*'` as fallback within that prefix.
|
|
226
|
+
|
|
227
|
+
Inner keys are case-sensitive and must match the URL pathname exactly (e.g. `Products/sports-watches`, not `/products/sports-watches`).
|
|
137
228
|
|
|
138
229
|
```js
|
|
139
230
|
output: {
|
|
@@ -142,13 +233,16 @@ output: {
|
|
|
142
233
|
'guide': 'guides.json',
|
|
143
234
|
'*': 'docs.json'
|
|
144
235
|
},
|
|
145
|
-
'https://example.com': { '*': '
|
|
236
|
+
'https://example.com/products/sports-watches': { '*': 'watches.json' }
|
|
146
237
|
}
|
|
147
238
|
}
|
|
148
239
|
```
|
|
149
240
|
|
|
241
|
+
## TypeScript
|
|
242
|
+
Scraply ships type declarations (`src/index.d.ts`), so configuration, hooks, and the crawler instance are fully typed in both TypeScript and JS (via editor IntelliSense) — no `@types` package needed.
|
|
243
|
+
|
|
150
244
|
## GitHub Actions
|
|
151
|
-
Because crawls are persistent
|
|
245
|
+
Because crawls are persistent, Scraply works well on a schedule. Commit the `dataset/` directory between runs, and each scheduled run continues the crawl. To stop a run early on rate limits and resume later, set `rateLimit.exitOnLimit: true` and exit with the thrown `RateLimitError.code`.
|
|
152
246
|
|
|
153
247
|
## Migrating from 1.x
|
|
154
248
|
The configuration is now camelCase and grouped, and the entry point is `src/index.js`.
|
|
@@ -167,4 +261,4 @@ The configuration is now camelCase and grouped, and the entry point is `src/inde
|
|
|
167
261
|
- `DATA_FORMATTER.CATEGORISED_PATHS` -> `output.routes`
|
|
168
262
|
- `DATA_FORMATTER.EXCLUDED_PATTERNS` -> `output.exclude`
|
|
169
263
|
|
|
170
|
-
New in 2.0: `crawl.concurrency`, `crawl.maxDepth`, `crawl.resetOnComplete`, `output.format`, pluggable `fetcher`, and lifecycle hooks. Formatted output is now real JSON by default (1.x wrote `url content` text lines).
|
|
264
|
+
New in 2.0: `crawl.concurrency`, `crawl.maxDepth`, `crawl.resetOnComplete`, `output.format`, `browser.waitUntil`, `browser.blockResources`, pluggable `fetcher`, and lifecycle hooks. Formatted output is now real JSON by default (1.x wrote `url content` text lines).
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/** @type {readonly ['load', 'domcontentloaded', 'networkidle0', 'networkidle2']} */
|
|
2
|
+
export const BROWSER_WAIT_UNTIL = Object.freeze([
|
|
3
|
+
'load',
|
|
4
|
+
'domcontentloaded',
|
|
5
|
+
'networkidle0',
|
|
6
|
+
'networkidle2'
|
|
7
|
+
]);
|
|
8
|
+
|
|
9
|
+
/** Puppeteer resource types Scraply may block to speed up browser fetches. */
|
|
10
|
+
export const BROWSER_BLOCKABLE_RESOURCES = Object.freeze(['image', 'stylesheet', 'font', 'media']);
|
|
11
|
+
|
|
12
|
+
/** Default blocked types. Stylesheets are excluded — many SPAs need CSS before content renders. */
|
|
13
|
+
export const DEFAULT_BROWSER_BLOCK_RESOURCES = Object.freeze(['image', 'font', 'media']);
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* @param {import('../index.js').BrowserConfig} browser
|
|
17
|
+
*/
|
|
18
|
+
export const assertBrowserConfig = (browser) => {
|
|
19
|
+
if (!BROWSER_WAIT_UNTIL.includes(browser?.waitUntil)) {
|
|
20
|
+
throw new Error(
|
|
21
|
+
`Invalid browser.waitUntil: ${String(browser?.waitUntil)}. Expected one of: ${BROWSER_WAIT_UNTIL.join(', ')}`
|
|
22
|
+
);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const blockResources = browser?.blockResources;
|
|
26
|
+
if (!Array.isArray(blockResources)) {
|
|
27
|
+
throw new Error('Invalid browser.blockResources: expected an array.');
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
for (const type of blockResources) {
|
|
31
|
+
if (!BROWSER_BLOCKABLE_RESOURCES.includes(type)) {
|
|
32
|
+
throw new Error(
|
|
33
|
+
`Invalid browser.blockResources entry: ${String(type)}. Expected one of: ${BROWSER_BLOCKABLE_RESOURCES.join(', ')}`
|
|
34
|
+
);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
};
|
package/src/config/defaults.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
import { DEFAULT_BROWSER_BLOCK_RESOURCES } from './browser.js';
|
|
2
|
+
|
|
1
3
|
/**
|
|
2
|
-
* Default Scraply configuration. Every value here can be overridden by the
|
|
3
|
-
* object passed to `createCrawler()` / `scraply()`. Durations are in milliseconds.
|
|
4
|
+
* Default Scraply configuration. Every value here can be overridden by the object passed to `createCrawler()` / `scraply()`. Durations are in milliseconds.
|
|
4
5
|
*
|
|
5
6
|
* @type {import('../index.js').ScraplyConfig}
|
|
6
7
|
*/
|
|
@@ -8,9 +9,7 @@ export const DEFAULT_CONFIG = {
|
|
|
8
9
|
// URLs the crawl is seeded with.
|
|
9
10
|
startUrls: ['https://crawler-test.com/'],
|
|
10
11
|
|
|
11
|
-
// Which discovered links are allowed into the queue. Each entry is either an
|
|
12
|
-
// absolute URL prefix (e.g. 'https://site.com/blog') or a RegExp. Empty means
|
|
13
|
-
// "default to startUrls".
|
|
12
|
+
// Which discovered links are allowed into the queue. Each entry is either an absolute URL prefix (e.g. 'https://site.com/blog') or a RegExp. Empty means "default to startUrls".
|
|
14
13
|
include: [],
|
|
15
14
|
|
|
16
15
|
// Links matching any of these (string prefix or RegExp) are never queued.
|
|
@@ -21,21 +20,41 @@ export const DEFAULT_CONFIG = {
|
|
|
21
20
|
// Only responses whose Content-Type includes one of these are parsed.
|
|
22
21
|
allowedContentTypes: ['text/html'],
|
|
23
22
|
|
|
23
|
+
// Per-origin/route overrides. Each entry: { match, allowedContentTypes?, extract? }.
|
|
24
|
+
// `match` is a URL prefix / RegExp (or an array of them); the most specific
|
|
25
|
+
// match wins and its fields override the top-level config for matching URLs.
|
|
26
|
+
sites: [],
|
|
27
|
+
|
|
24
28
|
// 'http' (native fetch), 'browser' (Puppeteer) or a custom Fetcher instance.
|
|
25
29
|
fetcher: 'http',
|
|
26
30
|
|
|
31
|
+
// Options for the built-in Puppeteer fetcher (`fetcher: 'browser'`).
|
|
32
|
+
browser: {
|
|
33
|
+
// When page.goto considers navigation finished. Use 'networkidle2' for SPAs that inject links/content after load (e.g. Vue/React apps).
|
|
34
|
+
waitUntil: 'load',
|
|
35
|
+
|
|
36
|
+
// Resource types to abort during fetch (speeds up crawls). Stylesheets are omitted by default because many SPAs need CSS before content renders.
|
|
37
|
+
blockResources: [...DEFAULT_BROWSER_BLOCK_RESOURCES]
|
|
38
|
+
},
|
|
39
|
+
|
|
27
40
|
// 'silent' | 'error' | 'warn' | 'info' | 'debug'
|
|
28
41
|
logLevel: 'info',
|
|
29
42
|
|
|
43
|
+
// Install SIGINT/SIGTERM handlers for a graceful stop (first signal finishes
|
|
44
|
+
// in-flight work and flushes; a second forces quit). Set false when embedding
|
|
45
|
+
// Scraply so it never touches process signals.
|
|
46
|
+
signals: true,
|
|
47
|
+
|
|
30
48
|
storage: {
|
|
31
49
|
dir: 'dataset'
|
|
32
50
|
},
|
|
33
51
|
|
|
34
52
|
request: {
|
|
35
|
-
timeout: 10000,
|
|
36
|
-
maxRedirects: 5,
|
|
37
|
-
maxContentLength: 20 * 1024 * 1024,
|
|
38
|
-
userAgent: 'Mozilla/5.0 (compatible; Scraply/2.0; +https://www.npmjs.com/package/scraply)'
|
|
53
|
+
timeout: 10000, // per-request budget (aborts the fetch, including body read)
|
|
54
|
+
maxRedirects: 5, // redirect hops the HTTP fetcher follows before giving up
|
|
55
|
+
maxContentLength: 20 * 1024 * 1024, // hard cap on the response body (bytes); 0 disables it
|
|
56
|
+
userAgent: 'Mozilla/5.0 (compatible; Scraply/2.0; +https://www.npmjs.com/package/scraply)',
|
|
57
|
+
headers: {} // extra request headers (auth, Accept-Language, cookies, ...) sent by every fetcher
|
|
39
58
|
},
|
|
40
59
|
|
|
41
60
|
retry: {
|
|
@@ -46,7 +65,10 @@ export const DEFAULT_CONFIG = {
|
|
|
46
65
|
|
|
47
66
|
rateLimit: {
|
|
48
67
|
fallbackDelay: 60000,
|
|
49
|
-
|
|
68
|
+
// false: wait (honoring retry-after / x-ratelimit-reset) and retry until the
|
|
69
|
+
// host relents. true: abort the crawl with a RateLimitError carrying
|
|
70
|
+
// `exitCode` so a scheduler can resume it later (the queue is persistent).
|
|
71
|
+
exitOnLimit: false,
|
|
50
72
|
exitCode: 10
|
|
51
73
|
},
|
|
52
74
|
|
|
@@ -54,10 +76,24 @@ export const DEFAULT_CONFIG = {
|
|
|
54
76
|
concurrency: 5,
|
|
55
77
|
delay: 200, // minimum spacing between requests to the same host
|
|
56
78
|
maxDepth: Infinity,
|
|
57
|
-
|
|
79
|
+
maxPages: Infinity, // hard cap on successfully crawled pages (counts across resumes)
|
|
80
|
+
resetOnComplete: true,
|
|
81
|
+
retryErrors: false, // re-queue previously errored URLs on resume so they are retried
|
|
82
|
+
retrySkipped: false, // re-queue previously skipped URLs on resume (e.g. after widening allowedContentTypes)
|
|
83
|
+
sitemap: false // true -> seed <origin>/sitemap.xml per start URL; or pass an array of sitemap URLs
|
|
58
84
|
},
|
|
59
85
|
|
|
60
86
|
extract: {
|
|
87
|
+
// Allow-list the container(s) to extract text from: a selector, an array of
|
|
88
|
+
// selectors, or null for the whole <body>. Falls back to `rootFallback`
|
|
89
|
+
// when the selector matches nothing.
|
|
90
|
+
root: null,
|
|
91
|
+
rootFallback: 'body',
|
|
92
|
+
|
|
93
|
+
// true -> JSON responses are parsed and stored as pretty-printed `content`
|
|
94
|
+
// (with the parsed value on `record.data`). false -> store the raw body text.
|
|
95
|
+
json: true,
|
|
96
|
+
|
|
61
97
|
removeSelectors: [
|
|
62
98
|
'script',
|
|
63
99
|
'noscript',
|
package/src/config/load.js
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import path from 'node:path';
|
|
2
2
|
import { DEFAULT_CONFIG } from './defaults.js';
|
|
3
|
+
import { assertBrowserConfig } from './browser.js';
|
|
4
|
+
import { normalizeUrl } from '../url/normalize.js';
|
|
3
5
|
|
|
4
6
|
const isPlainObject = (value) =>
|
|
5
7
|
value !== null && typeof value === 'object' && !Array.isArray(value) && !(value instanceof RegExp);
|
|
@@ -18,6 +20,27 @@ const deepMerge = (target, source) => {
|
|
|
18
20
|
return merged;
|
|
19
21
|
};
|
|
20
22
|
|
|
23
|
+
/**
|
|
24
|
+
* Resolves a list field that may be a plain array (replaces the default) or a
|
|
25
|
+
* directive object: `{ replace }`, `{ extend }`/`{ append }`, `{ prepend }`.
|
|
26
|
+
* Directives are combined with the package defaults so users can add to a list
|
|
27
|
+
* (e.g. `removeSelectors`) without losing Scraply's built-ins.
|
|
28
|
+
*/
|
|
29
|
+
const resolveList = (value, defaults) => {
|
|
30
|
+
if (Array.isArray(value)) return value;
|
|
31
|
+
if (value && typeof value === 'object') {
|
|
32
|
+
if (Array.isArray(value.replace)) return value.replace;
|
|
33
|
+
const prepend = Array.isArray(value.prepend) ? value.prepend : [];
|
|
34
|
+
const append = Array.isArray(value.extend)
|
|
35
|
+
? value.extend
|
|
36
|
+
: Array.isArray(value.append)
|
|
37
|
+
? value.append
|
|
38
|
+
: [];
|
|
39
|
+
return [...prepend, ...defaults, ...append];
|
|
40
|
+
}
|
|
41
|
+
return defaults;
|
|
42
|
+
};
|
|
43
|
+
|
|
21
44
|
/**
|
|
22
45
|
* Merges a user config over the defaults and derives the storage paths.
|
|
23
46
|
* @param {import('../index.js').ScraplyConfig} [userConfig]
|
|
@@ -26,14 +49,47 @@ const deepMerge = (target, source) => {
|
|
|
26
49
|
export const loadConfig = (userConfig = {}) => {
|
|
27
50
|
const config = deepMerge(DEFAULT_CONFIG, userConfig);
|
|
28
51
|
|
|
52
|
+
// List fields accept { extend } / { prepend } / { replace } directives so a
|
|
53
|
+
// user can add to Scraply's defaults instead of replacing them wholesale.
|
|
54
|
+
config.exclude = resolveList(config.exclude, DEFAULT_CONFIG.exclude);
|
|
55
|
+
config.include = resolveList(config.include, []);
|
|
56
|
+
config.allowedContentTypes = resolveList(config.allowedContentTypes, DEFAULT_CONFIG.allowedContentTypes);
|
|
57
|
+
config.extract.removeSelectors = resolveList(config.extract.removeSelectors, DEFAULT_CONFIG.extract.removeSelectors);
|
|
58
|
+
config.output.exclude = resolveList(config.output.exclude, DEFAULT_CONFIG.output.exclude);
|
|
59
|
+
|
|
60
|
+
// Normalize per-site overrides: `match` becomes an array of patterns, and a
|
|
61
|
+
// site's `extract.removeSelectors` honors the same { extend } / { replace }
|
|
62
|
+
// directives — resolved against the (already-resolved) top-level list, so a
|
|
63
|
+
// site can add to the base instead of silently passing an object downstream.
|
|
64
|
+
config.sites = (config.sites ?? []).map((site) => {
|
|
65
|
+
const normalized = {
|
|
66
|
+
...site,
|
|
67
|
+
match: Array.isArray(site.match) ? site.match : [site.match]
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
if (normalized.extract?.removeSelectors !== undefined) {
|
|
71
|
+
normalized.extract = {
|
|
72
|
+
...normalized.extract,
|
|
73
|
+
removeSelectors: resolveList(normalized.extract.removeSelectors, config.extract.removeSelectors)
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return normalized;
|
|
78
|
+
});
|
|
79
|
+
|
|
29
80
|
const { dir } = config.storage;
|
|
30
81
|
config.storage.queuePath = path.posix.join(dir, 'queue.json');
|
|
31
82
|
config.storage.crawledDir = path.posix.join(dir, 'crawled');
|
|
32
83
|
config.storage.formattedDir = path.posix.join(dir, 'formatted');
|
|
33
84
|
|
|
85
|
+
// When no include rules are given, fall back to the start URLs — normalized so
|
|
86
|
+
// they match the normalized links the crawler actually discovers (forced
|
|
87
|
+
// HTTPS, no "www.", no trailing slash).
|
|
34
88
|
if (!config.include?.length) {
|
|
35
|
-
config.include =
|
|
89
|
+
config.include = config.startUrls.map(normalizeUrl);
|
|
36
90
|
}
|
|
37
91
|
|
|
92
|
+
assertBrowserConfig(config.browser);
|
|
93
|
+
|
|
38
94
|
return config;
|
|
39
95
|
};
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Thrown when a host rate-limits the crawl (HTTP 429) and
|
|
3
|
+
* `rateLimit.exitOnLimit` is true. Instead of killing the host process, Scraply
|
|
4
|
+
* aborts the current crawl with this error so the caller can decide what to do
|
|
5
|
+
* (e.g. exit with `error.code` from a CLI, or schedule a later resume — the
|
|
6
|
+
* persistent queue means crawling continues where it stopped).
|
|
7
|
+
*/
|
|
8
|
+
export class RateLimitError extends Error {
|
|
9
|
+
/**
|
|
10
|
+
* @param {string} [message]
|
|
11
|
+
* @param {{ code?: number, headers?: Record<string, string>, cause?: unknown }} [options]
|
|
12
|
+
*/
|
|
13
|
+
constructor(message = 'Rate limited', { code = 10, headers = {}, cause } = {}) {
|
|
14
|
+
super(message);
|
|
15
|
+
this.name = 'RateLimitError';
|
|
16
|
+
this.code = code;
|
|
17
|
+
this.headers = headers;
|
|
18
|
+
// Mirror the shape fetchers attach so existing `error.response.status`
|
|
19
|
+
// checks keep working.
|
|
20
|
+
this.response = { status: 429, headers };
|
|
21
|
+
if (cause !== undefined) this.cause = cause;
|
|
22
|
+
}
|
|
23
|
+
}
|