scraply 2.0.1 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +7 -2
- package/readme.md +112 -10
- package/src/config/defaults.js +27 -2
- package/src/config/load.js +49 -0
- package/src/core/errors.js +23 -0
- package/src/core/queue.js +29 -11
- package/src/core/retry.js +11 -7
- package/src/crawler.js +215 -56
- package/src/extract/extract.js +17 -3
- package/src/extract/parse.js +35 -0
- package/src/extract/sitemap.js +35 -0
- package/src/index.d.ts +285 -0
- package/src/index.js +37 -6
- package/src/output/writers.js +14 -5
package/package.json
CHANGED
|
@@ -1,11 +1,16 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "scraply",
|
|
3
3
|
"description": "A simple, configurable and functional content scraper",
|
|
4
|
-
"version": "2.0.
|
|
4
|
+
"version": "2.0.2",
|
|
5
5
|
"main": "src/index.js",
|
|
6
|
+
"types": "./src/index.d.ts",
|
|
6
7
|
"type": "module",
|
|
7
8
|
"exports": {
|
|
8
|
-
".":
|
|
9
|
+
".": {
|
|
10
|
+
"types": "./src/index.d.ts",
|
|
11
|
+
"import": "./src/index.js",
|
|
12
|
+
"default": "./src/index.js"
|
|
13
|
+
}
|
|
9
14
|
},
|
|
10
15
|
"files": [
|
|
11
16
|
"src"
|
package/readme.md
CHANGED
|
@@ -32,21 +32,52 @@ await scraply({
|
|
|
32
32
|
This crawls `example.com`, extracts the readable text of every allowed page, and writes the results to `dataset/formatted/example.json`.
|
|
33
33
|
|
|
34
34
|
## How Scraply works
|
|
35
|
-
1. The crawl is seeded from `startUrls
|
|
35
|
+
1. The crawl is seeded from `startUrls` (and optionally a [sitemap](#sitemap-seeding)).
|
|
36
36
|
2. Each page is fetched, its links are discovered and filtered (`include` / `exclude`), and new links are queued.
|
|
37
|
-
3. The
|
|
37
|
+
3. The body is processed [based on its `Content-Type`](#content-type-aware-extraction) and saved under `dataset/crawled/` as `{ url, content, crawledAt, hash }` (`crawledAt` is an ISO timestamp; `hash` is the SHA-256 of `content`, handy for change detection). JSON responses additionally carry the parsed value on `data`.
|
|
38
38
|
4. When the queue drains, all crawled pages are routed by URL into the files defined in `output.routes` and written to `dataset/formatted/`.
|
|
39
39
|
|
|
40
40
|
Each queue entry ends in one of three terminal states: **crawled** (saved), **skipped** (disallowed `Content-Type`), or **error** (fetch failed). The three are tracked separately so stats stay meaningful.
|
|
41
41
|
|
|
42
|
+
### Content-type-aware extraction
|
|
43
|
+
The body is handled according to its `Content-Type`:
|
|
44
|
+
|
|
45
|
+
- **HTML** — links are discovered, then readable text is extracted. Use `extract.root` to allow-list the container(s) to read from (e.g. `'main'`), and `extract.removeSelectors` to strip noise. When `root` matches nothing it falls back to `extract.rootFallback` (default `<body>`).
|
|
46
|
+
- **JSON** — parsed and stored as pretty-printed `content`, with the parsed value also exposed on `record.data` (set `extract.json: false` to keep the raw text instead). The `extract` hook still runs (with `$` as `null`).
|
|
47
|
+
- **Other text** — stored as-is.
|
|
48
|
+
|
|
49
|
+
Only responses whose `Content-Type` matches `allowedContentTypes` are processed; everything else is skipped (and fires the `skip` hook).
|
|
50
|
+
|
|
51
|
+
```js
|
|
52
|
+
await scraply({
|
|
53
|
+
startUrls: ['https://docs.example.com'],
|
|
54
|
+
allowedContentTypes: ['text/html', 'application/json'],
|
|
55
|
+
extract: { root: 'main' }
|
|
56
|
+
});
|
|
57
|
+
```
|
|
58
|
+
|
|
42
59
|
### Persistence and resuming
|
|
43
|
-
The queue and crawled pages are checkpointed to disk in `dataset/`. If a run is interrupted (or rate-limited), progress is saved and the next run resumes exactly where it left off without re-crawling finished URLs. When every URL has been processed, Scraply starts a fresh crawl (set `crawl.resetOnComplete: false` to keep the finished queue instead).
|
|
60
|
+
The queue and crawled pages are checkpointed to disk in `dataset/`. If a run is interrupted (or rate-limited), progress is saved and the next run resumes exactly where it left off without re-crawling finished URLs. When every URL has been processed, Scraply starts a fresh crawl (set `crawl.resetOnComplete: false` to keep the finished queue instead).
|
|
61
|
+
|
|
62
|
+
- Re-attempt **failed** URLs on the next run with `crawl.retryErrors: true` (or call `requeueErrors()` and crawl again).
|
|
63
|
+
- Re-attempt **skipped** URLs with `crawl.retrySkipped: true` (or `requeueSkipped()`) — handy after widening `allowedContentTypes` or changing `sites`.
|
|
44
64
|
|
|
45
65
|
### Concurrency and limits
|
|
46
66
|
Pages are crawled with a worker pool (`crawl.concurrency`). Requests to the same host are spaced by `crawl.delay` for politeness, while different hosts run in parallel. `crawl.maxDepth` bounds link depth and `crawl.maxPages` caps the total number of successfully crawled pages (counted across resumes).
|
|
47
67
|
|
|
48
68
|
### Rate limiting
|
|
49
|
-
On HTTP `429`, Scraply
|
|
69
|
+
On HTTP `429`, Scraply waits (honoring `retry-after` / `x-ratelimit-reset`) and retries — independently of the normal `retry` budget. This is the default (`rateLimit.exitOnLimit: false`). Set `rateLimit.exitOnLimit: true` to instead abort the crawl by throwing a `RateLimitError` (carrying `error.code = rateLimit.exitCode`); because the queue is persistent, a later run resumes where it stopped. Scraply never calls `process.exit` on your behalf — catch the error and decide what to do (e.g. `process.exit(error.code)` from a CLI).
|
|
70
|
+
|
|
71
|
+
```js
|
|
72
|
+
import { scraply, RateLimitError } from 'scraply';
|
|
73
|
+
|
|
74
|
+
try {
|
|
75
|
+
await scraply({ startUrls: ['https://example.com'], rateLimit: { exitOnLimit: true } });
|
|
76
|
+
} catch (error) {
|
|
77
|
+
if (error instanceof RateLimitError) process.exit(error.code);
|
|
78
|
+
throw error;
|
|
79
|
+
}
|
|
80
|
+
```
|
|
50
81
|
|
|
51
82
|
## Fetchers
|
|
52
83
|
`fetcher` selects the backend:
|
|
@@ -88,19 +119,43 @@ crawler.on('page', (record) => console.log('crawled', record.url));
|
|
|
88
119
|
// Veto links before they are queued.
|
|
89
120
|
crawler.on('shouldEnqueue', (url) => !url.includes('/admin'));
|
|
90
121
|
|
|
91
|
-
// Transform the stored record.
|
|
122
|
+
// Transform the stored record (runs before it is persisted, so changes are saved).
|
|
92
123
|
crawler.on('transform', (record) => ({ ...record, length: record.content.length }));
|
|
93
124
|
|
|
94
125
|
await crawler.run();
|
|
95
126
|
```
|
|
96
127
|
|
|
97
|
-
Instance methods: `run()`, `crawl()`, `fetch(url)`, `extract(html, url)`, `enqueue(urls, opts)`, `format(records?)`, `requeueErrors()`, `stop()`, `on(event, fn)`.
|
|
128
|
+
Instance methods: `run()`, `crawl()`, `fetch(url)`, `extract(html, url)`, `enqueue(urls, opts)`, `format(records?)`, `requeueErrors()`, `requeueSkipped()`, `stop()`, `on(event, fn)`.
|
|
98
129
|
|
|
99
130
|
`format()` reads crawled pages from `dataset/crawled/` via the persisted queue. You can call it alone to re-route output after a crawl — no need to fetch pages again.
|
|
100
131
|
|
|
101
|
-
Hooks
|
|
132
|
+
### Hooks
|
|
133
|
+
Register with `crawler.on(name, fn)` (returns an unsubscribe function). Async handlers are awaited. **Reduce** hooks may return a replacement value; **emit** hooks are side-effect only.
|
|
102
134
|
|
|
103
|
-
|
|
135
|
+
| Hook | Type | Arguments | Notes |
|
|
136
|
+
|---|---|---|---|
|
|
137
|
+
| `response` | emit | `(result, entry)` | Raw `{ data, status, headers }`, before the content-type gate. |
|
|
138
|
+
| `skip` | emit | `(entry, { reason, status, result })` | A response was skipped (e.g. disallowed `Content-Type`). |
|
|
139
|
+
| `shouldEnqueue` | reduce | `(allow, url, referrer)` | Return `false` to veto a URL. |
|
|
140
|
+
| `links` | reduce | `(links, $, entry, result)` | Add/replace discovered links before they are enqueued. `$` is `null` for non-HTML — useful to pull URLs out of a JSON API response. |
|
|
141
|
+
| `extract` | reduce | `(content, $, entry, result)` | Replace the extracted content. `$` is `null` for non-HTML. `result` gives raw access to the body/headers. |
|
|
142
|
+
| `transform` | reduce | `(record, entry, result)` | Replace the record **before** it is saved and formatted. |
|
|
143
|
+
| `page` | emit | `(record, entry, result)` | Fires after the record is persisted. |
|
|
144
|
+
| `error` | emit | `(error, entry)` | A fetch/process failed. |
|
|
145
|
+
|
|
146
|
+
Standalone exports for advanced use: `runCrawlers`, `RateLimitError`, `normalizeUrl`, `matchesPattern`, `matchesAnyPattern`, `extractText`, `discoverLinks`, `classifyContentType`, `parseJson`, `parseSitemap`, `routeRecord`, `writeRecords`, `formatRecords`, `loadConfig`, `DEFAULT_CONFIG`, `resolveFetcher`, `createHttpFetcher`, `createBrowserFetcher`, `assertBrowserConfig`, `BROWSER_WAIT_UNTIL`, `BROWSER_BLOCKABLE_RESOURCES`.
|
|
147
|
+
|
|
148
|
+
### Running multiple crawlers
|
|
149
|
+
Scraply never calls `process.exit`, so several crawlers can share one process. `runCrawlers` accepts config objects or crawler instances and runs them sequentially (or `concurrency` at a time):
|
|
150
|
+
|
|
151
|
+
```js
|
|
152
|
+
import { runCrawlers } from 'scraply';
|
|
153
|
+
|
|
154
|
+
await runCrawlers([mainConfig, academyConfig]); // sequential
|
|
155
|
+
await runCrawlers([a, b, c], { concurrency: 2 }); // two at a time
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
By default each crawler installs a SIGINT/SIGTERM handler for a graceful stop (a second signal forces quit). Set `signals: false` in a config when embedding Scraply so it never touches process signals.
|
|
104
159
|
|
|
105
160
|
## Configuration
|
|
106
161
|
All options are optional except `startUrls`. Pass a partial object to `scraply()` or `createCrawler()` — it is [deep-merged](src/config/load.js) over the defaults. Durations are in milliseconds.
|
|
@@ -117,7 +172,51 @@ const config = loadConfig({
|
|
|
117
172
|
});
|
|
118
173
|
```
|
|
119
174
|
|
|
120
|
-
Top-level keys: `startUrls`, `include`, `exclude`, `allowedContentTypes`, `fetcher`, `browser`, `logLevel`, `storage`, `request`, `retry`, `rateLimit`, `crawl`, `extract`, `output`.
|
|
175
|
+
Top-level keys: `startUrls`, `include`, `exclude`, `allowedContentTypes`, `sites`, `fetcher`, `browser`, `logLevel`, `signals`, `storage`, `request`, `retry`, `rateLimit`, `crawl`, `extract`, `output`.
|
|
176
|
+
|
|
177
|
+
### Extending list options
|
|
178
|
+
List fields — `include`, `exclude`, `allowedContentTypes`, `extract.removeSelectors`, `output.exclude` — accept either an array (which **replaces** the default) or a directive object that **combines** with Scraply's defaults:
|
|
179
|
+
|
|
180
|
+
```js
|
|
181
|
+
extract: {
|
|
182
|
+
// Keep all of Scraply's default removeSelectors AND add your own.
|
|
183
|
+
removeSelectors: { extend: ['.cookie-banner', '#promo'] }
|
|
184
|
+
}
|
|
185
|
+
// Also supported: { prepend: [...] } and { replace: [...] }.
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### Per-site overrides
|
|
189
|
+
`sites` lets one crawl apply different rules per origin or path. Each entry has a `match` (URL prefix / RegExp, or an array of them) plus the fields to override: **`allowedContentTypes`** and **`extract`** (`root`, `rootFallback`, `json`, `removeSelectors`). The most specific match wins and is merged over the top-level config — so a single crawler can handle several origins with one queue. A site's `extract.removeSelectors` accepts the same `{ extend }` / `{ replace }` directives, resolved against the top-level list.
|
|
190
|
+
|
|
191
|
+
```js
|
|
192
|
+
await scraply({
|
|
193
|
+
startUrls: ['https://example.com', 'https://docs.example.com'],
|
|
194
|
+
include: ['https://example.com', 'https://docs.example.com'],
|
|
195
|
+
output: {
|
|
196
|
+
routes: {
|
|
197
|
+
'https://example.com': { '*': 'site.json' },
|
|
198
|
+
'https://docs.example.com': { '*': 'docs.json' } // routes are origin-aware in one config
|
|
199
|
+
}
|
|
200
|
+
},
|
|
201
|
+
sites: [
|
|
202
|
+
{
|
|
203
|
+
match: 'https://docs.example.com',
|
|
204
|
+
allowedContentTypes: ['text/html', 'application/json'],
|
|
205
|
+
extract: { root: 'main', removeSelectors: { extend: ['.sidebar'] } }
|
|
206
|
+
}
|
|
207
|
+
]
|
|
208
|
+
});
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
**Scope:** `sites` overrides only `allowedContentTypes` and `extract`. `request`, `retry`, `crawl`, and `fetcher` are per-instance (one fetcher/pool per crawler), and `storage.dir` is shared (one queue + one `dataset/`). To keep **separate datasets** per origin, run one crawler each via [`runCrawlers`](#running-multiple-crawlers) with different `storage.dir`s.
|
|
212
|
+
|
|
213
|
+
### Sitemap seeding
|
|
214
|
+
Set `crawl.sitemap: true` to seed the crawl from `<origin>/sitemap.xml` for each start URL, or pass an explicit array of sitemap URLs. Sitemap indexes are followed automatically, and discovered URLs still pass through `include` / `exclude`.
|
|
215
|
+
|
|
216
|
+
```js
|
|
217
|
+
await scraply({ startUrls: ['https://example.com'], crawl: { sitemap: true } });
|
|
218
|
+
await scraply({ startUrls: ['https://example.com'], crawl: { sitemap: ['https://example.com/sitemap_index.xml'] } });
|
|
219
|
+
```
|
|
121
220
|
|
|
122
221
|
### Output routing
|
|
123
222
|
`output.routes` is a two-level map:
|
|
@@ -139,8 +238,11 @@ output: {
|
|
|
139
238
|
}
|
|
140
239
|
```
|
|
141
240
|
|
|
241
|
+
## TypeScript
|
|
242
|
+
Scraply ships type declarations (`src/index.d.ts`), so configuration, hooks, and the crawler instance are fully typed in both TypeScript and JS (via editor IntelliSense) — no `@types` package needed.
|
|
243
|
+
|
|
142
244
|
## GitHub Actions
|
|
143
|
-
Because crawls are persistent
|
|
245
|
+
Because crawls are persistent, Scraply works well on a schedule. Commit the `dataset/` directory between runs, and each scheduled run continues the crawl. To stop a run early on rate limits and resume later, set `rateLimit.exitOnLimit: true` and exit with the thrown `RateLimitError.code`.
|
|
144
246
|
|
|
145
247
|
## Migrating from 1.x
|
|
146
248
|
The configuration is now camelCase and grouped, and the entry point is `src/index.js`.
|
package/src/config/defaults.js
CHANGED
|
@@ -20,6 +20,11 @@ export const DEFAULT_CONFIG = {
|
|
|
20
20
|
// Only responses whose Content-Type includes one of these are parsed.
|
|
21
21
|
allowedContentTypes: ['text/html'],
|
|
22
22
|
|
|
23
|
+
// Per-origin/route overrides. Each entry: { match, allowedContentTypes?, extract? }.
|
|
24
|
+
// `match` is a URL prefix / RegExp (or an array of them); the most specific
|
|
25
|
+
// match wins and its fields override the top-level config for matching URLs.
|
|
26
|
+
sites: [],
|
|
27
|
+
|
|
23
28
|
// 'http' (native fetch), 'browser' (Puppeteer) or a custom Fetcher instance.
|
|
24
29
|
fetcher: 'http',
|
|
25
30
|
|
|
@@ -35,6 +40,11 @@ export const DEFAULT_CONFIG = {
|
|
|
35
40
|
// 'silent' | 'error' | 'warn' | 'info' | 'debug'
|
|
36
41
|
logLevel: 'info',
|
|
37
42
|
|
|
43
|
+
// Install SIGINT/SIGTERM handlers for a graceful stop (first signal finishes
|
|
44
|
+
// in-flight work and flushes; a second forces quit). Set false when embedding
|
|
45
|
+
// Scraply so it never touches process signals.
|
|
46
|
+
signals: true,
|
|
47
|
+
|
|
38
48
|
storage: {
|
|
39
49
|
dir: 'dataset'
|
|
40
50
|
},
|
|
@@ -55,7 +65,10 @@ export const DEFAULT_CONFIG = {
|
|
|
55
65
|
|
|
56
66
|
rateLimit: {
|
|
57
67
|
fallbackDelay: 60000,
|
|
58
|
-
|
|
68
|
+
// false: wait (honoring retry-after / x-ratelimit-reset) and retry until the
|
|
69
|
+
// host relents. true: abort the crawl with a RateLimitError carrying
|
|
70
|
+
// `exitCode` so a scheduler can resume it later (the queue is persistent).
|
|
71
|
+
exitOnLimit: false,
|
|
59
72
|
exitCode: 10
|
|
60
73
|
},
|
|
61
74
|
|
|
@@ -65,10 +78,22 @@ export const DEFAULT_CONFIG = {
|
|
|
65
78
|
maxDepth: Infinity,
|
|
66
79
|
maxPages: Infinity, // hard cap on successfully crawled pages (counts across resumes)
|
|
67
80
|
resetOnComplete: true,
|
|
68
|
-
retryErrors: false // re-queue previously errored URLs on resume so they are retried
|
|
81
|
+
retryErrors: false, // re-queue previously errored URLs on resume so they are retried
|
|
82
|
+
retrySkipped: false, // re-queue previously skipped URLs on resume (e.g. after widening allowedContentTypes)
|
|
83
|
+
sitemap: false // true -> seed <origin>/sitemap.xml per start URL; or pass an array of sitemap URLs
|
|
69
84
|
},
|
|
70
85
|
|
|
71
86
|
extract: {
|
|
87
|
+
// Allow-list the container(s) to extract text from: a selector, an array of
|
|
88
|
+
// selectors, or null for the whole <body>. Falls back to `rootFallback`
|
|
89
|
+
// when the selector matches nothing.
|
|
90
|
+
root: null,
|
|
91
|
+
rootFallback: 'body',
|
|
92
|
+
|
|
93
|
+
// true -> JSON responses are parsed and stored as pretty-printed `content`
|
|
94
|
+
// (with the parsed value on `record.data`). false -> store the raw body text.
|
|
95
|
+
json: true,
|
|
96
|
+
|
|
72
97
|
removeSelectors: [
|
|
73
98
|
'script',
|
|
74
99
|
'noscript',
|
package/src/config/load.js
CHANGED
|
@@ -20,6 +20,27 @@ const deepMerge = (target, source) => {
|
|
|
20
20
|
return merged;
|
|
21
21
|
};
|
|
22
22
|
|
|
23
|
+
/**
|
|
24
|
+
* Resolves a list field that may be a plain array (replaces the default) or a
|
|
25
|
+
* directive object: `{ replace }`, `{ extend }`/`{ append }`, `{ prepend }`.
|
|
26
|
+
* Directives are combined with the package defaults so users can add to a list
|
|
27
|
+
* (e.g. `removeSelectors`) without losing Scraply's built-ins.
|
|
28
|
+
*/
|
|
29
|
+
const resolveList = (value, defaults) => {
|
|
30
|
+
if (Array.isArray(value)) return value;
|
|
31
|
+
if (value && typeof value === 'object') {
|
|
32
|
+
if (Array.isArray(value.replace)) return value.replace;
|
|
33
|
+
const prepend = Array.isArray(value.prepend) ? value.prepend : [];
|
|
34
|
+
const append = Array.isArray(value.extend)
|
|
35
|
+
? value.extend
|
|
36
|
+
: Array.isArray(value.append)
|
|
37
|
+
? value.append
|
|
38
|
+
: [];
|
|
39
|
+
return [...prepend, ...defaults, ...append];
|
|
40
|
+
}
|
|
41
|
+
return defaults;
|
|
42
|
+
};
|
|
43
|
+
|
|
23
44
|
/**
|
|
24
45
|
* Merges a user config over the defaults and derives the storage paths.
|
|
25
46
|
* @param {import('../index.js').ScraplyConfig} [userConfig]
|
|
@@ -28,6 +49,34 @@ const deepMerge = (target, source) => {
|
|
|
28
49
|
export const loadConfig = (userConfig = {}) => {
|
|
29
50
|
const config = deepMerge(DEFAULT_CONFIG, userConfig);
|
|
30
51
|
|
|
52
|
+
// List fields accept { extend } / { prepend } / { replace } directives so a
|
|
53
|
+
// user can add to Scraply's defaults instead of replacing them wholesale.
|
|
54
|
+
config.exclude = resolveList(config.exclude, DEFAULT_CONFIG.exclude);
|
|
55
|
+
config.include = resolveList(config.include, []);
|
|
56
|
+
config.allowedContentTypes = resolveList(config.allowedContentTypes, DEFAULT_CONFIG.allowedContentTypes);
|
|
57
|
+
config.extract.removeSelectors = resolveList(config.extract.removeSelectors, DEFAULT_CONFIG.extract.removeSelectors);
|
|
58
|
+
config.output.exclude = resolveList(config.output.exclude, DEFAULT_CONFIG.output.exclude);
|
|
59
|
+
|
|
60
|
+
// Normalize per-site overrides: `match` becomes an array of patterns, and a
|
|
61
|
+
// site's `extract.removeSelectors` honors the same { extend } / { replace }
|
|
62
|
+
// directives — resolved against the (already-resolved) top-level list, so a
|
|
63
|
+
// site can add to the base instead of silently passing an object downstream.
|
|
64
|
+
config.sites = (config.sites ?? []).map((site) => {
|
|
65
|
+
const normalized = {
|
|
66
|
+
...site,
|
|
67
|
+
match: Array.isArray(site.match) ? site.match : [site.match]
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
if (normalized.extract?.removeSelectors !== undefined) {
|
|
71
|
+
normalized.extract = {
|
|
72
|
+
...normalized.extract,
|
|
73
|
+
removeSelectors: resolveList(normalized.extract.removeSelectors, config.extract.removeSelectors)
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return normalized;
|
|
78
|
+
});
|
|
79
|
+
|
|
31
80
|
const { dir } = config.storage;
|
|
32
81
|
config.storage.queuePath = path.posix.join(dir, 'queue.json');
|
|
33
82
|
config.storage.crawledDir = path.posix.join(dir, 'crawled');
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Thrown when a host rate-limits the crawl (HTTP 429) and
|
|
3
|
+
* `rateLimit.exitOnLimit` is true. Instead of killing the host process, Scraply
|
|
4
|
+
* aborts the current crawl with this error so the caller can decide what to do
|
|
5
|
+
* (e.g. exit with `error.code` from a CLI, or schedule a later resume — the
|
|
6
|
+
* persistent queue means crawling continues where it stopped).
|
|
7
|
+
*/
|
|
8
|
+
export class RateLimitError extends Error {
|
|
9
|
+
/**
|
|
10
|
+
* @param {string} [message]
|
|
11
|
+
* @param {{ code?: number, headers?: Record<string, string>, cause?: unknown }} [options]
|
|
12
|
+
*/
|
|
13
|
+
constructor(message = 'Rate limited', { code = 10, headers = {}, cause } = {}) {
|
|
14
|
+
super(message);
|
|
15
|
+
this.name = 'RateLimitError';
|
|
16
|
+
this.code = code;
|
|
17
|
+
this.headers = headers;
|
|
18
|
+
// Mirror the shape fetchers attach so existing `error.response.status`
|
|
19
|
+
// checks keep working.
|
|
20
|
+
this.response = { status: 429, headers };
|
|
21
|
+
if (cause !== undefined) this.cause = cause;
|
|
22
|
+
}
|
|
23
|
+
}
|
package/src/core/queue.js
CHANGED
|
@@ -112,26 +112,44 @@ export class QueueManager {
|
|
|
112
112
|
}
|
|
113
113
|
|
|
114
114
|
/**
|
|
115
|
-
*
|
|
116
|
-
*
|
|
117
|
-
*
|
|
115
|
+
* Returns matching terminal entries to the pending set so the next crawl
|
|
116
|
+
* retries them. Persists immediately so a fresh `load()` (e.g. at the start of
|
|
117
|
+
* `crawl()`) sees the requeued entries.
|
|
118
|
+
* @param {(entry: QueueEntry) => boolean} match
|
|
118
119
|
* @returns {number} how many entries were requeued
|
|
119
120
|
*/
|
|
120
|
-
|
|
121
|
+
_requeue(match) {
|
|
121
122
|
let count = 0;
|
|
122
123
|
for (const entry of this.entries) {
|
|
123
|
-
if (entry
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
124
|
+
if (!match(entry)) continue;
|
|
125
|
+
|
|
126
|
+
if (entry.error !== null) this._errors -= 1;
|
|
127
|
+
if (entry.skipped !== null) this._skipped -= 1;
|
|
128
|
+
|
|
129
|
+
entry.error = null;
|
|
130
|
+
entry.skipped = null;
|
|
131
|
+
entry.status = null;
|
|
132
|
+
this._pending.push(entry);
|
|
133
|
+
count += 1;
|
|
130
134
|
}
|
|
131
135
|
if (count > 0) this.flush();
|
|
132
136
|
return count;
|
|
133
137
|
}
|
|
134
138
|
|
|
139
|
+
/** Re-queues every errored entry for retry. @returns {number} */
|
|
140
|
+
requeueErrors() {
|
|
141
|
+
return this._requeue((entry) => entry.error !== null);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Re-queues every skipped entry for another attempt. Useful after widening
|
|
146
|
+
* `allowedContentTypes` (or changing `sites`) so previously skipped URLs are
|
|
147
|
+
* reconsidered. @returns {number}
|
|
148
|
+
*/
|
|
149
|
+
requeueSkipped() {
|
|
150
|
+
return this._requeue((entry) => entry.skipped !== null);
|
|
151
|
+
}
|
|
152
|
+
|
|
135
153
|
isAllProcessed() {
|
|
136
154
|
return this.entries.length > 0 && this.pendingCount() === 0;
|
|
137
155
|
}
|
package/src/core/retry.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { delay } from '../util/delay.js';
|
|
2
|
+
import { RateLimitError } from './errors.js';
|
|
2
3
|
|
|
3
4
|
/** Derives how long to wait (ms) from rate-limit headers, falling back to a default. */
|
|
4
5
|
const computeWait = (headers = {}, fallback) => {
|
|
@@ -24,12 +25,12 @@ const computeWait = (headers = {}, fallback) => {
|
|
|
24
25
|
*
|
|
25
26
|
* Rate limiting (HTTP 429) is handled independently of the normal retry budget:
|
|
26
27
|
* when `rateLimit.exitOnLimit` is false the runner waits (honoring `retry-after`
|
|
27
|
-
* / `x-ratelimit-reset`) and retries until the host relents; otherwise it
|
|
28
|
-
*
|
|
28
|
+
* / `x-ratelimit-reset`) and retries until the host relents; otherwise it throws
|
|
29
|
+
* a `RateLimitError` so the crawl aborts cleanly and can be resumed later.
|
|
29
30
|
*
|
|
30
|
-
* @param {{ config: import('../index.js').ResolvedConfig, logger: any
|
|
31
|
+
* @param {{ config: import('../index.js').ResolvedConfig, logger: any }} deps
|
|
31
32
|
*/
|
|
32
|
-
export const createRetryRunner = ({ config, logger
|
|
33
|
+
export const createRetryRunner = ({ config, logger }) => {
|
|
33
34
|
const { retry, rateLimit } = config;
|
|
34
35
|
|
|
35
36
|
const run = async (fn) => {
|
|
@@ -43,9 +44,12 @@ export const createRetryRunner = ({ config, logger, onRateLimitExit }) => {
|
|
|
43
44
|
|
|
44
45
|
if (status === 429) {
|
|
45
46
|
if (rateLimit.exitOnLimit) {
|
|
46
|
-
logger.warn(`
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
logger.warn(`Rate limited. Aborting crawl (exitOnLimit) with code ${rateLimit.exitCode}.`);
|
|
48
|
+
throw new RateLimitError('Rate limited', {
|
|
49
|
+
code: rateLimit.exitCode,
|
|
50
|
+
headers: error.response.headers,
|
|
51
|
+
cause: error
|
|
52
|
+
});
|
|
49
53
|
}
|
|
50
54
|
|
|
51
55
|
const wait = computeWait(error.response.headers, rateLimit.fallbackDelay);
|