scraply 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -2
- package/readme.md +50 -58
- package/src/config/browser.js +37 -0
- package/src/config/defaults.js +21 -10
- package/src/config/load.js +8 -1
- package/src/core/queue.js +65 -11
- package/src/core/retry.js +28 -24
- package/src/crawler.js +75 -45
- package/src/extract/links.js +4 -4
- package/src/fetchers/browserFetcher.js +18 -12
- package/src/fetchers/httpFetcher.js +40 -3
- package/src/index.js +11 -1
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "scraply",
|
|
3
3
|
"description": "A simple, configurable and functional content scraper",
|
|
4
|
-
"version": "2.0.
|
|
4
|
+
"version": "2.0.1",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"type": "module",
|
|
7
7
|
"exports": {
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
"node": ">=18"
|
|
15
15
|
},
|
|
16
16
|
"scripts": {
|
|
17
|
-
"start": "node .",
|
|
18
17
|
"dev": "node src/dev.js"
|
|
19
18
|
},
|
|
20
19
|
"keywords": [
|
package/readme.md
CHANGED
|
@@ -34,24 +34,46 @@ This crawls `example.com`, extracts the readable text of every allowed page, and
|
|
|
34
34
|
## How Scraply works
|
|
35
35
|
1. The crawl is seeded from `startUrls`.
|
|
36
36
|
2. Each page is fetched, its links are discovered and filtered (`include` / `exclude`), and new links are queued.
|
|
37
|
-
3. The page text is extracted (configurable element removal) and saved under `dataset/crawled
|
|
37
|
+
3. The page text is extracted (configurable element removal) and saved under `dataset/crawled/` as `{ url, content, crawledAt, hash }` (`crawledAt` is an ISO timestamp; `hash` is the SHA-256 of `content`, handy for change detection).
|
|
38
38
|
4. When the queue drains, all crawled pages are routed by URL into the files defined in `output.routes` and written to `dataset/formatted/`.
|
|
39
39
|
|
|
40
|
+
Each queue entry ends in one of three terminal states: **crawled** (saved), **skipped** (disallowed `Content-Type`), or **error** (fetch failed). The three are tracked separately so stats stay meaningful.
|
|
41
|
+
|
|
40
42
|
### Persistence and resuming
|
|
41
|
-
The queue and crawled pages are checkpointed to disk in `dataset/`. If a run is interrupted (or rate-limited), progress is saved and the next run resumes exactly where it left off without re-crawling finished URLs. When every URL has been processed, Scraply starts a fresh crawl (set `crawl.resetOnComplete: false` to keep the finished queue instead).
|
|
43
|
+
The queue and crawled pages are checkpointed to disk in `dataset/`. If a run is interrupted (or rate-limited), progress is saved and the next run resumes exactly where it left off without re-crawling finished URLs. When every URL has been processed, Scraply starts a fresh crawl (set `crawl.resetOnComplete: false` to keep the finished queue instead). To re-attempt failed URLs on the next run, set `crawl.retryErrors: true` (or call `requeueErrors()` and crawl again).
|
|
42
44
|
|
|
43
|
-
### Concurrency and
|
|
44
|
-
Pages are crawled with a worker pool (`crawl.concurrency`). Requests to the same host are spaced by `crawl.delay` for politeness, while different hosts run in parallel.
|
|
45
|
+
### Concurrency and limits
|
|
46
|
+
Pages are crawled with a worker pool (`crawl.concurrency`). Requests to the same host are spaced by `crawl.delay` for politeness, while different hosts run in parallel. `crawl.maxDepth` bounds link depth and `crawl.maxPages` caps the total number of successfully crawled pages (counted across resumes).
|
|
45
47
|
|
|
46
48
|
### Rate limiting
|
|
47
|
-
On HTTP `429`, Scraply either exits immediately with `rateLimit.exitCode` (default) so a scheduler can retry later, or waits (honoring `retry-after` / `x-ratelimit-reset`) and
|
|
49
|
+
On HTTP `429`, Scraply either exits immediately with `rateLimit.exitCode` (default) so a scheduler can retry later, or waits (honoring `retry-after` / `x-ratelimit-reset`) and retries — independently of the normal `retry` budget — when `rateLimit.exitOnLimit` is `false`.
|
|
48
50
|
|
|
49
51
|
## Fetchers
|
|
50
52
|
`fetcher` selects the backend:
|
|
51
|
-
- `'http'` (default): fast static fetching with the native `fetch`.
|
|
53
|
+
- `'http'` (default): fast static fetching with the native `fetch`. Redirects are followed up to `request.maxRedirects`, and response bodies larger than `request.maxContentLength` (default 20 MB, `0` disables) are rejected before they are buffered.
|
|
52
54
|
- `'browser'`: full JavaScript rendering via Puppeteer (`puppeteer-cluster`).
|
|
53
55
|
- a custom object implementing the `Fetcher` interface (`{ name, fetch, init?, close? }`), so backends like Playwright or a remote CDP browser can be plugged in without changing the crawler.
|
|
54
56
|
|
|
57
|
+
Both built-in fetchers send `request.userAgent` and any extra `request.headers` (e.g. `Authorization`, `Accept-Language`, `Cookie`) with every request.
|
|
58
|
+
|
|
59
|
+
### Browser fetcher options
|
|
60
|
+
The `browser` block applies only when `fetcher: 'browser'`. Both options are validated at config load time. See [`src/config/defaults.js`](src/config/defaults.js) for defaults.
|
|
61
|
+
|
|
62
|
+
- **`browser.waitUntil`** — passed to Puppeteer `page.goto`. Default `'load'`. Use `'networkidle2'` for SPAs that inject links or content after the initial load (Vue/React sites). Increase `request.timeout` when using slower modes.
|
|
63
|
+
- **`browser.blockResources`** — Puppeteer resource types to abort during fetch (`'image'`, `'stylesheet'`, `'font'`, `'media'`). Default `['image', 'font', 'media']`. Stylesheets are excluded by default because many SPAs need CSS before content renders. Pass `[]` to disable resource blocking entirely.
|
|
64
|
+
|
|
65
|
+
```js
|
|
66
|
+
await scraply({
|
|
67
|
+
startUrls: ['https://spa.example.com/products'],
|
|
68
|
+
fetcher: 'browser',
|
|
69
|
+
browser: {
|
|
70
|
+
waitUntil: 'networkidle2',
|
|
71
|
+
blockResources: ['image', 'font', 'media']
|
|
72
|
+
},
|
|
73
|
+
request: { timeout: 60000 }
|
|
74
|
+
});
|
|
75
|
+
```
|
|
76
|
+
|
|
55
77
|
## Programmatic API
|
|
56
78
|
`createCrawler(config)` returns an instance exposing each stage, plus lifecycle hooks:
|
|
57
79
|
|
|
@@ -72,68 +94,38 @@ crawler.on('transform', (record) => ({ ...record, length: record.content.length
|
|
|
72
94
|
await crawler.run();
|
|
73
95
|
```
|
|
74
96
|
|
|
75
|
-
Instance methods: `run()`, `crawl()`, `fetch(url)`, `extract(html, url)`, `enqueue(urls, opts)`, `format(records?)`, `stop()`, `on(event, fn)`.
|
|
97
|
+
Instance methods: `run()`, `crawl()`, `fetch(url)`, `extract(html, url)`, `enqueue(urls, opts)`, `format(records?)`, `requeueErrors()`, `stop()`, `on(event, fn)`.
|
|
98
|
+
|
|
99
|
+
`format()` reads crawled pages from `dataset/crawled/` via the persisted queue. You can call it alone to re-route output after a crawl — no need to fetch pages again.
|
|
76
100
|
|
|
77
101
|
Hooks: `response`, `extract`, `shouldEnqueue`, `transform`, `page`, `error`.
|
|
78
102
|
|
|
79
|
-
Standalone exports for advanced use: `normalizeUrl`, `matchesPattern`, `matchesAnyPattern`, `extractText`, `discoverLinks`, `routeRecord`, `writeRecords`, `formatRecords`, `loadConfig`, `DEFAULT_CONFIG`, `resolveFetcher`, `createHttpFetcher`, `createBrowserFetcher`.
|
|
103
|
+
Standalone exports for advanced use: `normalizeUrl`, `matchesPattern`, `matchesAnyPattern`, `extractText`, `discoverLinks`, `routeRecord`, `writeRecords`, `formatRecords`, `loadConfig`, `DEFAULT_CONFIG`, `resolveFetcher`, `createHttpFetcher`, `createBrowserFetcher`, `assertBrowserConfig`, `BROWSER_WAIT_UNTIL`, `BROWSER_BLOCKABLE_RESOURCES`.
|
|
80
104
|
|
|
81
105
|
## Configuration
|
|
82
|
-
All options are optional except `startUrls`. Durations are milliseconds.
|
|
106
|
+
All options are optional except `startUrls`. Pass a partial object to `scraply()` or `createCrawler()` — it is [deep-merged](src/config/load.js) over the defaults. Durations are in milliseconds.
|
|
83
107
|
|
|
84
|
-
|
|
85
|
-
{
|
|
86
|
-
startUrls: ['https://crawler-test.com/'],
|
|
87
|
-
include: [], // URL prefixes or RegExp; defaults to startUrls
|
|
88
|
-
exclude: [/\.(zip|png|js|css|...)$/i],
|
|
89
|
-
allowedContentTypes: ['text/html'],
|
|
90
|
-
fetcher: 'http', // 'http' | 'browser' | Fetcher instance
|
|
91
|
-
logLevel: 'info', // 'silent' | 'error' | 'warn' | 'info' | 'debug'
|
|
92
|
-
|
|
93
|
-
storage: { dir: 'dataset' },
|
|
94
|
-
|
|
95
|
-
request: {
|
|
96
|
-
timeout: 10000,
|
|
97
|
-
maxRedirects: 5,
|
|
98
|
-
maxContentLength: 20 * 1024 * 1024,
|
|
99
|
-
userAgent: 'Mozilla/5.0 (compatible; Scraply/2.0; +https://www.npmjs.com/package/scraply)'
|
|
100
|
-
},
|
|
108
|
+
**Full default values and inline comments:** [`src/config/defaults.js`](src/config/defaults.js)
|
|
101
109
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
statusCodes: [408, 500, 502, 503, 504],
|
|
105
|
-
delay: 1000
|
|
106
|
-
},
|
|
110
|
+
```js
|
|
111
|
+
import { DEFAULT_CONFIG, loadConfig } from 'scraply';
|
|
107
112
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
+
// Inspect or extend the defaults programmatically.
|
|
114
|
+
const config = loadConfig({
|
|
115
|
+
...DEFAULT_CONFIG,
|
|
116
|
+
startUrls: ['https://example.com']
|
|
117
|
+
});
|
|
118
|
+
```
|
|
113
119
|
|
|
114
|
-
|
|
115
|
-
concurrency: 5,
|
|
116
|
-
delay: 200, // per-host spacing
|
|
117
|
-
maxDepth: Infinity,
|
|
118
|
-
resetOnComplete: true
|
|
119
|
-
},
|
|
120
|
+
Top-level keys: `startUrls`, `include`, `exclude`, `allowedContentTypes`, `fetcher`, `browser`, `logLevel`, `storage`, `request`, `retry`, `rateLimit`, `crawl`, `extract`, `output`.
|
|
120
121
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
},
|
|
122
|
+
### Output routing
|
|
123
|
+
`output.routes` is a two-level map:
|
|
124
124
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
exclude: [],
|
|
128
|
-
routes: {
|
|
129
|
-
'https://crawler-test.com': { '*': 'general.json' }
|
|
130
|
-
}
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
```
|
|
125
|
+
1. **Outer keys** — URL prefix (usually `https://origin`, or `https://origin/path`) matched against the full crawled URL.
|
|
126
|
+
2. **Inner keys** — pathname segments joined with `/`, **without a leading slash**, matched from the longest suffix upward. Use `'*'` as fallback within that prefix.
|
|
134
127
|
|
|
135
|
-
|
|
136
|
-
`output.routes` maps a URL prefix to `{ pathKey: filename, '*': fallback }`. The most specific matching prefix wins, then the most specific path key, then `'*'`. For example:
|
|
128
|
+
Inner keys are case-sensitive and must match the URL pathname exactly (e.g. `Products/sports-watches`, not `/products/sports-watches`).
|
|
137
129
|
|
|
138
130
|
```js
|
|
139
131
|
output: {
|
|
@@ -142,7 +134,7 @@ output: {
|
|
|
142
134
|
'guide': 'guides.json',
|
|
143
135
|
'*': 'docs.json'
|
|
144
136
|
},
|
|
145
|
-
'https://example.com': { '*': '
|
|
137
|
+
'https://example.com/products/sports-watches': { '*': 'watches.json' }
|
|
146
138
|
}
|
|
147
139
|
}
|
|
148
140
|
```
|
|
@@ -167,4 +159,4 @@ The configuration is now camelCase and grouped, and the entry point is `src/inde
|
|
|
167
159
|
- `DATA_FORMATTER.CATEGORISED_PATHS` -> `output.routes`
|
|
168
160
|
- `DATA_FORMATTER.EXCLUDED_PATTERNS` -> `output.exclude`
|
|
169
161
|
|
|
170
|
-
New in 2.0: `crawl.concurrency`, `crawl.maxDepth`, `crawl.resetOnComplete`, `output.format`, pluggable `fetcher`, and lifecycle hooks. Formatted output is now real JSON by default (1.x wrote `url content` text lines).
|
|
162
|
+
New in 2.0: `crawl.concurrency`, `crawl.maxDepth`, `crawl.resetOnComplete`, `output.format`, `browser.waitUntil`, `browser.blockResources`, pluggable `fetcher`, and lifecycle hooks. Formatted output is now real JSON by default (1.x wrote `url content` text lines).
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/** @type {readonly ['load', 'domcontentloaded', 'networkidle0', 'networkidle2']} */
|
|
2
|
+
export const BROWSER_WAIT_UNTIL = Object.freeze([
|
|
3
|
+
'load',
|
|
4
|
+
'domcontentloaded',
|
|
5
|
+
'networkidle0',
|
|
6
|
+
'networkidle2'
|
|
7
|
+
]);
|
|
8
|
+
|
|
9
|
+
/** Puppeteer resource types Scraply may block to speed up browser fetches. */
|
|
10
|
+
export const BROWSER_BLOCKABLE_RESOURCES = Object.freeze(['image', 'stylesheet', 'font', 'media']);
|
|
11
|
+
|
|
12
|
+
/** Default blocked types. Stylesheets are excluded — many SPAs need CSS before content renders. */
|
|
13
|
+
export const DEFAULT_BROWSER_BLOCK_RESOURCES = Object.freeze(['image', 'font', 'media']);
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* @param {import('../index.js').BrowserConfig} browser
|
|
17
|
+
*/
|
|
18
|
+
export const assertBrowserConfig = (browser) => {
|
|
19
|
+
if (!BROWSER_WAIT_UNTIL.includes(browser?.waitUntil)) {
|
|
20
|
+
throw new Error(
|
|
21
|
+
`Invalid browser.waitUntil: ${String(browser?.waitUntil)}. Expected one of: ${BROWSER_WAIT_UNTIL.join(', ')}`
|
|
22
|
+
);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const blockResources = browser?.blockResources;
|
|
26
|
+
if (!Array.isArray(blockResources)) {
|
|
27
|
+
throw new Error('Invalid browser.blockResources: expected an array.');
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
for (const type of blockResources) {
|
|
31
|
+
if (!BROWSER_BLOCKABLE_RESOURCES.includes(type)) {
|
|
32
|
+
throw new Error(
|
|
33
|
+
`Invalid browser.blockResources entry: ${String(type)}. Expected one of: ${BROWSER_BLOCKABLE_RESOURCES.join(', ')}`
|
|
34
|
+
);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
};
|
package/src/config/defaults.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
import { DEFAULT_BROWSER_BLOCK_RESOURCES } from './browser.js';
|
|
2
|
+
|
|
1
3
|
/**
|
|
2
|
-
* Default Scraply configuration. Every value here can be overridden by the
|
|
3
|
-
* object passed to `createCrawler()` / `scraply()`. Durations are in milliseconds.
|
|
4
|
+
* Default Scraply configuration. Every value here can be overridden by the object passed to `createCrawler()` / `scraply()`. Durations are in milliseconds.
|
|
4
5
|
*
|
|
5
6
|
* @type {import('../index.js').ScraplyConfig}
|
|
6
7
|
*/
|
|
@@ -8,9 +9,7 @@ export const DEFAULT_CONFIG = {
|
|
|
8
9
|
// URLs the crawl is seeded with.
|
|
9
10
|
startUrls: ['https://crawler-test.com/'],
|
|
10
11
|
|
|
11
|
-
// Which discovered links are allowed into the queue. Each entry is either an
|
|
12
|
-
// absolute URL prefix (e.g. 'https://site.com/blog') or a RegExp. Empty means
|
|
13
|
-
// "default to startUrls".
|
|
12
|
+
// Which discovered links are allowed into the queue. Each entry is either an absolute URL prefix (e.g. 'https://site.com/blog') or a RegExp. Empty means "default to startUrls".
|
|
14
13
|
include: [],
|
|
15
14
|
|
|
16
15
|
// Links matching any of these (string prefix or RegExp) are never queued.
|
|
@@ -24,6 +23,15 @@ export const DEFAULT_CONFIG = {
|
|
|
24
23
|
// 'http' (native fetch), 'browser' (Puppeteer) or a custom Fetcher instance.
|
|
25
24
|
fetcher: 'http',
|
|
26
25
|
|
|
26
|
+
// Options for the built-in Puppeteer fetcher (`fetcher: 'browser'`).
|
|
27
|
+
browser: {
|
|
28
|
+
// When page.goto considers navigation finished. Use 'networkidle2' for SPAs that inject links/content after load (e.g. Vue/React apps).
|
|
29
|
+
waitUntil: 'load',
|
|
30
|
+
|
|
31
|
+
// Resource types to abort during fetch (speeds up crawls). Stylesheets are omitted by default because many SPAs need CSS before content renders.
|
|
32
|
+
blockResources: [...DEFAULT_BROWSER_BLOCK_RESOURCES]
|
|
33
|
+
},
|
|
34
|
+
|
|
27
35
|
// 'silent' | 'error' | 'warn' | 'info' | 'debug'
|
|
28
36
|
logLevel: 'info',
|
|
29
37
|
|
|
@@ -32,10 +40,11 @@ export const DEFAULT_CONFIG = {
|
|
|
32
40
|
},
|
|
33
41
|
|
|
34
42
|
request: {
|
|
35
|
-
timeout: 10000,
|
|
36
|
-
maxRedirects: 5,
|
|
37
|
-
maxContentLength: 20 * 1024 * 1024,
|
|
38
|
-
userAgent: 'Mozilla/5.0 (compatible; Scraply/2.0; +https://www.npmjs.com/package/scraply)'
|
|
43
|
+
timeout: 10000, // per-request budget (aborts the fetch, including body read)
|
|
44
|
+
maxRedirects: 5, // redirect hops the HTTP fetcher follows before giving up
|
|
45
|
+
maxContentLength: 20 * 1024 * 1024, // hard cap on the response body (bytes); 0 disables it
|
|
46
|
+
userAgent: 'Mozilla/5.0 (compatible; Scraply/2.0; +https://www.npmjs.com/package/scraply)',
|
|
47
|
+
headers: {} // extra request headers (auth, Accept-Language, cookies, ...) sent by every fetcher
|
|
39
48
|
},
|
|
40
49
|
|
|
41
50
|
retry: {
|
|
@@ -54,7 +63,9 @@ export const DEFAULT_CONFIG = {
|
|
|
54
63
|
concurrency: 5,
|
|
55
64
|
delay: 200, // minimum spacing between requests to the same host
|
|
56
65
|
maxDepth: Infinity,
|
|
57
|
-
|
|
66
|
+
maxPages: Infinity, // hard cap on successfully crawled pages (counts across resumes)
|
|
67
|
+
resetOnComplete: true,
|
|
68
|
+
retryErrors: false // re-queue previously errored URLs on resume so they are retried
|
|
58
69
|
},
|
|
59
70
|
|
|
60
71
|
extract: {
|
package/src/config/load.js
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import path from 'node:path';
|
|
2
2
|
import { DEFAULT_CONFIG } from './defaults.js';
|
|
3
|
+
import { assertBrowserConfig } from './browser.js';
|
|
4
|
+
import { normalizeUrl } from '../url/normalize.js';
|
|
3
5
|
|
|
4
6
|
const isPlainObject = (value) =>
|
|
5
7
|
value !== null && typeof value === 'object' && !Array.isArray(value) && !(value instanceof RegExp);
|
|
@@ -31,9 +33,14 @@ export const loadConfig = (userConfig = {}) => {
|
|
|
31
33
|
config.storage.crawledDir = path.posix.join(dir, 'crawled');
|
|
32
34
|
config.storage.formattedDir = path.posix.join(dir, 'formatted');
|
|
33
35
|
|
|
36
|
+
// When no include rules are given, fall back to the start URLs — normalized so
|
|
37
|
+
// they match the normalized links the crawler actually discovers (forced
|
|
38
|
+
// HTTPS, no "www.", no trailing slash).
|
|
34
39
|
if (!config.include?.length) {
|
|
35
|
-
config.include =
|
|
40
|
+
config.include = config.startUrls.map(normalizeUrl);
|
|
36
41
|
}
|
|
37
42
|
|
|
43
|
+
assertBrowserConfig(config.browser);
|
|
44
|
+
|
|
38
45
|
return config;
|
|
39
46
|
};
|
package/src/core/queue.js
CHANGED
|
@@ -3,19 +3,21 @@ import { loadJSON, saveJSON, deletePath } from '../storage/files.js';
|
|
|
3
3
|
/**
|
|
4
4
|
* @typedef {Object} QueueEntry
|
|
5
5
|
* @property {string} url
|
|
6
|
-
* @property {string|null} file -
|
|
6
|
+
* @property {string|null} file - filename of the saved crawled record (relative to crawledDir), or null
|
|
7
7
|
* @property {number|null} status - last HTTP status
|
|
8
8
|
* @property {string|null} error - error message, or null
|
|
9
|
+
* @property {string|null} skipped - reason the page was skipped (e.g. content-type), or null
|
|
9
10
|
* @property {string|null} referrer - URL this entry was discovered on
|
|
10
11
|
* @property {number} depth
|
|
11
12
|
*/
|
|
12
13
|
|
|
13
|
-
const isProcessed = (entry) => entry.file !== null || entry.error !== null;
|
|
14
|
+
const isProcessed = (entry) => entry.file !== null || entry.error !== null || entry.skipped !== null;
|
|
14
15
|
|
|
15
16
|
/**
|
|
16
17
|
* Owns the crawl queue: dedup, depth limiting, status tracking and durable
|
|
17
|
-
* checkpointing.
|
|
18
|
-
*
|
|
18
|
+
* checkpointing. Status totals are tracked incrementally (O(1) reads) and
|
|
19
|
+
* persistence is debounced so a high-concurrency crawl does not rewrite the
|
|
20
|
+
* queue file on every single URL.
|
|
19
21
|
*/
|
|
20
22
|
export class QueueManager {
|
|
21
23
|
/** @param {{ config: import('../index.js').ResolvedConfig, logger: any }} deps */
|
|
@@ -32,16 +34,30 @@ export class QueueManager {
|
|
|
32
34
|
/** @type {QueueEntry[]} */
|
|
33
35
|
this._pending = [];
|
|
34
36
|
this._cursor = 0;
|
|
37
|
+
this._crawled = 0;
|
|
38
|
+
this._errors = 0;
|
|
39
|
+
this._skipped = 0;
|
|
35
40
|
this._dirty = false;
|
|
36
41
|
this._timer = null;
|
|
37
42
|
this._persistInterval = 1000;
|
|
38
43
|
}
|
|
39
44
|
|
|
40
|
-
/** Loads any previously persisted queue and rebuilds the in-memory indexes. */
|
|
45
|
+
/** Loads any previously persisted queue and rebuilds the in-memory indexes and totals. */
|
|
41
46
|
load() {
|
|
42
47
|
this.entries = loadJSON(this.path, []) ?? [];
|
|
43
48
|
this.index = new Set(this.entries.map((entry) => entry.url));
|
|
44
|
-
this._pending =
|
|
49
|
+
this._pending = [];
|
|
50
|
+
this._crawled = 0;
|
|
51
|
+
this._errors = 0;
|
|
52
|
+
this._skipped = 0;
|
|
53
|
+
|
|
54
|
+
for (const entry of this.entries) {
|
|
55
|
+
if (entry.file !== null) this._crawled += 1;
|
|
56
|
+
else if (entry.error !== null) this._errors += 1;
|
|
57
|
+
else if (entry.skipped !== null) this._skipped += 1;
|
|
58
|
+
else this._pending.push(entry);
|
|
59
|
+
}
|
|
60
|
+
|
|
45
61
|
this._cursor = 0;
|
|
46
62
|
return this.entries;
|
|
47
63
|
}
|
|
@@ -59,7 +75,7 @@ export class QueueManager {
|
|
|
59
75
|
add(url, { depth = 0, referrer = null } = {}) {
|
|
60
76
|
if (this.index.has(url) || depth > this.maxDepth) return false;
|
|
61
77
|
|
|
62
|
-
const entry = { url, file: null, status: null, error: null, referrer, depth };
|
|
78
|
+
const entry = { url, file: null, status: null, error: null, skipped: null, referrer, depth };
|
|
63
79
|
this.index.add(url);
|
|
64
80
|
this.entries.push(entry);
|
|
65
81
|
this._pending.push(entry);
|
|
@@ -76,29 +92,64 @@ export class QueueManager {
|
|
|
76
92
|
entry.file = file;
|
|
77
93
|
entry.status = status;
|
|
78
94
|
entry.error = null;
|
|
95
|
+
entry.skipped = null;
|
|
96
|
+
this._crawled += 1;
|
|
79
97
|
this._markDirty();
|
|
80
98
|
}
|
|
81
99
|
|
|
82
100
|
markError(entry, { error, status }) {
|
|
83
101
|
entry.error = error;
|
|
84
102
|
entry.status = status ?? null;
|
|
103
|
+
this._errors += 1;
|
|
104
|
+
this._markDirty();
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
markSkipped(entry, { reason, status }) {
|
|
108
|
+
entry.skipped = reason;
|
|
109
|
+
entry.status = status ?? null;
|
|
110
|
+
this._skipped += 1;
|
|
85
111
|
this._markDirty();
|
|
86
112
|
}
|
|
87
113
|
|
|
114
|
+
/**
|
|
115
|
+
* Clears the error on every failed entry and returns it to the pending set so
|
|
116
|
+
* the next crawl retries it. Persists immediately so a fresh `load()` (e.g. at
|
|
117
|
+
* the start of `crawl()`) sees the requeued entries.
|
|
118
|
+
* @returns {number} how many entries were requeued
|
|
119
|
+
*/
|
|
120
|
+
requeueErrors() {
|
|
121
|
+
let count = 0;
|
|
122
|
+
for (const entry of this.entries) {
|
|
123
|
+
if (entry.error !== null) {
|
|
124
|
+
entry.error = null;
|
|
125
|
+
entry.status = null;
|
|
126
|
+
this._pending.push(entry);
|
|
127
|
+
this._errors -= 1;
|
|
128
|
+
count += 1;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
if (count > 0) this.flush();
|
|
132
|
+
return count;
|
|
133
|
+
}
|
|
134
|
+
|
|
88
135
|
isAllProcessed() {
|
|
89
|
-
return this.entries.length > 0 && this.
|
|
136
|
+
return this.entries.length > 0 && this.pendingCount() === 0;
|
|
90
137
|
}
|
|
91
138
|
|
|
92
139
|
pendingCount() {
|
|
93
|
-
return this.entries.
|
|
140
|
+
return this.entries.length - this._crawled - this._errors - this._skipped;
|
|
94
141
|
}
|
|
95
142
|
|
|
96
143
|
crawledCount() {
|
|
97
|
-
return this.
|
|
144
|
+
return this._crawled;
|
|
98
145
|
}
|
|
99
146
|
|
|
100
147
|
errorCount() {
|
|
101
|
-
return this.
|
|
148
|
+
return this._errors;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
skippedCount() {
|
|
152
|
+
return this._skipped;
|
|
102
153
|
}
|
|
103
154
|
|
|
104
155
|
/** Clears in-memory state and removes the persisted queue file. */
|
|
@@ -107,6 +158,9 @@ export class QueueManager {
|
|
|
107
158
|
this.index = new Set();
|
|
108
159
|
this._pending = [];
|
|
109
160
|
this._cursor = 0;
|
|
161
|
+
this._crawled = 0;
|
|
162
|
+
this._errors = 0;
|
|
163
|
+
this._skipped = 0;
|
|
110
164
|
this._dirty = false;
|
|
111
165
|
deletePath(this.path);
|
|
112
166
|
}
|
package/src/core/retry.js
CHANGED
|
@@ -22,46 +22,50 @@ const computeWait = (headers = {}, fallback) => {
|
|
|
22
22
|
* Wraps a fetch operation with retry and rate-limit handling shared by every
|
|
23
23
|
* fetcher backend.
|
|
24
24
|
*
|
|
25
|
+
* Rate limiting (HTTP 429) is handled independently of the normal retry budget:
|
|
26
|
+
* when `rateLimit.exitOnLimit` is false the runner waits (honoring `retry-after`
|
|
27
|
+
* / `x-ratelimit-reset`) and retries until the host relents; otherwise it
|
|
28
|
+
* triggers a clean exit so a scheduler can resume the crawl later.
|
|
29
|
+
*
|
|
25
30
|
* @param {{ config: import('../index.js').ResolvedConfig, logger: any, onRateLimitExit: (code: number) => void }} deps
|
|
26
31
|
*/
|
|
27
32
|
export const createRetryRunner = ({ config, logger, onRateLimitExit }) => {
|
|
28
33
|
const { retry, rateLimit } = config;
|
|
29
34
|
|
|
30
|
-
const shouldRetry = async (error) => {
|
|
31
|
-
const status = error?.response?.status;
|
|
32
|
-
if (status === undefined) return true; // network/transport error
|
|
33
|
-
|
|
34
|
-
if (status === 429) {
|
|
35
|
-
if (rateLimit.exitOnLimit) return false; // run() handles the exit
|
|
36
|
-
const wait = computeWait(error.response.headers, rateLimit.fallbackDelay);
|
|
37
|
-
logger.warn(`Rate limited. Waiting ${Math.round(wait / 1000)}s before retrying...`);
|
|
38
|
-
await delay(wait);
|
|
39
|
-
return true;
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
return retry.statusCodes.includes(status);
|
|
43
|
-
};
|
|
44
|
-
|
|
45
35
|
const run = async (fn) => {
|
|
46
|
-
|
|
36
|
+
let attempt = 0;
|
|
37
|
+
|
|
38
|
+
for (;;) {
|
|
47
39
|
try {
|
|
48
40
|
return await fn();
|
|
49
41
|
} catch (error) {
|
|
50
|
-
const
|
|
51
|
-
|
|
52
|
-
|
|
42
|
+
const status = error?.response?.status;
|
|
43
|
+
|
|
44
|
+
if (status === 429) {
|
|
45
|
+
if (rateLimit.exitOnLimit) {
|
|
46
|
+
logger.warn(`Force exiting with code ${rateLimit.exitCode} (rate limited).`);
|
|
47
|
+
onRateLimitExit(rateLimit.exitCode);
|
|
48
|
+
throw error;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const wait = computeWait(error.response.headers, rateLimit.fallbackDelay);
|
|
52
|
+
logger.warn(`Rate limited. Waiting ${Math.round(wait / 1000)}s before retrying...`);
|
|
53
|
+
await delay(wait);
|
|
54
|
+
continue; // rate-limit waits never consume the retry budget
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const retriable = status === undefined || retry.statusCodes.includes(status);
|
|
58
|
+
if (retriable && attempt < retry.max) {
|
|
59
|
+
attempt += 1;
|
|
60
|
+
logger.info(`Retry ${attempt}/${retry.max} -> ${error.message}`);
|
|
53
61
|
if (retry.delay > 0) await delay(retry.delay);
|
|
54
62
|
continue;
|
|
55
63
|
}
|
|
56
64
|
|
|
57
|
-
if (error?.response?.status === 429) {
|
|
58
|
-
logger.warn(`Force exiting with code ${rateLimit.exitCode} (rate limited).`);
|
|
59
|
-
onRateLimitExit(rateLimit.exitCode);
|
|
60
|
-
}
|
|
61
65
|
throw error;
|
|
62
66
|
}
|
|
63
67
|
}
|
|
64
68
|
};
|
|
65
69
|
|
|
66
|
-
return { run
|
|
70
|
+
return { run };
|
|
67
71
|
};
|
package/src/crawler.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import path from 'node:path';
|
|
2
|
+
import { createHash } from 'node:crypto';
|
|
2
3
|
import * as cheerio from 'cheerio';
|
|
3
4
|
|
|
4
5
|
import { loadConfig } from './config/load.js';
|
|
@@ -15,22 +16,12 @@ import { resolveFetcher } from './fetchers/index.js';
|
|
|
15
16
|
import { formatRecords } from './output/writers.js';
|
|
16
17
|
import { loadJSON, saveJSON, deletePath, deleteUntracked } from './storage/files.js';
|
|
17
18
|
|
|
18
|
-
const getHeader = (headers, name) => {
|
|
19
|
-
if (!headers) return undefined;
|
|
20
|
-
if (headers[name] !== undefined) return headers[name];
|
|
21
|
-
const lower = name.toLowerCase();
|
|
22
|
-
for (const key of Object.keys(headers)) {
|
|
23
|
-
if (key.toLowerCase() === lower) return headers[key];
|
|
24
|
-
}
|
|
25
|
-
return undefined;
|
|
26
|
-
};
|
|
27
|
-
|
|
28
19
|
const toHtml = (data) => (typeof data === 'string' ? data : Buffer.from(data).toString('utf8'));
|
|
29
20
|
|
|
21
|
+
const sha256 = (text) => createHash('sha256').update(text).digest('hex');
|
|
22
|
+
|
|
30
23
|
/**
|
|
31
|
-
* Creates a crawler instance. Every stage is exposed as a method so callers can
|
|
32
|
-
* run the whole pipeline (`run`) or drive individual stages and add their own
|
|
33
|
-
* logic via hooks.
|
|
24
|
+
* Creates a crawler instance. Every stage is exposed as a method so callers can run the whole pipeline (`run`) or drive individual stages and add their own logic via hooks.
|
|
34
25
|
*
|
|
35
26
|
* @param {import('./index.js').ScraplyConfig} [userConfig]
|
|
36
27
|
*/
|
|
@@ -41,6 +32,11 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
41
32
|
const queue = new QueueManager({ config, logger });
|
|
42
33
|
const fetcher = resolveFetcher({ config, logger });
|
|
43
34
|
|
|
35
|
+
// Normalized once so the start URLs match discovered (normalized) links and
|
|
36
|
+
// can be looked up in O(1) during filtering.
|
|
37
|
+
const startUrls = config.startUrls.map(normalizeUrl);
|
|
38
|
+
const startUrlSet = new Set(startUrls);
|
|
39
|
+
|
|
44
40
|
let stopped = false;
|
|
45
41
|
let initialized = false;
|
|
46
42
|
let datasetCounter = 0;
|
|
@@ -76,9 +72,14 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
76
72
|
queue.load();
|
|
77
73
|
datasetCounter = computeDatasetCounter();
|
|
78
74
|
|
|
75
|
+
if (config.crawl.retryErrors) {
|
|
76
|
+
const requeued = queue.requeueErrors();
|
|
77
|
+
if (requeued > 0) logger.info(`Re-queued ${requeued} previously errored URL(s) for retry.`);
|
|
78
|
+
}
|
|
79
|
+
|
|
79
80
|
if (queue.entries.length === 0) {
|
|
80
|
-
logger.info(`Starting fresh with ${
|
|
81
|
-
queue.seed(
|
|
81
|
+
logger.info(`Starting fresh with ${startUrls.length} start URL(s).`);
|
|
82
|
+
queue.seed(startUrls);
|
|
82
83
|
return;
|
|
83
84
|
}
|
|
84
85
|
|
|
@@ -88,7 +89,7 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
88
89
|
queue.reset();
|
|
89
90
|
deletePath(config.storage.crawledDir);
|
|
90
91
|
datasetCounter = 0;
|
|
91
|
-
queue.seed(
|
|
92
|
+
queue.seed(startUrls);
|
|
92
93
|
} else {
|
|
93
94
|
logger.info('All URLs already processed (resetOnComplete is false). Nothing to do.');
|
|
94
95
|
}
|
|
@@ -100,22 +101,22 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
100
101
|
|
|
101
102
|
// --- stage methods ---
|
|
102
103
|
|
|
103
|
-
|
|
104
|
+
// Fetches a single URL (with retry/rate-limit policy) and returns the raw result.
|
|
104
105
|
const fetchUrl = (url) => retryRunner.run(() => fetcher.fetch(normalizeUrl(url)));
|
|
105
106
|
|
|
106
|
-
|
|
107
|
+
// Extracts readable text from HTML.
|
|
107
108
|
const extract = (html, url = null) => ({
|
|
108
109
|
url,
|
|
109
110
|
content: extractText(html, { removeSelectors: config.extract.removeSelectors })
|
|
110
111
|
});
|
|
111
112
|
|
|
112
113
|
const shouldCrawl = (url) => {
|
|
113
|
-
if (
|
|
114
|
+
if (startUrlSet.has(url)) return true;
|
|
114
115
|
if (matchesAnyPattern(url, config.exclude)) return false;
|
|
115
116
|
return matchesAnyPattern(url, config.include);
|
|
116
117
|
};
|
|
117
118
|
|
|
118
|
-
|
|
119
|
+
// Filters + normalizes URLs and adds the survivors to the queue.
|
|
119
120
|
const enqueue = async (urls, { depth = 0, referrer = null } = {}) => {
|
|
120
121
|
const list = Array.isArray(urls) ? urls : [urls];
|
|
121
122
|
let added = 0;
|
|
@@ -137,15 +138,17 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
137
138
|
return added;
|
|
138
139
|
};
|
|
139
140
|
|
|
141
|
+
// Persists a crawled record and returns its filename (relative to crawledDir).
|
|
142
|
+
// Only the bare name is stored in the queue so datasets stay portable.
|
|
140
143
|
const saveDataset = (record) => {
|
|
141
144
|
datasetCounter += 1;
|
|
142
|
-
const
|
|
143
|
-
saveJSON(
|
|
144
|
-
return
|
|
145
|
+
const file = `${datasetCounter}.json`;
|
|
146
|
+
saveJSON(path.posix.join(config.storage.crawledDir, file), record);
|
|
147
|
+
return file;
|
|
145
148
|
};
|
|
146
149
|
|
|
147
150
|
const processOne = async (entry) => {
|
|
148
|
-
if (entry.file || entry.error) return;
|
|
151
|
+
if (entry.file || entry.error || entry.skipped) return;
|
|
149
152
|
|
|
150
153
|
processedCount += 1;
|
|
151
154
|
logger.info(`- ${processedCount}/${queue.entries.length} -> ${entry.url}`);
|
|
@@ -154,9 +157,10 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
154
157
|
const result = await retryRunner.run(() => fetcher.fetch(entry.url));
|
|
155
158
|
await hooks.emit('response', result, entry);
|
|
156
159
|
|
|
157
|
-
|
|
160
|
+
// Fetchers return lowercased header keys (see Fetcher interface).
|
|
161
|
+
const contentType = result.headers?.['content-type'];
|
|
158
162
|
if (!contentType || !config.allowedContentTypes.some((type) => contentType.includes(type))) {
|
|
159
|
-
queue.
|
|
163
|
+
queue.markSkipped(entry, { reason: `content-type: ${contentType ?? 'none'}`, status: result.status });
|
|
160
164
|
return;
|
|
161
165
|
}
|
|
162
166
|
|
|
@@ -168,25 +172,41 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
168
172
|
let content = extractText($, { removeSelectors: config.extract.removeSelectors });
|
|
169
173
|
content = await hooks.reduce('extract', content, $, entry);
|
|
170
174
|
|
|
171
|
-
const
|
|
175
|
+
const record = {
|
|
176
|
+
url: entry.url,
|
|
177
|
+
content,
|
|
178
|
+
crawledAt: new Date().toISOString(),
|
|
179
|
+
hash: sha256(content)
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
const file = saveDataset(record);
|
|
172
183
|
queue.markDone(entry, { file, status: result.status });
|
|
173
184
|
|
|
174
|
-
const
|
|
175
|
-
await hooks.emit('page',
|
|
185
|
+
const transformed = await hooks.reduce('transform', record, entry);
|
|
186
|
+
await hooks.emit('page', transformed, entry);
|
|
176
187
|
} catch (error) {
|
|
177
|
-
|
|
188
|
+
// A 429 only reaches here when rateLimit.exitOnLimit is true and the
|
|
189
|
+
// process is already exiting; leave the entry pending so the next run
|
|
190
|
+
// retries it instead of recording a permanent error.
|
|
191
|
+
if (error.response?.status !== 429) {
|
|
192
|
+
queue.markError(entry, { error: error.message, status: error.response?.status });
|
|
193
|
+
}
|
|
178
194
|
await hooks.emit('error', error, entry);
|
|
179
195
|
logger.error(`Failed to fetch ${entry.url} -> ${error.message}`);
|
|
180
196
|
}
|
|
181
197
|
};
|
|
182
198
|
|
|
183
199
|
const logBanner = () => {
|
|
200
|
+
const browserLine =
|
|
201
|
+
fetcher.name === 'browser' ? `\n - Browser waitUntil: ${config.browser.waitUntil}` : '';
|
|
202
|
+
|
|
184
203
|
logger.info(`STARTING SCRAPLY CRAWLER...
|
|
185
204
|
- Start URLs: ${config.startUrls.join(', ')}
|
|
186
|
-
- Fetcher: ${fetcher.name}
|
|
205
|
+
- Fetcher: ${fetcher.name}${browserLine}
|
|
187
206
|
- Concurrency: ${config.crawl.concurrency}
|
|
188
207
|
- Per-host delay: ${config.crawl.delay}ms
|
|
189
208
|
- Max depth: ${config.crawl.maxDepth}
|
|
209
|
+
- Max pages: ${config.crawl.maxPages}
|
|
190
210
|
- Allowed content types: ${config.allowedContentTypes.join(', ')}
|
|
191
211
|
- Output format: ${config.output.format}
|
|
192
212
|
`);
|
|
@@ -208,48 +228,52 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
208
228
|
process.once('SIGTERM', handler);
|
|
209
229
|
};
|
|
210
230
|
|
|
211
|
-
|
|
231
|
+
// Crawls until the queue is drained (or `stop()` is called).
|
|
212
232
|
const crawl = async () => {
|
|
213
233
|
init();
|
|
214
234
|
logBanner();
|
|
215
235
|
registerSignals();
|
|
216
236
|
|
|
217
237
|
if (fetcher.init) await fetcher.init();
|
|
218
|
-
processedCount = queue.crawledCount() + queue.errorCount();
|
|
238
|
+
processedCount = queue.crawledCount() + queue.errorCount() + queue.skippedCount();
|
|
219
239
|
|
|
220
240
|
await runPipeline({
|
|
221
241
|
queue,
|
|
222
242
|
concurrency: config.crawl.concurrency,
|
|
223
243
|
perHostDelay: config.crawl.delay,
|
|
224
244
|
processOne,
|
|
225
|
-
isStopped: () => stopped
|
|
245
|
+
isStopped: () => stopped || queue.crawledCount() >= config.crawl.maxPages
|
|
226
246
|
});
|
|
227
247
|
|
|
228
248
|
queue.flush();
|
|
249
|
+
|
|
250
|
+
if (config.crawl.maxPages !== Infinity && queue.crawledCount() >= config.crawl.maxPages) {
|
|
251
|
+
logger.info(`Reached maxPages limit (${config.crawl.maxPages}).`);
|
|
252
|
+
}
|
|
253
|
+
|
|
229
254
|
logger.info(
|
|
230
|
-
`Crawling completed! ${queue.crawledCount()}
|
|
231
|
-
|
|
255
|
+
`Crawling completed! ${queue.crawledCount()} crawled, ${queue.skippedCount()} skipped, ` +
|
|
256
|
+
`${queue.errorCount()} errors, ${queue.pendingCount()} pending (of ${queue.entries.length} total).`
|
|
232
257
|
);
|
|
233
258
|
};
|
|
234
259
|
|
|
235
|
-
|
|
260
|
+
// Re-reads crawled pages from disk so resumed runs include earlier sessions.
|
|
236
261
|
const collectRecords = () => {
|
|
237
262
|
const records = [];
|
|
238
263
|
for (const entry of queue.entries) {
|
|
239
|
-
if (!entry.file
|
|
240
|
-
const data = loadJSON(entry.file, null);
|
|
264
|
+
if (!entry.file) continue;
|
|
265
|
+
const data = loadJSON(path.posix.join(config.storage.crawledDir, entry.file), null);
|
|
241
266
|
if (data) records.push({ url: entry.url, content: data.content });
|
|
242
267
|
}
|
|
243
268
|
return records;
|
|
244
269
|
};
|
|
245
270
|
|
|
246
|
-
|
|
247
|
-
* Routes records to their output files and writes them. Defaults to every
|
|
248
|
-
* successfully crawled page; pass an explicit array to format custom records.
|
|
249
|
-
*/
|
|
271
|
+
// Routes records to their output files and writes them. Defaults to every successfully crawled page; pass an explicit array to format custom records. When reading from disk, reloads `dataset/queue.json` first so this can run without calling `crawl()` (e.g. after changing `output.routes`).
|
|
250
272
|
const format = async (records = null) => {
|
|
251
273
|
logger.info('Formatting data...');
|
|
252
274
|
|
|
275
|
+
if (records === null) queue.load();
|
|
276
|
+
|
|
253
277
|
const collected = records ?? collectRecords();
|
|
254
278
|
const groups = formatRecords(collected, {
|
|
255
279
|
output: config.output,
|
|
@@ -269,7 +293,7 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
269
293
|
return groups;
|
|
270
294
|
};
|
|
271
295
|
|
|
272
|
-
|
|
296
|
+
// Full pipeline: init -> crawl -> format, with guaranteed cleanup.
|
|
273
297
|
const run = async () => {
|
|
274
298
|
try {
|
|
275
299
|
await crawl();
|
|
@@ -292,11 +316,17 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
292
316
|
crawl,
|
|
293
317
|
format,
|
|
294
318
|
run,
|
|
319
|
+
// Clears errored entries and returns them to the queue so a later crawl()
|
|
320
|
+
// retries them. Persists immediately; returns how many were requeued.
|
|
321
|
+
requeueErrors: () => {
|
|
322
|
+
if (queue.entries.length === 0) queue.load();
|
|
323
|
+
return queue.requeueErrors();
|
|
324
|
+
},
|
|
295
325
|
stop: () => {
|
|
296
326
|
stopped = true;
|
|
297
327
|
}
|
|
298
328
|
};
|
|
299
329
|
};
|
|
300
330
|
|
|
301
|
-
|
|
331
|
+
// One-call convenience wrapper: create a crawler and run the full pipeline.
|
|
302
332
|
export const scraply = (userConfig = {}) => createCrawler(userConfig).run();
|
package/src/extract/links.js
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import { URL } from 'node:url';
|
|
2
|
-
import { normalizeUrl } from '../url/normalize.js';
|
|
3
2
|
|
|
4
3
|
const NON_NAVIGATIONAL = /^(mailto:|tel:|javascript:|data:)/i;
|
|
5
4
|
|
|
6
5
|
/**
|
|
7
|
-
* Collects unique,
|
|
8
|
-
*
|
|
6
|
+
* Collects unique, absolute links from anchor tags in a document, resolving
|
|
7
|
+
* relative hrefs against `baseUrl`. Normalization and include/exclude filtering
|
|
8
|
+
* are the crawler's job (`enqueue`), so links are only resolved here.
|
|
9
9
|
*
|
|
10
10
|
* @param {import('cheerio').CheerioAPI} $
|
|
11
11
|
* @param {string} baseUrl - used to resolve relative hrefs
|
|
@@ -19,7 +19,7 @@ export const discoverLinks = ($, baseUrl) => {
|
|
|
19
19
|
if (!href || href.startsWith('#') || NON_NAVIGATIONAL.test(href)) return;
|
|
20
20
|
|
|
21
21
|
try {
|
|
22
|
-
links.add(
|
|
22
|
+
links.add(new URL(href, baseUrl).href);
|
|
23
23
|
} catch {
|
|
24
24
|
// Ignore malformed hrefs.
|
|
25
25
|
}
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
import { Cluster } from 'puppeteer-cluster';
|
|
2
2
|
|
|
3
|
-
const BLOCKED_RESOURCES = new Set(['image', 'stylesheet', 'font', 'media']);
|
|
4
|
-
|
|
5
3
|
/**
|
|
6
|
-
* Puppeteer-cluster backend for JavaScript-rendered pages. `page.goto` already
|
|
7
|
-
* follows redirects and returns the final response, so no manual redirect
|
|
8
|
-
* handling is needed.
|
|
4
|
+
* Puppeteer-cluster backend for JavaScript-rendered pages. `page.goto` already follows redirects and returns the final response, so no manual redirect handling is needed. The `browser` config is validated once in `loadConfig`, so no re-validation is needed here.
|
|
9
5
|
*
|
|
10
6
|
* @param {import('./types.js').FetcherDeps} deps
|
|
11
7
|
* @returns {import('./types.js').Fetcher}
|
|
12
8
|
*/
|
|
13
9
|
export const createBrowserFetcher = ({ config, logger }) => {
|
|
14
|
-
const { request, crawl } = config;
|
|
10
|
+
const { request, crawl, browser } = config;
|
|
15
11
|
const timeout = Math.max(request.timeout, 5000);
|
|
12
|
+
|
|
13
|
+
const { waitUntil, blockResources } = browser;
|
|
14
|
+
const blockedResources = new Set(blockResources);
|
|
15
|
+
|
|
16
16
|
let cluster = null;
|
|
17
17
|
|
|
18
18
|
const init = async () => {
|
|
@@ -31,13 +31,19 @@ export const createBrowserFetcher = ({ config, logger }) => {
|
|
|
31
31
|
|
|
32
32
|
await cluster.task(async ({ page, data: url }) => {
|
|
33
33
|
await page.setUserAgent(request.userAgent);
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
34
|
+
if (Object.keys(request.headers).length > 0) {
|
|
35
|
+
await page.setExtraHTTPHeaders(request.headers);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
if (blockedResources.size > 0) {
|
|
39
|
+
await page.setRequestInterception(true);
|
|
40
|
+
page.on('request', (req) => {
|
|
41
|
+
if (blockedResources.has(req.resourceType())) req.abort();
|
|
42
|
+
else req.continue();
|
|
43
|
+
});
|
|
44
|
+
}
|
|
39
45
|
|
|
40
|
-
const response = await page.goto(url, { timeout, waitUntil
|
|
46
|
+
const response = await page.goto(url, { timeout, waitUntil });
|
|
41
47
|
const data = await page.content();
|
|
42
48
|
|
|
43
49
|
return {
|
|
@@ -3,9 +3,45 @@ const lowercaseHeaders = (headers) => Object.fromEntries(headers.entries());
|
|
|
3
3
|
const httpError = (message, status, headers = {}) =>
|
|
4
4
|
Object.assign(new Error(message), { response: { status, headers } });
|
|
5
5
|
|
|
6
|
+
/**
|
|
7
|
+
* Reads a response body as text while enforcing a byte cap (`maxBytes <= 0`
|
|
8
|
+
* disables it). Rejects early on a declared `Content-Length`, and otherwise
|
|
9
|
+
* streams the body so an oversized chunked response is aborted instead of being
|
|
10
|
+
* buffered whole.
|
|
11
|
+
*/
|
|
12
|
+
const readBodyWithLimit = async (response, maxBytes, headers) => {
|
|
13
|
+
if (maxBytes > 0) {
|
|
14
|
+
const declared = Number(response.headers.get('content-length'));
|
|
15
|
+
if (Number.isFinite(declared) && declared > maxBytes) {
|
|
16
|
+
throw httpError(`Response too large: ${declared} bytes (max ${maxBytes})`, 413, headers);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
if (maxBytes <= 0 || !response.body) return response.text();
|
|
21
|
+
|
|
22
|
+
const reader = response.body.getReader();
|
|
23
|
+
const chunks = [];
|
|
24
|
+
let total = 0;
|
|
25
|
+
|
|
26
|
+
for (;;) {
|
|
27
|
+
const { done, value } = await reader.read();
|
|
28
|
+
if (done) break;
|
|
29
|
+
|
|
30
|
+
total += value.byteLength;
|
|
31
|
+
if (total > maxBytes) {
|
|
32
|
+
await reader.cancel();
|
|
33
|
+
throw httpError(`Response exceeded max size of ${maxBytes} bytes`, 413, headers);
|
|
34
|
+
}
|
|
35
|
+
chunks.push(Buffer.from(value));
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
return Buffer.concat(chunks).toString('utf8');
|
|
39
|
+
};
|
|
40
|
+
|
|
6
41
|
/**
|
|
7
42
|
* Native-fetch based backend. Follows redirects manually so the redirect budget
|
|
8
|
-
* is enforced,
|
|
43
|
+
* is enforced, times out via AbortController, and caps the body at
|
|
44
|
+
* `request.maxContentLength`.
|
|
9
45
|
*
|
|
10
46
|
* @param {import('./types.js').FetcherDeps} deps
|
|
11
47
|
* @returns {import('./types.js').Fetcher}
|
|
@@ -21,12 +57,13 @@ export const createHttpFetcher = ({ config }) => {
|
|
|
21
57
|
const response = await fetch(url, {
|
|
22
58
|
signal: controller.signal,
|
|
23
59
|
redirect: 'manual',
|
|
24
|
-
headers: { 'User-Agent': request.userAgent }
|
|
60
|
+
headers: { 'User-Agent': request.userAgent, ...request.headers }
|
|
25
61
|
});
|
|
26
62
|
|
|
27
63
|
const headers = lowercaseHeaders(response.headers);
|
|
28
64
|
|
|
29
65
|
if (response.status >= 300 && response.status < 400) {
|
|
66
|
+
await response.body?.cancel();
|
|
30
67
|
const location = response.headers.get('location');
|
|
31
68
|
if (!location) throw httpError('Redirect without location header', response.status, headers);
|
|
32
69
|
if (redirectsLeft <= 0) throw httpError('Max redirects reached', response.status, headers);
|
|
@@ -35,7 +72,7 @@ export const createHttpFetcher = ({ config }) => {
|
|
|
35
72
|
|
|
36
73
|
if (!response.ok) throw httpError(`Invalid status code: ${response.status}`, response.status, headers);
|
|
37
74
|
|
|
38
|
-
const data = await response.
|
|
75
|
+
const data = await readBodyWithLimit(response, request.maxContentLength, headers);
|
|
39
76
|
return { data, status: response.status, headers };
|
|
40
77
|
} catch (error) {
|
|
41
78
|
if (error.name === 'AbortError') {
|
package/src/index.js
CHANGED
|
@@ -4,8 +4,9 @@
|
|
|
4
4
|
* @typedef {Object} RequestConfig
|
|
5
5
|
* @property {number} timeout
|
|
6
6
|
* @property {number} maxRedirects
|
|
7
|
-
* @property {number} maxContentLength
|
|
7
|
+
* @property {number} maxContentLength - hard cap on the response body in bytes; 0 disables it
|
|
8
8
|
* @property {string} userAgent
|
|
9
|
+
* @property {Record<string, string>} headers - extra request headers sent by every fetcher
|
|
9
10
|
*
|
|
10
11
|
* @typedef {Object} RetryConfig
|
|
11
12
|
* @property {number} max
|
|
@@ -21,7 +22,13 @@
|
|
|
21
22
|
* @property {number} concurrency
|
|
22
23
|
* @property {number} delay - minimum spacing (ms) between requests to the same host
|
|
23
24
|
* @property {number} maxDepth
|
|
25
|
+
* @property {number} maxPages - hard cap on successfully crawled pages (counts across resumes)
|
|
24
26
|
* @property {boolean} resetOnComplete
|
|
27
|
+
* @property {boolean} retryErrors - re-queue previously errored URLs on resume
|
|
28
|
+
*
|
|
29
|
+
* @typedef {Object} BrowserConfig
|
|
30
|
+
* @property {'load'|'domcontentloaded'|'networkidle0'|'networkidle2'} waitUntil
|
|
31
|
+
* @property {Array<'image'|'stylesheet'|'font'|'media'>} blockResources
|
|
25
32
|
*
|
|
26
33
|
* @typedef {Object} OutputConfig
|
|
27
34
|
* @property {'json'|'jsonl'|'lines'} format
|
|
@@ -34,6 +41,7 @@
|
|
|
34
41
|
* @property {Array<string|RegExp>} [exclude]
|
|
35
42
|
* @property {string[]} [allowedContentTypes]
|
|
36
43
|
* @property {'http'|'browser'|import('./fetchers/types.js').Fetcher} [fetcher]
|
|
44
|
+
* @property {Partial<BrowserConfig>} [browser]
|
|
37
45
|
* @property {'silent'|'error'|'warn'|'info'|'debug'} [logLevel]
|
|
38
46
|
* @property {{ dir?: string }} [storage]
|
|
39
47
|
* @property {Partial<RequestConfig>} [request]
|
|
@@ -44,6 +52,7 @@
|
|
|
44
52
|
* @property {Partial<OutputConfig>} [output]
|
|
45
53
|
*
|
|
46
54
|
* @typedef {Required<ScraplyConfig> & {
|
|
55
|
+
* browser: BrowserConfig,
|
|
47
56
|
* storage: { dir: string, queuePath: string, crawledDir: string, formattedDir: string }
|
|
48
57
|
* }} ResolvedConfig
|
|
49
58
|
*/
|
|
@@ -54,6 +63,7 @@ export { createCrawler, scraply } from './crawler.js';
|
|
|
54
63
|
// Config
|
|
55
64
|
export { loadConfig } from './config/load.js';
|
|
56
65
|
export { DEFAULT_CONFIG } from './config/defaults.js';
|
|
66
|
+
export { assertBrowserConfig, BROWSER_WAIT_UNTIL, BROWSER_BLOCKABLE_RESOURCES } from './config/browser.js';
|
|
57
67
|
|
|
58
68
|
// Standalone building blocks (usable without a crawler instance)
|
|
59
69
|
export { normalizeUrl } from './url/normalize.js';
|