scraply 1.0.24 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +27 -9
- package/readme.md +152 -76
- package/src/config/defaults.js +97 -0
- package/src/config/load.js +39 -0
- package/src/core/pipeline.js +61 -0
- package/src/core/queue.js +131 -0
- package/src/core/retry.js +67 -0
- package/src/crawler.js +302 -0
- package/src/extract/extract.js +40 -0
- package/src/extract/links.js +29 -0
- package/src/fetchers/browserFetcher.js +77 -0
- package/src/fetchers/httpFetcher.js +54 -0
- package/src/fetchers/index.js +29 -0
- package/src/fetchers/types.js +31 -0
- package/src/index.js +67 -0
- package/src/output/router.js +39 -0
- package/src/output/writers.js +48 -0
- package/src/storage/files.js +48 -0
- package/src/url/normalize.js +21 -0
- package/src/url/patterns.js +57 -0
- package/src/util/delay.js +1 -0
- package/src/util/hooks.js +34 -0
- package/src/util/logger.js +20 -0
- package/.github/workflows/npm-publish.yml +0 -28
- package/src/defaultConfig.js +0 -67
- package/src/loadConfig.js +0 -29
- package/src/scraply.js +0 -125
- package/src/utils/crawl/browser/helper.js +0 -141
- package/src/utils/crawl/cleanHTML.js +0 -35
- package/src/utils/crawl/delay.js +0 -1
- package/src/utils/crawl/fileOperations.js +0 -51
- package/src/utils/crawl/url/fetch.js +0 -66
- package/src/utils/crawl/url/handlers.js +0 -75
- package/src/utils/crawl/url/normalize.js +0 -14
- package/src/utils/crawl/url/processor.js +0 -52
- package/src/utils/format/formatData.js +0 -54
package/package.json
CHANGED
|
@@ -1,26 +1,44 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "scraply",
|
|
3
3
|
"description": "A simple, configurable and functional content scraper",
|
|
4
|
-
"version": "
|
|
5
|
-
"main": "src/
|
|
4
|
+
"version": "2.0.0",
|
|
5
|
+
"main": "src/index.js",
|
|
6
6
|
"type": "module",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": "./src/index.js"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"src"
|
|
12
|
+
],
|
|
13
|
+
"engines": {
|
|
14
|
+
"node": ">=18"
|
|
15
|
+
},
|
|
7
16
|
"scripts": {
|
|
8
|
-
"start": "node ."
|
|
17
|
+
"start": "node .",
|
|
18
|
+
"dev": "node src/dev.js"
|
|
9
19
|
},
|
|
10
20
|
"keywords": [
|
|
11
21
|
"crawler",
|
|
12
|
-
"scraper"
|
|
22
|
+
"scraper",
|
|
23
|
+
"web-scraping",
|
|
24
|
+
"puppeteer",
|
|
25
|
+
"cheerio"
|
|
13
26
|
],
|
|
14
27
|
"author": "Pau Serrat Gutiérrez",
|
|
28
|
+
"repository": {
|
|
29
|
+
"type": "git",
|
|
30
|
+
"url": "git+https://github.com/pauserratgutierrez/scraply.git"
|
|
31
|
+
},
|
|
15
32
|
"dependencies": {
|
|
16
|
-
"
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"puppeteer": "^24.2.0",
|
|
20
|
-
"puppeteer-cluster": "^0.24.0"
|
|
33
|
+
"cheerio": "1.2.0",
|
|
34
|
+
"puppeteer": "25.1.0",
|
|
35
|
+
"puppeteer-cluster": "0.25.0"
|
|
21
36
|
},
|
|
22
37
|
"publishConfig": {
|
|
23
38
|
"registry": "https://registry.npmjs.org/",
|
|
24
39
|
"access": "public"
|
|
40
|
+
},
|
|
41
|
+
"allowScripts": {
|
|
42
|
+
"puppeteer@25.1.0": true
|
|
25
43
|
}
|
|
26
44
|
}
|
package/readme.md
CHANGED
|
@@ -1,94 +1,170 @@
|
|
|
1
1
|
# Scraply
|
|
2
|
-
Scraply is a customizable and efficient web crawler and data scraper for Node.js, designed to handle various web crawling needs with ease. You can define the URLs to crawl, configure patterns to include/exclude, and format the output data in JSON. Scraply is built to be flexible, with user-configurable settings and dynamic paths.
|
|
3
2
|
|
|
4
|
-
|
|
5
|
-
NPM Package: [Scraply's NPM](https://www.npmjs.com/package/scraply)
|
|
3
|
+
Scraply is a customizable, modular web crawler and content scraper for Node.js. Define the URLs to crawl, control which links are followed, choose how pages are fetched (plain HTTP or a real browser), and route the extracted text into JSON files. Crawls are persistent and resumable, so they are well suited to long-running or scheduled jobs.
|
|
6
4
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
5
|
+
Bug reports and development: [Scraply on GitHub](https://github.com/pauserratgutierrez/scraply)
|
|
6
|
+
NPM package: [Scraply on NPM](https://www.npmjs.com/package/scraply)
|
|
7
|
+
|
|
8
|
+
> Scraply 2.0 is a ground-up rewrite with a new configuration shape and public API. See [Migrating from 1.x](#migrating-from-1x).
|
|
9
|
+
|
|
10
|
+
## Requirements
|
|
11
|
+
- Node.js >= 18 (uses the built-in `fetch`).
|
|
10
12
|
|
|
11
|
-
##
|
|
12
|
-
|
|
13
|
+
## Installation
|
|
14
|
+
```
|
|
15
|
+
npm install scraply
|
|
13
16
|
```
|
|
17
|
+
|
|
18
|
+
## Quick start
|
|
19
|
+
```js
|
|
14
20
|
import { scraply } from 'scraply';
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
21
|
+
|
|
22
|
+
await scraply({
|
|
23
|
+
startUrls: ['https://example.com'],
|
|
24
|
+
output: {
|
|
25
|
+
routes: {
|
|
26
|
+
'https://example.com': { '*': 'example.json' }
|
|
27
|
+
}
|
|
18
28
|
}
|
|
19
29
|
});
|
|
20
30
|
```
|
|
21
31
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
32
|
+
This crawls `example.com`, extracts the readable text of every allowed page, and writes the results to `dataset/formatted/example.json`.
|
|
33
|
+
|
|
34
|
+
## How Scraply works
|
|
35
|
+
1. The crawl is seeded from `startUrls`.
|
|
36
|
+
2. Each page is fetched, its links are discovered and filtered (`include` / `exclude`), and new links are queued.
|
|
37
|
+
3. The page text is extracted (configurable element removal) and saved under `dataset/crawled/`.
|
|
38
|
+
4. When the queue drains, all crawled pages are routed by URL into the files defined in `output.routes` and written to `dataset/formatted/`.
|
|
39
|
+
|
|
40
|
+
### Persistence and resuming
|
|
41
|
+
The queue and crawled pages are checkpointed to disk in `dataset/`. If a run is interrupted (or rate-limited), progress is saved and the next run resumes exactly where it left off without re-crawling finished URLs. When every URL has been processed, Scraply starts a fresh crawl (set `crawl.resetOnComplete: false` to keep the finished queue instead).
|
|
42
|
+
|
|
43
|
+
### Concurrency and politeness
|
|
44
|
+
Pages are crawled with a worker pool (`crawl.concurrency`). Requests to the same host are spaced by `crawl.delay` for politeness, while different hosts run in parallel.
|
|
25
45
|
|
|
26
|
-
###
|
|
27
|
-
Scraply
|
|
46
|
+
### Rate limiting
|
|
47
|
+
On HTTP `429`, Scraply either exits immediately with `rateLimit.exitCode` (default) so a scheduler can retry later, or waits (honoring `retry-after` / `x-ratelimit-reset`) and continues when `rateLimit.exitOnLimit` is `false`.
|
|
28
48
|
|
|
29
|
-
|
|
49
|
+
## Fetchers
|
|
50
|
+
`fetcher` selects the backend:
|
|
51
|
+
- `'http'` (default): fast static fetching with the native `fetch`.
|
|
52
|
+
- `'browser'`: full JavaScript rendering via Puppeteer (`puppeteer-cluster`).
|
|
53
|
+
- a custom object implementing the `Fetcher` interface (`{ name, fetch, init?, close? }`), so backends like Playwright or a remote CDP browser can be plugged in without changing the crawler.
|
|
30
54
|
|
|
31
|
-
|
|
32
|
-
|
|
55
|
+
## Programmatic API
|
|
56
|
+
`createCrawler(config)` returns an instance exposing each stage, plus lifecycle hooks:
|
|
33
57
|
|
|
34
|
-
|
|
35
|
-
|
|
58
|
+
```js
|
|
59
|
+
import { createCrawler } from 'scraply';
|
|
60
|
+
|
|
61
|
+
const crawler = createCrawler({ startUrls: ['https://example.com'] });
|
|
62
|
+
|
|
63
|
+
// React to every crawled page as it happens.
|
|
64
|
+
crawler.on('page', (record) => console.log('crawled', record.url));
|
|
65
|
+
|
|
66
|
+
// Veto links before they are queued.
|
|
67
|
+
crawler.on('shouldEnqueue', (url) => !url.includes('/admin'));
|
|
68
|
+
|
|
69
|
+
// Transform the stored record.
|
|
70
|
+
crawler.on('transform', (record) => ({ ...record, length: record.content.length }));
|
|
71
|
+
|
|
72
|
+
await crawler.run();
|
|
36
73
|
```
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
],
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
'
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
74
|
+
|
|
75
|
+
Instance methods: `run()`, `crawl()`, `fetch(url)`, `extract(html, url)`, `enqueue(urls, opts)`, `format(records?)`, `stop()`, `on(event, fn)`.
|
|
76
|
+
|
|
77
|
+
Hooks: `response`, `extract`, `shouldEnqueue`, `transform`, `page`, `error`.
|
|
78
|
+
|
|
79
|
+
Standalone exports for advanced use: `normalizeUrl`, `matchesPattern`, `matchesAnyPattern`, `extractText`, `discoverLinks`, `routeRecord`, `writeRecords`, `formatRecords`, `loadConfig`, `DEFAULT_CONFIG`, `resolveFetcher`, `createHttpFetcher`, `createBrowserFetcher`.
|
|
80
|
+
|
|
81
|
+
## Configuration
|
|
82
|
+
All options are optional except `startUrls`. Durations are milliseconds.
|
|
83
|
+
|
|
84
|
+
```js
|
|
85
|
+
{
|
|
86
|
+
startUrls: ['https://crawler-test.com/'],
|
|
87
|
+
include: [], // URL prefixes or RegExp; defaults to startUrls
|
|
88
|
+
exclude: [/\.(zip|png|js|css|...)$/i],
|
|
89
|
+
allowedContentTypes: ['text/html'],
|
|
90
|
+
fetcher: 'http', // 'http' | 'browser' | Fetcher instance
|
|
91
|
+
logLevel: 'info', // 'silent' | 'error' | 'warn' | 'info' | 'debug'
|
|
92
|
+
|
|
93
|
+
storage: { dir: 'dataset' },
|
|
94
|
+
|
|
95
|
+
request: {
|
|
96
|
+
timeout: 10000,
|
|
97
|
+
maxRedirects: 5,
|
|
98
|
+
maxContentLength: 20 * 1024 * 1024,
|
|
99
|
+
userAgent: 'Mozilla/5.0 (compatible; Scraply/2.0; +https://www.npmjs.com/package/scraply)'
|
|
100
|
+
},
|
|
101
|
+
|
|
102
|
+
retry: {
|
|
103
|
+
max: 1,
|
|
104
|
+
statusCodes: [408, 500, 502, 503, 504],
|
|
105
|
+
delay: 1000
|
|
106
|
+
},
|
|
107
|
+
|
|
108
|
+
rateLimit: {
|
|
109
|
+
fallbackDelay: 60000,
|
|
110
|
+
exitOnLimit: true,
|
|
111
|
+
exitCode: 10
|
|
112
|
+
},
|
|
113
|
+
|
|
114
|
+
crawl: {
|
|
115
|
+
concurrency: 5,
|
|
116
|
+
delay: 200, // per-host spacing
|
|
117
|
+
maxDepth: Infinity,
|
|
118
|
+
resetOnComplete: true
|
|
119
|
+
},
|
|
120
|
+
|
|
121
|
+
extract: {
|
|
122
|
+
removeSelectors: ['script', 'style', 'nav', 'header', 'footer', '...']
|
|
123
|
+
},
|
|
124
|
+
|
|
125
|
+
output: {
|
|
126
|
+
format: 'json', // 'json' | 'jsonl' | 'lines'
|
|
127
|
+
exclude: [],
|
|
128
|
+
routes: {
|
|
129
|
+
'https://crawler-test.com': { '*': 'general.json' }
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Output routing
|
|
136
|
+
`output.routes` maps a URL prefix to `{ pathKey: filename, '*': fallback }`. The most specific matching prefix wins, then the most specific path key, then `'*'`. For example:
|
|
137
|
+
|
|
138
|
+
```js
|
|
139
|
+
output: {
|
|
140
|
+
routes: {
|
|
141
|
+
'https://docs.example.com': {
|
|
142
|
+
'guide': 'guides.json',
|
|
143
|
+
'*': 'docs.json'
|
|
91
144
|
},
|
|
145
|
+
'https://example.com': { '*': 'main.json' }
|
|
92
146
|
}
|
|
93
147
|
}
|
|
94
|
-
```
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## GitHub Actions
|
|
151
|
+
Because crawls are persistent and exit cleanly on rate limits, Scraply works well on a schedule. Commit the `dataset/` directory between runs, and each scheduled run continues the crawl.
|
|
152
|
+
|
|
153
|
+
## Migrating from 1.x
|
|
154
|
+
The configuration is now camelCase and grouped, and the entry point is `src/index.js`.
|
|
155
|
+
|
|
156
|
+
- `MAIN_DIR` -> `storage.dir`
|
|
157
|
+
- `CRAWLER.INITIAL_URLS` -> `startUrls`
|
|
158
|
+
- `CRAWLER.INCLUDE_URLS` -> `include`
|
|
159
|
+
- `CRAWLER.EXCLUDE_PATTERNS` -> `exclude`
|
|
160
|
+
- `CRAWLER.ALLOWED_CONTENT_TYPES` -> `allowedContentTypes`
|
|
161
|
+
- `CRAWLER.DOM_ELEMENTS_REMOVE` -> `extract.removeSelectors`
|
|
162
|
+
- `CRAWLER.DYNAMIC_CRAWLING: true` -> `fetcher: 'browser'`
|
|
163
|
+
- `REQUEST_TIMEOUT` / `MAX_REDIRECTS` / `MAX_CONTENT_LENGTH` -> `request.*`
|
|
164
|
+
- `MAX_RETRIES` / `RETRY_STATUS_CODES` / `CRAWL_ERROR_RETRY_DELAY_MS` -> `retry.{max,statusCodes,delay}`
|
|
165
|
+
- `CRAWL_RATE_LIMIT_FALLBACK_DELAY_MS` / `EXIT_ON_RATE_LIMIT` / `EXIT_CODE_RATE_LIMIT` -> `rateLimit.*`
|
|
166
|
+
- `CRAWL_DELAY_MS` -> `crawl.delay`
|
|
167
|
+
- `DATA_FORMATTER.CATEGORISED_PATHS` -> `output.routes`
|
|
168
|
+
- `DATA_FORMATTER.EXCLUDED_PATTERNS` -> `output.exclude`
|
|
169
|
+
|
|
170
|
+
New in 2.0: `crawl.concurrency`, `crawl.maxDepth`, `crawl.resetOnComplete`, `output.format`, pluggable `fetcher`, and lifecycle hooks. Formatted output is now real JSON by default (1.x wrote `url content` text lines).
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Default Scraply configuration. Every value here can be overridden by the
|
|
3
|
+
* object passed to `createCrawler()` / `scraply()`. Durations are in milliseconds.
|
|
4
|
+
*
|
|
5
|
+
* @type {import('../index.js').ScraplyConfig}
|
|
6
|
+
*/
|
|
7
|
+
export const DEFAULT_CONFIG = {
|
|
8
|
+
// URLs the crawl is seeded with.
|
|
9
|
+
startUrls: ['https://crawler-test.com/'],
|
|
10
|
+
|
|
11
|
+
// Which discovered links are allowed into the queue. Each entry is either an
|
|
12
|
+
// absolute URL prefix (e.g. 'https://site.com/blog') or a RegExp. Empty means
|
|
13
|
+
// "default to startUrls".
|
|
14
|
+
include: [],
|
|
15
|
+
|
|
16
|
+
// Links matching any of these (string prefix or RegExp) are never queued.
|
|
17
|
+
exclude: [
|
|
18
|
+
/\.(zip|rar|webp|png|jpg|jpeg|gif|mp3|mp4|pdf|css|js|svg|ico|eot|ttf|woff|woff2|otf|webm|ogg|wav|flac|m4a|mkv|mov|avi|wmv|flv|swf|exe|msi|dmg|iso|bin)$/i
|
|
19
|
+
],
|
|
20
|
+
|
|
21
|
+
// Only responses whose Content-Type includes one of these are parsed.
|
|
22
|
+
allowedContentTypes: ['text/html'],
|
|
23
|
+
|
|
24
|
+
// 'http' (native fetch), 'browser' (Puppeteer) or a custom Fetcher instance.
|
|
25
|
+
fetcher: 'http',
|
|
26
|
+
|
|
27
|
+
// 'silent' | 'error' | 'warn' | 'info' | 'debug'
|
|
28
|
+
logLevel: 'info',
|
|
29
|
+
|
|
30
|
+
storage: {
|
|
31
|
+
dir: 'dataset'
|
|
32
|
+
},
|
|
33
|
+
|
|
34
|
+
request: {
|
|
35
|
+
timeout: 10000,
|
|
36
|
+
maxRedirects: 5,
|
|
37
|
+
maxContentLength: 20 * 1024 * 1024,
|
|
38
|
+
userAgent: 'Mozilla/5.0 (compatible; Scraply/2.0; +https://www.npmjs.com/package/scraply)'
|
|
39
|
+
},
|
|
40
|
+
|
|
41
|
+
retry: {
|
|
42
|
+
max: 1,
|
|
43
|
+
statusCodes: [408, 500, 502, 503, 504],
|
|
44
|
+
delay: 1000
|
|
45
|
+
},
|
|
46
|
+
|
|
47
|
+
rateLimit: {
|
|
48
|
+
fallbackDelay: 60000,
|
|
49
|
+
exitOnLimit: true,
|
|
50
|
+
exitCode: 10
|
|
51
|
+
},
|
|
52
|
+
|
|
53
|
+
crawl: {
|
|
54
|
+
concurrency: 5,
|
|
55
|
+
delay: 200, // minimum spacing between requests to the same host
|
|
56
|
+
maxDepth: Infinity,
|
|
57
|
+
resetOnComplete: true
|
|
58
|
+
},
|
|
59
|
+
|
|
60
|
+
extract: {
|
|
61
|
+
removeSelectors: [
|
|
62
|
+
'script',
|
|
63
|
+
'noscript',
|
|
64
|
+
'style',
|
|
65
|
+
'meta',
|
|
66
|
+
'link',
|
|
67
|
+
'svg',
|
|
68
|
+
'path',
|
|
69
|
+
'img',
|
|
70
|
+
'input',
|
|
71
|
+
'textarea',
|
|
72
|
+
'embed',
|
|
73
|
+
'object',
|
|
74
|
+
'iframe',
|
|
75
|
+
'nav',
|
|
76
|
+
'header',
|
|
77
|
+
'footer',
|
|
78
|
+
'aside',
|
|
79
|
+
'button',
|
|
80
|
+
'[aria-modal]',
|
|
81
|
+
'[role="dialog"]',
|
|
82
|
+
'[role="alert"]',
|
|
83
|
+
'[role="banner"]',
|
|
84
|
+
'[role="form"]',
|
|
85
|
+
'[role="navigation"]',
|
|
86
|
+
'[role="search"]'
|
|
87
|
+
]
|
|
88
|
+
},
|
|
89
|
+
|
|
90
|
+
output: {
|
|
91
|
+
format: 'json', // 'json' | 'jsonl' | 'lines'
|
|
92
|
+
exclude: [],
|
|
93
|
+
routes: {
|
|
94
|
+
'https://crawler-test.com': { '*': 'general.json' }
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
};
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import { DEFAULT_CONFIG } from './defaults.js';
|
|
3
|
+
|
|
4
|
+
const isPlainObject = (value) =>
|
|
5
|
+
value !== null && typeof value === 'object' && !Array.isArray(value) && !(value instanceof RegExp);
|
|
6
|
+
|
|
7
|
+
const deepMerge = (target, source) => {
|
|
8
|
+
const merged = { ...target };
|
|
9
|
+
|
|
10
|
+
for (const [key, value] of Object.entries(source)) {
|
|
11
|
+
if (isPlainObject(value) && isPlainObject(target[key])) {
|
|
12
|
+
merged[key] = deepMerge(target[key], value);
|
|
13
|
+
} else if (value !== undefined) {
|
|
14
|
+
merged[key] = value;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
return merged;
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Merges a user config over the defaults and derives the storage paths.
|
|
23
|
+
* @param {import('../index.js').ScraplyConfig} [userConfig]
|
|
24
|
+
* @returns {import('../index.js').ResolvedConfig}
|
|
25
|
+
*/
|
|
26
|
+
export const loadConfig = (userConfig = {}) => {
|
|
27
|
+
const config = deepMerge(DEFAULT_CONFIG, userConfig);
|
|
28
|
+
|
|
29
|
+
const { dir } = config.storage;
|
|
30
|
+
config.storage.queuePath = path.posix.join(dir, 'queue.json');
|
|
31
|
+
config.storage.crawledDir = path.posix.join(dir, 'crawled');
|
|
32
|
+
config.storage.formattedDir = path.posix.join(dir, 'formatted');
|
|
33
|
+
|
|
34
|
+
if (!config.include?.length) {
|
|
35
|
+
config.include = [...config.startUrls];
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
return config;
|
|
39
|
+
};
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { URL } from 'node:url';
|
|
2
|
+
import { delay } from '../util/delay.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Drains the queue with a fixed-size worker pool. Requests to the same host are
|
|
6
|
+
* spaced by `perHostDelay` for politeness, while different hosts run in parallel.
|
|
7
|
+
* Workers stop when the queue is drained and nothing is in flight, or when
|
|
8
|
+
* `isStopped()` becomes true.
|
|
9
|
+
*
|
|
10
|
+
* @param {Object} deps
|
|
11
|
+
* @param {import('./queue.js').QueueManager} deps.queue
|
|
12
|
+
* @param {number} deps.concurrency
|
|
13
|
+
* @param {number} deps.perHostDelay
|
|
14
|
+
* @param {(entry: import('./queue.js').QueueEntry) => Promise<void>} deps.processOne
|
|
15
|
+
* @param {() => boolean} deps.isStopped
|
|
16
|
+
*/
|
|
17
|
+
export const runPipeline = async ({ queue, concurrency, perHostDelay, processOne, isStopped }) => {
|
|
18
|
+
const lastHostAt = new Map();
|
|
19
|
+
let active = 0;
|
|
20
|
+
|
|
21
|
+
const respectHostDelay = async (url) => {
|
|
22
|
+
if (perHostDelay <= 0) return;
|
|
23
|
+
|
|
24
|
+
let host;
|
|
25
|
+
try {
|
|
26
|
+
host = new URL(url).host;
|
|
27
|
+
} catch {
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const now = Date.now();
|
|
32
|
+
const scheduled = Math.max(now, (lastHostAt.get(host) ?? 0) + perHostDelay);
|
|
33
|
+
lastHostAt.set(host, scheduled);
|
|
34
|
+
|
|
35
|
+
const wait = scheduled - now;
|
|
36
|
+
if (wait > 0) await delay(wait);
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
const worker = async () => {
|
|
40
|
+
while (!isStopped()) {
|
|
41
|
+
const entry = queue.claimNext();
|
|
42
|
+
|
|
43
|
+
if (!entry) {
|
|
44
|
+
if (active === 0) return; // queue drained and nothing can enqueue more
|
|
45
|
+
await delay(25);
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
active++;
|
|
50
|
+
try {
|
|
51
|
+
await respectHostDelay(entry.url);
|
|
52
|
+
await processOne(entry);
|
|
53
|
+
} finally {
|
|
54
|
+
active--;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
const workers = Array.from({ length: Math.max(concurrency, 1) }, () => worker());
|
|
60
|
+
await Promise.all(workers);
|
|
61
|
+
};
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import { loadJSON, saveJSON, deletePath } from '../storage/files.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* @typedef {Object} QueueEntry
|
|
5
|
+
* @property {string} url
|
|
6
|
+
* @property {string|null} file - path to the saved crawled file, or null
|
|
7
|
+
* @property {number|null} status - last HTTP status
|
|
8
|
+
* @property {string|null} error - error message, or null
|
|
9
|
+
* @property {string|null} referrer - URL this entry was discovered on
|
|
10
|
+
* @property {number} depth
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const isProcessed = (entry) => entry.file !== null || entry.error !== null;
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Owns the crawl queue: dedup, depth limiting, status tracking and durable
|
|
17
|
+
* checkpointing. Persistence is debounced so a high-concurrency crawl does not
|
|
18
|
+
* rewrite the queue file on every single URL.
|
|
19
|
+
*/
|
|
20
|
+
export class QueueManager {
|
|
21
|
+
/** @param {{ config: import('../index.js').ResolvedConfig, logger: any }} deps */
|
|
22
|
+
constructor({ config, logger }) {
|
|
23
|
+
this.config = config;
|
|
24
|
+
this.logger = logger;
|
|
25
|
+
this.path = config.storage.queuePath;
|
|
26
|
+
this.maxDepth = config.crawl.maxDepth;
|
|
27
|
+
|
|
28
|
+
/** @type {QueueEntry[]} */
|
|
29
|
+
this.entries = [];
|
|
30
|
+
/** @type {Set<string>} */
|
|
31
|
+
this.index = new Set();
|
|
32
|
+
/** @type {QueueEntry[]} */
|
|
33
|
+
this._pending = [];
|
|
34
|
+
this._cursor = 0;
|
|
35
|
+
this._dirty = false;
|
|
36
|
+
this._timer = null;
|
|
37
|
+
this._persistInterval = 1000;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/** Loads any previously persisted queue and rebuilds the in-memory indexes. */
|
|
41
|
+
load() {
|
|
42
|
+
this.entries = loadJSON(this.path, []) ?? [];
|
|
43
|
+
this.index = new Set(this.entries.map((entry) => entry.url));
|
|
44
|
+
this._pending = this.entries.filter((entry) => !isProcessed(entry));
|
|
45
|
+
this._cursor = 0;
|
|
46
|
+
return this.entries;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** Replaces the queue with a fresh set of start URLs. */
|
|
50
|
+
seed(urls) {
|
|
51
|
+
for (const url of urls) this.add(url, { depth: 0, referrer: null });
|
|
52
|
+
this.flush();
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Adds a URL if it is new and within the depth limit.
|
|
57
|
+
* @returns {boolean} whether the URL was added
|
|
58
|
+
*/
|
|
59
|
+
add(url, { depth = 0, referrer = null } = {}) {
|
|
60
|
+
if (this.index.has(url) || depth > this.maxDepth) return false;
|
|
61
|
+
|
|
62
|
+
const entry = { url, file: null, status: null, error: null, referrer, depth };
|
|
63
|
+
this.index.add(url);
|
|
64
|
+
this.entries.push(entry);
|
|
65
|
+
this._pending.push(entry);
|
|
66
|
+
this._markDirty();
|
|
67
|
+
return true;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/** Returns the next unprocessed entry, or null when the queue is drained. */
|
|
71
|
+
claimNext() {
|
|
72
|
+
return this._cursor < this._pending.length ? this._pending[this._cursor++] : null;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
markDone(entry, { file, status }) {
|
|
76
|
+
entry.file = file;
|
|
77
|
+
entry.status = status;
|
|
78
|
+
entry.error = null;
|
|
79
|
+
this._markDirty();
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
markError(entry, { error, status }) {
|
|
83
|
+
entry.error = error;
|
|
84
|
+
entry.status = status ?? null;
|
|
85
|
+
this._markDirty();
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
isAllProcessed() {
|
|
89
|
+
return this.entries.length > 0 && this.entries.every(isProcessed);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
pendingCount() {
|
|
93
|
+
return this.entries.filter((entry) => !isProcessed(entry)).length;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
crawledCount() {
|
|
97
|
+
return this.entries.filter((entry) => entry.file !== null).length;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
errorCount() {
|
|
101
|
+
return this.entries.filter((entry) => entry.error !== null).length;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/** Clears in-memory state and removes the persisted queue file. */
|
|
105
|
+
reset() {
|
|
106
|
+
this.entries = [];
|
|
107
|
+
this.index = new Set();
|
|
108
|
+
this._pending = [];
|
|
109
|
+
this._cursor = 0;
|
|
110
|
+
this._dirty = false;
|
|
111
|
+
deletePath(this.path);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
_markDirty() {
|
|
115
|
+
this._dirty = true;
|
|
116
|
+
if (this._timer) return;
|
|
117
|
+
this._timer = setTimeout(() => this.flush(), this._persistInterval);
|
|
118
|
+
if (typeof this._timer.unref === 'function') this._timer.unref();
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/** Writes the queue to disk if it has unsaved changes. */
|
|
122
|
+
flush() {
|
|
123
|
+
if (this._timer) {
|
|
124
|
+
clearTimeout(this._timer);
|
|
125
|
+
this._timer = null;
|
|
126
|
+
}
|
|
127
|
+
if (!this._dirty) return;
|
|
128
|
+
saveJSON(this.path, this.entries);
|
|
129
|
+
this._dirty = false;
|
|
130
|
+
}
|
|
131
|
+
}
|