scraply 1.0.25 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +26 -9
- package/readme.md +144 -76
- package/src/config/browser.js +37 -0
- package/src/config/defaults.js +108 -0
- package/src/config/load.js +46 -0
- package/src/core/pipeline.js +61 -0
- package/src/core/queue.js +185 -0
- package/src/core/retry.js +71 -0
- package/src/crawler.js +332 -0
- package/src/extract/extract.js +40 -0
- package/src/extract/links.js +29 -0
- package/src/fetchers/browserFetcher.js +83 -0
- package/src/fetchers/httpFetcher.js +91 -0
- package/src/fetchers/index.js +29 -0
- package/src/fetchers/types.js +31 -0
- package/src/index.js +77 -0
- package/src/output/router.js +39 -0
- package/src/output/writers.js +48 -0
- package/src/storage/files.js +48 -0
- package/src/url/normalize.js +21 -0
- package/src/url/patterns.js +57 -0
- package/src/util/delay.js +1 -0
- package/src/util/hooks.js +34 -0
- package/src/util/logger.js +20 -0
- package/.github/workflows/npm-publish.yml +0 -28
- package/src/defaultConfig.js +0 -67
- package/src/loadConfig.js +0 -29
- package/src/scraply.js +0 -125
- package/src/utils/crawl/browser/helper.js +0 -143
- package/src/utils/crawl/cleanHTML.js +0 -35
- package/src/utils/crawl/delay.js +0 -1
- package/src/utils/crawl/fileOperations.js +0 -51
- package/src/utils/crawl/url/fetch.js +0 -66
- package/src/utils/crawl/url/handlers.js +0 -75
- package/src/utils/crawl/url/normalize.js +0 -14
- package/src/utils/crawl/url/processor.js +0 -52
- package/src/utils/format/formatData.js +0 -54
package/package.json
CHANGED
|
@@ -1,26 +1,43 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "scraply",
|
|
3
3
|
"description": "A simple, configurable and functional content scraper",
|
|
4
|
-
"version": "
|
|
5
|
-
"main": "src/
|
|
4
|
+
"version": "2.0.1",
|
|
5
|
+
"main": "src/index.js",
|
|
6
6
|
"type": "module",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": "./src/index.js"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"src"
|
|
12
|
+
],
|
|
13
|
+
"engines": {
|
|
14
|
+
"node": ">=18"
|
|
15
|
+
},
|
|
7
16
|
"scripts": {
|
|
8
|
-
"
|
|
17
|
+
"dev": "node src/dev.js"
|
|
9
18
|
},
|
|
10
19
|
"keywords": [
|
|
11
20
|
"crawler",
|
|
12
|
-
"scraper"
|
|
21
|
+
"scraper",
|
|
22
|
+
"web-scraping",
|
|
23
|
+
"puppeteer",
|
|
24
|
+
"cheerio"
|
|
13
25
|
],
|
|
14
26
|
"author": "Pau Serrat Gutiérrez",
|
|
27
|
+
"repository": {
|
|
28
|
+
"type": "git",
|
|
29
|
+
"url": "git+https://github.com/pauserratgutierrez/scraply.git"
|
|
30
|
+
},
|
|
15
31
|
"dependencies": {
|
|
16
|
-
"
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"puppeteer": "^24.2.0",
|
|
20
|
-
"puppeteer-cluster": "^0.24.0"
|
|
32
|
+
"cheerio": "1.2.0",
|
|
33
|
+
"puppeteer": "25.1.0",
|
|
34
|
+
"puppeteer-cluster": "0.25.0"
|
|
21
35
|
},
|
|
22
36
|
"publishConfig": {
|
|
23
37
|
"registry": "https://registry.npmjs.org/",
|
|
24
38
|
"access": "public"
|
|
39
|
+
},
|
|
40
|
+
"allowScripts": {
|
|
41
|
+
"puppeteer@25.1.0": true
|
|
25
42
|
}
|
|
26
43
|
}
|
package/readme.md
CHANGED
|
@@ -1,94 +1,162 @@
|
|
|
1
1
|
# Scraply
|
|
2
|
-
Scraply is a customizable and efficient web crawler and data scraper for Node.js, designed to handle various web crawling needs with ease. You can define the URLs to crawl, configure patterns to include/exclude, and format the output data in JSON. Scraply is built to be flexible, with user-configurable settings and dynamic paths.
|
|
3
2
|
|
|
4
|
-
|
|
5
|
-
NPM Package: [Scraply's NPM](https://www.npmjs.com/package/scraply)
|
|
3
|
+
Scraply is a customizable, modular web crawler and content scraper for Node.js. Define the URLs to crawl, control which links are followed, choose how pages are fetched (plain HTTP or a real browser), and route the extracted text into JSON files. Crawls are persistent and resumable, so they are well suited to long-running or scheduled jobs.
|
|
6
4
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
5
|
+
Bug reports and development: [Scraply on GitHub](https://github.com/pauserratgutierrez/scraply)
|
|
6
|
+
NPM package: [Scraply on NPM](https://www.npmjs.com/package/scraply)
|
|
7
|
+
|
|
8
|
+
> Scraply 2.0 is a ground-up rewrite with a new configuration shape and public API. See [Migrating from 1.x](#migrating-from-1x).
|
|
9
|
+
|
|
10
|
+
## Requirements
|
|
11
|
+
- Node.js >= 18 (uses the built-in `fetch`).
|
|
10
12
|
|
|
11
|
-
##
|
|
12
|
-
|
|
13
|
+
## Installation
|
|
14
|
+
```
|
|
15
|
+
npm install scraply
|
|
13
16
|
```
|
|
17
|
+
|
|
18
|
+
## Quick start
|
|
19
|
+
```js
|
|
14
20
|
import { scraply } from 'scraply';
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
21
|
+
|
|
22
|
+
await scraply({
|
|
23
|
+
startUrls: ['https://example.com'],
|
|
24
|
+
output: {
|
|
25
|
+
routes: {
|
|
26
|
+
'https://example.com': { '*': 'example.json' }
|
|
27
|
+
}
|
|
18
28
|
}
|
|
19
29
|
});
|
|
20
30
|
```
|
|
21
31
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
32
|
+
This crawls `example.com`, extracts the readable text of every allowed page, and writes the results to `dataset/formatted/example.json`.
|
|
33
|
+
|
|
34
|
+
## How Scraply works
|
|
35
|
+
1. The crawl is seeded from `startUrls`.
|
|
36
|
+
2. Each page is fetched, its links are discovered and filtered (`include` / `exclude`), and new links are queued.
|
|
37
|
+
3. The page text is extracted (configurable element removal) and saved under `dataset/crawled/` as `{ url, content, crawledAt, hash }` (`crawledAt` is an ISO timestamp; `hash` is the SHA-256 of `content`, handy for change detection).
|
|
38
|
+
4. When the queue drains, all crawled pages are routed by URL into the files defined in `output.routes` and written to `dataset/formatted/`.
|
|
39
|
+
|
|
40
|
+
Each queue entry ends in one of three terminal states: **crawled** (saved), **skipped** (disallowed `Content-Type`), or **error** (fetch failed). The three are tracked separately so stats stay meaningful.
|
|
41
|
+
|
|
42
|
+
### Persistence and resuming
|
|
43
|
+
The queue and crawled pages are checkpointed to disk in `dataset/`. If a run is interrupted (or rate-limited), progress is saved and the next run resumes exactly where it left off without re-crawling finished URLs. When every URL has been processed, Scraply starts a fresh crawl (set `crawl.resetOnComplete: false` to keep the finished queue instead). To re-attempt failed URLs on the next run, set `crawl.retryErrors: true` (or call `requeueErrors()` and crawl again).
|
|
44
|
+
|
|
45
|
+
### Concurrency and limits
|
|
46
|
+
Pages are crawled with a worker pool (`crawl.concurrency`). Requests to the same host are spaced by `crawl.delay` for politeness, while different hosts run in parallel. `crawl.maxDepth` bounds link depth and `crawl.maxPages` caps the total number of successfully crawled pages (counted across resumes).
|
|
47
|
+
|
|
48
|
+
### Rate limiting
|
|
49
|
+
On HTTP `429`, Scraply either exits immediately with `rateLimit.exitCode` (default) so a scheduler can retry later, or waits (honoring `retry-after` / `x-ratelimit-reset`) and retries — independently of the normal `retry` budget — when `rateLimit.exitOnLimit` is `false`.
|
|
50
|
+
|
|
51
|
+
## Fetchers
|
|
52
|
+
`fetcher` selects the backend:
|
|
53
|
+
- `'http'` (default): fast static fetching with the native `fetch`. Redirects are followed up to `request.maxRedirects`, and response bodies larger than `request.maxContentLength` (default 20 MB, `0` disables) are rejected before they are buffered.
|
|
54
|
+
- `'browser'`: full JavaScript rendering via Puppeteer (`puppeteer-cluster`).
|
|
55
|
+
- a custom object implementing the `Fetcher` interface (`{ name, fetch, init?, close? }`), so backends like Playwright or a remote CDP browser can be plugged in without changing the crawler.
|
|
56
|
+
|
|
57
|
+
Both built-in fetchers send `request.userAgent` and any extra `request.headers` (e.g. `Authorization`, `Accept-Language`, `Cookie`) with every request.
|
|
58
|
+
|
|
59
|
+
### Browser fetcher options
|
|
60
|
+
The `browser` block applies only when `fetcher: 'browser'`. Both options are validated at config load time. See [`src/config/defaults.js`](src/config/defaults.js) for defaults.
|
|
61
|
+
|
|
62
|
+
- **`browser.waitUntil`** — passed to Puppeteer `page.goto`. Default `'load'`. Use `'networkidle2'` for SPAs that inject links or content after the initial load (Vue/React sites). Increase `request.timeout` when using slower modes.
|
|
63
|
+
- **`browser.blockResources`** — Puppeteer resource types to abort during fetch (`'image'`, `'stylesheet'`, `'font'`, `'media'`). Default `['image', 'font', 'media']`. Stylesheets are excluded by default because many SPAs need CSS before content renders. Pass `[]` to disable resource blocking entirely.
|
|
64
|
+
|
|
65
|
+
```js
|
|
66
|
+
await scraply({
|
|
67
|
+
startUrls: ['https://spa.example.com/products'],
|
|
68
|
+
fetcher: 'browser',
|
|
69
|
+
browser: {
|
|
70
|
+
waitUntil: 'networkidle2',
|
|
71
|
+
blockResources: ['image', 'font', 'media']
|
|
72
|
+
},
|
|
73
|
+
request: { timeout: 60000 }
|
|
74
|
+
});
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Programmatic API
|
|
78
|
+
`createCrawler(config)` returns an instance exposing each stage, plus lifecycle hooks:
|
|
79
|
+
|
|
80
|
+
```js
|
|
81
|
+
import { createCrawler } from 'scraply';
|
|
82
|
+
|
|
83
|
+
const crawler = createCrawler({ startUrls: ['https://example.com'] });
|
|
84
|
+
|
|
85
|
+
// React to every crawled page as it happens.
|
|
86
|
+
crawler.on('page', (record) => console.log('crawled', record.url));
|
|
87
|
+
|
|
88
|
+
// Veto links before they are queued.
|
|
89
|
+
crawler.on('shouldEnqueue', (url) => !url.includes('/admin'));
|
|
90
|
+
|
|
91
|
+
// Transform the stored record.
|
|
92
|
+
crawler.on('transform', (record) => ({ ...record, length: record.content.length }));
|
|
93
|
+
|
|
94
|
+
await crawler.run();
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Instance methods: `run()`, `crawl()`, `fetch(url)`, `extract(html, url)`, `enqueue(urls, opts)`, `format(records?)`, `requeueErrors()`, `stop()`, `on(event, fn)`.
|
|
98
|
+
|
|
99
|
+
`format()` reads crawled pages from `dataset/crawled/` via the persisted queue. You can call it alone to re-route output after a crawl — no need to fetch pages again.
|
|
100
|
+
|
|
101
|
+
Hooks: `response`, `extract`, `shouldEnqueue`, `transform`, `page`, `error`.
|
|
25
102
|
|
|
26
|
-
|
|
27
|
-
Scraply is designed to handle rate-limiting gracefully. If the crawler encounters rate-limited responses (e.g., status code `429`), it stops processing further requests and saves everything in the queue. Once restarted, it resumes the crawling process from where it stopped.
|
|
103
|
+
Standalone exports for advanced use: `normalizeUrl`, `matchesPattern`, `matchesAnyPattern`, `extractText`, `discoverLinks`, `routeRecord`, `writeRecords`, `formatRecords`, `loadConfig`, `DEFAULT_CONFIG`, `resolveFetcher`, `createHttpFetcher`, `createBrowserFetcher`, `assertBrowserConfig`, `BROWSER_WAIT_UNTIL`, `BROWSER_BLOCKABLE_RESOURCES`.
|
|
28
104
|
|
|
29
|
-
|
|
105
|
+
## Configuration
|
|
106
|
+
All options are optional except `startUrls`. Pass a partial object to `scraply()` or `createCrawler()` — it is [deep-merged](src/config/load.js) over the defaults. Durations are in milliseconds.
|
|
30
107
|
|
|
31
|
-
|
|
32
|
-
Scraply can be easily integrated into a GitHub Action workflow for continuous, long-running crawling tasks. You can set it up to crawl for a set duration or number of URLs, persistently saving the progress, and then resuming where it left off on the next run.
|
|
108
|
+
**Full default values and inline comments:** [`src/config/defaults.js`](src/config/defaults.js)
|
|
33
109
|
|
|
34
|
-
|
|
35
|
-
|
|
110
|
+
```js
|
|
111
|
+
import { DEFAULT_CONFIG, loadConfig } from 'scraply';
|
|
112
|
+
|
|
113
|
+
// Inspect or extend the defaults programmatically.
|
|
114
|
+
const config = loadConfig({
|
|
115
|
+
...DEFAULT_CONFIG,
|
|
116
|
+
startUrls: ['https://example.com']
|
|
117
|
+
});
|
|
36
118
|
```
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
'script',
|
|
55
|
-
'noscript',
|
|
56
|
-
'style',
|
|
57
|
-
'meta',
|
|
58
|
-
'link',
|
|
59
|
-
'svg',
|
|
60
|
-
'path',
|
|
61
|
-
'img',
|
|
62
|
-
'input',
|
|
63
|
-
'textarea',
|
|
64
|
-
'embed',
|
|
65
|
-
'object',
|
|
66
|
-
'iframe',
|
|
67
|
-
'nav',
|
|
68
|
-
'header',
|
|
69
|
-
'footer',
|
|
70
|
-
'aside',
|
|
71
|
-
'button'
|
|
72
|
-
],
|
|
73
|
-
RETRY_STATUS_CODES: [408, 500, 502, 503, 504],
|
|
74
|
-
REQUEST_TIMEOUT: 3000,
|
|
75
|
-
MAX_REDIRECTS: 2,
|
|
76
|
-
MAX_CONTENT_LENGTH: 20 * 1024 * 1024, // 20MB
|
|
77
|
-
MAX_RETRIES: 1,
|
|
78
|
-
CRAWL_DELAY_MS: 200,
|
|
79
|
-
CRAWL_ERROR_RETRY_DELAY_MS: 1000,
|
|
80
|
-
CRAWL_RATE_LIMIT_FALLBACK_DELAY_MS: 60000,
|
|
81
|
-
EXIT_ON_RATE_LIMIT: true, // If true, forces exit instantly. If false, only exits after retries (if still 429)
|
|
82
|
-
EXIT_CODE_RATE_LIMIT: 10
|
|
83
|
-
},
|
|
84
|
-
|
|
85
|
-
DATA_FORMATTER: {
|
|
86
|
-
EXCLUDED_PATTERNS: [],
|
|
87
|
-
CATEGORISED_PATHS: {
|
|
88
|
-
'https://crawler-test.com': {
|
|
89
|
-
'mobile': 'mobile.json',
|
|
90
|
-
'*': 'general.json'
|
|
119
|
+
|
|
120
|
+
Top-level keys: `startUrls`, `include`, `exclude`, `allowedContentTypes`, `fetcher`, `browser`, `logLevel`, `storage`, `request`, `retry`, `rateLimit`, `crawl`, `extract`, `output`.
|
|
121
|
+
|
|
122
|
+
### Output routing
|
|
123
|
+
`output.routes` is a two-level map:
|
|
124
|
+
|
|
125
|
+
1. **Outer keys** — URL prefix (usually `https://origin`, or `https://origin/path`) matched against the full crawled URL.
|
|
126
|
+
2. **Inner keys** — pathname segments joined with `/`, **without a leading slash**, matched from the longest suffix upward. Use `'*'` as fallback within that prefix.
|
|
127
|
+
|
|
128
|
+
Inner keys are case-sensitive and must match the URL pathname exactly (e.g. `Products/sports-watches`, not `/products/sports-watches`).
|
|
129
|
+
|
|
130
|
+
```js
|
|
131
|
+
output: {
|
|
132
|
+
routes: {
|
|
133
|
+
'https://docs.example.com': {
|
|
134
|
+
'guide': 'guides.json',
|
|
135
|
+
'*': 'docs.json'
|
|
91
136
|
},
|
|
137
|
+
'https://example.com/products/sports-watches': { '*': 'watches.json' }
|
|
92
138
|
}
|
|
93
139
|
}
|
|
94
|
-
```
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## GitHub Actions
|
|
143
|
+
Because crawls are persistent and exit cleanly on rate limits, Scraply works well on a schedule. Commit the `dataset/` directory between runs, and each scheduled run continues the crawl.
|
|
144
|
+
|
|
145
|
+
## Migrating from 1.x
|
|
146
|
+
The configuration is now camelCase and grouped, and the entry point is `src/index.js`.
|
|
147
|
+
|
|
148
|
+
- `MAIN_DIR` -> `storage.dir`
|
|
149
|
+
- `CRAWLER.INITIAL_URLS` -> `startUrls`
|
|
150
|
+
- `CRAWLER.INCLUDE_URLS` -> `include`
|
|
151
|
+
- `CRAWLER.EXCLUDE_PATTERNS` -> `exclude`
|
|
152
|
+
- `CRAWLER.ALLOWED_CONTENT_TYPES` -> `allowedContentTypes`
|
|
153
|
+
- `CRAWLER.DOM_ELEMENTS_REMOVE` -> `extract.removeSelectors`
|
|
154
|
+
- `CRAWLER.DYNAMIC_CRAWLING: true` -> `fetcher: 'browser'`
|
|
155
|
+
- `REQUEST_TIMEOUT` / `MAX_REDIRECTS` / `MAX_CONTENT_LENGTH` -> `request.*`
|
|
156
|
+
- `MAX_RETRIES` / `RETRY_STATUS_CODES` / `CRAWL_ERROR_RETRY_DELAY_MS` -> `retry.{max,statusCodes,delay}`
|
|
157
|
+
- `CRAWL_RATE_LIMIT_FALLBACK_DELAY_MS` / `EXIT_ON_RATE_LIMIT` / `EXIT_CODE_RATE_LIMIT` -> `rateLimit.*`
|
|
158
|
+
- `CRAWL_DELAY_MS` -> `crawl.delay`
|
|
159
|
+
- `DATA_FORMATTER.CATEGORISED_PATHS` -> `output.routes`
|
|
160
|
+
- `DATA_FORMATTER.EXCLUDED_PATTERNS` -> `output.exclude`
|
|
161
|
+
|
|
162
|
+
New in 2.0: `crawl.concurrency`, `crawl.maxDepth`, `crawl.resetOnComplete`, `output.format`, `browser.waitUntil`, `browser.blockResources`, pluggable `fetcher`, and lifecycle hooks. Formatted output is now real JSON by default (1.x wrote `url content` text lines).
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/** @type {readonly ['load', 'domcontentloaded', 'networkidle0', 'networkidle2']} */
|
|
2
|
+
export const BROWSER_WAIT_UNTIL = Object.freeze([
|
|
3
|
+
'load',
|
|
4
|
+
'domcontentloaded',
|
|
5
|
+
'networkidle0',
|
|
6
|
+
'networkidle2'
|
|
7
|
+
]);
|
|
8
|
+
|
|
9
|
+
/** Puppeteer resource types Scraply may block to speed up browser fetches. */
|
|
10
|
+
export const BROWSER_BLOCKABLE_RESOURCES = Object.freeze(['image', 'stylesheet', 'font', 'media']);
|
|
11
|
+
|
|
12
|
+
/** Default blocked types. Stylesheets are excluded — many SPAs need CSS before content renders. */
|
|
13
|
+
export const DEFAULT_BROWSER_BLOCK_RESOURCES = Object.freeze(['image', 'font', 'media']);
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* @param {import('../index.js').BrowserConfig} browser
|
|
17
|
+
*/
|
|
18
|
+
export const assertBrowserConfig = (browser) => {
|
|
19
|
+
if (!BROWSER_WAIT_UNTIL.includes(browser?.waitUntil)) {
|
|
20
|
+
throw new Error(
|
|
21
|
+
`Invalid browser.waitUntil: ${String(browser?.waitUntil)}. Expected one of: ${BROWSER_WAIT_UNTIL.join(', ')}`
|
|
22
|
+
);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const blockResources = browser?.blockResources;
|
|
26
|
+
if (!Array.isArray(blockResources)) {
|
|
27
|
+
throw new Error('Invalid browser.blockResources: expected an array.');
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
for (const type of blockResources) {
|
|
31
|
+
if (!BROWSER_BLOCKABLE_RESOURCES.includes(type)) {
|
|
32
|
+
throw new Error(
|
|
33
|
+
`Invalid browser.blockResources entry: ${String(type)}. Expected one of: ${BROWSER_BLOCKABLE_RESOURCES.join(', ')}`
|
|
34
|
+
);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
};
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import { DEFAULT_BROWSER_BLOCK_RESOURCES } from './browser.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Default Scraply configuration. Every value here can be overridden by the object passed to `createCrawler()` / `scraply()`. Durations are in milliseconds.
|
|
5
|
+
*
|
|
6
|
+
* @type {import('../index.js').ScraplyConfig}
|
|
7
|
+
*/
|
|
8
|
+
export const DEFAULT_CONFIG = {
|
|
9
|
+
// URLs the crawl is seeded with.
|
|
10
|
+
startUrls: ['https://crawler-test.com/'],
|
|
11
|
+
|
|
12
|
+
// Which discovered links are allowed into the queue. Each entry is either an absolute URL prefix (e.g. 'https://site.com/blog') or a RegExp. Empty means "default to startUrls".
|
|
13
|
+
include: [],
|
|
14
|
+
|
|
15
|
+
// Links matching any of these (string prefix or RegExp) are never queued.
|
|
16
|
+
exclude: [
|
|
17
|
+
/\.(zip|rar|webp|png|jpg|jpeg|gif|mp3|mp4|pdf|css|js|svg|ico|eot|ttf|woff|woff2|otf|webm|ogg|wav|flac|m4a|mkv|mov|avi|wmv|flv|swf|exe|msi|dmg|iso|bin)$/i
|
|
18
|
+
],
|
|
19
|
+
|
|
20
|
+
// Only responses whose Content-Type includes one of these are parsed.
|
|
21
|
+
allowedContentTypes: ['text/html'],
|
|
22
|
+
|
|
23
|
+
// 'http' (native fetch), 'browser' (Puppeteer) or a custom Fetcher instance.
|
|
24
|
+
fetcher: 'http',
|
|
25
|
+
|
|
26
|
+
// Options for the built-in Puppeteer fetcher (`fetcher: 'browser'`).
|
|
27
|
+
browser: {
|
|
28
|
+
// When page.goto considers navigation finished. Use 'networkidle2' for SPAs that inject links/content after load (e.g. Vue/React apps).
|
|
29
|
+
waitUntil: 'load',
|
|
30
|
+
|
|
31
|
+
// Resource types to abort during fetch (speeds up crawls). Stylesheets are omitted by default because many SPAs need CSS before content renders.
|
|
32
|
+
blockResources: [...DEFAULT_BROWSER_BLOCK_RESOURCES]
|
|
33
|
+
},
|
|
34
|
+
|
|
35
|
+
// 'silent' | 'error' | 'warn' | 'info' | 'debug'
|
|
36
|
+
logLevel: 'info',
|
|
37
|
+
|
|
38
|
+
storage: {
|
|
39
|
+
dir: 'dataset'
|
|
40
|
+
},
|
|
41
|
+
|
|
42
|
+
request: {
|
|
43
|
+
timeout: 10000, // per-request budget (aborts the fetch, including body read)
|
|
44
|
+
maxRedirects: 5, // redirect hops the HTTP fetcher follows before giving up
|
|
45
|
+
maxContentLength: 20 * 1024 * 1024, // hard cap on the response body (bytes); 0 disables it
|
|
46
|
+
userAgent: 'Mozilla/5.0 (compatible; Scraply/2.0; +https://www.npmjs.com/package/scraply)',
|
|
47
|
+
headers: {} // extra request headers (auth, Accept-Language, cookies, ...) sent by every fetcher
|
|
48
|
+
},
|
|
49
|
+
|
|
50
|
+
retry: {
|
|
51
|
+
max: 1,
|
|
52
|
+
statusCodes: [408, 500, 502, 503, 504],
|
|
53
|
+
delay: 1000
|
|
54
|
+
},
|
|
55
|
+
|
|
56
|
+
rateLimit: {
|
|
57
|
+
fallbackDelay: 60000,
|
|
58
|
+
exitOnLimit: true,
|
|
59
|
+
exitCode: 10
|
|
60
|
+
},
|
|
61
|
+
|
|
62
|
+
crawl: {
|
|
63
|
+
concurrency: 5,
|
|
64
|
+
delay: 200, // minimum spacing between requests to the same host
|
|
65
|
+
maxDepth: Infinity,
|
|
66
|
+
maxPages: Infinity, // hard cap on successfully crawled pages (counts across resumes)
|
|
67
|
+
resetOnComplete: true,
|
|
68
|
+
retryErrors: false // re-queue previously errored URLs on resume so they are retried
|
|
69
|
+
},
|
|
70
|
+
|
|
71
|
+
extract: {
|
|
72
|
+
removeSelectors: [
|
|
73
|
+
'script',
|
|
74
|
+
'noscript',
|
|
75
|
+
'style',
|
|
76
|
+
'meta',
|
|
77
|
+
'link',
|
|
78
|
+
'svg',
|
|
79
|
+
'path',
|
|
80
|
+
'img',
|
|
81
|
+
'input',
|
|
82
|
+
'textarea',
|
|
83
|
+
'embed',
|
|
84
|
+
'object',
|
|
85
|
+
'iframe',
|
|
86
|
+
'nav',
|
|
87
|
+
'header',
|
|
88
|
+
'footer',
|
|
89
|
+
'aside',
|
|
90
|
+
'button',
|
|
91
|
+
'[aria-modal]',
|
|
92
|
+
'[role="dialog"]',
|
|
93
|
+
'[role="alert"]',
|
|
94
|
+
'[role="banner"]',
|
|
95
|
+
'[role="form"]',
|
|
96
|
+
'[role="navigation"]',
|
|
97
|
+
'[role="search"]'
|
|
98
|
+
]
|
|
99
|
+
},
|
|
100
|
+
|
|
101
|
+
output: {
|
|
102
|
+
format: 'json', // 'json' | 'jsonl' | 'lines'
|
|
103
|
+
exclude: [],
|
|
104
|
+
routes: {
|
|
105
|
+
'https://crawler-test.com': { '*': 'general.json' }
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
};
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import { DEFAULT_CONFIG } from './defaults.js';
|
|
3
|
+
import { assertBrowserConfig } from './browser.js';
|
|
4
|
+
import { normalizeUrl } from '../url/normalize.js';
|
|
5
|
+
|
|
6
|
+
const isPlainObject = (value) =>
|
|
7
|
+
value !== null && typeof value === 'object' && !Array.isArray(value) && !(value instanceof RegExp);
|
|
8
|
+
|
|
9
|
+
const deepMerge = (target, source) => {
|
|
10
|
+
const merged = { ...target };
|
|
11
|
+
|
|
12
|
+
for (const [key, value] of Object.entries(source)) {
|
|
13
|
+
if (isPlainObject(value) && isPlainObject(target[key])) {
|
|
14
|
+
merged[key] = deepMerge(target[key], value);
|
|
15
|
+
} else if (value !== undefined) {
|
|
16
|
+
merged[key] = value;
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
return merged;
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Merges a user config over the defaults and derives the storage paths.
|
|
25
|
+
* @param {import('../index.js').ScraplyConfig} [userConfig]
|
|
26
|
+
* @returns {import('../index.js').ResolvedConfig}
|
|
27
|
+
*/
|
|
28
|
+
export const loadConfig = (userConfig = {}) => {
|
|
29
|
+
const config = deepMerge(DEFAULT_CONFIG, userConfig);
|
|
30
|
+
|
|
31
|
+
const { dir } = config.storage;
|
|
32
|
+
config.storage.queuePath = path.posix.join(dir, 'queue.json');
|
|
33
|
+
config.storage.crawledDir = path.posix.join(dir, 'crawled');
|
|
34
|
+
config.storage.formattedDir = path.posix.join(dir, 'formatted');
|
|
35
|
+
|
|
36
|
+
// When no include rules are given, fall back to the start URLs — normalized so
|
|
37
|
+
// they match the normalized links the crawler actually discovers (forced
|
|
38
|
+
// HTTPS, no "www.", no trailing slash).
|
|
39
|
+
if (!config.include?.length) {
|
|
40
|
+
config.include = config.startUrls.map(normalizeUrl);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
assertBrowserConfig(config.browser);
|
|
44
|
+
|
|
45
|
+
return config;
|
|
46
|
+
};
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { URL } from 'node:url';
|
|
2
|
+
import { delay } from '../util/delay.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Drains the queue with a fixed-size worker pool. Requests to the same host are
|
|
6
|
+
* spaced by `perHostDelay` for politeness, while different hosts run in parallel.
|
|
7
|
+
* Workers stop when the queue is drained and nothing is in flight, or when
|
|
8
|
+
* `isStopped()` becomes true.
|
|
9
|
+
*
|
|
10
|
+
* @param {Object} deps
|
|
11
|
+
* @param {import('./queue.js').QueueManager} deps.queue
|
|
12
|
+
* @param {number} deps.concurrency
|
|
13
|
+
* @param {number} deps.perHostDelay
|
|
14
|
+
* @param {(entry: import('./queue.js').QueueEntry) => Promise<void>} deps.processOne
|
|
15
|
+
* @param {() => boolean} deps.isStopped
|
|
16
|
+
*/
|
|
17
|
+
export const runPipeline = async ({ queue, concurrency, perHostDelay, processOne, isStopped }) => {
|
|
18
|
+
const lastHostAt = new Map();
|
|
19
|
+
let active = 0;
|
|
20
|
+
|
|
21
|
+
const respectHostDelay = async (url) => {
|
|
22
|
+
if (perHostDelay <= 0) return;
|
|
23
|
+
|
|
24
|
+
let host;
|
|
25
|
+
try {
|
|
26
|
+
host = new URL(url).host;
|
|
27
|
+
} catch {
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const now = Date.now();
|
|
32
|
+
const scheduled = Math.max(now, (lastHostAt.get(host) ?? 0) + perHostDelay);
|
|
33
|
+
lastHostAt.set(host, scheduled);
|
|
34
|
+
|
|
35
|
+
const wait = scheduled - now;
|
|
36
|
+
if (wait > 0) await delay(wait);
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
const worker = async () => {
|
|
40
|
+
while (!isStopped()) {
|
|
41
|
+
const entry = queue.claimNext();
|
|
42
|
+
|
|
43
|
+
if (!entry) {
|
|
44
|
+
if (active === 0) return; // queue drained and nothing can enqueue more
|
|
45
|
+
await delay(25);
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
active++;
|
|
50
|
+
try {
|
|
51
|
+
await respectHostDelay(entry.url);
|
|
52
|
+
await processOne(entry);
|
|
53
|
+
} finally {
|
|
54
|
+
active--;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
const workers = Array.from({ length: Math.max(concurrency, 1) }, () => worker());
|
|
60
|
+
await Promise.all(workers);
|
|
61
|
+
};
|