scraply 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +7 -3
- package/readme.md +149 -55
- package/src/config/browser.js +37 -0
- package/src/config/defaults.js +47 -11
- package/src/config/load.js +57 -1
- package/src/core/errors.js +23 -0
- package/src/core/queue.js +83 -11
- package/src/core/retry.js +34 -26
- package/src/crawler.js +265 -76
- package/src/extract/extract.js +17 -3
- package/src/extract/links.js +4 -4
- package/src/extract/parse.js +35 -0
- package/src/extract/sitemap.js +35 -0
- package/src/fetchers/browserFetcher.js +18 -12
- package/src/fetchers/httpFetcher.js +40 -3
- package/src/index.d.ts +285 -0
- package/src/index.js +48 -7
- package/src/output/writers.js +14 -5
package/src/core/queue.js
CHANGED
|
@@ -3,19 +3,21 @@ import { loadJSON, saveJSON, deletePath } from '../storage/files.js';
|
|
|
3
3
|
/**
|
|
4
4
|
* @typedef {Object} QueueEntry
|
|
5
5
|
* @property {string} url
|
|
6
|
-
* @property {string|null} file -
|
|
6
|
+
* @property {string|null} file - filename of the saved crawled record (relative to crawledDir), or null
|
|
7
7
|
* @property {number|null} status - last HTTP status
|
|
8
8
|
* @property {string|null} error - error message, or null
|
|
9
|
+
* @property {string|null} skipped - reason the page was skipped (e.g. content-type), or null
|
|
9
10
|
* @property {string|null} referrer - URL this entry was discovered on
|
|
10
11
|
* @property {number} depth
|
|
11
12
|
*/
|
|
12
13
|
|
|
13
|
-
const isProcessed = (entry) => entry.file !== null || entry.error !== null;
|
|
14
|
+
const isProcessed = (entry) => entry.file !== null || entry.error !== null || entry.skipped !== null;
|
|
14
15
|
|
|
15
16
|
/**
|
|
16
17
|
* Owns the crawl queue: dedup, depth limiting, status tracking and durable
|
|
17
|
-
* checkpointing.
|
|
18
|
-
*
|
|
18
|
+
* checkpointing. Status totals are tracked incrementally (O(1) reads) and
|
|
19
|
+
* persistence is debounced so a high-concurrency crawl does not rewrite the
|
|
20
|
+
* queue file on every single URL.
|
|
19
21
|
*/
|
|
20
22
|
export class QueueManager {
|
|
21
23
|
/** @param {{ config: import('../index.js').ResolvedConfig, logger: any }} deps */
|
|
@@ -32,16 +34,30 @@ export class QueueManager {
|
|
|
32
34
|
/** @type {QueueEntry[]} */
|
|
33
35
|
this._pending = [];
|
|
34
36
|
this._cursor = 0;
|
|
37
|
+
this._crawled = 0;
|
|
38
|
+
this._errors = 0;
|
|
39
|
+
this._skipped = 0;
|
|
35
40
|
this._dirty = false;
|
|
36
41
|
this._timer = null;
|
|
37
42
|
this._persistInterval = 1000;
|
|
38
43
|
}
|
|
39
44
|
|
|
40
|
-
/** Loads any previously persisted queue and rebuilds the in-memory indexes. */
|
|
45
|
+
/** Loads any previously persisted queue and rebuilds the in-memory indexes and totals. */
|
|
41
46
|
load() {
|
|
42
47
|
this.entries = loadJSON(this.path, []) ?? [];
|
|
43
48
|
this.index = new Set(this.entries.map((entry) => entry.url));
|
|
44
|
-
this._pending =
|
|
49
|
+
this._pending = [];
|
|
50
|
+
this._crawled = 0;
|
|
51
|
+
this._errors = 0;
|
|
52
|
+
this._skipped = 0;
|
|
53
|
+
|
|
54
|
+
for (const entry of this.entries) {
|
|
55
|
+
if (entry.file !== null) this._crawled += 1;
|
|
56
|
+
else if (entry.error !== null) this._errors += 1;
|
|
57
|
+
else if (entry.skipped !== null) this._skipped += 1;
|
|
58
|
+
else this._pending.push(entry);
|
|
59
|
+
}
|
|
60
|
+
|
|
45
61
|
this._cursor = 0;
|
|
46
62
|
return this.entries;
|
|
47
63
|
}
|
|
@@ -59,7 +75,7 @@ export class QueueManager {
|
|
|
59
75
|
add(url, { depth = 0, referrer = null } = {}) {
|
|
60
76
|
if (this.index.has(url) || depth > this.maxDepth) return false;
|
|
61
77
|
|
|
62
|
-
const entry = { url, file: null, status: null, error: null, referrer, depth };
|
|
78
|
+
const entry = { url, file: null, status: null, error: null, skipped: null, referrer, depth };
|
|
63
79
|
this.index.add(url);
|
|
64
80
|
this.entries.push(entry);
|
|
65
81
|
this._pending.push(entry);
|
|
@@ -76,29 +92,82 @@ export class QueueManager {
|
|
|
76
92
|
entry.file = file;
|
|
77
93
|
entry.status = status;
|
|
78
94
|
entry.error = null;
|
|
95
|
+
entry.skipped = null;
|
|
96
|
+
this._crawled += 1;
|
|
79
97
|
this._markDirty();
|
|
80
98
|
}
|
|
81
99
|
|
|
82
100
|
markError(entry, { error, status }) {
|
|
83
101
|
entry.error = error;
|
|
84
102
|
entry.status = status ?? null;
|
|
103
|
+
this._errors += 1;
|
|
104
|
+
this._markDirty();
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
markSkipped(entry, { reason, status }) {
|
|
108
|
+
entry.skipped = reason;
|
|
109
|
+
entry.status = status ?? null;
|
|
110
|
+
this._skipped += 1;
|
|
85
111
|
this._markDirty();
|
|
86
112
|
}
|
|
87
113
|
|
|
114
|
+
/**
|
|
115
|
+
* Returns matching terminal entries to the pending set so the next crawl
|
|
116
|
+
* retries them. Persists immediately so a fresh `load()` (e.g. at the start of
|
|
117
|
+
* `crawl()`) sees the requeued entries.
|
|
118
|
+
* @param {(entry: QueueEntry) => boolean} match
|
|
119
|
+
* @returns {number} how many entries were requeued
|
|
120
|
+
*/
|
|
121
|
+
_requeue(match) {
|
|
122
|
+
let count = 0;
|
|
123
|
+
for (const entry of this.entries) {
|
|
124
|
+
if (!match(entry)) continue;
|
|
125
|
+
|
|
126
|
+
if (entry.error !== null) this._errors -= 1;
|
|
127
|
+
if (entry.skipped !== null) this._skipped -= 1;
|
|
128
|
+
|
|
129
|
+
entry.error = null;
|
|
130
|
+
entry.skipped = null;
|
|
131
|
+
entry.status = null;
|
|
132
|
+
this._pending.push(entry);
|
|
133
|
+
count += 1;
|
|
134
|
+
}
|
|
135
|
+
if (count > 0) this.flush();
|
|
136
|
+
return count;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/** Re-queues every errored entry for retry. @returns {number} */
|
|
140
|
+
requeueErrors() {
|
|
141
|
+
return this._requeue((entry) => entry.error !== null);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Re-queues every skipped entry for another attempt. Useful after widening
|
|
146
|
+
* `allowedContentTypes` (or changing `sites`) so previously skipped URLs are
|
|
147
|
+
* reconsidered. @returns {number}
|
|
148
|
+
*/
|
|
149
|
+
requeueSkipped() {
|
|
150
|
+
return this._requeue((entry) => entry.skipped !== null);
|
|
151
|
+
}
|
|
152
|
+
|
|
88
153
|
isAllProcessed() {
|
|
89
|
-
return this.entries.length > 0 && this.
|
|
154
|
+
return this.entries.length > 0 && this.pendingCount() === 0;
|
|
90
155
|
}
|
|
91
156
|
|
|
92
157
|
pendingCount() {
|
|
93
|
-
return this.entries.
|
|
158
|
+
return this.entries.length - this._crawled - this._errors - this._skipped;
|
|
94
159
|
}
|
|
95
160
|
|
|
96
161
|
crawledCount() {
|
|
97
|
-
return this.
|
|
162
|
+
return this._crawled;
|
|
98
163
|
}
|
|
99
164
|
|
|
100
165
|
errorCount() {
|
|
101
|
-
return this.
|
|
166
|
+
return this._errors;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
skippedCount() {
|
|
170
|
+
return this._skipped;
|
|
102
171
|
}
|
|
103
172
|
|
|
104
173
|
/** Clears in-memory state and removes the persisted queue file. */
|
|
@@ -107,6 +176,9 @@ export class QueueManager {
|
|
|
107
176
|
this.index = new Set();
|
|
108
177
|
this._pending = [];
|
|
109
178
|
this._cursor = 0;
|
|
179
|
+
this._crawled = 0;
|
|
180
|
+
this._errors = 0;
|
|
181
|
+
this._skipped = 0;
|
|
110
182
|
this._dirty = false;
|
|
111
183
|
deletePath(this.path);
|
|
112
184
|
}
|
package/src/core/retry.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { delay } from '../util/delay.js';
|
|
2
|
+
import { RateLimitError } from './errors.js';
|
|
2
3
|
|
|
3
4
|
/** Derives how long to wait (ms) from rate-limit headers, falling back to a default. */
|
|
4
5
|
const computeWait = (headers = {}, fallback) => {
|
|
@@ -22,46 +23,53 @@ const computeWait = (headers = {}, fallback) => {
|
|
|
22
23
|
* Wraps a fetch operation with retry and rate-limit handling shared by every
|
|
23
24
|
* fetcher backend.
|
|
24
25
|
*
|
|
25
|
-
*
|
|
26
|
+
* Rate limiting (HTTP 429) is handled independently of the normal retry budget:
|
|
27
|
+
* when `rateLimit.exitOnLimit` is false the runner waits (honoring `retry-after`
|
|
28
|
+
* / `x-ratelimit-reset`) and retries until the host relents; otherwise it throws
|
|
29
|
+
* a `RateLimitError` so the crawl aborts cleanly and can be resumed later.
|
|
30
|
+
*
|
|
31
|
+
* @param {{ config: import('../index.js').ResolvedConfig, logger: any }} deps
|
|
26
32
|
*/
|
|
27
|
-
export const createRetryRunner = ({ config, logger
|
|
33
|
+
export const createRetryRunner = ({ config, logger }) => {
|
|
28
34
|
const { retry, rateLimit } = config;
|
|
29
35
|
|
|
30
|
-
const shouldRetry = async (error) => {
|
|
31
|
-
const status = error?.response?.status;
|
|
32
|
-
if (status === undefined) return true; // network/transport error
|
|
33
|
-
|
|
34
|
-
if (status === 429) {
|
|
35
|
-
if (rateLimit.exitOnLimit) return false; // run() handles the exit
|
|
36
|
-
const wait = computeWait(error.response.headers, rateLimit.fallbackDelay);
|
|
37
|
-
logger.warn(`Rate limited. Waiting ${Math.round(wait / 1000)}s before retrying...`);
|
|
38
|
-
await delay(wait);
|
|
39
|
-
return true;
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
return retry.statusCodes.includes(status);
|
|
43
|
-
};
|
|
44
|
-
|
|
45
36
|
const run = async (fn) => {
|
|
46
|
-
|
|
37
|
+
let attempt = 0;
|
|
38
|
+
|
|
39
|
+
for (;;) {
|
|
47
40
|
try {
|
|
48
41
|
return await fn();
|
|
49
42
|
} catch (error) {
|
|
50
|
-
const
|
|
51
|
-
|
|
52
|
-
|
|
43
|
+
const status = error?.response?.status;
|
|
44
|
+
|
|
45
|
+
if (status === 429) {
|
|
46
|
+
if (rateLimit.exitOnLimit) {
|
|
47
|
+
logger.warn(`Rate limited. Aborting crawl (exitOnLimit) with code ${rateLimit.exitCode}.`);
|
|
48
|
+
throw new RateLimitError('Rate limited', {
|
|
49
|
+
code: rateLimit.exitCode,
|
|
50
|
+
headers: error.response.headers,
|
|
51
|
+
cause: error
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
const wait = computeWait(error.response.headers, rateLimit.fallbackDelay);
|
|
56
|
+
logger.warn(`Rate limited. Waiting ${Math.round(wait / 1000)}s before retrying...`);
|
|
57
|
+
await delay(wait);
|
|
58
|
+
continue; // rate-limit waits never consume the retry budget
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const retriable = status === undefined || retry.statusCodes.includes(status);
|
|
62
|
+
if (retriable && attempt < retry.max) {
|
|
63
|
+
attempt += 1;
|
|
64
|
+
logger.info(`Retry ${attempt}/${retry.max} -> ${error.message}`);
|
|
53
65
|
if (retry.delay > 0) await delay(retry.delay);
|
|
54
66
|
continue;
|
|
55
67
|
}
|
|
56
68
|
|
|
57
|
-
if (error?.response?.status === 429) {
|
|
58
|
-
logger.warn(`Force exiting with code ${rateLimit.exitCode} (rate limited).`);
|
|
59
|
-
onRateLimitExit(rateLimit.exitCode);
|
|
60
|
-
}
|
|
61
69
|
throw error;
|
|
62
70
|
}
|
|
63
71
|
}
|
|
64
72
|
};
|
|
65
73
|
|
|
66
|
-
return { run
|
|
74
|
+
return { run };
|
|
67
75
|
};
|