scraply 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +7 -3
- package/readme.md +149 -55
- package/src/config/browser.js +37 -0
- package/src/config/defaults.js +47 -11
- package/src/config/load.js +57 -1
- package/src/core/errors.js +23 -0
- package/src/core/queue.js +83 -11
- package/src/core/retry.js +34 -26
- package/src/crawler.js +265 -76
- package/src/extract/extract.js +17 -3
- package/src/extract/links.js +4 -4
- package/src/extract/parse.js +35 -0
- package/src/extract/sitemap.js +35 -0
- package/src/fetchers/browserFetcher.js +18 -12
- package/src/fetchers/httpFetcher.js +40 -3
- package/src/index.d.ts +285 -0
- package/src/index.js +48 -7
- package/src/output/writers.js +14 -5
package/src/crawler.js
CHANGED
|
@@ -1,36 +1,28 @@
|
|
|
1
1
|
import path from 'node:path';
|
|
2
|
+
import { createHash } from 'node:crypto';
|
|
2
3
|
import * as cheerio from 'cheerio';
|
|
3
4
|
|
|
4
5
|
import { loadConfig } from './config/load.js';
|
|
5
6
|
import { createLogger } from './util/logger.js';
|
|
6
7
|
import { createHooks } from './util/hooks.js';
|
|
7
8
|
import { normalizeUrl } from './url/normalize.js';
|
|
8
|
-
import { matchesAnyPattern } from './url/patterns.js';
|
|
9
|
+
import { matchesPattern, matchesAnyPattern } from './url/patterns.js';
|
|
9
10
|
import { discoverLinks } from './extract/links.js';
|
|
10
11
|
import { extractText } from './extract/extract.js';
|
|
12
|
+
import { classifyContentType, parseJson, toText } from './extract/parse.js';
|
|
13
|
+
import { parseSitemap } from './extract/sitemap.js';
|
|
11
14
|
import { QueueManager } from './core/queue.js';
|
|
12
15
|
import { runPipeline } from './core/pipeline.js';
|
|
13
16
|
import { createRetryRunner } from './core/retry.js';
|
|
17
|
+
import { RateLimitError } from './core/errors.js';
|
|
14
18
|
import { resolveFetcher } from './fetchers/index.js';
|
|
15
19
|
import { formatRecords } from './output/writers.js';
|
|
16
20
|
import { loadJSON, saveJSON, deletePath, deleteUntracked } from './storage/files.js';
|
|
17
21
|
|
|
18
|
-
const
|
|
19
|
-
if (!headers) return undefined;
|
|
20
|
-
if (headers[name] !== undefined) return headers[name];
|
|
21
|
-
const lower = name.toLowerCase();
|
|
22
|
-
for (const key of Object.keys(headers)) {
|
|
23
|
-
if (key.toLowerCase() === lower) return headers[key];
|
|
24
|
-
}
|
|
25
|
-
return undefined;
|
|
26
|
-
};
|
|
27
|
-
|
|
28
|
-
const toHtml = (data) => (typeof data === 'string' ? data : Buffer.from(data).toString('utf8'));
|
|
22
|
+
const sha256 = (text) => createHash('sha256').update(text).digest('hex');
|
|
29
23
|
|
|
30
24
|
/**
|
|
31
|
-
* Creates a crawler instance. Every stage is exposed as a method so callers can
|
|
32
|
-
* run the whole pipeline (`run`) or drive individual stages and add their own
|
|
33
|
-
* logic via hooks.
|
|
25
|
+
* Creates a crawler instance. Every stage is exposed as a method so callers can run the whole pipeline (`run`) or drive individual stages and add their own logic via hooks.
|
|
34
26
|
*
|
|
35
27
|
* @param {import('./index.js').ScraplyConfig} [userConfig]
|
|
36
28
|
*/
|
|
@@ -41,22 +33,55 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
41
33
|
const queue = new QueueManager({ config, logger });
|
|
42
34
|
const fetcher = resolveFetcher({ config, logger });
|
|
43
35
|
|
|
36
|
+
// Normalized once so the start URLs match discovered (normalized) links and
|
|
37
|
+
// can be looked up in O(1) during filtering.
|
|
38
|
+
const startUrls = config.startUrls.map(normalizeUrl);
|
|
39
|
+
const startUrlSet = new Set(startUrls);
|
|
40
|
+
|
|
44
41
|
let stopped = false;
|
|
45
42
|
let initialized = false;
|
|
46
43
|
let datasetCounter = 0;
|
|
47
44
|
let processedCount = 0;
|
|
48
45
|
let signalsRegistered = false;
|
|
46
|
+
let signalHandler = null;
|
|
47
|
+
/** @type {RateLimitError|null} Set when a 429 aborts the crawl; rethrown after the pool drains. */
|
|
48
|
+
let rateLimitError = null;
|
|
49
49
|
|
|
50
50
|
const closeFetcher = async () => {
|
|
51
51
|
if (fetcher.close) await fetcher.close();
|
|
52
52
|
};
|
|
53
53
|
|
|
54
|
-
const
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
54
|
+
const retryRunner = createRetryRunner({ config, logger });
|
|
55
|
+
|
|
56
|
+
// Resolves the effective per-URL config, applying the most specific matching
|
|
57
|
+
// `sites` entry over the top-level `allowedContentTypes` / `extract`.
|
|
58
|
+
const resolveEntryConfig = (url) => {
|
|
59
|
+
if (!config.sites.length) {
|
|
60
|
+
return { allowedContentTypes: config.allowedContentTypes, extract: config.extract };
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
let best = null;
|
|
64
|
+
let bestLen = -1;
|
|
65
|
+
for (const site of config.sites) {
|
|
66
|
+
for (const pattern of site.match) {
|
|
67
|
+
if (!matchesPattern(url, pattern)) continue;
|
|
68
|
+
const len = typeof pattern === 'string' ? pattern.length : String(pattern).length;
|
|
69
|
+
if (len > bestLen) {
|
|
70
|
+
bestLen = len;
|
|
71
|
+
best = site;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if (!best) {
|
|
77
|
+
return { allowedContentTypes: config.allowedContentTypes, extract: config.extract };
|
|
78
|
+
}
|
|
58
79
|
|
|
59
|
-
|
|
80
|
+
return {
|
|
81
|
+
allowedContentTypes: best.allowedContentTypes ?? config.allowedContentTypes,
|
|
82
|
+
extract: { ...config.extract, ...(best.extract ?? {}) }
|
|
83
|
+
};
|
|
84
|
+
};
|
|
60
85
|
|
|
61
86
|
// --- queue lifecycle ---
|
|
62
87
|
|
|
@@ -76,9 +101,19 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
76
101
|
queue.load();
|
|
77
102
|
datasetCounter = computeDatasetCounter();
|
|
78
103
|
|
|
104
|
+
if (config.crawl.retryErrors) {
|
|
105
|
+
const requeued = queue.requeueErrors();
|
|
106
|
+
if (requeued > 0) logger.info(`Re-queued ${requeued} previously errored URL(s) for retry.`);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
if (config.crawl.retrySkipped) {
|
|
110
|
+
const requeued = queue.requeueSkipped();
|
|
111
|
+
if (requeued > 0) logger.info(`Re-queued ${requeued} previously skipped URL(s) for retry.`);
|
|
112
|
+
}
|
|
113
|
+
|
|
79
114
|
if (queue.entries.length === 0) {
|
|
80
|
-
logger.info(`Starting fresh with ${
|
|
81
|
-
queue.seed(
|
|
115
|
+
logger.info(`Starting fresh with ${startUrls.length} start URL(s).`);
|
|
116
|
+
queue.seed(startUrls);
|
|
82
117
|
return;
|
|
83
118
|
}
|
|
84
119
|
|
|
@@ -88,7 +123,7 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
88
123
|
queue.reset();
|
|
89
124
|
deletePath(config.storage.crawledDir);
|
|
90
125
|
datasetCounter = 0;
|
|
91
|
-
queue.seed(
|
|
126
|
+
queue.seed(startUrls);
|
|
92
127
|
} else {
|
|
93
128
|
logger.info('All URLs already processed (resetOnComplete is false). Nothing to do.');
|
|
94
129
|
}
|
|
@@ -100,22 +135,23 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
100
135
|
|
|
101
136
|
// --- stage methods ---
|
|
102
137
|
|
|
103
|
-
|
|
138
|
+
// Fetches a single URL (with retry/rate-limit policy) and returns the raw result.
|
|
104
139
|
const fetchUrl = (url) => retryRunner.run(() => fetcher.fetch(normalizeUrl(url)));
|
|
105
140
|
|
|
106
|
-
|
|
141
|
+
// Extracts readable text from HTML. When a URL is supplied, the matching
|
|
142
|
+
// per-site extract rules apply; otherwise the global extract config is used.
|
|
107
143
|
const extract = (html, url = null) => ({
|
|
108
144
|
url,
|
|
109
|
-
content: extractText(html,
|
|
145
|
+
content: extractText(html, url ? resolveEntryConfig(url).extract : config.extract)
|
|
110
146
|
});
|
|
111
147
|
|
|
112
148
|
const shouldCrawl = (url) => {
|
|
113
|
-
if (
|
|
149
|
+
if (startUrlSet.has(url)) return true;
|
|
114
150
|
if (matchesAnyPattern(url, config.exclude)) return false;
|
|
115
151
|
return matchesAnyPattern(url, config.include);
|
|
116
152
|
};
|
|
117
153
|
|
|
118
|
-
|
|
154
|
+
// Filters + normalizes URLs and adds the survivors to the queue.
|
|
119
155
|
const enqueue = async (urls, { depth = 0, referrer = null } = {}) => {
|
|
120
156
|
const list = Array.isArray(urls) ? urls : [urls];
|
|
121
157
|
let added = 0;
|
|
@@ -137,15 +173,17 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
137
173
|
return added;
|
|
138
174
|
};
|
|
139
175
|
|
|
176
|
+
// Persists a crawled record and returns its filename (relative to crawledDir).
|
|
177
|
+
// Only the bare name is stored in the queue so datasets stay portable.
|
|
140
178
|
const saveDataset = (record) => {
|
|
141
179
|
datasetCounter += 1;
|
|
142
|
-
const
|
|
143
|
-
saveJSON(
|
|
144
|
-
return
|
|
180
|
+
const file = `${datasetCounter}.json`;
|
|
181
|
+
saveJSON(path.posix.join(config.storage.crawledDir, file), record);
|
|
182
|
+
return file;
|
|
145
183
|
};
|
|
146
184
|
|
|
147
185
|
const processOne = async (entry) => {
|
|
148
|
-
if (entry.file || entry.error) return;
|
|
186
|
+
if (entry.file || entry.error || entry.skipped) return;
|
|
149
187
|
|
|
150
188
|
processedCount += 1;
|
|
151
189
|
logger.info(`- ${processedCount}/${queue.entries.length} -> ${entry.url}`);
|
|
@@ -154,26 +192,68 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
154
192
|
const result = await retryRunner.run(() => fetcher.fetch(entry.url));
|
|
155
193
|
await hooks.emit('response', result, entry);
|
|
156
194
|
|
|
157
|
-
const
|
|
158
|
-
|
|
159
|
-
|
|
195
|
+
const effective = resolveEntryConfig(entry.url);
|
|
196
|
+
|
|
197
|
+
// Fetchers return lowercased header keys (see Fetcher interface).
|
|
198
|
+
const contentType = result.headers?.['content-type'];
|
|
199
|
+
if (!contentType || !effective.allowedContentTypes.some((type) => contentType.includes(type))) {
|
|
200
|
+
const reason = `content-type: ${contentType ?? 'none'}`;
|
|
201
|
+
queue.markSkipped(entry, { reason, status: result.status });
|
|
202
|
+
await hooks.emit('skip', entry, { reason, status: result.status, result });
|
|
160
203
|
return;
|
|
161
204
|
}
|
|
162
205
|
|
|
163
|
-
const
|
|
206
|
+
const kind = classifyContentType(contentType);
|
|
207
|
+
let $ = null;
|
|
208
|
+
let content = '';
|
|
209
|
+
let data = null;
|
|
210
|
+
|
|
211
|
+
if (kind === 'html') {
|
|
212
|
+
$ = cheerio.load(toText(result.data));
|
|
213
|
+
|
|
214
|
+
// Discover links from the full DOM before extraction strips elements.
|
|
215
|
+
const links = await hooks.reduce('links', discoverLinks($, entry.url), $, entry, result);
|
|
216
|
+
await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
|
|
217
|
+
|
|
218
|
+
content = extractText($, effective.extract);
|
|
219
|
+
} else if (kind === 'json' && effective.extract.json !== false) {
|
|
220
|
+
const parsed = parseJson(result.data);
|
|
221
|
+
data = parsed.data;
|
|
222
|
+
content = parsed.content;
|
|
223
|
+
|
|
224
|
+
const links = await hooks.reduce('links', [], $, entry, result);
|
|
225
|
+
if (links?.length) await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
|
|
226
|
+
} else {
|
|
227
|
+
content = toText(result.data);
|
|
228
|
+
|
|
229
|
+
const links = await hooks.reduce('links', [], $, entry, result);
|
|
230
|
+
if (links?.length) await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
content = await hooks.reduce('extract', content, $, entry, result);
|
|
164
234
|
|
|
165
|
-
|
|
166
|
-
|
|
235
|
+
let record = { url: entry.url, content, crawledAt: new Date().toISOString() };
|
|
236
|
+
if (data !== null) record.data = data;
|
|
167
237
|
|
|
168
|
-
|
|
169
|
-
|
|
238
|
+
// Transform runs BEFORE the record is persisted so its result is what gets
|
|
239
|
+
// saved to disk and later picked up by format().
|
|
240
|
+
record = await hooks.reduce('transform', record, entry, result);
|
|
241
|
+
record.hash = sha256(record.content ?? '');
|
|
170
242
|
|
|
171
|
-
const file = saveDataset(
|
|
243
|
+
const file = saveDataset(record);
|
|
172
244
|
queue.markDone(entry, { file, status: result.status });
|
|
173
245
|
|
|
174
|
-
|
|
175
|
-
await hooks.emit('page', record, entry);
|
|
246
|
+
await hooks.emit('page', record, entry, result);
|
|
176
247
|
} catch (error) {
|
|
248
|
+
// A 429 with exitOnLimit aborts the whole crawl: stash the error, stop the
|
|
249
|
+
// pool and leave the entry pending so the next run retries it.
|
|
250
|
+
if (error instanceof RateLimitError) {
|
|
251
|
+
rateLimitError = error;
|
|
252
|
+
stopped = true;
|
|
253
|
+
queue.flush();
|
|
254
|
+
return;
|
|
255
|
+
}
|
|
256
|
+
|
|
177
257
|
queue.markError(entry, { error: error.message, status: error.response?.status });
|
|
178
258
|
await hooks.emit('error', error, entry);
|
|
179
259
|
logger.error(`Failed to fetch ${entry.url} -> ${error.message}`);
|
|
@@ -181,75 +261,140 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
181
261
|
};
|
|
182
262
|
|
|
183
263
|
const logBanner = () => {
|
|
264
|
+
const browserLine =
|
|
265
|
+
fetcher.name === 'browser' ? `\n - Browser waitUntil: ${config.browser.waitUntil}` : '';
|
|
266
|
+
|
|
184
267
|
logger.info(`STARTING SCRAPLY CRAWLER...
|
|
185
268
|
- Start URLs: ${config.startUrls.join(', ')}
|
|
186
|
-
- Fetcher: ${fetcher.name}
|
|
269
|
+
- Fetcher: ${fetcher.name}${browserLine}
|
|
187
270
|
- Concurrency: ${config.crawl.concurrency}
|
|
188
271
|
- Per-host delay: ${config.crawl.delay}ms
|
|
189
272
|
- Max depth: ${config.crawl.maxDepth}
|
|
273
|
+
- Max pages: ${config.crawl.maxPages}
|
|
190
274
|
- Allowed content types: ${config.allowedContentTypes.join(', ')}
|
|
191
275
|
- Output format: ${config.output.format}
|
|
192
276
|
`);
|
|
193
277
|
};
|
|
194
278
|
|
|
195
279
|
const registerSignals = () => {
|
|
196
|
-
if (signalsRegistered) return;
|
|
280
|
+
if (!config.signals || signalsRegistered) return;
|
|
197
281
|
signalsRegistered = true;
|
|
198
282
|
|
|
199
|
-
|
|
200
|
-
|
|
283
|
+
let forcing = false;
|
|
284
|
+
signalHandler = () => {
|
|
285
|
+
if (forcing) {
|
|
286
|
+
logger.warn('Received second termination signal. Forcing quit.');
|
|
287
|
+
process.exit(1);
|
|
288
|
+
}
|
|
289
|
+
forcing = true;
|
|
290
|
+
logger.warn('Received termination signal. Finishing in-flight work... (signal again to force quit)');
|
|
201
291
|
stopped = true;
|
|
202
292
|
queue.flush();
|
|
203
|
-
await closeFetcher();
|
|
204
|
-
process.exit(0);
|
|
205
293
|
};
|
|
206
294
|
|
|
207
|
-
process.
|
|
208
|
-
process.
|
|
295
|
+
process.on('SIGINT', signalHandler);
|
|
296
|
+
process.on('SIGTERM', signalHandler);
|
|
209
297
|
};
|
|
210
298
|
|
|
211
|
-
|
|
299
|
+
const unregisterSignals = () => {
|
|
300
|
+
if (!signalHandler) return;
|
|
301
|
+
process.off('SIGINT', signalHandler);
|
|
302
|
+
process.off('SIGTERM', signalHandler);
|
|
303
|
+
signalHandler = null;
|
|
304
|
+
signalsRegistered = false;
|
|
305
|
+
};
|
|
306
|
+
|
|
307
|
+
// Seeds URLs from sitemap(s) when crawl.sitemap is enabled. Recurses into
|
|
308
|
+
// sitemap indexes (bounded) and routes discovered URLs through enqueue() so
|
|
309
|
+
// include/exclude rules still apply.
|
|
310
|
+
const seedSitemaps = async () => {
|
|
311
|
+
const cfg = config.crawl.sitemap;
|
|
312
|
+
if (!cfg) return;
|
|
313
|
+
|
|
314
|
+
const roots = Array.isArray(cfg)
|
|
315
|
+
? cfg
|
|
316
|
+
: startUrls.map((url) => new URL('/sitemap.xml', url).href);
|
|
317
|
+
|
|
318
|
+
const seen = new Set();
|
|
319
|
+
let added = 0;
|
|
320
|
+
|
|
321
|
+
const visit = async (url, depth) => {
|
|
322
|
+
if (depth > 5 || seen.has(url)) return;
|
|
323
|
+
seen.add(url);
|
|
324
|
+
|
|
325
|
+
try {
|
|
326
|
+
const result = await retryRunner.run(() => fetcher.fetch(url));
|
|
327
|
+
const { sitemaps, urls } = parseSitemap(toText(result.data));
|
|
328
|
+
added += await enqueue(urls, { depth: 0, referrer: url });
|
|
329
|
+
for (const nested of sitemaps) await visit(nested, depth + 1);
|
|
330
|
+
} catch (error) {
|
|
331
|
+
if (error instanceof RateLimitError) throw error;
|
|
332
|
+
logger.warn(`Sitemap fetch failed (${url}) -> ${error.message}`);
|
|
333
|
+
}
|
|
334
|
+
};
|
|
335
|
+
|
|
336
|
+
for (const url of roots) await visit(url, 0);
|
|
337
|
+
if (added > 0) logger.info(`Seeded ${added} URL(s) from sitemap(s).`);
|
|
338
|
+
};
|
|
339
|
+
|
|
340
|
+
// Crawls until the queue is drained (or `stop()` is called).
|
|
212
341
|
const crawl = async () => {
|
|
213
342
|
init();
|
|
214
343
|
logBanner();
|
|
215
344
|
registerSignals();
|
|
345
|
+
rateLimitError = null;
|
|
216
346
|
|
|
217
|
-
|
|
218
|
-
|
|
347
|
+
try {
|
|
348
|
+
if (fetcher.init) await fetcher.init();
|
|
349
|
+
await seedSitemaps();
|
|
350
|
+
processedCount = queue.crawledCount() + queue.errorCount() + queue.skippedCount();
|
|
351
|
+
|
|
352
|
+
await runPipeline({
|
|
353
|
+
queue,
|
|
354
|
+
concurrency: config.crawl.concurrency,
|
|
355
|
+
perHostDelay: config.crawl.delay,
|
|
356
|
+
processOne,
|
|
357
|
+
isStopped: () => stopped || queue.crawledCount() >= config.crawl.maxPages
|
|
358
|
+
});
|
|
219
359
|
|
|
220
|
-
|
|
221
|
-
queue,
|
|
222
|
-
concurrency: config.crawl.concurrency,
|
|
223
|
-
perHostDelay: config.crawl.delay,
|
|
224
|
-
processOne,
|
|
225
|
-
isStopped: () => stopped
|
|
226
|
-
});
|
|
360
|
+
queue.flush();
|
|
227
361
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
362
|
+
// A rate-limit abort surfaces here so run() can clean up (flush + close)
|
|
363
|
+
// before the error propagates to the caller.
|
|
364
|
+
if (rateLimitError) throw rateLimitError;
|
|
365
|
+
|
|
366
|
+
if (config.crawl.maxPages !== Infinity && queue.crawledCount() >= config.crawl.maxPages) {
|
|
367
|
+
logger.info(`Reached maxPages limit (${config.crawl.maxPages}).`);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
logger.info(
|
|
371
|
+
`Crawling completed! ${queue.crawledCount()} crawled, ${queue.skippedCount()} skipped, ` +
|
|
372
|
+
`${queue.errorCount()} errors, ${queue.pendingCount()} pending (of ${queue.entries.length} total).`
|
|
373
|
+
);
|
|
374
|
+
} finally {
|
|
375
|
+
unregisterSignals();
|
|
376
|
+
}
|
|
233
377
|
};
|
|
234
378
|
|
|
235
|
-
|
|
379
|
+
// Re-reads crawled pages from disk so resumed runs include earlier sessions.
|
|
380
|
+
// The full saved record is returned (including any `transform` additions and
|
|
381
|
+
// `data` for JSON sources); the output writer decides what to serialize.
|
|
236
382
|
const collectRecords = () => {
|
|
237
383
|
const records = [];
|
|
238
384
|
for (const entry of queue.entries) {
|
|
239
|
-
if (!entry.file
|
|
240
|
-
const
|
|
241
|
-
if (
|
|
385
|
+
if (!entry.file) continue;
|
|
386
|
+
const record = loadJSON(path.posix.join(config.storage.crawledDir, entry.file), null);
|
|
387
|
+
if (record) records.push(record);
|
|
242
388
|
}
|
|
243
389
|
return records;
|
|
244
390
|
};
|
|
245
391
|
|
|
246
|
-
|
|
247
|
-
* Routes records to their output files and writes them. Defaults to every
|
|
248
|
-
* successfully crawled page; pass an explicit array to format custom records.
|
|
249
|
-
*/
|
|
392
|
+
// Routes records to their output files and writes them. Defaults to every successfully crawled page; pass an explicit array to format custom records. When reading from disk, reloads `dataset/queue.json` first so this can run without calling `crawl()` (e.g. after changing `output.routes`).
|
|
250
393
|
const format = async (records = null) => {
|
|
251
394
|
logger.info('Formatting data...');
|
|
252
395
|
|
|
396
|
+
if (records === null) queue.load();
|
|
397
|
+
|
|
253
398
|
const collected = records ?? collectRecords();
|
|
254
399
|
const groups = formatRecords(collected, {
|
|
255
400
|
output: config.output,
|
|
@@ -269,7 +414,7 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
269
414
|
return groups;
|
|
270
415
|
};
|
|
271
416
|
|
|
272
|
-
|
|
417
|
+
// Full pipeline: init -> crawl -> format, with guaranteed cleanup.
|
|
273
418
|
const run = async () => {
|
|
274
419
|
try {
|
|
275
420
|
await crawl();
|
|
@@ -292,11 +437,55 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
292
437
|
crawl,
|
|
293
438
|
format,
|
|
294
439
|
run,
|
|
440
|
+
// Clears errored entries and returns them to the queue so a later crawl()
|
|
441
|
+
// retries them. Persists immediately; returns how many were requeued.
|
|
442
|
+
requeueErrors: () => {
|
|
443
|
+
if (queue.entries.length === 0) queue.load();
|
|
444
|
+
return queue.requeueErrors();
|
|
445
|
+
},
|
|
446
|
+
// Same as requeueErrors() but for skipped entries (e.g. after widening
|
|
447
|
+
// allowedContentTypes or changing sites).
|
|
448
|
+
requeueSkipped: () => {
|
|
449
|
+
if (queue.entries.length === 0) queue.load();
|
|
450
|
+
return queue.requeueSkipped();
|
|
451
|
+
},
|
|
295
452
|
stop: () => {
|
|
296
453
|
stopped = true;
|
|
297
454
|
}
|
|
298
455
|
};
|
|
299
456
|
};
|
|
300
457
|
|
|
301
|
-
|
|
458
|
+
// One-call convenience wrapper: create a crawler and run the full pipeline.
|
|
302
459
|
export const scraply = (userConfig = {}) => createCrawler(userConfig).run();
|
|
460
|
+
|
|
461
|
+
/**
|
|
462
|
+
* Runs multiple crawlers in one process. Accepts crawler instances or plain
|
|
463
|
+
* config objects (which are turned into crawlers). Because the crawler no longer
|
|
464
|
+
* calls `process.exit`, several crawlers can safely share one process — set
|
|
465
|
+
* `signals: false` in each config (or rely on the per-instance graceful stop).
|
|
466
|
+
*
|
|
467
|
+
* @param {Array<import('./index.js').ScraplyConfig | ReturnType<typeof createCrawler>>} items
|
|
468
|
+
* @param {{ concurrency?: number }} [options] - how many crawlers run at once (default 1 = sequential)
|
|
469
|
+
* @returns {Promise<Array<import('./core/queue.js').QueueEntry[]>>} each crawler's final queue entries, in input order
|
|
470
|
+
*/
|
|
471
|
+
export const runCrawlers = async (items, { concurrency = 1 } = {}) => {
|
|
472
|
+
const instances = items.map((item) =>
|
|
473
|
+
item && typeof item.run === 'function' ? item : createCrawler(item)
|
|
474
|
+
);
|
|
475
|
+
|
|
476
|
+
const results = new Array(instances.length);
|
|
477
|
+
let cursor = 0;
|
|
478
|
+
|
|
479
|
+
const worker = async () => {
|
|
480
|
+
for (;;) {
|
|
481
|
+
const index = cursor++;
|
|
482
|
+
if (index >= instances.length) return;
|
|
483
|
+
results[index] = await instances[index].run();
|
|
484
|
+
}
|
|
485
|
+
};
|
|
486
|
+
|
|
487
|
+
const poolSize = Math.max(1, Math.min(concurrency, instances.length || 1));
|
|
488
|
+
await Promise.all(Array.from({ length: poolSize }, () => worker()));
|
|
489
|
+
|
|
490
|
+
return results;
|
|
491
|
+
};
|
package/src/extract/extract.js
CHANGED
|
@@ -20,18 +20,32 @@ const collectText = ($, element) => {
|
|
|
20
20
|
* Extracts readable text from an HTML document. Cheerio decodes HTML entities
|
|
21
21
|
* for us, so no separate decoder dependency is needed.
|
|
22
22
|
*
|
|
23
|
+
* `root` allow-lists the container(s) to read from (a selector or array of
|
|
24
|
+
* selectors); when it matches nothing — or is null — extraction falls back to
|
|
25
|
+
* `rootFallback` (default `<body>`). `removeSelectors` then strips noise from
|
|
26
|
+
* within the chosen root.
|
|
27
|
+
*
|
|
23
28
|
* @param {string|import('cheerio').CheerioAPI} input - raw HTML or a loaded Cheerio instance
|
|
24
|
-
* @param {{ removeSelectors?: string[] }} [options]
|
|
29
|
+
* @param {{ removeSelectors?: string[], root?: string|string[]|null, rootFallback?: string }} [options]
|
|
25
30
|
* @returns {string}
|
|
26
31
|
*/
|
|
27
32
|
export const extractText = (input, options = {}) => {
|
|
28
|
-
const { removeSelectors = [] } = options;
|
|
33
|
+
const { removeSelectors = [], root = null, rootFallback = 'body' } = options;
|
|
29
34
|
const $ = typeof input === 'string' ? cheerio.load(input) : input;
|
|
30
35
|
|
|
31
36
|
if (removeSelectors.length) $(removeSelectors.join(',')).remove();
|
|
32
37
|
$('*').contents().filter((_, node) => node.type === 'comment').remove();
|
|
33
38
|
|
|
34
|
-
|
|
39
|
+
const rootSelector = Array.isArray(root) ? root.join(',') : root;
|
|
40
|
+
let $root = rootSelector ? $(rootSelector) : $(rootFallback || 'body');
|
|
41
|
+
if ($root.length === 0) $root = $(rootFallback || 'body');
|
|
42
|
+
|
|
43
|
+
let text = '';
|
|
44
|
+
$root.each((_, element) => {
|
|
45
|
+
text += `${collectText($, $(element))} `;
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
return text
|
|
35
49
|
.replace(/\n/g, ' ')
|
|
36
50
|
.replace(/\\['"\\]/g, (match) => match.slice(1))
|
|
37
51
|
.replace(WHITESPACE_CHARS, ' ')
|
package/src/extract/links.js
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import { URL } from 'node:url';
|
|
2
|
-
import { normalizeUrl } from '../url/normalize.js';
|
|
3
2
|
|
|
4
3
|
const NON_NAVIGATIONAL = /^(mailto:|tel:|javascript:|data:)/i;
|
|
5
4
|
|
|
6
5
|
/**
|
|
7
|
-
* Collects unique,
|
|
8
|
-
*
|
|
6
|
+
* Collects unique, absolute links from anchor tags in a document, resolving
|
|
7
|
+
* relative hrefs against `baseUrl`. Normalization and include/exclude filtering
|
|
8
|
+
* are the crawler's job (`enqueue`), so links are only resolved here.
|
|
9
9
|
*
|
|
10
10
|
* @param {import('cheerio').CheerioAPI} $
|
|
11
11
|
* @param {string} baseUrl - used to resolve relative hrefs
|
|
@@ -19,7 +19,7 @@ export const discoverLinks = ($, baseUrl) => {
|
|
|
19
19
|
if (!href || href.startsWith('#') || NON_NAVIGATIONAL.test(href)) return;
|
|
20
20
|
|
|
21
21
|
try {
|
|
22
|
-
links.add(
|
|
22
|
+
links.add(new URL(href, baseUrl).href);
|
|
23
23
|
} catch {
|
|
24
24
|
// Ignore malformed hrefs.
|
|
25
25
|
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/** Coerces a fetcher body (string or binary) to a UTF-8 string. */
|
|
2
|
+
export const toText = (data) => (typeof data === 'string' ? data : Buffer.from(data).toString('utf8'));
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Buckets a Content-Type into the kind of body Scraply knows how to handle.
|
|
6
|
+
* Anything containing "json" is JSON, anything containing "html" (incl.
|
|
7
|
+
* application/xhtml+xml) is HTML, everything else is treated as raw text.
|
|
8
|
+
*
|
|
9
|
+
* @param {string} [contentType]
|
|
10
|
+
* @returns {'html'|'json'|'text'}
|
|
11
|
+
*/
|
|
12
|
+
export const classifyContentType = (contentType = '') => {
|
|
13
|
+
const value = String(contentType).toLowerCase();
|
|
14
|
+
if (value.includes('json')) return 'json';
|
|
15
|
+
if (value.includes('html')) return 'html';
|
|
16
|
+
return 'text';
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Parses a JSON body. Returns the parsed value plus a pretty-printed string for
|
|
21
|
+
* the record `content`. Falls back to the raw text when the body is not valid
|
|
22
|
+
* JSON (so a mislabeled response is never lost).
|
|
23
|
+
*
|
|
24
|
+
* @param {string|ArrayBuffer} data
|
|
25
|
+
* @returns {{ data: unknown, content: string }}
|
|
26
|
+
*/
|
|
27
|
+
export const parseJson = (data) => {
|
|
28
|
+
const text = toText(data);
|
|
29
|
+
try {
|
|
30
|
+
const parsed = JSON.parse(text);
|
|
31
|
+
return { data: parsed, content: JSON.stringify(parsed, null, 2) };
|
|
32
|
+
} catch {
|
|
33
|
+
return { data: null, content: text };
|
|
34
|
+
}
|
|
35
|
+
};
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import * as cheerio from 'cheerio';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Parses an XML sitemap or sitemap index. Returns nested `sitemaps` (from a
|
|
5
|
+
* `<sitemapindex>`) and page `urls` (from a `<urlset>`) separately so the
|
|
6
|
+
* crawler can recurse into indexes before enqueuing pages.
|
|
7
|
+
*
|
|
8
|
+
* @param {string} xml
|
|
9
|
+
* @returns {{ sitemaps: string[], urls: string[] }}
|
|
10
|
+
*/
|
|
11
|
+
export const parseSitemap = (xml) => {
|
|
12
|
+
const $ = cheerio.load(xml, { xmlMode: true });
|
|
13
|
+
const sitemaps = [];
|
|
14
|
+
const urls = [];
|
|
15
|
+
|
|
16
|
+
$('sitemap > loc').each((_, el) => {
|
|
17
|
+
const value = $(el).text().trim();
|
|
18
|
+
if (value) sitemaps.push(value);
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
$('url > loc').each((_, el) => {
|
|
22
|
+
const value = $(el).text().trim();
|
|
23
|
+
if (value) urls.push(value);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
// Fallback for sitemaps that omit the standard wrapping elements.
|
|
27
|
+
if (sitemaps.length === 0 && urls.length === 0) {
|
|
28
|
+
$('loc').each((_, el) => {
|
|
29
|
+
const value = $(el).text().trim();
|
|
30
|
+
if (value) urls.push(value);
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
return { sitemaps, urls };
|
|
35
|
+
};
|