scraply 2.0.1 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +7 -2
- package/readme.md +112 -10
- package/src/config/defaults.js +27 -2
- package/src/config/load.js +49 -0
- package/src/core/errors.js +23 -0
- package/src/core/queue.js +29 -11
- package/src/core/retry.js +11 -7
- package/src/crawler.js +215 -56
- package/src/extract/extract.js +17 -3
- package/src/extract/parse.js +35 -0
- package/src/extract/sitemap.js +35 -0
- package/src/index.d.ts +285 -0
- package/src/index.js +37 -6
- package/src/output/writers.js +14 -5
package/src/crawler.js
CHANGED
|
@@ -6,18 +6,19 @@ import { loadConfig } from './config/load.js';
|
|
|
6
6
|
import { createLogger } from './util/logger.js';
|
|
7
7
|
import { createHooks } from './util/hooks.js';
|
|
8
8
|
import { normalizeUrl } from './url/normalize.js';
|
|
9
|
-
import { matchesAnyPattern } from './url/patterns.js';
|
|
9
|
+
import { matchesPattern, matchesAnyPattern } from './url/patterns.js';
|
|
10
10
|
import { discoverLinks } from './extract/links.js';
|
|
11
11
|
import { extractText } from './extract/extract.js';
|
|
12
|
+
import { classifyContentType, parseJson, toText } from './extract/parse.js';
|
|
13
|
+
import { parseSitemap } from './extract/sitemap.js';
|
|
12
14
|
import { QueueManager } from './core/queue.js';
|
|
13
15
|
import { runPipeline } from './core/pipeline.js';
|
|
14
16
|
import { createRetryRunner } from './core/retry.js';
|
|
17
|
+
import { RateLimitError } from './core/errors.js';
|
|
15
18
|
import { resolveFetcher } from './fetchers/index.js';
|
|
16
19
|
import { formatRecords } from './output/writers.js';
|
|
17
20
|
import { loadJSON, saveJSON, deletePath, deleteUntracked } from './storage/files.js';
|
|
18
21
|
|
|
19
|
-
const toHtml = (data) => (typeof data === 'string' ? data : Buffer.from(data).toString('utf8'));
|
|
20
|
-
|
|
21
22
|
const sha256 = (text) => createHash('sha256').update(text).digest('hex');
|
|
22
23
|
|
|
23
24
|
/**
|
|
@@ -42,17 +43,45 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
42
43
|
let datasetCounter = 0;
|
|
43
44
|
let processedCount = 0;
|
|
44
45
|
let signalsRegistered = false;
|
|
46
|
+
let signalHandler = null;
|
|
47
|
+
/** @type {RateLimitError|null} Set when a 429 aborts the crawl; rethrown after the pool drains. */
|
|
48
|
+
let rateLimitError = null;
|
|
45
49
|
|
|
46
50
|
const closeFetcher = async () => {
|
|
47
51
|
if (fetcher.close) await fetcher.close();
|
|
48
52
|
};
|
|
49
53
|
|
|
50
|
-
const
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
+
const retryRunner = createRetryRunner({ config, logger });
|
|
55
|
+
|
|
56
|
+
// Resolves the effective per-URL config, applying the most specific matching
|
|
57
|
+
// `sites` entry over the top-level `allowedContentTypes` / `extract`.
|
|
58
|
+
const resolveEntryConfig = (url) => {
|
|
59
|
+
if (!config.sites.length) {
|
|
60
|
+
return { allowedContentTypes: config.allowedContentTypes, extract: config.extract };
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
let best = null;
|
|
64
|
+
let bestLen = -1;
|
|
65
|
+
for (const site of config.sites) {
|
|
66
|
+
for (const pattern of site.match) {
|
|
67
|
+
if (!matchesPattern(url, pattern)) continue;
|
|
68
|
+
const len = typeof pattern === 'string' ? pattern.length : String(pattern).length;
|
|
69
|
+
if (len > bestLen) {
|
|
70
|
+
bestLen = len;
|
|
71
|
+
best = site;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
54
75
|
|
|
55
|
-
|
|
76
|
+
if (!best) {
|
|
77
|
+
return { allowedContentTypes: config.allowedContentTypes, extract: config.extract };
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
allowedContentTypes: best.allowedContentTypes ?? config.allowedContentTypes,
|
|
82
|
+
extract: { ...config.extract, ...(best.extract ?? {}) }
|
|
83
|
+
};
|
|
84
|
+
};
|
|
56
85
|
|
|
57
86
|
// --- queue lifecycle ---
|
|
58
87
|
|
|
@@ -77,6 +106,11 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
77
106
|
if (requeued > 0) logger.info(`Re-queued ${requeued} previously errored URL(s) for retry.`);
|
|
78
107
|
}
|
|
79
108
|
|
|
109
|
+
if (config.crawl.retrySkipped) {
|
|
110
|
+
const requeued = queue.requeueSkipped();
|
|
111
|
+
if (requeued > 0) logger.info(`Re-queued ${requeued} previously skipped URL(s) for retry.`);
|
|
112
|
+
}
|
|
113
|
+
|
|
80
114
|
if (queue.entries.length === 0) {
|
|
81
115
|
logger.info(`Starting fresh with ${startUrls.length} start URL(s).`);
|
|
82
116
|
queue.seed(startUrls);
|
|
@@ -104,10 +138,11 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
104
138
|
// Fetches a single URL (with retry/rate-limit policy) and returns the raw result.
|
|
105
139
|
const fetchUrl = (url) => retryRunner.run(() => fetcher.fetch(normalizeUrl(url)));
|
|
106
140
|
|
|
107
|
-
// Extracts readable text from HTML.
|
|
141
|
+
// Extracts readable text from HTML. When a URL is supplied, the matching
|
|
142
|
+
// per-site extract rules apply; otherwise the global extract config is used.
|
|
108
143
|
const extract = (html, url = null) => ({
|
|
109
144
|
url,
|
|
110
|
-
content: extractText(html,
|
|
145
|
+
content: extractText(html, url ? resolveEntryConfig(url).extract : config.extract)
|
|
111
146
|
});
|
|
112
147
|
|
|
113
148
|
const shouldCrawl = (url) => {
|
|
@@ -157,40 +192,69 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
157
192
|
const result = await retryRunner.run(() => fetcher.fetch(entry.url));
|
|
158
193
|
await hooks.emit('response', result, entry);
|
|
159
194
|
|
|
195
|
+
const effective = resolveEntryConfig(entry.url);
|
|
196
|
+
|
|
160
197
|
// Fetchers return lowercased header keys (see Fetcher interface).
|
|
161
198
|
const contentType = result.headers?.['content-type'];
|
|
162
|
-
if (!contentType || !
|
|
163
|
-
|
|
199
|
+
if (!contentType || !effective.allowedContentTypes.some((type) => contentType.includes(type))) {
|
|
200
|
+
const reason = `content-type: ${contentType ?? 'none'}`;
|
|
201
|
+
queue.markSkipped(entry, { reason, status: result.status });
|
|
202
|
+
await hooks.emit('skip', entry, { reason, status: result.status, result });
|
|
164
203
|
return;
|
|
165
204
|
}
|
|
166
205
|
|
|
167
|
-
const
|
|
206
|
+
const kind = classifyContentType(contentType);
|
|
207
|
+
let $ = null;
|
|
208
|
+
let content = '';
|
|
209
|
+
let data = null;
|
|
168
210
|
|
|
169
|
-
|
|
170
|
-
|
|
211
|
+
if (kind === 'html') {
|
|
212
|
+
$ = cheerio.load(toText(result.data));
|
|
171
213
|
|
|
172
|
-
|
|
173
|
-
|
|
214
|
+
// Discover links from the full DOM before extraction strips elements.
|
|
215
|
+
const links = await hooks.reduce('links', discoverLinks($, entry.url), $, entry, result);
|
|
216
|
+
await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
|
|
174
217
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
218
|
+
content = extractText($, effective.extract);
|
|
219
|
+
} else if (kind === 'json' && effective.extract.json !== false) {
|
|
220
|
+
const parsed = parseJson(result.data);
|
|
221
|
+
data = parsed.data;
|
|
222
|
+
content = parsed.content;
|
|
223
|
+
|
|
224
|
+
const links = await hooks.reduce('links', [], $, entry, result);
|
|
225
|
+
if (links?.length) await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
|
|
226
|
+
} else {
|
|
227
|
+
content = toText(result.data);
|
|
228
|
+
|
|
229
|
+
const links = await hooks.reduce('links', [], $, entry, result);
|
|
230
|
+
if (links?.length) await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
content = await hooks.reduce('extract', content, $, entry, result);
|
|
234
|
+
|
|
235
|
+
let record = { url: entry.url, content, crawledAt: new Date().toISOString() };
|
|
236
|
+
if (data !== null) record.data = data;
|
|
237
|
+
|
|
238
|
+
// Transform runs BEFORE the record is persisted so its result is what gets
|
|
239
|
+
// saved to disk and later picked up by format().
|
|
240
|
+
record = await hooks.reduce('transform', record, entry, result);
|
|
241
|
+
record.hash = sha256(record.content ?? '');
|
|
181
242
|
|
|
182
243
|
const file = saveDataset(record);
|
|
183
244
|
queue.markDone(entry, { file, status: result.status });
|
|
184
245
|
|
|
185
|
-
|
|
186
|
-
await hooks.emit('page', transformed, entry);
|
|
246
|
+
await hooks.emit('page', record, entry, result);
|
|
187
247
|
} catch (error) {
|
|
188
|
-
// A 429
|
|
189
|
-
//
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
248
|
+
// A 429 with exitOnLimit aborts the whole crawl: stash the error, stop the
|
|
249
|
+
// pool and leave the entry pending so the next run retries it.
|
|
250
|
+
if (error instanceof RateLimitError) {
|
|
251
|
+
rateLimitError = error;
|
|
252
|
+
stopped = true;
|
|
253
|
+
queue.flush();
|
|
254
|
+
return;
|
|
193
255
|
}
|
|
256
|
+
|
|
257
|
+
queue.markError(entry, { error: error.message, status: error.response?.status });
|
|
194
258
|
await hooks.emit('error', error, entry);
|
|
195
259
|
logger.error(`Failed to fetch ${entry.url} -> ${error.message}`);
|
|
196
260
|
}
|
|
@@ -213,19 +277,64 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
213
277
|
};
|
|
214
278
|
|
|
215
279
|
const registerSignals = () => {
|
|
216
|
-
if (signalsRegistered) return;
|
|
280
|
+
if (!config.signals || signalsRegistered) return;
|
|
217
281
|
signalsRegistered = true;
|
|
218
282
|
|
|
219
|
-
|
|
220
|
-
|
|
283
|
+
let forcing = false;
|
|
284
|
+
signalHandler = () => {
|
|
285
|
+
if (forcing) {
|
|
286
|
+
logger.warn('Received second termination signal. Forcing quit.');
|
|
287
|
+
process.exit(1);
|
|
288
|
+
}
|
|
289
|
+
forcing = true;
|
|
290
|
+
logger.warn('Received termination signal. Finishing in-flight work... (signal again to force quit)');
|
|
221
291
|
stopped = true;
|
|
222
292
|
queue.flush();
|
|
223
|
-
await closeFetcher();
|
|
224
|
-
process.exit(0);
|
|
225
293
|
};
|
|
226
294
|
|
|
227
|
-
process.
|
|
228
|
-
process.
|
|
295
|
+
process.on('SIGINT', signalHandler);
|
|
296
|
+
process.on('SIGTERM', signalHandler);
|
|
297
|
+
};
|
|
298
|
+
|
|
299
|
+
const unregisterSignals = () => {
|
|
300
|
+
if (!signalHandler) return;
|
|
301
|
+
process.off('SIGINT', signalHandler);
|
|
302
|
+
process.off('SIGTERM', signalHandler);
|
|
303
|
+
signalHandler = null;
|
|
304
|
+
signalsRegistered = false;
|
|
305
|
+
};
|
|
306
|
+
|
|
307
|
+
// Seeds URLs from sitemap(s) when crawl.sitemap is enabled. Recurses into
|
|
308
|
+
// sitemap indexes (bounded) and routes discovered URLs through enqueue() so
|
|
309
|
+
// include/exclude rules still apply.
|
|
310
|
+
const seedSitemaps = async () => {
|
|
311
|
+
const cfg = config.crawl.sitemap;
|
|
312
|
+
if (!cfg) return;
|
|
313
|
+
|
|
314
|
+
const roots = Array.isArray(cfg)
|
|
315
|
+
? cfg
|
|
316
|
+
: startUrls.map((url) => new URL('/sitemap.xml', url).href);
|
|
317
|
+
|
|
318
|
+
const seen = new Set();
|
|
319
|
+
let added = 0;
|
|
320
|
+
|
|
321
|
+
const visit = async (url, depth) => {
|
|
322
|
+
if (depth > 5 || seen.has(url)) return;
|
|
323
|
+
seen.add(url);
|
|
324
|
+
|
|
325
|
+
try {
|
|
326
|
+
const result = await retryRunner.run(() => fetcher.fetch(url));
|
|
327
|
+
const { sitemaps, urls } = parseSitemap(toText(result.data));
|
|
328
|
+
added += await enqueue(urls, { depth: 0, referrer: url });
|
|
329
|
+
for (const nested of sitemaps) await visit(nested, depth + 1);
|
|
330
|
+
} catch (error) {
|
|
331
|
+
if (error instanceof RateLimitError) throw error;
|
|
332
|
+
logger.warn(`Sitemap fetch failed (${url}) -> ${error.message}`);
|
|
333
|
+
}
|
|
334
|
+
};
|
|
335
|
+
|
|
336
|
+
for (const url of roots) await visit(url, 0);
|
|
337
|
+
if (added > 0) logger.info(`Seeded ${added} URL(s) from sitemap(s).`);
|
|
229
338
|
};
|
|
230
339
|
|
|
231
340
|
// Crawls until the queue is drained (or `stop()` is called).
|
|
@@ -233,37 +342,49 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
233
342
|
init();
|
|
234
343
|
logBanner();
|
|
235
344
|
registerSignals();
|
|
345
|
+
rateLimitError = null;
|
|
236
346
|
|
|
237
|
-
|
|
238
|
-
|
|
347
|
+
try {
|
|
348
|
+
if (fetcher.init) await fetcher.init();
|
|
349
|
+
await seedSitemaps();
|
|
350
|
+
processedCount = queue.crawledCount() + queue.errorCount() + queue.skippedCount();
|
|
351
|
+
|
|
352
|
+
await runPipeline({
|
|
353
|
+
queue,
|
|
354
|
+
concurrency: config.crawl.concurrency,
|
|
355
|
+
perHostDelay: config.crawl.delay,
|
|
356
|
+
processOne,
|
|
357
|
+
isStopped: () => stopped || queue.crawledCount() >= config.crawl.maxPages
|
|
358
|
+
});
|
|
239
359
|
|
|
240
|
-
|
|
241
|
-
queue,
|
|
242
|
-
concurrency: config.crawl.concurrency,
|
|
243
|
-
perHostDelay: config.crawl.delay,
|
|
244
|
-
processOne,
|
|
245
|
-
isStopped: () => stopped || queue.crawledCount() >= config.crawl.maxPages
|
|
246
|
-
});
|
|
360
|
+
queue.flush();
|
|
247
361
|
|
|
248
|
-
|
|
362
|
+
// A rate-limit abort surfaces here so run() can clean up (flush + close)
|
|
363
|
+
// before the error propagates to the caller.
|
|
364
|
+
if (rateLimitError) throw rateLimitError;
|
|
249
365
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
366
|
+
if (config.crawl.maxPages !== Infinity && queue.crawledCount() >= config.crawl.maxPages) {
|
|
367
|
+
logger.info(`Reached maxPages limit (${config.crawl.maxPages}).`);
|
|
368
|
+
}
|
|
253
369
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
370
|
+
logger.info(
|
|
371
|
+
`Crawling completed! ${queue.crawledCount()} crawled, ${queue.skippedCount()} skipped, ` +
|
|
372
|
+
`${queue.errorCount()} errors, ${queue.pendingCount()} pending (of ${queue.entries.length} total).`
|
|
373
|
+
);
|
|
374
|
+
} finally {
|
|
375
|
+
unregisterSignals();
|
|
376
|
+
}
|
|
258
377
|
};
|
|
259
378
|
|
|
260
379
|
// Re-reads crawled pages from disk so resumed runs include earlier sessions.
|
|
380
|
+
// The full saved record is returned (including any `transform` additions and
|
|
381
|
+
// `data` for JSON sources); the output writer decides what to serialize.
|
|
261
382
|
const collectRecords = () => {
|
|
262
383
|
const records = [];
|
|
263
384
|
for (const entry of queue.entries) {
|
|
264
385
|
if (!entry.file) continue;
|
|
265
|
-
const
|
|
266
|
-
if (
|
|
386
|
+
const record = loadJSON(path.posix.join(config.storage.crawledDir, entry.file), null);
|
|
387
|
+
if (record) records.push(record);
|
|
267
388
|
}
|
|
268
389
|
return records;
|
|
269
390
|
};
|
|
@@ -322,6 +443,12 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
322
443
|
if (queue.entries.length === 0) queue.load();
|
|
323
444
|
return queue.requeueErrors();
|
|
324
445
|
},
|
|
446
|
+
// Same as requeueErrors() but for skipped entries (e.g. after widening
|
|
447
|
+
// allowedContentTypes or changing sites).
|
|
448
|
+
requeueSkipped: () => {
|
|
449
|
+
if (queue.entries.length === 0) queue.load();
|
|
450
|
+
return queue.requeueSkipped();
|
|
451
|
+
},
|
|
325
452
|
stop: () => {
|
|
326
453
|
stopped = true;
|
|
327
454
|
}
|
|
@@ -330,3 +457,35 @@ export const createCrawler = (userConfig = {}) => {
|
|
|
330
457
|
|
|
331
458
|
// One-call convenience wrapper: create a crawler and run the full pipeline.
|
|
332
459
|
export const scraply = (userConfig = {}) => createCrawler(userConfig).run();
|
|
460
|
+
|
|
461
|
+
/**
|
|
462
|
+
* Runs multiple crawlers in one process. Accepts crawler instances or plain
|
|
463
|
+
* config objects (which are turned into crawlers). Because the crawler no longer
|
|
464
|
+
* calls `process.exit`, several crawlers can safely share one process — set
|
|
465
|
+
* `signals: false` in each config (or rely on the per-instance graceful stop).
|
|
466
|
+
*
|
|
467
|
+
* @param {Array<import('./index.js').ScraplyConfig | ReturnType<typeof createCrawler>>} items
|
|
468
|
+
* @param {{ concurrency?: number }} [options] - how many crawlers run at once (default 1 = sequential)
|
|
469
|
+
* @returns {Promise<Array<import('./core/queue.js').QueueEntry[]>>} each crawler's final queue entries, in input order
|
|
470
|
+
*/
|
|
471
|
+
export const runCrawlers = async (items, { concurrency = 1 } = {}) => {
|
|
472
|
+
const instances = items.map((item) =>
|
|
473
|
+
item && typeof item.run === 'function' ? item : createCrawler(item)
|
|
474
|
+
);
|
|
475
|
+
|
|
476
|
+
const results = new Array(instances.length);
|
|
477
|
+
let cursor = 0;
|
|
478
|
+
|
|
479
|
+
const worker = async () => {
|
|
480
|
+
for (;;) {
|
|
481
|
+
const index = cursor++;
|
|
482
|
+
if (index >= instances.length) return;
|
|
483
|
+
results[index] = await instances[index].run();
|
|
484
|
+
}
|
|
485
|
+
};
|
|
486
|
+
|
|
487
|
+
const poolSize = Math.max(1, Math.min(concurrency, instances.length || 1));
|
|
488
|
+
await Promise.all(Array.from({ length: poolSize }, () => worker()));
|
|
489
|
+
|
|
490
|
+
return results;
|
|
491
|
+
};
|
package/src/extract/extract.js
CHANGED
|
@@ -20,18 +20,32 @@ const collectText = ($, element) => {
|
|
|
20
20
|
* Extracts readable text from an HTML document. Cheerio decodes HTML entities
|
|
21
21
|
* for us, so no separate decoder dependency is needed.
|
|
22
22
|
*
|
|
23
|
+
* `root` allow-lists the container(s) to read from (a selector or array of
|
|
24
|
+
* selectors); when it matches nothing — or is null — extraction falls back to
|
|
25
|
+
* `rootFallback` (default `<body>`). `removeSelectors` then strips noise from
|
|
26
|
+
* within the chosen root.
|
|
27
|
+
*
|
|
23
28
|
* @param {string|import('cheerio').CheerioAPI} input - raw HTML or a loaded Cheerio instance
|
|
24
|
-
* @param {{ removeSelectors?: string[] }} [options]
|
|
29
|
+
* @param {{ removeSelectors?: string[], root?: string|string[]|null, rootFallback?: string }} [options]
|
|
25
30
|
* @returns {string}
|
|
26
31
|
*/
|
|
27
32
|
export const extractText = (input, options = {}) => {
|
|
28
|
-
const { removeSelectors = [] } = options;
|
|
33
|
+
const { removeSelectors = [], root = null, rootFallback = 'body' } = options;
|
|
29
34
|
const $ = typeof input === 'string' ? cheerio.load(input) : input;
|
|
30
35
|
|
|
31
36
|
if (removeSelectors.length) $(removeSelectors.join(',')).remove();
|
|
32
37
|
$('*').contents().filter((_, node) => node.type === 'comment').remove();
|
|
33
38
|
|
|
34
|
-
|
|
39
|
+
const rootSelector = Array.isArray(root) ? root.join(',') : root;
|
|
40
|
+
let $root = rootSelector ? $(rootSelector) : $(rootFallback || 'body');
|
|
41
|
+
if ($root.length === 0) $root = $(rootFallback || 'body');
|
|
42
|
+
|
|
43
|
+
let text = '';
|
|
44
|
+
$root.each((_, element) => {
|
|
45
|
+
text += `${collectText($, $(element))} `;
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
return text
|
|
35
49
|
.replace(/\n/g, ' ')
|
|
36
50
|
.replace(/\\['"\\]/g, (match) => match.slice(1))
|
|
37
51
|
.replace(WHITESPACE_CHARS, ' ')
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/** Coerces a fetcher body (string or binary) to a UTF-8 string. */
|
|
2
|
+
export const toText = (data) => (typeof data === 'string' ? data : Buffer.from(data).toString('utf8'));
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Buckets a Content-Type into the kind of body Scraply knows how to handle.
|
|
6
|
+
* Anything containing "json" is JSON, anything containing "html" (incl.
|
|
7
|
+
* application/xhtml+xml) is HTML, everything else is treated as raw text.
|
|
8
|
+
*
|
|
9
|
+
* @param {string} [contentType]
|
|
10
|
+
* @returns {'html'|'json'|'text'}
|
|
11
|
+
*/
|
|
12
|
+
export const classifyContentType = (contentType = '') => {
|
|
13
|
+
const value = String(contentType).toLowerCase();
|
|
14
|
+
if (value.includes('json')) return 'json';
|
|
15
|
+
if (value.includes('html')) return 'html';
|
|
16
|
+
return 'text';
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Parses a JSON body. Returns the parsed value plus a pretty-printed string for
|
|
21
|
+
* the record `content`. Falls back to the raw text when the body is not valid
|
|
22
|
+
* JSON (so a mislabeled response is never lost).
|
|
23
|
+
*
|
|
24
|
+
* @param {string|ArrayBuffer} data
|
|
25
|
+
* @returns {{ data: unknown, content: string }}
|
|
26
|
+
*/
|
|
27
|
+
export const parseJson = (data) => {
|
|
28
|
+
const text = toText(data);
|
|
29
|
+
try {
|
|
30
|
+
const parsed = JSON.parse(text);
|
|
31
|
+
return { data: parsed, content: JSON.stringify(parsed, null, 2) };
|
|
32
|
+
} catch {
|
|
33
|
+
return { data: null, content: text };
|
|
34
|
+
}
|
|
35
|
+
};
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import * as cheerio from 'cheerio';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Parses an XML sitemap or sitemap index. Returns nested `sitemaps` (from a
|
|
5
|
+
* `<sitemapindex>`) and page `urls` (from a `<urlset>`) separately so the
|
|
6
|
+
* crawler can recurse into indexes before enqueuing pages.
|
|
7
|
+
*
|
|
8
|
+
* @param {string} xml
|
|
9
|
+
* @returns {{ sitemaps: string[], urls: string[] }}
|
|
10
|
+
*/
|
|
11
|
+
export const parseSitemap = (xml) => {
|
|
12
|
+
const $ = cheerio.load(xml, { xmlMode: true });
|
|
13
|
+
const sitemaps = [];
|
|
14
|
+
const urls = [];
|
|
15
|
+
|
|
16
|
+
$('sitemap > loc').each((_, el) => {
|
|
17
|
+
const value = $(el).text().trim();
|
|
18
|
+
if (value) sitemaps.push(value);
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
$('url > loc').each((_, el) => {
|
|
22
|
+
const value = $(el).text().trim();
|
|
23
|
+
if (value) urls.push(value);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
// Fallback for sitemaps that omit the standard wrapping elements.
|
|
27
|
+
if (sitemaps.length === 0 && urls.length === 0) {
|
|
28
|
+
$('loc').each((_, el) => {
|
|
29
|
+
const value = $(el).text().trim();
|
|
30
|
+
if (value) urls.push(value);
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
return { sitemaps, urls };
|
|
35
|
+
};
|