scraply 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/crawler.js CHANGED
@@ -1,36 +1,28 @@
1
1
  import path from 'node:path';
2
+ import { createHash } from 'node:crypto';
2
3
  import * as cheerio from 'cheerio';
3
4
 
4
5
  import { loadConfig } from './config/load.js';
5
6
  import { createLogger } from './util/logger.js';
6
7
  import { createHooks } from './util/hooks.js';
7
8
  import { normalizeUrl } from './url/normalize.js';
8
- import { matchesAnyPattern } from './url/patterns.js';
9
+ import { matchesPattern, matchesAnyPattern } from './url/patterns.js';
9
10
  import { discoverLinks } from './extract/links.js';
10
11
  import { extractText } from './extract/extract.js';
12
+ import { classifyContentType, parseJson, toText } from './extract/parse.js';
13
+ import { parseSitemap } from './extract/sitemap.js';
11
14
  import { QueueManager } from './core/queue.js';
12
15
  import { runPipeline } from './core/pipeline.js';
13
16
  import { createRetryRunner } from './core/retry.js';
17
+ import { RateLimitError } from './core/errors.js';
14
18
  import { resolveFetcher } from './fetchers/index.js';
15
19
  import { formatRecords } from './output/writers.js';
16
20
  import { loadJSON, saveJSON, deletePath, deleteUntracked } from './storage/files.js';
17
21
 
18
- const getHeader = (headers, name) => {
19
- if (!headers) return undefined;
20
- if (headers[name] !== undefined) return headers[name];
21
- const lower = name.toLowerCase();
22
- for (const key of Object.keys(headers)) {
23
- if (key.toLowerCase() === lower) return headers[key];
24
- }
25
- return undefined;
26
- };
27
-
28
- const toHtml = (data) => (typeof data === 'string' ? data : Buffer.from(data).toString('utf8'));
22
+ const sha256 = (text) => createHash('sha256').update(text).digest('hex');
29
23
 
30
24
  /**
31
- * Creates a crawler instance. Every stage is exposed as a method so callers can
32
- * run the whole pipeline (`run`) or drive individual stages and add their own
33
- * logic via hooks.
25
+ * Creates a crawler instance. Every stage is exposed as a method so callers can run the whole pipeline (`run`) or drive individual stages and add their own logic via hooks.
34
26
  *
35
27
  * @param {import('./index.js').ScraplyConfig} [userConfig]
36
28
  */
@@ -41,22 +33,55 @@ export const createCrawler = (userConfig = {}) => {
41
33
  const queue = new QueueManager({ config, logger });
42
34
  const fetcher = resolveFetcher({ config, logger });
43
35
 
36
+ // Normalized once so the start URLs match discovered (normalized) links and
37
+ // can be looked up in O(1) during filtering.
38
+ const startUrls = config.startUrls.map(normalizeUrl);
39
+ const startUrlSet = new Set(startUrls);
40
+
44
41
  let stopped = false;
45
42
  let initialized = false;
46
43
  let datasetCounter = 0;
47
44
  let processedCount = 0;
48
45
  let signalsRegistered = false;
46
+ let signalHandler = null;
47
+ /** @type {RateLimitError|null} Set when a 429 aborts the crawl; rethrown after the pool drains. */
48
+ let rateLimitError = null;
49
49
 
50
50
  const closeFetcher = async () => {
51
51
  if (fetcher.close) await fetcher.close();
52
52
  };
53
53
 
54
- const onRateLimitExit = (code) => {
55
- queue.flush();
56
- closeFetcher().finally(() => process.exit(code));
57
- };
54
+ const retryRunner = createRetryRunner({ config, logger });
55
+
56
+ // Resolves the effective per-URL config, applying the most specific matching
57
+ // `sites` entry over the top-level `allowedContentTypes` / `extract`.
58
+ const resolveEntryConfig = (url) => {
59
+ if (!config.sites.length) {
60
+ return { allowedContentTypes: config.allowedContentTypes, extract: config.extract };
61
+ }
62
+
63
+ let best = null;
64
+ let bestLen = -1;
65
+ for (const site of config.sites) {
66
+ for (const pattern of site.match) {
67
+ if (!matchesPattern(url, pattern)) continue;
68
+ const len = typeof pattern === 'string' ? pattern.length : String(pattern).length;
69
+ if (len > bestLen) {
70
+ bestLen = len;
71
+ best = site;
72
+ }
73
+ }
74
+ }
75
+
76
+ if (!best) {
77
+ return { allowedContentTypes: config.allowedContentTypes, extract: config.extract };
78
+ }
58
79
 
59
- const retryRunner = createRetryRunner({ config, logger, onRateLimitExit });
80
+ return {
81
+ allowedContentTypes: best.allowedContentTypes ?? config.allowedContentTypes,
82
+ extract: { ...config.extract, ...(best.extract ?? {}) }
83
+ };
84
+ };
60
85
 
61
86
  // --- queue lifecycle ---
62
87
 
@@ -76,9 +101,19 @@ export const createCrawler = (userConfig = {}) => {
76
101
  queue.load();
77
102
  datasetCounter = computeDatasetCounter();
78
103
 
104
+ if (config.crawl.retryErrors) {
105
+ const requeued = queue.requeueErrors();
106
+ if (requeued > 0) logger.info(`Re-queued ${requeued} previously errored URL(s) for retry.`);
107
+ }
108
+
109
+ if (config.crawl.retrySkipped) {
110
+ const requeued = queue.requeueSkipped();
111
+ if (requeued > 0) logger.info(`Re-queued ${requeued} previously skipped URL(s) for retry.`);
112
+ }
113
+
79
114
  if (queue.entries.length === 0) {
80
- logger.info(`Starting fresh with ${config.startUrls.length} start URL(s).`);
81
- queue.seed(config.startUrls.map(normalizeUrl));
115
+ logger.info(`Starting fresh with ${startUrls.length} start URL(s).`);
116
+ queue.seed(startUrls);
82
117
  return;
83
118
  }
84
119
 
@@ -88,7 +123,7 @@ export const createCrawler = (userConfig = {}) => {
88
123
  queue.reset();
89
124
  deletePath(config.storage.crawledDir);
90
125
  datasetCounter = 0;
91
- queue.seed(config.startUrls.map(normalizeUrl));
126
+ queue.seed(startUrls);
92
127
  } else {
93
128
  logger.info('All URLs already processed (resetOnComplete is false). Nothing to do.');
94
129
  }
@@ -100,22 +135,23 @@ export const createCrawler = (userConfig = {}) => {
100
135
 
101
136
  // --- stage methods ---
102
137
 
103
- /** Fetches a single URL (with retry/rate-limit policy) and returns the raw result. */
138
+ // Fetches a single URL (with retry/rate-limit policy) and returns the raw result.
104
139
  const fetchUrl = (url) => retryRunner.run(() => fetcher.fetch(normalizeUrl(url)));
105
140
 
106
- /** Extracts readable text from HTML. */
141
+ // Extracts readable text from HTML. When a URL is supplied, the matching
142
+ // per-site extract rules apply; otherwise the global extract config is used.
107
143
  const extract = (html, url = null) => ({
108
144
  url,
109
- content: extractText(html, { removeSelectors: config.extract.removeSelectors })
145
+ content: extractText(html, url ? resolveEntryConfig(url).extract : config.extract)
110
146
  });
111
147
 
112
148
  const shouldCrawl = (url) => {
113
- if (config.startUrls.some((start) => normalizeUrl(start) === url)) return true;
149
+ if (startUrlSet.has(url)) return true;
114
150
  if (matchesAnyPattern(url, config.exclude)) return false;
115
151
  return matchesAnyPattern(url, config.include);
116
152
  };
117
153
 
118
- /** Filters + normalizes URLs and adds the survivors to the queue. */
154
+ // Filters + normalizes URLs and adds the survivors to the queue.
119
155
  const enqueue = async (urls, { depth = 0, referrer = null } = {}) => {
120
156
  const list = Array.isArray(urls) ? urls : [urls];
121
157
  let added = 0;
@@ -137,15 +173,17 @@ export const createCrawler = (userConfig = {}) => {
137
173
  return added;
138
174
  };
139
175
 
176
+ // Persists a crawled record and returns its filename (relative to crawledDir).
177
+ // Only the bare name is stored in the queue so datasets stay portable.
140
178
  const saveDataset = (record) => {
141
179
  datasetCounter += 1;
142
- const filePath = path.posix.join(config.storage.crawledDir, `${datasetCounter}.json`);
143
- saveJSON(filePath, record);
144
- return filePath;
180
+ const file = `${datasetCounter}.json`;
181
+ saveJSON(path.posix.join(config.storage.crawledDir, file), record);
182
+ return file;
145
183
  };
146
184
 
147
185
  const processOne = async (entry) => {
148
- if (entry.file || entry.error) return;
186
+ if (entry.file || entry.error || entry.skipped) return;
149
187
 
150
188
  processedCount += 1;
151
189
  logger.info(`- ${processedCount}/${queue.entries.length} -> ${entry.url}`);
@@ -154,26 +192,68 @@ export const createCrawler = (userConfig = {}) => {
154
192
  const result = await retryRunner.run(() => fetcher.fetch(entry.url));
155
193
  await hooks.emit('response', result, entry);
156
194
 
157
- const contentType = getHeader(result.headers, 'content-type');
158
- if (!contentType || !config.allowedContentTypes.some((type) => contentType.includes(type))) {
159
- queue.markError(entry, { error: `Skipped content-type: ${contentType ?? 'none'}`, status: result.status });
195
+ const effective = resolveEntryConfig(entry.url);
196
+
197
+ // Fetchers return lowercased header keys (see Fetcher interface).
198
+ const contentType = result.headers?.['content-type'];
199
+ if (!contentType || !effective.allowedContentTypes.some((type) => contentType.includes(type))) {
200
+ const reason = `content-type: ${contentType ?? 'none'}`;
201
+ queue.markSkipped(entry, { reason, status: result.status });
202
+ await hooks.emit('skip', entry, { reason, status: result.status, result });
160
203
  return;
161
204
  }
162
205
 
163
- const $ = cheerio.load(toHtml(result.data));
206
+ const kind = classifyContentType(contentType);
207
+ let $ = null;
208
+ let content = '';
209
+ let data = null;
210
+
211
+ if (kind === 'html') {
212
+ $ = cheerio.load(toText(result.data));
213
+
214
+ // Discover links from the full DOM before extraction strips elements.
215
+ const links = await hooks.reduce('links', discoverLinks($, entry.url), $, entry, result);
216
+ await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
217
+
218
+ content = extractText($, effective.extract);
219
+ } else if (kind === 'json' && effective.extract.json !== false) {
220
+ const parsed = parseJson(result.data);
221
+ data = parsed.data;
222
+ content = parsed.content;
223
+
224
+ const links = await hooks.reduce('links', [], $, entry, result);
225
+ if (links?.length) await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
226
+ } else {
227
+ content = toText(result.data);
228
+
229
+ const links = await hooks.reduce('links', [], $, entry, result);
230
+ if (links?.length) await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
231
+ }
232
+
233
+ content = await hooks.reduce('extract', content, $, entry, result);
164
234
 
165
- // Discover links from the full DOM before extraction strips elements.
166
- await enqueue(discoverLinks($, entry.url), { depth: entry.depth + 1, referrer: entry.url });
235
+ let record = { url: entry.url, content, crawledAt: new Date().toISOString() };
236
+ if (data !== null) record.data = data;
167
237
 
168
- let content = extractText($, { removeSelectors: config.extract.removeSelectors });
169
- content = await hooks.reduce('extract', content, $, entry);
238
+ // Transform runs BEFORE the record is persisted so its result is what gets
239
+ // saved to disk and later picked up by format().
240
+ record = await hooks.reduce('transform', record, entry, result);
241
+ record.hash = sha256(record.content ?? '');
170
242
 
171
- const file = saveDataset({ url: entry.url, content });
243
+ const file = saveDataset(record);
172
244
  queue.markDone(entry, { file, status: result.status });
173
245
 
174
- const record = await hooks.reduce('transform', { url: entry.url, content }, entry);
175
- await hooks.emit('page', record, entry);
246
+ await hooks.emit('page', record, entry, result);
176
247
  } catch (error) {
248
+ // A 429 with exitOnLimit aborts the whole crawl: stash the error, stop the
249
+ // pool and leave the entry pending so the next run retries it.
250
+ if (error instanceof RateLimitError) {
251
+ rateLimitError = error;
252
+ stopped = true;
253
+ queue.flush();
254
+ return;
255
+ }
256
+
177
257
  queue.markError(entry, { error: error.message, status: error.response?.status });
178
258
  await hooks.emit('error', error, entry);
179
259
  logger.error(`Failed to fetch ${entry.url} -> ${error.message}`);
@@ -181,75 +261,140 @@ export const createCrawler = (userConfig = {}) => {
181
261
  };
182
262
 
183
263
  const logBanner = () => {
264
+ const browserLine =
265
+ fetcher.name === 'browser' ? `\n - Browser waitUntil: ${config.browser.waitUntil}` : '';
266
+
184
267
  logger.info(`STARTING SCRAPLY CRAWLER...
185
268
  - Start URLs: ${config.startUrls.join(', ')}
186
- - Fetcher: ${fetcher.name}
269
+ - Fetcher: ${fetcher.name}${browserLine}
187
270
  - Concurrency: ${config.crawl.concurrency}
188
271
  - Per-host delay: ${config.crawl.delay}ms
189
272
  - Max depth: ${config.crawl.maxDepth}
273
+ - Max pages: ${config.crawl.maxPages}
190
274
  - Allowed content types: ${config.allowedContentTypes.join(', ')}
191
275
  - Output format: ${config.output.format}
192
276
  `);
193
277
  };
194
278
 
195
279
  const registerSignals = () => {
196
- if (signalsRegistered) return;
280
+ if (!config.signals || signalsRegistered) return;
197
281
  signalsRegistered = true;
198
282
 
199
- const handler = async () => {
200
- logger.warn('Received termination signal. Saving progress...');
283
+ let forcing = false;
284
+ signalHandler = () => {
285
+ if (forcing) {
286
+ logger.warn('Received second termination signal. Forcing quit.');
287
+ process.exit(1);
288
+ }
289
+ forcing = true;
290
+ logger.warn('Received termination signal. Finishing in-flight work... (signal again to force quit)');
201
291
  stopped = true;
202
292
  queue.flush();
203
- await closeFetcher();
204
- process.exit(0);
205
293
  };
206
294
 
207
- process.once('SIGINT', handler);
208
- process.once('SIGTERM', handler);
295
+ process.on('SIGINT', signalHandler);
296
+ process.on('SIGTERM', signalHandler);
209
297
  };
210
298
 
211
- /** Crawls until the queue is drained (or `stop()` is called). */
299
+ const unregisterSignals = () => {
300
+ if (!signalHandler) return;
301
+ process.off('SIGINT', signalHandler);
302
+ process.off('SIGTERM', signalHandler);
303
+ signalHandler = null;
304
+ signalsRegistered = false;
305
+ };
306
+
307
+ // Seeds URLs from sitemap(s) when crawl.sitemap is enabled. Recurses into
308
+ // sitemap indexes (bounded) and routes discovered URLs through enqueue() so
309
+ // include/exclude rules still apply.
310
+ const seedSitemaps = async () => {
311
+ const cfg = config.crawl.sitemap;
312
+ if (!cfg) return;
313
+
314
+ const roots = Array.isArray(cfg)
315
+ ? cfg
316
+ : startUrls.map((url) => new URL('/sitemap.xml', url).href);
317
+
318
+ const seen = new Set();
319
+ let added = 0;
320
+
321
+ const visit = async (url, depth) => {
322
+ if (depth > 5 || seen.has(url)) return;
323
+ seen.add(url);
324
+
325
+ try {
326
+ const result = await retryRunner.run(() => fetcher.fetch(url));
327
+ const { sitemaps, urls } = parseSitemap(toText(result.data));
328
+ added += await enqueue(urls, { depth: 0, referrer: url });
329
+ for (const nested of sitemaps) await visit(nested, depth + 1);
330
+ } catch (error) {
331
+ if (error instanceof RateLimitError) throw error;
332
+ logger.warn(`Sitemap fetch failed (${url}) -> ${error.message}`);
333
+ }
334
+ };
335
+
336
+ for (const url of roots) await visit(url, 0);
337
+ if (added > 0) logger.info(`Seeded ${added} URL(s) from sitemap(s).`);
338
+ };
339
+
340
+ // Crawls until the queue is drained (or `stop()` is called).
212
341
  const crawl = async () => {
213
342
  init();
214
343
  logBanner();
215
344
  registerSignals();
345
+ rateLimitError = null;
216
346
 
217
- if (fetcher.init) await fetcher.init();
218
- processedCount = queue.crawledCount() + queue.errorCount();
347
+ try {
348
+ if (fetcher.init) await fetcher.init();
349
+ await seedSitemaps();
350
+ processedCount = queue.crawledCount() + queue.errorCount() + queue.skippedCount();
351
+
352
+ await runPipeline({
353
+ queue,
354
+ concurrency: config.crawl.concurrency,
355
+ perHostDelay: config.crawl.delay,
356
+ processOne,
357
+ isStopped: () => stopped || queue.crawledCount() >= config.crawl.maxPages
358
+ });
219
359
 
220
- await runPipeline({
221
- queue,
222
- concurrency: config.crawl.concurrency,
223
- perHostDelay: config.crawl.delay,
224
- processOne,
225
- isStopped: () => stopped
226
- });
360
+ queue.flush();
227
361
 
228
- queue.flush();
229
- logger.info(
230
- `Crawling completed! ${queue.crawledCount()} of ${queue.entries.length} ` +
231
- `(${queue.entries.length - queue.crawledCount()} not crawled, ${queue.errorCount()} errors)`
232
- );
362
+ // A rate-limit abort surfaces here so run() can clean up (flush + close)
363
+ // before the error propagates to the caller.
364
+ if (rateLimitError) throw rateLimitError;
365
+
366
+ if (config.crawl.maxPages !== Infinity && queue.crawledCount() >= config.crawl.maxPages) {
367
+ logger.info(`Reached maxPages limit (${config.crawl.maxPages}).`);
368
+ }
369
+
370
+ logger.info(
371
+ `Crawling completed! ${queue.crawledCount()} crawled, ${queue.skippedCount()} skipped, ` +
372
+ `${queue.errorCount()} errors, ${queue.pendingCount()} pending (of ${queue.entries.length} total).`
373
+ );
374
+ } finally {
375
+ unregisterSignals();
376
+ }
233
377
  };
234
378
 
235
- /** Re-reads crawled pages from disk so resumed runs include earlier sessions. */
379
+ // Re-reads crawled pages from disk so resumed runs include earlier sessions.
380
+ // The full saved record is returned (including any `transform` additions and
381
+ // `data` for JSON sources); the output writer decides what to serialize.
236
382
  const collectRecords = () => {
237
383
  const records = [];
238
384
  for (const entry of queue.entries) {
239
- if (!entry.file || entry.error) continue;
240
- const data = loadJSON(entry.file, null);
241
- if (data) records.push({ url: entry.url, content: data.content });
385
+ if (!entry.file) continue;
386
+ const record = loadJSON(path.posix.join(config.storage.crawledDir, entry.file), null);
387
+ if (record) records.push(record);
242
388
  }
243
389
  return records;
244
390
  };
245
391
 
246
- /**
247
- * Routes records to their output files and writes them. Defaults to every
248
- * successfully crawled page; pass an explicit array to format custom records.
249
- */
392
+ // Routes records to their output files and writes them. Defaults to every successfully crawled page; pass an explicit array to format custom records. When reading from disk, reloads `dataset/queue.json` first so this can run without calling `crawl()` (e.g. after changing `output.routes`).
250
393
  const format = async (records = null) => {
251
394
  logger.info('Formatting data...');
252
395
 
396
+ if (records === null) queue.load();
397
+
253
398
  const collected = records ?? collectRecords();
254
399
  const groups = formatRecords(collected, {
255
400
  output: config.output,
@@ -269,7 +414,7 @@ export const createCrawler = (userConfig = {}) => {
269
414
  return groups;
270
415
  };
271
416
 
272
- /** Full pipeline: init -> crawl -> format, with guaranteed cleanup. */
417
+ // Full pipeline: init -> crawl -> format, with guaranteed cleanup.
273
418
  const run = async () => {
274
419
  try {
275
420
  await crawl();
@@ -292,11 +437,55 @@ export const createCrawler = (userConfig = {}) => {
292
437
  crawl,
293
438
  format,
294
439
  run,
440
+ // Clears errored entries and returns them to the queue so a later crawl()
441
+ // retries them. Persists immediately; returns how many were requeued.
442
+ requeueErrors: () => {
443
+ if (queue.entries.length === 0) queue.load();
444
+ return queue.requeueErrors();
445
+ },
446
+ // Same as requeueErrors() but for skipped entries (e.g. after widening
447
+ // allowedContentTypes or changing sites).
448
+ requeueSkipped: () => {
449
+ if (queue.entries.length === 0) queue.load();
450
+ return queue.requeueSkipped();
451
+ },
295
452
  stop: () => {
296
453
  stopped = true;
297
454
  }
298
455
  };
299
456
  };
300
457
 
301
- /** One-call convenience wrapper: create a crawler and run the full pipeline. */
458
+ // One-call convenience wrapper: create a crawler and run the full pipeline.
302
459
  export const scraply = (userConfig = {}) => createCrawler(userConfig).run();
460
+
461
+ /**
462
+ * Runs multiple crawlers in one process. Accepts crawler instances or plain
463
+ * config objects (which are turned into crawlers). Because the crawler no longer
464
+ * calls `process.exit`, several crawlers can safely share one process — set
465
+ * `signals: false` in each config (or rely on the per-instance graceful stop).
466
+ *
467
+ * @param {Array<import('./index.js').ScraplyConfig | ReturnType<typeof createCrawler>>} items
468
+ * @param {{ concurrency?: number }} [options] - how many crawlers run at once (default 1 = sequential)
469
+ * @returns {Promise<Array<import('./core/queue.js').QueueEntry[]>>} each crawler's final queue entries, in input order
470
+ */
471
+ export const runCrawlers = async (items, { concurrency = 1 } = {}) => {
472
+ const instances = items.map((item) =>
473
+ item && typeof item.run === 'function' ? item : createCrawler(item)
474
+ );
475
+
476
+ const results = new Array(instances.length);
477
+ let cursor = 0;
478
+
479
+ const worker = async () => {
480
+ for (;;) {
481
+ const index = cursor++;
482
+ if (index >= instances.length) return;
483
+ results[index] = await instances[index].run();
484
+ }
485
+ };
486
+
487
+ const poolSize = Math.max(1, Math.min(concurrency, instances.length || 1));
488
+ await Promise.all(Array.from({ length: poolSize }, () => worker()));
489
+
490
+ return results;
491
+ };
@@ -20,18 +20,32 @@ const collectText = ($, element) => {
20
20
  * Extracts readable text from an HTML document. Cheerio decodes HTML entities
21
21
  * for us, so no separate decoder dependency is needed.
22
22
  *
23
+ * `root` allow-lists the container(s) to read from (a selector or array of
24
+ * selectors); when it matches nothing — or is null — extraction falls back to
25
+ * `rootFallback` (default `<body>`). `removeSelectors` then strips noise from
26
+ * within the chosen root.
27
+ *
23
28
  * @param {string|import('cheerio').CheerioAPI} input - raw HTML or a loaded Cheerio instance
24
- * @param {{ removeSelectors?: string[] }} [options]
29
+ * @param {{ removeSelectors?: string[], root?: string|string[]|null, rootFallback?: string }} [options]
25
30
  * @returns {string}
26
31
  */
27
32
  export const extractText = (input, options = {}) => {
28
- const { removeSelectors = [] } = options;
33
+ const { removeSelectors = [], root = null, rootFallback = 'body' } = options;
29
34
  const $ = typeof input === 'string' ? cheerio.load(input) : input;
30
35
 
31
36
  if (removeSelectors.length) $(removeSelectors.join(',')).remove();
32
37
  $('*').contents().filter((_, node) => node.type === 'comment').remove();
33
38
 
34
- return collectText($, $('body'))
39
+ const rootSelector = Array.isArray(root) ? root.join(',') : root;
40
+ let $root = rootSelector ? $(rootSelector) : $(rootFallback || 'body');
41
+ if ($root.length === 0) $root = $(rootFallback || 'body');
42
+
43
+ let text = '';
44
+ $root.each((_, element) => {
45
+ text += `${collectText($, $(element))} `;
46
+ });
47
+
48
+ return text
35
49
  .replace(/\n/g, ' ')
36
50
  .replace(/\\['"\\]/g, (match) => match.slice(1))
37
51
  .replace(WHITESPACE_CHARS, ' ')
@@ -1,11 +1,11 @@
1
1
  import { URL } from 'node:url';
2
- import { normalizeUrl } from '../url/normalize.js';
3
2
 
4
3
  const NON_NAVIGATIONAL = /^(mailto:|tel:|javascript:|data:)/i;
5
4
 
6
5
  /**
7
- * Collects unique, normalized links from anchor tags in a document. No
8
- * include/exclude filtering happens here; that is the crawler's job.
6
+ * Collects unique, absolute links from anchor tags in a document, resolving
7
+ * relative hrefs against `baseUrl`. Normalization and include/exclude filtering
8
+ * are the crawler's job (`enqueue`), so links are only resolved here.
9
9
  *
10
10
  * @param {import('cheerio').CheerioAPI} $
11
11
  * @param {string} baseUrl - used to resolve relative hrefs
@@ -19,7 +19,7 @@ export const discoverLinks = ($, baseUrl) => {
19
19
  if (!href || href.startsWith('#') || NON_NAVIGATIONAL.test(href)) return;
20
20
 
21
21
  try {
22
- links.add(normalizeUrl(new URL(href, baseUrl).toString()));
22
+ links.add(new URL(href, baseUrl).href);
23
23
  } catch {
24
24
  // Ignore malformed hrefs.
25
25
  }
@@ -0,0 +1,35 @@
1
+ /** Coerces a fetcher body (string or binary) to a UTF-8 string. */
2
+ export const toText = (data) => (typeof data === 'string' ? data : Buffer.from(data).toString('utf8'));
3
+
4
+ /**
5
+ * Buckets a Content-Type into the kind of body Scraply knows how to handle.
6
+ * Anything containing "json" is JSON, anything containing "html" (incl.
7
+ * application/xhtml+xml) is HTML, everything else is treated as raw text.
8
+ *
9
+ * @param {string} [contentType]
10
+ * @returns {'html'|'json'|'text'}
11
+ */
12
+ export const classifyContentType = (contentType = '') => {
13
+ const value = String(contentType).toLowerCase();
14
+ if (value.includes('json')) return 'json';
15
+ if (value.includes('html')) return 'html';
16
+ return 'text';
17
+ };
18
+
19
+ /**
20
+ * Parses a JSON body. Returns the parsed value plus a pretty-printed string for
21
+ * the record `content`. Falls back to the raw text when the body is not valid
22
+ * JSON (so a mislabeled response is never lost).
23
+ *
24
+ * @param {string|ArrayBuffer} data
25
+ * @returns {{ data: unknown, content: string }}
26
+ */
27
+ export const parseJson = (data) => {
28
+ const text = toText(data);
29
+ try {
30
+ const parsed = JSON.parse(text);
31
+ return { data: parsed, content: JSON.stringify(parsed, null, 2) };
32
+ } catch {
33
+ return { data: null, content: text };
34
+ }
35
+ };
@@ -0,0 +1,35 @@
1
+ import * as cheerio from 'cheerio';
2
+
3
+ /**
4
+ * Parses an XML sitemap or sitemap index. Returns nested `sitemaps` (from a
5
+ * `<sitemapindex>`) and page `urls` (from a `<urlset>`) separately so the
6
+ * crawler can recurse into indexes before enqueuing pages.
7
+ *
8
+ * @param {string} xml
9
+ * @returns {{ sitemaps: string[], urls: string[] }}
10
+ */
11
+ export const parseSitemap = (xml) => {
12
+ const $ = cheerio.load(xml, { xmlMode: true });
13
+ const sitemaps = [];
14
+ const urls = [];
15
+
16
+ $('sitemap > loc').each((_, el) => {
17
+ const value = $(el).text().trim();
18
+ if (value) sitemaps.push(value);
19
+ });
20
+
21
+ $('url > loc').each((_, el) => {
22
+ const value = $(el).text().trim();
23
+ if (value) urls.push(value);
24
+ });
25
+
26
+ // Fallback for sitemaps that omit the standard wrapping elements.
27
+ if (sitemaps.length === 0 && urls.length === 0) {
28
+ $('loc').each((_, el) => {
29
+ const value = $(el).text().trim();
30
+ if (value) urls.push(value);
31
+ });
32
+ }
33
+
34
+ return { sitemaps, urls };
35
+ };