scraply 2.0.1 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/crawler.js CHANGED
@@ -6,18 +6,19 @@ import { loadConfig } from './config/load.js';
6
6
  import { createLogger } from './util/logger.js';
7
7
  import { createHooks } from './util/hooks.js';
8
8
  import { normalizeUrl } from './url/normalize.js';
9
- import { matchesAnyPattern } from './url/patterns.js';
9
+ import { matchesPattern, matchesAnyPattern } from './url/patterns.js';
10
10
  import { discoverLinks } from './extract/links.js';
11
11
  import { extractText } from './extract/extract.js';
12
+ import { classifyContentType, parseJson, toText } from './extract/parse.js';
13
+ import { parseSitemap } from './extract/sitemap.js';
12
14
  import { QueueManager } from './core/queue.js';
13
15
  import { runPipeline } from './core/pipeline.js';
14
16
  import { createRetryRunner } from './core/retry.js';
17
+ import { RateLimitError } from './core/errors.js';
15
18
  import { resolveFetcher } from './fetchers/index.js';
16
19
  import { formatRecords } from './output/writers.js';
17
20
  import { loadJSON, saveJSON, deletePath, deleteUntracked } from './storage/files.js';
18
21
 
19
- const toHtml = (data) => (typeof data === 'string' ? data : Buffer.from(data).toString('utf8'));
20
-
21
22
  const sha256 = (text) => createHash('sha256').update(text).digest('hex');
22
23
 
23
24
  /**
@@ -42,17 +43,45 @@ export const createCrawler = (userConfig = {}) => {
42
43
  let datasetCounter = 0;
43
44
  let processedCount = 0;
44
45
  let signalsRegistered = false;
46
+ let signalHandler = null;
47
+ /** @type {RateLimitError|null} Set when a 429 aborts the crawl; rethrown after the pool drains. */
48
+ let rateLimitError = null;
45
49
 
46
50
  const closeFetcher = async () => {
47
51
  if (fetcher.close) await fetcher.close();
48
52
  };
49
53
 
50
- const onRateLimitExit = (code) => {
51
- queue.flush();
52
- closeFetcher().finally(() => process.exit(code));
53
- };
54
+ const retryRunner = createRetryRunner({ config, logger });
55
+
56
+ // Resolves the effective per-URL config, applying the most specific matching
57
+ // `sites` entry over the top-level `allowedContentTypes` / `extract`.
58
+ const resolveEntryConfig = (url) => {
59
+ if (!config.sites.length) {
60
+ return { allowedContentTypes: config.allowedContentTypes, extract: config.extract };
61
+ }
62
+
63
+ let best = null;
64
+ let bestLen = -1;
65
+ for (const site of config.sites) {
66
+ for (const pattern of site.match) {
67
+ if (!matchesPattern(url, pattern)) continue;
68
+ const len = typeof pattern === 'string' ? pattern.length : String(pattern).length;
69
+ if (len > bestLen) {
70
+ bestLen = len;
71
+ best = site;
72
+ }
73
+ }
74
+ }
54
75
 
55
- const retryRunner = createRetryRunner({ config, logger, onRateLimitExit });
76
+ if (!best) {
77
+ return { allowedContentTypes: config.allowedContentTypes, extract: config.extract };
78
+ }
79
+
80
+ return {
81
+ allowedContentTypes: best.allowedContentTypes ?? config.allowedContentTypes,
82
+ extract: { ...config.extract, ...(best.extract ?? {}) }
83
+ };
84
+ };
56
85
 
57
86
  // --- queue lifecycle ---
58
87
 
@@ -77,6 +106,11 @@ export const createCrawler = (userConfig = {}) => {
77
106
  if (requeued > 0) logger.info(`Re-queued ${requeued} previously errored URL(s) for retry.`);
78
107
  }
79
108
 
109
+ if (config.crawl.retrySkipped) {
110
+ const requeued = queue.requeueSkipped();
111
+ if (requeued > 0) logger.info(`Re-queued ${requeued} previously skipped URL(s) for retry.`);
112
+ }
113
+
80
114
  if (queue.entries.length === 0) {
81
115
  logger.info(`Starting fresh with ${startUrls.length} start URL(s).`);
82
116
  queue.seed(startUrls);
@@ -104,10 +138,11 @@ export const createCrawler = (userConfig = {}) => {
104
138
  // Fetches a single URL (with retry/rate-limit policy) and returns the raw result.
105
139
  const fetchUrl = (url) => retryRunner.run(() => fetcher.fetch(normalizeUrl(url)));
106
140
 
107
- // Extracts readable text from HTML.
141
+ // Extracts readable text from HTML. When a URL is supplied, the matching
142
+ // per-site extract rules apply; otherwise the global extract config is used.
108
143
  const extract = (html, url = null) => ({
109
144
  url,
110
- content: extractText(html, { removeSelectors: config.extract.removeSelectors })
145
+ content: extractText(html, url ? resolveEntryConfig(url).extract : config.extract)
111
146
  });
112
147
 
113
148
  const shouldCrawl = (url) => {
@@ -157,40 +192,69 @@ export const createCrawler = (userConfig = {}) => {
157
192
  const result = await retryRunner.run(() => fetcher.fetch(entry.url));
158
193
  await hooks.emit('response', result, entry);
159
194
 
195
+ const effective = resolveEntryConfig(entry.url);
196
+
160
197
  // Fetchers return lowercased header keys (see Fetcher interface).
161
198
  const contentType = result.headers?.['content-type'];
162
- if (!contentType || !config.allowedContentTypes.some((type) => contentType.includes(type))) {
163
- queue.markSkipped(entry, { reason: `content-type: ${contentType ?? 'none'}`, status: result.status });
199
+ if (!contentType || !effective.allowedContentTypes.some((type) => contentType.includes(type))) {
200
+ const reason = `content-type: ${contentType ?? 'none'}`;
201
+ queue.markSkipped(entry, { reason, status: result.status });
202
+ await hooks.emit('skip', entry, { reason, status: result.status, result });
164
203
  return;
165
204
  }
166
205
 
167
- const $ = cheerio.load(toHtml(result.data));
206
+ const kind = classifyContentType(contentType);
207
+ let $ = null;
208
+ let content = '';
209
+ let data = null;
168
210
 
169
- // Discover links from the full DOM before extraction strips elements.
170
- await enqueue(discoverLinks($, entry.url), { depth: entry.depth + 1, referrer: entry.url });
211
+ if (kind === 'html') {
212
+ $ = cheerio.load(toText(result.data));
171
213
 
172
- let content = extractText($, { removeSelectors: config.extract.removeSelectors });
173
- content = await hooks.reduce('extract', content, $, entry);
214
+ // Discover links from the full DOM before extraction strips elements.
215
+ const links = await hooks.reduce('links', discoverLinks($, entry.url), $, entry, result);
216
+ await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
174
217
 
175
- const record = {
176
- url: entry.url,
177
- content,
178
- crawledAt: new Date().toISOString(),
179
- hash: sha256(content)
180
- };
218
+ content = extractText($, effective.extract);
219
+ } else if (kind === 'json' && effective.extract.json !== false) {
220
+ const parsed = parseJson(result.data);
221
+ data = parsed.data;
222
+ content = parsed.content;
223
+
224
+ const links = await hooks.reduce('links', [], $, entry, result);
225
+ if (links?.length) await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
226
+ } else {
227
+ content = toText(result.data);
228
+
229
+ const links = await hooks.reduce('links', [], $, entry, result);
230
+ if (links?.length) await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
231
+ }
232
+
233
+ content = await hooks.reduce('extract', content, $, entry, result);
234
+
235
+ let record = { url: entry.url, content, crawledAt: new Date().toISOString() };
236
+ if (data !== null) record.data = data;
237
+
238
+ // Transform runs BEFORE the record is persisted so its result is what gets
239
+ // saved to disk and later picked up by format().
240
+ record = await hooks.reduce('transform', record, entry, result);
241
+ record.hash = sha256(record.content ?? '');
181
242
 
182
243
  const file = saveDataset(record);
183
244
  queue.markDone(entry, { file, status: result.status });
184
245
 
185
- const transformed = await hooks.reduce('transform', record, entry);
186
- await hooks.emit('page', transformed, entry);
246
+ await hooks.emit('page', record, entry, result);
187
247
  } catch (error) {
188
- // A 429 only reaches here when rateLimit.exitOnLimit is true and the
189
- // process is already exiting; leave the entry pending so the next run
190
- // retries it instead of recording a permanent error.
191
- if (error.response?.status !== 429) {
192
- queue.markError(entry, { error: error.message, status: error.response?.status });
248
+ // A 429 with exitOnLimit aborts the whole crawl: stash the error, stop the
249
+ // pool and leave the entry pending so the next run retries it.
250
+ if (error instanceof RateLimitError) {
251
+ rateLimitError = error;
252
+ stopped = true;
253
+ queue.flush();
254
+ return;
193
255
  }
256
+
257
+ queue.markError(entry, { error: error.message, status: error.response?.status });
194
258
  await hooks.emit('error', error, entry);
195
259
  logger.error(`Failed to fetch ${entry.url} -> ${error.message}`);
196
260
  }
@@ -213,19 +277,64 @@ export const createCrawler = (userConfig = {}) => {
213
277
  };
214
278
 
215
279
  const registerSignals = () => {
216
- if (signalsRegistered) return;
280
+ if (!config.signals || signalsRegistered) return;
217
281
  signalsRegistered = true;
218
282
 
219
- const handler = async () => {
220
- logger.warn('Received termination signal. Saving progress...');
283
+ let forcing = false;
284
+ signalHandler = () => {
285
+ if (forcing) {
286
+ logger.warn('Received second termination signal. Forcing quit.');
287
+ process.exit(1);
288
+ }
289
+ forcing = true;
290
+ logger.warn('Received termination signal. Finishing in-flight work... (signal again to force quit)');
221
291
  stopped = true;
222
292
  queue.flush();
223
- await closeFetcher();
224
- process.exit(0);
225
293
  };
226
294
 
227
- process.once('SIGINT', handler);
228
- process.once('SIGTERM', handler);
295
+ process.on('SIGINT', signalHandler);
296
+ process.on('SIGTERM', signalHandler);
297
+ };
298
+
299
+ const unregisterSignals = () => {
300
+ if (!signalHandler) return;
301
+ process.off('SIGINT', signalHandler);
302
+ process.off('SIGTERM', signalHandler);
303
+ signalHandler = null;
304
+ signalsRegistered = false;
305
+ };
306
+
307
+ // Seeds URLs from sitemap(s) when crawl.sitemap is enabled. Recurses into
308
+ // sitemap indexes (bounded) and routes discovered URLs through enqueue() so
309
+ // include/exclude rules still apply.
310
+ const seedSitemaps = async () => {
311
+ const cfg = config.crawl.sitemap;
312
+ if (!cfg) return;
313
+
314
+ const roots = Array.isArray(cfg)
315
+ ? cfg
316
+ : startUrls.map((url) => new URL('/sitemap.xml', url).href);
317
+
318
+ const seen = new Set();
319
+ let added = 0;
320
+
321
+ const visit = async (url, depth) => {
322
+ if (depth > 5 || seen.has(url)) return;
323
+ seen.add(url);
324
+
325
+ try {
326
+ const result = await retryRunner.run(() => fetcher.fetch(url));
327
+ const { sitemaps, urls } = parseSitemap(toText(result.data));
328
+ added += await enqueue(urls, { depth: 0, referrer: url });
329
+ for (const nested of sitemaps) await visit(nested, depth + 1);
330
+ } catch (error) {
331
+ if (error instanceof RateLimitError) throw error;
332
+ logger.warn(`Sitemap fetch failed (${url}) -> ${error.message}`);
333
+ }
334
+ };
335
+
336
+ for (const url of roots) await visit(url, 0);
337
+ if (added > 0) logger.info(`Seeded ${added} URL(s) from sitemap(s).`);
229
338
  };
230
339
 
231
340
  // Crawls until the queue is drained (or `stop()` is called).
@@ -233,37 +342,49 @@ export const createCrawler = (userConfig = {}) => {
233
342
  init();
234
343
  logBanner();
235
344
  registerSignals();
345
+ rateLimitError = null;
236
346
 
237
- if (fetcher.init) await fetcher.init();
238
- processedCount = queue.crawledCount() + queue.errorCount() + queue.skippedCount();
347
+ try {
348
+ if (fetcher.init) await fetcher.init();
349
+ await seedSitemaps();
350
+ processedCount = queue.crawledCount() + queue.errorCount() + queue.skippedCount();
351
+
352
+ await runPipeline({
353
+ queue,
354
+ concurrency: config.crawl.concurrency,
355
+ perHostDelay: config.crawl.delay,
356
+ processOne,
357
+ isStopped: () => stopped || queue.crawledCount() >= config.crawl.maxPages
358
+ });
239
359
 
240
- await runPipeline({
241
- queue,
242
- concurrency: config.crawl.concurrency,
243
- perHostDelay: config.crawl.delay,
244
- processOne,
245
- isStopped: () => stopped || queue.crawledCount() >= config.crawl.maxPages
246
- });
360
+ queue.flush();
247
361
 
248
- queue.flush();
362
+ // A rate-limit abort surfaces here so run() can clean up (flush + close)
363
+ // before the error propagates to the caller.
364
+ if (rateLimitError) throw rateLimitError;
249
365
 
250
- if (config.crawl.maxPages !== Infinity && queue.crawledCount() >= config.crawl.maxPages) {
251
- logger.info(`Reached maxPages limit (${config.crawl.maxPages}).`);
252
- }
366
+ if (config.crawl.maxPages !== Infinity && queue.crawledCount() >= config.crawl.maxPages) {
367
+ logger.info(`Reached maxPages limit (${config.crawl.maxPages}).`);
368
+ }
253
369
 
254
- logger.info(
255
- `Crawling completed! ${queue.crawledCount()} crawled, ${queue.skippedCount()} skipped, ` +
256
- `${queue.errorCount()} errors, ${queue.pendingCount()} pending (of ${queue.entries.length} total).`
257
- );
370
+ logger.info(
371
+ `Crawling completed! ${queue.crawledCount()} crawled, ${queue.skippedCount()} skipped, ` +
372
+ `${queue.errorCount()} errors, ${queue.pendingCount()} pending (of ${queue.entries.length} total).`
373
+ );
374
+ } finally {
375
+ unregisterSignals();
376
+ }
258
377
  };
259
378
 
260
379
  // Re-reads crawled pages from disk so resumed runs include earlier sessions.
380
+ // The full saved record is returned (including any `transform` additions and
381
+ // `data` for JSON sources); the output writer decides what to serialize.
261
382
  const collectRecords = () => {
262
383
  const records = [];
263
384
  for (const entry of queue.entries) {
264
385
  if (!entry.file) continue;
265
- const data = loadJSON(path.posix.join(config.storage.crawledDir, entry.file), null);
266
- if (data) records.push({ url: entry.url, content: data.content });
386
+ const record = loadJSON(path.posix.join(config.storage.crawledDir, entry.file), null);
387
+ if (record) records.push(record);
267
388
  }
268
389
  return records;
269
390
  };
@@ -322,6 +443,12 @@ export const createCrawler = (userConfig = {}) => {
322
443
  if (queue.entries.length === 0) queue.load();
323
444
  return queue.requeueErrors();
324
445
  },
446
+ // Same as requeueErrors() but for skipped entries (e.g. after widening
447
+ // allowedContentTypes or changing sites).
448
+ requeueSkipped: () => {
449
+ if (queue.entries.length === 0) queue.load();
450
+ return queue.requeueSkipped();
451
+ },
325
452
  stop: () => {
326
453
  stopped = true;
327
454
  }
@@ -330,3 +457,35 @@ export const createCrawler = (userConfig = {}) => {
330
457
 
331
458
  // One-call convenience wrapper: create a crawler and run the full pipeline.
332
459
  export const scraply = (userConfig = {}) => createCrawler(userConfig).run();
460
+
461
+ /**
462
+ * Runs multiple crawlers in one process. Accepts crawler instances or plain
463
+ * config objects (which are turned into crawlers). Because the crawler no longer
464
+ * calls `process.exit`, several crawlers can safely share one process — set
465
+ * `signals: false` in each config (or rely on the per-instance graceful stop).
466
+ *
467
+ * @param {Array<import('./index.js').ScraplyConfig | ReturnType<typeof createCrawler>>} items
468
+ * @param {{ concurrency?: number }} [options] - how many crawlers run at once (default 1 = sequential)
469
+ * @returns {Promise<Array<import('./core/queue.js').QueueEntry[]>>} each crawler's final queue entries, in input order
470
+ */
471
+ export const runCrawlers = async (items, { concurrency = 1 } = {}) => {
472
+ const instances = items.map((item) =>
473
+ item && typeof item.run === 'function' ? item : createCrawler(item)
474
+ );
475
+
476
+ const results = new Array(instances.length);
477
+ let cursor = 0;
478
+
479
+ const worker = async () => {
480
+ for (;;) {
481
+ const index = cursor++;
482
+ if (index >= instances.length) return;
483
+ results[index] = await instances[index].run();
484
+ }
485
+ };
486
+
487
+ const poolSize = Math.max(1, Math.min(concurrency, instances.length || 1));
488
+ await Promise.all(Array.from({ length: poolSize }, () => worker()));
489
+
490
+ return results;
491
+ };
@@ -20,18 +20,32 @@ const collectText = ($, element) => {
20
20
  * Extracts readable text from an HTML document. Cheerio decodes HTML entities
21
21
  * for us, so no separate decoder dependency is needed.
22
22
  *
23
+ * `root` allow-lists the container(s) to read from (a selector or array of
24
+ * selectors); when it matches nothing — or is null — extraction falls back to
25
+ * `rootFallback` (default `<body>`). `removeSelectors` then strips noise from
26
+ * within the chosen root.
27
+ *
23
28
  * @param {string|import('cheerio').CheerioAPI} input - raw HTML or a loaded Cheerio instance
24
- * @param {{ removeSelectors?: string[] }} [options]
29
+ * @param {{ removeSelectors?: string[], root?: string|string[]|null, rootFallback?: string }} [options]
25
30
  * @returns {string}
26
31
  */
27
32
  export const extractText = (input, options = {}) => {
28
- const { removeSelectors = [] } = options;
33
+ const { removeSelectors = [], root = null, rootFallback = 'body' } = options;
29
34
  const $ = typeof input === 'string' ? cheerio.load(input) : input;
30
35
 
31
36
  if (removeSelectors.length) $(removeSelectors.join(',')).remove();
32
37
  $('*').contents().filter((_, node) => node.type === 'comment').remove();
33
38
 
34
- return collectText($, $('body'))
39
+ const rootSelector = Array.isArray(root) ? root.join(',') : root;
40
+ let $root = rootSelector ? $(rootSelector) : $(rootFallback || 'body');
41
+ if ($root.length === 0) $root = $(rootFallback || 'body');
42
+
43
+ let text = '';
44
+ $root.each((_, element) => {
45
+ text += `${collectText($, $(element))} `;
46
+ });
47
+
48
+ return text
35
49
  .replace(/\n/g, ' ')
36
50
  .replace(/\\['"\\]/g, (match) => match.slice(1))
37
51
  .replace(WHITESPACE_CHARS, ' ')
@@ -0,0 +1,35 @@
1
+ /** Coerces a fetcher body (string or binary) to a UTF-8 string. */
2
+ export const toText = (data) => (typeof data === 'string' ? data : Buffer.from(data).toString('utf8'));
3
+
4
+ /**
5
+ * Buckets a Content-Type into the kind of body Scraply knows how to handle.
6
+ * Anything containing "json" is JSON, anything containing "html" (incl.
7
+ * application/xhtml+xml) is HTML, everything else is treated as raw text.
8
+ *
9
+ * @param {string} [contentType]
10
+ * @returns {'html'|'json'|'text'}
11
+ */
12
+ export const classifyContentType = (contentType = '') => {
13
+ const value = String(contentType).toLowerCase();
14
+ if (value.includes('json')) return 'json';
15
+ if (value.includes('html')) return 'html';
16
+ return 'text';
17
+ };
18
+
19
+ /**
20
+ * Parses a JSON body. Returns the parsed value plus a pretty-printed string for
21
+ * the record `content`. Falls back to the raw text when the body is not valid
22
+ * JSON (so a mislabeled response is never lost).
23
+ *
24
+ * @param {string|ArrayBuffer} data
25
+ * @returns {{ data: unknown, content: string }}
26
+ */
27
+ export const parseJson = (data) => {
28
+ const text = toText(data);
29
+ try {
30
+ const parsed = JSON.parse(text);
31
+ return { data: parsed, content: JSON.stringify(parsed, null, 2) };
32
+ } catch {
33
+ return { data: null, content: text };
34
+ }
35
+ };
@@ -0,0 +1,35 @@
1
+ import * as cheerio from 'cheerio';
2
+
3
+ /**
4
+ * Parses an XML sitemap or sitemap index. Returns nested `sitemaps` (from a
5
+ * `<sitemapindex>`) and page `urls` (from a `<urlset>`) separately so the
6
+ * crawler can recurse into indexes before enqueuing pages.
7
+ *
8
+ * @param {string} xml
9
+ * @returns {{ sitemaps: string[], urls: string[] }}
10
+ */
11
+ export const parseSitemap = (xml) => {
12
+ const $ = cheerio.load(xml, { xmlMode: true });
13
+ const sitemaps = [];
14
+ const urls = [];
15
+
16
+ $('sitemap > loc').each((_, el) => {
17
+ const value = $(el).text().trim();
18
+ if (value) sitemaps.push(value);
19
+ });
20
+
21
+ $('url > loc').each((_, el) => {
22
+ const value = $(el).text().trim();
23
+ if (value) urls.push(value);
24
+ });
25
+
26
+ // Fallback for sitemaps that omit the standard wrapping elements.
27
+ if (sitemaps.length === 0 && urls.length === 0) {
28
+ $('loc').each((_, el) => {
29
+ const value = $(el).text().trim();
30
+ if (value) urls.push(value);
31
+ });
32
+ }
33
+
34
+ return { sitemaps, urls };
35
+ };