npm - scraply - Versions diffs - 2.0.0 → 2.0.2 - Mend

scraply 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/package.json +7 -3
package/readme.md +149 -55
package/src/config/browser.js +37 -0
package/src/config/defaults.js +47 -11
package/src/config/load.js +57 -1
package/src/core/errors.js +23 -0
package/src/core/queue.js +83 -11
package/src/core/retry.js +34 -26
package/src/crawler.js +265 -76
package/src/extract/extract.js +17 -3
package/src/extract/links.js +4 -4
package/src/extract/parse.js +35 -0
package/src/extract/sitemap.js +35 -0
package/src/fetchers/browserFetcher.js +18 -12
package/src/fetchers/httpFetcher.js +40 -3
package/src/index.d.ts +285 -0
package/src/index.js +48 -7
package/src/output/writers.js +14 -5

package/src/crawler.js CHANGED Viewed

@@ -1,36 +1,28 @@
 import path from 'node:path';
+import { createHash } from 'node:crypto';
 import * as cheerio from 'cheerio';
 import { loadConfig } from './config/load.js';
 import { createLogger } from './util/logger.js';
 import { createHooks } from './util/hooks.js';
 import { normalizeUrl } from './url/normalize.js';
-import { matchesAnyPattern } from './url/patterns.js';
+import { matchesPattern, matchesAnyPattern } from './url/patterns.js';
 import { discoverLinks } from './extract/links.js';
 import { extractText } from './extract/extract.js';
+import { classifyContentType, parseJson, toText } from './extract/parse.js';
+import { parseSitemap } from './extract/sitemap.js';
 import { QueueManager } from './core/queue.js';
 import { runPipeline } from './core/pipeline.js';
 import { createRetryRunner } from './core/retry.js';
+import { RateLimitError } from './core/errors.js';
 import { resolveFetcher } from './fetchers/index.js';
 import { formatRecords } from './output/writers.js';
 import { loadJSON, saveJSON, deletePath, deleteUntracked } from './storage/files.js';
-const getHeader = (headers, name) => {
-  if (!headers) return undefined;
-  if (headers[name] !== undefined) return headers[name];
-  const lower = name.toLowerCase();
-  for (const key of Object.keys(headers)) {
-    if (key.toLowerCase() === lower) return headers[key];
-  }
-  return undefined;
-};
-const toHtml = (data) => (typeof data === 'string' ? data : Buffer.from(data).toString('utf8'));
+const sha256 = (text) => createHash('sha256').update(text).digest('hex');
 /**
- * Creates a crawler instance. Every stage is exposed as a method so callers can
- * run the whole pipeline (`run`) or drive individual stages and add their own
- * logic via hooks.
+ * Creates a crawler instance. Every stage is exposed as a method so callers can run the whole pipeline (`run`) or drive individual stages and add their own logic via hooks.
  *
  * @param {import('./index.js').ScraplyConfig} [userConfig]
  */
@@ -41,22 +33,55 @@ export const createCrawler = (userConfig = {}) => {
   const queue = new QueueManager({ config, logger });
   const fetcher = resolveFetcher({ config, logger });
+  // Normalized once so the start URLs match discovered (normalized) links and
+  // can be looked up in O(1) during filtering.
+  const startUrls = config.startUrls.map(normalizeUrl);
+  const startUrlSet = new Set(startUrls);
   let stopped = false;
   let initialized = false;
   let datasetCounter = 0;
   let processedCount = 0;
   let signalsRegistered = false;
+  let signalHandler = null;
+  /** @type {RateLimitError|null} Set when a 429 aborts the crawl; rethrown after the pool drains. */
+  let rateLimitError = null;
   const closeFetcher = async () => {
     if (fetcher.close) await fetcher.close();
   };
-  const onRateLimitExit = (code) => {
-    queue.flush();
-    closeFetcher().finally(() => process.exit(code));
-  };
+  const retryRunner = createRetryRunner({ config, logger });
+  // Resolves the effective per-URL config, applying the most specific matching
+  // `sites` entry over the top-level `allowedContentTypes` / `extract`.
+  const resolveEntryConfig = (url) => {
+    if (!config.sites.length) {
+      return { allowedContentTypes: config.allowedContentTypes, extract: config.extract };
+    }
+    let best = null;
+    let bestLen = -1;
+    for (const site of config.sites) {
+      for (const pattern of site.match) {
+        if (!matchesPattern(url, pattern)) continue;
+        const len = typeof pattern === 'string' ? pattern.length : String(pattern).length;
+        if (len > bestLen) {
+          bestLen = len;
+          best = site;
+        }
+      }
+    }
+    if (!best) {
+      return { allowedContentTypes: config.allowedContentTypes, extract: config.extract };
+    }
-  const retryRunner = createRetryRunner({ config, logger, onRateLimitExit });
+    return {
+      allowedContentTypes: best.allowedContentTypes ?? config.allowedContentTypes,
+      extract: { ...config.extract, ...(best.extract ?? {}) }
+    };
+  };
   // --- queue lifecycle ---
@@ -76,9 +101,19 @@ export const createCrawler = (userConfig = {}) => {
     queue.load();
     datasetCounter = computeDatasetCounter();
+    if (config.crawl.retryErrors) {
+      const requeued = queue.requeueErrors();
+      if (requeued > 0) logger.info(`Re-queued ${requeued} previously errored URL(s) for retry.`);
+    }
+    if (config.crawl.retrySkipped) {
+      const requeued = queue.requeueSkipped();
+      if (requeued > 0) logger.info(`Re-queued ${requeued} previously skipped URL(s) for retry.`);
+    }
     if (queue.entries.length === 0) {
-      logger.info(`Starting fresh with ${config.startUrls.length} start URL(s).`);
-      queue.seed(config.startUrls.map(normalizeUrl));
+      logger.info(`Starting fresh with ${startUrls.length} start URL(s).`);
+      queue.seed(startUrls);
       return;
     }
@@ -88,7 +123,7 @@ export const createCrawler = (userConfig = {}) => {
         queue.reset();
         deletePath(config.storage.crawledDir);
         datasetCounter = 0;
-        queue.seed(config.startUrls.map(normalizeUrl));
+        queue.seed(startUrls);
       } else {
         logger.info('All URLs already processed (resetOnComplete is false). Nothing to do.');
       }
@@ -100,22 +135,23 @@ export const createCrawler = (userConfig = {}) => {
   // --- stage methods ---
-  /** Fetches a single URL (with retry/rate-limit policy) and returns the raw result. */
+  // Fetches a single URL (with retry/rate-limit policy) and returns the raw result.
   const fetchUrl = (url) => retryRunner.run(() => fetcher.fetch(normalizeUrl(url)));
-  /** Extracts readable text from HTML. */
+  // Extracts readable text from HTML. When a URL is supplied, the matching
+  // per-site extract rules apply; otherwise the global extract config is used.
   const extract = (html, url = null) => ({
     url,
-    content: extractText(html, { removeSelectors: config.extract.removeSelectors })
+    content: extractText(html, url ? resolveEntryConfig(url).extract : config.extract)
   });
   const shouldCrawl = (url) => {
-    if (config.startUrls.some((start) => normalizeUrl(start) === url)) return true;
+    if (startUrlSet.has(url)) return true;
     if (matchesAnyPattern(url, config.exclude)) return false;
     return matchesAnyPattern(url, config.include);
   };
-  /** Filters + normalizes URLs and adds the survivors to the queue. */
+  // Filters + normalizes URLs and adds the survivors to the queue.
   const enqueue = async (urls, { depth = 0, referrer = null } = {}) => {
     const list = Array.isArray(urls) ? urls : [urls];
     let added = 0;
@@ -137,15 +173,17 @@ export const createCrawler = (userConfig = {}) => {
     return added;
   };
+  // Persists a crawled record and returns its filename (relative to crawledDir).
+  // Only the bare name is stored in the queue so datasets stay portable.
   const saveDataset = (record) => {
     datasetCounter += 1;
-    const filePath = path.posix.join(config.storage.crawledDir, `${datasetCounter}.json`);
-    saveJSON(filePath, record);
-    return filePath;
+    const file = `${datasetCounter}.json`;
+    saveJSON(path.posix.join(config.storage.crawledDir, file), record);
+    return file;
   };
   const processOne = async (entry) => {
-    if (entry.file || entry.error) return;
+    if (entry.file || entry.error || entry.skipped) return;
     processedCount += 1;
     logger.info(`- ${processedCount}/${queue.entries.length} -> ${entry.url}`);
@@ -154,26 +192,68 @@ export const createCrawler = (userConfig = {}) => {
       const result = await retryRunner.run(() => fetcher.fetch(entry.url));
       await hooks.emit('response', result, entry);
-      const contentType = getHeader(result.headers, 'content-type');
-      if (!contentType || !config.allowedContentTypes.some((type) => contentType.includes(type))) {
-        queue.markError(entry, { error: `Skipped content-type: ${contentType ?? 'none'}`, status: result.status });
+      const effective = resolveEntryConfig(entry.url);
+      // Fetchers return lowercased header keys (see Fetcher interface).
+      const contentType = result.headers?.['content-type'];
+      if (!contentType || !effective.allowedContentTypes.some((type) => contentType.includes(type))) {
+        const reason = `content-type: ${contentType ?? 'none'}`;
+        queue.markSkipped(entry, { reason, status: result.status });
+        await hooks.emit('skip', entry, { reason, status: result.status, result });
         return;
       }
-      const $ = cheerio.load(toHtml(result.data));
+      const kind = classifyContentType(contentType);
+      let $ = null;
+      let content = '';
+      let data = null;
+      if (kind === 'html') {
+        $ = cheerio.load(toText(result.data));
+        // Discover links from the full DOM before extraction strips elements.
+        const links = await hooks.reduce('links', discoverLinks($, entry.url), $, entry, result);
+        await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
+        content = extractText($, effective.extract);
+      } else if (kind === 'json' && effective.extract.json !== false) {
+        const parsed = parseJson(result.data);
+        data = parsed.data;
+        content = parsed.content;
+        const links = await hooks.reduce('links', [], $, entry, result);
+        if (links?.length) await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
+      } else {
+        content = toText(result.data);
+        const links = await hooks.reduce('links', [], $, entry, result);
+        if (links?.length) await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
+      }
+      content = await hooks.reduce('extract', content, $, entry, result);
-      // Discover links from the full DOM before extraction strips elements.
-      await enqueue(discoverLinks($, entry.url), { depth: entry.depth + 1, referrer: entry.url });
+      let record = { url: entry.url, content, crawledAt: new Date().toISOString() };
+      if (data !== null) record.data = data;
-      let content = extractText($, { removeSelectors: config.extract.removeSelectors });
-      content = await hooks.reduce('extract', content, $, entry);
+      // Transform runs BEFORE the record is persisted so its result is what gets
+      // saved to disk and later picked up by format().
+      record = await hooks.reduce('transform', record, entry, result);
+      record.hash = sha256(record.content ?? '');
-      const file = saveDataset({ url: entry.url, content });
+      const file = saveDataset(record);
       queue.markDone(entry, { file, status: result.status });
-      const record = await hooks.reduce('transform', { url: entry.url, content }, entry);
-      await hooks.emit('page', record, entry);
+      await hooks.emit('page', record, entry, result);
     } catch (error) {
+      // A 429 with exitOnLimit aborts the whole crawl: stash the error, stop the
+      // pool and leave the entry pending so the next run retries it.
+      if (error instanceof RateLimitError) {
+        rateLimitError = error;
+        stopped = true;
+        queue.flush();
+        return;
+      }
       queue.markError(entry, { error: error.message, status: error.response?.status });
       await hooks.emit('error', error, entry);
       logger.error(`Failed to fetch ${entry.url} -> ${error.message}`);
@@ -181,75 +261,140 @@ export const createCrawler = (userConfig = {}) => {
   };
   const logBanner = () => {
+    const browserLine =
+      fetcher.name === 'browser' ? `\n  - Browser waitUntil: ${config.browser.waitUntil}` : '';
     logger.info(`STARTING SCRAPLY CRAWLER...
   - Start URLs: ${config.startUrls.join(', ')}
-  - Fetcher: ${fetcher.name}
+  - Fetcher: ${fetcher.name}${browserLine}
   - Concurrency: ${config.crawl.concurrency}
   - Per-host delay: ${config.crawl.delay}ms
   - Max depth: ${config.crawl.maxDepth}
+  - Max pages: ${config.crawl.maxPages}
   - Allowed content types: ${config.allowedContentTypes.join(', ')}
   - Output format: ${config.output.format}
 `);
   };
   const registerSignals = () => {
-    if (signalsRegistered) return;
+    if (!config.signals || signalsRegistered) return;
     signalsRegistered = true;
-    const handler = async () => {
-      logger.warn('Received termination signal. Saving progress...');
+    let forcing = false;
+    signalHandler = () => {
+      if (forcing) {
+        logger.warn('Received second termination signal. Forcing quit.');
+        process.exit(1);
+      }
+      forcing = true;
+      logger.warn('Received termination signal. Finishing in-flight work... (signal again to force quit)');
       stopped = true;
       queue.flush();
-      await closeFetcher();
-      process.exit(0);
     };
-    process.once('SIGINT', handler);
-    process.once('SIGTERM', handler);
+    process.on('SIGINT', signalHandler);
+    process.on('SIGTERM', signalHandler);
   };
-  /** Crawls until the queue is drained (or `stop()` is called). */
+  const unregisterSignals = () => {
+    if (!signalHandler) return;
+    process.off('SIGINT', signalHandler);
+    process.off('SIGTERM', signalHandler);
+    signalHandler = null;
+    signalsRegistered = false;
+  };
+  // Seeds URLs from sitemap(s) when crawl.sitemap is enabled. Recurses into
+  // sitemap indexes (bounded) and routes discovered URLs through enqueue() so
+  // include/exclude rules still apply.
+  const seedSitemaps = async () => {
+    const cfg = config.crawl.sitemap;
+    if (!cfg) return;
+    const roots = Array.isArray(cfg)
+      ? cfg
+      : startUrls.map((url) => new URL('/sitemap.xml', url).href);
+    const seen = new Set();
+    let added = 0;
+    const visit = async (url, depth) => {
+      if (depth > 5 || seen.has(url)) return;
+      seen.add(url);
+      try {
+        const result = await retryRunner.run(() => fetcher.fetch(url));
+        const { sitemaps, urls } = parseSitemap(toText(result.data));
+        added += await enqueue(urls, { depth: 0, referrer: url });
+        for (const nested of sitemaps) await visit(nested, depth + 1);
+      } catch (error) {
+        if (error instanceof RateLimitError) throw error;
+        logger.warn(`Sitemap fetch failed (${url}) -> ${error.message}`);
+      }
+    };
+    for (const url of roots) await visit(url, 0);
+    if (added > 0) logger.info(`Seeded ${added} URL(s) from sitemap(s).`);
+  };
+  // Crawls until the queue is drained (or `stop()` is called).
   const crawl = async () => {
     init();
     logBanner();
     registerSignals();
+    rateLimitError = null;
-    if (fetcher.init) await fetcher.init();
-    processedCount = queue.crawledCount() + queue.errorCount();
+    try {
+      if (fetcher.init) await fetcher.init();
+      await seedSitemaps();
+      processedCount = queue.crawledCount() + queue.errorCount() + queue.skippedCount();
+      await runPipeline({
+        queue,
+        concurrency: config.crawl.concurrency,
+        perHostDelay: config.crawl.delay,
+        processOne,
+        isStopped: () => stopped || queue.crawledCount() >= config.crawl.maxPages
+      });
-    await runPipeline({
-      queue,
-      concurrency: config.crawl.concurrency,
-      perHostDelay: config.crawl.delay,
-      processOne,
-      isStopped: () => stopped
-    });
+      queue.flush();
-    queue.flush();
-    logger.info(
-      `Crawling completed! ${queue.crawledCount()} of ${queue.entries.length} ` +
-        `(${queue.entries.length - queue.crawledCount()} not crawled, ${queue.errorCount()} errors)`
-    );
+      // A rate-limit abort surfaces here so run() can clean up (flush + close)
+      // before the error propagates to the caller.
+      if (rateLimitError) throw rateLimitError;
+      if (config.crawl.maxPages !== Infinity && queue.crawledCount() >= config.crawl.maxPages) {
+        logger.info(`Reached maxPages limit (${config.crawl.maxPages}).`);
+      }
+      logger.info(
+        `Crawling completed! ${queue.crawledCount()} crawled, ${queue.skippedCount()} skipped, ` +
+          `${queue.errorCount()} errors, ${queue.pendingCount()} pending (of ${queue.entries.length} total).`
+      );
+    } finally {
+      unregisterSignals();
+    }
   };
-  /** Re-reads crawled pages from disk so resumed runs include earlier sessions. */
+  // Re-reads crawled pages from disk so resumed runs include earlier sessions.
+  // The full saved record is returned (including any `transform` additions and
+  // `data` for JSON sources); the output writer decides what to serialize.
   const collectRecords = () => {
     const records = [];
     for (const entry of queue.entries) {
-      if (!entry.file || entry.error) continue;
-      const data = loadJSON(entry.file, null);
-      if (data) records.push({ url: entry.url, content: data.content });
+      if (!entry.file) continue;
+      const record = loadJSON(path.posix.join(config.storage.crawledDir, entry.file), null);
+      if (record) records.push(record);
     }
     return records;
   };
-  /**
-   * Routes records to their output files and writes them. Defaults to every
-   * successfully crawled page; pass an explicit array to format custom records.
-   */
+  // Routes records to their output files and writes them. Defaults to every successfully crawled page; pass an explicit array to format custom records. When reading from disk, reloads `dataset/queue.json` first so this can run without calling `crawl()` (e.g. after changing `output.routes`).
   const format = async (records = null) => {
     logger.info('Formatting data...');
+    if (records === null) queue.load();
     const collected = records ?? collectRecords();
     const groups = formatRecords(collected, {
       output: config.output,
@@ -269,7 +414,7 @@ export const createCrawler = (userConfig = {}) => {
     return groups;
   };
-  /** Full pipeline: init -> crawl -> format, with guaranteed cleanup. */
+  // Full pipeline: init -> crawl -> format, with guaranteed cleanup.
   const run = async () => {
     try {
       await crawl();
@@ -292,11 +437,55 @@ export const createCrawler = (userConfig = {}) => {
     crawl,
     format,
     run,
+    // Clears errored entries and returns them to the queue so a later crawl()
+    // retries them. Persists immediately; returns how many were requeued.
+    requeueErrors: () => {
+      if (queue.entries.length === 0) queue.load();
+      return queue.requeueErrors();
+    },
+    // Same as requeueErrors() but for skipped entries (e.g. after widening
+    // allowedContentTypes or changing sites).
+    requeueSkipped: () => {
+      if (queue.entries.length === 0) queue.load();
+      return queue.requeueSkipped();
+    },
     stop: () => {
       stopped = true;
     }
   };
 };
-/** One-call convenience wrapper: create a crawler and run the full pipeline. */
+// One-call convenience wrapper: create a crawler and run the full pipeline.
 export const scraply = (userConfig = {}) => createCrawler(userConfig).run();
+/**
+ * Runs multiple crawlers in one process. Accepts crawler instances or plain
+ * config objects (which are turned into crawlers). Because the crawler no longer
+ * calls `process.exit`, several crawlers can safely share one process — set
+ * `signals: false` in each config (or rely on the per-instance graceful stop).
+ *
+ * @param {Array<import('./index.js').ScraplyConfig | ReturnType<typeof createCrawler>>} items
+ * @param {{ concurrency?: number }} [options] - how many crawlers run at once (default 1 = sequential)
+ * @returns {Promise<Array<import('./core/queue.js').QueueEntry[]>>} each crawler's final queue entries, in input order
+ */
+export const runCrawlers = async (items, { concurrency = 1 } = {}) => {
+  const instances = items.map((item) =>
+    item && typeof item.run === 'function' ? item : createCrawler(item)
+  );
+  const results = new Array(instances.length);
+  let cursor = 0;
+  const worker = async () => {
+    for (;;) {
+      const index = cursor++;
+      if (index >= instances.length) return;
+      results[index] = await instances[index].run();
+    }
+  };
+  const poolSize = Math.max(1, Math.min(concurrency, instances.length || 1));
+  await Promise.all(Array.from({ length: poolSize }, () => worker()));
+  return results;
+};

package/src/extract/extract.js CHANGED Viewed

@@ -20,18 +20,32 @@ const collectText = ($, element) => {
  * Extracts readable text from an HTML document. Cheerio decodes HTML entities
  * for us, so no separate decoder dependency is needed.
  *
+ * `root` allow-lists the container(s) to read from (a selector or array of
+ * selectors); when it matches nothing — or is null — extraction falls back to
+ * `rootFallback` (default `<body>`). `removeSelectors` then strips noise from
+ * within the chosen root.
+ *
  * @param {string|import('cheerio').CheerioAPI} input - raw HTML or a loaded Cheerio instance
- * @param {{ removeSelectors?: string[] }} [options]
+ * @param {{ removeSelectors?: string[], root?: string|string[]|null, rootFallback?: string }} [options]
  * @returns {string}
  */
 export const extractText = (input, options = {}) => {
-  const { removeSelectors = [] } = options;
+  const { removeSelectors = [], root = null, rootFallback = 'body' } = options;
   const $ = typeof input === 'string' ? cheerio.load(input) : input;
   if (removeSelectors.length) $(removeSelectors.join(',')).remove();
   $('*').contents().filter((_, node) => node.type === 'comment').remove();
-  return collectText($, $('body'))
+  const rootSelector = Array.isArray(root) ? root.join(',') : root;
+  let $root = rootSelector ? $(rootSelector) : $(rootFallback || 'body');
+  if ($root.length === 0) $root = $(rootFallback || 'body');
+  let text = '';
+  $root.each((_, element) => {
+    text += `${collectText($, $(element))} `;
+  });
+  return text
     .replace(/\n/g, ' ')
     .replace(/\\['"\\]/g, (match) => match.slice(1))
     .replace(WHITESPACE_CHARS, ' ')

package/src/extract/links.js CHANGED Viewed

@@ -1,11 +1,11 @@
 import { URL } from 'node:url';
-import { normalizeUrl } from '../url/normalize.js';
 const NON_NAVIGATIONAL = /^(mailto:|tel:|javascript:|data:)/i;
 /**
- * Collects unique, normalized links from anchor tags in a document. No
- * include/exclude filtering happens here; that is the crawler's job.
+ * Collects unique, absolute links from anchor tags in a document, resolving
+ * relative hrefs against `baseUrl`. Normalization and include/exclude filtering
+ * are the crawler's job (`enqueue`), so links are only resolved here.
  *
  * @param {import('cheerio').CheerioAPI} $
  * @param {string} baseUrl - used to resolve relative hrefs
@@ -19,7 +19,7 @@ export const discoverLinks = ($, baseUrl) => {
     if (!href || href.startsWith('#') || NON_NAVIGATIONAL.test(href)) return;
     try {
-      links.add(normalizeUrl(new URL(href, baseUrl).toString()));
+      links.add(new URL(href, baseUrl).href);
     } catch {
       // Ignore malformed hrefs.
     }

package/src/extract/parse.js ADDED Viewed

@@ -0,0 +1,35 @@
+/** Coerces a fetcher body (string or binary) to a UTF-8 string. */
+export const toText = (data) => (typeof data === 'string' ? data : Buffer.from(data).toString('utf8'));
+/**
+ * Buckets a Content-Type into the kind of body Scraply knows how to handle.
+ * Anything containing "json" is JSON, anything containing "html" (incl.
+ * application/xhtml+xml) is HTML, everything else is treated as raw text.
+ *
+ * @param {string} [contentType]
+ * @returns {'html'|'json'|'text'}
+ */
+export const classifyContentType = (contentType = '') => {
+  const value = String(contentType).toLowerCase();
+  if (value.includes('json')) return 'json';
+  if (value.includes('html')) return 'html';
+  return 'text';
+};
+/**
+ * Parses a JSON body. Returns the parsed value plus a pretty-printed string for
+ * the record `content`. Falls back to the raw text when the body is not valid
+ * JSON (so a mislabeled response is never lost).
+ *
+ * @param {string|ArrayBuffer} data
+ * @returns {{ data: unknown, content: string }}
+ */
+export const parseJson = (data) => {
+  const text = toText(data);
+  try {
+    const parsed = JSON.parse(text);
+    return { data: parsed, content: JSON.stringify(parsed, null, 2) };
+  } catch {
+    return { data: null, content: text };
+  }
+};

package/src/extract/sitemap.js ADDED Viewed

@@ -0,0 +1,35 @@
+import * as cheerio from 'cheerio';
+/**
+ * Parses an XML sitemap or sitemap index. Returns nested `sitemaps` (from a
+ * `<sitemapindex>`) and page `urls` (from a `<urlset>`) separately so the
+ * crawler can recurse into indexes before enqueuing pages.
+ *
+ * @param {string} xml
+ * @returns {{ sitemaps: string[], urls: string[] }}
+ */
+export const parseSitemap = (xml) => {
+  const $ = cheerio.load(xml, { xmlMode: true });
+  const sitemaps = [];
+  const urls = [];
+  $('sitemap > loc').each((_, el) => {
+    const value = $(el).text().trim();
+    if (value) sitemaps.push(value);
+  });
+  $('url > loc').each((_, el) => {
+    const value = $(el).text().trim();
+    if (value) urls.push(value);
+  });
+  // Fallback for sitemaps that omit the standard wrapping elements.
+  if (sitemaps.length === 0 && urls.length === 0) {
+    $('loc').each((_, el) => {
+      const value = $(el).text().trim();
+      if (value) urls.push(value);
+    });
+  }
+  return { sitemaps, urls };
+};