npm - scraply - Versions diffs - 2.0.1 → 2.0.2 - Mend

scraply 2.0.1 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/src/crawler.js CHANGED Viewed

@@ -6,18 +6,19 @@ import { loadConfig } from './config/load.js';
 import { createLogger } from './util/logger.js';
 import { createHooks } from './util/hooks.js';
 import { normalizeUrl } from './url/normalize.js';
-import { matchesAnyPattern } from './url/patterns.js';
+import { matchesPattern, matchesAnyPattern } from './url/patterns.js';
 import { discoverLinks } from './extract/links.js';
 import { extractText } from './extract/extract.js';
+import { classifyContentType, parseJson, toText } from './extract/parse.js';
+import { parseSitemap } from './extract/sitemap.js';
 import { QueueManager } from './core/queue.js';
 import { runPipeline } from './core/pipeline.js';
 import { createRetryRunner } from './core/retry.js';
+import { RateLimitError } from './core/errors.js';
 import { resolveFetcher } from './fetchers/index.js';
 import { formatRecords } from './output/writers.js';
 import { loadJSON, saveJSON, deletePath, deleteUntracked } from './storage/files.js';
-const toHtml = (data) => (typeof data === 'string' ? data : Buffer.from(data).toString('utf8'));
 const sha256 = (text) => createHash('sha256').update(text).digest('hex');
 /**
@@ -42,17 +43,45 @@ export const createCrawler = (userConfig = {}) => {
   let datasetCounter = 0;
   let processedCount = 0;
   let signalsRegistered = false;
+  let signalHandler = null;
+  /** @type {RateLimitError|null} Set when a 429 aborts the crawl; rethrown after the pool drains. */
+  let rateLimitError = null;
   const closeFetcher = async () => {
     if (fetcher.close) await fetcher.close();
   };
-  const onRateLimitExit = (code) => {
-    queue.flush();
-    closeFetcher().finally(() => process.exit(code));
-  };
+  const retryRunner = createRetryRunner({ config, logger });
+  // Resolves the effective per-URL config, applying the most specific matching
+  // `sites` entry over the top-level `allowedContentTypes` / `extract`.
+  const resolveEntryConfig = (url) => {
+    if (!config.sites.length) {
+      return { allowedContentTypes: config.allowedContentTypes, extract: config.extract };
+    }
+    let best = null;
+    let bestLen = -1;
+    for (const site of config.sites) {
+      for (const pattern of site.match) {
+        if (!matchesPattern(url, pattern)) continue;
+        const len = typeof pattern === 'string' ? pattern.length : String(pattern).length;
+        if (len > bestLen) {
+          bestLen = len;
+          best = site;
+        }
+      }
+    }
-  const retryRunner = createRetryRunner({ config, logger, onRateLimitExit });
+    if (!best) {
+      return { allowedContentTypes: config.allowedContentTypes, extract: config.extract };
+    }
+    return {
+      allowedContentTypes: best.allowedContentTypes ?? config.allowedContentTypes,
+      extract: { ...config.extract, ...(best.extract ?? {}) }
+    };
+  };
   // --- queue lifecycle ---
@@ -77,6 +106,11 @@ export const createCrawler = (userConfig = {}) => {
       if (requeued > 0) logger.info(`Re-queued ${requeued} previously errored URL(s) for retry.`);
     }
+    if (config.crawl.retrySkipped) {
+      const requeued = queue.requeueSkipped();
+      if (requeued > 0) logger.info(`Re-queued ${requeued} previously skipped URL(s) for retry.`);
+    }
     if (queue.entries.length === 0) {
       logger.info(`Starting fresh with ${startUrls.length} start URL(s).`);
       queue.seed(startUrls);
@@ -104,10 +138,11 @@ export const createCrawler = (userConfig = {}) => {
   // Fetches a single URL (with retry/rate-limit policy) and returns the raw result.
   const fetchUrl = (url) => retryRunner.run(() => fetcher.fetch(normalizeUrl(url)));
-  // Extracts readable text from HTML.
+  // Extracts readable text from HTML. When a URL is supplied, the matching
+  // per-site extract rules apply; otherwise the global extract config is used.
   const extract = (html, url = null) => ({
     url,
-    content: extractText(html, { removeSelectors: config.extract.removeSelectors })
+    content: extractText(html, url ? resolveEntryConfig(url).extract : config.extract)
   });
   const shouldCrawl = (url) => {
@@ -157,40 +192,69 @@ export const createCrawler = (userConfig = {}) => {
       const result = await retryRunner.run(() => fetcher.fetch(entry.url));
       await hooks.emit('response', result, entry);
+      const effective = resolveEntryConfig(entry.url);
       // Fetchers return lowercased header keys (see Fetcher interface).
       const contentType = result.headers?.['content-type'];
-      if (!contentType || !config.allowedContentTypes.some((type) => contentType.includes(type))) {
-        queue.markSkipped(entry, { reason: `content-type: ${contentType ?? 'none'}`, status: result.status });
+      if (!contentType || !effective.allowedContentTypes.some((type) => contentType.includes(type))) {
+        const reason = `content-type: ${contentType ?? 'none'}`;
+        queue.markSkipped(entry, { reason, status: result.status });
+        await hooks.emit('skip', entry, { reason, status: result.status, result });
         return;
       }
-      const $ = cheerio.load(toHtml(result.data));
+      const kind = classifyContentType(contentType);
+      let $ = null;
+      let content = '';
+      let data = null;
-      // Discover links from the full DOM before extraction strips elements.
-      await enqueue(discoverLinks($, entry.url), { depth: entry.depth + 1, referrer: entry.url });
+      if (kind === 'html') {
+        $ = cheerio.load(toText(result.data));
-      let content = extractText($, { removeSelectors: config.extract.removeSelectors });
-      content = await hooks.reduce('extract', content, $, entry);
+        // Discover links from the full DOM before extraction strips elements.
+        const links = await hooks.reduce('links', discoverLinks($, entry.url), $, entry, result);
+        await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
-      const record = {
-        url: entry.url,
-        content,
-        crawledAt: new Date().toISOString(),
-        hash: sha256(content)
-      };
+        content = extractText($, effective.extract);
+      } else if (kind === 'json' && effective.extract.json !== false) {
+        const parsed = parseJson(result.data);
+        data = parsed.data;
+        content = parsed.content;
+        const links = await hooks.reduce('links', [], $, entry, result);
+        if (links?.length) await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
+      } else {
+        content = toText(result.data);
+        const links = await hooks.reduce('links', [], $, entry, result);
+        if (links?.length) await enqueue(links, { depth: entry.depth + 1, referrer: entry.url });
+      }
+      content = await hooks.reduce('extract', content, $, entry, result);
+      let record = { url: entry.url, content, crawledAt: new Date().toISOString() };
+      if (data !== null) record.data = data;
+      // Transform runs BEFORE the record is persisted so its result is what gets
+      // saved to disk and later picked up by format().
+      record = await hooks.reduce('transform', record, entry, result);
+      record.hash = sha256(record.content ?? '');
       const file = saveDataset(record);
       queue.markDone(entry, { file, status: result.status });
-      const transformed = await hooks.reduce('transform', record, entry);
-      await hooks.emit('page', transformed, entry);
+      await hooks.emit('page', record, entry, result);
     } catch (error) {
-      // A 429 only reaches here when rateLimit.exitOnLimit is true and the
-      // process is already exiting; leave the entry pending so the next run
-      // retries it instead of recording a permanent error.
-      if (error.response?.status !== 429) {
-        queue.markError(entry, { error: error.message, status: error.response?.status });
+      // A 429 with exitOnLimit aborts the whole crawl: stash the error, stop the
+      // pool and leave the entry pending so the next run retries it.
+      if (error instanceof RateLimitError) {
+        rateLimitError = error;
+        stopped = true;
+        queue.flush();
+        return;
       }
+      queue.markError(entry, { error: error.message, status: error.response?.status });
       await hooks.emit('error', error, entry);
       logger.error(`Failed to fetch ${entry.url} -> ${error.message}`);
     }
@@ -213,19 +277,64 @@ export const createCrawler = (userConfig = {}) => {
   };
   const registerSignals = () => {
-    if (signalsRegistered) return;
+    if (!config.signals || signalsRegistered) return;
     signalsRegistered = true;
-    const handler = async () => {
-      logger.warn('Received termination signal. Saving progress...');
+    let forcing = false;
+    signalHandler = () => {
+      if (forcing) {
+        logger.warn('Received second termination signal. Forcing quit.');
+        process.exit(1);
+      }
+      forcing = true;
+      logger.warn('Received termination signal. Finishing in-flight work... (signal again to force quit)');
       stopped = true;
       queue.flush();
-      await closeFetcher();
-      process.exit(0);
     };
-    process.once('SIGINT', handler);
-    process.once('SIGTERM', handler);
+    process.on('SIGINT', signalHandler);
+    process.on('SIGTERM', signalHandler);
+  };
+  const unregisterSignals = () => {
+    if (!signalHandler) return;
+    process.off('SIGINT', signalHandler);
+    process.off('SIGTERM', signalHandler);
+    signalHandler = null;
+    signalsRegistered = false;
+  };
+  // Seeds URLs from sitemap(s) when crawl.sitemap is enabled. Recurses into
+  // sitemap indexes (bounded) and routes discovered URLs through enqueue() so
+  // include/exclude rules still apply.
+  const seedSitemaps = async () => {
+    const cfg = config.crawl.sitemap;
+    if (!cfg) return;
+    const roots = Array.isArray(cfg)
+      ? cfg
+      : startUrls.map((url) => new URL('/sitemap.xml', url).href);
+    const seen = new Set();
+    let added = 0;
+    const visit = async (url, depth) => {
+      if (depth > 5 || seen.has(url)) return;
+      seen.add(url);
+      try {
+        const result = await retryRunner.run(() => fetcher.fetch(url));
+        const { sitemaps, urls } = parseSitemap(toText(result.data));
+        added += await enqueue(urls, { depth: 0, referrer: url });
+        for (const nested of sitemaps) await visit(nested, depth + 1);
+      } catch (error) {
+        if (error instanceof RateLimitError) throw error;
+        logger.warn(`Sitemap fetch failed (${url}) -> ${error.message}`);
+      }
+    };
+    for (const url of roots) await visit(url, 0);
+    if (added > 0) logger.info(`Seeded ${added} URL(s) from sitemap(s).`);
   };
   // Crawls until the queue is drained (or `stop()` is called).
@@ -233,37 +342,49 @@ export const createCrawler = (userConfig = {}) => {
     init();
     logBanner();
     registerSignals();
+    rateLimitError = null;
-    if (fetcher.init) await fetcher.init();
-    processedCount = queue.crawledCount() + queue.errorCount() + queue.skippedCount();
+    try {
+      if (fetcher.init) await fetcher.init();
+      await seedSitemaps();
+      processedCount = queue.crawledCount() + queue.errorCount() + queue.skippedCount();
+      await runPipeline({
+        queue,
+        concurrency: config.crawl.concurrency,
+        perHostDelay: config.crawl.delay,
+        processOne,
+        isStopped: () => stopped || queue.crawledCount() >= config.crawl.maxPages
+      });
-    await runPipeline({
-      queue,
-      concurrency: config.crawl.concurrency,
-      perHostDelay: config.crawl.delay,
-      processOne,
-      isStopped: () => stopped || queue.crawledCount() >= config.crawl.maxPages
-    });
+      queue.flush();
-    queue.flush();
+      // A rate-limit abort surfaces here so run() can clean up (flush + close)
+      // before the error propagates to the caller.
+      if (rateLimitError) throw rateLimitError;
-    if (config.crawl.maxPages !== Infinity && queue.crawledCount() >= config.crawl.maxPages) {
-      logger.info(`Reached maxPages limit (${config.crawl.maxPages}).`);
-    }
+      if (config.crawl.maxPages !== Infinity && queue.crawledCount() >= config.crawl.maxPages) {
+        logger.info(`Reached maxPages limit (${config.crawl.maxPages}).`);
+      }
-    logger.info(
-      `Crawling completed! ${queue.crawledCount()} crawled, ${queue.skippedCount()} skipped, ` +
-        `${queue.errorCount()} errors, ${queue.pendingCount()} pending (of ${queue.entries.length} total).`
-    );
+      logger.info(
+        `Crawling completed! ${queue.crawledCount()} crawled, ${queue.skippedCount()} skipped, ` +
+          `${queue.errorCount()} errors, ${queue.pendingCount()} pending (of ${queue.entries.length} total).`
+      );
+    } finally {
+      unregisterSignals();
+    }
   };
   // Re-reads crawled pages from disk so resumed runs include earlier sessions.
+  // The full saved record is returned (including any `transform` additions and
+  // `data` for JSON sources); the output writer decides what to serialize.
   const collectRecords = () => {
     const records = [];
     for (const entry of queue.entries) {
       if (!entry.file) continue;
-      const data = loadJSON(path.posix.join(config.storage.crawledDir, entry.file), null);
-      if (data) records.push({ url: entry.url, content: data.content });
+      const record = loadJSON(path.posix.join(config.storage.crawledDir, entry.file), null);
+      if (record) records.push(record);
     }
     return records;
   };
@@ -322,6 +443,12 @@ export const createCrawler = (userConfig = {}) => {
       if (queue.entries.length === 0) queue.load();
       return queue.requeueErrors();
     },
+    // Same as requeueErrors() but for skipped entries (e.g. after widening
+    // allowedContentTypes or changing sites).
+    requeueSkipped: () => {
+      if (queue.entries.length === 0) queue.load();
+      return queue.requeueSkipped();
+    },
     stop: () => {
       stopped = true;
     }
@@ -330,3 +457,35 @@ export const createCrawler = (userConfig = {}) => {
 // One-call convenience wrapper: create a crawler and run the full pipeline.
 export const scraply = (userConfig = {}) => createCrawler(userConfig).run();
+/**
+ * Runs multiple crawlers in one process. Accepts crawler instances or plain
+ * config objects (which are turned into crawlers). Because the crawler no longer
+ * calls `process.exit`, several crawlers can safely share one process — set
+ * `signals: false` in each config (or rely on the per-instance graceful stop).
+ *
+ * @param {Array<import('./index.js').ScraplyConfig | ReturnType<typeof createCrawler>>} items
+ * @param {{ concurrency?: number }} [options] - how many crawlers run at once (default 1 = sequential)
+ * @returns {Promise<Array<import('./core/queue.js').QueueEntry[]>>} each crawler's final queue entries, in input order
+ */
+export const runCrawlers = async (items, { concurrency = 1 } = {}) => {
+  const instances = items.map((item) =>
+    item && typeof item.run === 'function' ? item : createCrawler(item)
+  );
+  const results = new Array(instances.length);
+  let cursor = 0;
+  const worker = async () => {
+    for (;;) {
+      const index = cursor++;
+      if (index >= instances.length) return;
+      results[index] = await instances[index].run();
+    }
+  };
+  const poolSize = Math.max(1, Math.min(concurrency, instances.length || 1));
+  await Promise.all(Array.from({ length: poolSize }, () => worker()));
+  return results;
+};

package/src/extract/extract.js CHANGED Viewed

@@ -20,18 +20,32 @@ const collectText = ($, element) => {
  * Extracts readable text from an HTML document. Cheerio decodes HTML entities
  * for us, so no separate decoder dependency is needed.
  *
+ * `root` allow-lists the container(s) to read from (a selector or array of
+ * selectors); when it matches nothing — or is null — extraction falls back to
+ * `rootFallback` (default `<body>`). `removeSelectors` then strips noise from
+ * within the chosen root.
+ *
  * @param {string|import('cheerio').CheerioAPI} input - raw HTML or a loaded Cheerio instance
- * @param {{ removeSelectors?: string[] }} [options]
+ * @param {{ removeSelectors?: string[], root?: string|string[]|null, rootFallback?: string }} [options]
  * @returns {string}
  */
 export const extractText = (input, options = {}) => {
-  const { removeSelectors = [] } = options;
+  const { removeSelectors = [], root = null, rootFallback = 'body' } = options;
   const $ = typeof input === 'string' ? cheerio.load(input) : input;
   if (removeSelectors.length) $(removeSelectors.join(',')).remove();
   $('*').contents().filter((_, node) => node.type === 'comment').remove();
-  return collectText($, $('body'))
+  const rootSelector = Array.isArray(root) ? root.join(',') : root;
+  let $root = rootSelector ? $(rootSelector) : $(rootFallback || 'body');
+  if ($root.length === 0) $root = $(rootFallback || 'body');
+  let text = '';
+  $root.each((_, element) => {
+    text += `${collectText($, $(element))} `;
+  });
+  return text
     .replace(/\n/g, ' ')
     .replace(/\\['"\\]/g, (match) => match.slice(1))
     .replace(WHITESPACE_CHARS, ' ')

package/src/extract/parse.js ADDED Viewed

@@ -0,0 +1,35 @@
+/** Coerces a fetcher body (string or binary) to a UTF-8 string. */
+export const toText = (data) => (typeof data === 'string' ? data : Buffer.from(data).toString('utf8'));
+/**
+ * Buckets a Content-Type into the kind of body Scraply knows how to handle.
+ * Anything containing "json" is JSON, anything containing "html" (incl.
+ * application/xhtml+xml) is HTML, everything else is treated as raw text.
+ *
+ * @param {string} [contentType]
+ * @returns {'html'|'json'|'text'}
+ */
+export const classifyContentType = (contentType = '') => {
+  const value = String(contentType).toLowerCase();
+  if (value.includes('json')) return 'json';
+  if (value.includes('html')) return 'html';
+  return 'text';
+};
+/**
+ * Parses a JSON body. Returns the parsed value plus a pretty-printed string for
+ * the record `content`. Falls back to the raw text when the body is not valid
+ * JSON (so a mislabeled response is never lost).
+ *
+ * @param {string|ArrayBuffer} data
+ * @returns {{ data: unknown, content: string }}
+ */
+export const parseJson = (data) => {
+  const text = toText(data);
+  try {
+    const parsed = JSON.parse(text);
+    return { data: parsed, content: JSON.stringify(parsed, null, 2) };
+  } catch {
+    return { data: null, content: text };
+  }
+};

package/src/extract/sitemap.js ADDED Viewed

@@ -0,0 +1,35 @@
+import * as cheerio from 'cheerio';
+/**
+ * Parses an XML sitemap or sitemap index. Returns nested `sitemaps` (from a
+ * `<sitemapindex>`) and page `urls` (from a `<urlset>`) separately so the
+ * crawler can recurse into indexes before enqueuing pages.
+ *
+ * @param {string} xml
+ * @returns {{ sitemaps: string[], urls: string[] }}
+ */
+export const parseSitemap = (xml) => {
+  const $ = cheerio.load(xml, { xmlMode: true });
+  const sitemaps = [];
+  const urls = [];
+  $('sitemap > loc').each((_, el) => {
+    const value = $(el).text().trim();
+    if (value) sitemaps.push(value);
+  });
+  $('url > loc').each((_, el) => {
+    const value = $(el).text().trim();
+    if (value) urls.push(value);
+  });
+  // Fallback for sitemaps that omit the standard wrapping elements.
+  if (sitemaps.length === 0 && urls.length === 0) {
+    $('loc').each((_, el) => {
+      const value = $(el).text().trim();
+      if (value) urls.push(value);
+    });
+  }
+  return { sitemaps, urls };
+};