npm - scraply - Versions diffs - 2.0.0 → 2.0.2 - Mend

scraply 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/package.json +7 -3
package/readme.md +149 -55
package/src/config/browser.js +37 -0
package/src/config/defaults.js +47 -11
package/src/config/load.js +57 -1
package/src/core/errors.js +23 -0
package/src/core/queue.js +83 -11
package/src/core/retry.js +34 -26
package/src/crawler.js +265 -76
package/src/extract/extract.js +17 -3
package/src/extract/links.js +4 -4
package/src/extract/parse.js +35 -0
package/src/extract/sitemap.js +35 -0
package/src/fetchers/browserFetcher.js +18 -12
package/src/fetchers/httpFetcher.js +40 -3
package/src/index.d.ts +285 -0
package/src/index.js +48 -7
package/src/output/writers.js +14 -5

package/src/core/queue.js CHANGED Viewed

@@ -3,19 +3,21 @@ import { loadJSON, saveJSON, deletePath } from '../storage/files.js';
 /**
  * @typedef {Object} QueueEntry
  * @property {string} url
- * @property {string|null} file     - path to the saved crawled file, or null
+ * @property {string|null} file     - filename of the saved crawled record (relative to crawledDir), or null
  * @property {number|null} status   - last HTTP status
  * @property {string|null} error    - error message, or null
+ * @property {string|null} skipped  - reason the page was skipped (e.g. content-type), or null
  * @property {string|null} referrer - URL this entry was discovered on
  * @property {number} depth
  */
-const isProcessed = (entry) => entry.file !== null || entry.error !== null;
+const isProcessed = (entry) => entry.file !== null || entry.error !== null || entry.skipped !== null;
 /**
  * Owns the crawl queue: dedup, depth limiting, status tracking and durable
- * checkpointing. Persistence is debounced so a high-concurrency crawl does not
- * rewrite the queue file on every single URL.
+ * checkpointing. Status totals are tracked incrementally (O(1) reads) and
+ * persistence is debounced so a high-concurrency crawl does not rewrite the
+ * queue file on every single URL.
  */
 export class QueueManager {
   /** @param {{ config: import('../index.js').ResolvedConfig, logger: any }} deps */
@@ -32,16 +34,30 @@ export class QueueManager {
     /** @type {QueueEntry[]} */
     this._pending = [];
     this._cursor = 0;
+    this._crawled = 0;
+    this._errors = 0;
+    this._skipped = 0;
     this._dirty = false;
     this._timer = null;
     this._persistInterval = 1000;
   }
-  /** Loads any previously persisted queue and rebuilds the in-memory indexes. */
+  /** Loads any previously persisted queue and rebuilds the in-memory indexes and totals. */
   load() {
     this.entries = loadJSON(this.path, []) ?? [];
     this.index = new Set(this.entries.map((entry) => entry.url));
-    this._pending = this.entries.filter((entry) => !isProcessed(entry));
+    this._pending = [];
+    this._crawled = 0;
+    this._errors = 0;
+    this._skipped = 0;
+    for (const entry of this.entries) {
+      if (entry.file !== null) this._crawled += 1;
+      else if (entry.error !== null) this._errors += 1;
+      else if (entry.skipped !== null) this._skipped += 1;
+      else this._pending.push(entry);
+    }
     this._cursor = 0;
     return this.entries;
   }
@@ -59,7 +75,7 @@ export class QueueManager {
   add(url, { depth = 0, referrer = null } = {}) {
     if (this.index.has(url) || depth > this.maxDepth) return false;
-    const entry = { url, file: null, status: null, error: null, referrer, depth };
+    const entry = { url, file: null, status: null, error: null, skipped: null, referrer, depth };
     this.index.add(url);
     this.entries.push(entry);
     this._pending.push(entry);
@@ -76,29 +92,82 @@ export class QueueManager {
     entry.file = file;
     entry.status = status;
     entry.error = null;
+    entry.skipped = null;
+    this._crawled += 1;
     this._markDirty();
   }
   markError(entry, { error, status }) {
     entry.error = error;
     entry.status = status ?? null;
+    this._errors += 1;
+    this._markDirty();
+  }
+  markSkipped(entry, { reason, status }) {
+    entry.skipped = reason;
+    entry.status = status ?? null;
+    this._skipped += 1;
     this._markDirty();
   }
+  /**
+   * Returns matching terminal entries to the pending set so the next crawl
+   * retries them. Persists immediately so a fresh `load()` (e.g. at the start of
+   * `crawl()`) sees the requeued entries.
+   * @param {(entry: QueueEntry) => boolean} match
+   * @returns {number} how many entries were requeued
+   */
+  _requeue(match) {
+    let count = 0;
+    for (const entry of this.entries) {
+      if (!match(entry)) continue;
+      if (entry.error !== null) this._errors -= 1;
+      if (entry.skipped !== null) this._skipped -= 1;
+      entry.error = null;
+      entry.skipped = null;
+      entry.status = null;
+      this._pending.push(entry);
+      count += 1;
+    }
+    if (count > 0) this.flush();
+    return count;
+  }
+  /** Re-queues every errored entry for retry. @returns {number} */
+  requeueErrors() {
+    return this._requeue((entry) => entry.error !== null);
+  }
+  /**
+   * Re-queues every skipped entry for another attempt. Useful after widening
+   * `allowedContentTypes` (or changing `sites`) so previously skipped URLs are
+   * reconsidered. @returns {number}
+   */
+  requeueSkipped() {
+    return this._requeue((entry) => entry.skipped !== null);
+  }
   isAllProcessed() {
-    return this.entries.length > 0 && this.entries.every(isProcessed);
+    return this.entries.length > 0 && this.pendingCount() === 0;
   }
   pendingCount() {
-    return this.entries.filter((entry) => !isProcessed(entry)).length;
+    return this.entries.length - this._crawled - this._errors - this._skipped;
   }
   crawledCount() {
-    return this.entries.filter((entry) => entry.file !== null).length;
+    return this._crawled;
   }
   errorCount() {
-    return this.entries.filter((entry) => entry.error !== null).length;
+    return this._errors;
+  }
+  skippedCount() {
+    return this._skipped;
   }
   /** Clears in-memory state and removes the persisted queue file. */
@@ -107,6 +176,9 @@ export class QueueManager {
     this.index = new Set();
     this._pending = [];
     this._cursor = 0;
+    this._crawled = 0;
+    this._errors = 0;
+    this._skipped = 0;
     this._dirty = false;
     deletePath(this.path);
   }

package/src/core/retry.js CHANGED Viewed

@@ -1,4 +1,5 @@
 import { delay } from '../util/delay.js';
+import { RateLimitError } from './errors.js';
 /** Derives how long to wait (ms) from rate-limit headers, falling back to a default. */
 const computeWait = (headers = {}, fallback) => {
@@ -22,46 +23,53 @@ const computeWait = (headers = {}, fallback) => {
  * Wraps a fetch operation with retry and rate-limit handling shared by every
  * fetcher backend.
  *
- * @param {{ config: import('../index.js').ResolvedConfig, logger: any, onRateLimitExit: (code: number) => void }} deps
+ * Rate limiting (HTTP 429) is handled independently of the normal retry budget:
+ * when `rateLimit.exitOnLimit` is false the runner waits (honoring `retry-after`
+ * / `x-ratelimit-reset`) and retries until the host relents; otherwise it throws
+ * a `RateLimitError` so the crawl aborts cleanly and can be resumed later.
+ *
+ * @param {{ config: import('../index.js').ResolvedConfig, logger: any }} deps
  */
-export const createRetryRunner = ({ config, logger, onRateLimitExit }) => {
+export const createRetryRunner = ({ config, logger }) => {
   const { retry, rateLimit } = config;
-  const shouldRetry = async (error) => {
-    const status = error?.response?.status;
-    if (status === undefined) return true; // network/transport error
-    if (status === 429) {
-      if (rateLimit.exitOnLimit) return false; // run() handles the exit
-      const wait = computeWait(error.response.headers, rateLimit.fallbackDelay);
-      logger.warn(`Rate limited. Waiting ${Math.round(wait / 1000)}s before retrying...`);
-      await delay(wait);
-      return true;
-    }
-    return retry.statusCodes.includes(status);
-  };
   const run = async (fn) => {
-    for (let attempt = 0; ; attempt++) {
+    let attempt = 0;
+    for (;;) {
       try {
         return await fn();
       } catch (error) {
-        const canRetry = attempt < retry.max && (await shouldRetry(error));
-        if (canRetry) {
-          logger.info(`Retry ${attempt + 1}/${retry.max} -> ${error.message}`);
+        const status = error?.response?.status;
+        if (status === 429) {
+          if (rateLimit.exitOnLimit) {
+            logger.warn(`Rate limited. Aborting crawl (exitOnLimit) with code ${rateLimit.exitCode}.`);
+            throw new RateLimitError('Rate limited', {
+              code: rateLimit.exitCode,
+              headers: error.response.headers,
+              cause: error
+            });
+          }
+          const wait = computeWait(error.response.headers, rateLimit.fallbackDelay);
+          logger.warn(`Rate limited. Waiting ${Math.round(wait / 1000)}s before retrying...`);
+          await delay(wait);
+          continue; // rate-limit waits never consume the retry budget
+        }
+        const retriable = status === undefined || retry.statusCodes.includes(status);
+        if (retriable && attempt < retry.max) {
+          attempt += 1;
+          logger.info(`Retry ${attempt}/${retry.max} -> ${error.message}`);
           if (retry.delay > 0) await delay(retry.delay);
           continue;
         }
-        if (error?.response?.status === 429) {
-          logger.warn(`Force exiting with code ${rateLimit.exitCode} (rate limited).`);
-          onRateLimitExit(rateLimit.exitCode);
-        }
         throw error;
       }
     }
   };
-  return { run, shouldRetry };
+  return { run };
 };