npm - @nitpicker/crawler - Versions diffs - 0.4.1 - Mend

@nitpicker/crawler 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

package/CHANGELOG.md +8 -0
package/LICENSE +191 -0
package/README.md +13 -0
package/lib/archive/archive-accessor.d.ts +107 -0
package/lib/archive/archive-accessor.js +264 -0
package/lib/archive/archive.d.ts +174 -0
package/lib/archive/archive.js +331 -0
package/lib/archive/database.d.ts +207 -0
package/lib/archive/database.js +972 -0
package/lib/archive/debug.d.ts +8 -0
package/lib/archive/debug.js +9 -0
package/lib/archive/filesystem/append-text.d.ts +9 -0
package/lib/archive/filesystem/append-text.js +14 -0
package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
package/lib/archive/filesystem/copy-dir-sync.js +9 -0
package/lib/archive/filesystem/copy-dir.d.ts +7 -0
package/lib/archive/filesystem/copy-dir.js +13 -0
package/lib/archive/filesystem/exists.d.ts +6 -0
package/lib/archive/filesystem/exists.js +9 -0
package/lib/archive/filesystem/get-file-list.d.ts +8 -0
package/lib/archive/filesystem/get-file-list.js +12 -0
package/lib/archive/filesystem/index.d.ts +17 -0
package/lib/archive/filesystem/index.js +17 -0
package/lib/archive/filesystem/is-dir.d.ts +6 -0
package/lib/archive/filesystem/is-dir.js +10 -0
package/lib/archive/filesystem/mkdir.d.ts +8 -0
package/lib/archive/filesystem/mkdir.js +15 -0
package/lib/archive/filesystem/output-json.d.ts +9 -0
package/lib/archive/filesystem/output-json.js +14 -0
package/lib/archive/filesystem/output-text.d.ts +11 -0
package/lib/archive/filesystem/output-text.js +32 -0
package/lib/archive/filesystem/read-json.d.ts +7 -0
package/lib/archive/filesystem/read-json.js +11 -0
package/lib/archive/filesystem/read-text.d.ts +6 -0
package/lib/archive/filesystem/read-text.js +10 -0
package/lib/archive/filesystem/readline.d.ts +11 -0
package/lib/archive/filesystem/readline.js +26 -0
package/lib/archive/filesystem/remove.d.ts +5 -0
package/lib/archive/filesystem/remove.js +10 -0
package/lib/archive/filesystem/rename.d.ts +11 -0
package/lib/archive/filesystem/rename.js +18 -0
package/lib/archive/filesystem/tar.d.ts +11 -0
package/lib/archive/filesystem/tar.js +22 -0
package/lib/archive/filesystem/untar.d.ts +20 -0
package/lib/archive/filesystem/untar.js +24 -0
package/lib/archive/filesystem/utils.d.ts +109 -0
package/lib/archive/filesystem/utils.js +185 -0
package/lib/archive/filesystem/zip.d.ts +29 -0
package/lib/archive/filesystem/zip.js +53 -0
package/lib/archive/index.d.ts +6 -0
package/lib/archive/index.js +11 -0
package/lib/archive/page.d.ts +263 -0
package/lib/archive/page.js +316 -0
package/lib/archive/resource.d.ts +46 -0
package/lib/archive/resource.js +62 -0
package/lib/archive/safe-path.d.ts +9 -0
package/lib/archive/safe-path.js +17 -0
package/lib/archive/types.d.ts +210 -0
package/lib/archive/types.js +1 -0
package/lib/crawler/clear-destination-cache.d.ts +5 -0
package/lib/crawler/clear-destination-cache.js +8 -0
package/lib/crawler/crawler.d.ts +73 -0
package/lib/crawler/crawler.js +748 -0
package/lib/crawler/decompose-url.d.ts +25 -0
package/lib/crawler/decompose-url.js +71 -0
package/lib/crawler/destination-cache.d.ts +7 -0
package/lib/crawler/destination-cache.js +6 -0
package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
package/lib/crawler/detect-pagination-pattern.js +61 -0
package/lib/crawler/fetch-destination.d.ts +38 -0
package/lib/crawler/fetch-destination.js +208 -0
package/lib/crawler/fetch-robots-txt.d.ts +42 -0
package/lib/crawler/fetch-robots-txt.js +44 -0
package/lib/crawler/find-best-matching-scope.d.ts +12 -0
package/lib/crawler/find-best-matching-scope.js +46 -0
package/lib/crawler/generate-predicted-urls.d.ts +13 -0
package/lib/crawler/generate-predicted-urls.js +27 -0
package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
package/lib/crawler/handle-ignore-and-skip.js +19 -0
package/lib/crawler/handle-resource-response.d.ts +13 -0
package/lib/crawler/handle-resource-response.js +16 -0
package/lib/crawler/handle-scrape-end.d.ts +24 -0
package/lib/crawler/handle-scrape-end.js +82 -0
package/lib/crawler/handle-scrape-error.d.ts +37 -0
package/lib/crawler/handle-scrape-error.js +38 -0
package/lib/crawler/index.d.ts +2 -0
package/lib/crawler/index.js +2 -0
package/lib/crawler/inject-scope-auth.d.ts +11 -0
package/lib/crawler/inject-scope-auth.js +21 -0
package/lib/crawler/is-external-url.d.ts +11 -0
package/lib/crawler/is-external-url.js +12 -0
package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
package/lib/crawler/is-in-any-lower-layer.js +15 -0
package/lib/crawler/link-list.d.ts +112 -0
package/lib/crawler/link-list.js +248 -0
package/lib/crawler/link-to-page-data.d.ts +14 -0
package/lib/crawler/link-to-page-data.js +32 -0
package/lib/crawler/net-timeout-error.d.ts +9 -0
package/lib/crawler/net-timeout-error.js +11 -0
package/lib/crawler/network.d.ts +30 -0
package/lib/crawler/network.js +226 -0
package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
package/lib/crawler/protocol-agnostic-key.js +11 -0
package/lib/crawler/reconstruct-url.d.ts +10 -0
package/lib/crawler/reconstruct-url.js +28 -0
package/lib/crawler/result-handler.d.ts +118 -0
package/lib/crawler/result-handler.js +153 -0
package/lib/crawler/robots-checker.d.ts +26 -0
package/lib/crawler/robots-checker.js +62 -0
package/lib/crawler/should-discard-predicted.d.ts +14 -0
package/lib/crawler/should-discard-predicted.js +31 -0
package/lib/crawler/should-skip-url.d.ts +23 -0
package/lib/crawler/should-skip-url.js +15 -0
package/lib/crawler/speculative-pagination.d.ts +52 -0
package/lib/crawler/speculative-pagination.js +215 -0
package/lib/crawler/types.d.ts +119 -0
package/lib/crawler/types.js +1 -0
package/lib/crawler/url-filter.d.ts +56 -0
package/lib/crawler/url-filter.js +110 -0
package/lib/crawler-orchestrator.d.ts +142 -0
package/lib/crawler-orchestrator.js +309 -0
package/lib/debug.d.ts +8 -0
package/lib/debug.js +9 -0
package/lib/index.d.ts +16 -0
package/lib/index.js +18 -0
package/lib/qzilla.d.ts +136 -0
package/lib/qzilla.js +292 -0
package/lib/types.d.ts +27 -0
package/lib/types.js +1 -0
package/lib/utils/array/each-splitted.d.ts +10 -0
package/lib/utils/array/each-splitted.js +14 -0
package/lib/utils/array/index.d.ts +1 -0
package/lib/utils/array/index.js +1 -0
package/lib/utils/async/index.d.ts +1 -0
package/lib/utils/async/index.js +1 -0
package/lib/utils/debug.d.ts +5 -0
package/lib/utils/debug.js +5 -0
package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
package/lib/utils/error/dom-evaluation-error.js +7 -0
package/lib/utils/error/error-emitter.d.ts +18 -0
package/lib/utils/error/error-emitter.js +29 -0
package/lib/utils/error/index.d.ts +3 -0
package/lib/utils/error/index.js +2 -0
package/lib/utils/event-emitter/index.d.ts +6 -0
package/lib/utils/event-emitter/index.js +6 -0
package/lib/utils/index.d.ts +5 -0
package/lib/utils/index.js +5 -0
package/lib/utils/network/index.d.ts +1 -0
package/lib/utils/network/index.js +1 -0
package/lib/utils/object/clean-object.d.ts +8 -0
package/lib/utils/object/clean-object.js +13 -0
package/lib/utils/object/index.d.ts +1 -0
package/lib/utils/object/index.js +1 -0
package/lib/utils/path/index.d.ts +1 -0
package/lib/utils/path/index.js +1 -0
package/lib/utils/path/safe-filepath.d.ts +7 -0
package/lib/utils/path/safe-filepath.js +12 -0
package/lib/utils/regexp/index.d.ts +1 -0
package/lib/utils/regexp/index.js +1 -0
package/lib/utils/retryable/index.d.ts +2 -0
package/lib/utils/retryable/index.js +1 -0
package/lib/utils/sort/index.d.ts +14 -0
package/lib/utils/sort/index.js +61 -0
package/lib/utils/sort/remove-matches.d.ts +9 -0
package/lib/utils/sort/remove-matches.js +23 -0
package/lib/utils/types/index.d.ts +1 -0
package/lib/utils/types/index.js +1 -0
package/lib/utils/types/types.d.ts +46 -0
package/lib/utils/types/types.js +1 -0
package/lib/utils/url/index.d.ts +5 -0
package/lib/utils/url/index.js +5 -0
package/lib/utils/url/is-lower-layer.d.ts +15 -0
package/lib/utils/url/is-lower-layer.js +55 -0
package/lib/utils/url/parse-url.d.ts +11 -0
package/lib/utils/url/parse-url.js +20 -0
package/lib/utils/url/path-match.d.ts +11 -0
package/lib/utils/url/path-match.js +18 -0
package/lib/utils/url/sort-url.d.ts +10 -0
package/lib/utils/url/sort-url.js +24 -0
package/lib/utils/url/url-partial-match.d.ts +11 -0
package/lib/utils/url/url-partial-match.js +32 -0
package/package.json +49 -0
package/src/archive/__mock__/.gitignore +3 -0
package/src/archive/__mock__/mock.sqlite +0 -0
package/src/archive/archive-accessor.ts +337 -0
package/src/archive/archive.ts +408 -0
package/src/archive/database.spec.ts +469 -0
package/src/archive/database.ts +1059 -0
package/src/archive/debug.ts +10 -0
package/src/archive/filesystem/append-text.spec.ts +26 -0
package/src/archive/filesystem/append-text.ts +16 -0
package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
package/src/archive/filesystem/copy-dir-sync.ts +10 -0
package/src/archive/filesystem/copy-dir.spec.ts +33 -0
package/src/archive/filesystem/copy-dir.ts +14 -0
package/src/archive/filesystem/exists.spec.ts +33 -0
package/src/archive/filesystem/exists.ts +10 -0
package/src/archive/filesystem/get-file-list.spec.ts +37 -0
package/src/archive/filesystem/get-file-list.ts +13 -0
package/src/archive/filesystem/index.ts +17 -0
package/src/archive/filesystem/is-dir.spec.ts +29 -0
package/src/archive/filesystem/is-dir.ts +11 -0
package/src/archive/filesystem/mkdir.spec.ts +37 -0
package/src/archive/filesystem/mkdir.ts +16 -0
package/src/archive/filesystem/output-json.spec.ts +34 -0
package/src/archive/filesystem/output-json.ts +16 -0
package/src/archive/filesystem/output-text.spec.ts +31 -0
package/src/archive/filesystem/output-text.ts +35 -0
package/src/archive/filesystem/read-json.spec.ts +26 -0
package/src/archive/filesystem/read-json.ts +12 -0
package/src/archive/filesystem/read-text.spec.ts +25 -0
package/src/archive/filesystem/read-text.ts +11 -0
package/src/archive/filesystem/readline.spec.ts +29 -0
package/src/archive/filesystem/readline.ts +30 -0
package/src/archive/filesystem/remove.spec.ts +34 -0
package/src/archive/filesystem/remove.ts +11 -0
package/src/archive/filesystem/rename.spec.ts +46 -0
package/src/archive/filesystem/rename.ts +21 -0
package/src/archive/filesystem/tar.spec.ts +33 -0
package/src/archive/filesystem/tar.ts +27 -0
package/src/archive/filesystem/untar.spec.ts +34 -0
package/src/archive/filesystem/untar.ts +36 -0
package/src/archive/index.ts +13 -0
package/src/archive/page.spec.ts +368 -0
package/src/archive/page.ts +420 -0
package/src/archive/resource.spec.ts +101 -0
package/src/archive/resource.ts +73 -0
package/src/archive/safe-path.spec.ts +44 -0
package/src/archive/safe-path.ts +18 -0
package/src/archive/types.ts +227 -0
package/src/crawler/clear-destination-cache.spec.ts +20 -0
package/src/crawler/clear-destination-cache.ts +9 -0
package/src/crawler/crawler.ts +873 -0
package/src/crawler/decompose-url.spec.ts +48 -0
package/src/crawler/decompose-url.ts +90 -0
package/src/crawler/destination-cache.spec.ts +23 -0
package/src/crawler/destination-cache.ts +8 -0
package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
package/src/crawler/detect-pagination-pattern.ts +66 -0
package/src/crawler/fetch-destination.ts +257 -0
package/src/crawler/fetch-robots-txt.spec.ts +83 -0
package/src/crawler/fetch-robots-txt.ts +91 -0
package/src/crawler/find-best-matching-scope.spec.ts +39 -0
package/src/crawler/find-best-matching-scope.ts +57 -0
package/src/crawler/generate-predicted-urls.spec.ts +42 -0
package/src/crawler/generate-predicted-urls.ts +34 -0
package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
package/src/crawler/handle-ignore-and-skip.ts +30 -0
package/src/crawler/handle-resource-response.spec.ts +45 -0
package/src/crawler/handle-resource-response.ts +21 -0
package/src/crawler/handle-scrape-end.spec.ts +109 -0
package/src/crawler/handle-scrape-end.ts +115 -0
package/src/crawler/handle-scrape-error.spec.ts +105 -0
package/src/crawler/handle-scrape-error.ts +58 -0
package/src/crawler/index.ts +2 -0
package/src/crawler/inject-scope-auth.spec.ts +36 -0
package/src/crawler/inject-scope-auth.ts +27 -0
package/src/crawler/is-external-url.spec.ts +31 -0
package/src/crawler/is-external-url.ts +17 -0
package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
package/src/crawler/is-in-any-lower-layer.ts +22 -0
package/src/crawler/link-list.spec.ts +355 -0
package/src/crawler/link-list.ts +275 -0
package/src/crawler/link-to-page-data.spec.ts +133 -0
package/src/crawler/link-to-page-data.ts +34 -0
package/src/crawler/net-timeout-error.spec.ts +25 -0
package/src/crawler/net-timeout-error.ts +11 -0
package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
package/src/crawler/protocol-agnostic-key.ts +11 -0
package/src/crawler/reconstruct-url.spec.ts +37 -0
package/src/crawler/reconstruct-url.ts +37 -0
package/src/crawler/robots-checker.spec.ts +104 -0
package/src/crawler/robots-checker.ts +73 -0
package/src/crawler/should-discard-predicted.spec.ts +125 -0
package/src/crawler/should-discard-predicted.ts +33 -0
package/src/crawler/should-skip-url.spec.ts +77 -0
package/src/crawler/should-skip-url.ts +37 -0
package/src/crawler/types.ts +146 -0
package/src/crawler-orchestrator.ts +401 -0
package/src/debug.ts +10 -0
package/src/index.ts +25 -0
package/src/types.ts +30 -0
package/src/utils/array/each-splitted.spec.ts +38 -0
package/src/utils/array/each-splitted.ts +19 -0
package/src/utils/array/index.ts +1 -0
package/src/utils/debug.ts +6 -0
package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
package/src/utils/error/dom-evaluation-error.ts +6 -0
package/src/utils/error/error-emitter.spec.ts +78 -0
package/src/utils/error/error-emitter.ts +44 -0
package/src/utils/error/index.ts +3 -0
package/src/utils/index.ts +5 -0
package/src/utils/object/clean-object.spec.ts +24 -0
package/src/utils/object/clean-object.ts +13 -0
package/src/utils/object/index.ts +1 -0
package/src/utils/types/index.ts +1 -0
package/src/utils/types/types.ts +65 -0
package/tsconfig.json +11 -0
package/tsconfig.tsbuildinfo +1 -0

package/lib/crawler/decompose-url.d.ts ADDED Viewed

@@ -0,0 +1,25 @@
+/**
+ * Intermediate representation of a URL split into comparable tokens.
+ * Used by pagination detection to identify which token changed between two URLs.
+ */
+export interface DecomposedUrl {
+    /** Hostname including port (e.g. `"example.com:8080"`). */
+    host: string;
+    /** Path segments split by `/` (e.g. `["page", "2"]` for `/page/2`). */
+    pathSegments: string[];
+    /** Sorted query parameter keys. */
+    queryKeys: string[];
+    /** Query parameter values sorted by their corresponding key. */
+    queryValues: string[];
+    /** Protocol prefix (e.g. `"https:"`) or empty string if protocol-agnostic. */
+    protocol: string;
+}
+/**
+ * Decomposes a URL string into its constituent tokens for comparison.
+ * Handles both full URLs (`https://host/path?q=v`) and protocol-agnostic
+ * URLs (`//host/path?q=v`). Query parameters are sorted by key for
+ * consistent comparison.
+ * @param url - The URL string to decompose
+ * @returns The decomposed URL, or `null` if the format is invalid
+ */
+export declare function decomposeUrl(url: string): DecomposedUrl | null;

package/lib/crawler/decompose-url.js ADDED Viewed

@@ -0,0 +1,71 @@
+/**
+ * Decomposes a URL string into its constituent tokens for comparison.
+ * Handles both full URLs (`https://host/path?q=v`) and protocol-agnostic
+ * URLs (`//host/path?q=v`). Query parameters are sorted by key for
+ * consistent comparison.
+ * @param url - The URL string to decompose
+ * @returns The decomposed URL, or `null` if the format is invalid
+ */
+export function decomposeUrl(url) {
+    // URL format: //host/path?query  or  //host?query  (protocol-agnostic)
+    // Also handle protocol://host/path?query
+    let work = url;
+    let protocol = '';
+    // Strip protocol
+    const protoMatch = /^(https?:)?\/\//.exec(work);
+    if (!protoMatch)
+        return null;
+    protocol = protoMatch[1] ?? '';
+    work = work.slice(protoMatch[0].length);
+    // Split host from rest
+    const slashIdx = work.indexOf('/');
+    const qmarkIdx = work.indexOf('?');
+    let host;
+    let pathPart;
+    let queryPart;
+    if (slashIdx === -1 && qmarkIdx === -1) {
+        host = work;
+        pathPart = '';
+        queryPart = '';
+    }
+    else if (slashIdx === -1) {
+        host = work.slice(0, qmarkIdx);
+        pathPart = '';
+        queryPart = work.slice(qmarkIdx + 1);
+    }
+    else {
+        host = work.slice(0, slashIdx);
+        const pathAndQuery = work.slice(slashIdx + 1);
+        const pq = pathAndQuery.indexOf('?');
+        if (pq === -1) {
+            pathPart = pathAndQuery;
+            queryPart = '';
+        }
+        else {
+            pathPart = pathAndQuery.slice(0, pq);
+            queryPart = pathAndQuery.slice(pq + 1);
+        }
+    }
+    const pathSegments = pathPart ? pathPart.split('/') : [];
+    // Parse query into sorted key-value pairs
+    const queryPairs = [];
+    if (queryPart) {
+        for (const pair of queryPart.split('&')) {
+            const eqIdx = pair.indexOf('=');
+            if (eqIdx === -1) {
+                queryPairs.push([pair, '']);
+            }
+            else {
+                queryPairs.push([pair.slice(0, eqIdx), pair.slice(eqIdx + 1)]);
+            }
+        }
+    }
+    queryPairs.sort((a, b) => a[0].localeCompare(b[0]));
+    return {
+        host,
+        pathSegments,
+        queryKeys: queryPairs.map(([k]) => k),
+        queryValues: queryPairs.map(([, v]) => v),
+        protocol,
+    };
+}

package/lib/crawler/destination-cache.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+import type { PageData } from '@d-zero/beholder';
+/**
+ * In-memory cache of HEAD request results keyed by URL (without hash).
+ * Stores either the successful {@link PageData} or the {@link Error} to avoid
+ * repeated requests to the same destination.
+ */
+export declare const destinationCache: Map<string, Error | PageData>;

package/lib/crawler/destination-cache.js ADDED Viewed

@@ -0,0 +1,6 @@
+/**
+ * In-memory cache of HEAD request results keyed by URL (without hash).
+ * Stores either the successful {@link PageData} or the {@link Error} to avoid
+ * repeated requests to the same destination.
+ */
+export const destinationCache = new Map();

package/lib/crawler/detect-pagination-pattern.d.ts ADDED Viewed

@@ -0,0 +1,16 @@
+import type { PaginationPattern } from './types.js';
+/**
+ * Compares two consecutive URL strings and detects a single-token numeric
+ * pagination pattern (e.g. `/page/1` → `/page/2`, or `?p=1` → `?p=2`).
+ *
+ * The algorithm decomposes each URL into tokens (path segments + sorted query values),
+ * then checks that exactly one token differs and both values are integers with a
+ * positive step. Returns `null` when no pattern is detected.
+ *
+ * WHY single-token constraint: Multi-token differences (e.g. both path and query
+ * changing) indicate different routes rather than pagination, so they are rejected.
+ * @param prevUrl - The previously pushed URL (protocol-agnostic, without hash/auth)
+ * @param currentUrl - The newly discovered URL
+ * @returns The detected pattern, or `null` if no pagination pattern was found
+ */
+export declare function detectPaginationPattern(prevUrl: string, currentUrl: string): PaginationPattern | null;

package/lib/crawler/detect-pagination-pattern.js ADDED Viewed

@@ -0,0 +1,61 @@
+import { decomposeUrl } from './decompose-url.js';
+/**
+ * Compares two consecutive URL strings and detects a single-token numeric
+ * pagination pattern (e.g. `/page/1` → `/page/2`, or `?p=1` → `?p=2`).
+ *
+ * The algorithm decomposes each URL into tokens (path segments + sorted query values),
+ * then checks that exactly one token differs and both values are integers with a
+ * positive step. Returns `null` when no pattern is detected.
+ *
+ * WHY single-token constraint: Multi-token differences (e.g. both path and query
+ * changing) indicate different routes rather than pagination, so they are rejected.
+ * @param prevUrl - The previously pushed URL (protocol-agnostic, without hash/auth)
+ * @param currentUrl - The newly discovered URL
+ * @returns The detected pattern, or `null` if no pagination pattern was found
+ */
+export function detectPaginationPattern(prevUrl, currentUrl) {
+    const prev = decomposeUrl(prevUrl);
+    const curr = decomposeUrl(currentUrl);
+    if (!prev || !curr)
+        return null;
+    // Host (including port) must match
+    if (prev.host !== curr.host)
+        return null;
+    // Path segment count must match
+    if (prev.pathSegments.length !== curr.pathSegments.length)
+        return null;
+    // Query key sets must match in count and identity
+    if (prev.queryKeys.length !== curr.queryKeys.length)
+        return null;
+    for (let i = 0; i < prev.queryKeys.length; i++) {
+        if (prev.queryKeys[i] !== curr.queryKeys[i])
+            return null;
+    }
+    // Build combined token arrays: path segments + query values (sorted by key)
+    const prevTokens = [...prev.pathSegments, ...prev.queryValues];
+    const currTokens = [...curr.pathSegments, ...curr.queryValues];
+    let diffIndex = -1;
+    for (const [i, prevToken] of prevTokens.entries()) {
+        if (prevToken !== currTokens[i]) {
+            if (diffIndex !== -1)
+                return null; // more than one difference
+            diffIndex = i;
+        }
+    }
+    if (diffIndex === -1)
+        return null; // identical URLs
+    const prevNum = Number(prevTokens[diffIndex]);
+    const currNum = Number(currTokens[diffIndex]);
+    if (!Number.isFinite(prevNum) || !Number.isFinite(currNum))
+        return null;
+    if (!Number.isInteger(prevNum) || !Number.isInteger(currNum))
+        return null;
+    const step = currNum - prevNum;
+    if (step <= 0)
+        return null;
+    return {
+        tokenIndex: diffIndex,
+        step,
+        currentNumber: currNum,
+    };
+}

package/lib/crawler/fetch-destination.d.ts ADDED Viewed

@@ -0,0 +1,38 @@
+import type { PageData } from '@d-zero/beholder';
+import type { ExURL } from '@d-zero/shared/parse-url';
+/**
+ * Parameters for {@link fetchDestination}.
+ */
+export interface FetchDestinationParams {
+    /** The extended URL to fetch. */
+    readonly url: ExURL;
+    /** Whether the URL is external to the crawl scope. */
+    readonly isExternal: boolean;
+    /** The HTTP method to use. Defaults to `"HEAD"`. */
+    readonly method?: string;
+    /** Additional options. */
+    readonly options?: {
+        /**
+         * When set, forces a GET request and reads up to this many bytes from
+         * the response body to extract an HTML `<title>` tag.
+         */
+        titleBytesLimit?: number;
+    };
+    /** User-Agent string to send with the request. */
+    readonly userAgent?: string;
+}
+/**
+ * Fetches the destination metadata for a URL using an HTTP HEAD request (or GET as fallback).
+ *
+ * Results are cached in memory so that repeated calls for the same URL
+ * (without hash) return immediately. The request races against a 10-second
+ * timeout; if the server does not respond in time, a {@link NetTimeoutError} is thrown.
+ *
+ * If the server returns 405 (Method Not Allowed), 501 (Not Implemented), or 503
+ * (Service Unavailable) for a HEAD request, the function automatically retries with GET.
+ * @param params - Parameters containing URL, external flag, method, options, and optional User-Agent.
+ * @returns The page metadata obtained from the HTTP response.
+ * @throws {NetTimeoutError} If the request exceeds the 10-second timeout.
+ * @throws {Error} If the HTTP request fails for any other reason.
+ */
+export declare function fetchDestination(params: FetchDestinationParams): Promise<PageData>;

package/lib/crawler/fetch-destination.js ADDED Viewed

@@ -0,0 +1,208 @@
+import { delay } from '@d-zero/shared/delay';
+import redirects from 'follow-redirects';
+import { destinationCache } from './destination-cache.js';
+import NetTimeoutError from './net-timeout-error.js';
+/**
+ * Fetches the destination metadata for a URL using an HTTP HEAD request (or GET as fallback).
+ *
+ * Results are cached in memory so that repeated calls for the same URL
+ * (without hash) return immediately. The request races against a 10-second
+ * timeout; if the server does not respond in time, a {@link NetTimeoutError} is thrown.
+ *
+ * If the server returns 405 (Method Not Allowed), 501 (Not Implemented), or 503
+ * (Service Unavailable) for a HEAD request, the function automatically retries with GET.
+ * @param params - Parameters containing URL, external flag, method, options, and optional User-Agent.
+ * @returns The page metadata obtained from the HTTP response.
+ * @throws {NetTimeoutError} If the request exceeds the 10-second timeout.
+ * @throws {Error} If the HTTP request fails for any other reason.
+ */
+export async function fetchDestination(params) {
+    const { url, isExternal, method = 'HEAD', options, userAgent } = params;
+    const titleBytesLimit = options?.titleBytesLimit;
+    const cacheKey = titleBytesLimit == null ? url.withoutHash : `${url.withoutHash}:title`;
+    if (destinationCache.has(cacheKey)) {
+        const cache = destinationCache.get(cacheKey);
+        if (cache instanceof Error) {
+            throw cache;
+        }
+        return cache;
+    }
+    const effectiveMethod = titleBytesLimit == null ? method : 'GET';
+    const result = await Promise.race([
+        _fetchHead(url, isExternal, effectiveMethod, titleBytesLimit, userAgent).catch((error) => (error instanceof Error ? error : new Error(String(error)))),
+        (async () => {
+            await delay(10 * 1000);
+            return new NetTimeoutError(url.href);
+        })(),
+    ]);
+    destinationCache.set(cacheKey, result);
+    if (result instanceof Error) {
+        throw result;
+    }
+    return result;
+}
+/**
+ * Performs the actual HTTP request to retrieve page metadata.
+ *
+ * Handles both HTTP and HTTPS protocols via `follow-redirects`, tracks redirect chains,
+ * and falls back to GET on certain status codes (405, 501, 503).
+ * @param url - The extended URL to request.
+ * @param isExternal - Whether the URL is external to the crawl scope.
+ * @param method - The HTTP method (`"HEAD"` or `"GET"`).
+ * @param titleBytesLimit - When set, reads up to this many bytes from the response body
+ *   to extract a `<title>` tag, then destroys the connection.
+ * @param userAgent - Optional User-Agent string to send with the request.
+ * @returns A promise resolving to {@link PageData} with response metadata.
+ */
+async function _fetchHead(url, isExternal, method, titleBytesLimit, userAgent) {
+    return new Promise((resolve, reject) => {
+        const hostHeader = url.port ? `${url.hostname}:${url.port}` : url.hostname;
+        const request = {
+            protocol: url.protocol,
+            hostname: url.hostname,
+            port: url.port || undefined,
+            path: url.pathname,
+            method,
+            headers: {
+                host: hostHeader,
+                ...(userAgent ? { 'User-Agent': userAgent } : {}),
+                Connection: 'keep-alive',
+                Pragma: 'no-cache',
+                'Cache-Control': 'no-cache',
+                'Upgrade-Insecure-Requests': 1,
+                Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
+                'Accept-Encoding': 'gzip, deflate',
+                'Accept-Language': 'ja,en;q=0.9,zh;q=0.8,en-US;q=0.7,pl;q=0.6,de;q=0.5,zh-CN;q=0.4,zh-TW;q=0.3,th;q=0.2,ko;q=0.1,fr;q=0.1',
+                // Range: url.extname?.toLowerCase() === 'pdf' ? 'bytes=0-0' : undefined,
+            },
+        };
+        if (url.username && url.password) {
+            request.auth = `${url.username}:${url.password}`;
+        }
+        let req;
+        let destroyed = false;
+        const response = (res) => {
+            const chunks = [];
+            let totalBytes = 0;
+            let settled = false;
+            const buildPageData = (title) => {
+                const redirectPaths = res.redirects.map((r) => r.url);
+                const _contentLength = Number.parseInt(res.headers['content-length'] || '');
+                const contentLength = Number.isFinite(_contentLength) ? _contentLength : null;
+                return {
+                    url,
+                    isTarget: !isExternal,
+                    isExternal,
+                    redirectPaths,
+                    status: res.statusCode || 0,
+                    statusText: res.statusMessage || '',
+                    contentType: res.headers['content-type']?.split(';')[0] || null,
+                    contentLength,
+                    responseHeaders: res.headers,
+                    meta: { title },
+                    imageList: [],
+                    anchorList: [],
+                    html: '',
+                    isSkipped: false,
+                };
+            };
+            if (titleBytesLimit == null) {
+                res.on('data', () => { });
+                res.on('end', async () => {
+                    let rep = buildPageData('');
+                    if (rep.status === 405) {
+                        if (method === 'GET') {
+                            reject(new Error(`Method Not Allowed: ${url.href} ${rep.statusText}`));
+                            return;
+                        }
+                        try {
+                            rep = await fetchDestination({ url, isExternal, method: 'GET' });
+                        }
+                        catch (error) {
+                            reject(error);
+                            return;
+                        }
+                    }
+                    if (rep.status === 501) {
+                        if (method === 'GET') {
+                            reject(new Error(`Method Not Implemented: ${url.href} ${rep.statusText}`));
+                            return;
+                        }
+                        await delay(5 * 1000);
+                        try {
+                            rep = await fetchDestination({ url, isExternal, method: 'GET' });
+                        }
+                        catch (error) {
+                            reject(error);
+                            return;
+                        }
+                    }
+                    if (rep.status === 503) {
+                        if (method === 'GET') {
+                            reject(new Error(`Retrying failed: ${url.href} ${rep.statusText}`));
+                            return;
+                        }
+                        await delay(5 * 1000);
+                        try {
+                            rep = await fetchDestination({ url, isExternal, method: 'GET' });
+                        }
+                        catch (error) {
+                            reject(error);
+                            return;
+                        }
+                    }
+                    resolve(rep);
+                });
+            }
+            else {
+                res.on('data', (chunk) => {
+                    if (settled)
+                        return;
+                    chunks.push(chunk);
+                    totalBytes += chunk.length;
+                    // Check for title in accumulated data so far
+                    const body = Buffer.concat(chunks).toString('utf8');
+                    const titleMatch = /<title[^>]*>([\s\S]*?)<\/title>/i.exec(body);
+                    if (titleMatch) {
+                        settled = true;
+                        const title = titleMatch[1]?.trim() ?? '';
+                        resolve(buildPageData(title));
+                        destroyed = true;
+                        req.destroy();
+                        return;
+                    }
+                    // Reached byte limit without finding title
+                    if (totalBytes >= titleBytesLimit) {
+                        settled = true;
+                        resolve(buildPageData(''));
+                        destroyed = true;
+                        req.destroy();
+                    }
+                });
+                res.on('end', () => {
+                    if (settled)
+                        return;
+                    settled = true;
+                    // Stream ended before limit — try to extract title from what we have
+                    const body = Buffer.concat(chunks).toString('utf8');
+                    const titleMatch = /<title[^>]*>([\s\S]*?)<\/title>/i.exec(body);
+                    const title = titleMatch?.[1]?.trim() ?? '';
+                    resolve(buildPageData(title));
+                });
+            }
+        };
+        if (url.protocol === 'https:') {
+            req = redirects.https.request(request, response);
+        }
+        else {
+            req = redirects.http.request(request, response);
+        }
+        req.on('error', (error) => {
+            // Ignore errors caused by intentional req.destroy()
+            if (destroyed)
+                return;
+            reject(error);
+        });
+        req.end();
+    });
+}

package/lib/crawler/fetch-robots-txt.d.ts ADDED Viewed

@@ -0,0 +1,42 @@
+/**
+ * Result of parsing a robots.txt file.
+ */
+interface RobotsResult {
+    /**
+     * Check if a URL is allowed for a given user-agent.
+     * @param url - The URL to check.
+     * @param ua - The user-agent string to match against.
+     * @returns `true` if allowed, `false` if disallowed, `undefined` if no matching rule.
+     */
+    isAllowed(url: string, ua?: string): boolean | undefined;
+    /**
+     * Check if a URL is disallowed for a given user-agent.
+     * @param url - The URL to check.
+     * @param ua - The user-agent string to match against.
+     * @returns `true` if disallowed, `false` if allowed, `undefined` if no matching rule.
+     */
+    isDisallowed(url: string, ua?: string): boolean | undefined;
+    /**
+     * Get the crawl delay for a given user-agent.
+     * @param ua - The user-agent string to match against.
+     * @returns The crawl delay in seconds, or `undefined` if not specified.
+     */
+    getCrawlDelay(ua?: string): number | undefined;
+    /**
+     * Get the sitemaps listed in robots.txt.
+     * @returns An array of sitemap URLs.
+     */
+    getSitemaps(): string[];
+}
+/**
+ * Fetches and parses the robots.txt file for a given origin URL.
+ *
+ * Sends an HTTP(S) GET request to `{origin}/robots.txt` and parses the
+ * response using `robots-parser`. Returns `null` if the server returns
+ * a non-200 status code or if the request fails.
+ * @param origin - The origin URL (e.g., `https://example.com`).
+ * @param userAgent - Optional User-Agent string to send with the request.
+ * @returns A parsed RobotsResult instance, or `null` if robots.txt is unavailable.
+ */
+export declare function fetchRobotsTxt(origin: string, userAgent?: string): Promise<RobotsResult | null>;
+export {};

package/lib/crawler/fetch-robots-txt.js ADDED Viewed

@@ -0,0 +1,44 @@
+import { createRequire } from 'node:module';
+import redirects from 'follow-redirects';
+const require = createRequire(import.meta.url);
+const robotsParser = require('robots-parser');
+/**
+ * Fetches and parses the robots.txt file for a given origin URL.
+ *
+ * Sends an HTTP(S) GET request to `{origin}/robots.txt` and parses the
+ * response using `robots-parser`. Returns `null` if the server returns
+ * a non-200 status code or if the request fails.
+ * @param origin - The origin URL (e.g., `https://example.com`).
+ * @param userAgent - Optional User-Agent string to send with the request.
+ * @returns A parsed RobotsResult instance, or `null` if robots.txt is unavailable.
+ */
+export async function fetchRobotsTxt(origin, userAgent) {
+    const robotsUrl = `${origin}/robots.txt`;
+    return new Promise((resolve) => {
+        const protocol = robotsUrl.startsWith('https') ? redirects.https : redirects.http;
+        const req = protocol.get(robotsUrl, {
+            headers: {
+                ...(userAgent ? { 'User-Agent': userAgent } : {}),
+            },
+            timeout: 10_000,
+        }, (res) => {
+            if (res.statusCode !== 200) {
+                res.resume();
+                resolve(null);
+                return;
+            }
+            const chunks = [];
+            res.on('data', (chunk) => chunks.push(chunk));
+            res.on('end', () => {
+                const body = Buffer.concat(chunks).toString('utf8');
+                resolve(robotsParser(robotsUrl, body));
+            });
+            res.on('error', () => resolve(null));
+        });
+        req.on('error', () => resolve(null));
+        req.on('timeout', () => {
+            req.destroy();
+            resolve(null);
+        });
+    });
+}

package/lib/crawler/find-best-matching-scope.d.ts ADDED Viewed

@@ -0,0 +1,12 @@
+import type { ExURL } from '@d-zero/shared/parse-url';
+/**
+ * Find the scope URL with the deepest matching path for a given URL.
+ *
+ * Among all scope URLs sharing the same hostname, returns the one whose
+ * path segments are a prefix of the target URL's path segments and which
+ * has the greatest depth. Returns `null` if no scope URL matches.
+ * @param url - The parsed URL to match against scope URLs.
+ * @param scopes - The list of scope URLs to search.
+ * @returns The best-matching scope URL, or `null` if none match.
+ */
+export declare function findBestMatchingScope(url: ExURL, scopes: readonly ExURL[]): ExURL | null;

package/lib/crawler/find-best-matching-scope.js ADDED Viewed

@@ -0,0 +1,46 @@
+/**
+ * Find the scope URL with the deepest matching path for a given URL.
+ *
+ * Among all scope URLs sharing the same hostname, returns the one whose
+ * path segments are a prefix of the target URL's path segments and which
+ * has the greatest depth. Returns `null` if no scope URL matches.
+ * @param url - The parsed URL to match against scope URLs.
+ * @param scopes - The list of scope URLs to search.
+ * @returns The best-matching scope URL, or `null` if none match.
+ */
+export function findBestMatchingScope(url, scopes) {
+    let bestMatch = null;
+    let maxDepth = -1;
+    for (const scope of scopes) {
+        if (url.hostname !== scope.hostname) {
+            continue;
+        }
+        const isMatch = isPathMatch(url.paths, scope.paths);
+        if (isMatch && scope.depth > maxDepth) {
+            bestMatch = scope;
+            maxDepth = scope.depth;
+        }
+    }
+    return bestMatch;
+}
+/**
+ * Check whether a target path is equal to or is a descendant of a base path.
+ *
+ * Compares path segments element by element. The target path matches if
+ * all segments of the base path appear in the same positions at the
+ * beginning of the target path.
+ * @param targetPaths - The path segments of the URL being checked.
+ * @param basePaths - The path segments of the scope URL to match against.
+ * @returns `true` if the target path starts with or equals the base path.
+ */
+function isPathMatch(targetPaths, basePaths) {
+    if (targetPaths.length < basePaths.length) {
+        return false;
+    }
+    for (const [i, basePath] of basePaths.entries()) {
+        if (targetPaths[i] !== basePath) {
+            return false;
+        }
+    }
+    return true;
+}

package/lib/crawler/generate-predicted-urls.d.ts ADDED Viewed

@@ -0,0 +1,13 @@
+import type { PaginationPattern } from './types.js';
+/**
+ * Generates predicted URLs by extrapolating the detected pagination pattern.
+ *
+ * Starting from `currentUrl`, applies the pattern's step `count` times to produce
+ * future page URLs (e.g. if step=1 and currentNumber=2, generates page 3, 4, ...).
+ * These URLs are pushed into the crawl queue and discarded later if they 404.
+ * @param pattern - The detected pagination pattern from `detectPaginationPattern()`
+ * @param currentUrl - The URL to extrapolate from (protocol-agnostic, without hash/auth)
+ * @param count - Number of predicted URLs to generate (typically equals concurrency)
+ * @returns Array of predicted URL strings
+ */
+export declare function generatePredictedUrls(pattern: PaginationPattern, currentUrl: string, count: number): string[];

package/lib/crawler/generate-predicted-urls.js ADDED Viewed

@@ -0,0 +1,27 @@
+import { decomposeUrl } from './decompose-url.js';
+import { reconstructUrl } from './reconstruct-url.js';
+/**
+ * Generates predicted URLs by extrapolating the detected pagination pattern.
+ *
+ * Starting from `currentUrl`, applies the pattern's step `count` times to produce
+ * future page URLs (e.g. if step=1 and currentNumber=2, generates page 3, 4, ...).
+ * These URLs are pushed into the crawl queue and discarded later if they 404.
+ * @param pattern - The detected pagination pattern from `detectPaginationPattern()`
+ * @param currentUrl - The URL to extrapolate from (protocol-agnostic, without hash/auth)
+ * @param count - Number of predicted URLs to generate (typically equals concurrency)
+ * @returns Array of predicted URL strings
+ */
+export function generatePredictedUrls(pattern, currentUrl, count) {
+    if (count <= 0)
+        return [];
+    const decomposed = decomposeUrl(currentUrl);
+    if (!decomposed)
+        return [];
+    const results = [];
+    for (let i = 1; i <= count; i++) {
+        const nextNum = pattern.currentNumber + pattern.step * i;
+        const url = reconstructUrl(decomposed, pattern.tokenIndex, String(nextNum));
+        results.push(url);
+    }
+    return results;
+}

package/lib/crawler/handle-ignore-and-skip.d.ts ADDED Viewed

@@ -0,0 +1,16 @@
+import type LinkList from './link-list.js';
+import type { CrawlerOptions } from './types.js';
+import type { Link } from '../utils/index.js';
+import type { ExURL } from '@d-zero/shared/parse-url';
+/**
+ * Handle a URL that was ignored or skipped during scraping.
+ *
+ * Marks the URL as done in the link list without any page data,
+ * effectively recording that it was encountered but not scraped.
+ * @param url - The URL that was skipped.
+ * @param linkList - The link list managing the crawl queue.
+ * @param scope - Map of hostnames to their scope URLs.
+ * @param options - Crawler configuration options.
+ * @returns The constructed {@link Link} object, or `null` if the URL was not in the queue.
+ */
+export declare function handleIgnoreAndSkip(url: ExURL, linkList: LinkList, scope: ReadonlyMap<string, readonly ExURL[]>, options: CrawlerOptions): Link | null;

package/lib/crawler/handle-ignore-and-skip.js ADDED Viewed

@@ -0,0 +1,19 @@
+import { crawlerLog } from '../debug.js';
+/**
+ * Handle a URL that was ignored or skipped during scraping.
+ *
+ * Marks the URL as done in the link list without any page data,
+ * effectively recording that it was encountered but not scraped.
+ * @param url - The URL that was skipped.
+ * @param linkList - The link list managing the crawl queue.
+ * @param scope - Map of hostnames to their scope URLs.
+ * @param options - Crawler configuration options.
+ * @returns The constructed {@link Link} object, or `null` if the URL was not in the queue.
+ */
+export function handleIgnoreAndSkip(url, linkList, scope, options) {
+    const updated = linkList.done(url, scope, {}, options);
+    if (updated) {
+        crawlerLog('Skipped URL: %s', url.href);
+    }
+    return updated;
+}