npm - @nitpicker/crawler - Versions diffs - 0.4.1 - Mend

@nitpicker/crawler 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

package/CHANGELOG.md +8 -0
package/LICENSE +191 -0
package/README.md +13 -0
package/lib/archive/archive-accessor.d.ts +107 -0
package/lib/archive/archive-accessor.js +264 -0
package/lib/archive/archive.d.ts +174 -0
package/lib/archive/archive.js +331 -0
package/lib/archive/database.d.ts +207 -0
package/lib/archive/database.js +972 -0
package/lib/archive/debug.d.ts +8 -0
package/lib/archive/debug.js +9 -0
package/lib/archive/filesystem/append-text.d.ts +9 -0
package/lib/archive/filesystem/append-text.js +14 -0
package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
package/lib/archive/filesystem/copy-dir-sync.js +9 -0
package/lib/archive/filesystem/copy-dir.d.ts +7 -0
package/lib/archive/filesystem/copy-dir.js +13 -0
package/lib/archive/filesystem/exists.d.ts +6 -0
package/lib/archive/filesystem/exists.js +9 -0
package/lib/archive/filesystem/get-file-list.d.ts +8 -0
package/lib/archive/filesystem/get-file-list.js +12 -0
package/lib/archive/filesystem/index.d.ts +17 -0
package/lib/archive/filesystem/index.js +17 -0
package/lib/archive/filesystem/is-dir.d.ts +6 -0
package/lib/archive/filesystem/is-dir.js +10 -0
package/lib/archive/filesystem/mkdir.d.ts +8 -0
package/lib/archive/filesystem/mkdir.js +15 -0
package/lib/archive/filesystem/output-json.d.ts +9 -0
package/lib/archive/filesystem/output-json.js +14 -0
package/lib/archive/filesystem/output-text.d.ts +11 -0
package/lib/archive/filesystem/output-text.js +32 -0
package/lib/archive/filesystem/read-json.d.ts +7 -0
package/lib/archive/filesystem/read-json.js +11 -0
package/lib/archive/filesystem/read-text.d.ts +6 -0
package/lib/archive/filesystem/read-text.js +10 -0
package/lib/archive/filesystem/readline.d.ts +11 -0
package/lib/archive/filesystem/readline.js +26 -0
package/lib/archive/filesystem/remove.d.ts +5 -0
package/lib/archive/filesystem/remove.js +10 -0
package/lib/archive/filesystem/rename.d.ts +11 -0
package/lib/archive/filesystem/rename.js +18 -0
package/lib/archive/filesystem/tar.d.ts +11 -0
package/lib/archive/filesystem/tar.js +22 -0
package/lib/archive/filesystem/untar.d.ts +20 -0
package/lib/archive/filesystem/untar.js +24 -0
package/lib/archive/filesystem/utils.d.ts +109 -0
package/lib/archive/filesystem/utils.js +185 -0
package/lib/archive/filesystem/zip.d.ts +29 -0
package/lib/archive/filesystem/zip.js +53 -0
package/lib/archive/index.d.ts +6 -0
package/lib/archive/index.js +11 -0
package/lib/archive/page.d.ts +263 -0
package/lib/archive/page.js +316 -0
package/lib/archive/resource.d.ts +46 -0
package/lib/archive/resource.js +62 -0
package/lib/archive/safe-path.d.ts +9 -0
package/lib/archive/safe-path.js +17 -0
package/lib/archive/types.d.ts +210 -0
package/lib/archive/types.js +1 -0
package/lib/crawler/clear-destination-cache.d.ts +5 -0
package/lib/crawler/clear-destination-cache.js +8 -0
package/lib/crawler/crawler.d.ts +73 -0
package/lib/crawler/crawler.js +748 -0
package/lib/crawler/decompose-url.d.ts +25 -0
package/lib/crawler/decompose-url.js +71 -0
package/lib/crawler/destination-cache.d.ts +7 -0
package/lib/crawler/destination-cache.js +6 -0
package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
package/lib/crawler/detect-pagination-pattern.js +61 -0
package/lib/crawler/fetch-destination.d.ts +38 -0
package/lib/crawler/fetch-destination.js +208 -0
package/lib/crawler/fetch-robots-txt.d.ts +42 -0
package/lib/crawler/fetch-robots-txt.js +44 -0
package/lib/crawler/find-best-matching-scope.d.ts +12 -0
package/lib/crawler/find-best-matching-scope.js +46 -0
package/lib/crawler/generate-predicted-urls.d.ts +13 -0
package/lib/crawler/generate-predicted-urls.js +27 -0
package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
package/lib/crawler/handle-ignore-and-skip.js +19 -0
package/lib/crawler/handle-resource-response.d.ts +13 -0
package/lib/crawler/handle-resource-response.js +16 -0
package/lib/crawler/handle-scrape-end.d.ts +24 -0
package/lib/crawler/handle-scrape-end.js +82 -0
package/lib/crawler/handle-scrape-error.d.ts +37 -0
package/lib/crawler/handle-scrape-error.js +38 -0
package/lib/crawler/index.d.ts +2 -0
package/lib/crawler/index.js +2 -0
package/lib/crawler/inject-scope-auth.d.ts +11 -0
package/lib/crawler/inject-scope-auth.js +21 -0
package/lib/crawler/is-external-url.d.ts +11 -0
package/lib/crawler/is-external-url.js +12 -0
package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
package/lib/crawler/is-in-any-lower-layer.js +15 -0
package/lib/crawler/link-list.d.ts +112 -0
package/lib/crawler/link-list.js +248 -0
package/lib/crawler/link-to-page-data.d.ts +14 -0
package/lib/crawler/link-to-page-data.js +32 -0
package/lib/crawler/net-timeout-error.d.ts +9 -0
package/lib/crawler/net-timeout-error.js +11 -0
package/lib/crawler/network.d.ts +30 -0
package/lib/crawler/network.js +226 -0
package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
package/lib/crawler/protocol-agnostic-key.js +11 -0
package/lib/crawler/reconstruct-url.d.ts +10 -0
package/lib/crawler/reconstruct-url.js +28 -0
package/lib/crawler/result-handler.d.ts +118 -0
package/lib/crawler/result-handler.js +153 -0
package/lib/crawler/robots-checker.d.ts +26 -0
package/lib/crawler/robots-checker.js +62 -0
package/lib/crawler/should-discard-predicted.d.ts +14 -0
package/lib/crawler/should-discard-predicted.js +31 -0
package/lib/crawler/should-skip-url.d.ts +23 -0
package/lib/crawler/should-skip-url.js +15 -0
package/lib/crawler/speculative-pagination.d.ts +52 -0
package/lib/crawler/speculative-pagination.js +215 -0
package/lib/crawler/types.d.ts +119 -0
package/lib/crawler/types.js +1 -0
package/lib/crawler/url-filter.d.ts +56 -0
package/lib/crawler/url-filter.js +110 -0
package/lib/crawler-orchestrator.d.ts +142 -0
package/lib/crawler-orchestrator.js +309 -0
package/lib/debug.d.ts +8 -0
package/lib/debug.js +9 -0
package/lib/index.d.ts +16 -0
package/lib/index.js +18 -0
package/lib/qzilla.d.ts +136 -0
package/lib/qzilla.js +292 -0
package/lib/types.d.ts +27 -0
package/lib/types.js +1 -0
package/lib/utils/array/each-splitted.d.ts +10 -0
package/lib/utils/array/each-splitted.js +14 -0
package/lib/utils/array/index.d.ts +1 -0
package/lib/utils/array/index.js +1 -0
package/lib/utils/async/index.d.ts +1 -0
package/lib/utils/async/index.js +1 -0
package/lib/utils/debug.d.ts +5 -0
package/lib/utils/debug.js +5 -0
package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
package/lib/utils/error/dom-evaluation-error.js +7 -0
package/lib/utils/error/error-emitter.d.ts +18 -0
package/lib/utils/error/error-emitter.js +29 -0
package/lib/utils/error/index.d.ts +3 -0
package/lib/utils/error/index.js +2 -0
package/lib/utils/event-emitter/index.d.ts +6 -0
package/lib/utils/event-emitter/index.js +6 -0
package/lib/utils/index.d.ts +5 -0
package/lib/utils/index.js +5 -0
package/lib/utils/network/index.d.ts +1 -0
package/lib/utils/network/index.js +1 -0
package/lib/utils/object/clean-object.d.ts +8 -0
package/lib/utils/object/clean-object.js +13 -0
package/lib/utils/object/index.d.ts +1 -0
package/lib/utils/object/index.js +1 -0
package/lib/utils/path/index.d.ts +1 -0
package/lib/utils/path/index.js +1 -0
package/lib/utils/path/safe-filepath.d.ts +7 -0
package/lib/utils/path/safe-filepath.js +12 -0
package/lib/utils/regexp/index.d.ts +1 -0
package/lib/utils/regexp/index.js +1 -0
package/lib/utils/retryable/index.d.ts +2 -0
package/lib/utils/retryable/index.js +1 -0
package/lib/utils/sort/index.d.ts +14 -0
package/lib/utils/sort/index.js +61 -0
package/lib/utils/sort/remove-matches.d.ts +9 -0
package/lib/utils/sort/remove-matches.js +23 -0
package/lib/utils/types/index.d.ts +1 -0
package/lib/utils/types/index.js +1 -0
package/lib/utils/types/types.d.ts +46 -0
package/lib/utils/types/types.js +1 -0
package/lib/utils/url/index.d.ts +5 -0
package/lib/utils/url/index.js +5 -0
package/lib/utils/url/is-lower-layer.d.ts +15 -0
package/lib/utils/url/is-lower-layer.js +55 -0
package/lib/utils/url/parse-url.d.ts +11 -0
package/lib/utils/url/parse-url.js +20 -0
package/lib/utils/url/path-match.d.ts +11 -0
package/lib/utils/url/path-match.js +18 -0
package/lib/utils/url/sort-url.d.ts +10 -0
package/lib/utils/url/sort-url.js +24 -0
package/lib/utils/url/url-partial-match.d.ts +11 -0
package/lib/utils/url/url-partial-match.js +32 -0
package/package.json +49 -0
package/src/archive/__mock__/.gitignore +3 -0
package/src/archive/__mock__/mock.sqlite +0 -0
package/src/archive/archive-accessor.ts +337 -0
package/src/archive/archive.ts +408 -0
package/src/archive/database.spec.ts +469 -0
package/src/archive/database.ts +1059 -0
package/src/archive/debug.ts +10 -0
package/src/archive/filesystem/append-text.spec.ts +26 -0
package/src/archive/filesystem/append-text.ts +16 -0
package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
package/src/archive/filesystem/copy-dir-sync.ts +10 -0
package/src/archive/filesystem/copy-dir.spec.ts +33 -0
package/src/archive/filesystem/copy-dir.ts +14 -0
package/src/archive/filesystem/exists.spec.ts +33 -0
package/src/archive/filesystem/exists.ts +10 -0
package/src/archive/filesystem/get-file-list.spec.ts +37 -0
package/src/archive/filesystem/get-file-list.ts +13 -0
package/src/archive/filesystem/index.ts +17 -0
package/src/archive/filesystem/is-dir.spec.ts +29 -0
package/src/archive/filesystem/is-dir.ts +11 -0
package/src/archive/filesystem/mkdir.spec.ts +37 -0
package/src/archive/filesystem/mkdir.ts +16 -0
package/src/archive/filesystem/output-json.spec.ts +34 -0
package/src/archive/filesystem/output-json.ts +16 -0
package/src/archive/filesystem/output-text.spec.ts +31 -0
package/src/archive/filesystem/output-text.ts +35 -0
package/src/archive/filesystem/read-json.spec.ts +26 -0
package/src/archive/filesystem/read-json.ts +12 -0
package/src/archive/filesystem/read-text.spec.ts +25 -0
package/src/archive/filesystem/read-text.ts +11 -0
package/src/archive/filesystem/readline.spec.ts +29 -0
package/src/archive/filesystem/readline.ts +30 -0
package/src/archive/filesystem/remove.spec.ts +34 -0
package/src/archive/filesystem/remove.ts +11 -0
package/src/archive/filesystem/rename.spec.ts +46 -0
package/src/archive/filesystem/rename.ts +21 -0
package/src/archive/filesystem/tar.spec.ts +33 -0
package/src/archive/filesystem/tar.ts +27 -0
package/src/archive/filesystem/untar.spec.ts +34 -0
package/src/archive/filesystem/untar.ts +36 -0
package/src/archive/index.ts +13 -0
package/src/archive/page.spec.ts +368 -0
package/src/archive/page.ts +420 -0
package/src/archive/resource.spec.ts +101 -0
package/src/archive/resource.ts +73 -0
package/src/archive/safe-path.spec.ts +44 -0
package/src/archive/safe-path.ts +18 -0
package/src/archive/types.ts +227 -0
package/src/crawler/clear-destination-cache.spec.ts +20 -0
package/src/crawler/clear-destination-cache.ts +9 -0
package/src/crawler/crawler.ts +873 -0
package/src/crawler/decompose-url.spec.ts +48 -0
package/src/crawler/decompose-url.ts +90 -0
package/src/crawler/destination-cache.spec.ts +23 -0
package/src/crawler/destination-cache.ts +8 -0
package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
package/src/crawler/detect-pagination-pattern.ts +66 -0
package/src/crawler/fetch-destination.ts +257 -0
package/src/crawler/fetch-robots-txt.spec.ts +83 -0
package/src/crawler/fetch-robots-txt.ts +91 -0
package/src/crawler/find-best-matching-scope.spec.ts +39 -0
package/src/crawler/find-best-matching-scope.ts +57 -0
package/src/crawler/generate-predicted-urls.spec.ts +42 -0
package/src/crawler/generate-predicted-urls.ts +34 -0
package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
package/src/crawler/handle-ignore-and-skip.ts +30 -0
package/src/crawler/handle-resource-response.spec.ts +45 -0
package/src/crawler/handle-resource-response.ts +21 -0
package/src/crawler/handle-scrape-end.spec.ts +109 -0
package/src/crawler/handle-scrape-end.ts +115 -0
package/src/crawler/handle-scrape-error.spec.ts +105 -0
package/src/crawler/handle-scrape-error.ts +58 -0
package/src/crawler/index.ts +2 -0
package/src/crawler/inject-scope-auth.spec.ts +36 -0
package/src/crawler/inject-scope-auth.ts +27 -0
package/src/crawler/is-external-url.spec.ts +31 -0
package/src/crawler/is-external-url.ts +17 -0
package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
package/src/crawler/is-in-any-lower-layer.ts +22 -0
package/src/crawler/link-list.spec.ts +355 -0
package/src/crawler/link-list.ts +275 -0
package/src/crawler/link-to-page-data.spec.ts +133 -0
package/src/crawler/link-to-page-data.ts +34 -0
package/src/crawler/net-timeout-error.spec.ts +25 -0
package/src/crawler/net-timeout-error.ts +11 -0
package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
package/src/crawler/protocol-agnostic-key.ts +11 -0
package/src/crawler/reconstruct-url.spec.ts +37 -0
package/src/crawler/reconstruct-url.ts +37 -0
package/src/crawler/robots-checker.spec.ts +104 -0
package/src/crawler/robots-checker.ts +73 -0
package/src/crawler/should-discard-predicted.spec.ts +125 -0
package/src/crawler/should-discard-predicted.ts +33 -0
package/src/crawler/should-skip-url.spec.ts +77 -0
package/src/crawler/should-skip-url.ts +37 -0
package/src/crawler/types.ts +146 -0
package/src/crawler-orchestrator.ts +401 -0
package/src/debug.ts +10 -0
package/src/index.ts +25 -0
package/src/types.ts +30 -0
package/src/utils/array/each-splitted.spec.ts +38 -0
package/src/utils/array/each-splitted.ts +19 -0
package/src/utils/array/index.ts +1 -0
package/src/utils/debug.ts +6 -0
package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
package/src/utils/error/dom-evaluation-error.ts +6 -0
package/src/utils/error/error-emitter.spec.ts +78 -0
package/src/utils/error/error-emitter.ts +44 -0
package/src/utils/error/index.ts +3 -0
package/src/utils/index.ts +5 -0
package/src/utils/object/clean-object.spec.ts +24 -0
package/src/utils/object/clean-object.ts +13 -0
package/src/utils/object/index.ts +1 -0
package/src/utils/types/index.ts +1 -0
package/src/utils/types/types.ts +65 -0
package/tsconfig.json +11 -0
package/tsconfig.tsbuildinfo +1 -0

package/lib/crawler/result-handler.d.ts ADDED Viewed

@@ -0,0 +1,118 @@
+import type LinkList from './link-list.js';
+import type { Link, PageData, Resource } from '../utils/index.js';
+import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
+/**
+ * Configuration options that control crawler behavior.
+ *
+ * Used by the result handler functions to determine how to process
+ * scrape results, which URLs to follow, and how to handle external links.
+ * @see {@link ./crawler.ts | Crawler} for the main consumer of this type
+ * @see {@link ../crawler-orchestrator.ts | CrawlerOrchestrator} for factory methods that build these options
+ */
+export type CrawlerOptions = {
+    /** Delay in milliseconds between page requests. */
+    interval: number;
+    /** Maximum number of concurrent scraping processes. 0 uses the default. */
+    parallels: number;
+    /** Whether to recursively follow discovered links within the scope. */
+    recursive: boolean;
+    /** Whether the crawl was started from a pre-defined URL list. */
+    fromList: boolean;
+    /** Whether to capture image resources during scraping. */
+    isGettingImages: boolean;
+    /** Path to the Chromium/Chrome executable, or `null` for the bundled version. */
+    executablePath: string | null;
+    /** Whether to fetch and scrape external (out-of-scope) pages. */
+    fetchExternal: boolean;
+    /** List of scope URL strings that define the crawl boundary. */
+    scope: string[];
+    /** Glob patterns for URLs to exclude from crawling. */
+    excludes: string[];
+    /** Keywords that trigger page exclusion when found in content. */
+    excludeKeywords: string[];
+    /** URL prefixes to exclude from crawling (merged defaults + user additions). */
+    excludeUrls: readonly string[];
+    /** Maximum directory depth for crawling avoidance heuristics. */
+    depthOnAvoid: number;
+    /** Maximum number of retry attempts per URL on scrape failure. */
+    retry: number;
+    /** Whether to enable verbose logging. */
+    verbose: boolean;
+} & Required<Pick<ParseURLOptions, 'disableQueries'>>;
+/**
+ * Process the result of a successful page scrape.
+ *
+ * Extracts anchors from the page (unless in title-only mode), enqueues
+ * newly discovered URLs via the `addUrl` callback, and marks the URL
+ * as done in the link list.
+ * @param result - The scraped page data.
+ * @param linkList - The link list managing the crawl queue.
+ * @param scope - Map of hostnames to their scope URLs.
+ * @param options - Crawler configuration options.
+ * @param addUrl - Callback to enqueue a newly discovered URL. Accepts optional
+ *   `{ titleOnly: true }` to request metadata-only scraping.
+ * @returns An object containing the constructed link and whether the page is external.
+ */
+export declare function handleScrapeEnd(result: PageData, linkList: LinkList, scope: ReadonlyMap<string, readonly ExURL[]>, options: CrawlerOptions, addUrl: (url: ExURL, opts?: {
+    titleOnly?: true;
+}) => void): {
+    link: Link | null;
+    isExternal: boolean;
+};
+/**
+ * Handle a URL that was ignored or skipped during scraping.
+ *
+ * Marks the URL as done in the link list without any page data,
+ * effectively recording that it was encountered but not scraped.
+ * @param url - The URL that was skipped.
+ * @param linkList - The link list managing the crawl queue.
+ * @param scope - Map of hostnames to their scope URLs.
+ * @param options - Crawler configuration options.
+ * @returns The constructed {@link Link} object, or `null` if the URL was not in the queue.
+ */
+export declare function handleIgnoreAndSkip(url: ExURL, linkList: LinkList, scope: ReadonlyMap<string, readonly ExURL[]>, options: CrawlerOptions): Link | null;
+/**
+ * Track a network resource response and determine if it is newly discovered.
+ *
+ * Checks whether the resource URL has already been seen. If it is new,
+ * adds it to the known resources set.
+ * @param resource - The captured network resource data.
+ * @param resources - The set of already-known resource URLs (without hash).
+ * @returns An object with `isNew` indicating whether this resource was seen for the first time.
+ */
+export declare function handleResourceResponse(resource: Resource, resources: Set<string>): {
+    isNew: boolean;
+};
+/**
+ * Handle an error that occurred during page scraping.
+ *
+ * Marks the URL as done and creates a fallback {@link PageData} from the
+ * link, regardless of whether the error caused a shutdown. This ensures
+ * that errored URLs are recorded in the DB (`status = -1, scraped = 1`)
+ * and not re-queued on resume.
+ * @param payload - The error payload from the scraper.
+ * @param payload.url - The URL being scraped when the error occurred, or `null`.
+ * @param payload.error - The error details including name, message, and optional stack.
+ * @param payload.error.name
+ * @param payload.error.message
+ * @param payload.error.stack
+ * @param payload.shutdown - Whether the error caused the scraper process to shut down.
+ * @param payload.pid - The process ID of the scraper, or `undefined`.
+ * @param linkList - The link list managing the crawl queue.
+ * @param scope - Map of hostnames to their scope URLs.
+ * @param options - Crawler configuration options.
+ * @returns An object with the link and an optional fallback PageData result.
+ */
+export declare function handleScrapeError(payload: {
+    url: ExURL | null;
+    error: {
+        name: string;
+        message: string;
+        stack?: string;
+    };
+    shutdown: boolean;
+    pid: number | undefined;
+}, linkList: LinkList, scope: ReadonlyMap<string, readonly ExURL[]>, options: CrawlerOptions): {
+    link: Link | null;
+    result?: PageData;
+};

package/lib/crawler/result-handler.js ADDED Viewed

@@ -0,0 +1,153 @@
+import { crawlerErrorLog, crawlerLog } from '../debug.js';
+import { linkToPageData } from './link-to-page-data.js';
+import { injectScopeAuth } from './inject-scope-auth.js';
+import { isExternalUrl } from './is-external-url.js';
+import { isInAnyLowerLayer } from './is-in-any-lower-layer.js';
+/**
+ * Process the result of a successful page scrape.
+ *
+ * Extracts anchors from the page (unless in title-only mode), enqueues
+ * newly discovered URLs via the `addUrl` callback, and marks the URL
+ * as done in the link list.
+ * @param result - The scraped page data.
+ * @param linkList - The link list managing the crawl queue.
+ * @param scope - Map of hostnames to their scope URLs.
+ * @param options - Crawler configuration options.
+ * @param addUrl - Callback to enqueue a newly discovered URL. Accepts optional
+ *   `{ titleOnly: true }` to request metadata-only scraping.
+ * @returns An object containing the constructed link and whether the page is external.
+ */
+export function handleScrapeEnd(result, linkList, scope, options, addUrl) {
+    const isTitleOnly = linkList.isTitleOnly(result.url.withoutHash);
+    if (!isTitleOnly) {
+        processAnchors(result.anchorList, scope, options, addUrl);
+    }
+    const link = linkList.done(result.url, scope, {
+        page: result,
+    }, options);
+    crawlerLog('Scrape end URL: %s', result.url.href);
+    crawlerLog('Scrape end Status: %d', result.status);
+    crawlerLog('Scrape end Type: %s', result.contentType);
+    if (!result.isExternal) {
+        crawlerLog('Scrape end Anchors: %d URLs', result.anchorList.length);
+    }
+    return { link, isExternal: result.isExternal };
+}
+/**
+ * Handle a URL that was ignored or skipped during scraping.
+ *
+ * Marks the URL as done in the link list without any page data,
+ * effectively recording that it was encountered but not scraped.
+ * @param url - The URL that was skipped.
+ * @param linkList - The link list managing the crawl queue.
+ * @param scope - Map of hostnames to their scope URLs.
+ * @param options - Crawler configuration options.
+ * @returns The constructed {@link Link} object, or `null` if the URL was not in the queue.
+ */
+export function handleIgnoreAndSkip(url, linkList, scope, options) {
+    const updated = linkList.done(url, scope, {}, options);
+    if (updated) {
+        crawlerLog('Skipped URL: %s', url.href);
+    }
+    return updated;
+}
+/**
+ * Track a network resource response and determine if it is newly discovered.
+ *
+ * Checks whether the resource URL has already been seen. If it is new,
+ * adds it to the known resources set.
+ * @param resource - The captured network resource data.
+ * @param resources - The set of already-known resource URLs (without hash).
+ * @returns An object with `isNew` indicating whether this resource was seen for the first time.
+ */
+export function handleResourceResponse(resource, resources) {
+    const isNew = !resources.has(resource.url.withoutHash);
+    if (isNew) {
+        resources.add(resource.url.withoutHash);
+    }
+    return { isNew };
+}
+/**
+ * Handle an error that occurred during page scraping.
+ *
+ * Marks the URL as done and creates a fallback {@link PageData} from the
+ * link, regardless of whether the error caused a shutdown. This ensures
+ * that errored URLs are recorded in the DB (`status = -1, scraped = 1`)
+ * and not re-queued on resume.
+ * @param payload - The error payload from the scraper.
+ * @param payload.url - The URL being scraped when the error occurred, or `null`.
+ * @param payload.error - The error details including name, message, and optional stack.
+ * @param payload.error.name
+ * @param payload.error.message
+ * @param payload.error.stack
+ * @param payload.shutdown - Whether the error caused the scraper process to shut down.
+ * @param payload.pid - The process ID of the scraper, or `undefined`.
+ * @param linkList - The link list managing the crawl queue.
+ * @param scope - Map of hostnames to their scope URLs.
+ * @param options - Crawler configuration options.
+ * @returns An object with the link and an optional fallback PageData result.
+ */
+export function handleScrapeError(payload, linkList, scope, options) {
+    const { url, error, shutdown, pid } = payload;
+    let link = null;
+    let result;
+    if (url) {
+        const updated = linkList.done(url, scope, { error }, options);
+        if (updated) {
+            link = updated;
+            result = linkToPageData(updated);
+        }
+    }
+    crawlerErrorLog('From %d(%s)', pid, url?.href ?? 'UNKNOWN_URL');
+    crawlerErrorLog('Then shutdown?: %s', shutdown ? 'Yes' : 'No');
+    crawlerErrorLog('%O', error);
+    return { link, result };
+}
+/**
+ * Process anchor elements extracted from a scraped page and enqueue new URLs.
+ *
+ * For each anchor:
+ * 1. Determines if it is external (outside the crawl scope)
+ * 2. Injects authentication credentials from matching scope URLs
+ * 3. Reconstructs the `withoutHash` URL with injected auth
+ * 4. In recursive mode: enqueues internal lower-layer URLs for full scraping,
+ *    and external URLs for title-only scraping (if `fetchExternal` is enabled)
+ * 5. In non-recursive mode: enqueues all URLs for title-only scraping
+ * @param anchors - The list of anchor data extracted from the page.
+ * @param scope - Map of hostnames to their scope URLs.
+ * @param options - Crawler configuration options.
+ * @param addUrl - Callback to enqueue a newly discovered URL. Accepts optional
+ *   `{ titleOnly: true }` to request metadata-only scraping.
+ */
+function processAnchors(anchors, scope, options, addUrl) {
+    for (const anchor of anchors) {
+        const isExternal = isExternalUrl(anchor.href, scope);
+        anchor.isExternal = isExternal;
+        if (!isExternal && (!anchor.href.username || !anchor.href.password)) {
+            injectScopeAuth(anchor.href, scope);
+            const auth = anchor.href.username && anchor.href.password
+                ? `${anchor.href.username}:${anchor.href.password}@`
+                : '';
+            const host = anchor.href.hostname + (anchor.href.port ? `:${anchor.href.port}` : '');
+            const newSearch = anchor.href.query ? `?${anchor.href.query}` : '';
+            const body = anchor.href.dirname
+                ? `${anchor.href.paths.join('/')}${newSearch}`
+                : newSearch
+                    ? `${newSearch}`
+                    : '';
+            const withoutHash = `${anchor.href.protocol}//${auth}${host}${body ? `/${body}` : ''}`;
+            anchor.href.withoutHash = withoutHash;
+        }
+        if (options.recursive) {
+            const scopes = scope.get(anchor.href.hostname);
+            if (scopes && isInAnyLowerLayer(anchor.href, scopes, options)) {
+                addUrl(anchor.href);
+            }
+            else if (isExternal && options.fetchExternal) {
+                addUrl(anchor.href, { titleOnly: true });
+            }
+            continue;
+        }
+        addUrl(anchor.href, { titleOnly: true });
+    }
+}

package/lib/crawler/robots-checker.d.ts ADDED Viewed

@@ -0,0 +1,26 @@
+import type { ExURL } from '@d-zero/shared/parse-url';
+/**
+ * Checks whether a URL is allowed by the site's robots.txt rules.
+ *
+ * Caches robots.txt per origin so each origin is fetched at most once.
+ * When disabled (i.e., `ignoreRobots` mode), all URLs are allowed.
+ */
+export declare class RobotsChecker {
+    #private;
+    /**
+     * Create a new RobotsChecker.
+     * @param userAgent - User-Agent string for rule matching and fetching robots.txt.
+     * @param enabled - Whether robots.txt checking is enabled. When `false`, {@link isAllowed} always returns `true`.
+     */
+    constructor(userAgent: string, enabled: boolean);
+    /**
+     * Check whether the given URL is allowed by the site's robots.txt.
+     *
+     * Fetches and caches robots.txt per origin on first access.
+     * Returns `true` if robots.txt checking is disabled, if no robots.txt
+     * exists, or if the URL is explicitly allowed.
+     * @param url - The URL to check.
+     * @returns `true` if the URL is allowed, `false` if blocked.
+     */
+    isAllowed(url: ExURL): Promise<boolean>;
+}

package/lib/crawler/robots-checker.js ADDED Viewed

@@ -0,0 +1,62 @@
+import { crawlerLog } from '../debug.js';
+import { fetchRobotsTxt } from './fetch-robots-txt.js';
+/**
+ * Derives the origin string from an ExURL (e.g., `https://example.com:8080`).
+ * @param url - The extended URL.
+ * @returns The origin string.
+ */
+function getOrigin(url) {
+    return `${url.protocol}//${url.hostname}${url.port ? `:${url.port}` : ''}`;
+}
+/**
+ * Checks whether a URL is allowed by the site's robots.txt rules.
+ *
+ * Caches robots.txt per origin so each origin is fetched at most once.
+ * When disabled (i.e., `ignoreRobots` mode), all URLs are allowed.
+ */
+export class RobotsChecker {
+    /** Cache of parsed robots.txt per origin. `null` means no robots.txt or fetch failed. */
+    #cache = new Map();
+    /** When `false`, robots.txt checking is disabled and all URLs are allowed. */
+    #enabled;
+    /** User-Agent string used for robots.txt rule matching and HTTP requests. */
+    #userAgent;
+    /**
+     * Create a new RobotsChecker.
+     * @param userAgent - User-Agent string for rule matching and fetching robots.txt.
+     * @param enabled - Whether robots.txt checking is enabled. When `false`, {@link isAllowed} always returns `true`.
+     */
+    constructor(userAgent, enabled) {
+        this.#userAgent = userAgent;
+        this.#enabled = enabled;
+    }
+    /**
+     * Check whether the given URL is allowed by the site's robots.txt.
+     *
+     * Fetches and caches robots.txt per origin on first access.
+     * Returns `true` if robots.txt checking is disabled, if no robots.txt
+     * exists, or if the URL is explicitly allowed.
+     * @param url - The URL to check.
+     * @returns `true` if the URL is allowed, `false` if blocked.
+     */
+    async isAllowed(url) {
+        if (!this.#enabled) {
+            return true;
+        }
+        if (!url.isHTTP) {
+            return true;
+        }
+        const origin = getOrigin(url);
+        if (!this.#cache.has(origin)) {
+            crawlerLog('Fetching robots.txt for %s', origin);
+            const robot = await fetchRobotsTxt(origin, this.#userAgent);
+            this.#cache.set(origin, robot);
+        }
+        const robot = this.#cache.get(origin);
+        if (!robot) {
+            return true;
+        }
+        const allowed = robot.isAllowed(url.href, this.#userAgent);
+        return allowed !== false;
+    }
+}

package/lib/crawler/should-discard-predicted.d.ts ADDED Viewed

@@ -0,0 +1,14 @@
+import type { ScrapeResult } from '@d-zero/beholder';
+/**
+ * Determines whether a predicted URL's scrape result should be discarded.
+ *
+ * Predicted URLs are pre-emptively pushed into the crawl queue before
+ * knowing if they exist. This function filters out invalid results:
+ * - `error` type → discard (server unreachable, timeout, etc.)
+ * - `skipped` type → discard (matched exclusion rule)
+ * - `success` with HTTP error status (4xx/5xx) → discard
+ * - `success` with 2xx/3xx → keep
+ * @param result - The scrape result for the predicted URL
+ * @returns `true` if the result should be discarded (not saved to archive)
+ */
+export declare function shouldDiscardPredicted(result: ScrapeResult): boolean;

package/lib/crawler/should-discard-predicted.js ADDED Viewed

@@ -0,0 +1,31 @@
+import { isError } from '@d-zero/beholder';
+/**
+ * Determines whether a predicted URL's scrape result should be discarded.
+ *
+ * Predicted URLs are pre-emptively pushed into the crawl queue before
+ * knowing if they exist. This function filters out invalid results:
+ * - `error` type → discard (server unreachable, timeout, etc.)
+ * - `skipped` type → discard (matched exclusion rule)
+ * - `success` with HTTP error status (4xx/5xx) → discard
+ * - `success` with 2xx/3xx → keep
+ * @param result - The scrape result for the predicted URL
+ * @returns `true` if the result should be discarded (not saved to archive)
+ */
+export function shouldDiscardPredicted(result) {
+    switch (result.type) {
+        case 'error': {
+            return true;
+        }
+        case 'skipped': {
+            return true;
+        }
+        case 'success': {
+            if (!result.pageData)
+                return true;
+            return isError(result.pageData.status);
+        }
+        default: {
+            return true;
+        }
+    }
+}

package/lib/crawler/should-skip-url.d.ts ADDED Viewed

@@ -0,0 +1,23 @@
+import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
+/**
+ * Parameters for {@link shouldSkipUrl}.
+ */
+export interface ShouldSkipUrlParams {
+    /** The parsed URL to check. */
+    readonly url: ExURL;
+    /** Array of glob patterns for URLs to exclude. */
+    readonly excludes: readonly string[];
+    /** Array of URL prefixes to exclude (matched via `startsWith`). */
+    readonly excludeUrls: readonly string[];
+    /** URL parsing options used for pattern matching. */
+    readonly options: ParseURLOptions;
+}
+/**
+ * Determine whether a URL should be skipped during crawling.
+ *
+ * A URL is skipped if it matches any user-defined exclude glob pattern
+ * or starts with any of the excluded URL prefixes.
+ * @param params - Parameters containing the URL, exclude patterns, and options.
+ * @returns `true` if the URL should be skipped.
+ */
+export declare function shouldSkipUrl(params: ShouldSkipUrlParams): boolean;

package/lib/crawler/should-skip-url.js ADDED Viewed

@@ -0,0 +1,15 @@
+import { pathMatch } from '@d-zero/shared/path-match';
+import { protocolAgnosticKey } from './protocol-agnostic-key.js';
+/**
+ * Determine whether a URL should be skipped during crawling.
+ *
+ * A URL is skipped if it matches any user-defined exclude glob pattern
+ * or starts with any of the excluded URL prefixes.
+ * @param params - Parameters containing the URL, exclude patterns, and options.
+ * @returns `true` if the URL should be skipped.
+ */
+export function shouldSkipUrl(params) {
+    const { url, excludes, excludeUrls, options } = params;
+    return (excludes.some((excludeGlobPattern) => pathMatch(url, excludeGlobPattern, options)) ||
+        excludeUrls.some((prefix) => protocolAgnosticKey(url.href).startsWith(protocolAgnosticKey(prefix))));
+}

package/lib/crawler/speculative-pagination.d.ts ADDED Viewed

@@ -0,0 +1,52 @@
+import type { ScrapeResult } from '@nitpicker/beholder';
+/**
+ * Describes a detected pagination pattern between two consecutive URLs.
+ */
+export interface PaginationPattern {
+    /** Index within the combined token array (path segments + query values) where the numeric difference was found. */
+    tokenIndex: number;
+    /** The numeric increment (always > 0). */
+    step: number;
+    /** The number found at `tokenIndex` in the "current" URL. */
+    currentNumber: number;
+}
+/**
+ * Compares two consecutive URL strings and detects a single-token numeric
+ * pagination pattern (e.g. `/page/1` → `/page/2`, or `?p=1` → `?p=2`).
+ *
+ * The algorithm decomposes each URL into tokens (path segments + sorted query values),
+ * then checks that exactly one token differs and both values are integers with a
+ * positive step. Returns `null` when no pattern is detected.
+ *
+ * WHY single-token constraint: Multi-token differences (e.g. both path and query
+ * changing) indicate different routes rather than pagination, so they are rejected.
+ * @param prevUrl - The previously pushed URL (protocol-agnostic, without hash/auth)
+ * @param currentUrl - The newly discovered URL
+ * @returns The detected pattern, or `null` if no pagination pattern was found
+ */
+export declare function detectPaginationPattern(prevUrl: string, currentUrl: string): PaginationPattern | null;
+/**
+ * Generates speculative URLs by extrapolating the detected pagination pattern.
+ *
+ * Starting from `currentUrl`, applies the pattern's step `count` times to produce
+ * future page URLs (e.g. if step=1 and currentNumber=2, generates page 3, 4, ...).
+ * These URLs are pushed into the crawl queue and discarded later if they 404.
+ * @param pattern - The detected pagination pattern from {@link detectPaginationPattern}
+ * @param currentUrl - The URL to extrapolate from (protocol-agnostic, without hash/auth)
+ * @param count - Number of speculative URLs to generate (typically equals concurrency)
+ * @returns Array of speculative URL strings
+ */
+export declare function generateSpeculativeUrls(pattern: PaginationPattern, currentUrl: string, count: number): string[];
+/**
+ * Determines whether a speculative URL's scrape result should be discarded.
+ *
+ * Speculative URLs are pre-emptively pushed into the crawl queue before
+ * knowing if they exist. This function filters out invalid results:
+ * - `error` type → discard (server unreachable, timeout, etc.)
+ * - `ignoreAndSkip` type → discard (matched exclusion rule)
+ * - `scrapeEnd` with HTTP error status (4xx/5xx) → discard
+ * - `scrapeEnd` with 2xx/3xx → keep
+ * @param result - The scrape result for the speculative URL
+ * @returns `true` if the result should be discarded (not saved to archive)
+ */
+export declare function shouldDiscardSpeculative(result: ScrapeResult): boolean;