@nitpicker/crawler 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/LICENSE +191 -0
- package/README.md +13 -0
- package/lib/archive/archive-accessor.d.ts +107 -0
- package/lib/archive/archive-accessor.js +264 -0
- package/lib/archive/archive.d.ts +174 -0
- package/lib/archive/archive.js +331 -0
- package/lib/archive/database.d.ts +207 -0
- package/lib/archive/database.js +972 -0
- package/lib/archive/debug.d.ts +8 -0
- package/lib/archive/debug.js +9 -0
- package/lib/archive/filesystem/append-text.d.ts +9 -0
- package/lib/archive/filesystem/append-text.js +14 -0
- package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
- package/lib/archive/filesystem/copy-dir-sync.js +9 -0
- package/lib/archive/filesystem/copy-dir.d.ts +7 -0
- package/lib/archive/filesystem/copy-dir.js +13 -0
- package/lib/archive/filesystem/exists.d.ts +6 -0
- package/lib/archive/filesystem/exists.js +9 -0
- package/lib/archive/filesystem/get-file-list.d.ts +8 -0
- package/lib/archive/filesystem/get-file-list.js +12 -0
- package/lib/archive/filesystem/index.d.ts +17 -0
- package/lib/archive/filesystem/index.js +17 -0
- package/lib/archive/filesystem/is-dir.d.ts +6 -0
- package/lib/archive/filesystem/is-dir.js +10 -0
- package/lib/archive/filesystem/mkdir.d.ts +8 -0
- package/lib/archive/filesystem/mkdir.js +15 -0
- package/lib/archive/filesystem/output-json.d.ts +9 -0
- package/lib/archive/filesystem/output-json.js +14 -0
- package/lib/archive/filesystem/output-text.d.ts +11 -0
- package/lib/archive/filesystem/output-text.js +32 -0
- package/lib/archive/filesystem/read-json.d.ts +7 -0
- package/lib/archive/filesystem/read-json.js +11 -0
- package/lib/archive/filesystem/read-text.d.ts +6 -0
- package/lib/archive/filesystem/read-text.js +10 -0
- package/lib/archive/filesystem/readline.d.ts +11 -0
- package/lib/archive/filesystem/readline.js +26 -0
- package/lib/archive/filesystem/remove.d.ts +5 -0
- package/lib/archive/filesystem/remove.js +10 -0
- package/lib/archive/filesystem/rename.d.ts +11 -0
- package/lib/archive/filesystem/rename.js +18 -0
- package/lib/archive/filesystem/tar.d.ts +11 -0
- package/lib/archive/filesystem/tar.js +22 -0
- package/lib/archive/filesystem/untar.d.ts +20 -0
- package/lib/archive/filesystem/untar.js +24 -0
- package/lib/archive/filesystem/utils.d.ts +109 -0
- package/lib/archive/filesystem/utils.js +185 -0
- package/lib/archive/filesystem/zip.d.ts +29 -0
- package/lib/archive/filesystem/zip.js +53 -0
- package/lib/archive/index.d.ts +6 -0
- package/lib/archive/index.js +11 -0
- package/lib/archive/page.d.ts +263 -0
- package/lib/archive/page.js +316 -0
- package/lib/archive/resource.d.ts +46 -0
- package/lib/archive/resource.js +62 -0
- package/lib/archive/safe-path.d.ts +9 -0
- package/lib/archive/safe-path.js +17 -0
- package/lib/archive/types.d.ts +210 -0
- package/lib/archive/types.js +1 -0
- package/lib/crawler/clear-destination-cache.d.ts +5 -0
- package/lib/crawler/clear-destination-cache.js +8 -0
- package/lib/crawler/crawler.d.ts +73 -0
- package/lib/crawler/crawler.js +748 -0
- package/lib/crawler/decompose-url.d.ts +25 -0
- package/lib/crawler/decompose-url.js +71 -0
- package/lib/crawler/destination-cache.d.ts +7 -0
- package/lib/crawler/destination-cache.js +6 -0
- package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
- package/lib/crawler/detect-pagination-pattern.js +61 -0
- package/lib/crawler/fetch-destination.d.ts +38 -0
- package/lib/crawler/fetch-destination.js +208 -0
- package/lib/crawler/fetch-robots-txt.d.ts +42 -0
- package/lib/crawler/fetch-robots-txt.js +44 -0
- package/lib/crawler/find-best-matching-scope.d.ts +12 -0
- package/lib/crawler/find-best-matching-scope.js +46 -0
- package/lib/crawler/generate-predicted-urls.d.ts +13 -0
- package/lib/crawler/generate-predicted-urls.js +27 -0
- package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
- package/lib/crawler/handle-ignore-and-skip.js +19 -0
- package/lib/crawler/handle-resource-response.d.ts +13 -0
- package/lib/crawler/handle-resource-response.js +16 -0
- package/lib/crawler/handle-scrape-end.d.ts +24 -0
- package/lib/crawler/handle-scrape-end.js +82 -0
- package/lib/crawler/handle-scrape-error.d.ts +37 -0
- package/lib/crawler/handle-scrape-error.js +38 -0
- package/lib/crawler/index.d.ts +2 -0
- package/lib/crawler/index.js +2 -0
- package/lib/crawler/inject-scope-auth.d.ts +11 -0
- package/lib/crawler/inject-scope-auth.js +21 -0
- package/lib/crawler/is-external-url.d.ts +11 -0
- package/lib/crawler/is-external-url.js +12 -0
- package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
- package/lib/crawler/is-in-any-lower-layer.js +15 -0
- package/lib/crawler/link-list.d.ts +112 -0
- package/lib/crawler/link-list.js +248 -0
- package/lib/crawler/link-to-page-data.d.ts +14 -0
- package/lib/crawler/link-to-page-data.js +32 -0
- package/lib/crawler/net-timeout-error.d.ts +9 -0
- package/lib/crawler/net-timeout-error.js +11 -0
- package/lib/crawler/network.d.ts +30 -0
- package/lib/crawler/network.js +226 -0
- package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
- package/lib/crawler/protocol-agnostic-key.js +11 -0
- package/lib/crawler/reconstruct-url.d.ts +10 -0
- package/lib/crawler/reconstruct-url.js +28 -0
- package/lib/crawler/result-handler.d.ts +118 -0
- package/lib/crawler/result-handler.js +153 -0
- package/lib/crawler/robots-checker.d.ts +26 -0
- package/lib/crawler/robots-checker.js +62 -0
- package/lib/crawler/should-discard-predicted.d.ts +14 -0
- package/lib/crawler/should-discard-predicted.js +31 -0
- package/lib/crawler/should-skip-url.d.ts +23 -0
- package/lib/crawler/should-skip-url.js +15 -0
- package/lib/crawler/speculative-pagination.d.ts +52 -0
- package/lib/crawler/speculative-pagination.js +215 -0
- package/lib/crawler/types.d.ts +119 -0
- package/lib/crawler/types.js +1 -0
- package/lib/crawler/url-filter.d.ts +56 -0
- package/lib/crawler/url-filter.js +110 -0
- package/lib/crawler-orchestrator.d.ts +142 -0
- package/lib/crawler-orchestrator.js +309 -0
- package/lib/debug.d.ts +8 -0
- package/lib/debug.js +9 -0
- package/lib/index.d.ts +16 -0
- package/lib/index.js +18 -0
- package/lib/qzilla.d.ts +136 -0
- package/lib/qzilla.js +292 -0
- package/lib/types.d.ts +27 -0
- package/lib/types.js +1 -0
- package/lib/utils/array/each-splitted.d.ts +10 -0
- package/lib/utils/array/each-splitted.js +14 -0
- package/lib/utils/array/index.d.ts +1 -0
- package/lib/utils/array/index.js +1 -0
- package/lib/utils/async/index.d.ts +1 -0
- package/lib/utils/async/index.js +1 -0
- package/lib/utils/debug.d.ts +5 -0
- package/lib/utils/debug.js +5 -0
- package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
- package/lib/utils/error/dom-evaluation-error.js +7 -0
- package/lib/utils/error/error-emitter.d.ts +18 -0
- package/lib/utils/error/error-emitter.js +29 -0
- package/lib/utils/error/index.d.ts +3 -0
- package/lib/utils/error/index.js +2 -0
- package/lib/utils/event-emitter/index.d.ts +6 -0
- package/lib/utils/event-emitter/index.js +6 -0
- package/lib/utils/index.d.ts +5 -0
- package/lib/utils/index.js +5 -0
- package/lib/utils/network/index.d.ts +1 -0
- package/lib/utils/network/index.js +1 -0
- package/lib/utils/object/clean-object.d.ts +8 -0
- package/lib/utils/object/clean-object.js +13 -0
- package/lib/utils/object/index.d.ts +1 -0
- package/lib/utils/object/index.js +1 -0
- package/lib/utils/path/index.d.ts +1 -0
- package/lib/utils/path/index.js +1 -0
- package/lib/utils/path/safe-filepath.d.ts +7 -0
- package/lib/utils/path/safe-filepath.js +12 -0
- package/lib/utils/regexp/index.d.ts +1 -0
- package/lib/utils/regexp/index.js +1 -0
- package/lib/utils/retryable/index.d.ts +2 -0
- package/lib/utils/retryable/index.js +1 -0
- package/lib/utils/sort/index.d.ts +14 -0
- package/lib/utils/sort/index.js +61 -0
- package/lib/utils/sort/remove-matches.d.ts +9 -0
- package/lib/utils/sort/remove-matches.js +23 -0
- package/lib/utils/types/index.d.ts +1 -0
- package/lib/utils/types/index.js +1 -0
- package/lib/utils/types/types.d.ts +46 -0
- package/lib/utils/types/types.js +1 -0
- package/lib/utils/url/index.d.ts +5 -0
- package/lib/utils/url/index.js +5 -0
- package/lib/utils/url/is-lower-layer.d.ts +15 -0
- package/lib/utils/url/is-lower-layer.js +55 -0
- package/lib/utils/url/parse-url.d.ts +11 -0
- package/lib/utils/url/parse-url.js +20 -0
- package/lib/utils/url/path-match.d.ts +11 -0
- package/lib/utils/url/path-match.js +18 -0
- package/lib/utils/url/sort-url.d.ts +10 -0
- package/lib/utils/url/sort-url.js +24 -0
- package/lib/utils/url/url-partial-match.d.ts +11 -0
- package/lib/utils/url/url-partial-match.js +32 -0
- package/package.json +49 -0
- package/src/archive/__mock__/.gitignore +3 -0
- package/src/archive/__mock__/mock.sqlite +0 -0
- package/src/archive/archive-accessor.ts +337 -0
- package/src/archive/archive.ts +408 -0
- package/src/archive/database.spec.ts +469 -0
- package/src/archive/database.ts +1059 -0
- package/src/archive/debug.ts +10 -0
- package/src/archive/filesystem/append-text.spec.ts +26 -0
- package/src/archive/filesystem/append-text.ts +16 -0
- package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
- package/src/archive/filesystem/copy-dir-sync.ts +10 -0
- package/src/archive/filesystem/copy-dir.spec.ts +33 -0
- package/src/archive/filesystem/copy-dir.ts +14 -0
- package/src/archive/filesystem/exists.spec.ts +33 -0
- package/src/archive/filesystem/exists.ts +10 -0
- package/src/archive/filesystem/get-file-list.spec.ts +37 -0
- package/src/archive/filesystem/get-file-list.ts +13 -0
- package/src/archive/filesystem/index.ts +17 -0
- package/src/archive/filesystem/is-dir.spec.ts +29 -0
- package/src/archive/filesystem/is-dir.ts +11 -0
- package/src/archive/filesystem/mkdir.spec.ts +37 -0
- package/src/archive/filesystem/mkdir.ts +16 -0
- package/src/archive/filesystem/output-json.spec.ts +34 -0
- package/src/archive/filesystem/output-json.ts +16 -0
- package/src/archive/filesystem/output-text.spec.ts +31 -0
- package/src/archive/filesystem/output-text.ts +35 -0
- package/src/archive/filesystem/read-json.spec.ts +26 -0
- package/src/archive/filesystem/read-json.ts +12 -0
- package/src/archive/filesystem/read-text.spec.ts +25 -0
- package/src/archive/filesystem/read-text.ts +11 -0
- package/src/archive/filesystem/readline.spec.ts +29 -0
- package/src/archive/filesystem/readline.ts +30 -0
- package/src/archive/filesystem/remove.spec.ts +34 -0
- package/src/archive/filesystem/remove.ts +11 -0
- package/src/archive/filesystem/rename.spec.ts +46 -0
- package/src/archive/filesystem/rename.ts +21 -0
- package/src/archive/filesystem/tar.spec.ts +33 -0
- package/src/archive/filesystem/tar.ts +27 -0
- package/src/archive/filesystem/untar.spec.ts +34 -0
- package/src/archive/filesystem/untar.ts +36 -0
- package/src/archive/index.ts +13 -0
- package/src/archive/page.spec.ts +368 -0
- package/src/archive/page.ts +420 -0
- package/src/archive/resource.spec.ts +101 -0
- package/src/archive/resource.ts +73 -0
- package/src/archive/safe-path.spec.ts +44 -0
- package/src/archive/safe-path.ts +18 -0
- package/src/archive/types.ts +227 -0
- package/src/crawler/clear-destination-cache.spec.ts +20 -0
- package/src/crawler/clear-destination-cache.ts +9 -0
- package/src/crawler/crawler.ts +873 -0
- package/src/crawler/decompose-url.spec.ts +48 -0
- package/src/crawler/decompose-url.ts +90 -0
- package/src/crawler/destination-cache.spec.ts +23 -0
- package/src/crawler/destination-cache.ts +8 -0
- package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
- package/src/crawler/detect-pagination-pattern.ts +66 -0
- package/src/crawler/fetch-destination.ts +257 -0
- package/src/crawler/fetch-robots-txt.spec.ts +83 -0
- package/src/crawler/fetch-robots-txt.ts +91 -0
- package/src/crawler/find-best-matching-scope.spec.ts +39 -0
- package/src/crawler/find-best-matching-scope.ts +57 -0
- package/src/crawler/generate-predicted-urls.spec.ts +42 -0
- package/src/crawler/generate-predicted-urls.ts +34 -0
- package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
- package/src/crawler/handle-ignore-and-skip.ts +30 -0
- package/src/crawler/handle-resource-response.spec.ts +45 -0
- package/src/crawler/handle-resource-response.ts +21 -0
- package/src/crawler/handle-scrape-end.spec.ts +109 -0
- package/src/crawler/handle-scrape-end.ts +115 -0
- package/src/crawler/handle-scrape-error.spec.ts +105 -0
- package/src/crawler/handle-scrape-error.ts +58 -0
- package/src/crawler/index.ts +2 -0
- package/src/crawler/inject-scope-auth.spec.ts +36 -0
- package/src/crawler/inject-scope-auth.ts +27 -0
- package/src/crawler/is-external-url.spec.ts +31 -0
- package/src/crawler/is-external-url.ts +17 -0
- package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
- package/src/crawler/is-in-any-lower-layer.ts +22 -0
- package/src/crawler/link-list.spec.ts +355 -0
- package/src/crawler/link-list.ts +275 -0
- package/src/crawler/link-to-page-data.spec.ts +133 -0
- package/src/crawler/link-to-page-data.ts +34 -0
- package/src/crawler/net-timeout-error.spec.ts +25 -0
- package/src/crawler/net-timeout-error.ts +11 -0
- package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
- package/src/crawler/protocol-agnostic-key.ts +11 -0
- package/src/crawler/reconstruct-url.spec.ts +37 -0
- package/src/crawler/reconstruct-url.ts +37 -0
- package/src/crawler/robots-checker.spec.ts +104 -0
- package/src/crawler/robots-checker.ts +73 -0
- package/src/crawler/should-discard-predicted.spec.ts +125 -0
- package/src/crawler/should-discard-predicted.ts +33 -0
- package/src/crawler/should-skip-url.spec.ts +77 -0
- package/src/crawler/should-skip-url.ts +37 -0
- package/src/crawler/types.ts +146 -0
- package/src/crawler-orchestrator.ts +401 -0
- package/src/debug.ts +10 -0
- package/src/index.ts +25 -0
- package/src/types.ts +30 -0
- package/src/utils/array/each-splitted.spec.ts +38 -0
- package/src/utils/array/each-splitted.ts +19 -0
- package/src/utils/array/index.ts +1 -0
- package/src/utils/debug.ts +6 -0
- package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
- package/src/utils/error/dom-evaluation-error.ts +6 -0
- package/src/utils/error/error-emitter.spec.ts +78 -0
- package/src/utils/error/error-emitter.ts +44 -0
- package/src/utils/error/index.ts +3 -0
- package/src/utils/index.ts +5 -0
- package/src/utils/object/clean-object.spec.ts +24 -0
- package/src/utils/object/clean-object.ts +13 -0
- package/src/utils/object/index.ts +1 -0
- package/src/utils/types/index.ts +1 -0
- package/src/utils/types/types.ts +65 -0
- package/tsconfig.json +11 -0
- package/tsconfig.tsbuildinfo +1 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { Resource } from '../utils/index.js';
|
|
2
|
+
/**
|
|
3
|
+
* Track a network resource response and determine if it is newly discovered.
|
|
4
|
+
*
|
|
5
|
+
* Checks whether the resource URL has already been seen. If it is new,
|
|
6
|
+
* adds it to the known resources set.
|
|
7
|
+
* @param resource - The captured network resource data.
|
|
8
|
+
* @param resources - The set of already-known resource URLs (without hash).
|
|
9
|
+
* @returns An object with `isNew` indicating whether this resource was seen for the first time.
|
|
10
|
+
*/
|
|
11
|
+
export declare function handleResourceResponse(resource: Resource, resources: Set<string>): {
|
|
12
|
+
isNew: boolean;
|
|
13
|
+
};
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Track a network resource response and determine if it is newly discovered.
|
|
3
|
+
*
|
|
4
|
+
* Checks whether the resource URL has already been seen. If it is new,
|
|
5
|
+
* adds it to the known resources set.
|
|
6
|
+
* @param resource - The captured network resource data.
|
|
7
|
+
* @param resources - The set of already-known resource URLs (without hash).
|
|
8
|
+
* @returns An object with `isNew` indicating whether this resource was seen for the first time.
|
|
9
|
+
*/
|
|
10
|
+
export function handleResourceResponse(resource, resources) {
|
|
11
|
+
const isNew = !resources.has(resource.url.withoutHash);
|
|
12
|
+
if (isNew) {
|
|
13
|
+
resources.add(resource.url.withoutHash);
|
|
14
|
+
}
|
|
15
|
+
return { isNew };
|
|
16
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type LinkList from './link-list.js';
|
|
2
|
+
import type { CrawlerOptions } from './types.js';
|
|
3
|
+
import type { Link, PageData } from '../utils/index.js';
|
|
4
|
+
import type { ExURL } from '@d-zero/shared/parse-url';
|
|
5
|
+
/**
|
|
6
|
+
* Process the result of a successful page scrape.
|
|
7
|
+
*
|
|
8
|
+
* Extracts anchors from the page (unless in metadata-only mode), enqueues
|
|
9
|
+
* newly discovered URLs via the `addUrl` callback, and marks the URL
|
|
10
|
+
* as done in the link list.
|
|
11
|
+
* @param result - The scraped page data.
|
|
12
|
+
* @param linkList - The link list managing the crawl queue.
|
|
13
|
+
* @param scope - Map of hostnames to their scope URLs.
|
|
14
|
+
* @param options - Crawler configuration options.
|
|
15
|
+
* @param addUrl - Callback to enqueue a newly discovered URL. Accepts optional
|
|
16
|
+
* `{ metadataOnly: true }` to request metadata-only scraping.
|
|
17
|
+
* @returns An object containing the constructed link and whether the page is external.
|
|
18
|
+
*/
|
|
19
|
+
export declare function handleScrapeEnd(result: PageData, linkList: LinkList, scope: ReadonlyMap<string, readonly ExURL[]>, options: CrawlerOptions, addUrl: (url: ExURL, opts?: {
|
|
20
|
+
metadataOnly?: true;
|
|
21
|
+
}) => void): {
|
|
22
|
+
link: Link | null;
|
|
23
|
+
isExternal: boolean;
|
|
24
|
+
};
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import { crawlerLog } from '../debug.js';
|
|
2
|
+
import { injectScopeAuth } from './inject-scope-auth.js';
|
|
3
|
+
import { isExternalUrl } from './is-external-url.js';
|
|
4
|
+
import { isInAnyLowerLayer } from './is-in-any-lower-layer.js';
|
|
5
|
+
/**
|
|
6
|
+
* Process the result of a successful page scrape.
|
|
7
|
+
*
|
|
8
|
+
* Extracts anchors from the page (unless in metadata-only mode), enqueues
|
|
9
|
+
* newly discovered URLs via the `addUrl` callback, and marks the URL
|
|
10
|
+
* as done in the link list.
|
|
11
|
+
* @param result - The scraped page data.
|
|
12
|
+
* @param linkList - The link list managing the crawl queue.
|
|
13
|
+
* @param scope - Map of hostnames to their scope URLs.
|
|
14
|
+
* @param options - Crawler configuration options.
|
|
15
|
+
* @param addUrl - Callback to enqueue a newly discovered URL. Accepts optional
|
|
16
|
+
* `{ metadataOnly: true }` to request metadata-only scraping.
|
|
17
|
+
* @returns An object containing the constructed link and whether the page is external.
|
|
18
|
+
*/
|
|
19
|
+
export function handleScrapeEnd(result, linkList, scope, options, addUrl) {
|
|
20
|
+
const isMetadataOnly = linkList.isMetadataOnly(result.url.withoutHash);
|
|
21
|
+
if (!isMetadataOnly) {
|
|
22
|
+
processAnchors(result.anchorList, scope, options, addUrl);
|
|
23
|
+
}
|
|
24
|
+
const link = linkList.done(result.url, scope, {
|
|
25
|
+
page: result,
|
|
26
|
+
}, options);
|
|
27
|
+
crawlerLog('Scrape end URL: %s', result.url.href);
|
|
28
|
+
crawlerLog('Scrape end Status: %d', result.status);
|
|
29
|
+
crawlerLog('Scrape end Type: %s', result.contentType);
|
|
30
|
+
if (!result.isExternal) {
|
|
31
|
+
crawlerLog('Scrape end Anchors: %d URLs', result.anchorList.length);
|
|
32
|
+
}
|
|
33
|
+
return { link, isExternal: result.isExternal };
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Process anchor elements extracted from a scraped page and enqueue new URLs.
|
|
37
|
+
*
|
|
38
|
+
* For each anchor:
|
|
39
|
+
* 1. Determines if it is external (outside the crawl scope)
|
|
40
|
+
* 2. Injects authentication credentials from matching scope URLs
|
|
41
|
+
* 3. Reconstructs the `withoutHash` URL with injected auth
|
|
42
|
+
* 4. In recursive mode: enqueues internal lower-layer URLs for full scraping,
|
|
43
|
+
* and external URLs for metadata-only scraping (if `fetchExternal` is enabled)
|
|
44
|
+
* 5. In non-recursive mode: enqueues all URLs for metadata-only scraping
|
|
45
|
+
* @param anchors - The list of anchor data extracted from the page.
|
|
46
|
+
* @param scope - Map of hostnames to their scope URLs.
|
|
47
|
+
* @param options - Crawler configuration options.
|
|
48
|
+
* @param addUrl - Callback to enqueue a newly discovered URL. Accepts optional
|
|
49
|
+
* `{ metadataOnly: true }` to request metadata-only scraping.
|
|
50
|
+
*/
|
|
51
|
+
function processAnchors(anchors, scope, options, addUrl) {
|
|
52
|
+
for (const anchor of anchors) {
|
|
53
|
+
const isExternal = isExternalUrl(anchor.href, scope);
|
|
54
|
+
anchor.isExternal = isExternal;
|
|
55
|
+
if (!isExternal && (!anchor.href.username || !anchor.href.password)) {
|
|
56
|
+
injectScopeAuth(anchor.href, scope);
|
|
57
|
+
const auth = anchor.href.username && anchor.href.password
|
|
58
|
+
? `${anchor.href.username}:${anchor.href.password}@`
|
|
59
|
+
: '';
|
|
60
|
+
const host = anchor.href.hostname + (anchor.href.port ? `:${anchor.href.port}` : '');
|
|
61
|
+
const newSearch = anchor.href.query ? `?${anchor.href.query}` : '';
|
|
62
|
+
const body = anchor.href.dirname
|
|
63
|
+
? `${anchor.href.paths.join('/')}${newSearch}`
|
|
64
|
+
: newSearch
|
|
65
|
+
? `${newSearch}`
|
|
66
|
+
: '';
|
|
67
|
+
const withoutHash = `${anchor.href.protocol}//${auth}${host}${body ? `/${body}` : ''}`;
|
|
68
|
+
anchor.href.withoutHash = withoutHash;
|
|
69
|
+
}
|
|
70
|
+
if (options.recursive) {
|
|
71
|
+
const scopes = scope.get(anchor.href.hostname);
|
|
72
|
+
if (scopes && isInAnyLowerLayer(anchor.href, scopes, options)) {
|
|
73
|
+
addUrl(anchor.href);
|
|
74
|
+
}
|
|
75
|
+
else if (isExternal && options.fetchExternal) {
|
|
76
|
+
addUrl(anchor.href, { metadataOnly: true });
|
|
77
|
+
}
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
addUrl(anchor.href, { metadataOnly: true });
|
|
81
|
+
}
|
|
82
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import type LinkList from './link-list.js';
|
|
2
|
+
import type { CrawlerOptions } from './types.js';
|
|
3
|
+
import type { Link, PageData } from '../utils/index.js';
|
|
4
|
+
import type { ExURL } from '@d-zero/shared/parse-url';
|
|
5
|
+
/**
|
|
6
|
+
* Handle an error that occurred during page scraping.
|
|
7
|
+
*
|
|
8
|
+
* Marks the URL as done and creates a fallback {@link PageData} from the
|
|
9
|
+
* link, regardless of whether the error caused a shutdown. This ensures
|
|
10
|
+
* that errored URLs are recorded in the DB (`status = -1, scraped = 1`)
|
|
11
|
+
* and not re-queued on resume.
|
|
12
|
+
* @param payload - The error payload from the scraper.
|
|
13
|
+
* @param payload.url - The URL being scraped when the error occurred, or `null`.
|
|
14
|
+
* @param payload.error - The error details including name, message, and optional stack.
|
|
15
|
+
* @param payload.error.name
|
|
16
|
+
* @param payload.error.message
|
|
17
|
+
* @param payload.error.stack
|
|
18
|
+
* @param payload.shutdown - Whether the error caused the scraper process to shut down.
|
|
19
|
+
* @param payload.pid - The process ID of the scraper, or `undefined`.
|
|
20
|
+
* @param linkList - The link list managing the crawl queue.
|
|
21
|
+
* @param scope - Map of hostnames to their scope URLs.
|
|
22
|
+
* @param options - Crawler configuration options.
|
|
23
|
+
* @returns An object with the link and an optional fallback PageData result.
|
|
24
|
+
*/
|
|
25
|
+
export declare function handleScrapeError(payload: {
|
|
26
|
+
url: ExURL | null;
|
|
27
|
+
error: {
|
|
28
|
+
name: string;
|
|
29
|
+
message: string;
|
|
30
|
+
stack?: string;
|
|
31
|
+
};
|
|
32
|
+
shutdown: boolean;
|
|
33
|
+
pid: number | undefined;
|
|
34
|
+
}, linkList: LinkList, scope: ReadonlyMap<string, readonly ExURL[]>, options: CrawlerOptions): {
|
|
35
|
+
link: Link | null;
|
|
36
|
+
result?: PageData;
|
|
37
|
+
};
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { crawlerErrorLog } from '../debug.js';
|
|
2
|
+
import { linkToPageData } from './link-to-page-data.js';
|
|
3
|
+
/**
|
|
4
|
+
* Handle an error that occurred during page scraping.
|
|
5
|
+
*
|
|
6
|
+
* Marks the URL as done and creates a fallback {@link PageData} from the
|
|
7
|
+
* link, regardless of whether the error caused a shutdown. This ensures
|
|
8
|
+
* that errored URLs are recorded in the DB (`status = -1, scraped = 1`)
|
|
9
|
+
* and not re-queued on resume.
|
|
10
|
+
* @param payload - The error payload from the scraper.
|
|
11
|
+
* @param payload.url - The URL being scraped when the error occurred, or `null`.
|
|
12
|
+
* @param payload.error - The error details including name, message, and optional stack.
|
|
13
|
+
* @param payload.error.name
|
|
14
|
+
* @param payload.error.message
|
|
15
|
+
* @param payload.error.stack
|
|
16
|
+
* @param payload.shutdown - Whether the error caused the scraper process to shut down.
|
|
17
|
+
* @param payload.pid - The process ID of the scraper, or `undefined`.
|
|
18
|
+
* @param linkList - The link list managing the crawl queue.
|
|
19
|
+
* @param scope - Map of hostnames to their scope URLs.
|
|
20
|
+
* @param options - Crawler configuration options.
|
|
21
|
+
* @returns An object with the link and an optional fallback PageData result.
|
|
22
|
+
*/
|
|
23
|
+
export function handleScrapeError(payload, linkList, scope, options) {
|
|
24
|
+
const { url, error, shutdown, pid } = payload;
|
|
25
|
+
let link = null;
|
|
26
|
+
let result;
|
|
27
|
+
if (url) {
|
|
28
|
+
const updated = linkList.done(url, scope, { error }, options);
|
|
29
|
+
if (updated) {
|
|
30
|
+
link = updated;
|
|
31
|
+
result = linkToPageData(updated);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
crawlerErrorLog('From %d(%s)', pid, url?.href ?? 'UNKNOWN_URL');
|
|
35
|
+
crawlerErrorLog('Then shutdown?: %s', shutdown ? 'Yes' : 'No');
|
|
36
|
+
crawlerErrorLog('%O', error);
|
|
37
|
+
return { link, result };
|
|
38
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { ExURL } from '@d-zero/shared/parse-url';
|
|
2
|
+
/**
|
|
3
|
+
* Inject authentication credentials from a matching scope URL into the target URL.
|
|
4
|
+
*
|
|
5
|
+
* Finds the best-matching scope URL (deepest path match) for the given URL's
|
|
6
|
+
* hostname and copies its `username` and `password` properties. This mutates
|
|
7
|
+
* the `url` parameter in place.
|
|
8
|
+
* @param url - The parsed URL to receive authentication credentials (mutated in place).
|
|
9
|
+
* @param scope - Map of hostnames to their scope URLs.
|
|
10
|
+
*/
|
|
11
|
+
export declare function injectScopeAuth(url: ExURL, scope: ReadonlyMap<string, readonly ExURL[]>): void;
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { findBestMatchingScope } from './find-best-matching-scope.js';
|
|
2
|
+
/**
|
|
3
|
+
* Inject authentication credentials from a matching scope URL into the target URL.
|
|
4
|
+
*
|
|
5
|
+
* Finds the best-matching scope URL (deepest path match) for the given URL's
|
|
6
|
+
* hostname and copies its `username` and `password` properties. This mutates
|
|
7
|
+
* the `url` parameter in place.
|
|
8
|
+
* @param url - The parsed URL to receive authentication credentials (mutated in place).
|
|
9
|
+
* @param scope - Map of hostnames to their scope URLs.
|
|
10
|
+
*/
|
|
11
|
+
export function injectScopeAuth(url, scope) {
|
|
12
|
+
const scopes = scope.get(url.hostname);
|
|
13
|
+
if (!scopes) {
|
|
14
|
+
return;
|
|
15
|
+
}
|
|
16
|
+
const matchedScope = findBestMatchingScope(url, scopes);
|
|
17
|
+
if (matchedScope) {
|
|
18
|
+
url.username = matchedScope.username;
|
|
19
|
+
url.password = matchedScope.password;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { ExURL } from '@d-zero/shared/parse-url';
|
|
2
|
+
/**
|
|
3
|
+
* Determine whether a URL is external to the crawl scope.
|
|
4
|
+
*
|
|
5
|
+
* A URL is considered external if its hostname does not appear
|
|
6
|
+
* as a key in the scope map.
|
|
7
|
+
* @param url - The parsed URL to check.
|
|
8
|
+
* @param scope - Map of hostnames to their scope URLs.
|
|
9
|
+
* @returns `true` if the URL is outside the crawl scope.
|
|
10
|
+
*/
|
|
11
|
+
export declare function isExternalUrl(url: ExURL, scope: ReadonlyMap<string, readonly ExURL[]>): boolean;
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Determine whether a URL is external to the crawl scope.
|
|
3
|
+
*
|
|
4
|
+
* A URL is considered external if its hostname does not appear
|
|
5
|
+
* as a key in the scope map.
|
|
6
|
+
* @param url - The parsed URL to check.
|
|
7
|
+
* @param scope - Map of hostnames to their scope URLs.
|
|
8
|
+
* @returns `true` if the URL is outside the crawl scope.
|
|
9
|
+
*/
|
|
10
|
+
export function isExternalUrl(url, scope) {
|
|
11
|
+
return !scope.has(url.hostname);
|
|
12
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
|
|
2
|
+
/**
|
|
3
|
+
* Check whether a URL is in a lower layer (subdirectory) of any scope URL.
|
|
4
|
+
*
|
|
5
|
+
* Tests the URL against each scope URL using the `isLowerLayer` utility,
|
|
6
|
+
* which checks if the URL's path is at the same level or deeper than
|
|
7
|
+
* the scope URL's path.
|
|
8
|
+
* @param url - The parsed URL to check.
|
|
9
|
+
* @param scopes - The list of scope URLs to test against.
|
|
10
|
+
* @param options - URL parsing options used for layer comparison.
|
|
11
|
+
* @returns `true` if the URL is in a lower layer of at least one scope URL.
|
|
12
|
+
*/
|
|
13
|
+
export declare function isInAnyLowerLayer(url: ExURL, scopes: readonly ExURL[], options: ParseURLOptions): boolean;
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { isLowerLayer } from '@d-zero/shared/is-lower-layer';
|
|
2
|
+
/**
|
|
3
|
+
* Check whether a URL is in a lower layer (subdirectory) of any scope URL.
|
|
4
|
+
*
|
|
5
|
+
* Tests the URL against each scope URL using the `isLowerLayer` utility,
|
|
6
|
+
* which checks if the URL's path is at the same level or deeper than
|
|
7
|
+
* the scope URL's path.
|
|
8
|
+
* @param url - The parsed URL to check.
|
|
9
|
+
* @param scopes - The list of scope URLs to test against.
|
|
10
|
+
* @param options - URL parsing options used for layer comparison.
|
|
11
|
+
* @returns `true` if the URL is in a lower layer of at least one scope URL.
|
|
12
|
+
*/
|
|
13
|
+
export function isInAnyLowerLayer(url, scopes, options) {
|
|
14
|
+
return scopes.some((scope) => isLowerLayer(url.href, scope.href, options));
|
|
15
|
+
}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import type { Link, PageData } from '../utils/index.js';
|
|
2
|
+
import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
|
|
3
|
+
/**
|
|
4
|
+
* Manages the queue of URLs discovered during crawling.
|
|
5
|
+
*
|
|
6
|
+
* Tracks URLs across three states: pending (queued but not started),
|
|
7
|
+
* in-progress (currently being scraped), and done (scraping completed).
|
|
8
|
+
* Provides deduplication based on `withoutHashAndAuth` normalization
|
|
9
|
+
* and tracks page completion counts for progress reporting.
|
|
10
|
+
*/
|
|
11
|
+
export default class LinkList {
|
|
12
|
+
#private;
|
|
13
|
+
/**
|
|
14
|
+
* The number of successfully completed internal HTML pages.
|
|
15
|
+
*
|
|
16
|
+
* Only counts pages that are internal, in a lower layer, use HTTP(S),
|
|
17
|
+
* have no error status, and have `text/html` content type.
|
|
18
|
+
*/
|
|
19
|
+
get completePages(): number;
|
|
20
|
+
/**
|
|
21
|
+
* Add a URL to the pending queue if it has not been seen before.
|
|
22
|
+
*
|
|
23
|
+
* Deduplication is based on the URL's `withoutHashAndAuth` representation.
|
|
24
|
+
* If the URL is already pending, in progress, or done, this is a no-op.
|
|
25
|
+
* @param linkUrl - The parsed URL to add to the queue.
|
|
26
|
+
* @param options - Optional flags for the URL.
|
|
27
|
+
* @param options.metadataOnly - If `true`, marks this URL for title-only scraping
|
|
28
|
+
* (metadata extraction without full page processing).
|
|
29
|
+
* @param options.predicted - If `true`, marks this URL as a predicted pagination guess
|
|
30
|
+
* that should be discarded if it returns a 4xx/5xx status.
|
|
31
|
+
*/
|
|
32
|
+
add(linkUrl: ExURL, options?: {
|
|
33
|
+
metadataOnly?: true;
|
|
34
|
+
predicted?: true;
|
|
35
|
+
}): void;
|
|
36
|
+
/**
|
|
37
|
+
* Mark a URL as completed and record its scrape result.
|
|
38
|
+
*
|
|
39
|
+
* Moves the URL from pending/progress to done, constructs a {@link Link}
|
|
40
|
+
* object with scope and layer information, and increments the page counter
|
|
41
|
+
* if the result qualifies as a valid HTML page.
|
|
42
|
+
* @param url - The URL that has been scraped.
|
|
43
|
+
* @param scope - The current scope map (hostname to scope URLs).
|
|
44
|
+
* @param resource - The scrape result containing page data and/or error information.
|
|
45
|
+
* @param resource.page - The scraped page data, if the scrape succeeded.
|
|
46
|
+
* @param resource.error - The error object, if the scrape failed.
|
|
47
|
+
* @param options - URL parsing options (e.g., `disableQueries`).
|
|
48
|
+
* @returns The constructed {@link Link} object, or `null` if the URL was not in the queue.
|
|
49
|
+
*/
|
|
50
|
+
done(url: ExURL, scope: ReadonlyMap<string, readonly ExURL[]>, resource: {
|
|
51
|
+
page?: PageData;
|
|
52
|
+
error?: Error;
|
|
53
|
+
}, options: ParseURLOptions): Link | null;
|
|
54
|
+
/**
|
|
55
|
+
* Get the current pending and in-progress URL lists.
|
|
56
|
+
* @returns An object containing arrays of pending and in-progress URL strings.
|
|
57
|
+
*/
|
|
58
|
+
getLinks(): {
|
|
59
|
+
/** URLs queued but not yet started. */
|
|
60
|
+
pending: string[];
|
|
61
|
+
/** URLs currently being scraped. */
|
|
62
|
+
progress: string[];
|
|
63
|
+
};
|
|
64
|
+
/**
|
|
65
|
+
* Get a summary of crawl progress counts.
|
|
66
|
+
* @returns An object with total/completed counts for both all links and pages only.
|
|
67
|
+
*/
|
|
68
|
+
getPageCount(): {
|
|
69
|
+
/** Total number of discovered links (pending + progress + done). */
|
|
70
|
+
totalLinks: number;
|
|
71
|
+
/** Number of links that have been fully processed. */
|
|
72
|
+
completedLinks: number;
|
|
73
|
+
/** Total number of discovered pages (pending + progress + completed pages). */
|
|
74
|
+
totalPages: number;
|
|
75
|
+
/** Number of pages that have been successfully scraped. */
|
|
76
|
+
completedPages: number;
|
|
77
|
+
};
|
|
78
|
+
/**
|
|
79
|
+
* Check whether a URL is flagged for title-only scraping.
|
|
80
|
+
*
|
|
81
|
+
* Title-only scraping extracts only the page title and basic metadata,
|
|
82
|
+
* without processing anchors or capturing the full HTML.
|
|
83
|
+
* @param urlWithoutHashAndAuth - The normalized URL string (without hash and auth) to check.
|
|
84
|
+
* @returns `true` if the URL should be scraped in title-only mode.
|
|
85
|
+
*/
|
|
86
|
+
isMetadataOnly(urlWithoutHashAndAuth: string): boolean;
|
|
87
|
+
/**
|
|
88
|
+
* Check whether a URL was added as a predicted pagination URL.
|
|
89
|
+
* @param urlWithoutHashAndAuth - The normalized URL string (without hash and auth) to check.
|
|
90
|
+
* @returns `true` if the URL was added with the predicted flag.
|
|
91
|
+
*/
|
|
92
|
+
isPredicted(urlWithoutHashAndAuth: string): boolean;
|
|
93
|
+
/**
|
|
94
|
+
* Transition a URL from the pending state to the in-progress state.
|
|
95
|
+
*
|
|
96
|
+
* This should be called when scraping of the URL actually begins.
|
|
97
|
+
* If the URL is not in the pending set, this is a no-op.
|
|
98
|
+
* @param url - The URL that is now being actively scraped.
|
|
99
|
+
*/
|
|
100
|
+
progress(url: ExURL): void;
|
|
101
|
+
/**
|
|
102
|
+
* Restore the link list state from a previous crawl session.
|
|
103
|
+
*
|
|
104
|
+
* Re-adds pending URLs to the queue and marks previously done URLs
|
|
105
|
+
* as completed, enabling the crawler to resume from where it left off.
|
|
106
|
+
* @param pending - URLs that were pending in the previous session.
|
|
107
|
+
* @param done - URLs that were already completed in the previous session.
|
|
108
|
+
* @param options - URL parsing options for re-parsing the pending URLs.
|
|
109
|
+
* @returns The parsed pending URLs that were successfully added to the queue.
|
|
110
|
+
*/
|
|
111
|
+
resume(pending: string[], done: string[], options: ParseURLOptions): ExURL[];
|
|
112
|
+
}
|