@nitpicker/crawler 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/LICENSE +191 -0
- package/README.md +13 -0
- package/lib/archive/archive-accessor.d.ts +107 -0
- package/lib/archive/archive-accessor.js +264 -0
- package/lib/archive/archive.d.ts +174 -0
- package/lib/archive/archive.js +331 -0
- package/lib/archive/database.d.ts +207 -0
- package/lib/archive/database.js +972 -0
- package/lib/archive/debug.d.ts +8 -0
- package/lib/archive/debug.js +9 -0
- package/lib/archive/filesystem/append-text.d.ts +9 -0
- package/lib/archive/filesystem/append-text.js +14 -0
- package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
- package/lib/archive/filesystem/copy-dir-sync.js +9 -0
- package/lib/archive/filesystem/copy-dir.d.ts +7 -0
- package/lib/archive/filesystem/copy-dir.js +13 -0
- package/lib/archive/filesystem/exists.d.ts +6 -0
- package/lib/archive/filesystem/exists.js +9 -0
- package/lib/archive/filesystem/get-file-list.d.ts +8 -0
- package/lib/archive/filesystem/get-file-list.js +12 -0
- package/lib/archive/filesystem/index.d.ts +17 -0
- package/lib/archive/filesystem/index.js +17 -0
- package/lib/archive/filesystem/is-dir.d.ts +6 -0
- package/lib/archive/filesystem/is-dir.js +10 -0
- package/lib/archive/filesystem/mkdir.d.ts +8 -0
- package/lib/archive/filesystem/mkdir.js +15 -0
- package/lib/archive/filesystem/output-json.d.ts +9 -0
- package/lib/archive/filesystem/output-json.js +14 -0
- package/lib/archive/filesystem/output-text.d.ts +11 -0
- package/lib/archive/filesystem/output-text.js +32 -0
- package/lib/archive/filesystem/read-json.d.ts +7 -0
- package/lib/archive/filesystem/read-json.js +11 -0
- package/lib/archive/filesystem/read-text.d.ts +6 -0
- package/lib/archive/filesystem/read-text.js +10 -0
- package/lib/archive/filesystem/readline.d.ts +11 -0
- package/lib/archive/filesystem/readline.js +26 -0
- package/lib/archive/filesystem/remove.d.ts +5 -0
- package/lib/archive/filesystem/remove.js +10 -0
- package/lib/archive/filesystem/rename.d.ts +11 -0
- package/lib/archive/filesystem/rename.js +18 -0
- package/lib/archive/filesystem/tar.d.ts +11 -0
- package/lib/archive/filesystem/tar.js +22 -0
- package/lib/archive/filesystem/untar.d.ts +20 -0
- package/lib/archive/filesystem/untar.js +24 -0
- package/lib/archive/filesystem/utils.d.ts +109 -0
- package/lib/archive/filesystem/utils.js +185 -0
- package/lib/archive/filesystem/zip.d.ts +29 -0
- package/lib/archive/filesystem/zip.js +53 -0
- package/lib/archive/index.d.ts +6 -0
- package/lib/archive/index.js +11 -0
- package/lib/archive/page.d.ts +263 -0
- package/lib/archive/page.js +316 -0
- package/lib/archive/resource.d.ts +46 -0
- package/lib/archive/resource.js +62 -0
- package/lib/archive/safe-path.d.ts +9 -0
- package/lib/archive/safe-path.js +17 -0
- package/lib/archive/types.d.ts +210 -0
- package/lib/archive/types.js +1 -0
- package/lib/crawler/clear-destination-cache.d.ts +5 -0
- package/lib/crawler/clear-destination-cache.js +8 -0
- package/lib/crawler/crawler.d.ts +73 -0
- package/lib/crawler/crawler.js +748 -0
- package/lib/crawler/decompose-url.d.ts +25 -0
- package/lib/crawler/decompose-url.js +71 -0
- package/lib/crawler/destination-cache.d.ts +7 -0
- package/lib/crawler/destination-cache.js +6 -0
- package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
- package/lib/crawler/detect-pagination-pattern.js +61 -0
- package/lib/crawler/fetch-destination.d.ts +38 -0
- package/lib/crawler/fetch-destination.js +208 -0
- package/lib/crawler/fetch-robots-txt.d.ts +42 -0
- package/lib/crawler/fetch-robots-txt.js +44 -0
- package/lib/crawler/find-best-matching-scope.d.ts +12 -0
- package/lib/crawler/find-best-matching-scope.js +46 -0
- package/lib/crawler/generate-predicted-urls.d.ts +13 -0
- package/lib/crawler/generate-predicted-urls.js +27 -0
- package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
- package/lib/crawler/handle-ignore-and-skip.js +19 -0
- package/lib/crawler/handle-resource-response.d.ts +13 -0
- package/lib/crawler/handle-resource-response.js +16 -0
- package/lib/crawler/handle-scrape-end.d.ts +24 -0
- package/lib/crawler/handle-scrape-end.js +82 -0
- package/lib/crawler/handle-scrape-error.d.ts +37 -0
- package/lib/crawler/handle-scrape-error.js +38 -0
- package/lib/crawler/index.d.ts +2 -0
- package/lib/crawler/index.js +2 -0
- package/lib/crawler/inject-scope-auth.d.ts +11 -0
- package/lib/crawler/inject-scope-auth.js +21 -0
- package/lib/crawler/is-external-url.d.ts +11 -0
- package/lib/crawler/is-external-url.js +12 -0
- package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
- package/lib/crawler/is-in-any-lower-layer.js +15 -0
- package/lib/crawler/link-list.d.ts +112 -0
- package/lib/crawler/link-list.js +248 -0
- package/lib/crawler/link-to-page-data.d.ts +14 -0
- package/lib/crawler/link-to-page-data.js +32 -0
- package/lib/crawler/net-timeout-error.d.ts +9 -0
- package/lib/crawler/net-timeout-error.js +11 -0
- package/lib/crawler/network.d.ts +30 -0
- package/lib/crawler/network.js +226 -0
- package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
- package/lib/crawler/protocol-agnostic-key.js +11 -0
- package/lib/crawler/reconstruct-url.d.ts +10 -0
- package/lib/crawler/reconstruct-url.js +28 -0
- package/lib/crawler/result-handler.d.ts +118 -0
- package/lib/crawler/result-handler.js +153 -0
- package/lib/crawler/robots-checker.d.ts +26 -0
- package/lib/crawler/robots-checker.js +62 -0
- package/lib/crawler/should-discard-predicted.d.ts +14 -0
- package/lib/crawler/should-discard-predicted.js +31 -0
- package/lib/crawler/should-skip-url.d.ts +23 -0
- package/lib/crawler/should-skip-url.js +15 -0
- package/lib/crawler/speculative-pagination.d.ts +52 -0
- package/lib/crawler/speculative-pagination.js +215 -0
- package/lib/crawler/types.d.ts +119 -0
- package/lib/crawler/types.js +1 -0
- package/lib/crawler/url-filter.d.ts +56 -0
- package/lib/crawler/url-filter.js +110 -0
- package/lib/crawler-orchestrator.d.ts +142 -0
- package/lib/crawler-orchestrator.js +309 -0
- package/lib/debug.d.ts +8 -0
- package/lib/debug.js +9 -0
- package/lib/index.d.ts +16 -0
- package/lib/index.js +18 -0
- package/lib/qzilla.d.ts +136 -0
- package/lib/qzilla.js +292 -0
- package/lib/types.d.ts +27 -0
- package/lib/types.js +1 -0
- package/lib/utils/array/each-splitted.d.ts +10 -0
- package/lib/utils/array/each-splitted.js +14 -0
- package/lib/utils/array/index.d.ts +1 -0
- package/lib/utils/array/index.js +1 -0
- package/lib/utils/async/index.d.ts +1 -0
- package/lib/utils/async/index.js +1 -0
- package/lib/utils/debug.d.ts +5 -0
- package/lib/utils/debug.js +5 -0
- package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
- package/lib/utils/error/dom-evaluation-error.js +7 -0
- package/lib/utils/error/error-emitter.d.ts +18 -0
- package/lib/utils/error/error-emitter.js +29 -0
- package/lib/utils/error/index.d.ts +3 -0
- package/lib/utils/error/index.js +2 -0
- package/lib/utils/event-emitter/index.d.ts +6 -0
- package/lib/utils/event-emitter/index.js +6 -0
- package/lib/utils/index.d.ts +5 -0
- package/lib/utils/index.js +5 -0
- package/lib/utils/network/index.d.ts +1 -0
- package/lib/utils/network/index.js +1 -0
- package/lib/utils/object/clean-object.d.ts +8 -0
- package/lib/utils/object/clean-object.js +13 -0
- package/lib/utils/object/index.d.ts +1 -0
- package/lib/utils/object/index.js +1 -0
- package/lib/utils/path/index.d.ts +1 -0
- package/lib/utils/path/index.js +1 -0
- package/lib/utils/path/safe-filepath.d.ts +7 -0
- package/lib/utils/path/safe-filepath.js +12 -0
- package/lib/utils/regexp/index.d.ts +1 -0
- package/lib/utils/regexp/index.js +1 -0
- package/lib/utils/retryable/index.d.ts +2 -0
- package/lib/utils/retryable/index.js +1 -0
- package/lib/utils/sort/index.d.ts +14 -0
- package/lib/utils/sort/index.js +61 -0
- package/lib/utils/sort/remove-matches.d.ts +9 -0
- package/lib/utils/sort/remove-matches.js +23 -0
- package/lib/utils/types/index.d.ts +1 -0
- package/lib/utils/types/index.js +1 -0
- package/lib/utils/types/types.d.ts +46 -0
- package/lib/utils/types/types.js +1 -0
- package/lib/utils/url/index.d.ts +5 -0
- package/lib/utils/url/index.js +5 -0
- package/lib/utils/url/is-lower-layer.d.ts +15 -0
- package/lib/utils/url/is-lower-layer.js +55 -0
- package/lib/utils/url/parse-url.d.ts +11 -0
- package/lib/utils/url/parse-url.js +20 -0
- package/lib/utils/url/path-match.d.ts +11 -0
- package/lib/utils/url/path-match.js +18 -0
- package/lib/utils/url/sort-url.d.ts +10 -0
- package/lib/utils/url/sort-url.js +24 -0
- package/lib/utils/url/url-partial-match.d.ts +11 -0
- package/lib/utils/url/url-partial-match.js +32 -0
- package/package.json +49 -0
- package/src/archive/__mock__/.gitignore +3 -0
- package/src/archive/__mock__/mock.sqlite +0 -0
- package/src/archive/archive-accessor.ts +337 -0
- package/src/archive/archive.ts +408 -0
- package/src/archive/database.spec.ts +469 -0
- package/src/archive/database.ts +1059 -0
- package/src/archive/debug.ts +10 -0
- package/src/archive/filesystem/append-text.spec.ts +26 -0
- package/src/archive/filesystem/append-text.ts +16 -0
- package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
- package/src/archive/filesystem/copy-dir-sync.ts +10 -0
- package/src/archive/filesystem/copy-dir.spec.ts +33 -0
- package/src/archive/filesystem/copy-dir.ts +14 -0
- package/src/archive/filesystem/exists.spec.ts +33 -0
- package/src/archive/filesystem/exists.ts +10 -0
- package/src/archive/filesystem/get-file-list.spec.ts +37 -0
- package/src/archive/filesystem/get-file-list.ts +13 -0
- package/src/archive/filesystem/index.ts +17 -0
- package/src/archive/filesystem/is-dir.spec.ts +29 -0
- package/src/archive/filesystem/is-dir.ts +11 -0
- package/src/archive/filesystem/mkdir.spec.ts +37 -0
- package/src/archive/filesystem/mkdir.ts +16 -0
- package/src/archive/filesystem/output-json.spec.ts +34 -0
- package/src/archive/filesystem/output-json.ts +16 -0
- package/src/archive/filesystem/output-text.spec.ts +31 -0
- package/src/archive/filesystem/output-text.ts +35 -0
- package/src/archive/filesystem/read-json.spec.ts +26 -0
- package/src/archive/filesystem/read-json.ts +12 -0
- package/src/archive/filesystem/read-text.spec.ts +25 -0
- package/src/archive/filesystem/read-text.ts +11 -0
- package/src/archive/filesystem/readline.spec.ts +29 -0
- package/src/archive/filesystem/readline.ts +30 -0
- package/src/archive/filesystem/remove.spec.ts +34 -0
- package/src/archive/filesystem/remove.ts +11 -0
- package/src/archive/filesystem/rename.spec.ts +46 -0
- package/src/archive/filesystem/rename.ts +21 -0
- package/src/archive/filesystem/tar.spec.ts +33 -0
- package/src/archive/filesystem/tar.ts +27 -0
- package/src/archive/filesystem/untar.spec.ts +34 -0
- package/src/archive/filesystem/untar.ts +36 -0
- package/src/archive/index.ts +13 -0
- package/src/archive/page.spec.ts +368 -0
- package/src/archive/page.ts +420 -0
- package/src/archive/resource.spec.ts +101 -0
- package/src/archive/resource.ts +73 -0
- package/src/archive/safe-path.spec.ts +44 -0
- package/src/archive/safe-path.ts +18 -0
- package/src/archive/types.ts +227 -0
- package/src/crawler/clear-destination-cache.spec.ts +20 -0
- package/src/crawler/clear-destination-cache.ts +9 -0
- package/src/crawler/crawler.ts +873 -0
- package/src/crawler/decompose-url.spec.ts +48 -0
- package/src/crawler/decompose-url.ts +90 -0
- package/src/crawler/destination-cache.spec.ts +23 -0
- package/src/crawler/destination-cache.ts +8 -0
- package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
- package/src/crawler/detect-pagination-pattern.ts +66 -0
- package/src/crawler/fetch-destination.ts +257 -0
- package/src/crawler/fetch-robots-txt.spec.ts +83 -0
- package/src/crawler/fetch-robots-txt.ts +91 -0
- package/src/crawler/find-best-matching-scope.spec.ts +39 -0
- package/src/crawler/find-best-matching-scope.ts +57 -0
- package/src/crawler/generate-predicted-urls.spec.ts +42 -0
- package/src/crawler/generate-predicted-urls.ts +34 -0
- package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
- package/src/crawler/handle-ignore-and-skip.ts +30 -0
- package/src/crawler/handle-resource-response.spec.ts +45 -0
- package/src/crawler/handle-resource-response.ts +21 -0
- package/src/crawler/handle-scrape-end.spec.ts +109 -0
- package/src/crawler/handle-scrape-end.ts +115 -0
- package/src/crawler/handle-scrape-error.spec.ts +105 -0
- package/src/crawler/handle-scrape-error.ts +58 -0
- package/src/crawler/index.ts +2 -0
- package/src/crawler/inject-scope-auth.spec.ts +36 -0
- package/src/crawler/inject-scope-auth.ts +27 -0
- package/src/crawler/is-external-url.spec.ts +31 -0
- package/src/crawler/is-external-url.ts +17 -0
- package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
- package/src/crawler/is-in-any-lower-layer.ts +22 -0
- package/src/crawler/link-list.spec.ts +355 -0
- package/src/crawler/link-list.ts +275 -0
- package/src/crawler/link-to-page-data.spec.ts +133 -0
- package/src/crawler/link-to-page-data.ts +34 -0
- package/src/crawler/net-timeout-error.spec.ts +25 -0
- package/src/crawler/net-timeout-error.ts +11 -0
- package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
- package/src/crawler/protocol-agnostic-key.ts +11 -0
- package/src/crawler/reconstruct-url.spec.ts +37 -0
- package/src/crawler/reconstruct-url.ts +37 -0
- package/src/crawler/robots-checker.spec.ts +104 -0
- package/src/crawler/robots-checker.ts +73 -0
- package/src/crawler/should-discard-predicted.spec.ts +125 -0
- package/src/crawler/should-discard-predicted.ts +33 -0
- package/src/crawler/should-skip-url.spec.ts +77 -0
- package/src/crawler/should-skip-url.ts +37 -0
- package/src/crawler/types.ts +146 -0
- package/src/crawler-orchestrator.ts +401 -0
- package/src/debug.ts +10 -0
- package/src/index.ts +25 -0
- package/src/types.ts +30 -0
- package/src/utils/array/each-splitted.spec.ts +38 -0
- package/src/utils/array/each-splitted.ts +19 -0
- package/src/utils/array/index.ts +1 -0
- package/src/utils/debug.ts +6 -0
- package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
- package/src/utils/error/dom-evaluation-error.ts +6 -0
- package/src/utils/error/error-emitter.spec.ts +78 -0
- package/src/utils/error/error-emitter.ts +44 -0
- package/src/utils/error/index.ts +3 -0
- package/src/utils/index.ts +5 -0
- package/src/utils/object/clean-object.spec.ts +24 -0
- package/src/utils/object/clean-object.ts +13 -0
- package/src/utils/object/index.ts +1 -0
- package/src/utils/types/index.ts +1 -0
- package/src/utils/types/types.ts +65 -0
- package/tsconfig.json +11 -0
- package/tsconfig.tsbuildinfo +1 -0
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
import { isError } from '@d-zero/beholder';
|
|
2
|
+
import { isLowerLayer } from '@d-zero/shared/is-lower-layer';
|
|
3
|
+
import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
|
|
4
|
+
import { protocolAgnosticKey } from './protocol-agnostic-key.js';
|
|
5
|
+
/**
|
|
6
|
+
* Manages the queue of URLs discovered during crawling.
|
|
7
|
+
*
|
|
8
|
+
* Tracks URLs across three states: pending (queued but not started),
|
|
9
|
+
* in-progress (currently being scraped), and done (scraping completed).
|
|
10
|
+
* Provides deduplication based on `withoutHashAndAuth` normalization
|
|
11
|
+
* and tracks page completion counts for progress reporting.
|
|
12
|
+
*/
|
|
13
|
+
export default class LinkList {
|
|
14
|
+
#completePages = 0;
|
|
15
|
+
#done = new Set();
|
|
16
|
+
#metadataOnlyFlag = new Set();
|
|
17
|
+
#pending = new Set();
|
|
18
|
+
#predictedFlag = new Set();
|
|
19
|
+
#progress = new Set();
|
|
20
|
+
/**
|
|
21
|
+
* The number of successfully completed internal HTML pages.
|
|
22
|
+
*
|
|
23
|
+
* Only counts pages that are internal, in a lower layer, use HTTP(S),
|
|
24
|
+
* have no error status, and have `text/html` content type.
|
|
25
|
+
*/
|
|
26
|
+
get completePages() {
|
|
27
|
+
return this.#completePages;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Add a URL to the pending queue if it has not been seen before.
|
|
31
|
+
*
|
|
32
|
+
* Deduplication is based on the URL's `withoutHashAndAuth` representation.
|
|
33
|
+
* If the URL is already pending, in progress, or done, this is a no-op.
|
|
34
|
+
* @param linkUrl - The parsed URL to add to the queue.
|
|
35
|
+
* @param options - Optional flags for the URL.
|
|
36
|
+
* @param options.metadataOnly - If `true`, marks this URL for title-only scraping
|
|
37
|
+
* (metadata extraction without full page processing).
|
|
38
|
+
* @param options.predicted - If `true`, marks this URL as a predicted pagination guess
|
|
39
|
+
* that should be discarded if it returns a 4xx/5xx status.
|
|
40
|
+
*/
|
|
41
|
+
add(linkUrl, options) {
|
|
42
|
+
const key = protocolAgnosticKey(linkUrl.withoutHashAndAuth);
|
|
43
|
+
if (this.#pending.has(key) || this.#progress.has(key) || this.#done.has(key)) {
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
this.#pending.add(key);
|
|
47
|
+
if (options?.metadataOnly) {
|
|
48
|
+
this.#metadataOnlyFlag.add(key);
|
|
49
|
+
}
|
|
50
|
+
if (options?.predicted) {
|
|
51
|
+
this.#predictedFlag.add(key);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Mark a URL as completed and record its scrape result.
|
|
56
|
+
*
|
|
57
|
+
* Moves the URL from pending/progress to done, constructs a {@link Link}
|
|
58
|
+
* object with scope and layer information, and increments the page counter
|
|
59
|
+
* if the result qualifies as a valid HTML page.
|
|
60
|
+
* @param url - The URL that has been scraped.
|
|
61
|
+
* @param scope - The current scope map (hostname to scope URLs).
|
|
62
|
+
* @param resource - The scrape result containing page data and/or error information.
|
|
63
|
+
* @param resource.page - The scraped page data, if the scrape succeeded.
|
|
64
|
+
* @param resource.error - The error object, if the scrape failed.
|
|
65
|
+
* @param options - URL parsing options (e.g., `disableQueries`).
|
|
66
|
+
* @returns The constructed {@link Link} object, or `null` if the URL was not in the queue.
|
|
67
|
+
*/
|
|
68
|
+
done(url, scope, resource, options) {
|
|
69
|
+
const key = protocolAgnosticKey(url.withoutHashAndAuth);
|
|
70
|
+
if (!(this.#pending.has(key) || this.#progress.has(key))) {
|
|
71
|
+
return null;
|
|
72
|
+
}
|
|
73
|
+
this.#pending.delete(key);
|
|
74
|
+
this.#progress.delete(key);
|
|
75
|
+
const linkUrl = parseUrl(url, options);
|
|
76
|
+
if (!linkUrl) {
|
|
77
|
+
return null;
|
|
78
|
+
}
|
|
79
|
+
const sameScopes = scope.get(linkUrl.hostname);
|
|
80
|
+
const link = {
|
|
81
|
+
url: linkUrl,
|
|
82
|
+
isLowerLayer: sameScopes
|
|
83
|
+
? sameScopes.some((s) => isLowerLayer(linkUrl.href, s.href, options))
|
|
84
|
+
: false,
|
|
85
|
+
isExternal: !sameScopes,
|
|
86
|
+
};
|
|
87
|
+
const urlList = new Set([key]);
|
|
88
|
+
if (resource.page) {
|
|
89
|
+
link.dest = {
|
|
90
|
+
redirectPaths: resource.page.redirectPaths,
|
|
91
|
+
status: resource.page.status,
|
|
92
|
+
statusText: resource.page.statusText,
|
|
93
|
+
contentType: resource.page.contentType,
|
|
94
|
+
contentLength: resource.page.contentLength,
|
|
95
|
+
responseHeaders: resource.page.responseHeaders,
|
|
96
|
+
title: resource.page.meta.title,
|
|
97
|
+
};
|
|
98
|
+
for (const path of resource.page.redirectPaths) {
|
|
99
|
+
urlList.add(protocolAgnosticKey(path));
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
if (resource.error?.message.includes('ERR_NAME_NOT_RESOLVED')) {
|
|
103
|
+
link.dest = {
|
|
104
|
+
redirectPaths: [],
|
|
105
|
+
status: -1,
|
|
106
|
+
statusText: resource.error.message,
|
|
107
|
+
contentType: null,
|
|
108
|
+
contentLength: null,
|
|
109
|
+
responseHeaders: {},
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
const isPageLink = isPage(link);
|
|
113
|
+
for (const passedUrl of urlList) {
|
|
114
|
+
this.#done.add(passedUrl);
|
|
115
|
+
if (isPageLink) {
|
|
116
|
+
this.#completePages += 1;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
return link;
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Get the current pending and in-progress URL lists.
|
|
123
|
+
* @returns An object containing arrays of pending and in-progress URL strings.
|
|
124
|
+
*/
|
|
125
|
+
getLinks() {
|
|
126
|
+
return {
|
|
127
|
+
/** URLs queued but not yet started. */
|
|
128
|
+
pending: [...this.#pending.values()],
|
|
129
|
+
/** URLs currently being scraped. */
|
|
130
|
+
progress: [...this.#progress.values()],
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Get a summary of crawl progress counts.
|
|
135
|
+
* @returns An object with total/completed counts for both all links and pages only.
|
|
136
|
+
*/
|
|
137
|
+
getPageCount() {
|
|
138
|
+
const { pending, progress } = this.getLinks();
|
|
139
|
+
const pendingPages = pending;
|
|
140
|
+
const progressPages = progress;
|
|
141
|
+
const totalLinks = pending.length + progress.length + this.#done.size;
|
|
142
|
+
const completedLinks = this.#done.size;
|
|
143
|
+
const totalPages = pendingPages.length + progressPages.length + this.#completePages;
|
|
144
|
+
const completedPages = this.#completePages;
|
|
145
|
+
return {
|
|
146
|
+
/** Total number of discovered links (pending + progress + done). */
|
|
147
|
+
totalLinks,
|
|
148
|
+
/** Number of links that have been fully processed. */
|
|
149
|
+
completedLinks,
|
|
150
|
+
/** Total number of discovered pages (pending + progress + completed pages). */
|
|
151
|
+
totalPages,
|
|
152
|
+
/** Number of pages that have been successfully scraped. */
|
|
153
|
+
completedPages,
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Check whether a URL is flagged for title-only scraping.
|
|
158
|
+
*
|
|
159
|
+
* Title-only scraping extracts only the page title and basic metadata,
|
|
160
|
+
* without processing anchors or capturing the full HTML.
|
|
161
|
+
* @param urlWithoutHashAndAuth - The normalized URL string (without hash and auth) to check.
|
|
162
|
+
* @returns `true` if the URL should be scraped in title-only mode.
|
|
163
|
+
*/
|
|
164
|
+
isMetadataOnly(urlWithoutHashAndAuth) {
|
|
165
|
+
return this.#metadataOnlyFlag.has(protocolAgnosticKey(urlWithoutHashAndAuth));
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Check whether a URL was added as a predicted pagination URL.
|
|
169
|
+
* @param urlWithoutHashAndAuth - The normalized URL string (without hash and auth) to check.
|
|
170
|
+
* @returns `true` if the URL was added with the predicted flag.
|
|
171
|
+
*/
|
|
172
|
+
isPredicted(urlWithoutHashAndAuth) {
|
|
173
|
+
return this.#predictedFlag.has(protocolAgnosticKey(urlWithoutHashAndAuth));
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Transition a URL from the pending state to the in-progress state.
|
|
177
|
+
*
|
|
178
|
+
* This should be called when scraping of the URL actually begins.
|
|
179
|
+
* If the URL is not in the pending set, this is a no-op.
|
|
180
|
+
* @param url - The URL that is now being actively scraped.
|
|
181
|
+
*/
|
|
182
|
+
progress(url) {
|
|
183
|
+
const key = protocolAgnosticKey(url.withoutHashAndAuth);
|
|
184
|
+
if (!this.#pending.has(key)) {
|
|
185
|
+
return;
|
|
186
|
+
}
|
|
187
|
+
this.#pending.delete(key);
|
|
188
|
+
this.#progress.add(key);
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* Restore the link list state from a previous crawl session.
|
|
192
|
+
*
|
|
193
|
+
* Re-adds pending URLs to the queue and marks previously done URLs
|
|
194
|
+
* as completed, enabling the crawler to resume from where it left off.
|
|
195
|
+
* @param pending - URLs that were pending in the previous session.
|
|
196
|
+
* @param done - URLs that were already completed in the previous session.
|
|
197
|
+
* @param options - URL parsing options for re-parsing the pending URLs.
|
|
198
|
+
* @returns The parsed pending URLs that were successfully added to the queue.
|
|
199
|
+
*/
|
|
200
|
+
resume(pending, done, options) {
|
|
201
|
+
const parsedPending = [];
|
|
202
|
+
for (const url of done) {
|
|
203
|
+
this.#done.add(protocolAgnosticKey(url));
|
|
204
|
+
}
|
|
205
|
+
for (const url of pending) {
|
|
206
|
+
const parsedUrl = parseUrl(url, options);
|
|
207
|
+
if (!parsedUrl) {
|
|
208
|
+
continue;
|
|
209
|
+
}
|
|
210
|
+
this.add(parsedUrl);
|
|
211
|
+
parsedPending.push(parsedUrl);
|
|
212
|
+
}
|
|
213
|
+
return parsedPending;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
/**
|
|
217
|
+
* Determine whether a link represents a valid internal HTML page.
|
|
218
|
+
*
|
|
219
|
+
* A link qualifies as a "page" if it is:
|
|
220
|
+
* - Internal (not external)
|
|
221
|
+
* - In a lower layer of the scope
|
|
222
|
+
* - Using HTTP or HTTPS protocol
|
|
223
|
+
* - Has destination data with a non-error status
|
|
224
|
+
* - Has `text/html` content type
|
|
225
|
+
* @param link - The link to evaluate.
|
|
226
|
+
* @returns `true` if the link represents a valid internal HTML page.
|
|
227
|
+
*/
|
|
228
|
+
function isPage(link) {
|
|
229
|
+
if (link.isExternal) {
|
|
230
|
+
return false;
|
|
231
|
+
}
|
|
232
|
+
if (!link.isLowerLayer) {
|
|
233
|
+
return false;
|
|
234
|
+
}
|
|
235
|
+
if (!/^https?:$/.test(link.url.protocol)) {
|
|
236
|
+
return false;
|
|
237
|
+
}
|
|
238
|
+
if (!link.dest) {
|
|
239
|
+
return false;
|
|
240
|
+
}
|
|
241
|
+
if (isError(link.dest.status)) {
|
|
242
|
+
return false;
|
|
243
|
+
}
|
|
244
|
+
if (link.dest.contentType === 'text/html') {
|
|
245
|
+
return true;
|
|
246
|
+
}
|
|
247
|
+
return false;
|
|
248
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { Link, PageData } from '../utils/index.js';
|
|
2
|
+
/**
|
|
3
|
+
* Convert a {@link Link} object into a {@link PageData} structure.
|
|
4
|
+
*
|
|
5
|
+
* Creates a minimal PageData from the link's destination metadata. This is used
|
|
6
|
+
* when a full scrape is not performed (e.g., for external pages when
|
|
7
|
+
* `fetchExternal` is disabled, or when a scrape error produces a fallback result).
|
|
8
|
+
*
|
|
9
|
+
* Missing destination fields are filled with sensible defaults (e.g., status -1
|
|
10
|
+
* for unknown, empty arrays for anchors/images, empty string for HTML).
|
|
11
|
+
* @param link - The link to convert, containing URL and optional destination metadata.
|
|
12
|
+
* @returns A PageData object populated from the link's available data.
|
|
13
|
+
*/
|
|
14
|
+
export declare function linkToPageData(link: Link): PageData;
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Convert a {@link Link} object into a {@link PageData} structure.
|
|
3
|
+
*
|
|
4
|
+
* Creates a minimal PageData from the link's destination metadata. This is used
|
|
5
|
+
* when a full scrape is not performed (e.g., for external pages when
|
|
6
|
+
* `fetchExternal` is disabled, or when a scrape error produces a fallback result).
|
|
7
|
+
*
|
|
8
|
+
* Missing destination fields are filled with sensible defaults (e.g., status -1
|
|
9
|
+
* for unknown, empty arrays for anchors/images, empty string for HTML).
|
|
10
|
+
* @param link - The link to convert, containing URL and optional destination metadata.
|
|
11
|
+
* @returns A PageData object populated from the link's available data.
|
|
12
|
+
*/
|
|
13
|
+
export function linkToPageData(link) {
|
|
14
|
+
return {
|
|
15
|
+
url: link.url,
|
|
16
|
+
redirectPaths: link.dest?.redirectPaths || [],
|
|
17
|
+
isTarget: !link.isExternal,
|
|
18
|
+
isExternal: link.isExternal,
|
|
19
|
+
status: link.dest?.status || -1,
|
|
20
|
+
statusText: link.dest?.statusText || 'UnknownError',
|
|
21
|
+
contentType: link.dest?.contentType || null,
|
|
22
|
+
contentLength: link.dest?.contentLength || null,
|
|
23
|
+
responseHeaders: link.dest?.responseHeaders || null,
|
|
24
|
+
meta: {
|
|
25
|
+
title: link.dest?.title || '',
|
|
26
|
+
},
|
|
27
|
+
anchorList: [],
|
|
28
|
+
imageList: [],
|
|
29
|
+
html: '',
|
|
30
|
+
isSkipped: false,
|
|
31
|
+
};
|
|
32
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Error thrown when a network request (typically an HTTP HEAD check)
|
|
3
|
+
* exceeds the allowed timeout duration. Used by `fetchDestination`
|
|
4
|
+
* to signal that the destination server did not respond in time.
|
|
5
|
+
*/
|
|
6
|
+
export default class NetTimeoutError extends Error {
|
|
7
|
+
constructor(url?: string);
|
|
8
|
+
name: string;
|
|
9
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Error thrown when a network request (typically an HTTP HEAD check)
|
|
3
|
+
* exceeds the allowed timeout duration. Used by `fetchDestination`
|
|
4
|
+
* to signal that the destination server did not respond in time.
|
|
5
|
+
*/
|
|
6
|
+
export default class NetTimeoutError extends Error {
|
|
7
|
+
constructor(url) {
|
|
8
|
+
super(url ? `Timeout: ${url}` : 'Timeout');
|
|
9
|
+
}
|
|
10
|
+
name = 'NetTimeoutError';
|
|
11
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import type { ExURL } from '@d-zero/shared/parse-url';
|
|
2
|
+
import type { PageData } from '@nitpicker/beholder';
|
|
3
|
+
/**
|
|
4
|
+
* Clears the in-memory cache of HTTP request results.
|
|
5
|
+
* Should be called between crawl sessions to prevent memory leaks.
|
|
6
|
+
*/
|
|
7
|
+
export declare function clearDestinationCache(): void;
|
|
8
|
+
/**
|
|
9
|
+
* Fetches the destination metadata for a URL using an HTTP HEAD request (or GET as fallback).
|
|
10
|
+
*
|
|
11
|
+
* Results are cached in memory so that repeated calls for the same URL
|
|
12
|
+
* (without hash) return immediately. The request races against a 10-second
|
|
13
|
+
* timeout; if the server does not respond in time, a {@link NetTimeoutError} is thrown.
|
|
14
|
+
*
|
|
15
|
+
* If the server returns 405 (Method Not Allowed), 501 (Not Implemented), or 503
|
|
16
|
+
* (Service Unavailable) for a HEAD request, the function automatically retries with GET.
|
|
17
|
+
* @param url - The extended URL to fetch.
|
|
18
|
+
* @param isExternal - Whether the URL is external to the crawl scope.
|
|
19
|
+
* @param method - The HTTP method to use. Defaults to `"HEAD"`.
|
|
20
|
+
* @param options - Additional options.
|
|
21
|
+
* @param options.titleBytesLimit - When set, forces a GET request and reads up to this many
|
|
22
|
+
* bytes from the response body to extract an HTML `<title>` tag. The connection is
|
|
23
|
+
* destroyed as soon as the limit is reached or a title is found.
|
|
24
|
+
* @returns The page metadata obtained from the HTTP response.
|
|
25
|
+
* @throws {NetTimeoutError} If the request exceeds the 10-second timeout.
|
|
26
|
+
* @throws {Error} If the HTTP request fails for any other reason.
|
|
27
|
+
*/
|
|
28
|
+
export declare function fetchDestination(url: ExURL, isExternal: boolean, method?: string, options?: {
|
|
29
|
+
titleBytesLimit?: number;
|
|
30
|
+
}): Promise<PageData>;
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
import { delay } from '@d-zero/shared/delay';
|
|
2
|
+
import redirects from 'follow-redirects';
|
|
3
|
+
import NetTimeoutError from './net-timeout-error.js';
|
|
4
|
+
/**
|
|
5
|
+
* In-memory cache of HEAD request results keyed by URL (without hash).
|
|
6
|
+
* Stores either the successful {@link PageData} or the {@link Error} to avoid
|
|
7
|
+
* repeated requests to the same destination.
|
|
8
|
+
*/
|
|
9
|
+
const cacheMap = new Map();
|
|
10
|
+
/**
|
|
11
|
+
* Clears the in-memory cache of HTTP request results.
|
|
12
|
+
* Should be called between crawl sessions to prevent memory leaks.
|
|
13
|
+
*/
|
|
14
|
+
export function clearDestinationCache() {
|
|
15
|
+
cacheMap.clear();
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Fetches the destination metadata for a URL using an HTTP HEAD request (or GET as fallback).
|
|
19
|
+
*
|
|
20
|
+
* Results are cached in memory so that repeated calls for the same URL
|
|
21
|
+
* (without hash) return immediately. The request races against a 10-second
|
|
22
|
+
* timeout; if the server does not respond in time, a {@link NetTimeoutError} is thrown.
|
|
23
|
+
*
|
|
24
|
+
* If the server returns 405 (Method Not Allowed), 501 (Not Implemented), or 503
|
|
25
|
+
* (Service Unavailable) for a HEAD request, the function automatically retries with GET.
|
|
26
|
+
* @param url - The extended URL to fetch.
|
|
27
|
+
* @param isExternal - Whether the URL is external to the crawl scope.
|
|
28
|
+
* @param method - The HTTP method to use. Defaults to `"HEAD"`.
|
|
29
|
+
* @param options - Additional options.
|
|
30
|
+
* @param options.titleBytesLimit - When set, forces a GET request and reads up to this many
|
|
31
|
+
* bytes from the response body to extract an HTML `<title>` tag. The connection is
|
|
32
|
+
* destroyed as soon as the limit is reached or a title is found.
|
|
33
|
+
* @returns The page metadata obtained from the HTTP response.
|
|
34
|
+
* @throws {NetTimeoutError} If the request exceeds the 10-second timeout.
|
|
35
|
+
* @throws {Error} If the HTTP request fails for any other reason.
|
|
36
|
+
*/
|
|
37
|
+
export async function fetchDestination(url, isExternal, method = 'HEAD', options) {
|
|
38
|
+
const titleBytesLimit = options?.titleBytesLimit;
|
|
39
|
+
const cacheKey = titleBytesLimit == null ? url.withoutHash : `${url.withoutHash}:title`;
|
|
40
|
+
if (cacheMap.has(cacheKey)) {
|
|
41
|
+
const cache = cacheMap.get(cacheKey);
|
|
42
|
+
if (cache instanceof Error) {
|
|
43
|
+
throw cache;
|
|
44
|
+
}
|
|
45
|
+
return cache;
|
|
46
|
+
}
|
|
47
|
+
const effectiveMethod = titleBytesLimit == null ? method : 'GET';
|
|
48
|
+
const result = await Promise.race([
|
|
49
|
+
_fetchHead(url, isExternal, effectiveMethod, titleBytesLimit).catch((error) => (error instanceof Error ? error : new Error(String(error)))),
|
|
50
|
+
(async () => {
|
|
51
|
+
await delay(10 * 1000);
|
|
52
|
+
return new NetTimeoutError(url.href);
|
|
53
|
+
})(),
|
|
54
|
+
]);
|
|
55
|
+
cacheMap.set(cacheKey, result);
|
|
56
|
+
if (result instanceof Error) {
|
|
57
|
+
throw result;
|
|
58
|
+
}
|
|
59
|
+
return result;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Performs the actual HTTP request to retrieve page metadata.
|
|
63
|
+
*
|
|
64
|
+
* Handles both HTTP and HTTPS protocols via `follow-redirects`, tracks redirect chains,
|
|
65
|
+
* and falls back to GET on certain status codes (405, 501, 503).
|
|
66
|
+
* @param url - The extended URL to request.
|
|
67
|
+
* @param isExternal - Whether the URL is external to the crawl scope.
|
|
68
|
+
* @param method - The HTTP method (`"HEAD"` or `"GET"`).
|
|
69
|
+
* @param titleBytesLimit - When set, reads up to this many bytes from the response body
|
|
70
|
+
* to extract a `<title>` tag, then destroys the connection.
|
|
71
|
+
* @returns A promise resolving to {@link PageData} with response metadata.
|
|
72
|
+
*/
|
|
73
|
+
async function _fetchHead(url, isExternal, method, titleBytesLimit) {
|
|
74
|
+
return new Promise((resolve, reject) => {
|
|
75
|
+
const hostHeader = url.port ? `${url.hostname}:${url.port}` : url.hostname;
|
|
76
|
+
const request = {
|
|
77
|
+
protocol: url.protocol,
|
|
78
|
+
hostname: url.hostname,
|
|
79
|
+
port: url.port || undefined,
|
|
80
|
+
path: url.pathname,
|
|
81
|
+
method,
|
|
82
|
+
headers: {
|
|
83
|
+
host: hostHeader,
|
|
84
|
+
Connection: 'keep-alive',
|
|
85
|
+
Pragma: 'no-cache',
|
|
86
|
+
'Cache-Control': 'no-cache',
|
|
87
|
+
'Upgrade-Insecure-Requests': 1,
|
|
88
|
+
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
|
89
|
+
'Accept-Encoding': 'gzip, deflate',
|
|
90
|
+
'Accept-Language': 'ja,en;q=0.9,zh;q=0.8,en-US;q=0.7,pl;q=0.6,de;q=0.5,zh-CN;q=0.4,zh-TW;q=0.3,th;q=0.2,ko;q=0.1,fr;q=0.1',
|
|
91
|
+
// Range: url.extname?.toLowerCase() === 'pdf' ? 'bytes=0-0' : undefined,
|
|
92
|
+
},
|
|
93
|
+
};
|
|
94
|
+
if (url.username && url.password) {
|
|
95
|
+
request.auth = `${url.username}:${url.password}`;
|
|
96
|
+
}
|
|
97
|
+
let req;
|
|
98
|
+
let destroyed = false;
|
|
99
|
+
const response = (res) => {
|
|
100
|
+
const chunks = [];
|
|
101
|
+
let totalBytes = 0;
|
|
102
|
+
let settled = false;
|
|
103
|
+
const buildPageData = (title) => {
|
|
104
|
+
const redirectPaths = res.redirects.map((r) => r.url);
|
|
105
|
+
const _contentLength = Number.parseInt(res.headers['content-length'] || '');
|
|
106
|
+
const contentLength = Number.isFinite(_contentLength) ? _contentLength : null;
|
|
107
|
+
return {
|
|
108
|
+
url,
|
|
109
|
+
isTarget: !isExternal,
|
|
110
|
+
isExternal,
|
|
111
|
+
redirectPaths,
|
|
112
|
+
status: res.statusCode || 0,
|
|
113
|
+
statusText: res.statusMessage || '',
|
|
114
|
+
contentType: res.headers['content-type']?.split(';')[0] || null,
|
|
115
|
+
contentLength,
|
|
116
|
+
responseHeaders: res.headers,
|
|
117
|
+
meta: { title },
|
|
118
|
+
imageList: [],
|
|
119
|
+
anchorList: [],
|
|
120
|
+
html: '',
|
|
121
|
+
isSkipped: false,
|
|
122
|
+
};
|
|
123
|
+
};
|
|
124
|
+
if (titleBytesLimit == null) {
|
|
125
|
+
res.on('data', () => { });
|
|
126
|
+
res.on('end', async () => {
|
|
127
|
+
let rep = buildPageData('');
|
|
128
|
+
if (rep.status === 405) {
|
|
129
|
+
if (method === 'GET') {
|
|
130
|
+
reject(new Error(`Method Not Allowed: ${url.href} ${rep.statusText}`));
|
|
131
|
+
return;
|
|
132
|
+
}
|
|
133
|
+
try {
|
|
134
|
+
rep = await fetchDestination(url, isExternal, 'GET');
|
|
135
|
+
}
|
|
136
|
+
catch (error) {
|
|
137
|
+
reject(error);
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
if (rep.status === 501) {
|
|
142
|
+
if (method === 'GET') {
|
|
143
|
+
reject(new Error(`Method Not Implemented: ${url.href} ${rep.statusText}`));
|
|
144
|
+
return;
|
|
145
|
+
}
|
|
146
|
+
await delay(5 * 1000);
|
|
147
|
+
try {
|
|
148
|
+
rep = await fetchDestination(url, isExternal, 'GET');
|
|
149
|
+
}
|
|
150
|
+
catch (error) {
|
|
151
|
+
reject(error);
|
|
152
|
+
return;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
if (rep.status === 503) {
|
|
156
|
+
if (method === 'GET') {
|
|
157
|
+
reject(new Error(`Retrying failed: ${url.href} ${rep.statusText}`));
|
|
158
|
+
return;
|
|
159
|
+
}
|
|
160
|
+
await delay(5 * 1000);
|
|
161
|
+
try {
|
|
162
|
+
rep = await fetchDestination(url, isExternal, 'GET');
|
|
163
|
+
}
|
|
164
|
+
catch (error) {
|
|
165
|
+
reject(error);
|
|
166
|
+
return;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
resolve(rep);
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
else {
|
|
173
|
+
res.on('data', (chunk) => {
|
|
174
|
+
if (settled)
|
|
175
|
+
return;
|
|
176
|
+
chunks.push(chunk);
|
|
177
|
+
totalBytes += chunk.length;
|
|
178
|
+
// Check for title in accumulated data so far
|
|
179
|
+
const body = Buffer.concat(chunks).toString('utf8');
|
|
180
|
+
const titleMatch = /<title[^>]*>([\s\S]*?)<\/title>/i.exec(body);
|
|
181
|
+
if (titleMatch) {
|
|
182
|
+
settled = true;
|
|
183
|
+
const title = titleMatch[1]?.trim() ?? '';
|
|
184
|
+
resolve(buildPageData(title));
|
|
185
|
+
destroyed = true;
|
|
186
|
+
req.destroy();
|
|
187
|
+
return;
|
|
188
|
+
}
|
|
189
|
+
// Reached byte limit without finding title
|
|
190
|
+
if (totalBytes >= titleBytesLimit) {
|
|
191
|
+
settled = true;
|
|
192
|
+
resolve(buildPageData(''));
|
|
193
|
+
destroyed = true;
|
|
194
|
+
req.destroy();
|
|
195
|
+
}
|
|
196
|
+
});
|
|
197
|
+
res.on('end', () => {
|
|
198
|
+
if (settled)
|
|
199
|
+
return;
|
|
200
|
+
settled = true;
|
|
201
|
+
// Stream ended before limit — try to extract title from what we have
|
|
202
|
+
const body = Buffer.concat(chunks).toString('utf8');
|
|
203
|
+
const titleMatch = /<title[^>]*>([\s\S]*?)<\/title>/i.exec(body);
|
|
204
|
+
const title = titleMatch?.[1]?.trim() ?? '';
|
|
205
|
+
resolve(buildPageData(title));
|
|
206
|
+
});
|
|
207
|
+
}
|
|
208
|
+
};
|
|
209
|
+
if (url.protocol === 'https:') {
|
|
210
|
+
req = redirects.https.request({
|
|
211
|
+
...request,
|
|
212
|
+
rejectUnauthorized: false,
|
|
213
|
+
}, response);
|
|
214
|
+
}
|
|
215
|
+
else {
|
|
216
|
+
req = redirects.http.request(request, response);
|
|
217
|
+
}
|
|
218
|
+
req.on('error', (error) => {
|
|
219
|
+
// Ignore errors caused by intentional req.destroy()
|
|
220
|
+
if (destroyed)
|
|
221
|
+
return;
|
|
222
|
+
reject(error);
|
|
223
|
+
});
|
|
224
|
+
req.end();
|
|
225
|
+
});
|
|
226
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Returns a URL string with the protocol prefix (`http:` / `https:`) stripped.
|
|
3
|
+
*
|
|
4
|
+
* Used as a deduplication key so that HTTP and HTTPS variants of the
|
|
5
|
+
* same URL are treated as identical during crawling.
|
|
6
|
+
* @param url - A URL string (e.g. `"https://example.com/page"`)
|
|
7
|
+
* @returns The URL without its protocol prefix (e.g. `"//example.com/page"`)
|
|
8
|
+
*/
|
|
9
|
+
export declare function protocolAgnosticKey(url: string): string;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Returns a URL string with the protocol prefix (`http:` / `https:`) stripped.
|
|
3
|
+
*
|
|
4
|
+
* Used as a deduplication key so that HTTP and HTTPS variants of the
|
|
5
|
+
* same URL are treated as identical during crawling.
|
|
6
|
+
* @param url - A URL string (e.g. `"https://example.com/page"`)
|
|
7
|
+
* @returns The URL without its protocol prefix (e.g. `"//example.com/page"`)
|
|
8
|
+
*/
|
|
9
|
+
export function protocolAgnosticKey(url) {
|
|
10
|
+
return url.replace(/^https?:/, '');
|
|
11
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { DecomposedUrl } from './decompose-url.js';
|
|
2
|
+
/**
|
|
3
|
+
* Reconstructs a URL string from a decomposed representation with one
|
|
4
|
+
* token replaced at the specified index.
|
|
5
|
+
* @param decomposed - The decomposed URL to reconstruct
|
|
6
|
+
* @param tokenIndex - Index in the combined token array (path segments + query values)
|
|
7
|
+
* @param newValue - The replacement value for the token at `tokenIndex`
|
|
8
|
+
* @returns The reconstructed URL string
|
|
9
|
+
*/
|
|
10
|
+
export declare function reconstructUrl(decomposed: DecomposedUrl, tokenIndex: number, newValue: string): string;
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reconstructs a URL string from a decomposed representation with one
|
|
3
|
+
* token replaced at the specified index.
|
|
4
|
+
* @param decomposed - The decomposed URL to reconstruct
|
|
5
|
+
* @param tokenIndex - Index in the combined token array (path segments + query values)
|
|
6
|
+
* @param newValue - The replacement value for the token at `tokenIndex`
|
|
7
|
+
* @returns The reconstructed URL string
|
|
8
|
+
*/
|
|
9
|
+
export function reconstructUrl(decomposed, tokenIndex, newValue) {
|
|
10
|
+
const { host, pathSegments, queryKeys, queryValues, protocol } = decomposed;
|
|
11
|
+
const newPathSegments = [...pathSegments];
|
|
12
|
+
const newQueryValues = [...queryValues];
|
|
13
|
+
if (tokenIndex < pathSegments.length) {
|
|
14
|
+
newPathSegments[tokenIndex] = newValue;
|
|
15
|
+
}
|
|
16
|
+
else {
|
|
17
|
+
newQueryValues[tokenIndex - pathSegments.length] = newValue;
|
|
18
|
+
}
|
|
19
|
+
let url = `${protocol}//${host}`;
|
|
20
|
+
if (newPathSegments.length > 0) {
|
|
21
|
+
url += `/${newPathSegments.join('/')}`;
|
|
22
|
+
}
|
|
23
|
+
if (queryKeys.length > 0) {
|
|
24
|
+
const pairs = queryKeys.map((k, i) => `${encodeURIComponent(k)}=${encodeURIComponent(newQueryValues[i] ?? '')}`);
|
|
25
|
+
url += `?${pairs.join('&')}`;
|
|
26
|
+
}
|
|
27
|
+
return url;
|
|
28
|
+
}
|