@nitpicker/crawler 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/LICENSE +191 -0
- package/README.md +13 -0
- package/lib/archive/archive-accessor.d.ts +107 -0
- package/lib/archive/archive-accessor.js +264 -0
- package/lib/archive/archive.d.ts +174 -0
- package/lib/archive/archive.js +331 -0
- package/lib/archive/database.d.ts +207 -0
- package/lib/archive/database.js +972 -0
- package/lib/archive/debug.d.ts +8 -0
- package/lib/archive/debug.js +9 -0
- package/lib/archive/filesystem/append-text.d.ts +9 -0
- package/lib/archive/filesystem/append-text.js +14 -0
- package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
- package/lib/archive/filesystem/copy-dir-sync.js +9 -0
- package/lib/archive/filesystem/copy-dir.d.ts +7 -0
- package/lib/archive/filesystem/copy-dir.js +13 -0
- package/lib/archive/filesystem/exists.d.ts +6 -0
- package/lib/archive/filesystem/exists.js +9 -0
- package/lib/archive/filesystem/get-file-list.d.ts +8 -0
- package/lib/archive/filesystem/get-file-list.js +12 -0
- package/lib/archive/filesystem/index.d.ts +17 -0
- package/lib/archive/filesystem/index.js +17 -0
- package/lib/archive/filesystem/is-dir.d.ts +6 -0
- package/lib/archive/filesystem/is-dir.js +10 -0
- package/lib/archive/filesystem/mkdir.d.ts +8 -0
- package/lib/archive/filesystem/mkdir.js +15 -0
- package/lib/archive/filesystem/output-json.d.ts +9 -0
- package/lib/archive/filesystem/output-json.js +14 -0
- package/lib/archive/filesystem/output-text.d.ts +11 -0
- package/lib/archive/filesystem/output-text.js +32 -0
- package/lib/archive/filesystem/read-json.d.ts +7 -0
- package/lib/archive/filesystem/read-json.js +11 -0
- package/lib/archive/filesystem/read-text.d.ts +6 -0
- package/lib/archive/filesystem/read-text.js +10 -0
- package/lib/archive/filesystem/readline.d.ts +11 -0
- package/lib/archive/filesystem/readline.js +26 -0
- package/lib/archive/filesystem/remove.d.ts +5 -0
- package/lib/archive/filesystem/remove.js +10 -0
- package/lib/archive/filesystem/rename.d.ts +11 -0
- package/lib/archive/filesystem/rename.js +18 -0
- package/lib/archive/filesystem/tar.d.ts +11 -0
- package/lib/archive/filesystem/tar.js +22 -0
- package/lib/archive/filesystem/untar.d.ts +20 -0
- package/lib/archive/filesystem/untar.js +24 -0
- package/lib/archive/filesystem/utils.d.ts +109 -0
- package/lib/archive/filesystem/utils.js +185 -0
- package/lib/archive/filesystem/zip.d.ts +29 -0
- package/lib/archive/filesystem/zip.js +53 -0
- package/lib/archive/index.d.ts +6 -0
- package/lib/archive/index.js +11 -0
- package/lib/archive/page.d.ts +263 -0
- package/lib/archive/page.js +316 -0
- package/lib/archive/resource.d.ts +46 -0
- package/lib/archive/resource.js +62 -0
- package/lib/archive/safe-path.d.ts +9 -0
- package/lib/archive/safe-path.js +17 -0
- package/lib/archive/types.d.ts +210 -0
- package/lib/archive/types.js +1 -0
- package/lib/crawler/clear-destination-cache.d.ts +5 -0
- package/lib/crawler/clear-destination-cache.js +8 -0
- package/lib/crawler/crawler.d.ts +73 -0
- package/lib/crawler/crawler.js +748 -0
- package/lib/crawler/decompose-url.d.ts +25 -0
- package/lib/crawler/decompose-url.js +71 -0
- package/lib/crawler/destination-cache.d.ts +7 -0
- package/lib/crawler/destination-cache.js +6 -0
- package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
- package/lib/crawler/detect-pagination-pattern.js +61 -0
- package/lib/crawler/fetch-destination.d.ts +38 -0
- package/lib/crawler/fetch-destination.js +208 -0
- package/lib/crawler/fetch-robots-txt.d.ts +42 -0
- package/lib/crawler/fetch-robots-txt.js +44 -0
- package/lib/crawler/find-best-matching-scope.d.ts +12 -0
- package/lib/crawler/find-best-matching-scope.js +46 -0
- package/lib/crawler/generate-predicted-urls.d.ts +13 -0
- package/lib/crawler/generate-predicted-urls.js +27 -0
- package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
- package/lib/crawler/handle-ignore-and-skip.js +19 -0
- package/lib/crawler/handle-resource-response.d.ts +13 -0
- package/lib/crawler/handle-resource-response.js +16 -0
- package/lib/crawler/handle-scrape-end.d.ts +24 -0
- package/lib/crawler/handle-scrape-end.js +82 -0
- package/lib/crawler/handle-scrape-error.d.ts +37 -0
- package/lib/crawler/handle-scrape-error.js +38 -0
- package/lib/crawler/index.d.ts +2 -0
- package/lib/crawler/index.js +2 -0
- package/lib/crawler/inject-scope-auth.d.ts +11 -0
- package/lib/crawler/inject-scope-auth.js +21 -0
- package/lib/crawler/is-external-url.d.ts +11 -0
- package/lib/crawler/is-external-url.js +12 -0
- package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
- package/lib/crawler/is-in-any-lower-layer.js +15 -0
- package/lib/crawler/link-list.d.ts +112 -0
- package/lib/crawler/link-list.js +248 -0
- package/lib/crawler/link-to-page-data.d.ts +14 -0
- package/lib/crawler/link-to-page-data.js +32 -0
- package/lib/crawler/net-timeout-error.d.ts +9 -0
- package/lib/crawler/net-timeout-error.js +11 -0
- package/lib/crawler/network.d.ts +30 -0
- package/lib/crawler/network.js +226 -0
- package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
- package/lib/crawler/protocol-agnostic-key.js +11 -0
- package/lib/crawler/reconstruct-url.d.ts +10 -0
- package/lib/crawler/reconstruct-url.js +28 -0
- package/lib/crawler/result-handler.d.ts +118 -0
- package/lib/crawler/result-handler.js +153 -0
- package/lib/crawler/robots-checker.d.ts +26 -0
- package/lib/crawler/robots-checker.js +62 -0
- package/lib/crawler/should-discard-predicted.d.ts +14 -0
- package/lib/crawler/should-discard-predicted.js +31 -0
- package/lib/crawler/should-skip-url.d.ts +23 -0
- package/lib/crawler/should-skip-url.js +15 -0
- package/lib/crawler/speculative-pagination.d.ts +52 -0
- package/lib/crawler/speculative-pagination.js +215 -0
- package/lib/crawler/types.d.ts +119 -0
- package/lib/crawler/types.js +1 -0
- package/lib/crawler/url-filter.d.ts +56 -0
- package/lib/crawler/url-filter.js +110 -0
- package/lib/crawler-orchestrator.d.ts +142 -0
- package/lib/crawler-orchestrator.js +309 -0
- package/lib/debug.d.ts +8 -0
- package/lib/debug.js +9 -0
- package/lib/index.d.ts +16 -0
- package/lib/index.js +18 -0
- package/lib/qzilla.d.ts +136 -0
- package/lib/qzilla.js +292 -0
- package/lib/types.d.ts +27 -0
- package/lib/types.js +1 -0
- package/lib/utils/array/each-splitted.d.ts +10 -0
- package/lib/utils/array/each-splitted.js +14 -0
- package/lib/utils/array/index.d.ts +1 -0
- package/lib/utils/array/index.js +1 -0
- package/lib/utils/async/index.d.ts +1 -0
- package/lib/utils/async/index.js +1 -0
- package/lib/utils/debug.d.ts +5 -0
- package/lib/utils/debug.js +5 -0
- package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
- package/lib/utils/error/dom-evaluation-error.js +7 -0
- package/lib/utils/error/error-emitter.d.ts +18 -0
- package/lib/utils/error/error-emitter.js +29 -0
- package/lib/utils/error/index.d.ts +3 -0
- package/lib/utils/error/index.js +2 -0
- package/lib/utils/event-emitter/index.d.ts +6 -0
- package/lib/utils/event-emitter/index.js +6 -0
- package/lib/utils/index.d.ts +5 -0
- package/lib/utils/index.js +5 -0
- package/lib/utils/network/index.d.ts +1 -0
- package/lib/utils/network/index.js +1 -0
- package/lib/utils/object/clean-object.d.ts +8 -0
- package/lib/utils/object/clean-object.js +13 -0
- package/lib/utils/object/index.d.ts +1 -0
- package/lib/utils/object/index.js +1 -0
- package/lib/utils/path/index.d.ts +1 -0
- package/lib/utils/path/index.js +1 -0
- package/lib/utils/path/safe-filepath.d.ts +7 -0
- package/lib/utils/path/safe-filepath.js +12 -0
- package/lib/utils/regexp/index.d.ts +1 -0
- package/lib/utils/regexp/index.js +1 -0
- package/lib/utils/retryable/index.d.ts +2 -0
- package/lib/utils/retryable/index.js +1 -0
- package/lib/utils/sort/index.d.ts +14 -0
- package/lib/utils/sort/index.js +61 -0
- package/lib/utils/sort/remove-matches.d.ts +9 -0
- package/lib/utils/sort/remove-matches.js +23 -0
- package/lib/utils/types/index.d.ts +1 -0
- package/lib/utils/types/index.js +1 -0
- package/lib/utils/types/types.d.ts +46 -0
- package/lib/utils/types/types.js +1 -0
- package/lib/utils/url/index.d.ts +5 -0
- package/lib/utils/url/index.js +5 -0
- package/lib/utils/url/is-lower-layer.d.ts +15 -0
- package/lib/utils/url/is-lower-layer.js +55 -0
- package/lib/utils/url/parse-url.d.ts +11 -0
- package/lib/utils/url/parse-url.js +20 -0
- package/lib/utils/url/path-match.d.ts +11 -0
- package/lib/utils/url/path-match.js +18 -0
- package/lib/utils/url/sort-url.d.ts +10 -0
- package/lib/utils/url/sort-url.js +24 -0
- package/lib/utils/url/url-partial-match.d.ts +11 -0
- package/lib/utils/url/url-partial-match.js +32 -0
- package/package.json +49 -0
- package/src/archive/__mock__/.gitignore +3 -0
- package/src/archive/__mock__/mock.sqlite +0 -0
- package/src/archive/archive-accessor.ts +337 -0
- package/src/archive/archive.ts +408 -0
- package/src/archive/database.spec.ts +469 -0
- package/src/archive/database.ts +1059 -0
- package/src/archive/debug.ts +10 -0
- package/src/archive/filesystem/append-text.spec.ts +26 -0
- package/src/archive/filesystem/append-text.ts +16 -0
- package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
- package/src/archive/filesystem/copy-dir-sync.ts +10 -0
- package/src/archive/filesystem/copy-dir.spec.ts +33 -0
- package/src/archive/filesystem/copy-dir.ts +14 -0
- package/src/archive/filesystem/exists.spec.ts +33 -0
- package/src/archive/filesystem/exists.ts +10 -0
- package/src/archive/filesystem/get-file-list.spec.ts +37 -0
- package/src/archive/filesystem/get-file-list.ts +13 -0
- package/src/archive/filesystem/index.ts +17 -0
- package/src/archive/filesystem/is-dir.spec.ts +29 -0
- package/src/archive/filesystem/is-dir.ts +11 -0
- package/src/archive/filesystem/mkdir.spec.ts +37 -0
- package/src/archive/filesystem/mkdir.ts +16 -0
- package/src/archive/filesystem/output-json.spec.ts +34 -0
- package/src/archive/filesystem/output-json.ts +16 -0
- package/src/archive/filesystem/output-text.spec.ts +31 -0
- package/src/archive/filesystem/output-text.ts +35 -0
- package/src/archive/filesystem/read-json.spec.ts +26 -0
- package/src/archive/filesystem/read-json.ts +12 -0
- package/src/archive/filesystem/read-text.spec.ts +25 -0
- package/src/archive/filesystem/read-text.ts +11 -0
- package/src/archive/filesystem/readline.spec.ts +29 -0
- package/src/archive/filesystem/readline.ts +30 -0
- package/src/archive/filesystem/remove.spec.ts +34 -0
- package/src/archive/filesystem/remove.ts +11 -0
- package/src/archive/filesystem/rename.spec.ts +46 -0
- package/src/archive/filesystem/rename.ts +21 -0
- package/src/archive/filesystem/tar.spec.ts +33 -0
- package/src/archive/filesystem/tar.ts +27 -0
- package/src/archive/filesystem/untar.spec.ts +34 -0
- package/src/archive/filesystem/untar.ts +36 -0
- package/src/archive/index.ts +13 -0
- package/src/archive/page.spec.ts +368 -0
- package/src/archive/page.ts +420 -0
- package/src/archive/resource.spec.ts +101 -0
- package/src/archive/resource.ts +73 -0
- package/src/archive/safe-path.spec.ts +44 -0
- package/src/archive/safe-path.ts +18 -0
- package/src/archive/types.ts +227 -0
- package/src/crawler/clear-destination-cache.spec.ts +20 -0
- package/src/crawler/clear-destination-cache.ts +9 -0
- package/src/crawler/crawler.ts +873 -0
- package/src/crawler/decompose-url.spec.ts +48 -0
- package/src/crawler/decompose-url.ts +90 -0
- package/src/crawler/destination-cache.spec.ts +23 -0
- package/src/crawler/destination-cache.ts +8 -0
- package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
- package/src/crawler/detect-pagination-pattern.ts +66 -0
- package/src/crawler/fetch-destination.ts +257 -0
- package/src/crawler/fetch-robots-txt.spec.ts +83 -0
- package/src/crawler/fetch-robots-txt.ts +91 -0
- package/src/crawler/find-best-matching-scope.spec.ts +39 -0
- package/src/crawler/find-best-matching-scope.ts +57 -0
- package/src/crawler/generate-predicted-urls.spec.ts +42 -0
- package/src/crawler/generate-predicted-urls.ts +34 -0
- package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
- package/src/crawler/handle-ignore-and-skip.ts +30 -0
- package/src/crawler/handle-resource-response.spec.ts +45 -0
- package/src/crawler/handle-resource-response.ts +21 -0
- package/src/crawler/handle-scrape-end.spec.ts +109 -0
- package/src/crawler/handle-scrape-end.ts +115 -0
- package/src/crawler/handle-scrape-error.spec.ts +105 -0
- package/src/crawler/handle-scrape-error.ts +58 -0
- package/src/crawler/index.ts +2 -0
- package/src/crawler/inject-scope-auth.spec.ts +36 -0
- package/src/crawler/inject-scope-auth.ts +27 -0
- package/src/crawler/is-external-url.spec.ts +31 -0
- package/src/crawler/is-external-url.ts +17 -0
- package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
- package/src/crawler/is-in-any-lower-layer.ts +22 -0
- package/src/crawler/link-list.spec.ts +355 -0
- package/src/crawler/link-list.ts +275 -0
- package/src/crawler/link-to-page-data.spec.ts +133 -0
- package/src/crawler/link-to-page-data.ts +34 -0
- package/src/crawler/net-timeout-error.spec.ts +25 -0
- package/src/crawler/net-timeout-error.ts +11 -0
- package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
- package/src/crawler/protocol-agnostic-key.ts +11 -0
- package/src/crawler/reconstruct-url.spec.ts +37 -0
- package/src/crawler/reconstruct-url.ts +37 -0
- package/src/crawler/robots-checker.spec.ts +104 -0
- package/src/crawler/robots-checker.ts +73 -0
- package/src/crawler/should-discard-predicted.spec.ts +125 -0
- package/src/crawler/should-discard-predicted.ts +33 -0
- package/src/crawler/should-skip-url.spec.ts +77 -0
- package/src/crawler/should-skip-url.ts +37 -0
- package/src/crawler/types.ts +146 -0
- package/src/crawler-orchestrator.ts +401 -0
- package/src/debug.ts +10 -0
- package/src/index.ts +25 -0
- package/src/types.ts +30 -0
- package/src/utils/array/each-splitted.spec.ts +38 -0
- package/src/utils/array/each-splitted.ts +19 -0
- package/src/utils/array/index.ts +1 -0
- package/src/utils/debug.ts +6 -0
- package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
- package/src/utils/error/dom-evaluation-error.ts +6 -0
- package/src/utils/error/error-emitter.spec.ts +78 -0
- package/src/utils/error/error-emitter.ts +44 -0
- package/src/utils/error/index.ts +3 -0
- package/src/utils/index.ts +5 -0
- package/src/utils/object/clean-object.spec.ts +24 -0
- package/src/utils/object/clean-object.ts +13 -0
- package/src/utils/object/index.ts +1 -0
- package/src/utils/types/index.ts +1 -0
- package/src/utils/types/types.ts +65 -0
- package/tsconfig.json +11 -0
- package/tsconfig.tsbuildinfo +1 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
import { isError } from '@nitpicker/beholder';
|
|
2
|
+
/**
|
|
3
|
+
* Compares two consecutive URL strings and detects a single-token numeric
|
|
4
|
+
* pagination pattern (e.g. `/page/1` → `/page/2`, or `?p=1` → `?p=2`).
|
|
5
|
+
*
|
|
6
|
+
* The algorithm decomposes each URL into tokens (path segments + sorted query values),
|
|
7
|
+
* then checks that exactly one token differs and both values are integers with a
|
|
8
|
+
* positive step. Returns `null` when no pattern is detected.
|
|
9
|
+
*
|
|
10
|
+
* WHY single-token constraint: Multi-token differences (e.g. both path and query
|
|
11
|
+
* changing) indicate different routes rather than pagination, so they are rejected.
|
|
12
|
+
* @param prevUrl - The previously pushed URL (protocol-agnostic, without hash/auth)
|
|
13
|
+
* @param currentUrl - The newly discovered URL
|
|
14
|
+
* @returns The detected pattern, or `null` if no pagination pattern was found
|
|
15
|
+
*/
|
|
16
|
+
export function detectPaginationPattern(prevUrl, currentUrl) {
|
|
17
|
+
const prev = decomposeUrl(prevUrl);
|
|
18
|
+
const curr = decomposeUrl(currentUrl);
|
|
19
|
+
if (!prev || !curr)
|
|
20
|
+
return null;
|
|
21
|
+
// Host (including port) must match
|
|
22
|
+
if (prev.host !== curr.host)
|
|
23
|
+
return null;
|
|
24
|
+
// Path segment count must match
|
|
25
|
+
if (prev.pathSegments.length !== curr.pathSegments.length)
|
|
26
|
+
return null;
|
|
27
|
+
// Query key sets must match in count and identity
|
|
28
|
+
if (prev.queryKeys.length !== curr.queryKeys.length)
|
|
29
|
+
return null;
|
|
30
|
+
for (let i = 0; i < prev.queryKeys.length; i++) {
|
|
31
|
+
if (prev.queryKeys[i] !== curr.queryKeys[i])
|
|
32
|
+
return null;
|
|
33
|
+
}
|
|
34
|
+
// Build combined token arrays: path segments + query values (sorted by key)
|
|
35
|
+
const prevTokens = [...prev.pathSegments, ...prev.queryValues];
|
|
36
|
+
const currTokens = [...curr.pathSegments, ...curr.queryValues];
|
|
37
|
+
let diffIndex = -1;
|
|
38
|
+
for (const [i, prevToken] of prevTokens.entries()) {
|
|
39
|
+
if (prevToken !== currTokens[i]) {
|
|
40
|
+
if (diffIndex !== -1)
|
|
41
|
+
return null; // more than one difference
|
|
42
|
+
diffIndex = i;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
if (diffIndex === -1)
|
|
46
|
+
return null; // identical URLs
|
|
47
|
+
const prevNum = Number(prevTokens[diffIndex]);
|
|
48
|
+
const currNum = Number(currTokens[diffIndex]);
|
|
49
|
+
if (!Number.isFinite(prevNum) || !Number.isFinite(currNum))
|
|
50
|
+
return null;
|
|
51
|
+
if (!Number.isInteger(prevNum) || !Number.isInteger(currNum))
|
|
52
|
+
return null;
|
|
53
|
+
const step = currNum - prevNum;
|
|
54
|
+
if (step <= 0)
|
|
55
|
+
return null;
|
|
56
|
+
return {
|
|
57
|
+
tokenIndex: diffIndex,
|
|
58
|
+
step,
|
|
59
|
+
currentNumber: currNum,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* Generates speculative URLs by extrapolating the detected pagination pattern.
|
|
64
|
+
*
|
|
65
|
+
* Starting from `currentUrl`, applies the pattern's step `count` times to produce
|
|
66
|
+
* future page URLs (e.g. if step=1 and currentNumber=2, generates page 3, 4, ...).
|
|
67
|
+
* These URLs are pushed into the crawl queue and discarded later if they 404.
|
|
68
|
+
* @param pattern - The detected pagination pattern from {@link detectPaginationPattern}
|
|
69
|
+
* @param currentUrl - The URL to extrapolate from (protocol-agnostic, without hash/auth)
|
|
70
|
+
* @param count - Number of speculative URLs to generate (typically equals concurrency)
|
|
71
|
+
* @returns Array of speculative URL strings
|
|
72
|
+
*/
|
|
73
|
+
export function generateSpeculativeUrls(pattern, currentUrl, count) {
|
|
74
|
+
if (count <= 0)
|
|
75
|
+
return [];
|
|
76
|
+
const decomposed = decomposeUrl(currentUrl);
|
|
77
|
+
if (!decomposed)
|
|
78
|
+
return [];
|
|
79
|
+
const results = [];
|
|
80
|
+
for (let i = 1; i <= count; i++) {
|
|
81
|
+
const nextNum = pattern.currentNumber + pattern.step * i;
|
|
82
|
+
const url = reconstructUrl(decomposed, pattern.tokenIndex, String(nextNum));
|
|
83
|
+
results.push(url);
|
|
84
|
+
}
|
|
85
|
+
return results;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Determines whether a speculative URL's scrape result should be discarded.
|
|
89
|
+
*
|
|
90
|
+
* Speculative URLs are pre-emptively pushed into the crawl queue before
|
|
91
|
+
* knowing if they exist. This function filters out invalid results:
|
|
92
|
+
* - `error` type → discard (server unreachable, timeout, etc.)
|
|
93
|
+
* - `ignoreAndSkip` type → discard (matched exclusion rule)
|
|
94
|
+
* - `scrapeEnd` with HTTP error status (4xx/5xx) → discard
|
|
95
|
+
* - `scrapeEnd` with 2xx/3xx → keep
|
|
96
|
+
* @param result - The scrape result for the speculative URL
|
|
97
|
+
* @returns `true` if the result should be discarded (not saved to archive)
|
|
98
|
+
*/
|
|
99
|
+
export function shouldDiscardSpeculative(result) {
|
|
100
|
+
switch (result.type) {
|
|
101
|
+
case 'error': {
|
|
102
|
+
return true;
|
|
103
|
+
}
|
|
104
|
+
case 'ignoreAndSkip': {
|
|
105
|
+
return true;
|
|
106
|
+
}
|
|
107
|
+
case 'scrapeEnd': {
|
|
108
|
+
if (!result.pageData)
|
|
109
|
+
return true;
|
|
110
|
+
return isError(result.pageData.status);
|
|
111
|
+
}
|
|
112
|
+
default: {
|
|
113
|
+
return true;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Decomposes a URL string into its constituent tokens for comparison.
|
|
119
|
+
* Handles both full URLs (`https://host/path?q=v`) and protocol-agnostic
|
|
120
|
+
* URLs (`//host/path?q=v`). Query parameters are sorted by key for
|
|
121
|
+
* consistent comparison.
|
|
122
|
+
* @param url - The URL string to decompose
|
|
123
|
+
* @returns The decomposed URL, or `null` if the format is invalid
|
|
124
|
+
*/
|
|
125
|
+
function decomposeUrl(url) {
|
|
126
|
+
// URL format: //host/path?query or //host?query (protocol-agnostic)
|
|
127
|
+
// Also handle protocol://host/path?query
|
|
128
|
+
let work = url;
|
|
129
|
+
let protocol = '';
|
|
130
|
+
// Strip protocol
|
|
131
|
+
const protoMatch = /^(https?:)?\/\//.exec(work);
|
|
132
|
+
if (!protoMatch)
|
|
133
|
+
return null;
|
|
134
|
+
protocol = protoMatch[1] ?? '';
|
|
135
|
+
work = work.slice(protoMatch[0].length);
|
|
136
|
+
// Split host from rest
|
|
137
|
+
const slashIdx = work.indexOf('/');
|
|
138
|
+
const qmarkIdx = work.indexOf('?');
|
|
139
|
+
let host;
|
|
140
|
+
let pathPart;
|
|
141
|
+
let queryPart;
|
|
142
|
+
if (slashIdx === -1 && qmarkIdx === -1) {
|
|
143
|
+
host = work;
|
|
144
|
+
pathPart = '';
|
|
145
|
+
queryPart = '';
|
|
146
|
+
}
|
|
147
|
+
else if (slashIdx === -1) {
|
|
148
|
+
host = work.slice(0, qmarkIdx);
|
|
149
|
+
pathPart = '';
|
|
150
|
+
queryPart = work.slice(qmarkIdx + 1);
|
|
151
|
+
}
|
|
152
|
+
else {
|
|
153
|
+
host = work.slice(0, slashIdx);
|
|
154
|
+
const pathAndQuery = work.slice(slashIdx + 1);
|
|
155
|
+
const pq = pathAndQuery.indexOf('?');
|
|
156
|
+
if (pq === -1) {
|
|
157
|
+
pathPart = pathAndQuery;
|
|
158
|
+
queryPart = '';
|
|
159
|
+
}
|
|
160
|
+
else {
|
|
161
|
+
pathPart = pathAndQuery.slice(0, pq);
|
|
162
|
+
queryPart = pathAndQuery.slice(pq + 1);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
const pathSegments = pathPart ? pathPart.split('/') : [];
|
|
166
|
+
// Parse query into sorted key-value pairs
|
|
167
|
+
const queryPairs = [];
|
|
168
|
+
if (queryPart) {
|
|
169
|
+
for (const pair of queryPart.split('&')) {
|
|
170
|
+
const eqIdx = pair.indexOf('=');
|
|
171
|
+
if (eqIdx === -1) {
|
|
172
|
+
queryPairs.push([pair, '']);
|
|
173
|
+
}
|
|
174
|
+
else {
|
|
175
|
+
queryPairs.push([pair.slice(0, eqIdx), pair.slice(eqIdx + 1)]);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
queryPairs.sort((a, b) => a[0].localeCompare(b[0]));
|
|
180
|
+
return {
|
|
181
|
+
host,
|
|
182
|
+
pathSegments,
|
|
183
|
+
queryKeys: queryPairs.map(([k]) => k),
|
|
184
|
+
queryValues: queryPairs.map(([, v]) => v),
|
|
185
|
+
protocol,
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
/**
|
|
189
|
+
* Reconstructs a URL string from a decomposed representation with one
|
|
190
|
+
* token replaced at the specified index.
|
|
191
|
+
* @param decomposed - The decomposed URL to reconstruct
|
|
192
|
+
* @param tokenIndex - Index in the combined token array (path segments + query values)
|
|
193
|
+
* @param newValue - The replacement value for the token at `tokenIndex`
|
|
194
|
+
* @returns The reconstructed URL string
|
|
195
|
+
*/
|
|
196
|
+
function reconstructUrl(decomposed, tokenIndex, newValue) {
|
|
197
|
+
const { host, pathSegments, queryKeys, queryValues, protocol } = decomposed;
|
|
198
|
+
const newPathSegments = [...pathSegments];
|
|
199
|
+
const newQueryValues = [...queryValues];
|
|
200
|
+
if (tokenIndex < pathSegments.length) {
|
|
201
|
+
newPathSegments[tokenIndex] = newValue;
|
|
202
|
+
}
|
|
203
|
+
else {
|
|
204
|
+
newQueryValues[tokenIndex - pathSegments.length] = newValue;
|
|
205
|
+
}
|
|
206
|
+
let url = `${protocol}//${host}`;
|
|
207
|
+
if (newPathSegments.length > 0) {
|
|
208
|
+
url += `/${newPathSegments.join('/')}`;
|
|
209
|
+
}
|
|
210
|
+
if (queryKeys.length > 0) {
|
|
211
|
+
const pairs = queryKeys.map((k, i) => `${k}=${newQueryValues[i]}`);
|
|
212
|
+
url += `?${pairs.join('&')}`;
|
|
213
|
+
}
|
|
214
|
+
return url;
|
|
215
|
+
}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import type { PageData, CrawlerError, Resource } from '../utils/index.js';
|
|
2
|
+
import type { ChangePhaseEvent } from '@d-zero/beholder';
|
|
3
|
+
import type { ParseURLOptions } from '@d-zero/shared/parse-url';
|
|
4
|
+
/**
|
|
5
|
+
* Configuration options that control crawler behavior.
|
|
6
|
+
*
|
|
7
|
+
* Used by the result handler functions to determine how to process
|
|
8
|
+
* scrape results, which URLs to follow, and how to handle external links.
|
|
9
|
+
* @see {@link ./crawler.ts | Crawler} for the main consumer of this type
|
|
10
|
+
* @see {@link ../crawler-orchestrator.ts | CrawlerOrchestrator} for factory methods that build these options
|
|
11
|
+
*/
|
|
12
|
+
export interface CrawlerOptions extends Required<Pick<ParseURLOptions, 'disableQueries'>> {
|
|
13
|
+
/** Delay in milliseconds between page requests. */
|
|
14
|
+
interval: number;
|
|
15
|
+
/** Maximum number of concurrent scraping processes. 0 uses the default. */
|
|
16
|
+
parallels: number;
|
|
17
|
+
/** Whether to recursively follow discovered links within the scope. */
|
|
18
|
+
recursive: boolean;
|
|
19
|
+
/** Whether the crawl was started from a pre-defined URL list. */
|
|
20
|
+
fromList: boolean;
|
|
21
|
+
/** Whether to capture image resources during scraping. */
|
|
22
|
+
captureImages: boolean;
|
|
23
|
+
/** Path to the Chromium/Chrome executable, or `null` for the bundled version. */
|
|
24
|
+
executablePath: string | null;
|
|
25
|
+
/** Whether to fetch and scrape external (out-of-scope) pages. */
|
|
26
|
+
fetchExternal: boolean;
|
|
27
|
+
/** List of scope URL strings that define the crawl boundary. */
|
|
28
|
+
scope: string[];
|
|
29
|
+
/** Glob patterns for URLs to exclude from crawling. */
|
|
30
|
+
excludes: string[];
|
|
31
|
+
/** Keywords that trigger page exclusion when found in content. */
|
|
32
|
+
excludeKeywords: string[];
|
|
33
|
+
/** URL prefixes to exclude from crawling (merged defaults + user additions). */
|
|
34
|
+
excludeUrls: readonly string[];
|
|
35
|
+
/** Maximum directory depth for excluded paths. */
|
|
36
|
+
maxExcludedDepth: number;
|
|
37
|
+
/** Maximum number of retry attempts per URL on scrape failure. */
|
|
38
|
+
retry: number;
|
|
39
|
+
/** Whether to enable verbose logging. */
|
|
40
|
+
verbose: boolean;
|
|
41
|
+
/** User-Agent string sent with HTTP requests. */
|
|
42
|
+
userAgent: string;
|
|
43
|
+
/** Whether to ignore robots.txt restrictions. */
|
|
44
|
+
ignoreRobots: boolean;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Describes a detected pagination pattern between two consecutive URLs.
|
|
48
|
+
*/
|
|
49
|
+
export interface PaginationPattern {
|
|
50
|
+
/** Index within the combined token array (path segments + query values) where the numeric difference was found. */
|
|
51
|
+
tokenIndex: number;
|
|
52
|
+
/** The numeric increment (always > 0). */
|
|
53
|
+
step: number;
|
|
54
|
+
/** The number found at `tokenIndex` in the "current" URL. */
|
|
55
|
+
currentNumber: number;
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Event map for the `Crawler` class.
|
|
59
|
+
*
|
|
60
|
+
* Each key represents an event name and its value is the payload type
|
|
61
|
+
* passed to listeners subscribed via `on()` or `once()`.
|
|
62
|
+
*/
|
|
63
|
+
export interface CrawlerEventTypes {
|
|
64
|
+
/**
|
|
65
|
+
* Emitted when a page within the crawl scope has been successfully scraped.
|
|
66
|
+
*/
|
|
67
|
+
page: {
|
|
68
|
+
/** The scraped page data including HTML, metadata, anchors, and images. */
|
|
69
|
+
result: PageData;
|
|
70
|
+
};
|
|
71
|
+
/**
|
|
72
|
+
* Emitted when an external page (outside the crawl scope) has been scraped.
|
|
73
|
+
*/
|
|
74
|
+
externalPage: {
|
|
75
|
+
/** The scraped page data for the external page. */
|
|
76
|
+
result: PageData;
|
|
77
|
+
};
|
|
78
|
+
/**
|
|
79
|
+
* Emitted when a URL is skipped due to exclusion rules, robots.txt restrictions,
|
|
80
|
+
* or external fetch being disabled.
|
|
81
|
+
*/
|
|
82
|
+
skip: {
|
|
83
|
+
/** The URL that was skipped. */
|
|
84
|
+
url: string;
|
|
85
|
+
/** The reason the URL was skipped (e.g., "excluded", "blocked by robots.txt", or a JSON description). */
|
|
86
|
+
reason: string;
|
|
87
|
+
/** Whether the skipped URL is external to the crawl scope. */
|
|
88
|
+
isExternal: boolean;
|
|
89
|
+
};
|
|
90
|
+
/**
|
|
91
|
+
* Emitted when a network resource (CSS, JS, image, etc.) is captured during page scraping.
|
|
92
|
+
*/
|
|
93
|
+
response: {
|
|
94
|
+
/** The captured resource data including URL, status, content type, and headers. */
|
|
95
|
+
resource: Resource;
|
|
96
|
+
};
|
|
97
|
+
/**
|
|
98
|
+
* Emitted to record the relationship between a page and a resource it references.
|
|
99
|
+
*/
|
|
100
|
+
responseReferrers: {
|
|
101
|
+
/** The URL of the page that references the resource. */
|
|
102
|
+
url: string;
|
|
103
|
+
/** The URL of the referenced resource (without hash). */
|
|
104
|
+
src: string;
|
|
105
|
+
};
|
|
106
|
+
/**
|
|
107
|
+
* Emitted when the entire crawl process has completed or been aborted.
|
|
108
|
+
*/
|
|
109
|
+
crawlEnd: Record<string, unknown>;
|
|
110
|
+
/**
|
|
111
|
+
* Emitted when an error occurs during crawling.
|
|
112
|
+
*/
|
|
113
|
+
error: CrawlerError;
|
|
114
|
+
/**
|
|
115
|
+
* Emitted when the scraper transitions between phases of the page scraping lifecycle
|
|
116
|
+
* (e.g., scrapeStart, headRequest, openPage, success).
|
|
117
|
+
*/
|
|
118
|
+
changePhase: ChangePhaseEvent;
|
|
119
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
|
|
2
|
+
/**
|
|
3
|
+
* Determine whether a URL should be skipped during crawling.
|
|
4
|
+
*
|
|
5
|
+
* A URL is skipped if it matches any user-defined exclude glob pattern
|
|
6
|
+
* or starts with any of the excluded URL prefixes.
|
|
7
|
+
* @param url - The parsed URL to check.
|
|
8
|
+
* @param excludes - Array of glob patterns for URLs to exclude.
|
|
9
|
+
* @param excludeUrls - Array of URL prefixes to exclude (matched via `startsWith`).
|
|
10
|
+
* @param options - URL parsing options used for pattern matching.
|
|
11
|
+
* @returns `true` if the URL should be skipped.
|
|
12
|
+
*/
|
|
13
|
+
export declare function shouldSkipUrl(url: ExURL, excludes: readonly string[], excludeUrls: readonly string[], options: ParseURLOptions): boolean;
|
|
14
|
+
/**
|
|
15
|
+
* Determine whether a URL is external to the crawl scope.
|
|
16
|
+
*
|
|
17
|
+
* A URL is considered external if its hostname does not appear
|
|
18
|
+
* as a key in the scope map.
|
|
19
|
+
* @param url - The parsed URL to check.
|
|
20
|
+
* @param scope - Map of hostnames to their scope URLs.
|
|
21
|
+
* @returns `true` if the URL is outside the crawl scope.
|
|
22
|
+
*/
|
|
23
|
+
export declare function isExternalUrl(url: ExURL, scope: ReadonlyMap<string, readonly ExURL[]>): boolean;
|
|
24
|
+
/**
|
|
25
|
+
* Inject authentication credentials from a matching scope URL into the target URL.
|
|
26
|
+
*
|
|
27
|
+
* Finds the best-matching scope URL (deepest path match) for the given URL's
|
|
28
|
+
* hostname and copies its `username` and `password` properties. This mutates
|
|
29
|
+
* the `url` parameter in place.
|
|
30
|
+
* @param url - The parsed URL to receive authentication credentials (mutated in place).
|
|
31
|
+
* @param scope - Map of hostnames to their scope URLs.
|
|
32
|
+
*/
|
|
33
|
+
export declare function injectScopeAuth(url: ExURL, scope: ReadonlyMap<string, readonly ExURL[]>): void;
|
|
34
|
+
/**
|
|
35
|
+
* Find the scope URL with the deepest matching path for a given URL.
|
|
36
|
+
*
|
|
37
|
+
* Among all scope URLs sharing the same hostname, returns the one whose
|
|
38
|
+
* path segments are a prefix of the target URL's path segments and which
|
|
39
|
+
* has the greatest depth. Returns `null` if no scope URL matches.
|
|
40
|
+
* @param url - The parsed URL to match against scope URLs.
|
|
41
|
+
* @param scopes - The list of scope URLs to search.
|
|
42
|
+
* @returns The best-matching scope URL, or `null` if none match.
|
|
43
|
+
*/
|
|
44
|
+
export declare function findBestMatchingScope(url: ExURL, scopes: readonly ExURL[]): ExURL | null;
|
|
45
|
+
/**
|
|
46
|
+
* Check whether a URL is in a lower layer (subdirectory) of any scope URL.
|
|
47
|
+
*
|
|
48
|
+
* Tests the URL against each scope URL using the `isLowerLayer` utility,
|
|
49
|
+
* which checks if the URL's path is at the same level or deeper than
|
|
50
|
+
* the scope URL's path.
|
|
51
|
+
* @param url - The parsed URL to check.
|
|
52
|
+
* @param scopes - The list of scope URLs to test against.
|
|
53
|
+
* @param options - URL parsing options used for layer comparison.
|
|
54
|
+
* @returns `true` if the URL is in a lower layer of at least one scope URL.
|
|
55
|
+
*/
|
|
56
|
+
export declare function isInAnyLowerLayer(url: ExURL, scopes: readonly ExURL[], options: ParseURLOptions): boolean;
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import { isLowerLayer } from '@d-zero/shared/is-lower-layer';
|
|
2
|
+
import { pathMatch } from '@d-zero/shared/path-match';
|
|
3
|
+
import { protocolAgnosticKey } from './protocol-agnostic-key.js';
|
|
4
|
+
/**
|
|
5
|
+
* Determine whether a URL should be skipped during crawling.
|
|
6
|
+
*
|
|
7
|
+
* A URL is skipped if it matches any user-defined exclude glob pattern
|
|
8
|
+
* or starts with any of the excluded URL prefixes.
|
|
9
|
+
* @param url - The parsed URL to check.
|
|
10
|
+
* @param excludes - Array of glob patterns for URLs to exclude.
|
|
11
|
+
* @param excludeUrls - Array of URL prefixes to exclude (matched via `startsWith`).
|
|
12
|
+
* @param options - URL parsing options used for pattern matching.
|
|
13
|
+
* @returns `true` if the URL should be skipped.
|
|
14
|
+
*/
|
|
15
|
+
export function shouldSkipUrl(url, excludes, excludeUrls, options) {
|
|
16
|
+
return (excludes.some((excludeGlobPattern) => pathMatch(url, excludeGlobPattern, options)) ||
|
|
17
|
+
excludeUrls.some((prefix) => protocolAgnosticKey(url.href).startsWith(protocolAgnosticKey(prefix))));
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Determine whether a URL is external to the crawl scope.
|
|
21
|
+
*
|
|
22
|
+
* A URL is considered external if its hostname does not appear
|
|
23
|
+
* as a key in the scope map.
|
|
24
|
+
* @param url - The parsed URL to check.
|
|
25
|
+
* @param scope - Map of hostnames to their scope URLs.
|
|
26
|
+
* @returns `true` if the URL is outside the crawl scope.
|
|
27
|
+
*/
|
|
28
|
+
export function isExternalUrl(url, scope) {
|
|
29
|
+
return !scope.has(url.hostname);
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Inject authentication credentials from a matching scope URL into the target URL.
|
|
33
|
+
*
|
|
34
|
+
* Finds the best-matching scope URL (deepest path match) for the given URL's
|
|
35
|
+
* hostname and copies its `username` and `password` properties. This mutates
|
|
36
|
+
* the `url` parameter in place.
|
|
37
|
+
* @param url - The parsed URL to receive authentication credentials (mutated in place).
|
|
38
|
+
* @param scope - Map of hostnames to their scope URLs.
|
|
39
|
+
*/
|
|
40
|
+
export function injectScopeAuth(url, scope) {
|
|
41
|
+
const scopes = scope.get(url.hostname);
|
|
42
|
+
if (!scopes) {
|
|
43
|
+
return;
|
|
44
|
+
}
|
|
45
|
+
const matchedScope = findBestMatchingScope(url, scopes);
|
|
46
|
+
if (matchedScope) {
|
|
47
|
+
url.username = matchedScope.username;
|
|
48
|
+
url.password = matchedScope.password;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Find the scope URL with the deepest matching path for a given URL.
|
|
53
|
+
*
|
|
54
|
+
* Among all scope URLs sharing the same hostname, returns the one whose
|
|
55
|
+
* path segments are a prefix of the target URL's path segments and which
|
|
56
|
+
* has the greatest depth. Returns `null` if no scope URL matches.
|
|
57
|
+
* @param url - The parsed URL to match against scope URLs.
|
|
58
|
+
* @param scopes - The list of scope URLs to search.
|
|
59
|
+
* @returns The best-matching scope URL, or `null` if none match.
|
|
60
|
+
*/
|
|
61
|
+
export function findBestMatchingScope(url, scopes) {
|
|
62
|
+
let bestMatch = null;
|
|
63
|
+
let maxDepth = -1;
|
|
64
|
+
for (const scope of scopes) {
|
|
65
|
+
if (url.hostname !== scope.hostname) {
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
const isMatch = isPathMatch(url.paths, scope.paths);
|
|
69
|
+
if (isMatch && scope.depth > maxDepth) {
|
|
70
|
+
bestMatch = scope;
|
|
71
|
+
maxDepth = scope.depth;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
return bestMatch;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Check whether a target path is equal to or is a descendant of a base path.
|
|
78
|
+
*
|
|
79
|
+
* Compares path segments element by element. The target path matches if
|
|
80
|
+
* all segments of the base path appear in the same positions at the
|
|
81
|
+
* beginning of the target path.
|
|
82
|
+
* @param targetPaths - The path segments of the URL being checked.
|
|
83
|
+
* @param basePaths - The path segments of the scope URL to match against.
|
|
84
|
+
* @returns `true` if the target path starts with or equals the base path.
|
|
85
|
+
*/
|
|
86
|
+
function isPathMatch(targetPaths, basePaths) {
|
|
87
|
+
if (targetPaths.length < basePaths.length) {
|
|
88
|
+
return false;
|
|
89
|
+
}
|
|
90
|
+
for (const [i, basePath] of basePaths.entries()) {
|
|
91
|
+
if (targetPaths[i] !== basePath) {
|
|
92
|
+
return false;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
return true;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Check whether a URL is in a lower layer (subdirectory) of any scope URL.
|
|
99
|
+
*
|
|
100
|
+
* Tests the URL against each scope URL using the `isLowerLayer` utility,
|
|
101
|
+
* which checks if the URL's path is at the same level or deeper than
|
|
102
|
+
* the scope URL's path.
|
|
103
|
+
* @param url - The parsed URL to check.
|
|
104
|
+
* @param scopes - The list of scope URLs to test against.
|
|
105
|
+
* @param options - URL parsing options used for layer comparison.
|
|
106
|
+
* @returns `true` if the URL is in a lower layer of at least one scope URL.
|
|
107
|
+
*/
|
|
108
|
+
export function isInAnyLowerLayer(url, scopes, options) {
|
|
109
|
+
return scopes.some((scope) => isLowerLayer(url.href, scope.href, options));
|
|
110
|
+
}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import type { Config } from './archive/types.js';
|
|
2
|
+
import type { CrawlEvent } from './types.js';
|
|
3
|
+
import type { ExURL } from '@d-zero/shared/parse-url';
|
|
4
|
+
import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
5
|
+
import Archive from './archive/archive.js';
|
|
6
|
+
/**
|
|
7
|
+
* Default list of external URL prefixes excluded from crawling.
|
|
8
|
+
* Includes social media sharing endpoints that are commonly linked
|
|
9
|
+
* but provide no useful crawl data.
|
|
10
|
+
*/
|
|
11
|
+
export declare const DEFAULT_EXCLUDED_EXTERNAL_URLS: string[];
|
|
12
|
+
/**
|
|
13
|
+
* Configuration options for the CrawlerOrchestrator.
|
|
14
|
+
*
|
|
15
|
+
* Extends the archive {@link Config} with additional runtime settings
|
|
16
|
+
* such as working directory, browser executable path, and output options.
|
|
17
|
+
*/
|
|
18
|
+
interface CrawlConfig extends Config {
|
|
19
|
+
/** The working directory for output files. Defaults to `process.cwd()`. */
|
|
20
|
+
cwd: string;
|
|
21
|
+
/** Path to a Chromium/Chrome executable for Puppeteer. */
|
|
22
|
+
executablePath: string;
|
|
23
|
+
/** Output file path for the archive. */
|
|
24
|
+
filePath: string;
|
|
25
|
+
/** Whether to capture image resources during crawling. */
|
|
26
|
+
image: boolean;
|
|
27
|
+
/** File-size threshold (in bytes) above which images are excluded. */
|
|
28
|
+
imageFileSizeThreshold: number;
|
|
29
|
+
/** Delay in milliseconds between each page request. */
|
|
30
|
+
interval: number;
|
|
31
|
+
/** Whether the input is a pre-defined URL list (non-recursive mode). */
|
|
32
|
+
list: boolean;
|
|
33
|
+
/** Maximum number of retry attempts per URL on scrape failure. */
|
|
34
|
+
retry: number;
|
|
35
|
+
/** Whether to enable verbose logging output. */
|
|
36
|
+
verbose: boolean;
|
|
37
|
+
/** Custom User-Agent string for HTTP requests. */
|
|
38
|
+
userAgent: string;
|
|
39
|
+
/** Whether to ignore robots.txt restrictions. */
|
|
40
|
+
ignoreRobots: boolean;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Callback invoked after the CrawlerOrchestrator instance is fully initialized
|
|
44
|
+
* but before crawling begins.
|
|
45
|
+
* @param orchestrator - The initialized CrawlerOrchestrator instance.
|
|
46
|
+
* @param config - The resolved archive configuration.
|
|
47
|
+
*/
|
|
48
|
+
type CrawlInitializedCallback = (orchestrator: CrawlerOrchestrator, config: Config) => void | Promise<void>;
|
|
49
|
+
/**
|
|
50
|
+
* The main entry point for Nitpicker web crawling and archiving.
|
|
51
|
+
*
|
|
52
|
+
* CrawlerOrchestrator orchestrates the full lifecycle of a crawl session: it creates an archive,
|
|
53
|
+
* configures a {@link Crawler}, processes discovered pages and resources, and
|
|
54
|
+
* writes the final archive file. It emits events defined by {@link CrawlEvent}.
|
|
55
|
+
*
|
|
56
|
+
* Instances are created via the static factory methods {@link CrawlerOrchestrator.crawling}
|
|
57
|
+
* or {@link CrawlerOrchestrator.resume}; the constructor is private.
|
|
58
|
+
* @example
|
|
59
|
+
* ```ts
|
|
60
|
+
* const orchestrator = await CrawlerOrchestrator.crawling(['https://example.com'], { recursive: true });
|
|
61
|
+
* await orchestrator.write();
|
|
62
|
+
* ```
|
|
63
|
+
*/
|
|
64
|
+
export declare class CrawlerOrchestrator extends EventEmitter<CrawlEvent> {
|
|
65
|
+
#private;
|
|
66
|
+
/**
|
|
67
|
+
* The underlying archive instance used for storing crawl results.
|
|
68
|
+
*/
|
|
69
|
+
get archive(): Archive;
|
|
70
|
+
private constructor();
|
|
71
|
+
/**
|
|
72
|
+
* Abort the current crawl and archive operations.
|
|
73
|
+
*
|
|
74
|
+
* Delegates to the archive's abort method, which stops all in-progress
|
|
75
|
+
* database writes and cleans up temporary resources.
|
|
76
|
+
* @returns The result of the archive abort operation.
|
|
77
|
+
*/
|
|
78
|
+
abort(): void;
|
|
79
|
+
/**
|
|
80
|
+
* Execute the crawl for the given list of URLs.
|
|
81
|
+
*
|
|
82
|
+
* Sets up event listeners on the crawler, starts crawling, and resolves
|
|
83
|
+
* when the crawl completes. Discovered pages, external pages, skipped pages,
|
|
84
|
+
* and resources are forwarded to the archive for storage.
|
|
85
|
+
* @param list - The list of parsed URLs to crawl. The first URL is used as the root.
|
|
86
|
+
* @returns A promise that resolves when crawling is complete.
|
|
87
|
+
* @throws {Error} If the URL list is empty.
|
|
88
|
+
*/
|
|
89
|
+
crawling(list: ExURL[]): Promise<void>;
|
|
90
|
+
/**
|
|
91
|
+
* Kill any zombie Chromium processes that were not properly cleaned up.
|
|
92
|
+
*
|
|
93
|
+
* Retrieves the list of undead process IDs from the crawler and sends
|
|
94
|
+
* a SIGTERM signal to each one. Chromium is intentionally sent SIGTERM
|
|
95
|
+
* (not SIGKILL) to avoid leaving zombie processes.
|
|
96
|
+
*/
|
|
97
|
+
garbageCollect(): void;
|
|
98
|
+
/**
|
|
99
|
+
* Retrieve the list of process IDs for Chromium instances that are
|
|
100
|
+
* still running after crawling has ended.
|
|
101
|
+
* @returns An array of process IDs that should be terminated.
|
|
102
|
+
*/
|
|
103
|
+
getUndeadPid(): never[];
|
|
104
|
+
/**
|
|
105
|
+
* Write the archive to its configured file path.
|
|
106
|
+
*
|
|
107
|
+
* Emits `writeFileStart` before writing and `writeFileEnd` after
|
|
108
|
+
* the write completes successfully.
|
|
109
|
+
*/
|
|
110
|
+
write(): Promise<void>;
|
|
111
|
+
/**
|
|
112
|
+
* Create a new CrawlerOrchestrator instance and start crawling the given URLs.
|
|
113
|
+
*
|
|
114
|
+
* This is the primary factory method for starting a fresh crawl. It:
|
|
115
|
+
* 1. Parses and sorts the input URLs
|
|
116
|
+
* 2. Creates an archive file
|
|
117
|
+
* 3. Saves the crawl configuration
|
|
118
|
+
* 4. Runs the optional initialized callback
|
|
119
|
+
* 5. Executes the crawl
|
|
120
|
+
* 6. Sorts the archived URLs in natural order
|
|
121
|
+
* @param url - One or more URL strings to crawl.
|
|
122
|
+
* @param options - Optional configuration overrides for the crawl session.
|
|
123
|
+
* @param initializedCallback - Optional callback invoked after initialization but before crawling starts.
|
|
124
|
+
* @returns A promise that resolves to the CrawlerOrchestrator instance after crawling completes.
|
|
125
|
+
* @throws {Error} If the URL list is empty or contains no valid URLs.
|
|
126
|
+
*/
|
|
127
|
+
static crawling(url: string[], options?: Partial<CrawlConfig>, initializedCallback?: CrawlInitializedCallback): Promise<CrawlerOrchestrator>;
|
|
128
|
+
/**
|
|
129
|
+
* Resume a previously interrupted crawl from an existing archive file.
|
|
130
|
+
*
|
|
131
|
+
* Restores the crawl state (pending URLs, scraped URLs, and resources)
|
|
132
|
+
* from the archive, merges any option overrides, and continues crawling
|
|
133
|
+
* from where it left off.
|
|
134
|
+
* @param stubPath - Path to the existing archive file to resume from.
|
|
135
|
+
* @param options - Optional configuration overrides to apply on top of the archived config.
|
|
136
|
+
* @param initializedCallback - Optional callback invoked after initialization but before crawling resumes.
|
|
137
|
+
* @returns A promise that resolves to the CrawlerOrchestrator instance after crawling completes.
|
|
138
|
+
* @throws {Error} If the archived URL is invalid.
|
|
139
|
+
*/
|
|
140
|
+
static resume(stubPath: string, options?: Partial<CrawlConfig>, initializedCallback?: CrawlInitializedCallback): Promise<CrawlerOrchestrator>;
|
|
141
|
+
}
|
|
142
|
+
export {};
|