@nitpicker/crawler 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/LICENSE +191 -0
- package/README.md +13 -0
- package/lib/archive/archive-accessor.d.ts +107 -0
- package/lib/archive/archive-accessor.js +264 -0
- package/lib/archive/archive.d.ts +174 -0
- package/lib/archive/archive.js +331 -0
- package/lib/archive/database.d.ts +207 -0
- package/lib/archive/database.js +972 -0
- package/lib/archive/debug.d.ts +8 -0
- package/lib/archive/debug.js +9 -0
- package/lib/archive/filesystem/append-text.d.ts +9 -0
- package/lib/archive/filesystem/append-text.js +14 -0
- package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
- package/lib/archive/filesystem/copy-dir-sync.js +9 -0
- package/lib/archive/filesystem/copy-dir.d.ts +7 -0
- package/lib/archive/filesystem/copy-dir.js +13 -0
- package/lib/archive/filesystem/exists.d.ts +6 -0
- package/lib/archive/filesystem/exists.js +9 -0
- package/lib/archive/filesystem/get-file-list.d.ts +8 -0
- package/lib/archive/filesystem/get-file-list.js +12 -0
- package/lib/archive/filesystem/index.d.ts +17 -0
- package/lib/archive/filesystem/index.js +17 -0
- package/lib/archive/filesystem/is-dir.d.ts +6 -0
- package/lib/archive/filesystem/is-dir.js +10 -0
- package/lib/archive/filesystem/mkdir.d.ts +8 -0
- package/lib/archive/filesystem/mkdir.js +15 -0
- package/lib/archive/filesystem/output-json.d.ts +9 -0
- package/lib/archive/filesystem/output-json.js +14 -0
- package/lib/archive/filesystem/output-text.d.ts +11 -0
- package/lib/archive/filesystem/output-text.js +32 -0
- package/lib/archive/filesystem/read-json.d.ts +7 -0
- package/lib/archive/filesystem/read-json.js +11 -0
- package/lib/archive/filesystem/read-text.d.ts +6 -0
- package/lib/archive/filesystem/read-text.js +10 -0
- package/lib/archive/filesystem/readline.d.ts +11 -0
- package/lib/archive/filesystem/readline.js +26 -0
- package/lib/archive/filesystem/remove.d.ts +5 -0
- package/lib/archive/filesystem/remove.js +10 -0
- package/lib/archive/filesystem/rename.d.ts +11 -0
- package/lib/archive/filesystem/rename.js +18 -0
- package/lib/archive/filesystem/tar.d.ts +11 -0
- package/lib/archive/filesystem/tar.js +22 -0
- package/lib/archive/filesystem/untar.d.ts +20 -0
- package/lib/archive/filesystem/untar.js +24 -0
- package/lib/archive/filesystem/utils.d.ts +109 -0
- package/lib/archive/filesystem/utils.js +185 -0
- package/lib/archive/filesystem/zip.d.ts +29 -0
- package/lib/archive/filesystem/zip.js +53 -0
- package/lib/archive/index.d.ts +6 -0
- package/lib/archive/index.js +11 -0
- package/lib/archive/page.d.ts +263 -0
- package/lib/archive/page.js +316 -0
- package/lib/archive/resource.d.ts +46 -0
- package/lib/archive/resource.js +62 -0
- package/lib/archive/safe-path.d.ts +9 -0
- package/lib/archive/safe-path.js +17 -0
- package/lib/archive/types.d.ts +210 -0
- package/lib/archive/types.js +1 -0
- package/lib/crawler/clear-destination-cache.d.ts +5 -0
- package/lib/crawler/clear-destination-cache.js +8 -0
- package/lib/crawler/crawler.d.ts +73 -0
- package/lib/crawler/crawler.js +748 -0
- package/lib/crawler/decompose-url.d.ts +25 -0
- package/lib/crawler/decompose-url.js +71 -0
- package/lib/crawler/destination-cache.d.ts +7 -0
- package/lib/crawler/destination-cache.js +6 -0
- package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
- package/lib/crawler/detect-pagination-pattern.js +61 -0
- package/lib/crawler/fetch-destination.d.ts +38 -0
- package/lib/crawler/fetch-destination.js +208 -0
- package/lib/crawler/fetch-robots-txt.d.ts +42 -0
- package/lib/crawler/fetch-robots-txt.js +44 -0
- package/lib/crawler/find-best-matching-scope.d.ts +12 -0
- package/lib/crawler/find-best-matching-scope.js +46 -0
- package/lib/crawler/generate-predicted-urls.d.ts +13 -0
- package/lib/crawler/generate-predicted-urls.js +27 -0
- package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
- package/lib/crawler/handle-ignore-and-skip.js +19 -0
- package/lib/crawler/handle-resource-response.d.ts +13 -0
- package/lib/crawler/handle-resource-response.js +16 -0
- package/lib/crawler/handle-scrape-end.d.ts +24 -0
- package/lib/crawler/handle-scrape-end.js +82 -0
- package/lib/crawler/handle-scrape-error.d.ts +37 -0
- package/lib/crawler/handle-scrape-error.js +38 -0
- package/lib/crawler/index.d.ts +2 -0
- package/lib/crawler/index.js +2 -0
- package/lib/crawler/inject-scope-auth.d.ts +11 -0
- package/lib/crawler/inject-scope-auth.js +21 -0
- package/lib/crawler/is-external-url.d.ts +11 -0
- package/lib/crawler/is-external-url.js +12 -0
- package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
- package/lib/crawler/is-in-any-lower-layer.js +15 -0
- package/lib/crawler/link-list.d.ts +112 -0
- package/lib/crawler/link-list.js +248 -0
- package/lib/crawler/link-to-page-data.d.ts +14 -0
- package/lib/crawler/link-to-page-data.js +32 -0
- package/lib/crawler/net-timeout-error.d.ts +9 -0
- package/lib/crawler/net-timeout-error.js +11 -0
- package/lib/crawler/network.d.ts +30 -0
- package/lib/crawler/network.js +226 -0
- package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
- package/lib/crawler/protocol-agnostic-key.js +11 -0
- package/lib/crawler/reconstruct-url.d.ts +10 -0
- package/lib/crawler/reconstruct-url.js +28 -0
- package/lib/crawler/result-handler.d.ts +118 -0
- package/lib/crawler/result-handler.js +153 -0
- package/lib/crawler/robots-checker.d.ts +26 -0
- package/lib/crawler/robots-checker.js +62 -0
- package/lib/crawler/should-discard-predicted.d.ts +14 -0
- package/lib/crawler/should-discard-predicted.js +31 -0
- package/lib/crawler/should-skip-url.d.ts +23 -0
- package/lib/crawler/should-skip-url.js +15 -0
- package/lib/crawler/speculative-pagination.d.ts +52 -0
- package/lib/crawler/speculative-pagination.js +215 -0
- package/lib/crawler/types.d.ts +119 -0
- package/lib/crawler/types.js +1 -0
- package/lib/crawler/url-filter.d.ts +56 -0
- package/lib/crawler/url-filter.js +110 -0
- package/lib/crawler-orchestrator.d.ts +142 -0
- package/lib/crawler-orchestrator.js +309 -0
- package/lib/debug.d.ts +8 -0
- package/lib/debug.js +9 -0
- package/lib/index.d.ts +16 -0
- package/lib/index.js +18 -0
- package/lib/qzilla.d.ts +136 -0
- package/lib/qzilla.js +292 -0
- package/lib/types.d.ts +27 -0
- package/lib/types.js +1 -0
- package/lib/utils/array/each-splitted.d.ts +10 -0
- package/lib/utils/array/each-splitted.js +14 -0
- package/lib/utils/array/index.d.ts +1 -0
- package/lib/utils/array/index.js +1 -0
- package/lib/utils/async/index.d.ts +1 -0
- package/lib/utils/async/index.js +1 -0
- package/lib/utils/debug.d.ts +5 -0
- package/lib/utils/debug.js +5 -0
- package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
- package/lib/utils/error/dom-evaluation-error.js +7 -0
- package/lib/utils/error/error-emitter.d.ts +18 -0
- package/lib/utils/error/error-emitter.js +29 -0
- package/lib/utils/error/index.d.ts +3 -0
- package/lib/utils/error/index.js +2 -0
- package/lib/utils/event-emitter/index.d.ts +6 -0
- package/lib/utils/event-emitter/index.js +6 -0
- package/lib/utils/index.d.ts +5 -0
- package/lib/utils/index.js +5 -0
- package/lib/utils/network/index.d.ts +1 -0
- package/lib/utils/network/index.js +1 -0
- package/lib/utils/object/clean-object.d.ts +8 -0
- package/lib/utils/object/clean-object.js +13 -0
- package/lib/utils/object/index.d.ts +1 -0
- package/lib/utils/object/index.js +1 -0
- package/lib/utils/path/index.d.ts +1 -0
- package/lib/utils/path/index.js +1 -0
- package/lib/utils/path/safe-filepath.d.ts +7 -0
- package/lib/utils/path/safe-filepath.js +12 -0
- package/lib/utils/regexp/index.d.ts +1 -0
- package/lib/utils/regexp/index.js +1 -0
- package/lib/utils/retryable/index.d.ts +2 -0
- package/lib/utils/retryable/index.js +1 -0
- package/lib/utils/sort/index.d.ts +14 -0
- package/lib/utils/sort/index.js +61 -0
- package/lib/utils/sort/remove-matches.d.ts +9 -0
- package/lib/utils/sort/remove-matches.js +23 -0
- package/lib/utils/types/index.d.ts +1 -0
- package/lib/utils/types/index.js +1 -0
- package/lib/utils/types/types.d.ts +46 -0
- package/lib/utils/types/types.js +1 -0
- package/lib/utils/url/index.d.ts +5 -0
- package/lib/utils/url/index.js +5 -0
- package/lib/utils/url/is-lower-layer.d.ts +15 -0
- package/lib/utils/url/is-lower-layer.js +55 -0
- package/lib/utils/url/parse-url.d.ts +11 -0
- package/lib/utils/url/parse-url.js +20 -0
- package/lib/utils/url/path-match.d.ts +11 -0
- package/lib/utils/url/path-match.js +18 -0
- package/lib/utils/url/sort-url.d.ts +10 -0
- package/lib/utils/url/sort-url.js +24 -0
- package/lib/utils/url/url-partial-match.d.ts +11 -0
- package/lib/utils/url/url-partial-match.js +32 -0
- package/package.json +49 -0
- package/src/archive/__mock__/.gitignore +3 -0
- package/src/archive/__mock__/mock.sqlite +0 -0
- package/src/archive/archive-accessor.ts +337 -0
- package/src/archive/archive.ts +408 -0
- package/src/archive/database.spec.ts +469 -0
- package/src/archive/database.ts +1059 -0
- package/src/archive/debug.ts +10 -0
- package/src/archive/filesystem/append-text.spec.ts +26 -0
- package/src/archive/filesystem/append-text.ts +16 -0
- package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
- package/src/archive/filesystem/copy-dir-sync.ts +10 -0
- package/src/archive/filesystem/copy-dir.spec.ts +33 -0
- package/src/archive/filesystem/copy-dir.ts +14 -0
- package/src/archive/filesystem/exists.spec.ts +33 -0
- package/src/archive/filesystem/exists.ts +10 -0
- package/src/archive/filesystem/get-file-list.spec.ts +37 -0
- package/src/archive/filesystem/get-file-list.ts +13 -0
- package/src/archive/filesystem/index.ts +17 -0
- package/src/archive/filesystem/is-dir.spec.ts +29 -0
- package/src/archive/filesystem/is-dir.ts +11 -0
- package/src/archive/filesystem/mkdir.spec.ts +37 -0
- package/src/archive/filesystem/mkdir.ts +16 -0
- package/src/archive/filesystem/output-json.spec.ts +34 -0
- package/src/archive/filesystem/output-json.ts +16 -0
- package/src/archive/filesystem/output-text.spec.ts +31 -0
- package/src/archive/filesystem/output-text.ts +35 -0
- package/src/archive/filesystem/read-json.spec.ts +26 -0
- package/src/archive/filesystem/read-json.ts +12 -0
- package/src/archive/filesystem/read-text.spec.ts +25 -0
- package/src/archive/filesystem/read-text.ts +11 -0
- package/src/archive/filesystem/readline.spec.ts +29 -0
- package/src/archive/filesystem/readline.ts +30 -0
- package/src/archive/filesystem/remove.spec.ts +34 -0
- package/src/archive/filesystem/remove.ts +11 -0
- package/src/archive/filesystem/rename.spec.ts +46 -0
- package/src/archive/filesystem/rename.ts +21 -0
- package/src/archive/filesystem/tar.spec.ts +33 -0
- package/src/archive/filesystem/tar.ts +27 -0
- package/src/archive/filesystem/untar.spec.ts +34 -0
- package/src/archive/filesystem/untar.ts +36 -0
- package/src/archive/index.ts +13 -0
- package/src/archive/page.spec.ts +368 -0
- package/src/archive/page.ts +420 -0
- package/src/archive/resource.spec.ts +101 -0
- package/src/archive/resource.ts +73 -0
- package/src/archive/safe-path.spec.ts +44 -0
- package/src/archive/safe-path.ts +18 -0
- package/src/archive/types.ts +227 -0
- package/src/crawler/clear-destination-cache.spec.ts +20 -0
- package/src/crawler/clear-destination-cache.ts +9 -0
- package/src/crawler/crawler.ts +873 -0
- package/src/crawler/decompose-url.spec.ts +48 -0
- package/src/crawler/decompose-url.ts +90 -0
- package/src/crawler/destination-cache.spec.ts +23 -0
- package/src/crawler/destination-cache.ts +8 -0
- package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
- package/src/crawler/detect-pagination-pattern.ts +66 -0
- package/src/crawler/fetch-destination.ts +257 -0
- package/src/crawler/fetch-robots-txt.spec.ts +83 -0
- package/src/crawler/fetch-robots-txt.ts +91 -0
- package/src/crawler/find-best-matching-scope.spec.ts +39 -0
- package/src/crawler/find-best-matching-scope.ts +57 -0
- package/src/crawler/generate-predicted-urls.spec.ts +42 -0
- package/src/crawler/generate-predicted-urls.ts +34 -0
- package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
- package/src/crawler/handle-ignore-and-skip.ts +30 -0
- package/src/crawler/handle-resource-response.spec.ts +45 -0
- package/src/crawler/handle-resource-response.ts +21 -0
- package/src/crawler/handle-scrape-end.spec.ts +109 -0
- package/src/crawler/handle-scrape-end.ts +115 -0
- package/src/crawler/handle-scrape-error.spec.ts +105 -0
- package/src/crawler/handle-scrape-error.ts +58 -0
- package/src/crawler/index.ts +2 -0
- package/src/crawler/inject-scope-auth.spec.ts +36 -0
- package/src/crawler/inject-scope-auth.ts +27 -0
- package/src/crawler/is-external-url.spec.ts +31 -0
- package/src/crawler/is-external-url.ts +17 -0
- package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
- package/src/crawler/is-in-any-lower-layer.ts +22 -0
- package/src/crawler/link-list.spec.ts +355 -0
- package/src/crawler/link-list.ts +275 -0
- package/src/crawler/link-to-page-data.spec.ts +133 -0
- package/src/crawler/link-to-page-data.ts +34 -0
- package/src/crawler/net-timeout-error.spec.ts +25 -0
- package/src/crawler/net-timeout-error.ts +11 -0
- package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
- package/src/crawler/protocol-agnostic-key.ts +11 -0
- package/src/crawler/reconstruct-url.spec.ts +37 -0
- package/src/crawler/reconstruct-url.ts +37 -0
- package/src/crawler/robots-checker.spec.ts +104 -0
- package/src/crawler/robots-checker.ts +73 -0
- package/src/crawler/should-discard-predicted.spec.ts +125 -0
- package/src/crawler/should-discard-predicted.ts +33 -0
- package/src/crawler/should-skip-url.spec.ts +77 -0
- package/src/crawler/should-skip-url.ts +37 -0
- package/src/crawler/types.ts +146 -0
- package/src/crawler-orchestrator.ts +401 -0
- package/src/debug.ts +10 -0
- package/src/index.ts +25 -0
- package/src/types.ts +30 -0
- package/src/utils/array/each-splitted.spec.ts +38 -0
- package/src/utils/array/each-splitted.ts +19 -0
- package/src/utils/array/index.ts +1 -0
- package/src/utils/debug.ts +6 -0
- package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
- package/src/utils/error/dom-evaluation-error.ts +6 -0
- package/src/utils/error/error-emitter.spec.ts +78 -0
- package/src/utils/error/error-emitter.ts +44 -0
- package/src/utils/error/index.ts +3 -0
- package/src/utils/index.ts +5 -0
- package/src/utils/object/clean-object.spec.ts +24 -0
- package/src/utils/object/clean-object.ts +13 -0
- package/src/utils/object/index.ts +1 -0
- package/src/utils/types/index.ts +1 -0
- package/src/utils/types/types.ts +65 -0
- package/tsconfig.json +11 -0
- package/tsconfig.tsbuildinfo +1 -0
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
|
|
2
|
+
import { sortUrl } from '@d-zero/shared/sort-url';
|
|
3
|
+
import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
4
|
+
import pkg from '../package.json' with { type: 'json' };
|
|
5
|
+
import Archive from './archive/archive.js';
|
|
6
|
+
import { clearDestinationCache, Crawler } from './crawler/index.js';
|
|
7
|
+
import { crawlerLog, log } from './debug.js';
|
|
8
|
+
import { cleanObject } from './utils/index.js';
|
|
9
|
+
/**
|
|
10
|
+
* Default list of external URL prefixes excluded from crawling.
|
|
11
|
+
* Includes social media sharing endpoints that are commonly linked
|
|
12
|
+
* but provide no useful crawl data.
|
|
13
|
+
*/
|
|
14
|
+
export const DEFAULT_EXCLUDED_EXTERNAL_URLS = [
|
|
15
|
+
'https://social-plugins.line.me',
|
|
16
|
+
'https://access.line.me',
|
|
17
|
+
'https://lineit.line.me',
|
|
18
|
+
'https://line.me',
|
|
19
|
+
'https://plus.google.com',
|
|
20
|
+
'https://twitter.com',
|
|
21
|
+
'https://x.com',
|
|
22
|
+
'https://www.facebook.com/share.php',
|
|
23
|
+
'https://www.facebook.com/share/',
|
|
24
|
+
'https://www.facebook.com/sharer/',
|
|
25
|
+
'https://www.facebook.com/share_channel/',
|
|
26
|
+
'https://www.google.com',
|
|
27
|
+
];
|
|
28
|
+
/**
|
|
29
|
+
* The main entry point for Nitpicker web crawling and archiving.
|
|
30
|
+
*
|
|
31
|
+
* CrawlerOrchestrator orchestrates the full lifecycle of a crawl session: it creates an archive,
|
|
32
|
+
* configures a {@link Crawler}, processes discovered pages and resources, and
|
|
33
|
+
* writes the final archive file. It emits events defined by {@link CrawlEvent}.
|
|
34
|
+
*
|
|
35
|
+
* Instances are created via the static factory methods {@link CrawlerOrchestrator.crawling}
|
|
36
|
+
* or {@link CrawlerOrchestrator.resume}; the constructor is private.
|
|
37
|
+
* @example
|
|
38
|
+
* ```ts
|
|
39
|
+
* const orchestrator = await CrawlerOrchestrator.crawling(['https://example.com'], { recursive: true });
|
|
40
|
+
* await orchestrator.write();
|
|
41
|
+
* ```
|
|
42
|
+
*/
|
|
43
|
+
export class CrawlerOrchestrator extends EventEmitter {
|
|
44
|
+
/** The archive instance for persisting crawl results to SQLite + tar. */
|
|
45
|
+
#archive;
|
|
46
|
+
/** The crawler engine that discovers and scrapes pages. */
|
|
47
|
+
#crawler;
|
|
48
|
+
/** Whether the crawl was started from a pre-defined URL list (non-recursive mode). */
|
|
49
|
+
#fromList;
|
|
50
|
+
/**
|
|
51
|
+
* The underlying archive instance used for storing crawl results.
|
|
52
|
+
*/
|
|
53
|
+
get archive() {
|
|
54
|
+
return this.#archive;
|
|
55
|
+
}
|
|
56
|
+
// eslint-disable-next-line no-restricted-syntax
|
|
57
|
+
constructor(archive, options) {
|
|
58
|
+
super();
|
|
59
|
+
this.#fromList = !!options?.list;
|
|
60
|
+
this.#archive = archive;
|
|
61
|
+
this.#archive.on('error', (e) => {
|
|
62
|
+
this.#crawler.abort();
|
|
63
|
+
void this.emit('error', {
|
|
64
|
+
pid: process.pid,
|
|
65
|
+
isMainProcess: true,
|
|
66
|
+
url: null,
|
|
67
|
+
error: e instanceof Error ? e : new Error(String(e)),
|
|
68
|
+
});
|
|
69
|
+
});
|
|
70
|
+
const defaultUserAgent = `Nitpicker/${pkg.version}`;
|
|
71
|
+
this.#crawler = new Crawler({
|
|
72
|
+
interval: options?.interval || 0,
|
|
73
|
+
parallels: options?.parallels || 0,
|
|
74
|
+
captureImages: options?.image,
|
|
75
|
+
executablePath: options?.executablePath || null,
|
|
76
|
+
fetchExternal: options?.fetchExternal ?? true,
|
|
77
|
+
recursive: options?.recursive ?? true,
|
|
78
|
+
scope: options?.scope ?? [],
|
|
79
|
+
excludes: normalizeToArray(options?.excludes),
|
|
80
|
+
excludeKeywords: normalizeToArray(options?.excludeKeywords),
|
|
81
|
+
excludeUrls: [
|
|
82
|
+
...DEFAULT_EXCLUDED_EXTERNAL_URLS,
|
|
83
|
+
...normalizeToArray(options?.excludeUrls),
|
|
84
|
+
],
|
|
85
|
+
maxExcludedDepth: options?.maxExcludedDepth || 10,
|
|
86
|
+
retry: options?.retry ?? 3,
|
|
87
|
+
disableQueries: options?.disableQueries,
|
|
88
|
+
verbose: options?.verbose ?? false,
|
|
89
|
+
userAgent: options?.userAgent || defaultUserAgent,
|
|
90
|
+
ignoreRobots: options?.ignoreRobots ?? false,
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Abort the current crawl and archive operations.
|
|
95
|
+
*
|
|
96
|
+
* Delegates to the archive's abort method, which stops all in-progress
|
|
97
|
+
* database writes and cleans up temporary resources.
|
|
98
|
+
* @returns The result of the archive abort operation.
|
|
99
|
+
*/
|
|
100
|
+
abort() {
|
|
101
|
+
return this.#archive.abort();
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Execute the crawl for the given list of URLs.
|
|
105
|
+
*
|
|
106
|
+
* Sets up event listeners on the crawler, starts crawling, and resolves
|
|
107
|
+
* when the crawl completes. Discovered pages, external pages, skipped pages,
|
|
108
|
+
* and resources are forwarded to the archive for storage.
|
|
109
|
+
* @param list - The list of parsed URLs to crawl. The first URL is used as the root.
|
|
110
|
+
* @returns A promise that resolves when crawling is complete.
|
|
111
|
+
* @throws {Error} If the URL list is empty.
|
|
112
|
+
*/
|
|
113
|
+
async crawling(list) {
|
|
114
|
+
const root = list[0];
|
|
115
|
+
if (!root) {
|
|
116
|
+
throw new Error('URL is empty');
|
|
117
|
+
}
|
|
118
|
+
return new Promise((resolve, reject) => {
|
|
119
|
+
this.#crawler.on('error', (error) => {
|
|
120
|
+
crawlerLog('On error: %O', error);
|
|
121
|
+
void this.#archive.addError(error);
|
|
122
|
+
void this.emit('error', error);
|
|
123
|
+
});
|
|
124
|
+
this.#crawler.on('page', async ({ result }) => {
|
|
125
|
+
await this.#archive.setPage(result).catch((error) => reject(error));
|
|
126
|
+
});
|
|
127
|
+
this.#crawler.on('externalPage', ({ result }) => {
|
|
128
|
+
this.#archive.setExternalPage(result).catch((error) => reject(error));
|
|
129
|
+
});
|
|
130
|
+
this.#crawler.on('skip', ({ url, reason, isExternal }) => {
|
|
131
|
+
this.#archive
|
|
132
|
+
.setSkippedPage(url, reason, isExternal)
|
|
133
|
+
.catch((error) => reject(error));
|
|
134
|
+
});
|
|
135
|
+
this.#crawler.on('response', ({ resource }) => {
|
|
136
|
+
this.#archive.setResources(resource).catch((error) => reject(error));
|
|
137
|
+
});
|
|
138
|
+
this.#crawler.on('responseReferrers', (resource) => {
|
|
139
|
+
this.#archive.setResourcesReferrers(resource).catch((error) => reject(error));
|
|
140
|
+
});
|
|
141
|
+
this.#crawler.on('crawlEnd', () => {
|
|
142
|
+
resolve();
|
|
143
|
+
});
|
|
144
|
+
if (this.#fromList) {
|
|
145
|
+
this.#crawler.startMultiple(list);
|
|
146
|
+
}
|
|
147
|
+
else {
|
|
148
|
+
this.#crawler.start(root);
|
|
149
|
+
}
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Kill any zombie Chromium processes that were not properly cleaned up.
|
|
154
|
+
*
|
|
155
|
+
* Retrieves the list of undead process IDs from the crawler and sends
|
|
156
|
+
* a SIGTERM signal to each one. Chromium is intentionally sent SIGTERM
|
|
157
|
+
* (not SIGKILL) to avoid leaving zombie processes.
|
|
158
|
+
*/
|
|
159
|
+
garbageCollect() {
|
|
160
|
+
const pidList = this.getUndeadPid();
|
|
161
|
+
log('Undead PIDs: %O', pidList);
|
|
162
|
+
for (const pid of pidList) {
|
|
163
|
+
try {
|
|
164
|
+
log('Garbage collect: kill PID:%d', pid);
|
|
165
|
+
// Chromium becomes a zombie process if SIGKILL signal.
|
|
166
|
+
process.kill(pid);
|
|
167
|
+
}
|
|
168
|
+
catch (error) {
|
|
169
|
+
log('Garbage collect: Failed killing PID:%d %O', pid, error);
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Retrieve the list of process IDs for Chromium instances that are
|
|
175
|
+
* still running after crawling has ended.
|
|
176
|
+
* @returns An array of process IDs that should be terminated.
|
|
177
|
+
*/
|
|
178
|
+
getUndeadPid() {
|
|
179
|
+
return this.#crawler.getUndeadPid();
|
|
180
|
+
}
|
|
181
|
+
/**
|
|
182
|
+
* Write the archive to its configured file path.
|
|
183
|
+
*
|
|
184
|
+
* Emits `writeFileStart` before writing and `writeFileEnd` after
|
|
185
|
+
* the write completes successfully.
|
|
186
|
+
*/
|
|
187
|
+
async write() {
|
|
188
|
+
void this.emit('writeFileStart', { filePath: this.#archive.filePath });
|
|
189
|
+
await this.#archive.write();
|
|
190
|
+
void this.emit('writeFileEnd', { filePath: this.#archive.filePath });
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Create a new CrawlerOrchestrator instance and start crawling the given URLs.
|
|
194
|
+
*
|
|
195
|
+
* This is the primary factory method for starting a fresh crawl. It:
|
|
196
|
+
* 1. Parses and sorts the input URLs
|
|
197
|
+
* 2. Creates an archive file
|
|
198
|
+
* 3. Saves the crawl configuration
|
|
199
|
+
* 4. Runs the optional initialized callback
|
|
200
|
+
* 5. Executes the crawl
|
|
201
|
+
* 6. Sorts the archived URLs in natural order
|
|
202
|
+
* @param url - One or more URL strings to crawl.
|
|
203
|
+
* @param options - Optional configuration overrides for the crawl session.
|
|
204
|
+
* @param initializedCallback - Optional callback invoked after initialization but before crawling starts.
|
|
205
|
+
* @returns A promise that resolves to the CrawlerOrchestrator instance after crawling completes.
|
|
206
|
+
* @throws {Error} If the URL list is empty or contains no valid URLs.
|
|
207
|
+
*/
|
|
208
|
+
static async crawling(url, options, initializedCallback) {
|
|
209
|
+
const list = sortUrl(url, options);
|
|
210
|
+
const urlParsed = list[0];
|
|
211
|
+
if (!urlParsed) {
|
|
212
|
+
throw new Error('URL is empty');
|
|
213
|
+
}
|
|
214
|
+
const fileName = `${urlParsed.hostname}-${Archive.timestamp()}`;
|
|
215
|
+
const cwd = options?.cwd ?? process.cwd();
|
|
216
|
+
const filePath = Archive.joinPath(cwd, `${fileName}.${Archive.FILE_EXTENSION}`);
|
|
217
|
+
const disableQueries = options?.disableQueries || false;
|
|
218
|
+
const defaultUserAgent = `Nitpicker/${pkg.version}`;
|
|
219
|
+
const archive = await Archive.create({ filePath, cwd, disableQueries });
|
|
220
|
+
await archive.setConfig({
|
|
221
|
+
version: pkg.version,
|
|
222
|
+
name: fileName,
|
|
223
|
+
baseUrl: urlParsed.withoutHash,
|
|
224
|
+
recursive: options?.recursive ?? true,
|
|
225
|
+
fetchExternal: options?.fetchExternal ?? true,
|
|
226
|
+
image: options?.image ?? true,
|
|
227
|
+
interval: options?.interval || 0,
|
|
228
|
+
parallels: options?.parallels || 0,
|
|
229
|
+
scope: options?.scope ?? [],
|
|
230
|
+
// @ts-expect-error TODO: Fix CLI arguments
|
|
231
|
+
excludes: normalizeToArray(options?.exclude),
|
|
232
|
+
// @ts-expect-error TODO: Fix CLI arguments
|
|
233
|
+
excludeKeywords: normalizeToArray(options?.excludeKeyword),
|
|
234
|
+
excludeUrls: [
|
|
235
|
+
...DEFAULT_EXCLUDED_EXTERNAL_URLS,
|
|
236
|
+
// @ts-expect-error TODO: Fix CLI arguments
|
|
237
|
+
...normalizeToArray(options?.excludeUrl),
|
|
238
|
+
],
|
|
239
|
+
maxExcludedDepth: options?.maxExcludedDepth || 10,
|
|
240
|
+
retry: options?.retry ?? 3,
|
|
241
|
+
fromList: !!options?.list,
|
|
242
|
+
disableQueries,
|
|
243
|
+
userAgent: options?.userAgent || defaultUserAgent,
|
|
244
|
+
ignoreRobots: options?.ignoreRobots ?? false,
|
|
245
|
+
});
|
|
246
|
+
const orchestrator = new CrawlerOrchestrator(archive, options);
|
|
247
|
+
const config = await archive.getConfig();
|
|
248
|
+
if (initializedCallback) {
|
|
249
|
+
await initializedCallback(orchestrator, config);
|
|
250
|
+
}
|
|
251
|
+
log('Start crawling');
|
|
252
|
+
log('URL %O', list.map((url) => url.href));
|
|
253
|
+
log('Config %O', config);
|
|
254
|
+
await orchestrator.crawling(list);
|
|
255
|
+
log('Crawling completed');
|
|
256
|
+
clearDestinationCache();
|
|
257
|
+
log('Set order natural URL sort');
|
|
258
|
+
await archive.setUrlOrder();
|
|
259
|
+
log('Sorting done');
|
|
260
|
+
return orchestrator;
|
|
261
|
+
}
|
|
262
|
+
/**
|
|
263
|
+
* Resume a previously interrupted crawl from an existing archive file.
|
|
264
|
+
*
|
|
265
|
+
* Restores the crawl state (pending URLs, scraped URLs, and resources)
|
|
266
|
+
* from the archive, merges any option overrides, and continues crawling
|
|
267
|
+
* from where it left off.
|
|
268
|
+
* @param stubPath - Path to the existing archive file to resume from.
|
|
269
|
+
* @param options - Optional configuration overrides to apply on top of the archived config.
|
|
270
|
+
* @param initializedCallback - Optional callback invoked after initialization but before crawling resumes.
|
|
271
|
+
* @returns A promise that resolves to the CrawlerOrchestrator instance after crawling completes.
|
|
272
|
+
* @throws {Error} If the archived URL is invalid.
|
|
273
|
+
*/
|
|
274
|
+
static async resume(stubPath, options, initializedCallback) {
|
|
275
|
+
const archive = await Archive.resume(stubPath);
|
|
276
|
+
const archivedConfig = await archive.getConfig();
|
|
277
|
+
const config = {
|
|
278
|
+
...archivedConfig,
|
|
279
|
+
...cleanObject(options),
|
|
280
|
+
};
|
|
281
|
+
const orchestrator = new CrawlerOrchestrator(archive, config);
|
|
282
|
+
const _url = await archive.getUrl();
|
|
283
|
+
const url = parseUrl(_url, config);
|
|
284
|
+
if (!url) {
|
|
285
|
+
throw new Error(`URL (${_url}) is invalid`);
|
|
286
|
+
}
|
|
287
|
+
const { scraped, pending } = await archive.getCrawlingState();
|
|
288
|
+
const resources = await archive.getResourceUrlList();
|
|
289
|
+
orchestrator.#crawler.resume(pending, scraped, resources);
|
|
290
|
+
if (initializedCallback) {
|
|
291
|
+
await initializedCallback(orchestrator, config);
|
|
292
|
+
}
|
|
293
|
+
log('Start resuming');
|
|
294
|
+
log('Data %s', stubPath);
|
|
295
|
+
log('URL %s', url.href);
|
|
296
|
+
log('Config %O', config);
|
|
297
|
+
await orchestrator.crawling([url]);
|
|
298
|
+
return orchestrator;
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
/**
|
|
302
|
+
* Normalize an optional parameter that may be a single value, an array,
|
|
303
|
+
* null, or undefined into a guaranteed array.
|
|
304
|
+
* @param param - The parameter to normalize.
|
|
305
|
+
* @returns An array containing the parameter value(s), or an empty array if absent.
|
|
306
|
+
*/
|
|
307
|
+
function normalizeToArray(param) {
|
|
308
|
+
return Array.isArray(param) ? param : param ? [param] : [];
|
|
309
|
+
}
|
package/lib/debug.d.ts
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/** Debug logger for the core package. Namespace: `Nitpicker`. */
|
|
2
|
+
export declare const log: import("debug").Debugger;
|
|
3
|
+
/** Debug logger for the crawler module. Namespace: `Nitpicker:Crawler`. */
|
|
4
|
+
export declare const crawlerLog: import("debug").Debugger;
|
|
5
|
+
/** Debug logger for the dealer integration. Namespace: `Nitpicker:Crawler:Deal`. */
|
|
6
|
+
export declare const dealLog: import("debug").Debugger;
|
|
7
|
+
/** Debug logger for crawler errors. Namespace: `Nitpicker:Crawler:Error`. */
|
|
8
|
+
export declare const crawlerErrorLog: import("debug").Debugger;
|
package/lib/debug.js
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { log as globalLog } from './utils/debug.js';
|
|
2
|
+
/** Debug logger for the core package. Namespace: `Nitpicker`. */
|
|
3
|
+
export const log = globalLog;
|
|
4
|
+
/** Debug logger for the crawler module. Namespace: `Nitpicker:Crawler`. */
|
|
5
|
+
export const crawlerLog = log.extend('Crawler');
|
|
6
|
+
/** Debug logger for the dealer integration. Namespace: `Nitpicker:Crawler:Deal`. */
|
|
7
|
+
export const dealLog = crawlerLog.extend('Deal');
|
|
8
|
+
/** Debug logger for crawler errors. Namespace: `Nitpicker:Crawler:Error`. */
|
|
9
|
+
export const crawlerErrorLog = crawlerLog.extend('Error');
|
package/lib/index.d.ts
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module @nitpicker/crawler
|
|
3
|
+
*
|
|
4
|
+
* Core module of Nitpicker that provides the main crawling engine,
|
|
5
|
+
* utility functions, type definitions, and archive storage layer.
|
|
6
|
+
*/
|
|
7
|
+
export * from './utils/index.js';
|
|
8
|
+
export { ArchiveAccessor } from './archive/archive-accessor.js';
|
|
9
|
+
export type { Redirect, Referrer, Anchor, StaticPageData } from './archive/page.js';
|
|
10
|
+
export { default as Page } from './archive/page.js';
|
|
11
|
+
export { default as ArchiveResource } from './archive/resource.js';
|
|
12
|
+
export * from './archive/types.js';
|
|
13
|
+
export { default as Archive } from './archive/archive.js';
|
|
14
|
+
export { DEFAULT_EXCLUDED_EXTERNAL_URLS, CrawlerOrchestrator, } from './crawler-orchestrator.js';
|
|
15
|
+
export * from './types.js';
|
|
16
|
+
export * from './crawler/types.js';
|
package/lib/index.js
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module @nitpicker/crawler
|
|
3
|
+
*
|
|
4
|
+
* Core module of Nitpicker that provides the main crawling engine,
|
|
5
|
+
* utility functions, type definitions, and archive storage layer.
|
|
6
|
+
*/
|
|
7
|
+
// Types + Utils (旧 @nitpicker/types + utils)
|
|
8
|
+
export * from './utils/index.js';
|
|
9
|
+
// Archive
|
|
10
|
+
export { ArchiveAccessor } from './archive/archive-accessor.js';
|
|
11
|
+
export { default as Page } from './archive/page.js';
|
|
12
|
+
export { default as ArchiveResource } from './archive/resource.js';
|
|
13
|
+
export * from './archive/types.js';
|
|
14
|
+
export { default as Archive } from './archive/archive.js';
|
|
15
|
+
// Core
|
|
16
|
+
export { DEFAULT_EXCLUDED_EXTERNAL_URLS, CrawlerOrchestrator, } from './crawler-orchestrator.js';
|
|
17
|
+
export * from './types.js';
|
|
18
|
+
export * from './crawler/types.js';
|
package/lib/qzilla.d.ts
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import type { QzillaEvent } from './types.js';
|
|
2
|
+
import type { Config } from './archive/types.js';
|
|
3
|
+
import Archive from './archive/archive.js';
|
|
4
|
+
import { EventEmitter } from './utils/index.js';
|
|
5
|
+
import type { ExURL } from './utils/index.js';
|
|
6
|
+
/**
|
|
7
|
+
* Default list of external URL prefixes excluded from crawling.
|
|
8
|
+
* Includes social media sharing endpoints that are commonly linked
|
|
9
|
+
* but provide no useful crawl data.
|
|
10
|
+
*/
|
|
11
|
+
export declare const DEFAULT_EXCLUDED_EXTERNAL_URLS: string[];
|
|
12
|
+
/**
|
|
13
|
+
* Configuration options for the Qzilla crawler.
|
|
14
|
+
*
|
|
15
|
+
* Extends the archive {@link Config} with additional runtime settings
|
|
16
|
+
* such as working directory, browser executable path, and output options.
|
|
17
|
+
*/
|
|
18
|
+
type QzillaConfig = {
|
|
19
|
+
/** The working directory for output files. Defaults to `process.cwd()`. */
|
|
20
|
+
cwd: string;
|
|
21
|
+
/** Path to a Chromium/Chrome executable for Puppeteer. */
|
|
22
|
+
executablePath: string;
|
|
23
|
+
/** Output file path for the archive. */
|
|
24
|
+
filePath: string;
|
|
25
|
+
/** Whether to capture image resources during crawling. */
|
|
26
|
+
image: boolean;
|
|
27
|
+
/** File-size threshold (in bytes) above which images are excluded. */
|
|
28
|
+
imageFileSizeThreshold: number;
|
|
29
|
+
/** Delay in milliseconds between each page request. */
|
|
30
|
+
interval: number;
|
|
31
|
+
/** Whether the input is a pre-defined URL list (non-recursive mode). */
|
|
32
|
+
list: boolean;
|
|
33
|
+
/** Whether to enable verbose logging output. */
|
|
34
|
+
verbose: boolean;
|
|
35
|
+
} & Config;
|
|
36
|
+
/**
|
|
37
|
+
* Callback invoked after the Qzilla instance is fully initialized
|
|
38
|
+
* but before crawling begins.
|
|
39
|
+
* @param qzilla - The initialized Qzilla instance.
|
|
40
|
+
* @param config - The resolved archive configuration.
|
|
41
|
+
*/
|
|
42
|
+
type QzillaInitializedCallback = (qzilla: Qzilla, config: Config) => void | Promise<void>;
|
|
43
|
+
/**
|
|
44
|
+
* The main entry point for Qzilla web crawling and archiving.
|
|
45
|
+
*
|
|
46
|
+
* Qzilla orchestrates the full lifecycle of a crawl session: it creates an archive,
|
|
47
|
+
* configures a {@link Crawler}, processes discovered pages and resources, and
|
|
48
|
+
* writes the final archive file. It emits events defined by {@link QzillaEvent}.
|
|
49
|
+
*
|
|
50
|
+
* Instances are created via the static factory methods {@link Qzilla.crawling}
|
|
51
|
+
* or {@link Qzilla.resume}; the constructor is private.
|
|
52
|
+
* @example
|
|
53
|
+
* ```ts
|
|
54
|
+
* const qzilla = await Qzilla.crawling(['https://example.com'], { recursive: true });
|
|
55
|
+
* await qzilla.write();
|
|
56
|
+
* ```
|
|
57
|
+
*/
|
|
58
|
+
export declare class Qzilla extends EventEmitter<QzillaEvent> {
|
|
59
|
+
#private;
|
|
60
|
+
/**
|
|
61
|
+
* The underlying archive instance used for storing crawl results.
|
|
62
|
+
*/
|
|
63
|
+
get archive(): Archive;
|
|
64
|
+
private constructor();
|
|
65
|
+
/**
|
|
66
|
+
* Abort the current crawl and archive operations.
|
|
67
|
+
*
|
|
68
|
+
* Delegates to the archive's abort method, which stops all in-progress
|
|
69
|
+
* database writes and cleans up temporary resources.
|
|
70
|
+
* @returns The result of the archive abort operation.
|
|
71
|
+
*/
|
|
72
|
+
abort(): void;
|
|
73
|
+
/**
|
|
74
|
+
* Execute the crawl for the given list of URLs.
|
|
75
|
+
*
|
|
76
|
+
* Sets up event listeners on the crawler, starts crawling, and resolves
|
|
77
|
+
* when the crawl completes. Discovered pages, external pages, skipped pages,
|
|
78
|
+
* and resources are forwarded to the archive for storage.
|
|
79
|
+
* @param list - The list of parsed URLs to crawl. The first URL is used as the root.
|
|
80
|
+
* @returns A promise that resolves when crawling is complete.
|
|
81
|
+
* @throws {Error} If the URL list is empty.
|
|
82
|
+
*/
|
|
83
|
+
crawling(list: ExURL[]): Promise<void>;
|
|
84
|
+
/**
|
|
85
|
+
* Kill any zombie Chromium processes that were not properly cleaned up.
|
|
86
|
+
*
|
|
87
|
+
* Retrieves the list of undead process IDs from the crawler and sends
|
|
88
|
+
* a SIGTERM signal to each one. Chromium is intentionally sent SIGTERM
|
|
89
|
+
* (not SIGKILL) to avoid leaving zombie processes.
|
|
90
|
+
*/
|
|
91
|
+
garbageCollect(): void;
|
|
92
|
+
/**
|
|
93
|
+
* Retrieve the list of process IDs for Chromium instances that are
|
|
94
|
+
* still running after crawling has ended.
|
|
95
|
+
* @returns An array of process IDs that should be terminated.
|
|
96
|
+
*/
|
|
97
|
+
getUndeadPid(): never[];
|
|
98
|
+
/**
|
|
99
|
+
* Write the archive to its configured file path.
|
|
100
|
+
*
|
|
101
|
+
* Emits `writeFileStart` before writing and `writeFileEnd` after
|
|
102
|
+
* the write completes successfully.
|
|
103
|
+
*/
|
|
104
|
+
write(): Promise<void>;
|
|
105
|
+
/**
|
|
106
|
+
* Create a new Qzilla instance and start crawling the given URLs.
|
|
107
|
+
*
|
|
108
|
+
* This is the primary factory method for starting a fresh crawl. It:
|
|
109
|
+
* 1. Parses and sorts the input URLs
|
|
110
|
+
* 2. Creates an archive file
|
|
111
|
+
* 3. Saves the crawl configuration
|
|
112
|
+
* 4. Runs the optional initialized callback
|
|
113
|
+
* 5. Executes the crawl
|
|
114
|
+
* 6. Sorts the archived URLs in natural order
|
|
115
|
+
* @param url - One or more URL strings to crawl.
|
|
116
|
+
* @param options - Optional configuration overrides for the crawl session.
|
|
117
|
+
* @param initializedCallback - Optional callback invoked after initialization but before crawling starts.
|
|
118
|
+
* @returns A promise that resolves to the Qzilla instance after crawling completes.
|
|
119
|
+
* @throws {Error} If the URL list is empty or contains no valid URLs.
|
|
120
|
+
*/
|
|
121
|
+
static crawling(url: string[], options?: Partial<QzillaConfig>, initializedCallback?: QzillaInitializedCallback): Promise<Qzilla>;
|
|
122
|
+
/**
|
|
123
|
+
* Resume a previously interrupted crawl from an existing archive file.
|
|
124
|
+
*
|
|
125
|
+
* Restores the crawl state (pending URLs, scraped URLs, and resources)
|
|
126
|
+
* from the archive, merges any option overrides, and continues crawling
|
|
127
|
+
* from where it left off.
|
|
128
|
+
* @param stubPath - Path to the existing archive file to resume from.
|
|
129
|
+
* @param options - Optional configuration overrides to apply on top of the archived config.
|
|
130
|
+
* @param initializedCallback - Optional callback invoked after initialization but before crawling resumes.
|
|
131
|
+
* @returns A promise that resolves to the Qzilla instance after crawling completes.
|
|
132
|
+
* @throws {Error} If the archived URL is invalid.
|
|
133
|
+
*/
|
|
134
|
+
static resume(stubPath: string, options?: Partial<QzillaConfig>, initializedCallback?: QzillaInitializedCallback): Promise<Qzilla>;
|
|
135
|
+
}
|
|
136
|
+
export {};
|