@nitpicker/crawler 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/LICENSE +191 -0
- package/README.md +13 -0
- package/lib/archive/archive-accessor.d.ts +107 -0
- package/lib/archive/archive-accessor.js +264 -0
- package/lib/archive/archive.d.ts +174 -0
- package/lib/archive/archive.js +331 -0
- package/lib/archive/database.d.ts +207 -0
- package/lib/archive/database.js +972 -0
- package/lib/archive/debug.d.ts +8 -0
- package/lib/archive/debug.js +9 -0
- package/lib/archive/filesystem/append-text.d.ts +9 -0
- package/lib/archive/filesystem/append-text.js +14 -0
- package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
- package/lib/archive/filesystem/copy-dir-sync.js +9 -0
- package/lib/archive/filesystem/copy-dir.d.ts +7 -0
- package/lib/archive/filesystem/copy-dir.js +13 -0
- package/lib/archive/filesystem/exists.d.ts +6 -0
- package/lib/archive/filesystem/exists.js +9 -0
- package/lib/archive/filesystem/get-file-list.d.ts +8 -0
- package/lib/archive/filesystem/get-file-list.js +12 -0
- package/lib/archive/filesystem/index.d.ts +17 -0
- package/lib/archive/filesystem/index.js +17 -0
- package/lib/archive/filesystem/is-dir.d.ts +6 -0
- package/lib/archive/filesystem/is-dir.js +10 -0
- package/lib/archive/filesystem/mkdir.d.ts +8 -0
- package/lib/archive/filesystem/mkdir.js +15 -0
- package/lib/archive/filesystem/output-json.d.ts +9 -0
- package/lib/archive/filesystem/output-json.js +14 -0
- package/lib/archive/filesystem/output-text.d.ts +11 -0
- package/lib/archive/filesystem/output-text.js +32 -0
- package/lib/archive/filesystem/read-json.d.ts +7 -0
- package/lib/archive/filesystem/read-json.js +11 -0
- package/lib/archive/filesystem/read-text.d.ts +6 -0
- package/lib/archive/filesystem/read-text.js +10 -0
- package/lib/archive/filesystem/readline.d.ts +11 -0
- package/lib/archive/filesystem/readline.js +26 -0
- package/lib/archive/filesystem/remove.d.ts +5 -0
- package/lib/archive/filesystem/remove.js +10 -0
- package/lib/archive/filesystem/rename.d.ts +11 -0
- package/lib/archive/filesystem/rename.js +18 -0
- package/lib/archive/filesystem/tar.d.ts +11 -0
- package/lib/archive/filesystem/tar.js +22 -0
- package/lib/archive/filesystem/untar.d.ts +20 -0
- package/lib/archive/filesystem/untar.js +24 -0
- package/lib/archive/filesystem/utils.d.ts +109 -0
- package/lib/archive/filesystem/utils.js +185 -0
- package/lib/archive/filesystem/zip.d.ts +29 -0
- package/lib/archive/filesystem/zip.js +53 -0
- package/lib/archive/index.d.ts +6 -0
- package/lib/archive/index.js +11 -0
- package/lib/archive/page.d.ts +263 -0
- package/lib/archive/page.js +316 -0
- package/lib/archive/resource.d.ts +46 -0
- package/lib/archive/resource.js +62 -0
- package/lib/archive/safe-path.d.ts +9 -0
- package/lib/archive/safe-path.js +17 -0
- package/lib/archive/types.d.ts +210 -0
- package/lib/archive/types.js +1 -0
- package/lib/crawler/clear-destination-cache.d.ts +5 -0
- package/lib/crawler/clear-destination-cache.js +8 -0
- package/lib/crawler/crawler.d.ts +73 -0
- package/lib/crawler/crawler.js +748 -0
- package/lib/crawler/decompose-url.d.ts +25 -0
- package/lib/crawler/decompose-url.js +71 -0
- package/lib/crawler/destination-cache.d.ts +7 -0
- package/lib/crawler/destination-cache.js +6 -0
- package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
- package/lib/crawler/detect-pagination-pattern.js +61 -0
- package/lib/crawler/fetch-destination.d.ts +38 -0
- package/lib/crawler/fetch-destination.js +208 -0
- package/lib/crawler/fetch-robots-txt.d.ts +42 -0
- package/lib/crawler/fetch-robots-txt.js +44 -0
- package/lib/crawler/find-best-matching-scope.d.ts +12 -0
- package/lib/crawler/find-best-matching-scope.js +46 -0
- package/lib/crawler/generate-predicted-urls.d.ts +13 -0
- package/lib/crawler/generate-predicted-urls.js +27 -0
- package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
- package/lib/crawler/handle-ignore-and-skip.js +19 -0
- package/lib/crawler/handle-resource-response.d.ts +13 -0
- package/lib/crawler/handle-resource-response.js +16 -0
- package/lib/crawler/handle-scrape-end.d.ts +24 -0
- package/lib/crawler/handle-scrape-end.js +82 -0
- package/lib/crawler/handle-scrape-error.d.ts +37 -0
- package/lib/crawler/handle-scrape-error.js +38 -0
- package/lib/crawler/index.d.ts +2 -0
- package/lib/crawler/index.js +2 -0
- package/lib/crawler/inject-scope-auth.d.ts +11 -0
- package/lib/crawler/inject-scope-auth.js +21 -0
- package/lib/crawler/is-external-url.d.ts +11 -0
- package/lib/crawler/is-external-url.js +12 -0
- package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
- package/lib/crawler/is-in-any-lower-layer.js +15 -0
- package/lib/crawler/link-list.d.ts +112 -0
- package/lib/crawler/link-list.js +248 -0
- package/lib/crawler/link-to-page-data.d.ts +14 -0
- package/lib/crawler/link-to-page-data.js +32 -0
- package/lib/crawler/net-timeout-error.d.ts +9 -0
- package/lib/crawler/net-timeout-error.js +11 -0
- package/lib/crawler/network.d.ts +30 -0
- package/lib/crawler/network.js +226 -0
- package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
- package/lib/crawler/protocol-agnostic-key.js +11 -0
- package/lib/crawler/reconstruct-url.d.ts +10 -0
- package/lib/crawler/reconstruct-url.js +28 -0
- package/lib/crawler/result-handler.d.ts +118 -0
- package/lib/crawler/result-handler.js +153 -0
- package/lib/crawler/robots-checker.d.ts +26 -0
- package/lib/crawler/robots-checker.js +62 -0
- package/lib/crawler/should-discard-predicted.d.ts +14 -0
- package/lib/crawler/should-discard-predicted.js +31 -0
- package/lib/crawler/should-skip-url.d.ts +23 -0
- package/lib/crawler/should-skip-url.js +15 -0
- package/lib/crawler/speculative-pagination.d.ts +52 -0
- package/lib/crawler/speculative-pagination.js +215 -0
- package/lib/crawler/types.d.ts +119 -0
- package/lib/crawler/types.js +1 -0
- package/lib/crawler/url-filter.d.ts +56 -0
- package/lib/crawler/url-filter.js +110 -0
- package/lib/crawler-orchestrator.d.ts +142 -0
- package/lib/crawler-orchestrator.js +309 -0
- package/lib/debug.d.ts +8 -0
- package/lib/debug.js +9 -0
- package/lib/index.d.ts +16 -0
- package/lib/index.js +18 -0
- package/lib/qzilla.d.ts +136 -0
- package/lib/qzilla.js +292 -0
- package/lib/types.d.ts +27 -0
- package/lib/types.js +1 -0
- package/lib/utils/array/each-splitted.d.ts +10 -0
- package/lib/utils/array/each-splitted.js +14 -0
- package/lib/utils/array/index.d.ts +1 -0
- package/lib/utils/array/index.js +1 -0
- package/lib/utils/async/index.d.ts +1 -0
- package/lib/utils/async/index.js +1 -0
- package/lib/utils/debug.d.ts +5 -0
- package/lib/utils/debug.js +5 -0
- package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
- package/lib/utils/error/dom-evaluation-error.js +7 -0
- package/lib/utils/error/error-emitter.d.ts +18 -0
- package/lib/utils/error/error-emitter.js +29 -0
- package/lib/utils/error/index.d.ts +3 -0
- package/lib/utils/error/index.js +2 -0
- package/lib/utils/event-emitter/index.d.ts +6 -0
- package/lib/utils/event-emitter/index.js +6 -0
- package/lib/utils/index.d.ts +5 -0
- package/lib/utils/index.js +5 -0
- package/lib/utils/network/index.d.ts +1 -0
- package/lib/utils/network/index.js +1 -0
- package/lib/utils/object/clean-object.d.ts +8 -0
- package/lib/utils/object/clean-object.js +13 -0
- package/lib/utils/object/index.d.ts +1 -0
- package/lib/utils/object/index.js +1 -0
- package/lib/utils/path/index.d.ts +1 -0
- package/lib/utils/path/index.js +1 -0
- package/lib/utils/path/safe-filepath.d.ts +7 -0
- package/lib/utils/path/safe-filepath.js +12 -0
- package/lib/utils/regexp/index.d.ts +1 -0
- package/lib/utils/regexp/index.js +1 -0
- package/lib/utils/retryable/index.d.ts +2 -0
- package/lib/utils/retryable/index.js +1 -0
- package/lib/utils/sort/index.d.ts +14 -0
- package/lib/utils/sort/index.js +61 -0
- package/lib/utils/sort/remove-matches.d.ts +9 -0
- package/lib/utils/sort/remove-matches.js +23 -0
- package/lib/utils/types/index.d.ts +1 -0
- package/lib/utils/types/index.js +1 -0
- package/lib/utils/types/types.d.ts +46 -0
- package/lib/utils/types/types.js +1 -0
- package/lib/utils/url/index.d.ts +5 -0
- package/lib/utils/url/index.js +5 -0
- package/lib/utils/url/is-lower-layer.d.ts +15 -0
- package/lib/utils/url/is-lower-layer.js +55 -0
- package/lib/utils/url/parse-url.d.ts +11 -0
- package/lib/utils/url/parse-url.js +20 -0
- package/lib/utils/url/path-match.d.ts +11 -0
- package/lib/utils/url/path-match.js +18 -0
- package/lib/utils/url/sort-url.d.ts +10 -0
- package/lib/utils/url/sort-url.js +24 -0
- package/lib/utils/url/url-partial-match.d.ts +11 -0
- package/lib/utils/url/url-partial-match.js +32 -0
- package/package.json +49 -0
- package/src/archive/__mock__/.gitignore +3 -0
- package/src/archive/__mock__/mock.sqlite +0 -0
- package/src/archive/archive-accessor.ts +337 -0
- package/src/archive/archive.ts +408 -0
- package/src/archive/database.spec.ts +469 -0
- package/src/archive/database.ts +1059 -0
- package/src/archive/debug.ts +10 -0
- package/src/archive/filesystem/append-text.spec.ts +26 -0
- package/src/archive/filesystem/append-text.ts +16 -0
- package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
- package/src/archive/filesystem/copy-dir-sync.ts +10 -0
- package/src/archive/filesystem/copy-dir.spec.ts +33 -0
- package/src/archive/filesystem/copy-dir.ts +14 -0
- package/src/archive/filesystem/exists.spec.ts +33 -0
- package/src/archive/filesystem/exists.ts +10 -0
- package/src/archive/filesystem/get-file-list.spec.ts +37 -0
- package/src/archive/filesystem/get-file-list.ts +13 -0
- package/src/archive/filesystem/index.ts +17 -0
- package/src/archive/filesystem/is-dir.spec.ts +29 -0
- package/src/archive/filesystem/is-dir.ts +11 -0
- package/src/archive/filesystem/mkdir.spec.ts +37 -0
- package/src/archive/filesystem/mkdir.ts +16 -0
- package/src/archive/filesystem/output-json.spec.ts +34 -0
- package/src/archive/filesystem/output-json.ts +16 -0
- package/src/archive/filesystem/output-text.spec.ts +31 -0
- package/src/archive/filesystem/output-text.ts +35 -0
- package/src/archive/filesystem/read-json.spec.ts +26 -0
- package/src/archive/filesystem/read-json.ts +12 -0
- package/src/archive/filesystem/read-text.spec.ts +25 -0
- package/src/archive/filesystem/read-text.ts +11 -0
- package/src/archive/filesystem/readline.spec.ts +29 -0
- package/src/archive/filesystem/readline.ts +30 -0
- package/src/archive/filesystem/remove.spec.ts +34 -0
- package/src/archive/filesystem/remove.ts +11 -0
- package/src/archive/filesystem/rename.spec.ts +46 -0
- package/src/archive/filesystem/rename.ts +21 -0
- package/src/archive/filesystem/tar.spec.ts +33 -0
- package/src/archive/filesystem/tar.ts +27 -0
- package/src/archive/filesystem/untar.spec.ts +34 -0
- package/src/archive/filesystem/untar.ts +36 -0
- package/src/archive/index.ts +13 -0
- package/src/archive/page.spec.ts +368 -0
- package/src/archive/page.ts +420 -0
- package/src/archive/resource.spec.ts +101 -0
- package/src/archive/resource.ts +73 -0
- package/src/archive/safe-path.spec.ts +44 -0
- package/src/archive/safe-path.ts +18 -0
- package/src/archive/types.ts +227 -0
- package/src/crawler/clear-destination-cache.spec.ts +20 -0
- package/src/crawler/clear-destination-cache.ts +9 -0
- package/src/crawler/crawler.ts +873 -0
- package/src/crawler/decompose-url.spec.ts +48 -0
- package/src/crawler/decompose-url.ts +90 -0
- package/src/crawler/destination-cache.spec.ts +23 -0
- package/src/crawler/destination-cache.ts +8 -0
- package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
- package/src/crawler/detect-pagination-pattern.ts +66 -0
- package/src/crawler/fetch-destination.ts +257 -0
- package/src/crawler/fetch-robots-txt.spec.ts +83 -0
- package/src/crawler/fetch-robots-txt.ts +91 -0
- package/src/crawler/find-best-matching-scope.spec.ts +39 -0
- package/src/crawler/find-best-matching-scope.ts +57 -0
- package/src/crawler/generate-predicted-urls.spec.ts +42 -0
- package/src/crawler/generate-predicted-urls.ts +34 -0
- package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
- package/src/crawler/handle-ignore-and-skip.ts +30 -0
- package/src/crawler/handle-resource-response.spec.ts +45 -0
- package/src/crawler/handle-resource-response.ts +21 -0
- package/src/crawler/handle-scrape-end.spec.ts +109 -0
- package/src/crawler/handle-scrape-end.ts +115 -0
- package/src/crawler/handle-scrape-error.spec.ts +105 -0
- package/src/crawler/handle-scrape-error.ts +58 -0
- package/src/crawler/index.ts +2 -0
- package/src/crawler/inject-scope-auth.spec.ts +36 -0
- package/src/crawler/inject-scope-auth.ts +27 -0
- package/src/crawler/is-external-url.spec.ts +31 -0
- package/src/crawler/is-external-url.ts +17 -0
- package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
- package/src/crawler/is-in-any-lower-layer.ts +22 -0
- package/src/crawler/link-list.spec.ts +355 -0
- package/src/crawler/link-list.ts +275 -0
- package/src/crawler/link-to-page-data.spec.ts +133 -0
- package/src/crawler/link-to-page-data.ts +34 -0
- package/src/crawler/net-timeout-error.spec.ts +25 -0
- package/src/crawler/net-timeout-error.ts +11 -0
- package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
- package/src/crawler/protocol-agnostic-key.ts +11 -0
- package/src/crawler/reconstruct-url.spec.ts +37 -0
- package/src/crawler/reconstruct-url.ts +37 -0
- package/src/crawler/robots-checker.spec.ts +104 -0
- package/src/crawler/robots-checker.ts +73 -0
- package/src/crawler/should-discard-predicted.spec.ts +125 -0
- package/src/crawler/should-discard-predicted.ts +33 -0
- package/src/crawler/should-skip-url.spec.ts +77 -0
- package/src/crawler/should-skip-url.ts +37 -0
- package/src/crawler/types.ts +146 -0
- package/src/crawler-orchestrator.ts +401 -0
- package/src/debug.ts +10 -0
- package/src/index.ts +25 -0
- package/src/types.ts +30 -0
- package/src/utils/array/each-splitted.spec.ts +38 -0
- package/src/utils/array/each-splitted.ts +19 -0
- package/src/utils/array/index.ts +1 -0
- package/src/utils/debug.ts +6 -0
- package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
- package/src/utils/error/dom-evaluation-error.ts +6 -0
- package/src/utils/error/error-emitter.spec.ts +78 -0
- package/src/utils/error/error-emitter.ts +44 -0
- package/src/utils/error/index.ts +3 -0
- package/src/utils/index.ts +5 -0
- package/src/utils/object/clean-object.spec.ts +24 -0
- package/src/utils/object/clean-object.ts +13 -0
- package/src/utils/object/index.ts +1 -0
- package/src/utils/types/index.ts +1 -0
- package/src/utils/types/types.ts +65 -0
- package/tsconfig.json +11 -0
- package/tsconfig.tsbuildinfo +1 -0
package/lib/qzilla.js
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
import Archive from './archive/archive.js';
|
|
2
|
+
import { cleanObject, EventEmitter, parseUrl, sortUrl } from './utils/index.js';
|
|
3
|
+
import pkg from '../package.json' with { type: 'json' };
|
|
4
|
+
import { Crawler } from './crawler/index.js';
|
|
5
|
+
import { crawlerLog, log } from './debug.js';
|
|
6
|
+
/**
|
|
7
|
+
* Default list of external URL prefixes excluded from crawling.
|
|
8
|
+
* Includes social media sharing endpoints that are commonly linked
|
|
9
|
+
* but provide no useful crawl data.
|
|
10
|
+
*/
|
|
11
|
+
export const DEFAULT_EXCLUDED_EXTERNAL_URLS = [
|
|
12
|
+
'https://social-plugins.line.me',
|
|
13
|
+
'https://access.line.me',
|
|
14
|
+
'https://lineit.line.me',
|
|
15
|
+
'https://line.me',
|
|
16
|
+
'https://plus.google.com',
|
|
17
|
+
'https://twitter.com',
|
|
18
|
+
'https://x.com',
|
|
19
|
+
'https://www.facebook.com/share.php',
|
|
20
|
+
'https://www.facebook.com/share/',
|
|
21
|
+
'https://www.facebook.com/sharer/',
|
|
22
|
+
'https://www.facebook.com/share_channel/',
|
|
23
|
+
'https://www.google.com',
|
|
24
|
+
];
|
|
25
|
+
/**
|
|
26
|
+
* The main entry point for Qzilla web crawling and archiving.
|
|
27
|
+
*
|
|
28
|
+
* Qzilla orchestrates the full lifecycle of a crawl session: it creates an archive,
|
|
29
|
+
* configures a {@link Crawler}, processes discovered pages and resources, and
|
|
30
|
+
* writes the final archive file. It emits events defined by {@link QzillaEvent}.
|
|
31
|
+
*
|
|
32
|
+
* Instances are created via the static factory methods {@link Qzilla.crawling}
|
|
33
|
+
* or {@link Qzilla.resume}; the constructor is private.
|
|
34
|
+
* @example
|
|
35
|
+
* ```ts
|
|
36
|
+
* const qzilla = await Qzilla.crawling(['https://example.com'], { recursive: true });
|
|
37
|
+
* await qzilla.write();
|
|
38
|
+
* ```
|
|
39
|
+
*/
|
|
40
|
+
export class Qzilla extends EventEmitter {
|
|
41
|
+
#archive;
|
|
42
|
+
#crawler;
|
|
43
|
+
#fromList;
|
|
44
|
+
/**
|
|
45
|
+
* The underlying archive instance used for storing crawl results.
|
|
46
|
+
*/
|
|
47
|
+
get archive() {
|
|
48
|
+
return this.#archive;
|
|
49
|
+
}
|
|
50
|
+
// eslint-disable-next-line no-restricted-syntax
|
|
51
|
+
constructor(archive, options) {
|
|
52
|
+
super();
|
|
53
|
+
this.#fromList = !!options?.list;
|
|
54
|
+
this.#archive = archive;
|
|
55
|
+
this.#archive.on('error', (e) => {
|
|
56
|
+
this.#crawler.abort();
|
|
57
|
+
throw e;
|
|
58
|
+
});
|
|
59
|
+
this.#crawler = new Crawler({
|
|
60
|
+
interval: options?.interval || 0,
|
|
61
|
+
parallels: options?.parallels || 0,
|
|
62
|
+
isGettingImages: options?.image,
|
|
63
|
+
executablePath: options?.executablePath || null,
|
|
64
|
+
fetchExternal: options?.fetchExternal ?? true,
|
|
65
|
+
recursive: options?.recursive ?? true,
|
|
66
|
+
scope: options?.scope ?? [],
|
|
67
|
+
excludes: optMultiParam(options?.excludes),
|
|
68
|
+
excludeKeywords: optMultiParam(options?.excludeKeywords),
|
|
69
|
+
excludeUrls: [
|
|
70
|
+
...DEFAULT_EXCLUDED_EXTERNAL_URLS,
|
|
71
|
+
...optMultiParam(options?.excludeUrls),
|
|
72
|
+
],
|
|
73
|
+
depthOnAvoid: options?.depthOnAvoid || 10,
|
|
74
|
+
disableQueries: options?.disableQueries,
|
|
75
|
+
screenshot: Archive.joinPath(archive.tmpDir, 'screenshots'),
|
|
76
|
+
verbose: options?.verbose ?? false,
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Abort the current crawl and archive operations.
|
|
81
|
+
*
|
|
82
|
+
* Delegates to the archive's abort method, which stops all in-progress
|
|
83
|
+
* database writes and cleans up temporary resources.
|
|
84
|
+
* @returns The result of the archive abort operation.
|
|
85
|
+
*/
|
|
86
|
+
abort() {
|
|
87
|
+
return this.#archive.abort();
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Execute the crawl for the given list of URLs.
|
|
91
|
+
*
|
|
92
|
+
* Sets up event listeners on the crawler, starts crawling, and resolves
|
|
93
|
+
* when the crawl completes. Discovered pages, external pages, skipped pages,
|
|
94
|
+
* and resources are forwarded to the archive for storage.
|
|
95
|
+
* @param list - The list of parsed URLs to crawl. The first URL is used as the root.
|
|
96
|
+
* @returns A promise that resolves when crawling is complete.
|
|
97
|
+
* @throws {Error} If the URL list is empty.
|
|
98
|
+
*/
|
|
99
|
+
async crawling(list) {
|
|
100
|
+
const root = list[0];
|
|
101
|
+
if (!root) {
|
|
102
|
+
throw new Error('URL is empty');
|
|
103
|
+
}
|
|
104
|
+
return new Promise((resolve, reject) => {
|
|
105
|
+
this.#crawler.on('error', (error) => {
|
|
106
|
+
crawlerLog('On error: %O', error);
|
|
107
|
+
void this.#archive.addError(error);
|
|
108
|
+
void this.emit('error', error);
|
|
109
|
+
});
|
|
110
|
+
this.#crawler.on('page', async ({ result }) => {
|
|
111
|
+
// const pageId =
|
|
112
|
+
await this.#archive.setPage(result).catch((error) => reject(error));
|
|
113
|
+
// await this.#crawler.screenshot(pageId, Archive.joinPath(this.#archive.tmpDir, 'screenshots'));
|
|
114
|
+
});
|
|
115
|
+
this.#crawler.on('externalPage', ({ result }) => {
|
|
116
|
+
this.#archive.setExternalPage(result).catch((error) => reject(error));
|
|
117
|
+
});
|
|
118
|
+
this.#crawler.on('skip', ({ url, reason, isExternal }) => {
|
|
119
|
+
this.#archive
|
|
120
|
+
.setSkippedPage(url, reason, isExternal)
|
|
121
|
+
.catch((error) => reject(error));
|
|
122
|
+
});
|
|
123
|
+
this.#crawler.on('response', ({ resource }) => {
|
|
124
|
+
this.#archive.setResources(resource).catch((error) => reject(error));
|
|
125
|
+
});
|
|
126
|
+
this.#crawler.on('responseReferrers', (resource) => {
|
|
127
|
+
this.#archive.setResourcesReferrers(resource).catch((error) => reject(error));
|
|
128
|
+
});
|
|
129
|
+
this.#crawler.on('crawlEnd', () => {
|
|
130
|
+
resolve();
|
|
131
|
+
});
|
|
132
|
+
if (this.#fromList) {
|
|
133
|
+
this.#crawler.startMultiple(list);
|
|
134
|
+
}
|
|
135
|
+
else {
|
|
136
|
+
this.#crawler.start(root);
|
|
137
|
+
}
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Kill any zombie Chromium processes that were not properly cleaned up.
|
|
142
|
+
*
|
|
143
|
+
* Retrieves the list of undead process IDs from the crawler and sends
|
|
144
|
+
* a SIGTERM signal to each one. Chromium is intentionally sent SIGTERM
|
|
145
|
+
* (not SIGKILL) to avoid leaving zombie processes.
|
|
146
|
+
*/
|
|
147
|
+
garbageCollect() {
|
|
148
|
+
const pidList = this.getUndeadPid();
|
|
149
|
+
log('Undead PIDs: %O', pidList);
|
|
150
|
+
for (const pid of pidList) {
|
|
151
|
+
try {
|
|
152
|
+
log('Garbage collect: kill PID:%d', pid);
|
|
153
|
+
// Chromium becomes a zombie process if SIGKILL signal.
|
|
154
|
+
process.kill(pid);
|
|
155
|
+
}
|
|
156
|
+
catch (error) {
|
|
157
|
+
log('Garbage collect: Failed killing PID:%d %O', pid, error);
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* Retrieve the list of process IDs for Chromium instances that are
|
|
163
|
+
* still running after crawling has ended.
|
|
164
|
+
* @returns An array of process IDs that should be terminated.
|
|
165
|
+
*/
|
|
166
|
+
getUndeadPid() {
|
|
167
|
+
return this.#crawler.getUndeadPid();
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Write the archive to its configured file path.
|
|
171
|
+
*
|
|
172
|
+
* Emits `writeFileStart` before writing and `writeFileEnd` after
|
|
173
|
+
* the write completes successfully.
|
|
174
|
+
*/
|
|
175
|
+
async write() {
|
|
176
|
+
void this.emit('writeFileStart', { filePath: this.#archive.filePath });
|
|
177
|
+
await this.#archive.write();
|
|
178
|
+
void this.emit('writeFileEnd', { filePath: this.#archive.filePath });
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Create a new Qzilla instance and start crawling the given URLs.
|
|
182
|
+
*
|
|
183
|
+
* This is the primary factory method for starting a fresh crawl. It:
|
|
184
|
+
* 1. Parses and sorts the input URLs
|
|
185
|
+
* 2. Creates an archive file
|
|
186
|
+
* 3. Saves the crawl configuration
|
|
187
|
+
* 4. Runs the optional initialized callback
|
|
188
|
+
* 5. Executes the crawl
|
|
189
|
+
* 6. Sorts the archived URLs in natural order
|
|
190
|
+
* @param url - One or more URL strings to crawl.
|
|
191
|
+
* @param options - Optional configuration overrides for the crawl session.
|
|
192
|
+
* @param initializedCallback - Optional callback invoked after initialization but before crawling starts.
|
|
193
|
+
* @returns A promise that resolves to the Qzilla instance after crawling completes.
|
|
194
|
+
* @throws {Error} If the URL list is empty or contains no valid URLs.
|
|
195
|
+
*/
|
|
196
|
+
static async crawling(url, options, initializedCallback) {
|
|
197
|
+
const list = sortUrl(url, options);
|
|
198
|
+
const urlParsed = list[0];
|
|
199
|
+
if (!urlParsed) {
|
|
200
|
+
throw new Error('URL is empty');
|
|
201
|
+
}
|
|
202
|
+
const fileName = `${urlParsed.hostname}-${Archive.timestamp()}`;
|
|
203
|
+
const cwd = options?.cwd ?? process.cwd();
|
|
204
|
+
const filePath = Archive.joinPath(cwd, `${fileName}.${Archive.FILE_EXTENSION}`);
|
|
205
|
+
const disableQueries = options?.disableQueries || false;
|
|
206
|
+
const archive = await Archive.create({ filePath, cwd, disableQueries });
|
|
207
|
+
await archive.setConfig({
|
|
208
|
+
version: pkg.version,
|
|
209
|
+
name: fileName,
|
|
210
|
+
baseUrl: urlParsed.withoutHash,
|
|
211
|
+
recursive: options?.recursive ?? true,
|
|
212
|
+
fetchExternal: options?.fetchExternal ?? true,
|
|
213
|
+
image: options?.image ?? true,
|
|
214
|
+
interval: options?.interval || 0,
|
|
215
|
+
parallels: options?.parallels || 0,
|
|
216
|
+
scope: options?.scope ?? [],
|
|
217
|
+
// @ts-ignore TODO: Fix CLI arguments
|
|
218
|
+
excludes: optMultiParam(options?.exclude),
|
|
219
|
+
// @ts-ignore TODO: Fix CLI arguments
|
|
220
|
+
excludeKeywords: optMultiParam(options?.excludeKeyword),
|
|
221
|
+
excludeUrls: [
|
|
222
|
+
...DEFAULT_EXCLUDED_EXTERNAL_URLS,
|
|
223
|
+
// @ts-ignore TODO: Fix CLI arguments
|
|
224
|
+
...optMultiParam(options?.excludeUrl),
|
|
225
|
+
],
|
|
226
|
+
depthOnAvoid: options?.depthOnAvoid || 10,
|
|
227
|
+
fromList: !!options?.list,
|
|
228
|
+
disableQueries,
|
|
229
|
+
});
|
|
230
|
+
const qzilla = new Qzilla(archive, options);
|
|
231
|
+
const config = await archive.getConfig();
|
|
232
|
+
if (initializedCallback) {
|
|
233
|
+
await initializedCallback(qzilla, config);
|
|
234
|
+
}
|
|
235
|
+
log('Start crawling');
|
|
236
|
+
log('URL %O', list.map((url) => url.href));
|
|
237
|
+
log('Config %O', config);
|
|
238
|
+
await qzilla.crawling(list);
|
|
239
|
+
log('Crawling complated');
|
|
240
|
+
log('Set order natural URL sort');
|
|
241
|
+
await archive.setUrlOrder();
|
|
242
|
+
log('Sorting done');
|
|
243
|
+
return qzilla;
|
|
244
|
+
}
|
|
245
|
+
/**
|
|
246
|
+
* Resume a previously interrupted crawl from an existing archive file.
|
|
247
|
+
*
|
|
248
|
+
* Restores the crawl state (pending URLs, scraped URLs, and resources)
|
|
249
|
+
* from the archive, merges any option overrides, and continues crawling
|
|
250
|
+
* from where it left off.
|
|
251
|
+
* @param stubPath - Path to the existing archive file to resume from.
|
|
252
|
+
* @param options - Optional configuration overrides to apply on top of the archived config.
|
|
253
|
+
* @param initializedCallback - Optional callback invoked after initialization but before crawling resumes.
|
|
254
|
+
* @returns A promise that resolves to the Qzilla instance after crawling completes.
|
|
255
|
+
* @throws {Error} If the archived URL is invalid.
|
|
256
|
+
*/
|
|
257
|
+
static async resume(stubPath, options, initializedCallback) {
|
|
258
|
+
const archive = await Archive.resume(stubPath);
|
|
259
|
+
const archivedConfig = await archive.getConfig();
|
|
260
|
+
const config = {
|
|
261
|
+
...archivedConfig,
|
|
262
|
+
...cleanObject(options),
|
|
263
|
+
};
|
|
264
|
+
const qzilla = new Qzilla(archive, config);
|
|
265
|
+
const _url = await archive.getUrl();
|
|
266
|
+
const url = parseUrl(_url, config);
|
|
267
|
+
if (!url) {
|
|
268
|
+
throw new Error(`URL (${_url}) is invalid`);
|
|
269
|
+
}
|
|
270
|
+
const { scraped, pending } = await archive.getCrawlingState();
|
|
271
|
+
const resources = await archive.getResourceUrlList();
|
|
272
|
+
qzilla.#crawler.resume(pending, scraped, resources);
|
|
273
|
+
if (initializedCallback) {
|
|
274
|
+
await initializedCallback(qzilla, config);
|
|
275
|
+
}
|
|
276
|
+
log('Start resuming');
|
|
277
|
+
log('Data %s', stubPath);
|
|
278
|
+
log('URL %s', url.href);
|
|
279
|
+
log('Config %O', config);
|
|
280
|
+
await qzilla.crawling([url]);
|
|
281
|
+
return qzilla;
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
/**
|
|
285
|
+
* Normalize an optional parameter that may be a single value, an array,
|
|
286
|
+
* null, or undefined into a guaranteed array.
|
|
287
|
+
* @param param - The parameter to normalize.
|
|
288
|
+
* @returns An array containing the parameter value(s), or an empty array if absent.
|
|
289
|
+
*/
|
|
290
|
+
function optMultiParam(param) {
|
|
291
|
+
return Array.isArray(param) ? param : param ? [param] : [];
|
|
292
|
+
}
|
package/lib/types.d.ts
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import type { CrawlerError } from './utils/index.js';
|
|
2
|
+
/**
|
|
3
|
+
* Event map for the `CrawlerOrchestrator` class.
|
|
4
|
+
*
|
|
5
|
+
* Each key represents an event name and its value is the payload type
|
|
6
|
+
* passed to listeners subscribed via `on()` or `once()`.
|
|
7
|
+
*/
|
|
8
|
+
export interface CrawlEvent {
|
|
9
|
+
/**
|
|
10
|
+
* Emitted when the archive file write operation begins.
|
|
11
|
+
*/
|
|
12
|
+
writeFileStart: {
|
|
13
|
+
/** Absolute path of the archive file being written. */
|
|
14
|
+
filePath: string;
|
|
15
|
+
};
|
|
16
|
+
/**
|
|
17
|
+
* Emitted when the archive file write operation completes.
|
|
18
|
+
*/
|
|
19
|
+
writeFileEnd: {
|
|
20
|
+
/** Absolute path of the archive file that was written. */
|
|
21
|
+
filePath: string;
|
|
22
|
+
};
|
|
23
|
+
/**
|
|
24
|
+
* Emitted when an error occurs during crawling or archiving.
|
|
25
|
+
*/
|
|
26
|
+
error: CrawlerError;
|
|
27
|
+
}
|
package/lib/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Splits an array into chunks of the specified size and executes a callback
|
|
3
|
+
* on each chunk in parallel using `Promise.all`.
|
|
4
|
+
* @template T - The element type of the array.
|
|
5
|
+
* @param a - The array to split into chunks.
|
|
6
|
+
* @param count - The maximum number of elements per chunk.
|
|
7
|
+
* @param callback - A function to invoke on each chunk. May be synchronous or asynchronous.
|
|
8
|
+
* @returns A promise that resolves when all chunk callbacks have completed.
|
|
9
|
+
*/
|
|
10
|
+
export declare function eachSplitted<T>(a: T[], count: number, callback: (items: T[]) => void | Promise<void>): Promise<void>;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { splitArray } from '@d-zero/shared/split-array';
|
|
2
|
+
/**
|
|
3
|
+
* Splits an array into chunks of the specified size and executes a callback
|
|
4
|
+
* on each chunk in parallel using `Promise.all`.
|
|
5
|
+
* @template T - The element type of the array.
|
|
6
|
+
* @param a - The array to split into chunks.
|
|
7
|
+
* @param count - The maximum number of elements per chunk.
|
|
8
|
+
* @param callback - A function to invoke on each chunk. May be synchronous or asynchronous.
|
|
9
|
+
* @returns A promise that resolves when all chunk callbacks have completed.
|
|
10
|
+
*/
|
|
11
|
+
export async function eachSplitted(a, count, callback) {
|
|
12
|
+
const splitted = splitArray(a, count);
|
|
13
|
+
await Promise.all(splitted.map((items) => callback(items)));
|
|
14
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { eachSplitted } from './each-splitted.js';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { eachSplitted } from './each-splitted.js';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { delay } from '@d-zero/shared/delay';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { delay } from '@d-zero/shared/delay';
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import debug from 'debug';
|
|
2
|
+
/** Root debug logger for the Nitpicker application. Namespace: `Nitpicker`. */
|
|
3
|
+
export declare const globalLog: debug.Debugger;
|
|
4
|
+
/** Debug logger for the utils package. Namespace: `Nitpicker:Utils`. */
|
|
5
|
+
export declare const log: debug.Debugger;
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import debug from 'debug';
|
|
2
|
+
/** Root debug logger for the Nitpicker application. Namespace: `Nitpicker`. */
|
|
3
|
+
export const globalLog = debug('Nitpicker');
|
|
4
|
+
/** Debug logger for the utils package. Namespace: `Nitpicker:Utils`. */
|
|
5
|
+
export const log = globalLog.extend('Utils');
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Error thrown when DOM evaluation (e.g., running scripts within a browser page context)
|
|
3
|
+
* fails. This typically occurs during page scraping when JavaScript execution
|
|
4
|
+
* in the browser context encounters an error.
|
|
5
|
+
*/
|
|
6
|
+
export declare class DOMEvaluationError extends Error {
|
|
7
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Error thrown when DOM evaluation (e.g., running scripts within a browser page context)
|
|
3
|
+
* fails. This typically occurs during page scraping when JavaScript execution
|
|
4
|
+
* in the browser context encounters an error.
|
|
5
|
+
*/
|
|
6
|
+
export class DOMEvaluationError extends Error {
|
|
7
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
2
|
+
/**
|
|
3
|
+
* Event payload type for error events emitted by classes using the {@link ErrorEmitter} decorator.
|
|
4
|
+
* @template E - The specific error type, defaults to `Error`.
|
|
5
|
+
*/
|
|
6
|
+
export type ErrorEvent<E extends Error = Error> = {
|
|
7
|
+
/** The error instance that was caught. */
|
|
8
|
+
error: E;
|
|
9
|
+
};
|
|
10
|
+
/**
|
|
11
|
+
* A class method decorator factory that wraps the decorated method with error handling.
|
|
12
|
+
* When the method throws an `Error`, it emits an `'error'` event on the class instance
|
|
13
|
+
* (which must extend {@link EventEmitter}) with the caught error, then re-throws the error.
|
|
14
|
+
* @template C - The class type, which must be an EventEmitter capable of emitting error events.
|
|
15
|
+
* @template E - The error event type, defaults to {@link ErrorEvent}.
|
|
16
|
+
* @returns A decorator function that wraps the target method with error-emitting behavior.
|
|
17
|
+
*/
|
|
18
|
+
export declare function ErrorEmitter<C extends EventEmitter<E>, E extends ErrorEvent = ErrorEvent>(): (method: Function, context: ClassMethodDecoratorContext) => (this: C, ...args: unknown[]) => Promise<any>;
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { log } from '../debug.js';
|
|
2
|
+
const errorLog = log.extend('ErrorEmitter');
|
|
3
|
+
/**
|
|
4
|
+
* A class method decorator factory that wraps the decorated method with error handling.
|
|
5
|
+
* When the method throws an `Error`, it emits an `'error'` event on the class instance
|
|
6
|
+
* (which must extend {@link EventEmitter}) with the caught error, then re-throws the error.
|
|
7
|
+
* @template C - The class type, which must be an EventEmitter capable of emitting error events.
|
|
8
|
+
* @template E - The error event type, defaults to {@link ErrorEvent}.
|
|
9
|
+
* @returns A decorator function that wraps the target method with error-emitting behavior.
|
|
10
|
+
*/
|
|
11
|
+
export function ErrorEmitter() {
|
|
12
|
+
// eslint-disable-next-line @typescript-eslint/no-unsafe-function-type
|
|
13
|
+
return (method, context) => {
|
|
14
|
+
return async function (...args) {
|
|
15
|
+
const constructorName = String(this.constructor?.name || this.constructor || this);
|
|
16
|
+
const methodName = `${constructorName}.${String(context.name)}`;
|
|
17
|
+
try {
|
|
18
|
+
return await method.apply(this, args);
|
|
19
|
+
}
|
|
20
|
+
catch (error) {
|
|
21
|
+
if (error instanceof Error) {
|
|
22
|
+
errorLog('%s: %O', methodName, error);
|
|
23
|
+
void this.emit('error', error);
|
|
24
|
+
}
|
|
25
|
+
throw error;
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
};
|
|
29
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* A typed, async-capable event emitter re-exported from `@d-zero/shared`.
|
|
3
|
+
* Provides type-safe `emit`, `on`, and `off` methods where event names
|
|
4
|
+
* and their payload types are enforced at compile time.
|
|
5
|
+
*/
|
|
6
|
+
export { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* A typed, async-capable event emitter re-exported from `@d-zero/shared`.
|
|
3
|
+
* Provides type-safe `emit`, `on`, and `off` methods where event names
|
|
4
|
+
* and their payload types are enforced at compile time.
|
|
5
|
+
*/
|
|
6
|
+
export { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { isError } from '@nitpicker/beholder';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { isError } from '@nitpicker/beholder';
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Creates a shallow copy of an object with all `undefined`-valued properties removed.
|
|
3
|
+
* If the input is falsy (e.g., `undefined` or `null`), returns an empty object.
|
|
4
|
+
* @template T - The type of the input object.
|
|
5
|
+
* @param obj - The object to clean. If falsy, an empty `Partial<T>` is returned.
|
|
6
|
+
* @returns A new object containing only the properties whose values are not `undefined`.
|
|
7
|
+
*/
|
|
8
|
+
export declare function cleanObject<T extends Record<string, unknown>>(obj?: T): Partial<T>;
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Creates a shallow copy of an object with all `undefined`-valued properties removed.
|
|
3
|
+
* If the input is falsy (e.g., `undefined` or `null`), returns an empty object.
|
|
4
|
+
* @template T - The type of the input object.
|
|
5
|
+
* @param obj - The object to clean. If falsy, an empty `Partial<T>` is returned.
|
|
6
|
+
* @returns A new object containing only the properties whose values are not `undefined`.
|
|
7
|
+
*/
|
|
8
|
+
export function cleanObject(obj) {
|
|
9
|
+
if (!obj) {
|
|
10
|
+
return {};
|
|
11
|
+
}
|
|
12
|
+
return Object.fromEntries(Object.entries(obj).filter(([, v]) => v !== undefined));
|
|
13
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from './clean-object.js';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from './clean-object.js';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { safeFilePath } from '@d-zero/shared/safe-filepath';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { safeFilePath } from '@d-zero/shared/safe-filepath';
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sanitizes a file path by decoding URI-encoded characters and replacing
|
|
3
|
+
* any characters that are unsafe for use in filenames with underscores.
|
|
4
|
+
* @param filePath - The raw file path string (possibly URI-encoded) to sanitize.
|
|
5
|
+
* @returns A sanitized file path string that is safe for use as a filename.
|
|
6
|
+
*/
|
|
7
|
+
export declare function safeFilePath(filePath: string): string;
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import sanitize from 'sanitize-filename';
|
|
2
|
+
/**
|
|
3
|
+
* Sanitizes a file path by decoding URI-encoded characters and replacing
|
|
4
|
+
* any characters that are unsafe for use in filenames with underscores.
|
|
5
|
+
* @param filePath - The raw file path string (possibly URI-encoded) to sanitize.
|
|
6
|
+
* @returns A sanitized file path string that is safe for use as a filename.
|
|
7
|
+
*/
|
|
8
|
+
export function safeFilePath(filePath) {
|
|
9
|
+
return sanitize(decodeURI(filePath), {
|
|
10
|
+
replacement: '_',
|
|
11
|
+
});
|
|
12
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { strToRegex } from '@d-zero/shared/str-to-regex';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { strToRegex } from '@d-zero/shared/str-to-regex';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { retry as retryable } from '@d-zero/shared/retry';
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
|
|
2
|
+
/**
|
|
3
|
+
* Compares two URLs using a natural sorting algorithm. The comparison order is:
|
|
4
|
+
* hostname, path directories, basename (with index files prioritized), file extension,
|
|
5
|
+
* query string, hash, protocol, and finally the original URL string as a tiebreaker.
|
|
6
|
+
* Numeric segments within path components are compared numerically rather than
|
|
7
|
+
* lexicographically.
|
|
8
|
+
* @param url1 - The first URL string or ExURL to compare.
|
|
9
|
+
* @param url2 - The second URL string or ExURL to compare.
|
|
10
|
+
* @param options - Optional URL parsing options.
|
|
11
|
+
* @returns `0` if the URLs are equal, `-1` if url1 should come before url2,
|
|
12
|
+
* or `1` if url1 should come after url2.
|
|
13
|
+
*/
|
|
14
|
+
export declare function naturalURLSort(url1: string | ExURL, url2: string | ExURL, options?: ParseURLOptions): 0 | -1 | 1;
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { alphabeticalComparator } from '@d-zero/shared/sort/alphabetical';
|
|
2
|
+
import { dirComparator } from '@d-zero/shared/sort/dir';
|
|
3
|
+
import { numericalComparator } from '@d-zero/shared/sort/numerical';
|
|
4
|
+
import { parseUrl } from '../url/index.js';
|
|
5
|
+
/**
|
|
6
|
+
* Compares two URLs using a natural sorting algorithm. The comparison order is:
|
|
7
|
+
* hostname, path directories, basename (with index files prioritized), file extension,
|
|
8
|
+
* query string, hash, protocol, and finally the original URL string as a tiebreaker.
|
|
9
|
+
* Numeric segments within path components are compared numerically rather than
|
|
10
|
+
* lexicographically.
|
|
11
|
+
* @param url1 - The first URL string or ExURL to compare.
|
|
12
|
+
* @param url2 - The second URL string or ExURL to compare.
|
|
13
|
+
* @param options - Optional URL parsing options.
|
|
14
|
+
* @returns `0` if the URLs are equal, `-1` if url1 should come before url2,
|
|
15
|
+
* or `1` if url1 should come after url2.
|
|
16
|
+
*/
|
|
17
|
+
export function naturalURLSort(url1, url2, options) {
|
|
18
|
+
const u1 = typeof url1 === 'string' ? parseUrl(url1, options) : url1;
|
|
19
|
+
const u2 = typeof url2 === 'string' ? parseUrl(url2, options) : url2;
|
|
20
|
+
if (!u1 || !u2) {
|
|
21
|
+
return 0;
|
|
22
|
+
}
|
|
23
|
+
if (u1.href === u2.href) {
|
|
24
|
+
return alphabeticalComparator(u1._originUrlString, u2._originUrlString);
|
|
25
|
+
}
|
|
26
|
+
const rHost = alphabeticalComparator(u1.hostname, u2.hostname);
|
|
27
|
+
if (rHost) {
|
|
28
|
+
return rHost;
|
|
29
|
+
}
|
|
30
|
+
const rPaths = dirComparator(u1.paths, u2.paths);
|
|
31
|
+
if (rPaths) {
|
|
32
|
+
return rPaths;
|
|
33
|
+
}
|
|
34
|
+
if (u1.basename !== u2.basename) {
|
|
35
|
+
if (u1.isIndex)
|
|
36
|
+
return -1;
|
|
37
|
+
if (u2.isIndex)
|
|
38
|
+
return 1;
|
|
39
|
+
const rBasename = numericalComparator(u1.basename, u2.basename);
|
|
40
|
+
if (rBasename) {
|
|
41
|
+
return rBasename;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
const rExtname = numericalComparator(u1.extname, u2.extname);
|
|
45
|
+
if (rExtname) {
|
|
46
|
+
return rExtname;
|
|
47
|
+
}
|
|
48
|
+
const rSearch = numericalComparator(u1.query, u2.query);
|
|
49
|
+
if (rSearch) {
|
|
50
|
+
return rSearch;
|
|
51
|
+
}
|
|
52
|
+
const rHash = numericalComparator(u1.hash, u2.hash);
|
|
53
|
+
if (rHash) {
|
|
54
|
+
return rHash;
|
|
55
|
+
}
|
|
56
|
+
const rProtocol = alphabeticalComparator(u1.protocol, u2.protocol);
|
|
57
|
+
if (rProtocol) {
|
|
58
|
+
return rProtocol;
|
|
59
|
+
}
|
|
60
|
+
return numericalComparator(u1._originUrlString, u2._originUrlString);
|
|
61
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Strips the common leading prefix from two strings (case-insensitive comparison).
|
|
3
|
+
* Returns a tuple of the remaining suffixes after the shared prefix is removed.
|
|
4
|
+
* If the strings are identical (ignoring case), returns `['', '']`.
|
|
5
|
+
* @param t1 - The first string.
|
|
6
|
+
* @param t2 - The second string.
|
|
7
|
+
* @returns A tuple of the two strings with their common leading characters removed.
|
|
8
|
+
*/
|
|
9
|
+
export declare function removeMatches(t1: string, t2: string): [string, string];
|