@nitpicker/crawler 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/LICENSE +191 -0
- package/README.md +13 -0
- package/lib/archive/archive-accessor.d.ts +107 -0
- package/lib/archive/archive-accessor.js +264 -0
- package/lib/archive/archive.d.ts +174 -0
- package/lib/archive/archive.js +331 -0
- package/lib/archive/database.d.ts +207 -0
- package/lib/archive/database.js +972 -0
- package/lib/archive/debug.d.ts +8 -0
- package/lib/archive/debug.js +9 -0
- package/lib/archive/filesystem/append-text.d.ts +9 -0
- package/lib/archive/filesystem/append-text.js +14 -0
- package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
- package/lib/archive/filesystem/copy-dir-sync.js +9 -0
- package/lib/archive/filesystem/copy-dir.d.ts +7 -0
- package/lib/archive/filesystem/copy-dir.js +13 -0
- package/lib/archive/filesystem/exists.d.ts +6 -0
- package/lib/archive/filesystem/exists.js +9 -0
- package/lib/archive/filesystem/get-file-list.d.ts +8 -0
- package/lib/archive/filesystem/get-file-list.js +12 -0
- package/lib/archive/filesystem/index.d.ts +17 -0
- package/lib/archive/filesystem/index.js +17 -0
- package/lib/archive/filesystem/is-dir.d.ts +6 -0
- package/lib/archive/filesystem/is-dir.js +10 -0
- package/lib/archive/filesystem/mkdir.d.ts +8 -0
- package/lib/archive/filesystem/mkdir.js +15 -0
- package/lib/archive/filesystem/output-json.d.ts +9 -0
- package/lib/archive/filesystem/output-json.js +14 -0
- package/lib/archive/filesystem/output-text.d.ts +11 -0
- package/lib/archive/filesystem/output-text.js +32 -0
- package/lib/archive/filesystem/read-json.d.ts +7 -0
- package/lib/archive/filesystem/read-json.js +11 -0
- package/lib/archive/filesystem/read-text.d.ts +6 -0
- package/lib/archive/filesystem/read-text.js +10 -0
- package/lib/archive/filesystem/readline.d.ts +11 -0
- package/lib/archive/filesystem/readline.js +26 -0
- package/lib/archive/filesystem/remove.d.ts +5 -0
- package/lib/archive/filesystem/remove.js +10 -0
- package/lib/archive/filesystem/rename.d.ts +11 -0
- package/lib/archive/filesystem/rename.js +18 -0
- package/lib/archive/filesystem/tar.d.ts +11 -0
- package/lib/archive/filesystem/tar.js +22 -0
- package/lib/archive/filesystem/untar.d.ts +20 -0
- package/lib/archive/filesystem/untar.js +24 -0
- package/lib/archive/filesystem/utils.d.ts +109 -0
- package/lib/archive/filesystem/utils.js +185 -0
- package/lib/archive/filesystem/zip.d.ts +29 -0
- package/lib/archive/filesystem/zip.js +53 -0
- package/lib/archive/index.d.ts +6 -0
- package/lib/archive/index.js +11 -0
- package/lib/archive/page.d.ts +263 -0
- package/lib/archive/page.js +316 -0
- package/lib/archive/resource.d.ts +46 -0
- package/lib/archive/resource.js +62 -0
- package/lib/archive/safe-path.d.ts +9 -0
- package/lib/archive/safe-path.js +17 -0
- package/lib/archive/types.d.ts +210 -0
- package/lib/archive/types.js +1 -0
- package/lib/crawler/clear-destination-cache.d.ts +5 -0
- package/lib/crawler/clear-destination-cache.js +8 -0
- package/lib/crawler/crawler.d.ts +73 -0
- package/lib/crawler/crawler.js +748 -0
- package/lib/crawler/decompose-url.d.ts +25 -0
- package/lib/crawler/decompose-url.js +71 -0
- package/lib/crawler/destination-cache.d.ts +7 -0
- package/lib/crawler/destination-cache.js +6 -0
- package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
- package/lib/crawler/detect-pagination-pattern.js +61 -0
- package/lib/crawler/fetch-destination.d.ts +38 -0
- package/lib/crawler/fetch-destination.js +208 -0
- package/lib/crawler/fetch-robots-txt.d.ts +42 -0
- package/lib/crawler/fetch-robots-txt.js +44 -0
- package/lib/crawler/find-best-matching-scope.d.ts +12 -0
- package/lib/crawler/find-best-matching-scope.js +46 -0
- package/lib/crawler/generate-predicted-urls.d.ts +13 -0
- package/lib/crawler/generate-predicted-urls.js +27 -0
- package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
- package/lib/crawler/handle-ignore-and-skip.js +19 -0
- package/lib/crawler/handle-resource-response.d.ts +13 -0
- package/lib/crawler/handle-resource-response.js +16 -0
- package/lib/crawler/handle-scrape-end.d.ts +24 -0
- package/lib/crawler/handle-scrape-end.js +82 -0
- package/lib/crawler/handle-scrape-error.d.ts +37 -0
- package/lib/crawler/handle-scrape-error.js +38 -0
- package/lib/crawler/index.d.ts +2 -0
- package/lib/crawler/index.js +2 -0
- package/lib/crawler/inject-scope-auth.d.ts +11 -0
- package/lib/crawler/inject-scope-auth.js +21 -0
- package/lib/crawler/is-external-url.d.ts +11 -0
- package/lib/crawler/is-external-url.js +12 -0
- package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
- package/lib/crawler/is-in-any-lower-layer.js +15 -0
- package/lib/crawler/link-list.d.ts +112 -0
- package/lib/crawler/link-list.js +248 -0
- package/lib/crawler/link-to-page-data.d.ts +14 -0
- package/lib/crawler/link-to-page-data.js +32 -0
- package/lib/crawler/net-timeout-error.d.ts +9 -0
- package/lib/crawler/net-timeout-error.js +11 -0
- package/lib/crawler/network.d.ts +30 -0
- package/lib/crawler/network.js +226 -0
- package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
- package/lib/crawler/protocol-agnostic-key.js +11 -0
- package/lib/crawler/reconstruct-url.d.ts +10 -0
- package/lib/crawler/reconstruct-url.js +28 -0
- package/lib/crawler/result-handler.d.ts +118 -0
- package/lib/crawler/result-handler.js +153 -0
- package/lib/crawler/robots-checker.d.ts +26 -0
- package/lib/crawler/robots-checker.js +62 -0
- package/lib/crawler/should-discard-predicted.d.ts +14 -0
- package/lib/crawler/should-discard-predicted.js +31 -0
- package/lib/crawler/should-skip-url.d.ts +23 -0
- package/lib/crawler/should-skip-url.js +15 -0
- package/lib/crawler/speculative-pagination.d.ts +52 -0
- package/lib/crawler/speculative-pagination.js +215 -0
- package/lib/crawler/types.d.ts +119 -0
- package/lib/crawler/types.js +1 -0
- package/lib/crawler/url-filter.d.ts +56 -0
- package/lib/crawler/url-filter.js +110 -0
- package/lib/crawler-orchestrator.d.ts +142 -0
- package/lib/crawler-orchestrator.js +309 -0
- package/lib/debug.d.ts +8 -0
- package/lib/debug.js +9 -0
- package/lib/index.d.ts +16 -0
- package/lib/index.js +18 -0
- package/lib/qzilla.d.ts +136 -0
- package/lib/qzilla.js +292 -0
- package/lib/types.d.ts +27 -0
- package/lib/types.js +1 -0
- package/lib/utils/array/each-splitted.d.ts +10 -0
- package/lib/utils/array/each-splitted.js +14 -0
- package/lib/utils/array/index.d.ts +1 -0
- package/lib/utils/array/index.js +1 -0
- package/lib/utils/async/index.d.ts +1 -0
- package/lib/utils/async/index.js +1 -0
- package/lib/utils/debug.d.ts +5 -0
- package/lib/utils/debug.js +5 -0
- package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
- package/lib/utils/error/dom-evaluation-error.js +7 -0
- package/lib/utils/error/error-emitter.d.ts +18 -0
- package/lib/utils/error/error-emitter.js +29 -0
- package/lib/utils/error/index.d.ts +3 -0
- package/lib/utils/error/index.js +2 -0
- package/lib/utils/event-emitter/index.d.ts +6 -0
- package/lib/utils/event-emitter/index.js +6 -0
- package/lib/utils/index.d.ts +5 -0
- package/lib/utils/index.js +5 -0
- package/lib/utils/network/index.d.ts +1 -0
- package/lib/utils/network/index.js +1 -0
- package/lib/utils/object/clean-object.d.ts +8 -0
- package/lib/utils/object/clean-object.js +13 -0
- package/lib/utils/object/index.d.ts +1 -0
- package/lib/utils/object/index.js +1 -0
- package/lib/utils/path/index.d.ts +1 -0
- package/lib/utils/path/index.js +1 -0
- package/lib/utils/path/safe-filepath.d.ts +7 -0
- package/lib/utils/path/safe-filepath.js +12 -0
- package/lib/utils/regexp/index.d.ts +1 -0
- package/lib/utils/regexp/index.js +1 -0
- package/lib/utils/retryable/index.d.ts +2 -0
- package/lib/utils/retryable/index.js +1 -0
- package/lib/utils/sort/index.d.ts +14 -0
- package/lib/utils/sort/index.js +61 -0
- package/lib/utils/sort/remove-matches.d.ts +9 -0
- package/lib/utils/sort/remove-matches.js +23 -0
- package/lib/utils/types/index.d.ts +1 -0
- package/lib/utils/types/index.js +1 -0
- package/lib/utils/types/types.d.ts +46 -0
- package/lib/utils/types/types.js +1 -0
- package/lib/utils/url/index.d.ts +5 -0
- package/lib/utils/url/index.js +5 -0
- package/lib/utils/url/is-lower-layer.d.ts +15 -0
- package/lib/utils/url/is-lower-layer.js +55 -0
- package/lib/utils/url/parse-url.d.ts +11 -0
- package/lib/utils/url/parse-url.js +20 -0
- package/lib/utils/url/path-match.d.ts +11 -0
- package/lib/utils/url/path-match.js +18 -0
- package/lib/utils/url/sort-url.d.ts +10 -0
- package/lib/utils/url/sort-url.js +24 -0
- package/lib/utils/url/url-partial-match.d.ts +11 -0
- package/lib/utils/url/url-partial-match.js +32 -0
- package/package.json +49 -0
- package/src/archive/__mock__/.gitignore +3 -0
- package/src/archive/__mock__/mock.sqlite +0 -0
- package/src/archive/archive-accessor.ts +337 -0
- package/src/archive/archive.ts +408 -0
- package/src/archive/database.spec.ts +469 -0
- package/src/archive/database.ts +1059 -0
- package/src/archive/debug.ts +10 -0
- package/src/archive/filesystem/append-text.spec.ts +26 -0
- package/src/archive/filesystem/append-text.ts +16 -0
- package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
- package/src/archive/filesystem/copy-dir-sync.ts +10 -0
- package/src/archive/filesystem/copy-dir.spec.ts +33 -0
- package/src/archive/filesystem/copy-dir.ts +14 -0
- package/src/archive/filesystem/exists.spec.ts +33 -0
- package/src/archive/filesystem/exists.ts +10 -0
- package/src/archive/filesystem/get-file-list.spec.ts +37 -0
- package/src/archive/filesystem/get-file-list.ts +13 -0
- package/src/archive/filesystem/index.ts +17 -0
- package/src/archive/filesystem/is-dir.spec.ts +29 -0
- package/src/archive/filesystem/is-dir.ts +11 -0
- package/src/archive/filesystem/mkdir.spec.ts +37 -0
- package/src/archive/filesystem/mkdir.ts +16 -0
- package/src/archive/filesystem/output-json.spec.ts +34 -0
- package/src/archive/filesystem/output-json.ts +16 -0
- package/src/archive/filesystem/output-text.spec.ts +31 -0
- package/src/archive/filesystem/output-text.ts +35 -0
- package/src/archive/filesystem/read-json.spec.ts +26 -0
- package/src/archive/filesystem/read-json.ts +12 -0
- package/src/archive/filesystem/read-text.spec.ts +25 -0
- package/src/archive/filesystem/read-text.ts +11 -0
- package/src/archive/filesystem/readline.spec.ts +29 -0
- package/src/archive/filesystem/readline.ts +30 -0
- package/src/archive/filesystem/remove.spec.ts +34 -0
- package/src/archive/filesystem/remove.ts +11 -0
- package/src/archive/filesystem/rename.spec.ts +46 -0
- package/src/archive/filesystem/rename.ts +21 -0
- package/src/archive/filesystem/tar.spec.ts +33 -0
- package/src/archive/filesystem/tar.ts +27 -0
- package/src/archive/filesystem/untar.spec.ts +34 -0
- package/src/archive/filesystem/untar.ts +36 -0
- package/src/archive/index.ts +13 -0
- package/src/archive/page.spec.ts +368 -0
- package/src/archive/page.ts +420 -0
- package/src/archive/resource.spec.ts +101 -0
- package/src/archive/resource.ts +73 -0
- package/src/archive/safe-path.spec.ts +44 -0
- package/src/archive/safe-path.ts +18 -0
- package/src/archive/types.ts +227 -0
- package/src/crawler/clear-destination-cache.spec.ts +20 -0
- package/src/crawler/clear-destination-cache.ts +9 -0
- package/src/crawler/crawler.ts +873 -0
- package/src/crawler/decompose-url.spec.ts +48 -0
- package/src/crawler/decompose-url.ts +90 -0
- package/src/crawler/destination-cache.spec.ts +23 -0
- package/src/crawler/destination-cache.ts +8 -0
- package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
- package/src/crawler/detect-pagination-pattern.ts +66 -0
- package/src/crawler/fetch-destination.ts +257 -0
- package/src/crawler/fetch-robots-txt.spec.ts +83 -0
- package/src/crawler/fetch-robots-txt.ts +91 -0
- package/src/crawler/find-best-matching-scope.spec.ts +39 -0
- package/src/crawler/find-best-matching-scope.ts +57 -0
- package/src/crawler/generate-predicted-urls.spec.ts +42 -0
- package/src/crawler/generate-predicted-urls.ts +34 -0
- package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
- package/src/crawler/handle-ignore-and-skip.ts +30 -0
- package/src/crawler/handle-resource-response.spec.ts +45 -0
- package/src/crawler/handle-resource-response.ts +21 -0
- package/src/crawler/handle-scrape-end.spec.ts +109 -0
- package/src/crawler/handle-scrape-end.ts +115 -0
- package/src/crawler/handle-scrape-error.spec.ts +105 -0
- package/src/crawler/handle-scrape-error.ts +58 -0
- package/src/crawler/index.ts +2 -0
- package/src/crawler/inject-scope-auth.spec.ts +36 -0
- package/src/crawler/inject-scope-auth.ts +27 -0
- package/src/crawler/is-external-url.spec.ts +31 -0
- package/src/crawler/is-external-url.ts +17 -0
- package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
- package/src/crawler/is-in-any-lower-layer.ts +22 -0
- package/src/crawler/link-list.spec.ts +355 -0
- package/src/crawler/link-list.ts +275 -0
- package/src/crawler/link-to-page-data.spec.ts +133 -0
- package/src/crawler/link-to-page-data.ts +34 -0
- package/src/crawler/net-timeout-error.spec.ts +25 -0
- package/src/crawler/net-timeout-error.ts +11 -0
- package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
- package/src/crawler/protocol-agnostic-key.ts +11 -0
- package/src/crawler/reconstruct-url.spec.ts +37 -0
- package/src/crawler/reconstruct-url.ts +37 -0
- package/src/crawler/robots-checker.spec.ts +104 -0
- package/src/crawler/robots-checker.ts +73 -0
- package/src/crawler/should-discard-predicted.spec.ts +125 -0
- package/src/crawler/should-discard-predicted.ts +33 -0
- package/src/crawler/should-skip-url.spec.ts +77 -0
- package/src/crawler/should-skip-url.ts +37 -0
- package/src/crawler/types.ts +146 -0
- package/src/crawler-orchestrator.ts +401 -0
- package/src/debug.ts +10 -0
- package/src/index.ts +25 -0
- package/src/types.ts +30 -0
- package/src/utils/array/each-splitted.spec.ts +38 -0
- package/src/utils/array/each-splitted.ts +19 -0
- package/src/utils/array/index.ts +1 -0
- package/src/utils/debug.ts +6 -0
- package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
- package/src/utils/error/dom-evaluation-error.ts +6 -0
- package/src/utils/error/error-emitter.spec.ts +78 -0
- package/src/utils/error/error-emitter.ts +44 -0
- package/src/utils/error/index.ts +3 -0
- package/src/utils/index.ts +5 -0
- package/src/utils/object/clean-object.spec.ts +24 -0
- package/src/utils/object/clean-object.ts +13 -0
- package/src/utils/object/index.ts +1 -0
- package/src/utils/types/index.ts +1 -0
- package/src/utils/types/types.ts +65 -0
- package/tsconfig.json +11 -0
- package/tsconfig.tsbuildinfo +1 -0
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import { createReadStream, existsSync, promises as fs, mkdirSync } from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import Readline from 'node:readline';
|
|
4
|
+
import fsx from 'fs-extra';
|
|
5
|
+
/**
|
|
6
|
+
* Writes data to a JSON file at the specified path.
|
|
7
|
+
*
|
|
8
|
+
* Creates parent directories if they do not exist.
|
|
9
|
+
* The output is formatted with 2-space indentation.
|
|
10
|
+
* @param filePath - The absolute or relative path to the JSON file to write.
|
|
11
|
+
* @param data - The data to serialize as JSON and write to the file.
|
|
12
|
+
*/
|
|
13
|
+
export async function outputJSON(filePath, data) {
|
|
14
|
+
mkdir(filePath);
|
|
15
|
+
await fs.writeFile(filePath, JSON.stringify(data, null, 2), { encoding: 'utf8' });
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Reads and parses a JSON file from the specified path.
|
|
19
|
+
* @template T - The expected type of the parsed JSON content. Defaults to `unknown`.
|
|
20
|
+
* @param filePath - The absolute or relative path to the JSON file to read.
|
|
21
|
+
* @returns The parsed JSON content, cast to the specified generic type.
|
|
22
|
+
*/
|
|
23
|
+
export async function readJSON(filePath) {
|
|
24
|
+
const data = await fs.readFile(filePath, { encoding: 'utf8' });
|
|
25
|
+
return JSON.parse(data);
|
|
26
|
+
}
|
|
27
|
+
let filePathTooLongCount = 0;
|
|
28
|
+
/**
|
|
29
|
+
* Writes text data to a file at the specified path.
|
|
30
|
+
*
|
|
31
|
+
* Creates parent directories if they do not exist.
|
|
32
|
+
* If the file path exceeds the OS limit (ENAMETOOLONG), the file is saved
|
|
33
|
+
* with an auto-generated short name and an accompanying `.meta.txt` file
|
|
34
|
+
* that records the original file path.
|
|
35
|
+
* @param filePath - The absolute or relative path to the text file to write.
|
|
36
|
+
* @param data - The text content to write to the file.
|
|
37
|
+
*/
|
|
38
|
+
export async function outputText(filePath, data) {
|
|
39
|
+
mkdir(filePath);
|
|
40
|
+
await fs.writeFile(filePath, data, { encoding: 'utf8' }).catch(async (error) => {
|
|
41
|
+
if (error instanceof Error && 'code' in error && error.code === 'ENAMETOOLONG') {
|
|
42
|
+
// eslint-disable-next-line no-console
|
|
43
|
+
console.error(`File path too long: ${filePath}`);
|
|
44
|
+
const dir = path.dirname(filePath);
|
|
45
|
+
const altFileName = `__file_path_too_long_${(filePathTooLongCount++).toString().padStart(4, '0')}`;
|
|
46
|
+
const ext = path.extname(filePath);
|
|
47
|
+
const altFilePath = path.resolve(dir, `${altFileName}${ext}`);
|
|
48
|
+
// eslint-disable-next-line no-console
|
|
49
|
+
console.error(`Try to save to: ${altFilePath}`);
|
|
50
|
+
const altMetaFilePath = path.resolve(dir, `${altFileName}.meta.txt`);
|
|
51
|
+
await outputText(altFilePath, data);
|
|
52
|
+
await outputText(altMetaFilePath, `Original file path: ${filePath}`);
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Appends text data to a file at the specified path.
|
|
58
|
+
*
|
|
59
|
+
* Creates parent directories if they do not exist.
|
|
60
|
+
* A newline character is prepended to the data before appending.
|
|
61
|
+
* @param filePath - The absolute or relative path to the file to append to.
|
|
62
|
+
* @param data - The text content to append to the file.
|
|
63
|
+
*/
|
|
64
|
+
export async function appendText(filePath, data) {
|
|
65
|
+
mkdir(filePath);
|
|
66
|
+
await fs.appendFile(filePath, `\n${data}`, { encoding: 'utf8' });
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Reads the entire contents of a text file as a UTF-8 string.
|
|
70
|
+
* @param filePath - The absolute or relative path to the text file to read.
|
|
71
|
+
* @returns The text content of the file.
|
|
72
|
+
*/
|
|
73
|
+
export async function readText(filePath) {
|
|
74
|
+
const data = await fs.readFile(filePath, { encoding: 'utf8' });
|
|
75
|
+
return data;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Recursively copies a directory and its contents from one location to another.
|
|
79
|
+
* @param from - The source directory path to copy from.
|
|
80
|
+
* @param to - The destination directory path to copy to.
|
|
81
|
+
* @returns `true` if the copy succeeded, `false` if an error occurred.
|
|
82
|
+
*/
|
|
83
|
+
export async function copyDir(from, to) {
|
|
84
|
+
return fsx
|
|
85
|
+
.copy(from, to)
|
|
86
|
+
.then(() => true)
|
|
87
|
+
.catch(() => false);
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Synchronously copies a directory and its contents from one location to another.
|
|
91
|
+
* @param from - The source directory path to copy from.
|
|
92
|
+
* @param to - The destination directory path to copy to.
|
|
93
|
+
*/
|
|
94
|
+
export function copyDirSync(from, to) {
|
|
95
|
+
fsx.copySync(from, to);
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Checks whether the given path points to a directory.
|
|
99
|
+
* @param dirPath - The path to check.
|
|
100
|
+
* @returns `true` if the path is a directory, `false` otherwise.
|
|
101
|
+
*/
|
|
102
|
+
export async function isDir(dirPath) {
|
|
103
|
+
const stat = await fsx.stat(dirPath);
|
|
104
|
+
return stat.isDirectory();
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Recursively removes a file or directory at the specified path.
|
|
108
|
+
* @param dirPath - The path of the file or directory to remove.
|
|
109
|
+
*/
|
|
110
|
+
export async function remove(dirPath) {
|
|
111
|
+
await fs.rm(dirPath, {
|
|
112
|
+
recursive: true,
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Renames (moves) a file or directory from one path to another.
|
|
117
|
+
*
|
|
118
|
+
* If `override` is `true` and the destination already exists,
|
|
119
|
+
* the destination is removed before renaming.
|
|
120
|
+
* @param oldPath - The current path of the file or directory.
|
|
121
|
+
* @param newPath - The new path for the file or directory.
|
|
122
|
+
* @param override - Whether to overwrite the destination if it already exists. Defaults to `false`.
|
|
123
|
+
*/
|
|
124
|
+
export async function rename(oldPath, newPath, override = false) {
|
|
125
|
+
if (override && exists(newPath)) {
|
|
126
|
+
await remove(newPath);
|
|
127
|
+
}
|
|
128
|
+
await fs.rename(oldPath, newPath);
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Lists the file names in a directory, optionally filtered by a pattern.
|
|
132
|
+
* @param dirPath - The directory path to list files from.
|
|
133
|
+
* @param filter - An optional RegExp or string pattern to filter file names.
|
|
134
|
+
* Only file names matching this pattern are included in the result.
|
|
135
|
+
* @returns An array of file names in the directory that match the filter (or all if no filter is provided).
|
|
136
|
+
*/
|
|
137
|
+
export async function getFileList(dirPath, filter) {
|
|
138
|
+
const list = await fsx.readdir(dirPath);
|
|
139
|
+
return filter ? list.filter((fileName) => fileName.match(filter)) : list;
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Reads a file line by line and invokes the callback for each line.
|
|
143
|
+
*
|
|
144
|
+
* The callback may return a Promise for asynchronous processing.
|
|
145
|
+
* All callback results are collected and awaited via `Promise.all` before returning.
|
|
146
|
+
* @param filePath - The path to the file to read line by line.
|
|
147
|
+
* @param callback - A function invoked for each line of the file.
|
|
148
|
+
* May return a Promise for asynchronous operations.
|
|
149
|
+
* @returns A promise that resolves when all line callbacks have completed.
|
|
150
|
+
*/
|
|
151
|
+
export async function readline(filePath, callback) {
|
|
152
|
+
const stream = createReadStream(filePath);
|
|
153
|
+
const rLine = Readline.createInterface(stream);
|
|
154
|
+
const promiseBuffer = [];
|
|
155
|
+
await new Promise((resolve) => {
|
|
156
|
+
rLine.on('line', (line) => {
|
|
157
|
+
promiseBuffer.push(callback(line));
|
|
158
|
+
});
|
|
159
|
+
rLine.on('close', () => {
|
|
160
|
+
resolve();
|
|
161
|
+
});
|
|
162
|
+
});
|
|
163
|
+
return Promise.all(promiseBuffer);
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Ensures the parent directory of the given file path exists.
|
|
167
|
+
*
|
|
168
|
+
* If the parent directory does not exist, it is created recursively
|
|
169
|
+
* with permissions `0o755`.
|
|
170
|
+
* @param filePath - The file path whose parent directory should be created.
|
|
171
|
+
*/
|
|
172
|
+
export function mkdir(filePath) {
|
|
173
|
+
const { dir } = path.parse(filePath);
|
|
174
|
+
if (!existsSync(dir)) {
|
|
175
|
+
mkdirSync(path.resolve(dir), { recursive: true, mode: 0o755 });
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Checks whether a file or directory exists at the given path.
|
|
180
|
+
* @param filePath - The path to check for existence.
|
|
181
|
+
* @returns `true` if the path exists, `false` otherwise.
|
|
182
|
+
*/
|
|
183
|
+
export function exists(filePath) {
|
|
184
|
+
return existsSync(filePath);
|
|
185
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Creates a ZIP archive from a directory and writes it to a file.
|
|
3
|
+
*
|
|
4
|
+
* All files and subdirectories within `targetDir` are added to the archive
|
|
5
|
+
* at the root level (no wrapping directory).
|
|
6
|
+
* @param outputfilePath - The file path where the ZIP archive will be written.
|
|
7
|
+
* @param targetDir - The directory whose contents will be compressed into the ZIP archive.
|
|
8
|
+
* @returns A promise that resolves when the ZIP file has been fully written,
|
|
9
|
+
* or rejects if the write stream encounters an error.
|
|
10
|
+
*/
|
|
11
|
+
export declare function zip(outputfilePath: string, targetDir: string): Promise<void>;
|
|
12
|
+
/**
|
|
13
|
+
* Extracts a ZIP archive to a target directory.
|
|
14
|
+
* @param zipFilePath - The path to the ZIP file to extract.
|
|
15
|
+
* @param targetDir - The directory where the ZIP contents will be extracted to.
|
|
16
|
+
* @returns A promise that resolves when extraction is complete,
|
|
17
|
+
* or rejects if an error occurs during extraction.
|
|
18
|
+
*/
|
|
19
|
+
export declare function unzip(zipFilePath: string, targetDir: string): Promise<void>;
|
|
20
|
+
/**
|
|
21
|
+
* Opens a ZIP file and returns its directory listing without extracting.
|
|
22
|
+
*
|
|
23
|
+
* This is useful for inspecting the contents of a ZIP archive
|
|
24
|
+
* before performing a full extraction.
|
|
25
|
+
* @param zipFilePath - The path to the ZIP file to open and inspect.
|
|
26
|
+
* @returns A directory object representing the ZIP archive contents,
|
|
27
|
+
* which can be used to list or selectively extract entries.
|
|
28
|
+
*/
|
|
29
|
+
export declare function extractZip(zipFilePath: string): Promise<any>;
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import archiver from 'archiver';
|
|
3
|
+
import unzipper from 'unzipper';
|
|
4
|
+
/**
|
|
5
|
+
* Creates a ZIP archive from a directory and writes it to a file.
|
|
6
|
+
*
|
|
7
|
+
* All files and subdirectories within `targetDir` are added to the archive
|
|
8
|
+
* at the root level (no wrapping directory).
|
|
9
|
+
* @param outputfilePath - The file path where the ZIP archive will be written.
|
|
10
|
+
* @param targetDir - The directory whose contents will be compressed into the ZIP archive.
|
|
11
|
+
* @returns A promise that resolves when the ZIP file has been fully written,
|
|
12
|
+
* or rejects if the write stream encounters an error.
|
|
13
|
+
*/
|
|
14
|
+
export async function zip(outputfilePath, targetDir) {
|
|
15
|
+
const output = fs.createWriteStream(outputfilePath);
|
|
16
|
+
const archive = archiver('zip');
|
|
17
|
+
archive.pipe(output);
|
|
18
|
+
archive.directory(targetDir, false);
|
|
19
|
+
await archive.finalize();
|
|
20
|
+
return new Promise((resolve, reject) => {
|
|
21
|
+
output.on('finish', () => resolve());
|
|
22
|
+
output.on('error', () => reject(`Failed to save file "${outputfilePath}" from "${targetDir}"`));
|
|
23
|
+
});
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Extracts a ZIP archive to a target directory.
|
|
27
|
+
* @param zipFilePath - The path to the ZIP file to extract.
|
|
28
|
+
* @param targetDir - The directory where the ZIP contents will be extracted to.
|
|
29
|
+
* @returns A promise that resolves when extraction is complete,
|
|
30
|
+
* or rejects if an error occurs during extraction.
|
|
31
|
+
*/
|
|
32
|
+
export async function unzip(zipFilePath, targetDir) {
|
|
33
|
+
const extract = fs.createReadStream(zipFilePath).pipe(unzipper.Extract({
|
|
34
|
+
path: targetDir,
|
|
35
|
+
}));
|
|
36
|
+
return new Promise((resolve, reject) => {
|
|
37
|
+
extract.on('finish', () => resolve());
|
|
38
|
+
extract.on('error', (err) => reject(err));
|
|
39
|
+
});
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Opens a ZIP file and returns its directory listing without extracting.
|
|
43
|
+
*
|
|
44
|
+
* This is useful for inspecting the contents of a ZIP archive
|
|
45
|
+
* before performing a full extraction.
|
|
46
|
+
* @param zipFilePath - The path to the ZIP file to open and inspect.
|
|
47
|
+
* @returns A directory object representing the ZIP archive contents,
|
|
48
|
+
* which can be used to list or selectively extract entries.
|
|
49
|
+
*/
|
|
50
|
+
export async function extractZip(zipFilePath) {
|
|
51
|
+
const directory = await unzipper.Open.file(zipFilePath);
|
|
52
|
+
return directory;
|
|
53
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export * from './archive-accessor.js';
|
|
2
|
+
export type { Redirect, Referrer, Anchor, StaticPageData } from './page.js';
|
|
3
|
+
export { default as Page } from './page.js';
|
|
4
|
+
export { default as Resource } from './resource.js';
|
|
5
|
+
export * from './types.js';
|
|
6
|
+
export { default } from './archive.js';
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
// Archive storage and retrieval layer for Nitpicker crawl data.
|
|
2
|
+
//
|
|
3
|
+
// This package provides the `Archive` class for creating, reading, and writing
|
|
4
|
+
// `.nitpicker` archive files that store crawl results in a SQLite database along with
|
|
5
|
+
// optional HTML snapshots. It also exports the `ArchiveAccessor` for read-only
|
|
6
|
+
// access, `Page` and `Resource` model classes, and all related types.
|
|
7
|
+
export * from './archive-accessor.js';
|
|
8
|
+
export { default as Page } from './page.js';
|
|
9
|
+
export { default as Resource } from './resource.js';
|
|
10
|
+
export * from './types.js';
|
|
11
|
+
export { default } from './archive.js';
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
import type { ArchiveAccessor } from './archive-accessor.js';
|
|
2
|
+
import type { DB_Anchor, DB_Page, DB_Redirect, DB_Referrer } from './types.js';
|
|
3
|
+
/**
|
|
4
|
+
* Represents a crawled page stored in the archive.
|
|
5
|
+
*
|
|
6
|
+
* Provides access to the page's metadata (title, status, SEO tags, etc.),
|
|
7
|
+
* its relationships (anchors, referrers, redirects), and its HTML snapshot.
|
|
8
|
+
* Instances are created by {@link ArchiveAccessor.getPages} or
|
|
9
|
+
* {@link ArchiveAccessor.getPagesWithRefs}.
|
|
10
|
+
*/
|
|
11
|
+
export default class Page {
|
|
12
|
+
#private;
|
|
13
|
+
/**
|
|
14
|
+
* An array of URLs that redirect to this page.
|
|
15
|
+
* Each entry contains the source URL and its page ID.
|
|
16
|
+
* Returns an empty array if no redirects exist.
|
|
17
|
+
*/
|
|
18
|
+
readonly redirectFrom: Redirect[];
|
|
19
|
+
/**
|
|
20
|
+
* The alternate URL from the `<link rel="alternate">` tag, or null if not present.
|
|
21
|
+
*/
|
|
22
|
+
get alternate(): string | null;
|
|
23
|
+
/**
|
|
24
|
+
* The canonical URL from the `<link rel="canonical">` tag, or null if not present.
|
|
25
|
+
*/
|
|
26
|
+
get canonical(): string | null;
|
|
27
|
+
/**
|
|
28
|
+
* The content length of the HTTP response in bytes, or null if unknown.
|
|
29
|
+
*/
|
|
30
|
+
get contentLength(): number | null;
|
|
31
|
+
/**
|
|
32
|
+
* The MIME content type of the HTTP response (e.g., `"text/html"`), or null if unknown.
|
|
33
|
+
*/
|
|
34
|
+
get contentType(): string | null;
|
|
35
|
+
/**
|
|
36
|
+
* The meta description content, or null if not present.
|
|
37
|
+
*/
|
|
38
|
+
get description(): string | null;
|
|
39
|
+
/**
|
|
40
|
+
* Whether this page is on an external domain (outside the crawl scope).
|
|
41
|
+
*/
|
|
42
|
+
get isExternal(): boolean;
|
|
43
|
+
/**
|
|
44
|
+
* Whether this page was skipped during crawling.
|
|
45
|
+
*/
|
|
46
|
+
get isSkipped(): boolean;
|
|
47
|
+
/**
|
|
48
|
+
* Whether this page was a crawl target (as opposed to being discovered incidentally).
|
|
49
|
+
*/
|
|
50
|
+
get isTarget(): boolean;
|
|
51
|
+
/**
|
|
52
|
+
* The reason this page was skipped during crawling, or null if it was not skipped.
|
|
53
|
+
*/
|
|
54
|
+
get skipReason(): string | null;
|
|
55
|
+
/**
|
|
56
|
+
* The meta keywords content, or null if not present.
|
|
57
|
+
*/
|
|
58
|
+
get keywords(): string | null;
|
|
59
|
+
/**
|
|
60
|
+
* The `lang` attribute value from the HTML element, or null if not present.
|
|
61
|
+
*/
|
|
62
|
+
get lang(): string | null;
|
|
63
|
+
/**
|
|
64
|
+
* Whether the noarchive robots directive is set.
|
|
65
|
+
*/
|
|
66
|
+
get noarchive(): boolean;
|
|
67
|
+
/**
|
|
68
|
+
* Whether the nofollow robots directive is set.
|
|
69
|
+
*/
|
|
70
|
+
get nofollow(): boolean;
|
|
71
|
+
/**
|
|
72
|
+
* Whether the noindex robots directive is set.
|
|
73
|
+
*/
|
|
74
|
+
get noindex(): boolean;
|
|
75
|
+
/**
|
|
76
|
+
* The Open Graph description (`og:description`), or null if not present.
|
|
77
|
+
*/
|
|
78
|
+
get og_description(): string | null;
|
|
79
|
+
/**
|
|
80
|
+
* The Open Graph image URL (`og:image`), or null if not present.
|
|
81
|
+
*/
|
|
82
|
+
get og_image(): string | null;
|
|
83
|
+
/**
|
|
84
|
+
* The Open Graph site name (`og:site_name`), or null if not present.
|
|
85
|
+
*/
|
|
86
|
+
get og_site_name(): string | null;
|
|
87
|
+
/**
|
|
88
|
+
* The Open Graph title (`og:title`), or null if not present.
|
|
89
|
+
*/
|
|
90
|
+
get og_title(): string | null;
|
|
91
|
+
/**
|
|
92
|
+
* The Open Graph type (`og:type`), or null if not present.
|
|
93
|
+
*/
|
|
94
|
+
get og_type(): string | null;
|
|
95
|
+
/**
|
|
96
|
+
* The Open Graph URL (`og:url`), or null if not present.
|
|
97
|
+
*/
|
|
98
|
+
get og_url(): string | null;
|
|
99
|
+
/**
|
|
100
|
+
* The parsed HTTP response headers as a key-value record.
|
|
101
|
+
* Returns an empty object if headers cannot be parsed.
|
|
102
|
+
*/
|
|
103
|
+
get responseHeaders(): Record<string, string>;
|
|
104
|
+
/**
|
|
105
|
+
* The HTTP response status code, or null if the page has not been fetched.
|
|
106
|
+
*/
|
|
107
|
+
get status(): number | null;
|
|
108
|
+
/**
|
|
109
|
+
* The HTTP response status text (e.g., `"OK"`, `"Not Found"`), or null if not fetched.
|
|
110
|
+
*/
|
|
111
|
+
get statusText(): string | null;
|
|
112
|
+
/**
|
|
113
|
+
* The page title from the `<title>` element.
|
|
114
|
+
* Returns an empty string if no title is set.
|
|
115
|
+
*/
|
|
116
|
+
get title(): string;
|
|
117
|
+
/**
|
|
118
|
+
* The Twitter Card type (`twitter:card`), or null if not present.
|
|
119
|
+
*/
|
|
120
|
+
get twitter_card(): string | null;
|
|
121
|
+
/**
|
|
122
|
+
* The parsed URL of this page as an ExURL object.
|
|
123
|
+
* Respects the `disableQueries` option for query string handling.
|
|
124
|
+
*/
|
|
125
|
+
get url(): import("@d-zero/shared/parse-url").ExURL;
|
|
126
|
+
/**
|
|
127
|
+
* Creates a new Page instance.
|
|
128
|
+
* @param archive - The ArchiveAccessor used for lazy-loading relationships.
|
|
129
|
+
* @param raw - The raw database row for this page.
|
|
130
|
+
* @param rawRedirects - Pre-loaded redirect records, or undefined for lazy loading.
|
|
131
|
+
* @param rawAnchors - Pre-loaded anchor records, or undefined for lazy loading.
|
|
132
|
+
* @param rawReferrers - Pre-loaded referrer records, or undefined for lazy loading.
|
|
133
|
+
* @param disableQueries - Whether to strip query strings from the URL.
|
|
134
|
+
*/
|
|
135
|
+
constructor(archive: ArchiveAccessor, raw: DB_Page, rawRedirects?: DB_Redirect[], rawAnchors?: DB_Anchor[], rawReferrers?: DB_Referrer[], disableQueries?: boolean);
|
|
136
|
+
/**
|
|
137
|
+
* Retrieves the anchors (outgoing links) found on this page.
|
|
138
|
+
* Uses pre-loaded data if available, otherwise queries the database.
|
|
139
|
+
* @returns An array of {@link Anchor} objects representing the links on this page.
|
|
140
|
+
*/
|
|
141
|
+
getAnchors(): Promise<Anchor[]>;
|
|
142
|
+
/**
|
|
143
|
+
* Reads the HTML snapshot content of this page from the archive.
|
|
144
|
+
* @returns The HTML content as a string, or null if no snapshot was saved.
|
|
145
|
+
*/
|
|
146
|
+
getHtml(): Promise<string | null>;
|
|
147
|
+
/**
|
|
148
|
+
* Retrieves the referrers (incoming links) pointing to this page.
|
|
149
|
+
* Uses pre-loaded data if available, otherwise queries the database.
|
|
150
|
+
* @returns An array of {@link Referrer} objects representing pages that link to this page.
|
|
151
|
+
*/
|
|
152
|
+
getReferrers(): Promise<Referrer[]>;
|
|
153
|
+
/**
|
|
154
|
+
* Retrieves all request referrers for this page directly from the database.
|
|
155
|
+
* Unlike {@link getReferrers}, this always queries the database and does not use pre-loaded data.
|
|
156
|
+
* @returns An array of {@link Referrer} objects.
|
|
157
|
+
*/
|
|
158
|
+
getRequests(): Promise<Referrer[]>;
|
|
159
|
+
/**
|
|
160
|
+
* Checks whether this page is an internal HTML page (not external and has `text/html` content type).
|
|
161
|
+
* @returns `true` if this is an internal HTML page, `false` otherwise.
|
|
162
|
+
*/
|
|
163
|
+
isInternalPage(): boolean;
|
|
164
|
+
/**
|
|
165
|
+
* Checks whether this entry represents an HTML page (content type is `text/html`).
|
|
166
|
+
* @returns `true` if the content type is `text/html`, `false` otherwise.
|
|
167
|
+
*/
|
|
168
|
+
isPage(): boolean;
|
|
169
|
+
/**
|
|
170
|
+
* Serializes the page data to a plain JSON object,
|
|
171
|
+
* including resolved anchors and referrers.
|
|
172
|
+
* @returns A plain object containing all page metadata and relationships.
|
|
173
|
+
*/
|
|
174
|
+
toJSON(): Promise<{
|
|
175
|
+
url: string;
|
|
176
|
+
title: string;
|
|
177
|
+
status: number | null;
|
|
178
|
+
statusText: string | null;
|
|
179
|
+
contentType: string | null;
|
|
180
|
+
contentLength: number | null;
|
|
181
|
+
responseHeaders: Record<string, string>;
|
|
182
|
+
isExternal: boolean;
|
|
183
|
+
isSkipped: boolean;
|
|
184
|
+
skipReason: string | null;
|
|
185
|
+
isTarget: boolean;
|
|
186
|
+
lang: string | null;
|
|
187
|
+
description: string | null;
|
|
188
|
+
keywords: string | null;
|
|
189
|
+
noindex: boolean;
|
|
190
|
+
nofollow: boolean;
|
|
191
|
+
noarchive: boolean;
|
|
192
|
+
canonical: string | null;
|
|
193
|
+
alternate: string | null;
|
|
194
|
+
twitter_card: string | null;
|
|
195
|
+
og_site_name: string | null;
|
|
196
|
+
og_url: string | null;
|
|
197
|
+
og_title: string | null;
|
|
198
|
+
og_description: string | null;
|
|
199
|
+
og_type: string | null;
|
|
200
|
+
og_image: string | null;
|
|
201
|
+
redirectFrom: Redirect[];
|
|
202
|
+
isPage: boolean;
|
|
203
|
+
isInternalPage: boolean;
|
|
204
|
+
getAnchors: Anchor[];
|
|
205
|
+
getReferrers: Referrer[];
|
|
206
|
+
}>;
|
|
207
|
+
}
|
|
208
|
+
/**
|
|
209
|
+
* Utility type that extracts the resolved type from a Promise.
|
|
210
|
+
*/
|
|
211
|
+
type PromiseType<T> = T extends PromiseLike<infer U> ? U : T;
|
|
212
|
+
/**
|
|
213
|
+
* The static (serialized) representation of a Page, as returned by {@link Page.toJSON}.
|
|
214
|
+
*/
|
|
215
|
+
export type StaticPageData = PromiseType<ReturnType<Page['toJSON']>>;
|
|
216
|
+
/**
|
|
217
|
+
* Represents a page that links to another page (an incoming link).
|
|
218
|
+
*/
|
|
219
|
+
export interface Referrer {
|
|
220
|
+
/** The URL of the referring page. */
|
|
221
|
+
url: string;
|
|
222
|
+
/** The URL through which the referral passes (may differ due to redirects). */
|
|
223
|
+
through: string;
|
|
224
|
+
/** The page ID corresponding to the through URL. */
|
|
225
|
+
throughId: number;
|
|
226
|
+
/** The URL fragment (hash) of the referring link, or null if not present. */
|
|
227
|
+
hash: string | null;
|
|
228
|
+
/** The text content of the referring anchor element. */
|
|
229
|
+
textContent: string;
|
|
230
|
+
}
|
|
231
|
+
/**
|
|
232
|
+
* Represents an outgoing link (anchor element) found on a page.
|
|
233
|
+
*/
|
|
234
|
+
export interface Anchor {
|
|
235
|
+
/** The resolved destination URL of the anchor. */
|
|
236
|
+
url: string;
|
|
237
|
+
/** The original href attribute value of the anchor element. */
|
|
238
|
+
href: string;
|
|
239
|
+
/** Whether the anchor points to an external domain. */
|
|
240
|
+
isExternal: boolean;
|
|
241
|
+
/** The title attribute of the anchor element, or null if not present. */
|
|
242
|
+
title: string | null;
|
|
243
|
+
/** The HTTP status code of the linked page, or null if not yet fetched. */
|
|
244
|
+
status: number | null;
|
|
245
|
+
/** The HTTP status text of the linked page, or null if not yet fetched. */
|
|
246
|
+
statusText: string | null;
|
|
247
|
+
/** The content type of the linked page, or null if not yet fetched. */
|
|
248
|
+
contentType: string | null;
|
|
249
|
+
/** The URL fragment (hash) portion of the link, or null if not present. */
|
|
250
|
+
hash: string | null;
|
|
251
|
+
/** The text content of the anchor element, or null if empty. */
|
|
252
|
+
textContent: string | null;
|
|
253
|
+
}
|
|
254
|
+
/**
|
|
255
|
+
* Represents a page that redirects to this page.
|
|
256
|
+
*/
|
|
257
|
+
export interface Redirect {
|
|
258
|
+
/** The URL of the redirect source page. */
|
|
259
|
+
url: string;
|
|
260
|
+
/** The database ID of the redirect source page. */
|
|
261
|
+
pageId: number;
|
|
262
|
+
}
|
|
263
|
+
export {};
|