@nitpicker/crawler 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/LICENSE +191 -0
  3. package/README.md +13 -0
  4. package/lib/archive/archive-accessor.d.ts +107 -0
  5. package/lib/archive/archive-accessor.js +264 -0
  6. package/lib/archive/archive.d.ts +174 -0
  7. package/lib/archive/archive.js +331 -0
  8. package/lib/archive/database.d.ts +207 -0
  9. package/lib/archive/database.js +972 -0
  10. package/lib/archive/debug.d.ts +8 -0
  11. package/lib/archive/debug.js +9 -0
  12. package/lib/archive/filesystem/append-text.d.ts +9 -0
  13. package/lib/archive/filesystem/append-text.js +14 -0
  14. package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
  15. package/lib/archive/filesystem/copy-dir-sync.js +9 -0
  16. package/lib/archive/filesystem/copy-dir.d.ts +7 -0
  17. package/lib/archive/filesystem/copy-dir.js +13 -0
  18. package/lib/archive/filesystem/exists.d.ts +6 -0
  19. package/lib/archive/filesystem/exists.js +9 -0
  20. package/lib/archive/filesystem/get-file-list.d.ts +8 -0
  21. package/lib/archive/filesystem/get-file-list.js +12 -0
  22. package/lib/archive/filesystem/index.d.ts +17 -0
  23. package/lib/archive/filesystem/index.js +17 -0
  24. package/lib/archive/filesystem/is-dir.d.ts +6 -0
  25. package/lib/archive/filesystem/is-dir.js +10 -0
  26. package/lib/archive/filesystem/mkdir.d.ts +8 -0
  27. package/lib/archive/filesystem/mkdir.js +15 -0
  28. package/lib/archive/filesystem/output-json.d.ts +9 -0
  29. package/lib/archive/filesystem/output-json.js +14 -0
  30. package/lib/archive/filesystem/output-text.d.ts +11 -0
  31. package/lib/archive/filesystem/output-text.js +32 -0
  32. package/lib/archive/filesystem/read-json.d.ts +7 -0
  33. package/lib/archive/filesystem/read-json.js +11 -0
  34. package/lib/archive/filesystem/read-text.d.ts +6 -0
  35. package/lib/archive/filesystem/read-text.js +10 -0
  36. package/lib/archive/filesystem/readline.d.ts +11 -0
  37. package/lib/archive/filesystem/readline.js +26 -0
  38. package/lib/archive/filesystem/remove.d.ts +5 -0
  39. package/lib/archive/filesystem/remove.js +10 -0
  40. package/lib/archive/filesystem/rename.d.ts +11 -0
  41. package/lib/archive/filesystem/rename.js +18 -0
  42. package/lib/archive/filesystem/tar.d.ts +11 -0
  43. package/lib/archive/filesystem/tar.js +22 -0
  44. package/lib/archive/filesystem/untar.d.ts +20 -0
  45. package/lib/archive/filesystem/untar.js +24 -0
  46. package/lib/archive/filesystem/utils.d.ts +109 -0
  47. package/lib/archive/filesystem/utils.js +185 -0
  48. package/lib/archive/filesystem/zip.d.ts +29 -0
  49. package/lib/archive/filesystem/zip.js +53 -0
  50. package/lib/archive/index.d.ts +6 -0
  51. package/lib/archive/index.js +11 -0
  52. package/lib/archive/page.d.ts +263 -0
  53. package/lib/archive/page.js +316 -0
  54. package/lib/archive/resource.d.ts +46 -0
  55. package/lib/archive/resource.js +62 -0
  56. package/lib/archive/safe-path.d.ts +9 -0
  57. package/lib/archive/safe-path.js +17 -0
  58. package/lib/archive/types.d.ts +210 -0
  59. package/lib/archive/types.js +1 -0
  60. package/lib/crawler/clear-destination-cache.d.ts +5 -0
  61. package/lib/crawler/clear-destination-cache.js +8 -0
  62. package/lib/crawler/crawler.d.ts +73 -0
  63. package/lib/crawler/crawler.js +748 -0
  64. package/lib/crawler/decompose-url.d.ts +25 -0
  65. package/lib/crawler/decompose-url.js +71 -0
  66. package/lib/crawler/destination-cache.d.ts +7 -0
  67. package/lib/crawler/destination-cache.js +6 -0
  68. package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
  69. package/lib/crawler/detect-pagination-pattern.js +61 -0
  70. package/lib/crawler/fetch-destination.d.ts +38 -0
  71. package/lib/crawler/fetch-destination.js +208 -0
  72. package/lib/crawler/fetch-robots-txt.d.ts +42 -0
  73. package/lib/crawler/fetch-robots-txt.js +44 -0
  74. package/lib/crawler/find-best-matching-scope.d.ts +12 -0
  75. package/lib/crawler/find-best-matching-scope.js +46 -0
  76. package/lib/crawler/generate-predicted-urls.d.ts +13 -0
  77. package/lib/crawler/generate-predicted-urls.js +27 -0
  78. package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
  79. package/lib/crawler/handle-ignore-and-skip.js +19 -0
  80. package/lib/crawler/handle-resource-response.d.ts +13 -0
  81. package/lib/crawler/handle-resource-response.js +16 -0
  82. package/lib/crawler/handle-scrape-end.d.ts +24 -0
  83. package/lib/crawler/handle-scrape-end.js +82 -0
  84. package/lib/crawler/handle-scrape-error.d.ts +37 -0
  85. package/lib/crawler/handle-scrape-error.js +38 -0
  86. package/lib/crawler/index.d.ts +2 -0
  87. package/lib/crawler/index.js +2 -0
  88. package/lib/crawler/inject-scope-auth.d.ts +11 -0
  89. package/lib/crawler/inject-scope-auth.js +21 -0
  90. package/lib/crawler/is-external-url.d.ts +11 -0
  91. package/lib/crawler/is-external-url.js +12 -0
  92. package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
  93. package/lib/crawler/is-in-any-lower-layer.js +15 -0
  94. package/lib/crawler/link-list.d.ts +112 -0
  95. package/lib/crawler/link-list.js +248 -0
  96. package/lib/crawler/link-to-page-data.d.ts +14 -0
  97. package/lib/crawler/link-to-page-data.js +32 -0
  98. package/lib/crawler/net-timeout-error.d.ts +9 -0
  99. package/lib/crawler/net-timeout-error.js +11 -0
  100. package/lib/crawler/network.d.ts +30 -0
  101. package/lib/crawler/network.js +226 -0
  102. package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
  103. package/lib/crawler/protocol-agnostic-key.js +11 -0
  104. package/lib/crawler/reconstruct-url.d.ts +10 -0
  105. package/lib/crawler/reconstruct-url.js +28 -0
  106. package/lib/crawler/result-handler.d.ts +118 -0
  107. package/lib/crawler/result-handler.js +153 -0
  108. package/lib/crawler/robots-checker.d.ts +26 -0
  109. package/lib/crawler/robots-checker.js +62 -0
  110. package/lib/crawler/should-discard-predicted.d.ts +14 -0
  111. package/lib/crawler/should-discard-predicted.js +31 -0
  112. package/lib/crawler/should-skip-url.d.ts +23 -0
  113. package/lib/crawler/should-skip-url.js +15 -0
  114. package/lib/crawler/speculative-pagination.d.ts +52 -0
  115. package/lib/crawler/speculative-pagination.js +215 -0
  116. package/lib/crawler/types.d.ts +119 -0
  117. package/lib/crawler/types.js +1 -0
  118. package/lib/crawler/url-filter.d.ts +56 -0
  119. package/lib/crawler/url-filter.js +110 -0
  120. package/lib/crawler-orchestrator.d.ts +142 -0
  121. package/lib/crawler-orchestrator.js +309 -0
  122. package/lib/debug.d.ts +8 -0
  123. package/lib/debug.js +9 -0
  124. package/lib/index.d.ts +16 -0
  125. package/lib/index.js +18 -0
  126. package/lib/qzilla.d.ts +136 -0
  127. package/lib/qzilla.js +292 -0
  128. package/lib/types.d.ts +27 -0
  129. package/lib/types.js +1 -0
  130. package/lib/utils/array/each-splitted.d.ts +10 -0
  131. package/lib/utils/array/each-splitted.js +14 -0
  132. package/lib/utils/array/index.d.ts +1 -0
  133. package/lib/utils/array/index.js +1 -0
  134. package/lib/utils/async/index.d.ts +1 -0
  135. package/lib/utils/async/index.js +1 -0
  136. package/lib/utils/debug.d.ts +5 -0
  137. package/lib/utils/debug.js +5 -0
  138. package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
  139. package/lib/utils/error/dom-evaluation-error.js +7 -0
  140. package/lib/utils/error/error-emitter.d.ts +18 -0
  141. package/lib/utils/error/error-emitter.js +29 -0
  142. package/lib/utils/error/index.d.ts +3 -0
  143. package/lib/utils/error/index.js +2 -0
  144. package/lib/utils/event-emitter/index.d.ts +6 -0
  145. package/lib/utils/event-emitter/index.js +6 -0
  146. package/lib/utils/index.d.ts +5 -0
  147. package/lib/utils/index.js +5 -0
  148. package/lib/utils/network/index.d.ts +1 -0
  149. package/lib/utils/network/index.js +1 -0
  150. package/lib/utils/object/clean-object.d.ts +8 -0
  151. package/lib/utils/object/clean-object.js +13 -0
  152. package/lib/utils/object/index.d.ts +1 -0
  153. package/lib/utils/object/index.js +1 -0
  154. package/lib/utils/path/index.d.ts +1 -0
  155. package/lib/utils/path/index.js +1 -0
  156. package/lib/utils/path/safe-filepath.d.ts +7 -0
  157. package/lib/utils/path/safe-filepath.js +12 -0
  158. package/lib/utils/regexp/index.d.ts +1 -0
  159. package/lib/utils/regexp/index.js +1 -0
  160. package/lib/utils/retryable/index.d.ts +2 -0
  161. package/lib/utils/retryable/index.js +1 -0
  162. package/lib/utils/sort/index.d.ts +14 -0
  163. package/lib/utils/sort/index.js +61 -0
  164. package/lib/utils/sort/remove-matches.d.ts +9 -0
  165. package/lib/utils/sort/remove-matches.js +23 -0
  166. package/lib/utils/types/index.d.ts +1 -0
  167. package/lib/utils/types/index.js +1 -0
  168. package/lib/utils/types/types.d.ts +46 -0
  169. package/lib/utils/types/types.js +1 -0
  170. package/lib/utils/url/index.d.ts +5 -0
  171. package/lib/utils/url/index.js +5 -0
  172. package/lib/utils/url/is-lower-layer.d.ts +15 -0
  173. package/lib/utils/url/is-lower-layer.js +55 -0
  174. package/lib/utils/url/parse-url.d.ts +11 -0
  175. package/lib/utils/url/parse-url.js +20 -0
  176. package/lib/utils/url/path-match.d.ts +11 -0
  177. package/lib/utils/url/path-match.js +18 -0
  178. package/lib/utils/url/sort-url.d.ts +10 -0
  179. package/lib/utils/url/sort-url.js +24 -0
  180. package/lib/utils/url/url-partial-match.d.ts +11 -0
  181. package/lib/utils/url/url-partial-match.js +32 -0
  182. package/package.json +49 -0
  183. package/src/archive/__mock__/.gitignore +3 -0
  184. package/src/archive/__mock__/mock.sqlite +0 -0
  185. package/src/archive/archive-accessor.ts +337 -0
  186. package/src/archive/archive.ts +408 -0
  187. package/src/archive/database.spec.ts +469 -0
  188. package/src/archive/database.ts +1059 -0
  189. package/src/archive/debug.ts +10 -0
  190. package/src/archive/filesystem/append-text.spec.ts +26 -0
  191. package/src/archive/filesystem/append-text.ts +16 -0
  192. package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
  193. package/src/archive/filesystem/copy-dir-sync.ts +10 -0
  194. package/src/archive/filesystem/copy-dir.spec.ts +33 -0
  195. package/src/archive/filesystem/copy-dir.ts +14 -0
  196. package/src/archive/filesystem/exists.spec.ts +33 -0
  197. package/src/archive/filesystem/exists.ts +10 -0
  198. package/src/archive/filesystem/get-file-list.spec.ts +37 -0
  199. package/src/archive/filesystem/get-file-list.ts +13 -0
  200. package/src/archive/filesystem/index.ts +17 -0
  201. package/src/archive/filesystem/is-dir.spec.ts +29 -0
  202. package/src/archive/filesystem/is-dir.ts +11 -0
  203. package/src/archive/filesystem/mkdir.spec.ts +37 -0
  204. package/src/archive/filesystem/mkdir.ts +16 -0
  205. package/src/archive/filesystem/output-json.spec.ts +34 -0
  206. package/src/archive/filesystem/output-json.ts +16 -0
  207. package/src/archive/filesystem/output-text.spec.ts +31 -0
  208. package/src/archive/filesystem/output-text.ts +35 -0
  209. package/src/archive/filesystem/read-json.spec.ts +26 -0
  210. package/src/archive/filesystem/read-json.ts +12 -0
  211. package/src/archive/filesystem/read-text.spec.ts +25 -0
  212. package/src/archive/filesystem/read-text.ts +11 -0
  213. package/src/archive/filesystem/readline.spec.ts +29 -0
  214. package/src/archive/filesystem/readline.ts +30 -0
  215. package/src/archive/filesystem/remove.spec.ts +34 -0
  216. package/src/archive/filesystem/remove.ts +11 -0
  217. package/src/archive/filesystem/rename.spec.ts +46 -0
  218. package/src/archive/filesystem/rename.ts +21 -0
  219. package/src/archive/filesystem/tar.spec.ts +33 -0
  220. package/src/archive/filesystem/tar.ts +27 -0
  221. package/src/archive/filesystem/untar.spec.ts +34 -0
  222. package/src/archive/filesystem/untar.ts +36 -0
  223. package/src/archive/index.ts +13 -0
  224. package/src/archive/page.spec.ts +368 -0
  225. package/src/archive/page.ts +420 -0
  226. package/src/archive/resource.spec.ts +101 -0
  227. package/src/archive/resource.ts +73 -0
  228. package/src/archive/safe-path.spec.ts +44 -0
  229. package/src/archive/safe-path.ts +18 -0
  230. package/src/archive/types.ts +227 -0
  231. package/src/crawler/clear-destination-cache.spec.ts +20 -0
  232. package/src/crawler/clear-destination-cache.ts +9 -0
  233. package/src/crawler/crawler.ts +873 -0
  234. package/src/crawler/decompose-url.spec.ts +48 -0
  235. package/src/crawler/decompose-url.ts +90 -0
  236. package/src/crawler/destination-cache.spec.ts +23 -0
  237. package/src/crawler/destination-cache.ts +8 -0
  238. package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
  239. package/src/crawler/detect-pagination-pattern.ts +66 -0
  240. package/src/crawler/fetch-destination.ts +257 -0
  241. package/src/crawler/fetch-robots-txt.spec.ts +83 -0
  242. package/src/crawler/fetch-robots-txt.ts +91 -0
  243. package/src/crawler/find-best-matching-scope.spec.ts +39 -0
  244. package/src/crawler/find-best-matching-scope.ts +57 -0
  245. package/src/crawler/generate-predicted-urls.spec.ts +42 -0
  246. package/src/crawler/generate-predicted-urls.ts +34 -0
  247. package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
  248. package/src/crawler/handle-ignore-and-skip.ts +30 -0
  249. package/src/crawler/handle-resource-response.spec.ts +45 -0
  250. package/src/crawler/handle-resource-response.ts +21 -0
  251. package/src/crawler/handle-scrape-end.spec.ts +109 -0
  252. package/src/crawler/handle-scrape-end.ts +115 -0
  253. package/src/crawler/handle-scrape-error.spec.ts +105 -0
  254. package/src/crawler/handle-scrape-error.ts +58 -0
  255. package/src/crawler/index.ts +2 -0
  256. package/src/crawler/inject-scope-auth.spec.ts +36 -0
  257. package/src/crawler/inject-scope-auth.ts +27 -0
  258. package/src/crawler/is-external-url.spec.ts +31 -0
  259. package/src/crawler/is-external-url.ts +17 -0
  260. package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
  261. package/src/crawler/is-in-any-lower-layer.ts +22 -0
  262. package/src/crawler/link-list.spec.ts +355 -0
  263. package/src/crawler/link-list.ts +275 -0
  264. package/src/crawler/link-to-page-data.spec.ts +133 -0
  265. package/src/crawler/link-to-page-data.ts +34 -0
  266. package/src/crawler/net-timeout-error.spec.ts +25 -0
  267. package/src/crawler/net-timeout-error.ts +11 -0
  268. package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
  269. package/src/crawler/protocol-agnostic-key.ts +11 -0
  270. package/src/crawler/reconstruct-url.spec.ts +37 -0
  271. package/src/crawler/reconstruct-url.ts +37 -0
  272. package/src/crawler/robots-checker.spec.ts +104 -0
  273. package/src/crawler/robots-checker.ts +73 -0
  274. package/src/crawler/should-discard-predicted.spec.ts +125 -0
  275. package/src/crawler/should-discard-predicted.ts +33 -0
  276. package/src/crawler/should-skip-url.spec.ts +77 -0
  277. package/src/crawler/should-skip-url.ts +37 -0
  278. package/src/crawler/types.ts +146 -0
  279. package/src/crawler-orchestrator.ts +401 -0
  280. package/src/debug.ts +10 -0
  281. package/src/index.ts +25 -0
  282. package/src/types.ts +30 -0
  283. package/src/utils/array/each-splitted.spec.ts +38 -0
  284. package/src/utils/array/each-splitted.ts +19 -0
  285. package/src/utils/array/index.ts +1 -0
  286. package/src/utils/debug.ts +6 -0
  287. package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
  288. package/src/utils/error/dom-evaluation-error.ts +6 -0
  289. package/src/utils/error/error-emitter.spec.ts +78 -0
  290. package/src/utils/error/error-emitter.ts +44 -0
  291. package/src/utils/error/index.ts +3 -0
  292. package/src/utils/index.ts +5 -0
  293. package/src/utils/object/clean-object.spec.ts +24 -0
  294. package/src/utils/object/clean-object.ts +13 -0
  295. package/src/utils/object/index.ts +1 -0
  296. package/src/utils/types/index.ts +1 -0
  297. package/src/utils/types/types.ts +65 -0
  298. package/tsconfig.json +11 -0
  299. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,174 @@
1
+ import type { Config } from './types.js';
2
+ import type { PageData, CrawlerError, Resource } from '../utils/index.js';
3
+ import type { ParseURLOptions } from '@d-zero/shared/parse-url';
4
+ import { ArchiveAccessor } from './archive-accessor.js';
5
+ /**
6
+ * Main archive class for creating, opening, resuming, and writing Nitpicker archive files (`.nitpicker`).
7
+ *
8
+ * An Archive wraps a SQLite database and optional HTML snapshots into a compressed
9
+ * tar archive. It extends {@link ArchiveAccessor} to provide read access to stored data.
10
+ *
11
+ * Use the static factory methods ({@link Archive.create}, {@link Archive.open},
12
+ * {@link Archive.resume}, {@link Archive.connect}) to obtain instances.
13
+ * The constructor is private.
14
+ */
15
+ export default class Archive extends ArchiveAccessor {
16
+ #private;
17
+ /**
18
+ * The absolute file path of the archive (`.nitpicker` file).
19
+ */
20
+ get filePath(): string;
21
+ private constructor();
22
+ /**
23
+ * @deprecated This method is no longer functional.
24
+ */
25
+ abort(): void;
26
+ /**
27
+ * Appends an error entry to the archive's error log file.
28
+ * @param error - The crawler error object containing process and URL information.
29
+ */
30
+ addError(error: CrawlerError): Promise<void>;
31
+ /**
32
+ * Closes the archive. If the archive file does not yet exist on disk,
33
+ * it writes the archive first. If the temporary directory still exists,
34
+ * it is removed.
35
+ */
36
+ close(): Promise<void>;
37
+ /**
38
+ * Retrieves the crawl configuration stored in the archive database.
39
+ * @returns The configuration object.
40
+ */
41
+ getConfig(): Promise<Config>;
42
+ /**
43
+ * Retrieves the current crawling state, including lists of scraped and pending URLs.
44
+ * @returns An object with `scraped` and `pending` URL arrays.
45
+ */
46
+ getCrawlingState(): Promise<{
47
+ scraped: string[];
48
+ pending: string[];
49
+ }>;
50
+ /**
51
+ * Retrieves the base URL of the crawl session from the archive database.
52
+ * @returns The base URL string.
53
+ */
54
+ getUrl(): Promise<any>;
55
+ /**
56
+ * Stores the crawl configuration into the archive database.
57
+ * @param config - The configuration object to store.
58
+ */
59
+ setConfig(config: Config): Promise<Config[]>;
60
+ /**
61
+ * Stores an external page's data in the archive database without saving a snapshot.
62
+ * @param pageInfo - The page data to store.
63
+ */
64
+ setExternalPage(pageInfo: PageData): Promise<void>;
65
+ /**
66
+ * Stores a crawled page's data in the archive database and optionally saves an HTML snapshot.
67
+ * @param pageInfo - The page data to store.
68
+ * @returns The database ID of the stored page.
69
+ */
70
+ setPage(pageInfo: PageData): Promise<number>;
71
+ /**
72
+ * Stores a sub-resource (CSS, JS, image, etc.) in the archive database.
73
+ * @param resource - The resource data to store.
74
+ */
75
+ setResources(resource: Resource): Promise<void>;
76
+ /**
77
+ * Stores the referrer relationship between a resource and the page that references it.
78
+ * @param params - An object containing `url` (the page URL) and `src` (the resource URL).
79
+ * @param params.url
80
+ * @param params.src
81
+ */
82
+ setResourcesReferrers({ url, src }: {
83
+ url: string;
84
+ src: string;
85
+ }): Promise<void>;
86
+ /**
87
+ * Marks a page as skipped in the archive database with the given reason.
88
+ * @param url - The URL of the page to mark as skipped.
89
+ * @param reason - The reason the page was skipped.
90
+ * @param isExternal - Whether the page is on an external domain. Defaults to `false`.
91
+ */
92
+ setSkippedPage(url: string, reason: string, isExternal?: boolean): Promise<void>;
93
+ /**
94
+ * Assigns natural URL sort order values to all pages in the database
95
+ * that do not yet have an `order` field set.
96
+ */
97
+ setUrlOrder(): Promise<void>;
98
+ /**
99
+ * Writes the archive to disk as a compressed `.nitpicker` file.
100
+ *
101
+ * This method compresses the HTML snapshot directory into a zip file,
102
+ * renames the temporary working directory, and creates the final tar archive.
103
+ * The temporary directory is removed after writing.
104
+ */
105
+ write(): Promise<void>;
106
+ /** The file extension for Nitpicker archive files (without the leading dot). */
107
+ static FILE_EXTENSION: string;
108
+ /** The directory name used for storing HTML snapshots within the archive. */
109
+ static readonly SNAPSHOT_HTML_DIR = "snapshot-html";
110
+ /** The filename of the SQLite database within the archive. */
111
+ static readonly SQLITE_DB_FILE_NAME = "db.sqlite";
112
+ /** The prefix used for temporary working directories during archive operations. */
113
+ static TMP_DIR_PREFIX: string;
114
+ /**
115
+ * Opens a read-only connection to an existing archive's database.
116
+ * Returns an {@link ArchiveAccessor} that provides query methods
117
+ * without the ability to modify or write the archive.
118
+ * @param tmpDir - The path to the temporary directory containing the database.
119
+ * @param namespace - An optional namespace for scoping data access within the archive.
120
+ * @returns An ArchiveAccessor instance for querying the archive data.
121
+ */
122
+ static connect(tmpDir: string, namespace?: string | null): Promise<ArchiveAccessor>;
123
+ /**
124
+ * Creates a new archive at the specified file path.
125
+ * Initializes a temporary working directory and a fresh SQLite database.
126
+ * @param options - Options including the file path and optional working directory.
127
+ * @returns A new Archive instance ready for writing crawl data.
128
+ */
129
+ static create(options: ArchiveOptions & ParseURLOptions): Promise<Archive>;
130
+ /**
131
+ * Joins path segments into an absolute path.
132
+ * @param pathes - The path segments to join.
133
+ * @returns The resolved absolute path.
134
+ */
135
+ static joinPath(...pathes: string[]): string;
136
+ /**
137
+ * Opens an existing archive file (`.nitpicker`) by extracting it to a temporary directory.
138
+ * @param options - Options including the file path, optional working directory,
139
+ * and whether to extract plugin data.
140
+ * @returns An Archive instance with the extracted data loaded.
141
+ */
142
+ static open(options: ArchiveOptions & ArchiveOpenOptions): Promise<Archive>;
143
+ /**
144
+ * Resumes an archive from an existing temporary directory
145
+ * (e.g., after an interrupted crawl session).
146
+ * @param targetPath - The path to the temporary directory to resume from.
147
+ * @returns An Archive instance reconnected to the existing data.
148
+ * @throws {Error} If the specified path is not a directory.
149
+ */
150
+ static resume(targetPath: string): Promise<Archive>;
151
+ /**
152
+ * Generates a timestamp string in the format `YYYYMMDDHHmmssSSS`
153
+ * suitable for use in file names.
154
+ * @returns A formatted timestamp string.
155
+ */
156
+ static timestamp(): string;
157
+ }
158
+ /**
159
+ * Options for creating or opening an archive.
160
+ */
161
+ type ArchiveOptions = {
162
+ /** The file path for the archive (`.nitpicker` file). */
163
+ filePath: string;
164
+ /** The working directory. Defaults to `process.cwd()`. */
165
+ cwd?: string;
166
+ };
167
+ /**
168
+ * Additional options for opening an existing archive.
169
+ */
170
+ type ArchiveOpenOptions = {
171
+ /** When true, extracts all files including plugin data. When false, only extracts the database and snapshots. */
172
+ openPluginData?: boolean;
173
+ };
174
+ export {};
@@ -0,0 +1,331 @@
1
+ import path from 'node:path';
2
+ import { ArchiveAccessor } from './archive-accessor.js';
3
+ import { Database } from './database.js';
4
+ import { dbLog, log, saveLog } from './debug.js';
5
+ import { appendText, exists, isDir, outputText, remove, rename, tar, untar, zip, } from './filesystem/index.js';
6
+ /**
7
+ * Main archive class for creating, opening, resuming, and writing Nitpicker archive files (`.nitpicker`).
8
+ *
9
+ * An Archive wraps a SQLite database and optional HTML snapshots into a compressed
10
+ * tar archive. It extends {@link ArchiveAccessor} to provide read access to stored data.
11
+ *
12
+ * Use the static factory methods ({@link Archive.create}, {@link Archive.open},
13
+ * {@link Archive.resume}, {@link Archive.connect}) to obtain instances.
14
+ * The constructor is private.
15
+ */
16
+ export default class Archive extends ArchiveAccessor {
17
+ /** The SQLite database instance for reading and writing crawl data. */
18
+ #db;
19
+ /** Absolute path to the output `.nitpicker` archive file. */
20
+ #filePath;
21
+ /** Absolute path to the HTML snapshot directory within the temporary working directory. */
22
+ #snapshotDir;
23
+ /** Absolute path to the temporary working directory containing the SQLite DB and snapshots. */
24
+ #tmpDir;
25
+ /**
26
+ * The absolute file path of the archive (`.nitpicker` file).
27
+ */
28
+ get filePath() {
29
+ return this.#filePath;
30
+ }
31
+ // eslint-disable-next-line no-restricted-syntax
32
+ constructor(filePath, tmpDir, db) {
33
+ super(tmpDir, db, '');
34
+ this.#filePath = filePath;
35
+ this.#tmpDir = tmpDir;
36
+ this.#snapshotDir = path.resolve(this.#tmpDir, Archive.SNAPSHOT_HTML_DIR);
37
+ this.#db = db;
38
+ log('create instance: %O', {
39
+ filePath,
40
+ tmpDir,
41
+ snapshotDir: this.#snapshotDir,
42
+ });
43
+ this.#db.on('error', (e) => {
44
+ void this.emit('error', e);
45
+ });
46
+ }
47
+ /**
48
+ * @deprecated This method is no longer functional.
49
+ */
50
+ abort() { }
51
+ /**
52
+ * Appends an error entry to the archive's error log file.
53
+ * @param error - The crawler error object containing process and URL information.
54
+ */
55
+ async addError(error) {
56
+ const logFile = path.resolve(this.#tmpDir, 'error.log');
57
+ await appendText(logFile, `[${error.pid}(${error.isMainProcess ? 'main' : 'sub'})] ${error.url} ${error.error.stack ?? error.error}`);
58
+ }
59
+ /**
60
+ * Closes the archive. If the archive file does not yet exist on disk,
61
+ * it writes the archive first. If the temporary directory still exists,
62
+ * it is removed.
63
+ */
64
+ async close() {
65
+ log('Closing');
66
+ if (!exists(this.#filePath)) {
67
+ log("Save the file because it doesn't exist");
68
+ await this.write();
69
+ }
70
+ else if (exists(this.#tmpDir)) {
71
+ log('Remove temporary dir');
72
+ await remove(this.#tmpDir);
73
+ }
74
+ await this.#db.destroy();
75
+ log('Closing done');
76
+ }
77
+ /**
78
+ * Retrieves the crawl configuration stored in the archive database.
79
+ * @returns The configuration object.
80
+ */
81
+ async getConfig() {
82
+ return this.#db.getConfig();
83
+ }
84
+ /**
85
+ * Retrieves the current crawling state, including lists of scraped and pending URLs.
86
+ * @returns An object with `scraped` and `pending` URL arrays.
87
+ */
88
+ async getCrawlingState() {
89
+ return this.#db.getCrawlingState();
90
+ }
91
+ /**
92
+ * Retrieves the base URL of the crawl session from the archive database.
93
+ * @returns The base URL string.
94
+ */
95
+ async getUrl() {
96
+ return this.#db.getBaseUrl();
97
+ }
98
+ /**
99
+ * Stores the crawl configuration into the archive database.
100
+ * @param config - The configuration object to store.
101
+ */
102
+ async setConfig(config) {
103
+ dbLog('Set config: %O', config);
104
+ return this.#db.setConfig(config);
105
+ }
106
+ /**
107
+ * Stores an external page's data in the archive database without saving a snapshot.
108
+ * @param pageInfo - The page data to store.
109
+ */
110
+ async setExternalPage(pageInfo) {
111
+ dbLog('Set external page: %s', pageInfo.url.href);
112
+ await this.#db.updatePage(pageInfo, null, false);
113
+ }
114
+ /**
115
+ * Stores a crawled page's data in the archive database and optionally saves an HTML snapshot.
116
+ * @param pageInfo - The page data to store.
117
+ * @returns The database ID of the stored page.
118
+ */
119
+ async setPage(pageInfo) {
120
+ dbLog('Set page: %s', pageInfo.url.href);
121
+ const { html, pageId } = await this.#db.updatePage(pageInfo, this.#snapshotDir, pageInfo.isTarget);
122
+ const snapshotTask = [];
123
+ if (html) {
124
+ snapshotTask.push(outputText(html, pageInfo.html));
125
+ }
126
+ await Promise.all(snapshotTask);
127
+ return pageId;
128
+ }
129
+ /**
130
+ * Stores a sub-resource (CSS, JS, image, etc.) in the archive database.
131
+ * @param resource - The resource data to store.
132
+ */
133
+ async setResources(resource) {
134
+ dbLog('Set resource: %s', resource.url.href);
135
+ await this.#db.insertResource(resource);
136
+ }
137
+ /**
138
+ * Stores the referrer relationship between a resource and the page that references it.
139
+ * @param params - An object containing `url` (the page URL) and `src` (the resource URL).
140
+ * @param params.url
141
+ * @param params.src
142
+ */
143
+ async setResourcesReferrers({ url, src }) {
144
+ dbLog("Set resource's referrers: %s on %s", src, url);
145
+ await this.#db.insertResourceReferrers(src, url);
146
+ }
147
+ /**
148
+ * Marks a page as skipped in the archive database with the given reason.
149
+ * @param url - The URL of the page to mark as skipped.
150
+ * @param reason - The reason the page was skipped.
151
+ * @param isExternal - Whether the page is on an external domain. Defaults to `false`.
152
+ */
153
+ async setSkippedPage(url, reason, isExternal = false) {
154
+ dbLog('Set skipped page: %s', url);
155
+ await this.#db.setSkippedPage(url, reason, isExternal);
156
+ }
157
+ /**
158
+ * Assigns natural URL sort order values to all pages in the database
159
+ * that do not yet have an `order` field set.
160
+ */
161
+ async setUrlOrder() {
162
+ dbLog("Pages didn't have `order` field. So set URL order.");
163
+ await this.#db.setUrlOrder();
164
+ }
165
+ /**
166
+ * Writes the archive to disk as a compressed `.nitpicker` file.
167
+ *
168
+ * This method compresses the HTML snapshot directory into a zip file,
169
+ * renames the temporary working directory, and creates the final tar archive.
170
+ * The temporary directory is removed after writing.
171
+ */
172
+ async write() {
173
+ saveLog('Starts: %s', this.#filePath);
174
+ const snapshotZip = `${this.#snapshotDir}.zip`;
175
+ if (exists(this.#snapshotDir)) {
176
+ if (!exists(snapshotZip)) {
177
+ saveLog('Zips snapshot dir: %s', this.#snapshotDir);
178
+ await zip(snapshotZip, this.#snapshotDir);
179
+ }
180
+ saveLog('Remove snapshot dir: %s', this.#snapshotDir);
181
+ await remove(this.#snapshotDir);
182
+ }
183
+ await this.#db.checkpoint();
184
+ const filePathWithoutExt = path.resolve(path.dirname(this.#filePath), path.basename(this.#filePath, path.extname(this.#filePath)));
185
+ saveLog('Rename temporary dir: %s to %s', this.#tmpDir, filePathWithoutExt);
186
+ await rename(this.#tmpDir, filePathWithoutExt, true);
187
+ saveLog('Zip temporary dir to file: %s to %s', filePathWithoutExt, this.#filePath);
188
+ await tar(filePathWithoutExt, this.#filePath);
189
+ saveLog('Remove temporary dir: %s', filePathWithoutExt);
190
+ await remove(filePathWithoutExt);
191
+ saveLog('Done: %s', this.#filePath);
192
+ }
193
+ /** The file extension for Nitpicker archive files (without the leading dot). */
194
+ static FILE_EXTENSION = 'nitpicker';
195
+ /** The directory name used for storing HTML snapshots within the archive. */
196
+ static SNAPSHOT_HTML_DIR = 'snapshot-html';
197
+ /** The filename of the SQLite database within the archive. */
198
+ static SQLITE_DB_FILE_NAME = 'db.sqlite';
199
+ /** The prefix used for temporary working directories during archive operations. */
200
+ static TMP_DIR_PREFIX = '._nitpicker-';
201
+ /**
202
+ * Opens a read-only connection to an existing archive's database.
203
+ * Returns an {@link ArchiveAccessor} that provides query methods
204
+ * without the ability to modify or write the archive.
205
+ * @param tmpDir - The path to the temporary directory containing the database.
206
+ * @param namespace - An optional namespace for scoping data access within the archive.
207
+ * @returns An ArchiveAccessor instance for querying the archive data.
208
+ */
209
+ static async connect(tmpDir, namespace = null) {
210
+ const db = await Archive.#connectDB(tmpDir);
211
+ const archive = new ArchiveAccessor(tmpDir, db, namespace);
212
+ return archive;
213
+ }
214
+ /**
215
+ * Creates a new archive at the specified file path.
216
+ * Initializes a temporary working directory and a fresh SQLite database.
217
+ * @param options - Options including the file path and optional working directory.
218
+ * @returns A new Archive instance ready for writing crawl data.
219
+ */
220
+ static async create(options) {
221
+ const { filePath } = options;
222
+ const cwd = options.cwd ?? process.cwd();
223
+ log('Create: %O', {
224
+ filePath,
225
+ cwd,
226
+ });
227
+ const fileName = path.basename(filePath, path.extname(filePath));
228
+ const tmpDir = path.resolve(cwd, Archive.TMP_DIR_PREFIX + fileName);
229
+ return await Archive.#init(filePath, tmpDir);
230
+ }
231
+ /**
232
+ * Joins path segments into an absolute path.
233
+ * @param pathes - The path segments to join.
234
+ * @returns The resolved absolute path.
235
+ */
236
+ static joinPath(...pathes) {
237
+ return path.resolve(...pathes);
238
+ }
239
+ /**
240
+ * Opens an existing archive file (`.nitpicker`) by extracting it to a temporary directory.
241
+ * @param options - Options including the file path, optional working directory,
242
+ * and whether to extract plugin data.
243
+ * @returns An Archive instance with the extracted data loaded.
244
+ */
245
+ static async open(options) {
246
+ const { filePath, openPluginData } = options;
247
+ const cwd = options.cwd ?? process.cwd();
248
+ log('Open: %O', {
249
+ filePath,
250
+ cwd,
251
+ openPluginData,
252
+ });
253
+ const fileName = path.basename(filePath, path.extname(filePath));
254
+ const tmpDir = path.resolve(cwd, Archive.TMP_DIR_PREFIX + fileName);
255
+ const openFiles = [];
256
+ if (!openPluginData) {
257
+ const relDdPath = path.join(fileName, Archive.SQLITE_DB_FILE_NAME);
258
+ const relSnapshotPath = path.join(fileName, Archive.SNAPSHOT_HTML_DIR + '.zip');
259
+ openFiles.push(relDdPath, relSnapshotPath);
260
+ }
261
+ log('Unzip file: %s (%O)', filePath, openFiles);
262
+ await untar(filePath, {
263
+ cwd,
264
+ fileList: openFiles.length > 0 ? openFiles : undefined,
265
+ });
266
+ const extractedDir = path.resolve(cwd, fileName);
267
+ log('Move directory: %s to %s', extractedDir, tmpDir);
268
+ await rename(extractedDir, tmpDir, true);
269
+ return await Archive.#init(filePath, tmpDir);
270
+ }
271
+ /**
272
+ * Resumes an archive from an existing temporary directory
273
+ * (e.g., after an interrupted crawl session).
274
+ * @param targetPath - The path to the temporary directory to resume from.
275
+ * @returns An Archive instance reconnected to the existing data.
276
+ * @throws {Error} If the specified path is not a directory.
277
+ */
278
+ static async resume(targetPath) {
279
+ log('Resume: %s', targetPath);
280
+ if (await isDir(targetPath)) {
281
+ const tmpDir = targetPath;
282
+ const db = await Archive.#connectDB(tmpDir);
283
+ const name = (await db.getName()) ||
284
+ path.basename(targetPath).replace(Archive.TMP_DIR_PREFIX, '');
285
+ const filePath = path.resolve(process.cwd(), name + '.' + Archive.FILE_EXTENSION);
286
+ return await Archive.#init(filePath, tmpDir);
287
+ }
288
+ throw new Error('The specified path is not a directory. Please ensure the path points to a valid directory.');
289
+ }
290
+ /**
291
+ * Generates a timestamp string in the format `YYYYMMDDHHmmssSSS`
292
+ * suitable for use in file names.
293
+ * @returns A formatted timestamp string.
294
+ */
295
+ static timestamp() {
296
+ const now = new Date();
297
+ const year = now.getFullYear().toString();
298
+ const month = (now.getMonth() + 1).toLocaleString('en-US', {
299
+ minimumIntegerDigits: 2,
300
+ });
301
+ const date = now.getDate().toLocaleString('en-US', { minimumIntegerDigits: 2 });
302
+ const hours = now.getHours().toLocaleString('en-US', { minimumIntegerDigits: 2 });
303
+ const minutes = now.getMinutes().toLocaleString('en-US', { minimumIntegerDigits: 2 });
304
+ const seconds = now.getSeconds().toLocaleString('en-US', { minimumIntegerDigits: 2 });
305
+ const ms = now.getMilliseconds().toLocaleString('en-US', { minimumIntegerDigits: 3 });
306
+ return year + month + date + hours + minutes + seconds + ms;
307
+ }
308
+ /**
309
+ * Connects to (or creates) the SQLite database in the given directory.
310
+ * @param tmpDir - Directory containing `db.sqlite`
311
+ */
312
+ static async #connectDB(tmpDir) {
313
+ const dbPath = path.resolve(tmpDir, Archive.SQLITE_DB_FILE_NAME);
314
+ dbLog('connects database: %s', dbPath);
315
+ return await Database.connect({
316
+ type: 'sqlite3',
317
+ workingDir: tmpDir,
318
+ filename: dbPath,
319
+ });
320
+ }
321
+ /**
322
+ * Initializes an Archive instance by connecting to the database.
323
+ * @param filePath - Output `.nitpicker` file path
324
+ * @param tmpDir - Temporary working directory path
325
+ */
326
+ static async #init(filePath, tmpDir) {
327
+ const db = await Archive.#connectDB(tmpDir);
328
+ const archive = new Archive(filePath, tmpDir, db);
329
+ return archive;
330
+ }
331
+ }