@nitpicker/crawler 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/LICENSE +191 -0
  3. package/README.md +13 -0
  4. package/lib/archive/archive-accessor.d.ts +107 -0
  5. package/lib/archive/archive-accessor.js +264 -0
  6. package/lib/archive/archive.d.ts +174 -0
  7. package/lib/archive/archive.js +331 -0
  8. package/lib/archive/database.d.ts +207 -0
  9. package/lib/archive/database.js +972 -0
  10. package/lib/archive/debug.d.ts +8 -0
  11. package/lib/archive/debug.js +9 -0
  12. package/lib/archive/filesystem/append-text.d.ts +9 -0
  13. package/lib/archive/filesystem/append-text.js +14 -0
  14. package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
  15. package/lib/archive/filesystem/copy-dir-sync.js +9 -0
  16. package/lib/archive/filesystem/copy-dir.d.ts +7 -0
  17. package/lib/archive/filesystem/copy-dir.js +13 -0
  18. package/lib/archive/filesystem/exists.d.ts +6 -0
  19. package/lib/archive/filesystem/exists.js +9 -0
  20. package/lib/archive/filesystem/get-file-list.d.ts +8 -0
  21. package/lib/archive/filesystem/get-file-list.js +12 -0
  22. package/lib/archive/filesystem/index.d.ts +17 -0
  23. package/lib/archive/filesystem/index.js +17 -0
  24. package/lib/archive/filesystem/is-dir.d.ts +6 -0
  25. package/lib/archive/filesystem/is-dir.js +10 -0
  26. package/lib/archive/filesystem/mkdir.d.ts +8 -0
  27. package/lib/archive/filesystem/mkdir.js +15 -0
  28. package/lib/archive/filesystem/output-json.d.ts +9 -0
  29. package/lib/archive/filesystem/output-json.js +14 -0
  30. package/lib/archive/filesystem/output-text.d.ts +11 -0
  31. package/lib/archive/filesystem/output-text.js +32 -0
  32. package/lib/archive/filesystem/read-json.d.ts +7 -0
  33. package/lib/archive/filesystem/read-json.js +11 -0
  34. package/lib/archive/filesystem/read-text.d.ts +6 -0
  35. package/lib/archive/filesystem/read-text.js +10 -0
  36. package/lib/archive/filesystem/readline.d.ts +11 -0
  37. package/lib/archive/filesystem/readline.js +26 -0
  38. package/lib/archive/filesystem/remove.d.ts +5 -0
  39. package/lib/archive/filesystem/remove.js +10 -0
  40. package/lib/archive/filesystem/rename.d.ts +11 -0
  41. package/lib/archive/filesystem/rename.js +18 -0
  42. package/lib/archive/filesystem/tar.d.ts +11 -0
  43. package/lib/archive/filesystem/tar.js +22 -0
  44. package/lib/archive/filesystem/untar.d.ts +20 -0
  45. package/lib/archive/filesystem/untar.js +24 -0
  46. package/lib/archive/filesystem/utils.d.ts +109 -0
  47. package/lib/archive/filesystem/utils.js +185 -0
  48. package/lib/archive/filesystem/zip.d.ts +29 -0
  49. package/lib/archive/filesystem/zip.js +53 -0
  50. package/lib/archive/index.d.ts +6 -0
  51. package/lib/archive/index.js +11 -0
  52. package/lib/archive/page.d.ts +263 -0
  53. package/lib/archive/page.js +316 -0
  54. package/lib/archive/resource.d.ts +46 -0
  55. package/lib/archive/resource.js +62 -0
  56. package/lib/archive/safe-path.d.ts +9 -0
  57. package/lib/archive/safe-path.js +17 -0
  58. package/lib/archive/types.d.ts +210 -0
  59. package/lib/archive/types.js +1 -0
  60. package/lib/crawler/clear-destination-cache.d.ts +5 -0
  61. package/lib/crawler/clear-destination-cache.js +8 -0
  62. package/lib/crawler/crawler.d.ts +73 -0
  63. package/lib/crawler/crawler.js +748 -0
  64. package/lib/crawler/decompose-url.d.ts +25 -0
  65. package/lib/crawler/decompose-url.js +71 -0
  66. package/lib/crawler/destination-cache.d.ts +7 -0
  67. package/lib/crawler/destination-cache.js +6 -0
  68. package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
  69. package/lib/crawler/detect-pagination-pattern.js +61 -0
  70. package/lib/crawler/fetch-destination.d.ts +38 -0
  71. package/lib/crawler/fetch-destination.js +208 -0
  72. package/lib/crawler/fetch-robots-txt.d.ts +42 -0
  73. package/lib/crawler/fetch-robots-txt.js +44 -0
  74. package/lib/crawler/find-best-matching-scope.d.ts +12 -0
  75. package/lib/crawler/find-best-matching-scope.js +46 -0
  76. package/lib/crawler/generate-predicted-urls.d.ts +13 -0
  77. package/lib/crawler/generate-predicted-urls.js +27 -0
  78. package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
  79. package/lib/crawler/handle-ignore-and-skip.js +19 -0
  80. package/lib/crawler/handle-resource-response.d.ts +13 -0
  81. package/lib/crawler/handle-resource-response.js +16 -0
  82. package/lib/crawler/handle-scrape-end.d.ts +24 -0
  83. package/lib/crawler/handle-scrape-end.js +82 -0
  84. package/lib/crawler/handle-scrape-error.d.ts +37 -0
  85. package/lib/crawler/handle-scrape-error.js +38 -0
  86. package/lib/crawler/index.d.ts +2 -0
  87. package/lib/crawler/index.js +2 -0
  88. package/lib/crawler/inject-scope-auth.d.ts +11 -0
  89. package/lib/crawler/inject-scope-auth.js +21 -0
  90. package/lib/crawler/is-external-url.d.ts +11 -0
  91. package/lib/crawler/is-external-url.js +12 -0
  92. package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
  93. package/lib/crawler/is-in-any-lower-layer.js +15 -0
  94. package/lib/crawler/link-list.d.ts +112 -0
  95. package/lib/crawler/link-list.js +248 -0
  96. package/lib/crawler/link-to-page-data.d.ts +14 -0
  97. package/lib/crawler/link-to-page-data.js +32 -0
  98. package/lib/crawler/net-timeout-error.d.ts +9 -0
  99. package/lib/crawler/net-timeout-error.js +11 -0
  100. package/lib/crawler/network.d.ts +30 -0
  101. package/lib/crawler/network.js +226 -0
  102. package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
  103. package/lib/crawler/protocol-agnostic-key.js +11 -0
  104. package/lib/crawler/reconstruct-url.d.ts +10 -0
  105. package/lib/crawler/reconstruct-url.js +28 -0
  106. package/lib/crawler/result-handler.d.ts +118 -0
  107. package/lib/crawler/result-handler.js +153 -0
  108. package/lib/crawler/robots-checker.d.ts +26 -0
  109. package/lib/crawler/robots-checker.js +62 -0
  110. package/lib/crawler/should-discard-predicted.d.ts +14 -0
  111. package/lib/crawler/should-discard-predicted.js +31 -0
  112. package/lib/crawler/should-skip-url.d.ts +23 -0
  113. package/lib/crawler/should-skip-url.js +15 -0
  114. package/lib/crawler/speculative-pagination.d.ts +52 -0
  115. package/lib/crawler/speculative-pagination.js +215 -0
  116. package/lib/crawler/types.d.ts +119 -0
  117. package/lib/crawler/types.js +1 -0
  118. package/lib/crawler/url-filter.d.ts +56 -0
  119. package/lib/crawler/url-filter.js +110 -0
  120. package/lib/crawler-orchestrator.d.ts +142 -0
  121. package/lib/crawler-orchestrator.js +309 -0
  122. package/lib/debug.d.ts +8 -0
  123. package/lib/debug.js +9 -0
  124. package/lib/index.d.ts +16 -0
  125. package/lib/index.js +18 -0
  126. package/lib/qzilla.d.ts +136 -0
  127. package/lib/qzilla.js +292 -0
  128. package/lib/types.d.ts +27 -0
  129. package/lib/types.js +1 -0
  130. package/lib/utils/array/each-splitted.d.ts +10 -0
  131. package/lib/utils/array/each-splitted.js +14 -0
  132. package/lib/utils/array/index.d.ts +1 -0
  133. package/lib/utils/array/index.js +1 -0
  134. package/lib/utils/async/index.d.ts +1 -0
  135. package/lib/utils/async/index.js +1 -0
  136. package/lib/utils/debug.d.ts +5 -0
  137. package/lib/utils/debug.js +5 -0
  138. package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
  139. package/lib/utils/error/dom-evaluation-error.js +7 -0
  140. package/lib/utils/error/error-emitter.d.ts +18 -0
  141. package/lib/utils/error/error-emitter.js +29 -0
  142. package/lib/utils/error/index.d.ts +3 -0
  143. package/lib/utils/error/index.js +2 -0
  144. package/lib/utils/event-emitter/index.d.ts +6 -0
  145. package/lib/utils/event-emitter/index.js +6 -0
  146. package/lib/utils/index.d.ts +5 -0
  147. package/lib/utils/index.js +5 -0
  148. package/lib/utils/network/index.d.ts +1 -0
  149. package/lib/utils/network/index.js +1 -0
  150. package/lib/utils/object/clean-object.d.ts +8 -0
  151. package/lib/utils/object/clean-object.js +13 -0
  152. package/lib/utils/object/index.d.ts +1 -0
  153. package/lib/utils/object/index.js +1 -0
  154. package/lib/utils/path/index.d.ts +1 -0
  155. package/lib/utils/path/index.js +1 -0
  156. package/lib/utils/path/safe-filepath.d.ts +7 -0
  157. package/lib/utils/path/safe-filepath.js +12 -0
  158. package/lib/utils/regexp/index.d.ts +1 -0
  159. package/lib/utils/regexp/index.js +1 -0
  160. package/lib/utils/retryable/index.d.ts +2 -0
  161. package/lib/utils/retryable/index.js +1 -0
  162. package/lib/utils/sort/index.d.ts +14 -0
  163. package/lib/utils/sort/index.js +61 -0
  164. package/lib/utils/sort/remove-matches.d.ts +9 -0
  165. package/lib/utils/sort/remove-matches.js +23 -0
  166. package/lib/utils/types/index.d.ts +1 -0
  167. package/lib/utils/types/index.js +1 -0
  168. package/lib/utils/types/types.d.ts +46 -0
  169. package/lib/utils/types/types.js +1 -0
  170. package/lib/utils/url/index.d.ts +5 -0
  171. package/lib/utils/url/index.js +5 -0
  172. package/lib/utils/url/is-lower-layer.d.ts +15 -0
  173. package/lib/utils/url/is-lower-layer.js +55 -0
  174. package/lib/utils/url/parse-url.d.ts +11 -0
  175. package/lib/utils/url/parse-url.js +20 -0
  176. package/lib/utils/url/path-match.d.ts +11 -0
  177. package/lib/utils/url/path-match.js +18 -0
  178. package/lib/utils/url/sort-url.d.ts +10 -0
  179. package/lib/utils/url/sort-url.js +24 -0
  180. package/lib/utils/url/url-partial-match.d.ts +11 -0
  181. package/lib/utils/url/url-partial-match.js +32 -0
  182. package/package.json +49 -0
  183. package/src/archive/__mock__/.gitignore +3 -0
  184. package/src/archive/__mock__/mock.sqlite +0 -0
  185. package/src/archive/archive-accessor.ts +337 -0
  186. package/src/archive/archive.ts +408 -0
  187. package/src/archive/database.spec.ts +469 -0
  188. package/src/archive/database.ts +1059 -0
  189. package/src/archive/debug.ts +10 -0
  190. package/src/archive/filesystem/append-text.spec.ts +26 -0
  191. package/src/archive/filesystem/append-text.ts +16 -0
  192. package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
  193. package/src/archive/filesystem/copy-dir-sync.ts +10 -0
  194. package/src/archive/filesystem/copy-dir.spec.ts +33 -0
  195. package/src/archive/filesystem/copy-dir.ts +14 -0
  196. package/src/archive/filesystem/exists.spec.ts +33 -0
  197. package/src/archive/filesystem/exists.ts +10 -0
  198. package/src/archive/filesystem/get-file-list.spec.ts +37 -0
  199. package/src/archive/filesystem/get-file-list.ts +13 -0
  200. package/src/archive/filesystem/index.ts +17 -0
  201. package/src/archive/filesystem/is-dir.spec.ts +29 -0
  202. package/src/archive/filesystem/is-dir.ts +11 -0
  203. package/src/archive/filesystem/mkdir.spec.ts +37 -0
  204. package/src/archive/filesystem/mkdir.ts +16 -0
  205. package/src/archive/filesystem/output-json.spec.ts +34 -0
  206. package/src/archive/filesystem/output-json.ts +16 -0
  207. package/src/archive/filesystem/output-text.spec.ts +31 -0
  208. package/src/archive/filesystem/output-text.ts +35 -0
  209. package/src/archive/filesystem/read-json.spec.ts +26 -0
  210. package/src/archive/filesystem/read-json.ts +12 -0
  211. package/src/archive/filesystem/read-text.spec.ts +25 -0
  212. package/src/archive/filesystem/read-text.ts +11 -0
  213. package/src/archive/filesystem/readline.spec.ts +29 -0
  214. package/src/archive/filesystem/readline.ts +30 -0
  215. package/src/archive/filesystem/remove.spec.ts +34 -0
  216. package/src/archive/filesystem/remove.ts +11 -0
  217. package/src/archive/filesystem/rename.spec.ts +46 -0
  218. package/src/archive/filesystem/rename.ts +21 -0
  219. package/src/archive/filesystem/tar.spec.ts +33 -0
  220. package/src/archive/filesystem/tar.ts +27 -0
  221. package/src/archive/filesystem/untar.spec.ts +34 -0
  222. package/src/archive/filesystem/untar.ts +36 -0
  223. package/src/archive/index.ts +13 -0
  224. package/src/archive/page.spec.ts +368 -0
  225. package/src/archive/page.ts +420 -0
  226. package/src/archive/resource.spec.ts +101 -0
  227. package/src/archive/resource.ts +73 -0
  228. package/src/archive/safe-path.spec.ts +44 -0
  229. package/src/archive/safe-path.ts +18 -0
  230. package/src/archive/types.ts +227 -0
  231. package/src/crawler/clear-destination-cache.spec.ts +20 -0
  232. package/src/crawler/clear-destination-cache.ts +9 -0
  233. package/src/crawler/crawler.ts +873 -0
  234. package/src/crawler/decompose-url.spec.ts +48 -0
  235. package/src/crawler/decompose-url.ts +90 -0
  236. package/src/crawler/destination-cache.spec.ts +23 -0
  237. package/src/crawler/destination-cache.ts +8 -0
  238. package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
  239. package/src/crawler/detect-pagination-pattern.ts +66 -0
  240. package/src/crawler/fetch-destination.ts +257 -0
  241. package/src/crawler/fetch-robots-txt.spec.ts +83 -0
  242. package/src/crawler/fetch-robots-txt.ts +91 -0
  243. package/src/crawler/find-best-matching-scope.spec.ts +39 -0
  244. package/src/crawler/find-best-matching-scope.ts +57 -0
  245. package/src/crawler/generate-predicted-urls.spec.ts +42 -0
  246. package/src/crawler/generate-predicted-urls.ts +34 -0
  247. package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
  248. package/src/crawler/handle-ignore-and-skip.ts +30 -0
  249. package/src/crawler/handle-resource-response.spec.ts +45 -0
  250. package/src/crawler/handle-resource-response.ts +21 -0
  251. package/src/crawler/handle-scrape-end.spec.ts +109 -0
  252. package/src/crawler/handle-scrape-end.ts +115 -0
  253. package/src/crawler/handle-scrape-error.spec.ts +105 -0
  254. package/src/crawler/handle-scrape-error.ts +58 -0
  255. package/src/crawler/index.ts +2 -0
  256. package/src/crawler/inject-scope-auth.spec.ts +36 -0
  257. package/src/crawler/inject-scope-auth.ts +27 -0
  258. package/src/crawler/is-external-url.spec.ts +31 -0
  259. package/src/crawler/is-external-url.ts +17 -0
  260. package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
  261. package/src/crawler/is-in-any-lower-layer.ts +22 -0
  262. package/src/crawler/link-list.spec.ts +355 -0
  263. package/src/crawler/link-list.ts +275 -0
  264. package/src/crawler/link-to-page-data.spec.ts +133 -0
  265. package/src/crawler/link-to-page-data.ts +34 -0
  266. package/src/crawler/net-timeout-error.spec.ts +25 -0
  267. package/src/crawler/net-timeout-error.ts +11 -0
  268. package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
  269. package/src/crawler/protocol-agnostic-key.ts +11 -0
  270. package/src/crawler/reconstruct-url.spec.ts +37 -0
  271. package/src/crawler/reconstruct-url.ts +37 -0
  272. package/src/crawler/robots-checker.spec.ts +104 -0
  273. package/src/crawler/robots-checker.ts +73 -0
  274. package/src/crawler/should-discard-predicted.spec.ts +125 -0
  275. package/src/crawler/should-discard-predicted.ts +33 -0
  276. package/src/crawler/should-skip-url.spec.ts +77 -0
  277. package/src/crawler/should-skip-url.ts +37 -0
  278. package/src/crawler/types.ts +146 -0
  279. package/src/crawler-orchestrator.ts +401 -0
  280. package/src/debug.ts +10 -0
  281. package/src/index.ts +25 -0
  282. package/src/types.ts +30 -0
  283. package/src/utils/array/each-splitted.spec.ts +38 -0
  284. package/src/utils/array/each-splitted.ts +19 -0
  285. package/src/utils/array/index.ts +1 -0
  286. package/src/utils/debug.ts +6 -0
  287. package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
  288. package/src/utils/error/dom-evaluation-error.ts +6 -0
  289. package/src/utils/error/error-emitter.spec.ts +78 -0
  290. package/src/utils/error/error-emitter.ts +44 -0
  291. package/src/utils/error/index.ts +3 -0
  292. package/src/utils/index.ts +5 -0
  293. package/src/utils/object/clean-object.spec.ts +24 -0
  294. package/src/utils/object/clean-object.ts +13 -0
  295. package/src/utils/object/index.ts +1 -0
  296. package/src/utils/types/index.ts +1 -0
  297. package/src/utils/types/types.ts +65 -0
  298. package/tsconfig.json +11 -0
  299. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,972 @@
1
+ var __runInitializers = (this && this.__runInitializers) || function (thisArg, initializers, value) {
2
+ var useValue = arguments.length > 2;
3
+ for (var i = 0; i < initializers.length; i++) {
4
+ value = useValue ? initializers[i].call(thisArg, value) : initializers[i].call(thisArg);
5
+ }
6
+ return useValue ? value : void 0;
7
+ };
8
+ var __esDecorate = (this && this.__esDecorate) || function (ctor, descriptorIn, decorators, contextIn, initializers, extraInitializers) {
9
+ function accept(f) { if (f !== void 0 && typeof f !== "function") throw new TypeError("Function expected"); return f; }
10
+ var kind = contextIn.kind, key = kind === "getter" ? "get" : kind === "setter" ? "set" : "value";
11
+ var target = !descriptorIn && ctor ? contextIn["static"] ? ctor : ctor.prototype : null;
12
+ var descriptor = descriptorIn || (target ? Object.getOwnPropertyDescriptor(target, contextIn.name) : {});
13
+ var _, done = false;
14
+ for (var i = decorators.length - 1; i >= 0; i--) {
15
+ var context = {};
16
+ for (var p in contextIn) context[p] = p === "access" ? {} : contextIn[p];
17
+ for (var p in contextIn.access) context.access[p] = contextIn.access[p];
18
+ context.addInitializer = function (f) { if (done) throw new TypeError("Cannot add initializers after decoration has completed"); extraInitializers.push(accept(f || null)); };
19
+ var result = (0, decorators[i])(kind === "accessor" ? { get: descriptor.get, set: descriptor.set } : descriptor[key], context);
20
+ if (kind === "accessor") {
21
+ if (result === void 0) continue;
22
+ if (result === null || typeof result !== "object") throw new TypeError("Object expected");
23
+ if (_ = accept(result.get)) descriptor.get = _;
24
+ if (_ = accept(result.set)) descriptor.set = _;
25
+ if (_ = accept(result.init)) initializers.unshift(_);
26
+ }
27
+ else if (_ = accept(result)) {
28
+ if (kind === "field") initializers.unshift(_);
29
+ else descriptor[key] = _;
30
+ }
31
+ }
32
+ if (target) Object.defineProperty(target, contextIn.name, descriptor);
33
+ done = true;
34
+ };
35
+ import path from 'node:path';
36
+ import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
37
+ import { retry } from '@d-zero/shared/retry';
38
+ import { pathComparator } from '@d-zero/shared/sort/path';
39
+ import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
40
+ import knex from 'knex';
41
+ import { ErrorEmitter, eachSplitted } from '../utils/index.js';
42
+ import { dbLog } from './debug.js';
43
+ import { mkdir } from './filesystem/index.js';
44
+ const retrySetting = {
45
+ interval: 300,
46
+ retries: 3,
47
+ };
48
+ /**
49
+ * Low-level database abstraction layer for the archive's SQLite database.
50
+ *
51
+ * Manages the `pages`, `anchors`, `images`, `resources`, and `resources-referrers`
52
+ * tables. All public methods that perform database queries use the `@retryable`
53
+ * decorator for automatic retry on transient failures, and `@ErrorEmitter` to
54
+ * propagate errors as events.
55
+ *
56
+ * Use the static {@link Database.connect} factory method to create instances.
57
+ * The constructor is private.
58
+ */
59
+ let Database = (() => {
60
+ let _classSuper = EventEmitter;
61
+ let _instanceExtraInitializers = [];
62
+ let _getAnchorsOnPage_decorators;
63
+ let _getBaseUrl_decorators;
64
+ let _getConfig_decorators;
65
+ let _getCrawlingState_decorators;
66
+ let _getHtmlPathOnPage_decorators;
67
+ let _getName_decorators;
68
+ let _getPageCount_decorators;
69
+ let _getPages_decorators;
70
+ let _getPagesWithRels_decorators;
71
+ let _getRedirectsForPages_decorators;
72
+ let _getReferrersOfPage_decorators;
73
+ let _getReferrersOfResource_decorators;
74
+ let _getResources_decorators;
75
+ let _getResourceUrlList_decorators;
76
+ let _insertResource_decorators;
77
+ let _insertResourceReferrers_decorators;
78
+ let _setConfig_decorators;
79
+ let _setSkippedPage_decorators;
80
+ let _updatePage_decorators;
81
+ return class Database extends _classSuper {
82
+ static {
83
+ const _metadata = typeof Symbol === "function" && Symbol.metadata ? Object.create(_classSuper[Symbol.metadata] ?? null) : void 0;
84
+ _getAnchorsOnPage_decorators = [ErrorEmitter(), retry(retrySetting)];
85
+ _getBaseUrl_decorators = [ErrorEmitter(), retry(retrySetting)];
86
+ _getConfig_decorators = [ErrorEmitter(), retry(retrySetting)];
87
+ _getCrawlingState_decorators = [ErrorEmitter(), retry(retrySetting)];
88
+ _getHtmlPathOnPage_decorators = [ErrorEmitter(), retry(retrySetting)];
89
+ _getName_decorators = [ErrorEmitter(), retry(retrySetting)];
90
+ _getPageCount_decorators = [ErrorEmitter(), retry(retrySetting)];
91
+ _getPages_decorators = [ErrorEmitter(), retry(retrySetting)];
92
+ _getPagesWithRels_decorators = [ErrorEmitter(), retry(retrySetting)];
93
+ _getRedirectsForPages_decorators = [ErrorEmitter(), retry(retrySetting)];
94
+ _getReferrersOfPage_decorators = [ErrorEmitter(), retry(retrySetting)];
95
+ _getReferrersOfResource_decorators = [ErrorEmitter(), retry(retrySetting)];
96
+ _getResources_decorators = [ErrorEmitter(), retry(retrySetting)];
97
+ _getResourceUrlList_decorators = [ErrorEmitter(), retry(retrySetting)];
98
+ _insertResource_decorators = [ErrorEmitter(), retry(retrySetting)];
99
+ _insertResourceReferrers_decorators = [ErrorEmitter(), retry(retrySetting)];
100
+ _setConfig_decorators = [ErrorEmitter(), retry(retrySetting)];
101
+ _setSkippedPage_decorators = [ErrorEmitter(), retry(retrySetting)];
102
+ _updatePage_decorators = [ErrorEmitter(), retry(retrySetting)];
103
+ __esDecorate(this, null, _getAnchorsOnPage_decorators, { kind: "method", name: "getAnchorsOnPage", static: false, private: false, access: { has: obj => "getAnchorsOnPage" in obj, get: obj => obj.getAnchorsOnPage }, metadata: _metadata }, null, _instanceExtraInitializers);
104
+ __esDecorate(this, null, _getBaseUrl_decorators, { kind: "method", name: "getBaseUrl", static: false, private: false, access: { has: obj => "getBaseUrl" in obj, get: obj => obj.getBaseUrl }, metadata: _metadata }, null, _instanceExtraInitializers);
105
+ __esDecorate(this, null, _getConfig_decorators, { kind: "method", name: "getConfig", static: false, private: false, access: { has: obj => "getConfig" in obj, get: obj => obj.getConfig }, metadata: _metadata }, null, _instanceExtraInitializers);
106
+ __esDecorate(this, null, _getCrawlingState_decorators, { kind: "method", name: "getCrawlingState", static: false, private: false, access: { has: obj => "getCrawlingState" in obj, get: obj => obj.getCrawlingState }, metadata: _metadata }, null, _instanceExtraInitializers);
107
+ __esDecorate(this, null, _getHtmlPathOnPage_decorators, { kind: "method", name: "getHtmlPathOnPage", static: false, private: false, access: { has: obj => "getHtmlPathOnPage" in obj, get: obj => obj.getHtmlPathOnPage }, metadata: _metadata }, null, _instanceExtraInitializers);
108
+ __esDecorate(this, null, _getName_decorators, { kind: "method", name: "getName", static: false, private: false, access: { has: obj => "getName" in obj, get: obj => obj.getName }, metadata: _metadata }, null, _instanceExtraInitializers);
109
+ __esDecorate(this, null, _getPageCount_decorators, { kind: "method", name: "getPageCount", static: false, private: false, access: { has: obj => "getPageCount" in obj, get: obj => obj.getPageCount }, metadata: _metadata }, null, _instanceExtraInitializers);
110
+ __esDecorate(this, null, _getPages_decorators, { kind: "method", name: "getPages", static: false, private: false, access: { has: obj => "getPages" in obj, get: obj => obj.getPages }, metadata: _metadata }, null, _instanceExtraInitializers);
111
+ __esDecorate(this, null, _getPagesWithRels_decorators, { kind: "method", name: "getPagesWithRels", static: false, private: false, access: { has: obj => "getPagesWithRels" in obj, get: obj => obj.getPagesWithRels }, metadata: _metadata }, null, _instanceExtraInitializers);
112
+ __esDecorate(this, null, _getRedirectsForPages_decorators, { kind: "method", name: "getRedirectsForPages", static: false, private: false, access: { has: obj => "getRedirectsForPages" in obj, get: obj => obj.getRedirectsForPages }, metadata: _metadata }, null, _instanceExtraInitializers);
113
+ __esDecorate(this, null, _getReferrersOfPage_decorators, { kind: "method", name: "getReferrersOfPage", static: false, private: false, access: { has: obj => "getReferrersOfPage" in obj, get: obj => obj.getReferrersOfPage }, metadata: _metadata }, null, _instanceExtraInitializers);
114
+ __esDecorate(this, null, _getReferrersOfResource_decorators, { kind: "method", name: "getReferrersOfResource", static: false, private: false, access: { has: obj => "getReferrersOfResource" in obj, get: obj => obj.getReferrersOfResource }, metadata: _metadata }, null, _instanceExtraInitializers);
115
+ __esDecorate(this, null, _getResources_decorators, { kind: "method", name: "getResources", static: false, private: false, access: { has: obj => "getResources" in obj, get: obj => obj.getResources }, metadata: _metadata }, null, _instanceExtraInitializers);
116
+ __esDecorate(this, null, _getResourceUrlList_decorators, { kind: "method", name: "getResourceUrlList", static: false, private: false, access: { has: obj => "getResourceUrlList" in obj, get: obj => obj.getResourceUrlList }, metadata: _metadata }, null, _instanceExtraInitializers);
117
+ __esDecorate(this, null, _insertResource_decorators, { kind: "method", name: "insertResource", static: false, private: false, access: { has: obj => "insertResource" in obj, get: obj => obj.insertResource }, metadata: _metadata }, null, _instanceExtraInitializers);
118
+ __esDecorate(this, null, _insertResourceReferrers_decorators, { kind: "method", name: "insertResourceReferrers", static: false, private: false, access: { has: obj => "insertResourceReferrers" in obj, get: obj => obj.insertResourceReferrers }, metadata: _metadata }, null, _instanceExtraInitializers);
119
+ __esDecorate(this, null, _setConfig_decorators, { kind: "method", name: "setConfig", static: false, private: false, access: { has: obj => "setConfig" in obj, get: obj => obj.setConfig }, metadata: _metadata }, null, _instanceExtraInitializers);
120
+ __esDecorate(this, null, _setSkippedPage_decorators, { kind: "method", name: "setSkippedPage", static: false, private: false, access: { has: obj => "setSkippedPage" in obj, get: obj => obj.setSkippedPage }, metadata: _metadata }, null, _instanceExtraInitializers);
121
+ __esDecorate(this, null, _updatePage_decorators, { kind: "method", name: "updatePage", static: false, private: false, access: { has: obj => "updatePage" in obj, get: obj => obj.updatePage }, metadata: _metadata }, null, _instanceExtraInitializers);
122
+ if (_metadata) Object.defineProperty(this, Symbol.metadata, { enumerable: true, configurable: true, writable: true, value: _metadata });
123
+ }
124
+ /** The Knex query builder instance connected to the SQLite database. */
125
+ #instance = __runInitializers(this, _instanceExtraInitializers);
126
+ /** Absolute path to the working directory, used for resolving relative snapshot paths. */
127
+ #workingDir;
128
+ // eslint-disable-next-line no-restricted-syntax
129
+ constructor(options) {
130
+ super();
131
+ this.#workingDir = options.workingDir;
132
+ switch (options.type) {
133
+ case 'sqlite3': {
134
+ this.#instance = knex({
135
+ client: options.type,
136
+ connection: {
137
+ filename: options.filename,
138
+ },
139
+ useNullAsDefault: true,
140
+ pool: {
141
+ acquireTimeoutMillis: 600_000,
142
+ },
143
+ });
144
+ break;
145
+ }
146
+ case 'mysql': {
147
+ throw new Error("Don't support MySQL yet.");
148
+ }
149
+ }
150
+ }
151
+ /**
152
+ * Adds the `order` column to the `pages` table for URL sort ordering.
153
+ * @deprecated Since v0.1.x. The column is now created during table initialization.
154
+ * @returns The result of the schema alteration.
155
+ */
156
+ async addOrderField() {
157
+ return await this.#instance.schema.table('pages', (t) => {
158
+ t.integer('order').unsigned().nullable().defaultTo(null);
159
+ });
160
+ }
161
+ /**
162
+ * Forces a WAL checkpoint, writing all pending WAL data back to the main database file.
163
+ * Uses TRUNCATE mode to reset the WAL file to zero bytes after checkpointing.
164
+ * This ensures the database is fully self-contained in `db.sqlite` before archiving.
165
+ */
166
+ async checkpoint() {
167
+ await this.#instance.raw('PRAGMA wal_checkpoint(TRUNCATE)');
168
+ }
169
+ async destroy() {
170
+ await this.#instance.destroy();
171
+ }
172
+ /**
173
+ * Retrieves all anchors (outgoing links) on a specific page.
174
+ * Joins the `anchors` table with the `pages` table to resolve link destinations.
175
+ * @param pageId - The database ID of the page whose anchors to retrieve.
176
+ * @returns An array of anchor records with resolved URL, title, status, and content type.
177
+ */
178
+ async getAnchorsOnPage(pageId) {
179
+ const res = await this.#instance
180
+ .select('pages.url', 'pages.title', 'pages.status', 'pages.statusText', 'pages.contentType', 'anchors.hash', 'anchors.textContent')
181
+ .from('anchors')
182
+ .join('pages', 'anchors.hrefId', '=', 'pages.id')
183
+ .where('anchors.pageId', pageId);
184
+ return res;
185
+ }
186
+ /**
187
+ * Retrieves the base URL of the crawl session from the `info` table.
188
+ * @returns The base URL string.
189
+ * @throws {Error} If no base URL is found in the database.
190
+ */
191
+ async getBaseUrl() {
192
+ const selected = await this.#instance.select('baseUrl').from('info');
193
+ if (!selected[0]) {
194
+ throw new Error('No baseUrl');
195
+ }
196
+ const [{ baseUrl }] = selected;
197
+ return baseUrl || '';
198
+ }
199
+ /**
200
+ * Retrieves the full crawl configuration from the `info` table.
201
+ * Deserializes JSON-encoded fields (`excludes`, `excludeKeywords`, `scope`).
202
+ * @returns The parsed {@link Config} object.
203
+ * @throws {Error} If no configuration is found in the database.
204
+ */
205
+ async getConfig() {
206
+ const [config] = await this.#instance.select('*').from('info');
207
+ if (!config) {
208
+ throw new Error('No config');
209
+ }
210
+ const opt = {
211
+ ...config,
212
+ excludes: getJSON(config.excludes, []),
213
+ excludeKeywords: getJSON(config.excludeKeywords, []),
214
+ excludeUrls: getJSON(config.excludeUrls, []),
215
+ scope: getJSON(config.scope, []),
216
+ retry: config.retry ?? 3,
217
+ };
218
+ // @ts-expect-error
219
+ delete opt.id;
220
+ dbLog('Table `info`: %O => %O', config, opt);
221
+ return opt;
222
+ }
223
+ /**
224
+ * Retrieves the current crawling state by listing scraped and pending URLs.
225
+ * @returns An object with `scraped` (completed URLs) and `pending` (remaining URLs) arrays.
226
+ */
227
+ async getCrawlingState() {
228
+ const ex = (r) => r.url;
229
+ const $scraped = await this.#instance
230
+ .select('url')
231
+ .from('pages')
232
+ .where('scraped', 1);
233
+ const scraped = $scraped.map(ex);
234
+ const $pending = await this.#instance
235
+ .select('url')
236
+ .from('pages')
237
+ .where('scraped', 0);
238
+ const pending = $pending.map(ex);
239
+ return {
240
+ scraped,
241
+ pending,
242
+ };
243
+ }
244
+ /**
245
+ * Retrieves the HTML snapshot file path for a specific page.
246
+ * @param pageId - The database ID of the page.
247
+ * @returns The relative file path to the HTML snapshot, or null if not saved.
248
+ */
249
+ async getHtmlPathOnPage(pageId) {
250
+ return await this.#instance.transaction(async (trx) => {
251
+ const [{ html }] = await trx
252
+ .select('html')
253
+ .from('pages')
254
+ .where('id', pageId);
255
+ return html || null;
256
+ });
257
+ }
258
+ /**
259
+ * Retrieves the crawl session name from the `info` table.
260
+ * @returns The name string.
261
+ * @throws {Error} If no name is found in the database.
262
+ */
263
+ async getName() {
264
+ const selected = await this.#instance.select('name').from('info');
265
+ if (!selected[0]) {
266
+ throw new Error('No name');
267
+ }
268
+ const [{ name }] = selected;
269
+ return name;
270
+ }
271
+ /**
272
+ * Counts the total number of pages in the database.
273
+ * @returns The total page count.
274
+ * @throws {Error} If the count query fails.
275
+ */
276
+ async getPageCount() {
277
+ const selected = await this.#instance.count('id').from('pages');
278
+ if (!selected[0]) {
279
+ throw new Error('No count');
280
+ }
281
+ // @ts-expect-error
282
+ const count = selected[0]['count(`id`)'];
283
+ dbLog('Number of pages: %d', count);
284
+ return count;
285
+ }
286
+ /**
287
+ * Retrieves pages from the database with optional filtering, pagination via offset and limit.
288
+ * @param filter - An optional {@link PageFilter} to narrow results by content type and origin.
289
+ * @param offset - The number of rows to skip. Defaults to `0`.
290
+ * @param limit - The maximum number of rows to return. Defaults to `100000`.
291
+ * @returns An array of raw {@link DB_Page} rows.
292
+ */
293
+ async getPages(filter, offset = 0, limit = 100_000) {
294
+ const q = this.#instance.select('*').from('pages');
295
+ switch (filter) {
296
+ case 'page': {
297
+ return q
298
+ .where({
299
+ contentType: 'text/html',
300
+ isTarget: 1,
301
+ })
302
+ .limit(limit)
303
+ .offset(offset);
304
+ }
305
+ case 'page-included-no-target': {
306
+ return q
307
+ .where({
308
+ contentType: 'text/html',
309
+ })
310
+ .limit(limit)
311
+ .offset(offset);
312
+ }
313
+ case 'external-page': {
314
+ return q
315
+ .where({
316
+ contentType: 'text/html',
317
+ isExternal: 1,
318
+ })
319
+ .limit(limit)
320
+ .offset(offset);
321
+ }
322
+ case 'internal-page': {
323
+ return q
324
+ .where({
325
+ contentType: 'text/html',
326
+ isExternal: 0,
327
+ })
328
+ .limit(limit)
329
+ .offset(offset);
330
+ }
331
+ case 'no-page': {
332
+ return q
333
+ .whereNull('contentType')
334
+ .orWhereNot({
335
+ contentType: 'text/html',
336
+ })
337
+ .limit(limit)
338
+ .offset(offset);
339
+ }
340
+ case 'external-no-page': {
341
+ return q
342
+ .where((qb) => {
343
+ qb.whereNull('contentType').orWhereNot({
344
+ contentType: 'text/html',
345
+ });
346
+ })
347
+ .andWhere({
348
+ isExternal: 1,
349
+ })
350
+ .limit(limit)
351
+ .offset(offset);
352
+ }
353
+ case 'internal-no-page': {
354
+ return q
355
+ .where((qb) => {
356
+ qb.whereNull('contentType').orWhereNot({
357
+ contentType: 'text/html',
358
+ });
359
+ })
360
+ .andWhere({
361
+ isExternal: 0,
362
+ })
363
+ .limit(limit)
364
+ .offset(offset);
365
+ }
366
+ }
367
+ return q.limit(limit).offset(offset);
368
+ }
369
+ /**
370
+ * Retrieves pages along with their related redirect, anchor, and referrer data.
371
+ * Results are ordered by the natural URL sort order. Only non-redirected pages are returned.
372
+ * @param offset - The number of rows to skip.
373
+ * @param limit - The maximum number of pages to return.
374
+ * @returns An object containing `pages`, `redirects`, `anchors`, and `referrers` arrays.
375
+ */
376
+ async getPagesWithRels(offset, limit) {
377
+ await this.addOrderField().catch((error) => error);
378
+ await this.setUrlOrder();
379
+ dbLog('Get Pages');
380
+ const pages = await this.#instance
381
+ .select('*')
382
+ .from('pages')
383
+ .orderByRaw('`order` ASC NULLS LAST')
384
+ .whereNull('redirectDestId')
385
+ .limit(limit)
386
+ .offset(offset);
387
+ // When empty
388
+ if (pages.length === 0) {
389
+ return {
390
+ pages: [],
391
+ redirects: [],
392
+ referrers: [],
393
+ anchors: [],
394
+ };
395
+ }
396
+ dbLog('Get Pages: Redirects');
397
+ const redirects = await this.#instance
398
+ .with('limitedPages', limitedPageIds(limit, offset))
399
+ .with('redirect', redirectTable(false))
400
+ .select('id as pageId', 'from', 'fromId')
401
+ .from('redirect')
402
+ // Filter
403
+ .join('limitedPages', 'redirect.toId', '=', 'limitedPages.id')
404
+ // Sort
405
+ .orderBy('id', 'asc');
406
+ dbLog('Get Pages: Anchors');
407
+ const anchors = await this.#instance
408
+ .with('limitedPages', limitedPageIds(limit, offset))
409
+ .with('redirect', redirectTable())
410
+ .select('limitedPages.id as pageId', 'href.url', 'redirect.from as href', 'href.isExternal', 'href.title', 'href.status', 'href.statusText', 'href.contentType', 'anchors.hash', 'anchors.textContent')
411
+ .from('anchors')
412
+ // Filters
413
+ .join('limitedPages', 'anchors.pageId', '=', 'limitedPages.id')
414
+ // Resolves redirect
415
+ .join('redirect', 'anchors.hrefId', '=', 'redirect.fromId')
416
+ // Target
417
+ .join('pages as href', 'redirect.toId', '=', 'href.id')
418
+ // Sort
419
+ .orderBy('anchors.id', 'asc');
420
+ dbLog('Get Pages: Referrers');
421
+ const referrers = await this.#instance
422
+ .with('limitedPages', limitedPageIds(limit, offset))
423
+ .with('redirect', redirectTable())
424
+ .select('redirect.toId as pageId', 'referrer.url', 'redirect.from as through', 'redirect.fromId as throughId', 'anchors.hash', 'anchors.textContent')
425
+ .from('anchors')
426
+ // Resolves redirect
427
+ .join('redirect', 'anchors.hrefId', '=', 'redirect.fromId')
428
+ // Referrer
429
+ .join('pages as referrer', 'anchors.pageId', '=', 'referrer.id')
430
+ // Filters
431
+ .join('limitedPages', 'redirect.toId', '=', 'limitedPages.id')
432
+ // Sort
433
+ .orderBy('anchors.id', 'asc');
434
+ dbLog('Get Pages: Done');
435
+ return {
436
+ pages,
437
+ redirects,
438
+ anchors,
439
+ referrers,
440
+ };
441
+ }
442
+ /**
443
+ * Retrieves redirect sources for the given page IDs in bulk.
444
+ * @param pageIds - The database IDs of the destination pages.
445
+ * @returns An array of {@link DB_Redirect} records mapping destination pages to their redirect sources.
446
+ */
447
+ async getRedirectsForPages(pageIds) {
448
+ if (pageIds.length === 0)
449
+ return [];
450
+ return this.#instance
451
+ .select('redirectDestId as pageId', 'url as from', 'id as fromId')
452
+ .from('pages')
453
+ .whereIn('redirectDestId', pageIds);
454
+ }
455
+ /**
456
+ * Retrieves pages that link to a specific page (incoming links / referrers).
457
+ * @param pageId - The database ID of the target page.
458
+ * @returns An array of referrer records with URL, hash, and text content.
459
+ */
460
+ async getReferrersOfPage(pageId) {
461
+ const res = await this.#instance
462
+ .select('pages.url', 'anchors.hash', 'anchors.textContent')
463
+ .from('anchors')
464
+ .join('pages', 'anchors.pageId', '=', 'pages.id')
465
+ .where('anchors.hrefId', pageId);
466
+ return res;
467
+ }
468
+ /**
469
+ * Retrieves the page URLs that reference a specific resource.
470
+ * @param id - The database ID of the resource.
471
+ * @returns An array of page URL strings that reference the resource.
472
+ */
473
+ async getReferrersOfResource(id) {
474
+ const res = await this.#instance
475
+ .select('pages.url')
476
+ .from('resources-referrers')
477
+ .join('resources', 'resources.id', '=', 'resources-referrers.resourceId')
478
+ .join('pages', 'pages.id', '=', 'resources-referrers.pageId')
479
+ .where('resources.id', id);
480
+ return res.map((r) => r.url);
481
+ }
482
+ /**
483
+ * Retrieves all sub-resources from the `resources` table.
484
+ * @returns An array of raw {@link DB_Resource} rows.
485
+ */
486
+ async getResources() {
487
+ return this.#instance.select('*').from('resources');
488
+ }
489
+ /**
490
+ * Retrieves a flat list of all resource URLs from the `resources` table.
491
+ * @returns An array of resource URL strings.
492
+ */
493
+ async getResourceUrlList() {
494
+ const res = await this.#instance.select('url').from('resources');
495
+ return res.map((r) => r.url);
496
+ }
497
+ /**
498
+ * Inserts a sub-resource into the `resources` table.
499
+ * Ignores duplicate URLs (uses `ON CONFLICT IGNORE`).
500
+ * @param resource - The resource data to insert.
501
+ */
502
+ async insertResource(resource) {
503
+ await this.#instance
504
+ .from('resources')
505
+ .insert({
506
+ url: resource.url.href,
507
+ isExternal: resource.isExternal ? 1 : 0,
508
+ status: resource.status,
509
+ statusText: resource.statusText,
510
+ contentType: resource.contentType,
511
+ contentLength: resource.contentLength,
512
+ compress: resource.compress || 0,
513
+ cdn: resource.cdn || 0,
514
+ responseHeaders: JSON.stringify(resource.headers),
515
+ })
516
+ .onConflict('url')
517
+ .ignore();
518
+ }
519
+ /**
520
+ * Inserts a referrer relationship between a resource and a page into the
521
+ * `resources-referrers` table. Silently skips if the resource is not found.
522
+ * @param src - The URL of the resource.
523
+ * @param pageUrl - The URL of the page that references the resource.
524
+ */
525
+ async insertResourceReferrers(src, pageUrl) {
526
+ const selected = await this.#instance
527
+ .select('id')
528
+ .from('resources')
529
+ .where('url', src);
530
+ if (!selected[0]) {
531
+ // Ignore when the resource is not found
532
+ return;
533
+ }
534
+ const [{ id: resourceId }] = selected;
535
+ const pageId = await this.#getIdByUrl(pageUrl);
536
+ await this.#instance('resources-referrers').insert({
537
+ resourceId,
538
+ pageId,
539
+ });
540
+ }
541
+ /**
542
+ * Stores the crawl configuration in the `info` table.
543
+ * Serializes array fields (`excludes`, `excludeKeywords`, `scope`) as JSON strings.
544
+ * @param config - The {@link Config} object to store.
545
+ */
546
+ async setConfig(config) {
547
+ return this.#instance.from('info').insert({
548
+ ...config,
549
+ // @ts-expect-error
550
+ excludes: JSON.stringify(config.excludes),
551
+ // @ts-expect-error
552
+ excludeKeywords: JSON.stringify(config.excludeKeywords),
553
+ // @ts-expect-error
554
+ excludeUrls: JSON.stringify(config.excludeUrls),
555
+ // @ts-expect-error
556
+ scope: JSON.stringify(config.scope),
557
+ });
558
+ }
559
+ /**
560
+ * Marks a page as skipped in the database with the given reason.
561
+ * Creates the page row if it does not already exist.
562
+ * @param url - The URL of the skipped page.
563
+ * @param reason - The reason the page was skipped.
564
+ * @param isExternal - Whether the page is on an external domain. Defaults to `false`.
565
+ */
566
+ async setSkippedPage(url, reason, isExternal = false) {
567
+ const pageId = await this.#getIdByUrl(url, isExternal ? 1 : 0);
568
+ await this.#instance('pages')
569
+ .where('id', pageId)
570
+ .update({
571
+ scraped: 1,
572
+ isExternal: isExternal ? 1 : 0,
573
+ isSkipped: 1,
574
+ skipReason: reason,
575
+ });
576
+ }
577
+ /**
578
+ * Assigns natural URL sort order values to all internal pages.
579
+ * Pages are sorted using {@link pathComparator} and assigned sequential order numbers.
580
+ */
581
+ async setUrlOrder() {
582
+ dbLog('Set URL Order');
583
+ const res = await this.#instance
584
+ .select('id', 'url')
585
+ .from('pages')
586
+ .where('isExternal', '=', 0);
587
+ const sorted = res.toSorted((a, b) => pathComparator(a.url, b.url));
588
+ // Batch update using chunked CASE statements to avoid N+1 queries
589
+ const BATCH_SIZE = 500;
590
+ for (let i = 0; i < sorted.length; i += BATCH_SIZE) {
591
+ const batch = sorted.slice(i, i + BATCH_SIZE);
592
+ const ids = batch.map((row) => row.id);
593
+ const bindings = [];
594
+ const cases = batch
595
+ .map((row, j) => {
596
+ bindings.push(row.id, i + j + 1);
597
+ return 'WHEN ? THEN ?';
598
+ })
599
+ .join(' ');
600
+ const placeholders = ids.map(() => '?').join(',');
601
+ await this.#instance.raw(`UPDATE pages SET \`order\` = CASE id ${cases} END WHERE id IN (${placeholders})`, [...bindings, ...ids]);
602
+ }
603
+ }
604
+ /**
605
+ * Inserts or updates a crawled page in the database, including its redirect chain,
606
+ * anchors, and images. Optionally creates an HTML snapshot file path entry.
607
+ * @param page - The page data to store.
608
+ * @param snapshotDir - The directory for saving HTML snapshots, or null to skip snapshots.
609
+ * @param isTarget - Whether this page is a crawl target.
610
+ * @returns An object with the optional `html` snapshot file path and the page's database `pageId`.
611
+ */
612
+ async updatePage(page, snapshotDir, isTarget) {
613
+ let destUrl = page.url.withoutHashAndAuth;
614
+ const redirectPaths = [...page.redirectPaths];
615
+ if (redirectPaths.length > 0) {
616
+ destUrl = redirectPaths.pop();
617
+ redirectPaths.unshift(page.url.withoutHashAndAuth);
618
+ }
619
+ const destUrlObject = parseUrl(destUrl);
620
+ if (!destUrlObject) {
621
+ throw new Error(`Failed to parse URL: ${destUrl}`);
622
+ }
623
+ return await this.#instance.transaction(async (trx) => {
624
+ const pageId = await this.#insertPage({
625
+ ...page,
626
+ url: destUrlObject,
627
+ }, isTarget, trx);
628
+ for (const redirect of redirectPaths) {
629
+ dbLog('Set redirected url: %s -> %s', redirect, destUrl);
630
+ const redirectId = await this.#getIdByUrl(redirect, undefined, trx);
631
+ await trx('pages')
632
+ .where('id', redirectId)
633
+ .update({
634
+ scraped: 1,
635
+ redirectDestId: pageId,
636
+ isExternal: page.isExternal ? 1 : 0,
637
+ });
638
+ }
639
+ let snapshot = { pageId };
640
+ if (isTarget && snapshotDir) {
641
+ snapshot = await this.#updateSnapshotPath(pageId, snapshotDir, trx);
642
+ }
643
+ const anchors = await Promise.all(page.anchorList.map(async (anchor) => {
644
+ const hrefId = await this.#getIdByUrl(anchor.href.withoutHashAndAuth, anchor.isExternal ? 1 : 0, trx);
645
+ return {
646
+ pageId,
647
+ hrefId,
648
+ hash: anchor.href.hash,
649
+ textContent: anchor.textContent,
650
+ };
651
+ }));
652
+ dbLog('Insert anchors.length: %d', anchors.length);
653
+ if (anchors.length > 0) {
654
+ await eachSplitted(anchors, 100, async (_anchors) => {
655
+ await trx('anchors').insert(_anchors);
656
+ });
657
+ }
658
+ const images = page.imageList.map((image) => ({
659
+ pageId,
660
+ ...image,
661
+ }));
662
+ dbLog('Insert images.length: %d', images.length);
663
+ if (images.length > 0) {
664
+ await eachSplitted(images, 100, async (_images) => {
665
+ await trx('images').insert(_images);
666
+ });
667
+ }
668
+ return snapshot;
669
+ });
670
+ }
671
+ /**
672
+ * Returns the database ID for a URL, creating a new page row if needed.
673
+ * Uses `ON CONFLICT IGNORE` to handle race conditions in concurrent inserts.
674
+ * @param url
675
+ * @param isExternal
676
+ * @param trx
677
+ */
678
+ async #getIdByUrl(url, isExternal, trx) {
679
+ const qb = trx ?? this.#instance;
680
+ const [record] = await qb.select('id').from('pages').where('url', url);
681
+ // Must use `?` because it may be `undefined`
682
+ const pageId = record?.id ?? Number.NaN;
683
+ if (Number.isFinite(pageId)) {
684
+ return pageId;
685
+ }
686
+ const insertedRows = await qb('pages')
687
+ .insert({
688
+ url,
689
+ scraped: 0,
690
+ isTarget: 0,
691
+ ...(isExternal != null && { isExternal }),
692
+ })
693
+ .onConflict('url')
694
+ .ignore();
695
+ const [insertedId] = insertedRows;
696
+ if (!insertedId) {
697
+ // onConflict.ignore() returns 0 on race condition — re-select
698
+ const [existing] = await qb.select('id').from('pages').where('url', url);
699
+ if (existing?.id) {
700
+ return existing.id;
701
+ }
702
+ throw new Error(`Failed to insert a new page: ${url}`);
703
+ }
704
+ return insertedId;
705
+ }
706
+ /**
707
+ * Initializes the database schema if tables do not exist.
708
+ * Enables WAL journal mode and foreign keys, then creates all tables
709
+ * (`info`, `pages`, `anchors`, `images`, `resources`, `resources-referrers`).
710
+ */
711
+ async #init() {
712
+ const isExists = await this.#instance.schema.hasTable('info');
713
+ if (isExists) {
714
+ return;
715
+ }
716
+ // Enable WAL mode and foreign keys for better performance and data integrity
717
+ await this.#instance.raw('PRAGMA journal_mode = WAL');
718
+ await this.#instance.raw('PRAGMA foreign_keys = ON');
719
+ await this.#instance.schema
720
+ .createTable('info', (t) => {
721
+ t.increments('id');
722
+ t.string('version');
723
+ t.string('name');
724
+ t.string('baseUrl');
725
+ t.boolean('recursive');
726
+ t.boolean('useSubprocess');
727
+ t.integer('interval');
728
+ t.boolean('image');
729
+ t.boolean('fetchExternal');
730
+ t.integer('parallels');
731
+ t.json('scope');
732
+ t.json('excludes');
733
+ t.json('excludeKeywords');
734
+ t.json('excludeUrls');
735
+ t.integer('maxExcludedDepth');
736
+ t.integer('retry');
737
+ t.boolean('fromList');
738
+ t.boolean('disableQueries');
739
+ })
740
+ .createTable('pages', (t) => {
741
+ t.increments('id');
742
+ t.string('url', 8190).notNullable().unique();
743
+ t.integer('redirectDestId').unsigned().references('pages.id').defaultTo(null);
744
+ t.boolean('scraped').notNullable();
745
+ t.boolean('isTarget').notNullable();
746
+ t.boolean('isExternal');
747
+ t.integer('status');
748
+ t.string('statusText');
749
+ t.string('contentType').nullable();
750
+ t.integer('contentLength').unsigned().nullable();
751
+ t.json('responseHeaders').nullable();
752
+ t.string('lang');
753
+ t.string('title');
754
+ t.string('description');
755
+ t.string('keywords');
756
+ t.boolean('noindex');
757
+ t.boolean('nofollow');
758
+ t.boolean('noarchive');
759
+ t.string('canonical');
760
+ t.string('alternate');
761
+ t.string('og_type');
762
+ t.string('og_title');
763
+ t.string('og_site_name');
764
+ t.string('og_description');
765
+ t.string('og_url');
766
+ t.string('og_image');
767
+ t.string('twitter_card');
768
+ t.string('html');
769
+ t.boolean('isSkipped');
770
+ t.string('skipReason');
771
+ t.integer('order').unsigned().nullable();
772
+ t.index('isExternal');
773
+ t.index('contentType');
774
+ t.index('scraped');
775
+ t.index('redirectDestId');
776
+ t.index('order');
777
+ })
778
+ .createTable('anchors', (t) => {
779
+ t.increments('id');
780
+ t.integer('pageId').notNullable().unsigned().references('pages.id');
781
+ t.integer('hrefId').notNullable().unsigned().references('pages.id');
782
+ t.string('hash');
783
+ t.string('textContent').nullable();
784
+ t.index('pageId');
785
+ t.index('hrefId');
786
+ })
787
+ .createTable('images', (t) => {
788
+ t.increments('id');
789
+ t.integer('pageId').notNullable().unsigned().references('pages.id');
790
+ t.string('src', 8190);
791
+ t.string('currentSrc', 8190);
792
+ t.string('alt');
793
+ t.float('width').unsigned().notNullable();
794
+ t.float('height').unsigned().notNullable();
795
+ t.integer('naturalWidth').unsigned().notNullable();
796
+ t.integer('naturalHeight').unsigned().notNullable();
797
+ t.boolean('isLazy');
798
+ t.integer('viewportWidth').unsigned().notNullable();
799
+ t.string('sourceCode');
800
+ t.index('pageId');
801
+ })
802
+ .createTable('resources', (t) => {
803
+ t.increments('id');
804
+ t.string('url', 8190).notNullable().unique();
805
+ t.boolean('isExternal');
806
+ t.integer('status');
807
+ t.string('statusText');
808
+ t.string('contentType').nullable();
809
+ t.integer('contentLength').unsigned().nullable();
810
+ t.string('compress').nullable();
811
+ t.string('cdn').nullable();
812
+ t.json('responseHeaders').nullable();
813
+ })
814
+ .createTable('resources-referrers', (t) => {
815
+ t.increments('id');
816
+ t.integer('resourceId').notNullable().unsigned().references('resources.id');
817
+ t.integer('pageId').notNullable().unsigned().references('pages.id');
818
+ t.unique(['resourceId', 'pageId']);
819
+ t.index('resourceId');
820
+ t.index('pageId');
821
+ });
822
+ }
823
+ /**
824
+ * Upserts page data into the `pages` table (inserts if new, updates if existing).
825
+ * @param page
826
+ * @param isTarget
827
+ * @param trx
828
+ */
829
+ async #insertPage(page, isTarget, trx) {
830
+ const qb = trx ?? this.#instance;
831
+ const pageId = await this.#getIdByUrl(page.url.withoutHashAndAuth, undefined, trx);
832
+ await qb('pages')
833
+ .where('id', pageId)
834
+ .update({
835
+ scraped: true,
836
+ isTarget,
837
+ isExternal: page.isExternal,
838
+ status: page.status,
839
+ statusText: page.statusText,
840
+ contentType: page.contentType,
841
+ contentLength: page.contentLength,
842
+ responseHeaders: JSON.stringify(page.responseHeaders),
843
+ lang: page.meta.lang,
844
+ title: page.meta.title,
845
+ description: page.meta.description,
846
+ keywords: page.meta.keywords,
847
+ noindex: page.meta.noindex,
848
+ nofollow: page.meta.nofollow,
849
+ noarchive: page.meta.noarchive,
850
+ canonical: page.meta.canonical,
851
+ alternate: page.meta.alternate,
852
+ og_type: page.meta['og:type'],
853
+ og_title: page.meta['og:title'],
854
+ og_site_name: page.meta['og:site_name'],
855
+ og_description: page.meta['og:description'],
856
+ og_url: page.meta['og:url'],
857
+ og_image: page.meta['og:image'],
858
+ twitter_card: page.meta['twitter:card'],
859
+ isSkipped: page.isSkipped,
860
+ });
861
+ return pageId;
862
+ }
863
+ /**
864
+ * Assigns and persists the HTML snapshot file path for a page.
865
+ * @param pageId
866
+ * @param snapshotDir
867
+ * @param trx
868
+ */
869
+ async #updateSnapshotPath(pageId, snapshotDir, trx) {
870
+ const qb = trx ?? this.#instance;
871
+ const snapshotHtmlPath = path.resolve(snapshotDir, `${pageId}.html`);
872
+ const snapshotRelHtmlPath = path.relative(this.#workingDir, snapshotHtmlPath);
873
+ await qb('pages').where('id', pageId).update({
874
+ html: snapshotRelHtmlPath,
875
+ });
876
+ return {
877
+ html: snapshotHtmlPath,
878
+ pageId,
879
+ };
880
+ }
881
+ /**
882
+ * Creates and initializes a new Database instance.
883
+ * Creates the parent directory for the database file if needed,
884
+ * establishes the connection, and initializes tables if they do not exist.
885
+ * @param options - The database connection options specifying the type and file path.
886
+ * @returns A fully initialized Database instance.
887
+ */
888
+ static async connect(options) {
889
+ switch (options.type) {
890
+ case 'sqlite3': {
891
+ mkdir(options.filename);
892
+ break;
893
+ }
894
+ }
895
+ const db = new Database(options);
896
+ await db.#init();
897
+ return db;
898
+ }
899
+ };
900
+ })();
901
+ export { Database };
902
+ // ----- ----- ----- ----- -----
903
+ //
904
+ // Common Queries
905
+ //
906
+ // ----- ----- ----- ----- -----
907
+ /**
908
+ * Returns a Knex subquery builder that selects page IDs with pagination,
909
+ * ordered by the `order` column (nulls last), excluding redirected pages.
910
+ * @param limit - The maximum number of page IDs to return.
911
+ * @param offset - The number of page IDs to skip before returning results.
912
+ */
913
+ function limitedPageIds(limit, offset) {
914
+ return async (qb) => {
915
+ await qb
916
+ .select('id')
917
+ .from('pages')
918
+ .orderByRaw('`order` ASC NULLS LAST')
919
+ .whereNull('redirectDestId')
920
+ .limit(limit)
921
+ .offset(offset);
922
+ };
923
+ }
924
+ /**
925
+ * Returns a Knex subquery builder that joins pages with their redirect destinations.
926
+ * When `includeNull` is true, also includes pages without redirects (self-referencing).
927
+ * @param includeNull - Whether to include non-redirected pages in the result. Defaults to `true`.
928
+ */
929
+ function redirectTable(includeNull = true) {
930
+ return async (qb) => {
931
+ const list = qb
932
+ .select('A.id as fromId', 'A.url as from', 'B.url as to', 'B.id as toId')
933
+ .from('pages as A')
934
+ .join('pages as B', (j) => {
935
+ j.on('A.redirectDestId', '=', 'B.id').andOnNotNull('A.redirectDestId');
936
+ });
937
+ if (includeNull) {
938
+ await list.union(async (qb) => {
939
+ await qb
940
+ .select('A.id as fromId', 'A.url as from', 'A.url as to', 'A.id as toId')
941
+ .from('pages as A')
942
+ .whereNull('A.redirectDestId');
943
+ });
944
+ }
945
+ };
946
+ }
947
+ // ----- ----- ----- ----- -----
948
+ //
949
+ // Utils
950
+ //
951
+ // ----- ----- ----- ----- -----
952
+ /**
953
+ * Safely parses a JSON string, returning a fallback value if parsing fails or the input is not a string.
954
+ * @param data - The data to parse. Only string values are parsed; other types return the fallback.
955
+ * @param fallback - The value to return if parsing fails or the result is falsy.
956
+ * @returns The parsed JSON value, or the fallback.
957
+ */
958
+ function getJSON(data, fallback) {
959
+ try {
960
+ if (typeof data === 'string') {
961
+ const result = JSON.parse(data);
962
+ if (result) {
963
+ return result;
964
+ }
965
+ return fallback;
966
+ }
967
+ }
968
+ catch {
969
+ // void
970
+ }
971
+ return fallback;
972
+ }