@nitpicker/crawler 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/LICENSE +191 -0
  3. package/README.md +13 -0
  4. package/lib/archive/archive-accessor.d.ts +107 -0
  5. package/lib/archive/archive-accessor.js +264 -0
  6. package/lib/archive/archive.d.ts +174 -0
  7. package/lib/archive/archive.js +331 -0
  8. package/lib/archive/database.d.ts +207 -0
  9. package/lib/archive/database.js +972 -0
  10. package/lib/archive/debug.d.ts +8 -0
  11. package/lib/archive/debug.js +9 -0
  12. package/lib/archive/filesystem/append-text.d.ts +9 -0
  13. package/lib/archive/filesystem/append-text.js +14 -0
  14. package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
  15. package/lib/archive/filesystem/copy-dir-sync.js +9 -0
  16. package/lib/archive/filesystem/copy-dir.d.ts +7 -0
  17. package/lib/archive/filesystem/copy-dir.js +13 -0
  18. package/lib/archive/filesystem/exists.d.ts +6 -0
  19. package/lib/archive/filesystem/exists.js +9 -0
  20. package/lib/archive/filesystem/get-file-list.d.ts +8 -0
  21. package/lib/archive/filesystem/get-file-list.js +12 -0
  22. package/lib/archive/filesystem/index.d.ts +17 -0
  23. package/lib/archive/filesystem/index.js +17 -0
  24. package/lib/archive/filesystem/is-dir.d.ts +6 -0
  25. package/lib/archive/filesystem/is-dir.js +10 -0
  26. package/lib/archive/filesystem/mkdir.d.ts +8 -0
  27. package/lib/archive/filesystem/mkdir.js +15 -0
  28. package/lib/archive/filesystem/output-json.d.ts +9 -0
  29. package/lib/archive/filesystem/output-json.js +14 -0
  30. package/lib/archive/filesystem/output-text.d.ts +11 -0
  31. package/lib/archive/filesystem/output-text.js +32 -0
  32. package/lib/archive/filesystem/read-json.d.ts +7 -0
  33. package/lib/archive/filesystem/read-json.js +11 -0
  34. package/lib/archive/filesystem/read-text.d.ts +6 -0
  35. package/lib/archive/filesystem/read-text.js +10 -0
  36. package/lib/archive/filesystem/readline.d.ts +11 -0
  37. package/lib/archive/filesystem/readline.js +26 -0
  38. package/lib/archive/filesystem/remove.d.ts +5 -0
  39. package/lib/archive/filesystem/remove.js +10 -0
  40. package/lib/archive/filesystem/rename.d.ts +11 -0
  41. package/lib/archive/filesystem/rename.js +18 -0
  42. package/lib/archive/filesystem/tar.d.ts +11 -0
  43. package/lib/archive/filesystem/tar.js +22 -0
  44. package/lib/archive/filesystem/untar.d.ts +20 -0
  45. package/lib/archive/filesystem/untar.js +24 -0
  46. package/lib/archive/filesystem/utils.d.ts +109 -0
  47. package/lib/archive/filesystem/utils.js +185 -0
  48. package/lib/archive/filesystem/zip.d.ts +29 -0
  49. package/lib/archive/filesystem/zip.js +53 -0
  50. package/lib/archive/index.d.ts +6 -0
  51. package/lib/archive/index.js +11 -0
  52. package/lib/archive/page.d.ts +263 -0
  53. package/lib/archive/page.js +316 -0
  54. package/lib/archive/resource.d.ts +46 -0
  55. package/lib/archive/resource.js +62 -0
  56. package/lib/archive/safe-path.d.ts +9 -0
  57. package/lib/archive/safe-path.js +17 -0
  58. package/lib/archive/types.d.ts +210 -0
  59. package/lib/archive/types.js +1 -0
  60. package/lib/crawler/clear-destination-cache.d.ts +5 -0
  61. package/lib/crawler/clear-destination-cache.js +8 -0
  62. package/lib/crawler/crawler.d.ts +73 -0
  63. package/lib/crawler/crawler.js +748 -0
  64. package/lib/crawler/decompose-url.d.ts +25 -0
  65. package/lib/crawler/decompose-url.js +71 -0
  66. package/lib/crawler/destination-cache.d.ts +7 -0
  67. package/lib/crawler/destination-cache.js +6 -0
  68. package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
  69. package/lib/crawler/detect-pagination-pattern.js +61 -0
  70. package/lib/crawler/fetch-destination.d.ts +38 -0
  71. package/lib/crawler/fetch-destination.js +208 -0
  72. package/lib/crawler/fetch-robots-txt.d.ts +42 -0
  73. package/lib/crawler/fetch-robots-txt.js +44 -0
  74. package/lib/crawler/find-best-matching-scope.d.ts +12 -0
  75. package/lib/crawler/find-best-matching-scope.js +46 -0
  76. package/lib/crawler/generate-predicted-urls.d.ts +13 -0
  77. package/lib/crawler/generate-predicted-urls.js +27 -0
  78. package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
  79. package/lib/crawler/handle-ignore-and-skip.js +19 -0
  80. package/lib/crawler/handle-resource-response.d.ts +13 -0
  81. package/lib/crawler/handle-resource-response.js +16 -0
  82. package/lib/crawler/handle-scrape-end.d.ts +24 -0
  83. package/lib/crawler/handle-scrape-end.js +82 -0
  84. package/lib/crawler/handle-scrape-error.d.ts +37 -0
  85. package/lib/crawler/handle-scrape-error.js +38 -0
  86. package/lib/crawler/index.d.ts +2 -0
  87. package/lib/crawler/index.js +2 -0
  88. package/lib/crawler/inject-scope-auth.d.ts +11 -0
  89. package/lib/crawler/inject-scope-auth.js +21 -0
  90. package/lib/crawler/is-external-url.d.ts +11 -0
  91. package/lib/crawler/is-external-url.js +12 -0
  92. package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
  93. package/lib/crawler/is-in-any-lower-layer.js +15 -0
  94. package/lib/crawler/link-list.d.ts +112 -0
  95. package/lib/crawler/link-list.js +248 -0
  96. package/lib/crawler/link-to-page-data.d.ts +14 -0
  97. package/lib/crawler/link-to-page-data.js +32 -0
  98. package/lib/crawler/net-timeout-error.d.ts +9 -0
  99. package/lib/crawler/net-timeout-error.js +11 -0
  100. package/lib/crawler/network.d.ts +30 -0
  101. package/lib/crawler/network.js +226 -0
  102. package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
  103. package/lib/crawler/protocol-agnostic-key.js +11 -0
  104. package/lib/crawler/reconstruct-url.d.ts +10 -0
  105. package/lib/crawler/reconstruct-url.js +28 -0
  106. package/lib/crawler/result-handler.d.ts +118 -0
  107. package/lib/crawler/result-handler.js +153 -0
  108. package/lib/crawler/robots-checker.d.ts +26 -0
  109. package/lib/crawler/robots-checker.js +62 -0
  110. package/lib/crawler/should-discard-predicted.d.ts +14 -0
  111. package/lib/crawler/should-discard-predicted.js +31 -0
  112. package/lib/crawler/should-skip-url.d.ts +23 -0
  113. package/lib/crawler/should-skip-url.js +15 -0
  114. package/lib/crawler/speculative-pagination.d.ts +52 -0
  115. package/lib/crawler/speculative-pagination.js +215 -0
  116. package/lib/crawler/types.d.ts +119 -0
  117. package/lib/crawler/types.js +1 -0
  118. package/lib/crawler/url-filter.d.ts +56 -0
  119. package/lib/crawler/url-filter.js +110 -0
  120. package/lib/crawler-orchestrator.d.ts +142 -0
  121. package/lib/crawler-orchestrator.js +309 -0
  122. package/lib/debug.d.ts +8 -0
  123. package/lib/debug.js +9 -0
  124. package/lib/index.d.ts +16 -0
  125. package/lib/index.js +18 -0
  126. package/lib/qzilla.d.ts +136 -0
  127. package/lib/qzilla.js +292 -0
  128. package/lib/types.d.ts +27 -0
  129. package/lib/types.js +1 -0
  130. package/lib/utils/array/each-splitted.d.ts +10 -0
  131. package/lib/utils/array/each-splitted.js +14 -0
  132. package/lib/utils/array/index.d.ts +1 -0
  133. package/lib/utils/array/index.js +1 -0
  134. package/lib/utils/async/index.d.ts +1 -0
  135. package/lib/utils/async/index.js +1 -0
  136. package/lib/utils/debug.d.ts +5 -0
  137. package/lib/utils/debug.js +5 -0
  138. package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
  139. package/lib/utils/error/dom-evaluation-error.js +7 -0
  140. package/lib/utils/error/error-emitter.d.ts +18 -0
  141. package/lib/utils/error/error-emitter.js +29 -0
  142. package/lib/utils/error/index.d.ts +3 -0
  143. package/lib/utils/error/index.js +2 -0
  144. package/lib/utils/event-emitter/index.d.ts +6 -0
  145. package/lib/utils/event-emitter/index.js +6 -0
  146. package/lib/utils/index.d.ts +5 -0
  147. package/lib/utils/index.js +5 -0
  148. package/lib/utils/network/index.d.ts +1 -0
  149. package/lib/utils/network/index.js +1 -0
  150. package/lib/utils/object/clean-object.d.ts +8 -0
  151. package/lib/utils/object/clean-object.js +13 -0
  152. package/lib/utils/object/index.d.ts +1 -0
  153. package/lib/utils/object/index.js +1 -0
  154. package/lib/utils/path/index.d.ts +1 -0
  155. package/lib/utils/path/index.js +1 -0
  156. package/lib/utils/path/safe-filepath.d.ts +7 -0
  157. package/lib/utils/path/safe-filepath.js +12 -0
  158. package/lib/utils/regexp/index.d.ts +1 -0
  159. package/lib/utils/regexp/index.js +1 -0
  160. package/lib/utils/retryable/index.d.ts +2 -0
  161. package/lib/utils/retryable/index.js +1 -0
  162. package/lib/utils/sort/index.d.ts +14 -0
  163. package/lib/utils/sort/index.js +61 -0
  164. package/lib/utils/sort/remove-matches.d.ts +9 -0
  165. package/lib/utils/sort/remove-matches.js +23 -0
  166. package/lib/utils/types/index.d.ts +1 -0
  167. package/lib/utils/types/index.js +1 -0
  168. package/lib/utils/types/types.d.ts +46 -0
  169. package/lib/utils/types/types.js +1 -0
  170. package/lib/utils/url/index.d.ts +5 -0
  171. package/lib/utils/url/index.js +5 -0
  172. package/lib/utils/url/is-lower-layer.d.ts +15 -0
  173. package/lib/utils/url/is-lower-layer.js +55 -0
  174. package/lib/utils/url/parse-url.d.ts +11 -0
  175. package/lib/utils/url/parse-url.js +20 -0
  176. package/lib/utils/url/path-match.d.ts +11 -0
  177. package/lib/utils/url/path-match.js +18 -0
  178. package/lib/utils/url/sort-url.d.ts +10 -0
  179. package/lib/utils/url/sort-url.js +24 -0
  180. package/lib/utils/url/url-partial-match.d.ts +11 -0
  181. package/lib/utils/url/url-partial-match.js +32 -0
  182. package/package.json +49 -0
  183. package/src/archive/__mock__/.gitignore +3 -0
  184. package/src/archive/__mock__/mock.sqlite +0 -0
  185. package/src/archive/archive-accessor.ts +337 -0
  186. package/src/archive/archive.ts +408 -0
  187. package/src/archive/database.spec.ts +469 -0
  188. package/src/archive/database.ts +1059 -0
  189. package/src/archive/debug.ts +10 -0
  190. package/src/archive/filesystem/append-text.spec.ts +26 -0
  191. package/src/archive/filesystem/append-text.ts +16 -0
  192. package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
  193. package/src/archive/filesystem/copy-dir-sync.ts +10 -0
  194. package/src/archive/filesystem/copy-dir.spec.ts +33 -0
  195. package/src/archive/filesystem/copy-dir.ts +14 -0
  196. package/src/archive/filesystem/exists.spec.ts +33 -0
  197. package/src/archive/filesystem/exists.ts +10 -0
  198. package/src/archive/filesystem/get-file-list.spec.ts +37 -0
  199. package/src/archive/filesystem/get-file-list.ts +13 -0
  200. package/src/archive/filesystem/index.ts +17 -0
  201. package/src/archive/filesystem/is-dir.spec.ts +29 -0
  202. package/src/archive/filesystem/is-dir.ts +11 -0
  203. package/src/archive/filesystem/mkdir.spec.ts +37 -0
  204. package/src/archive/filesystem/mkdir.ts +16 -0
  205. package/src/archive/filesystem/output-json.spec.ts +34 -0
  206. package/src/archive/filesystem/output-json.ts +16 -0
  207. package/src/archive/filesystem/output-text.spec.ts +31 -0
  208. package/src/archive/filesystem/output-text.ts +35 -0
  209. package/src/archive/filesystem/read-json.spec.ts +26 -0
  210. package/src/archive/filesystem/read-json.ts +12 -0
  211. package/src/archive/filesystem/read-text.spec.ts +25 -0
  212. package/src/archive/filesystem/read-text.ts +11 -0
  213. package/src/archive/filesystem/readline.spec.ts +29 -0
  214. package/src/archive/filesystem/readline.ts +30 -0
  215. package/src/archive/filesystem/remove.spec.ts +34 -0
  216. package/src/archive/filesystem/remove.ts +11 -0
  217. package/src/archive/filesystem/rename.spec.ts +46 -0
  218. package/src/archive/filesystem/rename.ts +21 -0
  219. package/src/archive/filesystem/tar.spec.ts +33 -0
  220. package/src/archive/filesystem/tar.ts +27 -0
  221. package/src/archive/filesystem/untar.spec.ts +34 -0
  222. package/src/archive/filesystem/untar.ts +36 -0
  223. package/src/archive/index.ts +13 -0
  224. package/src/archive/page.spec.ts +368 -0
  225. package/src/archive/page.ts +420 -0
  226. package/src/archive/resource.spec.ts +101 -0
  227. package/src/archive/resource.ts +73 -0
  228. package/src/archive/safe-path.spec.ts +44 -0
  229. package/src/archive/safe-path.ts +18 -0
  230. package/src/archive/types.ts +227 -0
  231. package/src/crawler/clear-destination-cache.spec.ts +20 -0
  232. package/src/crawler/clear-destination-cache.ts +9 -0
  233. package/src/crawler/crawler.ts +873 -0
  234. package/src/crawler/decompose-url.spec.ts +48 -0
  235. package/src/crawler/decompose-url.ts +90 -0
  236. package/src/crawler/destination-cache.spec.ts +23 -0
  237. package/src/crawler/destination-cache.ts +8 -0
  238. package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
  239. package/src/crawler/detect-pagination-pattern.ts +66 -0
  240. package/src/crawler/fetch-destination.ts +257 -0
  241. package/src/crawler/fetch-robots-txt.spec.ts +83 -0
  242. package/src/crawler/fetch-robots-txt.ts +91 -0
  243. package/src/crawler/find-best-matching-scope.spec.ts +39 -0
  244. package/src/crawler/find-best-matching-scope.ts +57 -0
  245. package/src/crawler/generate-predicted-urls.spec.ts +42 -0
  246. package/src/crawler/generate-predicted-urls.ts +34 -0
  247. package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
  248. package/src/crawler/handle-ignore-and-skip.ts +30 -0
  249. package/src/crawler/handle-resource-response.spec.ts +45 -0
  250. package/src/crawler/handle-resource-response.ts +21 -0
  251. package/src/crawler/handle-scrape-end.spec.ts +109 -0
  252. package/src/crawler/handle-scrape-end.ts +115 -0
  253. package/src/crawler/handle-scrape-error.spec.ts +105 -0
  254. package/src/crawler/handle-scrape-error.ts +58 -0
  255. package/src/crawler/index.ts +2 -0
  256. package/src/crawler/inject-scope-auth.spec.ts +36 -0
  257. package/src/crawler/inject-scope-auth.ts +27 -0
  258. package/src/crawler/is-external-url.spec.ts +31 -0
  259. package/src/crawler/is-external-url.ts +17 -0
  260. package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
  261. package/src/crawler/is-in-any-lower-layer.ts +22 -0
  262. package/src/crawler/link-list.spec.ts +355 -0
  263. package/src/crawler/link-list.ts +275 -0
  264. package/src/crawler/link-to-page-data.spec.ts +133 -0
  265. package/src/crawler/link-to-page-data.ts +34 -0
  266. package/src/crawler/net-timeout-error.spec.ts +25 -0
  267. package/src/crawler/net-timeout-error.ts +11 -0
  268. package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
  269. package/src/crawler/protocol-agnostic-key.ts +11 -0
  270. package/src/crawler/reconstruct-url.spec.ts +37 -0
  271. package/src/crawler/reconstruct-url.ts +37 -0
  272. package/src/crawler/robots-checker.spec.ts +104 -0
  273. package/src/crawler/robots-checker.ts +73 -0
  274. package/src/crawler/should-discard-predicted.spec.ts +125 -0
  275. package/src/crawler/should-discard-predicted.ts +33 -0
  276. package/src/crawler/should-skip-url.spec.ts +77 -0
  277. package/src/crawler/should-skip-url.ts +37 -0
  278. package/src/crawler/types.ts +146 -0
  279. package/src/crawler-orchestrator.ts +401 -0
  280. package/src/debug.ts +10 -0
  281. package/src/index.ts +25 -0
  282. package/src/types.ts +30 -0
  283. package/src/utils/array/each-splitted.spec.ts +38 -0
  284. package/src/utils/array/each-splitted.ts +19 -0
  285. package/src/utils/array/index.ts +1 -0
  286. package/src/utils/debug.ts +6 -0
  287. package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
  288. package/src/utils/error/dom-evaluation-error.ts +6 -0
  289. package/src/utils/error/error-emitter.spec.ts +78 -0
  290. package/src/utils/error/error-emitter.ts +44 -0
  291. package/src/utils/error/index.ts +3 -0
  292. package/src/utils/index.ts +5 -0
  293. package/src/utils/object/clean-object.spec.ts +24 -0
  294. package/src/utils/object/clean-object.ts +13 -0
  295. package/src/utils/object/index.ts +1 -0
  296. package/src/utils/types/index.ts +1 -0
  297. package/src/utils/types/types.ts +65 -0
  298. package/tsconfig.json +11 -0
  299. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,316 @@
1
+ import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
2
+ /**
3
+ * Represents a crawled page stored in the archive.
4
+ *
5
+ * Provides access to the page's metadata (title, status, SEO tags, etc.),
6
+ * its relationships (anchors, referrers, redirects), and its HTML snapshot.
7
+ * Instances are created by {@link ArchiveAccessor.getPages} or
8
+ * {@link ArchiveAccessor.getPagesWithRefs}.
9
+ */
10
+ export default class Page {
11
+ /**
12
+ * An array of URLs that redirect to this page.
13
+ * Each entry contains the source URL and its page ID.
14
+ * Returns an empty array if no redirects exist.
15
+ */
16
+ redirectFrom;
17
+ #archive;
18
+ #disableQueries;
19
+ #raw;
20
+ #rawAnchors;
21
+ #rawReferrers;
22
+ /**
23
+ * The alternate URL from the `<link rel="alternate">` tag, or null if not present.
24
+ */
25
+ get alternate() {
26
+ return this.#raw.alternate;
27
+ }
28
+ /**
29
+ * The canonical URL from the `<link rel="canonical">` tag, or null if not present.
30
+ */
31
+ get canonical() {
32
+ return this.#raw.canonical;
33
+ }
34
+ /**
35
+ * The content length of the HTTP response in bytes, or null if unknown.
36
+ */
37
+ get contentLength() {
38
+ return this.#raw.contentLength;
39
+ }
40
+ /**
41
+ * The MIME content type of the HTTP response (e.g., `"text/html"`), or null if unknown.
42
+ */
43
+ get contentType() {
44
+ return this.#raw.contentType;
45
+ }
46
+ /**
47
+ * The meta description content, or null if not present.
48
+ */
49
+ get description() {
50
+ return this.#raw.description;
51
+ }
52
+ /**
53
+ * Whether this page is on an external domain (outside the crawl scope).
54
+ */
55
+ get isExternal() {
56
+ return !!this.#raw.isExternal;
57
+ }
58
+ /**
59
+ * Whether this page was skipped during crawling.
60
+ */
61
+ get isSkipped() {
62
+ return !!this.#raw.isSkipped;
63
+ }
64
+ /**
65
+ * Whether this page was a crawl target (as opposed to being discovered incidentally).
66
+ */
67
+ get isTarget() {
68
+ return !!this.#raw.isTarget;
69
+ }
70
+ /**
71
+ * The reason this page was skipped during crawling, or null if it was not skipped.
72
+ */
73
+ get skipReason() {
74
+ return this.#raw.skipReason;
75
+ }
76
+ /**
77
+ * The meta keywords content, or null if not present.
78
+ */
79
+ get keywords() {
80
+ return this.#raw.keywords;
81
+ }
82
+ /**
83
+ * The `lang` attribute value from the HTML element, or null if not present.
84
+ */
85
+ get lang() {
86
+ return this.#raw.lang;
87
+ }
88
+ /**
89
+ * Whether the noarchive robots directive is set.
90
+ */
91
+ get noarchive() {
92
+ return !!this.#raw.noarchive;
93
+ }
94
+ /**
95
+ * Whether the nofollow robots directive is set.
96
+ */
97
+ get nofollow() {
98
+ return !!this.#raw.nofollow;
99
+ }
100
+ /**
101
+ * Whether the noindex robots directive is set.
102
+ */
103
+ get noindex() {
104
+ return !!this.#raw.noindex;
105
+ }
106
+ /**
107
+ * The Open Graph description (`og:description`), or null if not present.
108
+ */
109
+ get og_description() {
110
+ return this.#raw.og_description;
111
+ }
112
+ /**
113
+ * The Open Graph image URL (`og:image`), or null if not present.
114
+ */
115
+ get og_image() {
116
+ return this.#raw.og_image;
117
+ }
118
+ /**
119
+ * The Open Graph site name (`og:site_name`), or null if not present.
120
+ */
121
+ get og_site_name() {
122
+ return this.#raw.og_site_name;
123
+ }
124
+ /**
125
+ * The Open Graph title (`og:title`), or null if not present.
126
+ */
127
+ get og_title() {
128
+ return this.#raw.og_title;
129
+ }
130
+ /**
131
+ * The Open Graph type (`og:type`), or null if not present.
132
+ */
133
+ get og_type() {
134
+ return this.#raw.og_type;
135
+ }
136
+ /**
137
+ * The Open Graph URL (`og:url`), or null if not present.
138
+ */
139
+ get og_url() {
140
+ return this.#raw.og_url;
141
+ }
142
+ /**
143
+ * The parsed HTTP response headers as a key-value record.
144
+ * Returns an empty object if headers cannot be parsed.
145
+ */
146
+ get responseHeaders() {
147
+ try {
148
+ return JSON.parse(this.#raw.responseHeaders);
149
+ }
150
+ catch {
151
+ return {};
152
+ }
153
+ }
154
+ /**
155
+ * The HTTP response status code, or null if the page has not been fetched.
156
+ */
157
+ get status() {
158
+ return this.#raw.status;
159
+ }
160
+ /**
161
+ * The HTTP response status text (e.g., `"OK"`, `"Not Found"`), or null if not fetched.
162
+ */
163
+ get statusText() {
164
+ return this.#raw.statusText;
165
+ }
166
+ /**
167
+ * The page title from the `<title>` element.
168
+ * Returns an empty string if no title is set.
169
+ */
170
+ get title() {
171
+ return this.#raw.title || '';
172
+ }
173
+ /**
174
+ * The Twitter Card type (`twitter:card`), or null if not present.
175
+ */
176
+ get twitter_card() {
177
+ return this.#raw.twitter_card;
178
+ }
179
+ /**
180
+ * The parsed URL of this page as an ExURL object.
181
+ * Respects the `disableQueries` option for query string handling.
182
+ */
183
+ get url() {
184
+ return parseUrl(this.#raw.url, {
185
+ disableQueries: this.#disableQueries,
186
+ });
187
+ }
188
+ /**
189
+ * Creates a new Page instance.
190
+ * @param archive - The ArchiveAccessor used for lazy-loading relationships.
191
+ * @param raw - The raw database row for this page.
192
+ * @param rawRedirects - Pre-loaded redirect records, or undefined for lazy loading.
193
+ * @param rawAnchors - Pre-loaded anchor records, or undefined for lazy loading.
194
+ * @param rawReferrers - Pre-loaded referrer records, or undefined for lazy loading.
195
+ * @param disableQueries - Whether to strip query strings from the URL.
196
+ */
197
+ constructor(archive, raw, rawRedirects, rawAnchors, rawReferrers, disableQueries) {
198
+ this.#archive = archive;
199
+ this.#raw = raw;
200
+ this.redirectFrom = (rawRedirects || []).map((r) => ({
201
+ url: r.from,
202
+ pageId: r.fromId,
203
+ }));
204
+ this.#rawAnchors = rawAnchors || null;
205
+ this.#rawReferrers = rawReferrers || null;
206
+ this.#disableQueries = disableQueries ?? false;
207
+ }
208
+ /**
209
+ * Retrieves the anchors (outgoing links) found on this page.
210
+ * Uses pre-loaded data if available, otherwise queries the database.
211
+ * @returns An array of {@link Anchor} objects representing the links on this page.
212
+ */
213
+ async getAnchors() {
214
+ if (this.#rawAnchors) {
215
+ return this.#rawAnchors.map((a) => ({
216
+ url: a.url,
217
+ href: a.href,
218
+ isExternal: !!a.isExternal,
219
+ title: a.title,
220
+ status: a.status,
221
+ statusText: a.statusText,
222
+ contentType: a.contentType,
223
+ hash: a.hash,
224
+ textContent: a.textContent,
225
+ }));
226
+ }
227
+ return this.#archive.getAnchorsOnPage(this.#raw.id);
228
+ }
229
+ /**
230
+ * Reads the HTML snapshot content of this page from the archive.
231
+ * @returns The HTML content as a string, or null if no snapshot was saved.
232
+ */
233
+ async getHtml() {
234
+ return this.#archive.getHtmlOfPage(this.#raw.html);
235
+ }
236
+ /**
237
+ * Retrieves the referrers (incoming links) pointing to this page.
238
+ * Uses pre-loaded data if available, otherwise queries the database.
239
+ * @returns An array of {@link Referrer} objects representing pages that link to this page.
240
+ */
241
+ async getReferrers() {
242
+ if (this.#rawReferrers) {
243
+ return this.#rawReferrers.map((r) => ({
244
+ url: r.url,
245
+ through: r.through,
246
+ throughId: r.throughId,
247
+ hash: r.hash,
248
+ textContent: r.textContent || '',
249
+ }));
250
+ }
251
+ return this.#archive.getReferrersOfPage(this.#raw.id);
252
+ }
253
+ /**
254
+ * Retrieves all request referrers for this page directly from the database.
255
+ * Unlike {@link getReferrers}, this always queries the database and does not use pre-loaded data.
256
+ * @returns An array of {@link Referrer} objects.
257
+ */
258
+ async getRequests() {
259
+ return this.#archive.getReferrersOfPage(this.#raw.id);
260
+ }
261
+ /**
262
+ * Checks whether this page is an internal HTML page (not external and has `text/html` content type).
263
+ * @returns `true` if this is an internal HTML page, `false` otherwise.
264
+ */
265
+ isInternalPage() {
266
+ return this.isPage() && !this.isExternal;
267
+ }
268
+ /**
269
+ * Checks whether this entry represents an HTML page (content type is `text/html`).
270
+ * @returns `true` if the content type is `text/html`, `false` otherwise.
271
+ */
272
+ isPage() {
273
+ const type = this.contentType || '';
274
+ return type.toLowerCase().trim() === 'text/html';
275
+ }
276
+ /**
277
+ * Serializes the page data to a plain JSON object,
278
+ * including resolved anchors and referrers.
279
+ * @returns A plain object containing all page metadata and relationships.
280
+ */
281
+ async toJSON() {
282
+ return {
283
+ url: this.url.href,
284
+ title: this.title,
285
+ status: this.status,
286
+ statusText: this.statusText,
287
+ contentType: this.contentType,
288
+ contentLength: this.contentLength,
289
+ responseHeaders: this.responseHeaders,
290
+ isExternal: this.isExternal,
291
+ isSkipped: this.isSkipped,
292
+ skipReason: this.skipReason,
293
+ isTarget: this.isTarget,
294
+ lang: this.lang,
295
+ description: this.description,
296
+ keywords: this.keywords,
297
+ noindex: this.noindex,
298
+ nofollow: this.nofollow,
299
+ noarchive: this.noarchive,
300
+ canonical: this.canonical,
301
+ alternate: this.alternate,
302
+ twitter_card: this.twitter_card,
303
+ og_site_name: this.og_site_name,
304
+ og_url: this.og_url,
305
+ og_title: this.og_title,
306
+ og_description: this.og_description,
307
+ og_type: this.og_type,
308
+ og_image: this.og_image,
309
+ redirectFrom: this.redirectFrom,
310
+ isPage: this.isPage(),
311
+ isInternalPage: this.isInternalPage(),
312
+ getAnchors: await this.getAnchors(),
313
+ getReferrers: await this.getReferrers(),
314
+ };
315
+ }
316
+ }
@@ -0,0 +1,46 @@
1
+ import type { ArchiveAccessor } from './archive-accessor.js';
2
+ import type { DB_Resource } from './types.js';
3
+ /**
4
+ * Represents a sub-resource (CSS, JS, image, font, etc.) stored in the archive.
5
+ *
6
+ * Provides access to the resource's HTTP metadata and referrer information.
7
+ * Instances are created by {@link ArchiveAccessor.getResources}.
8
+ */
9
+ export default class Resource {
10
+ #private;
11
+ /**
12
+ * The content length of the resource in bytes, or null if unknown.
13
+ */
14
+ get contentLength(): number | null;
15
+ /**
16
+ * The MIME content type of the resource (e.g., `"text/css"`, `"application/javascript"`), or null if unknown.
17
+ */
18
+ get contentType(): string | null;
19
+ /**
20
+ * Whether this resource is hosted on an external domain.
21
+ */
22
+ get isExternal(): boolean;
23
+ /**
24
+ * The HTTP response status code, or null if not yet fetched.
25
+ */
26
+ get status(): number | null;
27
+ /**
28
+ * The HTTP response status text (e.g., `"OK"`, `"Not Found"`), or null if not yet fetched.
29
+ */
30
+ get statusText(): string | null;
31
+ /**
32
+ * The URL of the resource.
33
+ */
34
+ get url(): string;
35
+ /**
36
+ * Creates a new Resource instance.
37
+ * @param archive - The ArchiveAccessor used for querying referrer data.
38
+ * @param raw - The raw database row for this resource.
39
+ */
40
+ constructor(archive: ArchiveAccessor, raw: DB_Resource);
41
+ /**
42
+ * Retrieves the page URLs that reference this resource.
43
+ * @returns An array of page URL strings that include or reference this resource.
44
+ */
45
+ getReferrers(): Promise<string[]>;
46
+ }
@@ -0,0 +1,62 @@
1
+ /**
2
+ * Represents a sub-resource (CSS, JS, image, font, etc.) stored in the archive.
3
+ *
4
+ * Provides access to the resource's HTTP metadata and referrer information.
5
+ * Instances are created by {@link ArchiveAccessor.getResources}.
6
+ */
7
+ export default class Resource {
8
+ #archive;
9
+ #raw;
10
+ /**
11
+ * The content length of the resource in bytes, or null if unknown.
12
+ */
13
+ get contentLength() {
14
+ return this.#raw.contentLength;
15
+ }
16
+ /**
17
+ * The MIME content type of the resource (e.g., `"text/css"`, `"application/javascript"`), or null if unknown.
18
+ */
19
+ get contentType() {
20
+ return this.#raw.contentType;
21
+ }
22
+ /**
23
+ * Whether this resource is hosted on an external domain.
24
+ */
25
+ get isExternal() {
26
+ return !!this.#raw.isExternal;
27
+ }
28
+ /**
29
+ * The HTTP response status code, or null if not yet fetched.
30
+ */
31
+ get status() {
32
+ return this.#raw.status;
33
+ }
34
+ /**
35
+ * The HTTP response status text (e.g., `"OK"`, `"Not Found"`), or null if not yet fetched.
36
+ */
37
+ get statusText() {
38
+ return this.#raw.statusText;
39
+ }
40
+ /**
41
+ * The URL of the resource.
42
+ */
43
+ get url() {
44
+ return this.#raw.url;
45
+ }
46
+ /**
47
+ * Creates a new Resource instance.
48
+ * @param archive - The ArchiveAccessor used for querying referrer data.
49
+ * @param raw - The raw database row for this resource.
50
+ */
51
+ constructor(archive, raw) {
52
+ this.#archive = archive;
53
+ this.#raw = raw;
54
+ }
55
+ /**
56
+ * Retrieves the page URLs that reference this resource.
57
+ * @returns An array of page URL strings that include or reference this resource.
58
+ */
59
+ async getReferrers() {
60
+ return this.#archive.getReferrersOfResource(this.#raw.id);
61
+ }
62
+ }
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Resolves and validates a file path to prevent path traversal attacks.
3
+ * Ensures the resolved path stays within the specified base directory.
4
+ * @param base - The base directory that all paths must stay within.
5
+ * @param segments - Path segments to resolve relative to the base.
6
+ * @returns The resolved absolute path.
7
+ * @throws {Error} If the resolved path escapes the base directory.
8
+ */
9
+ export declare function safePath(base: string, ...segments: string[]): string;
@@ -0,0 +1,17 @@
1
+ import path from 'node:path';
2
+ /**
3
+ * Resolves and validates a file path to prevent path traversal attacks.
4
+ * Ensures the resolved path stays within the specified base directory.
5
+ * @param base - The base directory that all paths must stay within.
6
+ * @param segments - Path segments to resolve relative to the base.
7
+ * @returns The resolved absolute path.
8
+ * @throws {Error} If the resolved path escapes the base directory.
9
+ */
10
+ export function safePath(base, ...segments) {
11
+ const resolvedBase = path.resolve(base);
12
+ const resolved = path.resolve(base, ...segments);
13
+ if (!resolved.startsWith(resolvedBase + path.sep) && resolved !== resolvedBase) {
14
+ throw new Error(`Path traversal detected: ${segments.join('/')}`);
15
+ }
16
+ return resolved;
17
+ }
@@ -0,0 +1,210 @@
1
+ import type { ParseURLOptions } from '@d-zero/shared/parse-url';
2
+ /**
3
+ * Event map for database-related events emitted by the Database and ArchiveAccessor classes.
4
+ */
5
+ export interface DatabaseEvent {
6
+ /** An error that occurred during a database operation. */
7
+ error: Error;
8
+ }
9
+ /**
10
+ * Configuration stored in the archive database's `info` table.
11
+ * Represents all crawling options that were used for the crawl session.
12
+ */
13
+ export interface Config extends Required<Pick<ParseURLOptions, 'disableQueries'>> {
14
+ /** The starting URL for the crawl. */
15
+ baseUrl: string;
16
+ /** Maximum directory depth for excluded paths. */
17
+ maxExcludedDepth: number;
18
+ /** URL patterns defining the crawl scope. */
19
+ scope: string[];
20
+ /** Keywords used to exclude pages from crawling. */
21
+ excludeKeywords: string[];
22
+ /** URL patterns to exclude from crawling. */
23
+ excludes: string[];
24
+ /** URL prefixes to exclude from crawling. */
25
+ excludeUrls: string[];
26
+ /** Whether to fetch external (off-site) pages. */
27
+ fetchExternal: boolean;
28
+ /** Whether the crawl was initiated from a URL list rather than recursive discovery. */
29
+ fromList: boolean;
30
+ /** Whether to collect image data during crawling. */
31
+ image: boolean;
32
+ /** Interval in milliseconds between requests. */
33
+ interval: number;
34
+ /** The name identifier for this crawl session. */
35
+ name: string;
36
+ /** Number of parallel crawling processes. */
37
+ parallels: number;
38
+ /** Whether to recursively follow links. */
39
+ recursive: boolean;
40
+ /** Maximum number of retry attempts per URL on scrape failure. */
41
+ retry: number;
42
+ /** The version of Nitpicker that created this archive. */
43
+ version: string;
44
+ /** User-Agent string used for HTTP requests. */
45
+ userAgent: string;
46
+ /** Whether robots.txt restrictions were ignored during crawling. */
47
+ ignoreRobots: boolean;
48
+ }
49
+ /**
50
+ * Filter type for querying pages from the database.
51
+ *
52
+ * - `'page'` - HTML pages that are crawl targets
53
+ * - `'page-included-no-target'` - All HTML pages, including non-target pages
54
+ * - `'external-page'` - HTML pages on external domains
55
+ * - `'internal-page'` - HTML pages on the crawled domain
56
+ * - `'no-page'` - Non-HTML resources (e.g., images, PDFs)
57
+ * - `'external-no-page'` - External non-HTML resources
58
+ * - `'internal-no-page'` - Internal non-HTML resources
59
+ */
60
+ export type PageFilter = 'page' | 'page-included-no-target' | 'external-page' | 'internal-page' | 'no-page' | 'external-no-page' | 'internal-no-page';
61
+ /**
62
+ * Raw database row representing a crawled page in the `pages` table.
63
+ */
64
+ export interface DB_Page {
65
+ /** Auto-incremented primary key. */
66
+ id: number;
67
+ /** The canonical URL of the page. */
68
+ url: string;
69
+ /** Foreign key to the redirect destination page, or null if not redirected. */
70
+ redirectDestId: number | null;
71
+ /** Whether the page has been scraped (1) or is still pending (0). */
72
+ scraped: 0 | 1;
73
+ /** Whether the page is a crawl target (1) or discovered incidentally (0). */
74
+ isTarget: 0 | 1;
75
+ /** Whether the page is on an external domain (1) or internal (0). */
76
+ isExternal: 0 | 1;
77
+ /** HTTP response status code, or null if not yet fetched. */
78
+ status: number | null;
79
+ /** HTTP response status text (e.g., "OK", "Not Found"), or null if not yet fetched. */
80
+ statusText: string | null;
81
+ /** MIME content type of the response (e.g., "text/html"), or null if unknown. */
82
+ contentType: string | null;
83
+ /** Content length in bytes, or null if unknown. */
84
+ contentLength: number | null;
85
+ /** JSON-serialized HTTP response headers. */
86
+ responseHeaders: string;
87
+ /** The `lang` attribute value from the HTML element, or null if not present. */
88
+ lang: string | null;
89
+ /** The page title from the `<title>` element, or null if not present. */
90
+ title: string | null;
91
+ /** The meta description content, or null if not present. */
92
+ description: string | null;
93
+ /** The meta keywords content, or null if not present. */
94
+ keywords: string | null;
95
+ /** Whether the noindex robots directive is set (SQLite INTEGER 0/1). */
96
+ noindex: number | null;
97
+ /** Whether the nofollow robots directive is set (SQLite INTEGER 0/1). */
98
+ nofollow: number | null;
99
+ /** Whether the noarchive robots directive is set (SQLite INTEGER 0/1). */
100
+ noarchive: number | null;
101
+ /** The canonical URL from `<link rel="canonical">`, or null if not present. */
102
+ canonical: string | null;
103
+ /** The alternate URL from `<link rel="alternate">`, or null if not present. */
104
+ alternate: string | null;
105
+ /** The Open Graph type (`og:type`), or null if not present. */
106
+ og_type: string | null;
107
+ /** The Open Graph title (`og:title`), or null if not present. */
108
+ og_title: string | null;
109
+ /** The Open Graph site name (`og:site_name`), or null if not present. */
110
+ og_site_name: string | null;
111
+ /** The Open Graph description (`og:description`), or null if not present. */
112
+ og_description: string | null;
113
+ /** The Open Graph URL (`og:url`), or null if not present. */
114
+ og_url: string | null;
115
+ /** The Open Graph image URL (`og:image`), or null if not present. */
116
+ og_image: string | null;
117
+ /** The Twitter Card type (`twitter:card`), or null if not present. */
118
+ twitter_card: string | null;
119
+ /** JSON-serialized network logs captured during scraping, or null if not collected. */
120
+ networkLogs: string | null;
121
+ /** Relative file path to the saved HTML snapshot, or null if not saved. */
122
+ html: string | null;
123
+ /** Whether the page was skipped during crawling (1) or processed normally (0). */
124
+ isSkipped: 0 | 1;
125
+ /** The reason the page was skipped, or null if it was not skipped. */
126
+ skipReason: string | null;
127
+ /** The natural URL sort order index, or null if not yet assigned. */
128
+ order: number | null;
129
+ }
130
+ /**
131
+ * Raw database row representing a redirect relationship.
132
+ * Maps a source page to its redirect destination.
133
+ */
134
+ export interface DB_Redirect {
135
+ /** The ID of the destination page after redirect. */
136
+ pageId: number;
137
+ /** The URL that was redirected from. */
138
+ from: string;
139
+ /** The page ID of the source URL that was redirected. */
140
+ fromId: number;
141
+ }
142
+ /**
143
+ * Raw database row representing an anchor (link) found on a page.
144
+ * Combines data from the `anchors` table and the linked `pages` table.
145
+ */
146
+ export interface DB_Anchor {
147
+ /** The ID of the page that contains this anchor. */
148
+ pageId: number;
149
+ /** The resolved destination URL of the anchor. */
150
+ url: string;
151
+ /** The original href attribute value of the anchor element. */
152
+ href: string;
153
+ /** Whether the anchor points to an external domain (1) or internal (0). */
154
+ isExternal: 0 | 1;
155
+ /** The title attribute of the anchor element, or null if not present. */
156
+ title: string | null;
157
+ /** The HTTP status code of the linked page, or null if not yet fetched. */
158
+ status: number | null;
159
+ /** The HTTP status text of the linked page, or null if not yet fetched. */
160
+ statusText: string | null;
161
+ /** The content type of the linked page, or null if not yet fetched. */
162
+ contentType: string | null;
163
+ /** The URL fragment (hash) portion of the link, or null if not present. */
164
+ hash: string | null;
165
+ /** The text content of the anchor element, or null if empty. */
166
+ textContent: string | null;
167
+ }
168
+ /**
169
+ * Raw database row representing a referrer relationship.
170
+ * Indicates which page links to which other page, potentially through redirects.
171
+ */
172
+ export interface DB_Referrer {
173
+ /** The ID of the page being referred to. */
174
+ pageId: number;
175
+ /** The URL of the referring page. */
176
+ url: string;
177
+ /** The URL through which the referral passes (may differ from url due to redirects). */
178
+ through: string;
179
+ /** The page ID of the through URL. */
180
+ throughId: number;
181
+ /** The URL fragment (hash) of the referring link, or null if not present. */
182
+ hash: string | null;
183
+ /** The text content of the referring anchor element, or null if empty. */
184
+ textContent: string | null;
185
+ }
186
+ /**
187
+ * Raw database row representing a sub-resource (CSS, JS, image, etc.) in the `resources` table.
188
+ */
189
+ export interface DB_Resource {
190
+ /** Auto-incremented primary key. */
191
+ id: number;
192
+ /** The URL of the resource. */
193
+ url: string;
194
+ /** Whether the resource is hosted on an external domain (1) or internal (0). */
195
+ isExternal: 0 | 1;
196
+ /** HTTP response status code, or null if not yet fetched. */
197
+ status: number | null;
198
+ /** HTTP response status text, or null if not yet fetched. */
199
+ statusText: string | null;
200
+ /** MIME content type of the resource, or null if unknown. */
201
+ contentType: string | null;
202
+ /** Content length in bytes, or null if unknown. */
203
+ contentLength: number | null;
204
+ /** Compression encoding (e.g., "gzip", "br"), or 0 if not compressed. */
205
+ compress: string | 0;
206
+ /** CDN provider identifier, or 0 if not served from a CDN. */
207
+ cdn: string | 0;
208
+ /** JSON-serialized HTTP response headers, or null if not available. */
209
+ responseHeaders: string | null;
210
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Clears the in-memory cache of HTTP request results.
3
+ * Should be called between crawl sessions to prevent memory leaks.
4
+ */
5
+ export declare function clearDestinationCache(): void;
@@ -0,0 +1,8 @@
1
+ import { destinationCache } from './destination-cache.js';
2
+ /**
3
+ * Clears the in-memory cache of HTTP request results.
4
+ * Should be called between crawl sessions to prevent memory leaks.
5
+ */
6
+ export function clearDestinationCache() {
7
+ destinationCache.clear();
8
+ }