@nitpicker/crawler 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/LICENSE +191 -0
  3. package/README.md +13 -0
  4. package/lib/archive/archive-accessor.d.ts +107 -0
  5. package/lib/archive/archive-accessor.js +264 -0
  6. package/lib/archive/archive.d.ts +174 -0
  7. package/lib/archive/archive.js +331 -0
  8. package/lib/archive/database.d.ts +207 -0
  9. package/lib/archive/database.js +972 -0
  10. package/lib/archive/debug.d.ts +8 -0
  11. package/lib/archive/debug.js +9 -0
  12. package/lib/archive/filesystem/append-text.d.ts +9 -0
  13. package/lib/archive/filesystem/append-text.js +14 -0
  14. package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
  15. package/lib/archive/filesystem/copy-dir-sync.js +9 -0
  16. package/lib/archive/filesystem/copy-dir.d.ts +7 -0
  17. package/lib/archive/filesystem/copy-dir.js +13 -0
  18. package/lib/archive/filesystem/exists.d.ts +6 -0
  19. package/lib/archive/filesystem/exists.js +9 -0
  20. package/lib/archive/filesystem/get-file-list.d.ts +8 -0
  21. package/lib/archive/filesystem/get-file-list.js +12 -0
  22. package/lib/archive/filesystem/index.d.ts +17 -0
  23. package/lib/archive/filesystem/index.js +17 -0
  24. package/lib/archive/filesystem/is-dir.d.ts +6 -0
  25. package/lib/archive/filesystem/is-dir.js +10 -0
  26. package/lib/archive/filesystem/mkdir.d.ts +8 -0
  27. package/lib/archive/filesystem/mkdir.js +15 -0
  28. package/lib/archive/filesystem/output-json.d.ts +9 -0
  29. package/lib/archive/filesystem/output-json.js +14 -0
  30. package/lib/archive/filesystem/output-text.d.ts +11 -0
  31. package/lib/archive/filesystem/output-text.js +32 -0
  32. package/lib/archive/filesystem/read-json.d.ts +7 -0
  33. package/lib/archive/filesystem/read-json.js +11 -0
  34. package/lib/archive/filesystem/read-text.d.ts +6 -0
  35. package/lib/archive/filesystem/read-text.js +10 -0
  36. package/lib/archive/filesystem/readline.d.ts +11 -0
  37. package/lib/archive/filesystem/readline.js +26 -0
  38. package/lib/archive/filesystem/remove.d.ts +5 -0
  39. package/lib/archive/filesystem/remove.js +10 -0
  40. package/lib/archive/filesystem/rename.d.ts +11 -0
  41. package/lib/archive/filesystem/rename.js +18 -0
  42. package/lib/archive/filesystem/tar.d.ts +11 -0
  43. package/lib/archive/filesystem/tar.js +22 -0
  44. package/lib/archive/filesystem/untar.d.ts +20 -0
  45. package/lib/archive/filesystem/untar.js +24 -0
  46. package/lib/archive/filesystem/utils.d.ts +109 -0
  47. package/lib/archive/filesystem/utils.js +185 -0
  48. package/lib/archive/filesystem/zip.d.ts +29 -0
  49. package/lib/archive/filesystem/zip.js +53 -0
  50. package/lib/archive/index.d.ts +6 -0
  51. package/lib/archive/index.js +11 -0
  52. package/lib/archive/page.d.ts +263 -0
  53. package/lib/archive/page.js +316 -0
  54. package/lib/archive/resource.d.ts +46 -0
  55. package/lib/archive/resource.js +62 -0
  56. package/lib/archive/safe-path.d.ts +9 -0
  57. package/lib/archive/safe-path.js +17 -0
  58. package/lib/archive/types.d.ts +210 -0
  59. package/lib/archive/types.js +1 -0
  60. package/lib/crawler/clear-destination-cache.d.ts +5 -0
  61. package/lib/crawler/clear-destination-cache.js +8 -0
  62. package/lib/crawler/crawler.d.ts +73 -0
  63. package/lib/crawler/crawler.js +748 -0
  64. package/lib/crawler/decompose-url.d.ts +25 -0
  65. package/lib/crawler/decompose-url.js +71 -0
  66. package/lib/crawler/destination-cache.d.ts +7 -0
  67. package/lib/crawler/destination-cache.js +6 -0
  68. package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
  69. package/lib/crawler/detect-pagination-pattern.js +61 -0
  70. package/lib/crawler/fetch-destination.d.ts +38 -0
  71. package/lib/crawler/fetch-destination.js +208 -0
  72. package/lib/crawler/fetch-robots-txt.d.ts +42 -0
  73. package/lib/crawler/fetch-robots-txt.js +44 -0
  74. package/lib/crawler/find-best-matching-scope.d.ts +12 -0
  75. package/lib/crawler/find-best-matching-scope.js +46 -0
  76. package/lib/crawler/generate-predicted-urls.d.ts +13 -0
  77. package/lib/crawler/generate-predicted-urls.js +27 -0
  78. package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
  79. package/lib/crawler/handle-ignore-and-skip.js +19 -0
  80. package/lib/crawler/handle-resource-response.d.ts +13 -0
  81. package/lib/crawler/handle-resource-response.js +16 -0
  82. package/lib/crawler/handle-scrape-end.d.ts +24 -0
  83. package/lib/crawler/handle-scrape-end.js +82 -0
  84. package/lib/crawler/handle-scrape-error.d.ts +37 -0
  85. package/lib/crawler/handle-scrape-error.js +38 -0
  86. package/lib/crawler/index.d.ts +2 -0
  87. package/lib/crawler/index.js +2 -0
  88. package/lib/crawler/inject-scope-auth.d.ts +11 -0
  89. package/lib/crawler/inject-scope-auth.js +21 -0
  90. package/lib/crawler/is-external-url.d.ts +11 -0
  91. package/lib/crawler/is-external-url.js +12 -0
  92. package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
  93. package/lib/crawler/is-in-any-lower-layer.js +15 -0
  94. package/lib/crawler/link-list.d.ts +112 -0
  95. package/lib/crawler/link-list.js +248 -0
  96. package/lib/crawler/link-to-page-data.d.ts +14 -0
  97. package/lib/crawler/link-to-page-data.js +32 -0
  98. package/lib/crawler/net-timeout-error.d.ts +9 -0
  99. package/lib/crawler/net-timeout-error.js +11 -0
  100. package/lib/crawler/network.d.ts +30 -0
  101. package/lib/crawler/network.js +226 -0
  102. package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
  103. package/lib/crawler/protocol-agnostic-key.js +11 -0
  104. package/lib/crawler/reconstruct-url.d.ts +10 -0
  105. package/lib/crawler/reconstruct-url.js +28 -0
  106. package/lib/crawler/result-handler.d.ts +118 -0
  107. package/lib/crawler/result-handler.js +153 -0
  108. package/lib/crawler/robots-checker.d.ts +26 -0
  109. package/lib/crawler/robots-checker.js +62 -0
  110. package/lib/crawler/should-discard-predicted.d.ts +14 -0
  111. package/lib/crawler/should-discard-predicted.js +31 -0
  112. package/lib/crawler/should-skip-url.d.ts +23 -0
  113. package/lib/crawler/should-skip-url.js +15 -0
  114. package/lib/crawler/speculative-pagination.d.ts +52 -0
  115. package/lib/crawler/speculative-pagination.js +215 -0
  116. package/lib/crawler/types.d.ts +119 -0
  117. package/lib/crawler/types.js +1 -0
  118. package/lib/crawler/url-filter.d.ts +56 -0
  119. package/lib/crawler/url-filter.js +110 -0
  120. package/lib/crawler-orchestrator.d.ts +142 -0
  121. package/lib/crawler-orchestrator.js +309 -0
  122. package/lib/debug.d.ts +8 -0
  123. package/lib/debug.js +9 -0
  124. package/lib/index.d.ts +16 -0
  125. package/lib/index.js +18 -0
  126. package/lib/qzilla.d.ts +136 -0
  127. package/lib/qzilla.js +292 -0
  128. package/lib/types.d.ts +27 -0
  129. package/lib/types.js +1 -0
  130. package/lib/utils/array/each-splitted.d.ts +10 -0
  131. package/lib/utils/array/each-splitted.js +14 -0
  132. package/lib/utils/array/index.d.ts +1 -0
  133. package/lib/utils/array/index.js +1 -0
  134. package/lib/utils/async/index.d.ts +1 -0
  135. package/lib/utils/async/index.js +1 -0
  136. package/lib/utils/debug.d.ts +5 -0
  137. package/lib/utils/debug.js +5 -0
  138. package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
  139. package/lib/utils/error/dom-evaluation-error.js +7 -0
  140. package/lib/utils/error/error-emitter.d.ts +18 -0
  141. package/lib/utils/error/error-emitter.js +29 -0
  142. package/lib/utils/error/index.d.ts +3 -0
  143. package/lib/utils/error/index.js +2 -0
  144. package/lib/utils/event-emitter/index.d.ts +6 -0
  145. package/lib/utils/event-emitter/index.js +6 -0
  146. package/lib/utils/index.d.ts +5 -0
  147. package/lib/utils/index.js +5 -0
  148. package/lib/utils/network/index.d.ts +1 -0
  149. package/lib/utils/network/index.js +1 -0
  150. package/lib/utils/object/clean-object.d.ts +8 -0
  151. package/lib/utils/object/clean-object.js +13 -0
  152. package/lib/utils/object/index.d.ts +1 -0
  153. package/lib/utils/object/index.js +1 -0
  154. package/lib/utils/path/index.d.ts +1 -0
  155. package/lib/utils/path/index.js +1 -0
  156. package/lib/utils/path/safe-filepath.d.ts +7 -0
  157. package/lib/utils/path/safe-filepath.js +12 -0
  158. package/lib/utils/regexp/index.d.ts +1 -0
  159. package/lib/utils/regexp/index.js +1 -0
  160. package/lib/utils/retryable/index.d.ts +2 -0
  161. package/lib/utils/retryable/index.js +1 -0
  162. package/lib/utils/sort/index.d.ts +14 -0
  163. package/lib/utils/sort/index.js +61 -0
  164. package/lib/utils/sort/remove-matches.d.ts +9 -0
  165. package/lib/utils/sort/remove-matches.js +23 -0
  166. package/lib/utils/types/index.d.ts +1 -0
  167. package/lib/utils/types/index.js +1 -0
  168. package/lib/utils/types/types.d.ts +46 -0
  169. package/lib/utils/types/types.js +1 -0
  170. package/lib/utils/url/index.d.ts +5 -0
  171. package/lib/utils/url/index.js +5 -0
  172. package/lib/utils/url/is-lower-layer.d.ts +15 -0
  173. package/lib/utils/url/is-lower-layer.js +55 -0
  174. package/lib/utils/url/parse-url.d.ts +11 -0
  175. package/lib/utils/url/parse-url.js +20 -0
  176. package/lib/utils/url/path-match.d.ts +11 -0
  177. package/lib/utils/url/path-match.js +18 -0
  178. package/lib/utils/url/sort-url.d.ts +10 -0
  179. package/lib/utils/url/sort-url.js +24 -0
  180. package/lib/utils/url/url-partial-match.d.ts +11 -0
  181. package/lib/utils/url/url-partial-match.js +32 -0
  182. package/package.json +49 -0
  183. package/src/archive/__mock__/.gitignore +3 -0
  184. package/src/archive/__mock__/mock.sqlite +0 -0
  185. package/src/archive/archive-accessor.ts +337 -0
  186. package/src/archive/archive.ts +408 -0
  187. package/src/archive/database.spec.ts +469 -0
  188. package/src/archive/database.ts +1059 -0
  189. package/src/archive/debug.ts +10 -0
  190. package/src/archive/filesystem/append-text.spec.ts +26 -0
  191. package/src/archive/filesystem/append-text.ts +16 -0
  192. package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
  193. package/src/archive/filesystem/copy-dir-sync.ts +10 -0
  194. package/src/archive/filesystem/copy-dir.spec.ts +33 -0
  195. package/src/archive/filesystem/copy-dir.ts +14 -0
  196. package/src/archive/filesystem/exists.spec.ts +33 -0
  197. package/src/archive/filesystem/exists.ts +10 -0
  198. package/src/archive/filesystem/get-file-list.spec.ts +37 -0
  199. package/src/archive/filesystem/get-file-list.ts +13 -0
  200. package/src/archive/filesystem/index.ts +17 -0
  201. package/src/archive/filesystem/is-dir.spec.ts +29 -0
  202. package/src/archive/filesystem/is-dir.ts +11 -0
  203. package/src/archive/filesystem/mkdir.spec.ts +37 -0
  204. package/src/archive/filesystem/mkdir.ts +16 -0
  205. package/src/archive/filesystem/output-json.spec.ts +34 -0
  206. package/src/archive/filesystem/output-json.ts +16 -0
  207. package/src/archive/filesystem/output-text.spec.ts +31 -0
  208. package/src/archive/filesystem/output-text.ts +35 -0
  209. package/src/archive/filesystem/read-json.spec.ts +26 -0
  210. package/src/archive/filesystem/read-json.ts +12 -0
  211. package/src/archive/filesystem/read-text.spec.ts +25 -0
  212. package/src/archive/filesystem/read-text.ts +11 -0
  213. package/src/archive/filesystem/readline.spec.ts +29 -0
  214. package/src/archive/filesystem/readline.ts +30 -0
  215. package/src/archive/filesystem/remove.spec.ts +34 -0
  216. package/src/archive/filesystem/remove.ts +11 -0
  217. package/src/archive/filesystem/rename.spec.ts +46 -0
  218. package/src/archive/filesystem/rename.ts +21 -0
  219. package/src/archive/filesystem/tar.spec.ts +33 -0
  220. package/src/archive/filesystem/tar.ts +27 -0
  221. package/src/archive/filesystem/untar.spec.ts +34 -0
  222. package/src/archive/filesystem/untar.ts +36 -0
  223. package/src/archive/index.ts +13 -0
  224. package/src/archive/page.spec.ts +368 -0
  225. package/src/archive/page.ts +420 -0
  226. package/src/archive/resource.spec.ts +101 -0
  227. package/src/archive/resource.ts +73 -0
  228. package/src/archive/safe-path.spec.ts +44 -0
  229. package/src/archive/safe-path.ts +18 -0
  230. package/src/archive/types.ts +227 -0
  231. package/src/crawler/clear-destination-cache.spec.ts +20 -0
  232. package/src/crawler/clear-destination-cache.ts +9 -0
  233. package/src/crawler/crawler.ts +873 -0
  234. package/src/crawler/decompose-url.spec.ts +48 -0
  235. package/src/crawler/decompose-url.ts +90 -0
  236. package/src/crawler/destination-cache.spec.ts +23 -0
  237. package/src/crawler/destination-cache.ts +8 -0
  238. package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
  239. package/src/crawler/detect-pagination-pattern.ts +66 -0
  240. package/src/crawler/fetch-destination.ts +257 -0
  241. package/src/crawler/fetch-robots-txt.spec.ts +83 -0
  242. package/src/crawler/fetch-robots-txt.ts +91 -0
  243. package/src/crawler/find-best-matching-scope.spec.ts +39 -0
  244. package/src/crawler/find-best-matching-scope.ts +57 -0
  245. package/src/crawler/generate-predicted-urls.spec.ts +42 -0
  246. package/src/crawler/generate-predicted-urls.ts +34 -0
  247. package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
  248. package/src/crawler/handle-ignore-and-skip.ts +30 -0
  249. package/src/crawler/handle-resource-response.spec.ts +45 -0
  250. package/src/crawler/handle-resource-response.ts +21 -0
  251. package/src/crawler/handle-scrape-end.spec.ts +109 -0
  252. package/src/crawler/handle-scrape-end.ts +115 -0
  253. package/src/crawler/handle-scrape-error.spec.ts +105 -0
  254. package/src/crawler/handle-scrape-error.ts +58 -0
  255. package/src/crawler/index.ts +2 -0
  256. package/src/crawler/inject-scope-auth.spec.ts +36 -0
  257. package/src/crawler/inject-scope-auth.ts +27 -0
  258. package/src/crawler/is-external-url.spec.ts +31 -0
  259. package/src/crawler/is-external-url.ts +17 -0
  260. package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
  261. package/src/crawler/is-in-any-lower-layer.ts +22 -0
  262. package/src/crawler/link-list.spec.ts +355 -0
  263. package/src/crawler/link-list.ts +275 -0
  264. package/src/crawler/link-to-page-data.spec.ts +133 -0
  265. package/src/crawler/link-to-page-data.ts +34 -0
  266. package/src/crawler/net-timeout-error.spec.ts +25 -0
  267. package/src/crawler/net-timeout-error.ts +11 -0
  268. package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
  269. package/src/crawler/protocol-agnostic-key.ts +11 -0
  270. package/src/crawler/reconstruct-url.spec.ts +37 -0
  271. package/src/crawler/reconstruct-url.ts +37 -0
  272. package/src/crawler/robots-checker.spec.ts +104 -0
  273. package/src/crawler/robots-checker.ts +73 -0
  274. package/src/crawler/should-discard-predicted.spec.ts +125 -0
  275. package/src/crawler/should-discard-predicted.ts +33 -0
  276. package/src/crawler/should-skip-url.spec.ts +77 -0
  277. package/src/crawler/should-skip-url.ts +37 -0
  278. package/src/crawler/types.ts +146 -0
  279. package/src/crawler-orchestrator.ts +401 -0
  280. package/src/debug.ts +10 -0
  281. package/src/index.ts +25 -0
  282. package/src/types.ts +30 -0
  283. package/src/utils/array/each-splitted.spec.ts +38 -0
  284. package/src/utils/array/each-splitted.ts +19 -0
  285. package/src/utils/array/index.ts +1 -0
  286. package/src/utils/debug.ts +6 -0
  287. package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
  288. package/src/utils/error/dom-evaluation-error.ts +6 -0
  289. package/src/utils/error/error-emitter.spec.ts +78 -0
  290. package/src/utils/error/error-emitter.ts +44 -0
  291. package/src/utils/error/index.ts +3 -0
  292. package/src/utils/index.ts +5 -0
  293. package/src/utils/object/clean-object.spec.ts +24 -0
  294. package/src/utils/object/clean-object.ts +13 -0
  295. package/src/utils/object/index.ts +1 -0
  296. package/src/utils/types/index.ts +1 -0
  297. package/src/utils/types/types.ts +65 -0
  298. package/tsconfig.json +11 -0
  299. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Strips the common leading prefix from two strings (case-insensitive comparison).
3
+ * Returns a tuple of the remaining suffixes after the shared prefix is removed.
4
+ * If the strings are identical (ignoring case), returns `['', '']`.
5
+ * @param t1 - The first string.
6
+ * @param t2 - The second string.
7
+ * @returns A tuple of the two strings with their common leading characters removed.
8
+ */
9
+ export function removeMatches(t1, t2) {
10
+ let loopCount = Math.max(t1.length, t2.length);
11
+ t1 = t1.toLowerCase();
12
+ t2 = t2.toLowerCase();
13
+ const a1 = [...t1];
14
+ const a2 = [...t2];
15
+ while (loopCount--) {
16
+ if (a1[0] !== a2[0]) {
17
+ return [a1.join(''), a2.join('')];
18
+ }
19
+ a1.shift();
20
+ a2.shift();
21
+ }
22
+ return ['', ''];
23
+ }
@@ -0,0 +1 @@
1
+ export * from './types.js';
@@ -0,0 +1 @@
1
+ export * from './types.js';
@@ -0,0 +1,46 @@
1
+ import type { ExURL } from '@d-zero/shared/parse-url';
2
+ export type { PageData, ImageElement, SkippedPageData, Resource, AnchorData, Meta, NetworkLog, } from '@d-zero/beholder';
3
+ export type { ExURL } from '@d-zero/shared/parse-url';
4
+ export type { CompressType } from '@d-zero/shared/detect-compress';
5
+ export type { CDNType } from '@d-zero/shared/detect-cdn';
6
+ /**
7
+ * Represents a discovered link during crawling, with its metadata from the HEAD request.
8
+ */
9
+ export interface Link {
10
+ /** The parsed URL of the link. */
11
+ url: ExURL;
12
+ /** Whether this link points to an external domain. */
13
+ isExternal: boolean;
14
+ /** Whether this link is in a lower layer (subdirectory) of a scope URL. */
15
+ isLowerLayer: boolean;
16
+ /** Destination data from the HEAD request, present only if the link was fetched. */
17
+ dest?: {
18
+ /** Chain of redirect URLs traversed. */
19
+ redirectPaths: string[];
20
+ /** HTTP status code of the final response. */
21
+ status: number;
22
+ /** HTTP status text of the final response. */
23
+ statusText: string;
24
+ /** The Content-Type header value, or `null` if unavailable. */
25
+ contentType: string | null;
26
+ /** The Content-Length header value in bytes, or `null` if unavailable. */
27
+ contentLength: number | null;
28
+ /** Raw HTTP response headers, or `null` if unavailable. */
29
+ responseHeaders: Record<string, string | string[] | undefined> | null;
30
+ /** The page title, if available from a title-only scrape. */
31
+ title?: string;
32
+ };
33
+ }
34
+ /**
35
+ * An error event emitted during crawling or scraping.
36
+ */
37
+ export interface CrawlerError {
38
+ /** The process ID where the error occurred. */
39
+ pid: number;
40
+ /** Whether the error occurred in the main process (as opposed to a sub-process). */
41
+ isMainProcess: boolean;
42
+ /** The URL being processed when the error occurred, or `null` if not applicable. */
43
+ url: string | null;
44
+ /** The error object. */
45
+ error: Error;
46
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,5 @@
1
+ export { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
2
+ export { isLowerLayer } from '@d-zero/shared/is-lower-layer';
3
+ export { pathMatch } from '@d-zero/shared/path-match';
4
+ export { urlPartialMatch } from '@d-zero/shared/url-partial-match';
5
+ export { sortUrl } from '@d-zero/shared/sort-url';
@@ -0,0 +1,5 @@
1
+ export { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
2
+ export { isLowerLayer } from '@d-zero/shared/is-lower-layer';
3
+ export { pathMatch } from '@d-zero/shared/path-match';
4
+ export { urlPartialMatch } from '@d-zero/shared/url-partial-match';
5
+ export { sortUrl } from '@d-zero/shared/sort-url';
@@ -0,0 +1,15 @@
1
+ import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
2
+ /**
3
+ * Determines whether the target URL is at the same level or a deeper (lower) layer
4
+ * in the path hierarchy relative to the base URL. Both URLs must share the same hostname.
5
+ *
6
+ * For example, if the base is `https://example.com/docs/`, then
7
+ * `https://example.com/docs/getting-started` is considered a lower layer,
8
+ * while `https://example.com/about` is not.
9
+ * @param target - The target URL string or ExURL to check.
10
+ * @param base - The base URL string or ExURL to compare against.
11
+ * @param options - Optional URL parsing options.
12
+ * @returns `true` if the target URL is at the same level or deeper than the base URL
13
+ * within the same hostname; `false` otherwise.
14
+ */
15
+ export declare function isLowerLayer(target: string | ExURL, base: string | ExURL, options?: ParseURLOptions): boolean;
@@ -0,0 +1,55 @@
1
+ import { parseUrl } from './parse-url.js';
2
+ /**
3
+ * Determines whether the target URL is at the same level or a deeper (lower) layer
4
+ * in the path hierarchy relative to the base URL. Both URLs must share the same hostname.
5
+ *
6
+ * For example, if the base is `https://example.com/docs/`, then
7
+ * `https://example.com/docs/getting-started` is considered a lower layer,
8
+ * while `https://example.com/about` is not.
9
+ * @param target - The target URL string or ExURL to check.
10
+ * @param base - The base URL string or ExURL to compare against.
11
+ * @param options - Optional URL parsing options.
12
+ * @returns `true` if the target URL is at the same level or deeper than the base URL
13
+ * within the same hostname; `false` otherwise.
14
+ */
15
+ export function isLowerLayer(target, base, options) {
16
+ const a = typeof target === 'string' ? parseUrl(target, options) : target;
17
+ const b = typeof base === 'string' ? parseUrl(base, options) : base;
18
+ if (!a || !b) {
19
+ return false;
20
+ }
21
+ if (a.href === b.href) {
22
+ return true;
23
+ }
24
+ const aPathIsEmpty = a.paths.length === 1 && a.paths[0] === '';
25
+ const bPathIsEmpty = b.paths.length === 1 && b.paths[0] === '';
26
+ if (a.hostname !== b.hostname) {
27
+ return false;
28
+ }
29
+ if (aPathIsEmpty && bPathIsEmpty) {
30
+ return true;
31
+ }
32
+ if (a.paths == null && b.paths == null) {
33
+ return true;
34
+ }
35
+ if (a.paths && b.paths == null) {
36
+ return true;
37
+ }
38
+ if (!a.paths || !b.paths) {
39
+ return false;
40
+ }
41
+ if (a.paths.length < b.paths.length) {
42
+ return false;
43
+ }
44
+ while (Math.max(a.paths.length, b.paths.length)) {
45
+ const i1 = a.paths.shift();
46
+ const i2 = b.paths.shift();
47
+ if (i1 && !i2) {
48
+ return true;
49
+ }
50
+ if (i1 !== i2) {
51
+ return false;
52
+ }
53
+ }
54
+ return false;
55
+ }
@@ -0,0 +1,11 @@
1
+ import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
2
+ /**
3
+ * Parses a URL string into an ExURL object, filtering out non-HTTP URLs
4
+ * that lack a hostname and protocol. If the input is already an ExURL object,
5
+ * it is returned as-is without re-parsing.
6
+ * @param url - The URL string to parse, or an already-parsed ExURL object.
7
+ * @param options - Optional parsing options forwarded to the underlying parser.
8
+ * @returns The parsed ExURL object, or `null` if the URL is not a valid HTTP URL
9
+ * and has no hostname or protocol.
10
+ */
11
+ export declare function parseUrl(url: string | ExURL, options?: ParseURLOptions): ExURL | null;
@@ -0,0 +1,20 @@
1
+ import { parseUrl as sharedParseUrl } from '@d-zero/shared/parse-url';
2
+ /**
3
+ * Parses a URL string into an ExURL object, filtering out non-HTTP URLs
4
+ * that lack a hostname and protocol. If the input is already an ExURL object,
5
+ * it is returned as-is without re-parsing.
6
+ * @param url - The URL string to parse, or an already-parsed ExURL object.
7
+ * @param options - Optional parsing options forwarded to the underlying parser.
8
+ * @returns The parsed ExURL object, or `null` if the URL is not a valid HTTP URL
9
+ * and has no hostname or protocol.
10
+ */
11
+ export function parseUrl(url, options) {
12
+ if (typeof url !== 'string') {
13
+ return url;
14
+ }
15
+ const result = sharedParseUrl(url, options);
16
+ if (!result.isHTTP && !result.hostname && !result.protocol) {
17
+ return null;
18
+ }
19
+ return result;
20
+ }
@@ -0,0 +1,11 @@
1
+ import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
2
+ /**
3
+ * Tests whether the pathname portion of a URL matches a given glob pattern.
4
+ * Uses micromatch for glob matching.
5
+ * @param targetPath - The URL string or ExURL whose pathname will be tested.
6
+ * @param pattern - The glob pattern to match against the pathname (e.g., `"/docs/**"`).
7
+ * @param options - Optional URL parsing options.
8
+ * @returns `true` if the URL's pathname matches the pattern; `false` otherwise
9
+ * (also returns `false` if the URL cannot be parsed).
10
+ */
11
+ export declare function pathMatch(targetPath: string | ExURL, pattern: string, options?: ParseURLOptions): boolean;
@@ -0,0 +1,18 @@
1
+ import micromatch from 'micromatch';
2
+ import { parseUrl } from './parse-url.js';
3
+ /**
4
+ * Tests whether the pathname portion of a URL matches a given glob pattern.
5
+ * Uses micromatch for glob matching.
6
+ * @param targetPath - The URL string or ExURL whose pathname will be tested.
7
+ * @param pattern - The glob pattern to match against the pathname (e.g., `"/docs/**"`).
8
+ * @param options - Optional URL parsing options.
9
+ * @returns `true` if the URL's pathname matches the pattern; `false` otherwise
10
+ * (also returns `false` if the URL cannot be parsed).
11
+ */
12
+ export function pathMatch(targetPath, pattern, options) {
13
+ const url = typeof targetPath === 'string' ? parseUrl(targetPath, options) : targetPath;
14
+ if (!url) {
15
+ return false;
16
+ }
17
+ return micromatch.isMatch(url.pathname || '', pattern || '/');
18
+ }
@@ -0,0 +1,10 @@
1
+ import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
2
+ /**
3
+ * Parses, deduplicates, and sorts a list of URL strings using natural URL sorting.
4
+ * Duplicate URLs (by normalized href) are removed before sorting.
5
+ * @param list - An array of URL strings to sort.
6
+ * @param options - Optional URL parsing options.
7
+ * @returns A sorted array of ExURL objects with duplicates removed, ordered by
8
+ * natural URL sort (hostname, path hierarchy, basename, extension, query, hash).
9
+ */
10
+ export declare function sortUrl(list: string[], options?: ParseURLOptions): ExURL[];
@@ -0,0 +1,24 @@
1
+ import { pathComparator } from '@d-zero/shared/sort/path';
2
+ import { parseUrl } from './parse-url.js';
3
+ /**
4
+ * Parses, deduplicates, and sorts a list of URL strings using natural URL sorting.
5
+ * Duplicate URLs (by normalized href) are removed before sorting.
6
+ * @param list - An array of URL strings to sort.
7
+ * @param options - Optional URL parsing options.
8
+ * @returns A sorted array of ExURL objects with duplicates removed, ordered by
9
+ * natural URL sort (hostname, path hierarchy, basename, extension, query, hash).
10
+ */
11
+ export function sortUrl(list, options) {
12
+ const map = new Map();
13
+ for (const url of list) {
14
+ if (map.has(url)) {
15
+ continue;
16
+ }
17
+ const parsedUrl = parseUrl(url, options);
18
+ if (!parsedUrl) {
19
+ continue;
20
+ }
21
+ map.set(parsedUrl.href, parsedUrl);
22
+ }
23
+ return [...map.values()].toSorted((a, b) => pathComparator(a.href, b.href));
24
+ }
@@ -0,0 +1,11 @@
1
+ import type { ExURL } from '@d-zero/shared/parse-url';
2
+ /**
3
+ * Checks whether a URL partially matches a pattern URL. The match succeeds if both the
4
+ * hostname (case-insensitive) and the pathname match. The pathname matches if it is exactly
5
+ * equal to the pattern's pathname or if it falls under the pattern's pathname as a sub-path.
6
+ * @param url - The URL string or ExURL to test.
7
+ * @param pattern - The pattern URL string to match against (must include hostname and path).
8
+ * @returns `true` if the URL's hostname matches the pattern's hostname and the URL's pathname
9
+ * is equal to or nested under the pattern's pathname; `false` otherwise.
10
+ */
11
+ export declare function urlPartialMatch(url: string | ExURL, pattern: string): boolean;
@@ -0,0 +1,32 @@
1
+ import { parseUrl } from './parse-url.js';
2
+ import { pathMatch } from './path-match.js';
3
+ /**
4
+ * Checks whether a URL partially matches a pattern URL. The match succeeds if both the
5
+ * hostname (case-insensitive) and the pathname match. The pathname matches if it is exactly
6
+ * equal to the pattern's pathname or if it falls under the pattern's pathname as a sub-path.
7
+ * @param url - The URL string or ExURL to test.
8
+ * @param pattern - The pattern URL string to match against (must include hostname and path).
9
+ * @returns `true` if the URL's hostname matches the pattern's hostname and the URL's pathname
10
+ * is equal to or nested under the pattern's pathname; `false` otherwise.
11
+ */
12
+ export function urlPartialMatch(url, pattern) {
13
+ const target = parseUrl(url);
14
+ if (!target) {
15
+ return false;
16
+ }
17
+ const patternUrl = parseUrl(pattern);
18
+ if (!patternUrl) {
19
+ return false;
20
+ }
21
+ const { hostname, pathname } = patternUrl;
22
+ if (hostname.toLowerCase() !== target.hostname.toLowerCase()) {
23
+ return false;
24
+ }
25
+ if (pathname === target.pathname) {
26
+ return true;
27
+ }
28
+ if (pathMatch(url, `${pathname}/**/*`)) {
29
+ return true;
30
+ }
31
+ return false;
32
+ }
package/package.json ADDED
@@ -0,0 +1,49 @@
1
+ {
2
+ "name": "@nitpicker/crawler",
3
+ "version": "0.4.1",
4
+ "description": "Web crawler engine with headless browser rendering and archive storage",
5
+ "author": "D-ZERO",
6
+ "license": "Apache-2.0",
7
+ "repository": {
8
+ "type": "git",
9
+ "url": "https://github.com/d-zero-dev/nitpicker.git",
10
+ "directory": "packages/@nitpicker/crawler"
11
+ },
12
+ "publishConfig": {
13
+ "access": "public"
14
+ },
15
+ "type": "module",
16
+ "exports": {
17
+ ".": {
18
+ "import": "./lib/index.js",
19
+ "types": "./lib/index.d.ts"
20
+ }
21
+ },
22
+ "scripts": {
23
+ "build": "tsc",
24
+ "clean": "tsc --build --clean"
25
+ },
26
+ "dependencies": {
27
+ "@d-zero/beholder": "2.0.0",
28
+ "@d-zero/dealer": "1.6.3",
29
+ "@d-zero/fs": "0.2.2",
30
+ "@d-zero/shared": "0.20.0",
31
+ "ansi-colors": "4.1.3",
32
+ "debug": "4.4.3",
33
+ "follow-redirects": "1.15.11",
34
+ "fs-extra": "11.3.3",
35
+ "knex": "3.1.0",
36
+ "puppeteer": "24.37.5",
37
+ "robots-parser": "3.0.1",
38
+ "sqlite3": "5.1.7",
39
+ "tar": "7.5.9"
40
+ },
41
+ "devDependencies": {
42
+ "@types/debug": "4.1.12",
43
+ "@types/follow-redirects": "1.14.4",
44
+ "@types/fs-extra": "11.0.4",
45
+ "@types/tar": "7.0.87",
46
+ "@types/unzipper": "0.10.11"
47
+ },
48
+ "gitHead": "32b83ee38eba7dfd237adb1b41f69e049e8d4ceb"
49
+ }
@@ -0,0 +1,3 @@
1
+ *
2
+ !.gitignore
3
+ !mock.sqlite
Binary file