@nitpicker/crawler 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/LICENSE +191 -0
  3. package/README.md +13 -0
  4. package/lib/archive/archive-accessor.d.ts +107 -0
  5. package/lib/archive/archive-accessor.js +264 -0
  6. package/lib/archive/archive.d.ts +174 -0
  7. package/lib/archive/archive.js +331 -0
  8. package/lib/archive/database.d.ts +207 -0
  9. package/lib/archive/database.js +972 -0
  10. package/lib/archive/debug.d.ts +8 -0
  11. package/lib/archive/debug.js +9 -0
  12. package/lib/archive/filesystem/append-text.d.ts +9 -0
  13. package/lib/archive/filesystem/append-text.js +14 -0
  14. package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
  15. package/lib/archive/filesystem/copy-dir-sync.js +9 -0
  16. package/lib/archive/filesystem/copy-dir.d.ts +7 -0
  17. package/lib/archive/filesystem/copy-dir.js +13 -0
  18. package/lib/archive/filesystem/exists.d.ts +6 -0
  19. package/lib/archive/filesystem/exists.js +9 -0
  20. package/lib/archive/filesystem/get-file-list.d.ts +8 -0
  21. package/lib/archive/filesystem/get-file-list.js +12 -0
  22. package/lib/archive/filesystem/index.d.ts +17 -0
  23. package/lib/archive/filesystem/index.js +17 -0
  24. package/lib/archive/filesystem/is-dir.d.ts +6 -0
  25. package/lib/archive/filesystem/is-dir.js +10 -0
  26. package/lib/archive/filesystem/mkdir.d.ts +8 -0
  27. package/lib/archive/filesystem/mkdir.js +15 -0
  28. package/lib/archive/filesystem/output-json.d.ts +9 -0
  29. package/lib/archive/filesystem/output-json.js +14 -0
  30. package/lib/archive/filesystem/output-text.d.ts +11 -0
  31. package/lib/archive/filesystem/output-text.js +32 -0
  32. package/lib/archive/filesystem/read-json.d.ts +7 -0
  33. package/lib/archive/filesystem/read-json.js +11 -0
  34. package/lib/archive/filesystem/read-text.d.ts +6 -0
  35. package/lib/archive/filesystem/read-text.js +10 -0
  36. package/lib/archive/filesystem/readline.d.ts +11 -0
  37. package/lib/archive/filesystem/readline.js +26 -0
  38. package/lib/archive/filesystem/remove.d.ts +5 -0
  39. package/lib/archive/filesystem/remove.js +10 -0
  40. package/lib/archive/filesystem/rename.d.ts +11 -0
  41. package/lib/archive/filesystem/rename.js +18 -0
  42. package/lib/archive/filesystem/tar.d.ts +11 -0
  43. package/lib/archive/filesystem/tar.js +22 -0
  44. package/lib/archive/filesystem/untar.d.ts +20 -0
  45. package/lib/archive/filesystem/untar.js +24 -0
  46. package/lib/archive/filesystem/utils.d.ts +109 -0
  47. package/lib/archive/filesystem/utils.js +185 -0
  48. package/lib/archive/filesystem/zip.d.ts +29 -0
  49. package/lib/archive/filesystem/zip.js +53 -0
  50. package/lib/archive/index.d.ts +6 -0
  51. package/lib/archive/index.js +11 -0
  52. package/lib/archive/page.d.ts +263 -0
  53. package/lib/archive/page.js +316 -0
  54. package/lib/archive/resource.d.ts +46 -0
  55. package/lib/archive/resource.js +62 -0
  56. package/lib/archive/safe-path.d.ts +9 -0
  57. package/lib/archive/safe-path.js +17 -0
  58. package/lib/archive/types.d.ts +210 -0
  59. package/lib/archive/types.js +1 -0
  60. package/lib/crawler/clear-destination-cache.d.ts +5 -0
  61. package/lib/crawler/clear-destination-cache.js +8 -0
  62. package/lib/crawler/crawler.d.ts +73 -0
  63. package/lib/crawler/crawler.js +748 -0
  64. package/lib/crawler/decompose-url.d.ts +25 -0
  65. package/lib/crawler/decompose-url.js +71 -0
  66. package/lib/crawler/destination-cache.d.ts +7 -0
  67. package/lib/crawler/destination-cache.js +6 -0
  68. package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
  69. package/lib/crawler/detect-pagination-pattern.js +61 -0
  70. package/lib/crawler/fetch-destination.d.ts +38 -0
  71. package/lib/crawler/fetch-destination.js +208 -0
  72. package/lib/crawler/fetch-robots-txt.d.ts +42 -0
  73. package/lib/crawler/fetch-robots-txt.js +44 -0
  74. package/lib/crawler/find-best-matching-scope.d.ts +12 -0
  75. package/lib/crawler/find-best-matching-scope.js +46 -0
  76. package/lib/crawler/generate-predicted-urls.d.ts +13 -0
  77. package/lib/crawler/generate-predicted-urls.js +27 -0
  78. package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
  79. package/lib/crawler/handle-ignore-and-skip.js +19 -0
  80. package/lib/crawler/handle-resource-response.d.ts +13 -0
  81. package/lib/crawler/handle-resource-response.js +16 -0
  82. package/lib/crawler/handle-scrape-end.d.ts +24 -0
  83. package/lib/crawler/handle-scrape-end.js +82 -0
  84. package/lib/crawler/handle-scrape-error.d.ts +37 -0
  85. package/lib/crawler/handle-scrape-error.js +38 -0
  86. package/lib/crawler/index.d.ts +2 -0
  87. package/lib/crawler/index.js +2 -0
  88. package/lib/crawler/inject-scope-auth.d.ts +11 -0
  89. package/lib/crawler/inject-scope-auth.js +21 -0
  90. package/lib/crawler/is-external-url.d.ts +11 -0
  91. package/lib/crawler/is-external-url.js +12 -0
  92. package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
  93. package/lib/crawler/is-in-any-lower-layer.js +15 -0
  94. package/lib/crawler/link-list.d.ts +112 -0
  95. package/lib/crawler/link-list.js +248 -0
  96. package/lib/crawler/link-to-page-data.d.ts +14 -0
  97. package/lib/crawler/link-to-page-data.js +32 -0
  98. package/lib/crawler/net-timeout-error.d.ts +9 -0
  99. package/lib/crawler/net-timeout-error.js +11 -0
  100. package/lib/crawler/network.d.ts +30 -0
  101. package/lib/crawler/network.js +226 -0
  102. package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
  103. package/lib/crawler/protocol-agnostic-key.js +11 -0
  104. package/lib/crawler/reconstruct-url.d.ts +10 -0
  105. package/lib/crawler/reconstruct-url.js +28 -0
  106. package/lib/crawler/result-handler.d.ts +118 -0
  107. package/lib/crawler/result-handler.js +153 -0
  108. package/lib/crawler/robots-checker.d.ts +26 -0
  109. package/lib/crawler/robots-checker.js +62 -0
  110. package/lib/crawler/should-discard-predicted.d.ts +14 -0
  111. package/lib/crawler/should-discard-predicted.js +31 -0
  112. package/lib/crawler/should-skip-url.d.ts +23 -0
  113. package/lib/crawler/should-skip-url.js +15 -0
  114. package/lib/crawler/speculative-pagination.d.ts +52 -0
  115. package/lib/crawler/speculative-pagination.js +215 -0
  116. package/lib/crawler/types.d.ts +119 -0
  117. package/lib/crawler/types.js +1 -0
  118. package/lib/crawler/url-filter.d.ts +56 -0
  119. package/lib/crawler/url-filter.js +110 -0
  120. package/lib/crawler-orchestrator.d.ts +142 -0
  121. package/lib/crawler-orchestrator.js +309 -0
  122. package/lib/debug.d.ts +8 -0
  123. package/lib/debug.js +9 -0
  124. package/lib/index.d.ts +16 -0
  125. package/lib/index.js +18 -0
  126. package/lib/qzilla.d.ts +136 -0
  127. package/lib/qzilla.js +292 -0
  128. package/lib/types.d.ts +27 -0
  129. package/lib/types.js +1 -0
  130. package/lib/utils/array/each-splitted.d.ts +10 -0
  131. package/lib/utils/array/each-splitted.js +14 -0
  132. package/lib/utils/array/index.d.ts +1 -0
  133. package/lib/utils/array/index.js +1 -0
  134. package/lib/utils/async/index.d.ts +1 -0
  135. package/lib/utils/async/index.js +1 -0
  136. package/lib/utils/debug.d.ts +5 -0
  137. package/lib/utils/debug.js +5 -0
  138. package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
  139. package/lib/utils/error/dom-evaluation-error.js +7 -0
  140. package/lib/utils/error/error-emitter.d.ts +18 -0
  141. package/lib/utils/error/error-emitter.js +29 -0
  142. package/lib/utils/error/index.d.ts +3 -0
  143. package/lib/utils/error/index.js +2 -0
  144. package/lib/utils/event-emitter/index.d.ts +6 -0
  145. package/lib/utils/event-emitter/index.js +6 -0
  146. package/lib/utils/index.d.ts +5 -0
  147. package/lib/utils/index.js +5 -0
  148. package/lib/utils/network/index.d.ts +1 -0
  149. package/lib/utils/network/index.js +1 -0
  150. package/lib/utils/object/clean-object.d.ts +8 -0
  151. package/lib/utils/object/clean-object.js +13 -0
  152. package/lib/utils/object/index.d.ts +1 -0
  153. package/lib/utils/object/index.js +1 -0
  154. package/lib/utils/path/index.d.ts +1 -0
  155. package/lib/utils/path/index.js +1 -0
  156. package/lib/utils/path/safe-filepath.d.ts +7 -0
  157. package/lib/utils/path/safe-filepath.js +12 -0
  158. package/lib/utils/regexp/index.d.ts +1 -0
  159. package/lib/utils/regexp/index.js +1 -0
  160. package/lib/utils/retryable/index.d.ts +2 -0
  161. package/lib/utils/retryable/index.js +1 -0
  162. package/lib/utils/sort/index.d.ts +14 -0
  163. package/lib/utils/sort/index.js +61 -0
  164. package/lib/utils/sort/remove-matches.d.ts +9 -0
  165. package/lib/utils/sort/remove-matches.js +23 -0
  166. package/lib/utils/types/index.d.ts +1 -0
  167. package/lib/utils/types/index.js +1 -0
  168. package/lib/utils/types/types.d.ts +46 -0
  169. package/lib/utils/types/types.js +1 -0
  170. package/lib/utils/url/index.d.ts +5 -0
  171. package/lib/utils/url/index.js +5 -0
  172. package/lib/utils/url/is-lower-layer.d.ts +15 -0
  173. package/lib/utils/url/is-lower-layer.js +55 -0
  174. package/lib/utils/url/parse-url.d.ts +11 -0
  175. package/lib/utils/url/parse-url.js +20 -0
  176. package/lib/utils/url/path-match.d.ts +11 -0
  177. package/lib/utils/url/path-match.js +18 -0
  178. package/lib/utils/url/sort-url.d.ts +10 -0
  179. package/lib/utils/url/sort-url.js +24 -0
  180. package/lib/utils/url/url-partial-match.d.ts +11 -0
  181. package/lib/utils/url/url-partial-match.js +32 -0
  182. package/package.json +49 -0
  183. package/src/archive/__mock__/.gitignore +3 -0
  184. package/src/archive/__mock__/mock.sqlite +0 -0
  185. package/src/archive/archive-accessor.ts +337 -0
  186. package/src/archive/archive.ts +408 -0
  187. package/src/archive/database.spec.ts +469 -0
  188. package/src/archive/database.ts +1059 -0
  189. package/src/archive/debug.ts +10 -0
  190. package/src/archive/filesystem/append-text.spec.ts +26 -0
  191. package/src/archive/filesystem/append-text.ts +16 -0
  192. package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
  193. package/src/archive/filesystem/copy-dir-sync.ts +10 -0
  194. package/src/archive/filesystem/copy-dir.spec.ts +33 -0
  195. package/src/archive/filesystem/copy-dir.ts +14 -0
  196. package/src/archive/filesystem/exists.spec.ts +33 -0
  197. package/src/archive/filesystem/exists.ts +10 -0
  198. package/src/archive/filesystem/get-file-list.spec.ts +37 -0
  199. package/src/archive/filesystem/get-file-list.ts +13 -0
  200. package/src/archive/filesystem/index.ts +17 -0
  201. package/src/archive/filesystem/is-dir.spec.ts +29 -0
  202. package/src/archive/filesystem/is-dir.ts +11 -0
  203. package/src/archive/filesystem/mkdir.spec.ts +37 -0
  204. package/src/archive/filesystem/mkdir.ts +16 -0
  205. package/src/archive/filesystem/output-json.spec.ts +34 -0
  206. package/src/archive/filesystem/output-json.ts +16 -0
  207. package/src/archive/filesystem/output-text.spec.ts +31 -0
  208. package/src/archive/filesystem/output-text.ts +35 -0
  209. package/src/archive/filesystem/read-json.spec.ts +26 -0
  210. package/src/archive/filesystem/read-json.ts +12 -0
  211. package/src/archive/filesystem/read-text.spec.ts +25 -0
  212. package/src/archive/filesystem/read-text.ts +11 -0
  213. package/src/archive/filesystem/readline.spec.ts +29 -0
  214. package/src/archive/filesystem/readline.ts +30 -0
  215. package/src/archive/filesystem/remove.spec.ts +34 -0
  216. package/src/archive/filesystem/remove.ts +11 -0
  217. package/src/archive/filesystem/rename.spec.ts +46 -0
  218. package/src/archive/filesystem/rename.ts +21 -0
  219. package/src/archive/filesystem/tar.spec.ts +33 -0
  220. package/src/archive/filesystem/tar.ts +27 -0
  221. package/src/archive/filesystem/untar.spec.ts +34 -0
  222. package/src/archive/filesystem/untar.ts +36 -0
  223. package/src/archive/index.ts +13 -0
  224. package/src/archive/page.spec.ts +368 -0
  225. package/src/archive/page.ts +420 -0
  226. package/src/archive/resource.spec.ts +101 -0
  227. package/src/archive/resource.ts +73 -0
  228. package/src/archive/safe-path.spec.ts +44 -0
  229. package/src/archive/safe-path.ts +18 -0
  230. package/src/archive/types.ts +227 -0
  231. package/src/crawler/clear-destination-cache.spec.ts +20 -0
  232. package/src/crawler/clear-destination-cache.ts +9 -0
  233. package/src/crawler/crawler.ts +873 -0
  234. package/src/crawler/decompose-url.spec.ts +48 -0
  235. package/src/crawler/decompose-url.ts +90 -0
  236. package/src/crawler/destination-cache.spec.ts +23 -0
  237. package/src/crawler/destination-cache.ts +8 -0
  238. package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
  239. package/src/crawler/detect-pagination-pattern.ts +66 -0
  240. package/src/crawler/fetch-destination.ts +257 -0
  241. package/src/crawler/fetch-robots-txt.spec.ts +83 -0
  242. package/src/crawler/fetch-robots-txt.ts +91 -0
  243. package/src/crawler/find-best-matching-scope.spec.ts +39 -0
  244. package/src/crawler/find-best-matching-scope.ts +57 -0
  245. package/src/crawler/generate-predicted-urls.spec.ts +42 -0
  246. package/src/crawler/generate-predicted-urls.ts +34 -0
  247. package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
  248. package/src/crawler/handle-ignore-and-skip.ts +30 -0
  249. package/src/crawler/handle-resource-response.spec.ts +45 -0
  250. package/src/crawler/handle-resource-response.ts +21 -0
  251. package/src/crawler/handle-scrape-end.spec.ts +109 -0
  252. package/src/crawler/handle-scrape-end.ts +115 -0
  253. package/src/crawler/handle-scrape-error.spec.ts +105 -0
  254. package/src/crawler/handle-scrape-error.ts +58 -0
  255. package/src/crawler/index.ts +2 -0
  256. package/src/crawler/inject-scope-auth.spec.ts +36 -0
  257. package/src/crawler/inject-scope-auth.ts +27 -0
  258. package/src/crawler/is-external-url.spec.ts +31 -0
  259. package/src/crawler/is-external-url.ts +17 -0
  260. package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
  261. package/src/crawler/is-in-any-lower-layer.ts +22 -0
  262. package/src/crawler/link-list.spec.ts +355 -0
  263. package/src/crawler/link-list.ts +275 -0
  264. package/src/crawler/link-to-page-data.spec.ts +133 -0
  265. package/src/crawler/link-to-page-data.ts +34 -0
  266. package/src/crawler/net-timeout-error.spec.ts +25 -0
  267. package/src/crawler/net-timeout-error.ts +11 -0
  268. package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
  269. package/src/crawler/protocol-agnostic-key.ts +11 -0
  270. package/src/crawler/reconstruct-url.spec.ts +37 -0
  271. package/src/crawler/reconstruct-url.ts +37 -0
  272. package/src/crawler/robots-checker.spec.ts +104 -0
  273. package/src/crawler/robots-checker.ts +73 -0
  274. package/src/crawler/should-discard-predicted.spec.ts +125 -0
  275. package/src/crawler/should-discard-predicted.ts +33 -0
  276. package/src/crawler/should-skip-url.spec.ts +77 -0
  277. package/src/crawler/should-skip-url.ts +37 -0
  278. package/src/crawler/types.ts +146 -0
  279. package/src/crawler-orchestrator.ts +401 -0
  280. package/src/debug.ts +10 -0
  281. package/src/index.ts +25 -0
  282. package/src/types.ts +30 -0
  283. package/src/utils/array/each-splitted.spec.ts +38 -0
  284. package/src/utils/array/each-splitted.ts +19 -0
  285. package/src/utils/array/index.ts +1 -0
  286. package/src/utils/debug.ts +6 -0
  287. package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
  288. package/src/utils/error/dom-evaluation-error.ts +6 -0
  289. package/src/utils/error/error-emitter.spec.ts +78 -0
  290. package/src/utils/error/error-emitter.ts +44 -0
  291. package/src/utils/error/index.ts +3 -0
  292. package/src/utils/index.ts +5 -0
  293. package/src/utils/object/clean-object.spec.ts +24 -0
  294. package/src/utils/object/clean-object.ts +13 -0
  295. package/src/utils/object/index.ts +1 -0
  296. package/src/utils/types/index.ts +1 -0
  297. package/src/utils/types/types.ts +65 -0
  298. package/tsconfig.json +11 -0
  299. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Intermediate representation of a URL split into comparable tokens.
3
+ * Used by pagination detection to identify which token changed between two URLs.
4
+ */
5
+ export interface DecomposedUrl {
6
+ /** Hostname including port (e.g. `"example.com:8080"`). */
7
+ host: string;
8
+ /** Path segments split by `/` (e.g. `["page", "2"]` for `/page/2`). */
9
+ pathSegments: string[];
10
+ /** Sorted query parameter keys. */
11
+ queryKeys: string[];
12
+ /** Query parameter values sorted by their corresponding key. */
13
+ queryValues: string[];
14
+ /** Protocol prefix (e.g. `"https:"`) or empty string if protocol-agnostic. */
15
+ protocol: string;
16
+ }
17
+ /**
18
+ * Decomposes a URL string into its constituent tokens for comparison.
19
+ * Handles both full URLs (`https://host/path?q=v`) and protocol-agnostic
20
+ * URLs (`//host/path?q=v`). Query parameters are sorted by key for
21
+ * consistent comparison.
22
+ * @param url - The URL string to decompose
23
+ * @returns The decomposed URL, or `null` if the format is invalid
24
+ */
25
+ export declare function decomposeUrl(url: string): DecomposedUrl | null;
@@ -0,0 +1,71 @@
1
+ /**
2
+ * Decomposes a URL string into its constituent tokens for comparison.
3
+ * Handles both full URLs (`https://host/path?q=v`) and protocol-agnostic
4
+ * URLs (`//host/path?q=v`). Query parameters are sorted by key for
5
+ * consistent comparison.
6
+ * @param url - The URL string to decompose
7
+ * @returns The decomposed URL, or `null` if the format is invalid
8
+ */
9
+ export function decomposeUrl(url) {
10
+ // URL format: //host/path?query or //host?query (protocol-agnostic)
11
+ // Also handle protocol://host/path?query
12
+ let work = url;
13
+ let protocol = '';
14
+ // Strip protocol
15
+ const protoMatch = /^(https?:)?\/\//.exec(work);
16
+ if (!protoMatch)
17
+ return null;
18
+ protocol = protoMatch[1] ?? '';
19
+ work = work.slice(protoMatch[0].length);
20
+ // Split host from rest
21
+ const slashIdx = work.indexOf('/');
22
+ const qmarkIdx = work.indexOf('?');
23
+ let host;
24
+ let pathPart;
25
+ let queryPart;
26
+ if (slashIdx === -1 && qmarkIdx === -1) {
27
+ host = work;
28
+ pathPart = '';
29
+ queryPart = '';
30
+ }
31
+ else if (slashIdx === -1) {
32
+ host = work.slice(0, qmarkIdx);
33
+ pathPart = '';
34
+ queryPart = work.slice(qmarkIdx + 1);
35
+ }
36
+ else {
37
+ host = work.slice(0, slashIdx);
38
+ const pathAndQuery = work.slice(slashIdx + 1);
39
+ const pq = pathAndQuery.indexOf('?');
40
+ if (pq === -1) {
41
+ pathPart = pathAndQuery;
42
+ queryPart = '';
43
+ }
44
+ else {
45
+ pathPart = pathAndQuery.slice(0, pq);
46
+ queryPart = pathAndQuery.slice(pq + 1);
47
+ }
48
+ }
49
+ const pathSegments = pathPart ? pathPart.split('/') : [];
50
+ // Parse query into sorted key-value pairs
51
+ const queryPairs = [];
52
+ if (queryPart) {
53
+ for (const pair of queryPart.split('&')) {
54
+ const eqIdx = pair.indexOf('=');
55
+ if (eqIdx === -1) {
56
+ queryPairs.push([pair, '']);
57
+ }
58
+ else {
59
+ queryPairs.push([pair.slice(0, eqIdx), pair.slice(eqIdx + 1)]);
60
+ }
61
+ }
62
+ }
63
+ queryPairs.sort((a, b) => a[0].localeCompare(b[0]));
64
+ return {
65
+ host,
66
+ pathSegments,
67
+ queryKeys: queryPairs.map(([k]) => k),
68
+ queryValues: queryPairs.map(([, v]) => v),
69
+ protocol,
70
+ };
71
+ }
@@ -0,0 +1,7 @@
1
+ import type { PageData } from '@d-zero/beholder';
2
+ /**
3
+ * In-memory cache of HEAD request results keyed by URL (without hash).
4
+ * Stores either the successful {@link PageData} or the {@link Error} to avoid
5
+ * repeated requests to the same destination.
6
+ */
7
+ export declare const destinationCache: Map<string, Error | PageData>;
@@ -0,0 +1,6 @@
1
+ /**
2
+ * In-memory cache of HEAD request results keyed by URL (without hash).
3
+ * Stores either the successful {@link PageData} or the {@link Error} to avoid
4
+ * repeated requests to the same destination.
5
+ */
6
+ export const destinationCache = new Map();
@@ -0,0 +1,16 @@
1
+ import type { PaginationPattern } from './types.js';
2
+ /**
3
+ * Compares two consecutive URL strings and detects a single-token numeric
4
+ * pagination pattern (e.g. `/page/1` → `/page/2`, or `?p=1` → `?p=2`).
5
+ *
6
+ * The algorithm decomposes each URL into tokens (path segments + sorted query values),
7
+ * then checks that exactly one token differs and both values are integers with a
8
+ * positive step. Returns `null` when no pattern is detected.
9
+ *
10
+ * WHY single-token constraint: Multi-token differences (e.g. both path and query
11
+ * changing) indicate different routes rather than pagination, so they are rejected.
12
+ * @param prevUrl - The previously pushed URL (protocol-agnostic, without hash/auth)
13
+ * @param currentUrl - The newly discovered URL
14
+ * @returns The detected pattern, or `null` if no pagination pattern was found
15
+ */
16
+ export declare function detectPaginationPattern(prevUrl: string, currentUrl: string): PaginationPattern | null;
@@ -0,0 +1,61 @@
1
+ import { decomposeUrl } from './decompose-url.js';
2
+ /**
3
+ * Compares two consecutive URL strings and detects a single-token numeric
4
+ * pagination pattern (e.g. `/page/1` → `/page/2`, or `?p=1` → `?p=2`).
5
+ *
6
+ * The algorithm decomposes each URL into tokens (path segments + sorted query values),
7
+ * then checks that exactly one token differs and both values are integers with a
8
+ * positive step. Returns `null` when no pattern is detected.
9
+ *
10
+ * WHY single-token constraint: Multi-token differences (e.g. both path and query
11
+ * changing) indicate different routes rather than pagination, so they are rejected.
12
+ * @param prevUrl - The previously pushed URL (protocol-agnostic, without hash/auth)
13
+ * @param currentUrl - The newly discovered URL
14
+ * @returns The detected pattern, or `null` if no pagination pattern was found
15
+ */
16
+ export function detectPaginationPattern(prevUrl, currentUrl) {
17
+ const prev = decomposeUrl(prevUrl);
18
+ const curr = decomposeUrl(currentUrl);
19
+ if (!prev || !curr)
20
+ return null;
21
+ // Host (including port) must match
22
+ if (prev.host !== curr.host)
23
+ return null;
24
+ // Path segment count must match
25
+ if (prev.pathSegments.length !== curr.pathSegments.length)
26
+ return null;
27
+ // Query key sets must match in count and identity
28
+ if (prev.queryKeys.length !== curr.queryKeys.length)
29
+ return null;
30
+ for (let i = 0; i < prev.queryKeys.length; i++) {
31
+ if (prev.queryKeys[i] !== curr.queryKeys[i])
32
+ return null;
33
+ }
34
+ // Build combined token arrays: path segments + query values (sorted by key)
35
+ const prevTokens = [...prev.pathSegments, ...prev.queryValues];
36
+ const currTokens = [...curr.pathSegments, ...curr.queryValues];
37
+ let diffIndex = -1;
38
+ for (const [i, prevToken] of prevTokens.entries()) {
39
+ if (prevToken !== currTokens[i]) {
40
+ if (diffIndex !== -1)
41
+ return null; // more than one difference
42
+ diffIndex = i;
43
+ }
44
+ }
45
+ if (diffIndex === -1)
46
+ return null; // identical URLs
47
+ const prevNum = Number(prevTokens[diffIndex]);
48
+ const currNum = Number(currTokens[diffIndex]);
49
+ if (!Number.isFinite(prevNum) || !Number.isFinite(currNum))
50
+ return null;
51
+ if (!Number.isInteger(prevNum) || !Number.isInteger(currNum))
52
+ return null;
53
+ const step = currNum - prevNum;
54
+ if (step <= 0)
55
+ return null;
56
+ return {
57
+ tokenIndex: diffIndex,
58
+ step,
59
+ currentNumber: currNum,
60
+ };
61
+ }
@@ -0,0 +1,38 @@
1
+ import type { PageData } from '@d-zero/beholder';
2
+ import type { ExURL } from '@d-zero/shared/parse-url';
3
+ /**
4
+ * Parameters for {@link fetchDestination}.
5
+ */
6
+ export interface FetchDestinationParams {
7
+ /** The extended URL to fetch. */
8
+ readonly url: ExURL;
9
+ /** Whether the URL is external to the crawl scope. */
10
+ readonly isExternal: boolean;
11
+ /** The HTTP method to use. Defaults to `"HEAD"`. */
12
+ readonly method?: string;
13
+ /** Additional options. */
14
+ readonly options?: {
15
+ /**
16
+ * When set, forces a GET request and reads up to this many bytes from
17
+ * the response body to extract an HTML `<title>` tag.
18
+ */
19
+ titleBytesLimit?: number;
20
+ };
21
+ /** User-Agent string to send with the request. */
22
+ readonly userAgent?: string;
23
+ }
24
+ /**
25
+ * Fetches the destination metadata for a URL using an HTTP HEAD request (or GET as fallback).
26
+ *
27
+ * Results are cached in memory so that repeated calls for the same URL
28
+ * (without hash) return immediately. The request races against a 10-second
29
+ * timeout; if the server does not respond in time, a {@link NetTimeoutError} is thrown.
30
+ *
31
+ * If the server returns 405 (Method Not Allowed), 501 (Not Implemented), or 503
32
+ * (Service Unavailable) for a HEAD request, the function automatically retries with GET.
33
+ * @param params - Parameters containing URL, external flag, method, options, and optional User-Agent.
34
+ * @returns The page metadata obtained from the HTTP response.
35
+ * @throws {NetTimeoutError} If the request exceeds the 10-second timeout.
36
+ * @throws {Error} If the HTTP request fails for any other reason.
37
+ */
38
+ export declare function fetchDestination(params: FetchDestinationParams): Promise<PageData>;
@@ -0,0 +1,208 @@
1
+ import { delay } from '@d-zero/shared/delay';
2
+ import redirects from 'follow-redirects';
3
+ import { destinationCache } from './destination-cache.js';
4
+ import NetTimeoutError from './net-timeout-error.js';
5
+ /**
6
+ * Fetches the destination metadata for a URL using an HTTP HEAD request (or GET as fallback).
7
+ *
8
+ * Results are cached in memory so that repeated calls for the same URL
9
+ * (without hash) return immediately. The request races against a 10-second
10
+ * timeout; if the server does not respond in time, a {@link NetTimeoutError} is thrown.
11
+ *
12
+ * If the server returns 405 (Method Not Allowed), 501 (Not Implemented), or 503
13
+ * (Service Unavailable) for a HEAD request, the function automatically retries with GET.
14
+ * @param params - Parameters containing URL, external flag, method, options, and optional User-Agent.
15
+ * @returns The page metadata obtained from the HTTP response.
16
+ * @throws {NetTimeoutError} If the request exceeds the 10-second timeout.
17
+ * @throws {Error} If the HTTP request fails for any other reason.
18
+ */
19
+ export async function fetchDestination(params) {
20
+ const { url, isExternal, method = 'HEAD', options, userAgent } = params;
21
+ const titleBytesLimit = options?.titleBytesLimit;
22
+ const cacheKey = titleBytesLimit == null ? url.withoutHash : `${url.withoutHash}:title`;
23
+ if (destinationCache.has(cacheKey)) {
24
+ const cache = destinationCache.get(cacheKey);
25
+ if (cache instanceof Error) {
26
+ throw cache;
27
+ }
28
+ return cache;
29
+ }
30
+ const effectiveMethod = titleBytesLimit == null ? method : 'GET';
31
+ const result = await Promise.race([
32
+ _fetchHead(url, isExternal, effectiveMethod, titleBytesLimit, userAgent).catch((error) => (error instanceof Error ? error : new Error(String(error)))),
33
+ (async () => {
34
+ await delay(10 * 1000);
35
+ return new NetTimeoutError(url.href);
36
+ })(),
37
+ ]);
38
+ destinationCache.set(cacheKey, result);
39
+ if (result instanceof Error) {
40
+ throw result;
41
+ }
42
+ return result;
43
+ }
44
+ /**
45
+ * Performs the actual HTTP request to retrieve page metadata.
46
+ *
47
+ * Handles both HTTP and HTTPS protocols via `follow-redirects`, tracks redirect chains,
48
+ * and falls back to GET on certain status codes (405, 501, 503).
49
+ * @param url - The extended URL to request.
50
+ * @param isExternal - Whether the URL is external to the crawl scope.
51
+ * @param method - The HTTP method (`"HEAD"` or `"GET"`).
52
+ * @param titleBytesLimit - When set, reads up to this many bytes from the response body
53
+ * to extract a `<title>` tag, then destroys the connection.
54
+ * @param userAgent - Optional User-Agent string to send with the request.
55
+ * @returns A promise resolving to {@link PageData} with response metadata.
56
+ */
57
+ async function _fetchHead(url, isExternal, method, titleBytesLimit, userAgent) {
58
+ return new Promise((resolve, reject) => {
59
+ const hostHeader = url.port ? `${url.hostname}:${url.port}` : url.hostname;
60
+ const request = {
61
+ protocol: url.protocol,
62
+ hostname: url.hostname,
63
+ port: url.port || undefined,
64
+ path: url.pathname,
65
+ method,
66
+ headers: {
67
+ host: hostHeader,
68
+ ...(userAgent ? { 'User-Agent': userAgent } : {}),
69
+ Connection: 'keep-alive',
70
+ Pragma: 'no-cache',
71
+ 'Cache-Control': 'no-cache',
72
+ 'Upgrade-Insecure-Requests': 1,
73
+ Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
74
+ 'Accept-Encoding': 'gzip, deflate',
75
+ 'Accept-Language': 'ja,en;q=0.9,zh;q=0.8,en-US;q=0.7,pl;q=0.6,de;q=0.5,zh-CN;q=0.4,zh-TW;q=0.3,th;q=0.2,ko;q=0.1,fr;q=0.1',
76
+ // Range: url.extname?.toLowerCase() === 'pdf' ? 'bytes=0-0' : undefined,
77
+ },
78
+ };
79
+ if (url.username && url.password) {
80
+ request.auth = `${url.username}:${url.password}`;
81
+ }
82
+ let req;
83
+ let destroyed = false;
84
+ const response = (res) => {
85
+ const chunks = [];
86
+ let totalBytes = 0;
87
+ let settled = false;
88
+ const buildPageData = (title) => {
89
+ const redirectPaths = res.redirects.map((r) => r.url);
90
+ const _contentLength = Number.parseInt(res.headers['content-length'] || '');
91
+ const contentLength = Number.isFinite(_contentLength) ? _contentLength : null;
92
+ return {
93
+ url,
94
+ isTarget: !isExternal,
95
+ isExternal,
96
+ redirectPaths,
97
+ status: res.statusCode || 0,
98
+ statusText: res.statusMessage || '',
99
+ contentType: res.headers['content-type']?.split(';')[0] || null,
100
+ contentLength,
101
+ responseHeaders: res.headers,
102
+ meta: { title },
103
+ imageList: [],
104
+ anchorList: [],
105
+ html: '',
106
+ isSkipped: false,
107
+ };
108
+ };
109
+ if (titleBytesLimit == null) {
110
+ res.on('data', () => { });
111
+ res.on('end', async () => {
112
+ let rep = buildPageData('');
113
+ if (rep.status === 405) {
114
+ if (method === 'GET') {
115
+ reject(new Error(`Method Not Allowed: ${url.href} ${rep.statusText}`));
116
+ return;
117
+ }
118
+ try {
119
+ rep = await fetchDestination({ url, isExternal, method: 'GET' });
120
+ }
121
+ catch (error) {
122
+ reject(error);
123
+ return;
124
+ }
125
+ }
126
+ if (rep.status === 501) {
127
+ if (method === 'GET') {
128
+ reject(new Error(`Method Not Implemented: ${url.href} ${rep.statusText}`));
129
+ return;
130
+ }
131
+ await delay(5 * 1000);
132
+ try {
133
+ rep = await fetchDestination({ url, isExternal, method: 'GET' });
134
+ }
135
+ catch (error) {
136
+ reject(error);
137
+ return;
138
+ }
139
+ }
140
+ if (rep.status === 503) {
141
+ if (method === 'GET') {
142
+ reject(new Error(`Retrying failed: ${url.href} ${rep.statusText}`));
143
+ return;
144
+ }
145
+ await delay(5 * 1000);
146
+ try {
147
+ rep = await fetchDestination({ url, isExternal, method: 'GET' });
148
+ }
149
+ catch (error) {
150
+ reject(error);
151
+ return;
152
+ }
153
+ }
154
+ resolve(rep);
155
+ });
156
+ }
157
+ else {
158
+ res.on('data', (chunk) => {
159
+ if (settled)
160
+ return;
161
+ chunks.push(chunk);
162
+ totalBytes += chunk.length;
163
+ // Check for title in accumulated data so far
164
+ const body = Buffer.concat(chunks).toString('utf8');
165
+ const titleMatch = /<title[^>]*>([\s\S]*?)<\/title>/i.exec(body);
166
+ if (titleMatch) {
167
+ settled = true;
168
+ const title = titleMatch[1]?.trim() ?? '';
169
+ resolve(buildPageData(title));
170
+ destroyed = true;
171
+ req.destroy();
172
+ return;
173
+ }
174
+ // Reached byte limit without finding title
175
+ if (totalBytes >= titleBytesLimit) {
176
+ settled = true;
177
+ resolve(buildPageData(''));
178
+ destroyed = true;
179
+ req.destroy();
180
+ }
181
+ });
182
+ res.on('end', () => {
183
+ if (settled)
184
+ return;
185
+ settled = true;
186
+ // Stream ended before limit — try to extract title from what we have
187
+ const body = Buffer.concat(chunks).toString('utf8');
188
+ const titleMatch = /<title[^>]*>([\s\S]*?)<\/title>/i.exec(body);
189
+ const title = titleMatch?.[1]?.trim() ?? '';
190
+ resolve(buildPageData(title));
191
+ });
192
+ }
193
+ };
194
+ if (url.protocol === 'https:') {
195
+ req = redirects.https.request(request, response);
196
+ }
197
+ else {
198
+ req = redirects.http.request(request, response);
199
+ }
200
+ req.on('error', (error) => {
201
+ // Ignore errors caused by intentional req.destroy()
202
+ if (destroyed)
203
+ return;
204
+ reject(error);
205
+ });
206
+ req.end();
207
+ });
208
+ }
@@ -0,0 +1,42 @@
1
+ /**
2
+ * Result of parsing a robots.txt file.
3
+ */
4
+ interface RobotsResult {
5
+ /**
6
+ * Check if a URL is allowed for a given user-agent.
7
+ * @param url - The URL to check.
8
+ * @param ua - The user-agent string to match against.
9
+ * @returns `true` if allowed, `false` if disallowed, `undefined` if no matching rule.
10
+ */
11
+ isAllowed(url: string, ua?: string): boolean | undefined;
12
+ /**
13
+ * Check if a URL is disallowed for a given user-agent.
14
+ * @param url - The URL to check.
15
+ * @param ua - The user-agent string to match against.
16
+ * @returns `true` if disallowed, `false` if allowed, `undefined` if no matching rule.
17
+ */
18
+ isDisallowed(url: string, ua?: string): boolean | undefined;
19
+ /**
20
+ * Get the crawl delay for a given user-agent.
21
+ * @param ua - The user-agent string to match against.
22
+ * @returns The crawl delay in seconds, or `undefined` if not specified.
23
+ */
24
+ getCrawlDelay(ua?: string): number | undefined;
25
+ /**
26
+ * Get the sitemaps listed in robots.txt.
27
+ * @returns An array of sitemap URLs.
28
+ */
29
+ getSitemaps(): string[];
30
+ }
31
+ /**
32
+ * Fetches and parses the robots.txt file for a given origin URL.
33
+ *
34
+ * Sends an HTTP(S) GET request to `{origin}/robots.txt` and parses the
35
+ * response using `robots-parser`. Returns `null` if the server returns
36
+ * a non-200 status code or if the request fails.
37
+ * @param origin - The origin URL (e.g., `https://example.com`).
38
+ * @param userAgent - Optional User-Agent string to send with the request.
39
+ * @returns A parsed RobotsResult instance, or `null` if robots.txt is unavailable.
40
+ */
41
+ export declare function fetchRobotsTxt(origin: string, userAgent?: string): Promise<RobotsResult | null>;
42
+ export {};
@@ -0,0 +1,44 @@
1
+ import { createRequire } from 'node:module';
2
+ import redirects from 'follow-redirects';
3
+ const require = createRequire(import.meta.url);
4
+ const robotsParser = require('robots-parser');
5
+ /**
6
+ * Fetches and parses the robots.txt file for a given origin URL.
7
+ *
8
+ * Sends an HTTP(S) GET request to `{origin}/robots.txt` and parses the
9
+ * response using `robots-parser`. Returns `null` if the server returns
10
+ * a non-200 status code or if the request fails.
11
+ * @param origin - The origin URL (e.g., `https://example.com`).
12
+ * @param userAgent - Optional User-Agent string to send with the request.
13
+ * @returns A parsed RobotsResult instance, or `null` if robots.txt is unavailable.
14
+ */
15
+ export async function fetchRobotsTxt(origin, userAgent) {
16
+ const robotsUrl = `${origin}/robots.txt`;
17
+ return new Promise((resolve) => {
18
+ const protocol = robotsUrl.startsWith('https') ? redirects.https : redirects.http;
19
+ const req = protocol.get(robotsUrl, {
20
+ headers: {
21
+ ...(userAgent ? { 'User-Agent': userAgent } : {}),
22
+ },
23
+ timeout: 10_000,
24
+ }, (res) => {
25
+ if (res.statusCode !== 200) {
26
+ res.resume();
27
+ resolve(null);
28
+ return;
29
+ }
30
+ const chunks = [];
31
+ res.on('data', (chunk) => chunks.push(chunk));
32
+ res.on('end', () => {
33
+ const body = Buffer.concat(chunks).toString('utf8');
34
+ resolve(robotsParser(robotsUrl, body));
35
+ });
36
+ res.on('error', () => resolve(null));
37
+ });
38
+ req.on('error', () => resolve(null));
39
+ req.on('timeout', () => {
40
+ req.destroy();
41
+ resolve(null);
42
+ });
43
+ });
44
+ }
@@ -0,0 +1,12 @@
1
+ import type { ExURL } from '@d-zero/shared/parse-url';
2
+ /**
3
+ * Find the scope URL with the deepest matching path for a given URL.
4
+ *
5
+ * Among all scope URLs sharing the same hostname, returns the one whose
6
+ * path segments are a prefix of the target URL's path segments and which
7
+ * has the greatest depth. Returns `null` if no scope URL matches.
8
+ * @param url - The parsed URL to match against scope URLs.
9
+ * @param scopes - The list of scope URLs to search.
10
+ * @returns The best-matching scope URL, or `null` if none match.
11
+ */
12
+ export declare function findBestMatchingScope(url: ExURL, scopes: readonly ExURL[]): ExURL | null;
@@ -0,0 +1,46 @@
1
+ /**
2
+ * Find the scope URL with the deepest matching path for a given URL.
3
+ *
4
+ * Among all scope URLs sharing the same hostname, returns the one whose
5
+ * path segments are a prefix of the target URL's path segments and which
6
+ * has the greatest depth. Returns `null` if no scope URL matches.
7
+ * @param url - The parsed URL to match against scope URLs.
8
+ * @param scopes - The list of scope URLs to search.
9
+ * @returns The best-matching scope URL, or `null` if none match.
10
+ */
11
+ export function findBestMatchingScope(url, scopes) {
12
+ let bestMatch = null;
13
+ let maxDepth = -1;
14
+ for (const scope of scopes) {
15
+ if (url.hostname !== scope.hostname) {
16
+ continue;
17
+ }
18
+ const isMatch = isPathMatch(url.paths, scope.paths);
19
+ if (isMatch && scope.depth > maxDepth) {
20
+ bestMatch = scope;
21
+ maxDepth = scope.depth;
22
+ }
23
+ }
24
+ return bestMatch;
25
+ }
26
+ /**
27
+ * Check whether a target path is equal to or is a descendant of a base path.
28
+ *
29
+ * Compares path segments element by element. The target path matches if
30
+ * all segments of the base path appear in the same positions at the
31
+ * beginning of the target path.
32
+ * @param targetPaths - The path segments of the URL being checked.
33
+ * @param basePaths - The path segments of the scope URL to match against.
34
+ * @returns `true` if the target path starts with or equals the base path.
35
+ */
36
+ function isPathMatch(targetPaths, basePaths) {
37
+ if (targetPaths.length < basePaths.length) {
38
+ return false;
39
+ }
40
+ for (const [i, basePath] of basePaths.entries()) {
41
+ if (targetPaths[i] !== basePath) {
42
+ return false;
43
+ }
44
+ }
45
+ return true;
46
+ }
@@ -0,0 +1,13 @@
1
+ import type { PaginationPattern } from './types.js';
2
+ /**
3
+ * Generates predicted URLs by extrapolating the detected pagination pattern.
4
+ *
5
+ * Starting from `currentUrl`, applies the pattern's step `count` times to produce
6
+ * future page URLs (e.g. if step=1 and currentNumber=2, generates page 3, 4, ...).
7
+ * These URLs are pushed into the crawl queue and discarded later if they 404.
8
+ * @param pattern - The detected pagination pattern from `detectPaginationPattern()`
9
+ * @param currentUrl - The URL to extrapolate from (protocol-agnostic, without hash/auth)
10
+ * @param count - Number of predicted URLs to generate (typically equals concurrency)
11
+ * @returns Array of predicted URL strings
12
+ */
13
+ export declare function generatePredictedUrls(pattern: PaginationPattern, currentUrl: string, count: number): string[];
@@ -0,0 +1,27 @@
1
+ import { decomposeUrl } from './decompose-url.js';
2
+ import { reconstructUrl } from './reconstruct-url.js';
3
+ /**
4
+ * Generates predicted URLs by extrapolating the detected pagination pattern.
5
+ *
6
+ * Starting from `currentUrl`, applies the pattern's step `count` times to produce
7
+ * future page URLs (e.g. if step=1 and currentNumber=2, generates page 3, 4, ...).
8
+ * These URLs are pushed into the crawl queue and discarded later if they 404.
9
+ * @param pattern - The detected pagination pattern from `detectPaginationPattern()`
10
+ * @param currentUrl - The URL to extrapolate from (protocol-agnostic, without hash/auth)
11
+ * @param count - Number of predicted URLs to generate (typically equals concurrency)
12
+ * @returns Array of predicted URL strings
13
+ */
14
+ export function generatePredictedUrls(pattern, currentUrl, count) {
15
+ if (count <= 0)
16
+ return [];
17
+ const decomposed = decomposeUrl(currentUrl);
18
+ if (!decomposed)
19
+ return [];
20
+ const results = [];
21
+ for (let i = 1; i <= count; i++) {
22
+ const nextNum = pattern.currentNumber + pattern.step * i;
23
+ const url = reconstructUrl(decomposed, pattern.tokenIndex, String(nextNum));
24
+ results.push(url);
25
+ }
26
+ return results;
27
+ }
@@ -0,0 +1,16 @@
1
+ import type LinkList from './link-list.js';
2
+ import type { CrawlerOptions } from './types.js';
3
+ import type { Link } from '../utils/index.js';
4
+ import type { ExURL } from '@d-zero/shared/parse-url';
5
+ /**
6
+ * Handle a URL that was ignored or skipped during scraping.
7
+ *
8
+ * Marks the URL as done in the link list without any page data,
9
+ * effectively recording that it was encountered but not scraped.
10
+ * @param url - The URL that was skipped.
11
+ * @param linkList - The link list managing the crawl queue.
12
+ * @param scope - Map of hostnames to their scope URLs.
13
+ * @param options - Crawler configuration options.
14
+ * @returns The constructed {@link Link} object, or `null` if the URL was not in the queue.
15
+ */
16
+ export declare function handleIgnoreAndSkip(url: ExURL, linkList: LinkList, scope: ReadonlyMap<string, readonly ExURL[]>, options: CrawlerOptions): Link | null;
@@ -0,0 +1,19 @@
1
+ import { crawlerLog } from '../debug.js';
2
+ /**
3
+ * Handle a URL that was ignored or skipped during scraping.
4
+ *
5
+ * Marks the URL as done in the link list without any page data,
6
+ * effectively recording that it was encountered but not scraped.
7
+ * @param url - The URL that was skipped.
8
+ * @param linkList - The link list managing the crawl queue.
9
+ * @param scope - Map of hostnames to their scope URLs.
10
+ * @param options - Crawler configuration options.
11
+ * @returns The constructed {@link Link} object, or `null` if the URL was not in the queue.
12
+ */
13
+ export function handleIgnoreAndSkip(url, linkList, scope, options) {
14
+ const updated = linkList.done(url, scope, {}, options);
15
+ if (updated) {
16
+ crawlerLog('Skipped URL: %s', url.href);
17
+ }
18
+ return updated;
19
+ }