@nitpicker/crawler 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/LICENSE +191 -0
  3. package/README.md +13 -0
  4. package/lib/archive/archive-accessor.d.ts +107 -0
  5. package/lib/archive/archive-accessor.js +264 -0
  6. package/lib/archive/archive.d.ts +174 -0
  7. package/lib/archive/archive.js +331 -0
  8. package/lib/archive/database.d.ts +207 -0
  9. package/lib/archive/database.js +972 -0
  10. package/lib/archive/debug.d.ts +8 -0
  11. package/lib/archive/debug.js +9 -0
  12. package/lib/archive/filesystem/append-text.d.ts +9 -0
  13. package/lib/archive/filesystem/append-text.js +14 -0
  14. package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
  15. package/lib/archive/filesystem/copy-dir-sync.js +9 -0
  16. package/lib/archive/filesystem/copy-dir.d.ts +7 -0
  17. package/lib/archive/filesystem/copy-dir.js +13 -0
  18. package/lib/archive/filesystem/exists.d.ts +6 -0
  19. package/lib/archive/filesystem/exists.js +9 -0
  20. package/lib/archive/filesystem/get-file-list.d.ts +8 -0
  21. package/lib/archive/filesystem/get-file-list.js +12 -0
  22. package/lib/archive/filesystem/index.d.ts +17 -0
  23. package/lib/archive/filesystem/index.js +17 -0
  24. package/lib/archive/filesystem/is-dir.d.ts +6 -0
  25. package/lib/archive/filesystem/is-dir.js +10 -0
  26. package/lib/archive/filesystem/mkdir.d.ts +8 -0
  27. package/lib/archive/filesystem/mkdir.js +15 -0
  28. package/lib/archive/filesystem/output-json.d.ts +9 -0
  29. package/lib/archive/filesystem/output-json.js +14 -0
  30. package/lib/archive/filesystem/output-text.d.ts +11 -0
  31. package/lib/archive/filesystem/output-text.js +32 -0
  32. package/lib/archive/filesystem/read-json.d.ts +7 -0
  33. package/lib/archive/filesystem/read-json.js +11 -0
  34. package/lib/archive/filesystem/read-text.d.ts +6 -0
  35. package/lib/archive/filesystem/read-text.js +10 -0
  36. package/lib/archive/filesystem/readline.d.ts +11 -0
  37. package/lib/archive/filesystem/readline.js +26 -0
  38. package/lib/archive/filesystem/remove.d.ts +5 -0
  39. package/lib/archive/filesystem/remove.js +10 -0
  40. package/lib/archive/filesystem/rename.d.ts +11 -0
  41. package/lib/archive/filesystem/rename.js +18 -0
  42. package/lib/archive/filesystem/tar.d.ts +11 -0
  43. package/lib/archive/filesystem/tar.js +22 -0
  44. package/lib/archive/filesystem/untar.d.ts +20 -0
  45. package/lib/archive/filesystem/untar.js +24 -0
  46. package/lib/archive/filesystem/utils.d.ts +109 -0
  47. package/lib/archive/filesystem/utils.js +185 -0
  48. package/lib/archive/filesystem/zip.d.ts +29 -0
  49. package/lib/archive/filesystem/zip.js +53 -0
  50. package/lib/archive/index.d.ts +6 -0
  51. package/lib/archive/index.js +11 -0
  52. package/lib/archive/page.d.ts +263 -0
  53. package/lib/archive/page.js +316 -0
  54. package/lib/archive/resource.d.ts +46 -0
  55. package/lib/archive/resource.js +62 -0
  56. package/lib/archive/safe-path.d.ts +9 -0
  57. package/lib/archive/safe-path.js +17 -0
  58. package/lib/archive/types.d.ts +210 -0
  59. package/lib/archive/types.js +1 -0
  60. package/lib/crawler/clear-destination-cache.d.ts +5 -0
  61. package/lib/crawler/clear-destination-cache.js +8 -0
  62. package/lib/crawler/crawler.d.ts +73 -0
  63. package/lib/crawler/crawler.js +748 -0
  64. package/lib/crawler/decompose-url.d.ts +25 -0
  65. package/lib/crawler/decompose-url.js +71 -0
  66. package/lib/crawler/destination-cache.d.ts +7 -0
  67. package/lib/crawler/destination-cache.js +6 -0
  68. package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
  69. package/lib/crawler/detect-pagination-pattern.js +61 -0
  70. package/lib/crawler/fetch-destination.d.ts +38 -0
  71. package/lib/crawler/fetch-destination.js +208 -0
  72. package/lib/crawler/fetch-robots-txt.d.ts +42 -0
  73. package/lib/crawler/fetch-robots-txt.js +44 -0
  74. package/lib/crawler/find-best-matching-scope.d.ts +12 -0
  75. package/lib/crawler/find-best-matching-scope.js +46 -0
  76. package/lib/crawler/generate-predicted-urls.d.ts +13 -0
  77. package/lib/crawler/generate-predicted-urls.js +27 -0
  78. package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
  79. package/lib/crawler/handle-ignore-and-skip.js +19 -0
  80. package/lib/crawler/handle-resource-response.d.ts +13 -0
  81. package/lib/crawler/handle-resource-response.js +16 -0
  82. package/lib/crawler/handle-scrape-end.d.ts +24 -0
  83. package/lib/crawler/handle-scrape-end.js +82 -0
  84. package/lib/crawler/handle-scrape-error.d.ts +37 -0
  85. package/lib/crawler/handle-scrape-error.js +38 -0
  86. package/lib/crawler/index.d.ts +2 -0
  87. package/lib/crawler/index.js +2 -0
  88. package/lib/crawler/inject-scope-auth.d.ts +11 -0
  89. package/lib/crawler/inject-scope-auth.js +21 -0
  90. package/lib/crawler/is-external-url.d.ts +11 -0
  91. package/lib/crawler/is-external-url.js +12 -0
  92. package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
  93. package/lib/crawler/is-in-any-lower-layer.js +15 -0
  94. package/lib/crawler/link-list.d.ts +112 -0
  95. package/lib/crawler/link-list.js +248 -0
  96. package/lib/crawler/link-to-page-data.d.ts +14 -0
  97. package/lib/crawler/link-to-page-data.js +32 -0
  98. package/lib/crawler/net-timeout-error.d.ts +9 -0
  99. package/lib/crawler/net-timeout-error.js +11 -0
  100. package/lib/crawler/network.d.ts +30 -0
  101. package/lib/crawler/network.js +226 -0
  102. package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
  103. package/lib/crawler/protocol-agnostic-key.js +11 -0
  104. package/lib/crawler/reconstruct-url.d.ts +10 -0
  105. package/lib/crawler/reconstruct-url.js +28 -0
  106. package/lib/crawler/result-handler.d.ts +118 -0
  107. package/lib/crawler/result-handler.js +153 -0
  108. package/lib/crawler/robots-checker.d.ts +26 -0
  109. package/lib/crawler/robots-checker.js +62 -0
  110. package/lib/crawler/should-discard-predicted.d.ts +14 -0
  111. package/lib/crawler/should-discard-predicted.js +31 -0
  112. package/lib/crawler/should-skip-url.d.ts +23 -0
  113. package/lib/crawler/should-skip-url.js +15 -0
  114. package/lib/crawler/speculative-pagination.d.ts +52 -0
  115. package/lib/crawler/speculative-pagination.js +215 -0
  116. package/lib/crawler/types.d.ts +119 -0
  117. package/lib/crawler/types.js +1 -0
  118. package/lib/crawler/url-filter.d.ts +56 -0
  119. package/lib/crawler/url-filter.js +110 -0
  120. package/lib/crawler-orchestrator.d.ts +142 -0
  121. package/lib/crawler-orchestrator.js +309 -0
  122. package/lib/debug.d.ts +8 -0
  123. package/lib/debug.js +9 -0
  124. package/lib/index.d.ts +16 -0
  125. package/lib/index.js +18 -0
  126. package/lib/qzilla.d.ts +136 -0
  127. package/lib/qzilla.js +292 -0
  128. package/lib/types.d.ts +27 -0
  129. package/lib/types.js +1 -0
  130. package/lib/utils/array/each-splitted.d.ts +10 -0
  131. package/lib/utils/array/each-splitted.js +14 -0
  132. package/lib/utils/array/index.d.ts +1 -0
  133. package/lib/utils/array/index.js +1 -0
  134. package/lib/utils/async/index.d.ts +1 -0
  135. package/lib/utils/async/index.js +1 -0
  136. package/lib/utils/debug.d.ts +5 -0
  137. package/lib/utils/debug.js +5 -0
  138. package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
  139. package/lib/utils/error/dom-evaluation-error.js +7 -0
  140. package/lib/utils/error/error-emitter.d.ts +18 -0
  141. package/lib/utils/error/error-emitter.js +29 -0
  142. package/lib/utils/error/index.d.ts +3 -0
  143. package/lib/utils/error/index.js +2 -0
  144. package/lib/utils/event-emitter/index.d.ts +6 -0
  145. package/lib/utils/event-emitter/index.js +6 -0
  146. package/lib/utils/index.d.ts +5 -0
  147. package/lib/utils/index.js +5 -0
  148. package/lib/utils/network/index.d.ts +1 -0
  149. package/lib/utils/network/index.js +1 -0
  150. package/lib/utils/object/clean-object.d.ts +8 -0
  151. package/lib/utils/object/clean-object.js +13 -0
  152. package/lib/utils/object/index.d.ts +1 -0
  153. package/lib/utils/object/index.js +1 -0
  154. package/lib/utils/path/index.d.ts +1 -0
  155. package/lib/utils/path/index.js +1 -0
  156. package/lib/utils/path/safe-filepath.d.ts +7 -0
  157. package/lib/utils/path/safe-filepath.js +12 -0
  158. package/lib/utils/regexp/index.d.ts +1 -0
  159. package/lib/utils/regexp/index.js +1 -0
  160. package/lib/utils/retryable/index.d.ts +2 -0
  161. package/lib/utils/retryable/index.js +1 -0
  162. package/lib/utils/sort/index.d.ts +14 -0
  163. package/lib/utils/sort/index.js +61 -0
  164. package/lib/utils/sort/remove-matches.d.ts +9 -0
  165. package/lib/utils/sort/remove-matches.js +23 -0
  166. package/lib/utils/types/index.d.ts +1 -0
  167. package/lib/utils/types/index.js +1 -0
  168. package/lib/utils/types/types.d.ts +46 -0
  169. package/lib/utils/types/types.js +1 -0
  170. package/lib/utils/url/index.d.ts +5 -0
  171. package/lib/utils/url/index.js +5 -0
  172. package/lib/utils/url/is-lower-layer.d.ts +15 -0
  173. package/lib/utils/url/is-lower-layer.js +55 -0
  174. package/lib/utils/url/parse-url.d.ts +11 -0
  175. package/lib/utils/url/parse-url.js +20 -0
  176. package/lib/utils/url/path-match.d.ts +11 -0
  177. package/lib/utils/url/path-match.js +18 -0
  178. package/lib/utils/url/sort-url.d.ts +10 -0
  179. package/lib/utils/url/sort-url.js +24 -0
  180. package/lib/utils/url/url-partial-match.d.ts +11 -0
  181. package/lib/utils/url/url-partial-match.js +32 -0
  182. package/package.json +49 -0
  183. package/src/archive/__mock__/.gitignore +3 -0
  184. package/src/archive/__mock__/mock.sqlite +0 -0
  185. package/src/archive/archive-accessor.ts +337 -0
  186. package/src/archive/archive.ts +408 -0
  187. package/src/archive/database.spec.ts +469 -0
  188. package/src/archive/database.ts +1059 -0
  189. package/src/archive/debug.ts +10 -0
  190. package/src/archive/filesystem/append-text.spec.ts +26 -0
  191. package/src/archive/filesystem/append-text.ts +16 -0
  192. package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
  193. package/src/archive/filesystem/copy-dir-sync.ts +10 -0
  194. package/src/archive/filesystem/copy-dir.spec.ts +33 -0
  195. package/src/archive/filesystem/copy-dir.ts +14 -0
  196. package/src/archive/filesystem/exists.spec.ts +33 -0
  197. package/src/archive/filesystem/exists.ts +10 -0
  198. package/src/archive/filesystem/get-file-list.spec.ts +37 -0
  199. package/src/archive/filesystem/get-file-list.ts +13 -0
  200. package/src/archive/filesystem/index.ts +17 -0
  201. package/src/archive/filesystem/is-dir.spec.ts +29 -0
  202. package/src/archive/filesystem/is-dir.ts +11 -0
  203. package/src/archive/filesystem/mkdir.spec.ts +37 -0
  204. package/src/archive/filesystem/mkdir.ts +16 -0
  205. package/src/archive/filesystem/output-json.spec.ts +34 -0
  206. package/src/archive/filesystem/output-json.ts +16 -0
  207. package/src/archive/filesystem/output-text.spec.ts +31 -0
  208. package/src/archive/filesystem/output-text.ts +35 -0
  209. package/src/archive/filesystem/read-json.spec.ts +26 -0
  210. package/src/archive/filesystem/read-json.ts +12 -0
  211. package/src/archive/filesystem/read-text.spec.ts +25 -0
  212. package/src/archive/filesystem/read-text.ts +11 -0
  213. package/src/archive/filesystem/readline.spec.ts +29 -0
  214. package/src/archive/filesystem/readline.ts +30 -0
  215. package/src/archive/filesystem/remove.spec.ts +34 -0
  216. package/src/archive/filesystem/remove.ts +11 -0
  217. package/src/archive/filesystem/rename.spec.ts +46 -0
  218. package/src/archive/filesystem/rename.ts +21 -0
  219. package/src/archive/filesystem/tar.spec.ts +33 -0
  220. package/src/archive/filesystem/tar.ts +27 -0
  221. package/src/archive/filesystem/untar.spec.ts +34 -0
  222. package/src/archive/filesystem/untar.ts +36 -0
  223. package/src/archive/index.ts +13 -0
  224. package/src/archive/page.spec.ts +368 -0
  225. package/src/archive/page.ts +420 -0
  226. package/src/archive/resource.spec.ts +101 -0
  227. package/src/archive/resource.ts +73 -0
  228. package/src/archive/safe-path.spec.ts +44 -0
  229. package/src/archive/safe-path.ts +18 -0
  230. package/src/archive/types.ts +227 -0
  231. package/src/crawler/clear-destination-cache.spec.ts +20 -0
  232. package/src/crawler/clear-destination-cache.ts +9 -0
  233. package/src/crawler/crawler.ts +873 -0
  234. package/src/crawler/decompose-url.spec.ts +48 -0
  235. package/src/crawler/decompose-url.ts +90 -0
  236. package/src/crawler/destination-cache.spec.ts +23 -0
  237. package/src/crawler/destination-cache.ts +8 -0
  238. package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
  239. package/src/crawler/detect-pagination-pattern.ts +66 -0
  240. package/src/crawler/fetch-destination.ts +257 -0
  241. package/src/crawler/fetch-robots-txt.spec.ts +83 -0
  242. package/src/crawler/fetch-robots-txt.ts +91 -0
  243. package/src/crawler/find-best-matching-scope.spec.ts +39 -0
  244. package/src/crawler/find-best-matching-scope.ts +57 -0
  245. package/src/crawler/generate-predicted-urls.spec.ts +42 -0
  246. package/src/crawler/generate-predicted-urls.ts +34 -0
  247. package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
  248. package/src/crawler/handle-ignore-and-skip.ts +30 -0
  249. package/src/crawler/handle-resource-response.spec.ts +45 -0
  250. package/src/crawler/handle-resource-response.ts +21 -0
  251. package/src/crawler/handle-scrape-end.spec.ts +109 -0
  252. package/src/crawler/handle-scrape-end.ts +115 -0
  253. package/src/crawler/handle-scrape-error.spec.ts +105 -0
  254. package/src/crawler/handle-scrape-error.ts +58 -0
  255. package/src/crawler/index.ts +2 -0
  256. package/src/crawler/inject-scope-auth.spec.ts +36 -0
  257. package/src/crawler/inject-scope-auth.ts +27 -0
  258. package/src/crawler/is-external-url.spec.ts +31 -0
  259. package/src/crawler/is-external-url.ts +17 -0
  260. package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
  261. package/src/crawler/is-in-any-lower-layer.ts +22 -0
  262. package/src/crawler/link-list.spec.ts +355 -0
  263. package/src/crawler/link-list.ts +275 -0
  264. package/src/crawler/link-to-page-data.spec.ts +133 -0
  265. package/src/crawler/link-to-page-data.ts +34 -0
  266. package/src/crawler/net-timeout-error.spec.ts +25 -0
  267. package/src/crawler/net-timeout-error.ts +11 -0
  268. package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
  269. package/src/crawler/protocol-agnostic-key.ts +11 -0
  270. package/src/crawler/reconstruct-url.spec.ts +37 -0
  271. package/src/crawler/reconstruct-url.ts +37 -0
  272. package/src/crawler/robots-checker.spec.ts +104 -0
  273. package/src/crawler/robots-checker.ts +73 -0
  274. package/src/crawler/should-discard-predicted.spec.ts +125 -0
  275. package/src/crawler/should-discard-predicted.ts +33 -0
  276. package/src/crawler/should-skip-url.spec.ts +77 -0
  277. package/src/crawler/should-skip-url.ts +37 -0
  278. package/src/crawler/types.ts +146 -0
  279. package/src/crawler-orchestrator.ts +401 -0
  280. package/src/debug.ts +10 -0
  281. package/src/index.ts +25 -0
  282. package/src/types.ts +30 -0
  283. package/src/utils/array/each-splitted.spec.ts +38 -0
  284. package/src/utils/array/each-splitted.ts +19 -0
  285. package/src/utils/array/index.ts +1 -0
  286. package/src/utils/debug.ts +6 -0
  287. package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
  288. package/src/utils/error/dom-evaluation-error.ts +6 -0
  289. package/src/utils/error/error-emitter.spec.ts +78 -0
  290. package/src/utils/error/error-emitter.ts +44 -0
  291. package/src/utils/error/index.ts +3 -0
  292. package/src/utils/index.ts +5 -0
  293. package/src/utils/object/clean-object.spec.ts +24 -0
  294. package/src/utils/object/clean-object.ts +13 -0
  295. package/src/utils/object/index.ts +1 -0
  296. package/src/utils/types/index.ts +1 -0
  297. package/src/utils/types/types.ts +65 -0
  298. package/tsconfig.json +11 -0
  299. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,118 @@
1
+ import type LinkList from './link-list.js';
2
+ import type { Link, PageData, Resource } from '../utils/index.js';
3
+ import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
4
+ /**
5
+ * Configuration options that control crawler behavior.
6
+ *
7
+ * Used by the result handler functions to determine how to process
8
+ * scrape results, which URLs to follow, and how to handle external links.
9
+ * @see {@link ./crawler.ts | Crawler} for the main consumer of this type
10
+ * @see {@link ../crawler-orchestrator.ts | CrawlerOrchestrator} for factory methods that build these options
11
+ */
12
+ export type CrawlerOptions = {
13
+ /** Delay in milliseconds between page requests. */
14
+ interval: number;
15
+ /** Maximum number of concurrent scraping processes. 0 uses the default. */
16
+ parallels: number;
17
+ /** Whether to recursively follow discovered links within the scope. */
18
+ recursive: boolean;
19
+ /** Whether the crawl was started from a pre-defined URL list. */
20
+ fromList: boolean;
21
+ /** Whether to capture image resources during scraping. */
22
+ isGettingImages: boolean;
23
+ /** Path to the Chromium/Chrome executable, or `null` for the bundled version. */
24
+ executablePath: string | null;
25
+ /** Whether to fetch and scrape external (out-of-scope) pages. */
26
+ fetchExternal: boolean;
27
+ /** List of scope URL strings that define the crawl boundary. */
28
+ scope: string[];
29
+ /** Glob patterns for URLs to exclude from crawling. */
30
+ excludes: string[];
31
+ /** Keywords that trigger page exclusion when found in content. */
32
+ excludeKeywords: string[];
33
+ /** URL prefixes to exclude from crawling (merged defaults + user additions). */
34
+ excludeUrls: readonly string[];
35
+ /** Maximum directory depth for crawling avoidance heuristics. */
36
+ depthOnAvoid: number;
37
+ /** Maximum number of retry attempts per URL on scrape failure. */
38
+ retry: number;
39
+ /** Whether to enable verbose logging. */
40
+ verbose: boolean;
41
+ } & Required<Pick<ParseURLOptions, 'disableQueries'>>;
42
+ /**
43
+ * Process the result of a successful page scrape.
44
+ *
45
+ * Extracts anchors from the page (unless in title-only mode), enqueues
46
+ * newly discovered URLs via the `addUrl` callback, and marks the URL
47
+ * as done in the link list.
48
+ * @param result - The scraped page data.
49
+ * @param linkList - The link list managing the crawl queue.
50
+ * @param scope - Map of hostnames to their scope URLs.
51
+ * @param options - Crawler configuration options.
52
+ * @param addUrl - Callback to enqueue a newly discovered URL. Accepts optional
53
+ * `{ titleOnly: true }` to request metadata-only scraping.
54
+ * @returns An object containing the constructed link and whether the page is external.
55
+ */
56
+ export declare function handleScrapeEnd(result: PageData, linkList: LinkList, scope: ReadonlyMap<string, readonly ExURL[]>, options: CrawlerOptions, addUrl: (url: ExURL, opts?: {
57
+ titleOnly?: true;
58
+ }) => void): {
59
+ link: Link | null;
60
+ isExternal: boolean;
61
+ };
62
+ /**
63
+ * Handle a URL that was ignored or skipped during scraping.
64
+ *
65
+ * Marks the URL as done in the link list without any page data,
66
+ * effectively recording that it was encountered but not scraped.
67
+ * @param url - The URL that was skipped.
68
+ * @param linkList - The link list managing the crawl queue.
69
+ * @param scope - Map of hostnames to their scope URLs.
70
+ * @param options - Crawler configuration options.
71
+ * @returns The constructed {@link Link} object, or `null` if the URL was not in the queue.
72
+ */
73
+ export declare function handleIgnoreAndSkip(url: ExURL, linkList: LinkList, scope: ReadonlyMap<string, readonly ExURL[]>, options: CrawlerOptions): Link | null;
74
+ /**
75
+ * Track a network resource response and determine if it is newly discovered.
76
+ *
77
+ * Checks whether the resource URL has already been seen. If it is new,
78
+ * adds it to the known resources set.
79
+ * @param resource - The captured network resource data.
80
+ * @param resources - The set of already-known resource URLs (without hash).
81
+ * @returns An object with `isNew` indicating whether this resource was seen for the first time.
82
+ */
83
+ export declare function handleResourceResponse(resource: Resource, resources: Set<string>): {
84
+ isNew: boolean;
85
+ };
86
+ /**
87
+ * Handle an error that occurred during page scraping.
88
+ *
89
+ * Marks the URL as done and creates a fallback {@link PageData} from the
90
+ * link, regardless of whether the error caused a shutdown. This ensures
91
+ * that errored URLs are recorded in the DB (`status = -1, scraped = 1`)
92
+ * and not re-queued on resume.
93
+ * @param payload - The error payload from the scraper.
94
+ * @param payload.url - The URL being scraped when the error occurred, or `null`.
95
+ * @param payload.error - The error details including name, message, and optional stack.
96
+ * @param payload.error.name
97
+ * @param payload.error.message
98
+ * @param payload.error.stack
99
+ * @param payload.shutdown - Whether the error caused the scraper process to shut down.
100
+ * @param payload.pid - The process ID of the scraper, or `undefined`.
101
+ * @param linkList - The link list managing the crawl queue.
102
+ * @param scope - Map of hostnames to their scope URLs.
103
+ * @param options - Crawler configuration options.
104
+ * @returns An object with the link and an optional fallback PageData result.
105
+ */
106
+ export declare function handleScrapeError(payload: {
107
+ url: ExURL | null;
108
+ error: {
109
+ name: string;
110
+ message: string;
111
+ stack?: string;
112
+ };
113
+ shutdown: boolean;
114
+ pid: number | undefined;
115
+ }, linkList: LinkList, scope: ReadonlyMap<string, readonly ExURL[]>, options: CrawlerOptions): {
116
+ link: Link | null;
117
+ result?: PageData;
118
+ };
@@ -0,0 +1,153 @@
1
+ import { crawlerErrorLog, crawlerLog } from '../debug.js';
2
+ import { linkToPageData } from './link-to-page-data.js';
3
+ import { injectScopeAuth } from './inject-scope-auth.js';
4
+ import { isExternalUrl } from './is-external-url.js';
5
+ import { isInAnyLowerLayer } from './is-in-any-lower-layer.js';
6
+ /**
7
+ * Process the result of a successful page scrape.
8
+ *
9
+ * Extracts anchors from the page (unless in title-only mode), enqueues
10
+ * newly discovered URLs via the `addUrl` callback, and marks the URL
11
+ * as done in the link list.
12
+ * @param result - The scraped page data.
13
+ * @param linkList - The link list managing the crawl queue.
14
+ * @param scope - Map of hostnames to their scope URLs.
15
+ * @param options - Crawler configuration options.
16
+ * @param addUrl - Callback to enqueue a newly discovered URL. Accepts optional
17
+ * `{ titleOnly: true }` to request metadata-only scraping.
18
+ * @returns An object containing the constructed link and whether the page is external.
19
+ */
20
+ export function handleScrapeEnd(result, linkList, scope, options, addUrl) {
21
+ const isTitleOnly = linkList.isTitleOnly(result.url.withoutHash);
22
+ if (!isTitleOnly) {
23
+ processAnchors(result.anchorList, scope, options, addUrl);
24
+ }
25
+ const link = linkList.done(result.url, scope, {
26
+ page: result,
27
+ }, options);
28
+ crawlerLog('Scrape end URL: %s', result.url.href);
29
+ crawlerLog('Scrape end Status: %d', result.status);
30
+ crawlerLog('Scrape end Type: %s', result.contentType);
31
+ if (!result.isExternal) {
32
+ crawlerLog('Scrape end Anchors: %d URLs', result.anchorList.length);
33
+ }
34
+ return { link, isExternal: result.isExternal };
35
+ }
36
+ /**
37
+ * Handle a URL that was ignored or skipped during scraping.
38
+ *
39
+ * Marks the URL as done in the link list without any page data,
40
+ * effectively recording that it was encountered but not scraped.
41
+ * @param url - The URL that was skipped.
42
+ * @param linkList - The link list managing the crawl queue.
43
+ * @param scope - Map of hostnames to their scope URLs.
44
+ * @param options - Crawler configuration options.
45
+ * @returns The constructed {@link Link} object, or `null` if the URL was not in the queue.
46
+ */
47
+ export function handleIgnoreAndSkip(url, linkList, scope, options) {
48
+ const updated = linkList.done(url, scope, {}, options);
49
+ if (updated) {
50
+ crawlerLog('Skipped URL: %s', url.href);
51
+ }
52
+ return updated;
53
+ }
54
+ /**
55
+ * Track a network resource response and determine if it is newly discovered.
56
+ *
57
+ * Checks whether the resource URL has already been seen. If it is new,
58
+ * adds it to the known resources set.
59
+ * @param resource - The captured network resource data.
60
+ * @param resources - The set of already-known resource URLs (without hash).
61
+ * @returns An object with `isNew` indicating whether this resource was seen for the first time.
62
+ */
63
+ export function handleResourceResponse(resource, resources) {
64
+ const isNew = !resources.has(resource.url.withoutHash);
65
+ if (isNew) {
66
+ resources.add(resource.url.withoutHash);
67
+ }
68
+ return { isNew };
69
+ }
70
+ /**
71
+ * Handle an error that occurred during page scraping.
72
+ *
73
+ * Marks the URL as done and creates a fallback {@link PageData} from the
74
+ * link, regardless of whether the error caused a shutdown. This ensures
75
+ * that errored URLs are recorded in the DB (`status = -1, scraped = 1`)
76
+ * and not re-queued on resume.
77
+ * @param payload - The error payload from the scraper.
78
+ * @param payload.url - The URL being scraped when the error occurred, or `null`.
79
+ * @param payload.error - The error details including name, message, and optional stack.
80
+ * @param payload.error.name
81
+ * @param payload.error.message
82
+ * @param payload.error.stack
83
+ * @param payload.shutdown - Whether the error caused the scraper process to shut down.
84
+ * @param payload.pid - The process ID of the scraper, or `undefined`.
85
+ * @param linkList - The link list managing the crawl queue.
86
+ * @param scope - Map of hostnames to their scope URLs.
87
+ * @param options - Crawler configuration options.
88
+ * @returns An object with the link and an optional fallback PageData result.
89
+ */
90
+ export function handleScrapeError(payload, linkList, scope, options) {
91
+ const { url, error, shutdown, pid } = payload;
92
+ let link = null;
93
+ let result;
94
+ if (url) {
95
+ const updated = linkList.done(url, scope, { error }, options);
96
+ if (updated) {
97
+ link = updated;
98
+ result = linkToPageData(updated);
99
+ }
100
+ }
101
+ crawlerErrorLog('From %d(%s)', pid, url?.href ?? 'UNKNOWN_URL');
102
+ crawlerErrorLog('Then shutdown?: %s', shutdown ? 'Yes' : 'No');
103
+ crawlerErrorLog('%O', error);
104
+ return { link, result };
105
+ }
106
+ /**
107
+ * Process anchor elements extracted from a scraped page and enqueue new URLs.
108
+ *
109
+ * For each anchor:
110
+ * 1. Determines if it is external (outside the crawl scope)
111
+ * 2. Injects authentication credentials from matching scope URLs
112
+ * 3. Reconstructs the `withoutHash` URL with injected auth
113
+ * 4. In recursive mode: enqueues internal lower-layer URLs for full scraping,
114
+ * and external URLs for title-only scraping (if `fetchExternal` is enabled)
115
+ * 5. In non-recursive mode: enqueues all URLs for title-only scraping
116
+ * @param anchors - The list of anchor data extracted from the page.
117
+ * @param scope - Map of hostnames to their scope URLs.
118
+ * @param options - Crawler configuration options.
119
+ * @param addUrl - Callback to enqueue a newly discovered URL. Accepts optional
120
+ * `{ titleOnly: true }` to request metadata-only scraping.
121
+ */
122
+ function processAnchors(anchors, scope, options, addUrl) {
123
+ for (const anchor of anchors) {
124
+ const isExternal = isExternalUrl(anchor.href, scope);
125
+ anchor.isExternal = isExternal;
126
+ if (!isExternal && (!anchor.href.username || !anchor.href.password)) {
127
+ injectScopeAuth(anchor.href, scope);
128
+ const auth = anchor.href.username && anchor.href.password
129
+ ? `${anchor.href.username}:${anchor.href.password}@`
130
+ : '';
131
+ const host = anchor.href.hostname + (anchor.href.port ? `:${anchor.href.port}` : '');
132
+ const newSearch = anchor.href.query ? `?${anchor.href.query}` : '';
133
+ const body = anchor.href.dirname
134
+ ? `${anchor.href.paths.join('/')}${newSearch}`
135
+ : newSearch
136
+ ? `${newSearch}`
137
+ : '';
138
+ const withoutHash = `${anchor.href.protocol}//${auth}${host}${body ? `/${body}` : ''}`;
139
+ anchor.href.withoutHash = withoutHash;
140
+ }
141
+ if (options.recursive) {
142
+ const scopes = scope.get(anchor.href.hostname);
143
+ if (scopes && isInAnyLowerLayer(anchor.href, scopes, options)) {
144
+ addUrl(anchor.href);
145
+ }
146
+ else if (isExternal && options.fetchExternal) {
147
+ addUrl(anchor.href, { titleOnly: true });
148
+ }
149
+ continue;
150
+ }
151
+ addUrl(anchor.href, { titleOnly: true });
152
+ }
153
+ }
@@ -0,0 +1,26 @@
1
+ import type { ExURL } from '@d-zero/shared/parse-url';
2
+ /**
3
+ * Checks whether a URL is allowed by the site's robots.txt rules.
4
+ *
5
+ * Caches robots.txt per origin so each origin is fetched at most once.
6
+ * When disabled (i.e., `ignoreRobots` mode), all URLs are allowed.
7
+ */
8
+ export declare class RobotsChecker {
9
+ #private;
10
+ /**
11
+ * Create a new RobotsChecker.
12
+ * @param userAgent - User-Agent string for rule matching and fetching robots.txt.
13
+ * @param enabled - Whether robots.txt checking is enabled. When `false`, {@link isAllowed} always returns `true`.
14
+ */
15
+ constructor(userAgent: string, enabled: boolean);
16
+ /**
17
+ * Check whether the given URL is allowed by the site's robots.txt.
18
+ *
19
+ * Fetches and caches robots.txt per origin on first access.
20
+ * Returns `true` if robots.txt checking is disabled, if no robots.txt
21
+ * exists, or if the URL is explicitly allowed.
22
+ * @param url - The URL to check.
23
+ * @returns `true` if the URL is allowed, `false` if blocked.
24
+ */
25
+ isAllowed(url: ExURL): Promise<boolean>;
26
+ }
@@ -0,0 +1,62 @@
1
+ import { crawlerLog } from '../debug.js';
2
+ import { fetchRobotsTxt } from './fetch-robots-txt.js';
3
+ /**
4
+ * Derives the origin string from an ExURL (e.g., `https://example.com:8080`).
5
+ * @param url - The extended URL.
6
+ * @returns The origin string.
7
+ */
8
+ function getOrigin(url) {
9
+ return `${url.protocol}//${url.hostname}${url.port ? `:${url.port}` : ''}`;
10
+ }
11
+ /**
12
+ * Checks whether a URL is allowed by the site's robots.txt rules.
13
+ *
14
+ * Caches robots.txt per origin so each origin is fetched at most once.
15
+ * When disabled (i.e., `ignoreRobots` mode), all URLs are allowed.
16
+ */
17
+ export class RobotsChecker {
18
+ /** Cache of parsed robots.txt per origin. `null` means no robots.txt or fetch failed. */
19
+ #cache = new Map();
20
+ /** When `false`, robots.txt checking is disabled and all URLs are allowed. */
21
+ #enabled;
22
+ /** User-Agent string used for robots.txt rule matching and HTTP requests. */
23
+ #userAgent;
24
+ /**
25
+ * Create a new RobotsChecker.
26
+ * @param userAgent - User-Agent string for rule matching and fetching robots.txt.
27
+ * @param enabled - Whether robots.txt checking is enabled. When `false`, {@link isAllowed} always returns `true`.
28
+ */
29
+ constructor(userAgent, enabled) {
30
+ this.#userAgent = userAgent;
31
+ this.#enabled = enabled;
32
+ }
33
+ /**
34
+ * Check whether the given URL is allowed by the site's robots.txt.
35
+ *
36
+ * Fetches and caches robots.txt per origin on first access.
37
+ * Returns `true` if robots.txt checking is disabled, if no robots.txt
38
+ * exists, or if the URL is explicitly allowed.
39
+ * @param url - The URL to check.
40
+ * @returns `true` if the URL is allowed, `false` if blocked.
41
+ */
42
+ async isAllowed(url) {
43
+ if (!this.#enabled) {
44
+ return true;
45
+ }
46
+ if (!url.isHTTP) {
47
+ return true;
48
+ }
49
+ const origin = getOrigin(url);
50
+ if (!this.#cache.has(origin)) {
51
+ crawlerLog('Fetching robots.txt for %s', origin);
52
+ const robot = await fetchRobotsTxt(origin, this.#userAgent);
53
+ this.#cache.set(origin, robot);
54
+ }
55
+ const robot = this.#cache.get(origin);
56
+ if (!robot) {
57
+ return true;
58
+ }
59
+ const allowed = robot.isAllowed(url.href, this.#userAgent);
60
+ return allowed !== false;
61
+ }
62
+ }
@@ -0,0 +1,14 @@
1
+ import type { ScrapeResult } from '@d-zero/beholder';
2
+ /**
3
+ * Determines whether a predicted URL's scrape result should be discarded.
4
+ *
5
+ * Predicted URLs are pre-emptively pushed into the crawl queue before
6
+ * knowing if they exist. This function filters out invalid results:
7
+ * - `error` type → discard (server unreachable, timeout, etc.)
8
+ * - `skipped` type → discard (matched exclusion rule)
9
+ * - `success` with HTTP error status (4xx/5xx) → discard
10
+ * - `success` with 2xx/3xx → keep
11
+ * @param result - The scrape result for the predicted URL
12
+ * @returns `true` if the result should be discarded (not saved to archive)
13
+ */
14
+ export declare function shouldDiscardPredicted(result: ScrapeResult): boolean;
@@ -0,0 +1,31 @@
1
+ import { isError } from '@d-zero/beholder';
2
+ /**
3
+ * Determines whether a predicted URL's scrape result should be discarded.
4
+ *
5
+ * Predicted URLs are pre-emptively pushed into the crawl queue before
6
+ * knowing if they exist. This function filters out invalid results:
7
+ * - `error` type → discard (server unreachable, timeout, etc.)
8
+ * - `skipped` type → discard (matched exclusion rule)
9
+ * - `success` with HTTP error status (4xx/5xx) → discard
10
+ * - `success` with 2xx/3xx → keep
11
+ * @param result - The scrape result for the predicted URL
12
+ * @returns `true` if the result should be discarded (not saved to archive)
13
+ */
14
+ export function shouldDiscardPredicted(result) {
15
+ switch (result.type) {
16
+ case 'error': {
17
+ return true;
18
+ }
19
+ case 'skipped': {
20
+ return true;
21
+ }
22
+ case 'success': {
23
+ if (!result.pageData)
24
+ return true;
25
+ return isError(result.pageData.status);
26
+ }
27
+ default: {
28
+ return true;
29
+ }
30
+ }
31
+ }
@@ -0,0 +1,23 @@
1
+ import type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
2
+ /**
3
+ * Parameters for {@link shouldSkipUrl}.
4
+ */
5
+ export interface ShouldSkipUrlParams {
6
+ /** The parsed URL to check. */
7
+ readonly url: ExURL;
8
+ /** Array of glob patterns for URLs to exclude. */
9
+ readonly excludes: readonly string[];
10
+ /** Array of URL prefixes to exclude (matched via `startsWith`). */
11
+ readonly excludeUrls: readonly string[];
12
+ /** URL parsing options used for pattern matching. */
13
+ readonly options: ParseURLOptions;
14
+ }
15
+ /**
16
+ * Determine whether a URL should be skipped during crawling.
17
+ *
18
+ * A URL is skipped if it matches any user-defined exclude glob pattern
19
+ * or starts with any of the excluded URL prefixes.
20
+ * @param params - Parameters containing the URL, exclude patterns, and options.
21
+ * @returns `true` if the URL should be skipped.
22
+ */
23
+ export declare function shouldSkipUrl(params: ShouldSkipUrlParams): boolean;
@@ -0,0 +1,15 @@
1
+ import { pathMatch } from '@d-zero/shared/path-match';
2
+ import { protocolAgnosticKey } from './protocol-agnostic-key.js';
3
+ /**
4
+ * Determine whether a URL should be skipped during crawling.
5
+ *
6
+ * A URL is skipped if it matches any user-defined exclude glob pattern
7
+ * or starts with any of the excluded URL prefixes.
8
+ * @param params - Parameters containing the URL, exclude patterns, and options.
9
+ * @returns `true` if the URL should be skipped.
10
+ */
11
+ export function shouldSkipUrl(params) {
12
+ const { url, excludes, excludeUrls, options } = params;
13
+ return (excludes.some((excludeGlobPattern) => pathMatch(url, excludeGlobPattern, options)) ||
14
+ excludeUrls.some((prefix) => protocolAgnosticKey(url.href).startsWith(protocolAgnosticKey(prefix))));
15
+ }
@@ -0,0 +1,52 @@
1
+ import type { ScrapeResult } from '@nitpicker/beholder';
2
+ /**
3
+ * Describes a detected pagination pattern between two consecutive URLs.
4
+ */
5
+ export interface PaginationPattern {
6
+ /** Index within the combined token array (path segments + query values) where the numeric difference was found. */
7
+ tokenIndex: number;
8
+ /** The numeric increment (always > 0). */
9
+ step: number;
10
+ /** The number found at `tokenIndex` in the "current" URL. */
11
+ currentNumber: number;
12
+ }
13
+ /**
14
+ * Compares two consecutive URL strings and detects a single-token numeric
15
+ * pagination pattern (e.g. `/page/1` → `/page/2`, or `?p=1` → `?p=2`).
16
+ *
17
+ * The algorithm decomposes each URL into tokens (path segments + sorted query values),
18
+ * then checks that exactly one token differs and both values are integers with a
19
+ * positive step. Returns `null` when no pattern is detected.
20
+ *
21
+ * WHY single-token constraint: Multi-token differences (e.g. both path and query
22
+ * changing) indicate different routes rather than pagination, so they are rejected.
23
+ * @param prevUrl - The previously pushed URL (protocol-agnostic, without hash/auth)
24
+ * @param currentUrl - The newly discovered URL
25
+ * @returns The detected pattern, or `null` if no pagination pattern was found
26
+ */
27
+ export declare function detectPaginationPattern(prevUrl: string, currentUrl: string): PaginationPattern | null;
28
+ /**
29
+ * Generates speculative URLs by extrapolating the detected pagination pattern.
30
+ *
31
+ * Starting from `currentUrl`, applies the pattern's step `count` times to produce
32
+ * future page URLs (e.g. if step=1 and currentNumber=2, generates page 3, 4, ...).
33
+ * These URLs are pushed into the crawl queue and discarded later if they 404.
34
+ * @param pattern - The detected pagination pattern from {@link detectPaginationPattern}
35
+ * @param currentUrl - The URL to extrapolate from (protocol-agnostic, without hash/auth)
36
+ * @param count - Number of speculative URLs to generate (typically equals concurrency)
37
+ * @returns Array of speculative URL strings
38
+ */
39
+ export declare function generateSpeculativeUrls(pattern: PaginationPattern, currentUrl: string, count: number): string[];
40
+ /**
41
+ * Determines whether a speculative URL's scrape result should be discarded.
42
+ *
43
+ * Speculative URLs are pre-emptively pushed into the crawl queue before
44
+ * knowing if they exist. This function filters out invalid results:
45
+ * - `error` type → discard (server unreachable, timeout, etc.)
46
+ * - `ignoreAndSkip` type → discard (matched exclusion rule)
47
+ * - `scrapeEnd` with HTTP error status (4xx/5xx) → discard
48
+ * - `scrapeEnd` with 2xx/3xx → keep
49
+ * @param result - The scrape result for the speculative URL
50
+ * @returns `true` if the result should be discarded (not saved to archive)
51
+ */
52
+ export declare function shouldDiscardSpeculative(result: ScrapeResult): boolean;