@nitpicker/crawler 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/LICENSE +191 -0
  3. package/README.md +13 -0
  4. package/lib/archive/archive-accessor.d.ts +107 -0
  5. package/lib/archive/archive-accessor.js +264 -0
  6. package/lib/archive/archive.d.ts +174 -0
  7. package/lib/archive/archive.js +331 -0
  8. package/lib/archive/database.d.ts +207 -0
  9. package/lib/archive/database.js +972 -0
  10. package/lib/archive/debug.d.ts +8 -0
  11. package/lib/archive/debug.js +9 -0
  12. package/lib/archive/filesystem/append-text.d.ts +9 -0
  13. package/lib/archive/filesystem/append-text.js +14 -0
  14. package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
  15. package/lib/archive/filesystem/copy-dir-sync.js +9 -0
  16. package/lib/archive/filesystem/copy-dir.d.ts +7 -0
  17. package/lib/archive/filesystem/copy-dir.js +13 -0
  18. package/lib/archive/filesystem/exists.d.ts +6 -0
  19. package/lib/archive/filesystem/exists.js +9 -0
  20. package/lib/archive/filesystem/get-file-list.d.ts +8 -0
  21. package/lib/archive/filesystem/get-file-list.js +12 -0
  22. package/lib/archive/filesystem/index.d.ts +17 -0
  23. package/lib/archive/filesystem/index.js +17 -0
  24. package/lib/archive/filesystem/is-dir.d.ts +6 -0
  25. package/lib/archive/filesystem/is-dir.js +10 -0
  26. package/lib/archive/filesystem/mkdir.d.ts +8 -0
  27. package/lib/archive/filesystem/mkdir.js +15 -0
  28. package/lib/archive/filesystem/output-json.d.ts +9 -0
  29. package/lib/archive/filesystem/output-json.js +14 -0
  30. package/lib/archive/filesystem/output-text.d.ts +11 -0
  31. package/lib/archive/filesystem/output-text.js +32 -0
  32. package/lib/archive/filesystem/read-json.d.ts +7 -0
  33. package/lib/archive/filesystem/read-json.js +11 -0
  34. package/lib/archive/filesystem/read-text.d.ts +6 -0
  35. package/lib/archive/filesystem/read-text.js +10 -0
  36. package/lib/archive/filesystem/readline.d.ts +11 -0
  37. package/lib/archive/filesystem/readline.js +26 -0
  38. package/lib/archive/filesystem/remove.d.ts +5 -0
  39. package/lib/archive/filesystem/remove.js +10 -0
  40. package/lib/archive/filesystem/rename.d.ts +11 -0
  41. package/lib/archive/filesystem/rename.js +18 -0
  42. package/lib/archive/filesystem/tar.d.ts +11 -0
  43. package/lib/archive/filesystem/tar.js +22 -0
  44. package/lib/archive/filesystem/untar.d.ts +20 -0
  45. package/lib/archive/filesystem/untar.js +24 -0
  46. package/lib/archive/filesystem/utils.d.ts +109 -0
  47. package/lib/archive/filesystem/utils.js +185 -0
  48. package/lib/archive/filesystem/zip.d.ts +29 -0
  49. package/lib/archive/filesystem/zip.js +53 -0
  50. package/lib/archive/index.d.ts +6 -0
  51. package/lib/archive/index.js +11 -0
  52. package/lib/archive/page.d.ts +263 -0
  53. package/lib/archive/page.js +316 -0
  54. package/lib/archive/resource.d.ts +46 -0
  55. package/lib/archive/resource.js +62 -0
  56. package/lib/archive/safe-path.d.ts +9 -0
  57. package/lib/archive/safe-path.js +17 -0
  58. package/lib/archive/types.d.ts +210 -0
  59. package/lib/archive/types.js +1 -0
  60. package/lib/crawler/clear-destination-cache.d.ts +5 -0
  61. package/lib/crawler/clear-destination-cache.js +8 -0
  62. package/lib/crawler/crawler.d.ts +73 -0
  63. package/lib/crawler/crawler.js +748 -0
  64. package/lib/crawler/decompose-url.d.ts +25 -0
  65. package/lib/crawler/decompose-url.js +71 -0
  66. package/lib/crawler/destination-cache.d.ts +7 -0
  67. package/lib/crawler/destination-cache.js +6 -0
  68. package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
  69. package/lib/crawler/detect-pagination-pattern.js +61 -0
  70. package/lib/crawler/fetch-destination.d.ts +38 -0
  71. package/lib/crawler/fetch-destination.js +208 -0
  72. package/lib/crawler/fetch-robots-txt.d.ts +42 -0
  73. package/lib/crawler/fetch-robots-txt.js +44 -0
  74. package/lib/crawler/find-best-matching-scope.d.ts +12 -0
  75. package/lib/crawler/find-best-matching-scope.js +46 -0
  76. package/lib/crawler/generate-predicted-urls.d.ts +13 -0
  77. package/lib/crawler/generate-predicted-urls.js +27 -0
  78. package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
  79. package/lib/crawler/handle-ignore-and-skip.js +19 -0
  80. package/lib/crawler/handle-resource-response.d.ts +13 -0
  81. package/lib/crawler/handle-resource-response.js +16 -0
  82. package/lib/crawler/handle-scrape-end.d.ts +24 -0
  83. package/lib/crawler/handle-scrape-end.js +82 -0
  84. package/lib/crawler/handle-scrape-error.d.ts +37 -0
  85. package/lib/crawler/handle-scrape-error.js +38 -0
  86. package/lib/crawler/index.d.ts +2 -0
  87. package/lib/crawler/index.js +2 -0
  88. package/lib/crawler/inject-scope-auth.d.ts +11 -0
  89. package/lib/crawler/inject-scope-auth.js +21 -0
  90. package/lib/crawler/is-external-url.d.ts +11 -0
  91. package/lib/crawler/is-external-url.js +12 -0
  92. package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
  93. package/lib/crawler/is-in-any-lower-layer.js +15 -0
  94. package/lib/crawler/link-list.d.ts +112 -0
  95. package/lib/crawler/link-list.js +248 -0
  96. package/lib/crawler/link-to-page-data.d.ts +14 -0
  97. package/lib/crawler/link-to-page-data.js +32 -0
  98. package/lib/crawler/net-timeout-error.d.ts +9 -0
  99. package/lib/crawler/net-timeout-error.js +11 -0
  100. package/lib/crawler/network.d.ts +30 -0
  101. package/lib/crawler/network.js +226 -0
  102. package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
  103. package/lib/crawler/protocol-agnostic-key.js +11 -0
  104. package/lib/crawler/reconstruct-url.d.ts +10 -0
  105. package/lib/crawler/reconstruct-url.js +28 -0
  106. package/lib/crawler/result-handler.d.ts +118 -0
  107. package/lib/crawler/result-handler.js +153 -0
  108. package/lib/crawler/robots-checker.d.ts +26 -0
  109. package/lib/crawler/robots-checker.js +62 -0
  110. package/lib/crawler/should-discard-predicted.d.ts +14 -0
  111. package/lib/crawler/should-discard-predicted.js +31 -0
  112. package/lib/crawler/should-skip-url.d.ts +23 -0
  113. package/lib/crawler/should-skip-url.js +15 -0
  114. package/lib/crawler/speculative-pagination.d.ts +52 -0
  115. package/lib/crawler/speculative-pagination.js +215 -0
  116. package/lib/crawler/types.d.ts +119 -0
  117. package/lib/crawler/types.js +1 -0
  118. package/lib/crawler/url-filter.d.ts +56 -0
  119. package/lib/crawler/url-filter.js +110 -0
  120. package/lib/crawler-orchestrator.d.ts +142 -0
  121. package/lib/crawler-orchestrator.js +309 -0
  122. package/lib/debug.d.ts +8 -0
  123. package/lib/debug.js +9 -0
  124. package/lib/index.d.ts +16 -0
  125. package/lib/index.js +18 -0
  126. package/lib/qzilla.d.ts +136 -0
  127. package/lib/qzilla.js +292 -0
  128. package/lib/types.d.ts +27 -0
  129. package/lib/types.js +1 -0
  130. package/lib/utils/array/each-splitted.d.ts +10 -0
  131. package/lib/utils/array/each-splitted.js +14 -0
  132. package/lib/utils/array/index.d.ts +1 -0
  133. package/lib/utils/array/index.js +1 -0
  134. package/lib/utils/async/index.d.ts +1 -0
  135. package/lib/utils/async/index.js +1 -0
  136. package/lib/utils/debug.d.ts +5 -0
  137. package/lib/utils/debug.js +5 -0
  138. package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
  139. package/lib/utils/error/dom-evaluation-error.js +7 -0
  140. package/lib/utils/error/error-emitter.d.ts +18 -0
  141. package/lib/utils/error/error-emitter.js +29 -0
  142. package/lib/utils/error/index.d.ts +3 -0
  143. package/lib/utils/error/index.js +2 -0
  144. package/lib/utils/event-emitter/index.d.ts +6 -0
  145. package/lib/utils/event-emitter/index.js +6 -0
  146. package/lib/utils/index.d.ts +5 -0
  147. package/lib/utils/index.js +5 -0
  148. package/lib/utils/network/index.d.ts +1 -0
  149. package/lib/utils/network/index.js +1 -0
  150. package/lib/utils/object/clean-object.d.ts +8 -0
  151. package/lib/utils/object/clean-object.js +13 -0
  152. package/lib/utils/object/index.d.ts +1 -0
  153. package/lib/utils/object/index.js +1 -0
  154. package/lib/utils/path/index.d.ts +1 -0
  155. package/lib/utils/path/index.js +1 -0
  156. package/lib/utils/path/safe-filepath.d.ts +7 -0
  157. package/lib/utils/path/safe-filepath.js +12 -0
  158. package/lib/utils/regexp/index.d.ts +1 -0
  159. package/lib/utils/regexp/index.js +1 -0
  160. package/lib/utils/retryable/index.d.ts +2 -0
  161. package/lib/utils/retryable/index.js +1 -0
  162. package/lib/utils/sort/index.d.ts +14 -0
  163. package/lib/utils/sort/index.js +61 -0
  164. package/lib/utils/sort/remove-matches.d.ts +9 -0
  165. package/lib/utils/sort/remove-matches.js +23 -0
  166. package/lib/utils/types/index.d.ts +1 -0
  167. package/lib/utils/types/index.js +1 -0
  168. package/lib/utils/types/types.d.ts +46 -0
  169. package/lib/utils/types/types.js +1 -0
  170. package/lib/utils/url/index.d.ts +5 -0
  171. package/lib/utils/url/index.js +5 -0
  172. package/lib/utils/url/is-lower-layer.d.ts +15 -0
  173. package/lib/utils/url/is-lower-layer.js +55 -0
  174. package/lib/utils/url/parse-url.d.ts +11 -0
  175. package/lib/utils/url/parse-url.js +20 -0
  176. package/lib/utils/url/path-match.d.ts +11 -0
  177. package/lib/utils/url/path-match.js +18 -0
  178. package/lib/utils/url/sort-url.d.ts +10 -0
  179. package/lib/utils/url/sort-url.js +24 -0
  180. package/lib/utils/url/url-partial-match.d.ts +11 -0
  181. package/lib/utils/url/url-partial-match.js +32 -0
  182. package/package.json +49 -0
  183. package/src/archive/__mock__/.gitignore +3 -0
  184. package/src/archive/__mock__/mock.sqlite +0 -0
  185. package/src/archive/archive-accessor.ts +337 -0
  186. package/src/archive/archive.ts +408 -0
  187. package/src/archive/database.spec.ts +469 -0
  188. package/src/archive/database.ts +1059 -0
  189. package/src/archive/debug.ts +10 -0
  190. package/src/archive/filesystem/append-text.spec.ts +26 -0
  191. package/src/archive/filesystem/append-text.ts +16 -0
  192. package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
  193. package/src/archive/filesystem/copy-dir-sync.ts +10 -0
  194. package/src/archive/filesystem/copy-dir.spec.ts +33 -0
  195. package/src/archive/filesystem/copy-dir.ts +14 -0
  196. package/src/archive/filesystem/exists.spec.ts +33 -0
  197. package/src/archive/filesystem/exists.ts +10 -0
  198. package/src/archive/filesystem/get-file-list.spec.ts +37 -0
  199. package/src/archive/filesystem/get-file-list.ts +13 -0
  200. package/src/archive/filesystem/index.ts +17 -0
  201. package/src/archive/filesystem/is-dir.spec.ts +29 -0
  202. package/src/archive/filesystem/is-dir.ts +11 -0
  203. package/src/archive/filesystem/mkdir.spec.ts +37 -0
  204. package/src/archive/filesystem/mkdir.ts +16 -0
  205. package/src/archive/filesystem/output-json.spec.ts +34 -0
  206. package/src/archive/filesystem/output-json.ts +16 -0
  207. package/src/archive/filesystem/output-text.spec.ts +31 -0
  208. package/src/archive/filesystem/output-text.ts +35 -0
  209. package/src/archive/filesystem/read-json.spec.ts +26 -0
  210. package/src/archive/filesystem/read-json.ts +12 -0
  211. package/src/archive/filesystem/read-text.spec.ts +25 -0
  212. package/src/archive/filesystem/read-text.ts +11 -0
  213. package/src/archive/filesystem/readline.spec.ts +29 -0
  214. package/src/archive/filesystem/readline.ts +30 -0
  215. package/src/archive/filesystem/remove.spec.ts +34 -0
  216. package/src/archive/filesystem/remove.ts +11 -0
  217. package/src/archive/filesystem/rename.spec.ts +46 -0
  218. package/src/archive/filesystem/rename.ts +21 -0
  219. package/src/archive/filesystem/tar.spec.ts +33 -0
  220. package/src/archive/filesystem/tar.ts +27 -0
  221. package/src/archive/filesystem/untar.spec.ts +34 -0
  222. package/src/archive/filesystem/untar.ts +36 -0
  223. package/src/archive/index.ts +13 -0
  224. package/src/archive/page.spec.ts +368 -0
  225. package/src/archive/page.ts +420 -0
  226. package/src/archive/resource.spec.ts +101 -0
  227. package/src/archive/resource.ts +73 -0
  228. package/src/archive/safe-path.spec.ts +44 -0
  229. package/src/archive/safe-path.ts +18 -0
  230. package/src/archive/types.ts +227 -0
  231. package/src/crawler/clear-destination-cache.spec.ts +20 -0
  232. package/src/crawler/clear-destination-cache.ts +9 -0
  233. package/src/crawler/crawler.ts +873 -0
  234. package/src/crawler/decompose-url.spec.ts +48 -0
  235. package/src/crawler/decompose-url.ts +90 -0
  236. package/src/crawler/destination-cache.spec.ts +23 -0
  237. package/src/crawler/destination-cache.ts +8 -0
  238. package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
  239. package/src/crawler/detect-pagination-pattern.ts +66 -0
  240. package/src/crawler/fetch-destination.ts +257 -0
  241. package/src/crawler/fetch-robots-txt.spec.ts +83 -0
  242. package/src/crawler/fetch-robots-txt.ts +91 -0
  243. package/src/crawler/find-best-matching-scope.spec.ts +39 -0
  244. package/src/crawler/find-best-matching-scope.ts +57 -0
  245. package/src/crawler/generate-predicted-urls.spec.ts +42 -0
  246. package/src/crawler/generate-predicted-urls.ts +34 -0
  247. package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
  248. package/src/crawler/handle-ignore-and-skip.ts +30 -0
  249. package/src/crawler/handle-resource-response.spec.ts +45 -0
  250. package/src/crawler/handle-resource-response.ts +21 -0
  251. package/src/crawler/handle-scrape-end.spec.ts +109 -0
  252. package/src/crawler/handle-scrape-end.ts +115 -0
  253. package/src/crawler/handle-scrape-error.spec.ts +105 -0
  254. package/src/crawler/handle-scrape-error.ts +58 -0
  255. package/src/crawler/index.ts +2 -0
  256. package/src/crawler/inject-scope-auth.spec.ts +36 -0
  257. package/src/crawler/inject-scope-auth.ts +27 -0
  258. package/src/crawler/is-external-url.spec.ts +31 -0
  259. package/src/crawler/is-external-url.ts +17 -0
  260. package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
  261. package/src/crawler/is-in-any-lower-layer.ts +22 -0
  262. package/src/crawler/link-list.spec.ts +355 -0
  263. package/src/crawler/link-list.ts +275 -0
  264. package/src/crawler/link-to-page-data.spec.ts +133 -0
  265. package/src/crawler/link-to-page-data.ts +34 -0
  266. package/src/crawler/net-timeout-error.spec.ts +25 -0
  267. package/src/crawler/net-timeout-error.ts +11 -0
  268. package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
  269. package/src/crawler/protocol-agnostic-key.ts +11 -0
  270. package/src/crawler/reconstruct-url.spec.ts +37 -0
  271. package/src/crawler/reconstruct-url.ts +37 -0
  272. package/src/crawler/robots-checker.spec.ts +104 -0
  273. package/src/crawler/robots-checker.ts +73 -0
  274. package/src/crawler/should-discard-predicted.spec.ts +125 -0
  275. package/src/crawler/should-discard-predicted.ts +33 -0
  276. package/src/crawler/should-skip-url.spec.ts +77 -0
  277. package/src/crawler/should-skip-url.ts +37 -0
  278. package/src/crawler/types.ts +146 -0
  279. package/src/crawler-orchestrator.ts +401 -0
  280. package/src/debug.ts +10 -0
  281. package/src/index.ts +25 -0
  282. package/src/types.ts +30 -0
  283. package/src/utils/array/each-splitted.spec.ts +38 -0
  284. package/src/utils/array/each-splitted.ts +19 -0
  285. package/src/utils/array/index.ts +1 -0
  286. package/src/utils/debug.ts +6 -0
  287. package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
  288. package/src/utils/error/dom-evaluation-error.ts +6 -0
  289. package/src/utils/error/error-emitter.spec.ts +78 -0
  290. package/src/utils/error/error-emitter.ts +44 -0
  291. package/src/utils/error/index.ts +3 -0
  292. package/src/utils/index.ts +5 -0
  293. package/src/utils/object/clean-object.spec.ts +24 -0
  294. package/src/utils/object/clean-object.ts +13 -0
  295. package/src/utils/object/index.ts +1 -0
  296. package/src/utils/types/index.ts +1 -0
  297. package/src/utils/types/types.ts +65 -0
  298. package/tsconfig.json +11 -0
  299. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,748 @@
1
+ var _a;
2
+ import { existsSync } from 'node:fs';
3
+ import path from 'node:path';
4
+ import Scraper from '@d-zero/beholder';
5
+ import { deal } from '@d-zero/dealer';
6
+ import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
7
+ import { retryCall } from '@d-zero/shared/retry';
8
+ import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
9
+ import c from 'ansi-colors';
10
+ import pkg from '../../package.json' with { type: 'json' };
11
+ import { crawlerLog } from '../debug.js';
12
+ import { detectPaginationPattern } from './detect-pagination-pattern.js';
13
+ import { fetchDestination } from './fetch-destination.js';
14
+ import { generatePredictedUrls } from './generate-predicted-urls.js';
15
+ import { handleIgnoreAndSkip } from './handle-ignore-and-skip.js';
16
+ import { handleResourceResponse } from './handle-resource-response.js';
17
+ import { handleScrapeEnd } from './handle-scrape-end.js';
18
+ import { handleScrapeError } from './handle-scrape-error.js';
19
+ import { injectScopeAuth } from './inject-scope-auth.js';
20
+ import { isExternalUrl } from './is-external-url.js';
21
+ import LinkList from './link-list.js';
22
+ import { linkToPageData } from './link-to-page-data.js';
23
+ import { protocolAgnosticKey } from './protocol-agnostic-key.js';
24
+ import { RobotsChecker } from './robots-checker.js';
25
+ import { shouldDiscardPredicted } from './should-discard-predicted.js';
26
+ import { shouldSkipUrl } from './should-skip-url.js';
27
+ /**
28
+ * The core crawler engine that discovers and scrapes web pages.
29
+ *
30
+ * The Crawler manages the crawl queue, uses the dealer pattern for concurrent
31
+ * page scraping via `@d-zero/beholder`, handles scrape results, and emits
32
+ * events defined by {@link CrawlerEventTypes}. It supports recursive crawling
33
+ * within a defined scope, external page fetching, URL exclusion, and resumable crawls.
34
+ *
35
+ * Crawling is performed concurrently using the dealer pattern, with
36
+ * configurable parallelism up to {@link Crawler.MAX_PROCESS_LENGTH}.
37
+ */
38
+ class Crawler extends EventEmitter {
39
+ /** Flag set by `abort()` to signal in-progress tasks to exit early. */
40
+ #aborted = false;
41
+ /** Tracks discovered URLs, their scrape status, and deduplication. */
42
+ #linkList = new LinkList();
43
+ /** Merged crawler configuration (user overrides + defaults). */
44
+ #options;
45
+ /** Set of resource URLs (without hash) already captured, for deduplication. */
46
+ #resources = new Set();
47
+ /** URLs restored from a previous session that still need to be scraped. */
48
+ #resumedPending = [];
49
+ /** URLs already scraped in a previous session, used to populate the `seen` set in {@link #runDeal}. */
50
+ #resumedScraped = [];
51
+ /** Checker for robots.txt compliance. */
52
+ #robotsChecker;
53
+ /** Maps hostnames to their scope URLs. Defines the crawl boundary for internal/external classification. */
54
+ #scope = new Map();
55
+ /**
56
+ * Create a new Crawler instance.
57
+ * @param options - Configuration options for crawling behavior. All fields have
58
+ * sensible defaults if omitted.
59
+ */
60
+ constructor(options) {
61
+ super();
62
+ this.#options = {
63
+ interval: options?.interval || 0,
64
+ parallels: options?.parallels || 0,
65
+ recursive: options?.recursive ?? true,
66
+ fromList: false,
67
+ captureImages: options?.captureImages ?? true,
68
+ executablePath: options?.executablePath ?? null,
69
+ fetchExternal: options?.fetchExternal ?? true,
70
+ scope: options?.scope ?? [],
71
+ excludes: options?.excludes || [],
72
+ excludeKeywords: options?.excludeKeywords || [],
73
+ excludeUrls: options?.excludeUrls || [],
74
+ maxExcludedDepth: options?.maxExcludedDepth || 10,
75
+ retry: options?.retry ?? 3,
76
+ disableQueries: options?.disableQueries ?? false,
77
+ verbose: options?.verbose ?? false,
78
+ userAgent: options?.userAgent || `Nitpicker/${pkg.version}`,
79
+ ignoreRobots: options?.ignoreRobots ?? false,
80
+ };
81
+ this.#robotsChecker = new RobotsChecker(this.#options.userAgent, !this.#options.ignoreRobots);
82
+ for (const urlStr of this.#options.scope) {
83
+ const url = parseUrl(urlStr, this.#options);
84
+ if (url) {
85
+ const existing = this.#scope.get(url.hostname) || [];
86
+ this.#scope.set(url.hostname, [...existing, url]);
87
+ }
88
+ }
89
+ }
90
+ /**
91
+ * Abort the current crawl operation.
92
+ *
93
+ * Sets the aborted flag and immediately emits a `crawlEnd` event.
94
+ * In-progress scrape tasks will check the flag and exit early.
95
+ */
96
+ abort() {
97
+ this.#aborted = true;
98
+ void this.emit('crawlEnd', {});
99
+ }
100
+ /**
101
+ * Retrieve the list of Chromium process IDs that are still running.
102
+ *
103
+ * In the current architecture, process cleanup is handled by the dealer,
104
+ * so this always returns an empty array.
105
+ * @returns An empty array (reserved for future use).
106
+ */
107
+ getUndeadPid() {
108
+ return [];
109
+ }
110
+ /**
111
+ * Restore crawl state from a previous session for resumable crawling.
112
+ *
113
+ * Repopulates the link list with pending and already-scraped URLs,
114
+ * and restores the set of known resource URLs to avoid duplicates.
115
+ * @param pending - URLs that were pending (not yet scraped) in the previous session.
116
+ * @param scraped - URLs that were already scraped in the previous session.
117
+ * @param resources - Resource URLs that were already captured in the previous session.
118
+ */
119
+ resume(pending, scraped, resources) {
120
+ this.#resumedPending = this.#linkList.resume(pending, scraped, this.#options);
121
+ this.#resumedScraped = scraped;
122
+ for (const resource of resources) {
123
+ this.#resources.add(resource);
124
+ }
125
+ }
126
+ /**
127
+ * Start crawling from a single root URL.
128
+ *
129
+ * Adds the root URL to the scope (if not already present) and the link list,
130
+ * then begins the deal-based concurrent crawl. Discovered child pages are
131
+ * automatically added to the queue when recursive mode is enabled.
132
+ * @param url - The root URL to begin crawling from.
133
+ */
134
+ start(url) {
135
+ const existing = this.#scope.get(url.hostname) || [];
136
+ if (!existing.some((u) => u.href === url.href)) {
137
+ this.#scope.set(url.hostname, [...existing, url]);
138
+ }
139
+ this.#linkList.add(url);
140
+ const isResuming = this.#resumedScraped.length > 0;
141
+ const initialUrls = isResuming ? this.#resumedPending : [url];
142
+ const resumeOffset = this.#resumedScraped.length;
143
+ if (initialUrls.length === 0) {
144
+ crawlerLog('Crawl End (nothing to resume)');
145
+ void this.emit('crawlEnd', {});
146
+ return;
147
+ }
148
+ void this.#runDeal(initialUrls, resumeOffset).catch((error) => {
149
+ crawlerLog('runDeal error: %O', error);
150
+ void this.emit('error', {
151
+ pid: process.pid,
152
+ isMainProcess: true,
153
+ url: url.href,
154
+ error: error instanceof Error ? error : new Error(String(error)),
155
+ });
156
+ void this.emit('crawlEnd', {});
157
+ });
158
+ }
159
+ /**
160
+ * Start crawling a pre-defined list of URLs in non-recursive mode.
161
+ *
162
+ * Each URL in the list is added to the scope and the link list. Recursive
163
+ * crawling is disabled; only the provided URLs will be scraped.
164
+ * @param pageList - The list of URLs to crawl. Must contain at least one URL.
165
+ * @throws {Error} If the page list is empty.
166
+ */
167
+ startMultiple(pageList) {
168
+ if (!pageList[0]) {
169
+ throw new Error('pageList is empty');
170
+ }
171
+ const scopeMap = new Map();
172
+ for (const pageUrl of pageList) {
173
+ const existing = this.#scope.get(pageUrl.hostname) || [];
174
+ const existingHrefs = scopeMap.get(pageUrl.hostname) || new Set(existing.map((u) => u.href));
175
+ if (!existingHrefs.has(pageUrl.href)) {
176
+ this.#scope.set(pageUrl.hostname, [...existing, pageUrl]);
177
+ existingHrefs.add(pageUrl.href);
178
+ }
179
+ scopeMap.set(pageUrl.hostname, existingHrefs);
180
+ this.#linkList.add(pageUrl);
181
+ }
182
+ this.#options.recursive = false;
183
+ this.#options.fromList = true;
184
+ void this.#runDeal(pageList).catch((error) => {
185
+ crawlerLog('runDeal error: %O', error);
186
+ void this.emit('error', {
187
+ pid: process.pid,
188
+ isMainProcess: true,
189
+ url: pageList[0].href,
190
+ error: error instanceof Error ? error : new Error(String(error)),
191
+ });
192
+ void this.emit('crawlEnd', {});
193
+ });
194
+ }
195
+ /**
196
+ * Processes captured sub-resources from a page scrape, deduplicates them,
197
+ * and emits `response` / `responseReferrers` events for new resources.
198
+ * @param resources - Sub-resource entries captured during the page load
199
+ */
200
+ #handleResources(resources) {
201
+ for (const { resource, pageUrl } of resources) {
202
+ const { isNew } = handleResourceResponse(resource, this.#resources);
203
+ if (isNew) {
204
+ void this.emit('response', {
205
+ resource: resource,
206
+ });
207
+ }
208
+ void this.emit('responseReferrers', {
209
+ url: pageUrl,
210
+ src: resource.url.withoutHash,
211
+ });
212
+ }
213
+ }
214
+ /**
215
+ * Dispatches a scrape result to the appropriate handler based on its type.
216
+ *
217
+ * - `success`: Processes anchors, enqueues new URLs, triggers predicted
218
+ * pagination detection, and emits `page` / `externalPage` events.
219
+ * - `skipped`: Marks the URL as done and emits `skip`.
220
+ * - `error`: Creates a fallback PageData, marks as done, and emits `error`.
221
+ * @param result - The scrape result from beholder
222
+ * @param url - The URL that was scraped
223
+ * @param push - Dealer's push callback to enqueue newly discovered URLs
224
+ * @param paginationState - Mutable state for predicted pagination cascade prevention
225
+ * @param paginationState.lastPushedUrl
226
+ * @param paginationState.lastPushedWasPredicted
227
+ * @param concurrency - Current concurrency level, used to determine predicted URL count
228
+ */
229
+ #handleResult(result, url, push, paginationState, concurrency) {
230
+ switch (result.type) {
231
+ case 'success': {
232
+ if (!result.pageData)
233
+ break;
234
+ handleScrapeEnd(result.pageData, this.#linkList, this.#scope, this.#options, (newUrl, opts) => {
235
+ this.#linkList.add(newUrl, opts);
236
+ void push(newUrl);
237
+ // Predicted pagination detection
238
+ if (!paginationState || !concurrency)
239
+ return;
240
+ // metadataOnly / external: update tracking but skip pattern detection
241
+ if (opts?.metadataOnly || isExternalUrl(newUrl, this.#scope)) {
242
+ paginationState.lastPushedUrl = newUrl.withoutHashAndAuth;
243
+ paginationState.lastPushedWasPredicted = false;
244
+ return;
245
+ }
246
+ // Skip comparison when last push was predicted (cascade prevention)
247
+ if (paginationState.lastPushedUrl &&
248
+ !paginationState.lastPushedWasPredicted) {
249
+ const pattern = detectPaginationPattern(paginationState.lastPushedUrl, newUrl.withoutHashAndAuth);
250
+ if (pattern) {
251
+ const urls = generatePredictedUrls(pattern, newUrl.withoutHashAndAuth, concurrency);
252
+ for (const specUrlStr of urls) {
253
+ const specUrl = parseUrl(specUrlStr, this.#options);
254
+ if (specUrl) {
255
+ this.#linkList.add(specUrl, { predicted: true });
256
+ void push(specUrl);
257
+ }
258
+ }
259
+ paginationState.lastPushedUrl = newUrl.withoutHashAndAuth;
260
+ paginationState.lastPushedWasPredicted = true;
261
+ return;
262
+ }
263
+ }
264
+ paginationState.lastPushedUrl = newUrl.withoutHashAndAuth;
265
+ paginationState.lastPushedWasPredicted = false;
266
+ });
267
+ if (result.pageData.isExternal) {
268
+ void this.emit('externalPage', { result: result.pageData });
269
+ }
270
+ else {
271
+ void this.emit('page', { result: result.pageData });
272
+ }
273
+ break;
274
+ }
275
+ case 'skipped': {
276
+ if (!result.ignored)
277
+ break;
278
+ handleIgnoreAndSkip(result.ignored.url, this.#linkList, this.#scope, this.#options);
279
+ void this.emit('skip', {
280
+ url: result.ignored.url.href,
281
+ reason: JSON.stringify(result.ignored),
282
+ isExternal: isExternalUrl(result.ignored.url, this.#scope),
283
+ });
284
+ break;
285
+ }
286
+ case 'error': {
287
+ if (!result.error)
288
+ break;
289
+ const error = new Error(result.error.message);
290
+ error.name = result.error.name;
291
+ error.stack = result.error.stack;
292
+ const { result: pageResult } = handleScrapeError({
293
+ url,
294
+ error,
295
+ shutdown: result.error.shutdown,
296
+ pid: undefined,
297
+ }, this.#linkList, this.#scope, this.#options);
298
+ if (pageResult) {
299
+ if (pageResult.isExternal) {
300
+ void this.emit('externalPage', { result: pageResult });
301
+ }
302
+ else {
303
+ void this.emit('page', { result: pageResult });
304
+ }
305
+ }
306
+ void this.emit('error', {
307
+ pid: process.pid,
308
+ isMainProcess: true,
309
+ url: url.href,
310
+ error,
311
+ });
312
+ break;
313
+ }
314
+ }
315
+ }
316
+ /**
317
+ * Launches a fresh Puppeteer browser, runs the beholder scraper, and cleans up.
318
+ *
319
+ * WHY per-URL browser: Each URL gets its own browser instance to ensure
320
+ * complete isolation (cookies, cache, service workers). The browser is always
321
+ * closed in the `finally` block, even on error.
322
+ * @param url - Target URL to scrape
323
+ * @param update - Callback for progress messages
324
+ * @param isExternal - Whether the URL is external to the crawl scope
325
+ * @param metadataOnly - When true, only extract title metadata
326
+ * @param headCheckResult - Optional HEAD result to pass to the scraper, avoiding a redundant request
327
+ * @returns The scrape result from beholder
328
+ */
329
+ async #launchBrowserAndScrape(url, update, isExternal, metadataOnly, headCheckResult) {
330
+ update('Launching browser%dots%');
331
+ if (this.#options.executablePath) {
332
+ const execPath = path.resolve(this.#options.executablePath);
333
+ if (!existsSync(execPath)) {
334
+ throw new Error(`Executable path does not exist: ${execPath}`);
335
+ }
336
+ }
337
+ const puppeteer = await import('puppeteer');
338
+ const browser = await puppeteer.launch({
339
+ headless: true,
340
+ ...(this.#options.executablePath
341
+ ? { executablePath: this.#options.executablePath }
342
+ : {}),
343
+ });
344
+ try {
345
+ update('Creating page%dots%');
346
+ const page = await browser.newPage();
347
+ await page.setUserAgent(this.#options.userAgent);
348
+ const scraper = new Scraper();
349
+ scraper.on('changePhase', (e) => {
350
+ const msg = formatPhaseLog(e);
351
+ if (msg) {
352
+ update(msg);
353
+ }
354
+ void this.emit('changePhase', e);
355
+ });
356
+ const result = await scraper.scrapeStart(page, url, {
357
+ isExternal,
358
+ captureImages: !isExternal && this.#options.captureImages,
359
+ excludeKeywords: this.#options.excludeKeywords,
360
+ disableQueries: this.#options.disableQueries,
361
+ metadataOnly,
362
+ retries: this.#options.retry,
363
+ headCheckResult,
364
+ });
365
+ update('Closing browser%dots%');
366
+ return result;
367
+ }
368
+ catch (error) {
369
+ return {
370
+ type: 'error',
371
+ resources: [],
372
+ error: {
373
+ name: error instanceof Error ? error.name : 'Error',
374
+ message: error instanceof Error ? error.message : String(error),
375
+ stack: error instanceof Error ? error.stack : undefined,
376
+ shutdown: true,
377
+ },
378
+ };
379
+ }
380
+ finally {
381
+ await browser.close().catch(() => { });
382
+ }
383
+ }
384
+ /**
385
+ * Runs the deal-based concurrent crawl loop.
386
+ *
387
+ * WHY deal(): The `@d-zero/dealer` pattern provides concurrent item processing
388
+ * with a dynamic queue — new URLs discovered during scraping are pushed via the
389
+ * `push` callback and automatically scheduled. The `onPush` deduplication ensures
390
+ * each URL is processed at most once (protocol-agnostic comparison).
391
+ * @param initialUrls - Starting URLs to seed the deal queue
392
+ * @param resumeOffset - Number of URLs already scraped in a previous session,
393
+ * added to the progress counter for accurate display
394
+ */
395
+ async #runDeal(initialUrls, resumeOffset = 0) {
396
+ const seen = new Set(initialUrls.map((u) => protocolAgnosticKey(u.withoutHashAndAuth)));
397
+ // Add scraped URLs to seen to prevent re-processing during resume
398
+ for (const url of this.#resumedScraped) {
399
+ seen.add(protocolAgnosticKey(url));
400
+ }
401
+ // external URL の追跡(target は deal の total/done から導出)
402
+ const externalUrls = new Set();
403
+ const externalDoneUrls = new Set();
404
+ // 初期 URL を分類(onPush を通らないため)
405
+ for (const url of initialUrls) {
406
+ if (isExternalUrl(url, this.#scope)) {
407
+ externalUrls.add(protocolAgnosticKey(url.withoutHashAndAuth));
408
+ }
409
+ }
410
+ const concurrency = this.#options.parallels
411
+ ? Math.max(this.#options.parallels, 1)
412
+ : _a.MAX_PROCESS_LENGTH;
413
+ // Predicted pagination state
414
+ const paginationState = {
415
+ lastPushedUrl: null,
416
+ lastPushedWasPredicted: false,
417
+ };
418
+ await deal(initialUrls, (url, update, _index, setLineHeader, push) => {
419
+ const isExternal = isExternalUrl(url, this.#scope);
420
+ const urlText = isExternal ? c.dim(url.href) : c.cyan(url.href);
421
+ setLineHeader(`%braille% ${urlText}: `);
422
+ injectScopeAuth(url, this.#scope);
423
+ this.#linkList.add(url);
424
+ this.#linkList.progress(url);
425
+ return async () => {
426
+ if (this.#aborted)
427
+ return;
428
+ const log = createTimedUpdate(update, this.#options.verbose);
429
+ try {
430
+ const robotsAllowed = await this.#robotsChecker.isAllowed(url);
431
+ if (!robotsAllowed) {
432
+ handleIgnoreAndSkip(url, this.#linkList, this.#scope, this.#options);
433
+ void this.emit('skip', {
434
+ url: url.href,
435
+ reason: 'blocked by robots.txt',
436
+ isExternal,
437
+ });
438
+ log(c.gray('Blocked by robots.txt'));
439
+ return;
440
+ }
441
+ const isSkip = shouldSkipUrl({
442
+ url,
443
+ excludes: this.#options.excludes,
444
+ excludeUrls: this.#options.excludeUrls,
445
+ options: this.#options,
446
+ });
447
+ if (isSkip) {
448
+ handleIgnoreAndSkip(url, this.#linkList, this.#scope, this.#options);
449
+ void this.emit('skip', { url: url.href, reason: 'excluded', isExternal });
450
+ log(c.gray('Skipped'));
451
+ return;
452
+ }
453
+ if (!this.#options.fetchExternal && isExternal) {
454
+ const pageData = linkToPageData({
455
+ url,
456
+ isExternal,
457
+ isLowerLayer: false,
458
+ });
459
+ this.#linkList.done(url, this.#scope, { page: pageData }, this.#options);
460
+ void this.emit('externalPage', { result: pageData });
461
+ log(c.dim('External (skip fetch)'));
462
+ return;
463
+ }
464
+ const metadataOnly = this.#linkList.isMetadataOnly(url.withoutHash);
465
+ const isPredicted = this.#linkList.isPredicted(url.withoutHashAndAuth);
466
+ log('Scraping%dots%');
467
+ const result = await this.#scrapePage(url, log, metadataOnly);
468
+ // Discard predicted URLs that failed (404, error, etc.)
469
+ if (isPredicted && shouldDiscardPredicted(result)) {
470
+ handleIgnoreAndSkip(url, this.#linkList, this.#scope, this.#options);
471
+ log(c.dim('Predicted (discarded)'));
472
+ return;
473
+ }
474
+ log('Saving results%dots%');
475
+ this.#handleResult(result, url, push, paginationState, concurrency);
476
+ this.#handleResources(result.resources);
477
+ log(formatResultSummary(result));
478
+ }
479
+ finally {
480
+ if (isExternal) {
481
+ externalDoneUrls.add(protocolAgnosticKey(url.withoutHashAndAuth));
482
+ }
483
+ }
484
+ };
485
+ }, {
486
+ limit: concurrency,
487
+ interval: this.#options.interval,
488
+ verbose: this.#options.verbose || !process.stdout.isTTY,
489
+ header: (_progress, done, total, limit) => {
490
+ const allDone = done + resumeOffset;
491
+ const allTotal = total + resumeOffset;
492
+ const extTotal = externalUrls.size;
493
+ const extDone = externalDoneUrls.size;
494
+ const pct = allTotal > 0 ? Math.round((allDone / allTotal) * 100) : 0;
495
+ return (c.bold(`Crawling: ${allDone - extDone}/${allTotal - extTotal}`) +
496
+ c.dim(`(${extDone}/${extTotal})`) +
497
+ c.bold(` (${pct}%) [${limit} parallel]`));
498
+ },
499
+ onPush: (url) => {
500
+ const key = protocolAgnosticKey(url.withoutHashAndAuth);
501
+ if (seen.has(key))
502
+ return false;
503
+ seen.add(key);
504
+ if (isExternalUrl(url, this.#scope)) {
505
+ externalUrls.add(key);
506
+ }
507
+ return true;
508
+ },
509
+ });
510
+ crawlerLog('Crawl End');
511
+ void this.emit('crawlEnd', {});
512
+ }
513
+ /**
514
+ * Orchestrates the full scrape pipeline for a single URL.
515
+ *
516
+ * Flow:
517
+ * 1. Non-HTTP protocols → delegate directly to browser scraper
518
+ * 2. HEAD pre-flight → check availability and content type
519
+ * 3. Title-only mode → extract `<title>` via partial GET, skip browser
520
+ * 4. Non-HTML content → return HEAD result, skip browser
521
+ * 5. HTML content → launch browser with preflight result
522
+ * @param url - Target URL to scrape
523
+ * @param update - Callback for progress messages
524
+ * @param metadataOnly - When true, only extract title metadata without full browser scraping
525
+ * @returns The scrape result
526
+ */
527
+ async #scrapePage(url, update, metadataOnly) {
528
+ const isExternal = isExternalUrl(url, this.#scope);
529
+ // Non-HTTP protocols (mailto:, tel:, etc.) — let the scraper handle early return
530
+ if (!url.isHTTP) {
531
+ return this.#launchBrowserAndScrape(url, update, isExternal, metadataOnly);
532
+ }
533
+ // Pre-flight: lightweight HEAD request to check server availability
534
+ update('HEAD request%dots%');
535
+ let headCheckResult;
536
+ try {
537
+ headCheckResult = await this.#sendHeadRequest(url, isExternal, update);
538
+ }
539
+ catch (error) {
540
+ // Server unreachable — skip browser launch entirely
541
+ update(c.red('Unreachable'));
542
+ return {
543
+ type: 'error',
544
+ resources: [],
545
+ error: {
546
+ name: error instanceof Error ? error.name : 'Error',
547
+ message: error instanceof Error ? error.message : String(error),
548
+ stack: error instanceof Error ? error.stack : undefined,
549
+ shutdown: false,
550
+ },
551
+ };
552
+ }
553
+ // Title-only mode — extract <title> via partial GET for HTML, skip browser
554
+ if (metadataOnly) {
555
+ if (headCheckResult.contentType === null ||
556
+ headCheckResult.contentType === 'text/html') {
557
+ update('Fetching title%dots%');
558
+ try {
559
+ const titleResult = await fetchDestination({
560
+ url,
561
+ isExternal,
562
+ method: 'GET',
563
+ options: { titleBytesLimit: 16_384 },
564
+ userAgent: this.#options.userAgent,
565
+ });
566
+ return {
567
+ type: 'success',
568
+ pageData: { ...titleResult, isTarget: false },
569
+ resources: [],
570
+ };
571
+ }
572
+ catch (error) {
573
+ crawlerLog('Title GET failed for %s: %O', url.href, error);
574
+ }
575
+ }
576
+ return {
577
+ type: 'success',
578
+ pageData: { ...headCheckResult, isTarget: false },
579
+ resources: [],
580
+ };
581
+ }
582
+ // Non-HTML content — skip browser
583
+ if (headCheckResult.contentType !== null &&
584
+ headCheckResult.contentType !== 'text/html') {
585
+ return {
586
+ type: 'success',
587
+ pageData: headCheckResult,
588
+ resources: [],
589
+ };
590
+ }
591
+ // HTML or unknown content type — launch browser with preflight result
592
+ return this.#launchBrowserAndScrape(url, update, isExternal, metadataOnly, headCheckResult);
593
+ }
594
+ /**
595
+ * Performs a pre-flight HTTP HEAD request with retry logic.
596
+ *
597
+ * WHY pre-flight: Avoids launching a browser for URLs that are unreachable,
598
+ * non-HTML, or return error status codes. This saves significant time and
599
+ * resources compared to launching Puppeteer for every URL.
600
+ * @param url - Target URL to check
601
+ * @param isExternal - Whether the URL is external to the crawl scope
602
+ * @param update - Callback for progress messages shown in the dealer display
603
+ * @returns Lightweight page data from the HEAD response
604
+ */
605
+ async #sendHeadRequest(url, isExternal, update) {
606
+ return retryCall(() => fetchDestination({ url, isExternal, userAgent: this.#options.userAgent }), {
607
+ retries: this.#options.retry,
608
+ label: 'HEAD request',
609
+ onWait: (determinedInterval, retryCount, label, error) => {
610
+ update(`${label}: ${error.message} — %countdown(${determinedInterval},fetchHead_${retryCount},s)%s (retry #${retryCount + 1})`);
611
+ },
612
+ onGiveUp: (retryCount, error, label) => {
613
+ update(c.red(`${label}: gave up after ${retryCount} retries — ${error.message}`));
614
+ },
615
+ });
616
+ }
617
+ /**
618
+ * The default maximum number of concurrent scraping processes.
619
+ *
620
+ * Used when `parallels` is not specified or is set to 0.
621
+ */
622
+ static MAX_PROCESS_LENGTH = 10;
623
+ }
624
+ _a = Crawler;
625
+ export default Crawler;
626
+ /**
627
+ * Colorize an HTTP status code string for terminal display.
628
+ *
629
+ * - 2xx: green
630
+ * - 3xx: yellow
631
+ * - 4xx/5xx: red
632
+ * - Unknown: no color
633
+ * @param status - The HTTP status code, or `undefined` if unknown.
634
+ * @returns A colorized "Done (status)" string.
635
+ */
636
+ function colorStatus(status) {
637
+ const text = `Done (${status ?? '?'})`;
638
+ if (!status)
639
+ return text;
640
+ if (status < 300)
641
+ return c.green(text);
642
+ if (status < 400)
643
+ return c.yellow(text);
644
+ return c.red(text);
645
+ }
646
+ /**
647
+ * Maps a beholder phase event to a human-readable log message for the dealer display.
648
+ * Returns `null` for phases that should not produce visible output (e.g. scrapeStart/End).
649
+ * @param e - The phase change event from beholder
650
+ * @returns A formatted message string, or `null` to suppress output
651
+ */
652
+ function formatPhaseLog(e) {
653
+ switch (e.name) {
654
+ case 'scrapeStart':
655
+ case 'scrapeEnd': {
656
+ return null;
657
+ }
658
+ case 'headRequest': {
659
+ return 'HEAD request%dots%';
660
+ }
661
+ case 'openPage': {
662
+ return e.message;
663
+ }
664
+ case 'loadDOMContent': {
665
+ return c.dim('DOM loaded');
666
+ }
667
+ case 'getHTML': {
668
+ return 'Getting HTML%dots%';
669
+ }
670
+ case 'waitNetworkIdle': {
671
+ return 'Waiting for network idle%dots%';
672
+ }
673
+ case 'getAnchors': {
674
+ return 'Extracting anchors%dots%';
675
+ }
676
+ case 'getMeta': {
677
+ return 'Extracting meta%dots%';
678
+ }
679
+ case 'extractImages': {
680
+ return 'Fetching images%dots%';
681
+ }
682
+ case 'setViewport':
683
+ case 'scrollToBottom':
684
+ case 'waitImageLoad':
685
+ case 'retryWait': {
686
+ return e.message;
687
+ }
688
+ case 'retryExhausted': {
689
+ return c.red(e.message);
690
+ }
691
+ case 'getImages': {
692
+ return e.message;
693
+ }
694
+ case 'pageSkipped': {
695
+ return c.yellow(`Skipped: ${e.message}`);
696
+ }
697
+ default: {
698
+ return e.name;
699
+ }
700
+ }
701
+ }
702
+ /**
703
+ * Wraps an update callback to append elapsed time between calls (e.g. `+42ms`).
704
+ * Only active when verbose mode is enabled; otherwise returns the original callback.
705
+ * @param update - The original dealer update callback
706
+ * @param verbose - Whether verbose mode is enabled
707
+ * @returns A wrapped update callback that appends timing information
708
+ */
709
+ function createTimedUpdate(update, verbose) {
710
+ if (!verbose)
711
+ return update;
712
+ let prev = Date.now();
713
+ return (msg) => {
714
+ const now = Date.now();
715
+ const delta = now - prev;
716
+ prev = now;
717
+ update(`${msg} ${c.dim(`+${delta}ms`)}`);
718
+ };
719
+ }
720
+ /**
721
+ * Formats a one-line summary of a scrape result for the dealer display.
722
+ * Shows HTTP status (colorized), anchor/image/resource counts for target pages.
723
+ * @param result - The scrape result to summarize
724
+ * @returns A colorized summary string
725
+ */
726
+ function formatResultSummary(result) {
727
+ switch (result.type) {
728
+ case 'success': {
729
+ const status = colorStatus(result.pageData?.status);
730
+ if (result.pageData?.isTarget) {
731
+ const anchors = result.pageData.anchorList.length;
732
+ const images = result.pageData.imageList.length;
733
+ const resources = result.resources.length;
734
+ return `${status} ${c.cyan(`\u{1F517} ${anchors}`)} ${c.magenta(`\u{1F5BC}\u{FE0F} ${images}`)} ${c.dim(`\u{1F4E6} ${resources}`)}`;
735
+ }
736
+ return status;
737
+ }
738
+ case 'skipped': {
739
+ return c.gray('Skipped');
740
+ }
741
+ case 'error': {
742
+ return c.red('Error');
743
+ }
744
+ default: {
745
+ return result.type;
746
+ }
747
+ }
748
+ }