@nitpicker/crawler 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. package/lib/archive/archive-accessor.d.ts +6 -1
  2. package/lib/archive/archive-accessor.js +7 -0
  3. package/lib/archive/database.js +2 -1
  4. package/package.json +5 -2
  5. package/CHANGELOG.md +0 -16
  6. package/src/archive/__mock__/.gitignore +0 -3
  7. package/src/archive/__mock__/mock.sqlite +0 -0
  8. package/src/archive/archive-accessor.ts +0 -337
  9. package/src/archive/archive.ts +0 -408
  10. package/src/archive/database.spec.ts +0 -469
  11. package/src/archive/database.ts +0 -1059
  12. package/src/archive/debug.ts +0 -10
  13. package/src/archive/filesystem/append-text.spec.ts +0 -26
  14. package/src/archive/filesystem/append-text.ts +0 -16
  15. package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
  16. package/src/archive/filesystem/copy-dir-sync.ts +0 -10
  17. package/src/archive/filesystem/copy-dir.spec.ts +0 -33
  18. package/src/archive/filesystem/copy-dir.ts +0 -14
  19. package/src/archive/filesystem/exists.spec.ts +0 -33
  20. package/src/archive/filesystem/exists.ts +0 -10
  21. package/src/archive/filesystem/get-file-list.spec.ts +0 -37
  22. package/src/archive/filesystem/get-file-list.ts +0 -13
  23. package/src/archive/filesystem/index.ts +0 -17
  24. package/src/archive/filesystem/is-dir.spec.ts +0 -29
  25. package/src/archive/filesystem/is-dir.ts +0 -11
  26. package/src/archive/filesystem/mkdir.spec.ts +0 -37
  27. package/src/archive/filesystem/mkdir.ts +0 -16
  28. package/src/archive/filesystem/output-json.spec.ts +0 -34
  29. package/src/archive/filesystem/output-json.ts +0 -16
  30. package/src/archive/filesystem/output-text.spec.ts +0 -31
  31. package/src/archive/filesystem/output-text.ts +0 -35
  32. package/src/archive/filesystem/read-json.spec.ts +0 -26
  33. package/src/archive/filesystem/read-json.ts +0 -12
  34. package/src/archive/filesystem/read-text.spec.ts +0 -25
  35. package/src/archive/filesystem/read-text.ts +0 -11
  36. package/src/archive/filesystem/readline.spec.ts +0 -29
  37. package/src/archive/filesystem/readline.ts +0 -30
  38. package/src/archive/filesystem/remove.spec.ts +0 -34
  39. package/src/archive/filesystem/remove.ts +0 -11
  40. package/src/archive/filesystem/rename.spec.ts +0 -46
  41. package/src/archive/filesystem/rename.ts +0 -21
  42. package/src/archive/filesystem/tar.spec.ts +0 -33
  43. package/src/archive/filesystem/tar.ts +0 -27
  44. package/src/archive/filesystem/untar.spec.ts +0 -34
  45. package/src/archive/filesystem/untar.ts +0 -36
  46. package/src/archive/index.ts +0 -13
  47. package/src/archive/page.spec.ts +0 -368
  48. package/src/archive/page.ts +0 -420
  49. package/src/archive/resource.spec.ts +0 -101
  50. package/src/archive/resource.ts +0 -73
  51. package/src/archive/safe-path.spec.ts +0 -44
  52. package/src/archive/safe-path.ts +0 -18
  53. package/src/archive/types.ts +0 -227
  54. package/src/crawler/clear-destination-cache.spec.ts +0 -20
  55. package/src/crawler/clear-destination-cache.ts +0 -9
  56. package/src/crawler/crawler.ts +0 -873
  57. package/src/crawler/decompose-url.spec.ts +0 -48
  58. package/src/crawler/decompose-url.ts +0 -90
  59. package/src/crawler/destination-cache.spec.ts +0 -23
  60. package/src/crawler/destination-cache.ts +0 -8
  61. package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
  62. package/src/crawler/detect-pagination-pattern.ts +0 -66
  63. package/src/crawler/fetch-destination.ts +0 -257
  64. package/src/crawler/fetch-robots-txt.spec.ts +0 -83
  65. package/src/crawler/fetch-robots-txt.ts +0 -91
  66. package/src/crawler/find-best-matching-scope.spec.ts +0 -39
  67. package/src/crawler/find-best-matching-scope.ts +0 -57
  68. package/src/crawler/generate-predicted-urls.spec.ts +0 -42
  69. package/src/crawler/generate-predicted-urls.ts +0 -34
  70. package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
  71. package/src/crawler/handle-ignore-and-skip.ts +0 -30
  72. package/src/crawler/handle-resource-response.spec.ts +0 -45
  73. package/src/crawler/handle-resource-response.ts +0 -21
  74. package/src/crawler/handle-scrape-end.spec.ts +0 -109
  75. package/src/crawler/handle-scrape-end.ts +0 -115
  76. package/src/crawler/handle-scrape-error.spec.ts +0 -105
  77. package/src/crawler/handle-scrape-error.ts +0 -58
  78. package/src/crawler/index.ts +0 -2
  79. package/src/crawler/inject-scope-auth.spec.ts +0 -36
  80. package/src/crawler/inject-scope-auth.ts +0 -27
  81. package/src/crawler/is-external-url.spec.ts +0 -31
  82. package/src/crawler/is-external-url.ts +0 -17
  83. package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
  84. package/src/crawler/is-in-any-lower-layer.ts +0 -22
  85. package/src/crawler/link-list.spec.ts +0 -355
  86. package/src/crawler/link-list.ts +0 -275
  87. package/src/crawler/link-to-page-data.spec.ts +0 -133
  88. package/src/crawler/link-to-page-data.ts +0 -34
  89. package/src/crawler/net-timeout-error.spec.ts +0 -25
  90. package/src/crawler/net-timeout-error.ts +0 -11
  91. package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
  92. package/src/crawler/protocol-agnostic-key.ts +0 -11
  93. package/src/crawler/reconstruct-url.spec.ts +0 -37
  94. package/src/crawler/reconstruct-url.ts +0 -37
  95. package/src/crawler/robots-checker.spec.ts +0 -104
  96. package/src/crawler/robots-checker.ts +0 -73
  97. package/src/crawler/should-discard-predicted.spec.ts +0 -125
  98. package/src/crawler/should-discard-predicted.ts +0 -33
  99. package/src/crawler/should-skip-url.spec.ts +0 -77
  100. package/src/crawler/should-skip-url.ts +0 -37
  101. package/src/crawler/types.ts +0 -146
  102. package/src/crawler-orchestrator.ts +0 -401
  103. package/src/debug.ts +0 -10
  104. package/src/index.ts +0 -25
  105. package/src/types.ts +0 -30
  106. package/src/utils/array/each-splitted.spec.ts +0 -38
  107. package/src/utils/array/each-splitted.ts +0 -19
  108. package/src/utils/array/index.ts +0 -1
  109. package/src/utils/debug.ts +0 -6
  110. package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
  111. package/src/utils/error/dom-evaluation-error.ts +0 -6
  112. package/src/utils/error/error-emitter.spec.ts +0 -78
  113. package/src/utils/error/error-emitter.ts +0 -44
  114. package/src/utils/error/index.ts +0 -3
  115. package/src/utils/index.ts +0 -5
  116. package/src/utils/object/clean-object.spec.ts +0 -24
  117. package/src/utils/object/clean-object.ts +0 -13
  118. package/src/utils/object/index.ts +0 -1
  119. package/src/utils/types/index.ts +0 -1
  120. package/src/utils/types/types.ts +0 -65
  121. package/tsconfig.json +0 -11
  122. package/tsconfig.tsbuildinfo +0 -1
@@ -1,873 +0,0 @@
1
- import type { CrawlerEventTypes, CrawlerOptions } from './types.js';
2
- import type {
3
- ChangePhaseEvent,
4
- PageData,
5
- ResourceEntry,
6
- ScrapeResult,
7
- } from '@d-zero/beholder';
8
- import type { ExURL } from '@d-zero/shared/parse-url';
9
-
10
- import { existsSync } from 'node:fs';
11
- import path from 'node:path';
12
-
13
- import Scraper from '@d-zero/beholder';
14
- import { deal } from '@d-zero/dealer';
15
- import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
16
- import { retryCall } from '@d-zero/shared/retry';
17
- import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
18
- import c from 'ansi-colors';
19
-
20
- import pkg from '../../package.json' with { type: 'json' };
21
- import { crawlerLog } from '../debug.js';
22
-
23
- import { detectPaginationPattern } from './detect-pagination-pattern.js';
24
- import { fetchDestination } from './fetch-destination.js';
25
- import { generatePredictedUrls } from './generate-predicted-urls.js';
26
- import { handleIgnoreAndSkip } from './handle-ignore-and-skip.js';
27
- import { handleResourceResponse } from './handle-resource-response.js';
28
- import { handleScrapeEnd } from './handle-scrape-end.js';
29
- import { handleScrapeError } from './handle-scrape-error.js';
30
- import { injectScopeAuth } from './inject-scope-auth.js';
31
- import { isExternalUrl } from './is-external-url.js';
32
- import LinkList from './link-list.js';
33
- import { linkToPageData } from './link-to-page-data.js';
34
- import { protocolAgnosticKey } from './protocol-agnostic-key.js';
35
- import { RobotsChecker } from './robots-checker.js';
36
- import { shouldDiscardPredicted } from './should-discard-predicted.js';
37
- import { shouldSkipUrl } from './should-skip-url.js';
38
-
39
- export type { CrawlerOptions } from './types.js';
40
-
41
- /**
42
- * The core crawler engine that discovers and scrapes web pages.
43
- *
44
- * The Crawler manages the crawl queue, uses the dealer pattern for concurrent
45
- * page scraping via `@d-zero/beholder`, handles scrape results, and emits
46
- * events defined by {@link CrawlerEventTypes}. It supports recursive crawling
47
- * within a defined scope, external page fetching, URL exclusion, and resumable crawls.
48
- *
49
- * Crawling is performed concurrently using the dealer pattern, with
50
- * configurable parallelism up to {@link Crawler.MAX_PROCESS_LENGTH}.
51
- */
52
- export default class Crawler extends EventEmitter<CrawlerEventTypes> {
53
- /** Flag set by `abort()` to signal in-progress tasks to exit early. */
54
- #aborted = false;
55
- /** Tracks discovered URLs, their scrape status, and deduplication. */
56
- readonly #linkList = new LinkList();
57
- /** Merged crawler configuration (user overrides + defaults). */
58
- readonly #options: CrawlerOptions;
59
- /** Set of resource URLs (without hash) already captured, for deduplication. */
60
- readonly #resources = new Set<string>();
61
- /** URLs restored from a previous session that still need to be scraped. */
62
- #resumedPending: ExURL[] = [];
63
- /** URLs already scraped in a previous session, used to populate the `seen` set in {@link #runDeal}. */
64
- #resumedScraped: string[] = [];
65
- /** Checker for robots.txt compliance. */
66
- readonly #robotsChecker: RobotsChecker;
67
-
68
- /** Maps hostnames to their scope URLs. Defines the crawl boundary for internal/external classification. */
69
- readonly #scope = new Map<string /* hostname */, ExURL[]>();
70
-
71
- /**
72
- * Create a new Crawler instance.
73
- * @param options - Configuration options for crawling behavior. All fields have
74
- * sensible defaults if omitted.
75
- */
76
- constructor(options?: Partial<CrawlerOptions>) {
77
- super();
78
- this.#options = {
79
- interval: options?.interval || 0,
80
- parallels: options?.parallels || 0,
81
- recursive: options?.recursive ?? true,
82
- fromList: false,
83
- captureImages: options?.captureImages ?? true,
84
- executablePath: options?.executablePath ?? null,
85
- fetchExternal: options?.fetchExternal ?? true,
86
- scope: options?.scope ?? [],
87
- excludes: options?.excludes || [],
88
- excludeKeywords: options?.excludeKeywords || [],
89
- excludeUrls: options?.excludeUrls || [],
90
- maxExcludedDepth: options?.maxExcludedDepth || 10,
91
- retry: options?.retry ?? 3,
92
- disableQueries: options?.disableQueries ?? false,
93
- verbose: options?.verbose ?? false,
94
- userAgent: options?.userAgent || `Nitpicker/${pkg.version}`,
95
- ignoreRobots: options?.ignoreRobots ?? false,
96
- };
97
-
98
- this.#robotsChecker = new RobotsChecker(
99
- this.#options.userAgent,
100
- !this.#options.ignoreRobots,
101
- );
102
-
103
- for (const urlStr of this.#options.scope) {
104
- const url = parseUrl(urlStr, this.#options);
105
- if (url) {
106
- const existing = this.#scope.get(url.hostname) || [];
107
- this.#scope.set(url.hostname, [...existing, url]);
108
- }
109
- }
110
- }
111
-
112
- /**
113
- * Abort the current crawl operation.
114
- *
115
- * Sets the aborted flag and immediately emits a `crawlEnd` event.
116
- * In-progress scrape tasks will check the flag and exit early.
117
- */
118
- abort() {
119
- this.#aborted = true;
120
- void this.emit('crawlEnd', {});
121
- }
122
-
123
- /**
124
- * Retrieve the list of Chromium process IDs that are still running.
125
- *
126
- * In the current architecture, process cleanup is handled by the dealer,
127
- * so this always returns an empty array.
128
- * @returns An empty array (reserved for future use).
129
- */
130
- getUndeadPid() {
131
- return [];
132
- }
133
-
134
- /**
135
- * Restore crawl state from a previous session for resumable crawling.
136
- *
137
- * Repopulates the link list with pending and already-scraped URLs,
138
- * and restores the set of known resource URLs to avoid duplicates.
139
- * @param pending - URLs that were pending (not yet scraped) in the previous session.
140
- * @param scraped - URLs that were already scraped in the previous session.
141
- * @param resources - Resource URLs that were already captured in the previous session.
142
- */
143
- resume(pending: string[], scraped: string[], resources: string[]) {
144
- this.#resumedPending = this.#linkList.resume(pending, scraped, this.#options);
145
- this.#resumedScraped = scraped;
146
- for (const resource of resources) {
147
- this.#resources.add(resource);
148
- }
149
- }
150
-
151
- /**
152
- * Start crawling from a single root URL.
153
- *
154
- * Adds the root URL to the scope (if not already present) and the link list,
155
- * then begins the deal-based concurrent crawl. Discovered child pages are
156
- * automatically added to the queue when recursive mode is enabled.
157
- * @param url - The root URL to begin crawling from.
158
- */
159
- start(url: ExURL) {
160
- const existing = this.#scope.get(url.hostname) || [];
161
- if (!existing.some((u) => u.href === url.href)) {
162
- this.#scope.set(url.hostname, [...existing, url]);
163
- }
164
- this.#linkList.add(url);
165
-
166
- const isResuming = this.#resumedScraped.length > 0;
167
- const initialUrls = isResuming ? this.#resumedPending : [url];
168
- const resumeOffset = this.#resumedScraped.length;
169
-
170
- if (initialUrls.length === 0) {
171
- crawlerLog('Crawl End (nothing to resume)');
172
- void this.emit('crawlEnd', {});
173
- return;
174
- }
175
-
176
- void this.#runDeal(initialUrls, resumeOffset).catch((error) => {
177
- crawlerLog('runDeal error: %O', error);
178
- void this.emit('error', {
179
- pid: process.pid,
180
- isMainProcess: true,
181
- url: url.href,
182
- error: error instanceof Error ? error : new Error(String(error)),
183
- });
184
- void this.emit('crawlEnd', {});
185
- });
186
- }
187
-
188
- /**
189
- * Start crawling a pre-defined list of URLs in non-recursive mode.
190
- *
191
- * Each URL in the list is added to the scope and the link list. Recursive
192
- * crawling is disabled; only the provided URLs will be scraped.
193
- * @param pageList - The list of URLs to crawl. Must contain at least one URL.
194
- * @throws {Error} If the page list is empty.
195
- */
196
- startMultiple(pageList: ExURL[]) {
197
- if (!pageList[0]) {
198
- throw new Error('pageList is empty');
199
- }
200
-
201
- const scopeMap = new Map<string, Set<string>>();
202
- for (const pageUrl of pageList) {
203
- const existing = this.#scope.get(pageUrl.hostname) || [];
204
- const existingHrefs =
205
- scopeMap.get(pageUrl.hostname) || new Set(existing.map((u) => u.href));
206
-
207
- if (!existingHrefs.has(pageUrl.href)) {
208
- this.#scope.set(pageUrl.hostname, [...existing, pageUrl]);
209
- existingHrefs.add(pageUrl.href);
210
- }
211
-
212
- scopeMap.set(pageUrl.hostname, existingHrefs);
213
- this.#linkList.add(pageUrl);
214
- }
215
- this.#options.recursive = false;
216
- this.#options.fromList = true;
217
- void this.#runDeal(pageList).catch((error) => {
218
- crawlerLog('runDeal error: %O', error);
219
- void this.emit('error', {
220
- pid: process.pid,
221
- isMainProcess: true,
222
- url: pageList[0]!.href,
223
- error: error instanceof Error ? error : new Error(String(error)),
224
- });
225
- void this.emit('crawlEnd', {});
226
- });
227
- }
228
-
229
- /**
230
- * Processes captured sub-resources from a page scrape, deduplicates them,
231
- * and emits `response` / `responseReferrers` events for new resources.
232
- * @param resources - Sub-resource entries captured during the page load
233
- */
234
- #handleResources(resources: ResourceEntry[]) {
235
- for (const { resource, pageUrl } of resources) {
236
- const { isNew } = handleResourceResponse(
237
- resource as CrawlerEventTypes['response']['resource'],
238
- this.#resources,
239
- );
240
- if (isNew) {
241
- void this.emit('response', {
242
- resource: resource as CrawlerEventTypes['response']['resource'],
243
- });
244
- }
245
- void this.emit('responseReferrers', {
246
- url: pageUrl,
247
- src: resource.url.withoutHash,
248
- });
249
- }
250
- }
251
- /**
252
- * Dispatches a scrape result to the appropriate handler based on its type.
253
- *
254
- * - `success`: Processes anchors, enqueues new URLs, triggers predicted
255
- * pagination detection, and emits `page` / `externalPage` events.
256
- * - `skipped`: Marks the URL as done and emits `skip`.
257
- * - `error`: Creates a fallback PageData, marks as done, and emits `error`.
258
- * @param result - The scrape result from beholder
259
- * @param url - The URL that was scraped
260
- * @param push - Dealer's push callback to enqueue newly discovered URLs
261
- * @param paginationState - Mutable state for predicted pagination cascade prevention
262
- * @param paginationState.lastPushedUrl
263
- * @param paginationState.lastPushedWasPredicted
264
- * @param concurrency - Current concurrency level, used to determine predicted URL count
265
- */
266
- #handleResult(
267
- result: ScrapeResult,
268
- url: ExURL,
269
- push: (...items: ExURL[]) => Promise<void>,
270
- paginationState?: { lastPushedUrl: string | null; lastPushedWasPredicted: boolean },
271
- concurrency?: number,
272
- ) {
273
- switch (result.type) {
274
- case 'success': {
275
- if (!result.pageData) break;
276
- handleScrapeEnd(
277
- result.pageData,
278
- this.#linkList,
279
- this.#scope,
280
- this.#options,
281
- (newUrl, opts) => {
282
- this.#linkList.add(newUrl, opts);
283
- void push(newUrl);
284
-
285
- // Predicted pagination detection
286
- if (!paginationState || !concurrency) return;
287
-
288
- // metadataOnly / external: update tracking but skip pattern detection
289
- if (opts?.metadataOnly || isExternalUrl(newUrl, this.#scope)) {
290
- paginationState.lastPushedUrl = newUrl.withoutHashAndAuth;
291
- paginationState.lastPushedWasPredicted = false;
292
- return;
293
- }
294
-
295
- // Skip comparison when last push was predicted (cascade prevention)
296
- if (
297
- paginationState.lastPushedUrl &&
298
- !paginationState.lastPushedWasPredicted
299
- ) {
300
- const pattern = detectPaginationPattern(
301
- paginationState.lastPushedUrl,
302
- newUrl.withoutHashAndAuth,
303
- );
304
- if (pattern) {
305
- const urls = generatePredictedUrls(
306
- pattern,
307
- newUrl.withoutHashAndAuth,
308
- concurrency,
309
- );
310
- for (const specUrlStr of urls) {
311
- const specUrl = parseUrl(specUrlStr, this.#options);
312
- if (specUrl) {
313
- this.#linkList.add(specUrl, { predicted: true });
314
- void push(specUrl);
315
- }
316
- }
317
- paginationState.lastPushedUrl = newUrl.withoutHashAndAuth;
318
- paginationState.lastPushedWasPredicted = true;
319
- return;
320
- }
321
- }
322
-
323
- paginationState.lastPushedUrl = newUrl.withoutHashAndAuth;
324
- paginationState.lastPushedWasPredicted = false;
325
- },
326
- );
327
- if (result.pageData.isExternal) {
328
- void this.emit('externalPage', { result: result.pageData });
329
- } else {
330
- void this.emit('page', { result: result.pageData });
331
- }
332
- break;
333
- }
334
- case 'skipped': {
335
- if (!result.ignored) break;
336
- handleIgnoreAndSkip(
337
- result.ignored.url,
338
- this.#linkList,
339
- this.#scope,
340
- this.#options,
341
- );
342
- void this.emit('skip', {
343
- url: result.ignored.url.href,
344
- reason: JSON.stringify(result.ignored),
345
- isExternal: isExternalUrl(result.ignored.url, this.#scope),
346
- });
347
- break;
348
- }
349
- case 'error': {
350
- if (!result.error) break;
351
- const error = new Error(result.error.message);
352
- error.name = result.error.name;
353
- error.stack = result.error.stack;
354
- const { result: pageResult } = handleScrapeError(
355
- {
356
- url,
357
- error,
358
- shutdown: result.error.shutdown,
359
- pid: undefined,
360
- },
361
- this.#linkList,
362
- this.#scope,
363
- this.#options,
364
- );
365
- if (pageResult) {
366
- if (pageResult.isExternal) {
367
- void this.emit('externalPage', { result: pageResult });
368
- } else {
369
- void this.emit('page', { result: pageResult });
370
- }
371
- }
372
- void this.emit('error', {
373
- pid: process.pid,
374
- isMainProcess: true,
375
- url: url.href,
376
- error,
377
- });
378
- break;
379
- }
380
- }
381
- }
382
- /**
383
- * Launches a fresh Puppeteer browser, runs the beholder scraper, and cleans up.
384
- *
385
- * WHY per-URL browser: Each URL gets its own browser instance to ensure
386
- * complete isolation (cookies, cache, service workers). The browser is always
387
- * closed in the `finally` block, even on error.
388
- * @param url - Target URL to scrape
389
- * @param update - Callback for progress messages
390
- * @param isExternal - Whether the URL is external to the crawl scope
391
- * @param metadataOnly - When true, only extract title metadata
392
- * @param headCheckResult - Optional HEAD result to pass to the scraper, avoiding a redundant request
393
- * @returns The scrape result from beholder
394
- */
395
- async #launchBrowserAndScrape(
396
- url: ExURL,
397
- update: (log: string) => void,
398
- isExternal: boolean,
399
- metadataOnly: boolean,
400
- headCheckResult?: PageData,
401
- ): Promise<ScrapeResult> {
402
- update('Launching browser%dots%');
403
- if (this.#options.executablePath) {
404
- const execPath = path.resolve(this.#options.executablePath);
405
- if (!existsSync(execPath)) {
406
- throw new Error(`Executable path does not exist: ${execPath}`);
407
- }
408
- }
409
- const puppeteer = await import('puppeteer');
410
- const browser = await puppeteer.launch({
411
- headless: true,
412
- ...(this.#options.executablePath
413
- ? { executablePath: this.#options.executablePath }
414
- : {}),
415
- });
416
-
417
- try {
418
- update('Creating page%dots%');
419
- const page = await browser.newPage();
420
- await page.setUserAgent(this.#options.userAgent);
421
- const scraper = new Scraper();
422
-
423
- scraper.on('changePhase', (e) => {
424
- const msg = formatPhaseLog(e);
425
- if (msg) {
426
- update(msg);
427
- }
428
- void this.emit('changePhase', e);
429
- });
430
-
431
- const result = await scraper.scrapeStart(page, url, {
432
- isExternal,
433
- captureImages: !isExternal && this.#options.captureImages,
434
- excludeKeywords: this.#options.excludeKeywords,
435
- disableQueries: this.#options.disableQueries,
436
- metadataOnly,
437
- retries: this.#options.retry,
438
- headCheckResult,
439
- });
440
-
441
- update('Closing browser%dots%');
442
- return result;
443
- } catch (error) {
444
- return {
445
- type: 'error',
446
- resources: [],
447
- error: {
448
- name: error instanceof Error ? error.name : 'Error',
449
- message: error instanceof Error ? error.message : String(error),
450
- stack: error instanceof Error ? error.stack : undefined,
451
- shutdown: true,
452
- },
453
- };
454
- } finally {
455
- await browser.close().catch(() => {});
456
- }
457
- }
458
- /**
459
- * Runs the deal-based concurrent crawl loop.
460
- *
461
- * WHY deal(): The `@d-zero/dealer` pattern provides concurrent item processing
462
- * with a dynamic queue — new URLs discovered during scraping are pushed via the
463
- * `push` callback and automatically scheduled. The `onPush` deduplication ensures
464
- * each URL is processed at most once (protocol-agnostic comparison).
465
- * @param initialUrls - Starting URLs to seed the deal queue
466
- * @param resumeOffset - Number of URLs already scraped in a previous session,
467
- * added to the progress counter for accurate display
468
- */
469
- async #runDeal(initialUrls: ExURL[], resumeOffset = 0) {
470
- const seen = new Set<string>(
471
- initialUrls.map((u) => protocolAgnosticKey(u.withoutHashAndAuth)),
472
- );
473
-
474
- // Add scraped URLs to seen to prevent re-processing during resume
475
- for (const url of this.#resumedScraped) {
476
- seen.add(protocolAgnosticKey(url));
477
- }
478
-
479
- // external URL の追跡(target は deal の total/done から導出)
480
- const externalUrls = new Set<string>();
481
- const externalDoneUrls = new Set<string>();
482
-
483
- // 初期 URL を分類(onPush を通らないため)
484
- for (const url of initialUrls) {
485
- if (isExternalUrl(url, this.#scope)) {
486
- externalUrls.add(protocolAgnosticKey(url.withoutHashAndAuth));
487
- }
488
- }
489
-
490
- const concurrency = this.#options.parallels
491
- ? Math.max(this.#options.parallels, 1)
492
- : Crawler.MAX_PROCESS_LENGTH;
493
-
494
- // Predicted pagination state
495
- const paginationState = {
496
- lastPushedUrl: null as string | null,
497
- lastPushedWasPredicted: false,
498
- };
499
-
500
- await deal(
501
- initialUrls,
502
- (url, update, _index, setLineHeader, push) => {
503
- const isExternal = isExternalUrl(url, this.#scope);
504
- const urlText = isExternal ? c.dim(url.href) : c.cyan(url.href);
505
- setLineHeader(`%braille% ${urlText}: `);
506
- injectScopeAuth(url, this.#scope);
507
- this.#linkList.add(url);
508
- this.#linkList.progress(url);
509
-
510
- return async () => {
511
- if (this.#aborted) return;
512
- const log = createTimedUpdate(update, this.#options.verbose);
513
-
514
- try {
515
- const robotsAllowed = await this.#robotsChecker.isAllowed(url);
516
- if (!robotsAllowed) {
517
- handleIgnoreAndSkip(url, this.#linkList, this.#scope, this.#options);
518
- void this.emit('skip', {
519
- url: url.href,
520
- reason: 'blocked by robots.txt',
521
- isExternal,
522
- });
523
- log(c.gray('Blocked by robots.txt'));
524
- return;
525
- }
526
-
527
- const isSkip = shouldSkipUrl({
528
- url,
529
- excludes: this.#options.excludes,
530
- excludeUrls: this.#options.excludeUrls,
531
- options: this.#options,
532
- });
533
-
534
- if (isSkip) {
535
- handleIgnoreAndSkip(url, this.#linkList, this.#scope, this.#options);
536
- void this.emit('skip', { url: url.href, reason: 'excluded', isExternal });
537
- log(c.gray('Skipped'));
538
- return;
539
- }
540
-
541
- if (!this.#options.fetchExternal && isExternal) {
542
- const pageData = linkToPageData({
543
- url,
544
- isExternal,
545
- isLowerLayer: false,
546
- });
547
- this.#linkList.done(url, this.#scope, { page: pageData }, this.#options);
548
- void this.emit('externalPage', { result: pageData });
549
- log(c.dim('External (skip fetch)'));
550
- return;
551
- }
552
-
553
- const metadataOnly = this.#linkList.isMetadataOnly(url.withoutHash);
554
- const isPredicted = this.#linkList.isPredicted(url.withoutHashAndAuth);
555
-
556
- log('Scraping%dots%');
557
- const result = await this.#scrapePage(url, log, metadataOnly);
558
-
559
- // Discard predicted URLs that failed (404, error, etc.)
560
- if (isPredicted && shouldDiscardPredicted(result)) {
561
- handleIgnoreAndSkip(url, this.#linkList, this.#scope, this.#options);
562
- log(c.dim('Predicted (discarded)'));
563
- return;
564
- }
565
-
566
- log('Saving results%dots%');
567
- this.#handleResult(result, url, push, paginationState, concurrency);
568
- this.#handleResources(result.resources);
569
- log(formatResultSummary(result));
570
- } finally {
571
- if (isExternal) {
572
- externalDoneUrls.add(protocolAgnosticKey(url.withoutHashAndAuth));
573
- }
574
- }
575
- };
576
- },
577
- {
578
- limit: concurrency,
579
- interval: this.#options.interval,
580
- verbose: this.#options.verbose || !process.stdout.isTTY,
581
- header: (_progress, done, total, limit) => {
582
- const allDone = done + resumeOffset;
583
- const allTotal = total + resumeOffset;
584
- const extTotal = externalUrls.size;
585
- const extDone = externalDoneUrls.size;
586
- const pct = allTotal > 0 ? Math.round((allDone / allTotal) * 100) : 0;
587
- return (
588
- c.bold(`Crawling: ${allDone - extDone}/${allTotal - extTotal}`) +
589
- c.dim(`(${extDone}/${extTotal})`) +
590
- c.bold(` (${pct}%) [${limit} parallel]`)
591
- );
592
- },
593
- onPush: (url) => {
594
- const key = protocolAgnosticKey(url.withoutHashAndAuth);
595
- if (seen.has(key)) return false;
596
- seen.add(key);
597
- if (isExternalUrl(url, this.#scope)) {
598
- externalUrls.add(key);
599
- }
600
- return true;
601
- },
602
- },
603
- );
604
-
605
- crawlerLog('Crawl End');
606
- void this.emit('crawlEnd', {});
607
- }
608
- /**
609
- * Orchestrates the full scrape pipeline for a single URL.
610
- *
611
- * Flow:
612
- * 1. Non-HTTP protocols → delegate directly to browser scraper
613
- * 2. HEAD pre-flight → check availability and content type
614
- * 3. Title-only mode → extract `<title>` via partial GET, skip browser
615
- * 4. Non-HTML content → return HEAD result, skip browser
616
- * 5. HTML content → launch browser with preflight result
617
- * @param url - Target URL to scrape
618
- * @param update - Callback for progress messages
619
- * @param metadataOnly - When true, only extract title metadata without full browser scraping
620
- * @returns The scrape result
621
- */
622
- async #scrapePage(
623
- url: ExURL,
624
- update: (log: string) => void,
625
- metadataOnly: boolean,
626
- ): Promise<ScrapeResult> {
627
- const isExternal = isExternalUrl(url, this.#scope);
628
-
629
- // Non-HTTP protocols (mailto:, tel:, etc.) — let the scraper handle early return
630
- if (!url.isHTTP) {
631
- return this.#launchBrowserAndScrape(url, update, isExternal, metadataOnly);
632
- }
633
-
634
- // Pre-flight: lightweight HEAD request to check server availability
635
- update('HEAD request%dots%');
636
- let headCheckResult: PageData;
637
- try {
638
- headCheckResult = await this.#sendHeadRequest(url, isExternal, update);
639
- } catch (error) {
640
- // Server unreachable — skip browser launch entirely
641
- update(c.red('Unreachable'));
642
- return {
643
- type: 'error',
644
- resources: [],
645
- error: {
646
- name: error instanceof Error ? error.name : 'Error',
647
- message: error instanceof Error ? error.message : String(error),
648
- stack: error instanceof Error ? error.stack : undefined,
649
- shutdown: false,
650
- },
651
- };
652
- }
653
-
654
- // Title-only mode — extract <title> via partial GET for HTML, skip browser
655
- if (metadataOnly) {
656
- if (
657
- headCheckResult.contentType === null ||
658
- headCheckResult.contentType === 'text/html'
659
- ) {
660
- update('Fetching title%dots%');
661
- try {
662
- const titleResult = await fetchDestination({
663
- url,
664
- isExternal,
665
- method: 'GET',
666
- options: { titleBytesLimit: 16_384 },
667
- userAgent: this.#options.userAgent,
668
- });
669
- return {
670
- type: 'success',
671
- pageData: { ...titleResult, isTarget: false },
672
- resources: [],
673
- };
674
- } catch (error) {
675
- crawlerLog('Title GET failed for %s: %O', url.href, error);
676
- }
677
- }
678
- return {
679
- type: 'success',
680
- pageData: { ...headCheckResult, isTarget: false },
681
- resources: [],
682
- };
683
- }
684
-
685
- // Non-HTML content — skip browser
686
- if (
687
- headCheckResult.contentType !== null &&
688
- headCheckResult.contentType !== 'text/html'
689
- ) {
690
- return {
691
- type: 'success',
692
- pageData: headCheckResult,
693
- resources: [],
694
- };
695
- }
696
-
697
- // HTML or unknown content type — launch browser with preflight result
698
- return this.#launchBrowserAndScrape(
699
- url,
700
- update,
701
- isExternal,
702
- metadataOnly,
703
- headCheckResult,
704
- );
705
- }
706
- /**
707
- * Performs a pre-flight HTTP HEAD request with retry logic.
708
- *
709
- * WHY pre-flight: Avoids launching a browser for URLs that are unreachable,
710
- * non-HTML, or return error status codes. This saves significant time and
711
- * resources compared to launching Puppeteer for every URL.
712
- * @param url - Target URL to check
713
- * @param isExternal - Whether the URL is external to the crawl scope
714
- * @param update - Callback for progress messages shown in the dealer display
715
- * @returns Lightweight page data from the HEAD response
716
- */
717
- async #sendHeadRequest(
718
- url: ExURL,
719
- isExternal: boolean,
720
- update: (msg: string) => void,
721
- ): Promise<PageData> {
722
- return retryCall(
723
- () => fetchDestination({ url, isExternal, userAgent: this.#options.userAgent }),
724
- {
725
- retries: this.#options.retry,
726
- label: 'HEAD request',
727
- onWait: (determinedInterval, retryCount, label, error) => {
728
- update(
729
- `${label}: ${error.message} — %countdown(${determinedInterval},fetchHead_${retryCount},s)%s (retry #${retryCount + 1})`,
730
- );
731
- },
732
- onGiveUp: (retryCount, error, label) => {
733
- update(
734
- c.red(`${label}: gave up after ${retryCount} retries — ${error.message}`),
735
- );
736
- },
737
- },
738
- );
739
- }
740
-
741
- /**
742
- * The default maximum number of concurrent scraping processes.
743
- *
744
- * Used when `parallels` is not specified or is set to 0.
745
- */
746
- static readonly MAX_PROCESS_LENGTH = 10;
747
- }
748
-
749
- /**
750
- * Colorize an HTTP status code string for terminal display.
751
- *
752
- * - 2xx: green
753
- * - 3xx: yellow
754
- * - 4xx/5xx: red
755
- * - Unknown: no color
756
- * @param status - The HTTP status code, or `undefined` if unknown.
757
- * @returns A colorized "Done (status)" string.
758
- */
759
- function colorStatus(status: number | undefined) {
760
- const text = `Done (${status ?? '?'})`;
761
- if (!status) return text;
762
- if (status < 300) return c.green(text);
763
- if (status < 400) return c.yellow(text);
764
- return c.red(text);
765
- }
766
-
767
- /**
768
- * Maps a beholder phase event to a human-readable log message for the dealer display.
769
- * Returns `null` for phases that should not produce visible output (e.g. scrapeStart/End).
770
- * @param e - The phase change event from beholder
771
- * @returns A formatted message string, or `null` to suppress output
772
- */
773
- function formatPhaseLog(e: ChangePhaseEvent): string | null {
774
- switch (e.name) {
775
- case 'scrapeStart':
776
- case 'scrapeEnd': {
777
- return null;
778
- }
779
- case 'headRequest': {
780
- return 'HEAD request%dots%';
781
- }
782
- case 'openPage': {
783
- return e.message;
784
- }
785
- case 'loadDOMContent': {
786
- return c.dim('DOM loaded');
787
- }
788
- case 'getHTML': {
789
- return 'Getting HTML%dots%';
790
- }
791
- case 'waitNetworkIdle': {
792
- return 'Waiting for network idle%dots%';
793
- }
794
- case 'getAnchors': {
795
- return 'Extracting anchors%dots%';
796
- }
797
- case 'getMeta': {
798
- return 'Extracting meta%dots%';
799
- }
800
- case 'extractImages': {
801
- return 'Fetching images%dots%';
802
- }
803
- case 'setViewport':
804
- case 'scrollToBottom':
805
- case 'waitImageLoad':
806
- case 'retryWait': {
807
- return e.message;
808
- }
809
- case 'retryExhausted': {
810
- return c.red(e.message);
811
- }
812
- case 'getImages': {
813
- return e.message;
814
- }
815
- case 'pageSkipped': {
816
- return c.yellow(`Skipped: ${e.message}`);
817
- }
818
- default: {
819
- return e.name;
820
- }
821
- }
822
- }
823
-
824
- /**
825
- * Wraps an update callback to append elapsed time between calls (e.g. `+42ms`).
826
- * Only active when verbose mode is enabled; otherwise returns the original callback.
827
- * @param update - The original dealer update callback
828
- * @param verbose - Whether verbose mode is enabled
829
- * @returns A wrapped update callback that appends timing information
830
- */
831
- function createTimedUpdate(
832
- update: (msg: string) => void,
833
- verbose?: boolean,
834
- ): (msg: string) => void {
835
- if (!verbose) return update;
836
- let prev = Date.now();
837
- return (msg: string) => {
838
- const now = Date.now();
839
- const delta = now - prev;
840
- prev = now;
841
- update(`${msg} ${c.dim(`+${delta}ms`)}`);
842
- };
843
- }
844
-
845
- /**
846
- * Formats a one-line summary of a scrape result for the dealer display.
847
- * Shows HTTP status (colorized), anchor/image/resource counts for target pages.
848
- * @param result - The scrape result to summarize
849
- * @returns A colorized summary string
850
- */
851
- function formatResultSummary(result: ScrapeResult): string {
852
- switch (result.type) {
853
- case 'success': {
854
- const status = colorStatus(result.pageData?.status);
855
- if (result.pageData?.isTarget) {
856
- const anchors = result.pageData.anchorList.length;
857
- const images = result.pageData.imageList.length;
858
- const resources = result.resources.length;
859
- return `${status} ${c.cyan(`\u{1F517} ${anchors}`)} ${c.magenta(`\u{1F5BC}\u{FE0F} ${images}`)} ${c.dim(`\u{1F4E6} ${resources}`)}`;
860
- }
861
- return status;
862
- }
863
- case 'skipped': {
864
- return c.gray('Skipped');
865
- }
866
- case 'error': {
867
- return c.red('Error');
868
- }
869
- default: {
870
- return result.type;
871
- }
872
- }
873
- }