@d-zero/beholder 0.1.29 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/CHANGELOG.md +11 -0
  2. package/README.md +172 -477
  3. package/dist/debug.d.ts +4 -1
  4. package/dist/debug.js +5 -2
  5. package/dist/dom-evaluation.d.ts +72 -14
  6. package/dist/dom-evaluation.js +169 -43
  7. package/dist/index.d.ts +20 -3
  8. package/dist/index.js +15 -3
  9. package/dist/is-error.d.ts +8 -0
  10. package/dist/is-error.js +10 -0
  11. package/dist/keyword-check.d.ts +5 -3
  12. package/dist/keyword-check.js +5 -3
  13. package/dist/parse-url.d.ts +14 -0
  14. package/dist/parse-url.js +23 -0
  15. package/dist/scraper.d.ts +39 -13
  16. package/dist/scraper.js +300 -263
  17. package/dist/types.d.ts +286 -214
  18. package/dist/types.js +6 -0
  19. package/package.json +7 -10
  20. package/src/debug.ts +5 -2
  21. package/src/dom-evaluation.ts +195 -65
  22. package/src/index.ts +27 -3
  23. package/src/is-error.spec.ts +33 -0
  24. package/src/is-error.ts +10 -0
  25. package/src/keyword-check.spec.ts +45 -4
  26. package/src/keyword-check.ts +5 -3
  27. package/src/parse-url.spec.ts +35 -0
  28. package/src/parse-url.ts +26 -0
  29. package/src/scraper.ts +338 -300
  30. package/src/types.ts +345 -258
  31. package/tsconfig.tsbuildinfo +1 -1
  32. package/dist/events.d.ts +0 -32
  33. package/dist/events.js +0 -15
  34. package/dist/fetch-destination.d.ts +0 -8
  35. package/dist/fetch-destination.js +0 -145
  36. package/dist/net-timeout-error.d.ts +0 -3
  37. package/dist/net-timeout-error.js +0 -3
  38. package/dist/sub-process-runner.d.ts +0 -12
  39. package/dist/sub-process-runner.js +0 -180
  40. package/dist/sub-process.d.ts +0 -1
  41. package/dist/sub-process.js +0 -67
  42. package/dist/utils.d.ts +0 -16
  43. package/dist/utils.js +0 -69
  44. package/src/events.ts +0 -21
  45. package/src/fetch-destination.ts +0 -173
  46. package/src/net-timeout-error.ts +0 -3
  47. package/src/sub-process-runner.ts +0 -220
  48. package/src/sub-process.ts +0 -86
  49. package/src/utils.ts +0 -89
package/src/scraper.ts CHANGED
@@ -1,130 +1,119 @@
1
1
  import type {
2
- ScrapeEventTypes,
2
+ ChangePhaseEvent,
3
+ ResourceEntry,
4
+ ScraperEventTypes,
5
+ ScraperOptions,
6
+ ScrapeResult,
7
+ ExURL,
3
8
  ImageElement,
4
9
  NetworkLog,
5
10
  PageData,
6
11
  ParseURLOptions,
7
12
  Resource,
8
13
  SkippedPageData,
9
- ExURL,
10
14
  } from './types.js';
11
- import type { Browser, Page } from 'puppeteer';
15
+ import type { PageScanPhase } from '@d-zero/puppeteer-page-scan';
16
+ import type { Page } from 'puppeteer';
12
17
 
13
- import { beforePageScan } from '@d-zero/puppeteer-page-scan';
14
- import { parseUrl } from '@d-zero/shared/parse-url';
15
- import { retry } from '@d-zero/shared/retry';
16
- import { TypedAwaitEventEmitter } from '@d-zero/shared/typed-await-event-emitter';
17
- import { launch } from 'puppeteer';
18
+ import { beforePageScan, devicePresets } from '@d-zero/puppeteer-page-scan';
19
+ import { detectCDN } from '@d-zero/shared/detect-cdn';
20
+ import { detectCompress } from '@d-zero/shared/detect-compress';
21
+ import { retry as retryable } from '@d-zero/shared/retry';
22
+ import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
18
23
 
19
24
  import { resourceLog, scraperLog } from './debug.js';
20
25
  import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
21
- import { fetchDestination } from './fetch-destination.js';
26
+ import { isError } from './is-error.js';
22
27
  import { keywordCheck } from './keyword-check.js';
23
- import { detectCDN, detectCompress, isError } from './utils.js';
28
+ import { parseUrl } from './parse-url.js';
24
29
 
25
30
  const pid = `${process.pid}`;
26
31
  const log = scraperLog.extend(pid);
27
32
  const rLog = resourceLog.extend(pid);
28
33
 
29
- const LAUNCH_BROWSER_TIMEOUT = 1000 * 30;
30
-
31
- export type ScraperOptions = {
32
- isExternal: boolean;
33
- isGettingImages: boolean;
34
- excludeKeywords: string[];
35
- executablePath: string | null;
36
- isTitleOnly: boolean;
37
- screenshot: string | null;
38
- } & ParseURLOptions;
39
-
40
- export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
41
- #browser: Browser | null = null;
42
- #url: ExURL | null = null;
43
-
44
- async destroy(isExternal: boolean) {
45
- log('Scraper destroys self');
46
- if (!this.#url) {
47
- throw new Error('The instance is already destroyed.');
48
- }
49
- if (!this.#browser) {
50
- void this.emit('destroyed', {
51
- pid: process.pid,
52
- });
53
- void this.emit('changePhase', {
54
- pid: process.pid,
55
- name: 'destroyed',
56
- url: this.#url,
57
- isExternal,
58
- message: '',
59
- });
60
- return;
61
- }
62
- while (!this.#browser.isConnected()) {
63
- log('Browser closes all pages');
64
- const pages = await this.#browser.pages();
65
- for (const page of pages) {
66
- page.removeAllListeners();
67
- if (!page.isClosed) {
68
- await page.close();
69
- }
70
- }
71
- log('Browser closes self');
72
- await this.#browser.close();
73
- log('Browser disconnects');
74
- await this.#browser.disconnect();
75
- }
76
- log('Scraper discards browser');
77
- this.#browser = null;
78
- void this.emit('destroyed', {
79
- pid: process.pid,
80
- });
81
- void this.emit('changePhase', {
82
- pid: process.pid,
83
- name: 'destroyed',
84
- url: this.#url,
85
- isExternal,
86
- message: '',
87
- });
88
- }
89
-
90
- async scrapeStart(url: ExURL, options?: Partial<ScraperOptions>, isSkip = false) {
34
+ /**
35
+ * Page-level scraper that extracts data from a single browser page.
36
+ *
37
+ * The scraper returns results as values from `scrapeStart()` rather than
38
+ * emitting them as events. Only streaming events (changePhase, resourceResponse)
39
+ * are emitted for progress monitoring.
40
+ *
41
+ * The Puppeteer `Page` object is injected externally, and page lifecycle
42
+ * (including `page.close()`) is managed by the caller.
43
+ * @example
44
+ * ```ts
45
+ * const scraper = new Scraper();
46
+ * scraper.on('changePhase', (e) => console.log(e.name));
47
+ * const result = await scraper.scrapeStart(page, url, { isExternal: false });
48
+ * ```
49
+ */
50
+ // eslint-disable-next-line unicorn/prefer-event-target -- TypedAwaitEventEmitter is a project-specific typed wrapper, not Node.js EventEmitter
51
+ export default class Scraper extends EventEmitter<ScraperEventTypes> {
52
+ /** Number of retries for `@retryable`-decorated methods. Set per-scrape from options. */
53
+ retries?: number;
54
+
55
+ /**
56
+ * Begins the scraping process for a given URL on the provided Puppeteer page.
57
+ *
58
+ * Returns a `ScrapeResult` containing the outcome:
59
+ * - `type: "success"` with `pageData` on success
60
+ * - `type: "skipped"` with `ignored` details when the page is excluded
61
+ * - `type: "error"` with `error` details when scraping fails
62
+ *
63
+ * Sub-resources are collected via the `resourceResponse` event and
64
+ * included in the returned `ScrapeResult.resources`.
65
+ * @param page - The Puppeteer page instance to use for navigation and DOM evaluation.
66
+ * @param url - The extended URL to scrape.
67
+ * @param options - Optional scraper configuration overriding defaults.
68
+ * @param isSkip - When `true`, the page is immediately skipped without any network requests.
69
+ * @returns The scrape result containing the outcome and captured resources.
70
+ */
71
+ async scrapeStart(
72
+ page: Page,
73
+ url: ExURL,
74
+ options?: Partial<ScraperOptions>,
75
+ isSkip = false,
76
+ ): Promise<ScrapeResult> {
77
+ this.retries = options?.retries;
91
78
  const isExternal = options?.isExternal ?? false;
92
- const isGettingImages = options?.isGettingImages ?? true;
79
+ const captureImages = options?.captureImages ?? true;
93
80
  const excludeKeywords = options?.excludeKeywords ?? [];
94
- const executablePath = options?.executablePath ?? null;
95
- const isTitleOnly = options?.isTitleOnly ?? false;
81
+ const metadataOnly = options?.metadataOnly ?? false;
82
+ const imageLoadTimeout = options?.imageLoadTimeout ?? 5000;
83
+ const resources: ResourceEntry[] = [];
96
84
 
97
- this.#url = url;
98
85
  void this.emit('changePhase', {
99
86
  pid: process.pid,
100
87
  name: 'scrapeStart',
101
- url: this.#url,
88
+ url,
102
89
  isExternal,
103
90
  message: '',
104
91
  });
105
92
 
93
+ // Path-excluded: return SkippedPageData
106
94
  if (isSkip) {
107
- void this.emit('ignoreAndSkip', {
108
- pid: process.pid,
109
- url: this.#url,
110
- reason: {
111
- matchedText: this.#url.pathname || '',
112
- excludeKeywords,
113
- },
114
- });
115
95
  void this.emit('changePhase', {
116
96
  pid: process.pid,
117
- name: 'ignoreAndSkip',
118
- url: this.#url,
97
+ name: 'pageSkipped',
98
+ url,
119
99
  isExternal,
120
100
  message: 'Matched: excluded path',
121
101
  });
122
- return;
102
+ return {
103
+ type: 'skipped',
104
+ resources,
105
+ ignored: {
106
+ url,
107
+ matchedText: url.pathname || '',
108
+ excludeKeywords,
109
+ },
110
+ };
123
111
  }
124
112
 
125
- if (!this.#url.isHTTP) {
113
+ // Non-HTTP protocol: return minimal PageData
114
+ if (!url.isHTTP) {
126
115
  const result: PageData = {
127
- url: this.#url,
116
+ url,
128
117
  isTarget: false,
129
118
  isExternal,
130
119
  redirectPaths: [],
@@ -142,69 +131,44 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
142
131
  isSkipped: false,
143
132
  };
144
133
 
145
- void this.emit('scrapeEnd', {
146
- pid: process.pid,
147
- url: this.#url,
148
- timestamp: Date.now(),
149
- result,
150
- });
151
-
152
134
  void this.emit('changePhase', {
153
135
  pid: process.pid,
154
136
  name: 'scrapeEnd',
155
- url: this.#url,
137
+ url,
156
138
  isExternal,
157
139
  message: '',
158
140
  });
159
- return;
141
+ return { type: 'success', pageData: result, resources };
160
142
  }
161
143
 
162
- void this.emit('changePhase', {
163
- pid: process.pid,
164
- name: 'touchHead',
165
- url: this.#url,
166
- isExternal,
167
- message: '',
168
- });
144
+ let headResult: PageData | SkippedPageData | null = options?.headCheckResult ?? null;
169
145
 
170
- let result: PageData | SkippedPageData | Error | null = await this.#fetchHead(
171
- url,
172
- isExternal,
173
- );
174
-
175
- if (result instanceof Error) {
176
- log('Error(FETCH_HEAD): %s', url.href);
177
- void this.emit('error', {
146
+ if (headResult && metadataOnly) {
147
+ void this.emit('changePhase', {
178
148
  pid: process.pid,
179
- url: this.#url,
180
- shutdown: false,
181
- error: result,
149
+ name: 'scrapeEnd',
150
+ url,
151
+ isExternal,
152
+ message: '',
182
153
  });
183
- result = null;
184
- }
185
-
186
- if (result && isTitleOnly) {
187
- void this.emit('scrapeEnd', {
188
- pid: process.pid,
189
- url: this.#url,
190
- timestamp: Date.now(),
191
- result: {
192
- ...result,
154
+ return {
155
+ type: 'success',
156
+ pageData: {
157
+ ...headResult,
193
158
  isTarget: false,
194
159
  },
195
- });
196
- return;
160
+ resources,
161
+ };
197
162
  }
198
163
 
199
- if (result === null || result.contentType === 'text/html') {
200
- const headlessMode: true | 'shell' = url.isSecure ? true : 'shell';
201
- const page = await this.#createPage(isExternal, executablePath, headlessMode);
202
-
203
- result = await this.#fetchData(
164
+ if (headResult === null || headResult.contentType === 'text/html') {
165
+ const fetchResult = await this.#fetchData(
204
166
  page,
205
167
  url,
206
168
  isExternal,
207
- isGettingImages,
169
+ captureImages,
170
+ imageLoadTimeout,
171
+ resources,
208
172
  options,
209
173
  ).catch((error) => {
210
174
  if (error instanceof Error) {
@@ -213,28 +177,29 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
213
177
  return new Error(error);
214
178
  });
215
179
 
216
- if (result instanceof Error) {
180
+ if (fetchResult instanceof Error) {
217
181
  log('Error(FETCH_DATA): %s', url.href);
218
- void this.emit('error', {
219
- pid: process.pid,
220
- url: this.#url,
221
- shutdown: true,
222
- error: result,
223
- });
224
- await this.destroy(isExternal);
225
- return;
182
+ page.removeAllListeners();
183
+ return {
184
+ type: 'error',
185
+ resources,
186
+ error: {
187
+ name: fetchResult.name,
188
+ message: fetchResult.message,
189
+ stack: fetchResult.stack,
190
+ shutdown: true,
191
+ },
192
+ };
226
193
  }
227
194
 
228
195
  page.removeAllListeners();
229
- if (!page.isClosed) {
230
- await page.close();
231
- }
196
+ headResult = fetchResult;
232
197
 
233
- if (!result.isSkipped) {
234
- const checkedKeyword = keywordCheck(result.html, excludeKeywords);
198
+ if (!headResult.isSkipped) {
199
+ const checkedKeyword = keywordCheck(headResult.html, excludeKeywords);
235
200
 
236
201
  if (checkedKeyword) {
237
- result = {
202
+ headResult = {
238
203
  url,
239
204
  isSkipped: true,
240
205
  matched: {
@@ -246,136 +211,135 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
246
211
  }
247
212
  }
248
213
 
249
- if (result.isSkipped) {
250
- if (result.matched.type === 'path') {
251
- return;
214
+ if (headResult.isSkipped) {
215
+ if (headResult.matched.type === 'path') {
216
+ return {
217
+ type: 'skipped',
218
+ resources,
219
+ ignored: {
220
+ url,
221
+ matchedText: url.pathname || '',
222
+ excludeKeywords,
223
+ },
224
+ };
252
225
  }
253
- void this.emit('ignoreAndSkip', {
254
- pid: process.pid,
255
- url: this.#url,
256
- reason: {
257
- matchedText: result.matched.text,
258
- excludeKeywords,
259
- },
260
- });
261
226
  void this.emit('changePhase', {
262
227
  pid: process.pid,
263
- name: 'ignoreAndSkip',
264
- url: this.#url,
228
+ name: 'pageSkipped',
229
+ url,
265
230
  isExternal,
266
- message: `Matched: "${result.matched.text}"`,
231
+ message: `Matched: "${headResult.matched.text}"`,
267
232
  });
268
- return;
233
+ return {
234
+ type: 'skipped',
235
+ resources,
236
+ ignored: {
237
+ url,
238
+ matchedText: headResult.matched.text,
239
+ excludeKeywords,
240
+ },
241
+ };
269
242
  }
270
243
  }
271
244
 
272
- void this.emit('scrapeEnd', {
273
- pid: process.pid,
274
- url: this.#url,
275
- timestamp: Date.now(),
276
- result,
277
- });
278
-
279
245
  void this.emit('changePhase', {
280
246
  pid: process.pid,
281
247
  name: 'scrapeEnd',
282
- url: this.#url,
248
+ url,
283
249
  isExternal,
284
250
  message: '',
285
251
  });
286
252
 
287
- return result;
253
+ return { type: 'success', pageData: headResult, resources };
288
254
  }
289
255
 
290
- @retry()
291
- async #bootBrowser(
256
+ /**
257
+ * Creates a callback for `@d-zero/puppeteer-page-scan`'s `beforePageScan` listener.
258
+ *
259
+ * WHY a separate factory: The listener must capture `isExternal` for phase events
260
+ * while conforming to the `beforePageScan` listener signature.
261
+ * Currently only handles the `scroll` phase to report scroll progress.
262
+ * @param isExternal - Whether the current page is external to the crawl scope
263
+ * @returns A listener function compatible with `beforePageScan`'s `listener` option
264
+ */
265
+ #createPageScanListener(
292
266
  isExternal: boolean,
293
- executablePath: string | null,
294
- headless: boolean | 'shell',
295
- ) {
296
- if (!this.#browser) {
267
+ ): (phase: keyof PageScanPhase, data: PageScanPhase[keyof PageScanPhase]) => void {
268
+ return (phase, data) => {
269
+ switch (phase) {
270
+ case 'scroll': {
271
+ const d = data as PageScanPhase['scroll'];
272
+ const scrollMsg = Number.isNaN(d.scrollHeight)
273
+ ? `%propeller% ${d.message}`
274
+ : `%propeller% ${d.scrollY}px/${d.scrollHeight}px (${Math.round((d.scrollY / d.scrollHeight) * 100)}%) ${d.message}`;
275
+ void this.emit('changePhase', {
276
+ pid: process.pid,
277
+ name: 'scrollToBottom',
278
+ url: null,
279
+ isExternal,
280
+ message: scrollMsg,
281
+ } satisfies ChangePhaseEvent);
282
+ break;
283
+ }
284
+ }
285
+ };
286
+ }
287
+ /**
288
+ * Navigates the page to the target URL and extracts full page data.
289
+ *
290
+ * WHY retryable with 3-min timeout: Page navigation can fail due to transient
291
+ * network issues or slow-loading pages. The decorator retries automatically,
292
+ * emitting `retryWait` / `retryExhausted` phase events for progress monitoring.
293
+ *
294
+ * Flow:
295
+ * 1. Register request/response listeners to capture sub-resources (internal pages only)
296
+ * 2. Navigate to URL via `page.goto()` and track redirect chain
297
+ * 3. Wait for DOM content and network idle
298
+ * 4. Extract anchors, meta, and optionally images
299
+ * 5. Check for keyword exclusion in HTML content
300
+ * @param page - Puppeteer page instance
301
+ * @param url - Target URL to navigate to
302
+ * @param isExternal - Whether the URL is external to the crawl scope
303
+ * @param captureImages - Whether to run the image extraction pipeline
304
+ * @param imageLoadTimeout - Timeout (ms) for waiting lazy-loaded images to complete
305
+ * @param resources - Mutable array to collect captured sub-resources into
306
+ * @param options - Additional scraper options (e.g. `disableQueries`, `navigationTimeout`)
307
+ * @returns Full page data or skipped page data if an exclusion rule matched
308
+ */
309
+ @retryable({
310
+ timeout: 3 * 60 * 1000,
311
+ onWait(this: Scraper, determinedInterval, retryCount, methodName, error) {
297
312
  void this.emit('changePhase', {
298
313
  pid: process.pid,
299
- name: 'launchBrowser',
300
- url: this.#url,
301
- isExternal,
302
- message: executablePath || '(executablePath is default)',
314
+ name: 'retryWait',
315
+ url: null,
316
+ isExternal: false,
317
+ message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1})`,
303
318
  });
304
-
305
- const browser = await launch({
306
- headless,
307
- timeout: LAUNCH_BROWSER_TIMEOUT,
308
- executablePath: executablePath ?? undefined,
309
- args: [
310
- // TODO: Optional lang
311
- '--lang=ja',
312
- '--no-zygote',
313
- '--ignore-certificate-errors',
314
- ],
315
- }).catch((error) => {
316
- if (error instanceof Error) {
317
- return error;
318
- }
319
- throw error;
319
+ },
320
+ onGiveUp(this: Scraper, retryCount, error, methodName) {
321
+ void this.emit('changePhase', {
322
+ pid: process.pid,
323
+ name: 'retryExhausted',
324
+ url: null,
325
+ isExternal: false,
326
+ message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
320
327
  });
321
-
322
- if (browser instanceof Error) {
323
- void this.emit('error', {
324
- pid: process.pid,
325
- url: this.#url!,
326
- shutdown: false,
327
- error: browser,
328
- });
329
- throw browser;
330
- }
331
-
332
- this.#browser = browser;
333
- } else if (!this.#browser.isConnected()) {
334
- await this.#browser.close();
335
- }
336
-
337
- return this.#browser;
338
- }
339
-
340
- @retry()
341
- async #createPage(
342
- isExternal: boolean,
343
- executablePath: string | null,
344
- headless: boolean | 'shell',
345
- ) {
346
- const browser = await this.#bootBrowser(isExternal, executablePath, headless);
347
-
348
- void this.emit('changePhase', {
349
- pid: process.pid,
350
- name: 'newPage',
351
- url: this.#url,
352
- isExternal,
353
- message: '',
354
- });
355
-
356
- const page = await browser.newPage();
357
- page.setDefaultNavigationTimeout(0);
358
- await page.setUserAgent(
359
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
360
- );
361
- await page.setExtraHTTPHeaders({
362
- // TODO: Optional lang
363
- 'Accept-Language': 'ja-JP',
364
- });
365
-
366
- return page;
367
- }
368
-
369
- @retry({
370
- timeout: 1 * 60 * 1000, // 1sec,
328
+ },
371
329
  })
372
330
  async #fetchData(
373
331
  page: Page,
374
332
  url: ExURL,
375
333
  isExternal: boolean,
376
- isGettingImages: boolean,
377
- options?: ParseURLOptions,
334
+ captureImages: boolean,
335
+ imageLoadTimeout: number,
336
+ resources: ResourceEntry[],
337
+ options?: Partial<ScraperOptions>,
378
338
  ): Promise<PageData | SkippedPageData> {
339
+ const parseOpts: ParseURLOptions | undefined =
340
+ options?.disableQueries == null
341
+ ? undefined
342
+ : { disableQueries: options.disableQueries };
379
343
  const networkLogs: Record<string, NetworkLog> = {};
380
344
 
381
345
  page.on('dialog', async (dialog) => {
@@ -390,7 +354,7 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
390
354
 
391
355
  if (!isExternal) {
392
356
  page.on('request', (request) => {
393
- const url = parseUrl(request.url(), options);
357
+ const url = parseUrl(request.url(), parseOpts)!;
394
358
  networkLogs[request.url()] = {
395
359
  url,
396
360
  status: null,
@@ -407,7 +371,7 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
407
371
 
408
372
  const uniqueRes = new Set<string>();
409
373
  page.on('response', (response) => {
410
- const resURL = parseUrl(response.url(), options);
374
+ const resURL = parseUrl(response.url(), parseOpts)!;
411
375
 
412
376
  if (uniqueRes.has(resURL.withoutHash)) {
413
377
  return;
@@ -453,6 +417,11 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
453
417
  };
454
418
 
455
419
  rLog('Fetched: %s', resURL.href);
420
+
421
+ // Collect resource into the results array
422
+ resources.push({ log, resource: referredLink, pageUrl: url.withoutHash });
423
+
424
+ // Also emit for streaming consumers
456
425
  void this.emit('resourceResponse', {
457
426
  pid: process.pid,
458
427
  url,
@@ -462,12 +431,14 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
462
431
  });
463
432
  }
464
433
 
434
+ const navigationTimeout = options?.navigationTimeout ?? 60_000;
435
+
465
436
  void this.emit('changePhase', {
466
437
  pid: process.pid,
467
438
  name: 'openPage',
468
- url: this.#url,
439
+ url,
469
440
  isExternal,
470
- message: '',
441
+ message: `%countdown(${navigationTimeout},openPage_${url.withoutHash},s)%s`,
471
442
  });
472
443
 
473
444
  if (url.username && url.password) {
@@ -476,19 +447,24 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
476
447
  });
477
448
  }
478
449
 
479
- const res = await page.goto(url.withoutHashAndAuth);
450
+ const res = await page.goto(url.withoutHashAndAuth, { timeout: navigationTimeout });
480
451
 
481
452
  if (!res) {
482
453
  throw new Error('The method Page.goto returned null');
483
454
  }
484
455
 
485
- const destUrl = parseUrl(page.url(), options);
486
- const redirectPaths = res
487
- .request()
488
- .redirectChain()
489
- .map((req) => req.url());
490
- if (destUrl.withoutHash !== url.withoutHash) {
491
- redirectPaths.push(destUrl.withoutHash);
456
+ const destUrl = parseUrl(page.url(), parseOpts)!;
457
+ const redirectPaths = new Set<string>();
458
+
459
+ if (url.withoutHash !== destUrl.withoutHash) {
460
+ const redirectChain = res
461
+ .request()
462
+ .redirectChain()
463
+ .map((req) => req.url());
464
+ for (const redirectPath of redirectChain) {
465
+ redirectPaths.add(redirectPath);
466
+ }
467
+ redirectPaths.add(destUrl.withoutHash);
492
468
  }
493
469
 
494
470
  if (destUrl.hostname !== url.hostname) {
@@ -507,7 +483,7 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
507
483
  url,
508
484
  isTarget: false,
509
485
  isExternal,
510
- redirectPaths,
486
+ redirectPaths: [...redirectPaths],
511
487
  status,
512
488
  statusText,
513
489
  contentType,
@@ -526,7 +502,7 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
526
502
  void this.emit('changePhase', {
527
503
  pid: process.pid,
528
504
  name: 'loadDOMContent',
529
- url: this.#url,
505
+ url,
530
506
  isExternal,
531
507
  message: '',
532
508
  });
@@ -538,7 +514,7 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
538
514
  void this.emit('changePhase', {
539
515
  pid: process.pid,
540
516
  name: 'getHTML',
541
- url: this.#url,
517
+ url,
542
518
  isExternal,
543
519
  message: '',
544
520
  });
@@ -556,7 +532,7 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
556
532
  url,
557
533
  isTarget: false,
558
534
  isExternal,
559
- redirectPaths,
535
+ redirectPaths: [...redirectPaths],
560
536
  status,
561
537
  statusText,
562
538
  contentType,
@@ -574,8 +550,8 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
574
550
 
575
551
  void this.emit('changePhase', {
576
552
  pid: process.pid,
577
- name: 'waitNetworkIdleZero',
578
- url: this.#url,
553
+ name: 'waitNetworkIdle',
554
+ url,
579
555
  isExternal,
580
556
  message: '',
581
557
  });
@@ -587,28 +563,44 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
587
563
  void this.emit('changePhase', {
588
564
  pid: process.pid,
589
565
  name: 'getAnchors',
590
- url: this.#url,
566
+ url,
591
567
  isExternal,
592
568
  message: '',
593
569
  });
594
- const anchorList = await getAnchorList(page, options);
570
+ const anchorList = await getAnchorList(page, parseOpts);
595
571
 
596
572
  void this.emit('changePhase', {
597
573
  pid: process.pid,
598
574
  name: 'getMeta',
599
- url: this.#url,
575
+ url,
600
576
  isExternal,
601
577
  message: '',
602
578
  });
603
579
  const meta = await getMeta(page);
604
580
 
605
- const imageList = isGettingImages ? await this.#fetchImages(page, isExternal) : [];
581
+ const imageList = captureImages
582
+ ? await (async () => {
583
+ void this.emit('changePhase', {
584
+ pid: process.pid,
585
+ name: 'extractImages',
586
+ url,
587
+ isExternal,
588
+ message: '',
589
+ });
590
+ return this.#fetchImages(
591
+ page,
592
+ url.withoutHashAndAuth,
593
+ isExternal,
594
+ imageLoadTimeout,
595
+ );
596
+ })()
597
+ : [];
606
598
 
607
599
  return {
608
600
  url,
609
601
  isTarget: true,
610
602
  isExternal,
611
- redirectPaths,
603
+ redirectPaths: [...redirectPaths],
612
604
  status,
613
605
  statusText,
614
606
  contentType,
@@ -621,50 +613,96 @@ export default class Scraper extends TypedAwaitEventEmitter<ScrapeEventTypes> {
621
613
  isSkipped: false,
622
614
  };
623
615
  }
624
-
625
- @retry()
626
- async #fetchHead(url: ExURL, isExternal: boolean) {
627
- return await fetchDestination(url, isExternal);
628
- }
629
-
630
- @retry({
631
- timeout: 5 * 60 * 1000, // 5sec
616
+ /**
617
+ * Extracts image data from the page across multiple device presets.
618
+ *
619
+ * WHY multiple device presets: Images may differ between desktop and mobile
620
+ * due to responsive `<picture>` / `srcset`. Capturing both `desktop-compact`
621
+ * and `mobile-small` viewports reveals responsive image issues.
622
+ *
623
+ * WHY retryable with 5-min timeout and `fallback: []`: Image extraction is
624
+ * best-effort. If all retries fail, an empty array is returned rather than
625
+ * failing the entire page scrape.
626
+ * @param page - Puppeteer page instance
627
+ * @param url - The page URL string (without hash and auth)
628
+ * @param isExternal - Whether the page is external
629
+ * @param imageLoadTimeout - Timeout (ms) for waiting images to complete loading
630
+ * @returns Array of image elements from all device presets
631
+ */
632
+ @retryable({
633
+ timeout: 5 * 60 * 1000,
632
634
  fallback: [],
635
+ onWait(this: Scraper, determinedInterval, retryCount, methodName, error) {
636
+ void this.emit('changePhase', {
637
+ pid: process.pid,
638
+ name: 'retryWait',
639
+ url: null,
640
+ isExternal: false,
641
+ message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1} / images)`,
642
+ });
643
+ },
644
+ onGiveUp(this: Scraper, retryCount, error, methodName) {
645
+ void this.emit('changePhase', {
646
+ pid: process.pid,
647
+ name: 'retryExhausted',
648
+ url: null,
649
+ isExternal: false,
650
+ message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
651
+ });
652
+ },
633
653
  })
634
- async #fetchImages(page: Page, isExternal: boolean): Promise<ImageElement[]> {
635
- const url = this.#url!.withoutHashAndAuth;
636
- const imageList: ImageElement[] = [];
637
-
638
- const devices: { name: string; width: number; resolution?: number }[] = [
639
- { name: 'desktop', width: 1280 },
640
- { name: 'mobile', width: 320, resolution: 2 },
654
+ async #fetchImages(
655
+ page: Page,
656
+ url: string,
657
+ isExternal: boolean,
658
+ imageLoadTimeout: number,
659
+ ): Promise<ImageElement[]> {
660
+ const listener = this.#createPageScanListener(isExternal);
661
+ const devices: { key: string; preset: { width: number; resolution?: number } }[] = [
662
+ { key: 'desktop-compact', preset: devicePresets['desktop-compact'] },
663
+ { key: 'mobile-small', preset: devicePresets['mobile-small'] },
641
664
  ];
665
+ const imageList: ImageElement[] = [];
642
666
 
643
- for (const device of devices) {
667
+ for (const { key, preset } of devices) {
644
668
  void this.emit('changePhase', {
645
669
  pid: process.pid,
646
670
  name: 'setViewport',
647
- url: this.#url,
671
+ url: null,
648
672
  isExternal,
649
- message: device.name,
673
+ message: `📷 ${key} ↔️ ${preset.width}px`,
650
674
  });
651
675
 
652
676
  await beforePageScan(page, url, {
653
- name: device.name,
654
- width: device.width,
655
- resolution: device.resolution,
677
+ name: key,
678
+ width: preset.width,
679
+ resolution: preset.resolution,
680
+ listener,
656
681
  timeout: 5000,
657
682
  });
658
683
 
659
684
  void this.emit('changePhase', {
660
685
  pid: process.pid,
661
- name: 'getImages',
662
- url: this.#url,
686
+ name: 'waitImageLoad',
687
+ url: null,
663
688
  isExternal,
664
- message: device.name,
689
+ message: `📷 ${key}: Waiting for images%dots%`,
665
690
  });
666
691
 
667
- const images = await getImageList(page, device.width);
692
+ await page
693
+ .waitForFunction(() => [...document.images].every((img) => img.complete), {
694
+ timeout: imageLoadTimeout,
695
+ })
696
+ .catch(() => {});
697
+
698
+ void this.emit('changePhase', {
699
+ pid: process.pid,
700
+ name: 'getImages',
701
+ url: null,
702
+ isExternal,
703
+ message: `📸 ${key}: Extracting images%dots%`,
704
+ });
705
+ const images = await getImageList(page, preset.width);
668
706
  imageList.push(...images);
669
707
  }
670
708