@d-zero/beholder 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,712 @@
1
+ var __runInitializers = (this && this.__runInitializers) || function (thisArg, initializers, value) {
2
+ var useValue = arguments.length > 2;
3
+ for (var i = 0; i < initializers.length; i++) {
4
+ value = useValue ? initializers[i].call(thisArg, value) : initializers[i].call(thisArg);
5
+ }
6
+ return useValue ? value : void 0;
7
+ };
8
+ var __esDecorate = (this && this.__esDecorate) || function (ctor, descriptorIn, decorators, contextIn, initializers, extraInitializers) {
9
+ function accept(f) { if (f !== void 0 && typeof f !== "function") throw new TypeError("Function expected"); return f; }
10
+ var kind = contextIn.kind, key = kind === "getter" ? "get" : kind === "setter" ? "set" : "value";
11
+ var target = !descriptorIn && ctor ? contextIn["static"] ? ctor : ctor.prototype : null;
12
+ var descriptor = descriptorIn || (target ? Object.getOwnPropertyDescriptor(target, contextIn.name) : {});
13
+ var _, done = false;
14
+ for (var i = decorators.length - 1; i >= 0; i--) {
15
+ var context = {};
16
+ for (var p in contextIn) context[p] = p === "access" ? {} : contextIn[p];
17
+ for (var p in contextIn.access) context.access[p] = contextIn.access[p];
18
+ context.addInitializer = function (f) { if (done) throw new TypeError("Cannot add initializers after decoration has completed"); extraInitializers.push(accept(f || null)); };
19
+ var result = (0, decorators[i])(kind === "accessor" ? { get: descriptor.get, set: descriptor.set } : descriptor[key], context);
20
+ if (kind === "accessor") {
21
+ if (result === void 0) continue;
22
+ if (result === null || typeof result !== "object") throw new TypeError("Object expected");
23
+ if (_ = accept(result.get)) descriptor.get = _;
24
+ if (_ = accept(result.set)) descriptor.set = _;
25
+ if (_ = accept(result.init)) initializers.unshift(_);
26
+ }
27
+ else if (_ = accept(result)) {
28
+ if (kind === "field") initializers.unshift(_);
29
+ else descriptor[key] = _;
30
+ }
31
+ }
32
+ if (target) Object.defineProperty(target, contextIn.name, descriptor);
33
+ done = true;
34
+ };
35
+ var __setFunctionName = (this && this.__setFunctionName) || function (f, name, prefix) {
36
+ if (typeof name === "symbol") name = name.description ? "[".concat(name.description, "]") : "";
37
+ return Object.defineProperty(f, "name", { configurable: true, value: prefix ? "".concat(prefix, " ", name) : name });
38
+ };
39
+ import { beforePageScan, devicePresets } from '@d-zero/puppeteer-page-scan';
40
+ import { detectCDN } from '@d-zero/shared/detect-cdn';
41
+ import { detectCompress } from '@d-zero/shared/detect-compress';
42
+ import { retry as retryable } from '@d-zero/shared/retry';
43
+ import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
44
+ import { resourceLog, scraperLog } from './debug.js';
45
+ import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
46
+ import { isError } from './is-error.js';
47
+ import { keywordCheck } from './keyword-check.js';
48
+ import { findDisconnectionFailures } from './network-disconnection.js';
49
+ import { parseUrl } from './parse-url.js';
50
+ const pid = `${process.pid}`;
51
+ const log = scraperLog.extend(pid);
52
+ const rLog = resourceLog.extend(pid);
53
+ let Scraper = (() => {
54
+ let _classSuper = EventEmitter;
55
+ let _instanceExtraInitializers = [];
56
+ let _private_fetchData_decorators;
57
+ let _private_fetchData_descriptor;
58
+ let _private_fetchImages_decorators;
59
+ let _private_fetchImages_descriptor;
60
+ return class Scraper extends _classSuper {
61
+ static {
62
+ const _metadata = typeof Symbol === "function" && Symbol.metadata ? Object.create(_classSuper[Symbol.metadata] ?? null) : void 0;
63
+ _private_fetchData_decorators = [retryable({
64
+ timeout: 3 * 60 * 1000,
65
+ onWait(determinedInterval, retryCount, methodName, error) {
66
+ void this.emit('changePhase', {
67
+ pid: process.pid,
68
+ name: 'retryWait',
69
+ url: null,
70
+ isExternal: false,
71
+ message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1})`,
72
+ });
73
+ },
74
+ onGiveUp(retryCount, error, methodName) {
75
+ void this.emit('changePhase', {
76
+ pid: process.pid,
77
+ name: 'retryExhausted',
78
+ url: null,
79
+ isExternal: false,
80
+ message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
81
+ });
82
+ },
83
+ })];
84
+ _private_fetchImages_decorators = [retryable({
85
+ timeout: 5 * 60 * 1000,
86
+ fallback: [],
87
+ onWait(determinedInterval, retryCount, methodName, error) {
88
+ void this.emit('changePhase', {
89
+ pid: process.pid,
90
+ name: 'retryWait',
91
+ url: null,
92
+ isExternal: false,
93
+ message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1} / images)`,
94
+ });
95
+ },
96
+ onGiveUp(retryCount, error, methodName) {
97
+ void this.emit('changePhase', {
98
+ pid: process.pid,
99
+ name: 'retryExhausted',
100
+ url: null,
101
+ isExternal: false,
102
+ message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
103
+ });
104
+ },
105
+ })];
106
+ __esDecorate(this, _private_fetchData_descriptor = { value: __setFunctionName(async function (page, url, isExternal, captureImages, imageLoadTimeout, resources, failedRequests, options) {
107
+ const parseOpts = options?.disableQueries == null
108
+ ? undefined
109
+ : { disableQueries: options.disableQueries };
110
+ const networkLogs = {};
111
+ // Clear stale state from previous retries (@retryable may re-invoke this method
112
+ // with the same page and mutable arrays, so we must reset to avoid accumulation)
113
+ this.#cleanupPageListeners();
114
+ failedRequests.length = 0;
115
+ resources.length = 0;
116
+ // Define named listeners so they can be individually removed on retry/cleanup
117
+ const onDialog = async (dialog) => {
118
+ log(`Appear ${dialog.type()} dialog: ${dialog.message()}`);
119
+ try {
120
+ await dialog.accept();
121
+ }
122
+ catch (error) {
123
+ log(`Error: ${error}`);
124
+ }
125
+ log(`Accept ${dialog.type()} dialog`);
126
+ };
127
+ page.on('dialog', onDialog);
128
+ let onRequest = null;
129
+ let onResponse = null;
130
+ let onRequestFailed = null;
131
+ if (!isExternal) {
132
+ onRequest = (request) => {
133
+ const url = parseUrl(request.url(), parseOpts);
134
+ networkLogs[request.url()] = {
135
+ url,
136
+ status: null,
137
+ contentLength: 0,
138
+ contentType: '',
139
+ isError: false,
140
+ request: {
141
+ ts: Date.now(),
142
+ headers: request.headers(),
143
+ method: request.method(),
144
+ },
145
+ };
146
+ };
147
+ const uniqueRes = new Set();
148
+ onResponse = (response) => {
149
+ const resURL = parseUrl(response.url(), parseOpts);
150
+ if (uniqueRes.has(resURL.withoutHash)) {
151
+ return;
152
+ }
153
+ if (resURL.withoutHash === url.withoutHash) {
154
+ return;
155
+ }
156
+ uniqueRes.add(resURL.withoutHash);
157
+ const headers = response.headers();
158
+ const status = response.status();
159
+ const statusText = response.statusText();
160
+ const contentType = headers['content-type']?.split(';')[0] || null;
161
+ const contentLength = Number.parseInt(headers['content-length'] ?? '', 10) || null;
162
+ const request = networkLogs[resURL.withoutHash];
163
+ const log = {
164
+ ...request,
165
+ response: {
166
+ ts: Date.now(),
167
+ status,
168
+ statusText,
169
+ fromCache: response.fromCache(),
170
+ headers,
171
+ },
172
+ status,
173
+ isError: isError(status),
174
+ contentType: contentType || '',
175
+ contentLength: contentLength || 0,
176
+ };
177
+ const referredLink = {
178
+ url: resURL,
179
+ isExternal: resURL.hostname !== url.hostname,
180
+ isError: log.isError,
181
+ status,
182
+ statusText,
183
+ contentType,
184
+ contentLength,
185
+ compress: detectCompress(headers),
186
+ cdn: detectCDN(headers),
187
+ headers: headers,
188
+ };
189
+ rLog('Fetched: %s', resURL.href);
190
+ // Collect resource into the results array
191
+ resources.push({ log, resource: referredLink, pageUrl: url.withoutHash });
192
+ // Also emit for streaming consumers
193
+ void this.emit('resourceResponse', {
194
+ pid: process.pid,
195
+ url,
196
+ log,
197
+ resource: referredLink,
198
+ });
199
+ };
200
+ onRequestFailed = (request) => {
201
+ const errorText = request.failure()?.errorText ?? 'Unknown error';
202
+ rLog('Request failed: %s (%s)', request.url(), errorText);
203
+ failedRequests.push({ url: request.url(), errorText });
204
+ };
205
+ page.on('request', onRequest);
206
+ page.on('response', onResponse);
207
+ page.on('requestfailed', onRequestFailed);
208
+ }
209
+ // Store cleanup function for retry/post-fetch removal
210
+ this.#pageListenerCleanup = () => {
211
+ page.off('dialog', onDialog);
212
+ if (onRequest)
213
+ page.off('request', onRequest);
214
+ if (onResponse)
215
+ page.off('response', onResponse);
216
+ if (onRequestFailed)
217
+ page.off('requestfailed', onRequestFailed);
218
+ };
219
+ const navigationTimeout = options?.navigationTimeout ?? 60_000;
220
+ void this.emit('changePhase', {
221
+ pid: process.pid,
222
+ name: 'openPage',
223
+ url,
224
+ isExternal,
225
+ message: `%countdown(${navigationTimeout},openPage_${url.withoutHash},s)%s`,
226
+ });
227
+ if (url.username && url.password) {
228
+ await page.setExtraHTTPHeaders({
229
+ Authorization: `Basic ${Buffer.from(`${url.username}:${url.password}`).toString('base64')}`,
230
+ });
231
+ }
232
+ const res = await page.goto(url.withoutHashAndAuth, { timeout: navigationTimeout });
233
+ if (!res) {
234
+ throw new Error('The method Page.goto returned null');
235
+ }
236
+ const destUrl = parseUrl(page.url(), parseOpts);
237
+ const redirectPaths = new Set();
238
+ if (url.withoutHash !== destUrl.withoutHash) {
239
+ const redirectChain = res
240
+ .request()
241
+ .redirectChain()
242
+ .map((req) => req.url());
243
+ for (const redirectPath of redirectChain) {
244
+ redirectPaths.add(redirectPath);
245
+ }
246
+ redirectPaths.add(destUrl.withoutHash);
247
+ }
248
+ if (destUrl.hostname !== url.hostname) {
249
+ isExternal = true;
250
+ }
251
+ const status = res.status();
252
+ const statusText = res.statusText();
253
+ const responseHeaders = res.headers();
254
+ const contentType = responseHeaders['content-type']?.split(';')[0] || null;
255
+ const _contentLength = Number.parseInt(responseHeaders['content-length'] ?? '');
256
+ const contentLength = Number.isFinite(_contentLength) ? _contentLength : null;
257
+ if (contentType !== 'text/html') {
258
+ return {
259
+ url,
260
+ isTarget: false,
261
+ isExternal,
262
+ redirectPaths: [...redirectPaths],
263
+ status,
264
+ statusText,
265
+ contentType,
266
+ contentLength,
267
+ responseHeaders,
268
+ meta: {
269
+ title: '',
270
+ },
271
+ imageList: [],
272
+ anchorList: [],
273
+ html: '',
274
+ isSkipped: false,
275
+ };
276
+ }
277
+ void this.emit('changePhase', {
278
+ pid: process.pid,
279
+ name: 'loadDOMContent',
280
+ url,
281
+ isExternal,
282
+ message: '',
283
+ });
284
+ await page
285
+ .waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 5000 })
286
+ .catch(() => { });
287
+ void this.emit('changePhase', {
288
+ pid: process.pid,
289
+ name: 'getHTML',
290
+ url,
291
+ isExternal,
292
+ message: '',
293
+ });
294
+ const { title, html } = await page.evaluate(() => {
295
+ /* global document */
296
+ return {
297
+ title: document.title,
298
+ html: document.documentElement.outerHTML,
299
+ };
300
+ });
301
+ if (isExternal) {
302
+ return {
303
+ url,
304
+ isTarget: false,
305
+ isExternal,
306
+ redirectPaths: [...redirectPaths],
307
+ status,
308
+ statusText,
309
+ contentType,
310
+ contentLength,
311
+ responseHeaders,
312
+ meta: {
313
+ title,
314
+ },
315
+ imageList: [],
316
+ anchorList: [],
317
+ html,
318
+ isSkipped: false,
319
+ };
320
+ }
321
+ void this.emit('changePhase', {
322
+ pid: process.pid,
323
+ name: 'waitNetworkIdle',
324
+ url,
325
+ isExternal,
326
+ message: '',
327
+ });
328
+ await page
329
+ .waitForNavigation({ waitUntil: 'networkidle0', timeout: 5000 })
330
+ .catch(() => { });
331
+ // Check for network disconnection errors in failed requests
332
+ const disconnectionFailures = findDisconnectionFailures(failedRequests);
333
+ if (disconnectionFailures.length > 0) {
334
+ const errorSummary = disconnectionFailures
335
+ .map((r) => `${r.url} (${r.errorText})`)
336
+ .join(', ');
337
+ throw new Error(`Network disconnection detected during page load: ${errorSummary}`);
338
+ }
339
+ void this.emit('changePhase', {
340
+ pid: process.pid,
341
+ name: 'getAnchors',
342
+ url,
343
+ isExternal,
344
+ message: '',
345
+ });
346
+ const anchorList = await getAnchorList(page, parseOpts);
347
+ void this.emit('changePhase', {
348
+ pid: process.pid,
349
+ name: 'getMeta',
350
+ url,
351
+ isExternal,
352
+ message: '',
353
+ });
354
+ const meta = await getMeta(page);
355
+ const imageList = captureImages
356
+ ? await (async () => {
357
+ void this.emit('changePhase', {
358
+ pid: process.pid,
359
+ name: 'extractImages',
360
+ url,
361
+ isExternal,
362
+ message: '',
363
+ });
364
+ return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout);
365
+ })()
366
+ : [];
367
+ return {
368
+ url,
369
+ isTarget: true,
370
+ isExternal,
371
+ redirectPaths: [...redirectPaths],
372
+ status,
373
+ statusText,
374
+ contentType,
375
+ contentLength,
376
+ responseHeaders,
377
+ meta,
378
+ anchorList,
379
+ imageList,
380
+ html,
381
+ isSkipped: false,
382
+ };
383
+ }, "#fetchData") }, _private_fetchData_decorators, { kind: "method", name: "#fetchData", static: false, private: true, access: { has: obj => #fetchData in obj, get: obj => obj.#fetchData }, metadata: _metadata }, null, _instanceExtraInitializers);
384
+ __esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout) {
385
+ const listener = this.#createPageScanListener(isExternal);
386
+ const devices = [
387
+ { key: 'desktop-compact', preset: devicePresets['desktop-compact'] },
388
+ { key: 'mobile-small', preset: devicePresets['mobile-small'] },
389
+ ];
390
+ const imageList = [];
391
+ for (const { key, preset } of devices) {
392
+ void this.emit('changePhase', {
393
+ pid: process.pid,
394
+ name: 'setViewport',
395
+ url: null,
396
+ isExternal,
397
+ message: `📷 ${key} ↔️ ${preset.width}px`,
398
+ });
399
+ await beforePageScan(page, url, {
400
+ name: key,
401
+ width: preset.width,
402
+ resolution: preset.resolution,
403
+ listener,
404
+ timeout: 5000,
405
+ });
406
+ void this.emit('changePhase', {
407
+ pid: process.pid,
408
+ name: 'waitImageLoad',
409
+ url: null,
410
+ isExternal,
411
+ message: `📷 ${key}: Waiting for images%dots%`,
412
+ });
413
+ await page
414
+ .waitForFunction(() => [...document.images].every((img) => img.complete), {
415
+ timeout: imageLoadTimeout,
416
+ })
417
+ .catch(() => { });
418
+ void this.emit('changePhase', {
419
+ pid: process.pid,
420
+ name: 'getImages',
421
+ url: null,
422
+ isExternal,
423
+ message: `📸 ${key}: Extracting images%dots%`,
424
+ });
425
+ const images = await getImageList(page, preset.width);
426
+ imageList.push(...images);
427
+ }
428
+ return imageList;
429
+ }, "#fetchImages") }, _private_fetchImages_decorators, { kind: "method", name: "#fetchImages", static: false, private: true, access: { has: obj => #fetchImages in obj, get: obj => obj.#fetchImages }, metadata: _metadata }, null, _instanceExtraInitializers);
430
+ if (_metadata) Object.defineProperty(this, Symbol.metadata, { enumerable: true, configurable: true, writable: true, value: _metadata });
431
+ }
432
+ /** Number of retries for `@retryable`-decorated methods. Set per-scrape from options. */
433
+ retries = __runInitializers(this, _instanceExtraInitializers);
434
+ /** Cleanup function to remove page listeners registered by `#fetchData`. */
435
+ #pageListenerCleanup = null;
436
+ /**
437
+ * Begins the scraping process for a given URL on the provided Puppeteer page.
438
+ *
439
+ * Returns a `ScrapeResult` containing the outcome:
440
+ * - `type: "success"` with `pageData` on success
441
+ * - `type: "skipped"` with `ignored` details when the page is excluded
442
+ * - `type: "error"` with `error` details when scraping fails
443
+ *
444
+ * Sub-resources are collected via the `resourceResponse` event and
445
+ * included in the returned `ScrapeResult.resources`.
446
+ * @param page - The Puppeteer page instance to use for navigation and DOM evaluation.
447
+ * @param url - The extended URL to scrape.
448
+ * @param options - Optional scraper configuration overriding defaults.
449
+ * @param isSkip - When `true`, the page is immediately skipped without any network requests.
450
+ * @returns The scrape result containing the outcome and captured resources.
451
+ */
452
+ async scrapeStart(page, url, options, isSkip = false) {
453
+ this.retries = options?.retries;
454
+ const isExternal = options?.isExternal ?? false;
455
+ const captureImages = options?.captureImages ?? true;
456
+ const excludeKeywords = options?.excludeKeywords ?? [];
457
+ const metadataOnly = options?.metadataOnly ?? false;
458
+ const imageLoadTimeout = options?.imageLoadTimeout ?? 5000;
459
+ const resources = [];
460
+ const failedRequests = [];
461
+ void this.emit('changePhase', {
462
+ pid: process.pid,
463
+ name: 'scrapeStart',
464
+ url,
465
+ isExternal,
466
+ message: '',
467
+ });
468
+ // Path-excluded: return SkippedPageData
469
+ if (isSkip) {
470
+ void this.emit('changePhase', {
471
+ pid: process.pid,
472
+ name: 'pageSkipped',
473
+ url,
474
+ isExternal,
475
+ message: 'Matched: excluded path',
476
+ });
477
+ return {
478
+ type: 'skipped',
479
+ resources,
480
+ ignored: {
481
+ url,
482
+ matchedText: url.pathname || '',
483
+ excludeKeywords,
484
+ },
485
+ };
486
+ }
487
+ // Non-HTTP protocol: return minimal PageData
488
+ if (!url.isHTTP) {
489
+ const result = {
490
+ url,
491
+ isTarget: false,
492
+ isExternal,
493
+ redirectPaths: [],
494
+ status: -1,
495
+ statusText: '__THIS_IS_NOT_HTTP_PROTOCOL__',
496
+ contentType: null,
497
+ contentLength: null,
498
+ responseHeaders: {},
499
+ meta: {
500
+ title: '',
501
+ },
502
+ imageList: [],
503
+ anchorList: [],
504
+ html: '',
505
+ isSkipped: false,
506
+ };
507
+ void this.emit('changePhase', {
508
+ pid: process.pid,
509
+ name: 'scrapeEnd',
510
+ url,
511
+ isExternal,
512
+ message: '',
513
+ });
514
+ return { type: 'success', pageData: result, resources };
515
+ }
516
+ let headResult = options?.headCheckResult ?? null;
517
+ if (headResult && metadataOnly) {
518
+ void this.emit('changePhase', {
519
+ pid: process.pid,
520
+ name: 'scrapeEnd',
521
+ url,
522
+ isExternal,
523
+ message: '',
524
+ });
525
+ return {
526
+ type: 'success',
527
+ pageData: {
528
+ ...headResult,
529
+ isTarget: false,
530
+ },
531
+ resources,
532
+ };
533
+ }
534
+ if (headResult === null || headResult.contentType === 'text/html') {
535
+ const fetchResult = await this.#fetchData(page, url, isExternal, captureImages, imageLoadTimeout, resources, failedRequests, options).catch((error) => {
536
+ if (error instanceof Error) {
537
+ return error;
538
+ }
539
+ return new Error(error);
540
+ });
541
+ if (fetchResult instanceof Error) {
542
+ log('Error(FETCH_DATA): %s', url.href);
543
+ this.#cleanupPageListeners();
544
+ return {
545
+ type: 'error',
546
+ resources,
547
+ failedRequests: failedRequests.length > 0 ? failedRequests : undefined,
548
+ error: {
549
+ name: fetchResult.name,
550
+ message: fetchResult.message,
551
+ stack: fetchResult.stack,
552
+ shutdown: true,
553
+ },
554
+ };
555
+ }
556
+ this.#cleanupPageListeners();
557
+ headResult = fetchResult;
558
+ if (!headResult.isSkipped) {
559
+ const checkedKeyword = keywordCheck(headResult.html, excludeKeywords);
560
+ if (checkedKeyword) {
561
+ headResult = {
562
+ url,
563
+ isSkipped: true,
564
+ matched: {
565
+ type: 'keyword',
566
+ text: checkedKeyword,
567
+ excludeKeywords,
568
+ },
569
+ };
570
+ }
571
+ }
572
+ if (headResult.isSkipped) {
573
+ if (headResult.matched.type === 'path') {
574
+ return {
575
+ type: 'skipped',
576
+ resources,
577
+ ignored: {
578
+ url,
579
+ matchedText: url.pathname || '',
580
+ excludeKeywords,
581
+ },
582
+ };
583
+ }
584
+ void this.emit('changePhase', {
585
+ pid: process.pid,
586
+ name: 'pageSkipped',
587
+ url,
588
+ isExternal,
589
+ message: `Matched: "${headResult.matched.text}"`,
590
+ });
591
+ return {
592
+ type: 'skipped',
593
+ resources,
594
+ ignored: {
595
+ url,
596
+ matchedText: headResult.matched.text,
597
+ excludeKeywords,
598
+ },
599
+ };
600
+ }
601
+ }
602
+ void this.emit('changePhase', {
603
+ pid: process.pid,
604
+ name: 'scrapeEnd',
605
+ url,
606
+ isExternal,
607
+ message: '',
608
+ });
609
+ return {
610
+ type: 'success',
611
+ pageData: headResult,
612
+ resources,
613
+ failedRequests: failedRequests.length > 0 ? failedRequests : undefined,
614
+ };
615
+ }
616
+ #cleanupPageListeners() {
617
+ if (this.#pageListenerCleanup) {
618
+ this.#pageListenerCleanup();
619
+ this.#pageListenerCleanup = null;
620
+ }
621
+ }
622
+ /**
623
+ * Creates a callback for `@d-zero/puppeteer-page-scan`'s `beforePageScan` listener.
624
+ *
625
+ * WHY a separate factory: The listener must capture `isExternal` for phase events
626
+ * while conforming to the `beforePageScan` listener signature.
627
+ * Currently only handles the `scroll` phase to report scroll progress.
628
+ * @param isExternal - Whether the current page is external to the crawl scope
629
+ * @returns A listener function compatible with `beforePageScan`'s `listener` option
630
+ */
631
+ #createPageScanListener(isExternal) {
632
+ return (phase, data) => {
633
+ switch (phase) {
634
+ case 'scroll': {
635
+ const d = data;
636
+ const scrollMsg = Number.isNaN(d.scrollHeight)
637
+ ? `%propeller% ${d.message}`
638
+ : `%propeller% ${d.scrollY}px/${d.scrollHeight}px (${Math.round((d.scrollY / d.scrollHeight) * 100)}%) ${d.message}`;
639
+ void this.emit('changePhase', {
640
+ pid: process.pid,
641
+ name: 'scrollToBottom',
642
+ url: null,
643
+ isExternal,
644
+ message: scrollMsg,
645
+ });
646
+ break;
647
+ }
648
+ }
649
+ };
650
+ }
651
+ /**
652
+ * Navigates the page to the target URL and extracts full page data.
653
+ *
654
+ * WHY retryable with 3-min timeout: Page navigation can fail due to transient
655
+ * network issues or slow-loading pages. The decorator retries automatically,
656
+ * emitting `retryWait` / `retryExhausted` phase events for progress monitoring.
657
+ *
658
+ * Flow:
659
+ * 1. Register request/response/requestfailed listeners to capture sub-resources (internal pages only)
660
+ * 2. Navigate to URL via `page.goto()` and track redirect chain
661
+ * 3. Wait for DOM content and network idle
662
+ * 4. Check for network disconnection errors and throw to trigger retry
663
+ * 5. Extract anchors, meta, and optionally images
664
+ * 6. Check for keyword exclusion in HTML content
665
+ * @param page - Puppeteer page instance
666
+ * @param url - Target URL to navigate to
667
+ * @param isExternal - Whether the URL is external to the crawl scope
668
+ * @param captureImages - Whether to run the image extraction pipeline
669
+ * @param imageLoadTimeout - Timeout (ms) for waiting lazy-loaded images to complete
670
+ * @param resources - Mutable array to collect captured sub-resources into
671
+ * @param failedRequests - Mutable array to collect failed sub-resource requests into
672
+ * @param options - Additional scraper options (e.g. `disableQueries`, `navigationTimeout`)
673
+ * @returns Full page data or skipped page data if an exclusion rule matched
674
+ */
675
+ get #fetchData() { return _private_fetchData_descriptor.value; }
676
+ /**
677
+ * Extracts image data from the page across multiple device presets.
678
+ *
679
+ * WHY multiple device presets: Images may differ between desktop and mobile
680
+ * due to responsive `<picture>` / `srcset`. Capturing both `desktop-compact`
681
+ * and `mobile-small` viewports reveals responsive image issues.
682
+ *
683
+ * WHY retryable with 5-min timeout and `fallback: []`: Image extraction is
684
+ * best-effort. If all retries fail, an empty array is returned rather than
685
+ * failing the entire page scrape.
686
+ * @param page - Puppeteer page instance
687
+ * @param url - The page URL string (without hash and auth)
688
+ * @param isExternal - Whether the page is external
689
+ * @param imageLoadTimeout - Timeout (ms) for waiting images to complete loading
690
+ * @returns Array of image elements from all device presets
691
+ */
692
+ get #fetchImages() { return _private_fetchImages_descriptor.value; }
693
+ };
694
+ })();
695
+ /**
696
+ * Page-level scraper that extracts data from a single browser page.
697
+ *
698
+ * The scraper returns results as values from `scrapeStart()` rather than
699
+ * emitting them as events. Only streaming events (changePhase, resourceResponse)
700
+ * are emitted for progress monitoring.
701
+ *
702
+ * The Puppeteer `Page` object is injected externally, and page lifecycle
703
+ * (including `page.close()`) is managed by the caller.
704
+ * @example
705
+ * ```ts
706
+ * const scraper = new Scraper();
707
+ * scraper.on('changePhase', (e) => console.log(e.name));
708
+ * const result = await scraper.scrapeStart(page, url, { isExternal: false });
709
+ * ```
710
+ */
711
+ // eslint-disable-next-line unicorn/prefer-event-target -- TypedAwaitEventEmitter is a project-specific typed wrapper, not Node.js EventEmitter
712
+ export default Scraper;