@d-zero/beholder 2.0.1 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,730 @@
1
+ var __runInitializers = (this && this.__runInitializers) || function (thisArg, initializers, value) {
2
+ var useValue = arguments.length > 2;
3
+ for (var i = 0; i < initializers.length; i++) {
4
+ value = useValue ? initializers[i].call(thisArg, value) : initializers[i].call(thisArg);
5
+ }
6
+ return useValue ? value : void 0;
7
+ };
8
+ var __esDecorate = (this && this.__esDecorate) || function (ctor, descriptorIn, decorators, contextIn, initializers, extraInitializers) {
9
+ function accept(f) { if (f !== void 0 && typeof f !== "function") throw new TypeError("Function expected"); return f; }
10
+ var kind = contextIn.kind, key = kind === "getter" ? "get" : kind === "setter" ? "set" : "value";
11
+ var target = !descriptorIn && ctor ? contextIn["static"] ? ctor : ctor.prototype : null;
12
+ var descriptor = descriptorIn || (target ? Object.getOwnPropertyDescriptor(target, contextIn.name) : {});
13
+ var _, done = false;
14
+ for (var i = decorators.length - 1; i >= 0; i--) {
15
+ var context = {};
16
+ for (var p in contextIn) context[p] = p === "access" ? {} : contextIn[p];
17
+ for (var p in contextIn.access) context.access[p] = contextIn.access[p];
18
+ context.addInitializer = function (f) { if (done) throw new TypeError("Cannot add initializers after decoration has completed"); extraInitializers.push(accept(f || null)); };
19
+ var result = (0, decorators[i])(kind === "accessor" ? { get: descriptor.get, set: descriptor.set } : descriptor[key], context);
20
+ if (kind === "accessor") {
21
+ if (result === void 0) continue;
22
+ if (result === null || typeof result !== "object") throw new TypeError("Object expected");
23
+ if (_ = accept(result.get)) descriptor.get = _;
24
+ if (_ = accept(result.set)) descriptor.set = _;
25
+ if (_ = accept(result.init)) initializers.unshift(_);
26
+ }
27
+ else if (_ = accept(result)) {
28
+ if (kind === "field") initializers.unshift(_);
29
+ else descriptor[key] = _;
30
+ }
31
+ }
32
+ if (target) Object.defineProperty(target, contextIn.name, descriptor);
33
+ done = true;
34
+ };
35
+ var __setFunctionName = (this && this.__setFunctionName) || function (f, name, prefix) {
36
+ if (typeof name === "symbol") name = name.description ? "[".concat(name.description, "]") : "";
37
+ return Object.defineProperty(f, "name", { configurable: true, value: prefix ? "".concat(prefix, " ", name) : name });
38
+ };
39
+ import { beforePageScan, devicePresets } from '@d-zero/puppeteer-page-scan';
40
+ import { detectCDN } from '@d-zero/shared/detect-cdn';
41
+ import { detectCompress } from '@d-zero/shared/detect-compress';
42
+ import { retry as retryable } from '@d-zero/shared/retry';
43
+ import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
44
+ import { resourceLog, scraperLog } from './debug.js';
45
+ import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
46
+ import { isError } from './is-error.js';
47
+ import { keywordCheck } from './keyword-check.js';
48
+ import { findDisconnectionFailures } from './network-disconnection.js';
49
+ import { parseUrl } from './parse-url.js';
50
+ const pid = `${process.pid}`;
51
+ const log = scraperLog.extend(pid);
52
+ const rLog = resourceLog.extend(pid);
53
+ let Scraper = (() => {
54
+ let _classSuper = EventEmitter;
55
+ let _instanceExtraInitializers = [];
56
+ let _private_fetchData_decorators;
57
+ let _private_fetchData_descriptor;
58
+ let _private_fetchImages_decorators;
59
+ let _private_fetchImages_descriptor;
60
+ return class Scraper extends _classSuper {
61
+ static {
62
+ const _metadata = typeof Symbol === "function" && Symbol.metadata ? Object.create(_classSuper[Symbol.metadata] ?? null) : void 0;
63
+ _private_fetchData_decorators = [retryable({
64
+ timeout: 3 * 60 * 1000,
65
+ onWait(determinedInterval, retryCount, methodName, error) {
66
+ void this.emit('changePhase', {
67
+ pid: process.pid,
68
+ name: 'retryWait',
69
+ url: null,
70
+ isExternal: false,
71
+ message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1})`,
72
+ });
73
+ },
74
+ onGiveUp(retryCount, error, methodName) {
75
+ void this.emit('changePhase', {
76
+ pid: process.pid,
77
+ name: 'retryExhausted',
78
+ url: null,
79
+ isExternal: false,
80
+ message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
81
+ });
82
+ },
83
+ })];
84
+ _private_fetchImages_decorators = [retryable({
85
+ timeout: 5 * 60 * 1000,
86
+ fallback: [],
87
+ onWait(determinedInterval, retryCount, methodName, error) {
88
+ void this.emit('changePhase', {
89
+ pid: process.pid,
90
+ name: 'retryWait',
91
+ url: null,
92
+ isExternal: false,
93
+ message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1} / images)`,
94
+ });
95
+ },
96
+ onGiveUp(retryCount, error, methodName) {
97
+ void this.emit('changePhase', {
98
+ pid: process.pid,
99
+ name: 'retryExhausted',
100
+ url: null,
101
+ isExternal: false,
102
+ message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
103
+ });
104
+ },
105
+ })];
106
+ __esDecorate(this, _private_fetchData_descriptor = { value: __setFunctionName(async function (page, url, isExternal, captureImages, imageLoadTimeout, resources, failedRequests, options) {
107
+ const parseOpts = options?.disableQueries == null
108
+ ? undefined
109
+ : { disableQueries: options.disableQueries };
110
+ const networkLogs = {};
111
+ // Clear stale state from previous retries (@retryable may re-invoke this method
112
+ // with the same page and mutable arrays, so we must reset to avoid accumulation)
113
+ this.#cleanupPageListeners();
114
+ failedRequests.length = 0;
115
+ resources.length = 0;
116
+ // Define named listeners so they can be individually removed on retry/cleanup
117
+ const onDialog = async (dialog) => {
118
+ log(`Appear ${dialog.type()} dialog: ${dialog.message()}`);
119
+ try {
120
+ await dialog.accept();
121
+ }
122
+ catch (error) {
123
+ log(`Error: ${error}`);
124
+ }
125
+ log(`Accept ${dialog.type()} dialog`);
126
+ };
127
+ page.on('dialog', onDialog);
128
+ let onRequest = null;
129
+ let onResponse = null;
130
+ let onRequestFailed = null;
131
+ if (!isExternal) {
132
+ onRequest = (request) => {
133
+ const url = parseUrl(request.url(), parseOpts);
134
+ networkLogs[request.url()] = {
135
+ url,
136
+ status: null,
137
+ contentLength: 0,
138
+ contentType: '',
139
+ isError: false,
140
+ request: {
141
+ ts: Date.now(),
142
+ headers: request.headers(),
143
+ method: request.method(),
144
+ },
145
+ };
146
+ };
147
+ const uniqueRes = new Set();
148
+ onResponse = (response) => {
149
+ const resURL = parseUrl(response.url(), parseOpts);
150
+ if (uniqueRes.has(resURL.withoutHash)) {
151
+ return;
152
+ }
153
+ if (resURL.withoutHash === url.withoutHash) {
154
+ return;
155
+ }
156
+ uniqueRes.add(resURL.withoutHash);
157
+ const headers = response.headers();
158
+ const status = response.status();
159
+ const statusText = response.statusText();
160
+ const contentType = headers['content-type']?.split(';')[0] || null;
161
+ const contentLength = Number.parseInt(headers['content-length'] ?? '', 10) || null;
162
+ const request = networkLogs[resURL.withoutHash];
163
+ const log = {
164
+ ...request,
165
+ response: {
166
+ ts: Date.now(),
167
+ status,
168
+ statusText,
169
+ fromCache: response.fromCache(),
170
+ headers,
171
+ },
172
+ status,
173
+ isError: isError(status),
174
+ contentType: contentType || '',
175
+ contentLength: contentLength || 0,
176
+ };
177
+ const referredLink = {
178
+ url: resURL,
179
+ isExternal: resURL.hostname !== url.hostname,
180
+ isError: log.isError,
181
+ status,
182
+ statusText,
183
+ contentType,
184
+ contentLength,
185
+ compress: detectCompress(headers),
186
+ cdn: detectCDN(headers),
187
+ headers: headers,
188
+ };
189
+ rLog('Fetched: %s', resURL.href);
190
+ // Collect resource into the results array
191
+ resources.push({ log, resource: referredLink, pageUrl: url.withoutHash });
192
+ // Also emit for streaming consumers
193
+ void this.emit('resourceResponse', {
194
+ pid: process.pid,
195
+ url,
196
+ log,
197
+ resource: referredLink,
198
+ });
199
+ };
200
+ onRequestFailed = (request) => {
201
+ const errorText = request.failure()?.errorText ?? 'Unknown error';
202
+ rLog('Request failed: %s (%s)', request.url(), errorText);
203
+ failedRequests.push({ url: request.url(), errorText });
204
+ };
205
+ page.on('request', onRequest);
206
+ page.on('response', onResponse);
207
+ page.on('requestfailed', onRequestFailed);
208
+ }
209
+ // Store cleanup function for retry/post-fetch removal
210
+ this.#pageListenerCleanup = () => {
211
+ page.off('dialog', onDialog);
212
+ if (onRequest)
213
+ page.off('request', onRequest);
214
+ if (onResponse)
215
+ page.off('response', onResponse);
216
+ if (onRequestFailed)
217
+ page.off('requestfailed', onRequestFailed);
218
+ };
219
+ const navigationTimeout = options?.navigationTimeout ?? 60_000;
220
+ void this.emit('changePhase', {
221
+ pid: process.pid,
222
+ name: 'openPage',
223
+ url,
224
+ isExternal,
225
+ message: `%countdown(${navigationTimeout},openPage_${url.withoutHash},s)%s`,
226
+ });
227
+ if (url.username && url.password) {
228
+ await page.setExtraHTTPHeaders({
229
+ Authorization: `Basic ${Buffer.from(`${url.username}:${url.password}`).toString('base64')}`,
230
+ });
231
+ }
232
+ const res = await page.goto(url.withoutHashAndAuth, { timeout: navigationTimeout });
233
+ if (!res) {
234
+ throw new Error('The method Page.goto returned null');
235
+ }
236
+ const destUrl = parseUrl(page.url(), parseOpts);
237
+ const redirectPaths = new Set();
238
+ if (url.withoutHash !== destUrl.withoutHash) {
239
+ const redirectChain = res
240
+ .request()
241
+ .redirectChain()
242
+ .map((req) => req.url());
243
+ for (const redirectPath of redirectChain) {
244
+ redirectPaths.add(redirectPath);
245
+ }
246
+ redirectPaths.add(destUrl.withoutHash);
247
+ }
248
+ if (destUrl.hostname !== url.hostname) {
249
+ isExternal = true;
250
+ }
251
+ const status = res.status();
252
+ const statusText = res.statusText();
253
+ const responseHeaders = res.headers();
254
+ const contentType = responseHeaders['content-type']?.split(';')[0] || null;
255
+ const _contentLength = Number.parseInt(responseHeaders['content-length'] ?? '');
256
+ const contentLength = Number.isFinite(_contentLength) ? _contentLength : null;
257
+ if (contentType !== 'text/html') {
258
+ return {
259
+ url,
260
+ isTarget: false,
261
+ isExternal,
262
+ redirectPaths: [...redirectPaths],
263
+ status,
264
+ statusText,
265
+ contentType,
266
+ contentLength,
267
+ responseHeaders,
268
+ meta: {
269
+ title: '',
270
+ },
271
+ imageList: [],
272
+ anchorList: [],
273
+ html: '',
274
+ isSkipped: false,
275
+ };
276
+ }
277
+ void this.emit('changePhase', {
278
+ pid: process.pid,
279
+ name: 'loadDOMContent',
280
+ url,
281
+ isExternal,
282
+ message: '',
283
+ });
284
+ await page
285
+ .waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 5000 })
286
+ .catch(() => { });
287
+ void this.emit('changePhase', {
288
+ pid: process.pid,
289
+ name: 'getHTML',
290
+ url,
291
+ isExternal,
292
+ message: '',
293
+ });
294
+ const { title, html } = await page.evaluate(() => {
295
+ /* global document */
296
+ return {
297
+ title: document.title,
298
+ html: document.documentElement.outerHTML,
299
+ };
300
+ });
301
+ if (isExternal) {
302
+ return {
303
+ url,
304
+ isTarget: false,
305
+ isExternal,
306
+ redirectPaths: [...redirectPaths],
307
+ status,
308
+ statusText,
309
+ contentType,
310
+ contentLength,
311
+ responseHeaders,
312
+ meta: {
313
+ title,
314
+ },
315
+ imageList: [],
316
+ anchorList: [],
317
+ html,
318
+ isSkipped: false,
319
+ };
320
+ }
321
+ void this.emit('changePhase', {
322
+ pid: process.pid,
323
+ name: 'waitNetworkIdle',
324
+ url,
325
+ isExternal,
326
+ message: '',
327
+ });
328
+ await page
329
+ .waitForNavigation({ waitUntil: 'networkidle0', timeout: 5000 })
330
+ .catch(() => { });
331
+ // Check for network disconnection errors in failed requests
332
+ const disconnectionFailures = findDisconnectionFailures(failedRequests);
333
+ if (disconnectionFailures.length > 0) {
334
+ const errorSummary = disconnectionFailures
335
+ .map((r) => `${r.url} (${r.errorText})`)
336
+ .join(', ');
337
+ throw new Error(`Network disconnection detected during page load: ${errorSummary}`);
338
+ }
339
+ void this.emit('changePhase', {
340
+ pid: process.pid,
341
+ name: 'getAnchors',
342
+ url,
343
+ isExternal,
344
+ message: '',
345
+ });
346
+ const anchorList = await getAnchorList(page, parseOpts);
347
+ void this.emit('changePhase', {
348
+ pid: process.pid,
349
+ name: 'getMeta',
350
+ url,
351
+ isExternal,
352
+ message: '',
353
+ });
354
+ const meta = await getMeta(page);
355
+ const imageList = captureImages
356
+ ? await (async () => {
357
+ void this.emit('changePhase', {
358
+ pid: process.pid,
359
+ name: 'extractImages',
360
+ url,
361
+ isExternal,
362
+ message: '',
363
+ });
364
+ return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout);
365
+ })()
366
+ : [];
367
+ return {
368
+ url,
369
+ isTarget: true,
370
+ isExternal,
371
+ redirectPaths: [...redirectPaths],
372
+ status,
373
+ statusText,
374
+ contentType,
375
+ contentLength,
376
+ responseHeaders,
377
+ meta,
378
+ anchorList,
379
+ imageList,
380
+ html,
381
+ isSkipped: false,
382
+ };
383
+ }, "#fetchData") }, _private_fetchData_decorators, { kind: "method", name: "#fetchData", static: false, private: true, access: { has: obj => #fetchData in obj, get: obj => obj.#fetchData }, metadata: _metadata }, null, _instanceExtraInitializers);
384
+ __esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout) {
385
+ const listener = this.#createPageScanListener(isExternal);
386
+ const devices = [
387
+ { key: 'desktop-compact', preset: devicePresets['desktop-compact'] },
388
+ { key: 'mobile-small', preset: devicePresets['mobile-small'] },
389
+ ];
390
+ const imageList = [];
391
+ for (const { key, preset } of devices) {
392
+ try {
393
+ void this.emit('changePhase', {
394
+ pid: process.pid,
395
+ name: 'setViewport',
396
+ url: null,
397
+ isExternal,
398
+ message: `📷 ${key} ↔️ ${preset.width}px`,
399
+ });
400
+ await beforePageScan(page, url, {
401
+ name: key,
402
+ width: preset.width,
403
+ resolution: preset.resolution,
404
+ listener,
405
+ timeout: 5000,
406
+ });
407
+ void this.emit('changePhase', {
408
+ pid: process.pid,
409
+ name: 'waitImageLoad',
410
+ url: null,
411
+ isExternal,
412
+ message: `📷 ${key}: Waiting for images%dots%`,
413
+ });
414
+ await page
415
+ .waitForFunction(() => [...document.images].every((img) => img.complete), {
416
+ timeout: imageLoadTimeout,
417
+ })
418
+ .catch(() => { });
419
+ void this.emit('changePhase', {
420
+ pid: process.pid,
421
+ name: 'getImages',
422
+ url: null,
423
+ isExternal,
424
+ message: `📸 ${key}: Extracting images%dots%`,
425
+ });
426
+ const images = await getImageList(page, preset.width);
427
+ imageList.push(...images);
428
+ }
429
+ catch (error) {
430
+ const errorMessage = error instanceof Error ? error.message : String(error);
431
+ log('Error(FETCH_IMAGES/%s): %s', key, errorMessage);
432
+ void this.emit('changePhase', {
433
+ pid: process.pid,
434
+ name: 'retryExhausted',
435
+ url: null,
436
+ isExternal: false,
437
+ message: `📷 ${key}: skipped — ${errorMessage}`,
438
+ });
439
+ }
440
+ }
441
+ return imageList;
442
+ }, "#fetchImages") }, _private_fetchImages_decorators, { kind: "method", name: "#fetchImages", static: false, private: true, access: { has: obj => #fetchImages in obj, get: obj => obj.#fetchImages }, metadata: _metadata }, null, _instanceExtraInitializers);
443
+ if (_metadata) Object.defineProperty(this, Symbol.metadata, { enumerable: true, configurable: true, writable: true, value: _metadata });
444
+ }
445
+ /** Number of retries for `@retryable`-decorated methods. Set per-scrape from options. */
446
+ retries = __runInitializers(this, _instanceExtraInitializers);
447
+ /** Cleanup function to remove page listeners registered by `#fetchData`. */
448
+ #pageListenerCleanup = null;
449
+ /**
450
+ * Begins the scraping process for a given URL on the provided Puppeteer page.
451
+ *
452
+ * Returns a `ScrapeResult` containing the outcome:
453
+ * - `type: "success"` with `pageData` on success
454
+ * - `type: "skipped"` with `ignored` details when the page is excluded
455
+ * - `type: "error"` with `error` details when scraping fails
456
+ *
457
+ * Sub-resources are collected via the `resourceResponse` event and
458
+ * included in the returned `ScrapeResult.resources`.
459
+ * @param page - The Puppeteer page instance to use for navigation and DOM evaluation.
460
+ * @param url - The extended URL to scrape.
461
+ * @param options - Optional scraper configuration overriding defaults.
462
+ * @param isSkip - When `true`, the page is immediately skipped without any network requests.
463
+ * @returns The scrape result containing the outcome and captured resources.
464
+ */
465
+ async scrapeStart(page, url, options, isSkip = false) {
466
+ this.retries = options?.retries;
467
+ const isExternal = options?.isExternal ?? false;
468
+ const captureImages = options?.captureImages ?? true;
469
+ const excludeKeywords = options?.excludeKeywords ?? [];
470
+ const metadataOnly = options?.metadataOnly ?? false;
471
+ const imageLoadTimeout = options?.imageLoadTimeout ?? 5000;
472
+ const resources = [];
473
+ const failedRequests = [];
474
+ void this.emit('changePhase', {
475
+ pid: process.pid,
476
+ name: 'scrapeStart',
477
+ url,
478
+ isExternal,
479
+ message: '',
480
+ });
481
+ // Path-excluded: return SkippedPageData
482
+ if (isSkip) {
483
+ void this.emit('changePhase', {
484
+ pid: process.pid,
485
+ name: 'pageSkipped',
486
+ url,
487
+ isExternal,
488
+ message: 'Matched: excluded path',
489
+ });
490
+ return {
491
+ type: 'skipped',
492
+ resources,
493
+ ignored: {
494
+ url,
495
+ matchedText: url.pathname || '',
496
+ excludeKeywords,
497
+ },
498
+ };
499
+ }
500
+ // Non-HTTP protocol: return minimal PageData
501
+ if (!url.isHTTP) {
502
+ const result = {
503
+ url,
504
+ isTarget: false,
505
+ isExternal,
506
+ redirectPaths: [],
507
+ status: -1,
508
+ statusText: '__THIS_IS_NOT_HTTP_PROTOCOL__',
509
+ contentType: null,
510
+ contentLength: null,
511
+ responseHeaders: {},
512
+ meta: {
513
+ title: '',
514
+ },
515
+ imageList: [],
516
+ anchorList: [],
517
+ html: '',
518
+ isSkipped: false,
519
+ };
520
+ void this.emit('changePhase', {
521
+ pid: process.pid,
522
+ name: 'scrapeEnd',
523
+ url,
524
+ isExternal,
525
+ message: '',
526
+ });
527
+ return { type: 'success', pageData: result, resources };
528
+ }
529
+ let headResult = options?.headCheckResult ?? null;
530
+ if (headResult && metadataOnly) {
531
+ void this.emit('changePhase', {
532
+ pid: process.pid,
533
+ name: 'scrapeEnd',
534
+ url,
535
+ isExternal,
536
+ message: '',
537
+ });
538
+ return {
539
+ type: 'success',
540
+ pageData: {
541
+ ...headResult,
542
+ isTarget: false,
543
+ },
544
+ resources,
545
+ };
546
+ }
547
+ if (headResult === null || headResult.contentType === 'text/html') {
548
+ const fetchResult = await this.#fetchData(page, url, isExternal, captureImages, imageLoadTimeout, resources, failedRequests, options).catch((error) => {
549
+ if (error instanceof Error) {
550
+ return error;
551
+ }
552
+ return new Error(error);
553
+ });
554
+ if (fetchResult instanceof Error) {
555
+ log('Error(FETCH_DATA): %s', url.href);
556
+ this.#cleanupPageListeners();
557
+ return {
558
+ type: 'error',
559
+ resources,
560
+ failedRequests: failedRequests.length > 0 ? failedRequests : undefined,
561
+ error: {
562
+ name: fetchResult.name,
563
+ message: fetchResult.message,
564
+ stack: fetchResult.stack,
565
+ shutdown: true,
566
+ },
567
+ };
568
+ }
569
+ this.#cleanupPageListeners();
570
+ headResult = fetchResult;
571
+ if (!headResult.isSkipped) {
572
+ const checkedKeyword = keywordCheck(headResult.html, excludeKeywords);
573
+ if (checkedKeyword) {
574
+ headResult = {
575
+ url,
576
+ isSkipped: true,
577
+ matched: {
578
+ type: 'keyword',
579
+ text: checkedKeyword,
580
+ excludeKeywords,
581
+ },
582
+ };
583
+ }
584
+ }
585
+ if (headResult.isSkipped) {
586
+ if (headResult.matched.type === 'path') {
587
+ return {
588
+ type: 'skipped',
589
+ resources,
590
+ ignored: {
591
+ url,
592
+ matchedText: url.pathname || '',
593
+ excludeKeywords,
594
+ },
595
+ };
596
+ }
597
+ void this.emit('changePhase', {
598
+ pid: process.pid,
599
+ name: 'pageSkipped',
600
+ url,
601
+ isExternal,
602
+ message: `Matched: "${headResult.matched.text}"`,
603
+ });
604
+ return {
605
+ type: 'skipped',
606
+ resources,
607
+ ignored: {
608
+ url,
609
+ matchedText: headResult.matched.text,
610
+ excludeKeywords,
611
+ },
612
+ };
613
+ }
614
+ }
615
+ void this.emit('changePhase', {
616
+ pid: process.pid,
617
+ name: 'scrapeEnd',
618
+ url,
619
+ isExternal,
620
+ message: '',
621
+ });
622
+ return {
623
+ type: 'success',
624
+ pageData: headResult,
625
+ resources,
626
+ failedRequests: failedRequests.length > 0 ? failedRequests : undefined,
627
+ };
628
+ }
629
+ #cleanupPageListeners() {
630
+ if (this.#pageListenerCleanup) {
631
+ this.#pageListenerCleanup();
632
+ this.#pageListenerCleanup = null;
633
+ }
634
+ }
635
+ /**
636
+ * Creates a callback for `@d-zero/puppeteer-page-scan`'s `beforePageScan` listener.
637
+ *
638
+ * WHY a separate factory: The listener must capture `isExternal` for phase events
639
+ * while conforming to the `beforePageScan` listener signature.
640
+ * Currently only handles the `scroll` phase to report scroll progress.
641
+ * @param isExternal - Whether the current page is external to the crawl scope
642
+ * @returns A listener function compatible with `beforePageScan`'s `listener` option
643
+ */
644
+ #createPageScanListener(isExternal) {
645
+ return (phase, data) => {
646
+ switch (phase) {
647
+ case 'scroll': {
648
+ const d = data;
649
+ const scrollMsg = Number.isNaN(d.scrollHeight)
650
+ ? `%propeller% ${d.message}`
651
+ : `%propeller% ${d.scrollY}px/${d.scrollHeight}px (${Math.round((d.scrollY / d.scrollHeight) * 100)}%) ${d.message}`;
652
+ void this.emit('changePhase', {
653
+ pid: process.pid,
654
+ name: 'scrollToBottom',
655
+ url: null,
656
+ isExternal,
657
+ message: scrollMsg,
658
+ });
659
+ break;
660
+ }
661
+ }
662
+ };
663
+ }
664
+ /**
665
+ * Navigates the page to the target URL and extracts full page data.
666
+ *
667
+ * WHY retryable with 3-min timeout: Page navigation can fail due to transient
668
+ * network issues or slow-loading pages. The decorator retries automatically,
669
+ * emitting `retryWait` / `retryExhausted` phase events for progress monitoring.
670
+ *
671
+ * Flow:
672
+ * 1. Register request/response/requestfailed listeners to capture sub-resources (internal pages only)
673
+ * 2. Navigate to URL via `page.goto()` and track redirect chain
674
+ * 3. Wait for DOM content and network idle
675
+ * 4. Check for network disconnection errors and throw to trigger retry
676
+ * 5. Extract anchors, meta, and optionally images
677
+ * 6. Check for keyword exclusion in HTML content
678
+ * @param page - Puppeteer page instance
679
+ * @param url - Target URL to navigate to
680
+ * @param isExternal - Whether the URL is external to the crawl scope
681
+ * @param captureImages - Whether to run the image extraction pipeline
682
+ * @param imageLoadTimeout - Timeout (ms) for waiting lazy-loaded images to complete
683
+ * @param resources - Mutable array to collect captured sub-resources into
684
+ * @param failedRequests - Mutable array to collect failed sub-resource requests into
685
+ * @param options - Additional scraper options (e.g. `disableQueries`, `navigationTimeout`)
686
+ * @returns Full page data or skipped page data if an exclusion rule matched
687
+ */
688
+ get #fetchData() { return _private_fetchData_descriptor.value; }
689
+ /**
690
+ * Extracts image data from the page across multiple device presets.
691
+ *
692
+ * WHY multiple device presets: Images may differ between desktop and mobile
693
+ * due to responsive `<picture>` / `srcset`. Capturing both `desktop-compact`
694
+ * and `mobile-small` viewports reveals responsive image issues.
695
+ *
696
+ * WHY per-device try-catch: Some pages (e.g. those using fullpage.js or
697
+ * scroll-jacking libraries) destroy the execution context when the viewport
698
+ * changes and triggers a reload. Isolating each device preset allows partial
699
+ * results — if one viewport fails, the other can still succeed.
700
+ *
701
+ * WHY retryable with 5-min timeout and `fallback: []`: Image extraction is
702
+ * best-effort. If all retries fail, an empty array is returned rather than
703
+ * failing the entire page scrape.
704
+ * @param page - Puppeteer page instance
705
+ * @param url - The page URL string (without hash and auth)
706
+ * @param isExternal - Whether the page is external
707
+ * @param imageLoadTimeout - Timeout (ms) for waiting images to complete loading
708
+ * @returns Array of image elements from all device presets (may be partial if some viewports failed)
709
+ */
710
+ get #fetchImages() { return _private_fetchImages_descriptor.value; }
711
+ };
712
+ })();
713
+ /**
714
+ * Page-level scraper that extracts data from a single browser page.
715
+ *
716
+ * The scraper returns results as values from `scrapeStart()` rather than
717
+ * emitting them as events. Only streaming events (changePhase, resourceResponse)
718
+ * are emitted for progress monitoring.
719
+ *
720
+ * The Puppeteer `Page` object is injected externally, and page lifecycle
721
+ * (including `page.close()`) is managed by the caller.
722
+ * @example
723
+ * ```ts
724
+ * const scraper = new Scraper();
725
+ * scraper.on('changePhase', (e) => console.log(e.name));
726
+ * const result = await scraper.scrapeStart(page, url, { isExternal: false });
727
+ * ```
728
+ */
729
+ // eslint-disable-next-line unicorn/prefer-event-target -- TypedAwaitEventEmitter is a project-specific typed wrapper, not Node.js EventEmitter
730
+ export default Scraper;