npm - @d-zero/beholder - Versions diffs - 0.1.29 → 2.0.0 - Mend

@d-zero/beholder 0.1.29 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/CHANGELOG.md +11 -0
package/README.md +172 -477
package/dist/debug.d.ts +4 -1
package/dist/debug.js +5 -2
package/dist/dom-evaluation.d.ts +72 -14
package/dist/dom-evaluation.js +169 -43
package/dist/index.d.ts +20 -3
package/dist/index.js +15 -3
package/dist/is-error.d.ts +8 -0
package/dist/is-error.js +10 -0
package/dist/keyword-check.d.ts +5 -3
package/dist/keyword-check.js +5 -3
package/dist/parse-url.d.ts +14 -0
package/dist/parse-url.js +23 -0
package/dist/scraper.d.ts +39 -13
package/dist/scraper.js +300 -263
package/dist/types.d.ts +286 -214
package/dist/types.js +6 -0
package/package.json +7 -10
package/src/debug.ts +5 -2
package/src/dom-evaluation.ts +195 -65
package/src/index.ts +27 -3
package/src/is-error.spec.ts +33 -0
package/src/is-error.ts +10 -0
package/src/keyword-check.spec.ts +45 -4
package/src/keyword-check.ts +5 -3
package/src/parse-url.spec.ts +35 -0
package/src/parse-url.ts +26 -0
package/src/scraper.ts +338 -300
package/src/types.ts +345 -258
package/tsconfig.tsbuildinfo +1 -1
package/dist/events.d.ts +0 -32
package/dist/events.js +0 -15
package/dist/fetch-destination.d.ts +0 -8
package/dist/fetch-destination.js +0 -145
package/dist/net-timeout-error.d.ts +0 -3
package/dist/net-timeout-error.js +0 -3
package/dist/sub-process-runner.d.ts +0 -12
package/dist/sub-process-runner.js +0 -180
package/dist/sub-process.d.ts +0 -1
package/dist/sub-process.js +0 -67
package/dist/utils.d.ts +0 -16
package/dist/utils.js +0 -69
package/src/events.ts +0 -21
package/src/fetch-destination.ts +0 -173
package/src/net-timeout-error.ts +0 -3
package/src/sub-process-runner.ts +0 -220
package/src/sub-process.ts +0 -86
package/src/utils.ts +0 -89

package/dist/scraper.js CHANGED Viewed

@@ -36,106 +36,76 @@ var __setFunctionName = (this && this.__setFunctionName) || function (f, name, p
     if (typeof name === "symbol") name = name.description ? "[".concat(name.description, "]") : "";
     return Object.defineProperty(f, "name", { configurable: true, value: prefix ? "".concat(prefix, " ", name) : name });
 };
-import { beforePageScan } from '@d-zero/puppeteer-page-scan';
-import { parseUrl } from '@d-zero/shared/parse-url';
-import { retry } from '@d-zero/shared/retry';
-import { TypedAwaitEventEmitter } from '@d-zero/shared/typed-await-event-emitter';
-import { launch } from 'puppeteer';
+import { beforePageScan, devicePresets } from '@d-zero/puppeteer-page-scan';
+import { detectCDN } from '@d-zero/shared/detect-cdn';
+import { detectCompress } from '@d-zero/shared/detect-compress';
+import { retry as retryable } from '@d-zero/shared/retry';
+import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
 import { resourceLog, scraperLog } from './debug.js';
 import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
-import { fetchDestination } from './fetch-destination.js';
+import { isError } from './is-error.js';
 import { keywordCheck } from './keyword-check.js';
-import { detectCDN, detectCompress, isError } from './utils.js';
+import { parseUrl } from './parse-url.js';
 const pid = `${process.pid}`;
 const log = scraperLog.extend(pid);
 const rLog = resourceLog.extend(pid);
-const LAUNCH_BROWSER_TIMEOUT = 1000 * 30;
 let Scraper = (() => {
-    let _classSuper = TypedAwaitEventEmitter;
+    let _classSuper = EventEmitter;
     let _instanceExtraInitializers = [];
-    let _private_bootBrowser_decorators;
-    let _private_bootBrowser_descriptor;
-    let _private_createPage_decorators;
-    let _private_createPage_descriptor;
     let _private_fetchData_decorators;
     let _private_fetchData_descriptor;
-    let _private_fetchHead_decorators;
-    let _private_fetchHead_descriptor;
     let _private_fetchImages_decorators;
     let _private_fetchImages_descriptor;
     return class Scraper extends _classSuper {
         static {
             const _metadata = typeof Symbol === "function" && Symbol.metadata ? Object.create(_classSuper[Symbol.metadata] ?? null) : void 0;
-            _private_bootBrowser_decorators = [retry()];
-            _private_createPage_decorators = [retry()];
-            _private_fetchData_decorators = [retry({
-                    timeout: 1 * 60 * 1000, // 1sec,
+            _private_fetchData_decorators = [retryable({
+                    timeout: 3 * 60 * 1000,
+                    onWait(determinedInterval, retryCount, methodName, error) {
+                        void this.emit('changePhase', {
+                            pid: process.pid,
+                            name: 'retryWait',
+                            url: null,
+                            isExternal: false,
+                            message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1})`,
+                        });
+                    },
+                    onGiveUp(retryCount, error, methodName) {
+                        void this.emit('changePhase', {
+                            pid: process.pid,
+                            name: 'retryExhausted',
+                            url: null,
+                            isExternal: false,
+                            message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
+                        });
+                    },
                 })];
-            _private_fetchHead_decorators = [retry()];
-            _private_fetchImages_decorators = [retry({
-                    timeout: 5 * 60 * 1000, // 5sec
+            _private_fetchImages_decorators = [retryable({
+                    timeout: 5 * 60 * 1000,
                     fallback: [],
-                })];
-            __esDecorate(this, _private_bootBrowser_descriptor = { value: __setFunctionName(async function (isExternal, executablePath, headless) {
-                    if (!this.#browser) {
+                    onWait(determinedInterval, retryCount, methodName, error) {
                         void this.emit('changePhase', {
                             pid: process.pid,
-                            name: 'launchBrowser',
-                            url: this.#url,
-                            isExternal,
-                            message: executablePath || '(executablePath is default)',
+                            name: 'retryWait',
+                            url: null,
+                            isExternal: false,
+                            message: `${methodName}: ${error.message} — %countdown(${determinedInterval},${methodName}_${retryCount},s)%s (retry #${retryCount + 1} / images)`,
                         });
-                        const browser = await launch({
-                            headless,
-                            timeout: LAUNCH_BROWSER_TIMEOUT,
-                            executablePath: executablePath ?? undefined,
-                            args: [
-                                // TODO: Optional lang
-                                '--lang=ja',
-                                '--no-zygote',
-                                '--ignore-certificate-errors',
-                            ],
-                        }).catch((error) => {
-                            if (error instanceof Error) {
-                                return error;
-                            }
-                            throw error;
+                    },
+                    onGiveUp(retryCount, error, methodName) {
+                        void this.emit('changePhase', {
+                            pid: process.pid,
+                            name: 'retryExhausted',
+                            url: null,
+                            isExternal: false,
+                            message: `${methodName}: gave up after ${retryCount} retries — ${error.message}`,
                         });
-                        if (browser instanceof Error) {
-                            void this.emit('error', {
-                                pid: process.pid,
-                                url: this.#url,
-                                shutdown: false,
-                                error: browser,
-                            });
-                            throw browser;
-                        }
-                        this.#browser = browser;
-                    }
-                    else if (!this.#browser.isConnected()) {
-                        await this.#browser.close();
-                    }
-                    return this.#browser;
-                }, "#bootBrowser") }, _private_bootBrowser_decorators, { kind: "method", name: "#bootBrowser", static: false, private: true, access: { has: obj => #bootBrowser in obj, get: obj => obj.#bootBrowser }, metadata: _metadata }, null, _instanceExtraInitializers);
-            __esDecorate(this, _private_createPage_descriptor = { value: __setFunctionName(async function (isExternal, executablePath, headless) {
-                    const browser = await this.#bootBrowser(isExternal, executablePath, headless);
-                    void this.emit('changePhase', {
-                        pid: process.pid,
-                        name: 'newPage',
-                        url: this.#url,
-                        isExternal,
-                        message: '',
-                    });
-                    const page = await browser.newPage();
-                    page.setDefaultNavigationTimeout(0);
-                    await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36');
-                    await page.setExtraHTTPHeaders({
-                        // TODO: Optional lang
-                        'Accept-Language': 'ja-JP',
-                    });
-                    return page;
-                }, "#createPage") }, _private_createPage_decorators, { kind: "method", name: "#createPage", static: false, private: true, access: { has: obj => #createPage in obj, get: obj => obj.#createPage }, metadata: _metadata }, null, _instanceExtraInitializers);
-            __esDecorate(this, _private_fetchData_descriptor = { value: __setFunctionName(async function (page, url, isExternal, isGettingImages, options) {
+                    },
+                })];
+            __esDecorate(this, _private_fetchData_descriptor = { value: __setFunctionName(async function (page, url, isExternal, captureImages, imageLoadTimeout, resources, options) {
+                    const parseOpts = options?.disableQueries == null
+                        ? undefined
+                        : { disableQueries: options.disableQueries };
                     const networkLogs = {};
                     page.on('dialog', async (dialog) => {
                         log(`Appear ${dialog.type()} dialog: ${dialog.message()}`);
@@ -149,7 +119,7 @@ let Scraper = (() => {
                     });
                     if (!isExternal) {
                         page.on('request', (request) => {
-                            const url = parseUrl(request.url(), options);
+                            const url = parseUrl(request.url(), parseOpts);
                             networkLogs[request.url()] = {
                                 url,
                                 status: null,
@@ -165,7 +135,7 @@ let Scraper = (() => {
                         });
                         const uniqueRes = new Set();
                         page.on('response', (response) => {
-                            const resURL = parseUrl(response.url(), options);
+                            const resURL = parseUrl(response.url(), parseOpts);
                             if (uniqueRes.has(resURL.withoutHash)) {
                                 return;
                             }
@@ -206,6 +176,9 @@ let Scraper = (() => {
                                 headers: headers,
                             };
                             rLog('Fetched: %s', resURL.href);
+                            // Collect resource into the results array
+                            resources.push({ log, resource: referredLink, pageUrl: url.withoutHash });
+                            // Also emit for streaming consumers
                             void this.emit('resourceResponse', {
                                 pid: process.pid,
                                 url,
@@ -214,29 +187,34 @@ let Scraper = (() => {
                             });
                         });
                     }
+                    const navigationTimeout = options?.navigationTimeout ?? 60_000;
                     void this.emit('changePhase', {
                         pid: process.pid,
                         name: 'openPage',
-                        url: this.#url,
+                        url,
                         isExternal,
-                        message: '',
+                        message: `%countdown(${navigationTimeout},openPage_${url.withoutHash},s)%s`,
                     });
                     if (url.username && url.password) {
                         await page.setExtraHTTPHeaders({
                             Authorization: `Basic ${Buffer.from(`${url.username}:${url.password}`).toString('base64')}`,
                         });
                     }
-                    const res = await page.goto(url.withoutHashAndAuth);
+                    const res = await page.goto(url.withoutHashAndAuth, { timeout: navigationTimeout });
                     if (!res) {
                         throw new Error('The method Page.goto returned null');
                     }
-                    const destUrl = parseUrl(page.url(), options);
-                    const redirectPaths = res
-                        .request()
-                        .redirectChain()
-                        .map((req) => req.url());
-                    if (destUrl.withoutHash !== url.withoutHash) {
-                        redirectPaths.push(destUrl.withoutHash);
+                    const destUrl = parseUrl(page.url(), parseOpts);
+                    const redirectPaths = new Set();
+                    if (url.withoutHash !== destUrl.withoutHash) {
+                        const redirectChain = res
+                            .request()
+                            .redirectChain()
+                            .map((req) => req.url());
+                        for (const redirectPath of redirectChain) {
+                            redirectPaths.add(redirectPath);
+                        }
+                        redirectPaths.add(destUrl.withoutHash);
                     }
                     if (destUrl.hostname !== url.hostname) {
                         isExternal = true;
@@ -252,7 +230,7 @@ let Scraper = (() => {
                             url,
                             isTarget: false,
                             isExternal,
-                            redirectPaths,
+                            redirectPaths: [...redirectPaths],
                             status,
                             statusText,
                             contentType,
@@ -270,7 +248,7 @@ let Scraper = (() => {
                     void this.emit('changePhase', {
                         pid: process.pid,
                         name: 'loadDOMContent',
-                        url: this.#url,
+                        url,
                         isExternal,
                         message: '',
                     });
@@ -280,7 +258,7 @@ let Scraper = (() => {
                     void this.emit('changePhase', {
                         pid: process.pid,
                         name: 'getHTML',
-                        url: this.#url,
+                        url,
                         isExternal,
                         message: '',
                     });
@@ -296,7 +274,7 @@ let Scraper = (() => {
                             url,
                             isTarget: false,
                             isExternal,
-                            redirectPaths,
+                            redirectPaths: [...redirectPaths],
                             status,
                             statusText,
                             contentType,
@@ -313,8 +291,8 @@ let Scraper = (() => {
                     }
                     void this.emit('changePhase', {
                         pid: process.pid,
-                        name: 'waitNetworkIdleZero',
-                        url: this.#url,
+                        name: 'waitNetworkIdle',
+                        url,
                         isExternal,
                         message: '',
                     });
@@ -324,25 +302,36 @@ let Scraper = (() => {
                     void this.emit('changePhase', {
                         pid: process.pid,
                         name: 'getAnchors',
-                        url: this.#url,
+                        url,
                         isExternal,
                         message: '',
                     });
-                    const anchorList = await getAnchorList(page, options);
+                    const anchorList = await getAnchorList(page, parseOpts);
                     void this.emit('changePhase', {
                         pid: process.pid,
                         name: 'getMeta',
-                        url: this.#url,
+                        url,
                         isExternal,
                         message: '',
                     });
                     const meta = await getMeta(page);
-                    const imageList = isGettingImages ? await this.#fetchImages(page, isExternal) : [];
+                    const imageList = captureImages
+                        ? await (async () => {
+                            void this.emit('changePhase', {
+                                pid: process.pid,
+                                name: 'extractImages',
+                                url,
+                                isExternal,
+                                message: '',
+                            });
+                            return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout);
+                        })()
+                        : [];
                     return {
                         url,
                         isTarget: true,
                         isExternal,
-                        redirectPaths,
+                        redirectPaths: [...redirectPaths],
                         status,
                         statusText,
                         contentType,
@@ -355,126 +344,110 @@ let Scraper = (() => {
                         isSkipped: false,
                     };
                 }, "#fetchData") }, _private_fetchData_decorators, { kind: "method", name: "#fetchData", static: false, private: true, access: { has: obj => #fetchData in obj, get: obj => obj.#fetchData }, metadata: _metadata }, null, _instanceExtraInitializers);
-            __esDecorate(this, _private_fetchHead_descriptor = { value: __setFunctionName(async function (url, isExternal) {
-                    return await fetchDestination(url, isExternal);
-                }, "#fetchHead") }, _private_fetchHead_decorators, { kind: "method", name: "#fetchHead", static: false, private: true, access: { has: obj => #fetchHead in obj, get: obj => obj.#fetchHead }, metadata: _metadata }, null, _instanceExtraInitializers);
-            __esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, isExternal) {
-                    const url = this.#url.withoutHashAndAuth;
-                    const imageList = [];
+            __esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout) {
+                    const listener = this.#createPageScanListener(isExternal);
                     const devices = [
-                        { name: 'desktop', width: 1280 },
-                        { name: 'mobile', width: 320, resolution: 2 },
+                        { key: 'desktop-compact', preset: devicePresets['desktop-compact'] },
+                        { key: 'mobile-small', preset: devicePresets['mobile-small'] },
                     ];
-                    for (const device of devices) {
+                    const imageList = [];
+                    for (const { key, preset } of devices) {
                         void this.emit('changePhase', {
                             pid: process.pid,
                             name: 'setViewport',
-                            url: this.#url,
+                            url: null,
                             isExternal,
-                            message: device.name,
+                            message: `📷 ${key} ↔️ ${preset.width}px`,
                         });
                         await beforePageScan(page, url, {
-                            name: device.name,
-                            width: device.width,
-                            resolution: device.resolution,
+                            name: key,
+                            width: preset.width,
+                            resolution: preset.resolution,
+                            listener,
                             timeout: 5000,
                         });
+                        void this.emit('changePhase', {
+                            pid: process.pid,
+                            name: 'waitImageLoad',
+                            url: null,
+                            isExternal,
+                            message: `📷 ${key}: Waiting for images%dots%`,
+                        });
+                        await page
+                            .waitForFunction(() => [...document.images].every((img) => img.complete), {
+                            timeout: imageLoadTimeout,
+                        })
+                            .catch(() => { });
                         void this.emit('changePhase', {
                             pid: process.pid,
                             name: 'getImages',
-                            url: this.#url,
+                            url: null,
                             isExternal,
-                            message: device.name,
+                            message: `📸 ${key}: Extracting images%dots%`,
                         });
-                        const images = await getImageList(page, device.width);
+                        const images = await getImageList(page, preset.width);
                         imageList.push(...images);
                     }
                     return imageList;
                 }, "#fetchImages") }, _private_fetchImages_decorators, { kind: "method", name: "#fetchImages", static: false, private: true, access: { has: obj => #fetchImages in obj, get: obj => obj.#fetchImages }, metadata: _metadata }, null, _instanceExtraInitializers);
             if (_metadata) Object.defineProperty(this, Symbol.metadata, { enumerable: true, configurable: true, writable: true, value: _metadata });
         }
-        #browser = (__runInitializers(this, _instanceExtraInitializers), null);
-        #url = null;
-        async destroy(isExternal) {
-            log('Scraper destroys self');
-            if (!this.#url) {
-                throw new Error('The instance is already destroyed.');
-            }
-            if (!this.#browser) {
-                void this.emit('destroyed', {
-                    pid: process.pid,
-                });
-                void this.emit('changePhase', {
-                    pid: process.pid,
-                    name: 'destroyed',
-                    url: this.#url,
-                    isExternal,
-                    message: '',
-                });
-                return;
-            }
-            while (!this.#browser.isConnected()) {
-                log('Browser closes all pages');
-                const pages = await this.#browser.pages();
-                for (const page of pages) {
-                    page.removeAllListeners();
-                    if (!page.isClosed) {
-                        await page.close();
-                    }
-                }
-                log('Browser closes self');
-                await this.#browser.close();
-                log('Browser disconnects');
-                await this.#browser.disconnect();
-            }
-            log('Scraper discards browser');
-            this.#browser = null;
-            void this.emit('destroyed', {
-                pid: process.pid,
-            });
-            void this.emit('changePhase', {
-                pid: process.pid,
-                name: 'destroyed',
-                url: this.#url,
-                isExternal,
-                message: '',
-            });
-        }
-        async scrapeStart(url, options, isSkip = false) {
+        /** Number of retries for `@retryable`-decorated methods. Set per-scrape from options. */
+        retries = __runInitializers(this, _instanceExtraInitializers);
+        /**
+         * Begins the scraping process for a given URL on the provided Puppeteer page.
+         *
+         * Returns a `ScrapeResult` containing the outcome:
+         * - `type: "success"` with `pageData` on success
+         * - `type: "skipped"` with `ignored` details when the page is excluded
+         * - `type: "error"` with `error` details when scraping fails
+         *
+         * Sub-resources are collected via the `resourceResponse` event and
+         * included in the returned `ScrapeResult.resources`.
+         * @param page - The Puppeteer page instance to use for navigation and DOM evaluation.
+         * @param url - The extended URL to scrape.
+         * @param options - Optional scraper configuration overriding defaults.
+         * @param isSkip - When `true`, the page is immediately skipped without any network requests.
+         * @returns The scrape result containing the outcome and captured resources.
+         */
+        async scrapeStart(page, url, options, isSkip = false) {
+            this.retries = options?.retries;
             const isExternal = options?.isExternal ?? false;
-            const isGettingImages = options?.isGettingImages ?? true;
+            const captureImages = options?.captureImages ?? true;
             const excludeKeywords = options?.excludeKeywords ?? [];
-            const executablePath = options?.executablePath ?? null;
-            const isTitleOnly = options?.isTitleOnly ?? false;
-            this.#url = url;
+            const metadataOnly = options?.metadataOnly ?? false;
+            const imageLoadTimeout = options?.imageLoadTimeout ?? 5000;
+            const resources = [];
             void this.emit('changePhase', {
                 pid: process.pid,
                 name: 'scrapeStart',
-                url: this.#url,
+                url,
                 isExternal,
                 message: '',
             });
+            // Path-excluded: return SkippedPageData
             if (isSkip) {
-                void this.emit('ignoreAndSkip', {
-                    pid: process.pid,
-                    url: this.#url,
-                    reason: {
-                        matchedText: this.#url.pathname || '',
-                        excludeKeywords,
-                    },
-                });
                 void this.emit('changePhase', {
                     pid: process.pid,
-                    name: 'ignoreAndSkip',
-                    url: this.#url,
+                    name: 'pageSkipped',
+                    url,
                     isExternal,
                     message: 'Matched: excluded path',
                 });
-                return;
+                return {
+                    type: 'skipped',
+                    resources,
+                    ignored: {
+                        url,
+                        matchedText: url.pathname || '',
+                        excludeKeywords,
+                    },
+                };
             }
-            if (!this.#url.isHTTP) {
+            // Non-HTTP protocol: return minimal PageData
+            if (!url.isHTTP) {
                 const result = {
-                    url: this.#url,
+                    url,
                     isTarget: false,
                     isExternal,
                     redirectPaths: [],
@@ -491,79 +464,60 @@ let Scraper = (() => {
                     html: '',
                     isSkipped: false,
                 };
-                void this.emit('scrapeEnd', {
-                    pid: process.pid,
-                    url: this.#url,
-                    timestamp: Date.now(),
-                    result,
-                });
                 void this.emit('changePhase', {
                     pid: process.pid,
                     name: 'scrapeEnd',
-                    url: this.#url,
+                    url,
                     isExternal,
                     message: '',
                 });
-                return;
+                return { type: 'success', pageData: result, resources };
             }
-            void this.emit('changePhase', {
-                pid: process.pid,
-                name: 'touchHead',
-                url: this.#url,
-                isExternal,
-                message: '',
-            });
-            let result = await this.#fetchHead(url, isExternal);
-            if (result instanceof Error) {
-                log('Error(FETCH_HEAD): %s', url.href);
-                void this.emit('error', {
+            let headResult = options?.headCheckResult ?? null;
+            if (headResult && metadataOnly) {
+                void this.emit('changePhase', {
                     pid: process.pid,
-                    url: this.#url,
-                    shutdown: false,
-                    error: result,
+                    name: 'scrapeEnd',
+                    url,
+                    isExternal,
+                    message: '',
                 });
-                result = null;
-            }
-            if (result && isTitleOnly) {
-                void this.emit('scrapeEnd', {
-                    pid: process.pid,
-                    url: this.#url,
-                    timestamp: Date.now(),
-                    result: {
-                        ...result,
+                return {
+                    type: 'success',
+                    pageData: {
+                        ...headResult,
                         isTarget: false,
                     },
-                });
-                return;
+                    resources,
+                };
             }
-            if (result === null || result.contentType === 'text/html') {
-                const headlessMode = url.isSecure ? true : 'shell';
-                const page = await this.#createPage(isExternal, executablePath, headlessMode);
-                result = await this.#fetchData(page, url, isExternal, isGettingImages, options).catch((error) => {
+            if (headResult === null || headResult.contentType === 'text/html') {
+                const fetchResult = await this.#fetchData(page, url, isExternal, captureImages, imageLoadTimeout, resources, options).catch((error) => {
                     if (error instanceof Error) {
                         return error;
                     }
                     return new Error(error);
                 });
-                if (result instanceof Error) {
+                if (fetchResult instanceof Error) {
                     log('Error(FETCH_DATA): %s', url.href);
-                    void this.emit('error', {
-                        pid: process.pid,
-                        url: this.#url,
-                        shutdown: true,
-                        error: result,
-                    });
-                    await this.destroy(isExternal);
-                    return;
+                    page.removeAllListeners();
+                    return {
+                        type: 'error',
+                        resources,
+                        error: {
+                            name: fetchResult.name,
+                            message: fetchResult.message,
+                            stack: fetchResult.stack,
+                            shutdown: true,
+                        },
+                    };
                 }
                 page.removeAllListeners();
-                if (!page.isClosed) {
-                    await page.close();
-                }
-                if (!result.isSkipped) {
-                    const checkedKeyword = keywordCheck(result.html, excludeKeywords);
+                headResult = fetchResult;
+                if (!headResult.isSkipped) {
+                    const checkedKeyword = keywordCheck(headResult.html, excludeKeywords);
                     if (checkedKeyword) {
-                        result = {
+                        headResult = {
                             url,
                             isSkipped: true,
                             matched: {
@@ -574,48 +528,131 @@ let Scraper = (() => {
                         };
                     }
                 }
-                if (result.isSkipped) {
-                    if (result.matched.type === 'path') {
-                        return;
+                if (headResult.isSkipped) {
+                    if (headResult.matched.type === 'path') {
+                        return {
+                            type: 'skipped',
+                            resources,
+                            ignored: {
+                                url,
+                                matchedText: url.pathname || '',
+                                excludeKeywords,
+                            },
+                        };
                     }
-                    void this.emit('ignoreAndSkip', {
-                        pid: process.pid,
-                        url: this.#url,
-                        reason: {
-                            matchedText: result.matched.text,
-                            excludeKeywords,
-                        },
-                    });
                     void this.emit('changePhase', {
                         pid: process.pid,
-                        name: 'ignoreAndSkip',
-                        url: this.#url,
+                        name: 'pageSkipped',
+                        url,
                         isExternal,
-                        message: `Matched: "${result.matched.text}"`,
+                        message: `Matched: "${headResult.matched.text}"`,
                     });
-                    return;
+                    return {
+                        type: 'skipped',
+                        resources,
+                        ignored: {
+                            url,
+                            matchedText: headResult.matched.text,
+                            excludeKeywords,
+                        },
+                    };
                 }
             }
-            void this.emit('scrapeEnd', {
-                pid: process.pid,
-                url: this.#url,
-                timestamp: Date.now(),
-                result,
-            });
             void this.emit('changePhase', {
                 pid: process.pid,
                 name: 'scrapeEnd',
-                url: this.#url,
+                url,
                 isExternal,
                 message: '',
             });
-            return result;
+            return { type: 'success', pageData: headResult, resources };
+        }
+        /**
+         * Creates a callback for `@d-zero/puppeteer-page-scan`'s `beforePageScan` listener.
+         *
+         * WHY a separate factory: The listener must capture `isExternal` for phase events
+         * while conforming to the `beforePageScan` listener signature.
+         * Currently only handles the `scroll` phase to report scroll progress.
+         * @param isExternal - Whether the current page is external to the crawl scope
+         * @returns A listener function compatible with `beforePageScan`'s `listener` option
+         */
+        #createPageScanListener(isExternal) {
+            return (phase, data) => {
+                switch (phase) {
+                    case 'scroll': {
+                        const d = data;
+                        const scrollMsg = Number.isNaN(d.scrollHeight)
+                            ? `%propeller% ${d.message}`
+                            : `%propeller% ${d.scrollY}px/${d.scrollHeight}px (${Math.round((d.scrollY / d.scrollHeight) * 100)}%) ${d.message}`;
+                        void this.emit('changePhase', {
+                            pid: process.pid,
+                            name: 'scrollToBottom',
+                            url: null,
+                            isExternal,
+                            message: scrollMsg,
+                        });
+                        break;
+                    }
+                }
+            };
         }
-        get #bootBrowser() { return _private_bootBrowser_descriptor.value; }
-        get #createPage() { return _private_createPage_descriptor.value; }
+        /**
+         * Navigates the page to the target URL and extracts full page data.
+         *
+         * WHY retryable with 3-min timeout: Page navigation can fail due to transient
+         * network issues or slow-loading pages. The decorator retries automatically,
+         * emitting `retryWait` / `retryExhausted` phase events for progress monitoring.
+         *
+         * Flow:
+         * 1. Register request/response listeners to capture sub-resources (internal pages only)
+         * 2. Navigate to URL via `page.goto()` and track redirect chain
+         * 3. Wait for DOM content and network idle
+         * 4. Extract anchors, meta, and optionally images
+         * 5. Check for keyword exclusion in HTML content
+         * @param page - Puppeteer page instance
+         * @param url - Target URL to navigate to
+         * @param isExternal - Whether the URL is external to the crawl scope
+         * @param captureImages - Whether to run the image extraction pipeline
+         * @param imageLoadTimeout - Timeout (ms) for waiting lazy-loaded images to complete
+         * @param resources - Mutable array to collect captured sub-resources into
+         * @param options - Additional scraper options (e.g. `disableQueries`, `navigationTimeout`)
+         * @returns Full page data or skipped page data if an exclusion rule matched
+         */
         get #fetchData() { return _private_fetchData_descriptor.value; }
-        get #fetchHead() { return _private_fetchHead_descriptor.value; }
+        /**
+         * Extracts image data from the page across multiple device presets.
+         *
+         * WHY multiple device presets: Images may differ between desktop and mobile
+         * due to responsive `<picture>` / `srcset`. Capturing both `desktop-compact`
+         * and `mobile-small` viewports reveals responsive image issues.
+         *
+         * WHY retryable with 5-min timeout and `fallback: []`: Image extraction is
+         * best-effort. If all retries fail, an empty array is returned rather than
+         * failing the entire page scrape.
+         * @param page - Puppeteer page instance
+         * @param url - The page URL string (without hash and auth)
+         * @param isExternal - Whether the page is external
+         * @param imageLoadTimeout - Timeout (ms) for waiting images to complete loading
+         * @returns Array of image elements from all device presets
+         */
         get #fetchImages() { return _private_fetchImages_descriptor.value; }
     };
 })();
+/**
+ * Page-level scraper that extracts data from a single browser page.
+ *
+ * The scraper returns results as values from `scrapeStart()` rather than
+ * emitting them as events. Only streaming events (changePhase, resourceResponse)
+ * are emitted for progress monitoring.
+ *
+ * The Puppeteer `Page` object is injected externally, and page lifecycle
+ * (including `page.close()`) is managed by the caller.
+ * @example
+ * ```ts
+ * const scraper = new Scraper();
+ * scraper.on('changePhase', (e) => console.log(e.name));
+ * const result = await scraper.scrapeStart(page, url, { isExternal: false });
+ * ```
+ */
+// eslint-disable-next-line unicorn/prefer-event-target -- TypedAwaitEventEmitter is a project-specific typed wrapper, not Node.js EventEmitter
 export default Scraper;