npm - @d-zero/beholder - Versions diffs - 2.1.5 → 2.1.6 - Mend

@d-zero/beholder 2.1.5 → 2.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/CHANGELOG.md +6 -0
package/README.md +9 -276
package/dist/dom-evaluation.d.ts +40 -50
package/dist/dom-evaluation.js +105 -160
package/dist/scraper.js +8 -6
package/dist/types.d.ts +6 -0
package/package.json +3 -3
package/src/dom-evaluation.spec.ts +293 -0
package/src/dom-evaluation.ts +148 -190
package/src/scraper.ts +14 -4
package/src/types.ts +6 -0
package/tsconfig.tsbuildinfo +1 -1

package/dist/dom-evaluation.js CHANGED Viewed

@@ -3,28 +3,43 @@
  *
  * These functions are called by {@link ./scraper.ts | Scraper.#fetchData} to extract
  * anchors, images, and meta information after page navigation completes.
+ *
+ * WHY timeouts everywhere: A page whose main thread is blocked (heavy JS, autoplay
+ * video players, infinite loops) makes every CDP round-trip hang. `getMeta` and
+ * `getImageList` therefore collect all data in a single `page.evaluate` and wrap it
+ * in {@link raceWithTimeout} so a blocked thread is abandoned after a bounded budget
+ * instead of accumulating per-property timeouts up to the caller's global timeout.
+ * Note that `page.evaluate` itself runs on the page's main thread and has no built-in
+ * timeout, so the surrounding race is what actually bounds the hang.
  * @see {@link ./types.ts} for the data types returned by these functions
  */
+import { raceWithTimeout } from '@d-zero/shared/race-with-timeout';
 import { domDetailsLog, domLog } from './debug.js';
 import { parseUrl } from './parse-url.js';
 const pid = `${process.pid}`;
 const log = domLog.extend(pid);
 const dLog = domDetailsLog.extend(pid);
+/**
+ * Default timeout (ms) applied to DOM evaluation operations when the caller does not
+ * specify one. Bounds how long a single `page.evaluate` / property read may hang on a
+ * page whose main thread is unresponsive.
+ */
+export const DEFAULT_DOM_EVALUATION_TIMEOUT = 30_000;
 /**
  * Retrieves a DOM property value from a Puppeteer element handle with a timeout.
  *
- * Races the actual property retrieval against a 10-second timeout.
+ * Races the actual property retrieval against a timeout via {@link raceWithTimeout},
+ * which clears the loser-side timer so it cannot keep the event loop alive.
  * If the property cannot be read or the timeout expires, the fallback value is returned.
  * @template T - The expected type of the property value.
  * @param params - Parameters containing the element, property name, and fallback.
- * @returns The property value, or the fallback if retrieval fails.
+ * @param timeout - Timeout in ms before falling back. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
+ * @returns The property value, or the fallback if retrieval fails or times out.
  */
-export async function getProp(params) {
+export async function getProp(params, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
     const { $el, propName, fallback } = params;
-    return Promise.race([
-        _getProp($el, propName, fallback),
-        new Promise((res) => setTimeout(() => res(fallback), 10 * 1000)),
-    ]);
+    const { result, timeout: timedOut } = await raceWithTimeout(() => _getProp($el, propName, fallback), timeout);
+    return timedOut ? fallback : result;
 }
 /**
  * Internal implementation of property retrieval without timeout.
@@ -47,76 +62,48 @@ async function _getProp($el, propName, fallback) {
         return fallback;
     }
 }
-/**
- * Retrieves a DOM property value from the first element matching a CSS selector.
- *
- * Combines `page.$()` with {@link getProp} for convenient single-element lookups.
- * @template T - The expected type of the property value.
- * @param params - Parameters containing the page, selector, property name, and fallback.
- * @returns The property value, or the fallback if the element is not found or retrieval fails.
- */
-export async function getPropBySelector(params) {
-    const { page, selector, propName, fallback } = params;
-    const $el = await page.$(selector);
-    if (!$el) {
-        return fallback;
-    }
-    return getProp({ $el, propName, fallback });
-}
 /**
  * Extracts all `<img>` elements from the page and returns their properties.
  *
- * For each image, collects the `src`, `currentSrc`, `alt`, bounding box dimensions,
- * natural dimensions, lazy-loading status, and the outer HTML source code.
+ * Collects every image's `src`, `currentSrc`, `alt`, layout dimensions,
+ * natural dimensions, lazy-loading status, and outer HTML in a single
+ * `page.evaluate` call, wrapped in {@link raceWithTimeout}. On timeout (an
+ * unresponsive page) an empty array is returned rather than hanging.
  * @param page - The Puppeteer page to extract images from.
  * @param viewportWidth - The current viewport width in pixels, recorded alongside each image entry.
+ * @param timeout - Timeout in ms for the evaluation. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
  * @returns An array of {@link ImageElement} objects describing each image on the page.
  */
-export async function getImageList(page, viewportWidth) {
+export async function getImageList(page, viewportWidth, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
     log('Getting images (Viewport: %dpx)', viewportWidth);
-    const $images = await page.$$('img');
-    const imageList = [];
-    for (const $image of $images) {
-        const boundingBox = await $image.boundingBox();
-        const width = boundingBox?.width || 0;
-        const height = boundingBox?.height || 0;
-        const src = await getProp({ $el: $image, propName: 'src', fallback: '' });
-        const currentSrc = await getProp({
-            $el: $image,
-            propName: 'currentSrc',
-            fallback: '',
-        });
-        const alt = await getProp({ $el: $image, propName: 'alt', fallback: '' });
-        const naturalWidth = await getProp({
-            $el: $image,
-            propName: 'naturalWidth',
-            fallback: 0,
-        });
-        const naturalHeight = await getProp({
-            $el: $image,
-            propName: 'naturalHeight',
-            fallback: 0,
-        });
-        const loading = await getProp({ $el: $image, propName: 'loading', fallback: '' });
-        const sourceCode = await getProp({
-            $el: $image,
-            propName: 'outerHTML',
-            fallback: '',
-        });
-        const isLazy = loading.toLowerCase().trim() === 'lazy';
-        imageList.push({
-            src,
-            currentSrc,
-            alt,
-            width,
-            height,
-            naturalWidth,
-            naturalHeight,
-            isLazy,
-            viewportWidth,
-            sourceCode,
+    const { result, timeout: timedOut } = await raceWithTimeout(() => page
+        .evaluate(() => {
+        /* global document */
+        return [...document.images].map((img) => {
+            const rect = img.getBoundingClientRect();
+            return {
+                src: img.src,
+                currentSrc: img.currentSrc,
+                alt: img.alt,
+                width: rect.width,
+                height: rect.height,
+                naturalWidth: img.naturalWidth,
+                naturalHeight: img.naturalHeight,
+                loading: img.loading,
+                sourceCode: img.outerHTML,
+            };
         });
+    })
+        .catch(() => null), timeout);
+    if (timedOut || result == null) {
+        log('Image extraction timed out or failed (Viewport: %dpx); returning []', viewportWidth);
+        return [];
     }
+    const imageList = result.map(({ loading, ...img }) => ({
+        ...img,
+        isLazy: loading.toLowerCase().trim() === 'lazy',
+        viewportWidth,
+    }));
     log('Got %d images (Viewport: %dpx)', imageList.length, viewportWidth);
     dLog('Images are: %O', imageList.map((i) => i.src));
     return imageList;
@@ -127,27 +114,29 @@ export async function getImageList(page, viewportWidth) {
  * For each anchor, resolves the `href` to an `ExURL` via `parseUrl`, retrieves
  * the accessible name (from the accessibility tree, falling back to `textContent`),
  * and filters out non-HTTP links.
+ *
+ * WHY this keeps per-element CDP calls (unlike {@link getMeta} / {@link getImageList}):
+ * the accessible name comes from Chrome's computed accessibility tree
+ * (`page.accessibility.snapshot`), which is a CDP-only feature unavailable to in-page
+ * DOM APIs. Each {@link getProp} read is still bounded by `timeout`.
  * @param page - The Puppeteer page to extract anchors from.
  * @param options - Optional URL parsing options (e.g., `disableQueries`).
+ * @param timeout - Timeout in ms per property read. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
  * @returns An array of {@link AnchorData} objects for all HTTP(S) links found on the page.
  */
-export async function getAnchorList(page, options) {
+export async function getAnchorList(page, options, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
     log('Getting anchors');
     const $anchors = await page.$$('a[href], area[href]');
     const anchorList = [];
     for (const $anchor of $anchors) {
-        const $href = await getProp({ $el: $anchor, propName: 'href', fallback: '' });
+        const $href = await getProp({ $el: $anchor, propName: 'href', fallback: '' }, timeout);
         const hrefVal = $href.toString();
         const href = parseUrl(hrefVal, options);
         if (!href || !href.isHTTP) {
             continue;
         }
         const axNode = await page.accessibility.snapshot({ root: $anchor });
-        const textContent = await getProp({
-            $el: $anchor,
-            propName: 'textContent',
-            fallback: '',
-        });
+        const textContent = await getProp({ $el: $anchor, propName: 'textContent', fallback: '' }, timeout);
         const accessibleName = axNode ? axNode.name || '' : textContent.trim();
         const link = {
             href,
@@ -162,7 +151,11 @@ export async function getAnchorList(page, options) {
 /**
  * Extracts comprehensive meta information from the page's `<head>`.
  *
- * Collects the following metadata:
+ * Collects all metadata in a single `page.evaluate` call (14 CDP round-trips
+ * collapsed into 1) wrapped in {@link raceWithTimeout}. On timeout (an unresponsive
+ * page) a minimal `{ title: '' }` is returned rather than hanging.
+ *
+ * Collected metadata:
  * - `title` - The document title.
  * - `lang` - The `lang` attribute of the `<html>` element.
  * - `description` - The `<meta name="description">` content.
@@ -173,99 +166,51 @@ export async function getAnchorList(page, options) {
  * - Open Graph tags: `og:type`, `og:title`, `og:site_name`, `og:description`, `og:url`, `og:image`.
  * - `twitter:card` - The Twitter Card type.
  * @param page - The Puppeteer page to extract meta information from.
+ * @param timeout - Timeout in ms for the evaluation. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
  * @returns An object containing all extracted meta properties.
  */
-export async function getMeta(page) {
+export async function getMeta(page, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
     log('Getting Meta');
-    const robotsVal = await getPropBySelector({
-        page,
-        selector: 'meta[name="robots"]',
-        propName: 'content',
-        fallback: '',
-    });
+    const { result, timeout: timedOut } = await raceWithTimeout(() => page
+        .evaluate(() => {
+        /* global document, HTMLMetaElement, HTMLLinkElement */
+        const content = (selector) => {
+            const el = document.querySelector(selector);
+            return el instanceof HTMLMetaElement ? el.content : '';
+        };
+        const linkHref = (selector) => {
+            const el = document.querySelector(selector);
+            return el instanceof HTMLLinkElement ? el.href : '';
+        };
+        return {
+            title: document.title,
+            lang: document.documentElement.lang,
+            description: content('meta[name="description"]'),
+            keywords: content('meta[name="keywords"]'),
+            robots: content('meta[name="robots"]'),
+            canonical: linkHref('link[rel="canonical"]'),
+            alternate: linkHref('link[rel="alternate"]'),
+            'og:type': content('meta[property="og:type"]'),
+            'og:title': content('meta[property="og:title"]'),
+            'og:site_name': content('meta[property="og:site_name"]'),
+            'og:description': content('meta[property="og:description"]'),
+            'og:url': content('meta[property="og:url"]'),
+            'og:image': content('meta[property="og:image"]'),
+            'twitter:card': content('meta[name="twitter:card"]'),
+        };
+    })
+        .catch(() => null), timeout);
+    if (timedOut || result == null) {
+        log('Meta extraction timed out or failed; returning fallback');
+        return { title: '' };
+    }
+    const { robots: robotsVal, ...rest } = result;
     const robots = new Set(robotsVal.split(',').map((robot) => robot.trim().toLowerCase()));
     const meta = {
-        title: await getPropBySelector({
-            page,
-            selector: 'title',
-            propName: 'textContent',
-            fallback: '',
-        }),
-        lang: await getPropBySelector({
-            page,
-            selector: 'html',
-            propName: 'lang',
-            fallback: '',
-        }),
-        description: await getPropBySelector({
-            page,
-            selector: 'meta[name="description"]',
-            propName: 'content',
-            fallback: '',
-        }),
-        keywords: await getPropBySelector({
-            page,
-            selector: 'meta[name="keywords"]',
-            propName: 'content',
-            fallback: '',
-        }),
+        ...rest,
         noindex: robots.has('noindex'),
         nofollow: robots.has('nofollow'),
         noarchive: robots.has('noarchive'),
-        canonical: await getPropBySelector({
-            page,
-            selector: 'link[rel="canonical"]',
-            propName: 'href',
-            fallback: '',
-        }),
-        alternate: await getPropBySelector({
-            page,
-            selector: 'link[rel="alternate"]',
-            propName: 'href',
-            fallback: '',
-        }),
-        'og:type': await getPropBySelector({
-            page,
-            selector: 'meta[property="og:type"]',
-            propName: 'content',
-            fallback: '',
-        }),
-        'og:title': await getPropBySelector({
-            page,
-            selector: 'meta[property="og:title"]',
-            propName: 'content',
-            fallback: '',
-        }),
-        'og:site_name': await getPropBySelector({
-            page,
-            selector: 'meta[property="og:site_name"]',
-            propName: 'content',
-            fallback: '',
-        }),
-        'og:description': await getPropBySelector({
-            page,
-            selector: 'meta[property="og:description"]',
-            propName: 'content',
-            fallback: '',
-        }),
-        'og:url': await getPropBySelector({
-            page,
-            selector: 'meta[property="og:url"]',
-            propName: 'content',
-            fallback: '',
-        }),
-        'og:image': await getPropBySelector({
-            page,
-            selector: 'meta[property="og:image"]',
-            propName: 'content',
-            fallback: '',
-        }),
-        'twitter:card': await getPropBySelector({
-            page,
-            selector: 'meta[name="twitter:card"]',
-            propName: 'content',
-            fallback: '',
-        }),
     };
     log('Got meta');
     dLog('Meta data are: %O', meta);

package/dist/scraper.js CHANGED Viewed

@@ -42,7 +42,7 @@ import { detectCompress } from '@d-zero/shared/detect-compress';
 import { retry as retryable } from '@d-zero/shared/retry';
 import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
 import { resourceLog, scraperLog } from './debug.js';
-import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
+import { DEFAULT_DOM_EVALUATION_TIMEOUT, getAnchorList, getImageList, getMeta, } from './dom-evaluation.js';
 import { isError } from './is-error.js';
 import { keywordCheck } from './keyword-check.js';
 import { findDisconnectionFailures } from './network-disconnection.js';
@@ -107,6 +107,7 @@ let Scraper = (() => {
                     const parseOpts = options?.disableQueries == null
                         ? undefined
                         : { disableQueries: options.disableQueries };
+                    const domEvaluationTimeout = options?.domEvaluationTimeout ?? DEFAULT_DOM_EVALUATION_TIMEOUT;
                     const networkLogs = {};
                     // Clear stale state from previous retries (@retryable may re-invoke this method
                     // with the same page and mutable arrays, so we must reset to avoid accumulation)
@@ -343,7 +344,7 @@ let Scraper = (() => {
                         isExternal,
                         message: '',
                     });
-                    const anchorList = await getAnchorList(page, parseOpts);
+                    const anchorList = await getAnchorList(page, parseOpts, domEvaluationTimeout);
                     void this.emit('changePhase', {
                         pid: process.pid,
                         name: 'getMeta',
@@ -351,7 +352,7 @@ let Scraper = (() => {
                         isExternal,
                         message: '',
                     });
-                    const meta = await getMeta(page);
+                    const meta = await getMeta(page, domEvaluationTimeout);
                     const imageList = captureImages
                         ? await (async () => {
                             void this.emit('changePhase', {
@@ -361,7 +362,7 @@ let Scraper = (() => {
                                 isExternal,
                                 message: '',
                             });
-                            return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout);
+                            return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout, domEvaluationTimeout);
                         })()
                         : [];
                     return {
@@ -381,7 +382,7 @@ let Scraper = (() => {
                         isSkipped: false,
                     };
                 }, "#fetchData") }, _private_fetchData_decorators, { kind: "method", name: "#fetchData", static: false, private: true, access: { has: obj => #fetchData in obj, get: obj => obj.#fetchData }, metadata: _metadata }, null, _instanceExtraInitializers);
-            __esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout) {
+            __esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout, domEvaluationTimeout) {
                     const listener = this.#createPageScanListener(isExternal);
                     const devices = [
                         { key: 'desktop-compact', preset: devicePresets['desktop-compact'] },
@@ -423,7 +424,7 @@ let Scraper = (() => {
                                 isExternal,
                                 message: `📸 ${key}: Extracting images%dots%`,
                             });
-                            const images = await getImageList(page, preset.width);
+                            const images = await getImageList(page, preset.width, domEvaluationTimeout);
                             imageList.push(...images);
                         }
                         catch (error) {
@@ -705,6 +706,7 @@ let Scraper = (() => {
          * @param url - The page URL string (without hash and auth)
          * @param isExternal - Whether the page is external
          * @param imageLoadTimeout - Timeout (ms) for waiting images to complete loading
+         * @param domEvaluationTimeout - Timeout (ms) for the in-page image extraction `page.evaluate`
          * @returns Array of image elements from all device presets (may be partial if some viewports failed)
          */
         get #fetchImages() { return _private_fetchImages_descriptor.value; }

package/dist/types.d.ts CHANGED Viewed

@@ -345,4 +345,10 @@ export type ScraperOptions = {
     headCheckResult?: PageData;
     /** Timeout (ms) for page.goto(). Default: 60_000 (60s). */
     navigationTimeout?: number;
+    /**
+     * Timeout (ms) for DOM evaluation operations (meta/image/anchor extraction).
+     * Bounds how long extraction may hang on a page with an unresponsive main thread.
+     * Default: 30_000 (30s).
+     */
+    domEvaluationTimeout?: number;
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@d-zero/beholder",
-	"version": "2.1.5",
+	"version": "2.1.6",
 	"description": "Page-level scraper for web crawling and auditing",
 	"author": "D-ZERO",
 	"license": "MIT",
@@ -20,7 +20,7 @@
 		"clean": "tsc --build --clean"
 	},
 	"dependencies": {
-		"@d-zero/puppeteer-page-scan": "4.5.0",
+		"@d-zero/puppeteer-page-scan": "4.5.1",
 		"@d-zero/shared": "0.22.0",
 		"debug": "4.4.3",
 		"puppeteer": "24.37.5"
@@ -33,5 +33,5 @@
 		"url": "https://github.com/d-zero-dev/tools.git",
 		"directory": "packages/@d-zero/beholder"
 	},
-	"gitHead": "2d24e08c0cb516b7ea9d07a4301eb991193cca11"
+	"gitHead": "25b4043dcd70cf3490ddcefd76a88b22c60f7712"
 }