@d-zero/beholder 2.1.5 → 2.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,28 +3,43 @@
3
3
  *
4
4
  * These functions are called by {@link ./scraper.ts | Scraper.#fetchData} to extract
5
5
  * anchors, images, and meta information after page navigation completes.
6
+ *
7
+ * WHY timeouts everywhere: A page whose main thread is blocked (heavy JS, autoplay
8
+ * video players, infinite loops) makes every CDP round-trip hang. `getMeta` and
9
+ * `getImageList` therefore collect all data in a single `page.evaluate` and wrap it
10
+ * in {@link raceWithTimeout} so a blocked thread is abandoned after a bounded budget
11
+ * instead of accumulating per-property timeouts up to the caller's global timeout.
12
+ * Note that `page.evaluate` itself runs on the page's main thread and has no built-in
13
+ * timeout, so the surrounding race is what actually bounds the hang.
6
14
  * @see {@link ./types.ts} for the data types returned by these functions
7
15
  */
16
+ import { raceWithTimeout } from '@d-zero/shared/race-with-timeout';
8
17
  import { domDetailsLog, domLog } from './debug.js';
9
18
  import { parseUrl } from './parse-url.js';
10
19
  const pid = `${process.pid}`;
11
20
  const log = domLog.extend(pid);
12
21
  const dLog = domDetailsLog.extend(pid);
22
+ /**
23
+ * Default timeout (ms) applied to DOM evaluation operations when the caller does not
24
+ * specify one. Bounds how long a single `page.evaluate` / property read may hang on a
25
+ * page whose main thread is unresponsive.
26
+ */
27
+ export const DEFAULT_DOM_EVALUATION_TIMEOUT = 30_000;
13
28
  /**
14
29
  * Retrieves a DOM property value from a Puppeteer element handle with a timeout.
15
30
  *
16
- * Races the actual property retrieval against a 10-second timeout.
31
+ * Races the actual property retrieval against a timeout via {@link raceWithTimeout},
32
+ * which clears the loser-side timer so it cannot keep the event loop alive.
17
33
  * If the property cannot be read or the timeout expires, the fallback value is returned.
18
34
  * @template T - The expected type of the property value.
19
35
  * @param params - Parameters containing the element, property name, and fallback.
20
- * @returns The property value, or the fallback if retrieval fails.
36
+ * @param timeout - Timeout in ms before falling back. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
37
+ * @returns The property value, or the fallback if retrieval fails or times out.
21
38
  */
22
- export async function getProp(params) {
39
+ export async function getProp(params, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
23
40
  const { $el, propName, fallback } = params;
24
- return Promise.race([
25
- _getProp($el, propName, fallback),
26
- new Promise((res) => setTimeout(() => res(fallback), 10 * 1000)),
27
- ]);
41
+ const { result, timeout: timedOut } = await raceWithTimeout(() => _getProp($el, propName, fallback), timeout);
42
+ return timedOut ? fallback : result;
28
43
  }
29
44
  /**
30
45
  * Internal implementation of property retrieval without timeout.
@@ -47,76 +62,48 @@ async function _getProp($el, propName, fallback) {
47
62
  return fallback;
48
63
  }
49
64
  }
50
- /**
51
- * Retrieves a DOM property value from the first element matching a CSS selector.
52
- *
53
- * Combines `page.$()` with {@link getProp} for convenient single-element lookups.
54
- * @template T - The expected type of the property value.
55
- * @param params - Parameters containing the page, selector, property name, and fallback.
56
- * @returns The property value, or the fallback if the element is not found or retrieval fails.
57
- */
58
- export async function getPropBySelector(params) {
59
- const { page, selector, propName, fallback } = params;
60
- const $el = await page.$(selector);
61
- if (!$el) {
62
- return fallback;
63
- }
64
- return getProp({ $el, propName, fallback });
65
- }
66
65
  /**
67
66
  * Extracts all `<img>` elements from the page and returns their properties.
68
67
  *
69
- * For each image, collects the `src`, `currentSrc`, `alt`, bounding box dimensions,
70
- * natural dimensions, lazy-loading status, and the outer HTML source code.
68
+ * Collects every image's `src`, `currentSrc`, `alt`, layout dimensions,
69
+ * natural dimensions, lazy-loading status, and outer HTML in a single
70
+ * `page.evaluate` call, wrapped in {@link raceWithTimeout}. On timeout (an
71
+ * unresponsive page) an empty array is returned rather than hanging.
71
72
  * @param page - The Puppeteer page to extract images from.
72
73
  * @param viewportWidth - The current viewport width in pixels, recorded alongside each image entry.
74
+ * @param timeout - Timeout in ms for the evaluation. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
73
75
  * @returns An array of {@link ImageElement} objects describing each image on the page.
74
76
  */
75
- export async function getImageList(page, viewportWidth) {
77
+ export async function getImageList(page, viewportWidth, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
76
78
  log('Getting images (Viewport: %dpx)', viewportWidth);
77
- const $images = await page.$$('img');
78
- const imageList = [];
79
- for (const $image of $images) {
80
- const boundingBox = await $image.boundingBox();
81
- const width = boundingBox?.width || 0;
82
- const height = boundingBox?.height || 0;
83
- const src = await getProp({ $el: $image, propName: 'src', fallback: '' });
84
- const currentSrc = await getProp({
85
- $el: $image,
86
- propName: 'currentSrc',
87
- fallback: '',
88
- });
89
- const alt = await getProp({ $el: $image, propName: 'alt', fallback: '' });
90
- const naturalWidth = await getProp({
91
- $el: $image,
92
- propName: 'naturalWidth',
93
- fallback: 0,
94
- });
95
- const naturalHeight = await getProp({
96
- $el: $image,
97
- propName: 'naturalHeight',
98
- fallback: 0,
99
- });
100
- const loading = await getProp({ $el: $image, propName: 'loading', fallback: '' });
101
- const sourceCode = await getProp({
102
- $el: $image,
103
- propName: 'outerHTML',
104
- fallback: '',
105
- });
106
- const isLazy = loading.toLowerCase().trim() === 'lazy';
107
- imageList.push({
108
- src,
109
- currentSrc,
110
- alt,
111
- width,
112
- height,
113
- naturalWidth,
114
- naturalHeight,
115
- isLazy,
116
- viewportWidth,
117
- sourceCode,
79
+ const { result, timeout: timedOut } = await raceWithTimeout(() => page
80
+ .evaluate(() => {
81
+ /* global document */
82
+ return [...document.images].map((img) => {
83
+ const rect = img.getBoundingClientRect();
84
+ return {
85
+ src: img.src,
86
+ currentSrc: img.currentSrc,
87
+ alt: img.alt,
88
+ width: rect.width,
89
+ height: rect.height,
90
+ naturalWidth: img.naturalWidth,
91
+ naturalHeight: img.naturalHeight,
92
+ loading: img.loading,
93
+ sourceCode: img.outerHTML,
94
+ };
118
95
  });
96
+ })
97
+ .catch(() => null), timeout);
98
+ if (timedOut || result == null) {
99
+ log('Image extraction timed out or failed (Viewport: %dpx); returning []', viewportWidth);
100
+ return [];
119
101
  }
102
+ const imageList = result.map(({ loading, ...img }) => ({
103
+ ...img,
104
+ isLazy: loading.toLowerCase().trim() === 'lazy',
105
+ viewportWidth,
106
+ }));
120
107
  log('Got %d images (Viewport: %dpx)', imageList.length, viewportWidth);
121
108
  dLog('Images are: %O', imageList.map((i) => i.src));
122
109
  return imageList;
@@ -127,27 +114,29 @@ export async function getImageList(page, viewportWidth) {
127
114
  * For each anchor, resolves the `href` to an `ExURL` via `parseUrl`, retrieves
128
115
  * the accessible name (from the accessibility tree, falling back to `textContent`),
129
116
  * and filters out non-HTTP links.
117
+ *
118
+ * WHY this keeps per-element CDP calls (unlike {@link getMeta} / {@link getImageList}):
119
+ * the accessible name comes from Chrome's computed accessibility tree
120
+ * (`page.accessibility.snapshot`), which is a CDP-only feature unavailable to in-page
121
+ * DOM APIs. Each {@link getProp} read is still bounded by `timeout`.
130
122
  * @param page - The Puppeteer page to extract anchors from.
131
123
  * @param options - Optional URL parsing options (e.g., `disableQueries`).
124
+ * @param timeout - Timeout in ms per property read. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
132
125
  * @returns An array of {@link AnchorData} objects for all HTTP(S) links found on the page.
133
126
  */
134
- export async function getAnchorList(page, options) {
127
+ export async function getAnchorList(page, options, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
135
128
  log('Getting anchors');
136
129
  const $anchors = await page.$$('a[href], area[href]');
137
130
  const anchorList = [];
138
131
  for (const $anchor of $anchors) {
139
- const $href = await getProp({ $el: $anchor, propName: 'href', fallback: '' });
132
+ const $href = await getProp({ $el: $anchor, propName: 'href', fallback: '' }, timeout);
140
133
  const hrefVal = $href.toString();
141
134
  const href = parseUrl(hrefVal, options);
142
135
  if (!href || !href.isHTTP) {
143
136
  continue;
144
137
  }
145
138
  const axNode = await page.accessibility.snapshot({ root: $anchor });
146
- const textContent = await getProp({
147
- $el: $anchor,
148
- propName: 'textContent',
149
- fallback: '',
150
- });
139
+ const textContent = await getProp({ $el: $anchor, propName: 'textContent', fallback: '' }, timeout);
151
140
  const accessibleName = axNode ? axNode.name || '' : textContent.trim();
152
141
  const link = {
153
142
  href,
@@ -162,7 +151,11 @@ export async function getAnchorList(page, options) {
162
151
  /**
163
152
  * Extracts comprehensive meta information from the page's `<head>`.
164
153
  *
165
- * Collects the following metadata:
154
+ * Collects all metadata in a single `page.evaluate` call (14 CDP round-trips
155
+ * collapsed into 1) wrapped in {@link raceWithTimeout}. On timeout (an unresponsive
156
+ * page) a minimal `{ title: '' }` is returned rather than hanging.
157
+ *
158
+ * Collected metadata:
166
159
  * - `title` - The document title.
167
160
  * - `lang` - The `lang` attribute of the `<html>` element.
168
161
  * - `description` - The `<meta name="description">` content.
@@ -173,99 +166,51 @@ export async function getAnchorList(page, options) {
173
166
  * - Open Graph tags: `og:type`, `og:title`, `og:site_name`, `og:description`, `og:url`, `og:image`.
174
167
  * - `twitter:card` - The Twitter Card type.
175
168
  * @param page - The Puppeteer page to extract meta information from.
169
+ * @param timeout - Timeout in ms for the evaluation. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
176
170
  * @returns An object containing all extracted meta properties.
177
171
  */
178
- export async function getMeta(page) {
172
+ export async function getMeta(page, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
179
173
  log('Getting Meta');
180
- const robotsVal = await getPropBySelector({
181
- page,
182
- selector: 'meta[name="robots"]',
183
- propName: 'content',
184
- fallback: '',
185
- });
174
+ const { result, timeout: timedOut } = await raceWithTimeout(() => page
175
+ .evaluate(() => {
176
+ /* global document, HTMLMetaElement, HTMLLinkElement */
177
+ const content = (selector) => {
178
+ const el = document.querySelector(selector);
179
+ return el instanceof HTMLMetaElement ? el.content : '';
180
+ };
181
+ const linkHref = (selector) => {
182
+ const el = document.querySelector(selector);
183
+ return el instanceof HTMLLinkElement ? el.href : '';
184
+ };
185
+ return {
186
+ title: document.title,
187
+ lang: document.documentElement.lang,
188
+ description: content('meta[name="description"]'),
189
+ keywords: content('meta[name="keywords"]'),
190
+ robots: content('meta[name="robots"]'),
191
+ canonical: linkHref('link[rel="canonical"]'),
192
+ alternate: linkHref('link[rel="alternate"]'),
193
+ 'og:type': content('meta[property="og:type"]'),
194
+ 'og:title': content('meta[property="og:title"]'),
195
+ 'og:site_name': content('meta[property="og:site_name"]'),
196
+ 'og:description': content('meta[property="og:description"]'),
197
+ 'og:url': content('meta[property="og:url"]'),
198
+ 'og:image': content('meta[property="og:image"]'),
199
+ 'twitter:card': content('meta[name="twitter:card"]'),
200
+ };
201
+ })
202
+ .catch(() => null), timeout);
203
+ if (timedOut || result == null) {
204
+ log('Meta extraction timed out or failed; returning fallback');
205
+ return { title: '' };
206
+ }
207
+ const { robots: robotsVal, ...rest } = result;
186
208
  const robots = new Set(robotsVal.split(',').map((robot) => robot.trim().toLowerCase()));
187
209
  const meta = {
188
- title: await getPropBySelector({
189
- page,
190
- selector: 'title',
191
- propName: 'textContent',
192
- fallback: '',
193
- }),
194
- lang: await getPropBySelector({
195
- page,
196
- selector: 'html',
197
- propName: 'lang',
198
- fallback: '',
199
- }),
200
- description: await getPropBySelector({
201
- page,
202
- selector: 'meta[name="description"]',
203
- propName: 'content',
204
- fallback: '',
205
- }),
206
- keywords: await getPropBySelector({
207
- page,
208
- selector: 'meta[name="keywords"]',
209
- propName: 'content',
210
- fallback: '',
211
- }),
210
+ ...rest,
212
211
  noindex: robots.has('noindex'),
213
212
  nofollow: robots.has('nofollow'),
214
213
  noarchive: robots.has('noarchive'),
215
- canonical: await getPropBySelector({
216
- page,
217
- selector: 'link[rel="canonical"]',
218
- propName: 'href',
219
- fallback: '',
220
- }),
221
- alternate: await getPropBySelector({
222
- page,
223
- selector: 'link[rel="alternate"]',
224
- propName: 'href',
225
- fallback: '',
226
- }),
227
- 'og:type': await getPropBySelector({
228
- page,
229
- selector: 'meta[property="og:type"]',
230
- propName: 'content',
231
- fallback: '',
232
- }),
233
- 'og:title': await getPropBySelector({
234
- page,
235
- selector: 'meta[property="og:title"]',
236
- propName: 'content',
237
- fallback: '',
238
- }),
239
- 'og:site_name': await getPropBySelector({
240
- page,
241
- selector: 'meta[property="og:site_name"]',
242
- propName: 'content',
243
- fallback: '',
244
- }),
245
- 'og:description': await getPropBySelector({
246
- page,
247
- selector: 'meta[property="og:description"]',
248
- propName: 'content',
249
- fallback: '',
250
- }),
251
- 'og:url': await getPropBySelector({
252
- page,
253
- selector: 'meta[property="og:url"]',
254
- propName: 'content',
255
- fallback: '',
256
- }),
257
- 'og:image': await getPropBySelector({
258
- page,
259
- selector: 'meta[property="og:image"]',
260
- propName: 'content',
261
- fallback: '',
262
- }),
263
- 'twitter:card': await getPropBySelector({
264
- page,
265
- selector: 'meta[name="twitter:card"]',
266
- propName: 'content',
267
- fallback: '',
268
- }),
269
214
  };
270
215
  log('Got meta');
271
216
  dLog('Meta data are: %O', meta);
package/dist/scraper.js CHANGED
@@ -42,7 +42,7 @@ import { detectCompress } from '@d-zero/shared/detect-compress';
42
42
  import { retry as retryable } from '@d-zero/shared/retry';
43
43
  import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
44
44
  import { resourceLog, scraperLog } from './debug.js';
45
- import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
45
+ import { DEFAULT_DOM_EVALUATION_TIMEOUT, getAnchorList, getImageList, getMeta, } from './dom-evaluation.js';
46
46
  import { isError } from './is-error.js';
47
47
  import { keywordCheck } from './keyword-check.js';
48
48
  import { findDisconnectionFailures } from './network-disconnection.js';
@@ -107,6 +107,7 @@ let Scraper = (() => {
107
107
  const parseOpts = options?.disableQueries == null
108
108
  ? undefined
109
109
  : { disableQueries: options.disableQueries };
110
+ const domEvaluationTimeout = options?.domEvaluationTimeout ?? DEFAULT_DOM_EVALUATION_TIMEOUT;
110
111
  const networkLogs = {};
111
112
  // Clear stale state from previous retries (@retryable may re-invoke this method
112
113
  // with the same page and mutable arrays, so we must reset to avoid accumulation)
@@ -343,7 +344,7 @@ let Scraper = (() => {
343
344
  isExternal,
344
345
  message: '',
345
346
  });
346
- const anchorList = await getAnchorList(page, parseOpts);
347
+ const anchorList = await getAnchorList(page, parseOpts, domEvaluationTimeout);
347
348
  void this.emit('changePhase', {
348
349
  pid: process.pid,
349
350
  name: 'getMeta',
@@ -351,7 +352,7 @@ let Scraper = (() => {
351
352
  isExternal,
352
353
  message: '',
353
354
  });
354
- const meta = await getMeta(page);
355
+ const meta = await getMeta(page, domEvaluationTimeout);
355
356
  const imageList = captureImages
356
357
  ? await (async () => {
357
358
  void this.emit('changePhase', {
@@ -361,7 +362,7 @@ let Scraper = (() => {
361
362
  isExternal,
362
363
  message: '',
363
364
  });
364
- return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout);
365
+ return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout, domEvaluationTimeout);
365
366
  })()
366
367
  : [];
367
368
  return {
@@ -381,7 +382,7 @@ let Scraper = (() => {
381
382
  isSkipped: false,
382
383
  };
383
384
  }, "#fetchData") }, _private_fetchData_decorators, { kind: "method", name: "#fetchData", static: false, private: true, access: { has: obj => #fetchData in obj, get: obj => obj.#fetchData }, metadata: _metadata }, null, _instanceExtraInitializers);
384
- __esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout) {
385
+ __esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout, domEvaluationTimeout) {
385
386
  const listener = this.#createPageScanListener(isExternal);
386
387
  const devices = [
387
388
  { key: 'desktop-compact', preset: devicePresets['desktop-compact'] },
@@ -423,7 +424,7 @@ let Scraper = (() => {
423
424
  isExternal,
424
425
  message: `📸 ${key}: Extracting images%dots%`,
425
426
  });
426
- const images = await getImageList(page, preset.width);
427
+ const images = await getImageList(page, preset.width, domEvaluationTimeout);
427
428
  imageList.push(...images);
428
429
  }
429
430
  catch (error) {
@@ -705,6 +706,7 @@ let Scraper = (() => {
705
706
  * @param url - The page URL string (without hash and auth)
706
707
  * @param isExternal - Whether the page is external
707
708
  * @param imageLoadTimeout - Timeout (ms) for waiting images to complete loading
709
+ * @param domEvaluationTimeout - Timeout (ms) for the in-page image extraction `page.evaluate`
708
710
  * @returns Array of image elements from all device presets (may be partial if some viewports failed)
709
711
  */
710
712
  get #fetchImages() { return _private_fetchImages_descriptor.value; }
package/dist/types.d.ts CHANGED
@@ -345,4 +345,10 @@ export type ScraperOptions = {
345
345
  headCheckResult?: PageData;
346
346
  /** Timeout (ms) for page.goto(). Default: 60_000 (60s). */
347
347
  navigationTimeout?: number;
348
+ /**
349
+ * Timeout (ms) for DOM evaluation operations (meta/image/anchor extraction).
350
+ * Bounds how long extraction may hang on a page with an unresponsive main thread.
351
+ * Default: 30_000 (30s).
352
+ */
353
+ domEvaluationTimeout?: number;
348
354
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@d-zero/beholder",
3
- "version": "2.1.5",
3
+ "version": "2.1.6",
4
4
  "description": "Page-level scraper for web crawling and auditing",
5
5
  "author": "D-ZERO",
6
6
  "license": "MIT",
@@ -20,7 +20,7 @@
20
20
  "clean": "tsc --build --clean"
21
21
  },
22
22
  "dependencies": {
23
- "@d-zero/puppeteer-page-scan": "4.5.0",
23
+ "@d-zero/puppeteer-page-scan": "4.5.1",
24
24
  "@d-zero/shared": "0.22.0",
25
25
  "debug": "4.4.3",
26
26
  "puppeteer": "24.37.5"
@@ -33,5 +33,5 @@
33
33
  "url": "https://github.com/d-zero-dev/tools.git",
34
34
  "directory": "packages/@d-zero/beholder"
35
35
  },
36
- "gitHead": "2d24e08c0cb516b7ea9d07a4301eb991193cca11"
36
+ "gitHead": "25b4043dcd70cf3490ddcefd76a88b22c60f7712"
37
37
  }