@d-zero/beholder 2.1.5 → 2.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +9 -276
- package/dist/dom-evaluation.d.ts +40 -50
- package/dist/dom-evaluation.js +105 -160
- package/dist/scraper.js +8 -6
- package/dist/types.d.ts +6 -0
- package/package.json +3 -3
- package/src/dom-evaluation.spec.ts +293 -0
- package/src/dom-evaluation.ts +148 -190
- package/src/scraper.ts +14 -4
- package/src/types.ts +6 -0
- package/tsconfig.tsbuildinfo +1 -1
package/dist/dom-evaluation.js
CHANGED
|
@@ -3,28 +3,43 @@
|
|
|
3
3
|
*
|
|
4
4
|
* These functions are called by {@link ./scraper.ts | Scraper.#fetchData} to extract
|
|
5
5
|
* anchors, images, and meta information after page navigation completes.
|
|
6
|
+
*
|
|
7
|
+
* WHY timeouts everywhere: A page whose main thread is blocked (heavy JS, autoplay
|
|
8
|
+
* video players, infinite loops) makes every CDP round-trip hang. `getMeta` and
|
|
9
|
+
* `getImageList` therefore collect all data in a single `page.evaluate` and wrap it
|
|
10
|
+
* in {@link raceWithTimeout} so a blocked thread is abandoned after a bounded budget
|
|
11
|
+
* instead of accumulating per-property timeouts up to the caller's global timeout.
|
|
12
|
+
* Note that `page.evaluate` itself runs on the page's main thread and has no built-in
|
|
13
|
+
* timeout, so the surrounding race is what actually bounds the hang.
|
|
6
14
|
* @see {@link ./types.ts} for the data types returned by these functions
|
|
7
15
|
*/
|
|
16
|
+
import { raceWithTimeout } from '@d-zero/shared/race-with-timeout';
|
|
8
17
|
import { domDetailsLog, domLog } from './debug.js';
|
|
9
18
|
import { parseUrl } from './parse-url.js';
|
|
10
19
|
const pid = `${process.pid}`;
|
|
11
20
|
const log = domLog.extend(pid);
|
|
12
21
|
const dLog = domDetailsLog.extend(pid);
|
|
22
|
+
/**
|
|
23
|
+
* Default timeout (ms) applied to DOM evaluation operations when the caller does not
|
|
24
|
+
* specify one. Bounds how long a single `page.evaluate` / property read may hang on a
|
|
25
|
+
* page whose main thread is unresponsive.
|
|
26
|
+
*/
|
|
27
|
+
export const DEFAULT_DOM_EVALUATION_TIMEOUT = 30_000;
|
|
13
28
|
/**
|
|
14
29
|
* Retrieves a DOM property value from a Puppeteer element handle with a timeout.
|
|
15
30
|
*
|
|
16
|
-
* Races the actual property retrieval against a
|
|
31
|
+
* Races the actual property retrieval against a timeout via {@link raceWithTimeout},
|
|
32
|
+
* which clears the loser-side timer so it cannot keep the event loop alive.
|
|
17
33
|
* If the property cannot be read or the timeout expires, the fallback value is returned.
|
|
18
34
|
* @template T - The expected type of the property value.
|
|
19
35
|
* @param params - Parameters containing the element, property name, and fallback.
|
|
20
|
-
* @
|
|
36
|
+
* @param timeout - Timeout in ms before falling back. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
|
|
37
|
+
* @returns The property value, or the fallback if retrieval fails or times out.
|
|
21
38
|
*/
|
|
22
|
-
export async function getProp(params) {
|
|
39
|
+
export async function getProp(params, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
|
|
23
40
|
const { $el, propName, fallback } = params;
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
new Promise((res) => setTimeout(() => res(fallback), 10 * 1000)),
|
|
27
|
-
]);
|
|
41
|
+
const { result, timeout: timedOut } = await raceWithTimeout(() => _getProp($el, propName, fallback), timeout);
|
|
42
|
+
return timedOut ? fallback : result;
|
|
28
43
|
}
|
|
29
44
|
/**
|
|
30
45
|
* Internal implementation of property retrieval without timeout.
|
|
@@ -47,76 +62,48 @@ async function _getProp($el, propName, fallback) {
|
|
|
47
62
|
return fallback;
|
|
48
63
|
}
|
|
49
64
|
}
|
|
50
|
-
/**
|
|
51
|
-
* Retrieves a DOM property value from the first element matching a CSS selector.
|
|
52
|
-
*
|
|
53
|
-
* Combines `page.$()` with {@link getProp} for convenient single-element lookups.
|
|
54
|
-
* @template T - The expected type of the property value.
|
|
55
|
-
* @param params - Parameters containing the page, selector, property name, and fallback.
|
|
56
|
-
* @returns The property value, or the fallback if the element is not found or retrieval fails.
|
|
57
|
-
*/
|
|
58
|
-
export async function getPropBySelector(params) {
|
|
59
|
-
const { page, selector, propName, fallback } = params;
|
|
60
|
-
const $el = await page.$(selector);
|
|
61
|
-
if (!$el) {
|
|
62
|
-
return fallback;
|
|
63
|
-
}
|
|
64
|
-
return getProp({ $el, propName, fallback });
|
|
65
|
-
}
|
|
66
65
|
/**
|
|
67
66
|
* Extracts all `<img>` elements from the page and returns their properties.
|
|
68
67
|
*
|
|
69
|
-
*
|
|
70
|
-
* natural dimensions, lazy-loading status, and
|
|
68
|
+
* Collects every image's `src`, `currentSrc`, `alt`, layout dimensions,
|
|
69
|
+
* natural dimensions, lazy-loading status, and outer HTML in a single
|
|
70
|
+
* `page.evaluate` call, wrapped in {@link raceWithTimeout}. On timeout (an
|
|
71
|
+
* unresponsive page) an empty array is returned rather than hanging.
|
|
71
72
|
* @param page - The Puppeteer page to extract images from.
|
|
72
73
|
* @param viewportWidth - The current viewport width in pixels, recorded alongside each image entry.
|
|
74
|
+
* @param timeout - Timeout in ms for the evaluation. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
|
|
73
75
|
* @returns An array of {@link ImageElement} objects describing each image on the page.
|
|
74
76
|
*/
|
|
75
|
-
export async function getImageList(page, viewportWidth) {
|
|
77
|
+
export async function getImageList(page, viewportWidth, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
|
|
76
78
|
log('Getting images (Viewport: %dpx)', viewportWidth);
|
|
77
|
-
const
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
fallback: 0,
|
|
94
|
-
});
|
|
95
|
-
const naturalHeight = await getProp({
|
|
96
|
-
$el: $image,
|
|
97
|
-
propName: 'naturalHeight',
|
|
98
|
-
fallback: 0,
|
|
99
|
-
});
|
|
100
|
-
const loading = await getProp({ $el: $image, propName: 'loading', fallback: '' });
|
|
101
|
-
const sourceCode = await getProp({
|
|
102
|
-
$el: $image,
|
|
103
|
-
propName: 'outerHTML',
|
|
104
|
-
fallback: '',
|
|
105
|
-
});
|
|
106
|
-
const isLazy = loading.toLowerCase().trim() === 'lazy';
|
|
107
|
-
imageList.push({
|
|
108
|
-
src,
|
|
109
|
-
currentSrc,
|
|
110
|
-
alt,
|
|
111
|
-
width,
|
|
112
|
-
height,
|
|
113
|
-
naturalWidth,
|
|
114
|
-
naturalHeight,
|
|
115
|
-
isLazy,
|
|
116
|
-
viewportWidth,
|
|
117
|
-
sourceCode,
|
|
79
|
+
const { result, timeout: timedOut } = await raceWithTimeout(() => page
|
|
80
|
+
.evaluate(() => {
|
|
81
|
+
/* global document */
|
|
82
|
+
return [...document.images].map((img) => {
|
|
83
|
+
const rect = img.getBoundingClientRect();
|
|
84
|
+
return {
|
|
85
|
+
src: img.src,
|
|
86
|
+
currentSrc: img.currentSrc,
|
|
87
|
+
alt: img.alt,
|
|
88
|
+
width: rect.width,
|
|
89
|
+
height: rect.height,
|
|
90
|
+
naturalWidth: img.naturalWidth,
|
|
91
|
+
naturalHeight: img.naturalHeight,
|
|
92
|
+
loading: img.loading,
|
|
93
|
+
sourceCode: img.outerHTML,
|
|
94
|
+
};
|
|
118
95
|
});
|
|
96
|
+
})
|
|
97
|
+
.catch(() => null), timeout);
|
|
98
|
+
if (timedOut || result == null) {
|
|
99
|
+
log('Image extraction timed out or failed (Viewport: %dpx); returning []', viewportWidth);
|
|
100
|
+
return [];
|
|
119
101
|
}
|
|
102
|
+
const imageList = result.map(({ loading, ...img }) => ({
|
|
103
|
+
...img,
|
|
104
|
+
isLazy: loading.toLowerCase().trim() === 'lazy',
|
|
105
|
+
viewportWidth,
|
|
106
|
+
}));
|
|
120
107
|
log('Got %d images (Viewport: %dpx)', imageList.length, viewportWidth);
|
|
121
108
|
dLog('Images are: %O', imageList.map((i) => i.src));
|
|
122
109
|
return imageList;
|
|
@@ -127,27 +114,29 @@ export async function getImageList(page, viewportWidth) {
|
|
|
127
114
|
* For each anchor, resolves the `href` to an `ExURL` via `parseUrl`, retrieves
|
|
128
115
|
* the accessible name (from the accessibility tree, falling back to `textContent`),
|
|
129
116
|
* and filters out non-HTTP links.
|
|
117
|
+
*
|
|
118
|
+
* WHY this keeps per-element CDP calls (unlike {@link getMeta} / {@link getImageList}):
|
|
119
|
+
* the accessible name comes from Chrome's computed accessibility tree
|
|
120
|
+
* (`page.accessibility.snapshot`), which is a CDP-only feature unavailable to in-page
|
|
121
|
+
* DOM APIs. Each {@link getProp} read is still bounded by `timeout`.
|
|
130
122
|
* @param page - The Puppeteer page to extract anchors from.
|
|
131
123
|
* @param options - Optional URL parsing options (e.g., `disableQueries`).
|
|
124
|
+
* @param timeout - Timeout in ms per property read. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
|
|
132
125
|
* @returns An array of {@link AnchorData} objects for all HTTP(S) links found on the page.
|
|
133
126
|
*/
|
|
134
|
-
export async function getAnchorList(page, options) {
|
|
127
|
+
export async function getAnchorList(page, options, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
|
|
135
128
|
log('Getting anchors');
|
|
136
129
|
const $anchors = await page.$$('a[href], area[href]');
|
|
137
130
|
const anchorList = [];
|
|
138
131
|
for (const $anchor of $anchors) {
|
|
139
|
-
const $href = await getProp({ $el: $anchor, propName: 'href', fallback: '' });
|
|
132
|
+
const $href = await getProp({ $el: $anchor, propName: 'href', fallback: '' }, timeout);
|
|
140
133
|
const hrefVal = $href.toString();
|
|
141
134
|
const href = parseUrl(hrefVal, options);
|
|
142
135
|
if (!href || !href.isHTTP) {
|
|
143
136
|
continue;
|
|
144
137
|
}
|
|
145
138
|
const axNode = await page.accessibility.snapshot({ root: $anchor });
|
|
146
|
-
const textContent = await getProp({
|
|
147
|
-
$el: $anchor,
|
|
148
|
-
propName: 'textContent',
|
|
149
|
-
fallback: '',
|
|
150
|
-
});
|
|
139
|
+
const textContent = await getProp({ $el: $anchor, propName: 'textContent', fallback: '' }, timeout);
|
|
151
140
|
const accessibleName = axNode ? axNode.name || '' : textContent.trim();
|
|
152
141
|
const link = {
|
|
153
142
|
href,
|
|
@@ -162,7 +151,11 @@ export async function getAnchorList(page, options) {
|
|
|
162
151
|
/**
|
|
163
152
|
* Extracts comprehensive meta information from the page's `<head>`.
|
|
164
153
|
*
|
|
165
|
-
* Collects
|
|
154
|
+
* Collects all metadata in a single `page.evaluate` call (14 CDP round-trips
|
|
155
|
+
* collapsed into 1) wrapped in {@link raceWithTimeout}. On timeout (an unresponsive
|
|
156
|
+
* page) a minimal `{ title: '' }` is returned rather than hanging.
|
|
157
|
+
*
|
|
158
|
+
* Collected metadata:
|
|
166
159
|
* - `title` - The document title.
|
|
167
160
|
* - `lang` - The `lang` attribute of the `<html>` element.
|
|
168
161
|
* - `description` - The `<meta name="description">` content.
|
|
@@ -173,99 +166,51 @@ export async function getAnchorList(page, options) {
|
|
|
173
166
|
* - Open Graph tags: `og:type`, `og:title`, `og:site_name`, `og:description`, `og:url`, `og:image`.
|
|
174
167
|
* - `twitter:card` - The Twitter Card type.
|
|
175
168
|
* @param page - The Puppeteer page to extract meta information from.
|
|
169
|
+
* @param timeout - Timeout in ms for the evaluation. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
|
|
176
170
|
* @returns An object containing all extracted meta properties.
|
|
177
171
|
*/
|
|
178
|
-
export async function getMeta(page) {
|
|
172
|
+
export async function getMeta(page, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
|
|
179
173
|
log('Getting Meta');
|
|
180
|
-
const
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
174
|
+
const { result, timeout: timedOut } = await raceWithTimeout(() => page
|
|
175
|
+
.evaluate(() => {
|
|
176
|
+
/* global document, HTMLMetaElement, HTMLLinkElement */
|
|
177
|
+
const content = (selector) => {
|
|
178
|
+
const el = document.querySelector(selector);
|
|
179
|
+
return el instanceof HTMLMetaElement ? el.content : '';
|
|
180
|
+
};
|
|
181
|
+
const linkHref = (selector) => {
|
|
182
|
+
const el = document.querySelector(selector);
|
|
183
|
+
return el instanceof HTMLLinkElement ? el.href : '';
|
|
184
|
+
};
|
|
185
|
+
return {
|
|
186
|
+
title: document.title,
|
|
187
|
+
lang: document.documentElement.lang,
|
|
188
|
+
description: content('meta[name="description"]'),
|
|
189
|
+
keywords: content('meta[name="keywords"]'),
|
|
190
|
+
robots: content('meta[name="robots"]'),
|
|
191
|
+
canonical: linkHref('link[rel="canonical"]'),
|
|
192
|
+
alternate: linkHref('link[rel="alternate"]'),
|
|
193
|
+
'og:type': content('meta[property="og:type"]'),
|
|
194
|
+
'og:title': content('meta[property="og:title"]'),
|
|
195
|
+
'og:site_name': content('meta[property="og:site_name"]'),
|
|
196
|
+
'og:description': content('meta[property="og:description"]'),
|
|
197
|
+
'og:url': content('meta[property="og:url"]'),
|
|
198
|
+
'og:image': content('meta[property="og:image"]'),
|
|
199
|
+
'twitter:card': content('meta[name="twitter:card"]'),
|
|
200
|
+
};
|
|
201
|
+
})
|
|
202
|
+
.catch(() => null), timeout);
|
|
203
|
+
if (timedOut || result == null) {
|
|
204
|
+
log('Meta extraction timed out or failed; returning fallback');
|
|
205
|
+
return { title: '' };
|
|
206
|
+
}
|
|
207
|
+
const { robots: robotsVal, ...rest } = result;
|
|
186
208
|
const robots = new Set(robotsVal.split(',').map((robot) => robot.trim().toLowerCase()));
|
|
187
209
|
const meta = {
|
|
188
|
-
|
|
189
|
-
page,
|
|
190
|
-
selector: 'title',
|
|
191
|
-
propName: 'textContent',
|
|
192
|
-
fallback: '',
|
|
193
|
-
}),
|
|
194
|
-
lang: await getPropBySelector({
|
|
195
|
-
page,
|
|
196
|
-
selector: 'html',
|
|
197
|
-
propName: 'lang',
|
|
198
|
-
fallback: '',
|
|
199
|
-
}),
|
|
200
|
-
description: await getPropBySelector({
|
|
201
|
-
page,
|
|
202
|
-
selector: 'meta[name="description"]',
|
|
203
|
-
propName: 'content',
|
|
204
|
-
fallback: '',
|
|
205
|
-
}),
|
|
206
|
-
keywords: await getPropBySelector({
|
|
207
|
-
page,
|
|
208
|
-
selector: 'meta[name="keywords"]',
|
|
209
|
-
propName: 'content',
|
|
210
|
-
fallback: '',
|
|
211
|
-
}),
|
|
210
|
+
...rest,
|
|
212
211
|
noindex: robots.has('noindex'),
|
|
213
212
|
nofollow: robots.has('nofollow'),
|
|
214
213
|
noarchive: robots.has('noarchive'),
|
|
215
|
-
canonical: await getPropBySelector({
|
|
216
|
-
page,
|
|
217
|
-
selector: 'link[rel="canonical"]',
|
|
218
|
-
propName: 'href',
|
|
219
|
-
fallback: '',
|
|
220
|
-
}),
|
|
221
|
-
alternate: await getPropBySelector({
|
|
222
|
-
page,
|
|
223
|
-
selector: 'link[rel="alternate"]',
|
|
224
|
-
propName: 'href',
|
|
225
|
-
fallback: '',
|
|
226
|
-
}),
|
|
227
|
-
'og:type': await getPropBySelector({
|
|
228
|
-
page,
|
|
229
|
-
selector: 'meta[property="og:type"]',
|
|
230
|
-
propName: 'content',
|
|
231
|
-
fallback: '',
|
|
232
|
-
}),
|
|
233
|
-
'og:title': await getPropBySelector({
|
|
234
|
-
page,
|
|
235
|
-
selector: 'meta[property="og:title"]',
|
|
236
|
-
propName: 'content',
|
|
237
|
-
fallback: '',
|
|
238
|
-
}),
|
|
239
|
-
'og:site_name': await getPropBySelector({
|
|
240
|
-
page,
|
|
241
|
-
selector: 'meta[property="og:site_name"]',
|
|
242
|
-
propName: 'content',
|
|
243
|
-
fallback: '',
|
|
244
|
-
}),
|
|
245
|
-
'og:description': await getPropBySelector({
|
|
246
|
-
page,
|
|
247
|
-
selector: 'meta[property="og:description"]',
|
|
248
|
-
propName: 'content',
|
|
249
|
-
fallback: '',
|
|
250
|
-
}),
|
|
251
|
-
'og:url': await getPropBySelector({
|
|
252
|
-
page,
|
|
253
|
-
selector: 'meta[property="og:url"]',
|
|
254
|
-
propName: 'content',
|
|
255
|
-
fallback: '',
|
|
256
|
-
}),
|
|
257
|
-
'og:image': await getPropBySelector({
|
|
258
|
-
page,
|
|
259
|
-
selector: 'meta[property="og:image"]',
|
|
260
|
-
propName: 'content',
|
|
261
|
-
fallback: '',
|
|
262
|
-
}),
|
|
263
|
-
'twitter:card': await getPropBySelector({
|
|
264
|
-
page,
|
|
265
|
-
selector: 'meta[name="twitter:card"]',
|
|
266
|
-
propName: 'content',
|
|
267
|
-
fallback: '',
|
|
268
|
-
}),
|
|
269
214
|
};
|
|
270
215
|
log('Got meta');
|
|
271
216
|
dLog('Meta data are: %O', meta);
|
package/dist/scraper.js
CHANGED
|
@@ -42,7 +42,7 @@ import { detectCompress } from '@d-zero/shared/detect-compress';
|
|
|
42
42
|
import { retry as retryable } from '@d-zero/shared/retry';
|
|
43
43
|
import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
44
44
|
import { resourceLog, scraperLog } from './debug.js';
|
|
45
|
-
import { getAnchorList, getImageList, getMeta } from './dom-evaluation.js';
|
|
45
|
+
import { DEFAULT_DOM_EVALUATION_TIMEOUT, getAnchorList, getImageList, getMeta, } from './dom-evaluation.js';
|
|
46
46
|
import { isError } from './is-error.js';
|
|
47
47
|
import { keywordCheck } from './keyword-check.js';
|
|
48
48
|
import { findDisconnectionFailures } from './network-disconnection.js';
|
|
@@ -107,6 +107,7 @@ let Scraper = (() => {
|
|
|
107
107
|
const parseOpts = options?.disableQueries == null
|
|
108
108
|
? undefined
|
|
109
109
|
: { disableQueries: options.disableQueries };
|
|
110
|
+
const domEvaluationTimeout = options?.domEvaluationTimeout ?? DEFAULT_DOM_EVALUATION_TIMEOUT;
|
|
110
111
|
const networkLogs = {};
|
|
111
112
|
// Clear stale state from previous retries (@retryable may re-invoke this method
|
|
112
113
|
// with the same page and mutable arrays, so we must reset to avoid accumulation)
|
|
@@ -343,7 +344,7 @@ let Scraper = (() => {
|
|
|
343
344
|
isExternal,
|
|
344
345
|
message: '',
|
|
345
346
|
});
|
|
346
|
-
const anchorList = await getAnchorList(page, parseOpts);
|
|
347
|
+
const anchorList = await getAnchorList(page, parseOpts, domEvaluationTimeout);
|
|
347
348
|
void this.emit('changePhase', {
|
|
348
349
|
pid: process.pid,
|
|
349
350
|
name: 'getMeta',
|
|
@@ -351,7 +352,7 @@ let Scraper = (() => {
|
|
|
351
352
|
isExternal,
|
|
352
353
|
message: '',
|
|
353
354
|
});
|
|
354
|
-
const meta = await getMeta(page);
|
|
355
|
+
const meta = await getMeta(page, domEvaluationTimeout);
|
|
355
356
|
const imageList = captureImages
|
|
356
357
|
? await (async () => {
|
|
357
358
|
void this.emit('changePhase', {
|
|
@@ -361,7 +362,7 @@ let Scraper = (() => {
|
|
|
361
362
|
isExternal,
|
|
362
363
|
message: '',
|
|
363
364
|
});
|
|
364
|
-
return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout);
|
|
365
|
+
return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout, domEvaluationTimeout);
|
|
365
366
|
})()
|
|
366
367
|
: [];
|
|
367
368
|
return {
|
|
@@ -381,7 +382,7 @@ let Scraper = (() => {
|
|
|
381
382
|
isSkipped: false,
|
|
382
383
|
};
|
|
383
384
|
}, "#fetchData") }, _private_fetchData_decorators, { kind: "method", name: "#fetchData", static: false, private: true, access: { has: obj => #fetchData in obj, get: obj => obj.#fetchData }, metadata: _metadata }, null, _instanceExtraInitializers);
|
|
384
|
-
__esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout) {
|
|
385
|
+
__esDecorate(this, _private_fetchImages_descriptor = { value: __setFunctionName(async function (page, url, isExternal, imageLoadTimeout, domEvaluationTimeout) {
|
|
385
386
|
const listener = this.#createPageScanListener(isExternal);
|
|
386
387
|
const devices = [
|
|
387
388
|
{ key: 'desktop-compact', preset: devicePresets['desktop-compact'] },
|
|
@@ -423,7 +424,7 @@ let Scraper = (() => {
|
|
|
423
424
|
isExternal,
|
|
424
425
|
message: `📸 ${key}: Extracting images%dots%`,
|
|
425
426
|
});
|
|
426
|
-
const images = await getImageList(page, preset.width);
|
|
427
|
+
const images = await getImageList(page, preset.width, domEvaluationTimeout);
|
|
427
428
|
imageList.push(...images);
|
|
428
429
|
}
|
|
429
430
|
catch (error) {
|
|
@@ -705,6 +706,7 @@ let Scraper = (() => {
|
|
|
705
706
|
* @param url - The page URL string (without hash and auth)
|
|
706
707
|
* @param isExternal - Whether the page is external
|
|
707
708
|
* @param imageLoadTimeout - Timeout (ms) for waiting images to complete loading
|
|
709
|
+
* @param domEvaluationTimeout - Timeout (ms) for the in-page image extraction `page.evaluate`
|
|
708
710
|
* @returns Array of image elements from all device presets (may be partial if some viewports failed)
|
|
709
711
|
*/
|
|
710
712
|
get #fetchImages() { return _private_fetchImages_descriptor.value; }
|
package/dist/types.d.ts
CHANGED
|
@@ -345,4 +345,10 @@ export type ScraperOptions = {
|
|
|
345
345
|
headCheckResult?: PageData;
|
|
346
346
|
/** Timeout (ms) for page.goto(). Default: 60_000 (60s). */
|
|
347
347
|
navigationTimeout?: number;
|
|
348
|
+
/**
|
|
349
|
+
* Timeout (ms) for DOM evaluation operations (meta/image/anchor extraction).
|
|
350
|
+
* Bounds how long extraction may hang on a page with an unresponsive main thread.
|
|
351
|
+
* Default: 30_000 (30s).
|
|
352
|
+
*/
|
|
353
|
+
domEvaluationTimeout?: number;
|
|
348
354
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@d-zero/beholder",
|
|
3
|
-
"version": "2.1.
|
|
3
|
+
"version": "2.1.6",
|
|
4
4
|
"description": "Page-level scraper for web crawling and auditing",
|
|
5
5
|
"author": "D-ZERO",
|
|
6
6
|
"license": "MIT",
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
"clean": "tsc --build --clean"
|
|
21
21
|
},
|
|
22
22
|
"dependencies": {
|
|
23
|
-
"@d-zero/puppeteer-page-scan": "4.5.
|
|
23
|
+
"@d-zero/puppeteer-page-scan": "4.5.1",
|
|
24
24
|
"@d-zero/shared": "0.22.0",
|
|
25
25
|
"debug": "4.4.3",
|
|
26
26
|
"puppeteer": "24.37.5"
|
|
@@ -33,5 +33,5 @@
|
|
|
33
33
|
"url": "https://github.com/d-zero-dev/tools.git",
|
|
34
34
|
"directory": "packages/@d-zero/beholder"
|
|
35
35
|
},
|
|
36
|
-
"gitHead": "
|
|
36
|
+
"gitHead": "25b4043dcd70cf3490ddcefd76a88b22c60f7712"
|
|
37
37
|
}
|