@d-zero/beholder 2.1.5 → 2.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +9 -276
- package/dist/dom-evaluation.d.ts +40 -50
- package/dist/dom-evaluation.js +105 -160
- package/dist/scraper.js +8 -6
- package/dist/types.d.ts +6 -0
- package/package.json +3 -3
- package/src/dom-evaluation.spec.ts +293 -0
- package/src/dom-evaluation.ts +148 -190
- package/src/scraper.ts +14 -4
- package/src/types.ts +6 -0
- package/tsconfig.tsbuildinfo +1 -1
package/src/dom-evaluation.ts
CHANGED
|
@@ -3,12 +3,22 @@
|
|
|
3
3
|
*
|
|
4
4
|
* These functions are called by {@link ./scraper.ts | Scraper.#fetchData} to extract
|
|
5
5
|
* anchors, images, and meta information after page navigation completes.
|
|
6
|
+
*
|
|
7
|
+
* WHY timeouts everywhere: A page whose main thread is blocked (heavy JS, autoplay
|
|
8
|
+
* video players, infinite loops) makes every CDP round-trip hang. `getMeta` and
|
|
9
|
+
* `getImageList` therefore collect all data in a single `page.evaluate` and wrap it
|
|
10
|
+
* in {@link raceWithTimeout} so a blocked thread is abandoned after a bounded budget
|
|
11
|
+
* instead of accumulating per-property timeouts up to the caller's global timeout.
|
|
12
|
+
* Note that `page.evaluate` itself runs on the page's main thread and has no built-in
|
|
13
|
+
* timeout, so the surrounding race is what actually bounds the hang.
|
|
6
14
|
* @see {@link ./types.ts} for the data types returned by these functions
|
|
7
15
|
*/
|
|
8
16
|
|
|
9
|
-
import type { AnchorData, ImageElement, ParseURLOptions } from './types.js';
|
|
17
|
+
import type { AnchorData, ImageElement, Meta, ParseURLOptions } from './types.js';
|
|
10
18
|
import type { ElementHandle, Page } from 'puppeteer';
|
|
11
19
|
|
|
20
|
+
import { raceWithTimeout } from '@d-zero/shared/race-with-timeout';
|
|
21
|
+
|
|
12
22
|
import { domDetailsLog, domLog } from './debug.js';
|
|
13
23
|
import { parseUrl } from './parse-url.js';
|
|
14
24
|
|
|
@@ -16,6 +26,13 @@ const pid = `${process.pid}`;
|
|
|
16
26
|
const log = domLog.extend(pid);
|
|
17
27
|
const dLog = domDetailsLog.extend(pid);
|
|
18
28
|
|
|
29
|
+
/**
|
|
30
|
+
* Default timeout (ms) applied to DOM evaluation operations when the caller does not
|
|
31
|
+
* specify one. Bounds how long a single `page.evaluate` / property read may hang on a
|
|
32
|
+
* page whose main thread is unresponsive.
|
|
33
|
+
*/
|
|
34
|
+
export const DEFAULT_DOM_EVALUATION_TIMEOUT = 30_000;
|
|
35
|
+
|
|
19
36
|
/**
|
|
20
37
|
* Parameters for {@link getProp}.
|
|
21
38
|
* @template T - The expected type of the property value.
|
|
@@ -32,18 +49,24 @@ export interface GetPropParams<T> {
|
|
|
32
49
|
/**
|
|
33
50
|
* Retrieves a DOM property value from a Puppeteer element handle with a timeout.
|
|
34
51
|
*
|
|
35
|
-
* Races the actual property retrieval against a
|
|
52
|
+
* Races the actual property retrieval against a timeout via {@link raceWithTimeout},
|
|
53
|
+
* which clears the loser-side timer so it cannot keep the event loop alive.
|
|
36
54
|
* If the property cannot be read or the timeout expires, the fallback value is returned.
|
|
37
55
|
* @template T - The expected type of the property value.
|
|
38
56
|
* @param params - Parameters containing the element, property name, and fallback.
|
|
39
|
-
* @
|
|
57
|
+
* @param timeout - Timeout in ms before falling back. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
|
|
58
|
+
* @returns The property value, or the fallback if retrieval fails or times out.
|
|
40
59
|
*/
|
|
41
|
-
export async function getProp<T>(
|
|
60
|
+
export async function getProp<T>(
|
|
61
|
+
params: GetPropParams<T>,
|
|
62
|
+
timeout: number = DEFAULT_DOM_EVALUATION_TIMEOUT,
|
|
63
|
+
): Promise<T> {
|
|
42
64
|
const { $el, propName, fallback } = params;
|
|
43
|
-
|
|
44
|
-
_getProp($el, propName, fallback),
|
|
45
|
-
|
|
46
|
-
|
|
65
|
+
const { result, timeout: timedOut } = await raceWithTimeout(
|
|
66
|
+
() => _getProp($el, propName, fallback),
|
|
67
|
+
timeout,
|
|
68
|
+
);
|
|
69
|
+
return timedOut ? fallback : result;
|
|
47
70
|
}
|
|
48
71
|
|
|
49
72
|
/**
|
|
@@ -54,7 +77,11 @@ export async function getProp<T>(params: GetPropParams<T>) {
|
|
|
54
77
|
* @param fallback - The default value on failure.
|
|
55
78
|
* @returns The property value cast to `T`, or the fallback.
|
|
56
79
|
*/
|
|
57
|
-
async function _getProp<T>(
|
|
80
|
+
async function _getProp<T>(
|
|
81
|
+
$el: ElementHandle<Element>,
|
|
82
|
+
propName: string,
|
|
83
|
+
fallback: T,
|
|
84
|
+
): Promise<T> {
|
|
58
85
|
try {
|
|
59
86
|
const prop = await $el.getProperty(propName);
|
|
60
87
|
if (!prop) {
|
|
@@ -67,109 +94,63 @@ async function _getProp<T>($el: ElementHandle<Element>, propName: string, fallba
|
|
|
67
94
|
}
|
|
68
95
|
}
|
|
69
96
|
|
|
70
|
-
/**
|
|
71
|
-
* Parameters for {@link getPropBySelector}.
|
|
72
|
-
* @template T - The expected type of the property value.
|
|
73
|
-
*/
|
|
74
|
-
export interface GetPropBySelectorParams<T> {
|
|
75
|
-
/** The Puppeteer page to query. */
|
|
76
|
-
readonly page: Page;
|
|
77
|
-
/** A CSS selector to find the target element. */
|
|
78
|
-
readonly selector: string;
|
|
79
|
-
/** The DOM property name to read from the matched element. */
|
|
80
|
-
readonly propName: string;
|
|
81
|
-
/** The default value if no element matches or the property cannot be read. */
|
|
82
|
-
readonly fallback: T;
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
/**
|
|
86
|
-
* Retrieves a DOM property value from the first element matching a CSS selector.
|
|
87
|
-
*
|
|
88
|
-
* Combines `page.$()` with {@link getProp} for convenient single-element lookups.
|
|
89
|
-
* @template T - The expected type of the property value.
|
|
90
|
-
* @param params - Parameters containing the page, selector, property name, and fallback.
|
|
91
|
-
* @returns The property value, or the fallback if the element is not found or retrieval fails.
|
|
92
|
-
*/
|
|
93
|
-
export async function getPropBySelector<T>(params: GetPropBySelectorParams<T>) {
|
|
94
|
-
const { page, selector, propName, fallback } = params;
|
|
95
|
-
const $el = await page.$(selector);
|
|
96
|
-
if (!$el) {
|
|
97
|
-
return fallback;
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
return getProp({ $el, propName, fallback });
|
|
101
|
-
}
|
|
102
|
-
|
|
103
97
|
/**
|
|
104
98
|
* Extracts all `<img>` elements from the page and returns their properties.
|
|
105
99
|
*
|
|
106
|
-
*
|
|
107
|
-
* natural dimensions, lazy-loading status, and
|
|
100
|
+
* Collects every image's `src`, `currentSrc`, `alt`, layout dimensions,
|
|
101
|
+
* natural dimensions, lazy-loading status, and outer HTML in a single
|
|
102
|
+
* `page.evaluate` call, wrapped in {@link raceWithTimeout}. On timeout (an
|
|
103
|
+
* unresponsive page) an empty array is returned rather than hanging.
|
|
108
104
|
* @param page - The Puppeteer page to extract images from.
|
|
109
105
|
* @param viewportWidth - The current viewport width in pixels, recorded alongside each image entry.
|
|
106
|
+
* @param timeout - Timeout in ms for the evaluation. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
|
|
110
107
|
* @returns An array of {@link ImageElement} objects describing each image on the page.
|
|
111
108
|
*/
|
|
112
109
|
export async function getImageList(
|
|
113
110
|
page: Page,
|
|
114
111
|
viewportWidth: number,
|
|
112
|
+
timeout: number = DEFAULT_DOM_EVALUATION_TIMEOUT,
|
|
115
113
|
): Promise<ImageElement[]> {
|
|
116
114
|
log('Getting images (Viewport: %dpx)', viewportWidth);
|
|
117
115
|
|
|
118
|
-
const
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
fallback: 0,
|
|
146
|
-
});
|
|
147
|
-
const naturalHeight = await getProp({
|
|
148
|
-
$el: $image,
|
|
149
|
-
propName: 'naturalHeight',
|
|
150
|
-
fallback: 0,
|
|
151
|
-
});
|
|
152
|
-
const loading = await getProp({ $el: $image, propName: 'loading', fallback: '' });
|
|
153
|
-
const sourceCode = await getProp({
|
|
154
|
-
$el: $image,
|
|
155
|
-
propName: 'outerHTML',
|
|
156
|
-
fallback: '',
|
|
157
|
-
});
|
|
158
|
-
const isLazy = loading.toLowerCase().trim() === 'lazy';
|
|
159
|
-
imageList.push({
|
|
160
|
-
src,
|
|
161
|
-
currentSrc,
|
|
162
|
-
alt,
|
|
163
|
-
width,
|
|
164
|
-
height,
|
|
165
|
-
naturalWidth,
|
|
166
|
-
naturalHeight,
|
|
167
|
-
isLazy,
|
|
116
|
+
const { result, timeout: timedOut } = await raceWithTimeout(
|
|
117
|
+
() =>
|
|
118
|
+
page
|
|
119
|
+
.evaluate(() => {
|
|
120
|
+
/* global document */
|
|
121
|
+
return [...document.images].map((img) => {
|
|
122
|
+
const rect = img.getBoundingClientRect();
|
|
123
|
+
return {
|
|
124
|
+
src: img.src,
|
|
125
|
+
currentSrc: img.currentSrc,
|
|
126
|
+
alt: img.alt,
|
|
127
|
+
width: rect.width,
|
|
128
|
+
height: rect.height,
|
|
129
|
+
naturalWidth: img.naturalWidth,
|
|
130
|
+
naturalHeight: img.naturalHeight,
|
|
131
|
+
loading: img.loading,
|
|
132
|
+
sourceCode: img.outerHTML,
|
|
133
|
+
};
|
|
134
|
+
});
|
|
135
|
+
})
|
|
136
|
+
.catch(() => null),
|
|
137
|
+
timeout,
|
|
138
|
+
);
|
|
139
|
+
|
|
140
|
+
if (timedOut || result == null) {
|
|
141
|
+
log(
|
|
142
|
+
'Image extraction timed out or failed (Viewport: %dpx); returning []',
|
|
168
143
|
viewportWidth,
|
|
169
|
-
|
|
170
|
-
|
|
144
|
+
);
|
|
145
|
+
return [];
|
|
171
146
|
}
|
|
172
147
|
|
|
148
|
+
const imageList: ImageElement[] = result.map(({ loading, ...img }) => ({
|
|
149
|
+
...img,
|
|
150
|
+
isLazy: loading.toLowerCase().trim() === 'lazy',
|
|
151
|
+
viewportWidth,
|
|
152
|
+
}));
|
|
153
|
+
|
|
173
154
|
log('Got %d images (Viewport: %dpx)', imageList.length, viewportWidth);
|
|
174
155
|
dLog(
|
|
175
156
|
'Images are: %O',
|
|
@@ -184,29 +165,41 @@ export async function getImageList(
|
|
|
184
165
|
* For each anchor, resolves the `href` to an `ExURL` via `parseUrl`, retrieves
|
|
185
166
|
* the accessible name (from the accessibility tree, falling back to `textContent`),
|
|
186
167
|
* and filters out non-HTTP links.
|
|
168
|
+
*
|
|
169
|
+
* WHY this keeps per-element CDP calls (unlike {@link getMeta} / {@link getImageList}):
|
|
170
|
+
* the accessible name comes from Chrome's computed accessibility tree
|
|
171
|
+
* (`page.accessibility.snapshot`), which is a CDP-only feature unavailable to in-page
|
|
172
|
+
* DOM APIs. Each {@link getProp} read is still bounded by `timeout`.
|
|
187
173
|
* @param page - The Puppeteer page to extract anchors from.
|
|
188
174
|
* @param options - Optional URL parsing options (e.g., `disableQueries`).
|
|
175
|
+
* @param timeout - Timeout in ms per property read. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
|
|
189
176
|
* @returns An array of {@link AnchorData} objects for all HTTP(S) links found on the page.
|
|
190
177
|
*/
|
|
191
|
-
export async function getAnchorList(
|
|
178
|
+
export async function getAnchorList(
|
|
179
|
+
page: Page,
|
|
180
|
+
options?: ParseURLOptions,
|
|
181
|
+
timeout: number = DEFAULT_DOM_EVALUATION_TIMEOUT,
|
|
182
|
+
) {
|
|
192
183
|
log('Getting anchors');
|
|
193
184
|
|
|
194
185
|
const $anchors = await page.$$('a[href], area[href]');
|
|
195
186
|
const anchorList: AnchorData[] = [];
|
|
196
187
|
|
|
197
188
|
for (const $anchor of $anchors) {
|
|
198
|
-
const $href = await getProp(
|
|
189
|
+
const $href = await getProp(
|
|
190
|
+
{ $el: $anchor, propName: 'href', fallback: '' },
|
|
191
|
+
timeout,
|
|
192
|
+
);
|
|
199
193
|
const hrefVal = $href.toString();
|
|
200
194
|
const href = parseUrl(hrefVal, options);
|
|
201
195
|
if (!href || !href.isHTTP) {
|
|
202
196
|
continue;
|
|
203
197
|
}
|
|
204
198
|
const axNode = await page.accessibility.snapshot({ root: $anchor });
|
|
205
|
-
const textContent = await getProp(
|
|
206
|
-
$el: $anchor,
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
});
|
|
199
|
+
const textContent = await getProp(
|
|
200
|
+
{ $el: $anchor, propName: 'textContent', fallback: '' },
|
|
201
|
+
timeout,
|
|
202
|
+
);
|
|
210
203
|
const accessibleName = axNode ? axNode.name || '' : textContent.trim();
|
|
211
204
|
const link: AnchorData = {
|
|
212
205
|
href,
|
|
@@ -226,7 +219,11 @@ export async function getAnchorList(page: Page, options?: ParseURLOptions) {
|
|
|
226
219
|
/**
|
|
227
220
|
* Extracts comprehensive meta information from the page's `<head>`.
|
|
228
221
|
*
|
|
229
|
-
* Collects
|
|
222
|
+
* Collects all metadata in a single `page.evaluate` call (14 CDP round-trips
|
|
223
|
+
* collapsed into 1) wrapped in {@link raceWithTimeout}. On timeout (an unresponsive
|
|
224
|
+
* page) a minimal `{ title: '' }` is returned rather than hanging.
|
|
225
|
+
*
|
|
226
|
+
* Collected metadata:
|
|
230
227
|
* - `title` - The document title.
|
|
231
228
|
* - `lang` - The `lang` attribute of the `<html>` element.
|
|
232
229
|
* - `description` - The `<meta name="description">` content.
|
|
@@ -237,100 +234,61 @@ export async function getAnchorList(page: Page, options?: ParseURLOptions) {
|
|
|
237
234
|
* - Open Graph tags: `og:type`, `og:title`, `og:site_name`, `og:description`, `og:url`, `og:image`.
|
|
238
235
|
* - `twitter:card` - The Twitter Card type.
|
|
239
236
|
* @param page - The Puppeteer page to extract meta information from.
|
|
237
|
+
* @param timeout - Timeout in ms for the evaluation. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
|
|
240
238
|
* @returns An object containing all extracted meta properties.
|
|
241
239
|
*/
|
|
242
|
-
export async function getMeta(
|
|
240
|
+
export async function getMeta(
|
|
241
|
+
page: Page,
|
|
242
|
+
timeout: number = DEFAULT_DOM_EVALUATION_TIMEOUT,
|
|
243
|
+
): Promise<Meta> {
|
|
243
244
|
log('Getting Meta');
|
|
244
245
|
|
|
245
|
-
const
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
246
|
+
const { result, timeout: timedOut } = await raceWithTimeout(
|
|
247
|
+
() =>
|
|
248
|
+
page
|
|
249
|
+
.evaluate(() => {
|
|
250
|
+
/* global document, HTMLMetaElement, HTMLLinkElement */
|
|
251
|
+
const content = (selector: string): string => {
|
|
252
|
+
const el = document.querySelector(selector);
|
|
253
|
+
return el instanceof HTMLMetaElement ? el.content : '';
|
|
254
|
+
};
|
|
255
|
+
const linkHref = (selector: string): string => {
|
|
256
|
+
const el = document.querySelector(selector);
|
|
257
|
+
return el instanceof HTMLLinkElement ? el.href : '';
|
|
258
|
+
};
|
|
259
|
+
return {
|
|
260
|
+
title: document.title,
|
|
261
|
+
lang: document.documentElement.lang,
|
|
262
|
+
description: content('meta[name="description"]'),
|
|
263
|
+
keywords: content('meta[name="keywords"]'),
|
|
264
|
+
robots: content('meta[name="robots"]'),
|
|
265
|
+
canonical: linkHref('link[rel="canonical"]'),
|
|
266
|
+
alternate: linkHref('link[rel="alternate"]'),
|
|
267
|
+
'og:type': content('meta[property="og:type"]'),
|
|
268
|
+
'og:title': content('meta[property="og:title"]'),
|
|
269
|
+
'og:site_name': content('meta[property="og:site_name"]'),
|
|
270
|
+
'og:description': content('meta[property="og:description"]'),
|
|
271
|
+
'og:url': content('meta[property="og:url"]'),
|
|
272
|
+
'og:image': content('meta[property="og:image"]'),
|
|
273
|
+
'twitter:card': content('meta[name="twitter:card"]'),
|
|
274
|
+
};
|
|
275
|
+
})
|
|
276
|
+
.catch(() => null),
|
|
277
|
+
timeout,
|
|
278
|
+
);
|
|
279
|
+
|
|
280
|
+
if (timedOut || result == null) {
|
|
281
|
+
log('Meta extraction timed out or failed; returning fallback');
|
|
282
|
+
return { title: '' };
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
const { robots: robotsVal, ...rest } = result;
|
|
251
286
|
const robots = new Set(robotsVal.split(',').map((robot) => robot.trim().toLowerCase()));
|
|
252
|
-
const meta = {
|
|
253
|
-
|
|
254
|
-
page,
|
|
255
|
-
selector: 'title',
|
|
256
|
-
propName: 'textContent',
|
|
257
|
-
fallback: '',
|
|
258
|
-
}),
|
|
259
|
-
lang: await getPropBySelector({
|
|
260
|
-
page,
|
|
261
|
-
selector: 'html',
|
|
262
|
-
propName: 'lang',
|
|
263
|
-
fallback: '',
|
|
264
|
-
}),
|
|
265
|
-
description: await getPropBySelector({
|
|
266
|
-
page,
|
|
267
|
-
selector: 'meta[name="description"]',
|
|
268
|
-
propName: 'content',
|
|
269
|
-
fallback: '',
|
|
270
|
-
}),
|
|
271
|
-
keywords: await getPropBySelector({
|
|
272
|
-
page,
|
|
273
|
-
selector: 'meta[name="keywords"]',
|
|
274
|
-
propName: 'content',
|
|
275
|
-
fallback: '',
|
|
276
|
-
}),
|
|
287
|
+
const meta: Meta = {
|
|
288
|
+
...rest,
|
|
277
289
|
noindex: robots.has('noindex'),
|
|
278
290
|
nofollow: robots.has('nofollow'),
|
|
279
291
|
noarchive: robots.has('noarchive'),
|
|
280
|
-
canonical: await getPropBySelector({
|
|
281
|
-
page,
|
|
282
|
-
selector: 'link[rel="canonical"]',
|
|
283
|
-
propName: 'href',
|
|
284
|
-
fallback: '',
|
|
285
|
-
}),
|
|
286
|
-
alternate: await getPropBySelector({
|
|
287
|
-
page,
|
|
288
|
-
selector: 'link[rel="alternate"]',
|
|
289
|
-
propName: 'href',
|
|
290
|
-
fallback: '',
|
|
291
|
-
}),
|
|
292
|
-
'og:type': await getPropBySelector({
|
|
293
|
-
page,
|
|
294
|
-
selector: 'meta[property="og:type"]',
|
|
295
|
-
propName: 'content',
|
|
296
|
-
fallback: '',
|
|
297
|
-
}),
|
|
298
|
-
'og:title': await getPropBySelector({
|
|
299
|
-
page,
|
|
300
|
-
selector: 'meta[property="og:title"]',
|
|
301
|
-
propName: 'content',
|
|
302
|
-
fallback: '',
|
|
303
|
-
}),
|
|
304
|
-
'og:site_name': await getPropBySelector({
|
|
305
|
-
page,
|
|
306
|
-
selector: 'meta[property="og:site_name"]',
|
|
307
|
-
propName: 'content',
|
|
308
|
-
fallback: '',
|
|
309
|
-
}),
|
|
310
|
-
'og:description': await getPropBySelector({
|
|
311
|
-
page,
|
|
312
|
-
selector: 'meta[property="og:description"]',
|
|
313
|
-
propName: 'content',
|
|
314
|
-
fallback: '',
|
|
315
|
-
}),
|
|
316
|
-
'og:url': await getPropBySelector({
|
|
317
|
-
page,
|
|
318
|
-
selector: 'meta[property="og:url"]',
|
|
319
|
-
propName: 'content',
|
|
320
|
-
fallback: '',
|
|
321
|
-
}),
|
|
322
|
-
'og:image': await getPropBySelector({
|
|
323
|
-
page,
|
|
324
|
-
selector: 'meta[property="og:image"]',
|
|
325
|
-
propName: 'content',
|
|
326
|
-
fallback: '',
|
|
327
|
-
}),
|
|
328
|
-
'twitter:card': await getPropBySelector({
|
|
329
|
-
page,
|
|
330
|
-
selector: 'meta[name="twitter:card"]',
|
|
331
|
-
propName: 'content',
|
|
332
|
-
fallback: '',
|
|
333
|
-
}),
|
|
334
292
|
};
|
|
335
293
|
|
|
336
294
|
log('Got meta');
|
package/src/scraper.ts
CHANGED
|
@@ -22,7 +22,12 @@ import { retry as retryable } from '@d-zero/shared/retry';
|
|
|
22
22
|
import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
23
23
|
|
|
24
24
|
import { resourceLog, scraperLog } from './debug.js';
|
|
25
|
-
import {
|
|
25
|
+
import {
|
|
26
|
+
DEFAULT_DOM_EVALUATION_TIMEOUT,
|
|
27
|
+
getAnchorList,
|
|
28
|
+
getImageList,
|
|
29
|
+
getMeta,
|
|
30
|
+
} from './dom-evaluation.js';
|
|
26
31
|
import { isError } from './is-error.js';
|
|
27
32
|
import { keywordCheck } from './keyword-check.js';
|
|
28
33
|
import { findDisconnectionFailures } from './network-disconnection.js';
|
|
@@ -360,6 +365,8 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
360
365
|
options?.disableQueries == null
|
|
361
366
|
? undefined
|
|
362
367
|
: { disableQueries: options.disableQueries };
|
|
368
|
+
const domEvaluationTimeout =
|
|
369
|
+
options?.domEvaluationTimeout ?? DEFAULT_DOM_EVALUATION_TIMEOUT;
|
|
363
370
|
const networkLogs: Record<string, NetworkLog> = {};
|
|
364
371
|
|
|
365
372
|
// Clear stale state from previous retries (@retryable may re-invoke this method
|
|
@@ -626,7 +633,7 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
626
633
|
isExternal,
|
|
627
634
|
message: '',
|
|
628
635
|
});
|
|
629
|
-
const anchorList = await getAnchorList(page, parseOpts);
|
|
636
|
+
const anchorList = await getAnchorList(page, parseOpts, domEvaluationTimeout);
|
|
630
637
|
|
|
631
638
|
void this.emit('changePhase', {
|
|
632
639
|
pid: process.pid,
|
|
@@ -635,7 +642,7 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
635
642
|
isExternal,
|
|
636
643
|
message: '',
|
|
637
644
|
});
|
|
638
|
-
const meta = await getMeta(page);
|
|
645
|
+
const meta = await getMeta(page, domEvaluationTimeout);
|
|
639
646
|
|
|
640
647
|
const imageList = captureImages
|
|
641
648
|
? await (async () => {
|
|
@@ -651,6 +658,7 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
651
658
|
url.withoutHashAndAuth,
|
|
652
659
|
isExternal,
|
|
653
660
|
imageLoadTimeout,
|
|
661
|
+
domEvaluationTimeout,
|
|
654
662
|
);
|
|
655
663
|
})()
|
|
656
664
|
: [];
|
|
@@ -691,6 +699,7 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
691
699
|
* @param url - The page URL string (without hash and auth)
|
|
692
700
|
* @param isExternal - Whether the page is external
|
|
693
701
|
* @param imageLoadTimeout - Timeout (ms) for waiting images to complete loading
|
|
702
|
+
* @param domEvaluationTimeout - Timeout (ms) for the in-page image extraction `page.evaluate`
|
|
694
703
|
* @returns Array of image elements from all device presets (may be partial if some viewports failed)
|
|
695
704
|
*/
|
|
696
705
|
@retryable({
|
|
@@ -720,6 +729,7 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
720
729
|
url: string,
|
|
721
730
|
isExternal: boolean,
|
|
722
731
|
imageLoadTimeout: number,
|
|
732
|
+
domEvaluationTimeout: number,
|
|
723
733
|
): Promise<ImageElement[]> {
|
|
724
734
|
const listener = this.#createPageScanListener(isExternal);
|
|
725
735
|
const devices: { key: string; preset: { width: number; resolution?: number } }[] = [
|
|
@@ -767,7 +777,7 @@ export default class Scraper extends EventEmitter<ScraperEventTypes> {
|
|
|
767
777
|
isExternal,
|
|
768
778
|
message: `📸 ${key}: Extracting images%dots%`,
|
|
769
779
|
});
|
|
770
|
-
const images = await getImageList(page, preset.width);
|
|
780
|
+
const images = await getImageList(page, preset.width, domEvaluationTimeout);
|
|
771
781
|
imageList.push(...images);
|
|
772
782
|
} catch (error) {
|
|
773
783
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
package/src/types.ts
CHANGED
|
@@ -427,4 +427,10 @@ export type ScraperOptions = {
|
|
|
427
427
|
headCheckResult?: PageData;
|
|
428
428
|
/** Timeout (ms) for page.goto(). Default: 60_000 (60s). */
|
|
429
429
|
navigationTimeout?: number;
|
|
430
|
+
/**
|
|
431
|
+
* Timeout (ms) for DOM evaluation operations (meta/image/anchor extraction).
|
|
432
|
+
* Bounds how long extraction may hang on a page with an unresponsive main thread.
|
|
433
|
+
* Default: 30_000 (30s).
|
|
434
|
+
*/
|
|
435
|
+
domEvaluationTimeout?: number;
|
|
430
436
|
};
|