@d-zero/beholder 2.1.6 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +44 -0
- package/README.md +26 -0
- package/dist/dom-evaluation.d.ts +72 -24
- package/dist/dom-evaluation.js +310 -84
- package/dist/extract-meta.d.ts +98 -0
- package/dist/extract-meta.js +75 -0
- package/dist/index.d.ts +3 -1
- package/dist/index.js +1 -0
- package/dist/meta/classify.d.ts +52 -0
- package/dist/meta/classify.js +731 -0
- package/dist/meta/collect-head.d.ts +63 -0
- package/dist/meta/collect-head.js +223 -0
- package/dist/meta/id-extractors.d.ts +40 -0
- package/dist/meta/id-extractors.js +196 -0
- package/dist/meta/keys.d.ts +41 -0
- package/dist/meta/keys.js +507 -0
- package/dist/meta/parsers.d.ts +74 -0
- package/dist/meta/parsers.js +293 -0
- package/dist/meta/tag-detection.d.ts +59 -0
- package/dist/meta/tag-detection.js +120 -0
- package/dist/meta/types.d.ts +874 -0
- package/dist/meta/types.js +12 -0
- package/dist/scraper.js +15 -13
- package/dist/types.d.ts +3 -38
- package/package.json +8 -5
- package/src/dom-evaluation.spec.ts +301 -73
- package/src/dom-evaluation.ts +417 -88
- package/src/extract-meta.spec.ts +247 -0
- package/src/extract-meta.ts +121 -0
- package/src/index.ts +45 -0
- package/src/meta/classify.spec.ts +281 -0
- package/src/meta/classify.ts +810 -0
- package/src/meta/collect-head.ts +247 -0
- package/src/meta/id-extractors.spec.ts +69 -0
- package/src/meta/id-extractors.ts +206 -0
- package/src/meta/keys.ts +568 -0
- package/src/meta/parsers.spec.ts +178 -0
- package/src/meta/parsers.ts +304 -0
- package/src/meta/simple-wappalyzer.d.ts +37 -0
- package/src/meta/tag-detection.spec.ts +134 -0
- package/src/meta/tag-detection.ts +161 -0
- package/src/meta/types.ts +949 -0
- package/src/scraper.ts +19 -13
- package/src/types.ts +49 -55
- package/tsconfig.tsbuildinfo +1 -1
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Type definitions for the `Meta` data extracted from a page's `<head>` and full document.
|
|
3
|
+
*
|
|
4
|
+
* Structure follows the reference table in `frontmatter-keys.md`, with one dot-path
|
|
5
|
+
* field per category. Optional fields are absent when not detected on the page.
|
|
6
|
+
* Array fields are required and default to `[]` so consumers can iterate without
|
|
7
|
+
* null-checks.
|
|
8
|
+
* @see {@link ./classify.ts} for the function that builds `Meta` from raw head entries
|
|
9
|
+
* @see {@link ./parsers.ts} for the value normalizers used by `classify`
|
|
10
|
+
* @module
|
|
11
|
+
*/
|
|
12
|
+
export {};
|
package/dist/scraper.js
CHANGED
|
@@ -45,6 +45,7 @@ import { resourceLog, scraperLog } from './debug.js';
|
|
|
45
45
|
import { DEFAULT_DOM_EVALUATION_TIMEOUT, getAnchorList, getImageList, getMeta, } from './dom-evaluation.js';
|
|
46
46
|
import { isError } from './is-error.js';
|
|
47
47
|
import { keywordCheck } from './keyword-check.js';
|
|
48
|
+
import { emptyMeta } from './meta/classify.js';
|
|
48
49
|
import { findDisconnectionFailures } from './network-disconnection.js';
|
|
49
50
|
import { parseUrl } from './parse-url.js';
|
|
50
51
|
const pid = `${process.pid}`;
|
|
@@ -266,9 +267,7 @@ let Scraper = (() => {
|
|
|
266
267
|
contentType,
|
|
267
268
|
contentLength,
|
|
268
269
|
responseHeaders,
|
|
269
|
-
meta:
|
|
270
|
-
title: '',
|
|
271
|
-
},
|
|
270
|
+
meta: emptyMeta(),
|
|
272
271
|
imageList: [],
|
|
273
272
|
anchorList: [],
|
|
274
273
|
html: '',
|
|
@@ -300,6 +299,8 @@ let Scraper = (() => {
|
|
|
300
299
|
};
|
|
301
300
|
});
|
|
302
301
|
if (isExternal) {
|
|
302
|
+
const externalMeta = emptyMeta();
|
|
303
|
+
externalMeta.title = title;
|
|
303
304
|
return {
|
|
304
305
|
url,
|
|
305
306
|
isTarget: false,
|
|
@@ -310,9 +311,7 @@ let Scraper = (() => {
|
|
|
310
311
|
contentType,
|
|
311
312
|
contentLength,
|
|
312
313
|
responseHeaders,
|
|
313
|
-
meta:
|
|
314
|
-
title,
|
|
315
|
-
},
|
|
314
|
+
meta: externalMeta,
|
|
316
315
|
imageList: [],
|
|
317
316
|
anchorList: [],
|
|
318
317
|
html,
|
|
@@ -342,7 +341,7 @@ let Scraper = (() => {
|
|
|
342
341
|
name: 'getAnchors',
|
|
343
342
|
url,
|
|
344
343
|
isExternal,
|
|
345
|
-
message:
|
|
344
|
+
message: `%countdown(${domEvaluationTimeout},getAnchors_${url.withoutHash},s)%s`,
|
|
346
345
|
});
|
|
347
346
|
const anchorList = await getAnchorList(page, parseOpts, domEvaluationTimeout);
|
|
348
347
|
void this.emit('changePhase', {
|
|
@@ -350,9 +349,14 @@ let Scraper = (() => {
|
|
|
350
349
|
name: 'getMeta',
|
|
351
350
|
url,
|
|
352
351
|
isExternal,
|
|
353
|
-
message:
|
|
352
|
+
message: `%countdown(${domEvaluationTimeout},getMeta_${url.withoutHash},s)%s`,
|
|
354
353
|
});
|
|
355
|
-
const meta = await getMeta(page,
|
|
354
|
+
const meta = await getMeta(page, {
|
|
355
|
+
url: url.withoutHashAndAuth,
|
|
356
|
+
html,
|
|
357
|
+
statusCode: status,
|
|
358
|
+
headers: responseHeaders ?? undefined,
|
|
359
|
+
}, domEvaluationTimeout);
|
|
356
360
|
const imageList = captureImages
|
|
357
361
|
? await (async () => {
|
|
358
362
|
void this.emit('changePhase', {
|
|
@@ -360,7 +364,7 @@ let Scraper = (() => {
|
|
|
360
364
|
name: 'extractImages',
|
|
361
365
|
url,
|
|
362
366
|
isExternal,
|
|
363
|
-
message:
|
|
367
|
+
message: `%countdown(${domEvaluationTimeout},extractImages_${url.withoutHash},s)%s`,
|
|
364
368
|
});
|
|
365
369
|
return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout, domEvaluationTimeout);
|
|
366
370
|
})()
|
|
@@ -510,9 +514,7 @@ let Scraper = (() => {
|
|
|
510
514
|
contentType: null,
|
|
511
515
|
contentLength: null,
|
|
512
516
|
responseHeaders: {},
|
|
513
|
-
meta:
|
|
514
|
-
title: '',
|
|
515
|
-
},
|
|
517
|
+
meta: emptyMeta(),
|
|
516
518
|
imageList: [],
|
|
517
519
|
anchorList: [],
|
|
518
520
|
html: '',
|
package/dist/types.d.ts
CHANGED
|
@@ -7,6 +7,8 @@
|
|
|
7
7
|
export type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
|
|
8
8
|
export type { CompressType } from '@d-zero/shared/detect-compress';
|
|
9
9
|
export type { CDNType } from '@d-zero/shared/detect-cdn';
|
|
10
|
+
export type { Meta, OpenGraphMeta, OgArticleMeta, OgBookMeta, OgProfileMeta, OgMusicMeta, OgVideoNsMeta, TwitterMeta, FbMeta, FediverseMeta, AppleMeta, MsApplicationMeta, VerificationMeta, GoogleMeta, GeoMeta, CitationMeta, RdfaMeta, MicrodataMeta, AmpMeta, LegacyMeta, MobileMeta, MicroformatsMeta, PinterestMeta, SlackMeta, LinkedInMeta, ExperimentalMeta, WikiMeta, LinkMeta, LinkEntry, JsonLdEntry, OthersBucket, ScriptEntry, IframeEntry, TagsMeta, TagDetail, TagEntry, TagSource, ViewportMeta, RobotsMeta, ReferrerMeta, FormatDetectionMeta, HttpEquivMeta, HttpEquivRefresh, RawHeadEntry, } from './meta/types.js';
|
|
11
|
+
import type { Meta } from './meta/types.js';
|
|
10
12
|
import type { CDNType } from '@d-zero/shared/detect-cdn';
|
|
11
13
|
import type { CompressType } from '@d-zero/shared/detect-compress';
|
|
12
14
|
import type { ExURL } from '@d-zero/shared/parse-url';
|
|
@@ -134,43 +136,6 @@ export type AnchorData = {
|
|
|
134
136
|
*/
|
|
135
137
|
isExternal?: boolean;
|
|
136
138
|
};
|
|
137
|
-
/**
|
|
138
|
-
* Metadata extracted from a page's `<head>` element.
|
|
139
|
-
*/
|
|
140
|
-
export type Meta = {
|
|
141
|
-
/** The `lang` attribute of the `<html>` element. */
|
|
142
|
-
lang?: string;
|
|
143
|
-
/** The text content of the `<title>` element. */
|
|
144
|
-
title: string;
|
|
145
|
-
/** The `content` attribute of `<meta name="description">`. */
|
|
146
|
-
description?: string;
|
|
147
|
-
/** The `content` attribute of `<meta name="keywords">`. */
|
|
148
|
-
keywords?: string;
|
|
149
|
-
/** Whether `noindex` is present in the robots meta tag. */
|
|
150
|
-
noindex?: boolean;
|
|
151
|
-
/** Whether `nofollow` is present in the robots meta tag. */
|
|
152
|
-
nofollow?: boolean;
|
|
153
|
-
/** Whether `noarchive` is present in the robots meta tag. */
|
|
154
|
-
noarchive?: boolean;
|
|
155
|
-
/** The canonical URL from `<link rel="canonical">`. */
|
|
156
|
-
canonical?: string;
|
|
157
|
-
/** The alternate URL from `<link rel="alternate">`. */
|
|
158
|
-
alternate?: string;
|
|
159
|
-
/** The Open Graph type (`og:type`). */
|
|
160
|
-
'og:type'?: string;
|
|
161
|
-
/** The Open Graph title (`og:title`). */
|
|
162
|
-
'og:title'?: string;
|
|
163
|
-
/** The Open Graph site name (`og:site_name`). */
|
|
164
|
-
'og:site_name'?: string;
|
|
165
|
-
/** The Open Graph description (`og:description`). */
|
|
166
|
-
'og:description'?: string;
|
|
167
|
-
/** The Open Graph URL (`og:url`). */
|
|
168
|
-
'og:url'?: string;
|
|
169
|
-
/** The Open Graph image URL (`og:image`). */
|
|
170
|
-
'og:image'?: string;
|
|
171
|
-
/** The Twitter Card type (`twitter:card`). */
|
|
172
|
-
'twitter:card'?: string;
|
|
173
|
-
};
|
|
174
139
|
/**
|
|
175
140
|
* A network request/response log entry captured during page scraping via Puppeteer.
|
|
176
141
|
*/
|
|
@@ -348,7 +313,7 @@ export type ScraperOptions = {
|
|
|
348
313
|
/**
|
|
349
314
|
* Timeout (ms) for DOM evaluation operations (meta/image/anchor extraction).
|
|
350
315
|
* Bounds how long extraction may hang on a page with an unresponsive main thread.
|
|
351
|
-
* Default:
|
|
316
|
+
* Default: 180_000 (180s, aligned with the upstream retryable timeout).
|
|
352
317
|
*/
|
|
353
318
|
domEvaluationTimeout?: number;
|
|
354
319
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@d-zero/beholder",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "3.1.0",
|
|
4
4
|
"description": "Page-level scraper for web crawling and auditing",
|
|
5
5
|
"author": "D-ZERO",
|
|
6
6
|
"license": "MIT",
|
|
@@ -20,18 +20,21 @@
|
|
|
20
20
|
"clean": "tsc --build --clean"
|
|
21
21
|
},
|
|
22
22
|
"dependencies": {
|
|
23
|
-
"@d-zero/puppeteer-page-scan": "4.5.
|
|
23
|
+
"@d-zero/puppeteer-page-scan": "4.5.2",
|
|
24
24
|
"@d-zero/shared": "0.22.0",
|
|
25
25
|
"debug": "4.4.3",
|
|
26
|
-
"puppeteer": "24.37.5"
|
|
26
|
+
"puppeteer": "24.37.5",
|
|
27
|
+
"simple-wappalyzer": "1.1.99"
|
|
27
28
|
},
|
|
28
29
|
"devDependencies": {
|
|
29
|
-
"@types/debug": "4.1.12"
|
|
30
|
+
"@types/debug": "4.1.12",
|
|
31
|
+
"@types/jsdom": "28.0.3",
|
|
32
|
+
"jsdom": "29.1.1"
|
|
30
33
|
},
|
|
31
34
|
"repository": {
|
|
32
35
|
"type": "git",
|
|
33
36
|
"url": "https://github.com/d-zero-dev/tools.git",
|
|
34
37
|
"directory": "packages/@d-zero/beholder"
|
|
35
38
|
},
|
|
36
|
-
"gitHead": "
|
|
39
|
+
"gitHead": "e69344a9d4d45b0ec0ee942f920b84bbd0fb77ae"
|
|
37
40
|
}
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
import type { ElementHandle, Page } from 'puppeteer';
|
|
2
2
|
|
|
3
|
+
import { readFileSync } from 'node:fs';
|
|
4
|
+
import { createRequire } from 'node:module';
|
|
5
|
+
|
|
3
6
|
import { afterEach, describe, expect, it, vi } from 'vitest';
|
|
4
7
|
|
|
5
8
|
import {
|
|
@@ -9,6 +12,7 @@ import {
|
|
|
9
12
|
getMeta,
|
|
10
13
|
getProp,
|
|
11
14
|
} from './dom-evaluation.js';
|
|
15
|
+
import { emptyMeta } from './meta/classify.js';
|
|
12
16
|
|
|
13
17
|
afterEach(() => {
|
|
14
18
|
vi.useRealTimers();
|
|
@@ -38,68 +42,30 @@ function mockElementHandle(value: unknown): ElementHandle<Element> {
|
|
|
38
42
|
}
|
|
39
43
|
|
|
40
44
|
describe('getMeta', () => {
|
|
41
|
-
it('
|
|
42
|
-
const page = mockPageEvaluate({
|
|
43
|
-
title: 'Example',
|
|
44
|
-
lang: 'ja',
|
|
45
|
-
description: 'desc',
|
|
46
|
-
keywords: 'a,b',
|
|
47
|
-
robots: 'noindex, NOFOLLOW',
|
|
48
|
-
canonical: 'https://example.com/',
|
|
49
|
-
alternate: 'https://example.com/en',
|
|
50
|
-
'og:type': 'website',
|
|
51
|
-
'og:title': 'OG Title',
|
|
52
|
-
'og:site_name': 'Site',
|
|
53
|
-
'og:description': 'OG desc',
|
|
54
|
-
'og:url': 'https://example.com/',
|
|
55
|
-
'og:image': 'https://example.com/img.png',
|
|
56
|
-
'twitter:card': 'summary',
|
|
57
|
-
});
|
|
58
|
-
|
|
59
|
-
const meta = await getMeta(page);
|
|
60
|
-
|
|
61
|
-
expect(meta).toStrictEqual({
|
|
62
|
-
title: 'Example',
|
|
63
|
-
lang: 'ja',
|
|
64
|
-
description: 'desc',
|
|
65
|
-
keywords: 'a,b',
|
|
66
|
-
noindex: true,
|
|
67
|
-
nofollow: true,
|
|
68
|
-
noarchive: false,
|
|
69
|
-
canonical: 'https://example.com/',
|
|
70
|
-
alternate: 'https://example.com/en',
|
|
71
|
-
'og:type': 'website',
|
|
72
|
-
'og:title': 'OG Title',
|
|
73
|
-
'og:site_name': 'Site',
|
|
74
|
-
'og:description': 'OG desc',
|
|
75
|
-
'og:url': 'https://example.com/',
|
|
76
|
-
'og:image': 'https://example.com/img.png',
|
|
77
|
-
'twitter:card': 'summary',
|
|
78
|
-
});
|
|
79
|
-
});
|
|
80
|
-
|
|
81
|
-
it('returns a minimal fallback when evaluation rejects', async () => {
|
|
45
|
+
it('returns emptyMeta() when page.evaluate rejects', async () => {
|
|
82
46
|
const page = {
|
|
83
47
|
evaluate: () => Promise.reject(new Error('execution context destroyed')),
|
|
48
|
+
content: () => Promise.resolve('<html></html>'),
|
|
84
49
|
} as unknown as Page;
|
|
85
50
|
|
|
86
|
-
const meta = await getMeta(page);
|
|
51
|
+
const meta = await getMeta(page, { url: 'https://example.com/' });
|
|
87
52
|
|
|
88
|
-
expect(meta).
|
|
53
|
+
expect(meta).toEqual(emptyMeta());
|
|
89
54
|
});
|
|
90
55
|
|
|
91
|
-
it('returns
|
|
56
|
+
it('returns emptyMeta() when the main thread is unresponsive (timeout)', async () => {
|
|
92
57
|
vi.useFakeTimers();
|
|
93
58
|
const page = {
|
|
94
59
|
// Never resolves — simulates a blocked main thread.
|
|
95
60
|
evaluate: () => new Promise(() => {}),
|
|
61
|
+
content: () => new Promise(() => {}),
|
|
96
62
|
} as unknown as Page;
|
|
97
63
|
|
|
98
|
-
const promise = getMeta(page, 5000);
|
|
64
|
+
const promise = getMeta(page, { url: 'https://example.com/' }, 5000);
|
|
99
65
|
await vi.advanceTimersByTimeAsync(5000);
|
|
100
66
|
const meta = await promise;
|
|
101
67
|
|
|
102
|
-
expect(meta).
|
|
68
|
+
expect(meta).toEqual(emptyMeta());
|
|
103
69
|
expect(vi.getTimerCount()).toBe(0);
|
|
104
70
|
});
|
|
105
71
|
});
|
|
@@ -231,15 +197,76 @@ describe('getProp', () => {
|
|
|
231
197
|
});
|
|
232
198
|
});
|
|
233
199
|
|
|
200
|
+
/**
|
|
201
|
+
* Builds an anchor element handle whose `remoteObject().objectId` and per-property
|
|
202
|
+
* reads can be customized for the new Strategy F implementation.
|
|
203
|
+
* @param objectId The remote object id used to map this handle back to an AX node.
|
|
204
|
+
* @param props Property values returned by `getProperty(propName).jsonValue()`.
|
|
205
|
+
*/
|
|
206
|
+
function mockAnchorHandle(
|
|
207
|
+
objectId: string,
|
|
208
|
+
props: Record<string, unknown>,
|
|
209
|
+
): ElementHandle<Element> {
|
|
210
|
+
return {
|
|
211
|
+
remoteObject: () => ({ objectId }),
|
|
212
|
+
getProperty: (propName: string) =>
|
|
213
|
+
Promise.resolve({
|
|
214
|
+
jsonValue: () => Promise.resolve(props[propName] ?? ''),
|
|
215
|
+
}),
|
|
216
|
+
} as unknown as ElementHandle<Element>;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Builds a page mock for the new `getAnchorList` implementation, wiring up
|
|
221
|
+
* `_client()` to return a stub CDP session whose `send(method)` is dispatched
|
|
222
|
+
* by `axNodes`/`describeNodes` (matched by `objectId`).
|
|
223
|
+
* @param args - Mock configuration.
|
|
224
|
+
* @param args.anchors - Anchor element handles to be returned by `page.$$()`.
|
|
225
|
+
* @param args.axNodes - Raw AX nodes returned by `Accessibility.getFullAXTree`.
|
|
226
|
+
* @param args.describeNodes - Map from `objectId` → `backendNodeId` for `DOM.describeNode`.
|
|
227
|
+
* @param args.getFullAXTree - Optional override for `Accessibility.getFullAXTree` (e.g., simulate rejection).
|
|
228
|
+
* @param args.describeNode - Optional override for `DOM.describeNode` (e.g., simulate rejection).
|
|
229
|
+
*/
|
|
230
|
+
function mockPageForAnchors(args: {
|
|
231
|
+
anchors: ElementHandle<Element>[];
|
|
232
|
+
axNodes?: Array<{
|
|
233
|
+
backendDOMNodeId?: number;
|
|
234
|
+
ignored?: boolean;
|
|
235
|
+
name?: { value?: unknown };
|
|
236
|
+
}>;
|
|
237
|
+
describeNodes?: Record<string, number | undefined>;
|
|
238
|
+
getFullAXTree?: () => Promise<unknown>;
|
|
239
|
+
describeNode?: (params: { objectId: string }) => Promise<unknown>;
|
|
240
|
+
}): Page {
|
|
241
|
+
const { anchors, axNodes = [], describeNodes = {}, getFullAXTree, describeNode } = args;
|
|
242
|
+
const client = {
|
|
243
|
+
send: (method: string, params?: { objectId?: string }) => {
|
|
244
|
+
if (method === 'Accessibility.getFullAXTree') {
|
|
245
|
+
return getFullAXTree ? getFullAXTree() : Promise.resolve({ nodes: axNodes });
|
|
246
|
+
}
|
|
247
|
+
if (method === 'DOM.describeNode') {
|
|
248
|
+
if (describeNode) return describeNode({ objectId: params?.objectId ?? '' });
|
|
249
|
+
const backendNodeId =
|
|
250
|
+
params?.objectId == null ? undefined : describeNodes[params.objectId];
|
|
251
|
+
return Promise.resolve({ node: { backendNodeId } });
|
|
252
|
+
}
|
|
253
|
+
return Promise.reject(new Error(`unexpected CDP method: ${method}`));
|
|
254
|
+
},
|
|
255
|
+
};
|
|
256
|
+
return {
|
|
257
|
+
$$: () => Promise.resolve(anchors),
|
|
258
|
+
_client: () => client,
|
|
259
|
+
} as unknown as Page;
|
|
260
|
+
}
|
|
261
|
+
|
|
234
262
|
describe('getAnchorList', () => {
|
|
235
|
-
it('resolves the href and
|
|
236
|
-
const $anchor =
|
|
237
|
-
const page = {
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
} as unknown as Page;
|
|
263
|
+
it('resolves the href and uses the accessible name from the AX tree', async () => {
|
|
264
|
+
const $anchor = mockAnchorHandle('obj-1', { href: 'https://example.com/page' });
|
|
265
|
+
const page = mockPageForAnchors({
|
|
266
|
+
anchors: [$anchor],
|
|
267
|
+
axNodes: [{ backendDOMNodeId: 42, name: { value: 'Accessible Name' } }],
|
|
268
|
+
describeNodes: { 'obj-1': 42 },
|
|
269
|
+
});
|
|
243
270
|
|
|
244
271
|
const anchors = await getAnchorList(page);
|
|
245
272
|
|
|
@@ -248,22 +275,84 @@ describe('getAnchorList', () => {
|
|
|
248
275
|
expect(anchors[0]?.href.href).toBe('https://example.com/page');
|
|
249
276
|
});
|
|
250
277
|
|
|
251
|
-
it('
|
|
278
|
+
it('uses an empty AX name as-is without falling back to textContent', async () => {
|
|
279
|
+
// Mirrors the old `axNode.name || ''` behavior: when the AX tree DOES contain
|
|
280
|
+
// the anchor (so it's not "missing from the tree") but its computed name is
|
|
281
|
+
// empty, we keep the empty string — no textContent fallback.
|
|
282
|
+
const textContent = vi.fn();
|
|
252
283
|
const $anchor = {
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
284
|
+
remoteObject: () => ({ objectId: 'obj-1' }),
|
|
285
|
+
getProperty: (propName: string) => {
|
|
286
|
+
if (propName === 'href') {
|
|
287
|
+
return Promise.resolve({
|
|
288
|
+
jsonValue: () => Promise.resolve('https://example.com/page'),
|
|
289
|
+
});
|
|
290
|
+
}
|
|
291
|
+
textContent();
|
|
292
|
+
return Promise.resolve({ jsonValue: () => Promise.resolve('text fallback') });
|
|
293
|
+
},
|
|
260
294
|
} as unknown as ElementHandle<Element>;
|
|
261
|
-
const page = {
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
295
|
+
const page = mockPageForAnchors({
|
|
296
|
+
anchors: [$anchor],
|
|
297
|
+
axNodes: [{ backendDOMNodeId: 42, name: { value: '' } }],
|
|
298
|
+
describeNodes: { 'obj-1': 42 },
|
|
299
|
+
});
|
|
300
|
+
|
|
301
|
+
const anchors = await getAnchorList(page);
|
|
302
|
+
|
|
303
|
+
expect(anchors).toHaveLength(1);
|
|
304
|
+
expect(anchors[0]?.textContent).toBe('');
|
|
305
|
+
expect(textContent).not.toHaveBeenCalled();
|
|
306
|
+
});
|
|
307
|
+
|
|
308
|
+
it('falls back to textContent for ignored AX nodes (aria-hidden / display:none anchors)', async () => {
|
|
309
|
+
// Mirrors puppeteer's high-level snapshot({root}) with interestingOnly:true,
|
|
310
|
+
// which returns null for ignored nodes — old code then used textContent.
|
|
311
|
+
const $anchor = mockAnchorHandle('obj-1', {
|
|
312
|
+
href: 'https://example.com/page',
|
|
313
|
+
textContent: 'Visible text',
|
|
314
|
+
});
|
|
315
|
+
const page = mockPageForAnchors({
|
|
316
|
+
anchors: [$anchor],
|
|
317
|
+
axNodes: [{ backendDOMNodeId: 42, ignored: true, name: { value: '' } }],
|
|
318
|
+
describeNodes: { 'obj-1': 42 },
|
|
319
|
+
});
|
|
320
|
+
|
|
321
|
+
const anchors = await getAnchorList(page);
|
|
322
|
+
|
|
323
|
+
expect(anchors).toHaveLength(1);
|
|
324
|
+
expect(anchors[0]?.textContent).toBe('Visible text');
|
|
325
|
+
});
|
|
326
|
+
|
|
327
|
+
it('drops a single anchor whose handle throws (detached) without rejecting the whole list', async () => {
|
|
328
|
+
const $detached = {
|
|
329
|
+
remoteObject: () => {
|
|
330
|
+
throw new Error('Handle is detached');
|
|
265
331
|
},
|
|
266
|
-
} as unknown as
|
|
332
|
+
} as unknown as ElementHandle<Element>;
|
|
333
|
+
const $good = mockAnchorHandle('obj-1', { href: 'https://example.com/page' });
|
|
334
|
+
const page = mockPageForAnchors({
|
|
335
|
+
anchors: [$detached, $good],
|
|
336
|
+
axNodes: [{ backendDOMNodeId: 42, name: { value: 'Name' } }],
|
|
337
|
+
describeNodes: { 'obj-1': 42 },
|
|
338
|
+
});
|
|
339
|
+
|
|
340
|
+
const anchors = await getAnchorList(page);
|
|
341
|
+
|
|
342
|
+
expect(anchors).toHaveLength(1);
|
|
343
|
+
expect(anchors[0]?.href.href).toBe('https://example.com/page');
|
|
344
|
+
});
|
|
345
|
+
|
|
346
|
+
it('falls back to trimmed textContent when the anchor is not represented in the AX tree', async () => {
|
|
347
|
+
const $anchor = mockAnchorHandle('obj-1', {
|
|
348
|
+
href: 'https://example.com/page',
|
|
349
|
+
textContent: ' Link text ',
|
|
350
|
+
});
|
|
351
|
+
const page = mockPageForAnchors({
|
|
352
|
+
anchors: [$anchor],
|
|
353
|
+
axNodes: [], // anchor's backendNodeId not present
|
|
354
|
+
describeNodes: { 'obj-1': 99 },
|
|
355
|
+
});
|
|
267
356
|
|
|
268
357
|
const anchors = await getAnchorList(page);
|
|
269
358
|
|
|
@@ -271,23 +360,162 @@ describe('getAnchorList', () => {
|
|
|
271
360
|
expect(anchors[0]?.textContent).toBe('Link text');
|
|
272
361
|
});
|
|
273
362
|
|
|
363
|
+
it('falls back to textContent when the AX tree response is malformed (no `nodes` field)', async () => {
|
|
364
|
+
// Defensive: an unexpected CDP shape must not throw or pollute the map.
|
|
365
|
+
const $anchor = mockAnchorHandle('obj-1', {
|
|
366
|
+
href: 'https://example.com/page',
|
|
367
|
+
textContent: 'Plain text',
|
|
368
|
+
});
|
|
369
|
+
const page = mockPageForAnchors({
|
|
370
|
+
anchors: [$anchor],
|
|
371
|
+
getFullAXTree: () => Promise.resolve({}),
|
|
372
|
+
describeNodes: { 'obj-1': 1 },
|
|
373
|
+
});
|
|
374
|
+
|
|
375
|
+
const anchors = await getAnchorList(page);
|
|
376
|
+
|
|
377
|
+
expect(anchors).toHaveLength(1);
|
|
378
|
+
expect(anchors[0]?.textContent).toBe('Plain text');
|
|
379
|
+
});
|
|
380
|
+
|
|
381
|
+
it('falls back to textContent when DOM.describeNode response is malformed (no `node` field)', async () => {
|
|
382
|
+
// Defensive: an unexpected CDP shape must not throw inside Promise.all.
|
|
383
|
+
const $anchor = mockAnchorHandle('obj-1', {
|
|
384
|
+
href: 'https://example.com/page',
|
|
385
|
+
textContent: 'Plain text',
|
|
386
|
+
});
|
|
387
|
+
const page = mockPageForAnchors({
|
|
388
|
+
anchors: [$anchor],
|
|
389
|
+
axNodes: [{ backendDOMNodeId: 1, name: { value: 'AX Name' } }],
|
|
390
|
+
describeNode: () => Promise.resolve({}),
|
|
391
|
+
});
|
|
392
|
+
|
|
393
|
+
const anchors = await getAnchorList(page);
|
|
394
|
+
|
|
395
|
+
expect(anchors).toHaveLength(1);
|
|
396
|
+
expect(anchors[0]?.textContent).toBe('Plain text');
|
|
397
|
+
});
|
|
398
|
+
|
|
399
|
+
it('falls back to textContent for every anchor when the AX tree fetch rejects', async () => {
|
|
400
|
+
const $anchor = mockAnchorHandle('obj-1', {
|
|
401
|
+
href: 'https://example.com/page',
|
|
402
|
+
textContent: 'Plain text',
|
|
403
|
+
});
|
|
404
|
+
const page = mockPageForAnchors({
|
|
405
|
+
anchors: [$anchor],
|
|
406
|
+
getFullAXTree: () => Promise.reject(new Error('CDP unavailable')),
|
|
407
|
+
describeNodes: { 'obj-1': 1 },
|
|
408
|
+
});
|
|
409
|
+
|
|
410
|
+
const anchors = await getAnchorList(page);
|
|
411
|
+
|
|
412
|
+
expect(anchors).toHaveLength(1);
|
|
413
|
+
expect(anchors[0]?.textContent).toBe('Plain text');
|
|
414
|
+
});
|
|
415
|
+
|
|
416
|
+
it('falls back to textContent when DOM.describeNode rejects for an anchor', async () => {
|
|
417
|
+
const $anchor = mockAnchorHandle('obj-1', {
|
|
418
|
+
href: 'https://example.com/page',
|
|
419
|
+
textContent: 'Plain text',
|
|
420
|
+
});
|
|
421
|
+
const page = mockPageForAnchors({
|
|
422
|
+
anchors: [$anchor],
|
|
423
|
+
axNodes: [{ backendDOMNodeId: 1, name: { value: 'AX Name' } }],
|
|
424
|
+
describeNode: () => Promise.reject(new Error('detached')),
|
|
425
|
+
});
|
|
426
|
+
|
|
427
|
+
const anchors = await getAnchorList(page);
|
|
428
|
+
|
|
429
|
+
expect(anchors).toHaveLength(1);
|
|
430
|
+
expect(anchors[0]?.textContent).toBe('Plain text');
|
|
431
|
+
});
|
|
432
|
+
|
|
433
|
+
it('returns partial results when the overall operation exceeds the timeout', async () => {
|
|
434
|
+
vi.useFakeTimers();
|
|
435
|
+
const $fast = mockAnchorHandle('obj-fast', { href: 'https://example.com/fast' });
|
|
436
|
+
const $slow = {
|
|
437
|
+
remoteObject: () => ({ objectId: 'obj-slow' }),
|
|
438
|
+
getProperty: () => new Promise(() => {}), // never resolves
|
|
439
|
+
} as unknown as ElementHandle<Element>;
|
|
440
|
+
const page = mockPageForAnchors({
|
|
441
|
+
anchors: [$fast, $slow],
|
|
442
|
+
axNodes: [{ backendDOMNodeId: 1, name: { value: 'Fast' } }],
|
|
443
|
+
describeNodes: { 'obj-fast': 1, 'obj-slow': 2 },
|
|
444
|
+
});
|
|
445
|
+
|
|
446
|
+
const promise = getAnchorList(page, undefined, 5000);
|
|
447
|
+
await vi.advanceTimersByTimeAsync(5000);
|
|
448
|
+
const anchors = await promise;
|
|
449
|
+
|
|
450
|
+
// The fast anchor was collected before the overall race tripped; the slow
|
|
451
|
+
// one was abandoned.
|
|
452
|
+
expect(anchors).toHaveLength(1);
|
|
453
|
+
expect(anchors[0]?.href.href).toBe('https://example.com/fast');
|
|
454
|
+
});
|
|
455
|
+
|
|
274
456
|
it('skips non-HTTP links', async () => {
|
|
275
|
-
const $anchor =
|
|
457
|
+
const $anchor = mockAnchorHandle('obj-1', { href: 'javascript:void(0)' });
|
|
458
|
+
const page = mockPageForAnchors({
|
|
459
|
+
anchors: [$anchor],
|
|
460
|
+
axNodes: [{ backendDOMNodeId: 1, name: { value: 'JS link' } }],
|
|
461
|
+
describeNodes: { 'obj-1': 1 },
|
|
462
|
+
});
|
|
463
|
+
|
|
464
|
+
const anchors = await getAnchorList(page);
|
|
465
|
+
|
|
466
|
+
expect(anchors).toStrictEqual([]);
|
|
467
|
+
});
|
|
468
|
+
|
|
469
|
+
it("falls back to textContent for every anchor when puppeteer's internal CDP session is unavailable", async () => {
|
|
470
|
+
const $anchor = mockAnchorHandle('obj-1', {
|
|
471
|
+
href: 'https://example.com/page',
|
|
472
|
+
textContent: ' Plain text ',
|
|
473
|
+
});
|
|
474
|
+
// Page mock without `_client()`: simulates puppeteer wrappers that hide the
|
|
475
|
+
// internal session — the function must still produce anchor data, just
|
|
476
|
+
// without AX names.
|
|
276
477
|
const page = {
|
|
277
478
|
$$: () => Promise.resolve([$anchor]),
|
|
278
|
-
accessibility: {
|
|
279
|
-
snapshot: () => Promise.resolve(null),
|
|
280
|
-
},
|
|
281
479
|
} as unknown as Page;
|
|
282
480
|
|
|
283
481
|
const anchors = await getAnchorList(page);
|
|
284
482
|
|
|
483
|
+
expect(anchors).toHaveLength(1);
|
|
484
|
+
expect(anchors[0]?.textContent).toBe('Plain text');
|
|
485
|
+
});
|
|
486
|
+
|
|
487
|
+
it('returns an empty array when the page has no anchors', async () => {
|
|
488
|
+
const page = mockPageForAnchors({ anchors: [] });
|
|
489
|
+
|
|
490
|
+
const anchors = await getAnchorList(page);
|
|
491
|
+
|
|
285
492
|
expect(anchors).toStrictEqual([]);
|
|
286
493
|
});
|
|
287
494
|
});
|
|
288
495
|
|
|
289
496
|
describe('DEFAULT_DOM_EVALUATION_TIMEOUT', () => {
|
|
290
|
-
it('defaults to
|
|
291
|
-
expect(DEFAULT_DOM_EVALUATION_TIMEOUT).toBe(
|
|
497
|
+
it('defaults to 180 seconds', () => {
|
|
498
|
+
expect(DEFAULT_DOM_EVALUATION_TIMEOUT).toBe(180_000);
|
|
499
|
+
});
|
|
500
|
+
});
|
|
501
|
+
|
|
502
|
+
/**
|
|
503
|
+
* Tripwire: `getAnchorList` reads `(page as any)._client()` to reuse puppeteer's
|
|
504
|
+
* internal CDP session. Unit tests mock that method directly, so a silent
|
|
505
|
+
* removal/rename in a future puppeteer release would not be caught by the
|
|
506
|
+
* functional tests — the production path would just fall back to
|
|
507
|
+
* textContent-only mode without anyone noticing.
|
|
508
|
+
*
|
|
509
|
+
* This block inspects the actual installed puppeteer-core source to assert the
|
|
510
|
+
* `_client()` method still exists. If puppeteer drops or renames it, this test
|
|
511
|
+
* fails and forces a maintainer to update `getInternalCDPClient` instead of
|
|
512
|
+
* silently degrading.
|
|
513
|
+
*/
|
|
514
|
+
describe('puppeteer internal API tripwire', () => {
|
|
515
|
+
it('puppeteer-core CDP Page still defines _client()', () => {
|
|
516
|
+
const require = createRequire(import.meta.url);
|
|
517
|
+
const cdpPagePath = require.resolve('puppeteer-core/lib/cjs/puppeteer/cdp/Page.js');
|
|
518
|
+
const src = readFileSync(cdpPagePath, 'utf8');
|
|
519
|
+
expect(src).toMatch(/_client\s*\(\s*\)\s*\{/);
|
|
292
520
|
});
|
|
293
521
|
});
|