@d-zero/beholder 2.1.6 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +44 -0
  2. package/README.md +26 -0
  3. package/dist/dom-evaluation.d.ts +72 -24
  4. package/dist/dom-evaluation.js +310 -84
  5. package/dist/extract-meta.d.ts +98 -0
  6. package/dist/extract-meta.js +75 -0
  7. package/dist/index.d.ts +3 -1
  8. package/dist/index.js +1 -0
  9. package/dist/meta/classify.d.ts +52 -0
  10. package/dist/meta/classify.js +731 -0
  11. package/dist/meta/collect-head.d.ts +63 -0
  12. package/dist/meta/collect-head.js +223 -0
  13. package/dist/meta/id-extractors.d.ts +40 -0
  14. package/dist/meta/id-extractors.js +196 -0
  15. package/dist/meta/keys.d.ts +41 -0
  16. package/dist/meta/keys.js +507 -0
  17. package/dist/meta/parsers.d.ts +74 -0
  18. package/dist/meta/parsers.js +293 -0
  19. package/dist/meta/tag-detection.d.ts +59 -0
  20. package/dist/meta/tag-detection.js +120 -0
  21. package/dist/meta/types.d.ts +874 -0
  22. package/dist/meta/types.js +12 -0
  23. package/dist/scraper.js +15 -13
  24. package/dist/types.d.ts +3 -38
  25. package/package.json +8 -5
  26. package/src/dom-evaluation.spec.ts +301 -73
  27. package/src/dom-evaluation.ts +417 -88
  28. package/src/extract-meta.spec.ts +247 -0
  29. package/src/extract-meta.ts +121 -0
  30. package/src/index.ts +45 -0
  31. package/src/meta/classify.spec.ts +281 -0
  32. package/src/meta/classify.ts +810 -0
  33. package/src/meta/collect-head.ts +247 -0
  34. package/src/meta/id-extractors.spec.ts +69 -0
  35. package/src/meta/id-extractors.ts +206 -0
  36. package/src/meta/keys.ts +568 -0
  37. package/src/meta/parsers.spec.ts +178 -0
  38. package/src/meta/parsers.ts +304 -0
  39. package/src/meta/simple-wappalyzer.d.ts +37 -0
  40. package/src/meta/tag-detection.spec.ts +134 -0
  41. package/src/meta/tag-detection.ts +161 -0
  42. package/src/meta/types.ts +949 -0
  43. package/src/scraper.ts +19 -13
  44. package/src/types.ts +49 -55
  45. package/tsconfig.tsbuildinfo +1 -1
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Type definitions for the `Meta` data extracted from a page's `<head>` and full document.
3
+ *
4
+ * Structure follows the reference table in `frontmatter-keys.md`, with one dot-path
5
+ * field per category. Optional fields are absent when not detected on the page.
6
+ * Array fields are required and default to `[]` so consumers can iterate without
7
+ * null-checks.
8
+ * @see {@link ./classify.ts} for the function that builds `Meta` from raw head entries
9
+ * @see {@link ./parsers.ts} for the value normalizers used by `classify`
10
+ * @module
11
+ */
12
+ export {};
package/dist/scraper.js CHANGED
@@ -45,6 +45,7 @@ import { resourceLog, scraperLog } from './debug.js';
45
45
  import { DEFAULT_DOM_EVALUATION_TIMEOUT, getAnchorList, getImageList, getMeta, } from './dom-evaluation.js';
46
46
  import { isError } from './is-error.js';
47
47
  import { keywordCheck } from './keyword-check.js';
48
+ import { emptyMeta } from './meta/classify.js';
48
49
  import { findDisconnectionFailures } from './network-disconnection.js';
49
50
  import { parseUrl } from './parse-url.js';
50
51
  const pid = `${process.pid}`;
@@ -266,9 +267,7 @@ let Scraper = (() => {
266
267
  contentType,
267
268
  contentLength,
268
269
  responseHeaders,
269
- meta: {
270
- title: '',
271
- },
270
+ meta: emptyMeta(),
272
271
  imageList: [],
273
272
  anchorList: [],
274
273
  html: '',
@@ -300,6 +299,8 @@ let Scraper = (() => {
300
299
  };
301
300
  });
302
301
  if (isExternal) {
302
+ const externalMeta = emptyMeta();
303
+ externalMeta.title = title;
303
304
  return {
304
305
  url,
305
306
  isTarget: false,
@@ -310,9 +311,7 @@ let Scraper = (() => {
310
311
  contentType,
311
312
  contentLength,
312
313
  responseHeaders,
313
- meta: {
314
- title,
315
- },
314
+ meta: externalMeta,
316
315
  imageList: [],
317
316
  anchorList: [],
318
317
  html,
@@ -342,7 +341,7 @@ let Scraper = (() => {
342
341
  name: 'getAnchors',
343
342
  url,
344
343
  isExternal,
345
- message: '',
344
+ message: `%countdown(${domEvaluationTimeout},getAnchors_${url.withoutHash},s)%s`,
346
345
  });
347
346
  const anchorList = await getAnchorList(page, parseOpts, domEvaluationTimeout);
348
347
  void this.emit('changePhase', {
@@ -350,9 +349,14 @@ let Scraper = (() => {
350
349
  name: 'getMeta',
351
350
  url,
352
351
  isExternal,
353
- message: '',
352
+ message: `%countdown(${domEvaluationTimeout},getMeta_${url.withoutHash},s)%s`,
354
353
  });
355
- const meta = await getMeta(page, domEvaluationTimeout);
354
+ const meta = await getMeta(page, {
355
+ url: url.withoutHashAndAuth,
356
+ html,
357
+ statusCode: status,
358
+ headers: responseHeaders ?? undefined,
359
+ }, domEvaluationTimeout);
356
360
  const imageList = captureImages
357
361
  ? await (async () => {
358
362
  void this.emit('changePhase', {
@@ -360,7 +364,7 @@ let Scraper = (() => {
360
364
  name: 'extractImages',
361
365
  url,
362
366
  isExternal,
363
- message: '',
367
+ message: `%countdown(${domEvaluationTimeout},extractImages_${url.withoutHash},s)%s`,
364
368
  });
365
369
  return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout, domEvaluationTimeout);
366
370
  })()
@@ -510,9 +514,7 @@ let Scraper = (() => {
510
514
  contentType: null,
511
515
  contentLength: null,
512
516
  responseHeaders: {},
513
- meta: {
514
- title: '',
515
- },
517
+ meta: emptyMeta(),
516
518
  imageList: [],
517
519
  anchorList: [],
518
520
  html: '',
package/dist/types.d.ts CHANGED
@@ -7,6 +7,8 @@
7
7
  export type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
8
8
  export type { CompressType } from '@d-zero/shared/detect-compress';
9
9
  export type { CDNType } from '@d-zero/shared/detect-cdn';
10
+ export type { Meta, OpenGraphMeta, OgArticleMeta, OgBookMeta, OgProfileMeta, OgMusicMeta, OgVideoNsMeta, TwitterMeta, FbMeta, FediverseMeta, AppleMeta, MsApplicationMeta, VerificationMeta, GoogleMeta, GeoMeta, CitationMeta, RdfaMeta, MicrodataMeta, AmpMeta, LegacyMeta, MobileMeta, MicroformatsMeta, PinterestMeta, SlackMeta, LinkedInMeta, ExperimentalMeta, WikiMeta, LinkMeta, LinkEntry, JsonLdEntry, OthersBucket, ScriptEntry, IframeEntry, TagsMeta, TagDetail, TagEntry, TagSource, ViewportMeta, RobotsMeta, ReferrerMeta, FormatDetectionMeta, HttpEquivMeta, HttpEquivRefresh, RawHeadEntry, } from './meta/types.js';
11
+ import type { Meta } from './meta/types.js';
10
12
  import type { CDNType } from '@d-zero/shared/detect-cdn';
11
13
  import type { CompressType } from '@d-zero/shared/detect-compress';
12
14
  import type { ExURL } from '@d-zero/shared/parse-url';
@@ -134,43 +136,6 @@ export type AnchorData = {
134
136
  */
135
137
  isExternal?: boolean;
136
138
  };
137
- /**
138
- * Metadata extracted from a page's `<head>` element.
139
- */
140
- export type Meta = {
141
- /** The `lang` attribute of the `<html>` element. */
142
- lang?: string;
143
- /** The text content of the `<title>` element. */
144
- title: string;
145
- /** The `content` attribute of `<meta name="description">`. */
146
- description?: string;
147
- /** The `content` attribute of `<meta name="keywords">`. */
148
- keywords?: string;
149
- /** Whether `noindex` is present in the robots meta tag. */
150
- noindex?: boolean;
151
- /** Whether `nofollow` is present in the robots meta tag. */
152
- nofollow?: boolean;
153
- /** Whether `noarchive` is present in the robots meta tag. */
154
- noarchive?: boolean;
155
- /** The canonical URL from `<link rel="canonical">`. */
156
- canonical?: string;
157
- /** The alternate URL from `<link rel="alternate">`. */
158
- alternate?: string;
159
- /** The Open Graph type (`og:type`). */
160
- 'og:type'?: string;
161
- /** The Open Graph title (`og:title`). */
162
- 'og:title'?: string;
163
- /** The Open Graph site name (`og:site_name`). */
164
- 'og:site_name'?: string;
165
- /** The Open Graph description (`og:description`). */
166
- 'og:description'?: string;
167
- /** The Open Graph URL (`og:url`). */
168
- 'og:url'?: string;
169
- /** The Open Graph image URL (`og:image`). */
170
- 'og:image'?: string;
171
- /** The Twitter Card type (`twitter:card`). */
172
- 'twitter:card'?: string;
173
- };
174
139
  /**
175
140
  * A network request/response log entry captured during page scraping via Puppeteer.
176
141
  */
@@ -348,7 +313,7 @@ export type ScraperOptions = {
348
313
  /**
349
314
  * Timeout (ms) for DOM evaluation operations (meta/image/anchor extraction).
350
315
  * Bounds how long extraction may hang on a page with an unresponsive main thread.
351
- * Default: 30_000 (30s).
316
+ * Default: 180_000 (180s, aligned with the upstream retryable timeout).
352
317
  */
353
318
  domEvaluationTimeout?: number;
354
319
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@d-zero/beholder",
3
- "version": "2.1.6",
3
+ "version": "3.1.0",
4
4
  "description": "Page-level scraper for web crawling and auditing",
5
5
  "author": "D-ZERO",
6
6
  "license": "MIT",
@@ -20,18 +20,21 @@
20
20
  "clean": "tsc --build --clean"
21
21
  },
22
22
  "dependencies": {
23
- "@d-zero/puppeteer-page-scan": "4.5.1",
23
+ "@d-zero/puppeteer-page-scan": "4.5.2",
24
24
  "@d-zero/shared": "0.22.0",
25
25
  "debug": "4.4.3",
26
- "puppeteer": "24.37.5"
26
+ "puppeteer": "24.37.5",
27
+ "simple-wappalyzer": "1.1.99"
27
28
  },
28
29
  "devDependencies": {
29
- "@types/debug": "4.1.12"
30
+ "@types/debug": "4.1.12",
31
+ "@types/jsdom": "28.0.3",
32
+ "jsdom": "29.1.1"
30
33
  },
31
34
  "repository": {
32
35
  "type": "git",
33
36
  "url": "https://github.com/d-zero-dev/tools.git",
34
37
  "directory": "packages/@d-zero/beholder"
35
38
  },
36
- "gitHead": "25b4043dcd70cf3490ddcefd76a88b22c60f7712"
39
+ "gitHead": "e69344a9d4d45b0ec0ee942f920b84bbd0fb77ae"
37
40
  }
@@ -1,5 +1,8 @@
1
1
  import type { ElementHandle, Page } from 'puppeteer';
2
2
 
3
+ import { readFileSync } from 'node:fs';
4
+ import { createRequire } from 'node:module';
5
+
3
6
  import { afterEach, describe, expect, it, vi } from 'vitest';
4
7
 
5
8
  import {
@@ -9,6 +12,7 @@ import {
9
12
  getMeta,
10
13
  getProp,
11
14
  } from './dom-evaluation.js';
15
+ import { emptyMeta } from './meta/classify.js';
12
16
 
13
17
  afterEach(() => {
14
18
  vi.useRealTimers();
@@ -38,68 +42,30 @@ function mockElementHandle(value: unknown): ElementHandle<Element> {
38
42
  }
39
43
 
40
44
  describe('getMeta', () => {
41
- it('maps raw evaluation result into a Meta object and parses robots directives', async () => {
42
- const page = mockPageEvaluate({
43
- title: 'Example',
44
- lang: 'ja',
45
- description: 'desc',
46
- keywords: 'a,b',
47
- robots: 'noindex, NOFOLLOW',
48
- canonical: 'https://example.com/',
49
- alternate: 'https://example.com/en',
50
- 'og:type': 'website',
51
- 'og:title': 'OG Title',
52
- 'og:site_name': 'Site',
53
- 'og:description': 'OG desc',
54
- 'og:url': 'https://example.com/',
55
- 'og:image': 'https://example.com/img.png',
56
- 'twitter:card': 'summary',
57
- });
58
-
59
- const meta = await getMeta(page);
60
-
61
- expect(meta).toStrictEqual({
62
- title: 'Example',
63
- lang: 'ja',
64
- description: 'desc',
65
- keywords: 'a,b',
66
- noindex: true,
67
- nofollow: true,
68
- noarchive: false,
69
- canonical: 'https://example.com/',
70
- alternate: 'https://example.com/en',
71
- 'og:type': 'website',
72
- 'og:title': 'OG Title',
73
- 'og:site_name': 'Site',
74
- 'og:description': 'OG desc',
75
- 'og:url': 'https://example.com/',
76
- 'og:image': 'https://example.com/img.png',
77
- 'twitter:card': 'summary',
78
- });
79
- });
80
-
81
- it('returns a minimal fallback when evaluation rejects', async () => {
45
+ it('returns emptyMeta() when page.evaluate rejects', async () => {
82
46
  const page = {
83
47
  evaluate: () => Promise.reject(new Error('execution context destroyed')),
48
+ content: () => Promise.resolve('<html></html>'),
84
49
  } as unknown as Page;
85
50
 
86
- const meta = await getMeta(page);
51
+ const meta = await getMeta(page, { url: 'https://example.com/' });
87
52
 
88
- expect(meta).toStrictEqual({ title: '' });
53
+ expect(meta).toEqual(emptyMeta());
89
54
  });
90
55
 
91
- it('returns a minimal fallback when the main thread is unresponsive (timeout)', async () => {
56
+ it('returns emptyMeta() when the main thread is unresponsive (timeout)', async () => {
92
57
  vi.useFakeTimers();
93
58
  const page = {
94
59
  // Never resolves — simulates a blocked main thread.
95
60
  evaluate: () => new Promise(() => {}),
61
+ content: () => new Promise(() => {}),
96
62
  } as unknown as Page;
97
63
 
98
- const promise = getMeta(page, 5000);
64
+ const promise = getMeta(page, { url: 'https://example.com/' }, 5000);
99
65
  await vi.advanceTimersByTimeAsync(5000);
100
66
  const meta = await promise;
101
67
 
102
- expect(meta).toStrictEqual({ title: '' });
68
+ expect(meta).toEqual(emptyMeta());
103
69
  expect(vi.getTimerCount()).toBe(0);
104
70
  });
105
71
  });
@@ -231,15 +197,76 @@ describe('getProp', () => {
231
197
  });
232
198
  });
233
199
 
200
+ /**
201
+ * Builds an anchor element handle whose `remoteObject().objectId` and per-property
202
+ * reads can be customized for the new Strategy F implementation.
203
+ * @param objectId The remote object id used to map this handle back to an AX node.
204
+ * @param props Property values returned by `getProperty(propName).jsonValue()`.
205
+ */
206
+ function mockAnchorHandle(
207
+ objectId: string,
208
+ props: Record<string, unknown>,
209
+ ): ElementHandle<Element> {
210
+ return {
211
+ remoteObject: () => ({ objectId }),
212
+ getProperty: (propName: string) =>
213
+ Promise.resolve({
214
+ jsonValue: () => Promise.resolve(props[propName] ?? ''),
215
+ }),
216
+ } as unknown as ElementHandle<Element>;
217
+ }
218
+
219
+ /**
220
+ * Builds a page mock for the new `getAnchorList` implementation, wiring up
221
+ * `_client()` to return a stub CDP session whose `send(method)` is dispatched
222
+ * by `axNodes`/`describeNodes` (matched by `objectId`).
223
+ * @param args - Mock configuration.
224
+ * @param args.anchors - Anchor element handles to be returned by `page.$$()`.
225
+ * @param args.axNodes - Raw AX nodes returned by `Accessibility.getFullAXTree`.
226
+ * @param args.describeNodes - Map from `objectId` → `backendNodeId` for `DOM.describeNode`.
227
+ * @param args.getFullAXTree - Optional override for `Accessibility.getFullAXTree` (e.g., simulate rejection).
228
+ * @param args.describeNode - Optional override for `DOM.describeNode` (e.g., simulate rejection).
229
+ */
230
+ function mockPageForAnchors(args: {
231
+ anchors: ElementHandle<Element>[];
232
+ axNodes?: Array<{
233
+ backendDOMNodeId?: number;
234
+ ignored?: boolean;
235
+ name?: { value?: unknown };
236
+ }>;
237
+ describeNodes?: Record<string, number | undefined>;
238
+ getFullAXTree?: () => Promise<unknown>;
239
+ describeNode?: (params: { objectId: string }) => Promise<unknown>;
240
+ }): Page {
241
+ const { anchors, axNodes = [], describeNodes = {}, getFullAXTree, describeNode } = args;
242
+ const client = {
243
+ send: (method: string, params?: { objectId?: string }) => {
244
+ if (method === 'Accessibility.getFullAXTree') {
245
+ return getFullAXTree ? getFullAXTree() : Promise.resolve({ nodes: axNodes });
246
+ }
247
+ if (method === 'DOM.describeNode') {
248
+ if (describeNode) return describeNode({ objectId: params?.objectId ?? '' });
249
+ const backendNodeId =
250
+ params?.objectId == null ? undefined : describeNodes[params.objectId];
251
+ return Promise.resolve({ node: { backendNodeId } });
252
+ }
253
+ return Promise.reject(new Error(`unexpected CDP method: ${method}`));
254
+ },
255
+ };
256
+ return {
257
+ $$: () => Promise.resolve(anchors),
258
+ _client: () => client,
259
+ } as unknown as Page;
260
+ }
261
+
234
262
  describe('getAnchorList', () => {
235
- it('resolves the href and prefers the accessible name from the accessibility tree', async () => {
236
- const $anchor = mockElementHandle('https://example.com/page');
237
- const page = {
238
- $$: () => Promise.resolve([$anchor]),
239
- accessibility: {
240
- snapshot: () => Promise.resolve({ name: 'Accessible Name' }),
241
- },
242
- } as unknown as Page;
263
+ it('resolves the href and uses the accessible name from the AX tree', async () => {
264
+ const $anchor = mockAnchorHandle('obj-1', { href: 'https://example.com/page' });
265
+ const page = mockPageForAnchors({
266
+ anchors: [$anchor],
267
+ axNodes: [{ backendDOMNodeId: 42, name: { value: 'Accessible Name' } }],
268
+ describeNodes: { 'obj-1': 42 },
269
+ });
243
270
 
244
271
  const anchors = await getAnchorList(page);
245
272
 
@@ -248,22 +275,84 @@ describe('getAnchorList', () => {
248
275
  expect(anchors[0]?.href.href).toBe('https://example.com/page');
249
276
  });
250
277
 
251
- it('falls back to trimmed textContent when the accessibility tree has no node', async () => {
278
+ it('uses an empty AX name as-is without falling back to textContent', async () => {
279
+ // Mirrors the old `axNode.name || ''` behavior: when the AX tree DOES contain
280
+ // the anchor (so it's not "missing from the tree") but its computed name is
281
+ // empty, we keep the empty string — no textContent fallback.
282
+ const textContent = vi.fn();
252
283
  const $anchor = {
253
- getProperty: vi
254
- .fn()
255
- // First getProp call reads `href`, second reads `textContent`.
256
- .mockResolvedValueOnce({
257
- jsonValue: () => Promise.resolve('https://example.com/page'),
258
- })
259
- .mockResolvedValueOnce({ jsonValue: () => Promise.resolve(' Link text ') }),
284
+ remoteObject: () => ({ objectId: 'obj-1' }),
285
+ getProperty: (propName: string) => {
286
+ if (propName === 'href') {
287
+ return Promise.resolve({
288
+ jsonValue: () => Promise.resolve('https://example.com/page'),
289
+ });
290
+ }
291
+ textContent();
292
+ return Promise.resolve({ jsonValue: () => Promise.resolve('text fallback') });
293
+ },
260
294
  } as unknown as ElementHandle<Element>;
261
- const page = {
262
- $$: () => Promise.resolve([$anchor]),
263
- accessibility: {
264
- snapshot: () => Promise.resolve(null),
295
+ const page = mockPageForAnchors({
296
+ anchors: [$anchor],
297
+ axNodes: [{ backendDOMNodeId: 42, name: { value: '' } }],
298
+ describeNodes: { 'obj-1': 42 },
299
+ });
300
+
301
+ const anchors = await getAnchorList(page);
302
+
303
+ expect(anchors).toHaveLength(1);
304
+ expect(anchors[0]?.textContent).toBe('');
305
+ expect(textContent).not.toHaveBeenCalled();
306
+ });
307
+
308
+ it('falls back to textContent for ignored AX nodes (aria-hidden / display:none anchors)', async () => {
309
+ // Mirrors puppeteer's high-level snapshot({root}) with interestingOnly:true,
310
+ // which returns null for ignored nodes — old code then used textContent.
311
+ const $anchor = mockAnchorHandle('obj-1', {
312
+ href: 'https://example.com/page',
313
+ textContent: 'Visible text',
314
+ });
315
+ const page = mockPageForAnchors({
316
+ anchors: [$anchor],
317
+ axNodes: [{ backendDOMNodeId: 42, ignored: true, name: { value: '' } }],
318
+ describeNodes: { 'obj-1': 42 },
319
+ });
320
+
321
+ const anchors = await getAnchorList(page);
322
+
323
+ expect(anchors).toHaveLength(1);
324
+ expect(anchors[0]?.textContent).toBe('Visible text');
325
+ });
326
+
327
+ it('drops a single anchor whose handle throws (detached) without rejecting the whole list', async () => {
328
+ const $detached = {
329
+ remoteObject: () => {
330
+ throw new Error('Handle is detached');
265
331
  },
266
- } as unknown as Page;
332
+ } as unknown as ElementHandle<Element>;
333
+ const $good = mockAnchorHandle('obj-1', { href: 'https://example.com/page' });
334
+ const page = mockPageForAnchors({
335
+ anchors: [$detached, $good],
336
+ axNodes: [{ backendDOMNodeId: 42, name: { value: 'Name' } }],
337
+ describeNodes: { 'obj-1': 42 },
338
+ });
339
+
340
+ const anchors = await getAnchorList(page);
341
+
342
+ expect(anchors).toHaveLength(1);
343
+ expect(anchors[0]?.href.href).toBe('https://example.com/page');
344
+ });
345
+
346
+ it('falls back to trimmed textContent when the anchor is not represented in the AX tree', async () => {
347
+ const $anchor = mockAnchorHandle('obj-1', {
348
+ href: 'https://example.com/page',
349
+ textContent: ' Link text ',
350
+ });
351
+ const page = mockPageForAnchors({
352
+ anchors: [$anchor],
353
+ axNodes: [], // anchor's backendNodeId not present
354
+ describeNodes: { 'obj-1': 99 },
355
+ });
267
356
 
268
357
  const anchors = await getAnchorList(page);
269
358
 
@@ -271,23 +360,162 @@ describe('getAnchorList', () => {
271
360
  expect(anchors[0]?.textContent).toBe('Link text');
272
361
  });
273
362
 
363
+ it('falls back to textContent when the AX tree response is malformed (no `nodes` field)', async () => {
364
+ // Defensive: an unexpected CDP shape must not throw or pollute the map.
365
+ const $anchor = mockAnchorHandle('obj-1', {
366
+ href: 'https://example.com/page',
367
+ textContent: 'Plain text',
368
+ });
369
+ const page = mockPageForAnchors({
370
+ anchors: [$anchor],
371
+ getFullAXTree: () => Promise.resolve({}),
372
+ describeNodes: { 'obj-1': 1 },
373
+ });
374
+
375
+ const anchors = await getAnchorList(page);
376
+
377
+ expect(anchors).toHaveLength(1);
378
+ expect(anchors[0]?.textContent).toBe('Plain text');
379
+ });
380
+
381
+ it('falls back to textContent when DOM.describeNode response is malformed (no `node` field)', async () => {
382
+ // Defensive: an unexpected CDP shape must not throw inside Promise.all.
383
+ const $anchor = mockAnchorHandle('obj-1', {
384
+ href: 'https://example.com/page',
385
+ textContent: 'Plain text',
386
+ });
387
+ const page = mockPageForAnchors({
388
+ anchors: [$anchor],
389
+ axNodes: [{ backendDOMNodeId: 1, name: { value: 'AX Name' } }],
390
+ describeNode: () => Promise.resolve({}),
391
+ });
392
+
393
+ const anchors = await getAnchorList(page);
394
+
395
+ expect(anchors).toHaveLength(1);
396
+ expect(anchors[0]?.textContent).toBe('Plain text');
397
+ });
398
+
399
+ it('falls back to textContent for every anchor when the AX tree fetch rejects', async () => {
400
+ const $anchor = mockAnchorHandle('obj-1', {
401
+ href: 'https://example.com/page',
402
+ textContent: 'Plain text',
403
+ });
404
+ const page = mockPageForAnchors({
405
+ anchors: [$anchor],
406
+ getFullAXTree: () => Promise.reject(new Error('CDP unavailable')),
407
+ describeNodes: { 'obj-1': 1 },
408
+ });
409
+
410
+ const anchors = await getAnchorList(page);
411
+
412
+ expect(anchors).toHaveLength(1);
413
+ expect(anchors[0]?.textContent).toBe('Plain text');
414
+ });
415
+
416
+ it('falls back to textContent when DOM.describeNode rejects for an anchor', async () => {
417
+ const $anchor = mockAnchorHandle('obj-1', {
418
+ href: 'https://example.com/page',
419
+ textContent: 'Plain text',
420
+ });
421
+ const page = mockPageForAnchors({
422
+ anchors: [$anchor],
423
+ axNodes: [{ backendDOMNodeId: 1, name: { value: 'AX Name' } }],
424
+ describeNode: () => Promise.reject(new Error('detached')),
425
+ });
426
+
427
+ const anchors = await getAnchorList(page);
428
+
429
+ expect(anchors).toHaveLength(1);
430
+ expect(anchors[0]?.textContent).toBe('Plain text');
431
+ });
432
+
433
+ it('returns partial results when the overall operation exceeds the timeout', async () => {
434
+ vi.useFakeTimers();
435
+ const $fast = mockAnchorHandle('obj-fast', { href: 'https://example.com/fast' });
436
+ const $slow = {
437
+ remoteObject: () => ({ objectId: 'obj-slow' }),
438
+ getProperty: () => new Promise(() => {}), // never resolves
439
+ } as unknown as ElementHandle<Element>;
440
+ const page = mockPageForAnchors({
441
+ anchors: [$fast, $slow],
442
+ axNodes: [{ backendDOMNodeId: 1, name: { value: 'Fast' } }],
443
+ describeNodes: { 'obj-fast': 1, 'obj-slow': 2 },
444
+ });
445
+
446
+ const promise = getAnchorList(page, undefined, 5000);
447
+ await vi.advanceTimersByTimeAsync(5000);
448
+ const anchors = await promise;
449
+
450
+ // The fast anchor was collected before the overall race tripped; the slow
451
+ // one was abandoned.
452
+ expect(anchors).toHaveLength(1);
453
+ expect(anchors[0]?.href.href).toBe('https://example.com/fast');
454
+ });
455
+
274
456
  it('skips non-HTTP links', async () => {
275
- const $anchor = mockElementHandle('javascript:void(0)');
457
+ const $anchor = mockAnchorHandle('obj-1', { href: 'javascript:void(0)' });
458
+ const page = mockPageForAnchors({
459
+ anchors: [$anchor],
460
+ axNodes: [{ backendDOMNodeId: 1, name: { value: 'JS link' } }],
461
+ describeNodes: { 'obj-1': 1 },
462
+ });
463
+
464
+ const anchors = await getAnchorList(page);
465
+
466
+ expect(anchors).toStrictEqual([]);
467
+ });
468
+
469
+ it("falls back to textContent for every anchor when puppeteer's internal CDP session is unavailable", async () => {
470
+ const $anchor = mockAnchorHandle('obj-1', {
471
+ href: 'https://example.com/page',
472
+ textContent: ' Plain text ',
473
+ });
474
+ // Page mock without `_client()`: simulates puppeteer wrappers that hide the
475
+ // internal session — the function must still produce anchor data, just
476
+ // without AX names.
276
477
  const page = {
277
478
  $$: () => Promise.resolve([$anchor]),
278
- accessibility: {
279
- snapshot: () => Promise.resolve(null),
280
- },
281
479
  } as unknown as Page;
282
480
 
283
481
  const anchors = await getAnchorList(page);
284
482
 
483
+ expect(anchors).toHaveLength(1);
484
+ expect(anchors[0]?.textContent).toBe('Plain text');
485
+ });
486
+
487
+ it('returns an empty array when the page has no anchors', async () => {
488
+ const page = mockPageForAnchors({ anchors: [] });
489
+
490
+ const anchors = await getAnchorList(page);
491
+
285
492
  expect(anchors).toStrictEqual([]);
286
493
  });
287
494
  });
288
495
 
289
496
  describe('DEFAULT_DOM_EVALUATION_TIMEOUT', () => {
290
- it('defaults to 30 seconds', () => {
291
- expect(DEFAULT_DOM_EVALUATION_TIMEOUT).toBe(30_000);
497
+ it('defaults to 180 seconds', () => {
498
+ expect(DEFAULT_DOM_EVALUATION_TIMEOUT).toBe(180_000);
499
+ });
500
+ });
501
+
502
+ /**
503
+ * Tripwire: `getAnchorList` reads `(page as any)._client()` to reuse puppeteer's
504
+ * internal CDP session. Unit tests mock that method directly, so a silent
505
+ * removal/rename in a future puppeteer release would not be caught by the
506
+ * functional tests — the production path would just fall back to
507
+ * textContent-only mode without anyone noticing.
508
+ *
509
+ * This block inspects the actual installed puppeteer-core source to assert the
510
+ * `_client()` method still exists. If puppeteer drops or renames it, this test
511
+ * fails and forces a maintainer to update `getInternalCDPClient` instead of
512
+ * silently degrading.
513
+ */
514
+ describe('puppeteer internal API tripwire', () => {
515
+ it('puppeteer-core CDP Page still defines _client()', () => {
516
+ const require = createRequire(import.meta.url);
517
+ const cdpPagePath = require.resolve('puppeteer-core/lib/cjs/puppeteer/cdp/Page.js');
518
+ const src = readFileSync(cdpPagePath, 'utf8');
519
+ expect(src).toMatch(/_client\s*\(\s*\)\s*\{/);
292
520
  });
293
521
  });