npm - @d-zero/beholder - Versions diffs - 2.1.6 → 3.1.0 - Mend

@d-zero/beholder 2.1.6 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/CHANGELOG.md +44 -0
package/README.md +26 -0
package/dist/dom-evaluation.d.ts +72 -24
package/dist/dom-evaluation.js +310 -84
package/dist/extract-meta.d.ts +98 -0
package/dist/extract-meta.js +75 -0
package/dist/index.d.ts +3 -1
package/dist/index.js +1 -0
package/dist/meta/classify.d.ts +52 -0
package/dist/meta/classify.js +731 -0
package/dist/meta/collect-head.d.ts +63 -0
package/dist/meta/collect-head.js +223 -0
package/dist/meta/id-extractors.d.ts +40 -0
package/dist/meta/id-extractors.js +196 -0
package/dist/meta/keys.d.ts +41 -0
package/dist/meta/keys.js +507 -0
package/dist/meta/parsers.d.ts +74 -0
package/dist/meta/parsers.js +293 -0
package/dist/meta/tag-detection.d.ts +59 -0
package/dist/meta/tag-detection.js +120 -0
package/dist/meta/types.d.ts +874 -0
package/dist/meta/types.js +12 -0
package/dist/scraper.js +15 -13
package/dist/types.d.ts +3 -38
package/package.json +8 -5
package/src/dom-evaluation.spec.ts +301 -73
package/src/dom-evaluation.ts +417 -88
package/src/extract-meta.spec.ts +247 -0
package/src/extract-meta.ts +121 -0
package/src/index.ts +45 -0
package/src/meta/classify.spec.ts +281 -0
package/src/meta/classify.ts +810 -0
package/src/meta/collect-head.ts +247 -0
package/src/meta/id-extractors.spec.ts +69 -0
package/src/meta/id-extractors.ts +206 -0
package/src/meta/keys.ts +568 -0
package/src/meta/parsers.spec.ts +178 -0
package/src/meta/parsers.ts +304 -0
package/src/meta/simple-wappalyzer.d.ts +37 -0
package/src/meta/tag-detection.spec.ts +134 -0
package/src/meta/tag-detection.ts +161 -0
package/src/meta/types.ts +949 -0
package/src/scraper.ts +19 -13
package/src/types.ts +49 -55
package/tsconfig.tsbuildinfo +1 -1

package/dist/meta/types.js ADDED Viewed

@@ -0,0 +1,12 @@
+/**
+ * Type definitions for the `Meta` data extracted from a page's `<head>` and full document.
+ *
+ * Structure follows the reference table in `frontmatter-keys.md`, with one dot-path
+ * field per category. Optional fields are absent when not detected on the page.
+ * Array fields are required and default to `[]` so consumers can iterate without
+ * null-checks.
+ * @see {@link ./classify.ts} for the function that builds `Meta` from raw head entries
+ * @see {@link ./parsers.ts} for the value normalizers used by `classify`
+ * @module
+ */
+export {};

package/dist/scraper.js CHANGED Viewed

@@ -45,6 +45,7 @@ import { resourceLog, scraperLog } from './debug.js';
 import { DEFAULT_DOM_EVALUATION_TIMEOUT, getAnchorList, getImageList, getMeta, } from './dom-evaluation.js';
 import { isError } from './is-error.js';
 import { keywordCheck } from './keyword-check.js';
+import { emptyMeta } from './meta/classify.js';
 import { findDisconnectionFailures } from './network-disconnection.js';
 import { parseUrl } from './parse-url.js';
 const pid = `${process.pid}`;
@@ -266,9 +267,7 @@ let Scraper = (() => {
                             contentType,
                             contentLength,
                             responseHeaders,
-                            meta: {
-                                title: '',
-                            },
+                            meta: emptyMeta(),
                             imageList: [],
                             anchorList: [],
                             html: '',
@@ -300,6 +299,8 @@ let Scraper = (() => {
                         };
                     });
                     if (isExternal) {
+                        const externalMeta = emptyMeta();
+                        externalMeta.title = title;
                         return {
                             url,
                             isTarget: false,
@@ -310,9 +311,7 @@ let Scraper = (() => {
                             contentType,
                             contentLength,
                             responseHeaders,
-                            meta: {
-                                title,
-                            },
+                            meta: externalMeta,
                             imageList: [],
                             anchorList: [],
                             html,
@@ -342,7 +341,7 @@ let Scraper = (() => {
                         name: 'getAnchors',
                         url,
                         isExternal,
-                        message: '',
+                        message: `%countdown(${domEvaluationTimeout},getAnchors_${url.withoutHash},s)%s`,
                     });
                     const anchorList = await getAnchorList(page, parseOpts, domEvaluationTimeout);
                     void this.emit('changePhase', {
@@ -350,9 +349,14 @@ let Scraper = (() => {
                         name: 'getMeta',
                         url,
                         isExternal,
-                        message: '',
+                        message: `%countdown(${domEvaluationTimeout},getMeta_${url.withoutHash},s)%s`,
                     });
-                    const meta = await getMeta(page, domEvaluationTimeout);
+                    const meta = await getMeta(page, {
+                        url: url.withoutHashAndAuth,
+                        html,
+                        statusCode: status,
+                        headers: responseHeaders ?? undefined,
+                    }, domEvaluationTimeout);
                     const imageList = captureImages
                         ? await (async () => {
                             void this.emit('changePhase', {
@@ -360,7 +364,7 @@ let Scraper = (() => {
                                 name: 'extractImages',
                                 url,
                                 isExternal,
-                                message: '',
+                                message: `%countdown(${domEvaluationTimeout},extractImages_${url.withoutHash},s)%s`,
                             });
                             return this.#fetchImages(page, url.withoutHashAndAuth, isExternal, imageLoadTimeout, domEvaluationTimeout);
                         })()
@@ -510,9 +514,7 @@ let Scraper = (() => {
                     contentType: null,
                     contentLength: null,
                     responseHeaders: {},
-                    meta: {
-                        title: '',
-                    },
+                    meta: emptyMeta(),
                     imageList: [],
                     anchorList: [],
                     html: '',

package/dist/types.d.ts CHANGED Viewed

@@ -7,6 +7,8 @@
 export type { ExURL, ParseURLOptions } from '@d-zero/shared/parse-url';
 export type { CompressType } from '@d-zero/shared/detect-compress';
 export type { CDNType } from '@d-zero/shared/detect-cdn';
+export type { Meta, OpenGraphMeta, OgArticleMeta, OgBookMeta, OgProfileMeta, OgMusicMeta, OgVideoNsMeta, TwitterMeta, FbMeta, FediverseMeta, AppleMeta, MsApplicationMeta, VerificationMeta, GoogleMeta, GeoMeta, CitationMeta, RdfaMeta, MicrodataMeta, AmpMeta, LegacyMeta, MobileMeta, MicroformatsMeta, PinterestMeta, SlackMeta, LinkedInMeta, ExperimentalMeta, WikiMeta, LinkMeta, LinkEntry, JsonLdEntry, OthersBucket, ScriptEntry, IframeEntry, TagsMeta, TagDetail, TagEntry, TagSource, ViewportMeta, RobotsMeta, ReferrerMeta, FormatDetectionMeta, HttpEquivMeta, HttpEquivRefresh, RawHeadEntry, } from './meta/types.js';
+import type { Meta } from './meta/types.js';
 import type { CDNType } from '@d-zero/shared/detect-cdn';
 import type { CompressType } from '@d-zero/shared/detect-compress';
 import type { ExURL } from '@d-zero/shared/parse-url';
@@ -134,43 +136,6 @@ export type AnchorData = {
      */
     isExternal?: boolean;
 };
-/**
- * Metadata extracted from a page's `<head>` element.
- */
-export type Meta = {
-    /** The `lang` attribute of the `<html>` element. */
-    lang?: string;
-    /** The text content of the `<title>` element. */
-    title: string;
-    /** The `content` attribute of `<meta name="description">`. */
-    description?: string;
-    /** The `content` attribute of `<meta name="keywords">`. */
-    keywords?: string;
-    /** Whether `noindex` is present in the robots meta tag. */
-    noindex?: boolean;
-    /** Whether `nofollow` is present in the robots meta tag. */
-    nofollow?: boolean;
-    /** Whether `noarchive` is present in the robots meta tag. */
-    noarchive?: boolean;
-    /** The canonical URL from `<link rel="canonical">`. */
-    canonical?: string;
-    /** The alternate URL from `<link rel="alternate">`. */
-    alternate?: string;
-    /** The Open Graph type (`og:type`). */
-    'og:type'?: string;
-    /** The Open Graph title (`og:title`). */
-    'og:title'?: string;
-    /** The Open Graph site name (`og:site_name`). */
-    'og:site_name'?: string;
-    /** The Open Graph description (`og:description`). */
-    'og:description'?: string;
-    /** The Open Graph URL (`og:url`). */
-    'og:url'?: string;
-    /** The Open Graph image URL (`og:image`). */
-    'og:image'?: string;
-    /** The Twitter Card type (`twitter:card`). */
-    'twitter:card'?: string;
-};
 /**
  * A network request/response log entry captured during page scraping via Puppeteer.
  */
@@ -348,7 +313,7 @@ export type ScraperOptions = {
     /**
      * Timeout (ms) for DOM evaluation operations (meta/image/anchor extraction).
      * Bounds how long extraction may hang on a page with an unresponsive main thread.
-     * Default: 30_000 (30s).
+     * Default: 180_000 (180s, aligned with the upstream retryable timeout).
      */
     domEvaluationTimeout?: number;
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@d-zero/beholder",
-	"version": "2.1.6",
+	"version": "3.1.0",
 	"description": "Page-level scraper for web crawling and auditing",
 	"author": "D-ZERO",
 	"license": "MIT",
@@ -20,18 +20,21 @@
 		"clean": "tsc --build --clean"
 	},
 	"dependencies": {
-		"@d-zero/puppeteer-page-scan": "4.5.1",
+		"@d-zero/puppeteer-page-scan": "4.5.2",
 		"@d-zero/shared": "0.22.0",
 		"debug": "4.4.3",
-		"puppeteer": "24.37.5"
+		"puppeteer": "24.37.5",
+		"simple-wappalyzer": "1.1.99"
 	},
 	"devDependencies": {
-		"@types/debug": "4.1.12"
+		"@types/debug": "4.1.12",
+		"@types/jsdom": "28.0.3",
+		"jsdom": "29.1.1"
 	},
 	"repository": {
 		"type": "git",
 		"url": "https://github.com/d-zero-dev/tools.git",
 		"directory": "packages/@d-zero/beholder"
 	},
-	"gitHead": "25b4043dcd70cf3490ddcefd76a88b22c60f7712"
+	"gitHead": "e69344a9d4d45b0ec0ee942f920b84bbd0fb77ae"
 }

package/src/dom-evaluation.spec.ts CHANGED Viewed

@@ -1,5 +1,8 @@
 import type { ElementHandle, Page } from 'puppeteer';
+import { readFileSync } from 'node:fs';
+import { createRequire } from 'node:module';
 import { afterEach, describe, expect, it, vi } from 'vitest';
 import {
@@ -9,6 +12,7 @@ import {
 	getMeta,
 	getProp,
 } from './dom-evaluation.js';
+import { emptyMeta } from './meta/classify.js';
 afterEach(() => {
 	vi.useRealTimers();
@@ -38,68 +42,30 @@ function mockElementHandle(value: unknown): ElementHandle<Element> {
 }
 describe('getMeta', () => {
-	it('maps raw evaluation result into a Meta object and parses robots directives', async () => {
-		const page = mockPageEvaluate({
-			title: 'Example',
-			lang: 'ja',
-			description: 'desc',
-			keywords: 'a,b',
-			robots: 'noindex, NOFOLLOW',
-			canonical: 'https://example.com/',
-			alternate: 'https://example.com/en',
-			'og:type': 'website',
-			'og:title': 'OG Title',
-			'og:site_name': 'Site',
-			'og:description': 'OG desc',
-			'og:url': 'https://example.com/',
-			'og:image': 'https://example.com/img.png',
-			'twitter:card': 'summary',
-		});
-		const meta = await getMeta(page);
-		expect(meta).toStrictEqual({
-			title: 'Example',
-			lang: 'ja',
-			description: 'desc',
-			keywords: 'a,b',
-			noindex: true,
-			nofollow: true,
-			noarchive: false,
-			canonical: 'https://example.com/',
-			alternate: 'https://example.com/en',
-			'og:type': 'website',
-			'og:title': 'OG Title',
-			'og:site_name': 'Site',
-			'og:description': 'OG desc',
-			'og:url': 'https://example.com/',
-			'og:image': 'https://example.com/img.png',
-			'twitter:card': 'summary',
-		});
-	});
-	it('returns a minimal fallback when evaluation rejects', async () => {
+	it('returns emptyMeta() when page.evaluate rejects', async () => {
 		const page = {
 			evaluate: () => Promise.reject(new Error('execution context destroyed')),
+			content: () => Promise.resolve('<html></html>'),
 		} as unknown as Page;
-		const meta = await getMeta(page);
+		const meta = await getMeta(page, { url: 'https://example.com/' });
-		expect(meta).toStrictEqual({ title: '' });
+		expect(meta).toEqual(emptyMeta());
 	});
-	it('returns a minimal fallback when the main thread is unresponsive (timeout)', async () => {
+	it('returns emptyMeta() when the main thread is unresponsive (timeout)', async () => {
 		vi.useFakeTimers();
 		const page = {
 			// Never resolves — simulates a blocked main thread.
 			evaluate: () => new Promise(() => {}),
+			content: () => new Promise(() => {}),
 		} as unknown as Page;
-		const promise = getMeta(page, 5000);
+		const promise = getMeta(page, { url: 'https://example.com/' }, 5000);
 		await vi.advanceTimersByTimeAsync(5000);
 		const meta = await promise;
-		expect(meta).toStrictEqual({ title: '' });
+		expect(meta).toEqual(emptyMeta());
 		expect(vi.getTimerCount()).toBe(0);
 	});
 });
@@ -231,15 +197,76 @@ describe('getProp', () => {
 	});
 });
+/**
+ * Builds an anchor element handle whose `remoteObject().objectId` and per-property
+ * reads can be customized for the new Strategy F implementation.
+ * @param objectId The remote object id used to map this handle back to an AX node.
+ * @param props Property values returned by `getProperty(propName).jsonValue()`.
+ */
+function mockAnchorHandle(
+	objectId: string,
+	props: Record<string, unknown>,
+): ElementHandle<Element> {
+	return {
+		remoteObject: () => ({ objectId }),
+		getProperty: (propName: string) =>
+			Promise.resolve({
+				jsonValue: () => Promise.resolve(props[propName] ?? ''),
+			}),
+	} as unknown as ElementHandle<Element>;
+}
+/**
+ * Builds a page mock for the new `getAnchorList` implementation, wiring up
+ * `_client()` to return a stub CDP session whose `send(method)` is dispatched
+ * by `axNodes`/`describeNodes` (matched by `objectId`).
+ * @param args - Mock configuration.
+ * @param args.anchors - Anchor element handles to be returned by `page.$$()`.
+ * @param args.axNodes - Raw AX nodes returned by `Accessibility.getFullAXTree`.
+ * @param args.describeNodes - Map from `objectId` → `backendNodeId` for `DOM.describeNode`.
+ * @param args.getFullAXTree - Optional override for `Accessibility.getFullAXTree` (e.g., simulate rejection).
+ * @param args.describeNode - Optional override for `DOM.describeNode` (e.g., simulate rejection).
+ */
+function mockPageForAnchors(args: {
+	anchors: ElementHandle<Element>[];
+	axNodes?: Array<{
+		backendDOMNodeId?: number;
+		ignored?: boolean;
+		name?: { value?: unknown };
+	}>;
+	describeNodes?: Record<string, number | undefined>;
+	getFullAXTree?: () => Promise<unknown>;
+	describeNode?: (params: { objectId: string }) => Promise<unknown>;
+}): Page {
+	const { anchors, axNodes = [], describeNodes = {}, getFullAXTree, describeNode } = args;
+	const client = {
+		send: (method: string, params?: { objectId?: string }) => {
+			if (method === 'Accessibility.getFullAXTree') {
+				return getFullAXTree ? getFullAXTree() : Promise.resolve({ nodes: axNodes });
+			}
+			if (method === 'DOM.describeNode') {
+				if (describeNode) return describeNode({ objectId: params?.objectId ?? '' });
+				const backendNodeId =
+					params?.objectId == null ? undefined : describeNodes[params.objectId];
+				return Promise.resolve({ node: { backendNodeId } });
+			}
+			return Promise.reject(new Error(`unexpected CDP method: ${method}`));
+		},
+	};
+	return {
+		$$: () => Promise.resolve(anchors),
+		_client: () => client,
+	} as unknown as Page;
+}
 describe('getAnchorList', () => {
-	it('resolves the href and prefers the accessible name from the accessibility tree', async () => {
-		const $anchor = mockElementHandle('https://example.com/page');
-		const page = {
-			$$: () => Promise.resolve([$anchor]),
-			accessibility: {
-				snapshot: () => Promise.resolve({ name: 'Accessible Name' }),
-			},
-		} as unknown as Page;
+	it('resolves the href and uses the accessible name from the AX tree', async () => {
+		const $anchor = mockAnchorHandle('obj-1', { href: 'https://example.com/page' });
+		const page = mockPageForAnchors({
+			anchors: [$anchor],
+			axNodes: [{ backendDOMNodeId: 42, name: { value: 'Accessible Name' } }],
+			describeNodes: { 'obj-1': 42 },
+		});
 		const anchors = await getAnchorList(page);
@@ -248,22 +275,84 @@ describe('getAnchorList', () => {
 		expect(anchors[0]?.href.href).toBe('https://example.com/page');
 	});
-	it('falls back to trimmed textContent when the accessibility tree has no node', async () => {
+	it('uses an empty AX name as-is without falling back to textContent', async () => {
+		// Mirrors the old `axNode.name || ''` behavior: when the AX tree DOES contain
+		// the anchor (so it's not "missing from the tree") but its computed name is
+		// empty, we keep the empty string — no textContent fallback.
+		const textContent = vi.fn();
 		const $anchor = {
-			getProperty: vi
-				.fn()
-				// First getProp call reads `href`, second reads `textContent`.
-				.mockResolvedValueOnce({
-					jsonValue: () => Promise.resolve('https://example.com/page'),
-				})
-				.mockResolvedValueOnce({ jsonValue: () => Promise.resolve('  Link text  ') }),
+			remoteObject: () => ({ objectId: 'obj-1' }),
+			getProperty: (propName: string) => {
+				if (propName === 'href') {
+					return Promise.resolve({
+						jsonValue: () => Promise.resolve('https://example.com/page'),
+					});
+				}
+				textContent();
+				return Promise.resolve({ jsonValue: () => Promise.resolve('text fallback') });
+			},
 		} as unknown as ElementHandle<Element>;
-		const page = {
-			$$: () => Promise.resolve([$anchor]),
-			accessibility: {
-				snapshot: () => Promise.resolve(null),
+		const page = mockPageForAnchors({
+			anchors: [$anchor],
+			axNodes: [{ backendDOMNodeId: 42, name: { value: '' } }],
+			describeNodes: { 'obj-1': 42 },
+		});
+		const anchors = await getAnchorList(page);
+		expect(anchors).toHaveLength(1);
+		expect(anchors[0]?.textContent).toBe('');
+		expect(textContent).not.toHaveBeenCalled();
+	});
+	it('falls back to textContent for ignored AX nodes (aria-hidden / display:none anchors)', async () => {
+		// Mirrors puppeteer's high-level snapshot({root}) with interestingOnly:true,
+		// which returns null for ignored nodes — old code then used textContent.
+		const $anchor = mockAnchorHandle('obj-1', {
+			href: 'https://example.com/page',
+			textContent: 'Visible text',
+		});
+		const page = mockPageForAnchors({
+			anchors: [$anchor],
+			axNodes: [{ backendDOMNodeId: 42, ignored: true, name: { value: '' } }],
+			describeNodes: { 'obj-1': 42 },
+		});
+		const anchors = await getAnchorList(page);
+		expect(anchors).toHaveLength(1);
+		expect(anchors[0]?.textContent).toBe('Visible text');
+	});
+	it('drops a single anchor whose handle throws (detached) without rejecting the whole list', async () => {
+		const $detached = {
+			remoteObject: () => {
+				throw new Error('Handle is detached');
 			},
-		} as unknown as Page;
+		} as unknown as ElementHandle<Element>;
+		const $good = mockAnchorHandle('obj-1', { href: 'https://example.com/page' });
+		const page = mockPageForAnchors({
+			anchors: [$detached, $good],
+			axNodes: [{ backendDOMNodeId: 42, name: { value: 'Name' } }],
+			describeNodes: { 'obj-1': 42 },
+		});
+		const anchors = await getAnchorList(page);
+		expect(anchors).toHaveLength(1);
+		expect(anchors[0]?.href.href).toBe('https://example.com/page');
+	});
+	it('falls back to trimmed textContent when the anchor is not represented in the AX tree', async () => {
+		const $anchor = mockAnchorHandle('obj-1', {
+			href: 'https://example.com/page',
+			textContent: '  Link text  ',
+		});
+		const page = mockPageForAnchors({
+			anchors: [$anchor],
+			axNodes: [], // anchor's backendNodeId not present
+			describeNodes: { 'obj-1': 99 },
+		});
 		const anchors = await getAnchorList(page);
@@ -271,23 +360,162 @@ describe('getAnchorList', () => {
 		expect(anchors[0]?.textContent).toBe('Link text');
 	});
+	it('falls back to textContent when the AX tree response is malformed (no `nodes` field)', async () => {
+		// Defensive: an unexpected CDP shape must not throw or pollute the map.
+		const $anchor = mockAnchorHandle('obj-1', {
+			href: 'https://example.com/page',
+			textContent: 'Plain text',
+		});
+		const page = mockPageForAnchors({
+			anchors: [$anchor],
+			getFullAXTree: () => Promise.resolve({}),
+			describeNodes: { 'obj-1': 1 },
+		});
+		const anchors = await getAnchorList(page);
+		expect(anchors).toHaveLength(1);
+		expect(anchors[0]?.textContent).toBe('Plain text');
+	});
+	it('falls back to textContent when DOM.describeNode response is malformed (no `node` field)', async () => {
+		// Defensive: an unexpected CDP shape must not throw inside Promise.all.
+		const $anchor = mockAnchorHandle('obj-1', {
+			href: 'https://example.com/page',
+			textContent: 'Plain text',
+		});
+		const page = mockPageForAnchors({
+			anchors: [$anchor],
+			axNodes: [{ backendDOMNodeId: 1, name: { value: 'AX Name' } }],
+			describeNode: () => Promise.resolve({}),
+		});
+		const anchors = await getAnchorList(page);
+		expect(anchors).toHaveLength(1);
+		expect(anchors[0]?.textContent).toBe('Plain text');
+	});
+	it('falls back to textContent for every anchor when the AX tree fetch rejects', async () => {
+		const $anchor = mockAnchorHandle('obj-1', {
+			href: 'https://example.com/page',
+			textContent: 'Plain text',
+		});
+		const page = mockPageForAnchors({
+			anchors: [$anchor],
+			getFullAXTree: () => Promise.reject(new Error('CDP unavailable')),
+			describeNodes: { 'obj-1': 1 },
+		});
+		const anchors = await getAnchorList(page);
+		expect(anchors).toHaveLength(1);
+		expect(anchors[0]?.textContent).toBe('Plain text');
+	});
+	it('falls back to textContent when DOM.describeNode rejects for an anchor', async () => {
+		const $anchor = mockAnchorHandle('obj-1', {
+			href: 'https://example.com/page',
+			textContent: 'Plain text',
+		});
+		const page = mockPageForAnchors({
+			anchors: [$anchor],
+			axNodes: [{ backendDOMNodeId: 1, name: { value: 'AX Name' } }],
+			describeNode: () => Promise.reject(new Error('detached')),
+		});
+		const anchors = await getAnchorList(page);
+		expect(anchors).toHaveLength(1);
+		expect(anchors[0]?.textContent).toBe('Plain text');
+	});
+	it('returns partial results when the overall operation exceeds the timeout', async () => {
+		vi.useFakeTimers();
+		const $fast = mockAnchorHandle('obj-fast', { href: 'https://example.com/fast' });
+		const $slow = {
+			remoteObject: () => ({ objectId: 'obj-slow' }),
+			getProperty: () => new Promise(() => {}), // never resolves
+		} as unknown as ElementHandle<Element>;
+		const page = mockPageForAnchors({
+			anchors: [$fast, $slow],
+			axNodes: [{ backendDOMNodeId: 1, name: { value: 'Fast' } }],
+			describeNodes: { 'obj-fast': 1, 'obj-slow': 2 },
+		});
+		const promise = getAnchorList(page, undefined, 5000);
+		await vi.advanceTimersByTimeAsync(5000);
+		const anchors = await promise;
+		// The fast anchor was collected before the overall race tripped; the slow
+		// one was abandoned.
+		expect(anchors).toHaveLength(1);
+		expect(anchors[0]?.href.href).toBe('https://example.com/fast');
+	});
 	it('skips non-HTTP links', async () => {
-		const $anchor = mockElementHandle('javascript:void(0)');
+		const $anchor = mockAnchorHandle('obj-1', { href: 'javascript:void(0)' });
+		const page = mockPageForAnchors({
+			anchors: [$anchor],
+			axNodes: [{ backendDOMNodeId: 1, name: { value: 'JS link' } }],
+			describeNodes: { 'obj-1': 1 },
+		});
+		const anchors = await getAnchorList(page);
+		expect(anchors).toStrictEqual([]);
+	});
+	it("falls back to textContent for every anchor when puppeteer's internal CDP session is unavailable", async () => {
+		const $anchor = mockAnchorHandle('obj-1', {
+			href: 'https://example.com/page',
+			textContent: '  Plain text  ',
+		});
+		// Page mock without `_client()`: simulates puppeteer wrappers that hide the
+		// internal session — the function must still produce anchor data, just
+		// without AX names.
 		const page = {
 			$$: () => Promise.resolve([$anchor]),
-			accessibility: {
-				snapshot: () => Promise.resolve(null),
-			},
 		} as unknown as Page;
 		const anchors = await getAnchorList(page);
+		expect(anchors).toHaveLength(1);
+		expect(anchors[0]?.textContent).toBe('Plain text');
+	});
+	it('returns an empty array when the page has no anchors', async () => {
+		const page = mockPageForAnchors({ anchors: [] });
+		const anchors = await getAnchorList(page);
 		expect(anchors).toStrictEqual([]);
 	});
 });
 describe('DEFAULT_DOM_EVALUATION_TIMEOUT', () => {
-	it('defaults to 30 seconds', () => {
-		expect(DEFAULT_DOM_EVALUATION_TIMEOUT).toBe(30_000);
+	it('defaults to 180 seconds', () => {
+		expect(DEFAULT_DOM_EVALUATION_TIMEOUT).toBe(180_000);
+	});
+});
+/**
+ * Tripwire: `getAnchorList` reads `(page as any)._client()` to reuse puppeteer's
+ * internal CDP session. Unit tests mock that method directly, so a silent
+ * removal/rename in a future puppeteer release would not be caught by the
+ * functional tests — the production path would just fall back to
+ * textContent-only mode without anyone noticing.
+ *
+ * This block inspects the actual installed puppeteer-core source to assert the
+ * `_client()` method still exists. If puppeteer drops or renames it, this test
+ * fails and forces a maintainer to update `getInternalCDPClient` instead of
+ * silently degrading.
+ */
+describe('puppeteer internal API tripwire', () => {
+	it('puppeteer-core CDP Page still defines _client()', () => {
+		const require = createRequire(import.meta.url);
+		const cdpPagePath = require.resolve('puppeteer-core/lib/cjs/puppeteer/cdp/Page.js');
+		const src = readFileSync(cdpPagePath, 'utf8');
+		expect(src).toMatch(/_client\s*\(\s*\)\s*\{/);
 	});
 });