@d-zero/beholder 2.1.6 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +44 -0
  2. package/README.md +26 -0
  3. package/dist/dom-evaluation.d.ts +72 -24
  4. package/dist/dom-evaluation.js +310 -84
  5. package/dist/extract-meta.d.ts +98 -0
  6. package/dist/extract-meta.js +75 -0
  7. package/dist/index.d.ts +3 -1
  8. package/dist/index.js +1 -0
  9. package/dist/meta/classify.d.ts +52 -0
  10. package/dist/meta/classify.js +731 -0
  11. package/dist/meta/collect-head.d.ts +63 -0
  12. package/dist/meta/collect-head.js +223 -0
  13. package/dist/meta/id-extractors.d.ts +40 -0
  14. package/dist/meta/id-extractors.js +196 -0
  15. package/dist/meta/keys.d.ts +41 -0
  16. package/dist/meta/keys.js +507 -0
  17. package/dist/meta/parsers.d.ts +74 -0
  18. package/dist/meta/parsers.js +293 -0
  19. package/dist/meta/tag-detection.d.ts +59 -0
  20. package/dist/meta/tag-detection.js +120 -0
  21. package/dist/meta/types.d.ts +874 -0
  22. package/dist/meta/types.js +12 -0
  23. package/dist/scraper.js +15 -13
  24. package/dist/types.d.ts +3 -38
  25. package/package.json +8 -5
  26. package/src/dom-evaluation.spec.ts +301 -73
  27. package/src/dom-evaluation.ts +417 -88
  28. package/src/extract-meta.spec.ts +247 -0
  29. package/src/extract-meta.ts +121 -0
  30. package/src/index.ts +45 -0
  31. package/src/meta/classify.spec.ts +281 -0
  32. package/src/meta/classify.ts +810 -0
  33. package/src/meta/collect-head.ts +247 -0
  34. package/src/meta/id-extractors.spec.ts +69 -0
  35. package/src/meta/id-extractors.ts +206 -0
  36. package/src/meta/keys.ts +568 -0
  37. package/src/meta/parsers.spec.ts +178 -0
  38. package/src/meta/parsers.ts +304 -0
  39. package/src/meta/simple-wappalyzer.d.ts +37 -0
  40. package/src/meta/tag-detection.spec.ts +134 -0
  41. package/src/meta/tag-detection.ts +161 -0
  42. package/src/meta/types.ts +949 -0
  43. package/src/scraper.ts +19 -13
  44. package/src/types.ts +49 -55
  45. package/tsconfig.tsbuildinfo +1 -1
package/CHANGELOG.md CHANGED
@@ -3,6 +3,50 @@
3
3
  All notable changes to this project will be documented in this file.
4
4
  See [Conventional Commits](https://conventionalcommits.org) for commit guidelines.
5
5
 
6
+ # [3.1.0](https://github.com/d-zero-dev/tools/compare/@d-zero/beholder@3.0.0...@d-zero/beholder@3.1.0) (2026-06-17)
7
+
8
+ ### Features
9
+
10
+ - **beholder:** expose extractMetaFromDocument for jsdom-backed meta extraction ([a56e21c](https://github.com/d-zero-dev/tools/commit/a56e21c17dcc1e542595a596074c5d8e659c1168))
11
+
12
+ # [3.0.0](https://github.com/d-zero-dev/tools/compare/@d-zero/beholder@2.1.6...@d-zero/beholder@3.0.0) (2026-06-16)
13
+
14
+ ### Bug Fixes
15
+
16
+ - **beholder:** warn loudly and tripwire-test puppeteer Page.\_client() coverage ([97a07ea](https://github.com/d-zero-dev/tools/commit/97a07ea273e90d50bfede1d68f594ddee9c33268))
17
+
18
+ - feat(beholder)!: expand meta extraction with frontmatter-keys schema and Wappalyzer tag detection ([6ee7861](https://github.com/d-zero-dev/tools/commit/6ee78617aac3fe3d5c022ccfd0df265de0c5310b))
19
+
20
+ ### Features
21
+
22
+ - **beholder:** rewrite getAnchorList with single AX tree + parallel describeNode ([#876](https://github.com/d-zero-dev/tools/issues/876)) ([7e5b089](https://github.com/d-zero-dev/tools/commit/7e5b089695bd1e605d63c6faef2e8bf927bd861f))
23
+
24
+ ### BREAKING CHANGES
25
+
26
+ - `Meta` is restructured from flat keys (`noindex`, `canonical`,
27
+ `'og:type'`, `'twitter:card'`, ...) into a nested shape backed by
28
+ `frontmatter-keys.md`. New required fields: `title`, `jsonLd`,
29
+ `speculationRules`, `originTrial`, `tags`, `others`. `getMeta(page)` now takes
30
+ a context object `getMeta(page, { url, html?, statusCode?, headers? }, timeout?)`.
31
+ Old top-level shortcuts (`canonical`, `alternate`, `noindex`, `nofollow`,
32
+ `noarchive`, `'og:*'`, `'twitter:card'`) are removed; values move to
33
+ `meta.link.canonical`, `meta.robots.*`, `meta.og.*`, `meta.twitter.*` etc.
34
+
35
+ Changes:
36
+
37
+ - New `src/meta/` module: `types.ts`, `keys.ts`, `parsers.ts`, `classify.ts`,
38
+ `id-extractors.ts`, `tag-detection.ts`, plus ambient `simple-wappalyzer.d.ts`
39
+ - Browser-side `collectHead()` serializes every `<meta>`, `<link>`, structured-data
40
+ `<script>`, `<base>`, `<iframe>` plus a curated set of `window` globals into
41
+ `RawHeadEntry[]`; Node-side `classify()` maps these to typed Meta fields
42
+ - `simple-wappalyzer` (MIT) added as a dependency for technology detection;
43
+ detected providers run through `id-extractors.ts` for real ID extraction
44
+ (GA4, GTM, UA, FB Pixel, Hotjar, Clarity, ...)
45
+ - Unknown markup is preserved under `Meta.others` (meta/property/httpEquiv/
46
+ itemprop/link/script/iframe buckets) so nothing is silently dropped
47
+ - Tests: parsers/classify/id-extractors/tag-detection units + getMeta
48
+ error/timeout fallback
49
+
6
50
  ## [2.1.6](https://github.com/d-zero-dev/tools/compare/@d-zero/beholder@2.1.5...@d-zero/beholder@2.1.6) (2026-06-15)
7
51
 
8
52
  ### Bug Fixes
package/README.md CHANGED
@@ -32,3 +32,29 @@ if (result.type === 'success') {
32
32
  ```
33
33
 
34
34
  設計判断(イベントではなく戻り値で返す理由、`page` のライフサイクル責務、リトライ機構など)は `src/scraper.ts` の JSDoc を参照。
35
+
36
+ ## DOM 文字列からメタ抽出(Puppeteer なし)
37
+
38
+ HTML 文字列を jsdom などでパースしてから `Meta` を取り出したい場合、`extractMetaFromDocument` を使う。`Scraper` が内部で呼ぶ `collectHead → detectTags → classify` パイプラインと同じ実装を再利用するため、戻り値の `Meta` 形状は `scrapeStart` と同一。DOM ライブラリ(jsdom 等)はユーザランドの責務。
39
+
40
+ ```ts
41
+ import { extractMetaFromDocument } from '@d-zero/beholder';
42
+ import { JSDOM } from 'jsdom';
43
+
44
+ const url = 'https://example.com/';
45
+ const html = await (await fetch(url)).text();
46
+ const dom = new JSDOM(html, { url });
47
+
48
+ // `as unknown as Window` は jsdom の `DOMWindow` 型が lib.dom の `Window` と
49
+ // 構造的に完全一致しないための型キャスト。ランタイムでは互換。
50
+ const meta = await extractMetaFromDocument(dom.window as unknown as Window, {
51
+ url,
52
+ html,
53
+ });
54
+
55
+ console.log(meta.title);
56
+ console.log(meta.og?.image);
57
+ console.log(meta.tags.entries);
58
+ ```
59
+
60
+ `context.html` を省略すると `window.document.documentElement.outerHTML` がフォールバックされる。ただし Wappalyzer の HTML パターンはスクリプト実行前の生 HTML に合わせて作られているので、可能なら取得直後の HTML 文字列を明示的に渡す方が検出が安定する。
@@ -19,8 +19,12 @@ import type { ElementHandle, Page } from 'puppeteer';
19
19
  * Default timeout (ms) applied to DOM evaluation operations when the caller does not
20
20
  * specify one. Bounds how long a single `page.evaluate` / property read may hang on a
21
21
  * page whose main thread is unresponsive.
22
+ *
23
+ * WHY 180s: Aligned with the upstream `Scraper#fetchData` retryable timeout (3 min) so
24
+ * a single phase does not exceed the retry budget while still tolerating large pages
25
+ * (e.g., 1000+ anchors) and slow main threads.
22
26
  */
23
- export declare const DEFAULT_DOM_EVALUATION_TIMEOUT = 30000;
27
+ export declare const DEFAULT_DOM_EVALUATION_TIMEOUT = 180000;
24
28
  /**
25
29
  * Parameters for {@link getProp}.
26
30
  * @template T - The expected type of the property value.
@@ -65,35 +69,79 @@ export declare function getImageList(page: Page, viewportWidth: number, timeout?
65
69
  * the accessible name (from the accessibility tree, falling back to `textContent`),
66
70
  * and filters out non-HTTP links.
67
71
  *
68
- * WHY this keeps per-element CDP calls (unlike {@link getMeta} / {@link getImageList}):
69
- * the accessible name comes from Chrome's computed accessibility tree
70
- * (`page.accessibility.snapshot`), which is a CDP-only feature unavailable to in-page
71
- * DOM APIs. Each {@link getProp} read is still bounded by `timeout`.
72
+ * WHY Strategy F (single AX-tree fetch + parallel `DOM.describeNode`): the old
73
+ * implementation called `page.accessibility.snapshot({ root })` per anchor, which
74
+ * triggers a CDP round-trip *and* a Chrome-side AX subtree computation (~42ms
75
+ * each). On a page with 1181 anchors that compounded to ~53s. By fetching the
76
+ * full AX tree once and using `DOM.describeNode` in parallel to map element
77
+ * handles back to AX nodes by `backendDOMNodeId`, the same data is collected in
78
+ * ~150ms on the same page — a ~350× speed-up while preserving the original
79
+ * accessible-name semantics. See issue #876 for measurements.
80
+ *
81
+ * WHY the whole operation is wrapped in `raceWithTimeout`: even with bounded
82
+ * per-CDP-call timeouts, a degenerate page (blocked main thread, thousands of
83
+ * anchors, runaway describeNode latency) could chain enough sub-timeouts to
84
+ * exceed the caller's `timeout` budget. The outer race guarantees the function
85
+ * returns within `timeout`, surfacing whatever anchors were collected so far so
86
+ * the upstream scrape phase can continue rather than tripping a retryable retry.
72
87
  * @param page - The Puppeteer page to extract anchors from.
73
88
  * @param options - Optional URL parsing options (e.g., `disableQueries`).
74
- * @param timeout - Timeout in ms per property read. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
89
+ * @param timeout - Total time budget in ms for the whole extraction. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
75
90
  * @returns An array of {@link AnchorData} objects for all HTTP(S) links found on the page.
76
91
  */
77
92
  export declare function getAnchorList(page: Page, options?: ParseURLOptions, timeout?: number): Promise<AnchorData[]>;
78
93
  /**
79
- * Extracts comprehensive meta information from the page's `<head>`.
94
+ * Required context for {@link getMeta}. Provided by the scraper from data it
95
+ * already has on hand (URL it navigated to, response status/headers it received).
96
+ *
97
+ * `html` is optional: when omitted, `getMeta` falls back to `page.content()`
98
+ * to obtain the rendered HTML for the third-party tag detection pass.
99
+ */
100
+ export type GetMetaContext = {
101
+ /** The fully resolved URL of the page (after redirects). */
102
+ readonly url: string;
103
+ /** Rendered HTML. Falls back to `page.content()` when omitted. */
104
+ readonly html?: string;
105
+ /** Response status code, surfaced to the Wappalyzer driver. */
106
+ readonly statusCode?: number;
107
+ /** Response headers; case is preserved by the caller, lowercased internally. */
108
+ readonly headers?: Record<string, string | string[] | undefined>;
109
+ /**
110
+ * When `true`, the returned `Meta` includes `_raw: RawHeadEntry[]` for
111
+ * debugging. Default `false` to keep the serialized payload small.
112
+ */
113
+ readonly includeRaw?: boolean;
114
+ };
115
+ /**
116
+ * Extracts comprehensive metadata from the page.
80
117
  *
81
- * Collects all metadata in a single `page.evaluate` call (14 CDP round-trips
82
- * collapsed into 1) wrapped in {@link raceWithTimeout}. On timeout (an unresponsive
83
- * page) a minimal `{ title: '' }` is returned rather than hanging.
118
+ * Two passes happen in parallel:
119
+ * 1. Browser-side `collectHead()` serializes every `<meta>`, `<link>`,
120
+ * relevant `<script>`, `<base>`, `<noscript>`/`<iframe>` and a curated
121
+ * set of `window` globals into a `RawHeadEntry[]`. Node-side `classify()`
122
+ * then maps those entries to typed `Meta` fields using the lookup tables
123
+ * in `./meta/keys.ts`, with unknown entries preserved in `Meta.others`.
124
+ * 2. `detectTags()` runs `simple-wappalyzer` over the page HTML to produce
125
+ * `Meta.tags` (technology detection + real-ID extraction).
84
126
  *
85
- * Collected metadata:
86
- * - `title` - The document title.
87
- * - `lang` - The `lang` attribute of the `<html>` element.
88
- * - `description` - The `<meta name="description">` content.
89
- * - `keywords` - The `<meta name="keywords">` content.
90
- * - `noindex` / `nofollow` / `noarchive` - Parsed from the `<meta name="robots">` directives.
91
- * - `canonical` - The `<link rel="canonical">` content.
92
- * - `alternate` - The `<link rel="alternate">` content.
93
- * - Open Graph tags: `og:type`, `og:title`, `og:site_name`, `og:description`, `og:url`, `og:image`.
94
- * - `twitter:card` - The Twitter Card type.
95
- * @param page - The Puppeteer page to extract meta information from.
96
- * @param timeout - Timeout in ms for the evaluation. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
97
- * @returns An object containing all extracted meta properties.
127
+ * The whole call is wrapped in `raceWithTimeout`. On timeout an empty `Meta`
128
+ * (with `title: ''` and empty required arrays/objects) is returned.
129
+ * @param page
130
+ * @param context
131
+ * @param timeout
132
+ * @example
133
+ * ```ts
134
+ * const meta = await getMeta(page, {
135
+ * url: 'https://example.com/',
136
+ * html: await page.content(),
137
+ * statusCode: response.status,
138
+ * headers: response.headers,
139
+ * });
140
+ * console.log(meta.title); // <title> text
141
+ * console.log(meta.og?.image); // og:image[] array
142
+ * console.log(meta.robots?.noindex); // parsed robots
143
+ * console.log(meta.tags.detected.Analytics); // Wappalyzer hits
144
+ * console.log(meta.tags.entries.find(e => e.provider === 'Google Analytics')?.id);
145
+ * ```
98
146
  */
99
- export declare function getMeta(page: Page, timeout?: number): Promise<Meta>;
147
+ export declare function getMeta(page: Page, context: GetMetaContext, timeout?: number): Promise<Meta>;
@@ -15,6 +15,9 @@
15
15
  */
16
16
  import { raceWithTimeout } from '@d-zero/shared/race-with-timeout';
17
17
  import { domDetailsLog, domLog } from './debug.js';
18
+ import { classify, emptyMeta } from './meta/classify.js';
19
+ import { WINDOW_GLOBALS_TO_CHECK, collectHeadFromDocument } from './meta/collect-head.js';
20
+ import { detectTags } from './meta/tag-detection.js';
18
21
  import { parseUrl } from './parse-url.js';
19
22
  const pid = `${process.pid}`;
20
23
  const log = domLog.extend(pid);
@@ -23,8 +26,12 @@ const dLog = domDetailsLog.extend(pid);
23
26
  * Default timeout (ms) applied to DOM evaluation operations when the caller does not
24
27
  * specify one. Bounds how long a single `page.evaluate` / property read may hang on a
25
28
  * page whose main thread is unresponsive.
29
+ *
30
+ * WHY 180s: Aligned with the upstream `Scraper#fetchData` retryable timeout (3 min) so
31
+ * a single phase does not exceed the retry budget while still tolerating large pages
32
+ * (e.g., 1000+ anchors) and slow main threads.
26
33
  */
27
- export const DEFAULT_DOM_EVALUATION_TIMEOUT = 30_000;
34
+ export const DEFAULT_DOM_EVALUATION_TIMEOUT = 180_000;
28
35
  /**
29
36
  * Retrieves a DOM property value from a Puppeteer element handle with a timeout.
30
37
  *
@@ -108,6 +115,114 @@ export async function getImageList(page, viewportWidth, timeout = DEFAULT_DOM_EV
108
115
  dLog('Images are: %O', imageList.map((i) => i.src));
109
116
  return imageList;
110
117
  }
118
+ /**
119
+ * One-shot warning latch: only the first time `_client()` is missing in a
120
+ * process do we log the degradation. Subsequent calls stay silent to avoid
121
+ * spamming logs while every page in a crawl re-enters the fallback path.
122
+ */
123
+ let warnedAboutMissingClient = false;
124
+ /**
125
+ * Returns puppeteer's internal CDP session for the page, or `null` if it is
126
+ * unreachable (e.g., test mocks, puppeteer wrappers that hide the internal API,
127
+ * or a future puppeteer release that renames `_client`).
128
+ *
129
+ * WHY a warning log: callers transparently fall back to textContent-only mode
130
+ * when this returns `null`, which masks a silent perf regression if a
131
+ * puppeteer update removes `_client`. The warning makes the degraded state
132
+ * observable in production logs so a maintainer can patch the access path.
133
+ *
134
+ * Callers fall back to a textContent-only path when this returns `null`.
135
+ * @param page - The Puppeteer page.
136
+ */
137
+ function getInternalCDPClient(page) {
138
+ try {
139
+ const client = page._client?.();
140
+ if (!client) {
141
+ if (!warnedAboutMissingClient) {
142
+ warnedAboutMissingClient = true;
143
+ log('WARN: puppeteer Page._client() returned no session — getAnchorList ' +
144
+ 'falls back to textContent-only mode. Verify the installed puppeteer ' +
145
+ 'version still exposes the internal _client() accessor.');
146
+ }
147
+ return null;
148
+ }
149
+ return client;
150
+ }
151
+ catch (error) {
152
+ if (!warnedAboutMissingClient) {
153
+ warnedAboutMissingClient = true;
154
+ log('WARN: puppeteer Page._client() threw — getAnchorList falls back to ' +
155
+ 'textContent-only mode. Error: %O', error);
156
+ }
157
+ return null;
158
+ }
159
+ }
160
+ /**
161
+ * Fetches the full accessibility tree once and builds a `backendDOMNodeId → accessibleName`
162
+ * map covering every AX node that exposes a backend DOM id.
163
+ *
164
+ * WHY include every non-ignored node (not just `role === 'link'`): the original
165
+ * `page.accessibility.snapshot({ root })` returned whatever AX node represented
166
+ * the anchor — including anchors whose computed role was overridden via ARIA
167
+ * (e.g., `<a role="button">`). Mapping every non-ignored node preserves that.
168
+ *
169
+ * WHY skip `ignored === true`: puppeteer's high-level snapshot uses
170
+ * `interestingOnly: true` by default and returns `null` for ignored nodes
171
+ * (aria-hidden, display:none, visibility:hidden). The old code then fell back
172
+ * to `textContent.trim()`. Including ignored nodes here would short-circuit
173
+ * that fallback with the AX tree's empty name and silently drop link text.
174
+ *
175
+ * On timeout or CDP failure, an empty map is returned so callers transparently
176
+ * fall back to `textContent.trim()` for every anchor.
177
+ * @param client - The CDP session attached to the page.
178
+ * @param timeout - Maximum time to wait for the AX tree fetch.
179
+ */
180
+ async function buildAccessibleNameMap(client, timeout) {
181
+ const { result, timeout: timedOut } = await raceWithTimeout(() => client
182
+ .send('Accessibility.getFullAXTree')
183
+ .then((res) => res)
184
+ .catch((error) => {
185
+ log('Accessibility.getFullAXTree failed: %O', error);
186
+ return null;
187
+ }), timeout);
188
+ const map = new Map();
189
+ if (timedOut) {
190
+ log('Accessibility.getFullAXTree timed out after %dms', timeout);
191
+ return map;
192
+ }
193
+ if (!result?.nodes) {
194
+ return map;
195
+ }
196
+ for (const node of result.nodes) {
197
+ if (node.backendDOMNodeId == null || node.ignored === true) {
198
+ continue;
199
+ }
200
+ const name = typeof node.name?.value === 'string' ? node.name.value : '';
201
+ map.set(node.backendDOMNodeId, name);
202
+ }
203
+ return map;
204
+ }
205
+ /**
206
+ * Resolves a CDP backend node id for a given element handle.
207
+ *
208
+ * Wrapped in {@link raceWithTimeout} so a single hung `DOM.describeNode` cannot
209
+ * stall the outer `Promise.all` over every anchor on the page.
210
+ * @param client - The CDP session attached to the page (must be the same session
211
+ * that owns the handle's `objectId`).
212
+ * @param objectId - The remote object id of the element handle.
213
+ * @param timeout - Maximum time to wait for the describeNode call.
214
+ * @returns The backend node id, or `null` if unavailable / timed out / failed.
215
+ */
216
+ async function resolveBackendNodeId(client, objectId, timeout) {
217
+ const { result, timeout: timedOut } = await raceWithTimeout(() => client
218
+ .send('DOM.describeNode', { objectId })
219
+ .then((res) => res)
220
+ .catch(() => null), timeout);
221
+ if (timedOut || !result) {
222
+ return null;
223
+ }
224
+ return result.node?.backendNodeId ?? null;
225
+ }
111
226
  /**
112
227
  * Extracts all anchor (`<a>` and `<area>`) elements with `href` attributes from the page.
113
228
  *
@@ -115,104 +230,215 @@ export async function getImageList(page, viewportWidth, timeout = DEFAULT_DOM_EV
115
230
  * the accessible name (from the accessibility tree, falling back to `textContent`),
116
231
  * and filters out non-HTTP links.
117
232
  *
118
- * WHY this keeps per-element CDP calls (unlike {@link getMeta} / {@link getImageList}):
119
- * the accessible name comes from Chrome's computed accessibility tree
120
- * (`page.accessibility.snapshot`), which is a CDP-only feature unavailable to in-page
121
- * DOM APIs. Each {@link getProp} read is still bounded by `timeout`.
233
+ * WHY Strategy F (single AX-tree fetch + parallel `DOM.describeNode`): the old
234
+ * implementation called `page.accessibility.snapshot({ root })` per anchor, which
235
+ * triggers a CDP round-trip *and* a Chrome-side AX subtree computation (~42ms
236
+ * each). On a page with 1181 anchors that compounded to ~53s. By fetching the
237
+ * full AX tree once and using `DOM.describeNode` in parallel to map element
238
+ * handles back to AX nodes by `backendDOMNodeId`, the same data is collected in
239
+ * ~150ms on the same page — a ~350× speed-up while preserving the original
240
+ * accessible-name semantics. See issue #876 for measurements.
241
+ *
242
+ * WHY the whole operation is wrapped in `raceWithTimeout`: even with bounded
243
+ * per-CDP-call timeouts, a degenerate page (blocked main thread, thousands of
244
+ * anchors, runaway describeNode latency) could chain enough sub-timeouts to
245
+ * exceed the caller's `timeout` budget. The outer race guarantees the function
246
+ * returns within `timeout`, surfacing whatever anchors were collected so far so
247
+ * the upstream scrape phase can continue rather than tripping a retryable retry.
122
248
  * @param page - The Puppeteer page to extract anchors from.
123
249
  * @param options - Optional URL parsing options (e.g., `disableQueries`).
124
- * @param timeout - Timeout in ms per property read. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
250
+ * @param timeout - Total time budget in ms for the whole extraction. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
125
251
  * @returns An array of {@link AnchorData} objects for all HTTP(S) links found on the page.
126
252
  */
127
253
  export async function getAnchorList(page, options, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
128
254
  log('Getting anchors');
129
255
  const $anchors = await page.$$('a[href], area[href]');
130
- const anchorList = [];
131
- for (const $anchor of $anchors) {
132
- const $href = await getProp({ $el: $anchor, propName: 'href', fallback: '' }, timeout);
133
- const hrefVal = $href.toString();
134
- const href = parseUrl(hrefVal, options);
256
+ if ($anchors.length === 0) {
257
+ log('Got 0 anchors');
258
+ return [];
259
+ }
260
+ const collected = [];
261
+ let axHits = 0;
262
+ let textFallbacks = 0;
263
+ // Set after the overall race trips so in-flight `resolveAnchor` calls can
264
+ // short-circuit instead of continuing to consume CDP capacity and pushing
265
+ // late entries into the already-returned `collected` array.
266
+ let cancelled = false;
267
+ const work = async () => {
268
+ const client = getInternalCDPClient(page);
269
+ if (cancelled)
270
+ return;
271
+ const nameByBackendId = client
272
+ ? await buildAccessibleNameMap(client, timeout)
273
+ : new Map();
274
+ if (cancelled)
275
+ return;
276
+ await Promise.all($anchors.map(async ($anchor) => {
277
+ if (cancelled)
278
+ return;
279
+ const resolved = await resolveAnchor($anchor, client, nameByBackendId, options, timeout);
280
+ if (cancelled || !resolved) {
281
+ return;
282
+ }
283
+ if (resolved.source === 'ax') {
284
+ axHits++;
285
+ }
286
+ else {
287
+ textFallbacks++;
288
+ }
289
+ collected.push(resolved.anchor);
290
+ }));
291
+ };
292
+ const { timeout: timedOut } = await raceWithTimeout(work, timeout);
293
+ cancelled = true;
294
+ if (timedOut) {
295
+ log('getAnchorList timed out after %dms; returning %d anchors collected so far', timeout, collected.length);
296
+ }
297
+ // Snapshot so post-return mutations from any in-flight Promise.all callback
298
+ // (already gated by `cancelled`, but not synchronously cancellable) cannot
299
+ // alter the array the caller now holds.
300
+ const result = [...collected];
301
+ log('Got %d anchors (%d via AX, %d via textContent)', result.length, axHits, textFallbacks);
302
+ dLog('Anchors are: %O', result.map((a) => a.href.href));
303
+ return result;
304
+ }
305
+ /**
306
+ * Resolves a single anchor handle into an {@link AnchorData} entry, or `null`
307
+ * if the anchor's href is not an HTTP(S) URL.
308
+ *
309
+ * Fires `getProp(href)` and `DOM.describeNode` in parallel, then looks up the
310
+ * accessible name from the pre-built AX map. If the anchor is not represented
311
+ * in the AX map (or CDP is unavailable), falls back to a lazy `textContent`
312
+ * fetch — only paying the extra CDP round-trip when actually needed.
313
+ * @param $anchor - The Puppeteer element handle for an anchor element.
314
+ * @param client - The shared CDP session, or `null` if unavailable.
315
+ * @param nameByBackendId - Map from `backendDOMNodeId` to accessible name.
316
+ * @param options - URL parsing options.
317
+ * @param timeout - Per-CDP-call timeout in ms.
318
+ * @returns The resolved anchor along with the name source, or `null` when the
319
+ * anchor's href is not crawlable.
320
+ */
321
+ async function resolveAnchor($anchor, client, nameByBackendId, options, timeout) {
322
+ try {
323
+ const objectId = $anchor.remoteObject().objectId;
324
+ const [hrefVal, backendNodeId] = await Promise.all([
325
+ getProp({ $el: $anchor, propName: 'href', fallback: '' }, timeout),
326
+ client && objectId != null
327
+ ? resolveBackendNodeId(client, objectId, timeout)
328
+ : Promise.resolve(null),
329
+ ]);
330
+ const href = parseUrl(hrefVal.toString(), options);
135
331
  if (!href || !href.isHTTP) {
136
- continue;
332
+ return null;
333
+ }
334
+ const axName = backendNodeId == null ? undefined : nameByBackendId.get(backendNodeId);
335
+ if (axName !== undefined) {
336
+ return { anchor: { href, textContent: axName }, source: 'ax' };
137
337
  }
138
- const axNode = await page.accessibility.snapshot({ root: $anchor });
139
338
  const textContent = await getProp({ $el: $anchor, propName: 'textContent', fallback: '' }, timeout);
140
- const accessibleName = axNode ? axNode.name || '' : textContent.trim();
141
- const link = {
142
- href,
143
- textContent: accessibleName,
144
- };
145
- anchorList.push(link);
146
- }
147
- log('Got %d anchors', anchorList.length);
148
- dLog('Anchors are: %O', anchorList.map((a) => a.href.href));
149
- return anchorList;
339
+ return { anchor: { href, textContent: textContent.trim() }, source: 'text' };
340
+ }
341
+ catch (error) {
342
+ // `remoteObject()` (and other synchronous handle accesses) can throw when
343
+ // the handle is disposed (page navigated mid-extraction). Drop just this
344
+ // anchor rather than poisoning the Promise.all over every other anchor.
345
+ dLog('resolveAnchor failed for an anchor: %O', error);
346
+ return null;
347
+ }
150
348
  }
151
349
  /**
152
- * Extracts comprehensive meta information from the page's `<head>`.
153
- *
154
- * Collects all metadata in a single `page.evaluate` call (14 CDP round-trips
155
- * collapsed into 1) wrapped in {@link raceWithTimeout}. On timeout (an unresponsive
156
- * page) a minimal `{ title: '' }` is returned rather than hanging.
157
- *
158
- * Collected metadata:
159
- * - `title` - The document title.
160
- * - `lang` - The `lang` attribute of the `<html>` element.
161
- * - `description` - The `<meta name="description">` content.
162
- * - `keywords` - The `<meta name="keywords">` content.
163
- * - `noindex` / `nofollow` / `noarchive` - Parsed from the `<meta name="robots">` directives.
164
- * - `canonical` - The `<link rel="canonical">` content.
165
- * - `alternate` - The `<link rel="alternate">` content.
166
- * - Open Graph tags: `og:type`, `og:title`, `og:site_name`, `og:description`, `og:url`, `og:image`.
167
- * - `twitter:card` - The Twitter Card type.
168
- * @param page - The Puppeteer page to extract meta information from.
169
- * @param timeout - Timeout in ms for the evaluation. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
170
- * @returns An object containing all extracted meta properties.
350
+ * Extracts comprehensive metadata from the page.
351
+ *
352
+ * Two passes happen in parallel:
353
+ * 1. Browser-side `collectHead()` serializes every `<meta>`, `<link>`,
354
+ * relevant `<script>`, `<base>`, `<noscript>`/`<iframe>` and a curated
355
+ * set of `window` globals into a `RawHeadEntry[]`. Node-side `classify()`
356
+ * then maps those entries to typed `Meta` fields using the lookup tables
357
+ * in `./meta/keys.ts`, with unknown entries preserved in `Meta.others`.
358
+ * 2. `detectTags()` runs `simple-wappalyzer` over the page HTML to produce
359
+ * `Meta.tags` (technology detection + real-ID extraction).
360
+ *
361
+ * The whole call is wrapped in `raceWithTimeout`. On timeout an empty `Meta`
362
+ * (with `title: ''` and empty required arrays/objects) is returned.
363
+ * @param page
364
+ * @param context
365
+ * @param timeout
366
+ * @example
367
+ * ```ts
368
+ * const meta = await getMeta(page, {
369
+ * url: 'https://example.com/',
370
+ * html: await page.content(),
371
+ * statusCode: response.status,
372
+ * headers: response.headers,
373
+ * });
374
+ * console.log(meta.title); // <title> text
375
+ * console.log(meta.og?.image); // og:image[] array
376
+ * console.log(meta.robots?.noindex); // parsed robots
377
+ * console.log(meta.tags.detected.Analytics); // Wappalyzer hits
378
+ * console.log(meta.tags.entries.find(e => e.provider === 'Google Analytics')?.id);
379
+ * ```
171
380
  */
172
- export async function getMeta(page, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
381
+ export async function getMeta(page, context, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
173
382
  log('Getting Meta');
174
- const { result, timeout: timedOut } = await raceWithTimeout(() => page
175
- .evaluate(() => {
176
- /* global document, HTMLMetaElement, HTMLLinkElement */
177
- const content = (selector) => {
178
- const el = document.querySelector(selector);
179
- return el instanceof HTMLMetaElement ? el.content : '';
180
- };
181
- const linkHref = (selector) => {
182
- const el = document.querySelector(selector);
183
- return el instanceof HTMLLinkElement ? el.href : '';
184
- };
185
- return {
186
- title: document.title,
187
- lang: document.documentElement.lang,
188
- description: content('meta[name="description"]'),
189
- keywords: content('meta[name="keywords"]'),
190
- robots: content('meta[name="robots"]'),
191
- canonical: linkHref('link[rel="canonical"]'),
192
- alternate: linkHref('link[rel="alternate"]'),
193
- 'og:type': content('meta[property="og:type"]'),
194
- 'og:title': content('meta[property="og:title"]'),
195
- 'og:site_name': content('meta[property="og:site_name"]'),
196
- 'og:description': content('meta[property="og:description"]'),
197
- 'og:url': content('meta[property="og:url"]'),
198
- 'og:image': content('meta[property="og:image"]'),
199
- 'twitter:card': content('meta[name="twitter:card"]'),
200
- };
201
- })
202
- .catch(() => null), timeout);
383
+ const { result, timeout: timedOut } = await raceWithTimeout(() => runGetMeta(page, context), timeout);
203
384
  if (timedOut || result == null) {
204
385
  log('Meta extraction timed out or failed; returning fallback');
205
- return { title: '' };
206
- }
207
- const { robots: robotsVal, ...rest } = result;
208
- const robots = new Set(robotsVal.split(',').map((robot) => robot.trim().toLowerCase()));
209
- const meta = {
210
- ...rest,
211
- noindex: robots.has('noindex'),
212
- nofollow: robots.has('nofollow'),
213
- noarchive: robots.has('noarchive'),
214
- };
386
+ return emptyMeta();
387
+ }
215
388
  log('Got meta');
216
- dLog('Meta data are: %O', meta);
217
- return meta;
389
+ dLog('Meta data are: %O', result);
390
+ return result;
391
+ }
392
+ /**
393
+ *
394
+ * @param page
395
+ * @param context
396
+ */
397
+ async function runGetMeta(page, context) {
398
+ try {
399
+ const rawPromise = collectHeadOnPage(page);
400
+ const htmlPromise = context.html === undefined
401
+ ? page.content().catch(() => '')
402
+ : Promise.resolve(context.html);
403
+ const [raw, html] = await Promise.all([rawPromise, htmlPromise]);
404
+ const tags = await detectTags({
405
+ url: context.url,
406
+ html,
407
+ ...(context.statusCode === undefined ? {} : { statusCode: context.statusCode }),
408
+ ...(context.headers === undefined ? {} : { headers: context.headers }),
409
+ });
410
+ return classify(raw, {
411
+ tags,
412
+ ...(context.includeRaw ? { includeRaw: true } : {}),
413
+ });
414
+ }
415
+ catch (error) {
416
+ log('runGetMeta failed: %O', error);
417
+ return null;
418
+ }
419
+ }
420
+ /**
421
+ * Collects raw `<head>` entries from a Puppeteer page by injecting
422
+ * {@link collectHeadFromDocument} into the page realm.
423
+ *
424
+ * WHY string-eval instead of `page.evaluate(fn, args)`: the shared
425
+ * implementation lives in this module (`collectHeadFromDocument`), and a
426
+ * `page.evaluate(() => collectHeadFromDocument(window, …))` wrapper cannot
427
+ * reach that module-scope binding inside the page realm — only the wrapper's
428
+ * own source crosses the CDP boundary. Serializing the implementation via
429
+ * `Function.prototype.toString` and invoking it through
430
+ * `page.evaluate(string)` is what keeps the Puppeteer path and the
431
+ * jsdom path on one source of truth.
432
+ *
433
+ * The same {@link collectHeadFromDocument} function is also exposed via
434
+ * {@link ../extract-meta.ts | extractMetaFromDocument} for jsdom/Node callers,
435
+ * so the two paths cannot drift apart.
436
+ * @param page - The Puppeteer page whose document will be inspected.
437
+ */
438
+ async function collectHeadOnPage(page) {
439
+ const fnSource = collectHeadFromDocument.toString();
440
+ const globalsLiteral = JSON.stringify(WINDOW_GLOBALS_TO_CHECK);
441
+ const expr = `(${fnSource})(window, ${globalsLiteral})`;
442
+ const raw = await page.evaluate(expr).catch(() => []);
443
+ return raw;
218
444
  }