@d-zero/beholder 2.1.5 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,19 +3,43 @@
3
3
  *
4
4
  * These functions are called by {@link ./scraper.ts | Scraper.#fetchData} to extract
5
5
  * anchors, images, and meta information after page navigation completes.
6
+ *
7
+ * WHY timeouts everywhere: A page whose main thread is blocked (heavy JS, autoplay
8
+ * video players, infinite loops) makes every CDP round-trip hang. `getMeta` and
9
+ * `getImageList` therefore collect all data in a single `page.evaluate` and wrap it
10
+ * in {@link raceWithTimeout} so a blocked thread is abandoned after a bounded budget
11
+ * instead of accumulating per-property timeouts up to the caller's global timeout.
12
+ * Note that `page.evaluate` itself runs on the page's main thread and has no built-in
13
+ * timeout, so the surrounding race is what actually bounds the hang.
6
14
  * @see {@link ./types.ts} for the data types returned by these functions
7
15
  */
8
16
 
9
- import type { AnchorData, ImageElement, ParseURLOptions } from './types.js';
10
- import type { ElementHandle, Page } from 'puppeteer';
17
+ import type { RawHeadEntry } from './meta/types.js';
18
+ import type { AnchorData, ImageElement, Meta, ParseURLOptions } from './types.js';
19
+ import type { CDPSession, ElementHandle, Page } from 'puppeteer';
20
+
21
+ import { raceWithTimeout } from '@d-zero/shared/race-with-timeout';
11
22
 
12
23
  import { domDetailsLog, domLog } from './debug.js';
24
+ import { classify, emptyMeta } from './meta/classify.js';
25
+ import { detectTags } from './meta/tag-detection.js';
13
26
  import { parseUrl } from './parse-url.js';
14
27
 
15
28
  const pid = `${process.pid}`;
16
29
  const log = domLog.extend(pid);
17
30
  const dLog = domDetailsLog.extend(pid);
18
31
 
32
+ /**
33
+ * Default timeout (ms) applied to DOM evaluation operations when the caller does not
34
+ * specify one. Bounds how long a single `page.evaluate` / property read may hang on a
35
+ * page whose main thread is unresponsive.
36
+ *
37
+ * WHY 180s: Aligned with the upstream `Scraper#fetchData` retryable timeout (3 min) so
38
+ * a single phase does not exceed the retry budget while still tolerating large pages
39
+ * (e.g., 1000+ anchors) and slow main threads.
40
+ */
41
+ export const DEFAULT_DOM_EVALUATION_TIMEOUT = 180_000;
42
+
19
43
  /**
20
44
  * Parameters for {@link getProp}.
21
45
  * @template T - The expected type of the property value.
@@ -32,18 +56,24 @@ export interface GetPropParams<T> {
32
56
  /**
33
57
  * Retrieves a DOM property value from a Puppeteer element handle with a timeout.
34
58
  *
35
- * Races the actual property retrieval against a 10-second timeout.
59
+ * Races the actual property retrieval against a timeout via {@link raceWithTimeout},
60
+ * which clears the loser-side timer so it cannot keep the event loop alive.
36
61
  * If the property cannot be read or the timeout expires, the fallback value is returned.
37
62
  * @template T - The expected type of the property value.
38
63
  * @param params - Parameters containing the element, property name, and fallback.
39
- * @returns The property value, or the fallback if retrieval fails.
64
+ * @param timeout - Timeout in ms before falling back. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
65
+ * @returns The property value, or the fallback if retrieval fails or times out.
40
66
  */
41
- export async function getProp<T>(params: GetPropParams<T>) {
67
+ export async function getProp<T>(
68
+ params: GetPropParams<T>,
69
+ timeout: number = DEFAULT_DOM_EVALUATION_TIMEOUT,
70
+ ): Promise<T> {
42
71
  const { $el, propName, fallback } = params;
43
- return Promise.race([
44
- _getProp($el, propName, fallback),
45
- new Promise<T>((res) => setTimeout(() => res(fallback), 10 * 1000)),
46
- ]);
72
+ const { result, timeout: timedOut } = await raceWithTimeout(
73
+ () => _getProp($el, propName, fallback),
74
+ timeout,
75
+ );
76
+ return timedOut ? fallback : result;
47
77
  }
48
78
 
49
79
  /**
@@ -54,7 +84,11 @@ export async function getProp<T>(params: GetPropParams<T>) {
54
84
  * @param fallback - The default value on failure.
55
85
  * @returns The property value cast to `T`, or the fallback.
56
86
  */
57
- async function _getProp<T>($el: ElementHandle<Element>, propName: string, fallback: T) {
87
+ async function _getProp<T>(
88
+ $el: ElementHandle<Element>,
89
+ propName: string,
90
+ fallback: T,
91
+ ): Promise<T> {
58
92
  try {
59
93
  const prop = await $el.getProperty(propName);
60
94
  if (!prop) {
@@ -67,109 +101,63 @@ async function _getProp<T>($el: ElementHandle<Element>, propName: string, fallba
67
101
  }
68
102
  }
69
103
 
70
- /**
71
- * Parameters for {@link getPropBySelector}.
72
- * @template T - The expected type of the property value.
73
- */
74
- export interface GetPropBySelectorParams<T> {
75
- /** The Puppeteer page to query. */
76
- readonly page: Page;
77
- /** A CSS selector to find the target element. */
78
- readonly selector: string;
79
- /** The DOM property name to read from the matched element. */
80
- readonly propName: string;
81
- /** The default value if no element matches or the property cannot be read. */
82
- readonly fallback: T;
83
- }
84
-
85
- /**
86
- * Retrieves a DOM property value from the first element matching a CSS selector.
87
- *
88
- * Combines `page.$()` with {@link getProp} for convenient single-element lookups.
89
- * @template T - The expected type of the property value.
90
- * @param params - Parameters containing the page, selector, property name, and fallback.
91
- * @returns The property value, or the fallback if the element is not found or retrieval fails.
92
- */
93
- export async function getPropBySelector<T>(params: GetPropBySelectorParams<T>) {
94
- const { page, selector, propName, fallback } = params;
95
- const $el = await page.$(selector);
96
- if (!$el) {
97
- return fallback;
98
- }
99
-
100
- return getProp({ $el, propName, fallback });
101
- }
102
-
103
104
  /**
104
105
  * Extracts all `<img>` elements from the page and returns their properties.
105
106
  *
106
- * For each image, collects the `src`, `currentSrc`, `alt`, bounding box dimensions,
107
- * natural dimensions, lazy-loading status, and the outer HTML source code.
107
+ * Collects every image's `src`, `currentSrc`, `alt`, layout dimensions,
108
+ * natural dimensions, lazy-loading status, and outer HTML in a single
109
+ * `page.evaluate` call, wrapped in {@link raceWithTimeout}. On timeout (an
110
+ * unresponsive page) an empty array is returned rather than hanging.
108
111
  * @param page - The Puppeteer page to extract images from.
109
112
  * @param viewportWidth - The current viewport width in pixels, recorded alongside each image entry.
113
+ * @param timeout - Timeout in ms for the evaluation. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
110
114
  * @returns An array of {@link ImageElement} objects describing each image on the page.
111
115
  */
112
116
  export async function getImageList(
113
117
  page: Page,
114
118
  viewportWidth: number,
119
+ timeout: number = DEFAULT_DOM_EVALUATION_TIMEOUT,
115
120
  ): Promise<ImageElement[]> {
116
121
  log('Getting images (Viewport: %dpx)', viewportWidth);
117
122
 
118
- const $images = await page.$$('img');
119
- const imageList: {
120
- src: string;
121
- currentSrc: string;
122
- alt: string;
123
- width: number;
124
- height: number;
125
- naturalWidth: number;
126
- naturalHeight: number;
127
- isLazy: boolean;
128
- viewportWidth: number;
129
- sourceCode: string;
130
- }[] = [];
131
- for (const $image of $images) {
132
- const boundingBox = await $image.boundingBox();
133
- const width = boundingBox?.width || 0;
134
- const height = boundingBox?.height || 0;
135
- const src = await getProp({ $el: $image, propName: 'src', fallback: '' });
136
- const currentSrc = await getProp({
137
- $el: $image,
138
- propName: 'currentSrc',
139
- fallback: '',
140
- });
141
- const alt = await getProp({ $el: $image, propName: 'alt', fallback: '' });
142
- const naturalWidth = await getProp({
143
- $el: $image,
144
- propName: 'naturalWidth',
145
- fallback: 0,
146
- });
147
- const naturalHeight = await getProp({
148
- $el: $image,
149
- propName: 'naturalHeight',
150
- fallback: 0,
151
- });
152
- const loading = await getProp({ $el: $image, propName: 'loading', fallback: '' });
153
- const sourceCode = await getProp({
154
- $el: $image,
155
- propName: 'outerHTML',
156
- fallback: '',
157
- });
158
- const isLazy = loading.toLowerCase().trim() === 'lazy';
159
- imageList.push({
160
- src,
161
- currentSrc,
162
- alt,
163
- width,
164
- height,
165
- naturalWidth,
166
- naturalHeight,
167
- isLazy,
123
+ const { result, timeout: timedOut } = await raceWithTimeout(
124
+ () =>
125
+ page
126
+ .evaluate(() => {
127
+ /* global document */
128
+ return [...document.images].map((img) => {
129
+ const rect = img.getBoundingClientRect();
130
+ return {
131
+ src: img.src,
132
+ currentSrc: img.currentSrc,
133
+ alt: img.alt,
134
+ width: rect.width,
135
+ height: rect.height,
136
+ naturalWidth: img.naturalWidth,
137
+ naturalHeight: img.naturalHeight,
138
+ loading: img.loading,
139
+ sourceCode: img.outerHTML,
140
+ };
141
+ });
142
+ })
143
+ .catch(() => null),
144
+ timeout,
145
+ );
146
+
147
+ if (timedOut || result == null) {
148
+ log(
149
+ 'Image extraction timed out or failed (Viewport: %dpx); returning []',
168
150
  viewportWidth,
169
- sourceCode,
170
- });
151
+ );
152
+ return [];
171
153
  }
172
154
 
155
+ const imageList: ImageElement[] = result.map(({ loading, ...img }) => ({
156
+ ...img,
157
+ isLazy: loading.toLowerCase().trim() === 'lazy',
158
+ viewportWidth,
159
+ }));
160
+
173
161
  log('Got %d images (Viewport: %dpx)', imageList.length, viewportWidth);
174
162
  dLog(
175
163
  'Images are: %O',
@@ -178,162 +166,602 @@ export async function getImageList(
178
166
  return imageList;
179
167
  }
180
168
 
169
+ /**
170
+ * Page-like shape exposing puppeteer's internal CDP session.
171
+ *
172
+ * WHY private `_client()` instead of `page.createCDPSession()`: the `objectId`
173
+ * returned by {@link ElementHandle.remoteObject} is scoped to the page's primary
174
+ * session. A fresh session created via `createCDPSession()` cannot resolve those
175
+ * `objectId` values when calling `DOM.describeNode`, so we must reuse the same
176
+ * session puppeteer uses internally.
177
+ */
178
+ interface PageWithInternalClient {
179
+ _client(): CDPSession;
180
+ }
181
+
182
+ /** Minimal shape of a CDP `Accessibility.AXValue` we read from. */
183
+ interface AXValueLike {
184
+ readonly value?: unknown;
185
+ }
186
+
187
+ /** Minimal shape of a CDP `Accessibility.AXNode` we read from. */
188
+ interface AXNodeLike {
189
+ readonly backendDOMNodeId?: number;
190
+ readonly ignored?: boolean;
191
+ readonly name?: AXValueLike;
192
+ }
193
+
194
+ interface GetFullAXTreeResponse {
195
+ readonly nodes: readonly AXNodeLike[];
196
+ }
197
+
198
+ interface DescribeNodeResponse {
199
+ readonly node: { readonly backendNodeId?: number };
200
+ }
201
+
202
+ /**
203
+ * One-shot warning latch: only the first time `_client()` is missing in a
204
+ * process do we log the degradation. Subsequent calls stay silent to avoid
205
+ * spamming logs while every page in a crawl re-enters the fallback path.
206
+ */
207
+ let warnedAboutMissingClient = false;
208
+
209
+ /**
210
+ * Returns puppeteer's internal CDP session for the page, or `null` if it is
211
+ * unreachable (e.g., test mocks, puppeteer wrappers that hide the internal API,
212
+ * or a future puppeteer release that renames `_client`).
213
+ *
214
+ * WHY a warning log: callers transparently fall back to textContent-only mode
215
+ * when this returns `null`, which masks a silent perf regression if a
216
+ * puppeteer update removes `_client`. The warning makes the degraded state
217
+ * observable in production logs so a maintainer can patch the access path.
218
+ *
219
+ * Callers fall back to a textContent-only path when this returns `null`.
220
+ * @param page - The Puppeteer page.
221
+ */
222
+ function getInternalCDPClient(page: Page): CDPSession | null {
223
+ try {
224
+ const client = (page as unknown as Partial<PageWithInternalClient>)._client?.();
225
+ if (!client) {
226
+ if (!warnedAboutMissingClient) {
227
+ warnedAboutMissingClient = true;
228
+ log(
229
+ 'WARN: puppeteer Page._client() returned no session — getAnchorList ' +
230
+ 'falls back to textContent-only mode. Verify the installed puppeteer ' +
231
+ 'version still exposes the internal _client() accessor.',
232
+ );
233
+ }
234
+ return null;
235
+ }
236
+ return client;
237
+ } catch (error) {
238
+ if (!warnedAboutMissingClient) {
239
+ warnedAboutMissingClient = true;
240
+ log(
241
+ 'WARN: puppeteer Page._client() threw — getAnchorList falls back to ' +
242
+ 'textContent-only mode. Error: %O',
243
+ error,
244
+ );
245
+ }
246
+ return null;
247
+ }
248
+ }
249
+
250
+ /**
251
+ * Fetches the full accessibility tree once and builds a `backendDOMNodeId → accessibleName`
252
+ * map covering every AX node that exposes a backend DOM id.
253
+ *
254
+ * WHY include every non-ignored node (not just `role === 'link'`): the original
255
+ * `page.accessibility.snapshot({ root })` returned whatever AX node represented
256
+ * the anchor — including anchors whose computed role was overridden via ARIA
257
+ * (e.g., `<a role="button">`). Mapping every non-ignored node preserves that.
258
+ *
259
+ * WHY skip `ignored === true`: puppeteer's high-level snapshot uses
260
+ * `interestingOnly: true` by default and returns `null` for ignored nodes
261
+ * (aria-hidden, display:none, visibility:hidden). The old code then fell back
262
+ * to `textContent.trim()`. Including ignored nodes here would short-circuit
263
+ * that fallback with the AX tree's empty name and silently drop link text.
264
+ *
265
+ * On timeout or CDP failure, an empty map is returned so callers transparently
266
+ * fall back to `textContent.trim()` for every anchor.
267
+ * @param client - The CDP session attached to the page.
268
+ * @param timeout - Maximum time to wait for the AX tree fetch.
269
+ */
270
+ async function buildAccessibleNameMap(
271
+ client: CDPSession,
272
+ timeout: number,
273
+ ): Promise<Map<number, string>> {
274
+ const { result, timeout: timedOut } = await raceWithTimeout(
275
+ () =>
276
+ client
277
+ .send('Accessibility.getFullAXTree')
278
+ .then((res) => res as unknown as GetFullAXTreeResponse)
279
+ .catch((error: unknown) => {
280
+ log('Accessibility.getFullAXTree failed: %O', error);
281
+ return null;
282
+ }),
283
+ timeout,
284
+ );
285
+ const map = new Map<number, string>();
286
+ if (timedOut) {
287
+ log('Accessibility.getFullAXTree timed out after %dms', timeout);
288
+ return map;
289
+ }
290
+ if (!result?.nodes) {
291
+ return map;
292
+ }
293
+ for (const node of result.nodes) {
294
+ if (node.backendDOMNodeId == null || node.ignored === true) {
295
+ continue;
296
+ }
297
+ const name = typeof node.name?.value === 'string' ? node.name.value : '';
298
+ map.set(node.backendDOMNodeId, name);
299
+ }
300
+ return map;
301
+ }
302
+
303
+ /**
304
+ * Resolves a CDP backend node id for a given element handle.
305
+ *
306
+ * Wrapped in {@link raceWithTimeout} so a single hung `DOM.describeNode` cannot
307
+ * stall the outer `Promise.all` over every anchor on the page.
308
+ * @param client - The CDP session attached to the page (must be the same session
309
+ * that owns the handle's `objectId`).
310
+ * @param objectId - The remote object id of the element handle.
311
+ * @param timeout - Maximum time to wait for the describeNode call.
312
+ * @returns The backend node id, or `null` if unavailable / timed out / failed.
313
+ */
314
+ async function resolveBackendNodeId(
315
+ client: CDPSession,
316
+ objectId: string,
317
+ timeout: number,
318
+ ): Promise<number | null> {
319
+ const { result, timeout: timedOut } = await raceWithTimeout(
320
+ () =>
321
+ client
322
+ .send('DOM.describeNode', { objectId })
323
+ .then((res) => res as unknown as DescribeNodeResponse)
324
+ .catch(() => null),
325
+ timeout,
326
+ );
327
+ if (timedOut || !result) {
328
+ return null;
329
+ }
330
+ return result.node?.backendNodeId ?? null;
331
+ }
332
+
181
333
  /**
182
334
  * Extracts all anchor (`<a>` and `<area>`) elements with `href` attributes from the page.
183
335
  *
184
336
  * For each anchor, resolves the `href` to an `ExURL` via `parseUrl`, retrieves
185
337
  * the accessible name (from the accessibility tree, falling back to `textContent`),
186
338
  * and filters out non-HTTP links.
339
+ *
340
+ * WHY Strategy F (single AX-tree fetch + parallel `DOM.describeNode`): the old
341
+ * implementation called `page.accessibility.snapshot({ root })` per anchor, which
342
+ * triggers a CDP round-trip *and* a Chrome-side AX subtree computation (~42ms
343
+ * each). On a page with 1181 anchors that compounded to ~53s. By fetching the
344
+ * full AX tree once and using `DOM.describeNode` in parallel to map element
345
+ * handles back to AX nodes by `backendDOMNodeId`, the same data is collected in
346
+ * ~150ms on the same page — a ~350× speed-up while preserving the original
347
+ * accessible-name semantics. See issue #876 for measurements.
348
+ *
349
+ * WHY the whole operation is wrapped in `raceWithTimeout`: even with bounded
350
+ * per-CDP-call timeouts, a degenerate page (blocked main thread, thousands of
351
+ * anchors, runaway describeNode latency) could chain enough sub-timeouts to
352
+ * exceed the caller's `timeout` budget. The outer race guarantees the function
353
+ * returns within `timeout`, surfacing whatever anchors were collected so far so
354
+ * the upstream scrape phase can continue rather than tripping a retryable retry.
187
355
  * @param page - The Puppeteer page to extract anchors from.
188
356
  * @param options - Optional URL parsing options (e.g., `disableQueries`).
357
+ * @param timeout - Total time budget in ms for the whole extraction. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
189
358
  * @returns An array of {@link AnchorData} objects for all HTTP(S) links found on the page.
190
359
  */
191
- export async function getAnchorList(page: Page, options?: ParseURLOptions) {
360
+ export async function getAnchorList(
361
+ page: Page,
362
+ options?: ParseURLOptions,
363
+ timeout: number = DEFAULT_DOM_EVALUATION_TIMEOUT,
364
+ ): Promise<AnchorData[]> {
192
365
  log('Getting anchors');
193
366
 
194
367
  const $anchors = await page.$$('a[href], area[href]');
195
- const anchorList: AnchorData[] = [];
368
+ if ($anchors.length === 0) {
369
+ log('Got 0 anchors');
370
+ return [];
371
+ }
196
372
 
197
- for (const $anchor of $anchors) {
198
- const $href = await getProp({ $el: $anchor, propName: 'href', fallback: '' });
199
- const hrefVal = $href.toString();
200
- const href = parseUrl(hrefVal, options);
201
- if (!href || !href.isHTTP) {
202
- continue;
203
- }
204
- const axNode = await page.accessibility.snapshot({ root: $anchor });
205
- const textContent = await getProp({
206
- $el: $anchor,
207
- propName: 'textContent',
208
- fallback: '',
209
- });
210
- const accessibleName = axNode ? axNode.name || '' : textContent.trim();
211
- const link: AnchorData = {
212
- href,
213
- textContent: accessibleName,
214
- };
215
- anchorList.push(link);
373
+ const collected: AnchorData[] = [];
374
+ let axHits = 0;
375
+ let textFallbacks = 0;
376
+ // Set after the overall race trips so in-flight `resolveAnchor` calls can
377
+ // short-circuit instead of continuing to consume CDP capacity and pushing
378
+ // late entries into the already-returned `collected` array.
379
+ let cancelled = false;
380
+
381
+ const work = async () => {
382
+ const client = getInternalCDPClient(page);
383
+ if (cancelled) return;
384
+ const nameByBackendId = client
385
+ ? await buildAccessibleNameMap(client, timeout)
386
+ : new Map<number, string>();
387
+ if (cancelled) return;
388
+
389
+ await Promise.all(
390
+ $anchors.map(async ($anchor) => {
391
+ if (cancelled) return;
392
+ const resolved = await resolveAnchor(
393
+ $anchor,
394
+ client,
395
+ nameByBackendId,
396
+ options,
397
+ timeout,
398
+ );
399
+ if (cancelled || !resolved) {
400
+ return;
401
+ }
402
+ if (resolved.source === 'ax') {
403
+ axHits++;
404
+ } else {
405
+ textFallbacks++;
406
+ }
407
+ collected.push(resolved.anchor);
408
+ }),
409
+ );
410
+ };
411
+
412
+ const { timeout: timedOut } = await raceWithTimeout(work, timeout);
413
+ cancelled = true;
414
+ if (timedOut) {
415
+ log(
416
+ 'getAnchorList timed out after %dms; returning %d anchors collected so far',
417
+ timeout,
418
+ collected.length,
419
+ );
216
420
  }
217
421
 
218
- log('Got %d anchors', anchorList.length);
422
+ // Snapshot so post-return mutations from any in-flight Promise.all callback
423
+ // (already gated by `cancelled`, but not synchronously cancellable) cannot
424
+ // alter the array the caller now holds.
425
+ const result = [...collected];
426
+ log(
427
+ 'Got %d anchors (%d via AX, %d via textContent)',
428
+ result.length,
429
+ axHits,
430
+ textFallbacks,
431
+ );
219
432
  dLog(
220
433
  'Anchors are: %O',
221
- anchorList.map((a) => a.href.href),
434
+ result.map((a) => a.href.href),
222
435
  );
223
- return anchorList;
436
+ return result;
437
+ }
438
+
439
+ /**
440
+ * Resolves a single anchor handle into an {@link AnchorData} entry, or `null`
441
+ * if the anchor's href is not an HTTP(S) URL.
442
+ *
443
+ * Fires `getProp(href)` and `DOM.describeNode` in parallel, then looks up the
444
+ * accessible name from the pre-built AX map. If the anchor is not represented
445
+ * in the AX map (or CDP is unavailable), falls back to a lazy `textContent`
446
+ * fetch — only paying the extra CDP round-trip when actually needed.
447
+ * @param $anchor - The Puppeteer element handle for an anchor element.
448
+ * @param client - The shared CDP session, or `null` if unavailable.
449
+ * @param nameByBackendId - Map from `backendDOMNodeId` to accessible name.
450
+ * @param options - URL parsing options.
451
+ * @param timeout - Per-CDP-call timeout in ms.
452
+ * @returns The resolved anchor along with the name source, or `null` when the
453
+ * anchor's href is not crawlable.
454
+ */
455
+ async function resolveAnchor(
456
+ $anchor: ElementHandle<Element>,
457
+ client: CDPSession | null,
458
+ nameByBackendId: ReadonlyMap<number, string>,
459
+ options: ParseURLOptions | undefined,
460
+ timeout: number,
461
+ ): Promise<{ anchor: AnchorData; source: 'ax' | 'text' } | null> {
462
+ try {
463
+ const objectId = $anchor.remoteObject().objectId;
464
+ const [hrefVal, backendNodeId] = await Promise.all([
465
+ getProp({ $el: $anchor, propName: 'href', fallback: '' }, timeout),
466
+ client && objectId != null
467
+ ? resolveBackendNodeId(client, objectId, timeout)
468
+ : Promise.resolve(null),
469
+ ]);
470
+
471
+ const href = parseUrl(hrefVal.toString(), options);
472
+ if (!href || !href.isHTTP) {
473
+ return null;
474
+ }
475
+
476
+ const axName = backendNodeId == null ? undefined : nameByBackendId.get(backendNodeId);
477
+ if (axName !== undefined) {
478
+ return { anchor: { href, textContent: axName }, source: 'ax' };
479
+ }
480
+
481
+ const textContent = await getProp(
482
+ { $el: $anchor, propName: 'textContent', fallback: '' },
483
+ timeout,
484
+ );
485
+ return { anchor: { href, textContent: textContent.trim() }, source: 'text' };
486
+ } catch (error) {
487
+ // `remoteObject()` (and other synchronous handle accesses) can throw when
488
+ // the handle is disposed (page navigated mid-extraction). Drop just this
489
+ // anchor rather than poisoning the Promise.all over every other anchor.
490
+ dLog('resolveAnchor failed for an anchor: %O', error);
491
+ return null;
492
+ }
224
493
  }
225
494
 
226
495
  /**
227
- * Extracts comprehensive meta information from the page's `<head>`.
496
+ * Required context for {@link getMeta}. Provided by the scraper from data it
497
+ * already has on hand (URL it navigated to, response status/headers it received).
228
498
  *
229
- * Collects the following metadata:
230
- * - `title` - The document title.
231
- * - `lang` - The `lang` attribute of the `<html>` element.
232
- * - `description` - The `<meta name="description">` content.
233
- * - `keywords` - The `<meta name="keywords">` content.
234
- * - `noindex` / `nofollow` / `noarchive` - Parsed from the `<meta name="robots">` directives.
235
- * - `canonical` - The `<link rel="canonical">` content.
236
- * - `alternate` - The `<link rel="alternate">` content.
237
- * - Open Graph tags: `og:type`, `og:title`, `og:site_name`, `og:description`, `og:url`, `og:image`.
238
- * - `twitter:card` - The Twitter Card type.
239
- * @param page - The Puppeteer page to extract meta information from.
240
- * @returns An object containing all extracted meta properties.
499
+ * `html` is optional: when omitted, `getMeta` falls back to `page.content()`
500
+ * to obtain the rendered HTML for the third-party tag detection pass.
241
501
  */
242
- export async function getMeta(page: Page) {
502
+ export type GetMetaContext = {
503
+ /** The fully resolved URL of the page (after redirects). */
504
+ readonly url: string;
505
+ /** Rendered HTML. Falls back to `page.content()` when omitted. */
506
+ readonly html?: string;
507
+ /** Response status code, surfaced to the Wappalyzer driver. */
508
+ readonly statusCode?: number;
509
+ /** Response headers; case is preserved by the caller, lowercased internally. */
510
+ readonly headers?: Record<string, string | string[] | undefined>;
511
+ /**
512
+ * When `true`, the returned `Meta` includes `_raw: RawHeadEntry[]` for
513
+ * debugging. Default `false` to keep the serialized payload small.
514
+ */
515
+ readonly includeRaw?: boolean;
516
+ };
517
+
518
+ const WINDOW_GLOBALS_TO_CHECK: readonly string[] = [
519
+ 'dataLayer',
520
+ 'gtag',
521
+ 'ga',
522
+ '_gaq',
523
+ 'fbq',
524
+ '_fbq',
525
+ 'clarity',
526
+ '_hjSettings',
527
+ '_hjid',
528
+ 'twq',
529
+ 'ttq',
530
+ '_linkedin_partner_id',
531
+ 'pintrk',
532
+ 'amplitude',
533
+ 'mixpanel',
534
+ 'analytics',
535
+ 'heap',
536
+ 'posthog',
537
+ 'plausible',
538
+ 'fathom',
539
+ '_paq',
540
+ 's_account',
541
+ 's',
542
+ 'ym',
543
+ 'UET',
544
+ 'optimizely',
545
+ '_hsq',
546
+ 'Sentry',
547
+ 'Intercom',
548
+ 'intercomSettings',
549
+ 'drift',
550
+ 'Tawk_API',
551
+ 'zE',
552
+ 'OneTrust',
553
+ 'Cookiebot',
554
+ 'Stripe',
555
+ 'grecaptcha',
556
+ ];
557
+
558
+ /**
559
+ * Extracts comprehensive metadata from the page.
560
+ *
561
+ * Two passes happen in parallel:
562
+ * 1. Browser-side `collectHead()` serializes every `<meta>`, `<link>`,
563
+ * relevant `<script>`, `<base>`, `<noscript>`/`<iframe>` and a curated
564
+ * set of `window` globals into a `RawHeadEntry[]`. Node-side `classify()`
565
+ * then maps those entries to typed `Meta` fields using the lookup tables
566
+ * in `./meta/keys.ts`, with unknown entries preserved in `Meta.others`.
567
+ * 2. `detectTags()` runs `simple-wappalyzer` over the page HTML to produce
568
+ * `Meta.tags` (technology detection + real-ID extraction).
569
+ *
570
+ * The whole call is wrapped in `raceWithTimeout`. On timeout an empty `Meta`
571
+ * (with `title: ''` and empty required arrays/objects) is returned.
572
+ * @param page
573
+ * @param context
574
+ * @param timeout
575
+ * @example
576
+ * ```ts
577
+ * const meta = await getMeta(page, {
578
+ * url: 'https://example.com/',
579
+ * html: await page.content(),
580
+ * statusCode: response.status,
581
+ * headers: response.headers,
582
+ * });
583
+ * console.log(meta.title); // <title> text
584
+ * console.log(meta.og?.image); // og:image[] array
585
+ * console.log(meta.robots?.noindex); // parsed robots
586
+ * console.log(meta.tags.detected.Analytics); // Wappalyzer hits
587
+ * console.log(meta.tags.entries.find(e => e.provider === 'Google Analytics')?.id);
588
+ * ```
589
+ */
590
+ export async function getMeta(
591
+ page: Page,
592
+ context: GetMetaContext,
593
+ timeout: number = DEFAULT_DOM_EVALUATION_TIMEOUT,
594
+ ): Promise<Meta> {
243
595
  log('Getting Meta');
244
596
 
245
- const robotsVal = await getPropBySelector({
246
- page,
247
- selector: 'meta[name="robots"]',
248
- propName: 'content',
249
- fallback: '',
250
- });
251
- const robots = new Set(robotsVal.split(',').map((robot) => robot.trim().toLowerCase()));
252
- const meta = {
253
- title: await getPropBySelector({
254
- page,
255
- selector: 'title',
256
- propName: 'textContent',
257
- fallback: '',
258
- }),
259
- lang: await getPropBySelector({
260
- page,
261
- selector: 'html',
262
- propName: 'lang',
263
- fallback: '',
264
- }),
265
- description: await getPropBySelector({
266
- page,
267
- selector: 'meta[name="description"]',
268
- propName: 'content',
269
- fallback: '',
270
- }),
271
- keywords: await getPropBySelector({
272
- page,
273
- selector: 'meta[name="keywords"]',
274
- propName: 'content',
275
- fallback: '',
276
- }),
277
- noindex: robots.has('noindex'),
278
- nofollow: robots.has('nofollow'),
279
- noarchive: robots.has('noarchive'),
280
- canonical: await getPropBySelector({
281
- page,
282
- selector: 'link[rel="canonical"]',
283
- propName: 'href',
284
- fallback: '',
285
- }),
286
- alternate: await getPropBySelector({
287
- page,
288
- selector: 'link[rel="alternate"]',
289
- propName: 'href',
290
- fallback: '',
291
- }),
292
- 'og:type': await getPropBySelector({
293
- page,
294
- selector: 'meta[property="og:type"]',
295
- propName: 'content',
296
- fallback: '',
297
- }),
298
- 'og:title': await getPropBySelector({
299
- page,
300
- selector: 'meta[property="og:title"]',
301
- propName: 'content',
302
- fallback: '',
303
- }),
304
- 'og:site_name': await getPropBySelector({
305
- page,
306
- selector: 'meta[property="og:site_name"]',
307
- propName: 'content',
308
- fallback: '',
309
- }),
310
- 'og:description': await getPropBySelector({
311
- page,
312
- selector: 'meta[property="og:description"]',
313
- propName: 'content',
314
- fallback: '',
315
- }),
316
- 'og:url': await getPropBySelector({
317
- page,
318
- selector: 'meta[property="og:url"]',
319
- propName: 'content',
320
- fallback: '',
321
- }),
322
- 'og:image': await getPropBySelector({
323
- page,
324
- selector: 'meta[property="og:image"]',
325
- propName: 'content',
326
- fallback: '',
327
- }),
328
- 'twitter:card': await getPropBySelector({
329
- page,
330
- selector: 'meta[name="twitter:card"]',
331
- propName: 'content',
332
- fallback: '',
333
- }),
334
- };
597
+ const { result, timeout: timedOut } = await raceWithTimeout(
598
+ () => runGetMeta(page, context),
599
+ timeout,
600
+ );
601
+
602
+ if (timedOut || result == null) {
603
+ log('Meta extraction timed out or failed; returning fallback');
604
+ return emptyMeta();
605
+ }
335
606
 
336
607
  log('Got meta');
337
- dLog('Meta data are: %O', meta);
338
- return meta;
608
+ dLog('Meta data are: %O', result);
609
+ return result;
610
+ }
611
+
612
+ /**
613
+ *
614
+ * @param page
615
+ * @param context
616
+ */
617
+ async function runGetMeta(page: Page, context: GetMetaContext): Promise<Meta | null> {
618
+ try {
619
+ const rawPromise = collectHeadOnPage(page);
620
+ const htmlPromise: Promise<string> =
621
+ context.html === undefined
622
+ ? page.content().catch(() => '')
623
+ : Promise.resolve(context.html);
624
+ const [raw, html] = await Promise.all([rawPromise, htmlPromise]);
625
+ const tags = await detectTags({
626
+ url: context.url,
627
+ html,
628
+ ...(context.statusCode === undefined ? {} : { statusCode: context.statusCode }),
629
+ ...(context.headers === undefined ? {} : { headers: context.headers }),
630
+ });
631
+ return classify(raw, {
632
+ tags,
633
+ ...(context.includeRaw ? { includeRaw: true } : {}),
634
+ });
635
+ } catch (error) {
636
+ log('runGetMeta failed: %O', error);
637
+ return null;
638
+ }
639
+ }
640
+
641
+ /**
642
+ *
643
+ * @param page
644
+ */
645
+ async function collectHeadOnPage(page: Page): Promise<RawHeadEntry[]> {
646
+ const raw = await page
647
+ .evaluate((knownGlobals: readonly string[]) => {
648
+ /* global document, HTMLLinkElement, HTMLMetaElement, HTMLBaseElement,
649
+ HTMLScriptElement, HTMLIFrameElement */
650
+ type Out = unknown;
651
+ const entries: Out[] = [];
652
+
653
+ const html = document.documentElement;
654
+ entries.push(
655
+ {
656
+ kind: 'html',
657
+ lang: html.lang || undefined,
658
+ dir: html.dir || undefined,
659
+ xmlns: html.getAttribute('xmlns') ?? undefined,
660
+ prefix: html.getAttribute('prefix') ?? undefined,
661
+ vocab: html.getAttribute('vocab') ?? undefined,
662
+ typeOf: html.getAttribute('typeof') ?? undefined,
663
+ itemscope: html.hasAttribute('itemscope') || undefined,
664
+ itemtype: html.getAttribute('itemtype') ?? undefined,
665
+ amp: html.hasAttribute('amp') || undefined,
666
+ lightning: html.hasAttribute('⚡') || undefined,
667
+ },
668
+ { kind: 'title', content: document.title },
669
+ );
670
+
671
+ for (const base of document.querySelectorAll('base')) {
672
+ if (!(base instanceof HTMLBaseElement)) continue;
673
+ entries.push({
674
+ kind: 'base',
675
+ href: base.getAttribute('href') ?? undefined,
676
+ target: base.getAttribute('target') ?? undefined,
677
+ });
678
+ }
679
+
680
+ for (const meta of document.querySelectorAll('meta')) {
681
+ if (!(meta instanceof HTMLMetaElement)) continue;
682
+ const name = meta.getAttribute('name');
683
+ const property = meta.getAttribute('property');
684
+ const httpEquiv = meta.getAttribute('http-equiv');
685
+ const itemprop = meta.getAttribute('itemprop');
686
+ const charset = meta.getAttribute('charset');
687
+ const content = meta.getAttribute('content');
688
+ const media = meta.getAttribute('media');
689
+ entries.push({
690
+ kind: 'meta',
691
+ name: name ? name.toLowerCase() : undefined,
692
+ property: property ? property.toLowerCase() : undefined,
693
+ httpEquiv: httpEquiv ? httpEquiv.toLowerCase() : undefined,
694
+ itemprop: itemprop ?? undefined,
695
+ charset: charset ?? undefined,
696
+ content: content ?? undefined,
697
+ media: media ?? undefined,
698
+ });
699
+ }
700
+
701
+ for (const link of document.querySelectorAll('link[href]')) {
702
+ if (!(link instanceof HTMLLinkElement)) continue;
703
+ const relRaw = link.getAttribute('rel') ?? '';
704
+ const rel = relRaw.toLowerCase().split(/\s+/u).filter(Boolean);
705
+ entries.push({
706
+ kind: 'link',
707
+ rel,
708
+ href: link.getAttribute('href') ?? '',
709
+ type: link.getAttribute('type') ?? undefined,
710
+ media: link.getAttribute('media') ?? undefined,
711
+ sizes: link.getAttribute('sizes') ?? undefined,
712
+ title: link.getAttribute('title') ?? undefined,
713
+ hreflang: link.getAttribute('hreflang') ?? undefined,
714
+ as: link.getAttribute('as') ?? undefined,
715
+ crossorigin: link.getAttribute('crossorigin') ?? undefined,
716
+ color: link.getAttribute('color') ?? undefined,
717
+ blocking: link.getAttribute('blocking') ?? undefined,
718
+ imagesrcset: link.getAttribute('imagesrcset') ?? undefined,
719
+ });
720
+ }
721
+
722
+ const STRUCTURED_TYPES = new Set([
723
+ 'application/ld+json',
724
+ 'speculationrules',
725
+ 'application/json+oembed',
726
+ 'application/xml+oembed',
727
+ ]);
728
+ for (const script of document.querySelectorAll('script[type]')) {
729
+ if (!(script instanceof HTMLScriptElement)) continue;
730
+ const scriptType = (script.getAttribute('type') ?? '').toLowerCase();
731
+ if (!STRUCTURED_TYPES.has(scriptType)) continue;
732
+ const src = script.getAttribute('src') ?? undefined;
733
+ const text = script.textContent ?? '';
734
+ const inHead = !!script.closest('head');
735
+ const inNoscript = !!script.closest('noscript');
736
+ const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
737
+ entries.push({
738
+ kind: 'script',
739
+ scriptType,
740
+ content: text || undefined,
741
+ src,
742
+ location,
743
+ });
744
+ }
745
+
746
+ for (const iframe of document.querySelectorAll('iframe[src]')) {
747
+ if (!(iframe instanceof HTMLIFrameElement)) continue;
748
+ const src = iframe.getAttribute('src') ?? '';
749
+ if (!src) continue;
750
+ const inHead = !!iframe.closest('head');
751
+ const inNoscript = !!iframe.closest('noscript');
752
+ const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
753
+ entries.push({ kind: 'iframe', src, location });
754
+ }
755
+
756
+ const win = window as unknown as Record<string, unknown>;
757
+ const presentGlobals = knownGlobals.filter((name) => win[name] !== undefined);
758
+ if (presentGlobals.length > 0) {
759
+ entries.push({ kind: 'window-global', names: presentGlobals });
760
+ }
761
+
762
+ return entries;
763
+ }, WINDOW_GLOBALS_TO_CHECK)
764
+ .catch(() => [] as unknown[]);
765
+
766
+ return raw as RawHeadEntry[];
339
767
  }