@d-zero/beholder 2.1.6 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +44 -0
  2. package/README.md +26 -0
  3. package/dist/dom-evaluation.d.ts +72 -24
  4. package/dist/dom-evaluation.js +310 -84
  5. package/dist/extract-meta.d.ts +98 -0
  6. package/dist/extract-meta.js +75 -0
  7. package/dist/index.d.ts +3 -1
  8. package/dist/index.js +1 -0
  9. package/dist/meta/classify.d.ts +52 -0
  10. package/dist/meta/classify.js +731 -0
  11. package/dist/meta/collect-head.d.ts +63 -0
  12. package/dist/meta/collect-head.js +223 -0
  13. package/dist/meta/id-extractors.d.ts +40 -0
  14. package/dist/meta/id-extractors.js +196 -0
  15. package/dist/meta/keys.d.ts +41 -0
  16. package/dist/meta/keys.js +507 -0
  17. package/dist/meta/parsers.d.ts +74 -0
  18. package/dist/meta/parsers.js +293 -0
  19. package/dist/meta/tag-detection.d.ts +59 -0
  20. package/dist/meta/tag-detection.js +120 -0
  21. package/dist/meta/types.d.ts +874 -0
  22. package/dist/meta/types.js +12 -0
  23. package/dist/scraper.js +15 -13
  24. package/dist/types.d.ts +3 -38
  25. package/package.json +8 -5
  26. package/src/dom-evaluation.spec.ts +301 -73
  27. package/src/dom-evaluation.ts +417 -88
  28. package/src/extract-meta.spec.ts +247 -0
  29. package/src/extract-meta.ts +121 -0
  30. package/src/index.ts +45 -0
  31. package/src/meta/classify.spec.ts +281 -0
  32. package/src/meta/classify.ts +810 -0
  33. package/src/meta/collect-head.ts +247 -0
  34. package/src/meta/id-extractors.spec.ts +69 -0
  35. package/src/meta/id-extractors.ts +206 -0
  36. package/src/meta/keys.ts +568 -0
  37. package/src/meta/parsers.spec.ts +178 -0
  38. package/src/meta/parsers.ts +304 -0
  39. package/src/meta/simple-wappalyzer.d.ts +37 -0
  40. package/src/meta/tag-detection.spec.ts +134 -0
  41. package/src/meta/tag-detection.ts +161 -0
  42. package/src/meta/types.ts +949 -0
  43. package/src/scraper.ts +19 -13
  44. package/src/types.ts +49 -55
  45. package/tsconfig.tsbuildinfo +1 -1
@@ -14,12 +14,16 @@
14
14
  * @see {@link ./types.ts} for the data types returned by these functions
15
15
  */
16
16
 
17
+ import type { RawHeadEntry } from './meta/types.js';
17
18
  import type { AnchorData, ImageElement, Meta, ParseURLOptions } from './types.js';
18
- import type { ElementHandle, Page } from 'puppeteer';
19
+ import type { CDPSession, ElementHandle, Page } from 'puppeteer';
19
20
 
20
21
  import { raceWithTimeout } from '@d-zero/shared/race-with-timeout';
21
22
 
22
23
  import { domDetailsLog, domLog } from './debug.js';
24
+ import { classify, emptyMeta } from './meta/classify.js';
25
+ import { WINDOW_GLOBALS_TO_CHECK, collectHeadFromDocument } from './meta/collect-head.js';
26
+ import { detectTags } from './meta/tag-detection.js';
23
27
  import { parseUrl } from './parse-url.js';
24
28
 
25
29
  const pid = `${process.pid}`;
@@ -30,8 +34,12 @@ const dLog = domDetailsLog.extend(pid);
30
34
  * Default timeout (ms) applied to DOM evaluation operations when the caller does not
31
35
  * specify one. Bounds how long a single `page.evaluate` / property read may hang on a
32
36
  * page whose main thread is unresponsive.
37
+ *
38
+ * WHY 180s: Aligned with the upstream `Scraper#fetchData` retryable timeout (3 min) so
39
+ * a single phase does not exceed the retry budget while still tolerating large pages
40
+ * (e.g., 1000+ anchors) and slow main threads.
33
41
  */
34
- export const DEFAULT_DOM_EVALUATION_TIMEOUT = 30_000;
42
+ export const DEFAULT_DOM_EVALUATION_TIMEOUT = 180_000;
35
43
 
36
44
  /**
37
45
  * Parameters for {@link getProp}.
@@ -159,6 +167,170 @@ export async function getImageList(
159
167
  return imageList;
160
168
  }
161
169
 
170
+ /**
171
+ * Page-like shape exposing puppeteer's internal CDP session.
172
+ *
173
+ * WHY private `_client()` instead of `page.createCDPSession()`: the `objectId`
174
+ * returned by {@link ElementHandle.remoteObject} is scoped to the page's primary
175
+ * session. A fresh session created via `createCDPSession()` cannot resolve those
176
+ * `objectId` values when calling `DOM.describeNode`, so we must reuse the same
177
+ * session puppeteer uses internally.
178
+ */
179
+ interface PageWithInternalClient {
180
+ _client(): CDPSession;
181
+ }
182
+
183
+ /** Minimal shape of a CDP `Accessibility.AXValue` we read from. */
184
+ interface AXValueLike {
185
+ readonly value?: unknown;
186
+ }
187
+
188
+ /** Minimal shape of a CDP `Accessibility.AXNode` we read from. */
189
+ interface AXNodeLike {
190
+ readonly backendDOMNodeId?: number;
191
+ readonly ignored?: boolean;
192
+ readonly name?: AXValueLike;
193
+ }
194
+
195
+ interface GetFullAXTreeResponse {
196
+ readonly nodes: readonly AXNodeLike[];
197
+ }
198
+
199
+ interface DescribeNodeResponse {
200
+ readonly node: { readonly backendNodeId?: number };
201
+ }
202
+
203
+ /**
204
+ * One-shot warning latch: only the first time `_client()` is missing in a
205
+ * process do we log the degradation. Subsequent calls stay silent to avoid
206
+ * spamming logs while every page in a crawl re-enters the fallback path.
207
+ */
208
+ let warnedAboutMissingClient = false;
209
+
210
+ /**
211
+ * Returns puppeteer's internal CDP session for the page, or `null` if it is
212
+ * unreachable (e.g., test mocks, puppeteer wrappers that hide the internal API,
213
+ * or a future puppeteer release that renames `_client`).
214
+ *
215
+ * WHY a warning log: callers transparently fall back to textContent-only mode
216
+ * when this returns `null`, which masks a silent perf regression if a
217
+ * puppeteer update removes `_client`. The warning makes the degraded state
218
+ * observable in production logs so a maintainer can patch the access path.
219
+ *
220
+ * Callers fall back to a textContent-only path when this returns `null`.
221
+ * @param page - The Puppeteer page.
222
+ */
223
+ function getInternalCDPClient(page: Page): CDPSession | null {
224
+ try {
225
+ const client = (page as unknown as Partial<PageWithInternalClient>)._client?.();
226
+ if (!client) {
227
+ if (!warnedAboutMissingClient) {
228
+ warnedAboutMissingClient = true;
229
+ log(
230
+ 'WARN: puppeteer Page._client() returned no session — getAnchorList ' +
231
+ 'falls back to textContent-only mode. Verify the installed puppeteer ' +
232
+ 'version still exposes the internal _client() accessor.',
233
+ );
234
+ }
235
+ return null;
236
+ }
237
+ return client;
238
+ } catch (error) {
239
+ if (!warnedAboutMissingClient) {
240
+ warnedAboutMissingClient = true;
241
+ log(
242
+ 'WARN: puppeteer Page._client() threw — getAnchorList falls back to ' +
243
+ 'textContent-only mode. Error: %O',
244
+ error,
245
+ );
246
+ }
247
+ return null;
248
+ }
249
+ }
250
+
251
+ /**
252
+ * Fetches the full accessibility tree once and builds a `backendDOMNodeId → accessibleName`
253
+ * map covering every AX node that exposes a backend DOM id.
254
+ *
255
+ * WHY include every non-ignored node (not just `role === 'link'`): the original
256
+ * `page.accessibility.snapshot({ root })` returned whatever AX node represented
257
+ * the anchor — including anchors whose computed role was overridden via ARIA
258
+ * (e.g., `<a role="button">`). Mapping every non-ignored node preserves that.
259
+ *
260
+ * WHY skip `ignored === true`: puppeteer's high-level snapshot uses
261
+ * `interestingOnly: true` by default and returns `null` for ignored nodes
262
+ * (aria-hidden, display:none, visibility:hidden). The old code then fell back
263
+ * to `textContent.trim()`. Including ignored nodes here would short-circuit
264
+ * that fallback with the AX tree's empty name and silently drop link text.
265
+ *
266
+ * On timeout or CDP failure, an empty map is returned so callers transparently
267
+ * fall back to `textContent.trim()` for every anchor.
268
+ * @param client - The CDP session attached to the page.
269
+ * @param timeout - Maximum time to wait for the AX tree fetch.
270
+ */
271
+ async function buildAccessibleNameMap(
272
+ client: CDPSession,
273
+ timeout: number,
274
+ ): Promise<Map<number, string>> {
275
+ const { result, timeout: timedOut } = await raceWithTimeout(
276
+ () =>
277
+ client
278
+ .send('Accessibility.getFullAXTree')
279
+ .then((res) => res as unknown as GetFullAXTreeResponse)
280
+ .catch((error: unknown) => {
281
+ log('Accessibility.getFullAXTree failed: %O', error);
282
+ return null;
283
+ }),
284
+ timeout,
285
+ );
286
+ const map = new Map<number, string>();
287
+ if (timedOut) {
288
+ log('Accessibility.getFullAXTree timed out after %dms', timeout);
289
+ return map;
290
+ }
291
+ if (!result?.nodes) {
292
+ return map;
293
+ }
294
+ for (const node of result.nodes) {
295
+ if (node.backendDOMNodeId == null || node.ignored === true) {
296
+ continue;
297
+ }
298
+ const name = typeof node.name?.value === 'string' ? node.name.value : '';
299
+ map.set(node.backendDOMNodeId, name);
300
+ }
301
+ return map;
302
+ }
303
+
304
+ /**
305
+ * Resolves a CDP backend node id for a given element handle.
306
+ *
307
+ * Wrapped in {@link raceWithTimeout} so a single hung `DOM.describeNode` cannot
308
+ * stall the outer `Promise.all` over every anchor on the page.
309
+ * @param client - The CDP session attached to the page (must be the same session
310
+ * that owns the handle's `objectId`).
311
+ * @param objectId - The remote object id of the element handle.
312
+ * @param timeout - Maximum time to wait for the describeNode call.
313
+ * @returns The backend node id, or `null` if unavailable / timed out / failed.
314
+ */
315
+ async function resolveBackendNodeId(
316
+ client: CDPSession,
317
+ objectId: string,
318
+ timeout: number,
319
+ ): Promise<number | null> {
320
+ const { result, timeout: timedOut } = await raceWithTimeout(
321
+ () =>
322
+ client
323
+ .send('DOM.describeNode', { objectId })
324
+ .then((res) => res as unknown as DescribeNodeResponse)
325
+ .catch(() => null),
326
+ timeout,
327
+ );
328
+ if (timedOut || !result) {
329
+ return null;
330
+ }
331
+ return result.node?.backendNodeId ?? null;
332
+ }
333
+
162
334
  /**
163
335
  * Extracts all anchor (`<a>` and `<area>`) elements with `href` attributes from the page.
164
336
  *
@@ -166,132 +338,289 @@ export async function getImageList(
166
338
  * the accessible name (from the accessibility tree, falling back to `textContent`),
167
339
  * and filters out non-HTTP links.
168
340
  *
169
- * WHY this keeps per-element CDP calls (unlike {@link getMeta} / {@link getImageList}):
170
- * the accessible name comes from Chrome's computed accessibility tree
171
- * (`page.accessibility.snapshot`), which is a CDP-only feature unavailable to in-page
172
- * DOM APIs. Each {@link getProp} read is still bounded by `timeout`.
341
+ * WHY Strategy F (single AX-tree fetch + parallel `DOM.describeNode`): the old
342
+ * implementation called `page.accessibility.snapshot({ root })` per anchor, which
343
+ * triggers a CDP round-trip *and* a Chrome-side AX subtree computation (~42ms
344
+ * each). On a page with 1181 anchors that compounded to ~53s. By fetching the
345
+ * full AX tree once and using `DOM.describeNode` in parallel to map element
346
+ * handles back to AX nodes by `backendDOMNodeId`, the same data is collected in
347
+ * ~150ms on the same page — a ~350× speed-up while preserving the original
348
+ * accessible-name semantics. See issue #876 for measurements.
349
+ *
350
+ * WHY the whole operation is wrapped in `raceWithTimeout`: even with bounded
351
+ * per-CDP-call timeouts, a degenerate page (blocked main thread, thousands of
352
+ * anchors, runaway describeNode latency) could chain enough sub-timeouts to
353
+ * exceed the caller's `timeout` budget. The outer race guarantees the function
354
+ * returns within `timeout`, surfacing whatever anchors were collected so far so
355
+ * the upstream scrape phase can continue rather than tripping a retryable retry.
173
356
  * @param page - The Puppeteer page to extract anchors from.
174
357
  * @param options - Optional URL parsing options (e.g., `disableQueries`).
175
- * @param timeout - Timeout in ms per property read. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
358
+ * @param timeout - Total time budget in ms for the whole extraction. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
176
359
  * @returns An array of {@link AnchorData} objects for all HTTP(S) links found on the page.
177
360
  */
178
361
  export async function getAnchorList(
179
362
  page: Page,
180
363
  options?: ParseURLOptions,
181
364
  timeout: number = DEFAULT_DOM_EVALUATION_TIMEOUT,
182
- ) {
365
+ ): Promise<AnchorData[]> {
183
366
  log('Getting anchors');
184
367
 
185
368
  const $anchors = await page.$$('a[href], area[href]');
186
- const anchorList: AnchorData[] = [];
369
+ if ($anchors.length === 0) {
370
+ log('Got 0 anchors');
371
+ return [];
372
+ }
373
+
374
+ const collected: AnchorData[] = [];
375
+ let axHits = 0;
376
+ let textFallbacks = 0;
377
+ // Set after the overall race trips so in-flight `resolveAnchor` calls can
378
+ // short-circuit instead of continuing to consume CDP capacity and pushing
379
+ // late entries into the already-returned `collected` array.
380
+ let cancelled = false;
381
+
382
+ const work = async () => {
383
+ const client = getInternalCDPClient(page);
384
+ if (cancelled) return;
385
+ const nameByBackendId = client
386
+ ? await buildAccessibleNameMap(client, timeout)
387
+ : new Map<number, string>();
388
+ if (cancelled) return;
389
+
390
+ await Promise.all(
391
+ $anchors.map(async ($anchor) => {
392
+ if (cancelled) return;
393
+ const resolved = await resolveAnchor(
394
+ $anchor,
395
+ client,
396
+ nameByBackendId,
397
+ options,
398
+ timeout,
399
+ );
400
+ if (cancelled || !resolved) {
401
+ return;
402
+ }
403
+ if (resolved.source === 'ax') {
404
+ axHits++;
405
+ } else {
406
+ textFallbacks++;
407
+ }
408
+ collected.push(resolved.anchor);
409
+ }),
410
+ );
411
+ };
187
412
 
188
- for (const $anchor of $anchors) {
189
- const $href = await getProp(
190
- { $el: $anchor, propName: 'href', fallback: '' },
413
+ const { timeout: timedOut } = await raceWithTimeout(work, timeout);
414
+ cancelled = true;
415
+ if (timedOut) {
416
+ log(
417
+ 'getAnchorList timed out after %dms; returning %d anchors collected so far',
191
418
  timeout,
419
+ collected.length,
192
420
  );
193
- const hrefVal = $href.toString();
194
- const href = parseUrl(hrefVal, options);
421
+ }
422
+
423
+ // Snapshot so post-return mutations from any in-flight Promise.all callback
424
+ // (already gated by `cancelled`, but not synchronously cancellable) cannot
425
+ // alter the array the caller now holds.
426
+ const result = [...collected];
427
+ log(
428
+ 'Got %d anchors (%d via AX, %d via textContent)',
429
+ result.length,
430
+ axHits,
431
+ textFallbacks,
432
+ );
433
+ dLog(
434
+ 'Anchors are: %O',
435
+ result.map((a) => a.href.href),
436
+ );
437
+ return result;
438
+ }
439
+
440
+ /**
441
+ * Resolves a single anchor handle into an {@link AnchorData} entry, or `null`
442
+ * if the anchor's href is not an HTTP(S) URL.
443
+ *
444
+ * Fires `getProp(href)` and `DOM.describeNode` in parallel, then looks up the
445
+ * accessible name from the pre-built AX map. If the anchor is not represented
446
+ * in the AX map (or CDP is unavailable), falls back to a lazy `textContent`
447
+ * fetch — only paying the extra CDP round-trip when actually needed.
448
+ * @param $anchor - The Puppeteer element handle for an anchor element.
449
+ * @param client - The shared CDP session, or `null` if unavailable.
450
+ * @param nameByBackendId - Map from `backendDOMNodeId` to accessible name.
451
+ * @param options - URL parsing options.
452
+ * @param timeout - Per-CDP-call timeout in ms.
453
+ * @returns The resolved anchor along with the name source, or `null` when the
454
+ * anchor's href is not crawlable.
455
+ */
456
+ async function resolveAnchor(
457
+ $anchor: ElementHandle<Element>,
458
+ client: CDPSession | null,
459
+ nameByBackendId: ReadonlyMap<number, string>,
460
+ options: ParseURLOptions | undefined,
461
+ timeout: number,
462
+ ): Promise<{ anchor: AnchorData; source: 'ax' | 'text' } | null> {
463
+ try {
464
+ const objectId = $anchor.remoteObject().objectId;
465
+ const [hrefVal, backendNodeId] = await Promise.all([
466
+ getProp({ $el: $anchor, propName: 'href', fallback: '' }, timeout),
467
+ client && objectId != null
468
+ ? resolveBackendNodeId(client, objectId, timeout)
469
+ : Promise.resolve(null),
470
+ ]);
471
+
472
+ const href = parseUrl(hrefVal.toString(), options);
195
473
  if (!href || !href.isHTTP) {
196
- continue;
474
+ return null;
475
+ }
476
+
477
+ const axName = backendNodeId == null ? undefined : nameByBackendId.get(backendNodeId);
478
+ if (axName !== undefined) {
479
+ return { anchor: { href, textContent: axName }, source: 'ax' };
197
480
  }
198
- const axNode = await page.accessibility.snapshot({ root: $anchor });
481
+
199
482
  const textContent = await getProp(
200
483
  { $el: $anchor, propName: 'textContent', fallback: '' },
201
484
  timeout,
202
485
  );
203
- const accessibleName = axNode ? axNode.name || '' : textContent.trim();
204
- const link: AnchorData = {
205
- href,
206
- textContent: accessibleName,
207
- };
208
- anchorList.push(link);
486
+ return { anchor: { href, textContent: textContent.trim() }, source: 'text' };
487
+ } catch (error) {
488
+ // `remoteObject()` (and other synchronous handle accesses) can throw when
489
+ // the handle is disposed (page navigated mid-extraction). Drop just this
490
+ // anchor rather than poisoning the Promise.all over every other anchor.
491
+ dLog('resolveAnchor failed for an anchor: %O', error);
492
+ return null;
209
493
  }
210
-
211
- log('Got %d anchors', anchorList.length);
212
- dLog(
213
- 'Anchors are: %O',
214
- anchorList.map((a) => a.href.href),
215
- );
216
- return anchorList;
217
494
  }
218
495
 
219
496
  /**
220
- * Extracts comprehensive meta information from the page's `<head>`.
497
+ * Required context for {@link getMeta}. Provided by the scraper from data it
498
+ * already has on hand (URL it navigated to, response status/headers it received).
221
499
  *
222
- * Collects all metadata in a single `page.evaluate` call (14 CDP round-trips
223
- * collapsed into 1) wrapped in {@link raceWithTimeout}. On timeout (an unresponsive
224
- * page) a minimal `{ title: '' }` is returned rather than hanging.
500
+ * `html` is optional: when omitted, `getMeta` falls back to `page.content()`
501
+ * to obtain the rendered HTML for the third-party tag detection pass.
502
+ */
503
+ export type GetMetaContext = {
504
+ /** The fully resolved URL of the page (after redirects). */
505
+ readonly url: string;
506
+ /** Rendered HTML. Falls back to `page.content()` when omitted. */
507
+ readonly html?: string;
508
+ /** Response status code, surfaced to the Wappalyzer driver. */
509
+ readonly statusCode?: number;
510
+ /** Response headers; case is preserved by the caller, lowercased internally. */
511
+ readonly headers?: Record<string, string | string[] | undefined>;
512
+ /**
513
+ * When `true`, the returned `Meta` includes `_raw: RawHeadEntry[]` for
514
+ * debugging. Default `false` to keep the serialized payload small.
515
+ */
516
+ readonly includeRaw?: boolean;
517
+ };
518
+
519
+ /**
520
+ * Extracts comprehensive metadata from the page.
225
521
  *
226
- * Collected metadata:
227
- * - `title` - The document title.
228
- * - `lang` - The `lang` attribute of the `<html>` element.
229
- * - `description` - The `<meta name="description">` content.
230
- * - `keywords` - The `<meta name="keywords">` content.
231
- * - `noindex` / `nofollow` / `noarchive` - Parsed from the `<meta name="robots">` directives.
232
- * - `canonical` - The `<link rel="canonical">` content.
233
- * - `alternate` - The `<link rel="alternate">` content.
234
- * - Open Graph tags: `og:type`, `og:title`, `og:site_name`, `og:description`, `og:url`, `og:image`.
235
- * - `twitter:card` - The Twitter Card type.
236
- * @param page - The Puppeteer page to extract meta information from.
237
- * @param timeout - Timeout in ms for the evaluation. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
238
- * @returns An object containing all extracted meta properties.
522
+ * Two passes happen in parallel:
523
+ * 1. Browser-side `collectHead()` serializes every `<meta>`, `<link>`,
524
+ * relevant `<script>`, `<base>`, `<noscript>`/`<iframe>` and a curated
525
+ * set of `window` globals into a `RawHeadEntry[]`. Node-side `classify()`
526
+ * then maps those entries to typed `Meta` fields using the lookup tables
527
+ * in `./meta/keys.ts`, with unknown entries preserved in `Meta.others`.
528
+ * 2. `detectTags()` runs `simple-wappalyzer` over the page HTML to produce
529
+ * `Meta.tags` (technology detection + real-ID extraction).
530
+ *
531
+ * The whole call is wrapped in `raceWithTimeout`. On timeout an empty `Meta`
532
+ * (with `title: ''` and empty required arrays/objects) is returned.
533
+ * @param page
534
+ * @param context
535
+ * @param timeout
536
+ * @example
537
+ * ```ts
538
+ * const meta = await getMeta(page, {
539
+ * url: 'https://example.com/',
540
+ * html: await page.content(),
541
+ * statusCode: response.status,
542
+ * headers: response.headers,
543
+ * });
544
+ * console.log(meta.title); // <title> text
545
+ * console.log(meta.og?.image); // og:image[] array
546
+ * console.log(meta.robots?.noindex); // parsed robots
547
+ * console.log(meta.tags.detected.Analytics); // Wappalyzer hits
548
+ * console.log(meta.tags.entries.find(e => e.provider === 'Google Analytics')?.id);
549
+ * ```
239
550
  */
240
551
  export async function getMeta(
241
552
  page: Page,
553
+ context: GetMetaContext,
242
554
  timeout: number = DEFAULT_DOM_EVALUATION_TIMEOUT,
243
555
  ): Promise<Meta> {
244
556
  log('Getting Meta');
245
557
 
246
558
  const { result, timeout: timedOut } = await raceWithTimeout(
247
- () =>
248
- page
249
- .evaluate(() => {
250
- /* global document, HTMLMetaElement, HTMLLinkElement */
251
- const content = (selector: string): string => {
252
- const el = document.querySelector(selector);
253
- return el instanceof HTMLMetaElement ? el.content : '';
254
- };
255
- const linkHref = (selector: string): string => {
256
- const el = document.querySelector(selector);
257
- return el instanceof HTMLLinkElement ? el.href : '';
258
- };
259
- return {
260
- title: document.title,
261
- lang: document.documentElement.lang,
262
- description: content('meta[name="description"]'),
263
- keywords: content('meta[name="keywords"]'),
264
- robots: content('meta[name="robots"]'),
265
- canonical: linkHref('link[rel="canonical"]'),
266
- alternate: linkHref('link[rel="alternate"]'),
267
- 'og:type': content('meta[property="og:type"]'),
268
- 'og:title': content('meta[property="og:title"]'),
269
- 'og:site_name': content('meta[property="og:site_name"]'),
270
- 'og:description': content('meta[property="og:description"]'),
271
- 'og:url': content('meta[property="og:url"]'),
272
- 'og:image': content('meta[property="og:image"]'),
273
- 'twitter:card': content('meta[name="twitter:card"]'),
274
- };
275
- })
276
- .catch(() => null),
559
+ () => runGetMeta(page, context),
277
560
  timeout,
278
561
  );
279
562
 
280
563
  if (timedOut || result == null) {
281
564
  log('Meta extraction timed out or failed; returning fallback');
282
- return { title: '' };
565
+ return emptyMeta();
283
566
  }
284
567
 
285
- const { robots: robotsVal, ...rest } = result;
286
- const robots = new Set(robotsVal.split(',').map((robot) => robot.trim().toLowerCase()));
287
- const meta: Meta = {
288
- ...rest,
289
- noindex: robots.has('noindex'),
290
- nofollow: robots.has('nofollow'),
291
- noarchive: robots.has('noarchive'),
292
- };
293
-
294
568
  log('Got meta');
295
- dLog('Meta data are: %O', meta);
296
- return meta;
569
+ dLog('Meta data are: %O', result);
570
+ return result;
571
+ }
572
+
573
+ /**
574
+ *
575
+ * @param page
576
+ * @param context
577
+ */
578
+ async function runGetMeta(page: Page, context: GetMetaContext): Promise<Meta | null> {
579
+ try {
580
+ const rawPromise = collectHeadOnPage(page);
581
+ const htmlPromise: Promise<string> =
582
+ context.html === undefined
583
+ ? page.content().catch(() => '')
584
+ : Promise.resolve(context.html);
585
+ const [raw, html] = await Promise.all([rawPromise, htmlPromise]);
586
+ const tags = await detectTags({
587
+ url: context.url,
588
+ html,
589
+ ...(context.statusCode === undefined ? {} : { statusCode: context.statusCode }),
590
+ ...(context.headers === undefined ? {} : { headers: context.headers }),
591
+ });
592
+ return classify(raw, {
593
+ tags,
594
+ ...(context.includeRaw ? { includeRaw: true } : {}),
595
+ });
596
+ } catch (error) {
597
+ log('runGetMeta failed: %O', error);
598
+ return null;
599
+ }
600
+ }
601
+
602
+ /**
603
+ * Collects raw `<head>` entries from a Puppeteer page by injecting
604
+ * {@link collectHeadFromDocument} into the page realm.
605
+ *
606
+ * WHY string-eval instead of `page.evaluate(fn, args)`: the shared
607
+ * implementation lives in this module (`collectHeadFromDocument`), and a
608
+ * `page.evaluate(() => collectHeadFromDocument(window, …))` wrapper cannot
609
+ * reach that module-scope binding inside the page realm — only the wrapper's
610
+ * own source crosses the CDP boundary. Serializing the implementation via
611
+ * `Function.prototype.toString` and invoking it through
612
+ * `page.evaluate(string)` is what keeps the Puppeteer path and the
613
+ * jsdom path on one source of truth.
614
+ *
615
+ * The same {@link collectHeadFromDocument} function is also exposed via
616
+ * {@link ../extract-meta.ts | extractMetaFromDocument} for jsdom/Node callers,
617
+ * so the two paths cannot drift apart.
618
+ * @param page - The Puppeteer page whose document will be inspected.
619
+ */
620
+ async function collectHeadOnPage(page: Page): Promise<RawHeadEntry[]> {
621
+ const fnSource = collectHeadFromDocument.toString();
622
+ const globalsLiteral = JSON.stringify(WINDOW_GLOBALS_TO_CHECK);
623
+ const expr = `(${fnSource})(window, ${globalsLiteral})`;
624
+ const raw = await page.evaluate(expr).catch(() => [] as unknown[]);
625
+ return raw as RawHeadEntry[];
297
626
  }