@d-zero/beholder 2.1.6 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,6 +15,8 @@
15
15
  */
16
16
  import { raceWithTimeout } from '@d-zero/shared/race-with-timeout';
17
17
  import { domDetailsLog, domLog } from './debug.js';
18
+ import { classify, emptyMeta } from './meta/classify.js';
19
+ import { detectTags } from './meta/tag-detection.js';
18
20
  import { parseUrl } from './parse-url.js';
19
21
  const pid = `${process.pid}`;
20
22
  const log = domLog.extend(pid);
@@ -23,8 +25,12 @@ const dLog = domDetailsLog.extend(pid);
23
25
  * Default timeout (ms) applied to DOM evaluation operations when the caller does not
24
26
  * specify one. Bounds how long a single `page.evaluate` / property read may hang on a
25
27
  * page whose main thread is unresponsive.
28
+ *
29
+ * WHY 180s: Aligned with the upstream `Scraper#fetchData` retryable timeout (3 min) so
30
+ * a single phase does not exceed the retry budget while still tolerating large pages
31
+ * (e.g., 1000+ anchors) and slow main threads.
26
32
  */
27
- export const DEFAULT_DOM_EVALUATION_TIMEOUT = 30_000;
33
+ export const DEFAULT_DOM_EVALUATION_TIMEOUT = 180_000;
28
34
  /**
29
35
  * Retrieves a DOM property value from a Puppeteer element handle with a timeout.
30
36
  *
@@ -108,6 +114,114 @@ export async function getImageList(page, viewportWidth, timeout = DEFAULT_DOM_EV
108
114
  dLog('Images are: %O', imageList.map((i) => i.src));
109
115
  return imageList;
110
116
  }
117
+ /**
118
+ * One-shot warning latch: only the first time `_client()` is missing in a
119
+ * process do we log the degradation. Subsequent calls stay silent to avoid
120
+ * spamming logs while every page in a crawl re-enters the fallback path.
121
+ */
122
+ let warnedAboutMissingClient = false;
123
+ /**
124
+ * Returns puppeteer's internal CDP session for the page, or `null` if it is
125
+ * unreachable (e.g., test mocks, puppeteer wrappers that hide the internal API,
126
+ * or a future puppeteer release that renames `_client`).
127
+ *
128
+ * WHY a warning log: callers transparently fall back to textContent-only mode
129
+ * when this returns `null`, which masks a silent perf regression if a
130
+ * puppeteer update removes `_client`. The warning makes the degraded state
131
+ * observable in production logs so a maintainer can patch the access path.
132
+ *
133
+ * Callers fall back to a textContent-only path when this returns `null`.
134
+ * @param page - The Puppeteer page.
135
+ */
136
+ function getInternalCDPClient(page) {
137
+ try {
138
+ const client = page._client?.();
139
+ if (!client) {
140
+ if (!warnedAboutMissingClient) {
141
+ warnedAboutMissingClient = true;
142
+ log('WARN: puppeteer Page._client() returned no session — getAnchorList ' +
143
+ 'falls back to textContent-only mode. Verify the installed puppeteer ' +
144
+ 'version still exposes the internal _client() accessor.');
145
+ }
146
+ return null;
147
+ }
148
+ return client;
149
+ }
150
+ catch (error) {
151
+ if (!warnedAboutMissingClient) {
152
+ warnedAboutMissingClient = true;
153
+ log('WARN: puppeteer Page._client() threw — getAnchorList falls back to ' +
154
+ 'textContent-only mode. Error: %O', error);
155
+ }
156
+ return null;
157
+ }
158
+ }
159
+ /**
160
+ * Fetches the full accessibility tree once and builds a `backendDOMNodeId → accessibleName`
161
+ * map covering every AX node that exposes a backend DOM id.
162
+ *
163
+ * WHY include every non-ignored node (not just `role === 'link'`): the original
164
+ * `page.accessibility.snapshot({ root })` returned whatever AX node represented
165
+ * the anchor — including anchors whose computed role was overridden via ARIA
166
+ * (e.g., `<a role="button">`). Mapping every non-ignored node preserves that.
167
+ *
168
+ * WHY skip `ignored === true`: puppeteer's high-level snapshot uses
169
+ * `interestingOnly: true` by default and returns `null` for ignored nodes
170
+ * (aria-hidden, display:none, visibility:hidden). The old code then fell back
171
+ * to `textContent.trim()`. Including ignored nodes here would short-circuit
172
+ * that fallback with the AX tree's empty name and silently drop link text.
173
+ *
174
+ * On timeout or CDP failure, an empty map is returned so callers transparently
175
+ * fall back to `textContent.trim()` for every anchor.
176
+ * @param client - The CDP session attached to the page.
177
+ * @param timeout - Maximum time to wait for the AX tree fetch.
178
+ */
179
+ async function buildAccessibleNameMap(client, timeout) {
180
+ const { result, timeout: timedOut } = await raceWithTimeout(() => client
181
+ .send('Accessibility.getFullAXTree')
182
+ .then((res) => res)
183
+ .catch((error) => {
184
+ log('Accessibility.getFullAXTree failed: %O', error);
185
+ return null;
186
+ }), timeout);
187
+ const map = new Map();
188
+ if (timedOut) {
189
+ log('Accessibility.getFullAXTree timed out after %dms', timeout);
190
+ return map;
191
+ }
192
+ if (!result?.nodes) {
193
+ return map;
194
+ }
195
+ for (const node of result.nodes) {
196
+ if (node.backendDOMNodeId == null || node.ignored === true) {
197
+ continue;
198
+ }
199
+ const name = typeof node.name?.value === 'string' ? node.name.value : '';
200
+ map.set(node.backendDOMNodeId, name);
201
+ }
202
+ return map;
203
+ }
204
+ /**
205
+ * Resolves a CDP backend node id for a given element handle.
206
+ *
207
+ * Wrapped in {@link raceWithTimeout} so a single hung `DOM.describeNode` cannot
208
+ * stall the outer `Promise.all` over every anchor on the page.
209
+ * @param client - The CDP session attached to the page (must be the same session
210
+ * that owns the handle's `objectId`).
211
+ * @param objectId - The remote object id of the element handle.
212
+ * @param timeout - Maximum time to wait for the describeNode call.
213
+ * @returns The backend node id, or `null` if unavailable / timed out / failed.
214
+ */
215
+ async function resolveBackendNodeId(client, objectId, timeout) {
216
+ const { result, timeout: timedOut } = await raceWithTimeout(() => client
217
+ .send('DOM.describeNode', { objectId })
218
+ .then((res) => res)
219
+ .catch(() => null), timeout);
220
+ if (timedOut || !result) {
221
+ return null;
222
+ }
223
+ return result.node?.backendNodeId ?? null;
224
+ }
111
225
  /**
112
226
  * Extracts all anchor (`<a>` and `<area>`) elements with `href` attributes from the page.
113
227
  *
@@ -115,104 +229,348 @@ export async function getImageList(page, viewportWidth, timeout = DEFAULT_DOM_EV
115
229
  * the accessible name (from the accessibility tree, falling back to `textContent`),
116
230
  * and filters out non-HTTP links.
117
231
  *
118
- * WHY this keeps per-element CDP calls (unlike {@link getMeta} / {@link getImageList}):
119
- * the accessible name comes from Chrome's computed accessibility tree
120
- * (`page.accessibility.snapshot`), which is a CDP-only feature unavailable to in-page
121
- * DOM APIs. Each {@link getProp} read is still bounded by `timeout`.
232
+ * WHY Strategy F (single AX-tree fetch + parallel `DOM.describeNode`): the old
233
+ * implementation called `page.accessibility.snapshot({ root })` per anchor, which
234
+ * triggers a CDP round-trip *and* a Chrome-side AX subtree computation (~42ms
235
+ * each). On a page with 1181 anchors that compounded to ~53s. By fetching the
236
+ * full AX tree once and using `DOM.describeNode` in parallel to map element
237
+ * handles back to AX nodes by `backendDOMNodeId`, the same data is collected in
238
+ * ~150ms on the same page — a ~350× speed-up while preserving the original
239
+ * accessible-name semantics. See issue #876 for measurements.
240
+ *
241
+ * WHY the whole operation is wrapped in `raceWithTimeout`: even with bounded
242
+ * per-CDP-call timeouts, a degenerate page (blocked main thread, thousands of
243
+ * anchors, runaway describeNode latency) could chain enough sub-timeouts to
244
+ * exceed the caller's `timeout` budget. The outer race guarantees the function
245
+ * returns within `timeout`, surfacing whatever anchors were collected so far so
246
+ * the upstream scrape phase can continue rather than tripping a retryable retry.
122
247
  * @param page - The Puppeteer page to extract anchors from.
123
248
  * @param options - Optional URL parsing options (e.g., `disableQueries`).
124
- * @param timeout - Timeout in ms per property read. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
249
+ * @param timeout - Total time budget in ms for the whole extraction. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
125
250
  * @returns An array of {@link AnchorData} objects for all HTTP(S) links found on the page.
126
251
  */
127
252
  export async function getAnchorList(page, options, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
128
253
  log('Getting anchors');
129
254
  const $anchors = await page.$$('a[href], area[href]');
130
- const anchorList = [];
131
- for (const $anchor of $anchors) {
132
- const $href = await getProp({ $el: $anchor, propName: 'href', fallback: '' }, timeout);
133
- const hrefVal = $href.toString();
134
- const href = parseUrl(hrefVal, options);
255
+ if ($anchors.length === 0) {
256
+ log('Got 0 anchors');
257
+ return [];
258
+ }
259
+ const collected = [];
260
+ let axHits = 0;
261
+ let textFallbacks = 0;
262
+ // Set after the overall race trips so in-flight `resolveAnchor` calls can
263
+ // short-circuit instead of continuing to consume CDP capacity and pushing
264
+ // late entries into the already-returned `collected` array.
265
+ let cancelled = false;
266
+ const work = async () => {
267
+ const client = getInternalCDPClient(page);
268
+ if (cancelled)
269
+ return;
270
+ const nameByBackendId = client
271
+ ? await buildAccessibleNameMap(client, timeout)
272
+ : new Map();
273
+ if (cancelled)
274
+ return;
275
+ await Promise.all($anchors.map(async ($anchor) => {
276
+ if (cancelled)
277
+ return;
278
+ const resolved = await resolveAnchor($anchor, client, nameByBackendId, options, timeout);
279
+ if (cancelled || !resolved) {
280
+ return;
281
+ }
282
+ if (resolved.source === 'ax') {
283
+ axHits++;
284
+ }
285
+ else {
286
+ textFallbacks++;
287
+ }
288
+ collected.push(resolved.anchor);
289
+ }));
290
+ };
291
+ const { timeout: timedOut } = await raceWithTimeout(work, timeout);
292
+ cancelled = true;
293
+ if (timedOut) {
294
+ log('getAnchorList timed out after %dms; returning %d anchors collected so far', timeout, collected.length);
295
+ }
296
+ // Snapshot so post-return mutations from any in-flight Promise.all callback
297
+ // (already gated by `cancelled`, but not synchronously cancellable) cannot
298
+ // alter the array the caller now holds.
299
+ const result = [...collected];
300
+ log('Got %d anchors (%d via AX, %d via textContent)', result.length, axHits, textFallbacks);
301
+ dLog('Anchors are: %O', result.map((a) => a.href.href));
302
+ return result;
303
+ }
304
+ /**
305
+ * Resolves a single anchor handle into an {@link AnchorData} entry, or `null`
306
+ * if the anchor's href is not an HTTP(S) URL.
307
+ *
308
+ * Fires `getProp(href)` and `DOM.describeNode` in parallel, then looks up the
309
+ * accessible name from the pre-built AX map. If the anchor is not represented
310
+ * in the AX map (or CDP is unavailable), falls back to a lazy `textContent`
311
+ * fetch — only paying the extra CDP round-trip when actually needed.
312
+ * @param $anchor - The Puppeteer element handle for an anchor element.
313
+ * @param client - The shared CDP session, or `null` if unavailable.
314
+ * @param nameByBackendId - Map from `backendDOMNodeId` to accessible name.
315
+ * @param options - URL parsing options.
316
+ * @param timeout - Per-CDP-call timeout in ms.
317
+ * @returns The resolved anchor along with the name source, or `null` when the
318
+ * anchor's href is not crawlable.
319
+ */
320
+ async function resolveAnchor($anchor, client, nameByBackendId, options, timeout) {
321
+ try {
322
+ const objectId = $anchor.remoteObject().objectId;
323
+ const [hrefVal, backendNodeId] = await Promise.all([
324
+ getProp({ $el: $anchor, propName: 'href', fallback: '' }, timeout),
325
+ client && objectId != null
326
+ ? resolveBackendNodeId(client, objectId, timeout)
327
+ : Promise.resolve(null),
328
+ ]);
329
+ const href = parseUrl(hrefVal.toString(), options);
135
330
  if (!href || !href.isHTTP) {
136
- continue;
331
+ return null;
332
+ }
333
+ const axName = backendNodeId == null ? undefined : nameByBackendId.get(backendNodeId);
334
+ if (axName !== undefined) {
335
+ return { anchor: { href, textContent: axName }, source: 'ax' };
137
336
  }
138
- const axNode = await page.accessibility.snapshot({ root: $anchor });
139
337
  const textContent = await getProp({ $el: $anchor, propName: 'textContent', fallback: '' }, timeout);
140
- const accessibleName = axNode ? axNode.name || '' : textContent.trim();
141
- const link = {
142
- href,
143
- textContent: accessibleName,
144
- };
145
- anchorList.push(link);
146
- }
147
- log('Got %d anchors', anchorList.length);
148
- dLog('Anchors are: %O', anchorList.map((a) => a.href.href));
149
- return anchorList;
338
+ return { anchor: { href, textContent: textContent.trim() }, source: 'text' };
339
+ }
340
+ catch (error) {
341
+ // `remoteObject()` (and other synchronous handle accesses) can throw when
342
+ // the handle is disposed (page navigated mid-extraction). Drop just this
343
+ // anchor rather than poisoning the Promise.all over every other anchor.
344
+ dLog('resolveAnchor failed for an anchor: %O', error);
345
+ return null;
346
+ }
150
347
  }
348
+ const WINDOW_GLOBALS_TO_CHECK = [
349
+ 'dataLayer',
350
+ 'gtag',
351
+ 'ga',
352
+ '_gaq',
353
+ 'fbq',
354
+ '_fbq',
355
+ 'clarity',
356
+ '_hjSettings',
357
+ '_hjid',
358
+ 'twq',
359
+ 'ttq',
360
+ '_linkedin_partner_id',
361
+ 'pintrk',
362
+ 'amplitude',
363
+ 'mixpanel',
364
+ 'analytics',
365
+ 'heap',
366
+ 'posthog',
367
+ 'plausible',
368
+ 'fathom',
369
+ '_paq',
370
+ 's_account',
371
+ 's',
372
+ 'ym',
373
+ 'UET',
374
+ 'optimizely',
375
+ '_hsq',
376
+ 'Sentry',
377
+ 'Intercom',
378
+ 'intercomSettings',
379
+ 'drift',
380
+ 'Tawk_API',
381
+ 'zE',
382
+ 'OneTrust',
383
+ 'Cookiebot',
384
+ 'Stripe',
385
+ 'grecaptcha',
386
+ ];
151
387
  /**
152
- * Extracts comprehensive meta information from the page's `<head>`.
153
- *
154
- * Collects all metadata in a single `page.evaluate` call (14 CDP round-trips
155
- * collapsed into 1) wrapped in {@link raceWithTimeout}. On timeout (an unresponsive
156
- * page) a minimal `{ title: '' }` is returned rather than hanging.
157
- *
158
- * Collected metadata:
159
- * - `title` - The document title.
160
- * - `lang` - The `lang` attribute of the `<html>` element.
161
- * - `description` - The `<meta name="description">` content.
162
- * - `keywords` - The `<meta name="keywords">` content.
163
- * - `noindex` / `nofollow` / `noarchive` - Parsed from the `<meta name="robots">` directives.
164
- * - `canonical` - The `<link rel="canonical">` content.
165
- * - `alternate` - The `<link rel="alternate">` content.
166
- * - Open Graph tags: `og:type`, `og:title`, `og:site_name`, `og:description`, `og:url`, `og:image`.
167
- * - `twitter:card` - The Twitter Card type.
168
- * @param page - The Puppeteer page to extract meta information from.
169
- * @param timeout - Timeout in ms for the evaluation. Defaults to {@link DEFAULT_DOM_EVALUATION_TIMEOUT}.
170
- * @returns An object containing all extracted meta properties.
388
+ * Extracts comprehensive metadata from the page.
389
+ *
390
+ * Two passes happen in parallel:
391
+ * 1. Browser-side `collectHead()` serializes every `<meta>`, `<link>`,
392
+ * relevant `<script>`, `<base>`, `<noscript>`/`<iframe>` and a curated
393
+ * set of `window` globals into a `RawHeadEntry[]`. Node-side `classify()`
394
+ * then maps those entries to typed `Meta` fields using the lookup tables
395
+ * in `./meta/keys.ts`, with unknown entries preserved in `Meta.others`.
396
+ * 2. `detectTags()` runs `simple-wappalyzer` over the page HTML to produce
397
+ * `Meta.tags` (technology detection + real-ID extraction).
398
+ *
399
+ * The whole call is wrapped in `raceWithTimeout`. On timeout an empty `Meta`
400
+ * (with `title: ''` and empty required arrays/objects) is returned.
401
+ * @param page
402
+ * @param context
403
+ * @param timeout
404
+ * @example
405
+ * ```ts
406
+ * const meta = await getMeta(page, {
407
+ * url: 'https://example.com/',
408
+ * html: await page.content(),
409
+ * statusCode: response.status,
410
+ * headers: response.headers,
411
+ * });
412
+ * console.log(meta.title); // <title> text
413
+ * console.log(meta.og?.image); // og:image[] array
414
+ * console.log(meta.robots?.noindex); // parsed robots
415
+ * console.log(meta.tags.detected.Analytics); // Wappalyzer hits
416
+ * console.log(meta.tags.entries.find(e => e.provider === 'Google Analytics')?.id);
417
+ * ```
171
418
  */
172
- export async function getMeta(page, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
419
+ export async function getMeta(page, context, timeout = DEFAULT_DOM_EVALUATION_TIMEOUT) {
173
420
  log('Getting Meta');
174
- const { result, timeout: timedOut } = await raceWithTimeout(() => page
175
- .evaluate(() => {
176
- /* global document, HTMLMetaElement, HTMLLinkElement */
177
- const content = (selector) => {
178
- const el = document.querySelector(selector);
179
- return el instanceof HTMLMetaElement ? el.content : '';
180
- };
181
- const linkHref = (selector) => {
182
- const el = document.querySelector(selector);
183
- return el instanceof HTMLLinkElement ? el.href : '';
184
- };
185
- return {
186
- title: document.title,
187
- lang: document.documentElement.lang,
188
- description: content('meta[name="description"]'),
189
- keywords: content('meta[name="keywords"]'),
190
- robots: content('meta[name="robots"]'),
191
- canonical: linkHref('link[rel="canonical"]'),
192
- alternate: linkHref('link[rel="alternate"]'),
193
- 'og:type': content('meta[property="og:type"]'),
194
- 'og:title': content('meta[property="og:title"]'),
195
- 'og:site_name': content('meta[property="og:site_name"]'),
196
- 'og:description': content('meta[property="og:description"]'),
197
- 'og:url': content('meta[property="og:url"]'),
198
- 'og:image': content('meta[property="og:image"]'),
199
- 'twitter:card': content('meta[name="twitter:card"]'),
200
- };
201
- })
202
- .catch(() => null), timeout);
421
+ const { result, timeout: timedOut } = await raceWithTimeout(() => runGetMeta(page, context), timeout);
203
422
  if (timedOut || result == null) {
204
423
  log('Meta extraction timed out or failed; returning fallback');
205
- return { title: '' };
206
- }
207
- const { robots: robotsVal, ...rest } = result;
208
- const robots = new Set(robotsVal.split(',').map((robot) => robot.trim().toLowerCase()));
209
- const meta = {
210
- ...rest,
211
- noindex: robots.has('noindex'),
212
- nofollow: robots.has('nofollow'),
213
- noarchive: robots.has('noarchive'),
214
- };
424
+ return emptyMeta();
425
+ }
215
426
  log('Got meta');
216
- dLog('Meta data are: %O', meta);
217
- return meta;
427
+ dLog('Meta data are: %O', result);
428
+ return result;
429
+ }
430
+ /**
431
+ *
432
+ * @param page
433
+ * @param context
434
+ */
435
+ async function runGetMeta(page, context) {
436
+ try {
437
+ const rawPromise = collectHeadOnPage(page);
438
+ const htmlPromise = context.html === undefined
439
+ ? page.content().catch(() => '')
440
+ : Promise.resolve(context.html);
441
+ const [raw, html] = await Promise.all([rawPromise, htmlPromise]);
442
+ const tags = await detectTags({
443
+ url: context.url,
444
+ html,
445
+ ...(context.statusCode === undefined ? {} : { statusCode: context.statusCode }),
446
+ ...(context.headers === undefined ? {} : { headers: context.headers }),
447
+ });
448
+ return classify(raw, {
449
+ tags,
450
+ ...(context.includeRaw ? { includeRaw: true } : {}),
451
+ });
452
+ }
453
+ catch (error) {
454
+ log('runGetMeta failed: %O', error);
455
+ return null;
456
+ }
457
+ }
458
+ /**
459
+ *
460
+ * @param page
461
+ */
462
+ async function collectHeadOnPage(page) {
463
+ const raw = await page
464
+ .evaluate((knownGlobals) => {
465
+ const entries = [];
466
+ const html = document.documentElement;
467
+ entries.push({
468
+ kind: 'html',
469
+ lang: html.lang || undefined,
470
+ dir: html.dir || undefined,
471
+ xmlns: html.getAttribute('xmlns') ?? undefined,
472
+ prefix: html.getAttribute('prefix') ?? undefined,
473
+ vocab: html.getAttribute('vocab') ?? undefined,
474
+ typeOf: html.getAttribute('typeof') ?? undefined,
475
+ itemscope: html.hasAttribute('itemscope') || undefined,
476
+ itemtype: html.getAttribute('itemtype') ?? undefined,
477
+ amp: html.hasAttribute('amp') || undefined,
478
+ lightning: html.hasAttribute('⚡') || undefined,
479
+ }, { kind: 'title', content: document.title });
480
+ for (const base of document.querySelectorAll('base')) {
481
+ if (!(base instanceof HTMLBaseElement))
482
+ continue;
483
+ entries.push({
484
+ kind: 'base',
485
+ href: base.getAttribute('href') ?? undefined,
486
+ target: base.getAttribute('target') ?? undefined,
487
+ });
488
+ }
489
+ for (const meta of document.querySelectorAll('meta')) {
490
+ if (!(meta instanceof HTMLMetaElement))
491
+ continue;
492
+ const name = meta.getAttribute('name');
493
+ const property = meta.getAttribute('property');
494
+ const httpEquiv = meta.getAttribute('http-equiv');
495
+ const itemprop = meta.getAttribute('itemprop');
496
+ const charset = meta.getAttribute('charset');
497
+ const content = meta.getAttribute('content');
498
+ const media = meta.getAttribute('media');
499
+ entries.push({
500
+ kind: 'meta',
501
+ name: name ? name.toLowerCase() : undefined,
502
+ property: property ? property.toLowerCase() : undefined,
503
+ httpEquiv: httpEquiv ? httpEquiv.toLowerCase() : undefined,
504
+ itemprop: itemprop ?? undefined,
505
+ charset: charset ?? undefined,
506
+ content: content ?? undefined,
507
+ media: media ?? undefined,
508
+ });
509
+ }
510
+ for (const link of document.querySelectorAll('link[href]')) {
511
+ if (!(link instanceof HTMLLinkElement))
512
+ continue;
513
+ const relRaw = link.getAttribute('rel') ?? '';
514
+ const rel = relRaw.toLowerCase().split(/\s+/u).filter(Boolean);
515
+ entries.push({
516
+ kind: 'link',
517
+ rel,
518
+ href: link.getAttribute('href') ?? '',
519
+ type: link.getAttribute('type') ?? undefined,
520
+ media: link.getAttribute('media') ?? undefined,
521
+ sizes: link.getAttribute('sizes') ?? undefined,
522
+ title: link.getAttribute('title') ?? undefined,
523
+ hreflang: link.getAttribute('hreflang') ?? undefined,
524
+ as: link.getAttribute('as') ?? undefined,
525
+ crossorigin: link.getAttribute('crossorigin') ?? undefined,
526
+ color: link.getAttribute('color') ?? undefined,
527
+ blocking: link.getAttribute('blocking') ?? undefined,
528
+ imagesrcset: link.getAttribute('imagesrcset') ?? undefined,
529
+ });
530
+ }
531
+ const STRUCTURED_TYPES = new Set([
532
+ 'application/ld+json',
533
+ 'speculationrules',
534
+ 'application/json+oembed',
535
+ 'application/xml+oembed',
536
+ ]);
537
+ for (const script of document.querySelectorAll('script[type]')) {
538
+ if (!(script instanceof HTMLScriptElement))
539
+ continue;
540
+ const scriptType = (script.getAttribute('type') ?? '').toLowerCase();
541
+ if (!STRUCTURED_TYPES.has(scriptType))
542
+ continue;
543
+ const src = script.getAttribute('src') ?? undefined;
544
+ const text = script.textContent ?? '';
545
+ const inHead = !!script.closest('head');
546
+ const inNoscript = !!script.closest('noscript');
547
+ const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
548
+ entries.push({
549
+ kind: 'script',
550
+ scriptType,
551
+ content: text || undefined,
552
+ src,
553
+ location,
554
+ });
555
+ }
556
+ for (const iframe of document.querySelectorAll('iframe[src]')) {
557
+ if (!(iframe instanceof HTMLIFrameElement))
558
+ continue;
559
+ const src = iframe.getAttribute('src') ?? '';
560
+ if (!src)
561
+ continue;
562
+ const inHead = !!iframe.closest('head');
563
+ const inNoscript = !!iframe.closest('noscript');
564
+ const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
565
+ entries.push({ kind: 'iframe', src, location });
566
+ }
567
+ const win = window;
568
+ const presentGlobals = knownGlobals.filter((name) => win[name] !== undefined);
569
+ if (presentGlobals.length > 0) {
570
+ entries.push({ kind: 'window-global', names: presentGlobals });
571
+ }
572
+ return entries;
573
+ }, WINDOW_GLOBALS_TO_CHECK)
574
+ .catch(() => []);
575
+ return raw;
218
576
  }
package/dist/index.d.ts CHANGED
@@ -18,4 +18,4 @@ export { detectCDN } from '@d-zero/shared/detect-cdn';
18
18
  export type { CDNType } from '@d-zero/shared/detect-cdn';
19
19
  export type { ScrapeResult, ResourceEntry, PageData } from './types.js';
20
20
  export type { ScraperOptions, ChangePhaseEvent, ScraperEventTypes } from './types.js';
21
- export type { Resource, AnchorData, Meta, ImageElement, SkippedPageData, NetworkLog, } from './types.js';
21
+ export type { Resource, AnchorData, Meta, ImageElement, SkippedPageData, NetworkLog, OpenGraphMeta, OgArticleMeta, OgBookMeta, OgProfileMeta, OgMusicMeta, OgVideoNsMeta, TwitterMeta, FbMeta, FediverseMeta, AppleMeta, MsApplicationMeta, VerificationMeta, GoogleMeta, GeoMeta, CitationMeta, RdfaMeta, MicrodataMeta, AmpMeta, LegacyMeta, MobileMeta, MicroformatsMeta, PinterestMeta, SlackMeta, LinkedInMeta, ExperimentalMeta, WikiMeta, LinkMeta, LinkEntry, JsonLdEntry, OthersBucket, ScriptEntry, IframeEntry, TagsMeta, TagDetail, TagEntry, TagSource, ViewportMeta, RobotsMeta, ReferrerMeta, FormatDetectionMeta, HttpEquivMeta, HttpEquivRefresh, RawHeadEntry, } from './types.js';
@@ -0,0 +1,52 @@
1
+ /**
2
+ * Pure-function classifier that turns `RawHeadEntry[]` (collected on the browser
3
+ * side by `collectHead`) into a typed `Meta` object.
4
+ *
5
+ * The classifier is the **only place** where dot-paths from `keys.ts` get
6
+ * resolved against the `Meta` shape. Parsers (viewport/robots/refresh/etc.)
7
+ * are dispatched on the fly for the few entries that need value normalization.
8
+ *
9
+ * Unknown entries (names/properties/rels not in the lookup tables) are
10
+ * preserved in {@link Meta.others} so consumers never lose information.
11
+ * @module
12
+ */
13
+ import type { Meta, RawHeadEntry, TagsMeta } from './types.js';
14
+ /**
15
+ * Options for {@link classify}.
16
+ */
17
+ export type ClassifyOptions = {
18
+ /**
19
+ * When `true`, copies the input `raw` entries onto `Meta._raw` for debugging.
20
+ * Default `false` to keep the serialized `Meta` small.
21
+ */
22
+ readonly includeRaw?: boolean;
23
+ /**
24
+ * Pre-computed `TagsMeta` from `tag-detection.ts`. When omitted, an empty
25
+ * `TagsMeta` (with `detected: {}` and `entries: []`) is used.
26
+ */
27
+ readonly tags?: TagsMeta;
28
+ };
29
+ /**
30
+ * Builds the empty `Meta` skeleton with all required fields initialized.
31
+ */
32
+ /** Returns a fresh `Meta` skeleton with all required fields initialized. */
33
+ export declare function emptyMeta(): Meta;
34
+ /**
35
+ * Writes `value` to `target` along `dotPath`. Intermediate objects are created
36
+ * on demand. When `multi` is `true`, the leaf is treated as an array and `value`
37
+ * is appended; otherwise the first assignment wins (subsequent calls are no-ops).
38
+ *
39
+ * Exported for the unit tests in `classify.spec.ts`.
40
+ * @param target
41
+ * @param dotPath
42
+ * @param value
43
+ * @param multi
44
+ */
45
+ export declare function setByPath(target: Record<string, unknown>, dotPath: string, value: unknown, multi: boolean): void;
46
+ /**
47
+ * Top-level classifier. Takes a list of raw entries collected from the page
48
+ * and produces a populated `Meta`.
49
+ * @param raw
50
+ * @param options
51
+ */
52
+ export declare function classify(raw: readonly RawHeadEntry[], options?: ClassifyOptions): Meta;