@d-zero/beholder 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -3,6 +3,12 @@
3
3
  All notable changes to this project will be documented in this file.
4
4
  See [Conventional Commits](https://conventionalcommits.org) for commit guidelines.
5
5
 
6
+ # [3.1.0](https://github.com/d-zero-dev/tools/compare/@d-zero/beholder@3.0.0...@d-zero/beholder@3.1.0) (2026-06-17)
7
+
8
+ ### Features
9
+
10
+ - **beholder:** expose extractMetaFromDocument for jsdom-backed meta extraction ([a56e21c](https://github.com/d-zero-dev/tools/commit/a56e21c17dcc1e542595a596074c5d8e659c1168))
11
+
6
12
  # [3.0.0](https://github.com/d-zero-dev/tools/compare/@d-zero/beholder@2.1.6...@d-zero/beholder@3.0.0) (2026-06-16)
7
13
 
8
14
  ### Bug Fixes
package/README.md CHANGED
@@ -32,3 +32,29 @@ if (result.type === 'success') {
32
32
  ```
33
33
 
34
34
  設計判断(イベントではなく戻り値で返す理由、`page` のライフサイクル責務、リトライ機構など)は `src/scraper.ts` の JSDoc を参照。
35
+
36
+ ## DOM 文字列からメタ抽出(Puppeteer なし)
37
+
38
+ HTML 文字列を jsdom などでパースしてから `Meta` を取り出したい場合、`extractMetaFromDocument` を使う。`Scraper` が内部で呼ぶ `collectHead → detectTags → classify` パイプラインと同じ実装を再利用するため、戻り値の `Meta` 形状は `scrapeStart` と同一。DOM ライブラリ(jsdom 等)はユーザランドの責務。
39
+
40
+ ```ts
41
+ import { extractMetaFromDocument } from '@d-zero/beholder';
42
+ import { JSDOM } from 'jsdom';
43
+
44
+ const url = 'https://example.com/';
45
+ const html = await (await fetch(url)).text();
46
+ const dom = new JSDOM(html, { url });
47
+
48
+ // `as unknown as Window` は jsdom の `DOMWindow` 型が lib.dom の `Window` と
49
+ // 構造的に完全一致しないための型キャスト。ランタイムでは互換。
50
+ const meta = await extractMetaFromDocument(dom.window as unknown as Window, {
51
+ url,
52
+ html,
53
+ });
54
+
55
+ console.log(meta.title);
56
+ console.log(meta.og?.image);
57
+ console.log(meta.tags.entries);
58
+ ```
59
+
60
+ `context.html` を省略すると `window.document.documentElement.outerHTML` がフォールバックされる。ただし Wappalyzer の HTML パターンはスクリプト実行前の生 HTML に合わせて作られているので、可能なら取得直後の HTML 文字列を明示的に渡す方が検出が安定する。
@@ -16,6 +16,7 @@
16
16
  import { raceWithTimeout } from '@d-zero/shared/race-with-timeout';
17
17
  import { domDetailsLog, domLog } from './debug.js';
18
18
  import { classify, emptyMeta } from './meta/classify.js';
19
+ import { WINDOW_GLOBALS_TO_CHECK, collectHeadFromDocument } from './meta/collect-head.js';
19
20
  import { detectTags } from './meta/tag-detection.js';
20
21
  import { parseUrl } from './parse-url.js';
21
22
  const pid = `${process.pid}`;
@@ -345,45 +346,6 @@ async function resolveAnchor($anchor, client, nameByBackendId, options, timeout)
345
346
  return null;
346
347
  }
347
348
  }
348
- const WINDOW_GLOBALS_TO_CHECK = [
349
- 'dataLayer',
350
- 'gtag',
351
- 'ga',
352
- '_gaq',
353
- 'fbq',
354
- '_fbq',
355
- 'clarity',
356
- '_hjSettings',
357
- '_hjid',
358
- 'twq',
359
- 'ttq',
360
- '_linkedin_partner_id',
361
- 'pintrk',
362
- 'amplitude',
363
- 'mixpanel',
364
- 'analytics',
365
- 'heap',
366
- 'posthog',
367
- 'plausible',
368
- 'fathom',
369
- '_paq',
370
- 's_account',
371
- 's',
372
- 'ym',
373
- 'UET',
374
- 'optimizely',
375
- '_hsq',
376
- 'Sentry',
377
- 'Intercom',
378
- 'intercomSettings',
379
- 'drift',
380
- 'Tawk_API',
381
- 'zE',
382
- 'OneTrust',
383
- 'Cookiebot',
384
- 'Stripe',
385
- 'grecaptcha',
386
- ];
387
349
  /**
388
350
  * Extracts comprehensive metadata from the page.
389
351
  *
@@ -456,121 +418,27 @@ async function runGetMeta(page, context) {
456
418
  }
457
419
  }
458
420
  /**
421
+ * Collects raw `<head>` entries from a Puppeteer page by injecting
422
+ * {@link collectHeadFromDocument} into the page realm.
459
423
  *
460
- * @param page
424
+ * WHY string-eval instead of `page.evaluate(fn, args)`: the shared
425
+ * implementation lives in this module (`collectHeadFromDocument`), and a
426
+ * `page.evaluate(() => collectHeadFromDocument(window, …))` wrapper cannot
427
+ * reach that module-scope binding inside the page realm — only the wrapper's
428
+ * own source crosses the CDP boundary. Serializing the implementation via
429
+ * `Function.prototype.toString` and invoking it through
430
+ * `page.evaluate(string)` is what keeps the Puppeteer path and the
431
+ * jsdom path on one source of truth.
432
+ *
433
+ * The same {@link collectHeadFromDocument} function is also exposed via
434
+ * {@link ../extract-meta.ts | extractMetaFromDocument} for jsdom/Node callers,
435
+ * so the two paths cannot drift apart.
436
+ * @param page - The Puppeteer page whose document will be inspected.
461
437
  */
462
438
  async function collectHeadOnPage(page) {
463
- const raw = await page
464
- .evaluate((knownGlobals) => {
465
- const entries = [];
466
- const html = document.documentElement;
467
- entries.push({
468
- kind: 'html',
469
- lang: html.lang || undefined,
470
- dir: html.dir || undefined,
471
- xmlns: html.getAttribute('xmlns') ?? undefined,
472
- prefix: html.getAttribute('prefix') ?? undefined,
473
- vocab: html.getAttribute('vocab') ?? undefined,
474
- typeOf: html.getAttribute('typeof') ?? undefined,
475
- itemscope: html.hasAttribute('itemscope') || undefined,
476
- itemtype: html.getAttribute('itemtype') ?? undefined,
477
- amp: html.hasAttribute('amp') || undefined,
478
- lightning: html.hasAttribute('⚡') || undefined,
479
- }, { kind: 'title', content: document.title });
480
- for (const base of document.querySelectorAll('base')) {
481
- if (!(base instanceof HTMLBaseElement))
482
- continue;
483
- entries.push({
484
- kind: 'base',
485
- href: base.getAttribute('href') ?? undefined,
486
- target: base.getAttribute('target') ?? undefined,
487
- });
488
- }
489
- for (const meta of document.querySelectorAll('meta')) {
490
- if (!(meta instanceof HTMLMetaElement))
491
- continue;
492
- const name = meta.getAttribute('name');
493
- const property = meta.getAttribute('property');
494
- const httpEquiv = meta.getAttribute('http-equiv');
495
- const itemprop = meta.getAttribute('itemprop');
496
- const charset = meta.getAttribute('charset');
497
- const content = meta.getAttribute('content');
498
- const media = meta.getAttribute('media');
499
- entries.push({
500
- kind: 'meta',
501
- name: name ? name.toLowerCase() : undefined,
502
- property: property ? property.toLowerCase() : undefined,
503
- httpEquiv: httpEquiv ? httpEquiv.toLowerCase() : undefined,
504
- itemprop: itemprop ?? undefined,
505
- charset: charset ?? undefined,
506
- content: content ?? undefined,
507
- media: media ?? undefined,
508
- });
509
- }
510
- for (const link of document.querySelectorAll('link[href]')) {
511
- if (!(link instanceof HTMLLinkElement))
512
- continue;
513
- const relRaw = link.getAttribute('rel') ?? '';
514
- const rel = relRaw.toLowerCase().split(/\s+/u).filter(Boolean);
515
- entries.push({
516
- kind: 'link',
517
- rel,
518
- href: link.getAttribute('href') ?? '',
519
- type: link.getAttribute('type') ?? undefined,
520
- media: link.getAttribute('media') ?? undefined,
521
- sizes: link.getAttribute('sizes') ?? undefined,
522
- title: link.getAttribute('title') ?? undefined,
523
- hreflang: link.getAttribute('hreflang') ?? undefined,
524
- as: link.getAttribute('as') ?? undefined,
525
- crossorigin: link.getAttribute('crossorigin') ?? undefined,
526
- color: link.getAttribute('color') ?? undefined,
527
- blocking: link.getAttribute('blocking') ?? undefined,
528
- imagesrcset: link.getAttribute('imagesrcset') ?? undefined,
529
- });
530
- }
531
- const STRUCTURED_TYPES = new Set([
532
- 'application/ld+json',
533
- 'speculationrules',
534
- 'application/json+oembed',
535
- 'application/xml+oembed',
536
- ]);
537
- for (const script of document.querySelectorAll('script[type]')) {
538
- if (!(script instanceof HTMLScriptElement))
539
- continue;
540
- const scriptType = (script.getAttribute('type') ?? '').toLowerCase();
541
- if (!STRUCTURED_TYPES.has(scriptType))
542
- continue;
543
- const src = script.getAttribute('src') ?? undefined;
544
- const text = script.textContent ?? '';
545
- const inHead = !!script.closest('head');
546
- const inNoscript = !!script.closest('noscript');
547
- const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
548
- entries.push({
549
- kind: 'script',
550
- scriptType,
551
- content: text || undefined,
552
- src,
553
- location,
554
- });
555
- }
556
- for (const iframe of document.querySelectorAll('iframe[src]')) {
557
- if (!(iframe instanceof HTMLIFrameElement))
558
- continue;
559
- const src = iframe.getAttribute('src') ?? '';
560
- if (!src)
561
- continue;
562
- const inHead = !!iframe.closest('head');
563
- const inNoscript = !!iframe.closest('noscript');
564
- const location = inHead ? 'head' : inNoscript ? 'noscript' : 'body';
565
- entries.push({ kind: 'iframe', src, location });
566
- }
567
- const win = window;
568
- const presentGlobals = knownGlobals.filter((name) => win[name] !== undefined);
569
- if (presentGlobals.length > 0) {
570
- entries.push({ kind: 'window-global', names: presentGlobals });
571
- }
572
- return entries;
573
- }, WINDOW_GLOBALS_TO_CHECK)
574
- .catch(() => []);
439
+ const fnSource = collectHeadFromDocument.toString();
440
+ const globalsLiteral = JSON.stringify(WINDOW_GLOBALS_TO_CHECK);
441
+ const expr = `(${fnSource})(window, ${globalsLiteral})`;
442
+ const raw = await page.evaluate(expr).catch(() => []);
575
443
  return raw;
576
444
  }
@@ -0,0 +1,98 @@
1
+ /**
2
+ * Public, Puppeteer-free entry point for extracting {@link Meta} from an
3
+ * already-parsed DOM (e.g. jsdom).
4
+ *
5
+ * WHY this exists alongside `Scraper.scrapeStart()` / `getMeta(page, …)`:
6
+ * callers who already have an HTML string (from `fetch`, a fixture, an
7
+ * archive) should not be forced to spin up Chromium just to read a few `<meta>`
8
+ * tags. This module reuses the same `collectHead → detectTags → classify`
9
+ * pipeline as the Puppeteer path — the `Meta` shape returned here is
10
+ * identical to what `Scraper` produces, so downstream consumers do not branch
11
+ * on the source.
12
+ *
13
+ * See {@link extractMetaFromDocument} for the usage example.
14
+ * @module
15
+ */
16
+ import type { Meta } from './types.js';
17
+ /**
18
+ * Inputs for {@link extractMetaFromDocument}.
19
+ *
20
+ * `url`/`statusCode`/`headers` mirror the inputs to the underlying
21
+ * `simple-wappalyzer` driver. They are not consumed by the DOM-walk side of
22
+ * the pipeline.
23
+ *
24
+ * `html` is optional: when omitted, `document.documentElement.outerHTML` is
25
+ * read off the passed window — matching the fallback `getMeta(page, …)` does
26
+ * via `page.content()`.
27
+ */
28
+ export type ExtractMetaContext = {
29
+ /** The fully resolved URL of the page (used by Wappalyzer + AMP fields). */
30
+ readonly url: string;
31
+ /**
32
+ * Rendered HTML used for technology detection. Defaults to
33
+ * `window.document.documentElement.outerHTML` when omitted.
34
+ *
35
+ * WHY allow override: callers that fetched the raw HTML string from the
36
+ * network already have the *pre-script-execution* markup, which is what
37
+ * Wappalyzer's HTML patterns are tuned for. The serialized DOM from
38
+ * `outerHTML` reflects whatever scripts have already mutated; provide the
39
+ * raw string to get more stable detections.
40
+ */
41
+ readonly html?: string;
42
+ /** HTTP status code, surfaced to the Wappalyzer driver. */
43
+ readonly statusCode?: number;
44
+ /**
45
+ * Response headers; case is preserved by the caller, lowercased internally
46
+ * by `detectTags`.
47
+ */
48
+ readonly headers?: Record<string, string | string[] | undefined>;
49
+ /**
50
+ * When `true`, the returned `Meta` includes `_raw: RawHeadEntry[]` for
51
+ * debugging. Default `false` to keep the serialized payload small.
52
+ */
53
+ readonly includeRaw?: boolean;
54
+ };
55
+ /**
56
+ * Extracts a `Meta` object from a DOM provided by the caller.
57
+ *
58
+ * Pipeline:
59
+ *
60
+ * 1. {@link collectHeadFromDocument} walks `window.document` and returns a
61
+ * serializable `RawHeadEntry[]`.
62
+ * 2. {@link detectTags} runs `simple-wappalyzer` over the HTML + headers to
63
+ * detect third-party technologies.
64
+ * 3. {@link classify} folds the two signals together into a typed `Meta`.
65
+ *
66
+ * Step (1) is synchronous and runs first; step (2) is awaited next. The two
67
+ * are independent in principle, but the current shape is sequential — keeping
68
+ * it that way avoids forcing the synchronous DOM walk into a microtask just to
69
+ * gain a few milliseconds of overlap with the Wappalyzer call.
70
+ * @param window - The window whose `document` will be walked. jsdom's
71
+ * `dom.window` works; pass any object satisfying the `Window`
72
+ * type. The function never mutates the document.
73
+ * @param context - URL / HTML / headers / status code context. See
74
+ * {@link ExtractMetaContext}.
75
+ * @returns The extracted `Meta` (always defined; empty fields stay empty).
76
+ * @example
77
+ * ```ts
78
+ * import { JSDOM } from 'jsdom';
79
+ * import { extractMetaFromDocument } from '@d-zero/beholder';
80
+ *
81
+ * const url = 'https://example.com/';
82
+ * const html = await (await fetch(url)).text();
83
+ * const dom = new JSDOM(html, { url });
84
+ *
85
+ * // The `as unknown as Window` cast is needed because jsdom's `DOMWindow` is
86
+ * // not structurally identical to lib.dom's `Window` (a few rare globals
87
+ * // differ), but the runtime shape is compatible for this function's needs.
88
+ * const meta = await extractMetaFromDocument(dom.window as unknown as Window, {
89
+ * url,
90
+ * html,
91
+ * });
92
+ *
93
+ * meta.title; // <title>
94
+ * meta.og?.image; // og:image[]
95
+ * meta.tags.entries; // Wappalyzer detections + extracted IDs
96
+ * ```
97
+ */
98
+ export declare function extractMetaFromDocument(window: Window, context: ExtractMetaContext): Promise<Meta>;
@@ -0,0 +1,75 @@
1
+ /**
2
+ * Public, Puppeteer-free entry point for extracting {@link Meta} from an
3
+ * already-parsed DOM (e.g. jsdom).
4
+ *
5
+ * WHY this exists alongside `Scraper.scrapeStart()` / `getMeta(page, …)`:
6
+ * callers who already have an HTML string (from `fetch`, a fixture, an
7
+ * archive) should not be forced to spin up Chromium just to read a few `<meta>`
8
+ * tags. This module reuses the same `collectHead → detectTags → classify`
9
+ * pipeline as the Puppeteer path — the `Meta` shape returned here is
10
+ * identical to what `Scraper` produces, so downstream consumers do not branch
11
+ * on the source.
12
+ *
13
+ * See {@link extractMetaFromDocument} for the usage example.
14
+ * @module
15
+ */
16
+ import { classify } from './meta/classify.js';
17
+ import { collectHeadFromDocument, WINDOW_GLOBALS_TO_CHECK } from './meta/collect-head.js';
18
+ import { detectTags } from './meta/tag-detection.js';
19
+ /**
20
+ * Extracts a `Meta` object from a DOM provided by the caller.
21
+ *
22
+ * Pipeline:
23
+ *
24
+ * 1. {@link collectHeadFromDocument} walks `window.document` and returns a
25
+ * serializable `RawHeadEntry[]`.
26
+ * 2. {@link detectTags} runs `simple-wappalyzer` over the HTML + headers to
27
+ * detect third-party technologies.
28
+ * 3. {@link classify} folds the two signals together into a typed `Meta`.
29
+ *
30
+ * Step (1) is synchronous and runs first; step (2) is awaited next. The two
31
+ * are independent in principle, but the current shape is sequential — keeping
32
+ * it that way avoids forcing the synchronous DOM walk into a microtask just to
33
+ * gain a few milliseconds of overlap with the Wappalyzer call.
34
+ * @param window - The window whose `document` will be walked. jsdom's
35
+ * `dom.window` works; pass any object satisfying the `Window`
36
+ * type. The function never mutates the document.
37
+ * @param context - URL / HTML / headers / status code context. See
38
+ * {@link ExtractMetaContext}.
39
+ * @returns The extracted `Meta` (always defined; empty fields stay empty).
40
+ * @example
41
+ * ```ts
42
+ * import { JSDOM } from 'jsdom';
43
+ * import { extractMetaFromDocument } from '@d-zero/beholder';
44
+ *
45
+ * const url = 'https://example.com/';
46
+ * const html = await (await fetch(url)).text();
47
+ * const dom = new JSDOM(html, { url });
48
+ *
49
+ * // The `as unknown as Window` cast is needed because jsdom's `DOMWindow` is
50
+ * // not structurally identical to lib.dom's `Window` (a few rare globals
51
+ * // differ), but the runtime shape is compatible for this function's needs.
52
+ * const meta = await extractMetaFromDocument(dom.window as unknown as Window, {
53
+ * url,
54
+ * html,
55
+ * });
56
+ *
57
+ * meta.title; // <title>
58
+ * meta.og?.image; // og:image[]
59
+ * meta.tags.entries; // Wappalyzer detections + extracted IDs
60
+ * ```
61
+ */
62
+ export async function extractMetaFromDocument(window, context) {
63
+ const raw = collectHeadFromDocument(window, WINDOW_GLOBALS_TO_CHECK);
64
+ const html = context.html ?? window.document.documentElement.outerHTML;
65
+ const tags = await detectTags({
66
+ url: context.url,
67
+ html,
68
+ ...(context.statusCode === undefined ? {} : { statusCode: context.statusCode }),
69
+ ...(context.headers === undefined ? {} : { headers: context.headers }),
70
+ });
71
+ return classify(raw, {
72
+ tags,
73
+ ...(context.includeRaw ? { includeRaw: true } : {}),
74
+ });
75
+ }
package/dist/index.d.ts CHANGED
@@ -12,6 +12,8 @@
12
12
  */
13
13
  export { default as default } from './scraper.js';
14
14
  export { isError } from './is-error.js';
15
+ export { extractMetaFromDocument } from './extract-meta.js';
16
+ export type { ExtractMetaContext } from './extract-meta.js';
15
17
  export { detectCompress } from '@d-zero/shared/detect-compress';
16
18
  export type { CompressType } from '@d-zero/shared/detect-compress';
17
19
  export { detectCDN } from '@d-zero/shared/detect-cdn';
package/dist/index.js CHANGED
@@ -12,5 +12,6 @@
12
12
  */
13
13
  export { default as default } from './scraper.js';
14
14
  export { isError } from './is-error.js';
15
+ export { extractMetaFromDocument } from './extract-meta.js';
15
16
  export { detectCompress } from '@d-zero/shared/detect-compress';
16
17
  export { detectCDN } from '@d-zero/shared/detect-cdn';
@@ -0,0 +1,63 @@
1
+ /**
2
+ * DOM-side raw `<head>` collector.
3
+ *
4
+ * `collectHeadFromDocument` walks a `Document` (Puppeteer page realm or jsdom realm
5
+ * alike) and produces a serializable {@link RawHeadEntry}[] that
6
+ * {@link ../meta/classify.ts | classify} can turn into a typed `Meta`.
7
+ *
8
+ * WHY this function is realm-agnostic:
9
+ *
10
+ * - The Puppeteer path stringifies this function via `Function.prototype.toString`
11
+ * and runs it as a `page.evaluate(string)` expression, so any closure over
12
+ * module-scope bindings would resolve to `undefined` in the browser realm.
13
+ * - The jsdom (Node) path calls it directly with the jsdom `Window`. Because
14
+ * `HTMLLinkElement` (etc.) in jsdom is a *different class instance* from the
15
+ * one in the page realm, `instanceof` only works when the constructor is read
16
+ * from the *passed* `window` rather than from bare globals.
17
+ *
18
+ * Together those constraints dictate that the function MUST:
19
+ *
20
+ * 1. Reference no module-level variables — only its own parameters and inner locals.
21
+ * 2. Take every HTML class constructor (`HTMLBaseElement`, …) from the passed
22
+ * `window` via destructuring instead of relying on ambient globals.
23
+ * 3. Stay in plain ES syntax (no TS-only constructs that need helper imports).
24
+ * @module
25
+ */
26
+ import type { RawHeadEntry } from './types.js';
27
+ /**
28
+ * Curated list of `window` globals whose presence indicates that a third-party
29
+ * tag library has been loaded on the page. Surfaced as a single
30
+ * `kind: 'window-global'` entry so that downstream consumers (e.g. tag-detection)
31
+ * can cross-reference the script/iframe signals.
32
+ *
33
+ * Kept here (rather than in `dom-evaluation.ts`) so the Puppeteer path and the
34
+ * jsdom path share one source of truth.
35
+ */
36
+ export declare const WINDOW_GLOBALS_TO_CHECK: readonly string[];
37
+ /**
38
+ * Walks the given window's `Document` and returns a serializable list of raw
39
+ * head entries.
40
+ *
41
+ * Two realms are supported:
42
+ *
43
+ * - Browser realm (Puppeteer): the function source is `.toString()`'d and run
44
+ * inside the page via `page.evaluate(string)`. Inside the page, `window`
45
+ * resolves to the page's global object, so destructured class constructors
46
+ * match `instanceof` checks against elements returned from `querySelectorAll`.
47
+ * - Node realm (jsdom et al.): the caller passes `dom.window` directly. jsdom's
48
+ * HTML element prototypes are distinct from the host Node's bare globals, so
49
+ * reading the constructors off the passed `window` is what makes `instanceof`
50
+ * succeed.
51
+ *
52
+ * The function MUST NOT close over any module-scope binding — all data it needs
53
+ * is reached through its two parameters.
54
+ * @param window - The window object whose `document` will be inspected. Provides
55
+ * both the DOM tree and the HTML element constructors used for
56
+ * `instanceof` narrowing.
57
+ * @param knownGlobals - Names of `window` properties that, when present,
58
+ * indicate a third-party tag library is loaded. Required
59
+ * (no default) so the Puppeteer-side string-eval path
60
+ * does not have to inline a default value list.
61
+ * @returns Serializable list of raw head entries for {@link ../meta/classify.ts | classify}.
62
+ */
63
+ export declare function collectHeadFromDocument(window: Window, knownGlobals: readonly string[]): RawHeadEntry[];