@d-zero/beholder 2.1.6 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +44 -0
  2. package/README.md +26 -0
  3. package/dist/dom-evaluation.d.ts +72 -24
  4. package/dist/dom-evaluation.js +310 -84
  5. package/dist/extract-meta.d.ts +98 -0
  6. package/dist/extract-meta.js +75 -0
  7. package/dist/index.d.ts +3 -1
  8. package/dist/index.js +1 -0
  9. package/dist/meta/classify.d.ts +52 -0
  10. package/dist/meta/classify.js +731 -0
  11. package/dist/meta/collect-head.d.ts +63 -0
  12. package/dist/meta/collect-head.js +223 -0
  13. package/dist/meta/id-extractors.d.ts +40 -0
  14. package/dist/meta/id-extractors.js +196 -0
  15. package/dist/meta/keys.d.ts +41 -0
  16. package/dist/meta/keys.js +507 -0
  17. package/dist/meta/parsers.d.ts +74 -0
  18. package/dist/meta/parsers.js +293 -0
  19. package/dist/meta/tag-detection.d.ts +59 -0
  20. package/dist/meta/tag-detection.js +120 -0
  21. package/dist/meta/types.d.ts +874 -0
  22. package/dist/meta/types.js +12 -0
  23. package/dist/scraper.js +15 -13
  24. package/dist/types.d.ts +3 -38
  25. package/package.json +8 -5
  26. package/src/dom-evaluation.spec.ts +301 -73
  27. package/src/dom-evaluation.ts +417 -88
  28. package/src/extract-meta.spec.ts +247 -0
  29. package/src/extract-meta.ts +121 -0
  30. package/src/index.ts +45 -0
  31. package/src/meta/classify.spec.ts +281 -0
  32. package/src/meta/classify.ts +810 -0
  33. package/src/meta/collect-head.ts +247 -0
  34. package/src/meta/id-extractors.spec.ts +69 -0
  35. package/src/meta/id-extractors.ts +206 -0
  36. package/src/meta/keys.ts +568 -0
  37. package/src/meta/parsers.spec.ts +178 -0
  38. package/src/meta/parsers.ts +304 -0
  39. package/src/meta/simple-wappalyzer.d.ts +37 -0
  40. package/src/meta/tag-detection.spec.ts +134 -0
  41. package/src/meta/tag-detection.ts +161 -0
  42. package/src/meta/types.ts +949 -0
  43. package/src/scraper.ts +19 -13
  44. package/src/types.ts +49 -55
  45. package/tsconfig.tsbuildinfo +1 -1
@@ -0,0 +1,98 @@
1
+ /**
2
+ * Public, Puppeteer-free entry point for extracting {@link Meta} from an
3
+ * already-parsed DOM (e.g. jsdom).
4
+ *
5
+ * WHY this exists alongside `Scraper.scrapeStart()` / `getMeta(page, …)`:
6
+ * callers who already have an HTML string (from `fetch`, a fixture, an
7
+ * archive) should not be forced to spin up Chromium just to read a few `<meta>`
8
+ * tags. This module reuses the same `collectHead → detectTags → classify`
9
+ * pipeline as the Puppeteer path — the `Meta` shape returned here is
10
+ * identical to what `Scraper` produces, so downstream consumers do not branch
11
+ * on the source.
12
+ *
13
+ * See {@link extractMetaFromDocument} for the usage example.
14
+ * @module
15
+ */
16
+ import type { Meta } from './types.js';
17
+ /**
18
+ * Inputs for {@link extractMetaFromDocument}.
19
+ *
20
+ * `url`/`statusCode`/`headers` mirror the inputs to the underlying
21
+ * `simple-wappalyzer` driver. They are not consumed by the DOM-walk side of
22
+ * the pipeline.
23
+ *
24
+ * `html` is optional: when omitted, `document.documentElement.outerHTML` is
25
+ * read off the passed window — matching the fallback `getMeta(page, …)` does
26
+ * via `page.content()`.
27
+ */
28
+ export type ExtractMetaContext = {
29
+ /** The fully resolved URL of the page (used by Wappalyzer + AMP fields). */
30
+ readonly url: string;
31
+ /**
32
+ * Rendered HTML used for technology detection. Defaults to
33
+ * `window.document.documentElement.outerHTML` when omitted.
34
+ *
35
+ * WHY allow override: callers that fetched the raw HTML string from the
36
+ * network already have the *pre-script-execution* markup, which is what
37
+ * Wappalyzer's HTML patterns are tuned for. The serialized DOM from
38
+ * `outerHTML` reflects whatever scripts have already mutated; provide the
39
+ * raw string to get more stable detections.
40
+ */
41
+ readonly html?: string;
42
+ /** HTTP status code, surfaced to the Wappalyzer driver. */
43
+ readonly statusCode?: number;
44
+ /**
45
+ * Response headers; case is preserved by the caller, lowercased internally
46
+ * by `detectTags`.
47
+ */
48
+ readonly headers?: Record<string, string | string[] | undefined>;
49
+ /**
50
+ * When `true`, the returned `Meta` includes `_raw: RawHeadEntry[]` for
51
+ * debugging. Default `false` to keep the serialized payload small.
52
+ */
53
+ readonly includeRaw?: boolean;
54
+ };
55
+ /**
56
+ * Extracts a `Meta` object from a DOM provided by the caller.
57
+ *
58
+ * Pipeline:
59
+ *
60
+ * 1. {@link collectHeadFromDocument} walks `window.document` and returns a
61
+ * serializable `RawHeadEntry[]`.
62
+ * 2. {@link detectTags} runs `simple-wappalyzer` over the HTML + headers to
63
+ * detect third-party technologies.
64
+ * 3. {@link classify} folds the two signals together into a typed `Meta`.
65
+ *
66
+ * Step (1) is synchronous and runs first; step (2) is awaited next. The two
67
+ * are independent in principle, but the current shape is sequential — keeping
68
+ * it that way avoids forcing the synchronous DOM walk into a microtask just to
69
+ * gain a few milliseconds of overlap with the Wappalyzer call.
70
+ * @param window - The window whose `document` will be walked. jsdom's
71
+ * `dom.window` works; pass any object satisfying the `Window`
72
+ * type. The function never mutates the document.
73
+ * @param context - URL / HTML / headers / status code context. See
74
+ * {@link ExtractMetaContext}.
75
+ * @returns The extracted `Meta` (always defined; empty fields stay empty).
76
+ * @example
77
+ * ```ts
78
+ * import { JSDOM } from 'jsdom';
79
+ * import { extractMetaFromDocument } from '@d-zero/beholder';
80
+ *
81
+ * const url = 'https://example.com/';
82
+ * const html = await (await fetch(url)).text();
83
+ * const dom = new JSDOM(html, { url });
84
+ *
85
+ * // The `as unknown as Window` cast is needed because jsdom's `DOMWindow` is
86
+ * // not structurally identical to lib.dom's `Window` (a few rare globals
87
+ * // differ), but the runtime shape is compatible for this function's needs.
88
+ * const meta = await extractMetaFromDocument(dom.window as unknown as Window, {
89
+ * url,
90
+ * html,
91
+ * });
92
+ *
93
+ * meta.title; // <title>
94
+ * meta.og?.image; // og:image[]
95
+ * meta.tags.entries; // Wappalyzer detections + extracted IDs
96
+ * ```
97
+ */
98
+ export declare function extractMetaFromDocument(window: Window, context: ExtractMetaContext): Promise<Meta>;
@@ -0,0 +1,75 @@
1
+ /**
2
+ * Public, Puppeteer-free entry point for extracting {@link Meta} from an
3
+ * already-parsed DOM (e.g. jsdom).
4
+ *
5
+ * WHY this exists alongside `Scraper.scrapeStart()` / `getMeta(page, …)`:
6
+ * callers who already have an HTML string (from `fetch`, a fixture, an
7
+ * archive) should not be forced to spin up Chromium just to read a few `<meta>`
8
+ * tags. This module reuses the same `collectHead → detectTags → classify`
9
+ * pipeline as the Puppeteer path — the `Meta` shape returned here is
10
+ * identical to what `Scraper` produces, so downstream consumers do not branch
11
+ * on the source.
12
+ *
13
+ * See {@link extractMetaFromDocument} for the usage example.
14
+ * @module
15
+ */
16
+ import { classify } from './meta/classify.js';
17
+ import { collectHeadFromDocument, WINDOW_GLOBALS_TO_CHECK } from './meta/collect-head.js';
18
+ import { detectTags } from './meta/tag-detection.js';
19
+ /**
20
+ * Extracts a `Meta` object from a DOM provided by the caller.
21
+ *
22
+ * Pipeline:
23
+ *
24
+ * 1. {@link collectHeadFromDocument} walks `window.document` and returns a
25
+ * serializable `RawHeadEntry[]`.
26
+ * 2. {@link detectTags} runs `simple-wappalyzer` over the HTML + headers to
27
+ * detect third-party technologies.
28
+ * 3. {@link classify} folds the two signals together into a typed `Meta`.
29
+ *
30
+ * Step (1) is synchronous and runs first; step (2) is awaited next. The two
31
+ * are independent in principle, but the current shape is sequential — keeping
32
+ * it that way avoids forcing the synchronous DOM walk into a microtask just to
33
+ * gain a few milliseconds of overlap with the Wappalyzer call.
34
+ * @param window - The window whose `document` will be walked. jsdom's
35
+ * `dom.window` works; pass any object satisfying the `Window`
36
+ * type. The function never mutates the document.
37
+ * @param context - URL / HTML / headers / status code context. See
38
+ * {@link ExtractMetaContext}.
39
+ * @returns The extracted `Meta` (always defined; empty fields stay empty).
40
+ * @example
41
+ * ```ts
42
+ * import { JSDOM } from 'jsdom';
43
+ * import { extractMetaFromDocument } from '@d-zero/beholder';
44
+ *
45
+ * const url = 'https://example.com/';
46
+ * const html = await (await fetch(url)).text();
47
+ * const dom = new JSDOM(html, { url });
48
+ *
49
+ * // The `as unknown as Window` cast is needed because jsdom's `DOMWindow` is
50
+ * // not structurally identical to lib.dom's `Window` (a few rare globals
51
+ * // differ), but the runtime shape is compatible for this function's needs.
52
+ * const meta = await extractMetaFromDocument(dom.window as unknown as Window, {
53
+ * url,
54
+ * html,
55
+ * });
56
+ *
57
+ * meta.title; // <title>
58
+ * meta.og?.image; // og:image[]
59
+ * meta.tags.entries; // Wappalyzer detections + extracted IDs
60
+ * ```
61
+ */
62
+ export async function extractMetaFromDocument(window, context) {
63
+ const raw = collectHeadFromDocument(window, WINDOW_GLOBALS_TO_CHECK);
64
+ const html = context.html ?? window.document.documentElement.outerHTML;
65
+ const tags = await detectTags({
66
+ url: context.url,
67
+ html,
68
+ ...(context.statusCode === undefined ? {} : { statusCode: context.statusCode }),
69
+ ...(context.headers === undefined ? {} : { headers: context.headers }),
70
+ });
71
+ return classify(raw, {
72
+ tags,
73
+ ...(context.includeRaw ? { includeRaw: true } : {}),
74
+ });
75
+ }
package/dist/index.d.ts CHANGED
@@ -12,10 +12,12 @@
12
12
  */
13
13
  export { default as default } from './scraper.js';
14
14
  export { isError } from './is-error.js';
15
+ export { extractMetaFromDocument } from './extract-meta.js';
16
+ export type { ExtractMetaContext } from './extract-meta.js';
15
17
  export { detectCompress } from '@d-zero/shared/detect-compress';
16
18
  export type { CompressType } from '@d-zero/shared/detect-compress';
17
19
  export { detectCDN } from '@d-zero/shared/detect-cdn';
18
20
  export type { CDNType } from '@d-zero/shared/detect-cdn';
19
21
  export type { ScrapeResult, ResourceEntry, PageData } from './types.js';
20
22
  export type { ScraperOptions, ChangePhaseEvent, ScraperEventTypes } from './types.js';
21
- export type { Resource, AnchorData, Meta, ImageElement, SkippedPageData, NetworkLog, } from './types.js';
23
+ export type { Resource, AnchorData, Meta, ImageElement, SkippedPageData, NetworkLog, OpenGraphMeta, OgArticleMeta, OgBookMeta, OgProfileMeta, OgMusicMeta, OgVideoNsMeta, TwitterMeta, FbMeta, FediverseMeta, AppleMeta, MsApplicationMeta, VerificationMeta, GoogleMeta, GeoMeta, CitationMeta, RdfaMeta, MicrodataMeta, AmpMeta, LegacyMeta, MobileMeta, MicroformatsMeta, PinterestMeta, SlackMeta, LinkedInMeta, ExperimentalMeta, WikiMeta, LinkMeta, LinkEntry, JsonLdEntry, OthersBucket, ScriptEntry, IframeEntry, TagsMeta, TagDetail, TagEntry, TagSource, ViewportMeta, RobotsMeta, ReferrerMeta, FormatDetectionMeta, HttpEquivMeta, HttpEquivRefresh, RawHeadEntry, } from './types.js';
package/dist/index.js CHANGED
@@ -12,5 +12,6 @@
12
12
  */
13
13
  export { default as default } from './scraper.js';
14
14
  export { isError } from './is-error.js';
15
+ export { extractMetaFromDocument } from './extract-meta.js';
15
16
  export { detectCompress } from '@d-zero/shared/detect-compress';
16
17
  export { detectCDN } from '@d-zero/shared/detect-cdn';
@@ -0,0 +1,52 @@
1
+ /**
2
+ * Pure-function classifier that turns `RawHeadEntry[]` (collected on the browser
3
+ * side by `collectHead`) into a typed `Meta` object.
4
+ *
5
+ * The classifier is the **only place** where dot-paths from `keys.ts` get
6
+ * resolved against the `Meta` shape. Parsers (viewport/robots/refresh/etc.)
7
+ * are dispatched on the fly for the few entries that need value normalization.
8
+ *
9
+ * Unknown entries (names/properties/rels not in the lookup tables) are
10
+ * preserved in {@link Meta.others} so consumers never lose information.
11
+ * @module
12
+ */
13
+ import type { Meta, RawHeadEntry, TagsMeta } from './types.js';
14
+ /**
15
+ * Options for {@link classify}.
16
+ */
17
+ export type ClassifyOptions = {
18
+ /**
19
+ * When `true`, copies the input `raw` entries onto `Meta._raw` for debugging.
20
+ * Default `false` to keep the serialized `Meta` small.
21
+ */
22
+ readonly includeRaw?: boolean;
23
+ /**
24
+ * Pre-computed `TagsMeta` from `tag-detection.ts`. When omitted, an empty
25
+ * `TagsMeta` (with `detected: {}` and `entries: []`) is used.
26
+ */
27
+ readonly tags?: TagsMeta;
28
+ };
29
+ /**
30
+ * Builds the empty `Meta` skeleton with all required fields initialized.
31
+ */
32
+ /** Returns a fresh `Meta` skeleton with all required fields initialized. */
33
+ export declare function emptyMeta(): Meta;
34
+ /**
35
+ * Writes `value` to `target` along `dotPath`. Intermediate objects are created
36
+ * on demand. When `multi` is `true`, the leaf is treated as an array and `value`
37
+ * is appended; otherwise the first assignment wins (subsequent calls are no-ops).
38
+ *
39
+ * Exported for the unit tests in `classify.spec.ts`.
40
+ * @param target
41
+ * @param dotPath
42
+ * @param value
43
+ * @param multi
44
+ */
45
+ export declare function setByPath(target: Record<string, unknown>, dotPath: string, value: unknown, multi: boolean): void;
46
+ /**
47
+ * Top-level classifier. Takes a list of raw entries collected from the page
48
+ * and produces a populated `Meta`.
49
+ * @param raw
50
+ * @param options
51
+ */
52
+ export declare function classify(raw: readonly RawHeadEntry[], options?: ClassifyOptions): Meta;