npm - @d-zero/beholder - Versions diffs - 2.1.5 → 3.0.0 - Mend

@d-zero/beholder 2.1.5 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/CHANGELOG.md +44 -0
package/README.md +9 -276
package/dist/dom-evaluation.d.ts +100 -62
package/dist/dom-evaluation.js +498 -195
package/dist/index.d.ts +1 -1
package/dist/meta/classify.d.ts +52 -0
package/dist/meta/classify.js +731 -0
package/dist/meta/id-extractors.d.ts +40 -0
package/dist/meta/id-extractors.js +196 -0
package/dist/meta/keys.d.ts +41 -0
package/dist/meta/keys.js +507 -0
package/dist/meta/parsers.d.ts +74 -0
package/dist/meta/parsers.js +293 -0
package/dist/meta/tag-detection.d.ts +59 -0
package/dist/meta/tag-detection.js +120 -0
package/dist/meta/types.d.ts +874 -0
package/dist/meta/types.js +12 -0
package/dist/scraper.js +22 -18
package/dist/types.d.ts +8 -37
package/package.json +5 -4
package/src/dom-evaluation.spec.ts +521 -0
package/src/dom-evaluation.ts +655 -227
package/src/index.ts +43 -0
package/src/meta/classify.spec.ts +281 -0
package/src/meta/classify.ts +810 -0
package/src/meta/id-extractors.spec.ts +69 -0
package/src/meta/id-extractors.ts +206 -0
package/src/meta/keys.ts +568 -0
package/src/meta/parsers.spec.ts +178 -0
package/src/meta/parsers.ts +304 -0
package/src/meta/simple-wappalyzer.d.ts +37 -0
package/src/meta/tag-detection.spec.ts +134 -0
package/src/meta/tag-detection.ts +161 -0
package/src/meta/types.ts +949 -0
package/src/scraper.ts +32 -16
package/src/types.ts +54 -54
package/tsconfig.tsbuildinfo +1 -1

package/dist/meta/parsers.js ADDED Viewed

@@ -0,0 +1,293 @@
+/**
+ * Value normalizers used by `classify()` to turn raw `content` strings into
+ * structured objects (viewport, robots, format-detection, etc.).
+ *
+ * Each parser is a pure function that takes the raw `content` string and
+ * returns a normalized structure. They never throw; on unrecognizable input
+ * they fall back to keeping the `raw` field only.
+ * @module
+ */
+/**
+ * Parses `<meta name="viewport">` content into a structured `ViewportMeta`.
+ * @param raw
+ * @example parseViewport('width=device-width, initial-scale=1.0')
+ *   → { raw: '...', width: 'device-width', initialScale: 1 }
+ */
+export function parseViewport(raw) {
+    const meta = { raw };
+    for (const part of raw.split(',')) {
+        const split = part.split('=');
+        const keyRaw = split[0] ?? '';
+        const valueRaw = split[1] ?? '';
+        const key = keyRaw.trim().toLowerCase();
+        const value = valueRaw.trim();
+        if (!key)
+            continue;
+        switch (key) {
+            case 'width': {
+                meta.width = value;
+                break;
+            }
+            case 'height': {
+                meta.height = value;
+                break;
+            }
+            case 'initial-scale': {
+                const n = Number.parseFloat(value);
+                if (!Number.isNaN(n))
+                    meta.initialScale = n;
+                break;
+            }
+            case 'minimum-scale': {
+                const n = Number.parseFloat(value);
+                if (!Number.isNaN(n))
+                    meta.minimumScale = n;
+                break;
+            }
+            case 'maximum-scale': {
+                const n = Number.parseFloat(value);
+                if (!Number.isNaN(n))
+                    meta.maximumScale = n;
+                break;
+            }
+            case 'user-scalable': {
+                const lower = value.toLowerCase();
+                if (lower === 'no' || lower === '0')
+                    meta.userScalable = false;
+                else if (lower === 'yes' || lower === '1')
+                    meta.userScalable = true;
+                else
+                    meta.userScalable = value;
+                break;
+            }
+            case 'viewport-fit': {
+                meta.viewportFit = value;
+                break;
+            }
+            case 'interactive-widget': {
+                meta.interactiveWidget = value;
+                break;
+            }
+        }
+    }
+    return meta;
+}
+const ROBOTS_BOOLEAN_FLAGS = new Set([
+    'index',
+    'noindex',
+    'follow',
+    'nofollow',
+    'none',
+    'all',
+    'noarchive',
+    'nosnippet',
+    'noimageindex',
+    'nocache',
+    'notranslate',
+    'noodp',
+    'noydir',
+    'indexifembedded',
+]);
+/**
+ * Parses `<meta name="robots">` content into a structured `RobotsMeta`.
+ * @param raw
+ * @example parseRobots('noindex, max-snippet:50, unavailable_after:2026-01-01')
+ *   → { raw: '...', noindex: true, maxSnippet: 50, unavailableAfter: '2026-01-01' }
+ */
+export function parseRobots(raw) {
+    const meta = { raw };
+    for (const token of raw.split(',')) {
+        const trimmed = token.trim().toLowerCase();
+        if (!trimmed)
+            continue;
+        if (ROBOTS_BOOLEAN_FLAGS.has(trimmed)) {
+            meta[trimmed] = true;
+            continue;
+        }
+        const colonIndex = trimmed.indexOf(':');
+        if (colonIndex === -1) {
+            continue;
+        }
+        const key = trimmed.slice(0, colonIndex).trim();
+        const value = token.slice(token.indexOf(':') + 1).trim();
+        switch (key) {
+            case 'max-snippet': {
+                const n = Number.parseInt(value, 10);
+                if (!Number.isNaN(n))
+                    meta.maxSnippet = n;
+                break;
+            }
+            case 'max-image-preview': {
+                meta.maxImagePreview = value;
+                break;
+            }
+            case 'max-video-preview': {
+                const n = Number.parseInt(value, 10);
+                if (!Number.isNaN(n))
+                    meta.maxVideoPreview = n;
+                break;
+            }
+            case 'unavailable_after':
+            case 'unavailable-after': {
+                meta.unavailableAfter = value;
+                break;
+            }
+        }
+    }
+    return meta;
+}
+const REFERRER_POLICY_KEYS = {
+    'no-referrer': 'noReferrer',
+    origin: 'origin',
+    'origin-when-cross-origin': 'originWhenCrossOrigin',
+    'strict-origin': 'strictOrigin',
+    'strict-origin-when-cross-origin': 'strictOriginWhenCrossOrigin',
+    'unsafe-url': 'unsafeUrl',
+    'same-origin': 'sameOrigin',
+    'no-referrer-when-downgrade': 'noReferrerWhenDowngrade',
+};
+/**
+ * Parses `<meta name="referrer">` content into a structured `ReferrerMeta`.
+ * @param raw
+ */
+export function parseReferrer(raw) {
+    const meta = { raw };
+    const key = REFERRER_POLICY_KEYS[raw.trim().toLowerCase()];
+    if (key) {
+        meta[key] = true;
+    }
+    return meta;
+}
+/**
+ * Parses `<meta name="format-detection">` content (e.g. `'telephone=no, address=no'`).
+ * @param raw
+ */
+export function parseFormatDetection(raw) {
+    const meta = { raw };
+    for (const part of raw.split(/[,;]/)) {
+        const split = part.split('=');
+        const keyRaw = split[0] ?? '';
+        const valueRaw = split[1] ?? '';
+        const key = keyRaw.trim().toLowerCase();
+        const value = valueRaw.trim().toLowerCase();
+        if (!key)
+            continue;
+        const enabled = value !== 'no' && value !== 'false' && value !== '0';
+        switch (key) {
+            case 'telephone': {
+                meta.telephone = enabled;
+                break;
+            }
+            case 'email': {
+                meta.email = enabled;
+                break;
+            }
+            case 'address': {
+                meta.address = enabled;
+                break;
+            }
+            case 'date': {
+                meta.date = enabled;
+                break;
+            }
+        }
+    }
+    return meta;
+}
+/**
+ * Parses `<meta http-equiv="refresh">` content (e.g. `'5; url=https://...'`).
+ * @param raw
+ */
+export function parseRefresh(raw) {
+    const refresh = { raw };
+    const split = raw.split(';');
+    const secondsRaw = split[0] ?? '';
+    const rest = split.slice(1).join(';');
+    const seconds = Number.parseFloat(secondsRaw.trim());
+    if (!Number.isNaN(seconds)) {
+        refresh.seconds = seconds;
+    }
+    const urlMatch = /url\s*=\s*(.+)/i.exec(rest);
+    if (urlMatch?.[1]) {
+        refresh.url = urlMatch[1].trim().replaceAll(/^['"]|['"]$/g, '');
+    }
+    return refresh;
+}
+/**
+ * Parses a `<script type="application/ld+json">` (or speculationrules) body
+ * into a {@link JsonLdEntry}. On parse failure, the entry preserves the `raw`
+ * text and records the error message in `parseError`.
+ * @param content
+ */
+export function parseJsonLd(content) {
+    const raw = content;
+    try {
+        const parsed = JSON.parse(content);
+        return { raw, parsed };
+    }
+    catch (error) {
+        const parseError = error instanceof Error ? error.message : String(error);
+        return { raw, parseError };
+    }
+}
+/**
+ * Normalizes a string value according to a {@link KeyTransform}.
+ *
+ * `'boolean-yes'`: `'yes'` → `true`, `'no'` → `false`, anything else → raw string
+ * `'boolean-on'`: `'on'`/`'true'`/`'1'` → `true`, `'off'`/`'false'`/`'0'` → `false`, else raw
+ * `'boolean-true'`: `'true'` → `true`, `'false'` → `false`, else raw
+ * `'number'`: parsed via `Number.parseFloat`, falls back to raw on NaN
+ * `'string'` (default): returns the value unchanged
+ * @param value
+ * @param transform
+ */
+export function normalizeValue(value, transform) {
+    if (!transform || transform === 'string') {
+        return value;
+    }
+    const lower = value.trim().toLowerCase();
+    switch (transform) {
+        case 'boolean-yes': {
+            if (lower === 'yes')
+                return true;
+            if (lower === 'no')
+                return false;
+            return value;
+        }
+        case 'boolean-on': {
+            if (lower === 'on' || lower === 'true' || lower === '1')
+                return true;
+            if (lower === 'off' || lower === 'false' || lower === '0')
+                return false;
+            return value;
+        }
+        case 'boolean-true': {
+            if (lower === 'true')
+                return true;
+            if (lower === 'false')
+                return false;
+            return value;
+        }
+        case 'number': {
+            const n = Number.parseFloat(value);
+            return Number.isNaN(n) ? value : n;
+        }
+    }
+}
+/**
+ * JSON-LD / speculationrules content size caps (bytes). Above these sizes the
+ * content is truncated and a `truncated` marker is emitted via `parseError`.
+ */
+export const JSON_LD_PER_ENTRY_LIMIT = 200_000;
+export const JSON_LD_TOTAL_LIMIT = 1_000_000;
+/**
+ * Caps a single JSON-LD entry's raw content to {@link JSON_LD_PER_ENTRY_LIMIT}.
+ * Returns the (possibly truncated) entry and a `truncated` flag.
+ * @param content
+ */
+export function capJsonLdContent(content) {
+    if (content.length <= JSON_LD_PER_ENTRY_LIMIT) {
+        return { content, truncated: false };
+    }
+    return { content: content.slice(0, JSON_LD_PER_ENTRY_LIMIT), truncated: true };
+}

package/dist/meta/tag-detection.d.ts ADDED Viewed

@@ -0,0 +1,59 @@
+/**
+ * Third-party tag detection layer.
+ *
+ * Combines two signals to populate {@link TagsMeta}:
+ * 1. `simple-wappalyzer` runs over the page HTML + headers to identify
+ *    the technologies present (and their Wappalyzer categories).
+ * 2. {@link extractIds} from `./id-extractors.js` finds the real account
+ *    / measurement IDs (e.g. `G-XXXXXXXX`, `GTM-XXXXX`) for each detected
+ *    provider.
+ *
+ * Returned shape is documented on {@link TagsMeta} in `./types.ts`.
+ * @module
+ */
+import type { TagsMeta } from './types.js';
+/**
+ * Shape of a single technology entry returned by `simple-wappalyzer`.
+ * Mirrors the subset of fields we use; everything else is ignored.
+ */
+interface WappalyzerTech {
+    readonly name: string;
+    readonly version?: string;
+    readonly confidence?: number;
+    readonly categories?: ReadonlyArray<{
+        readonly name?: string;
+        readonly id?: number;
+    }>;
+}
+/**
+ * Inputs required to drive `simple-wappalyzer`.
+ *
+ * `headers` keys should be lowercase; `simple-wappalyzer` is case-insensitive
+ * but normalizing up front avoids ambiguity.
+ */
+export type DetectTagsInput = {
+    readonly url: string;
+    readonly html: string;
+    readonly statusCode?: number;
+    readonly headers?: Record<string, string | string[] | undefined>;
+};
+/**
+ * Drives `simple-wappalyzer` and post-processes the result with the
+ * provider-specific ID extractors. Failures fall back to an empty `TagsMeta`
+ * rather than throwing, so the caller does not need to wrap the call.
+ * @param input
+ */
+export declare function detectTags(input: DetectTagsInput): Promise<TagsMeta>;
+/**
+ * Builds a `TagsMeta` from the raw `simple-wappalyzer` output and the page
+ * HTML used for ID extraction.
+ *
+ * Exported for unit tests that bypass `simple-wappalyzer` and feed
+ * pre-recorded detections directly.
+ * @param detections
+ * @param html
+ */
+export declare function assembleTagsMeta(detections: readonly WappalyzerTech[], html: string): TagsMeta;
+/** Singleton empty `TagsMeta` value (exported for tests). */
+export declare const EMPTY_TAGS_META: TagsMeta;
+export {};

package/dist/meta/tag-detection.js ADDED Viewed

@@ -0,0 +1,120 @@
+/**
+ * Third-party tag detection layer.
+ *
+ * Combines two signals to populate {@link TagsMeta}:
+ * 1. `simple-wappalyzer` runs over the page HTML + headers to identify
+ *    the technologies present (and their Wappalyzer categories).
+ * 2. {@link extractIds} from `./id-extractors.js` finds the real account
+ *    / measurement IDs (e.g. `G-XXXXXXXX`, `GTM-XXXXX`) for each detected
+ *    provider.
+ *
+ * Returned shape is documented on {@link TagsMeta} in `./types.ts`.
+ * @module
+ */
+import wappalyzer from 'simple-wappalyzer';
+import { domLog } from '../debug.js';
+import { extractIds } from './id-extractors.js';
+const log = domLog.extend(`${process.pid}`);
+const EMPTY_TAGS = { detected: {}, entries: [] };
+/**
+ * Drives `simple-wappalyzer` and post-processes the result with the
+ * provider-specific ID extractors. Failures fall back to an empty `TagsMeta`
+ * rather than throwing, so the caller does not need to wrap the call.
+ * @param input
+ */
+export async function detectTags(input) {
+    const headers = normalizeHeaders(input.headers);
+    let detections;
+    try {
+        const result = (await wappalyzer({
+            url: input.url,
+            html: input.html,
+            headers,
+        }));
+        detections = Array.isArray(result) ? result : [];
+    }
+    catch (error) {
+        log('detectTags: simple-wappalyzer failed; returning empty TagsMeta. Error: %O', error);
+        return cloneEmpty();
+    }
+    return assembleTagsMeta(detections, input.html);
+}
+/**
+ * Builds a `TagsMeta` from the raw `simple-wappalyzer` output and the page
+ * HTML used for ID extraction.
+ *
+ * Exported for unit tests that bypass `simple-wappalyzer` and feed
+ * pre-recorded detections directly.
+ * @param detections
+ * @param html
+ */
+export function assembleTagsMeta(detections, html) {
+    const detected = {};
+    const entries = [];
+    for (const tech of detections) {
+        if (!tech.name)
+            continue;
+        const ids = extractIds(tech.name, html);
+        const categories = tech.categories
+            ?.map((c) => c.name)
+            .filter((name) => typeof name === 'string') ?? [];
+        const detail = {
+            ids,
+            ...(tech.version === undefined ? {} : { version: tech.version }),
+            ...(tech.confidence === undefined ? {} : { confidence: tech.confidence }),
+        };
+        for (const category of categories.length > 0 ? categories : ['Other']) {
+            if (detected[category] === undefined) {
+                detected[category] = {};
+            }
+            detected[category][tech.name] = detail;
+        }
+        const baseSources = [{ type: 'html' }];
+        if (ids.length === 0) {
+            entries.push({
+                provider: tech.name,
+                categories,
+                ...(tech.version === undefined ? {} : { version: tech.version }),
+                ...(tech.confidence === undefined ? {} : { confidence: tech.confidence }),
+                sources: baseSources,
+            });
+        }
+        else {
+            for (const id of ids) {
+                entries.push({
+                    provider: tech.name,
+                    categories,
+                    id,
+                    ...(tech.version === undefined ? {} : { version: tech.version }),
+                    ...(tech.confidence === undefined ? {} : { confidence: tech.confidence }),
+                    sources: baseSources,
+                });
+            }
+        }
+    }
+    return { detected, entries };
+}
+/**
+ *
+ */
+function cloneEmpty() {
+    return { detected: {}, entries: [] };
+}
+/**
+ *
+ * @param headers
+ */
+function normalizeHeaders(headers) {
+    if (!headers)
+        return {};
+    const out = {};
+    for (const [key, value] of Object.entries(headers)) {
+        if (value === undefined)
+            continue;
+        const flat = Array.isArray(value) ? value.join(', ') : value;
+        out[key.toLowerCase()] = flat;
+    }
+    return out;
+}
+/** Singleton empty `TagsMeta` value (exported for tests). */
+export const EMPTY_TAGS_META = EMPTY_TAGS;