npm - @d-zero/beholder - Versions diffs - 2.1.6 → 3.1.0 - Mend

@d-zero/beholder 2.1.6 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/CHANGELOG.md +44 -0
package/README.md +26 -0
package/dist/dom-evaluation.d.ts +72 -24
package/dist/dom-evaluation.js +310 -84
package/dist/extract-meta.d.ts +98 -0
package/dist/extract-meta.js +75 -0
package/dist/index.d.ts +3 -1
package/dist/index.js +1 -0
package/dist/meta/classify.d.ts +52 -0
package/dist/meta/classify.js +731 -0
package/dist/meta/collect-head.d.ts +63 -0
package/dist/meta/collect-head.js +223 -0
package/dist/meta/id-extractors.d.ts +40 -0
package/dist/meta/id-extractors.js +196 -0
package/dist/meta/keys.d.ts +41 -0
package/dist/meta/keys.js +507 -0
package/dist/meta/parsers.d.ts +74 -0
package/dist/meta/parsers.js +293 -0
package/dist/meta/tag-detection.d.ts +59 -0
package/dist/meta/tag-detection.js +120 -0
package/dist/meta/types.d.ts +874 -0
package/dist/meta/types.js +12 -0
package/dist/scraper.js +15 -13
package/dist/types.d.ts +3 -38
package/package.json +8 -5
package/src/dom-evaluation.spec.ts +301 -73
package/src/dom-evaluation.ts +417 -88
package/src/extract-meta.spec.ts +247 -0
package/src/extract-meta.ts +121 -0
package/src/index.ts +45 -0
package/src/meta/classify.spec.ts +281 -0
package/src/meta/classify.ts +810 -0
package/src/meta/collect-head.ts +247 -0
package/src/meta/id-extractors.spec.ts +69 -0
package/src/meta/id-extractors.ts +206 -0
package/src/meta/keys.ts +568 -0
package/src/meta/parsers.spec.ts +178 -0
package/src/meta/parsers.ts +304 -0
package/src/meta/simple-wappalyzer.d.ts +37 -0
package/src/meta/tag-detection.spec.ts +134 -0
package/src/meta/tag-detection.ts +161 -0
package/src/meta/types.ts +949 -0
package/src/scraper.ts +19 -13
package/src/types.ts +49 -55
package/tsconfig.tsbuildinfo +1 -1

package/src/extract-meta.spec.ts ADDED Viewed

@@ -0,0 +1,247 @@
+import { JSDOM } from 'jsdom';
+import { describe, expect, it } from 'vitest';
+import { extractMetaFromDocument } from './extract-meta.js';
+const URL = 'https://example.com/';
+/**
+ *
+ * @param html
+ */
+function mkDom(html: string): JSDOM {
+	return new JSDOM(html, { url: URL });
+}
+/**
+ *
+ * @param dom
+ */
+function asWindow(dom: JSDOM): Window {
+	return dom.window as unknown as Window;
+}
+describe('extractMetaFromDocument', () => {
+	it('extracts <title>, lang and basic <meta name=description>', async () => {
+		const html = `<!doctype html>
+			<html lang="ja">
+				<head>
+					<title>Example Title</title>
+					<meta name="description" content="An example page">
+					<meta name="keywords" content="a, b, c">
+				</head>
+				<body></body>
+			</html>`;
+		const dom = mkDom(html);
+		const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
+		expect(meta.title).toBe('Example Title');
+		expect(meta.lang).toBe('ja');
+		expect(meta.description).toBe('An example page');
+		expect(meta.keywords).toBe('a, b, c');
+	});
+	it('parses og:* and twitter:* meta tags', async () => {
+		const html = `<!doctype html>
+			<html>
+				<head>
+					<title>OG</title>
+					<meta property="og:title" content="OG Title">
+					<meta property="og:type" content="article">
+					<meta property="og:image" content="https://example.com/a.png">
+					<meta property="og:image" content="https://example.com/b.png">
+					<meta name="twitter:card" content="summary_large_image">
+					<meta name="twitter:site" content="@example">
+				</head>
+				<body></body>
+			</html>`;
+		const dom = mkDom(html);
+		const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
+		expect(meta.og?.title).toBe('OG Title');
+		expect(meta.og?.type).toBe('article');
+		expect(meta.og?.image).toEqual([
+			'https://example.com/a.png',
+			'https://example.com/b.png',
+		]);
+		expect(meta.twitter?.card).toBe('summary_large_image');
+		expect(meta.twitter?.site).toBe('@example');
+	});
+	it('parses viewport, robots and theme-color (with media branches)', async () => {
+		const html = `<!doctype html>
+			<html>
+				<head>
+					<title>X</title>
+					<meta name="viewport" content="width=device-width, initial-scale=1">
+					<meta name="robots" content="noindex, nofollow">
+					<meta name="theme-color" content="#000000">
+					<meta name="theme-color" media="(prefers-color-scheme: dark)" content="#111111">
+					<meta name="theme-color" media="(prefers-color-scheme: light)" content="#eeeeee">
+				</head>
+			</html>`;
+		const dom = mkDom(html);
+		const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
+		expect(meta.viewport?.width).toBe('device-width');
+		expect(meta.viewport?.initialScale).toBe(1);
+		expect(meta.robots?.noindex).toBe(true);
+		expect(meta.robots?.nofollow).toBe(true);
+		expect(meta.themeColor).toBe('#000000');
+		expect(meta.themeColorDark).toBe('#111111');
+		expect(meta.themeColorLight).toBe('#eeeeee');
+	});
+	it('captures <link rel="canonical"> and alternate hreflang', async () => {
+		const html = `<!doctype html>
+			<html>
+				<head>
+					<title>L</title>
+					<link rel="canonical" href="https://example.com/canonical">
+					<link rel="alternate" hreflang="en" href="https://example.com/en">
+					<link rel="alternate" hreflang="ja" href="https://example.com/ja">
+				</head>
+			</html>`;
+		const dom = mkDom(html);
+		const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
+		expect(meta.link?.canonical).toBe('https://example.com/canonical');
+		const hreflangs = meta.link?.alternateHreflang.map((e) => e.hreflang) ?? [];
+		expect(hreflangs).toEqual(['en', 'ja']);
+	});
+	it('parses inline JSON-LD scripts', async () => {
+		const data = { '@context': 'https://schema.org', '@type': 'WebPage', name: 'X' };
+		const html = `<!doctype html>
+			<html>
+				<head>
+					<title>J</title>
+					<script type="application/ld+json">${JSON.stringify(data)}</script>
+				</head>
+			</html>`;
+		const dom = mkDom(html);
+		const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
+		expect(meta.jsonLd).toHaveLength(1);
+		const first = meta.jsonLd[0];
+		expect(first?.parsed).toEqual(data);
+	});
+	it('captures itemtype/itemscope (microdata) and prefix/vocab (RDFa) from <html>', async () => {
+		const html = `<!doctype html>
+			<html itemscope itemtype="https://schema.org/WebPage" prefix="og: https://ogp.me/ns#" vocab="https://schema.org/" typeof="WebPage">
+				<head><title>M</title></head>
+			</html>`;
+		const dom = mkDom(html);
+		const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
+		expect(meta.microdata?.itemscope).toBe(true);
+		expect(meta.microdata?.itemtype).toBe('https://schema.org/WebPage');
+		expect(meta.rdfa?.prefix).toBe('og: https://ogp.me/ns#');
+		expect(meta.rdfa?.vocab).toBe('https://schema.org/');
+		expect(meta.rdfa?.typeOf).toBe('WebPage');
+	});
+	it('captures <base href> and <iframe src>', async () => {
+		const html = `<!doctype html>
+			<html>
+				<head>
+					<title>B</title>
+					<base href="https://example.com/sub/">
+				</head>
+				<body>
+					<iframe src="https://www.youtube.com/embed/abc"></iframe>
+				</body>
+			</html>`;
+		const dom = mkDom(html);
+		const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
+		expect(meta.baseHref).toBe('https://example.com/sub/');
+		expect(meta.others.iframe).toEqual([
+			{ src: 'https://www.youtube.com/embed/abc', location: 'body' },
+		]);
+	});
+	it('falls back to documentElement.outerHTML when context.html is omitted', async () => {
+		const html = `<!doctype html><html><head><title>FB</title></head></html>`;
+		const dom = mkDom(html);
+		const meta = await extractMetaFromDocument(asWindow(dom), { url: URL });
+		expect(meta.title).toBe('FB');
+		expect(meta.tags).toBeDefined();
+		expect(meta.tags.entries).toBeInstanceOf(Array);
+	});
+	it('returns includeRaw when requested', async () => {
+		const html = `<!doctype html><html><head><title>R</title></head></html>`;
+		const dom = mkDom(html);
+		const meta = await extractMetaFromDocument(asWindow(dom), {
+			url: URL,
+			html,
+			includeRaw: true,
+		});
+		expect(meta._raw).toBeInstanceOf(Array);
+		expect(meta._raw?.some((e) => e.kind === 'title')).toBe(true);
+	});
+	it("emits a 'window-global' raw entry when known globals are present on the window", async () => {
+		const html = `<!doctype html><html><head><title>WG</title></head></html>`;
+		const dom = mkDom(html);
+		// jsdom does not execute scripts by default, so simulate a tag library
+		// having installed itself onto `window` (the production trigger for the
+		// `window-global` branch in `collectHeadFromDocument`).
+		(dom.window as unknown as Record<string, unknown>).dataLayer = [];
+		(dom.window as unknown as Record<string, unknown>).fbq = () => {};
+		const meta = await extractMetaFromDocument(asWindow(dom), {
+			url: URL,
+			html,
+			includeRaw: true,
+		});
+		const globalEntry = meta._raw?.find((e) => e.kind === 'window-global');
+		expect(globalEntry).toBeDefined();
+		// Force a type error if the narrow ever fails, rather than letting the
+		// trailing `expect` calls silently skip via an `if` branch.
+		if (globalEntry === undefined || globalEntry.kind !== 'window-global') {
+			throw new Error('expected a window-global raw entry');
+		}
+		expect(globalEntry.names).toContain('dataLayer');
+		expect(globalEntry.names).toContain('fbq');
+	});
+	it('forwards headers and statusCode to the tag-detection layer', async () => {
+		// We can't assert Wappalyzer's internal decisions without coupling to its
+		// signature table, but we can at least verify that supplying headers and
+		// statusCode does not throw and that the returned Meta is still well-formed.
+		const html = `<!doctype html><html><head><title>H</title></head></html>`;
+		const dom = mkDom(html);
+		const meta = await extractMetaFromDocument(asWindow(dom), {
+			url: URL,
+			html,
+			headers: {
+				'content-type': 'text/html; charset=utf-8',
+				'x-powered-by': 'Express',
+			},
+			statusCode: 200,
+		});
+		expect(meta.title).toBe('H');
+		expect(Array.isArray(meta.tags.entries)).toBe(true);
+	});
+	it('records parseError for malformed inline JSON-LD', async () => {
+		const html = `<!doctype html>
+			<html>
+				<head>
+					<title>JE</title>
+					<script type="application/ld+json">{ this is not valid json</script>
+				</head>
+			</html>`;
+		const dom = mkDom(html);
+		const meta = await extractMetaFromDocument(asWindow(dom), { url: URL, html });
+		expect(meta.jsonLd).toHaveLength(1);
+		const first = meta.jsonLd[0];
+		expect(first?.parsed).toBeUndefined();
+		expect(typeof first?.parseError).toBe('string');
+	});
+});

package/src/extract-meta.ts ADDED Viewed

@@ -0,0 +1,121 @@
+/**
+ * Public, Puppeteer-free entry point for extracting {@link Meta} from an
+ * already-parsed DOM (e.g. jsdom).
+ *
+ * WHY this exists alongside `Scraper.scrapeStart()` / `getMeta(page, …)`:
+ * callers who already have an HTML string (from `fetch`, a fixture, an
+ * archive) should not be forced to spin up Chromium just to read a few `<meta>`
+ * tags. This module reuses the same `collectHead → detectTags → classify`
+ * pipeline as the Puppeteer path — the `Meta` shape returned here is
+ * identical to what `Scraper` produces, so downstream consumers do not branch
+ * on the source.
+ *
+ * See {@link extractMetaFromDocument} for the usage example.
+ * @module
+ */
+import type { Meta } from './types.js';
+import { classify } from './meta/classify.js';
+import { collectHeadFromDocument, WINDOW_GLOBALS_TO_CHECK } from './meta/collect-head.js';
+import { detectTags } from './meta/tag-detection.js';
+/**
+ * Inputs for {@link extractMetaFromDocument}.
+ *
+ * `url`/`statusCode`/`headers` mirror the inputs to the underlying
+ * `simple-wappalyzer` driver. They are not consumed by the DOM-walk side of
+ * the pipeline.
+ *
+ * `html` is optional: when omitted, `document.documentElement.outerHTML` is
+ * read off the passed window — matching the fallback `getMeta(page, …)` does
+ * via `page.content()`.
+ */
+export type ExtractMetaContext = {
+	/** The fully resolved URL of the page (used by Wappalyzer + AMP fields). */
+	readonly url: string;
+	/**
+	 * Rendered HTML used for technology detection. Defaults to
+	 * `window.document.documentElement.outerHTML` when omitted.
+	 *
+	 * WHY allow override: callers that fetched the raw HTML string from the
+	 * network already have the *pre-script-execution* markup, which is what
+	 * Wappalyzer's HTML patterns are tuned for. The serialized DOM from
+	 * `outerHTML` reflects whatever scripts have already mutated; provide the
+	 * raw string to get more stable detections.
+	 */
+	readonly html?: string;
+	/** HTTP status code, surfaced to the Wappalyzer driver. */
+	readonly statusCode?: number;
+	/**
+	 * Response headers; case is preserved by the caller, lowercased internally
+	 * by `detectTags`.
+	 */
+	readonly headers?: Record<string, string | string[] | undefined>;
+	/**
+	 * When `true`, the returned `Meta` includes `_raw: RawHeadEntry[]` for
+	 * debugging. Default `false` to keep the serialized payload small.
+	 */
+	readonly includeRaw?: boolean;
+};
+/**
+ * Extracts a `Meta` object from a DOM provided by the caller.
+ *
+ * Pipeline:
+ *
+ * 1. {@link collectHeadFromDocument} walks `window.document` and returns a
+ *    serializable `RawHeadEntry[]`.
+ * 2. {@link detectTags} runs `simple-wappalyzer` over the HTML + headers to
+ *    detect third-party technologies.
+ * 3. {@link classify} folds the two signals together into a typed `Meta`.
+ *
+ * Step (1) is synchronous and runs first; step (2) is awaited next. The two
+ * are independent in principle, but the current shape is sequential — keeping
+ * it that way avoids forcing the synchronous DOM walk into a microtask just to
+ * gain a few milliseconds of overlap with the Wappalyzer call.
+ * @param window - The window whose `document` will be walked. jsdom's
+ *                 `dom.window` works; pass any object satisfying the `Window`
+ *                 type. The function never mutates the document.
+ * @param context - URL / HTML / headers / status code context. See
+ *                  {@link ExtractMetaContext}.
+ * @returns The extracted `Meta` (always defined; empty fields stay empty).
+ * @example
+ * ```ts
+ * import { JSDOM } from 'jsdom';
+ * import { extractMetaFromDocument } from '@d-zero/beholder';
+ *
+ * const url = 'https://example.com/';
+ * const html = await (await fetch(url)).text();
+ * const dom = new JSDOM(html, { url });
+ *
+ * // The `as unknown as Window` cast is needed because jsdom's `DOMWindow` is
+ * // not structurally identical to lib.dom's `Window` (a few rare globals
+ * // differ), but the runtime shape is compatible for this function's needs.
+ * const meta = await extractMetaFromDocument(dom.window as unknown as Window, {
+ *   url,
+ *   html,
+ * });
+ *
+ * meta.title;         // <title>
+ * meta.og?.image;     // og:image[]
+ * meta.tags.entries;  // Wappalyzer detections + extracted IDs
+ * ```
+ */
+export async function extractMetaFromDocument(
+	window: Window,
+	context: ExtractMetaContext,
+): Promise<Meta> {
+	const raw = collectHeadFromDocument(window, WINDOW_GLOBALS_TO_CHECK);
+	const html = context.html ?? window.document.documentElement.outerHTML;
+	const tags = await detectTags({
+		url: context.url,
+		html,
+		...(context.statusCode === undefined ? {} : { statusCode: context.statusCode }),
+		...(context.headers === undefined ? {} : { headers: context.headers }),
+	});
+	return classify(raw, {
+		tags,
+		...(context.includeRaw ? { includeRaw: true } : {}),
+	});
+}

package/src/index.ts CHANGED Viewed

@@ -12,6 +12,8 @@
  */
 export { default as default } from './scraper.js';
 export { isError } from './is-error.js';
+export { extractMetaFromDocument } from './extract-meta.js';
+export type { ExtractMetaContext } from './extract-meta.js';
 export { detectCompress } from '@d-zero/shared/detect-compress';
 export type { CompressType } from '@d-zero/shared/detect-compress';
 export { detectCDN } from '@d-zero/shared/detect-cdn';
@@ -25,4 +27,47 @@ export type {
 	ImageElement,
 	SkippedPageData,
 	NetworkLog,
+	OpenGraphMeta,
+	OgArticleMeta,
+	OgBookMeta,
+	OgProfileMeta,
+	OgMusicMeta,
+	OgVideoNsMeta,
+	TwitterMeta,
+	FbMeta,
+	FediverseMeta,
+	AppleMeta,
+	MsApplicationMeta,
+	VerificationMeta,
+	GoogleMeta,
+	GeoMeta,
+	CitationMeta,
+	RdfaMeta,
+	MicrodataMeta,
+	AmpMeta,
+	LegacyMeta,
+	MobileMeta,
+	MicroformatsMeta,
+	PinterestMeta,
+	SlackMeta,
+	LinkedInMeta,
+	ExperimentalMeta,
+	WikiMeta,
+	LinkMeta,
+	LinkEntry,
+	JsonLdEntry,
+	OthersBucket,
+	ScriptEntry,
+	IframeEntry,
+	TagsMeta,
+	TagDetail,
+	TagEntry,
+	TagSource,
+	ViewportMeta,
+	RobotsMeta,
+	ReferrerMeta,
+	FormatDetectionMeta,
+	HttpEquivMeta,
+	HttpEquivRefresh,
+	RawHeadEntry,
 } from './types.js';