langtell 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -4,20 +4,21 @@
4
4
 
5
5
  `langtell` infers the language of short strings — titles, snippets, headlines —
6
6
  by **fusing evidence from many signals** into a single weighted verdict with a
7
- confidence score and an auditable trail. It reads the *tells*: the script and
7
+ confidence score and an auditable trail. It reads the _tells_: the script and
8
8
  distinctive letters of the text, the `<html lang>` / `og:locale` / meta tags of
9
9
  the page it came from, the HTTP `Content-Language` header, and — optionally —
10
10
  heavier statistical engines like [franc](https://github.com/wooorm/franc) or the
11
11
  on-device Chrome AI language detector.
12
12
 
13
13
  It is **not** another trigram detector competing with franc/cld3/tinyld. Those
14
- answer *"what language is this body of text?"* from the characters alone.
15
- `langtell` answers *"what language is this **title**, given the page, transport,
16
- and source it arrived in?"* — and shows its work.
14
+ answer _"what language is this body of text?"_ from the characters alone.
15
+ `langtell` answers _"what language is this **title**, given the page, transport,
16
+ and source it arrived in?"_ — and shows its work.
17
17
 
18
- > **Status:** design preview. The API below is the committed design; the
19
- > implementation is in progress. This `0.0.x` release reserves the name and
20
- > documents the design it has no runtime yet.
18
+ > **Status:** early. The core detector (candidate-relative script/letter
19
+ > scoring, the BCP-47-aware fuser with the context-vs-script guard, and the
20
+ > opt-in franc and Chrome AI engines) is implemented and tested. The API below
21
+ > reflects the committed design.
21
22
 
22
23
  ## Why
23
24
 
@@ -26,7 +27,7 @@ and source it arrived in?"* — and shows its work.
26
27
  out-of-band metadata that a pure text detector never sees.
27
28
  - **Auditable, not magic.** Every verdict carries the list of signals that
28
29
  produced it (`evidence[]`), each with its kind, language, confidence, and raw
29
- value — so you can debug *why* a title was classified the way it was.
30
+ value — so you can debug _why_ a title was classified the way it was.
30
31
  - **Pay only for what you use.** The zero-dependency core (script + HTML + header
31
32
  signals) is fully synchronous. Heavy engines (franc's trigram tables, the
32
33
  browser detector) live behind their own subpaths and only enter your bundle —
@@ -36,41 +37,59 @@ and source it arrived in?"* — and shows its work.
36
37
 
37
38
  ```ts
38
39
  import { compile } from "langtell";
40
+ import { uk, ru, en } from "langtell/profiles"; // ready-made roster data
39
41
 
40
42
  // compile() does the per-roster setup once; call the returned fn many times.
41
- const detect = compile({ candidates: [UK, RU, EN] });
43
+ const detect = compile({ candidates: [uk, ru, en] });
42
44
 
43
45
  const result = detect({
44
46
  text: "Їжак Сонік",
45
- html, // optional: <html lang>, og:locale, meta content-language
47
+ html, // optional: <html lang>, og:locale, meta content-language
46
48
  responseHeaders, // optional: HTTP Content-Language
47
49
  });
48
50
  // → { language: "uk", confidence: 0.9x, evidence: [{ kind: "title-script", ... }, ...] }
49
51
  ```
50
52
 
51
- Add a heavy engine — it stays behind its own import door, and the return type
52
- becomes `Promise` automatically because the engine is async:
53
+ Add the franc engine — it stays behind its own import door so its trigram tables
54
+ never reach a bundle that doesn't use it. franc runs in-process and
55
+ synchronously, so `detect` stays synchronous:
53
56
 
54
57
  ```ts
55
- import { compile } from "langtell";
56
- import { francEngine } from "langtell/franc";
58
+ import { compile } from "langtell";
59
+ import { uk, ru, en } from "langtell/profiles";
60
+ import { createFrancEngine } from "langtell/franc";
61
+
62
+ const candidates = [uk, ru, en];
63
+ const detect = compile({ candidates, engines: [createFrancEngine(candidates)] });
64
+ const result = detect({ text, html, responseHeaders });
65
+ ```
66
+
67
+ Register the on-device Chrome AI engine and the return type becomes `Promise`
68
+ automatically, because that engine is async:
69
+
70
+ ```ts
71
+ import { compile } from "langtell";
72
+ import { uk, ru, en } from "langtell/profiles";
73
+ import { chromeAiEngine } from "langtell/chrome-ai";
57
74
 
58
- const detect = compile({ candidates: [UK, RU, EN], engines: [francEngine] });
59
- const result = await detect({ text, html, responseHeaders });
75
+ const detect = compile({ candidates: [uk, ru, en], engines: [chromeAiEngine] });
76
+ const result = await detect({ text }); // Promise<Classification>
60
77
  ```
61
78
 
62
79
  ## API at a glance
63
80
 
64
- | Export | Role |
65
- | --- | --- |
66
- | `compile(config)` | Build a configured `detect` function (does the precompute once). |
67
- | `detect(input)` | The compiled detector. Sync or `Promise`, by config — see below. |
68
- | `evidenceFromText(text)` | Producer: script + distinctive-letter signals. Zero-dep, sync. |
69
- | `evidenceFromHtml(html)` | Producer: `<html lang>`, meta content-language, `og:locale`. Zero-dep, sync. |
70
- | `evidenceFromHeaders(h)` | Producer: HTTP `Content-Language`. Zero-dep, sync. |
71
- | `fuse(evidence, opts?)` | Weighted blend + "context never overrides clear script" guard. |
72
- | `langtell/franc` | Opt-in franc engine (pulls trigram tables). |
73
- | `langtell/chrome-ai` | Opt-in on-device Chrome AI engine (browser). |
81
+ | Export | Role |
82
+ | ------------------------------------- | ------------------------------------------------------------------------------- |
83
+ | `compile(config)` | Build a configured `detect` function (does the precompute once). |
84
+ | `detect(input)` | The compiled detector. Sync or `Promise`, by config — see below. |
85
+ | `evidenceFromText(text, candidates?)` | Producer: roster-relative script + distinctive-letter signals. Zero-dep, sync. |
86
+ | `evidenceFromHtml(html)` | Producer: `<html lang>`, meta content-language, `og:locale`. Zero-dep, sync. |
87
+ | `evidenceFromHeaders(h)` | Producer: HTTP `Content-Language`. Zero-dep, sync. |
88
+ | `normalizeBCP47(tag)` | Normalize a BCP-47 tag/alias to a canonical code (`uk-UA`/`ua` → `uk`). |
89
+ | `fuse(evidence, opts?)` | Weighted blend + "context never overrides clear script" guard. |
90
+ | `langtell/profiles` | Ready-made `LanguageProfile` data (uk/ru/be/bg/en). Opt-in (carries word data). |
91
+ | `langtell/franc` | Opt-in franc engine (pulls trigram tables). Sync. |
92
+ | `langtell/chrome-ai` | Opt-in on-device Chrome AI engine (browser). Async. |
74
93
 
75
94
  `detect` returns a plain `Classification` when every registered source is
76
95
  synchronous, and `Promise<Classification>` the moment an async engine is in the
@@ -0,0 +1,29 @@
1
+ import { A as AsyncSource } from './types-D4Ux-xA6.js';
2
+
3
+ /**
4
+ * `langtell/chrome-ai` — the opt-in on-device engine wrapping the browser's
5
+ * `LanguageDetector` API (Gemini Nano on Chrome 138+ / Edge). Lives behind its
6
+ * own subpath; the zero-dependency core never imports it.
7
+ *
8
+ * Opportunistic: it never triggers a model download. Availability (per Chrome
9
+ * docs):
10
+ * - `available` — model loaded, ready. We're available.
11
+ * - `downloadable` — could be fetched on demand. Treated as unavailable so we
12
+ * never initiate a download the user hasn't consented to.
13
+ * - `downloading` — same reasoning; wait for the model to land.
14
+ * - `unavailable` — Chrome's flat-out no. Skip.
15
+ *
16
+ * Emits `kind: "chrome-ai"` evidence with the model's own confidence. An
17
+ * `AsyncSource`: registering it flips the compiled `detect` to `Promise`-typed.
18
+ */
19
+
20
+ /**
21
+ * Build a chrome-ai {@link AsyncSource}. State (availability + session) is
22
+ * cached per instance: once availability is confirmed it is not re-probed for
23
+ * the instance's lifetime, and the detector session is created once and reused.
24
+ */
25
+ declare function createChromeAiEngine(): AsyncSource;
26
+ /** A ready-to-register chrome-ai engine instance. */
27
+ declare const chromeAiEngine: AsyncSource;
28
+
29
+ export { chromeAiEngine, createChromeAiEngine };
@@ -0,0 +1,71 @@
1
+ // src/chrome-ai.ts
2
+ var DEFAULT_MAX_CHARS = 2e3;
3
+ var CONFIDENCE_THRESHOLD = 0.6;
4
+ function getApi() {
5
+ const globalRef = globalThis;
6
+ return globalRef.LanguageDetector ?? null;
7
+ }
8
+ function createChromeAiEngine() {
9
+ let cachedAvailability = null;
10
+ let cachedSession = null;
11
+ async function checkAvailability() {
12
+ if (cachedAvailability !== null) return cachedAvailability;
13
+ const api = getApi();
14
+ if (!api) {
15
+ cachedAvailability = false;
16
+ return false;
17
+ }
18
+ const state = await api.availability();
19
+ cachedAvailability = state === "available";
20
+ return cachedAvailability;
21
+ }
22
+ async function getSession() {
23
+ if (cachedSession) return cachedSession;
24
+ const api = getApi();
25
+ if (!api) throw new Error("chrome-ai: LanguageDetector API missing");
26
+ cachedSession = await api.create();
27
+ return cachedSession;
28
+ }
29
+ return {
30
+ id: "chrome-ai",
31
+ sync: false,
32
+ inputs: ["text"],
33
+ isAvailable() {
34
+ if (cachedAvailability !== null) return cachedAvailability;
35
+ if (!getApi()) {
36
+ cachedAvailability = false;
37
+ return false;
38
+ }
39
+ return checkAvailability();
40
+ },
41
+ async detect(input, ctx = {}) {
42
+ const text = input.text;
43
+ if (text === void 0 || text.trim().length === 0) return [];
44
+ const session = await getSession();
45
+ const sample = text.slice(0, ctx.maxChars ?? DEFAULT_MAX_CHARS);
46
+ const results = await session.detect(sample);
47
+ const top = results[0];
48
+ if (!top || top.confidence < CONFIDENCE_THRESHOLD) return [];
49
+ return [
50
+ {
51
+ kind: "chrome-ai",
52
+ language: top.detectedLanguage,
53
+ confidence: clamp01(top.confidence),
54
+ source: "chrome-ai",
55
+ value: top.detectedLanguage
56
+ }
57
+ ];
58
+ }
59
+ };
60
+ }
61
+ var chromeAiEngine = createChromeAiEngine();
62
+ function clamp01(value) {
63
+ if (!Number.isFinite(value)) return 0;
64
+ if (value < 0) return 0;
65
+ if (value > 1) return 1;
66
+ return value;
67
+ }
68
+
69
+ export { chromeAiEngine, createChromeAiEngine };
70
+ //# sourceMappingURL=chrome-ai.js.map
71
+ //# sourceMappingURL=chrome-ai.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/chrome-ai.ts"],"names":[],"mappings":";AAkBA,IAAM,iBAAA,GAAoB,GAAA;AAG1B,IAAM,oBAAA,GAAuB,GAAA;AAkB7B,SAAS,MAAA,GAAqC;AAC5C,EAAA,MAAM,SAAA,GAAY,UAAA;AAClB,EAAA,OAAO,UAAU,gBAAA,IAAoB,IAAA;AACvC;AAOO,SAAS,oBAAA,GAAoC;AAClD,EAAA,IAAI,kBAAA,GAAqC,IAAA;AACzC,EAAA,IAAI,aAAA,GAAgD,IAAA;AAEpD,EAAA,eAAe,iBAAA,GAAsC;AACnD,IAAA,IAAI,kBAAA,KAAuB,MAAM,OAAO,kBAAA;AACxC,IAAA,MAAM,MAAM,MAAA,EAAO;AACnB,IAAA,IAAI,CAAC,GAAA,EAAK;AACR,MAAA,kBAAA,GAAqB,KAAA;AACrB,MAAA,OAAO,KAAA;AAAA,IACT;AACA,IAAA,MAAM,KAAA,GAAQ,MAAM,GAAA,CAAI,YAAA,EAAa;AACrC,IAAA,kBAAA,GAAqB,KAAA,KAAU,WAAA;AAC/B,IAAA,OAAO,kBAAA;AAAA,EACT;AAEA,EAAA,eAAe,UAAA,GAA+C;AAC5D,IAAA,IAAI,eAAe,OAAO,aAAA;AAC1B,IAAA,MAAM,MAAM,MAAA,EAAO;AACnB,IAAA,IAAI,CAAC,GAAA,EAAK,MAAM,IAAI,MAAM,yCAAyC,CAAA;AACnE,IAAA,aAAA,GAAgB,MAAM,IAAI,MAAA,EAAO;AACjC,IAAA,OAAO,aAAA;AAAA,EACT;AAEA,EAAA,OAAO;AAAA,IACL,EAAA,EAAI,WAAA;AAAA,IACJ,IAAA,EAAM,KAAA;AAAA,IACN,MAAA,EAAQ,CAAC,MAAM,CAAA;AAAA,IACf,WAAA,GAA0C;AACxC,MAAA,IAAI,kBAAA,KAAuB,MAAM,OAAO,kBAAA;AAGxC,MAAA,IAAI,CAAC,QAAO,EAAG;AACb,QAAA,kBAAA,GAAqB,KAAA;AACrB,QAAA,OAAO,KAAA;AAAA,MACT;AACA,MAAA,OAAO,iBAAA,EAAkB;AAAA,IAC3B,CAAA;AAAA,IACA,MAAM,MAAA,CAAO,KAAA,EAAO,GAAA,GAAqB,EAAC,EAAgC;AACxE,MAAA,MAAM,OAAO,KAAA,CAAM,IAAA;AACnB,MAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,KAAW,CAAA,SAAU,EAAC;AAC5D,MAAA,MAAM,OAAA,GAAU,MAAM,UAAA,EAAW;AACjC,MAAA,MAAM,SAAS,IAAA,CAAK,KAAA,CAAM,CAAA,EAAG,GAAA,CAAI,YAAY,iBAAiB,CAAA;AAC9D,MAAA,MAAM,OAAA,GAAU,MAAM,OAAA,CAAQ,MAAA,CAAO,MAAM,CAAA;AAC3C,MAAA,MAAM,GAAA,GAAM,QAAQ,CAAC,CAAA;AACrB,MAAA,IAAI,CAAC,GAAA,IAAO,GAAA,CAAI,UAAA,GAAa,oBAAA,SAA6B,EAAC;AAC3D,MAAA,OAAO;AAAA,QACL;AAAA,UACE,IAAA,EAAM,WAAA;AAAA,UACN,UAAU,GAAA,CAAI,gBAAA;AAAA,UACd,UAAA,EAAY,OAAA,CAAQ,GAAA,CAAI,UAAU,CAAA;AAAA,UAClC,MAAA,EAAQ,WAAA;AAAA,UACR,OAAO,GAAA,CAAI;AAAA;AACb,OACF;AAAA,IACF;AAAA,GACF;AACF;AAGO,IAAM,iBAA8B,oBAAA;AAE3C,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chrome-ai.js","sourcesContent":["/**\n * `langtell/chrome-ai` — the opt-in on-device engine wrapping the browser's\n * `LanguageDetector` API (Gemini Nano on Chrome 138+ / Edge). Lives behind its\n * own subpath; the zero-dependency core never imports it.\n *\n * Opportunistic: it never triggers a model download. Availability (per Chrome\n * docs):\n * - `available` — model loaded, ready. We're available.\n * - `downloadable` — could be fetched on demand. Treated as unavailable so we\n * never initiate a download the user hasn't consented to.\n * - `downloading` — same reasoning; wait for the model to land.\n * - `unavailable` — Chrome's flat-out no. Skip.\n *\n * Emits `kind: \"chrome-ai\"` evidence with the model's own confidence. An\n * `AsyncSource`: registering it flips the compiled `detect` to `Promise`-typed.\n */\nimport type { AsyncSource, DetectContext, LanguageEvidence } from \"./types.js\";\n\nconst DEFAULT_MAX_CHARS = 2000;\n/** Minimum top-language confidence to count as a confident detection; below\n * this we abstain rather than risk a thin-plurality misclassification. */\nconst CONFIDENCE_THRESHOLD = 0.6;\n\ntype AvailabilityState = \"available\" | \"downloadable\" | \"downloading\" | \"unavailable\";\n\ninterface LanguageDetectorResult {\n detectedLanguage: string;\n confidence: number;\n}\n\ninterface LanguageDetectorSession {\n detect(text: string): Promise<LanguageDetectorResult[]>;\n}\n\ninterface LanguageDetectorApi {\n availability(): Promise<AvailabilityState>;\n create(): Promise<LanguageDetectorSession>;\n}\n\nfunction getApi(): LanguageDetectorApi | null {\n const globalRef = globalThis as unknown as { LanguageDetector?: LanguageDetectorApi };\n return globalRef.LanguageDetector ?? null;\n}\n\n/**\n * Build a chrome-ai {@link AsyncSource}. State (availability + session) is\n * cached per instance: once availability is confirmed it is not re-probed for\n * the instance's lifetime, and the detector session is created once and reused.\n */\nexport function createChromeAiEngine(): AsyncSource {\n let cachedAvailability: boolean | null = null;\n let cachedSession: LanguageDetectorSession | null = null;\n\n async function checkAvailability(): Promise<boolean> {\n if (cachedAvailability !== null) return cachedAvailability;\n const api = getApi();\n if (!api) {\n cachedAvailability = false;\n return false;\n }\n const state = await api.availability();\n cachedAvailability = state === \"available\";\n return cachedAvailability;\n }\n\n async function getSession(): Promise<LanguageDetectorSession> {\n if (cachedSession) return cachedSession;\n const api = getApi();\n if (!api) throw new Error(\"chrome-ai: LanguageDetector API missing\");\n cachedSession = await api.create();\n return cachedSession;\n }\n\n return {\n id: \"chrome-ai\",\n sync: false,\n inputs: [\"text\"],\n isAvailable(): boolean | Promise<boolean> {\n if (cachedAvailability !== null) return cachedAvailability;\n // No API at all — return false synchronously so the common (non-Chrome)\n // case skips without a promise round-trip.\n if (!getApi()) {\n cachedAvailability = false;\n return false;\n }\n return checkAvailability();\n },\n async detect(input, ctx: DetectContext = {}): Promise<LanguageEvidence[]> {\n const text = input.text;\n if (text === undefined || text.trim().length === 0) return [];\n const session = await getSession();\n const sample = text.slice(0, ctx.maxChars ?? DEFAULT_MAX_CHARS);\n const results = await session.detect(sample);\n const top = results[0];\n if (!top || top.confidence < CONFIDENCE_THRESHOLD) return [];\n return [\n {\n kind: \"chrome-ai\",\n language: top.detectedLanguage,\n confidence: clamp01(top.confidence),\n source: \"chrome-ai\",\n value: top.detectedLanguage,\n },\n ];\n },\n };\n}\n\n/** A ready-to-register chrome-ai engine instance. */\nexport const chromeAiEngine: AsyncSource = createChromeAiEngine();\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
@@ -0,0 +1,36 @@
1
+ import { primarySubtag } from './chunk-OVSPOZ5J.js';
2
+
3
+ // src/headers.ts
4
+ function evidenceFromHeaders(headers) {
5
+ if (headers === void 0) return [];
6
+ const value = getHeader(headers, "content-language");
7
+ const lang = primarySubtag(value);
8
+ if (lang === null) return [];
9
+ return [
10
+ {
11
+ kind: "http-content-language",
12
+ language: lang,
13
+ confidence: 0.8,
14
+ source: "http-content-language",
15
+ value: value ?? ""
16
+ }
17
+ ];
18
+ }
19
+ function getHeader(headers, name) {
20
+ if (isHeaders(headers)) {
21
+ return headers.get(name) ?? void 0;
22
+ }
23
+ for (const [key, value] of Object.entries(headers)) {
24
+ if (key.toLowerCase() !== name) continue;
25
+ if (Array.isArray(value)) return value.join(",");
26
+ return value ?? void 0;
27
+ }
28
+ return void 0;
29
+ }
30
+ function isHeaders(headers) {
31
+ return typeof Headers !== "undefined" && headers instanceof Headers;
32
+ }
33
+
34
+ export { evidenceFromHeaders };
35
+ //# sourceMappingURL=chunk-3LDE35U2.js.map
36
+ //# sourceMappingURL=chunk-3LDE35U2.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/headers.ts"],"names":[],"mappings":";;;AAIO,SAAS,oBAAoB,OAAA,EAAoD;AACtF,EAAA,IAAI,OAAA,KAAY,MAAA,EAAW,OAAO,EAAC;AAEnC,EAAA,MAAM,KAAA,GAAQ,SAAA,CAAU,OAAA,EAAS,kBAAkB,CAAA;AACnD,EAAA,MAAM,IAAA,GAAO,cAAc,KAAK,CAAA;AAChC,EAAA,IAAI,IAAA,KAAS,IAAA,EAAM,OAAO,EAAC;AAE3B,EAAA,OAAO;AAAA,IACL;AAAA,MACE,IAAA,EAAM,uBAAA;AAAA,MACN,QAAA,EAAU,IAAA;AAAA,MACV,UAAA,EAAY,GAAA;AAAA,MACZ,MAAA,EAAQ,uBAAA;AAAA,MACR,OAAO,KAAA,IAAS;AAAA;AAClB,GACF;AACF;AAEA,SAAS,SAAA,CAAU,SAAoB,IAAA,EAAkC;AACvE,EAAA,IAAI,SAAA,CAAU,OAAO,CAAA,EAAG;AACtB,IAAA,OAAO,OAAA,CAAQ,GAAA,CAAI,IAAI,CAAA,IAAK,MAAA;AAAA,EAC9B;AACA,EAAA,KAAA,MAAW,CAAC,GAAA,EAAK,KAAK,KAAK,MAAA,CAAO,OAAA,CAAQ,OAAO,CAAA,EAAG;AAClD,IAAA,IAAI,GAAA,CAAI,WAAA,EAAY,KAAM,IAAA,EAAM;AAChC,IAAA,IAAI,MAAM,OAAA,CAAQ,KAAK,GAAG,OAAO,KAAA,CAAM,KAAK,GAAG,CAAA;AAC/C,IAAA,OAAO,KAAA,IAAS,MAAA;AAAA,EAClB;AACA,EAAA,OAAO,MAAA;AACT;AAEA,SAAS,UAAU,OAAA,EAAwC;AACzD,EAAA,OAAO,OAAO,OAAA,KAAY,WAAA,IAAe,OAAA,YAAmB,OAAA;AAC9D","file":"chunk-3LDE35U2.js","sourcesContent":["import type { HeaderBag, LanguageEvidence } from \"./types.js\";\nimport { primarySubtag } from \"./internal/bcp47.js\";\n\n/** Producer: the HTTP `Content-Language` response header. */\nexport function evidenceFromHeaders(headers: HeaderBag | undefined): LanguageEvidence[] {\n if (headers === undefined) return [];\n\n const value = getHeader(headers, \"content-language\");\n const lang = primarySubtag(value);\n if (lang === null) return [];\n\n return [\n {\n kind: \"http-content-language\",\n language: lang,\n confidence: 0.8,\n source: \"http-content-language\",\n value: value ?? \"\",\n },\n ];\n}\n\nfunction getHeader(headers: HeaderBag, name: string): string | undefined {\n if (isHeaders(headers)) {\n return headers.get(name) ?? undefined;\n }\n for (const [key, value] of Object.entries(headers)) {\n if (key.toLowerCase() !== name) continue;\n if (Array.isArray(value)) return value.join(\",\");\n return value ?? undefined;\n }\n return undefined;\n}\n\nfunction isHeaders(headers: HeaderBag): headers is Headers {\n return typeof Headers !== \"undefined\" && headers instanceof Headers;\n}\n"]}
@@ -0,0 +1,35 @@
1
+ import { classifyBySnippet } from './chunk-RFR5I7P7.js';
2
+
3
+ // src/text.ts
4
+ function evidenceFromText(text, candidates, rung3) {
5
+ if (text === void 0 || text.trim().length === 0) return [];
6
+ if (candidates === void 0 || candidates.length === 0) return [];
7
+ const verdict = classifyBySnippet(text, candidates, rung3);
8
+ if (verdict.language === "unknown") return [];
9
+ return [
10
+ {
11
+ kind: "title-script",
12
+ language: verdict.language,
13
+ confidence: marginToConfidence(verdict.margin, verdict.rung),
14
+ source: "title-script",
15
+ value: text.trim().slice(0, 80)
16
+ }
17
+ ];
18
+ }
19
+ function marginToConfidence(margin, rung) {
20
+ if (rung === 3) {
21
+ return clamp01(0.4 + Math.min(Math.max(margin, 0), 1) * 0.35);
22
+ }
23
+ const lead = Math.max(margin, 1);
24
+ return clamp01(0.6 + Math.min(lead, 4) / 4 * 0.35);
25
+ }
26
+ function clamp01(value) {
27
+ if (!Number.isFinite(value)) return 0;
28
+ if (value < 0) return 0;
29
+ if (value > 1) return 1;
30
+ return value;
31
+ }
32
+
33
+ export { evidenceFromText };
34
+ //# sourceMappingURL=chunk-3SO2WI75.js.map
35
+ //# sourceMappingURL=chunk-3SO2WI75.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/text.ts"],"names":[],"mappings":";;;AAmBO,SAAS,gBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACoB;AACpB,EAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,KAAW,CAAA,SAAU,EAAC;AAC5D,EAAA,IAAI,eAAe,MAAA,IAAa,UAAA,CAAW,MAAA,KAAW,CAAA,SAAU,EAAC;AAEjE,EAAA,MAAM,OAAA,GAAU,iBAAA,CAAkB,IAAA,EAAM,UAAA,EAAY,KAAK,CAAA;AACzD,EAAA,IAAI,OAAA,CAAQ,QAAA,KAAa,SAAA,EAAW,OAAO,EAAC;AAE5C,EAAA,OAAO;AAAA,IACL;AAAA,MACE,IAAA,EAAM,cAAA;AAAA,MACN,UAAU,OAAA,CAAQ,QAAA;AAAA,MAClB,UAAA,EAAY,kBAAA,CAAmB,OAAA,CAAQ,MAAA,EAAQ,QAAQ,IAAI,CAAA;AAAA,MAC3D,MAAA,EAAQ,cAAA;AAAA,MACR,OAAO,IAAA,CAAK,IAAA,EAAK,CAAE,KAAA,CAAM,GAAG,EAAE;AAAA;AAChC,GACF;AACF;AAWA,SAAS,kBAAA,CAAmB,QAAgB,IAAA,EAAoB;AAC9D,EAAA,IAAI,SAAS,CAAA,EAAG;AAEd,IAAA,OAAO,OAAA,CAAQ,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA,EAAG,CAAC,CAAA,GAAI,IAAI,CAAA;AAAA,EAC9D;AACA,EAAA,MAAM,IAAA,GAAO,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA;AAC/B,EAAA,OAAO,OAAA,CAAQ,MAAO,IAAA,CAAK,GAAA,CAAI,MAAM,CAAC,CAAA,GAAI,IAAK,IAAI,CAAA;AACrD;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chunk-3SO2WI75.js","sourcesContent":["import type { LanguageEvidence, LanguageProfile } from \"./types.js\";\nimport { classifyBySnippet, type Rung, type Rung3Resolver } from \"./internal/classify.js\";\n\n/**\n * Producer: candidate-relative script + lexical signals from the title text.\n *\n * Wraps the ported snippet classifier ({@link classifyBySnippet}): noise strip →\n * dominant-script scope → distinctive letters (rung 1) → function words (2a) →\n * frequent words (2b). The `candidates` roster makes scoring roster-relative —\n * `і` decides Ukrainian only when Russian is also a candidate. Sync and\n * zero-dependency; the optional franc rung is injected via `rung3`.\n *\n * Emits at most one `kind: \"title-script\"` evidence item. The classifier's\n * integer `margin` (the winner's lead over the runner-up) maps to a 0..1\n * `confidence`: a verdict at all means the dominant script and the deciding rung\n * agreed, so the floor is high; a wider lead nudges it up. With no candidates\n * (or no usable distinctive signal) it abstains — emitting nothing rather than a\n * coarse \"unknown\", since the roster decides relevance.\n */\nexport function evidenceFromText(\n text: string | undefined,\n candidates?: readonly LanguageProfile[],\n rung3?: Rung3Resolver,\n): LanguageEvidence[] {\n if (text === undefined || text.trim().length === 0) return [];\n if (candidates === undefined || candidates.length === 0) return [];\n\n const verdict = classifyBySnippet(text, candidates, rung3);\n if (verdict.language === \"unknown\") return [];\n\n return [\n {\n kind: \"title-script\",\n language: verdict.language,\n confidence: marginToConfidence(verdict.margin, verdict.rung),\n source: \"title-script\",\n value: text.trim().slice(0, 80),\n },\n ];\n}\n\n/**\n * Map the classifier's per-rung lead to a 0..1 confidence.\n *\n * Rungs 1–2 carry an integer count of distinctive items (≥1). A verdict already\n * means script + rung agreed, so the floor is high (0.6) and each extra\n * distinctive item adds up to a 0.35 bonus, saturating by a lead of 4. Rung 3\n * (franc) carries franc's own 0..1 score-gap, which is weaker evidence, so it is\n * scaled into a 0.4..0.75 band.\n */\nfunction marginToConfidence(margin: number, rung: Rung): number {\n if (rung === 3) {\n // franc score-gap is already 0..1; weaker than the distinctive rungs.\n return clamp01(0.4 + Math.min(Math.max(margin, 0), 1) * 0.35);\n }\n const lead = Math.max(margin, 1);\n return clamp01(0.6 + (Math.min(lead, 4) / 4) * 0.35);\n}\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
@@ -0,0 +1,27 @@
1
+ import { normalizeBCP47 } from './chunk-OVSPOZ5J.js';
2
+
3
+ // src/html.ts
4
+ function evidenceFromHtml(html) {
5
+ if (html === void 0 || html.trim().length === 0) return [];
6
+ const out = [];
7
+ const htmlLang = /<html\b[^>]*\blang=["']?([^"'\s>]+)/i.exec(html)?.[1];
8
+ pushTag(out, "html-lang", 0.7, htmlLang);
9
+ const metaContentLang = /<meta\b[^>]*\bhttp-equiv=["']?content-language["']?[^>]*\bcontent=["']?([^"'\s>]+)/i.exec(
10
+ html
11
+ )?.[1] ?? /<meta\b[^>]*\bcontent=["']?([^"'\s>]+)["']?[^>]*\bhttp-equiv=["']?content-language/i.exec(
12
+ html
13
+ )?.[1];
14
+ pushTag(out, "meta-content-language", 0.6, metaContentLang);
15
+ const ogLocale = /<meta\b[^>]*\bproperty=["']?og:locale["']?[^>]*\bcontent=["']?([^"'\s>]+)/i.exec(html)?.[1] ?? /<meta\b[^>]*\bcontent=["']?([^"'\s>]+)["']?[^>]*\bproperty=["']?og:locale/i.exec(html)?.[1];
16
+ pushTag(out, "meta-og-locale", 0.6, ogLocale);
17
+ return out;
18
+ }
19
+ function pushTag(out, kind, confidence, raw) {
20
+ const lang = normalizeBCP47(raw);
21
+ if (lang === null) return;
22
+ out.push({ kind, language: lang, confidence, source: kind, value: raw ?? "" });
23
+ }
24
+
25
+ export { evidenceFromHtml };
26
+ //# sourceMappingURL=chunk-KI4MAI3N.js.map
27
+ //# sourceMappingURL=chunk-KI4MAI3N.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/html.ts"],"names":[],"mappings":";;;AAeO,SAAS,iBAAiB,IAAA,EAA8C;AAC7E,EAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,KAAW,CAAA,SAAU,EAAC;AAE5D,EAAA,MAAM,MAA0B,EAAC;AAEjC,EAAA,MAAM,QAAA,GAAW,sCAAA,CAAuC,IAAA,CAAK,IAAI,IAAI,CAAC,CAAA;AACtE,EAAA,OAAA,CAAQ,GAAA,EAAK,WAAA,EAAa,GAAA,EAAK,QAAQ,CAAA;AAGvC,EAAA,MAAM,kBACJ,qFAAA,CAAsF,IAAA;AAAA,IACpF;AAAA,GACF,GAAI,CAAC,CAAA,IACL,qFAAA,CAAsF,IAAA;AAAA,IACpF;AAAA,MACE,CAAC,CAAA;AACP,EAAA,OAAA,CAAQ,GAAA,EAAK,uBAAA,EAAyB,GAAA,EAAK,eAAe,CAAA;AAG1D,EAAA,MAAM,QAAA,GACJ,4EAAA,CAA6E,IAAA,CAAK,IAAI,CAAA,GAAI,CAAC,CAAA,IAC3F,4EAAA,CAA6E,IAAA,CAAK,IAAI,CAAA,GAAI,CAAC,CAAA;AAC7F,EAAA,OAAA,CAAQ,GAAA,EAAK,gBAAA,EAAkB,GAAA,EAAK,QAAQ,CAAA;AAE5C,EAAA,OAAO,GAAA;AACT;AAEA,SAAS,OAAA,CACP,GAAA,EACA,IAAA,EACA,UAAA,EACA,GAAA,EACM;AACN,EAAA,MAAM,IAAA,GAAO,eAAe,GAAG,CAAA;AAC/B,EAAA,IAAI,SAAS,IAAA,EAAM;AACnB,EAAA,GAAA,CAAI,IAAA,CAAK,EAAE,IAAA,EAAM,QAAA,EAAU,IAAA,EAAM,UAAA,EAAY,MAAA,EAAQ,IAAA,EAAM,KAAA,EAAO,GAAA,IAAO,EAAA,EAAI,CAAA;AAC/E","file":"chunk-KI4MAI3N.js","sourcesContent":["import type { LanguageEvidence } from \"./types.js\";\nimport { normalizeBCP47 } from \"./internal/bcp47.js\";\n\n/**\n * Producer: language clues from an HTML string's metadata.\n *\n * Reads three independent declarations, each emitted as its own evidence item\n * (the fuser weighs them):\n * - `<html lang>` → `html-lang`\n * - `<meta http-equiv=\"content-language\">` → `meta-content-language`\n * - `<meta property=\"og:locale\">` → `meta-og-locale`\n *\n * All tags are BCP-47-normalized (`uk-UA` → `uk`, `en_US` → `en`). Sync and\n * zero-dependency — regex extraction only, never a DOM parse.\n */\nexport function evidenceFromHtml(html: string | undefined): LanguageEvidence[] {\n if (html === undefined || html.trim().length === 0) return [];\n\n const out: LanguageEvidence[] = [];\n\n const htmlLang = /<html\\b[^>]*\\blang=[\"']?([^\"'\\s>]+)/i.exec(html)?.[1];\n pushTag(out, \"html-lang\", 0.7, htmlLang);\n\n // <meta http-equiv=\"content-language\" content=\"uk\"> (attribute order varies).\n const metaContentLang =\n /<meta\\b[^>]*\\bhttp-equiv=[\"']?content-language[\"']?[^>]*\\bcontent=[\"']?([^\"'\\s>]+)/i.exec(\n html,\n )?.[1] ??\n /<meta\\b[^>]*\\bcontent=[\"']?([^\"'\\s>]+)[\"']?[^>]*\\bhttp-equiv=[\"']?content-language/i.exec(\n html,\n )?.[1];\n pushTag(out, \"meta-content-language\", 0.6, metaContentLang);\n\n // <meta property=\"og:locale\" content=\"uk_UA\"> (attribute order varies).\n const ogLocale =\n /<meta\\b[^>]*\\bproperty=[\"']?og:locale[\"']?[^>]*\\bcontent=[\"']?([^\"'\\s>]+)/i.exec(html)?.[1] ??\n /<meta\\b[^>]*\\bcontent=[\"']?([^\"'\\s>]+)[\"']?[^>]*\\bproperty=[\"']?og:locale/i.exec(html)?.[1];\n pushTag(out, \"meta-og-locale\", 0.6, ogLocale);\n\n return out;\n}\n\nfunction pushTag(\n out: LanguageEvidence[],\n kind: \"html-lang\" | \"meta-content-language\" | \"meta-og-locale\",\n confidence: number,\n raw: string | undefined,\n): void {\n const lang = normalizeBCP47(raw);\n if (lang === null) return;\n out.push({ kind, language: lang, confidence, source: kind, value: raw ?? \"\" });\n}\n"]}
@@ -0,0 +1,115 @@
1
+ // src/internal/bcp47.ts
2
+ var ALIASES = {
3
+ // Ukrainian
4
+ ua: "uk",
5
+ uk: "uk",
6
+ \u0443\u043A\u0440: "uk",
7
+ \u0443\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0430: "uk",
8
+ \u0443\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u043E\u044E: "uk",
9
+ "\u0443\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0430 \u043C\u043E\u0432\u0430": "uk",
10
+ "\u043D\u0430 \u0443\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0456\u0439": "uk",
11
+ "\u0443\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u043E\u044E \u043C\u043E\u0432\u043E\u044E": "uk",
12
+ ukrainian: "uk",
13
+ "in ukrainian": "uk",
14
+ // Russian
15
+ ru: "ru",
16
+ rus: "ru",
17
+ \u0440\u0443\u0441: "ru",
18
+ \u0440\u0443\u0441\u0441\u043A\u0438\u0439: "ru",
19
+ "\u043F\u043E-\u0440\u0443\u0441\u0441\u043A\u0438": "ru",
20
+ "\u043F\u043E \u0440\u0443\u0441\u0441\u043A\u0438": "ru",
21
+ "\u0440\u0443\u0441\u0441\u043A\u0438\u0439 \u044F\u0437\u044B\u043A": "ru",
22
+ "\u043D\u0430 \u0440\u0443\u0441\u0441\u043A\u043E\u043C": "ru",
23
+ russian: "ru",
24
+ "in russian": "ru",
25
+ \u0440\u043E\u0441\u0456\u0439\u0441\u044C\u043A\u0430: "ru",
26
+ "\u0440\u043E\u0441\u0456\u0439\u0441\u044C\u043A\u0430 \u043C\u043E\u0432\u0430": "ru",
27
+ "\u043F\u043E-\u0440\u043E\u0441\u0456\u0439\u0441\u044C\u043A\u0438": "ru",
28
+ "\u043F\u043E \u0440\u043E\u0441\u0456\u0439\u0441\u044C\u043A\u0438": "ru",
29
+ // Belarusian
30
+ be: "be",
31
+ bel: "be",
32
+ \u0431\u0435\u043B\u0430\u0440\u0443\u0441\u043A\u0430\u044F: "be",
33
+ "\u0431\u0435\u043B\u0430\u0440\u0443\u0441\u043A\u0430\u044F \u043C\u043E\u0432\u0430": "be",
34
+ belarusian: "be",
35
+ "in belarusian": "be",
36
+ // Bulgarian
37
+ bg: "bg",
38
+ bul: "bg",
39
+ \u0431\u044A\u043B\u0433\u0430\u0440\u0441\u043A\u0438: "bg",
40
+ "\u0431\u044A\u043B\u0433\u0430\u0440\u0441\u043A\u0438 \u0435\u0437\u0438\u043A": "bg",
41
+ bulgarian: "bg",
42
+ "in bulgarian": "bg",
43
+ // English
44
+ en: "en",
45
+ eng: "en",
46
+ english: "en",
47
+ "in english": "en",
48
+ \u0430\u043D\u0433\u043B\u0456\u0439\u0441\u044C\u043A\u0430: "en",
49
+ \u0430\u043D\u0433\u043B\u0438\u0439\u0441\u043A\u0438\u0439: "en",
50
+ // Polish
51
+ pl: "pl",
52
+ pol: "pl",
53
+ polski: "pl",
54
+ "po polsku": "pl",
55
+ polish: "pl",
56
+ \u043F\u043E\u043B\u044C\u0441\u044C\u043A\u0430: "pl",
57
+ // German
58
+ de: "de",
59
+ deu: "de",
60
+ ger: "de",
61
+ deutsch: "de",
62
+ "auf deutsch": "de",
63
+ german: "de",
64
+ \u043D\u0456\u043C\u0435\u0446\u044C\u043A\u0430: "de",
65
+ // French
66
+ fr: "fr",
67
+ fra: "fr",
68
+ fran\u00E7ais: "fr",
69
+ francais: "fr",
70
+ "en fran\xE7ais": "fr",
71
+ french: "fr",
72
+ \u0444\u0440\u0430\u043D\u0446\u0443\u0437\u044C\u043A\u0430: "fr",
73
+ // Spanish
74
+ es: "es",
75
+ spa: "es",
76
+ espa\u00F1ol: "es",
77
+ espanol: "es",
78
+ "en espa\xF1ol": "es",
79
+ spanish: "es",
80
+ \u0456\u0441\u043F\u0430\u043D\u0441\u044C\u043A\u0430: "es",
81
+ // Italian
82
+ it: "it",
83
+ ita: "it",
84
+ italiano: "it",
85
+ "in italiano": "it",
86
+ italian: "it",
87
+ \u0456\u0442\u0430\u043B\u0456\u0439\u0441\u044C\u043A\u0430: "it"
88
+ };
89
+ function normalizeLanguageCode(input) {
90
+ if (input === void 0 || input === null) return null;
91
+ const cleaned = input.trim().toLowerCase();
92
+ if (cleaned.length === 0) return null;
93
+ return ALIASES[cleaned] ?? null;
94
+ }
95
+ function normalizeBCP47(input) {
96
+ if (input === void 0 || input === null) return null;
97
+ const cleaned = input.trim().toLowerCase().replace(/_/g, "-");
98
+ if (cleaned.length === 0) return null;
99
+ const direct = ALIASES[cleaned];
100
+ if (direct !== void 0) return direct;
101
+ const head = cleaned.split("-")[0];
102
+ if (head === void 0 || head.length === 0) return null;
103
+ return ALIASES[head] ?? head;
104
+ }
105
+ function primarySubtag(value) {
106
+ if (value === void 0 || value === null) return null;
107
+ const first = value.split(",")[0]?.trim();
108
+ if (first === void 0 || first.length === 0) return null;
109
+ const tag = first.split(";")[0]?.trim();
110
+ return normalizeBCP47(tag);
111
+ }
112
+
113
+ export { normalizeBCP47, normalizeLanguageCode, primarySubtag };
114
+ //# sourceMappingURL=chunk-OVSPOZ5J.js.map
115
+ //# sourceMappingURL=chunk-OVSPOZ5J.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/internal/bcp47.ts"],"names":[],"mappings":";AAuBA,IAAM,OAAA,GAAkC;AAAA;AAAA,EAEtC,EAAA,EAAI,IAAA;AAAA,EACJ,EAAA,EAAI,IAAA;AAAA,EACJ,kBAAA,EAAK,IAAA;AAAA,EACL,4DAAA,EAAY,IAAA;AAAA,EACZ,kEAAA,EAAa,IAAA;AAAA,EACb,uFAAA,EAAmB,IAAA;AAAA,EACnB,iFAAA,EAAkB,IAAA;AAAA,EAClB,mGAAA,EAAqB,IAAA;AAAA,EACrB,SAAA,EAAW,IAAA;AAAA,EACX,cAAA,EAAgB,IAAA;AAAA;AAAA,EAGhB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,kBAAA,EAAK,IAAA;AAAA,EACL,0CAAA,EAAS,IAAA;AAAA,EACT,mDAAA,EAAa,IAAA;AAAA,EACb,mDAAA,EAAa,IAAA;AAAA,EACb,qEAAA,EAAgB,IAAA;AAAA,EAChB,yDAAA,EAAc,IAAA;AAAA,EACd,OAAA,EAAS,IAAA;AAAA,EACT,YAAA,EAAc,IAAA;AAAA,EACd,sDAAA,EAAW,IAAA;AAAA,EACX,iFAAA,EAAkB,IAAA;AAAA,EAClB,qEAAA,EAAgB,IAAA;AAAA,EAChB,qEAAA,EAAgB,IAAA;AAAA;AAAA,EAGhB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,4DAAA,EAAY,IAAA;AAAA,EACZ,uFAAA,EAAmB,IAAA;AAAA,EACnB,UAAA,EAAY,IAAA;AAAA,EACZ,eAAA,EAAiB,IAAA;AAAA;AAAA,EAGjB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,sDAAA,EAAW,IAAA;AAAA,EACX,iFAAA,EAAkB,IAAA;AAAA,EAClB,SAAA,EAAW,IAAA;AAAA,EACX,cAAA,EAAgB,IAAA;AAAA;AAAA,EAGhB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,OAAA,EAAS,IAAA;AAAA,EACT,YAAA,EAAc,IAAA;AAAA,EACd,4DAAA,EAAY,IAAA;AAAA,EACZ,4DAAA,EAAY,IAAA;AAAA;AAAA,EAGZ,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,MAAA,EAAQ,IAAA;AAAA,EACR,WAAA,EAAa,IAAA;AAAA,EACb,MAAA,EAAQ,IAAA;AAAA,EACR,gDAAA,EAAU,IAAA;AAAA;AAAA,EAGV,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,GAAA,EAAK,IAAA;AAAA,EACL,OAAA,EAAS,IAAA;AAAA,EACT,aAAA,EAAe,IAAA;AAAA,EACf,MAAA,EAAQ,IAAA;AAAA,EACR,gDAAA,EAAU,IAAA;AAAA;AAAA,EAGV,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,aAAA,EAAU,IAAA;AAAA,EACV,QAAA,EAAU,IAAA;AAAA,EACV,gBAAA,EAAe,IAAA;AAAA,EACf,MAAA,EAAQ,IAAA;AAAA,EACR,4DAAA,EAAY,IAAA;AAAA;AAAA,EAGZ,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,YAAA,EAAS,IAAA;AAAA,EACT,OAAA,EAAS,IAAA;AAAA,EACT,eAAA,EAAc,IAAA;AAAA,EACd,OAAA,EAAS,IAAA;AAAA,EACT,sDAAA,EAAW,IAAA;AAAA;AAAA,EAGX,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,QAAA,EAAU,IAAA;AAAA,EACV,aAAA,EAAe,IAAA;AAAA,EACf,OAAA,EAAS,IAAA;AAAA,EACT,4DAAA,EAAY;AACd,CAAA;AASO,SAAS,sBAAsB,KAAA,EAAiD;AACrF,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,KAAU,IAAA,EAAM,OAAO,IAAA;AAClD,EAAA,MAAM,OAAA,GAAU,KAAA,CAAM,IAAA,EAAK,CAAE,WAAA,EAAY;AACzC,EAAA,IAAI,OAAA,CAAQ,MAAA,KAAW,CAAA,EAAG,OAAO,IAAA;AACjC,EAAA,OAAO,OAAA,CAAQ,OAAO,CAAA,IAAK,IAAA;AAC7B;AAYO,SAAS,eAAe,KAAA,EAAiD;AAC9E,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,KAAU,IAAA,EAAM,OAAO,IAAA;AAClD,EAAA,MAAM,OAAA,GAAU,MAAM,IAAA,EAAK,CAAE,aAAY,CAAE,OAAA,CAAQ,MAAM,GAAG,CAAA;AAC5D,EAAA,IAAI,OAAA,CAAQ,MAAA,KAAW,CAAA,EAAG,OAAO,IAAA;AACjC,EAAA,MAAM,MAAA,GAAS,QAAQ,OAAO,CAAA;AAC9B,EAAA,IAAI,MAAA,KAAW,QAAW,OAAO,MAAA;AACjC,EAAA,MAAM,IAAA,GAAO,OAAA,CAAQ,KAAA,CAAM,GAAG,EAAE,CAAC,CAAA;AACjC,EAAA,IAAI,IAAA,KAAS,MAAA,IAAa,IAAA,CAAK,MAAA,KAAW,GAAG,OAAO,IAAA;AACpD,EAAA,OAAO,OAAA,CAAQ,IAAI,CAAA,IAAK,IAAA;AAC1B;AAWO,SAAS,cAAc,KAAA,EAAiD;AAC7E,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,KAAU,IAAA,EAAM,OAAO,IAAA;AAClD,EAAA,MAAM,QAAQ,KAAA,CAAM,KAAA,CAAM,GAAG,CAAA,CAAE,CAAC,GAAG,IAAA,EAAK;AACxC,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,CAAM,MAAA,KAAW,GAAG,OAAO,IAAA;AAEtD,EAAA,MAAM,MAAM,KAAA,CAAM,KAAA,CAAM,GAAG,CAAA,CAAE,CAAC,GAAG,IAAA,EAAK;AACtC,EAAA,OAAO,eAAe,GAAG,CAAA;AAC3B","file":"chunk-OVSPOZ5J.js","sourcesContent":["/**\n * BCP-47 / language-code normalization.\n *\n * Two entry points with deliberately different strictness:\n * - {@link normalizeBCP47} — for inputs documented to be BCP-47 (`<html lang>`,\n * hreflang, `Content-Language`): try the full string, then strip a\n * region/script suffix (`en-US` → `en`, `zh_CN` → `zh`).\n * - {@link normalizeLanguageCode} — strict exact-match only, for free-text\n * contexts (URL slugs, link text) where a hyphen split could be a coincidence.\n *\n * Both resolve aliases that appear in the wild (`ua` → `uk`, `rus` → `ru`,\n * localized picker phrases) to a canonical ISO 639-1 code.\n */\n\n/**\n * Aliases mapped to canonical ISO 639-1 codes. Keys are lowercased.\n *\n * Ukrainian is the load-bearing case: most sites use `ua` in URLs even though\n * the ISO code is `uk`. Both are accepted on input; `uk` is always output.\n *\n * Includes localized phrases users see in language pickers (`українською`,\n * `по-русски`, `in english`, …).\n */\nconst ALIASES: Record<string, string> = {\n // Ukrainian\n ua: \"uk\",\n uk: \"uk\",\n укр: \"uk\",\n українська: \"uk\",\n українською: \"uk\",\n \"українська мова\": \"uk\",\n \"на українській\": \"uk\",\n \"українською мовою\": \"uk\",\n ukrainian: \"uk\",\n \"in ukrainian\": \"uk\",\n\n // Russian\n ru: \"ru\",\n rus: \"ru\",\n рус: \"ru\",\n русский: \"ru\",\n \"по-русски\": \"ru\",\n \"по русски\": \"ru\",\n \"русский язык\": \"ru\",\n \"на русском\": \"ru\",\n russian: \"ru\",\n \"in russian\": \"ru\",\n російська: \"ru\",\n \"російська мова\": \"ru\",\n \"по-російськи\": \"ru\",\n \"по російськи\": \"ru\",\n\n // Belarusian\n be: \"be\",\n bel: \"be\",\n беларуская: \"be\",\n \"беларуская мова\": \"be\",\n belarusian: \"be\",\n \"in belarusian\": \"be\",\n\n // Bulgarian\n bg: \"bg\",\n bul: \"bg\",\n български: \"bg\",\n \"български език\": \"bg\",\n bulgarian: \"bg\",\n \"in bulgarian\": \"bg\",\n\n // English\n en: \"en\",\n eng: \"en\",\n english: \"en\",\n \"in english\": \"en\",\n англійська: \"en\",\n английский: \"en\",\n\n // Polish\n pl: \"pl\",\n pol: \"pl\",\n polski: \"pl\",\n \"po polsku\": \"pl\",\n polish: \"pl\",\n польська: \"pl\",\n\n // German\n de: \"de\",\n deu: \"de\",\n ger: \"de\",\n deutsch: \"de\",\n \"auf deutsch\": \"de\",\n german: \"de\",\n німецька: \"de\",\n\n // French\n fr: \"fr\",\n fra: \"fr\",\n français: \"fr\",\n francais: \"fr\",\n \"en français\": \"fr\",\n french: \"fr\",\n французька: \"fr\",\n\n // Spanish\n es: \"es\",\n spa: \"es\",\n español: \"es\",\n espanol: \"es\",\n \"en español\": \"es\",\n spanish: \"es\",\n іспанська: \"es\",\n\n // Italian\n it: \"it\",\n ita: \"it\",\n italiano: \"it\",\n \"in italiano\": \"it\",\n italian: \"it\",\n італійська: \"it\",\n};\n\n/**\n * Strict, exact-match lookup. Returns `null` for unknown inputs and does NOT\n * fall back to a hyphen prefix. Use anywhere a hyphen split could be a\n * coincidence — URL path segments (`/ru-return-warranty`), title attrs, link\n * text. The phrase aliases (`по-русски`, `in english`) are in the table\n * directly, so exact lookup still finds them.\n */\nexport function normalizeLanguageCode(input: string | undefined | null): string | null {\n if (input === undefined || input === null) return null;\n const cleaned = input.trim().toLowerCase();\n if (cleaned.length === 0) return null;\n return ALIASES[cleaned] ?? null;\n}\n\n/**\n * BCP-47-aware normalization: try the full string first, then strip a\n * region/script suffix (`en-US` → `en`, `zh_CN` → `zh`). Use ONLY for inputs\n * documented to be BCP-47 — `hreflang`, `<html lang>`, `Content-Language`,\n * `data-lang`/`data-locale` — never for free-text URL slugs.\n *\n * Falls back to the raw primary subtag when no alias matches, so a code outside\n * the alias table (e.g. `pt-BR` → `pt`) still resolves to its language. The\n * roster decides relevance downstream.\n */\nexport function normalizeBCP47(input: string | undefined | null): string | null {\n if (input === undefined || input === null) return null;\n const cleaned = input.trim().toLowerCase().replace(/_/g, \"-\");\n if (cleaned.length === 0) return null;\n const direct = ALIASES[cleaned];\n if (direct !== undefined) return direct;\n const head = cleaned.split(\"-\")[0];\n if (head === undefined || head.length === 0) return null;\n return ALIASES[head] ?? head;\n}\n\n/**\n * Extract the primary subtag from a BCP-47-ish value, lowercased, then resolve\n * it through the alias table (`ua` → `uk`). Handles `Accept-Language`-style\n * comma lists (`en-US,en;q=0.9` → `en`). Returns `null` for empty/nullish.\n *\n * This is the header/HTML extraction helper: it tolerates the messy shapes those\n * sources carry (comma lists, `q` weights) where {@link normalizeBCP47} expects\n * a single tag.\n */\nexport function primarySubtag(value: string | undefined | null): string | null {\n if (value === undefined || value === null) return null;\n const first = value.split(\",\")[0]?.trim();\n if (first === undefined || first.length === 0) return null;\n // Drop a `;q=…` weight if present.\n const tag = first.split(\";\")[0]?.trim();\n return normalizeBCP47(tag);\n}\n"]}
@@ -0,0 +1,123 @@
1
+ // src/internal/classify.ts
2
+ var UNKNOWN = { language: "unknown", margin: 0, rung: null };
3
+ var CYRILLIC_RE = /\p{Script=Cyrillic}/u;
4
+ var LATIN_RE = /\p{Script=Latin}/u;
5
+ var NOISE_PATTERNS = [
6
+ /\bhttps?:\/\/\S+/gi,
7
+ // full URLs
8
+ /\bwww\.\S+/gi,
9
+ // www.… without a scheme
10
+ /\b[a-z0-9-]+(?:\.[a-z0-9-]+)+(?:\/\S*)?/gi,
11
+ // bare domains (example.com/path)
12
+ /[@#][\p{L}\p{N}_]+/gu
13
+ // @handles and #hashtags
14
+ ];
15
+ function stripNoise(text) {
16
+ let out = text;
17
+ for (const re of NOISE_PATTERNS) out = out.replace(re, " ");
18
+ return out;
19
+ }
20
+ function dominantScript(text) {
21
+ let cyr = 0;
22
+ let lat = 0;
23
+ for (const ch of stripNoise(text)) {
24
+ if (CYRILLIC_RE.test(ch)) cyr += 1;
25
+ else if (LATIN_RE.test(ch)) lat += 1;
26
+ }
27
+ if (cyr === 0 && lat === 0) return null;
28
+ return cyr >= lat ? "cyrillic" : "latin";
29
+ }
30
+ function profileScript(profile) {
31
+ for (const ch of profile.alphabet) {
32
+ if (CYRILLIC_RE.test(ch)) return "cyrillic";
33
+ if (LATIN_RE.test(ch)) return "latin";
34
+ }
35
+ return null;
36
+ }
37
+ function scopeCandidates(text, candidates) {
38
+ const script = dominantScript(text);
39
+ if (script === null) return [];
40
+ const seen = /* @__PURE__ */ new Set();
41
+ const scoped = [];
42
+ for (const c of candidates) {
43
+ if (profileScript(c) !== script || seen.has(c.code)) continue;
44
+ seen.add(c.code);
45
+ scoped.push(c);
46
+ }
47
+ return scoped;
48
+ }
49
+ function tokenize(text) {
50
+ return text.toLowerCase().match(/\p{L}+/gu) ?? [];
51
+ }
52
+ function tally(items, membership) {
53
+ const scores = new Map(membership.map((m) => [m.code, 0]));
54
+ for (const item of items) {
55
+ let owner = null;
56
+ let owners = 0;
57
+ for (const m of membership) {
58
+ if (m.set.has(item)) {
59
+ owners += 1;
60
+ if (owners > 1) {
61
+ owner = null;
62
+ break;
63
+ }
64
+ owner = m.code;
65
+ }
66
+ }
67
+ if (owner !== null) scores.set(owner, (scores.get(owner) ?? 0) + 1);
68
+ }
69
+ return scores;
70
+ }
71
+ function leader(scores) {
72
+ let max = -1;
73
+ let second = -1;
74
+ let code = null;
75
+ for (const [c, score] of scores) {
76
+ if (score > max) {
77
+ second = max;
78
+ max = score;
79
+ code = c;
80
+ } else if (score > second) {
81
+ second = score;
82
+ }
83
+ }
84
+ if (code === null || max < 1) return null;
85
+ const margin = max - Math.max(second, 0);
86
+ return margin >= 1 ? { code, margin } : null;
87
+ }
88
+ function membershipFor(candidates, pick) {
89
+ return candidates.map((c) => ({ code: c.code, set: new Set(pick(c)) }));
90
+ }
91
+ function letterRung(text, scoped) {
92
+ const r = leader(
93
+ tally(
94
+ text.toLowerCase(),
95
+ membershipFor(scoped, (p) => p.alphabet + (p.marks ?? ""))
96
+ )
97
+ );
98
+ return r ? { language: r.code, margin: r.margin, rung: 1 } : null;
99
+ }
100
+ function wordRung(tokens, scoped, tier, rung) {
101
+ const r = leader(
102
+ tally(
103
+ tokens,
104
+ membershipFor(scoped, (p) => p.words?.[tier] ?? [])
105
+ )
106
+ );
107
+ return r ? { language: r.code, margin: r.margin, rung } : null;
108
+ }
109
+ function classifyBySnippet(text, candidates, rung3) {
110
+ if (!text || candidates.length === 0) return UNKNOWN;
111
+ const cleaned = stripNoise(text);
112
+ const scoped = scopeCandidates(cleaned, candidates);
113
+ if (scoped.length === 0) return UNKNOWN;
114
+ const byLetter = letterRung(cleaned, scoped);
115
+ if (byLetter) return byLetter;
116
+ const tokens = tokenize(cleaned);
117
+ if (tokens.length === 0) return UNKNOWN;
118
+ return wordRung(tokens, scoped, "function", "2a") ?? wordRung(tokens, scoped, "frequent", "2b") ?? rung3?.(cleaned, scoped) ?? UNKNOWN;
119
+ }
120
+
121
+ export { classifyBySnippet, scopeCandidates };
122
+ //# sourceMappingURL=chunk-RFR5I7P7.js.map
123
+ //# sourceMappingURL=chunk-RFR5I7P7.js.map