langtell 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import { normalizeBCP47 } from './chunk-OVSPOZ5J.js';
2
+ import { scriptOfProfile } from './chunk-U34Z3ZSV.js';
2
3
 
3
4
  // src/fuse.ts
4
5
  var DEFAULT_KIND_WEIGHT = {
@@ -19,7 +20,7 @@ var MIN_MARGIN = 0.12;
19
20
  function fuse(evidence, options = {}) {
20
21
  const weights = options.weights ?? {};
21
22
  const normalized = normalizeEvidence(evidence, options.candidates);
22
- const scoring = options.nonDiscriminatingScript === "unknown" ? normalized.filter((item) => !isNeutralized(item, normalized)) : normalized;
23
+ const scoring = options.nonDiscriminatingScript === "unknown" ? filterForUnknownMode(normalized, options.candidates) : normalized;
23
24
  const scores = /* @__PURE__ */ new Map();
24
25
  for (const item of scoring) {
25
26
  if (item.language === "unknown") continue;
@@ -53,6 +54,36 @@ function normalizeEvidence(evidence, _candidates) {
53
54
  return { ...item, language: normalized };
54
55
  });
55
56
  }
57
+ function filterForUnknownMode(normalized, candidates) {
58
+ const surviving = normalized.filter((item) => !isNeutralized(item, normalized));
59
+ const titleScript = nonDiscriminatingTitleScript(normalized, candidates);
60
+ if (titleScript === null) return surviving;
61
+ const scriptOf = scriptByCode(candidates ?? []);
62
+ return surviving.filter((item) => {
63
+ if (SCRIPT_KINDS.has(item.kind) || item.language === "unknown") return true;
64
+ const itemScript = scriptOf.get(item.language);
65
+ return itemScript === void 0 || itemScript === titleScript;
66
+ });
67
+ }
68
+ function nonDiscriminatingTitleScript(normalized, candidates) {
69
+ if (candidates === void 0) return null;
70
+ const scriptOf = scriptByCode(candidates);
71
+ for (const item of normalized) {
72
+ if (isNeutralized(item, normalized)) {
73
+ const script = scriptOf.get(item.language);
74
+ if (script !== void 0) return script;
75
+ }
76
+ }
77
+ return null;
78
+ }
79
+ function scriptByCode(candidates) {
80
+ const map = /* @__PURE__ */ new Map();
81
+ for (const c of candidates) {
82
+ const script = scriptOfProfile(c);
83
+ if (script !== null) map.set(c.code, script);
84
+ }
85
+ return map;
86
+ }
56
87
  function isNeutralized(item, all) {
57
88
  if (item.discriminating !== false || !SCRIPT_KINDS.has(item.kind)) return false;
58
89
  return !all.some(
@@ -105,5 +136,5 @@ function clamp01(value) {
105
136
  }
106
137
 
107
138
  export { fuse };
108
- //# sourceMappingURL=chunk-7G3MEXWK.js.map
109
- //# sourceMappingURL=chunk-7G3MEXWK.js.map
139
+ //# sourceMappingURL=chunk-G44HHVK5.js.map
140
+ //# sourceMappingURL=chunk-G44HHVK5.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/fuse.ts"],"names":[],"mappings":";;;;AA2BA,IAAM,mBAAA,GAA8C;AAAA,EAClD,cAAA,EAAgB,CAAA;AAAA,EAChB,iBAAA,EAAmB,CAAA;AAAA,EACnB,WAAA,EAAa,CAAA;AAAA,EACb,cAAA,EAAgB,GAAA;AAAA,EAChB,KAAA,EAAO,GAAA;AAAA,EACP,uBAAA,EAAyB,GAAA;AAAA,EACzB,uBAAA,EAAyB,IAAA;AAAA,EACzB,gBAAA,EAAkB,IAAA;AAAA,EAClB,WAAA,EAAa;AACf,CAAA;AAKA,IAAM,+BAAe,IAAI,GAAA,CAAY,CAAC,cAAA,EAAgB,OAAA,EAAS,WAAW,CAAC,CAAA;AAI3E,IAAM,uBAAA,GAA0B,GAAA;AAEhC,IAAM,iBAAA,GAAoB,IAAA;AAC1B,IAAM,UAAA,GAAa,IAAA;AAcZ,SAAS,IAAA,CACd,QAAA,EACA,OAAA,GAAuB,EAAC,EACR;AAChB,EAAA,MAAM,OAAA,GAAU,OAAA,CAAQ,OAAA,IAAW,EAAC;AACpC,EAAA,MAAM,UAAA,GAAa,iBAAA,CAAkB,QAAA,EAAU,OAAA,CAAQ,UAAU,CAAA;AASjE,EAAA,MAAM,OAAA,GACJ,QAAQ,uBAAA,KAA4B,SAAA,GAChC,qBAAqB,UAAA,EAAY,OAAA,CAAQ,UAAU,CAAA,GACnD,UAAA;AAEN,EAAA,MAAM,MAAA,uBAAa,GAAA,EAAoB;AACvC,EAAA,KAAA,MAAW,QAAQ,OAAA,EAAS;AAC1B,IAAA,IAAI,IAAA,CAAK,aAAa,SAAA,EAAW;AACjC,IAAA,MAAM,MAAA,GACJ,OAAA,CAAQ,IAAA,CAAK,MAAM,CAAA,IAAK,OAAA,CAAQ,IAAA,CAAK,IAAI,CAAA,IAAK,mBAAA,CAAoB,IAAA,CAAK,IAAI,CAAA,IAAK,GAAA;AAClF,IAAA,MAAA,CAAO,GAAA,CAAI,IAAA,CAAK,QAAA,EAAA,CAAW,MAAA,CAAO,GAAA,CAAI,IAAA,CAAK,QAAQ,CAAA,IAAK,CAAA,IAAK,OAAA,CAAQ,IAAA,CAAK,UAAU,IAAI,MAAM,CAAA;AAAA,EAChG;AAGA,EAAA,MAAM,MAAA,GAAS,wBAAwB,OAAO,CAAA;AAE9C,EAAA,MAAM,EAAE,IAAA,EAAM,SAAA,EAAW,aAAY,GAAI,MAAA,CAAO,QAAQ,MAAM,CAAA;AAE9D,EAAA,IAAI,SAAS,IAAA,IAAQ,SAAA,GAAY,iBAAA,IAAqB,SAAA,GAAY,cAAc,UAAA,EAAY;AAG1F,IAAA,IAAI,MAAA,KAAW,IAAA,IAAQ,MAAA,CAAO,GAAA,CAAI,MAAM,CAAA,EAAG;AACzC,MAAA,MAAM,KAAA,GAAQ,MAAA,CAAO,GAAA,CAAI,MAAM,CAAA,IAAK,CAAA;AACpC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,UAAA,EAAY,OAAA,CAAQ,KAAA,IAAS,KAAA,GAAQ,IAAA,CAAK,CAAA;AAAA,QAC1C,QAAA,EAAU,CAAC,GAAG,UAAU;AAAA,OAC1B;AAAA,IACF;AACA,IAAA,OAAO,EAAE,QAAA,EAAU,SAAA,EAAW,UAAA,EAAY,OAAA,CAAQ,SAAS,CAAA,EAAG,QAAA,EAAU,CAAC,GAAG,UAAU,CAAA,EAAE;AAAA,EAC1F;AAEA,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,IAAA;AAAA,IACV,UAAA,EAAY,OAAA,CAAQ,SAAA,IAAa,SAAA,GAAY,cAAc,IAAA,CAAK,CAAA;AAAA,IAChE,QAAA,EAAU,CAAC,GAAG,UAAU;AAAA,GAC1B;AACF;AAYA,SAAS,iBAAA,CACP,UACA,WAAA,EACoB;AACpB,EAAA,OAAO,QAAA,CAAS,GAAA,CAAI,CAAC,IAAA,KAAS;AAC5B,IAAA,IAAI,IAAA,CAAK,QAAA,KAAa,SAAA,EAAW,OAAO,IAAA;AACxC,IAAA,MAAM,UAAA,GAAa,cAAA,CAAe,IAAA,CAAK,QAAQ,KAAK,IAAA,CAAK,QAAA;AACzD,IAAA,IAAI,UAAA,KAAe,IAAA,CAAK,QAAA,EAAU,OAAO,IAAA;AACzC,IAAA,OAAO,EAAE,GAAG,IAAA,EAAM,QAAA,EAAU,UAAA,EAAW;AAAA,EACzC,CAAC,CAAA;AACH;AAoBA,SAAS,oBAAA,CACP,YACA,UAAA,EACoB;AACpB,EAAA,MAAM,SAAA,GAAY,WAAW,MAAA,CAAO,CAAC,SAAS,CAAC,aAAA,CAAc,IAAA,EAAM,UAAU,CAAC,CAAA;AAE9E,EAAA,MAAM,WAAA,GAAc,4BAAA,CAA6B,UAAA,EAAY,UAAU,CAAA;AACvE,EAAA,IAAI,WAAA,KAAgB,MAAM,OAAO,SAAA;AAEjC,EAAA,MAAM,QAAA,GAAW,YAAA,CAAa,UAAA,IAAc,EAAE,CAAA;AAC9C,EAAA,OAAO,SAAA,CAAU,MAAA,CAAO,CAAC,IAAA,KAAS;AAGhC,IAAA,IAAI,YAAA,CAAa,IAAI,IAAA,CAAK,IAAI,KAAK,IAAA,CAAK,QAAA,KAAa,WAAW,OAAO,IAAA;AACvE,IAAA,MAAM,UAAA,GAAa,QAAA,CAAS,GAAA,CAAI,IAAA,CAAK,QAAQ,CAAA;AAC7C,IAAA,OAAO,UAAA,KAAe,UAAa,UAAA,KAAe,WAAA;AAAA,EACpD,CAAC,CAAA;AACH;AAKA,SAAS,4BAAA,CACP,YACA,UAAA,EACmB;AACnB,EAAA,IAAI,UAAA,KAAe,QAAW,OAAO,IAAA;AACrC,EAAA,MAAM,QAAA,GAAW,aAAa,UAAU,CAAA;AACxC,EAAA,KAAA,MAAW,QAAQ,UAAA,EAAY;AAC7B,IAAA,IAAI,aAAA,CAAc,IAAA,EAAM,UAAU,CAAA,EAAG;AACnC,MAAA,MAAM,MAAA,GAAS,QAAA,CAAS,GAAA,CAAI,IAAA,CAAK,QAAQ,CAAA;AACzC,MAAA,IAAI,MAAA,KAAW,QAAW,OAAO,MAAA;AAAA,IACnC;AAAA,EACF;AACA,EAAA,OAAO,IAAA;AACT;AAIA,SAAS,aAAa,UAAA,EAAiE;AACrF,EAAA,MAAM,GAAA,uBAAU,GAAA,EAAwB;AACxC,EAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,IAAA,MAAM,MAAA,GAAS,gBAAgB,CAAC,CAAA;AAChC,IAAA,IAAI,WAAW,IAAA,EAAM,GAAA,CAAI,GAAA,CAAI,CAAA,CAAE,MAAM,MAAM,CAAA;AAAA,EAC7C;AACA,EAAA,OAAO,GAAA;AACT;AAUA,SAAS,aAAA,CAAc,MAAwB,GAAA,EAA2C;AACxF,EAAA,IAAI,IAAA,CAAK,mBAAmB,KAAA,IAAS,CAAC,aAAa,GAAA,CAAI,IAAA,CAAK,IAAI,CAAA,EAAG,OAAO,KAAA;AAC1E,EAAA,OAAO,CAAC,GAAA,CAAI,IAAA;AAAA,IACV,CAAC,KAAA,KACC,KAAA,CAAM,QAAA,KAAa,IAAA,CAAK,QAAA,IACxB,KAAA,CAAM,QAAA,KAAa,SAAA,IACnB,CAAC,YAAA,CAAa,GAAA,CAAI,MAAM,IAAI;AAAA,GAChC;AACF;AAKA,SAAS,wBAAwB,QAAA,EAAsD;AACrF,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,IAAI,cAAA,GAAiB,CAAA;AACrB,EAAA,KAAA,MAAW,QAAQ,QAAA,EAAU;AAC3B,IAAA,IAAI,IAAA,CAAK,aAAa,SAAA,IAAa,CAAC,aAAa,GAAA,CAAI,IAAA,CAAK,IAAI,CAAA,EAAG;AACjE,IAAA,MAAM,CAAA,GAAI,OAAA,CAAQ,IAAA,CAAK,UAAU,CAAA;AACjC,IAAA,IAAI,IAAI,uBAAA,EAAyB;AACjC,IAAA,IAAI,IAAI,cAAA,EAAgB;AACtB,MAAA,cAAA,GAAiB,CAAA;AACjB,MAAA,IAAA,GAAO,IAAA,CAAK,QAAA;AAAA,IACd,CAAA,MAAA,IAAW,CAAA,KAAM,cAAA,IAAkB,IAAA,CAAK,aAAa,IAAA,EAAM;AAEzD,MAAA,IAAA,GAAO,IAAA;AAAA,IACT;AAAA,EACF;AACA,EAAA,OAAO,IAAA;AACT;AAQA,SAAS,MAAA,CACP,QACA,MAAA,EACiE;AACjE,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,IAAI,SAAA,GAAY,CAAA;AAChB,EAAA,IAAI,WAAA,GAAc,CAAA;AAClB,EAAA,MAAM,cAAc,MAAA,KAAW,IAAA,GAAQ,OAAO,GAAA,CAAI,MAAM,KAAK,CAAA,GAAK,CAAA;AAElE,EAAA,KAAA,MAAW,CAAC,QAAA,EAAU,GAAG,CAAA,IAAK,MAAA,EAAQ;AAEpC,IAAA,MAAM,KAAA,GAAQ,WAAW,IAAA,IAAQ,QAAA,KAAa,SAAS,IAAA,CAAK,GAAA,CAAI,GAAA,EAAK,WAAW,CAAA,GAAI,GAAA;AACpF,IAAA,IAAI,QAAQ,SAAA,EAAW;AACrB,MAAA,WAAA,GAAc,SAAA;AACd,MAAA,SAAA,GAAY,KAAA;AACZ,MAAA,IAAA,GAAO,QAAA;AAAA,IACT,CAAA,MAAA,IAAW,QAAQ,WAAA,EAAa;AAC9B,MAAA,WAAA,GAAc,KAAA;AAAA,IAChB;AAAA,EACF;AAEA,EAAA,IAAI,WAAW,IAAA,IAAQ,IAAA,KAAS,UAAU,SAAA,KAAc,WAAA,IAAe,cAAc,CAAA,EAAG;AACtF,IAAA,WAAA,GAAc,SAAA;AACd,IAAA,IAAA,GAAO,MAAA;AACP,IAAA,SAAA,GAAY,WAAA;AAAA,EACd;AACA,EAAA,OAAO,EAAE,IAAA,EAAM,SAAA,EAAW,WAAA,EAAY;AACxC;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chunk-G44HHVK5.js","sourcesContent":["import type {\n Classification,\n LanguageEvidence,\n LanguageProfile,\n NonDiscriminatingScript,\n Weights,\n} from \"./types.js\";\nimport { normalizeBCP47 } from \"./internal/bcp47.js\";\nimport { scriptOfProfile, type ScriptName } from \"./internal/classify.js\";\n\nexport interface FuseOptions {\n weights?: Weights;\n /** The candidate roster. When present, incoming evidence tags are normalized\n * into it (`uk-UA` → `uk`, `ua` → `uk`) so context signals (page/header\n * locale) land on the same code the text rungs use. */\n candidates?: readonly LanguageProfile[];\n /** How to resolve a *non-discriminating* script read (one flagged\n * `discriminating: false` — its winning script owned by ≤1 roster candidate).\n * Default `\"candidate\"` keeps current behavior; `\"unknown\"` drops such a read\n * unless non-script evidence corroborates the same language. See\n * {@link NonDiscriminatingScript}. */\n nonDiscriminatingScript?: NonDiscriminatingScript;\n}\n\n/** Default per-kind weights. Clear lexical signal (script, explicit locale)\n * outweighs contextual signal (page tags, headers). Callers override per\n * `source` id or `kind` via {@link FuseOptions.weights}. */\nconst DEFAULT_KIND_WEIGHT: Record<string, number> = {\n \"title-script\": 1,\n \"explicit-locale\": 1,\n \"chrome-ai\": 1,\n \"source-prior\": 0.7,\n franc: 0.7,\n \"http-content-language\": 0.6,\n \"meta-content-language\": 0.55,\n \"meta-og-locale\": 0.55,\n \"html-lang\": 0.5,\n};\n\n/** Evidence kinds that constitute *clear script evidence* — a verdict the text\n * classifier or an on-device model reached by actually reading the string. The\n * guard below forbids weaker page/header *context* from flipping these. */\nconst SCRIPT_KINDS = new Set<string>([\"title-script\", \"franc\", \"chrome-ai\"]);\n\n/** A script verdict this confident is treated as settled — context may add to it\n * but must not flip the winner to a different language. */\nconst SCRIPT_CONFIDENCE_FLOOR = 0.6;\n\nconst MIN_WINNING_SCORE = 0.35;\nconst MIN_MARGIN = 0.12;\n\n/**\n * Combine evidence into a single weighted verdict with an audit trail.\n *\n * Three steps:\n * 1. Normalize each item's language tag into the candidate roster (BCP-47:\n * `uk-UA`/`ua` → `uk`) so text, page, and header signals agree on a code.\n * 2. Weighted argmax over languages (caller weights override per `source`/`kind`).\n * 3. Apply the guard **context must never override clear script evidence**: when\n * the text classifier (or an on-device model) confidently read one language,\n * weaker page/header context for a *different* language cannot win — a\n * Ukrainian page chrome does not make a Latin/English title Ukrainian.\n */\nexport function fuse(\n evidence: readonly LanguageEvidence[],\n options: FuseOptions = {},\n): Classification {\n const weights = options.weights ?? {};\n const normalized = normalizeEvidence(evidence, options.candidates);\n\n // Under `\"unknown\"`, a non-discriminating script read scores nothing on its own\n // — it's dropped from the tally and the pin below — but stays in the trail. AND\n // context written in a *different script* than that title is dropped too: a\n // foreign-script title's language is never named by page/transport context in\n // another script (a Latin title on a Ukrainian page is a foreign title in a\n // Ukrainian UI, not a Ukrainian title). The full `normalized` set is still\n // returned as evidence.\n const scoring =\n options.nonDiscriminatingScript === \"unknown\"\n ? filterForUnknownMode(normalized, options.candidates)\n : normalized;\n\n const scores = new Map<string, number>();\n for (const item of scoring) {\n if (item.language === \"unknown\") continue;\n const weight =\n weights[item.source] ?? weights[item.kind] ?? DEFAULT_KIND_WEIGHT[item.kind] ?? 0.5;\n scores.set(item.language, (scores.get(item.language) ?? 0) + clamp01(item.confidence) * weight);\n }\n\n // The context-vs-script guard: a confident script read pins the winner.\n const pinned = confidentScriptLanguage(scoring);\n\n const { best, bestScore, secondScore } = argmax(scores, pinned);\n\n if (best === null || bestScore < MIN_WINNING_SCORE || bestScore - secondScore < MIN_MARGIN) {\n // A pinned script language still wins even on a thin margin — clear script\n // evidence is never demoted to \"unknown\" by competing context.\n if (pinned !== null && scores.has(pinned)) {\n const score = scores.get(pinned) ?? 0;\n return {\n language: pinned,\n confidence: clamp01(score / (score + 0.15)),\n evidence: [...normalized],\n };\n }\n return { language: \"unknown\", confidence: clamp01(bestScore), evidence: [...normalized] };\n }\n\n return {\n language: best,\n confidence: clamp01(bestScore / (bestScore + secondScore + 0.15)),\n evidence: [...normalized],\n };\n}\n\n/** Normalize each item's tag into the roster's code space (BCP-47-aware). Items\n * already `\"unknown\"` pass through untouched. Tags are BCP-47-normalized\n * (`en-US` → `en`, `ua` → `uk`) so text, page, and header signals land on the\n * same code. The normalized code is kept even when it falls outside the roster —\n * argmax simply won't favor an out-of-roster context tag, but it stays in the\n * audit trail.\n *\n * The roster is accepted (and reserved) so a future revision can fold roster\n * aliasing in without a signature change; today BCP-47 normalization alone\n * reconciles the codes the producers emit. */\nfunction normalizeEvidence(\n evidence: readonly LanguageEvidence[],\n _candidates: readonly LanguageProfile[] | undefined,\n): LanguageEvidence[] {\n return evidence.map((item) => {\n if (item.language === \"unknown\") return item;\n const normalized = normalizeBCP47(item.language) ?? item.language;\n if (normalized === item.language) return item;\n return { ...item, language: normalized };\n });\n}\n\n/**\n * The scoring set under `nonDiscriminatingScript: \"unknown\"`. Two cuts:\n *\n * 1. Drop every *neutralized* non-discriminating script read (see\n * {@link isNeutralized}) — it names a language only by being the lone\n * candidate in its script, with nothing corroborating it.\n * 2. Drop context (page/transport) evidence whose language is in a **different\n * script** than such a neutralized title. A foreign-script title's language\n * is not the page's language: a Latin title on a `lang=\"uk\"` page must not\n * resolve to `uk`. Same-script context (an explicit `en` `Content-Language`\n * for a Latin title) survives and may still name — or, among same-script\n * candidates, disambiguate — the title.\n *\n * The second cut needs each language's script, which is derived from the\n * candidate roster's alphabets. When `candidates` is absent the scripts can't be\n * derived, so the cut is skipped and behavior falls back to cut 1 alone (the\n * 0.3.0 behavior) — never throwing.\n */\nfunction filterForUnknownMode(\n normalized: readonly LanguageEvidence[],\n candidates: readonly LanguageProfile[] | undefined,\n): LanguageEvidence[] {\n const surviving = normalized.filter((item) => !isNeutralized(item, normalized));\n\n const titleScript = nonDiscriminatingTitleScript(normalized, candidates);\n if (titleScript === null) return surviving;\n\n const scriptOf = scriptByCode(candidates ?? []);\n return surviving.filter((item) => {\n // Keep the script reads themselves and anything whose script we can't place;\n // only cross-script *context* in a known, different script is excluded.\n if (SCRIPT_KINDS.has(item.kind) || item.language === \"unknown\") return true;\n const itemScript = scriptOf.get(item.language);\n return itemScript === undefined || itemScript === titleScript;\n });\n}\n\n/** The script of the title under `\"unknown\"` mode, or `null` when there is no\n * neutralized non-discriminating script read to anchor on (so no cross-script\n * cut applies) or the roster can't place that read's language. */\nfunction nonDiscriminatingTitleScript(\n normalized: readonly LanguageEvidence[],\n candidates: readonly LanguageProfile[] | undefined,\n): ScriptName | null {\n if (candidates === undefined) return null;\n const scriptOf = scriptByCode(candidates);\n for (const item of normalized) {\n if (isNeutralized(item, normalized)) {\n const script = scriptOf.get(item.language);\n if (script !== undefined) return script;\n }\n }\n return null;\n}\n\n/** Map each roster code to the script of its alphabet (Cyrillic/Latin). Codes\n * whose alphabet carries no Cyrillic/Latin letter are omitted. */\nfunction scriptByCode(candidates: readonly LanguageProfile[]): Map<string, ScriptName> {\n const map = new Map<string, ScriptName>();\n for (const c of candidates) {\n const script = scriptOfProfile(c);\n if (script !== null) map.set(c.code, script);\n }\n return map;\n}\n\n/**\n * Whether a non-discriminating script read should score nothing (mode\n * `\"unknown\"`). True when `item` is a script kind flagged `discriminating:\n * false` (its winning script is owned by ≤1 roster candidate) AND no *non-script*\n * evidence corroborates its language. Corroboration must come from context kinds\n * (page tags, headers): two lone-candidate script reads agreeing is still two\n * defaults, not real evidence — so script kinds never corroborate one another.\n */\nfunction isNeutralized(item: LanguageEvidence, all: readonly LanguageEvidence[]): boolean {\n if (item.discriminating !== false || !SCRIPT_KINDS.has(item.kind)) return false;\n return !all.some(\n (other) =>\n other.language === item.language &&\n other.language !== \"unknown\" &&\n !SCRIPT_KINDS.has(other.kind),\n );\n}\n\n/** The language of a *clear script* read confident enough to pin the verdict, or\n * `null` when none qualifies. When two script reads disagree, the higher-\n * confidence one pins (a tie leaves nothing pinned — argmax decides normally). */\nfunction confidentScriptLanguage(evidence: readonly LanguageEvidence[]): string | null {\n let best: string | null = null;\n let bestConfidence = 0;\n for (const item of evidence) {\n if (item.language === \"unknown\" || !SCRIPT_KINDS.has(item.kind)) continue;\n const c = clamp01(item.confidence);\n if (c < SCRIPT_CONFIDENCE_FLOOR) continue;\n if (c > bestConfidence) {\n bestConfidence = c;\n best = item.language;\n } else if (c === bestConfidence && item.language !== best) {\n // Two equally-confident script reads for different languages — ambiguous.\n best = null;\n }\n }\n return best;\n}\n\n/**\n * Weighted argmax. When `pinned` is set (a confident script language), any\n * *other* language's score may only come from context kinds; that score is\n * capped so it can never exceed the pinned language. This enforces the guard\n * without discarding the context from the audit trail.\n */\nfunction argmax(\n scores: Map<string, number>,\n pinned: string | null,\n): { best: string | null; bestScore: number; secondScore: number } {\n let best: string | null = null;\n let bestScore = 0;\n let secondScore = 0;\n const pinnedScore = pinned !== null ? (scores.get(pinned) ?? 0) : 0;\n\n for (const [language, raw] of scores) {\n // Guard: a non-pinned language cannot out-score the pinned one.\n const score = pinned !== null && language !== pinned ? Math.min(raw, pinnedScore) : raw;\n if (score > bestScore) {\n secondScore = bestScore;\n bestScore = score;\n best = language;\n } else if (score > secondScore) {\n secondScore = score;\n }\n }\n // On a pinned tie (pinned capped equal to a context language), prefer pinned.\n if (pinned !== null && best !== pinned && bestScore === pinnedScore && pinnedScore > 0) {\n secondScore = bestScore;\n best = pinned;\n bestScore = pinnedScore;\n }\n return { best, bestScore, secondScore };\n}\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
@@ -1,4 +1,4 @@
1
- import { classifyBySnippet } from './chunk-NCGZPEDA.js';
1
+ import { classifyBySnippet } from './chunk-U34Z3ZSV.js';
2
2
 
3
3
  // src/text.ts
4
4
  function evidenceFromText(text, candidates, rung3) {
@@ -31,5 +31,5 @@ function clamp01(value) {
31
31
  }
32
32
 
33
33
  export { evidenceFromText };
34
- //# sourceMappingURL=chunk-PT7R2BRQ.js.map
35
- //# sourceMappingURL=chunk-PT7R2BRQ.js.map
34
+ //# sourceMappingURL=chunk-KCK3XWCJ.js.map
35
+ //# sourceMappingURL=chunk-KCK3XWCJ.js.map
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/text.ts"],"names":[],"mappings":";;;AAmBO,SAAS,gBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACoB;AACpB,EAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,KAAW,CAAA,SAAU,EAAC;AAC5D,EAAA,IAAI,eAAe,MAAA,IAAa,UAAA,CAAW,MAAA,KAAW,CAAA,SAAU,EAAC;AAEjE,EAAA,MAAM,OAAA,GAAU,iBAAA,CAAkB,IAAA,EAAM,UAAA,EAAY,KAAK,CAAA;AACzD,EAAA,IAAI,OAAA,CAAQ,QAAA,KAAa,SAAA,EAAW,OAAO,EAAC;AAE5C,EAAA,MAAM,IAAA,GAAyB;AAAA,IAC7B,IAAA,EAAM,cAAA;AAAA,IACN,UAAU,OAAA,CAAQ,QAAA;AAAA,IAClB,UAAA,EAAY,kBAAA,CAAmB,OAAA,CAAQ,MAAA,EAAQ,QAAQ,IAAI,CAAA;AAAA,IAC3D,MAAA,EAAQ,cAAA;AAAA,IACR,OAAO,IAAA,CAAK,IAAA,EAAK,CAAE,KAAA,CAAM,GAAG,EAAE;AAAA,GAChC;AAIA,EAAA,IAAI,CAAC,OAAA,CAAQ,cAAA,EAAgB,IAAA,CAAK,cAAA,GAAiB,KAAA;AACnD,EAAA,OAAO,CAAC,IAAI,CAAA;AACd;AAWA,SAAS,kBAAA,CAAmB,QAAgB,IAAA,EAAoB;AAC9D,EAAA,IAAI,SAAS,CAAA,EAAG;AAEd,IAAA,OAAO,OAAA,CAAQ,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA,EAAG,CAAC,CAAA,GAAI,IAAI,CAAA;AAAA,EAC9D;AACA,EAAA,MAAM,IAAA,GAAO,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA;AAC/B,EAAA,OAAO,OAAA,CAAQ,MAAO,IAAA,CAAK,GAAA,CAAI,MAAM,CAAC,CAAA,GAAI,IAAK,IAAI,CAAA;AACrD;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chunk-PT7R2BRQ.js","sourcesContent":["import type { LanguageEvidence, LanguageProfile } from \"./types.js\";\nimport { classifyBySnippet, type Rung, type Rung3Resolver } from \"./internal/classify.js\";\n\n/**\n * Producer: candidate-relative script + lexical signals from the title text.\n *\n * Wraps the ported snippet classifier ({@link classifyBySnippet}): noise strip →\n * dominant-script scope → distinctive letters (rung 1) → function words (2a) →\n * frequent words (2b). The `candidates` roster makes scoring roster-relative —\n * `і` decides Ukrainian only when Russian is also a candidate. Sync and\n * zero-dependency; the optional franc rung is injected via `rung3`.\n *\n * Emits at most one `kind: \"title-script\"` evidence item. The classifier's\n * integer `margin` (the winner's lead over the runner-up) maps to a 0..1\n * `confidence`: a verdict at all means the dominant script and the deciding rung\n * agreed, so the floor is high; a wider lead nudges it up. With no candidates\n * (or no usable distinctive signal) it abstains — emitting nothing rather than a\n * coarse \"unknown\", since the roster decides relevance.\n */\nexport function evidenceFromText(\n text: string | undefined,\n candidates?: readonly LanguageProfile[],\n rung3?: Rung3Resolver,\n): LanguageEvidence[] {\n if (text === undefined || text.trim().length === 0) return [];\n if (candidates === undefined || candidates.length === 0) return [];\n\n const verdict = classifyBySnippet(text, candidates, rung3);\n if (verdict.language === \"unknown\") return [];\n\n const item: LanguageEvidence = {\n kind: \"title-script\",\n language: verdict.language,\n confidence: marginToConfidence(verdict.margin, verdict.rung),\n source: \"title-script\",\n value: text.trim().slice(0, 80),\n };\n // Surface only the meaningful negative: the script was owned by ≤1 candidate,\n // so it didn't choose between candidates. The discriminating case stays narrow\n // (flag omitted). `fuse({ nonDiscriminatingScript: \"unknown\" })` reads this.\n if (!verdict.discriminating) item.discriminating = false;\n return [item];\n}\n\n/**\n * Map the classifier's per-rung lead to a 0..1 confidence.\n *\n * Rungs 1–2 carry an integer count of distinctive items (≥1). A verdict already\n * means script + rung agreed, so the floor is high (0.6) and each extra\n * distinctive item adds up to a 0.35 bonus, saturating by a lead of 4. Rung 3\n * (franc) carries franc's own 0..1 score-gap, which is weaker evidence, so it is\n * scaled into a 0.4..0.75 band.\n */\nfunction marginToConfidence(margin: number, rung: Rung): number {\n if (rung === 3) {\n // franc score-gap is already 0..1; weaker than the distinctive rungs.\n return clamp01(0.4 + Math.min(Math.max(margin, 0), 1) * 0.35);\n }\n const lead = Math.max(margin, 1);\n return clamp01(0.6 + (Math.min(lead, 4) / 4) * 0.35);\n}\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
1
+ {"version":3,"sources":["../src/text.ts"],"names":[],"mappings":";;;AAmBO,SAAS,gBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACoB;AACpB,EAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,KAAW,CAAA,SAAU,EAAC;AAC5D,EAAA,IAAI,eAAe,MAAA,IAAa,UAAA,CAAW,MAAA,KAAW,CAAA,SAAU,EAAC;AAEjE,EAAA,MAAM,OAAA,GAAU,iBAAA,CAAkB,IAAA,EAAM,UAAA,EAAY,KAAK,CAAA;AACzD,EAAA,IAAI,OAAA,CAAQ,QAAA,KAAa,SAAA,EAAW,OAAO,EAAC;AAE5C,EAAA,MAAM,IAAA,GAAyB;AAAA,IAC7B,IAAA,EAAM,cAAA;AAAA,IACN,UAAU,OAAA,CAAQ,QAAA;AAAA,IAClB,UAAA,EAAY,kBAAA,CAAmB,OAAA,CAAQ,MAAA,EAAQ,QAAQ,IAAI,CAAA;AAAA,IAC3D,MAAA,EAAQ,cAAA;AAAA,IACR,OAAO,IAAA,CAAK,IAAA,EAAK,CAAE,KAAA,CAAM,GAAG,EAAE;AAAA,GAChC;AAIA,EAAA,IAAI,CAAC,OAAA,CAAQ,cAAA,EAAgB,IAAA,CAAK,cAAA,GAAiB,KAAA;AACnD,EAAA,OAAO,CAAC,IAAI,CAAA;AACd;AAWA,SAAS,kBAAA,CAAmB,QAAgB,IAAA,EAAoB;AAC9D,EAAA,IAAI,SAAS,CAAA,EAAG;AAEd,IAAA,OAAO,OAAA,CAAQ,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA,EAAG,CAAC,CAAA,GAAI,IAAI,CAAA;AAAA,EAC9D;AACA,EAAA,MAAM,IAAA,GAAO,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA;AAC/B,EAAA,OAAO,OAAA,CAAQ,MAAO,IAAA,CAAK,GAAA,CAAI,MAAM,CAAC,CAAA,GAAI,IAAK,IAAI,CAAA;AACrD;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chunk-KCK3XWCJ.js","sourcesContent":["import type { LanguageEvidence, LanguageProfile } from \"./types.js\";\nimport { classifyBySnippet, type Rung, type Rung3Resolver } from \"./internal/classify.js\";\n\n/**\n * Producer: candidate-relative script + lexical signals from the title text.\n *\n * Wraps the ported snippet classifier ({@link classifyBySnippet}): noise strip →\n * dominant-script scope → distinctive letters (rung 1) → function words (2a) →\n * frequent words (2b). The `candidates` roster makes scoring roster-relative —\n * `і` decides Ukrainian only when Russian is also a candidate. Sync and\n * zero-dependency; the optional franc rung is injected via `rung3`.\n *\n * Emits at most one `kind: \"title-script\"` evidence item. The classifier's\n * integer `margin` (the winner's lead over the runner-up) maps to a 0..1\n * `confidence`: a verdict at all means the dominant script and the deciding rung\n * agreed, so the floor is high; a wider lead nudges it up. With no candidates\n * (or no usable distinctive signal) it abstains — emitting nothing rather than a\n * coarse \"unknown\", since the roster decides relevance.\n */\nexport function evidenceFromText(\n text: string | undefined,\n candidates?: readonly LanguageProfile[],\n rung3?: Rung3Resolver,\n): LanguageEvidence[] {\n if (text === undefined || text.trim().length === 0) return [];\n if (candidates === undefined || candidates.length === 0) return [];\n\n const verdict = classifyBySnippet(text, candidates, rung3);\n if (verdict.language === \"unknown\") return [];\n\n const item: LanguageEvidence = {\n kind: \"title-script\",\n language: verdict.language,\n confidence: marginToConfidence(verdict.margin, verdict.rung),\n source: \"title-script\",\n value: text.trim().slice(0, 80),\n };\n // Surface only the meaningful negative: the script was owned by ≤1 candidate,\n // so it didn't choose between candidates. The discriminating case stays narrow\n // (flag omitted). `fuse({ nonDiscriminatingScript: \"unknown\" })` reads this.\n if (!verdict.discriminating) item.discriminating = false;\n return [item];\n}\n\n/**\n * Map the classifier's per-rung lead to a 0..1 confidence.\n *\n * Rungs 1–2 carry an integer count of distinctive items (≥1). A verdict already\n * means script + rung agreed, so the floor is high (0.6) and each extra\n * distinctive item adds up to a 0.35 bonus, saturating by a lead of 4. Rung 3\n * (franc) carries franc's own 0..1 score-gap, which is weaker evidence, so it is\n * scaled into a 0.4..0.75 band.\n */\nfunction marginToConfidence(margin: number, rung: Rung): number {\n if (rung === 3) {\n // franc score-gap is already 0..1; weaker than the distinctive rungs.\n return clamp01(0.4 + Math.min(Math.max(margin, 0), 1) * 0.35);\n }\n const lead = Math.max(margin, 1);\n return clamp01(0.6 + (Math.min(lead, 4) / 4) * 0.35);\n}\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
@@ -33,7 +33,7 @@ function dominantScript(text) {
33
33
  if (cyr === 0 && lat === 0) return null;
34
34
  return cyr >= lat ? "cyrillic" : "latin";
35
35
  }
36
- function profileScript(profile) {
36
+ function scriptOfProfile(profile) {
37
37
  for (const ch of profile.alphabet) {
38
38
  if (CYRILLIC_RE.test(ch)) return "cyrillic";
39
39
  if (LATIN_RE.test(ch)) return "latin";
@@ -46,7 +46,7 @@ function scopeCandidates(text, candidates) {
46
46
  const seen = /* @__PURE__ */ new Set();
47
47
  const scoped = [];
48
48
  for (const c of candidates) {
49
- if (profileScript(c) !== script || seen.has(c.code)) continue;
49
+ if (scriptOfProfile(c) !== script || seen.has(c.code)) continue;
50
50
  seen.add(c.code);
51
51
  scoped.push(c);
52
52
  }
@@ -126,6 +126,6 @@ function classifyBySnippet(text, candidates, rung3) {
126
126
  return byWord ? { ...byWord, discriminating } : UNKNOWN;
127
127
  }
128
128
 
129
- export { FRANC_RUNG, classifyBySnippet, scopeCandidates };
130
- //# sourceMappingURL=chunk-NCGZPEDA.js.map
131
- //# sourceMappingURL=chunk-NCGZPEDA.js.map
129
+ export { FRANC_RUNG, classifyBySnippet, scopeCandidates, scriptOfProfile };
130
+ //# sourceMappingURL=chunk-U34Z3ZSV.js.map
131
+ //# sourceMappingURL=chunk-U34Z3ZSV.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/internal/classify.ts"],"names":[],"mappings":";AA0BO,IAAM,UAAA,GAAa;AA0B1B,IAAM,OAAA,GAA0B;AAAA,EAC9B,QAAA,EAAU,SAAA;AAAA,EACV,MAAA,EAAQ,CAAA;AAAA,EACR,IAAA,EAAM,IAAA;AAAA,EACN,cAAA,EAAgB;AAClB,CAAA;AAmBA,IAAM,WAAA,GAAc,sBAAA;AACpB,IAAM,QAAA,GAAW,mBAAA;AAsBjB,IAAM,cAAA,GAAoC;AAAA,EACxC,oBAAA;AAAA;AAAA,EACA,cAAA;AAAA;AAAA,EACA,2CAAA;AAAA;AAAA,EACA;AAAA;AACF,CAAA;AAIO,SAAS,WAAW,IAAA,EAAsB;AAC/C,EAAA,IAAI,GAAA,GAAM,IAAA;AACV,EAAA,KAAA,MAAW,MAAM,cAAA,EAAgB,GAAA,GAAM,GAAA,CAAI,OAAA,CAAQ,IAAI,GAAG,CAAA;AAC1D,EAAA,OAAO,GAAA;AACT;AAKA,SAAS,eAAe,IAAA,EAA2C;AACjE,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,MAAW,EAAA,IAAM,UAAA,CAAW,IAAI,CAAA,EAAG;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,SAAA,IACxB,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,EACrC;AACA,EAAA,IAAI,GAAA,KAAQ,CAAA,IAAK,GAAA,KAAQ,CAAA,EAAG,OAAO,IAAA;AACnC,EAAA,OAAO,GAAA,IAAO,MAAM,UAAA,GAAa,OAAA;AACnC;AAMO,SAAS,gBAAgB,OAAA,EAA6C;AAC3E,EAAA,KAAA,MAAW,EAAA,IAAM,QAAQ,QAAA,EAAU;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,UAAA;AACjC,IAAA,IAAI,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,OAAA;AAAA,EAChC;AACA,EAAA,OAAO,IAAA;AACT;AAMO,SAAS,eAAA,CACd,MACA,UAAA,EACK;AACL,EAAA,MAAM,MAAA,GAAS,eAAe,IAAI,CAAA;AAClC,EAAA,IAAI,MAAA,KAAW,IAAA,EAAM,OAAO,EAAC;AAI7B,EAAA,MAAM,IAAA,uBAAW,GAAA,EAAY;AAC7B,EAAA,MAAM,SAAc,EAAC;AACrB,EAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,IAAA,IAAI,eAAA,CAAgB,CAAC,CAAA,KAAM,MAAA,IAAU,KAAK,GAAA,CAAI,CAAA,CAAE,IAAI,CAAA,EAAG;AACvD,IAAA,IAAA,CAAK,GAAA,CAAI,EAAE,IAAI,CAAA;AACf,IAAA,MAAA,CAAO,KAAK,CAAC,CAAA;AAAA,EACf;AACA,EAAA,OAAO,MAAA;AACT;AA8BA,SAAS,SAAS,IAAA,EAAwB;AACxC,EAAA,OAAO,KAAK,WAAA,EAAY,CAAE,KAAA,CAAM,UAAU,KAAK,EAAC;AAClD;AAOA,SAAS,KAAA,CAAM,OAAyB,UAAA,EAAwD;AAC9F,EAAA,MAAM,MAAA,GAAS,IAAI,GAAA,CAAoB,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,KAAM,CAAC,CAAA,CAAE,IAAA,EAAM,CAAC,CAAC,CAAC,CAAA;AACzE,EAAA,KAAA,MAAW,QAAQ,KAAA,EAAO;AACxB,IAAA,IAAI,KAAA,GAAuB,IAAA;AAC3B,IAAA,IAAI,MAAA,GAAS,CAAA;AACb,IAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,MAAA,IAAI,CAAA,CAAE,GAAA,CAAI,GAAA,CAAI,IAAI,CAAA,EAAG;AACnB,QAAA,MAAA,IAAU,CAAA;AACV,QAAA,IAAI,SAAS,CAAA,EAAG;AACd,UAAA,KAAA,GAAQ,IAAA;AACR,UAAA;AAAA,QACF;AACA,QAAA,KAAA,GAAQ,CAAA,CAAE,IAAA;AAAA,MACZ;AAAA,IACF;AACA,IAAA,IAAI,KAAA,KAAU,IAAA,EAAM,MAAA,CAAO,GAAA,CAAI,KAAA,EAAA,CAAQ,OAAO,GAAA,CAAI,KAAK,CAAA,IAAK,CAAA,IAAK,CAAC,CAAA;AAAA,EACpE;AACA,EAAA,OAAO,MAAA;AACT;AAGA,SAAS,OAAO,MAAA,EAAsE;AACpF,EAAA,IAAI,GAAA,GAAM,EAAA;AACV,EAAA,IAAI,MAAA,GAAS,EAAA;AACb,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,KAAA,MAAW,CAAC,CAAA,EAAG,KAAK,CAAA,IAAK,MAAA,EAAQ;AAC/B,IAAA,IAAI,QAAQ,GAAA,EAAK;AACf,MAAA,MAAA,GAAS,GAAA;AACT,MAAA,GAAA,GAAM,KAAA;AACN,MAAA,IAAA,GAAO,CAAA;AAAA,IACT,CAAA,MAAA,IAAW,QAAQ,MAAA,EAAQ;AACzB,MAAA,MAAA,GAAS,KAAA;AAAA,IACX;AAAA,EACF;AACA,EAAA,IAAI,IAAA,KAAS,IAAA,IAAQ,GAAA,GAAM,CAAA,EAAG,OAAO,IAAA;AACrC,EAAA,MAAM,MAAA,GAAS,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,QAAQ,CAAC,CAAA;AACvC,EAAA,OAAO,MAAA,IAAU,CAAA,GAAI,EAAE,IAAA,EAAM,QAAO,GAAI,IAAA;AAC1C;AAEA,SAAS,aAAA,CACP,YACA,IAAA,EACc;AACd,EAAA,OAAO,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,MAAO,EAAE,IAAA,EAAM,CAAA,CAAE,IAAA,EAAM,GAAA,EAAK,IAAI,GAAA,CAAI,IAAA,CAAK,CAAC,CAAC,GAAE,CAAE,CAAA;AACxE;AAIA,SAAS,UAAA,CAAW,MAAc,MAAA,EAAwD;AACxF,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,KAAK,WAAA,EAAY;AAAA,MACjB,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,QAAA,IAAY,CAAA,CAAE,SAAS,EAAA,CAAG;AAAA;AAC3D,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,IAAA,EAAM,QAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAM,CAAA,EAAE,GAAI,IAAA;AAC/D;AAGA,SAAS,QAAA,CACP,MAAA,EACA,MAAA,EACA,IAAA,EACA,IAAA,EACoB;AACpB,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,MAAA;AAAA,MACA,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,KAAA,GAAQ,IAAI,CAAA,IAAK,EAAE;AAAA;AACpD,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,MAAM,MAAA,EAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAK,GAAI,IAAA;AAC5D;AAcO,SAAS,iBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACgB;AAChB,EAAA,IAAI,CAAC,IAAA,IAAQ,UAAA,CAAW,MAAA,KAAW,GAAG,OAAO,OAAA;AAI7C,EAAA,MAAM,OAAA,GAAU,WAAW,IAAI,CAAA;AAG/B,EAAA,MAAM,MAAA,GAAS,eAAA,CAAgB,OAAA,EAAS,UAAU,CAAA;AAClD,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAKhC,EAAA,MAAM,cAAA,GAAiB,OAAO,MAAA,IAAU,CAAA;AAExC,EAAA,MAAM,QAAA,GAAW,UAAA,CAAW,OAAA,EAAS,MAAM,CAAA;AAC3C,EAAA,IAAI,QAAA,EAAU,OAAO,EAAE,GAAG,UAAU,cAAA,EAAe;AAEnD,EAAA,MAAM,MAAA,GAAS,SAAS,OAAO,CAAA;AAC/B,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAEhC,EAAA,MAAM,MAAA,GACJ,QAAA,CAAS,MAAA,EAAQ,MAAA,EAAQ,YAAY,IAAI,CAAA,IACzC,QAAA,CAAS,MAAA,EAAQ,QAAQ,UAAA,EAAY,IAAI,CAAA,IACzC,KAAA,GAAQ,SAAS,MAAM,CAAA;AACzB,EAAA,OAAO,MAAA,GAAS,EAAE,GAAG,MAAA,EAAQ,gBAAe,GAAI,OAAA;AAClD","file":"chunk-U34Z3ZSV.js","sourcesContent":["/**\n * Per-snippet language classification by candidate-set-relative set-difference.\n *\n * A ladder of rungs; the first rung whose leader clears a lead (margin) of ≥1\n * wins; otherwise `\"unknown\"`:\n *\n * 1 alphabet — characters distinctive within the candidate set\n * 2a function words — curated grammatical markers (highest precision)\n * 2b frequent words — corpus content words\n * 3 franc — optional trigram backstop for the distinctive-free\n * residual, injected as a resolver (this module stays\n * franc-free and importable without franc's tables)\n *\n * \"Distinctive\" is ALWAYS relative to the candidate set: a signal counts for a\n * candidate iff it appears in that candidate's profile and in NO other\n * candidate's. So `і` decides {uk, ru} (only uk has it) but is inert in\n * {uk, be} (both have it), and the word `и` decides {uk, ru} even though the\n * *letter* `и` is shared. Nothing is precomputed — uniqueness is the runtime\n * output, never stored.\n *\n * Adapted to langtell's {@link LanguageProfile} shape: the `words` and `iso6393`\n * fields are optional here, so a bare `{ code, alphabet }` profile still\n * classifies on rung 1.\n */\nimport type { LanguageProfile } from \"../types.js\";\n\nexport const FRANC_RUNG = 3;\n\n/** Which rung decided a verdict; `null` when unknown. */\nexport type Rung = 1 | \"2a\" | \"2b\" | typeof FRANC_RUNG | null;\n\nexport interface SnippetVerdict {\n /** Winning language code, or the sentinel `\"unknown\"`. */\n language: string;\n /** Lead of the winner over the runner-up, in the rung's own unit (distinctive\n * char/word count for rungs 1–2; franc score-gap for rung 3). 0 when unknown. */\n margin: number;\n /** Which rung decided; `null` when unknown. */\n rung: Rung;\n /** Whether ≥2 same-script candidates were in scope when the verdict was\n * reached. `true` ⇒ the distinctive-letter/word machinery actually chose\n * between candidates; `false` ⇒ the winner was the lone candidate in its\n * script, selected by script alone (no evidence it is *distinctively* that\n * language). `false` for `\"unknown\"`. */\n discriminating: boolean;\n}\n\n/** A rung's verdict before {@link classifyBySnippet} stamps on the scope-derived\n * `discriminating` flag (which a single rung can't know — it depends on how many\n * same-script candidates were scoped). */\nexport type RungVerdict = Pick<SnippetVerdict, \"language\" | \"margin\" | \"rung\">;\n\nconst UNKNOWN: SnippetVerdict = {\n language: \"unknown\",\n margin: 0,\n rung: null,\n discriminating: false,\n};\n\n/** Resolver for rung 3 (the optional trigram backstop), injected into\n * {@link classifyBySnippet} by callers that have franc available. Kept as an\n * injected seam — not a direct import — so this module stays franc-free and\n * importable without pulling franc's tables. Returns a rung-3 verdict or\n * `null` (abstain).\n *\n * Generic over the concrete profile type `P` the caller classifies with, so a\n * consumer that defines a stricter profile (e.g. `words` required) can type its\n * resolver over that exact shape and hand it to {@link classifyBySnippet} with\n * no adapter — the resolver sees `readonly P[]`, the same array the classifier\n * scoped from its input. Defaults to {@link LanguageProfile} for callers that\n * don't narrow. */\nexport type Rung3Resolver<P extends LanguageProfile = LanguageProfile> = (\n text: string,\n scoped: readonly P[],\n) => RungVerdict | null;\n\nconst CYRILLIC_RE = /\\p{Script=Cyrillic}/u;\nconst LATIN_RE = /\\p{Script=Latin}/u;\n\n/** A coarse script bucket — the only two the candidate-relative classifier\n * distinguishes today. `null` means \"no letters / undetermined\". */\nexport type ScriptName = \"cyrillic\" | \"latin\";\n\n/** Below this length, trigrams are too noisy to justify a rung-3 verdict. */\nexport const RUNG3_MIN_LENGTH = 24;\n\n/**\n * Trailing/inline Latin \"noise\" tokens — URLs, @handles, #hashtags — that a\n * Cyrillic title commonly carries (a headline followed by a link or a social\n * handle). These are almost always Latin even on Cyrillic-language content, so\n * left in they can flip {@link dominantScript} to Latin and let genuinely\n * Cyrillic content scope to the wrong roster. Stripped before the script vote\n * AND before the rung tallies so the URL's letters never contribute either.\n *\n * Kept as separate simple patterns (applied in order — schemes/www before bare\n * domains) rather than one big alternation, so each stays readable. ASCII-only\n * `[a-z0-9-]` in the domain pattern means a Cyrillic word is never mistaken for\n * a domain.\n */\nconst NOISE_PATTERNS: readonly RegExp[] = [\n /\\bhttps?:\\/\\/\\S+/gi, // full URLs\n /\\bwww\\.\\S+/gi, // www.… without a scheme\n /\\b[a-z0-9-]+(?:\\.[a-z0-9-]+)+(?:\\/\\S*)?/gi, // bare domains (example.com/path)\n /[@#][\\p{L}\\p{N}_]+/gu, // @handles and #hashtags\n];\n\n/** Drop URLs / @handles / #hashtags so trailing Latin noise can't outvote the\n * prose's script or pollute the per-rung tallies. */\nexport function stripNoise(text: string): string {\n let out = text;\n for (const re of NOISE_PATTERNS) out = out.replace(re, \" \");\n return out;\n}\n\n/** The script most of `text` is written in, or `null` if it carries no letters.\n * Noise (URLs/handles/hashtags) is stripped first so a single trailing link\n * can't flip a multi-word Cyrillic title's vote to Latin. */\nfunction dominantScript(text: string): \"cyrillic\" | \"latin\" | null {\n let cyr = 0;\n let lat = 0;\n for (const ch of stripNoise(text)) {\n if (CYRILLIC_RE.test(ch)) cyr += 1;\n else if (LATIN_RE.test(ch)) lat += 1;\n }\n if (cyr === 0 && lat === 0) return null;\n return cyr >= lat ? \"cyrillic\" : \"latin\";\n}\n\n/** The script a profile's alphabet is written in, or `null` if it carries no\n * Cyrillic/Latin letter. Exported so the fuser can derive each roster\n * candidate's script without re-deriving the script regexes — a Latin alphabet\n * ⇒ `\"latin\"`, a Cyrillic one ⇒ `\"cyrillic\"`. */\nexport function scriptOfProfile(profile: LanguageProfile): ScriptName | null {\n for (const ch of profile.alphabet) {\n if (CYRILLIC_RE.test(ch)) return \"cyrillic\";\n if (LATIN_RE.test(ch)) return \"latin\";\n }\n return null;\n}\n\n/** Candidates whose script matches the text's dominant script (others can't tip\n * the verdict). Empty when the text carries no letters. Generic over the\n * concrete profile type `P`: the result is a subset of the input array, so it\n * keeps `P` — a stricter caller's profiles stay strictly typed downstream. */\nexport function scopeCandidates<P extends LanguageProfile>(\n text: string,\n candidates: readonly P[],\n): P[] {\n const script = dominantScript(text);\n if (script === null) return [];\n // Keep one profile per code. A language listed twice would otherwise make its\n // own distinctive chars/words read as \"owned by ≥2 candidates\" in `tally`,\n // cancelling them out and collapsing the verdict to \"unknown\".\n const seen = new Set<string>();\n const scoped: P[] = [];\n for (const c of candidates) {\n if (scriptOfProfile(c) !== script || seen.has(c.code)) continue;\n seen.add(c.code);\n scoped.push(c);\n }\n return scoped;\n}\n\n/**\n * Per-language set of characters globally unique within `profiles` — present in\n * exactly one profile's alphabet. Relative to the given profile set: the unique\n * set shrinks as languages are added (a second Latin language un-uniques a–z).\n */\nexport function distinctiveChars(profiles: readonly LanguageProfile[]): Map<string, Set<string>> {\n const owners = new Map<string, string[]>();\n for (const p of profiles) {\n for (const ch of new Set(p.alphabet)) {\n const list = owners.get(ch);\n if (list) list.push(p.code);\n else owners.set(ch, [p.code]);\n }\n }\n const result = new Map<string, Set<string>>(profiles.map((p) => [p.code, new Set()]));\n for (const [ch, codes] of owners) {\n const [only] = codes;\n if (codes.length === 1 && only !== undefined) result.get(only)?.add(ch);\n }\n return result;\n}\n\ninterface Membership {\n code: string;\n set: ReadonlySet<string>;\n}\n\n/** Lowercased Unicode letter-run tokens. Keeps single-char tokens (`і`, `и`). */\nfunction tokenize(text: string): string[] {\n return text.toLowerCase().match(/\\p{L}+/gu) ?? [];\n}\n\n/**\n * Tally how many items (characters or word tokens) are distinctive to each\n * candidate — present in exactly one candidate's set. Items owned by zero or by\n * ≥2 candidates contribute nothing.\n */\nfunction tally(items: Iterable<string>, membership: readonly Membership[]): Map<string, number> {\n const scores = new Map<string, number>(membership.map((m) => [m.code, 0]));\n for (const item of items) {\n let owner: string | null = null;\n let owners = 0;\n for (const m of membership) {\n if (m.set.has(item)) {\n owners += 1;\n if (owners > 1) {\n owner = null;\n break;\n }\n owner = m.code;\n }\n }\n if (owner !== null) scores.set(owner, (scores.get(owner) ?? 0) + 1);\n }\n return scores;\n}\n\n/** The leading candidate and its lead over the runner-up, or `null` if <1. */\nfunction leader(scores: Map<string, number>): { code: string; margin: number } | null {\n let max = -1;\n let second = -1;\n let code: string | null = null;\n for (const [c, score] of scores) {\n if (score > max) {\n second = max;\n max = score;\n code = c;\n } else if (score > second) {\n second = score;\n }\n }\n if (code === null || max < 1) return null;\n const margin = max - Math.max(second, 0);\n return margin >= 1 ? { code, margin } : null;\n}\n\nfunction membershipFor(\n candidates: readonly LanguageProfile[],\n pick: (p: LanguageProfile) => Iterable<string>,\n): Membership[] {\n return candidates.map((c) => ({ code: c.code, set: new Set(pick(c)) }));\n}\n\n/** Rung 1 — characters (alphabet + orthographic {@link LanguageProfile.marks})\n * distinctive within the scoped candidate set. */\nfunction letterRung(text: string, scoped: readonly LanguageProfile[]): RungVerdict | null {\n const r = leader(\n tally(\n text.toLowerCase(),\n membershipFor(scoped, (p) => p.alphabet + (p.marks ?? \"\")),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung: 1 } : null;\n}\n\n/** Rung 2 — distinctive words from the given tier (2a function, 2b frequent). */\nfunction wordRung(\n tokens: readonly string[],\n scoped: readonly LanguageProfile[],\n tier: \"function\" | \"frequent\",\n rung: \"2a\" | \"2b\",\n): RungVerdict | null {\n const r = leader(\n tally(\n tokens,\n membershipFor(scoped, (p) => p.words?.[tier] ?? []),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung } : null;\n}\n\n/**\n * Classify `text` among `candidates`. Synchronous and allocation-light. Returns\n * `\"unknown\"` on empty evidence, on a tie inside the candidate set, or when\n * nothing is distinctive.\n *\n * Generic over the concrete profile type `P`, inferred from `candidates`. The\n * optional `rung3` resolver is typed over the same `P`, so a consumer with a\n * stricter profile (e.g. `words` required) can pass its own resolver directly,\n * with no adapter — the resolver sees exactly the profiles the caller passed.\n * `P` defaults to {@link LanguageProfile}, so the bare two-argument form and\n * every existing call site are unchanged.\n */\nexport function classifyBySnippet<P extends LanguageProfile = LanguageProfile>(\n text: string,\n candidates: readonly P[],\n rung3?: Rung3Resolver<P>,\n): SnippetVerdict {\n if (!text || candidates.length === 0) return UNKNOWN;\n\n // Drop URLs / @handles / #hashtags once, up front: trailing Latin noise must\n // not flip the dominant-script vote nor pollute the per-rung tallies.\n const cleaned = stripNoise(text);\n\n // Restrict to candidates in the text's dominant script.\n const scoped = scopeCandidates(cleaned, candidates);\n if (scoped.length === 0) return UNKNOWN;\n\n // ≥2 same-script candidates means the distinctive machinery actually had a\n // choice to make; a lone scoped candidate wins by script alone. Stamped onto\n // whichever rung decides — a single rung can't see the scope size.\n const discriminating = scoped.length >= 2;\n\n const byLetter = letterRung(cleaned, scoped);\n if (byLetter) return { ...byLetter, discriminating };\n\n const tokens = tokenize(cleaned);\n if (tokens.length === 0) return UNKNOWN;\n\n const byWord =\n wordRung(tokens, scoped, \"function\", \"2a\") ??\n wordRung(tokens, scoped, \"frequent\", \"2b\") ??\n rung3?.(cleaned, scoped);\n return byWord ? { ...byWord, discriminating } : UNKNOWN;\n}\n"]}
@@ -51,13 +51,27 @@ type RungVerdict = Pick<SnippetVerdict, "language" | "margin" | "rung">;
51
51
  * {@link classifyBySnippet} by callers that have franc available. Kept as an
52
52
  * injected seam — not a direct import — so this module stays franc-free and
53
53
  * importable without pulling franc's tables. Returns a rung-3 verdict or
54
- * `null` (abstain). */
55
- type Rung3Resolver = (text: string, scoped: readonly LanguageProfile[]) => RungVerdict | null;
54
+ * `null` (abstain).
55
+ *
56
+ * Generic over the concrete profile type `P` the caller classifies with, so a
57
+ * consumer that defines a stricter profile (e.g. `words` required) can type its
58
+ * resolver over that exact shape and hand it to {@link classifyBySnippet} with
59
+ * no adapter — the resolver sees `readonly P[]`, the same array the classifier
60
+ * scoped from its input. Defaults to {@link LanguageProfile} for callers that
61
+ * don't narrow. */
62
+ type Rung3Resolver<P extends LanguageProfile = LanguageProfile> = (text: string, scoped: readonly P[]) => RungVerdict | null;
56
63
  /**
57
64
  * Classify `text` among `candidates`. Synchronous and allocation-light. Returns
58
65
  * `"unknown"` on empty evidence, on a tie inside the candidate set, or when
59
66
  * nothing is distinctive.
67
+ *
68
+ * Generic over the concrete profile type `P`, inferred from `candidates`. The
69
+ * optional `rung3` resolver is typed over the same `P`, so a consumer with a
70
+ * stricter profile (e.g. `words` required) can pass its own resolver directly,
71
+ * with no adapter — the resolver sees exactly the profiles the caller passed.
72
+ * `P` defaults to {@link LanguageProfile}, so the bare two-argument form and
73
+ * every existing call site are unchanged.
60
74
  */
61
- declare function classifyBySnippet(text: string, candidates: readonly LanguageProfile[], rung3?: Rung3Resolver): SnippetVerdict;
75
+ declare function classifyBySnippet<P extends LanguageProfile = LanguageProfile>(text: string, candidates: readonly P[], rung3?: Rung3Resolver<P>): SnippetVerdict;
62
76
 
63
77
  export { FRANC_RUNG, type Rung, type Rung3Resolver, type RungVerdict, type SnippetVerdict, classifyBySnippet };
package/dist/classify.js CHANGED
@@ -1,3 +1,3 @@
1
- export { FRANC_RUNG, classifyBySnippet } from './chunk-NCGZPEDA.js';
1
+ export { FRANC_RUNG, classifyBySnippet } from './chunk-U34Z3ZSV.js';
2
2
  //# sourceMappingURL=classify.js.map
3
3
  //# sourceMappingURL=classify.js.map
package/dist/franc.js CHANGED
@@ -1,4 +1,4 @@
1
- import { scopeCandidates } from './chunk-NCGZPEDA.js';
1
+ import { scopeCandidates } from './chunk-U34Z3ZSV.js';
2
2
  import { francAll } from 'franc';
3
3
 
4
4
  var RUNG_MIN_LENGTH = 24;
package/dist/fuse.js CHANGED
@@ -1,4 +1,5 @@
1
- export { fuse } from './chunk-7G3MEXWK.js';
1
+ export { fuse } from './chunk-G44HHVK5.js';
2
2
  import './chunk-OVSPOZ5J.js';
3
+ import './chunk-U34Z3ZSV.js';
3
4
  //# sourceMappingURL=fuse.js.map
4
5
  //# sourceMappingURL=fuse.js.map
package/dist/index.js CHANGED
@@ -1,13 +1,13 @@
1
- import { evidenceFromText } from './chunk-PT7R2BRQ.js';
2
- export { evidenceFromText } from './chunk-PT7R2BRQ.js';
1
+ import { evidenceFromText } from './chunk-KCK3XWCJ.js';
2
+ export { evidenceFromText } from './chunk-KCK3XWCJ.js';
3
3
  import { evidenceFromHtml } from './chunk-KI4MAI3N.js';
4
4
  export { evidenceFromHtml } from './chunk-KI4MAI3N.js';
5
5
  import { evidenceFromHeaders } from './chunk-3LDE35U2.js';
6
6
  export { evidenceFromHeaders } from './chunk-3LDE35U2.js';
7
- import { fuse } from './chunk-7G3MEXWK.js';
8
- export { fuse } from './chunk-7G3MEXWK.js';
7
+ import { fuse } from './chunk-G44HHVK5.js';
8
+ export { fuse } from './chunk-G44HHVK5.js';
9
9
  export { normalizeBCP47, normalizeLanguageCode, primarySubtag } from './chunk-OVSPOZ5J.js';
10
- import './chunk-NCGZPEDA.js';
10
+ import './chunk-U34Z3ZSV.js';
11
11
 
12
12
  // src/compile.ts
13
13
  function builtIns(candidates) {
package/dist/text.js CHANGED
@@ -1,4 +1,4 @@
1
- export { evidenceFromText } from './chunk-PT7R2BRQ.js';
2
- import './chunk-NCGZPEDA.js';
1
+ export { evidenceFromText } from './chunk-KCK3XWCJ.js';
2
+ import './chunk-U34Z3ZSV.js';
3
3
  //# sourceMappingURL=text.js.map
4
4
  //# sourceMappingURL=text.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "langtell",
3
- "version": "0.3.0",
3
+ "version": "0.4.0",
4
4
  "description": "Tell me the language — evidence-fusion language detection for short strings, with an auditable confidence trail.",
5
5
  "type": "module",
6
6
  "license": "MIT",
@@ -1 +0,0 @@
1
- {"version":3,"sources":["../src/fuse.ts"],"names":[],"mappings":";;;AA0BA,IAAM,mBAAA,GAA8C;AAAA,EAClD,cAAA,EAAgB,CAAA;AAAA,EAChB,iBAAA,EAAmB,CAAA;AAAA,EACnB,WAAA,EAAa,CAAA;AAAA,EACb,cAAA,EAAgB,GAAA;AAAA,EAChB,KAAA,EAAO,GAAA;AAAA,EACP,uBAAA,EAAyB,GAAA;AAAA,EACzB,uBAAA,EAAyB,IAAA;AAAA,EACzB,gBAAA,EAAkB,IAAA;AAAA,EAClB,WAAA,EAAa;AACf,CAAA;AAKA,IAAM,+BAAe,IAAI,GAAA,CAAY,CAAC,cAAA,EAAgB,OAAA,EAAS,WAAW,CAAC,CAAA;AAI3E,IAAM,uBAAA,GAA0B,GAAA;AAEhC,IAAM,iBAAA,GAAoB,IAAA;AAC1B,IAAM,UAAA,GAAa,IAAA;AAcZ,SAAS,IAAA,CACd,QAAA,EACA,OAAA,GAAuB,EAAC,EACR;AAChB,EAAA,MAAM,OAAA,GAAU,OAAA,CAAQ,OAAA,IAAW,EAAC;AACpC,EAAA,MAAM,UAAA,GAAa,iBAAA,CAAkB,QAAA,EAAU,OAAA,CAAQ,UAAU,CAAA;AAKjE,EAAA,MAAM,OAAA,GACJ,OAAA,CAAQ,uBAAA,KAA4B,SAAA,GAChC,UAAA,CAAW,MAAA,CAAO,CAAC,IAAA,KAAS,CAAC,aAAA,CAAc,IAAA,EAAM,UAAU,CAAC,CAAA,GAC5D,UAAA;AAEN,EAAA,MAAM,MAAA,uBAAa,GAAA,EAAoB;AACvC,EAAA,KAAA,MAAW,QAAQ,OAAA,EAAS;AAC1B,IAAA,IAAI,IAAA,CAAK,aAAa,SAAA,EAAW;AACjC,IAAA,MAAM,MAAA,GACJ,OAAA,CAAQ,IAAA,CAAK,MAAM,CAAA,IAAK,OAAA,CAAQ,IAAA,CAAK,IAAI,CAAA,IAAK,mBAAA,CAAoB,IAAA,CAAK,IAAI,CAAA,IAAK,GAAA;AAClF,IAAA,MAAA,CAAO,GAAA,CAAI,IAAA,CAAK,QAAA,EAAA,CAAW,MAAA,CAAO,GAAA,CAAI,IAAA,CAAK,QAAQ,CAAA,IAAK,CAAA,IAAK,OAAA,CAAQ,IAAA,CAAK,UAAU,IAAI,MAAM,CAAA;AAAA,EAChG;AAGA,EAAA,MAAM,MAAA,GAAS,wBAAwB,OAAO,CAAA;AAE9C,EAAA,MAAM,EAAE,IAAA,EAAM,SAAA,EAAW,aAAY,GAAI,MAAA,CAAO,QAAQ,MAAM,CAAA;AAE9D,EAAA,IAAI,SAAS,IAAA,IAAQ,SAAA,GAAY,iBAAA,IAAqB,SAAA,GAAY,cAAc,UAAA,EAAY;AAG1F,IAAA,IAAI,MAAA,KAAW,IAAA,IAAQ,MAAA,CAAO,GAAA,CAAI,MAAM,CAAA,EAAG;AACzC,MAAA,MAAM,KAAA,GAAQ,MAAA,CAAO,GAAA,CAAI,MAAM,CAAA,IAAK,CAAA;AACpC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,UAAA,EAAY,OAAA,CAAQ,KAAA,IAAS,KAAA,GAAQ,IAAA,CAAK,CAAA;AAAA,QAC1C,QAAA,EAAU,CAAC,GAAG,UAAU;AAAA,OAC1B;AAAA,IACF;AACA,IAAA,OAAO,EAAE,QAAA,EAAU,SAAA,EAAW,UAAA,EAAY,OAAA,CAAQ,SAAS,CAAA,EAAG,QAAA,EAAU,CAAC,GAAG,UAAU,CAAA,EAAE;AAAA,EAC1F;AAEA,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,IAAA;AAAA,IACV,UAAA,EAAY,OAAA,CAAQ,SAAA,IAAa,SAAA,GAAY,cAAc,IAAA,CAAK,CAAA;AAAA,IAChE,QAAA,EAAU,CAAC,GAAG,UAAU;AAAA,GAC1B;AACF;AAYA,SAAS,iBAAA,CACP,UACA,WAAA,EACoB;AACpB,EAAA,OAAO,QAAA,CAAS,GAAA,CAAI,CAAC,IAAA,KAAS;AAC5B,IAAA,IAAI,IAAA,CAAK,QAAA,KAAa,SAAA,EAAW,OAAO,IAAA;AACxC,IAAA,MAAM,UAAA,GAAa,cAAA,CAAe,IAAA,CAAK,QAAQ,KAAK,IAAA,CAAK,QAAA;AACzD,IAAA,IAAI,UAAA,KAAe,IAAA,CAAK,QAAA,EAAU,OAAO,IAAA;AACzC,IAAA,OAAO,EAAE,GAAG,IAAA,EAAM,QAAA,EAAU,UAAA,EAAW;AAAA,EACzC,CAAC,CAAA;AACH;AAUA,SAAS,aAAA,CAAc,MAAwB,GAAA,EAA2C;AACxF,EAAA,IAAI,IAAA,CAAK,mBAAmB,KAAA,IAAS,CAAC,aAAa,GAAA,CAAI,IAAA,CAAK,IAAI,CAAA,EAAG,OAAO,KAAA;AAC1E,EAAA,OAAO,CAAC,GAAA,CAAI,IAAA;AAAA,IACV,CAAC,KAAA,KACC,KAAA,CAAM,QAAA,KAAa,IAAA,CAAK,QAAA,IACxB,KAAA,CAAM,QAAA,KAAa,SAAA,IACnB,CAAC,YAAA,CAAa,GAAA,CAAI,MAAM,IAAI;AAAA,GAChC;AACF;AAKA,SAAS,wBAAwB,QAAA,EAAsD;AACrF,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,IAAI,cAAA,GAAiB,CAAA;AACrB,EAAA,KAAA,MAAW,QAAQ,QAAA,EAAU;AAC3B,IAAA,IAAI,IAAA,CAAK,aAAa,SAAA,IAAa,CAAC,aAAa,GAAA,CAAI,IAAA,CAAK,IAAI,CAAA,EAAG;AACjE,IAAA,MAAM,CAAA,GAAI,OAAA,CAAQ,IAAA,CAAK,UAAU,CAAA;AACjC,IAAA,IAAI,IAAI,uBAAA,EAAyB;AACjC,IAAA,IAAI,IAAI,cAAA,EAAgB;AACtB,MAAA,cAAA,GAAiB,CAAA;AACjB,MAAA,IAAA,GAAO,IAAA,CAAK,QAAA;AAAA,IACd,CAAA,MAAA,IAAW,CAAA,KAAM,cAAA,IAAkB,IAAA,CAAK,aAAa,IAAA,EAAM;AAEzD,MAAA,IAAA,GAAO,IAAA;AAAA,IACT;AAAA,EACF;AACA,EAAA,OAAO,IAAA;AACT;AAQA,SAAS,MAAA,CACP,QACA,MAAA,EACiE;AACjE,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,IAAI,SAAA,GAAY,CAAA;AAChB,EAAA,IAAI,WAAA,GAAc,CAAA;AAClB,EAAA,MAAM,cAAc,MAAA,KAAW,IAAA,GAAQ,OAAO,GAAA,CAAI,MAAM,KAAK,CAAA,GAAK,CAAA;AAElE,EAAA,KAAA,MAAW,CAAC,QAAA,EAAU,GAAG,CAAA,IAAK,MAAA,EAAQ;AAEpC,IAAA,MAAM,KAAA,GAAQ,WAAW,IAAA,IAAQ,QAAA,KAAa,SAAS,IAAA,CAAK,GAAA,CAAI,GAAA,EAAK,WAAW,CAAA,GAAI,GAAA;AACpF,IAAA,IAAI,QAAQ,SAAA,EAAW;AACrB,MAAA,WAAA,GAAc,SAAA;AACd,MAAA,SAAA,GAAY,KAAA;AACZ,MAAA,IAAA,GAAO,QAAA;AAAA,IACT,CAAA,MAAA,IAAW,QAAQ,WAAA,EAAa;AAC9B,MAAA,WAAA,GAAc,KAAA;AAAA,IAChB;AAAA,EACF;AAEA,EAAA,IAAI,WAAW,IAAA,IAAQ,IAAA,KAAS,UAAU,SAAA,KAAc,WAAA,IAAe,cAAc,CAAA,EAAG;AACtF,IAAA,WAAA,GAAc,SAAA;AACd,IAAA,IAAA,GAAO,MAAA;AACP,IAAA,SAAA,GAAY,WAAA;AAAA,EACd;AACA,EAAA,OAAO,EAAE,IAAA,EAAM,SAAA,EAAW,WAAA,EAAY;AACxC;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chunk-7G3MEXWK.js","sourcesContent":["import type {\n Classification,\n LanguageEvidence,\n LanguageProfile,\n NonDiscriminatingScript,\n Weights,\n} from \"./types.js\";\nimport { normalizeBCP47 } from \"./internal/bcp47.js\";\n\nexport interface FuseOptions {\n weights?: Weights;\n /** The candidate roster. When present, incoming evidence tags are normalized\n * into it (`uk-UA` → `uk`, `ua` → `uk`) so context signals (page/header\n * locale) land on the same code the text rungs use. */\n candidates?: readonly LanguageProfile[];\n /** How to resolve a *non-discriminating* script read (one flagged\n * `discriminating: false` — its winning script owned by ≤1 roster candidate).\n * Default `\"candidate\"` keeps current behavior; `\"unknown\"` drops such a read\n * unless non-script evidence corroborates the same language. See\n * {@link NonDiscriminatingScript}. */\n nonDiscriminatingScript?: NonDiscriminatingScript;\n}\n\n/** Default per-kind weights. Clear lexical signal (script, explicit locale)\n * outweighs contextual signal (page tags, headers). Callers override per\n * `source` id or `kind` via {@link FuseOptions.weights}. */\nconst DEFAULT_KIND_WEIGHT: Record<string, number> = {\n \"title-script\": 1,\n \"explicit-locale\": 1,\n \"chrome-ai\": 1,\n \"source-prior\": 0.7,\n franc: 0.7,\n \"http-content-language\": 0.6,\n \"meta-content-language\": 0.55,\n \"meta-og-locale\": 0.55,\n \"html-lang\": 0.5,\n};\n\n/** Evidence kinds that constitute *clear script evidence* — a verdict the text\n * classifier or an on-device model reached by actually reading the string. The\n * guard below forbids weaker page/header *context* from flipping these. */\nconst SCRIPT_KINDS = new Set<string>([\"title-script\", \"franc\", \"chrome-ai\"]);\n\n/** A script verdict this confident is treated as settled — context may add to it\n * but must not flip the winner to a different language. */\nconst SCRIPT_CONFIDENCE_FLOOR = 0.6;\n\nconst MIN_WINNING_SCORE = 0.35;\nconst MIN_MARGIN = 0.12;\n\n/**\n * Combine evidence into a single weighted verdict with an audit trail.\n *\n * Three steps:\n * 1. Normalize each item's language tag into the candidate roster (BCP-47:\n * `uk-UA`/`ua` → `uk`) so text, page, and header signals agree on a code.\n * 2. Weighted argmax over languages (caller weights override per `source`/`kind`).\n * 3. Apply the guard **context must never override clear script evidence**: when\n * the text classifier (or an on-device model) confidently read one language,\n * weaker page/header context for a *different* language cannot win — a\n * Ukrainian page chrome does not make a Latin/English title Ukrainian.\n */\nexport function fuse(\n evidence: readonly LanguageEvidence[],\n options: FuseOptions = {},\n): Classification {\n const weights = options.weights ?? {};\n const normalized = normalizeEvidence(evidence, options.candidates);\n\n // Under `\"unknown\"`, a non-discriminating script read scores nothing on its own\n // — it's dropped from the tally and the pin below — but stays in the trail. The\n // full `normalized` set is still returned as evidence.\n const scoring =\n options.nonDiscriminatingScript === \"unknown\"\n ? normalized.filter((item) => !isNeutralized(item, normalized))\n : normalized;\n\n const scores = new Map<string, number>();\n for (const item of scoring) {\n if (item.language === \"unknown\") continue;\n const weight =\n weights[item.source] ?? weights[item.kind] ?? DEFAULT_KIND_WEIGHT[item.kind] ?? 0.5;\n scores.set(item.language, (scores.get(item.language) ?? 0) + clamp01(item.confidence) * weight);\n }\n\n // The context-vs-script guard: a confident script read pins the winner.\n const pinned = confidentScriptLanguage(scoring);\n\n const { best, bestScore, secondScore } = argmax(scores, pinned);\n\n if (best === null || bestScore < MIN_WINNING_SCORE || bestScore - secondScore < MIN_MARGIN) {\n // A pinned script language still wins even on a thin margin — clear script\n // evidence is never demoted to \"unknown\" by competing context.\n if (pinned !== null && scores.has(pinned)) {\n const score = scores.get(pinned) ?? 0;\n return {\n language: pinned,\n confidence: clamp01(score / (score + 0.15)),\n evidence: [...normalized],\n };\n }\n return { language: \"unknown\", confidence: clamp01(bestScore), evidence: [...normalized] };\n }\n\n return {\n language: best,\n confidence: clamp01(bestScore / (bestScore + secondScore + 0.15)),\n evidence: [...normalized],\n };\n}\n\n/** Normalize each item's tag into the roster's code space (BCP-47-aware). Items\n * already `\"unknown\"` pass through untouched. Tags are BCP-47-normalized\n * (`en-US` → `en`, `ua` → `uk`) so text, page, and header signals land on the\n * same code. The normalized code is kept even when it falls outside the roster —\n * argmax simply won't favor an out-of-roster context tag, but it stays in the\n * audit trail.\n *\n * The roster is accepted (and reserved) so a future revision can fold roster\n * aliasing in without a signature change; today BCP-47 normalization alone\n * reconciles the codes the producers emit. */\nfunction normalizeEvidence(\n evidence: readonly LanguageEvidence[],\n _candidates: readonly LanguageProfile[] | undefined,\n): LanguageEvidence[] {\n return evidence.map((item) => {\n if (item.language === \"unknown\") return item;\n const normalized = normalizeBCP47(item.language) ?? item.language;\n if (normalized === item.language) return item;\n return { ...item, language: normalized };\n });\n}\n\n/**\n * Whether a non-discriminating script read should score nothing (mode\n * `\"unknown\"`). True when `item` is a script kind flagged `discriminating:\n * false` (its winning script is owned by ≤1 roster candidate) AND no *non-script*\n * evidence corroborates its language. Corroboration must come from context kinds\n * (page tags, headers): two lone-candidate script reads agreeing is still two\n * defaults, not real evidence — so script kinds never corroborate one another.\n */\nfunction isNeutralized(item: LanguageEvidence, all: readonly LanguageEvidence[]): boolean {\n if (item.discriminating !== false || !SCRIPT_KINDS.has(item.kind)) return false;\n return !all.some(\n (other) =>\n other.language === item.language &&\n other.language !== \"unknown\" &&\n !SCRIPT_KINDS.has(other.kind),\n );\n}\n\n/** The language of a *clear script* read confident enough to pin the verdict, or\n * `null` when none qualifies. When two script reads disagree, the higher-\n * confidence one pins (a tie leaves nothing pinned — argmax decides normally). */\nfunction confidentScriptLanguage(evidence: readonly LanguageEvidence[]): string | null {\n let best: string | null = null;\n let bestConfidence = 0;\n for (const item of evidence) {\n if (item.language === \"unknown\" || !SCRIPT_KINDS.has(item.kind)) continue;\n const c = clamp01(item.confidence);\n if (c < SCRIPT_CONFIDENCE_FLOOR) continue;\n if (c > bestConfidence) {\n bestConfidence = c;\n best = item.language;\n } else if (c === bestConfidence && item.language !== best) {\n // Two equally-confident script reads for different languages — ambiguous.\n best = null;\n }\n }\n return best;\n}\n\n/**\n * Weighted argmax. When `pinned` is set (a confident script language), any\n * *other* language's score may only come from context kinds; that score is\n * capped so it can never exceed the pinned language. This enforces the guard\n * without discarding the context from the audit trail.\n */\nfunction argmax(\n scores: Map<string, number>,\n pinned: string | null,\n): { best: string | null; bestScore: number; secondScore: number } {\n let best: string | null = null;\n let bestScore = 0;\n let secondScore = 0;\n const pinnedScore = pinned !== null ? (scores.get(pinned) ?? 0) : 0;\n\n for (const [language, raw] of scores) {\n // Guard: a non-pinned language cannot out-score the pinned one.\n const score = pinned !== null && language !== pinned ? Math.min(raw, pinnedScore) : raw;\n if (score > bestScore) {\n secondScore = bestScore;\n bestScore = score;\n best = language;\n } else if (score > secondScore) {\n secondScore = score;\n }\n }\n // On a pinned tie (pinned capped equal to a context language), prefer pinned.\n if (pinned !== null && best !== pinned && bestScore === pinnedScore && pinnedScore > 0) {\n secondScore = bestScore;\n best = pinned;\n bestScore = pinnedScore;\n }\n return { best, bestScore, secondScore };\n}\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
@@ -1 +0,0 @@
1
- {"version":3,"sources":["../src/internal/classify.ts"],"names":[],"mappings":";AA0BO,IAAM,UAAA,GAAa;AA0B1B,IAAM,OAAA,GAA0B;AAAA,EAC9B,QAAA,EAAU,SAAA;AAAA,EACV,MAAA,EAAQ,CAAA;AAAA,EACR,IAAA,EAAM,IAAA;AAAA,EACN,cAAA,EAAgB;AAClB,CAAA;AAYA,IAAM,WAAA,GAAc,sBAAA;AACpB,IAAM,QAAA,GAAW,mBAAA;AAkBjB,IAAM,cAAA,GAAoC;AAAA,EACxC,oBAAA;AAAA;AAAA,EACA,cAAA;AAAA;AAAA,EACA,2CAAA;AAAA;AAAA,EACA;AAAA;AACF,CAAA;AAIO,SAAS,WAAW,IAAA,EAAsB;AAC/C,EAAA,IAAI,GAAA,GAAM,IAAA;AACV,EAAA,KAAA,MAAW,MAAM,cAAA,EAAgB,GAAA,GAAM,GAAA,CAAI,OAAA,CAAQ,IAAI,GAAG,CAAA;AAC1D,EAAA,OAAO,GAAA;AACT;AAKA,SAAS,eAAe,IAAA,EAA2C;AACjE,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,MAAW,EAAA,IAAM,UAAA,CAAW,IAAI,CAAA,EAAG;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,SAAA,IACxB,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,EACrC;AACA,EAAA,IAAI,GAAA,KAAQ,CAAA,IAAK,GAAA,KAAQ,CAAA,EAAG,OAAO,IAAA;AACnC,EAAA,OAAO,GAAA,IAAO,MAAM,UAAA,GAAa,OAAA;AACnC;AAGA,SAAS,cAAc,OAAA,EAAuD;AAC5E,EAAA,KAAA,MAAW,EAAA,IAAM,QAAQ,QAAA,EAAU;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,UAAA;AACjC,IAAA,IAAI,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,OAAA;AAAA,EAChC;AACA,EAAA,OAAO,IAAA;AACT;AAIO,SAAS,eAAA,CACd,MACA,UAAA,EACmB;AACnB,EAAA,MAAM,MAAA,GAAS,eAAe,IAAI,CAAA;AAClC,EAAA,IAAI,MAAA,KAAW,IAAA,EAAM,OAAO,EAAC;AAI7B,EAAA,MAAM,IAAA,uBAAW,GAAA,EAAY;AAC7B,EAAA,MAAM,SAA4B,EAAC;AACnC,EAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,IAAA,IAAI,aAAA,CAAc,CAAC,CAAA,KAAM,MAAA,IAAU,KAAK,GAAA,CAAI,CAAA,CAAE,IAAI,CAAA,EAAG;AACrD,IAAA,IAAA,CAAK,GAAA,CAAI,EAAE,IAAI,CAAA;AACf,IAAA,MAAA,CAAO,KAAK,CAAC,CAAA;AAAA,EACf;AACA,EAAA,OAAO,MAAA;AACT;AA8BA,SAAS,SAAS,IAAA,EAAwB;AACxC,EAAA,OAAO,KAAK,WAAA,EAAY,CAAE,KAAA,CAAM,UAAU,KAAK,EAAC;AAClD;AAOA,SAAS,KAAA,CAAM,OAAyB,UAAA,EAAwD;AAC9F,EAAA,MAAM,MAAA,GAAS,IAAI,GAAA,CAAoB,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,KAAM,CAAC,CAAA,CAAE,IAAA,EAAM,CAAC,CAAC,CAAC,CAAA;AACzE,EAAA,KAAA,MAAW,QAAQ,KAAA,EAAO;AACxB,IAAA,IAAI,KAAA,GAAuB,IAAA;AAC3B,IAAA,IAAI,MAAA,GAAS,CAAA;AACb,IAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,MAAA,IAAI,CAAA,CAAE,GAAA,CAAI,GAAA,CAAI,IAAI,CAAA,EAAG;AACnB,QAAA,MAAA,IAAU,CAAA;AACV,QAAA,IAAI,SAAS,CAAA,EAAG;AACd,UAAA,KAAA,GAAQ,IAAA;AACR,UAAA;AAAA,QACF;AACA,QAAA,KAAA,GAAQ,CAAA,CAAE,IAAA;AAAA,MACZ;AAAA,IACF;AACA,IAAA,IAAI,KAAA,KAAU,IAAA,EAAM,MAAA,CAAO,GAAA,CAAI,KAAA,EAAA,CAAQ,OAAO,GAAA,CAAI,KAAK,CAAA,IAAK,CAAA,IAAK,CAAC,CAAA;AAAA,EACpE;AACA,EAAA,OAAO,MAAA;AACT;AAGA,SAAS,OAAO,MAAA,EAAsE;AACpF,EAAA,IAAI,GAAA,GAAM,EAAA;AACV,EAAA,IAAI,MAAA,GAAS,EAAA;AACb,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,KAAA,MAAW,CAAC,CAAA,EAAG,KAAK,CAAA,IAAK,MAAA,EAAQ;AAC/B,IAAA,IAAI,QAAQ,GAAA,EAAK;AACf,MAAA,MAAA,GAAS,GAAA;AACT,MAAA,GAAA,GAAM,KAAA;AACN,MAAA,IAAA,GAAO,CAAA;AAAA,IACT,CAAA,MAAA,IAAW,QAAQ,MAAA,EAAQ;AACzB,MAAA,MAAA,GAAS,KAAA;AAAA,IACX;AAAA,EACF;AACA,EAAA,IAAI,IAAA,KAAS,IAAA,IAAQ,GAAA,GAAM,CAAA,EAAG,OAAO,IAAA;AACrC,EAAA,MAAM,MAAA,GAAS,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,QAAQ,CAAC,CAAA;AACvC,EAAA,OAAO,MAAA,IAAU,CAAA,GAAI,EAAE,IAAA,EAAM,QAAO,GAAI,IAAA;AAC1C;AAEA,SAAS,aAAA,CACP,YACA,IAAA,EACc;AACd,EAAA,OAAO,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,MAAO,EAAE,IAAA,EAAM,CAAA,CAAE,IAAA,EAAM,GAAA,EAAK,IAAI,GAAA,CAAI,IAAA,CAAK,CAAC,CAAC,GAAE,CAAE,CAAA;AACxE;AAIA,SAAS,UAAA,CAAW,MAAc,MAAA,EAAwD;AACxF,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,KAAK,WAAA,EAAY;AAAA,MACjB,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,QAAA,IAAY,CAAA,CAAE,SAAS,EAAA,CAAG;AAAA;AAC3D,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,IAAA,EAAM,QAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAM,CAAA,EAAE,GAAI,IAAA;AAC/D;AAGA,SAAS,QAAA,CACP,MAAA,EACA,MAAA,EACA,IAAA,EACA,IAAA,EACoB;AACpB,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,MAAA;AAAA,MACA,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,KAAA,GAAQ,IAAI,CAAA,IAAK,EAAE;AAAA;AACpD,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,MAAM,MAAA,EAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAK,GAAI,IAAA;AAC5D;AAOO,SAAS,iBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACgB;AAChB,EAAA,IAAI,CAAC,IAAA,IAAQ,UAAA,CAAW,MAAA,KAAW,GAAG,OAAO,OAAA;AAI7C,EAAA,MAAM,OAAA,GAAU,WAAW,IAAI,CAAA;AAG/B,EAAA,MAAM,MAAA,GAAS,eAAA,CAAgB,OAAA,EAAS,UAAU,CAAA;AAClD,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAKhC,EAAA,MAAM,cAAA,GAAiB,OAAO,MAAA,IAAU,CAAA;AAExC,EAAA,MAAM,QAAA,GAAW,UAAA,CAAW,OAAA,EAAS,MAAM,CAAA;AAC3C,EAAA,IAAI,QAAA,EAAU,OAAO,EAAE,GAAG,UAAU,cAAA,EAAe;AAEnD,EAAA,MAAM,MAAA,GAAS,SAAS,OAAO,CAAA;AAC/B,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAEhC,EAAA,MAAM,MAAA,GACJ,QAAA,CAAS,MAAA,EAAQ,MAAA,EAAQ,YAAY,IAAI,CAAA,IACzC,QAAA,CAAS,MAAA,EAAQ,QAAQ,UAAA,EAAY,IAAI,CAAA,IACzC,KAAA,GAAQ,SAAS,MAAM,CAAA;AACzB,EAAA,OAAO,MAAA,GAAS,EAAE,GAAG,MAAA,EAAQ,gBAAe,GAAI,OAAA;AAClD","file":"chunk-NCGZPEDA.js","sourcesContent":["/**\n * Per-snippet language classification by candidate-set-relative set-difference.\n *\n * A ladder of rungs; the first rung whose leader clears a lead (margin) of ≥1\n * wins; otherwise `\"unknown\"`:\n *\n * 1 alphabet — characters distinctive within the candidate set\n * 2a function words — curated grammatical markers (highest precision)\n * 2b frequent words — corpus content words\n * 3 franc — optional trigram backstop for the distinctive-free\n * residual, injected as a resolver (this module stays\n * franc-free and importable without franc's tables)\n *\n * \"Distinctive\" is ALWAYS relative to the candidate set: a signal counts for a\n * candidate iff it appears in that candidate's profile and in NO other\n * candidate's. So `і` decides {uk, ru} (only uk has it) but is inert in\n * {uk, be} (both have it), and the word `и` decides {uk, ru} even though the\n * *letter* `и` is shared. Nothing is precomputed — uniqueness is the runtime\n * output, never stored.\n *\n * Adapted to langtell's {@link LanguageProfile} shape: the `words` and `iso6393`\n * fields are optional here, so a bare `{ code, alphabet }` profile still\n * classifies on rung 1.\n */\nimport type { LanguageProfile } from \"../types.js\";\n\nexport const FRANC_RUNG = 3;\n\n/** Which rung decided a verdict; `null` when unknown. */\nexport type Rung = 1 | \"2a\" | \"2b\" | typeof FRANC_RUNG | null;\n\nexport interface SnippetVerdict {\n /** Winning language code, or the sentinel `\"unknown\"`. */\n language: string;\n /** Lead of the winner over the runner-up, in the rung's own unit (distinctive\n * char/word count for rungs 1–2; franc score-gap for rung 3). 0 when unknown. */\n margin: number;\n /** Which rung decided; `null` when unknown. */\n rung: Rung;\n /** Whether ≥2 same-script candidates were in scope when the verdict was\n * reached. `true` ⇒ the distinctive-letter/word machinery actually chose\n * between candidates; `false` ⇒ the winner was the lone candidate in its\n * script, selected by script alone (no evidence it is *distinctively* that\n * language). `false` for `\"unknown\"`. */\n discriminating: boolean;\n}\n\n/** A rung's verdict before {@link classifyBySnippet} stamps on the scope-derived\n * `discriminating` flag (which a single rung can't know — it depends on how many\n * same-script candidates were scoped). */\nexport type RungVerdict = Pick<SnippetVerdict, \"language\" | \"margin\" | \"rung\">;\n\nconst UNKNOWN: SnippetVerdict = {\n language: \"unknown\",\n margin: 0,\n rung: null,\n discriminating: false,\n};\n\n/** Resolver for rung 3 (the optional trigram backstop), injected into\n * {@link classifyBySnippet} by callers that have franc available. Kept as an\n * injected seam — not a direct import — so this module stays franc-free and\n * importable without pulling franc's tables. Returns a rung-3 verdict or\n * `null` (abstain). */\nexport type Rung3Resolver = (\n text: string,\n scoped: readonly LanguageProfile[],\n) => RungVerdict | null;\n\nconst CYRILLIC_RE = /\\p{Script=Cyrillic}/u;\nconst LATIN_RE = /\\p{Script=Latin}/u;\n\n/** Below this length, trigrams are too noisy to justify a rung-3 verdict. */\nexport const RUNG3_MIN_LENGTH = 24;\n\n/**\n * Trailing/inline Latin \"noise\" tokens — URLs, @handles, #hashtags — that a\n * Cyrillic title commonly carries (a headline followed by a link or a social\n * handle). These are almost always Latin even on Cyrillic-language content, so\n * left in they can flip {@link dominantScript} to Latin and let genuinely\n * Cyrillic content scope to the wrong roster. Stripped before the script vote\n * AND before the rung tallies so the URL's letters never contribute either.\n *\n * Kept as separate simple patterns (applied in order — schemes/www before bare\n * domains) rather than one big alternation, so each stays readable. ASCII-only\n * `[a-z0-9-]` in the domain pattern means a Cyrillic word is never mistaken for\n * a domain.\n */\nconst NOISE_PATTERNS: readonly RegExp[] = [\n /\\bhttps?:\\/\\/\\S+/gi, // full URLs\n /\\bwww\\.\\S+/gi, // www.… without a scheme\n /\\b[a-z0-9-]+(?:\\.[a-z0-9-]+)+(?:\\/\\S*)?/gi, // bare domains (example.com/path)\n /[@#][\\p{L}\\p{N}_]+/gu, // @handles and #hashtags\n];\n\n/** Drop URLs / @handles / #hashtags so trailing Latin noise can't outvote the\n * prose's script or pollute the per-rung tallies. */\nexport function stripNoise(text: string): string {\n let out = text;\n for (const re of NOISE_PATTERNS) out = out.replace(re, \" \");\n return out;\n}\n\n/** The script most of `text` is written in, or `null` if it carries no letters.\n * Noise (URLs/handles/hashtags) is stripped first so a single trailing link\n * can't flip a multi-word Cyrillic title's vote to Latin. */\nfunction dominantScript(text: string): \"cyrillic\" | \"latin\" | null {\n let cyr = 0;\n let lat = 0;\n for (const ch of stripNoise(text)) {\n if (CYRILLIC_RE.test(ch)) cyr += 1;\n else if (LATIN_RE.test(ch)) lat += 1;\n }\n if (cyr === 0 && lat === 0) return null;\n return cyr >= lat ? \"cyrillic\" : \"latin\";\n}\n\n/** The script of a profile's alphabet. */\nfunction profileScript(profile: LanguageProfile): \"cyrillic\" | \"latin\" | null {\n for (const ch of profile.alphabet) {\n if (CYRILLIC_RE.test(ch)) return \"cyrillic\";\n if (LATIN_RE.test(ch)) return \"latin\";\n }\n return null;\n}\n\n/** Candidates whose script matches the text's dominant script (others can't tip\n * the verdict). Empty when the text carries no letters. */\nexport function scopeCandidates(\n text: string,\n candidates: readonly LanguageProfile[],\n): LanguageProfile[] {\n const script = dominantScript(text);\n if (script === null) return [];\n // Keep one profile per code. A language listed twice would otherwise make its\n // own distinctive chars/words read as \"owned by ≥2 candidates\" in `tally`,\n // cancelling them out and collapsing the verdict to \"unknown\".\n const seen = new Set<string>();\n const scoped: LanguageProfile[] = [];\n for (const c of candidates) {\n if (profileScript(c) !== script || seen.has(c.code)) continue;\n seen.add(c.code);\n scoped.push(c);\n }\n return scoped;\n}\n\n/**\n * Per-language set of characters globally unique within `profiles` — present in\n * exactly one profile's alphabet. Relative to the given profile set: the unique\n * set shrinks as languages are added (a second Latin language un-uniques a–z).\n */\nexport function distinctiveChars(profiles: readonly LanguageProfile[]): Map<string, Set<string>> {\n const owners = new Map<string, string[]>();\n for (const p of profiles) {\n for (const ch of new Set(p.alphabet)) {\n const list = owners.get(ch);\n if (list) list.push(p.code);\n else owners.set(ch, [p.code]);\n }\n }\n const result = new Map<string, Set<string>>(profiles.map((p) => [p.code, new Set()]));\n for (const [ch, codes] of owners) {\n const [only] = codes;\n if (codes.length === 1 && only !== undefined) result.get(only)?.add(ch);\n }\n return result;\n}\n\ninterface Membership {\n code: string;\n set: ReadonlySet<string>;\n}\n\n/** Lowercased Unicode letter-run tokens. Keeps single-char tokens (`і`, `и`). */\nfunction tokenize(text: string): string[] {\n return text.toLowerCase().match(/\\p{L}+/gu) ?? [];\n}\n\n/**\n * Tally how many items (characters or word tokens) are distinctive to each\n * candidate — present in exactly one candidate's set. Items owned by zero or by\n * ≥2 candidates contribute nothing.\n */\nfunction tally(items: Iterable<string>, membership: readonly Membership[]): Map<string, number> {\n const scores = new Map<string, number>(membership.map((m) => [m.code, 0]));\n for (const item of items) {\n let owner: string | null = null;\n let owners = 0;\n for (const m of membership) {\n if (m.set.has(item)) {\n owners += 1;\n if (owners > 1) {\n owner = null;\n break;\n }\n owner = m.code;\n }\n }\n if (owner !== null) scores.set(owner, (scores.get(owner) ?? 0) + 1);\n }\n return scores;\n}\n\n/** The leading candidate and its lead over the runner-up, or `null` if <1. */\nfunction leader(scores: Map<string, number>): { code: string; margin: number } | null {\n let max = -1;\n let second = -1;\n let code: string | null = null;\n for (const [c, score] of scores) {\n if (score > max) {\n second = max;\n max = score;\n code = c;\n } else if (score > second) {\n second = score;\n }\n }\n if (code === null || max < 1) return null;\n const margin = max - Math.max(second, 0);\n return margin >= 1 ? { code, margin } : null;\n}\n\nfunction membershipFor(\n candidates: readonly LanguageProfile[],\n pick: (p: LanguageProfile) => Iterable<string>,\n): Membership[] {\n return candidates.map((c) => ({ code: c.code, set: new Set(pick(c)) }));\n}\n\n/** Rung 1 — characters (alphabet + orthographic {@link LanguageProfile.marks})\n * distinctive within the scoped candidate set. */\nfunction letterRung(text: string, scoped: readonly LanguageProfile[]): RungVerdict | null {\n const r = leader(\n tally(\n text.toLowerCase(),\n membershipFor(scoped, (p) => p.alphabet + (p.marks ?? \"\")),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung: 1 } : null;\n}\n\n/** Rung 2 — distinctive words from the given tier (2a function, 2b frequent). */\nfunction wordRung(\n tokens: readonly string[],\n scoped: readonly LanguageProfile[],\n tier: \"function\" | \"frequent\",\n rung: \"2a\" | \"2b\",\n): RungVerdict | null {\n const r = leader(\n tally(\n tokens,\n membershipFor(scoped, (p) => p.words?.[tier] ?? []),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung } : null;\n}\n\n/**\n * Classify `text` among `candidates`. Synchronous and allocation-light. Returns\n * `\"unknown\"` on empty evidence, on a tie inside the candidate set, or when\n * nothing is distinctive.\n */\nexport function classifyBySnippet(\n text: string,\n candidates: readonly LanguageProfile[],\n rung3?: Rung3Resolver,\n): SnippetVerdict {\n if (!text || candidates.length === 0) return UNKNOWN;\n\n // Drop URLs / @handles / #hashtags once, up front: trailing Latin noise must\n // not flip the dominant-script vote nor pollute the per-rung tallies.\n const cleaned = stripNoise(text);\n\n // Restrict to candidates in the text's dominant script.\n const scoped = scopeCandidates(cleaned, candidates);\n if (scoped.length === 0) return UNKNOWN;\n\n // ≥2 same-script candidates means the distinctive machinery actually had a\n // choice to make; a lone scoped candidate wins by script alone. Stamped onto\n // whichever rung decides — a single rung can't see the scope size.\n const discriminating = scoped.length >= 2;\n\n const byLetter = letterRung(cleaned, scoped);\n if (byLetter) return { ...byLetter, discriminating };\n\n const tokens = tokenize(cleaned);\n if (tokens.length === 0) return UNKNOWN;\n\n const byWord =\n wordRung(tokens, scoped, \"function\", \"2a\") ??\n wordRung(tokens, scoped, \"frequent\", \"2b\") ??\n rung3?.(cleaned, scoped);\n return byWord ? { ...byWord, discriminating } : UNKNOWN;\n}\n"]}