langtell 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
- import { normalizeBCP47 } from './chunk-OVSPOZ5J.js';
2
- import { scriptOfProfile } from './chunk-U34Z3ZSV.js';
1
+ import { normalizeBCP47 } from './chunk-YCUSX3GG.js';
2
+ import { scriptOfProfile } from './chunk-BL627TWI.js';
3
3
 
4
4
  // src/fuse.ts
5
5
  var DEFAULT_KIND_WEIGHT = {
@@ -136,5 +136,5 @@ function clamp01(value) {
136
136
  }
137
137
 
138
138
  export { fuse };
139
- //# sourceMappingURL=chunk-G44HHVK5.js.map
140
- //# sourceMappingURL=chunk-G44HHVK5.js.map
139
+ //# sourceMappingURL=chunk-6PWEE3SR.js.map
140
+ //# sourceMappingURL=chunk-6PWEE3SR.js.map
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/fuse.ts"],"names":[],"mappings":";;;;AA2BA,IAAM,mBAAA,GAA8C;AAAA,EAClD,cAAA,EAAgB,CAAA;AAAA,EAChB,iBAAA,EAAmB,CAAA;AAAA,EACnB,WAAA,EAAa,CAAA;AAAA,EACb,cAAA,EAAgB,GAAA;AAAA,EAChB,KAAA,EAAO,GAAA;AAAA,EACP,uBAAA,EAAyB,GAAA;AAAA,EACzB,uBAAA,EAAyB,IAAA;AAAA,EACzB,gBAAA,EAAkB,IAAA;AAAA,EAClB,WAAA,EAAa;AACf,CAAA;AAKA,IAAM,+BAAe,IAAI,GAAA,CAAY,CAAC,cAAA,EAAgB,OAAA,EAAS,WAAW,CAAC,CAAA;AAI3E,IAAM,uBAAA,GAA0B,GAAA;AAEhC,IAAM,iBAAA,GAAoB,IAAA;AAC1B,IAAM,UAAA,GAAa,IAAA;AAcZ,SAAS,IAAA,CACd,QAAA,EACA,OAAA,GAAuB,EAAC,EACR;AAChB,EAAA,MAAM,OAAA,GAAU,OAAA,CAAQ,OAAA,IAAW,EAAC;AACpC,EAAA,MAAM,UAAA,GAAa,iBAAA,CAAkB,QAAA,EAAU,OAAA,CAAQ,UAAU,CAAA;AASjE,EAAA,MAAM,OAAA,GACJ,QAAQ,uBAAA,KAA4B,SAAA,GAChC,qBAAqB,UAAA,EAAY,OAAA,CAAQ,UAAU,CAAA,GACnD,UAAA;AAEN,EAAA,MAAM,MAAA,uBAAa,GAAA,EAAoB;AACvC,EAAA,KAAA,MAAW,QAAQ,OAAA,EAAS;AAC1B,IAAA,IAAI,IAAA,CAAK,aAAa,SAAA,EAAW;AACjC,IAAA,MAAM,MAAA,GACJ,OAAA,CAAQ,IAAA,CAAK,MAAM,CAAA,IAAK,OAAA,CAAQ,IAAA,CAAK,IAAI,CAAA,IAAK,mBAAA,CAAoB,IAAA,CAAK,IAAI,CAAA,IAAK,GAAA;AAClF,IAAA,MAAA,CAAO,GAAA,CAAI,IAAA,CAAK,QAAA,EAAA,CAAW,MAAA,CAAO,GAAA,CAAI,IAAA,CAAK,QAAQ,CAAA,IAAK,CAAA,IAAK,OAAA,CAAQ,IAAA,CAAK,UAAU,IAAI,MAAM,CAAA;AAAA,EAChG;AAGA,EAAA,MAAM,MAAA,GAAS,wBAAwB,OAAO,CAAA;AAE9C,EAAA,MAAM,EAAE,IAAA,EAAM,SAAA,EAAW,aAAY,GAAI,MAAA,CAAO,QAAQ,MAAM,CAAA;AAE9D,EAAA,IAAI,SAAS,IAAA,IAAQ,SAAA,GAAY,iBAAA,IAAqB,SAAA,GAAY,cAAc,UAAA,EAAY;AAG1F,IAAA,IAAI,MAAA,KAAW,IAAA,IAAQ,MAAA,CAAO,GAAA,CAAI,MAAM,CAAA,EAAG;AACzC,MAAA,MAAM,KAAA,GAAQ,MAAA,CAAO,GAAA,CAAI,MAAM,CAAA,IAAK,CAAA;AACpC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,UAAA,EAAY,OAAA,CAAQ,KAAA,IAAS,KAAA,GAAQ,IAAA,CAAK,CAAA;AAAA,QAC1C,QAAA,EAAU,CAAC,GAAG,UAAU;AAAA,OAC1B;AAAA,IACF;AACA,IAAA,OAAO,EAAE,QAAA,EAAU,SAAA,EAAW,UAAA,EAAY,OAAA,CAAQ,SAAS,CAAA,EAAG,QAAA,EAAU,CAAC,GAAG,UAAU,CAAA,EAAE;AAAA,EAC1F;AAEA,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,IAAA;AAAA,IACV,UAAA,EAAY,OAAA,CAAQ,SAAA,IAAa,SAAA,GAAY,cAAc,IAAA,CAAK,CAAA;AAAA,IAChE,QAAA,EAAU,CAAC,GAAG,UAAU;AAAA,GAC1B;AACF;AAYA,SAAS,iBAAA,CACP,UACA,WAAA,EACoB;AACpB,EAAA,OAAO,QAAA,CAAS,GAAA,CAAI,CAAC,IAAA,KAAS;AAC5B,IAAA,IAAI,IAAA,CAAK,QAAA,KAAa,SAAA,EAAW,OAAO,IAAA;AACxC,IAAA,MAAM,UAAA,GAAa,cAAA,CAAe,IAAA,CAAK,QAAQ,KAAK,IAAA,CAAK,QAAA;AACzD,IAAA,IAAI,UAAA,KAAe,IAAA,CAAK,QAAA,EAAU,OAAO,IAAA;AACzC,IAAA,OAAO,EAAE,GAAG,IAAA,EAAM,QAAA,EAAU,UAAA,EAAW;AAAA,EACzC,CAAC,CAAA;AACH;AAoBA,SAAS,oBAAA,CACP,YACA,UAAA,EACoB;AACpB,EAAA,MAAM,SAAA,GAAY,WAAW,MAAA,CAAO,CAAC,SAAS,CAAC,aAAA,CAAc,IAAA,EAAM,UAAU,CAAC,CAAA;AAE9E,EAAA,MAAM,WAAA,GAAc,4BAAA,CAA6B,UAAA,EAAY,UAAU,CAAA;AACvE,EAAA,IAAI,WAAA,KAAgB,MAAM,OAAO,SAAA;AAEjC,EAAA,MAAM,QAAA,GAAW,YAAA,CAAa,UAAA,IAAc,EAAE,CAAA;AAC9C,EAAA,OAAO,SAAA,CAAU,MAAA,CAAO,CAAC,IAAA,KAAS;AAGhC,IAAA,IAAI,YAAA,CAAa,IAAI,IAAA,CAAK,IAAI,KAAK,IAAA,CAAK,QAAA,KAAa,WAAW,OAAO,IAAA;AACvE,IAAA,MAAM,UAAA,GAAa,QAAA,CAAS,GAAA,CAAI,IAAA,CAAK,QAAQ,CAAA;AAC7C,IAAA,OAAO,UAAA,KAAe,UAAa,UAAA,KAAe,WAAA;AAAA,EACpD,CAAC,CAAA;AACH;AAKA,SAAS,4BAAA,CACP,YACA,UAAA,EACmB;AACnB,EAAA,IAAI,UAAA,KAAe,QAAW,OAAO,IAAA;AACrC,EAAA,MAAM,QAAA,GAAW,aAAa,UAAU,CAAA;AACxC,EAAA,KAAA,MAAW,QAAQ,UAAA,EAAY;AAC7B,IAAA,IAAI,aAAA,CAAc,IAAA,EAAM,UAAU,CAAA,EAAG;AACnC,MAAA,MAAM,MAAA,GAAS,QAAA,CAAS,GAAA,CAAI,IAAA,CAAK,QAAQ,CAAA;AACzC,MAAA,IAAI,MAAA,KAAW,QAAW,OAAO,MAAA;AAAA,IACnC;AAAA,EACF;AACA,EAAA,OAAO,IAAA;AACT;AAIA,SAAS,aAAa,UAAA,EAAiE;AACrF,EAAA,MAAM,GAAA,uBAAU,GAAA,EAAwB;AACxC,EAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,IAAA,MAAM,MAAA,GAAS,gBAAgB,CAAC,CAAA;AAChC,IAAA,IAAI,WAAW,IAAA,EAAM,GAAA,CAAI,GAAA,CAAI,CAAA,CAAE,MAAM,MAAM,CAAA;AAAA,EAC7C;AACA,EAAA,OAAO,GAAA;AACT;AAUA,SAAS,aAAA,CAAc,MAAwB,GAAA,EAA2C;AACxF,EAAA,IAAI,IAAA,CAAK,mBAAmB,KAAA,IAAS,CAAC,aAAa,GAAA,CAAI,IAAA,CAAK,IAAI,CAAA,EAAG,OAAO,KAAA;AAC1E,EAAA,OAAO,CAAC,GAAA,CAAI,IAAA;AAAA,IACV,CAAC,KAAA,KACC,KAAA,CAAM,QAAA,KAAa,IAAA,CAAK,QAAA,IACxB,KAAA,CAAM,QAAA,KAAa,SAAA,IACnB,CAAC,YAAA,CAAa,GAAA,CAAI,MAAM,IAAI;AAAA,GAChC;AACF;AAKA,SAAS,wBAAwB,QAAA,EAAsD;AACrF,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,IAAI,cAAA,GAAiB,CAAA;AACrB,EAAA,KAAA,MAAW,QAAQ,QAAA,EAAU;AAC3B,IAAA,IAAI,IAAA,CAAK,aAAa,SAAA,IAAa,CAAC,aAAa,GAAA,CAAI,IAAA,CAAK,IAAI,CAAA,EAAG;AACjE,IAAA,MAAM,CAAA,GAAI,OAAA,CAAQ,IAAA,CAAK,UAAU,CAAA;AACjC,IAAA,IAAI,IAAI,uBAAA,EAAyB;AACjC,IAAA,IAAI,IAAI,cAAA,EAAgB;AACtB,MAAA,cAAA,GAAiB,CAAA;AACjB,MAAA,IAAA,GAAO,IAAA,CAAK,QAAA;AAAA,IACd,CAAA,MAAA,IAAW,CAAA,KAAM,cAAA,IAAkB,IAAA,CAAK,aAAa,IAAA,EAAM;AAEzD,MAAA,IAAA,GAAO,IAAA;AAAA,IACT;AAAA,EACF;AACA,EAAA,OAAO,IAAA;AACT;AAQA,SAAS,MAAA,CACP,QACA,MAAA,EACiE;AACjE,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,IAAI,SAAA,GAAY,CAAA;AAChB,EAAA,IAAI,WAAA,GAAc,CAAA;AAClB,EAAA,MAAM,cAAc,MAAA,KAAW,IAAA,GAAQ,OAAO,GAAA,CAAI,MAAM,KAAK,CAAA,GAAK,CAAA;AAElE,EAAA,KAAA,MAAW,CAAC,QAAA,EAAU,GAAG,CAAA,IAAK,MAAA,EAAQ;AAEpC,IAAA,MAAM,KAAA,GAAQ,WAAW,IAAA,IAAQ,QAAA,KAAa,SAAS,IAAA,CAAK,GAAA,CAAI,GAAA,EAAK,WAAW,CAAA,GAAI,GAAA;AACpF,IAAA,IAAI,QAAQ,SAAA,EAAW;AACrB,MAAA,WAAA,GAAc,SAAA;AACd,MAAA,SAAA,GAAY,KAAA;AACZ,MAAA,IAAA,GAAO,QAAA;AAAA,IACT,CAAA,MAAA,IAAW,QAAQ,WAAA,EAAa;AAC9B,MAAA,WAAA,GAAc,KAAA;AAAA,IAChB;AAAA,EACF;AAEA,EAAA,IAAI,WAAW,IAAA,IAAQ,IAAA,KAAS,UAAU,SAAA,KAAc,WAAA,IAAe,cAAc,CAAA,EAAG;AACtF,IAAA,WAAA,GAAc,SAAA;AACd,IAAA,IAAA,GAAO,MAAA;AACP,IAAA,SAAA,GAAY,WAAA;AAAA,EACd;AACA,EAAA,OAAO,EAAE,IAAA,EAAM,SAAA,EAAW,WAAA,EAAY;AACxC;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chunk-G44HHVK5.js","sourcesContent":["import type {\n Classification,\n LanguageEvidence,\n LanguageProfile,\n NonDiscriminatingScript,\n Weights,\n} from \"./types.js\";\nimport { normalizeBCP47 } from \"./internal/bcp47.js\";\nimport { scriptOfProfile, type ScriptName } from \"./internal/classify.js\";\n\nexport interface FuseOptions {\n weights?: Weights;\n /** The candidate roster. When present, incoming evidence tags are normalized\n * into it (`uk-UA` → `uk`, `ua` → `uk`) so context signals (page/header\n * locale) land on the same code the text rungs use. */\n candidates?: readonly LanguageProfile[];\n /** How to resolve a *non-discriminating* script read (one flagged\n * `discriminating: false` — its winning script owned by ≤1 roster candidate).\n * Default `\"candidate\"` keeps current behavior; `\"unknown\"` drops such a read\n * unless non-script evidence corroborates the same language. See\n * {@link NonDiscriminatingScript}. */\n nonDiscriminatingScript?: NonDiscriminatingScript;\n}\n\n/** Default per-kind weights. Clear lexical signal (script, explicit locale)\n * outweighs contextual signal (page tags, headers). Callers override per\n * `source` id or `kind` via {@link FuseOptions.weights}. */\nconst DEFAULT_KIND_WEIGHT: Record<string, number> = {\n \"title-script\": 1,\n \"explicit-locale\": 1,\n \"chrome-ai\": 1,\n \"source-prior\": 0.7,\n franc: 0.7,\n \"http-content-language\": 0.6,\n \"meta-content-language\": 0.55,\n \"meta-og-locale\": 0.55,\n \"html-lang\": 0.5,\n};\n\n/** Evidence kinds that constitute *clear script evidence* — a verdict the text\n * classifier or an on-device model reached by actually reading the string. The\n * guard below forbids weaker page/header *context* from flipping these. */\nconst SCRIPT_KINDS = new Set<string>([\"title-script\", \"franc\", \"chrome-ai\"]);\n\n/** A script verdict this confident is treated as settled — context may add to it\n * but must not flip the winner to a different language. */\nconst SCRIPT_CONFIDENCE_FLOOR = 0.6;\n\nconst MIN_WINNING_SCORE = 0.35;\nconst MIN_MARGIN = 0.12;\n\n/**\n * Combine evidence into a single weighted verdict with an audit trail.\n *\n * Three steps:\n * 1. Normalize each item's language tag into the candidate roster (BCP-47:\n * `uk-UA`/`ua` → `uk`) so text, page, and header signals agree on a code.\n * 2. Weighted argmax over languages (caller weights override per `source`/`kind`).\n * 3. Apply the guard **context must never override clear script evidence**: when\n * the text classifier (or an on-device model) confidently read one language,\n * weaker page/header context for a *different* language cannot win — a\n * Ukrainian page chrome does not make a Latin/English title Ukrainian.\n */\nexport function fuse(\n evidence: readonly LanguageEvidence[],\n options: FuseOptions = {},\n): Classification {\n const weights = options.weights ?? {};\n const normalized = normalizeEvidence(evidence, options.candidates);\n\n // Under `\"unknown\"`, a non-discriminating script read scores nothing on its own\n // — it's dropped from the tally and the pin below — but stays in the trail. AND\n // context written in a *different script* than that title is dropped too: a\n // foreign-script title's language is never named by page/transport context in\n // another script (a Latin title on a Ukrainian page is a foreign title in a\n // Ukrainian UI, not a Ukrainian title). The full `normalized` set is still\n // returned as evidence.\n const scoring =\n options.nonDiscriminatingScript === \"unknown\"\n ? filterForUnknownMode(normalized, options.candidates)\n : normalized;\n\n const scores = new Map<string, number>();\n for (const item of scoring) {\n if (item.language === \"unknown\") continue;\n const weight =\n weights[item.source] ?? weights[item.kind] ?? DEFAULT_KIND_WEIGHT[item.kind] ?? 0.5;\n scores.set(item.language, (scores.get(item.language) ?? 0) + clamp01(item.confidence) * weight);\n }\n\n // The context-vs-script guard: a confident script read pins the winner.\n const pinned = confidentScriptLanguage(scoring);\n\n const { best, bestScore, secondScore } = argmax(scores, pinned);\n\n if (best === null || bestScore < MIN_WINNING_SCORE || bestScore - secondScore < MIN_MARGIN) {\n // A pinned script language still wins even on a thin margin — clear script\n // evidence is never demoted to \"unknown\" by competing context.\n if (pinned !== null && scores.has(pinned)) {\n const score = scores.get(pinned) ?? 0;\n return {\n language: pinned,\n confidence: clamp01(score / (score + 0.15)),\n evidence: [...normalized],\n };\n }\n return { language: \"unknown\", confidence: clamp01(bestScore), evidence: [...normalized] };\n }\n\n return {\n language: best,\n confidence: clamp01(bestScore / (bestScore + secondScore + 0.15)),\n evidence: [...normalized],\n };\n}\n\n/** Normalize each item's tag into the roster's code space (BCP-47-aware). Items\n * already `\"unknown\"` pass through untouched. Tags are BCP-47-normalized\n * (`en-US` → `en`, `ua` → `uk`) so text, page, and header signals land on the\n * same code. The normalized code is kept even when it falls outside the roster —\n * argmax simply won't favor an out-of-roster context tag, but it stays in the\n * audit trail.\n *\n * The roster is accepted (and reserved) so a future revision can fold roster\n * aliasing in without a signature change; today BCP-47 normalization alone\n * reconciles the codes the producers emit. */\nfunction normalizeEvidence(\n evidence: readonly LanguageEvidence[],\n _candidates: readonly LanguageProfile[] | undefined,\n): LanguageEvidence[] {\n return evidence.map((item) => {\n if (item.language === \"unknown\") return item;\n const normalized = normalizeBCP47(item.language) ?? item.language;\n if (normalized === item.language) return item;\n return { ...item, language: normalized };\n });\n}\n\n/**\n * The scoring set under `nonDiscriminatingScript: \"unknown\"`. Two cuts:\n *\n * 1. Drop every *neutralized* non-discriminating script read (see\n * {@link isNeutralized}) — it names a language only by being the lone\n * candidate in its script, with nothing corroborating it.\n * 2. Drop context (page/transport) evidence whose language is in a **different\n * script** than such a neutralized title. A foreign-script title's language\n * is not the page's language: a Latin title on a `lang=\"uk\"` page must not\n * resolve to `uk`. Same-script context (an explicit `en` `Content-Language`\n * for a Latin title) survives and may still name — or, among same-script\n * candidates, disambiguate — the title.\n *\n * The second cut needs each language's script, which is derived from the\n * candidate roster's alphabets. When `candidates` is absent the scripts can't be\n * derived, so the cut is skipped and behavior falls back to cut 1 alone (the\n * 0.3.0 behavior) — never throwing.\n */\nfunction filterForUnknownMode(\n normalized: readonly LanguageEvidence[],\n candidates: readonly LanguageProfile[] | undefined,\n): LanguageEvidence[] {\n const surviving = normalized.filter((item) => !isNeutralized(item, normalized));\n\n const titleScript = nonDiscriminatingTitleScript(normalized, candidates);\n if (titleScript === null) return surviving;\n\n const scriptOf = scriptByCode(candidates ?? []);\n return surviving.filter((item) => {\n // Keep the script reads themselves and anything whose script we can't place;\n // only cross-script *context* in a known, different script is excluded.\n if (SCRIPT_KINDS.has(item.kind) || item.language === \"unknown\") return true;\n const itemScript = scriptOf.get(item.language);\n return itemScript === undefined || itemScript === titleScript;\n });\n}\n\n/** The script of the title under `\"unknown\"` mode, or `null` when there is no\n * neutralized non-discriminating script read to anchor on (so no cross-script\n * cut applies) or the roster can't place that read's language. */\nfunction nonDiscriminatingTitleScript(\n normalized: readonly LanguageEvidence[],\n candidates: readonly LanguageProfile[] | undefined,\n): ScriptName | null {\n if (candidates === undefined) return null;\n const scriptOf = scriptByCode(candidates);\n for (const item of normalized) {\n if (isNeutralized(item, normalized)) {\n const script = scriptOf.get(item.language);\n if (script !== undefined) return script;\n }\n }\n return null;\n}\n\n/** Map each roster code to the script of its alphabet (Cyrillic/Latin). Codes\n * whose alphabet carries no Cyrillic/Latin letter are omitted. */\nfunction scriptByCode(candidates: readonly LanguageProfile[]): Map<string, ScriptName> {\n const map = new Map<string, ScriptName>();\n for (const c of candidates) {\n const script = scriptOfProfile(c);\n if (script !== null) map.set(c.code, script);\n }\n return map;\n}\n\n/**\n * Whether a non-discriminating script read should score nothing (mode\n * `\"unknown\"`). True when `item` is a script kind flagged `discriminating:\n * false` (its winning script is owned by ≤1 roster candidate) AND no *non-script*\n * evidence corroborates its language. Corroboration must come from context kinds\n * (page tags, headers): two lone-candidate script reads agreeing is still two\n * defaults, not real evidence — so script kinds never corroborate one another.\n */\nfunction isNeutralized(item: LanguageEvidence, all: readonly LanguageEvidence[]): boolean {\n if (item.discriminating !== false || !SCRIPT_KINDS.has(item.kind)) return false;\n return !all.some(\n (other) =>\n other.language === item.language &&\n other.language !== \"unknown\" &&\n !SCRIPT_KINDS.has(other.kind),\n );\n}\n\n/** The language of a *clear script* read confident enough to pin the verdict, or\n * `null` when none qualifies. When two script reads disagree, the higher-\n * confidence one pins (a tie leaves nothing pinned — argmax decides normally). */\nfunction confidentScriptLanguage(evidence: readonly LanguageEvidence[]): string | null {\n let best: string | null = null;\n let bestConfidence = 0;\n for (const item of evidence) {\n if (item.language === \"unknown\" || !SCRIPT_KINDS.has(item.kind)) continue;\n const c = clamp01(item.confidence);\n if (c < SCRIPT_CONFIDENCE_FLOOR) continue;\n if (c > bestConfidence) {\n bestConfidence = c;\n best = item.language;\n } else if (c === bestConfidence && item.language !== best) {\n // Two equally-confident script reads for different languages — ambiguous.\n best = null;\n }\n }\n return best;\n}\n\n/**\n * Weighted argmax. When `pinned` is set (a confident script language), any\n * *other* language's score may only come from context kinds; that score is\n * capped so it can never exceed the pinned language. This enforces the guard\n * without discarding the context from the audit trail.\n */\nfunction argmax(\n scores: Map<string, number>,\n pinned: string | null,\n): { best: string | null; bestScore: number; secondScore: number } {\n let best: string | null = null;\n let bestScore = 0;\n let secondScore = 0;\n const pinnedScore = pinned !== null ? (scores.get(pinned) ?? 0) : 0;\n\n for (const [language, raw] of scores) {\n // Guard: a non-pinned language cannot out-score the pinned one.\n const score = pinned !== null && language !== pinned ? Math.min(raw, pinnedScore) : raw;\n if (score > bestScore) {\n secondScore = bestScore;\n bestScore = score;\n best = language;\n } else if (score > secondScore) {\n secondScore = score;\n }\n }\n // On a pinned tie (pinned capped equal to a context language), prefer pinned.\n if (pinned !== null && best !== pinned && bestScore === pinnedScore && pinnedScore > 0) {\n secondScore = bestScore;\n best = pinned;\n bestScore = pinnedScore;\n }\n return { best, bestScore, secondScore };\n}\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
1
+ {"version":3,"sources":["../src/fuse.ts"],"names":[],"mappings":";;;;AA2BA,IAAM,mBAAA,GAA8C;AAAA,EAClD,cAAA,EAAgB,CAAA;AAAA,EAChB,iBAAA,EAAmB,CAAA;AAAA,EACnB,WAAA,EAAa,CAAA;AAAA,EACb,cAAA,EAAgB,GAAA;AAAA,EAChB,KAAA,EAAO,GAAA;AAAA,EACP,uBAAA,EAAyB,GAAA;AAAA,EACzB,uBAAA,EAAyB,IAAA;AAAA,EACzB,gBAAA,EAAkB,IAAA;AAAA,EAClB,WAAA,EAAa;AACf,CAAA;AAKA,IAAM,+BAAe,IAAI,GAAA,CAAY,CAAC,cAAA,EAAgB,OAAA,EAAS,WAAW,CAAC,CAAA;AAI3E,IAAM,uBAAA,GAA0B,GAAA;AAEhC,IAAM,iBAAA,GAAoB,IAAA;AAC1B,IAAM,UAAA,GAAa,IAAA;AAcZ,SAAS,IAAA,CACd,QAAA,EACA,OAAA,GAAuB,EAAC,EACR;AAChB,EAAA,MAAM,OAAA,GAAU,OAAA,CAAQ,OAAA,IAAW,EAAC;AACpC,EAAA,MAAM,UAAA,GAAa,iBAAA,CAAkB,QAAA,EAAU,OAAA,CAAQ,UAAU,CAAA;AASjE,EAAA,MAAM,OAAA,GACJ,QAAQ,uBAAA,KAA4B,SAAA,GAChC,qBAAqB,UAAA,EAAY,OAAA,CAAQ,UAAU,CAAA,GACnD,UAAA;AAEN,EAAA,MAAM,MAAA,uBAAa,GAAA,EAAoB;AACvC,EAAA,KAAA,MAAW,QAAQ,OAAA,EAAS;AAC1B,IAAA,IAAI,IAAA,CAAK,aAAa,SAAA,EAAW;AACjC,IAAA,MAAM,MAAA,GACJ,OAAA,CAAQ,IAAA,CAAK,MAAM,CAAA,IAAK,OAAA,CAAQ,IAAA,CAAK,IAAI,CAAA,IAAK,mBAAA,CAAoB,IAAA,CAAK,IAAI,CAAA,IAAK,GAAA;AAClF,IAAA,MAAA,CAAO,GAAA,CAAI,IAAA,CAAK,QAAA,EAAA,CAAW,MAAA,CAAO,GAAA,CAAI,IAAA,CAAK,QAAQ,CAAA,IAAK,CAAA,IAAK,OAAA,CAAQ,IAAA,CAAK,UAAU,IAAI,MAAM,CAAA;AAAA,EAChG;AAGA,EAAA,MAAM,MAAA,GAAS,wBAAwB,OAAO,CAAA;AAE9C,EAAA,MAAM,EAAE,IAAA,EAAM,SAAA,EAAW,aAAY,GAAI,MAAA,CAAO,QAAQ,MAAM,CAAA;AAE9D,EAAA,IAAI,SAAS,IAAA,IAAQ,SAAA,GAAY,iBAAA,IAAqB,SAAA,GAAY,cAAc,UAAA,EAAY;AAG1F,IAAA,IAAI,MAAA,KAAW,IAAA,IAAQ,MAAA,CAAO,GAAA,CAAI,MAAM,CAAA,EAAG;AACzC,MAAA,MAAM,KAAA,GAAQ,MAAA,CAAO,GAAA,CAAI,MAAM,CAAA,IAAK,CAAA;AACpC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,UAAA,EAAY,OAAA,CAAQ,KAAA,IAAS,KAAA,GAAQ,IAAA,CAAK,CAAA;AAAA,QAC1C,QAAA,EAAU,CAAC,GAAG,UAAU;AAAA,OAC1B;AAAA,IACF;AACA,IAAA,OAAO,EAAE,QAAA,EAAU,SAAA,EAAW,UAAA,EAAY,OAAA,CAAQ,SAAS,CAAA,EAAG,QAAA,EAAU,CAAC,GAAG,UAAU,CAAA,EAAE;AAAA,EAC1F;AAEA,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,IAAA;AAAA,IACV,UAAA,EAAY,OAAA,CAAQ,SAAA,IAAa,SAAA,GAAY,cAAc,IAAA,CAAK,CAAA;AAAA,IAChE,QAAA,EAAU,CAAC,GAAG,UAAU;AAAA,GAC1B;AACF;AAYA,SAAS,iBAAA,CACP,UACA,WAAA,EACoB;AACpB,EAAA,OAAO,QAAA,CAAS,GAAA,CAAI,CAAC,IAAA,KAAS;AAC5B,IAAA,IAAI,IAAA,CAAK,QAAA,KAAa,SAAA,EAAW,OAAO,IAAA;AACxC,IAAA,MAAM,UAAA,GAAa,cAAA,CAAe,IAAA,CAAK,QAAQ,KAAK,IAAA,CAAK,QAAA;AACzD,IAAA,IAAI,UAAA,KAAe,IAAA,CAAK,QAAA,EAAU,OAAO,IAAA;AACzC,IAAA,OAAO,EAAE,GAAG,IAAA,EAAM,QAAA,EAAU,UAAA,EAAW;AAAA,EACzC,CAAC,CAAA;AACH;AAoBA,SAAS,oBAAA,CACP,YACA,UAAA,EACoB;AACpB,EAAA,MAAM,SAAA,GAAY,WAAW,MAAA,CAAO,CAAC,SAAS,CAAC,aAAA,CAAc,IAAA,EAAM,UAAU,CAAC,CAAA;AAE9E,EAAA,MAAM,WAAA,GAAc,4BAAA,CAA6B,UAAA,EAAY,UAAU,CAAA;AACvE,EAAA,IAAI,WAAA,KAAgB,MAAM,OAAO,SAAA;AAEjC,EAAA,MAAM,QAAA,GAAW,YAAA,CAAa,UAAA,IAAc,EAAE,CAAA;AAC9C,EAAA,OAAO,SAAA,CAAU,MAAA,CAAO,CAAC,IAAA,KAAS;AAGhC,IAAA,IAAI,YAAA,CAAa,IAAI,IAAA,CAAK,IAAI,KAAK,IAAA,CAAK,QAAA,KAAa,WAAW,OAAO,IAAA;AACvE,IAAA,MAAM,UAAA,GAAa,QAAA,CAAS,GAAA,CAAI,IAAA,CAAK,QAAQ,CAAA;AAC7C,IAAA,OAAO,UAAA,KAAe,UAAa,UAAA,KAAe,WAAA;AAAA,EACpD,CAAC,CAAA;AACH;AAKA,SAAS,4BAAA,CACP,YACA,UAAA,EACmB;AACnB,EAAA,IAAI,UAAA,KAAe,QAAW,OAAO,IAAA;AACrC,EAAA,MAAM,QAAA,GAAW,aAAa,UAAU,CAAA;AACxC,EAAA,KAAA,MAAW,QAAQ,UAAA,EAAY;AAC7B,IAAA,IAAI,aAAA,CAAc,IAAA,EAAM,UAAU,CAAA,EAAG;AACnC,MAAA,MAAM,MAAA,GAAS,QAAA,CAAS,GAAA,CAAI,IAAA,CAAK,QAAQ,CAAA;AACzC,MAAA,IAAI,MAAA,KAAW,QAAW,OAAO,MAAA;AAAA,IACnC;AAAA,EACF;AACA,EAAA,OAAO,IAAA;AACT;AAIA,SAAS,aAAa,UAAA,EAAiE;AACrF,EAAA,MAAM,GAAA,uBAAU,GAAA,EAAwB;AACxC,EAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,IAAA,MAAM,MAAA,GAAS,gBAAgB,CAAC,CAAA;AAChC,IAAA,IAAI,WAAW,IAAA,EAAM,GAAA,CAAI,GAAA,CAAI,CAAA,CAAE,MAAM,MAAM,CAAA;AAAA,EAC7C;AACA,EAAA,OAAO,GAAA;AACT;AAUA,SAAS,aAAA,CAAc,MAAwB,GAAA,EAA2C;AACxF,EAAA,IAAI,IAAA,CAAK,mBAAmB,KAAA,IAAS,CAAC,aAAa,GAAA,CAAI,IAAA,CAAK,IAAI,CAAA,EAAG,OAAO,KAAA;AAC1E,EAAA,OAAO,CAAC,GAAA,CAAI,IAAA;AAAA,IACV,CAAC,KAAA,KACC,KAAA,CAAM,QAAA,KAAa,IAAA,CAAK,QAAA,IACxB,KAAA,CAAM,QAAA,KAAa,SAAA,IACnB,CAAC,YAAA,CAAa,GAAA,CAAI,MAAM,IAAI;AAAA,GAChC;AACF;AAKA,SAAS,wBAAwB,QAAA,EAAsD;AACrF,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,IAAI,cAAA,GAAiB,CAAA;AACrB,EAAA,KAAA,MAAW,QAAQ,QAAA,EAAU;AAC3B,IAAA,IAAI,IAAA,CAAK,aAAa,SAAA,IAAa,CAAC,aAAa,GAAA,CAAI,IAAA,CAAK,IAAI,CAAA,EAAG;AACjE,IAAA,MAAM,CAAA,GAAI,OAAA,CAAQ,IAAA,CAAK,UAAU,CAAA;AACjC,IAAA,IAAI,IAAI,uBAAA,EAAyB;AACjC,IAAA,IAAI,IAAI,cAAA,EAAgB;AACtB,MAAA,cAAA,GAAiB,CAAA;AACjB,MAAA,IAAA,GAAO,IAAA,CAAK,QAAA;AAAA,IACd,CAAA,MAAA,IAAW,CAAA,KAAM,cAAA,IAAkB,IAAA,CAAK,aAAa,IAAA,EAAM;AAEzD,MAAA,IAAA,GAAO,IAAA;AAAA,IACT;AAAA,EACF;AACA,EAAA,OAAO,IAAA;AACT;AAQA,SAAS,MAAA,CACP,QACA,MAAA,EACiE;AACjE,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,IAAI,SAAA,GAAY,CAAA;AAChB,EAAA,IAAI,WAAA,GAAc,CAAA;AAClB,EAAA,MAAM,cAAc,MAAA,KAAW,IAAA,GAAQ,OAAO,GAAA,CAAI,MAAM,KAAK,CAAA,GAAK,CAAA;AAElE,EAAA,KAAA,MAAW,CAAC,QAAA,EAAU,GAAG,CAAA,IAAK,MAAA,EAAQ;AAEpC,IAAA,MAAM,KAAA,GAAQ,WAAW,IAAA,IAAQ,QAAA,KAAa,SAAS,IAAA,CAAK,GAAA,CAAI,GAAA,EAAK,WAAW,CAAA,GAAI,GAAA;AACpF,IAAA,IAAI,QAAQ,SAAA,EAAW;AACrB,MAAA,WAAA,GAAc,SAAA;AACd,MAAA,SAAA,GAAY,KAAA;AACZ,MAAA,IAAA,GAAO,QAAA;AAAA,IACT,CAAA,MAAA,IAAW,QAAQ,WAAA,EAAa;AAC9B,MAAA,WAAA,GAAc,KAAA;AAAA,IAChB;AAAA,EACF;AAEA,EAAA,IAAI,WAAW,IAAA,IAAQ,IAAA,KAAS,UAAU,SAAA,KAAc,WAAA,IAAe,cAAc,CAAA,EAAG;AACtF,IAAA,WAAA,GAAc,SAAA;AACd,IAAA,IAAA,GAAO,MAAA;AACP,IAAA,SAAA,GAAY,WAAA;AAAA,EACd;AACA,EAAA,OAAO,EAAE,IAAA,EAAM,SAAA,EAAW,WAAA,EAAY;AACxC;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chunk-6PWEE3SR.js","sourcesContent":["import type {\n Classification,\n LanguageEvidence,\n LanguageProfile,\n NonDiscriminatingScript,\n Weights,\n} from \"./types.js\";\nimport { normalizeBCP47 } from \"./internal/bcp47.js\";\nimport { scriptOfProfile, type ScriptName } from \"./internal/classify.js\";\n\nexport interface FuseOptions {\n weights?: Weights;\n /** The candidate roster. When present, incoming evidence tags are normalized\n * into it (`uk-UA` → `uk`, `ua` → `uk`) so context signals (page/header\n * locale) land on the same code the text rungs use. */\n candidates?: readonly LanguageProfile[];\n /** How to resolve a *non-discriminating* script read (one flagged\n * `discriminating: false` — its winning script owned by ≤1 roster candidate).\n * Default `\"candidate\"` keeps current behavior; `\"unknown\"` drops such a read\n * unless non-script evidence corroborates the same language. See\n * {@link NonDiscriminatingScript}. */\n nonDiscriminatingScript?: NonDiscriminatingScript;\n}\n\n/** Default per-kind weights. Clear lexical signal (script, explicit locale)\n * outweighs contextual signal (page tags, headers). Callers override per\n * `source` id or `kind` via {@link FuseOptions.weights}. */\nconst DEFAULT_KIND_WEIGHT: Record<string, number> = {\n \"title-script\": 1,\n \"explicit-locale\": 1,\n \"chrome-ai\": 1,\n \"source-prior\": 0.7,\n franc: 0.7,\n \"http-content-language\": 0.6,\n \"meta-content-language\": 0.55,\n \"meta-og-locale\": 0.55,\n \"html-lang\": 0.5,\n};\n\n/** Evidence kinds that constitute *clear script evidence* — a verdict the text\n * classifier or an on-device model reached by actually reading the string. The\n * guard below forbids weaker page/header *context* from flipping these. */\nconst SCRIPT_KINDS = new Set<string>([\"title-script\", \"franc\", \"chrome-ai\"]);\n\n/** A script verdict this confident is treated as settled — context may add to it\n * but must not flip the winner to a different language. */\nconst SCRIPT_CONFIDENCE_FLOOR = 0.6;\n\nconst MIN_WINNING_SCORE = 0.35;\nconst MIN_MARGIN = 0.12;\n\n/**\n * Combine evidence into a single weighted verdict with an audit trail.\n *\n * Three steps:\n * 1. Normalize each item's language tag into the candidate roster (BCP-47:\n * `uk-UA`/`ua` → `uk`) so text, page, and header signals agree on a code.\n * 2. Weighted argmax over languages (caller weights override per `source`/`kind`).\n * 3. Apply the guard **context must never override clear script evidence**: when\n * the text classifier (or an on-device model) confidently read one language,\n * weaker page/header context for a *different* language cannot win — a\n * Ukrainian page chrome does not make a Latin/English title Ukrainian.\n */\nexport function fuse(\n evidence: readonly LanguageEvidence[],\n options: FuseOptions = {},\n): Classification {\n const weights = options.weights ?? {};\n const normalized = normalizeEvidence(evidence, options.candidates);\n\n // Under `\"unknown\"`, a non-discriminating script read scores nothing on its own\n // — it's dropped from the tally and the pin below — but stays in the trail. AND\n // context written in a *different script* than that title is dropped too: a\n // foreign-script title's language is never named by page/transport context in\n // another script (a Latin title on a Ukrainian page is a foreign title in a\n // Ukrainian UI, not a Ukrainian title). The full `normalized` set is still\n // returned as evidence.\n const scoring =\n options.nonDiscriminatingScript === \"unknown\"\n ? filterForUnknownMode(normalized, options.candidates)\n : normalized;\n\n const scores = new Map<string, number>();\n for (const item of scoring) {\n if (item.language === \"unknown\") continue;\n const weight =\n weights[item.source] ?? weights[item.kind] ?? DEFAULT_KIND_WEIGHT[item.kind] ?? 0.5;\n scores.set(item.language, (scores.get(item.language) ?? 0) + clamp01(item.confidence) * weight);\n }\n\n // The context-vs-script guard: a confident script read pins the winner.\n const pinned = confidentScriptLanguage(scoring);\n\n const { best, bestScore, secondScore } = argmax(scores, pinned);\n\n if (best === null || bestScore < MIN_WINNING_SCORE || bestScore - secondScore < MIN_MARGIN) {\n // A pinned script language still wins even on a thin margin — clear script\n // evidence is never demoted to \"unknown\" by competing context.\n if (pinned !== null && scores.has(pinned)) {\n const score = scores.get(pinned) ?? 0;\n return {\n language: pinned,\n confidence: clamp01(score / (score + 0.15)),\n evidence: [...normalized],\n };\n }\n return { language: \"unknown\", confidence: clamp01(bestScore), evidence: [...normalized] };\n }\n\n return {\n language: best,\n confidence: clamp01(bestScore / (bestScore + secondScore + 0.15)),\n evidence: [...normalized],\n };\n}\n\n/** Normalize each item's tag into the roster's code space (BCP-47-aware). Items\n * already `\"unknown\"` pass through untouched. Tags are BCP-47-normalized\n * (`en-US` → `en`, `ua` → `uk`) so text, page, and header signals land on the\n * same code. The normalized code is kept even when it falls outside the roster —\n * argmax simply won't favor an out-of-roster context tag, but it stays in the\n * audit trail.\n *\n * The roster is accepted (and reserved) so a future revision can fold roster\n * aliasing in without a signature change; today BCP-47 normalization alone\n * reconciles the codes the producers emit. */\nfunction normalizeEvidence(\n evidence: readonly LanguageEvidence[],\n _candidates: readonly LanguageProfile[] | undefined,\n): LanguageEvidence[] {\n return evidence.map((item) => {\n if (item.language === \"unknown\") return item;\n const normalized = normalizeBCP47(item.language) ?? item.language;\n if (normalized === item.language) return item;\n return { ...item, language: normalized };\n });\n}\n\n/**\n * The scoring set under `nonDiscriminatingScript: \"unknown\"`. Two cuts:\n *\n * 1. Drop every *neutralized* non-discriminating script read (see\n * {@link isNeutralized}) — it names a language only by being the lone\n * candidate in its script, with nothing corroborating it.\n * 2. Drop context (page/transport) evidence whose language is in a **different\n * script** than such a neutralized title. A foreign-script title's language\n * is not the page's language: a Latin title on a `lang=\"uk\"` page must not\n * resolve to `uk`. Same-script context (an explicit `en` `Content-Language`\n * for a Latin title) survives and may still name — or, among same-script\n * candidates, disambiguate — the title.\n *\n * The second cut needs each language's script, which is derived from the\n * candidate roster's alphabets. When `candidates` is absent the scripts can't be\n * derived, so the cut is skipped and behavior falls back to cut 1 alone (the\n * 0.3.0 behavior) — never throwing.\n */\nfunction filterForUnknownMode(\n normalized: readonly LanguageEvidence[],\n candidates: readonly LanguageProfile[] | undefined,\n): LanguageEvidence[] {\n const surviving = normalized.filter((item) => !isNeutralized(item, normalized));\n\n const titleScript = nonDiscriminatingTitleScript(normalized, candidates);\n if (titleScript === null) return surviving;\n\n const scriptOf = scriptByCode(candidates ?? []);\n return surviving.filter((item) => {\n // Keep the script reads themselves and anything whose script we can't place;\n // only cross-script *context* in a known, different script is excluded.\n if (SCRIPT_KINDS.has(item.kind) || item.language === \"unknown\") return true;\n const itemScript = scriptOf.get(item.language);\n return itemScript === undefined || itemScript === titleScript;\n });\n}\n\n/** The script of the title under `\"unknown\"` mode, or `null` when there is no\n * neutralized non-discriminating script read to anchor on (so no cross-script\n * cut applies) or the roster can't place that read's language. */\nfunction nonDiscriminatingTitleScript(\n normalized: readonly LanguageEvidence[],\n candidates: readonly LanguageProfile[] | undefined,\n): ScriptName | null {\n if (candidates === undefined) return null;\n const scriptOf = scriptByCode(candidates);\n for (const item of normalized) {\n if (isNeutralized(item, normalized)) {\n const script = scriptOf.get(item.language);\n if (script !== undefined) return script;\n }\n }\n return null;\n}\n\n/** Map each roster code to the script of its alphabet (Cyrillic/Latin). Codes\n * whose alphabet carries no Cyrillic/Latin letter are omitted. */\nfunction scriptByCode(candidates: readonly LanguageProfile[]): Map<string, ScriptName> {\n const map = new Map<string, ScriptName>();\n for (const c of candidates) {\n const script = scriptOfProfile(c);\n if (script !== null) map.set(c.code, script);\n }\n return map;\n}\n\n/**\n * Whether a non-discriminating script read should score nothing (mode\n * `\"unknown\"`). True when `item` is a script kind flagged `discriminating:\n * false` (its winning script is owned by ≤1 roster candidate) AND no *non-script*\n * evidence corroborates its language. Corroboration must come from context kinds\n * (page tags, headers): two lone-candidate script reads agreeing is still two\n * defaults, not real evidence — so script kinds never corroborate one another.\n */\nfunction isNeutralized(item: LanguageEvidence, all: readonly LanguageEvidence[]): boolean {\n if (item.discriminating !== false || !SCRIPT_KINDS.has(item.kind)) return false;\n return !all.some(\n (other) =>\n other.language === item.language &&\n other.language !== \"unknown\" &&\n !SCRIPT_KINDS.has(other.kind),\n );\n}\n\n/** The language of a *clear script* read confident enough to pin the verdict, or\n * `null` when none qualifies. When two script reads disagree, the higher-\n * confidence one pins (a tie leaves nothing pinned — argmax decides normally). */\nfunction confidentScriptLanguage(evidence: readonly LanguageEvidence[]): string | null {\n let best: string | null = null;\n let bestConfidence = 0;\n for (const item of evidence) {\n if (item.language === \"unknown\" || !SCRIPT_KINDS.has(item.kind)) continue;\n const c = clamp01(item.confidence);\n if (c < SCRIPT_CONFIDENCE_FLOOR) continue;\n if (c > bestConfidence) {\n bestConfidence = c;\n best = item.language;\n } else if (c === bestConfidence && item.language !== best) {\n // Two equally-confident script reads for different languages — ambiguous.\n best = null;\n }\n }\n return best;\n}\n\n/**\n * Weighted argmax. When `pinned` is set (a confident script language), any\n * *other* language's score may only come from context kinds; that score is\n * capped so it can never exceed the pinned language. This enforces the guard\n * without discarding the context from the audit trail.\n */\nfunction argmax(\n scores: Map<string, number>,\n pinned: string | null,\n): { best: string | null; bestScore: number; secondScore: number } {\n let best: string | null = null;\n let bestScore = 0;\n let secondScore = 0;\n const pinnedScore = pinned !== null ? (scores.get(pinned) ?? 0) : 0;\n\n for (const [language, raw] of scores) {\n // Guard: a non-pinned language cannot out-score the pinned one.\n const score = pinned !== null && language !== pinned ? Math.min(raw, pinnedScore) : raw;\n if (score > bestScore) {\n secondScore = bestScore;\n bestScore = score;\n best = language;\n } else if (score > secondScore) {\n secondScore = score;\n }\n }\n // On a pinned tie (pinned capped equal to a context language), prefer pinned.\n if (pinned !== null && best !== pinned && bestScore === pinnedScore && pinnedScore > 0) {\n secondScore = bestScore;\n best = pinned;\n bestScore = pinnedScore;\n }\n return { best, bestScore, secondScore };\n}\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
@@ -1,4 +1,4 @@
1
- import { classifyBySnippet } from './chunk-U34Z3ZSV.js';
1
+ import { classifyBySnippet } from './chunk-BL627TWI.js';
2
2
 
3
3
  // src/text.ts
4
4
  function evidenceFromText(text, candidates, rung3) {
@@ -31,5 +31,5 @@ function clamp01(value) {
31
31
  }
32
32
 
33
33
  export { evidenceFromText };
34
- //# sourceMappingURL=chunk-KCK3XWCJ.js.map
35
- //# sourceMappingURL=chunk-KCK3XWCJ.js.map
34
+ //# sourceMappingURL=chunk-7TDGJOIJ.js.map
35
+ //# sourceMappingURL=chunk-7TDGJOIJ.js.map
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/text.ts"],"names":[],"mappings":";;;AAmBO,SAAS,gBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACoB;AACpB,EAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,KAAW,CAAA,SAAU,EAAC;AAC5D,EAAA,IAAI,eAAe,MAAA,IAAa,UAAA,CAAW,MAAA,KAAW,CAAA,SAAU,EAAC;AAEjE,EAAA,MAAM,OAAA,GAAU,iBAAA,CAAkB,IAAA,EAAM,UAAA,EAAY,KAAK,CAAA;AACzD,EAAA,IAAI,OAAA,CAAQ,QAAA,KAAa,SAAA,EAAW,OAAO,EAAC;AAE5C,EAAA,MAAM,IAAA,GAAyB;AAAA,IAC7B,IAAA,EAAM,cAAA;AAAA,IACN,UAAU,OAAA,CAAQ,QAAA;AAAA,IAClB,UAAA,EAAY,kBAAA,CAAmB,OAAA,CAAQ,MAAA,EAAQ,QAAQ,IAAI,CAAA;AAAA,IAC3D,MAAA,EAAQ,cAAA;AAAA,IACR,OAAO,IAAA,CAAK,IAAA,EAAK,CAAE,KAAA,CAAM,GAAG,EAAE;AAAA,GAChC;AAIA,EAAA,IAAI,CAAC,OAAA,CAAQ,cAAA,EAAgB,IAAA,CAAK,cAAA,GAAiB,KAAA;AACnD,EAAA,OAAO,CAAC,IAAI,CAAA;AACd;AAWA,SAAS,kBAAA,CAAmB,QAAgB,IAAA,EAAoB;AAC9D,EAAA,IAAI,SAAS,CAAA,EAAG;AAEd,IAAA,OAAO,OAAA,CAAQ,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA,EAAG,CAAC,CAAA,GAAI,IAAI,CAAA;AAAA,EAC9D;AACA,EAAA,MAAM,IAAA,GAAO,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA;AAC/B,EAAA,OAAO,OAAA,CAAQ,MAAO,IAAA,CAAK,GAAA,CAAI,MAAM,CAAC,CAAA,GAAI,IAAK,IAAI,CAAA;AACrD;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chunk-KCK3XWCJ.js","sourcesContent":["import type { LanguageEvidence, LanguageProfile } from \"./types.js\";\nimport { classifyBySnippet, type Rung, type Rung3Resolver } from \"./internal/classify.js\";\n\n/**\n * Producer: candidate-relative script + lexical signals from the title text.\n *\n * Wraps the ported snippet classifier ({@link classifyBySnippet}): noise strip →\n * dominant-script scope → distinctive letters (rung 1) → function words (2a) →\n * frequent words (2b). The `candidates` roster makes scoring roster-relative —\n * `і` decides Ukrainian only when Russian is also a candidate. Sync and\n * zero-dependency; the optional franc rung is injected via `rung3`.\n *\n * Emits at most one `kind: \"title-script\"` evidence item. The classifier's\n * integer `margin` (the winner's lead over the runner-up) maps to a 0..1\n * `confidence`: a verdict at all means the dominant script and the deciding rung\n * agreed, so the floor is high; a wider lead nudges it up. With no candidates\n * (or no usable distinctive signal) it abstains — emitting nothing rather than a\n * coarse \"unknown\", since the roster decides relevance.\n */\nexport function evidenceFromText(\n text: string | undefined,\n candidates?: readonly LanguageProfile[],\n rung3?: Rung3Resolver,\n): LanguageEvidence[] {\n if (text === undefined || text.trim().length === 0) return [];\n if (candidates === undefined || candidates.length === 0) return [];\n\n const verdict = classifyBySnippet(text, candidates, rung3);\n if (verdict.language === \"unknown\") return [];\n\n const item: LanguageEvidence = {\n kind: \"title-script\",\n language: verdict.language,\n confidence: marginToConfidence(verdict.margin, verdict.rung),\n source: \"title-script\",\n value: text.trim().slice(0, 80),\n };\n // Surface only the meaningful negative: the script was owned by ≤1 candidate,\n // so it didn't choose between candidates. The discriminating case stays narrow\n // (flag omitted). `fuse({ nonDiscriminatingScript: \"unknown\" })` reads this.\n if (!verdict.discriminating) item.discriminating = false;\n return [item];\n}\n\n/**\n * Map the classifier's per-rung lead to a 0..1 confidence.\n *\n * Rungs 1–2 carry an integer count of distinctive items (≥1). A verdict already\n * means script + rung agreed, so the floor is high (0.6) and each extra\n * distinctive item adds up to a 0.35 bonus, saturating by a lead of 4. Rung 3\n * (franc) carries franc's own 0..1 score-gap, which is weaker evidence, so it is\n * scaled into a 0.4..0.75 band.\n */\nfunction marginToConfidence(margin: number, rung: Rung): number {\n if (rung === 3) {\n // franc score-gap is already 0..1; weaker than the distinctive rungs.\n return clamp01(0.4 + Math.min(Math.max(margin, 0), 1) * 0.35);\n }\n const lead = Math.max(margin, 1);\n return clamp01(0.6 + (Math.min(lead, 4) / 4) * 0.35);\n}\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
1
+ {"version":3,"sources":["../src/text.ts"],"names":[],"mappings":";;;AAmBO,SAAS,gBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACoB;AACpB,EAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,KAAW,CAAA,SAAU,EAAC;AAC5D,EAAA,IAAI,eAAe,MAAA,IAAa,UAAA,CAAW,MAAA,KAAW,CAAA,SAAU,EAAC;AAEjE,EAAA,MAAM,OAAA,GAAU,iBAAA,CAAkB,IAAA,EAAM,UAAA,EAAY,KAAK,CAAA;AACzD,EAAA,IAAI,OAAA,CAAQ,QAAA,KAAa,SAAA,EAAW,OAAO,EAAC;AAE5C,EAAA,MAAM,IAAA,GAAyB;AAAA,IAC7B,IAAA,EAAM,cAAA;AAAA,IACN,UAAU,OAAA,CAAQ,QAAA;AAAA,IAClB,UAAA,EAAY,kBAAA,CAAmB,OAAA,CAAQ,MAAA,EAAQ,QAAQ,IAAI,CAAA;AAAA,IAC3D,MAAA,EAAQ,cAAA;AAAA,IACR,OAAO,IAAA,CAAK,IAAA,EAAK,CAAE,KAAA,CAAM,GAAG,EAAE;AAAA,GAChC;AAIA,EAAA,IAAI,CAAC,OAAA,CAAQ,cAAA,EAAgB,IAAA,CAAK,cAAA,GAAiB,KAAA;AACnD,EAAA,OAAO,CAAC,IAAI,CAAA;AACd;AAWA,SAAS,kBAAA,CAAmB,QAAgB,IAAA,EAAoB;AAC9D,EAAA,IAAI,SAAS,CAAA,EAAG;AAEd,IAAA,OAAO,OAAA,CAAQ,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA,EAAG,CAAC,CAAA,GAAI,IAAI,CAAA;AAAA,EAC9D;AACA,EAAA,MAAM,IAAA,GAAO,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA;AAC/B,EAAA,OAAO,OAAA,CAAQ,MAAO,IAAA,CAAK,GAAA,CAAI,MAAM,CAAC,CAAA,GAAI,IAAK,IAAI,CAAA;AACrD;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chunk-7TDGJOIJ.js","sourcesContent":["import type { LanguageEvidence, LanguageProfile } from \"./types.js\";\nimport { classifyBySnippet, type Rung, type Rung3Resolver } from \"./internal/classify.js\";\n\n/**\n * Producer: candidate-relative script + lexical signals from the title text.\n *\n * Wraps the ported snippet classifier ({@link classifyBySnippet}): noise strip →\n * dominant-script scope → distinctive letters (rung 1) → function words (2a) →\n * frequent words (2b). The `candidates` roster makes scoring roster-relative —\n * `і` decides Ukrainian only when Russian is also a candidate. Sync and\n * zero-dependency; the optional franc rung is injected via `rung3`.\n *\n * Emits at most one `kind: \"title-script\"` evidence item. The classifier's\n * integer `margin` (the winner's lead over the runner-up) maps to a 0..1\n * `confidence`: a verdict at all means the dominant script and the deciding rung\n * agreed, so the floor is high; a wider lead nudges it up. With no candidates\n * (or no usable distinctive signal) it abstains — emitting nothing rather than a\n * coarse \"unknown\", since the roster decides relevance.\n */\nexport function evidenceFromText(\n text: string | undefined,\n candidates?: readonly LanguageProfile[],\n rung3?: Rung3Resolver,\n): LanguageEvidence[] {\n if (text === undefined || text.trim().length === 0) return [];\n if (candidates === undefined || candidates.length === 0) return [];\n\n const verdict = classifyBySnippet(text, candidates, rung3);\n if (verdict.language === \"unknown\") return [];\n\n const item: LanguageEvidence = {\n kind: \"title-script\",\n language: verdict.language,\n confidence: marginToConfidence(verdict.margin, verdict.rung),\n source: \"title-script\",\n value: text.trim().slice(0, 80),\n };\n // Surface only the meaningful negative: the script was owned by ≤1 candidate,\n // so it didn't choose between candidates. The discriminating case stays narrow\n // (flag omitted). `fuse({ nonDiscriminatingScript: \"unknown\" })` reads this.\n if (!verdict.discriminating) item.discriminating = false;\n return [item];\n}\n\n/**\n * Map the classifier's per-rung lead to a 0..1 confidence.\n *\n * Rungs 1–2 carry an integer count of distinctive items (≥1). A verdict already\n * means script + rung agreed, so the floor is high (0.6) and each extra\n * distinctive item adds up to a 0.35 bonus, saturating by a lead of 4. Rung 3\n * (franc) carries franc's own 0..1 score-gap, which is weaker evidence, so it is\n * scaled into a 0.4..0.75 band.\n */\nfunction marginToConfidence(margin: number, rung: Rung): number {\n if (rung === 3) {\n // franc score-gap is already 0..1; weaker than the distinctive rungs.\n return clamp01(0.4 + Math.min(Math.max(margin, 0), 1) * 0.35);\n }\n const lead = Math.max(margin, 1);\n return clamp01(0.6 + (Math.min(lead, 4) / 4) * 0.35);\n}\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
@@ -8,6 +8,7 @@ var UNKNOWN = {
8
8
  };
9
9
  var CYRILLIC_RE = /\p{Script=Cyrillic}/u;
10
10
  var LATIN_RE = /\p{Script=Latin}/u;
11
+ var RUNG3_MIN_LENGTH = 24;
11
12
  var NOISE_PATTERNS = [
12
13
  /\bhttps?:\/\/\S+/gi,
13
14
  // full URLs
@@ -126,6 +127,6 @@ function classifyBySnippet(text, candidates, rung3) {
126
127
  return byWord ? { ...byWord, discriminating } : UNKNOWN;
127
128
  }
128
129
 
129
- export { FRANC_RUNG, classifyBySnippet, scopeCandidates, scriptOfProfile };
130
- //# sourceMappingURL=chunk-U34Z3ZSV.js.map
131
- //# sourceMappingURL=chunk-U34Z3ZSV.js.map
130
+ export { FRANC_RUNG, RUNG3_MIN_LENGTH, classifyBySnippet, scopeCandidates, scriptOfProfile };
131
+ //# sourceMappingURL=chunk-BL627TWI.js.map
132
+ //# sourceMappingURL=chunk-BL627TWI.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/internal/classify.ts"],"names":[],"mappings":";AA0BO,IAAM,UAAA,GAAa;AA0B1B,IAAM,OAAA,GAA0B;AAAA,EAC9B,QAAA,EAAU,SAAA;AAAA,EACV,MAAA,EAAQ,CAAA;AAAA,EACR,IAAA,EAAM,IAAA;AAAA,EACN,cAAA,EAAgB;AAClB,CAAA;AAmBA,IAAM,WAAA,GAAc,sBAAA;AACpB,IAAM,QAAA,GAAW,mBAAA;AAOV,IAAM,gBAAA,GAAmB;AAehC,IAAM,cAAA,GAAoC;AAAA,EACxC,oBAAA;AAAA;AAAA,EACA,cAAA;AAAA;AAAA,EACA,2CAAA;AAAA;AAAA,EACA;AAAA;AACF,CAAA;AAIO,SAAS,WAAW,IAAA,EAAsB;AAC/C,EAAA,IAAI,GAAA,GAAM,IAAA;AACV,EAAA,KAAA,MAAW,MAAM,cAAA,EAAgB,GAAA,GAAM,GAAA,CAAI,OAAA,CAAQ,IAAI,GAAG,CAAA;AAC1D,EAAA,OAAO,GAAA;AACT;AAKA,SAAS,eAAe,IAAA,EAA2C;AACjE,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,MAAW,EAAA,IAAM,UAAA,CAAW,IAAI,CAAA,EAAG;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,SAAA,IACxB,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,EACrC;AACA,EAAA,IAAI,GAAA,KAAQ,CAAA,IAAK,GAAA,KAAQ,CAAA,EAAG,OAAO,IAAA;AACnC,EAAA,OAAO,GAAA,IAAO,MAAM,UAAA,GAAa,OAAA;AACnC;AAMO,SAAS,gBAAgB,OAAA,EAA6C;AAC3E,EAAA,KAAA,MAAW,EAAA,IAAM,QAAQ,QAAA,EAAU;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,UAAA;AACjC,IAAA,IAAI,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,OAAA;AAAA,EAChC;AACA,EAAA,OAAO,IAAA;AACT;AAMO,SAAS,eAAA,CACd,MACA,UAAA,EACK;AACL,EAAA,MAAM,MAAA,GAAS,eAAe,IAAI,CAAA;AAClC,EAAA,IAAI,MAAA,KAAW,IAAA,EAAM,OAAO,EAAC;AAI7B,EAAA,MAAM,IAAA,uBAAW,GAAA,EAAY;AAC7B,EAAA,MAAM,SAAc,EAAC;AACrB,EAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,IAAA,IAAI,eAAA,CAAgB,CAAC,CAAA,KAAM,MAAA,IAAU,KAAK,GAAA,CAAI,CAAA,CAAE,IAAI,CAAA,EAAG;AACvD,IAAA,IAAA,CAAK,GAAA,CAAI,EAAE,IAAI,CAAA;AACf,IAAA,MAAA,CAAO,KAAK,CAAC,CAAA;AAAA,EACf;AACA,EAAA,OAAO,MAAA;AACT;AA8BA,SAAS,SAAS,IAAA,EAAwB;AACxC,EAAA,OAAO,KAAK,WAAA,EAAY,CAAE,KAAA,CAAM,UAAU,KAAK,EAAC;AAClD;AAOA,SAAS,KAAA,CAAM,OAAyB,UAAA,EAAwD;AAC9F,EAAA,MAAM,MAAA,GAAS,IAAI,GAAA,CAAoB,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,KAAM,CAAC,CAAA,CAAE,IAAA,EAAM,CAAC,CAAC,CAAC,CAAA;AACzE,EAAA,KAAA,MAAW,QAAQ,KAAA,EAAO;AACxB,IAAA,IAAI,KAAA,GAAuB,IAAA;AAC3B,IAAA,IAAI,MAAA,GAAS,CAAA;AACb,IAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,MAAA,IAAI,CAAA,CAAE,GAAA,CAAI,GAAA,CAAI,IAAI,CAAA,EAAG;AACnB,QAAA,MAAA,IAAU,CAAA;AACV,QAAA,IAAI,SAAS,CAAA,EAAG;AACd,UAAA,KAAA,GAAQ,IAAA;AACR,UAAA;AAAA,QACF;AACA,QAAA,KAAA,GAAQ,CAAA,CAAE,IAAA;AAAA,MACZ;AAAA,IACF;AACA,IAAA,IAAI,KAAA,KAAU,IAAA,EAAM,MAAA,CAAO,GAAA,CAAI,KAAA,EAAA,CAAQ,OAAO,GAAA,CAAI,KAAK,CAAA,IAAK,CAAA,IAAK,CAAC,CAAA;AAAA,EACpE;AACA,EAAA,OAAO,MAAA;AACT;AAGA,SAAS,OAAO,MAAA,EAAsE;AACpF,EAAA,IAAI,GAAA,GAAM,EAAA;AACV,EAAA,IAAI,MAAA,GAAS,EAAA;AACb,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,KAAA,MAAW,CAAC,CAAA,EAAG,KAAK,CAAA,IAAK,MAAA,EAAQ;AAC/B,IAAA,IAAI,QAAQ,GAAA,EAAK;AACf,MAAA,MAAA,GAAS,GAAA;AACT,MAAA,GAAA,GAAM,KAAA;AACN,MAAA,IAAA,GAAO,CAAA;AAAA,IACT,CAAA,MAAA,IAAW,QAAQ,MAAA,EAAQ;AACzB,MAAA,MAAA,GAAS,KAAA;AAAA,IACX;AAAA,EACF;AACA,EAAA,IAAI,IAAA,KAAS,IAAA,IAAQ,GAAA,GAAM,CAAA,EAAG,OAAO,IAAA;AACrC,EAAA,MAAM,MAAA,GAAS,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,QAAQ,CAAC,CAAA;AACvC,EAAA,OAAO,MAAA,IAAU,CAAA,GAAI,EAAE,IAAA,EAAM,QAAO,GAAI,IAAA;AAC1C;AAEA,SAAS,aAAA,CACP,YACA,IAAA,EACc;AACd,EAAA,OAAO,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,MAAO,EAAE,IAAA,EAAM,CAAA,CAAE,IAAA,EAAM,GAAA,EAAK,IAAI,GAAA,CAAI,IAAA,CAAK,CAAC,CAAC,GAAE,CAAE,CAAA;AACxE;AAIA,SAAS,UAAA,CAAW,MAAc,MAAA,EAAwD;AACxF,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,KAAK,WAAA,EAAY;AAAA,MACjB,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,QAAA,IAAY,CAAA,CAAE,SAAS,EAAA,CAAG;AAAA;AAC3D,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,IAAA,EAAM,QAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAM,CAAA,EAAE,GAAI,IAAA;AAC/D;AAGA,SAAS,QAAA,CACP,MAAA,EACA,MAAA,EACA,IAAA,EACA,IAAA,EACoB;AACpB,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,MAAA;AAAA,MACA,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,KAAA,GAAQ,IAAI,CAAA,IAAK,EAAE;AAAA;AACpD,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,MAAM,MAAA,EAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAK,GAAI,IAAA;AAC5D;AAcO,SAAS,iBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACgB;AAChB,EAAA,IAAI,CAAC,IAAA,IAAQ,UAAA,CAAW,MAAA,KAAW,GAAG,OAAO,OAAA;AAI7C,EAAA,MAAM,OAAA,GAAU,WAAW,IAAI,CAAA;AAG/B,EAAA,MAAM,MAAA,GAAS,eAAA,CAAgB,OAAA,EAAS,UAAU,CAAA;AAClD,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAKhC,EAAA,MAAM,cAAA,GAAiB,OAAO,MAAA,IAAU,CAAA;AAExC,EAAA,MAAM,QAAA,GAAW,UAAA,CAAW,OAAA,EAAS,MAAM,CAAA;AAC3C,EAAA,IAAI,QAAA,EAAU,OAAO,EAAE,GAAG,UAAU,cAAA,EAAe;AAEnD,EAAA,MAAM,MAAA,GAAS,SAAS,OAAO,CAAA;AAC/B,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAEhC,EAAA,MAAM,MAAA,GACJ,QAAA,CAAS,MAAA,EAAQ,MAAA,EAAQ,YAAY,IAAI,CAAA,IACzC,QAAA,CAAS,MAAA,EAAQ,QAAQ,UAAA,EAAY,IAAI,CAAA,IACzC,KAAA,GAAQ,SAAS,MAAM,CAAA;AACzB,EAAA,OAAO,MAAA,GAAS,EAAE,GAAG,MAAA,EAAQ,gBAAe,GAAI,OAAA;AAClD","file":"chunk-BL627TWI.js","sourcesContent":["/**\n * Per-snippet language classification by candidate-set-relative set-difference.\n *\n * A ladder of rungs; the first rung whose leader clears a lead (margin) of ≥1\n * wins; otherwise `\"unknown\"`:\n *\n * 1 alphabet — characters distinctive within the candidate set\n * 2a function words — curated grammatical markers (highest precision)\n * 2b frequent words — corpus content words\n * 3 franc — optional trigram backstop for the distinctive-free\n * residual, injected as a resolver (this module stays\n * franc-free and importable without franc's tables)\n *\n * \"Distinctive\" is ALWAYS relative to the candidate set: a signal counts for a\n * candidate iff it appears in that candidate's profile and in NO other\n * candidate's. So `і` decides {uk, ru} (only uk has it) but is inert in\n * {uk, be} (both have it), and the word `и` decides {uk, ru} even though the\n * *letter* `и` is shared. Nothing is precomputed — uniqueness is the runtime\n * output, never stored.\n *\n * Adapted to langtell's {@link LanguageProfile} shape: the `words` and `iso6393`\n * fields are optional here, so a bare `{ code, alphabet }` profile still\n * classifies on rung 1.\n */\nimport type { LanguageProfile } from \"../types.js\";\n\nexport const FRANC_RUNG = 3;\n\n/** Which rung decided a verdict; `null` when unknown. */\nexport type Rung = 1 | \"2a\" | \"2b\" | typeof FRANC_RUNG | null;\n\nexport interface SnippetVerdict {\n /** Winning language code, or the sentinel `\"unknown\"`. */\n language: string;\n /** Lead of the winner over the runner-up, in the rung's own unit (distinctive\n * char/word count for rungs 1–2; franc score-gap for rung 3). 0 when unknown. */\n margin: number;\n /** Which rung decided; `null` when unknown. */\n rung: Rung;\n /** Whether ≥2 same-script candidates were in scope when the verdict was\n * reached. `true` ⇒ the distinctive-letter/word machinery actually chose\n * between candidates; `false` ⇒ the winner was the lone candidate in its\n * script, selected by script alone (no evidence it is *distinctively* that\n * language). `false` for `\"unknown\"`. */\n discriminating: boolean;\n}\n\n/** A rung's verdict before {@link classifyBySnippet} stamps on the scope-derived\n * `discriminating` flag (which a single rung can't know — it depends on how many\n * same-script candidates were scoped). */\nexport type RungVerdict = Pick<SnippetVerdict, \"language\" | \"margin\" | \"rung\">;\n\nconst UNKNOWN: SnippetVerdict = {\n language: \"unknown\",\n margin: 0,\n rung: null,\n discriminating: false,\n};\n\n/** Resolver for rung 3 (the optional trigram backstop), injected into\n * {@link classifyBySnippet} by callers that have franc available. Kept as an\n * injected seam — not a direct import — so this module stays franc-free and\n * importable without pulling franc's tables. Returns a rung-3 verdict or\n * `null` (abstain).\n *\n * Generic over the concrete profile type `P` the caller classifies with, so a\n * consumer that defines a stricter profile (e.g. `words` required) can type its\n * resolver over that exact shape and hand it to {@link classifyBySnippet} with\n * no adapter — the resolver sees `readonly P[]`, the same array the classifier\n * scoped from its input. Defaults to {@link LanguageProfile} for callers that\n * don't narrow. */\nexport type Rung3Resolver<P extends LanguageProfile = LanguageProfile> = (\n text: string,\n scoped: readonly P[],\n) => RungVerdict | null;\n\nconst CYRILLIC_RE = /\\p{Script=Cyrillic}/u;\nconst LATIN_RE = /\\p{Script=Latin}/u;\n\n/** A coarse script bucket — the only two the candidate-relative classifier\n * distinguishes today. `null` means \"no letters / undetermined\". */\nexport type ScriptName = \"cyrillic\" | \"latin\";\n\n/** Below this length, trigrams are too noisy to justify a rung-3 verdict. */\nexport const RUNG3_MIN_LENGTH = 24;\n\n/**\n * Trailing/inline Latin \"noise\" tokens — URLs, @handles, #hashtags — that a\n * Cyrillic title commonly carries (a headline followed by a link or a social\n * handle). These are almost always Latin even on Cyrillic-language content, so\n * left in they can flip {@link dominantScript} to Latin and let genuinely\n * Cyrillic content scope to the wrong roster. Stripped before the script vote\n * AND before the rung tallies so the URL's letters never contribute either.\n *\n * Kept as separate simple patterns (applied in order — schemes/www before bare\n * domains) rather than one big alternation, so each stays readable. ASCII-only\n * `[a-z0-9-]` in the domain pattern means a Cyrillic word is never mistaken for\n * a domain.\n */\nconst NOISE_PATTERNS: readonly RegExp[] = [\n /\\bhttps?:\\/\\/\\S+/gi, // full URLs\n /\\bwww\\.\\S+/gi, // www.… without a scheme\n /\\b[a-z0-9-]+(?:\\.[a-z0-9-]+)+(?:\\/\\S*)?/gi, // bare domains (example.com/path)\n /[@#][\\p{L}\\p{N}_]+/gu, // @handles and #hashtags\n];\n\n/** Drop URLs / @handles / #hashtags so trailing Latin noise can't outvote the\n * prose's script or pollute the per-rung tallies. */\nexport function stripNoise(text: string): string {\n let out = text;\n for (const re of NOISE_PATTERNS) out = out.replace(re, \" \");\n return out;\n}\n\n/** The script most of `text` is written in, or `null` if it carries no letters.\n * Noise (URLs/handles/hashtags) is stripped first so a single trailing link\n * can't flip a multi-word Cyrillic title's vote to Latin. */\nfunction dominantScript(text: string): \"cyrillic\" | \"latin\" | null {\n let cyr = 0;\n let lat = 0;\n for (const ch of stripNoise(text)) {\n if (CYRILLIC_RE.test(ch)) cyr += 1;\n else if (LATIN_RE.test(ch)) lat += 1;\n }\n if (cyr === 0 && lat === 0) return null;\n return cyr >= lat ? \"cyrillic\" : \"latin\";\n}\n\n/** The script a profile's alphabet is written in, or `null` if it carries no\n * Cyrillic/Latin letter. Exported so the fuser can derive each roster\n * candidate's script without re-deriving the script regexes — a Latin alphabet\n * ⇒ `\"latin\"`, a Cyrillic one ⇒ `\"cyrillic\"`. */\nexport function scriptOfProfile(profile: LanguageProfile): ScriptName | null {\n for (const ch of profile.alphabet) {\n if (CYRILLIC_RE.test(ch)) return \"cyrillic\";\n if (LATIN_RE.test(ch)) return \"latin\";\n }\n return null;\n}\n\n/** Candidates whose script matches the text's dominant script (others can't tip\n * the verdict). Empty when the text carries no letters. Generic over the\n * concrete profile type `P`: the result is a subset of the input array, so it\n * keeps `P` — a stricter caller's profiles stay strictly typed downstream. */\nexport function scopeCandidates<P extends LanguageProfile>(\n text: string,\n candidates: readonly P[],\n): P[] {\n const script = dominantScript(text);\n if (script === null) return [];\n // Keep one profile per code. A language listed twice would otherwise make its\n // own distinctive chars/words read as \"owned by ≥2 candidates\" in `tally`,\n // cancelling them out and collapsing the verdict to \"unknown\".\n const seen = new Set<string>();\n const scoped: P[] = [];\n for (const c of candidates) {\n if (scriptOfProfile(c) !== script || seen.has(c.code)) continue;\n seen.add(c.code);\n scoped.push(c);\n }\n return scoped;\n}\n\n/**\n * Per-language set of characters globally unique within `profiles` — present in\n * exactly one profile's alphabet. Relative to the given profile set: the unique\n * set shrinks as languages are added (a second Latin language un-uniques a–z).\n */\nexport function distinctiveChars(profiles: readonly LanguageProfile[]): Map<string, Set<string>> {\n const owners = new Map<string, string[]>();\n for (const p of profiles) {\n for (const ch of new Set(p.alphabet)) {\n const list = owners.get(ch);\n if (list) list.push(p.code);\n else owners.set(ch, [p.code]);\n }\n }\n const result = new Map<string, Set<string>>(profiles.map((p) => [p.code, new Set()]));\n for (const [ch, codes] of owners) {\n const [only] = codes;\n if (codes.length === 1 && only !== undefined) result.get(only)?.add(ch);\n }\n return result;\n}\n\ninterface Membership {\n code: string;\n set: ReadonlySet<string>;\n}\n\n/** Lowercased Unicode letter-run tokens. Keeps single-char tokens (`і`, `и`). */\nfunction tokenize(text: string): string[] {\n return text.toLowerCase().match(/\\p{L}+/gu) ?? [];\n}\n\n/**\n * Tally how many items (characters or word tokens) are distinctive to each\n * candidate — present in exactly one candidate's set. Items owned by zero or by\n * ≥2 candidates contribute nothing.\n */\nfunction tally(items: Iterable<string>, membership: readonly Membership[]): Map<string, number> {\n const scores = new Map<string, number>(membership.map((m) => [m.code, 0]));\n for (const item of items) {\n let owner: string | null = null;\n let owners = 0;\n for (const m of membership) {\n if (m.set.has(item)) {\n owners += 1;\n if (owners > 1) {\n owner = null;\n break;\n }\n owner = m.code;\n }\n }\n if (owner !== null) scores.set(owner, (scores.get(owner) ?? 0) + 1);\n }\n return scores;\n}\n\n/** The leading candidate and its lead over the runner-up, or `null` if <1. */\nfunction leader(scores: Map<string, number>): { code: string; margin: number } | null {\n let max = -1;\n let second = -1;\n let code: string | null = null;\n for (const [c, score] of scores) {\n if (score > max) {\n second = max;\n max = score;\n code = c;\n } else if (score > second) {\n second = score;\n }\n }\n if (code === null || max < 1) return null;\n const margin = max - Math.max(second, 0);\n return margin >= 1 ? { code, margin } : null;\n}\n\nfunction membershipFor(\n candidates: readonly LanguageProfile[],\n pick: (p: LanguageProfile) => Iterable<string>,\n): Membership[] {\n return candidates.map((c) => ({ code: c.code, set: new Set(pick(c)) }));\n}\n\n/** Rung 1 — characters (alphabet + orthographic {@link LanguageProfile.marks})\n * distinctive within the scoped candidate set. */\nfunction letterRung(text: string, scoped: readonly LanguageProfile[]): RungVerdict | null {\n const r = leader(\n tally(\n text.toLowerCase(),\n membershipFor(scoped, (p) => p.alphabet + (p.marks ?? \"\")),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung: 1 } : null;\n}\n\n/** Rung 2 — distinctive words from the given tier (2a function, 2b frequent). */\nfunction wordRung(\n tokens: readonly string[],\n scoped: readonly LanguageProfile[],\n tier: \"function\" | \"frequent\",\n rung: \"2a\" | \"2b\",\n): RungVerdict | null {\n const r = leader(\n tally(\n tokens,\n membershipFor(scoped, (p) => p.words?.[tier] ?? []),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung } : null;\n}\n\n/**\n * Classify `text` among `candidates`. Synchronous and allocation-light. Returns\n * `\"unknown\"` on empty evidence, on a tie inside the candidate set, or when\n * nothing is distinctive.\n *\n * Generic over the concrete profile type `P`, inferred from `candidates`. The\n * optional `rung3` resolver is typed over the same `P`, so a consumer with a\n * stricter profile (e.g. `words` required) can pass its own resolver directly,\n * with no adapter — the resolver sees exactly the profiles the caller passed.\n * `P` defaults to {@link LanguageProfile}, so the bare two-argument form and\n * every existing call site are unchanged.\n */\nexport function classifyBySnippet<P extends LanguageProfile = LanguageProfile>(\n text: string,\n candidates: readonly P[],\n rung3?: Rung3Resolver<P>,\n): SnippetVerdict {\n if (!text || candidates.length === 0) return UNKNOWN;\n\n // Drop URLs / @handles / #hashtags once, up front: trailing Latin noise must\n // not flip the dominant-script vote nor pollute the per-rung tallies.\n const cleaned = stripNoise(text);\n\n // Restrict to candidates in the text's dominant script.\n const scoped = scopeCandidates(cleaned, candidates);\n if (scoped.length === 0) return UNKNOWN;\n\n // ≥2 same-script candidates means the distinctive machinery actually had a\n // choice to make; a lone scoped candidate wins by script alone. Stamped onto\n // whichever rung decides — a single rung can't see the scope size.\n const discriminating = scoped.length >= 2;\n\n const byLetter = letterRung(cleaned, scoped);\n if (byLetter) return { ...byLetter, discriminating };\n\n const tokens = tokenize(cleaned);\n if (tokens.length === 0) return UNKNOWN;\n\n const byWord =\n wordRung(tokens, scoped, \"function\", \"2a\") ??\n wordRung(tokens, scoped, \"frequent\", \"2b\") ??\n rung3?.(cleaned, scoped);\n return byWord ? { ...byWord, discriminating } : UNKNOWN;\n}\n"]}
@@ -1,4 +1,4 @@
1
- import { primarySubtag } from './chunk-OVSPOZ5J.js';
1
+ import { primarySubtag } from './chunk-YCUSX3GG.js';
2
2
 
3
3
  // src/headers.ts
4
4
  function evidenceFromHeaders(headers) {
@@ -32,5 +32,5 @@ function isHeaders(headers) {
32
32
  }
33
33
 
34
34
  export { evidenceFromHeaders };
35
- //# sourceMappingURL=chunk-3LDE35U2.js.map
36
- //# sourceMappingURL=chunk-3LDE35U2.js.map
35
+ //# sourceMappingURL=chunk-FEKBPTHK.js.map
36
+ //# sourceMappingURL=chunk-FEKBPTHK.js.map
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/headers.ts"],"names":[],"mappings":";;;AAIO,SAAS,oBAAoB,OAAA,EAAoD;AACtF,EAAA,IAAI,OAAA,KAAY,MAAA,EAAW,OAAO,EAAC;AAEnC,EAAA,MAAM,KAAA,GAAQ,SAAA,CAAU,OAAA,EAAS,kBAAkB,CAAA;AACnD,EAAA,MAAM,IAAA,GAAO,cAAc,KAAK,CAAA;AAChC,EAAA,IAAI,IAAA,KAAS,IAAA,EAAM,OAAO,EAAC;AAE3B,EAAA,OAAO;AAAA,IACL;AAAA,MACE,IAAA,EAAM,uBAAA;AAAA,MACN,QAAA,EAAU,IAAA;AAAA,MACV,UAAA,EAAY,GAAA;AAAA,MACZ,MAAA,EAAQ,uBAAA;AAAA,MACR,OAAO,KAAA,IAAS;AAAA;AAClB,GACF;AACF;AAEA,SAAS,SAAA,CAAU,SAAoB,IAAA,EAAkC;AACvE,EAAA,IAAI,SAAA,CAAU,OAAO,CAAA,EAAG;AACtB,IAAA,OAAO,OAAA,CAAQ,GAAA,CAAI,IAAI,CAAA,IAAK,MAAA;AAAA,EAC9B;AACA,EAAA,KAAA,MAAW,CAAC,GAAA,EAAK,KAAK,KAAK,MAAA,CAAO,OAAA,CAAQ,OAAO,CAAA,EAAG;AAClD,IAAA,IAAI,GAAA,CAAI,WAAA,EAAY,KAAM,IAAA,EAAM;AAChC,IAAA,IAAI,MAAM,OAAA,CAAQ,KAAK,GAAG,OAAO,KAAA,CAAM,KAAK,GAAG,CAAA;AAC/C,IAAA,OAAO,KAAA,IAAS,MAAA;AAAA,EAClB;AACA,EAAA,OAAO,MAAA;AACT;AAEA,SAAS,UAAU,OAAA,EAAwC;AACzD,EAAA,OAAO,OAAO,OAAA,KAAY,WAAA,IAAe,OAAA,YAAmB,OAAA;AAC9D","file":"chunk-3LDE35U2.js","sourcesContent":["import type { HeaderBag, LanguageEvidence } from \"./types.js\";\nimport { primarySubtag } from \"./internal/bcp47.js\";\n\n/** Producer: the HTTP `Content-Language` response header. */\nexport function evidenceFromHeaders(headers: HeaderBag | undefined): LanguageEvidence[] {\n if (headers === undefined) return [];\n\n const value = getHeader(headers, \"content-language\");\n const lang = primarySubtag(value);\n if (lang === null) return [];\n\n return [\n {\n kind: \"http-content-language\",\n language: lang,\n confidence: 0.8,\n source: \"http-content-language\",\n value: value ?? \"\",\n },\n ];\n}\n\nfunction getHeader(headers: HeaderBag, name: string): string | undefined {\n if (isHeaders(headers)) {\n return headers.get(name) ?? undefined;\n }\n for (const [key, value] of Object.entries(headers)) {\n if (key.toLowerCase() !== name) continue;\n if (Array.isArray(value)) return value.join(\",\");\n return value ?? undefined;\n }\n return undefined;\n}\n\nfunction isHeaders(headers: HeaderBag): headers is Headers {\n return typeof Headers !== \"undefined\" && headers instanceof Headers;\n}\n"]}
1
+ {"version":3,"sources":["../src/headers.ts"],"names":[],"mappings":";;;AAIO,SAAS,oBAAoB,OAAA,EAAoD;AACtF,EAAA,IAAI,OAAA,KAAY,MAAA,EAAW,OAAO,EAAC;AAEnC,EAAA,MAAM,KAAA,GAAQ,SAAA,CAAU,OAAA,EAAS,kBAAkB,CAAA;AACnD,EAAA,MAAM,IAAA,GAAO,cAAc,KAAK,CAAA;AAChC,EAAA,IAAI,IAAA,KAAS,IAAA,EAAM,OAAO,EAAC;AAE3B,EAAA,OAAO;AAAA,IACL;AAAA,MACE,IAAA,EAAM,uBAAA;AAAA,MACN,QAAA,EAAU,IAAA;AAAA,MACV,UAAA,EAAY,GAAA;AAAA,MACZ,MAAA,EAAQ,uBAAA;AAAA,MACR,OAAO,KAAA,IAAS;AAAA;AAClB,GACF;AACF;AAEA,SAAS,SAAA,CAAU,SAAoB,IAAA,EAAkC;AACvE,EAAA,IAAI,SAAA,CAAU,OAAO,CAAA,EAAG;AACtB,IAAA,OAAO,OAAA,CAAQ,GAAA,CAAI,IAAI,CAAA,IAAK,MAAA;AAAA,EAC9B;AACA,EAAA,KAAA,MAAW,CAAC,GAAA,EAAK,KAAK,KAAK,MAAA,CAAO,OAAA,CAAQ,OAAO,CAAA,EAAG;AAClD,IAAA,IAAI,GAAA,CAAI,WAAA,EAAY,KAAM,IAAA,EAAM;AAChC,IAAA,IAAI,MAAM,OAAA,CAAQ,KAAK,GAAG,OAAO,KAAA,CAAM,KAAK,GAAG,CAAA;AAC/C,IAAA,OAAO,KAAA,IAAS,MAAA;AAAA,EAClB;AACA,EAAA,OAAO,MAAA;AACT;AAEA,SAAS,UAAU,OAAA,EAAwC;AACzD,EAAA,OAAO,OAAO,OAAA,KAAY,WAAA,IAAe,OAAA,YAAmB,OAAA;AAC9D","file":"chunk-FEKBPTHK.js","sourcesContent":["import type { HeaderBag, LanguageEvidence } from \"./types.js\";\nimport { primarySubtag } from \"./internal/bcp47.js\";\n\n/** Producer: the HTTP `Content-Language` response header. */\nexport function evidenceFromHeaders(headers: HeaderBag | undefined): LanguageEvidence[] {\n if (headers === undefined) return [];\n\n const value = getHeader(headers, \"content-language\");\n const lang = primarySubtag(value);\n if (lang === null) return [];\n\n return [\n {\n kind: \"http-content-language\",\n language: lang,\n confidence: 0.8,\n source: \"http-content-language\",\n value: value ?? \"\",\n },\n ];\n}\n\nfunction getHeader(headers: HeaderBag, name: string): string | undefined {\n if (isHeaders(headers)) {\n return headers.get(name) ?? undefined;\n }\n for (const [key, value] of Object.entries(headers)) {\n if (key.toLowerCase() !== name) continue;\n if (Array.isArray(value)) return value.join(\",\");\n return value ?? undefined;\n }\n return undefined;\n}\n\nfunction isHeaders(headers: HeaderBag): headers is Headers {\n return typeof Headers !== \"undefined\" && headers instanceof Headers;\n}\n"]}
@@ -1,4 +1,4 @@
1
- import { normalizeBCP47 } from './chunk-OVSPOZ5J.js';
1
+ import { normalizeBCP47 } from './chunk-YCUSX3GG.js';
2
2
 
3
3
  // src/html.ts
4
4
  function evidenceFromHtml(html) {
@@ -23,5 +23,5 @@ function pushTag(out, kind, confidence, raw) {
23
23
  }
24
24
 
25
25
  export { evidenceFromHtml };
26
- //# sourceMappingURL=chunk-KI4MAI3N.js.map
27
- //# sourceMappingURL=chunk-KI4MAI3N.js.map
26
+ //# sourceMappingURL=chunk-K4MXTIY7.js.map
27
+ //# sourceMappingURL=chunk-K4MXTIY7.js.map
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/html.ts"],"names":[],"mappings":";;;AAeO,SAAS,iBAAiB,IAAA,EAA8C;AAC7E,EAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,KAAW,CAAA,SAAU,EAAC;AAE5D,EAAA,MAAM,MAA0B,EAAC;AAEjC,EAAA,MAAM,QAAA,GAAW,sCAAA,CAAuC,IAAA,CAAK,IAAI,IAAI,CAAC,CAAA;AACtE,EAAA,OAAA,CAAQ,GAAA,EAAK,WAAA,EAAa,GAAA,EAAK,QAAQ,CAAA;AAGvC,EAAA,MAAM,kBACJ,qFAAA,CAAsF,IAAA;AAAA,IACpF;AAAA,GACF,GAAI,CAAC,CAAA,IACL,qFAAA,CAAsF,IAAA;AAAA,IACpF;AAAA,MACE,CAAC,CAAA;AACP,EAAA,OAAA,CAAQ,GAAA,EAAK,uBAAA,EAAyB,GAAA,EAAK,eAAe,CAAA;AAG1D,EAAA,MAAM,QAAA,GACJ,4EAAA,CAA6E,IAAA,CAAK,IAAI,CAAA,GAAI,CAAC,CAAA,IAC3F,4EAAA,CAA6E,IAAA,CAAK,IAAI,CAAA,GAAI,CAAC,CAAA;AAC7F,EAAA,OAAA,CAAQ,GAAA,EAAK,gBAAA,EAAkB,GAAA,EAAK,QAAQ,CAAA;AAE5C,EAAA,OAAO,GAAA;AACT;AAEA,SAAS,OAAA,CACP,GAAA,EACA,IAAA,EACA,UAAA,EACA,GAAA,EACM;AACN,EAAA,MAAM,IAAA,GAAO,eAAe,GAAG,CAAA;AAC/B,EAAA,IAAI,SAAS,IAAA,EAAM;AACnB,EAAA,GAAA,CAAI,IAAA,CAAK,EAAE,IAAA,EAAM,QAAA,EAAU,IAAA,EAAM,UAAA,EAAY,MAAA,EAAQ,IAAA,EAAM,KAAA,EAAO,GAAA,IAAO,EAAA,EAAI,CAAA;AAC/E","file":"chunk-KI4MAI3N.js","sourcesContent":["import type { LanguageEvidence } from \"./types.js\";\nimport { normalizeBCP47 } from \"./internal/bcp47.js\";\n\n/**\n * Producer: language clues from an HTML string's metadata.\n *\n * Reads three independent declarations, each emitted as its own evidence item\n * (the fuser weighs them):\n * - `<html lang>` → `html-lang`\n * - `<meta http-equiv=\"content-language\">` → `meta-content-language`\n * - `<meta property=\"og:locale\">` → `meta-og-locale`\n *\n * All tags are BCP-47-normalized (`uk-UA` → `uk`, `en_US` → `en`). Sync and\n * zero-dependency — regex extraction only, never a DOM parse.\n */\nexport function evidenceFromHtml(html: string | undefined): LanguageEvidence[] {\n if (html === undefined || html.trim().length === 0) return [];\n\n const out: LanguageEvidence[] = [];\n\n const htmlLang = /<html\\b[^>]*\\blang=[\"']?([^\"'\\s>]+)/i.exec(html)?.[1];\n pushTag(out, \"html-lang\", 0.7, htmlLang);\n\n // <meta http-equiv=\"content-language\" content=\"uk\"> (attribute order varies).\n const metaContentLang =\n /<meta\\b[^>]*\\bhttp-equiv=[\"']?content-language[\"']?[^>]*\\bcontent=[\"']?([^\"'\\s>]+)/i.exec(\n html,\n )?.[1] ??\n /<meta\\b[^>]*\\bcontent=[\"']?([^\"'\\s>]+)[\"']?[^>]*\\bhttp-equiv=[\"']?content-language/i.exec(\n html,\n )?.[1];\n pushTag(out, \"meta-content-language\", 0.6, metaContentLang);\n\n // <meta property=\"og:locale\" content=\"uk_UA\"> (attribute order varies).\n const ogLocale =\n /<meta\\b[^>]*\\bproperty=[\"']?og:locale[\"']?[^>]*\\bcontent=[\"']?([^\"'\\s>]+)/i.exec(html)?.[1] ??\n /<meta\\b[^>]*\\bcontent=[\"']?([^\"'\\s>]+)[\"']?[^>]*\\bproperty=[\"']?og:locale/i.exec(html)?.[1];\n pushTag(out, \"meta-og-locale\", 0.6, ogLocale);\n\n return out;\n}\n\nfunction pushTag(\n out: LanguageEvidence[],\n kind: \"html-lang\" | \"meta-content-language\" | \"meta-og-locale\",\n confidence: number,\n raw: string | undefined,\n): void {\n const lang = normalizeBCP47(raw);\n if (lang === null) return;\n out.push({ kind, language: lang, confidence, source: kind, value: raw ?? \"\" });\n}\n"]}
1
+ {"version":3,"sources":["../src/html.ts"],"names":[],"mappings":";;;AAeO,SAAS,iBAAiB,IAAA,EAA8C;AAC7E,EAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,KAAW,CAAA,SAAU,EAAC;AAE5D,EAAA,MAAM,MAA0B,EAAC;AAEjC,EAAA,MAAM,QAAA,GAAW,sCAAA,CAAuC,IAAA,CAAK,IAAI,IAAI,CAAC,CAAA;AACtE,EAAA,OAAA,CAAQ,GAAA,EAAK,WAAA,EAAa,GAAA,EAAK,QAAQ,CAAA;AAGvC,EAAA,MAAM,kBACJ,qFAAA,CAAsF,IAAA;AAAA,IACpF;AAAA,GACF,GAAI,CAAC,CAAA,IACL,qFAAA,CAAsF,IAAA;AAAA,IACpF;AAAA,MACE,CAAC,CAAA;AACP,EAAA,OAAA,CAAQ,GAAA,EAAK,uBAAA,EAAyB,GAAA,EAAK,eAAe,CAAA;AAG1D,EAAA,MAAM,QAAA,GACJ,4EAAA,CAA6E,IAAA,CAAK,IAAI,CAAA,GAAI,CAAC,CAAA,IAC3F,4EAAA,CAA6E,IAAA,CAAK,IAAI,CAAA,GAAI,CAAC,CAAA;AAC7F,EAAA,OAAA,CAAQ,GAAA,EAAK,gBAAA,EAAkB,GAAA,EAAK,QAAQ,CAAA;AAE5C,EAAA,OAAO,GAAA;AACT;AAEA,SAAS,OAAA,CACP,GAAA,EACA,IAAA,EACA,UAAA,EACA,GAAA,EACM;AACN,EAAA,MAAM,IAAA,GAAO,eAAe,GAAG,CAAA;AAC/B,EAAA,IAAI,SAAS,IAAA,EAAM;AACnB,EAAA,GAAA,CAAI,IAAA,CAAK,EAAE,IAAA,EAAM,QAAA,EAAU,IAAA,EAAM,UAAA,EAAY,MAAA,EAAQ,IAAA,EAAM,KAAA,EAAO,GAAA,IAAO,EAAA,EAAI,CAAA;AAC/E","file":"chunk-K4MXTIY7.js","sourcesContent":["import type { LanguageEvidence } from \"./types.js\";\nimport { normalizeBCP47 } from \"./internal/bcp47.js\";\n\n/**\n * Producer: language clues from an HTML string's metadata.\n *\n * Reads three independent declarations, each emitted as its own evidence item\n * (the fuser weighs them):\n * - `<html lang>` → `html-lang`\n * - `<meta http-equiv=\"content-language\">` → `meta-content-language`\n * - `<meta property=\"og:locale\">` → `meta-og-locale`\n *\n * All tags are BCP-47-normalized (`uk-UA` → `uk`, `en_US` → `en`). Sync and\n * zero-dependency — regex extraction only, never a DOM parse.\n */\nexport function evidenceFromHtml(html: string | undefined): LanguageEvidence[] {\n if (html === undefined || html.trim().length === 0) return [];\n\n const out: LanguageEvidence[] = [];\n\n const htmlLang = /<html\\b[^>]*\\blang=[\"']?([^\"'\\s>]+)/i.exec(html)?.[1];\n pushTag(out, \"html-lang\", 0.7, htmlLang);\n\n // <meta http-equiv=\"content-language\" content=\"uk\"> (attribute order varies).\n const metaContentLang =\n /<meta\\b[^>]*\\bhttp-equiv=[\"']?content-language[\"']?[^>]*\\bcontent=[\"']?([^\"'\\s>]+)/i.exec(\n html,\n )?.[1] ??\n /<meta\\b[^>]*\\bcontent=[\"']?([^\"'\\s>]+)[\"']?[^>]*\\bhttp-equiv=[\"']?content-language/i.exec(\n html,\n )?.[1];\n pushTag(out, \"meta-content-language\", 0.6, metaContentLang);\n\n // <meta property=\"og:locale\" content=\"uk_UA\"> (attribute order varies).\n const ogLocale =\n /<meta\\b[^>]*\\bproperty=[\"']?og:locale[\"']?[^>]*\\bcontent=[\"']?([^\"'\\s>]+)/i.exec(html)?.[1] ??\n /<meta\\b[^>]*\\bcontent=[\"']?([^\"'\\s>]+)[\"']?[^>]*\\bproperty=[\"']?og:locale/i.exec(html)?.[1];\n pushTag(out, \"meta-og-locale\", 0.6, ogLocale);\n\n return out;\n}\n\nfunction pushTag(\n out: LanguageEvidence[],\n kind: \"html-lang\" | \"meta-content-language\" | \"meta-og-locale\",\n confidence: number,\n raw: string | undefined,\n): void {\n const lang = normalizeBCP47(raw);\n if (lang === null) return;\n out.push({ kind, language: lang, confidence, source: kind, value: raw ?? \"\" });\n}\n"]}
@@ -54,6 +54,8 @@ var ALIASES = {
54
54
  "po polsku": "pl",
55
55
  polish: "pl",
56
56
  \u043F\u043E\u043B\u044C\u0441\u044C\u043A\u0430: "pl",
57
+ "\u043F\u043E\u043B\u044C\u0441\u044C\u043A\u0430 \u043C\u043E\u0432\u0430": "pl",
58
+ "\u043F\u043E-\u043F\u043E\u043B\u044C\u0441\u044C\u043A\u0438": "pl",
57
59
  // German
58
60
  de: "de",
59
61
  deu: "de",
@@ -62,6 +64,8 @@ var ALIASES = {
62
64
  "auf deutsch": "de",
63
65
  german: "de",
64
66
  \u043D\u0456\u043C\u0435\u0446\u044C\u043A\u0430: "de",
67
+ "\u043D\u0456\u043C\u0435\u0446\u044C\u043A\u0430 \u043C\u043E\u0432\u0430": "de",
68
+ "\u043F\u043E-\u043D\u0456\u043C\u0435\u0446\u044C\u043A\u0438": "de",
65
69
  // French
66
70
  fr: "fr",
67
71
  fra: "fr",
@@ -70,6 +74,8 @@ var ALIASES = {
70
74
  "en fran\xE7ais": "fr",
71
75
  french: "fr",
72
76
  \u0444\u0440\u0430\u043D\u0446\u0443\u0437\u044C\u043A\u0430: "fr",
77
+ "\u0444\u0440\u0430\u043D\u0446\u0443\u0437\u044C\u043A\u0430 \u043C\u043E\u0432\u0430": "fr",
78
+ "\u043F\u043E-\u0444\u0440\u0430\u043D\u0446\u0443\u0437\u044C\u043A\u0438": "fr",
73
79
  // Spanish
74
80
  es: "es",
75
81
  spa: "es",
@@ -78,13 +84,17 @@ var ALIASES = {
78
84
  "en espa\xF1ol": "es",
79
85
  spanish: "es",
80
86
  \u0456\u0441\u043F\u0430\u043D\u0441\u044C\u043A\u0430: "es",
87
+ "\u0456\u0441\u043F\u0430\u043D\u0441\u044C\u043A\u0430 \u043C\u043E\u0432\u0430": "es",
88
+ "\u043F\u043E-\u0456\u0441\u043F\u0430\u043D\u0441\u044C\u043A\u0438": "es",
81
89
  // Italian
82
90
  it: "it",
83
91
  ita: "it",
84
92
  italiano: "it",
85
93
  "in italiano": "it",
86
94
  italian: "it",
87
- \u0456\u0442\u0430\u043B\u0456\u0439\u0441\u044C\u043A\u0430: "it"
95
+ \u0456\u0442\u0430\u043B\u0456\u0439\u0441\u044C\u043A\u0430: "it",
96
+ "\u0456\u0442\u0430\u043B\u0456\u0439\u0441\u044C\u043A\u0430 \u043C\u043E\u0432\u0430": "it",
97
+ "\u043F\u043E-\u0456\u0442\u0430\u043B\u0456\u0439\u0441\u044C\u043A\u0438": "it"
88
98
  };
89
99
  function normalizeLanguageCode(input) {
90
100
  if (input === void 0 || input === null) return null;
@@ -92,7 +102,7 @@ function normalizeLanguageCode(input) {
92
102
  if (cleaned.length === 0) return null;
93
103
  return ALIASES[cleaned] ?? null;
94
104
  }
95
- function normalizeBCP47(input) {
105
+ function normalizeBCP47(input, options) {
96
106
  if (input === void 0 || input === null) return null;
97
107
  const cleaned = input.trim().toLowerCase().replace(/_/g, "-");
98
108
  if (cleaned.length === 0) return null;
@@ -100,7 +110,9 @@ function normalizeBCP47(input) {
100
110
  if (direct !== void 0) return direct;
101
111
  const head = cleaned.split("-")[0];
102
112
  if (head === void 0 || head.length === 0) return null;
103
- return ALIASES[head] ?? head;
113
+ const aliased = ALIASES[head];
114
+ if (aliased !== void 0) return aliased;
115
+ return options?.unknownHead === "null" ? null : head;
104
116
  }
105
117
  function primarySubtag(value) {
106
118
  if (value === void 0 || value === null) return null;
@@ -111,5 +123,5 @@ function primarySubtag(value) {
111
123
  }
112
124
 
113
125
  export { normalizeBCP47, normalizeLanguageCode, primarySubtag };
114
- //# sourceMappingURL=chunk-OVSPOZ5J.js.map
115
- //# sourceMappingURL=chunk-OVSPOZ5J.js.map
126
+ //# sourceMappingURL=chunk-YCUSX3GG.js.map
127
+ //# sourceMappingURL=chunk-YCUSX3GG.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/internal/bcp47.ts"],"names":[],"mappings":";AAuBA,IAAM,OAAA,GAAkC;AAAA;AAAA,EAEtC,EAAA,EAAI,IAAA;AAAA,EACJ,EAAA,EAAI,IAAA;AAAA,EACJ,kBAAA,EAAK,IAAA;AAAA,EACL,4DAAA,EAAY,IAAA;AAAA,EACZ,kEAAA,EAAa,IAAA;AAAA,EACb,uFAAA,EAAmB,IAAA;AAAA,EACnB,iFAAA,EAAkB,IAAA;AAAA,EAClB,mGAAA,EAAqB,IAAA;AAAA,EACrB,SAAA,EAAW,IAAA;AAAA,EACX,cAAA,EAAgB,IAAA;AAAA;AAAA,EAGhB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,kBAAA,EAAK,IAAA;AAAA,EACL,0CAAA,EAAS,IAAA;AAAA,EACT,mDAAA,EAAa,IAAA;AAAA,EACb,mDAAA,EAAa,IAAA;AAAA,EACb,qEAAA,EAAgB,IAAA;AAAA,EAChB,yDAAA,EAAc,IAAA;AAAA,EACd,OAAA,EAAS,IAAA;AAAA,EACT,YAAA,EAAc,IAAA;AAAA,EACd,sDAAA,EAAW,IAAA;AAAA,EACX,iFAAA,EAAkB,IAAA;AAAA,EAClB,qEAAA,EAAgB,IAAA;AAAA,EAChB,qEAAA,EAAgB,IAAA;AAAA;AAAA,EAGhB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,4DAAA,EAAY,IAAA;AAAA,EACZ,uFAAA,EAAmB,IAAA;AAAA,EACnB,UAAA,EAAY,IAAA;AAAA,EACZ,eAAA,EAAiB,IAAA;AAAA;AAAA,EAGjB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,sDAAA,EAAW,IAAA;AAAA,EACX,iFAAA,EAAkB,IAAA;AAAA,EAClB,SAAA,EAAW,IAAA;AAAA,EACX,cAAA,EAAgB,IAAA;AAAA;AAAA,EAGhB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,OAAA,EAAS,IAAA;AAAA,EACT,YAAA,EAAc,IAAA;AAAA,EACd,4DAAA,EAAY,IAAA;AAAA,EACZ,4DAAA,EAAY,IAAA;AAAA;AAAA,EAGZ,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,MAAA,EAAQ,IAAA;AAAA,EACR,WAAA,EAAa,IAAA;AAAA,EACb,MAAA,EAAQ,IAAA;AAAA,EACR,gDAAA,EAAU,IAAA;AAAA,EACV,2EAAA,EAAiB,IAAA;AAAA,EACjB,+DAAA,EAAe,IAAA;AAAA;AAAA,EAGf,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,GAAA,EAAK,IAAA;AAAA,EACL,OAAA,EAAS,IAAA;AAAA,EACT,aAAA,EAAe,IAAA;AAAA,EACf,MAAA,EAAQ,IAAA;AAAA,EACR,gDAAA,EAAU,IAAA;AAAA,EACV,2EAAA,EAAiB,IAAA;AAAA,EACjB,+DAAA,EAAe,IAAA;AAAA;AAAA,EAGf,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,aAAA,EAAU,IAAA;AAAA,EACV,QAAA,EAAU,IAAA;AAAA,EACV,gBAAA,EAAe,IAAA;AAAA,EACf,MAAA,EAAQ,IAAA;AAAA,EACR,4DAAA,EAAY,IAAA;AAAA,EACZ,uFAAA,EAAmB,IAAA;AAAA,EACnB,2EAAA,EAAiB,IAAA;AAAA;AAAA,EAGjB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,YAAA,EAAS,IAAA;AAAA,EACT,OAAA,EAAS,IAAA;AAAA,EACT,eAAA,EAAc,IAAA;AAAA,EACd,OAAA,EAAS,IAAA;AAAA,EACT,sDAAA,EAAW,IAAA;AAAA,EACX,iFAAA,EAAkB,IAAA;AAAA,EAClB,qEAAA,EAAgB,IAAA;AAAA;AAAA,EAGhB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,QAAA,EAAU,IAAA;AAAA,EACV,aAAA,EAAe,IAAA;AAAA,EACf,OAAA,EAAS,IAAA;AAAA,EACT,4DAAA,EAAY,IAAA;AAAA,EACZ,uFAAA,EAAmB,IAAA;AAAA,EACnB,2EAAA,EAAiB;AACnB,CAAA;AASO,SAAS,sBAAsB,KAAA,EAAiD;AACrF,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,KAAU,IAAA,EAAM,OAAO,IAAA;AAClD,EAAA,MAAM,OAAA,GAAU,KAAA,CAAM,IAAA,EAAK,CAAE,WAAA,EAAY;AACzC,EAAA,IAAI,OAAA,CAAQ,MAAA,KAAW,CAAA,EAAG,OAAO,IAAA;AACjC,EAAA,OAAO,OAAA,CAAQ,OAAO,CAAA,IAAK,IAAA;AAC7B;AA8BO,SAAS,cAAA,CACd,OACA,OAAA,EACe;AACf,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,KAAU,IAAA,EAAM,OAAO,IAAA;AAClD,EAAA,MAAM,OAAA,GAAU,MAAM,IAAA,EAAK,CAAE,aAAY,CAAE,OAAA,CAAQ,MAAM,GAAG,CAAA;AAC5D,EAAA,IAAI,OAAA,CAAQ,MAAA,KAAW,CAAA,EAAG,OAAO,IAAA;AACjC,EAAA,MAAM,MAAA,GAAS,QAAQ,OAAO,CAAA;AAC9B,EAAA,IAAI,MAAA,KAAW,QAAW,OAAO,MAAA;AACjC,EAAA,MAAM,IAAA,GAAO,OAAA,CAAQ,KAAA,CAAM,GAAG,EAAE,CAAC,CAAA;AACjC,EAAA,IAAI,IAAA,KAAS,MAAA,IAAa,IAAA,CAAK,MAAA,KAAW,GAAG,OAAO,IAAA;AACpD,EAAA,MAAM,OAAA,GAAU,QAAQ,IAAI,CAAA;AAC5B,EAAA,IAAI,OAAA,KAAY,QAAW,OAAO,OAAA;AAClC,EAAA,OAAO,OAAA,EAAS,WAAA,KAAgB,MAAA,GAAS,IAAA,GAAO,IAAA;AAClD;AAWO,SAAS,cAAc,KAAA,EAAiD;AAC7E,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,KAAU,IAAA,EAAM,OAAO,IAAA;AAClD,EAAA,MAAM,QAAQ,KAAA,CAAM,KAAA,CAAM,GAAG,CAAA,CAAE,CAAC,GAAG,IAAA,EAAK;AACxC,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,CAAM,MAAA,KAAW,GAAG,OAAO,IAAA;AAEtD,EAAA,MAAM,MAAM,KAAA,CAAM,KAAA,CAAM,GAAG,CAAA,CAAE,CAAC,GAAG,IAAA,EAAK;AACtC,EAAA,OAAO,eAAe,GAAG,CAAA;AAC3B","file":"chunk-YCUSX3GG.js","sourcesContent":["/**\n * BCP-47 / language-code normalization.\n *\n * Two entry points with deliberately different strictness:\n * - {@link normalizeBCP47} — for inputs documented to be BCP-47 (`<html lang>`,\n * hreflang, `Content-Language`): try the full string, then strip a\n * region/script suffix (`en-US` → `en`, `zh_CN` → `zh`).\n * - {@link normalizeLanguageCode} — strict exact-match only, for free-text\n * contexts (URL slugs, link text) where a hyphen split could be a coincidence.\n *\n * Both resolve aliases that appear in the wild (`ua` → `uk`, `rus` → `ru`,\n * localized picker phrases) to a canonical ISO 639-1 code.\n */\n\n/**\n * Aliases mapped to canonical ISO 639-1 codes. Keys are lowercased.\n *\n * Ukrainian is the load-bearing case: most sites use `ua` in URLs even though\n * the ISO code is `uk`. Both are accepted on input; `uk` is always output.\n *\n * Includes localized phrases users see in language pickers (`українською`,\n * `по-русски`, `in english`, …).\n */\nconst ALIASES: Record<string, string> = {\n // Ukrainian\n ua: \"uk\",\n uk: \"uk\",\n укр: \"uk\",\n українська: \"uk\",\n українською: \"uk\",\n \"українська мова\": \"uk\",\n \"на українській\": \"uk\",\n \"українською мовою\": \"uk\",\n ukrainian: \"uk\",\n \"in ukrainian\": \"uk\",\n\n // Russian\n ru: \"ru\",\n rus: \"ru\",\n рус: \"ru\",\n русский: \"ru\",\n \"по-русски\": \"ru\",\n \"по русски\": \"ru\",\n \"русский язык\": \"ru\",\n \"на русском\": \"ru\",\n russian: \"ru\",\n \"in russian\": \"ru\",\n російська: \"ru\",\n \"російська мова\": \"ru\",\n \"по-російськи\": \"ru\",\n \"по російськи\": \"ru\",\n\n // Belarusian\n be: \"be\",\n bel: \"be\",\n беларуская: \"be\",\n \"беларуская мова\": \"be\",\n belarusian: \"be\",\n \"in belarusian\": \"be\",\n\n // Bulgarian\n bg: \"bg\",\n bul: \"bg\",\n български: \"bg\",\n \"български език\": \"bg\",\n bulgarian: \"bg\",\n \"in bulgarian\": \"bg\",\n\n // English\n en: \"en\",\n eng: \"en\",\n english: \"en\",\n \"in english\": \"en\",\n англійська: \"en\",\n английский: \"en\",\n\n // Polish\n pl: \"pl\",\n pol: \"pl\",\n polski: \"pl\",\n \"po polsku\": \"pl\",\n polish: \"pl\",\n польська: \"pl\",\n \"польська мова\": \"pl\",\n \"по-польськи\": \"pl\",\n\n // German\n de: \"de\",\n deu: \"de\",\n ger: \"de\",\n deutsch: \"de\",\n \"auf deutsch\": \"de\",\n german: \"de\",\n німецька: \"de\",\n \"німецька мова\": \"de\",\n \"по-німецьки\": \"de\",\n\n // French\n fr: \"fr\",\n fra: \"fr\",\n français: \"fr\",\n francais: \"fr\",\n \"en français\": \"fr\",\n french: \"fr\",\n французька: \"fr\",\n \"французька мова\": \"fr\",\n \"по-французьки\": \"fr\",\n\n // Spanish\n es: \"es\",\n spa: \"es\",\n español: \"es\",\n espanol: \"es\",\n \"en español\": \"es\",\n spanish: \"es\",\n іспанська: \"es\",\n \"іспанська мова\": \"es\",\n \"по-іспанськи\": \"es\",\n\n // Italian\n it: \"it\",\n ita: \"it\",\n italiano: \"it\",\n \"in italiano\": \"it\",\n italian: \"it\",\n італійська: \"it\",\n \"італійська мова\": \"it\",\n \"по-італійськи\": \"it\",\n};\n\n/**\n * Strict, exact-match lookup. Returns `null` for unknown inputs and does NOT\n * fall back to a hyphen prefix. Use anywhere a hyphen split could be a\n * coincidence — URL path segments (`/ru-return-warranty`), title attrs, link\n * text. The phrase aliases (`по-русски`, `in english`) are in the table\n * directly, so exact lookup still finds them.\n */\nexport function normalizeLanguageCode(input: string | undefined | null): string | null {\n if (input === undefined || input === null) return null;\n const cleaned = input.trim().toLowerCase();\n if (cleaned.length === 0) return null;\n return ALIASES[cleaned] ?? null;\n}\n\n/** Options for {@link normalizeBCP47}. */\nexport interface NormalizeBCP47Options {\n /**\n * What to return when the input's primary subtag is not in the alias table.\n * - `\"subtag\"` (default) — pass the raw primary subtag through, so a code\n * outside the table still resolves to its language (`pt-BR` → `pt`,\n * `sv` → `sv`). Best for a permissive normalizer whose roster decides\n * relevance downstream.\n * - `\"null\"` — treat an unknown head as unsupported and return `null`. Best\n * for callers that gate on a fixed alias set and read `null` as \"not a\n * language I handle\".\n */\n unknownHead?: \"subtag\" | \"null\";\n}\n\n/**\n * BCP-47-aware normalization: try the full string first, then strip a\n * region/script suffix (`en-US` → `en`, `zh_CN` → `zh`). Use ONLY for inputs\n * documented to be BCP-47 — `hreflang`, `<html lang>`, `Content-Language`,\n * `data-lang`/`data-locale` — never for free-text URL slugs.\n *\n * By default, falls back to the raw primary subtag when no alias matches, so a\n * code outside the alias table (e.g. `pt-BR` → `pt`) still resolves to its\n * language; the roster decides relevance downstream. Pass\n * `{ unknownHead: \"null\" }` to instead return `null` for any tag whose head\n * isn't in the table — for callers that treat \"not in my alias set\" as\n * unsupported. The default (`\"subtag\"`) is unchanged.\n */\nexport function normalizeBCP47(\n input: string | undefined | null,\n options?: NormalizeBCP47Options,\n): string | null {\n if (input === undefined || input === null) return null;\n const cleaned = input.trim().toLowerCase().replace(/_/g, \"-\");\n if (cleaned.length === 0) return null;\n const direct = ALIASES[cleaned];\n if (direct !== undefined) return direct;\n const head = cleaned.split(\"-\")[0];\n if (head === undefined || head.length === 0) return null;\n const aliased = ALIASES[head];\n if (aliased !== undefined) return aliased;\n return options?.unknownHead === \"null\" ? null : head;\n}\n\n/**\n * Extract the primary subtag from a BCP-47-ish value, lowercased, then resolve\n * it through the alias table (`ua` → `uk`). Handles `Accept-Language`-style\n * comma lists (`en-US,en;q=0.9` → `en`). Returns `null` for empty/nullish.\n *\n * This is the header/HTML extraction helper: it tolerates the messy shapes those\n * sources carry (comma lists, `q` weights) where {@link normalizeBCP47} expects\n * a single tag.\n */\nexport function primarySubtag(value: string | undefined | null): string | null {\n if (value === undefined || value === null) return null;\n const first = value.split(\",\")[0]?.trim();\n if (first === undefined || first.length === 0) return null;\n // Drop a `;q=…` weight if present.\n const tag = first.split(\";\")[0]?.trim();\n return normalizeBCP47(tag);\n}\n"]}
@@ -60,6 +60,13 @@ type RungVerdict = Pick<SnippetVerdict, "language" | "margin" | "rung">;
60
60
  * scoped from its input. Defaults to {@link LanguageProfile} for callers that
61
61
  * don't narrow. */
62
62
  type Rung3Resolver<P extends LanguageProfile = LanguageProfile> = (text: string, scoped: readonly P[]) => RungVerdict | null;
63
+ /** Below this length, trigrams are too noisy to justify a rung-3 verdict. */
64
+ declare const RUNG3_MIN_LENGTH = 24;
65
+ /** Candidates whose script matches the text's dominant script (others can't tip
66
+ * the verdict). Empty when the text carries no letters. Generic over the
67
+ * concrete profile type `P`: the result is a subset of the input array, so it
68
+ * keeps `P` — a stricter caller's profiles stay strictly typed downstream. */
69
+ declare function scopeCandidates<P extends LanguageProfile>(text: string, candidates: readonly P[]): P[];
63
70
  /**
64
71
  * Classify `text` among `candidates`. Synchronous and allocation-light. Returns
65
72
  * `"unknown"` on empty evidence, on a tie inside the candidate set, or when
@@ -74,4 +81,4 @@ type Rung3Resolver<P extends LanguageProfile = LanguageProfile> = (text: string,
74
81
  */
75
82
  declare function classifyBySnippet<P extends LanguageProfile = LanguageProfile>(text: string, candidates: readonly P[], rung3?: Rung3Resolver<P>): SnippetVerdict;
76
83
 
77
- export { FRANC_RUNG, type Rung, type Rung3Resolver, type RungVerdict, type SnippetVerdict, classifyBySnippet };
84
+ export { FRANC_RUNG, RUNG3_MIN_LENGTH, type Rung, type Rung3Resolver, type RungVerdict, type SnippetVerdict, classifyBySnippet, scopeCandidates };
package/dist/classify.js CHANGED
@@ -1,3 +1,3 @@
1
- export { FRANC_RUNG, classifyBySnippet } from './chunk-U34Z3ZSV.js';
1
+ export { FRANC_RUNG, RUNG3_MIN_LENGTH, classifyBySnippet, scopeCandidates } from './chunk-BL627TWI.js';
2
2
  //# sourceMappingURL=classify.js.map
3
3
  //# sourceMappingURL=classify.js.map
@@ -0,0 +1,53 @@
1
+ /**
2
+ * `langtell/cyrillic` — a cheap, roster-free Cyrillic language fast-path.
3
+ *
4
+ * Where {@link classifyBySnippet} (`langtell/classify`) scores a snippet
5
+ * *relative to a candidate roster you pass in*, this is the opposite trade: a
6
+ * fixed, zero-config discriminator for the four Cyrillic languages langtell
7
+ * profiles — Ukrainian, Russian, Belarusian, Bulgarian — decided purely by
8
+ * letters distinctive to each, with no profiles, no tokenization, and no franc.
9
+ * Reach for it when you just need "is this Russian / is this Ukrainian?" on a
10
+ * hot path and don't want to assemble a candidate set.
11
+ *
12
+ * Each language carries letters the others (mostly) don't:
13
+ * Ukrainian — і ї є ґ
14
+ * Russian — ы ё (ъ э are shared with bg/be and handled separately)
15
+ * Belarusian — ў (uniquely Belarusian)
16
+ * Bulgarian — ъ used as a vowel in nearly every word
17
+ *
18
+ * The shared letters `ъ` and `э` need care: `подъезд` is Russian, `съм
19
+ * българин` is Bulgarian — both contain `ъ`. We disambiguate by density and
20
+ * length: a single `ъ` in a short snippet leans RU; multiple `ъ` in longer text
21
+ * leans BG. A lone `э` with no other distinctives stays `"unknown"` rather than
22
+ * silently guessing (Russian uses it in loanwords/`Это`, Belarusian in
23
+ * `гэта`/`сэрца`).
24
+ *
25
+ * Zero-dependency and side-effect-free. The cheap heuristic stays cheap; if a
26
+ * use case needs more than letter signals can give, escalate to
27
+ * {@link classifyBySnippet} or a franc-backed source.
28
+ */
29
+ /** The four Cyrillic languages this fast-path tells apart, plus the `"unknown"`
30
+ * sentinel when letter signals are insufficient. */
31
+ type CyrillicLanguage = "uk" | "ru" | "be" | "bg" | "unknown";
32
+ /** The verdict from {@link detectCyrillicLanguage}: the chosen language plus the
33
+ * raw distinctive-letter tallies that drove a uk-vs-ru decision (informational;
34
+ * `ruScore` carries the deciding count for the ru/bg fallbacks too). */
35
+ interface CyrillicVerdict {
36
+ language: CyrillicLanguage;
37
+ ukScore: number;
38
+ ruScore: number;
39
+ }
40
+ /**
41
+ * Identify the Cyrillic language of `text` by distinctive letters, returning the
42
+ * chosen language and the uk/ru tallies behind it. `"unknown"` when there is no
43
+ * Cyrillic evidence, on a uk/ru tie, or when only an ambiguous `э` is present.
44
+ */
45
+ declare function detectCyrillicLanguage(text: string): CyrillicVerdict;
46
+ /** Convenience predicate: `true` iff {@link detectCyrillicLanguage} calls `text`
47
+ * Russian. */
48
+ declare function isRussian(text: string): boolean;
49
+ /** Convenience predicate: `true` iff {@link detectCyrillicLanguage} calls `text`
50
+ * Ukrainian. */
51
+ declare function isUkrainian(text: string): boolean;
52
+
53
+ export { type CyrillicLanguage, type CyrillicVerdict, detectCyrillicLanguage, isRussian, isUkrainian };
@@ -0,0 +1,64 @@
1
+ // src/cyrillic.ts
2
+ var UK_DISTINCTIVE = /[іїєґ]/gi;
3
+ var RU_DISTINCTIVE = /[ыё]/gi;
4
+ var BE_DISTINCTIVE = /ў/gi;
5
+ var HARD_SIGN = /ъ/gi;
6
+ var E_OBOROT = /э/gi;
7
+ var CYRILLIC = /[\u0400-\u04FF]/g;
8
+ var MIN_CYRILLIC_FOR_FALLBACK = 10;
9
+ var MIN_LEN_FOR_BG = 10;
10
+ function count(text, re) {
11
+ return (text.match(re) ?? []).length;
12
+ }
13
+ function countSignals(text) {
14
+ return {
15
+ ukScore: count(text, UK_DISTINCTIVE),
16
+ ruDistinctive: count(text, RU_DISTINCTIVE),
17
+ beScore: count(text, BE_DISTINCTIVE),
18
+ hardSigns: count(text, HARD_SIGN),
19
+ eOborot: count(text, E_OBOROT),
20
+ cyrillicCount: count(text, CYRILLIC)
21
+ };
22
+ }
23
+ function detectCyrillicLanguage(text) {
24
+ const { ukScore, ruDistinctive, beScore, hardSigns, eOborot, cyrillicCount } = countSignals(text);
25
+ if (beScore > 0) {
26
+ return { language: "be", ukScore, ruScore: ruDistinctive };
27
+ }
28
+ if (ukScore > 0 && ruDistinctive > 0) {
29
+ if (ukScore === ruDistinctive) {
30
+ return { language: "unknown", ukScore, ruScore: ruDistinctive };
31
+ }
32
+ return {
33
+ language: ukScore > ruDistinctive ? "uk" : "ru",
34
+ ukScore,
35
+ ruScore: ruDistinctive
36
+ };
37
+ }
38
+ if (ruDistinctive > 0) {
39
+ return { language: "ru", ukScore, ruScore: ruDistinctive };
40
+ }
41
+ if (ukScore > 0) {
42
+ return { language: "uk", ukScore, ruScore: 0 };
43
+ }
44
+ if (hardSigns >= 2 && text.length >= MIN_LEN_FOR_BG) {
45
+ return { language: "bg", ukScore: 0, ruScore: hardSigns };
46
+ }
47
+ if (cyrillicCount >= MIN_CYRILLIC_FOR_FALLBACK && eOborot === 0) {
48
+ return { language: "ru", ukScore: 0, ruScore: 0 };
49
+ }
50
+ if (hardSigns > 0 && eOborot === 0) {
51
+ return { language: "ru", ukScore: 0, ruScore: hardSigns };
52
+ }
53
+ return { language: "unknown", ukScore: 0, ruScore: 0 };
54
+ }
55
+ function isRussian(text) {
56
+ return detectCyrillicLanguage(text).language === "ru";
57
+ }
58
+ function isUkrainian(text) {
59
+ return detectCyrillicLanguage(text).language === "uk";
60
+ }
61
+
62
+ export { detectCyrillicLanguage, isRussian, isUkrainian };
63
+ //# sourceMappingURL=cyrillic.js.map
64
+ //# sourceMappingURL=cyrillic.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/cyrillic.ts"],"names":[],"mappings":";AAiCA,IAAM,cAAA,GAAiB,UAAA;AACvB,IAAM,cAAA,GAAiB,QAAA;AACvB,IAAM,cAAA,GAAiB,KAAA;AACvB,IAAM,SAAA,GAAY,KAAA;AAClB,IAAM,QAAA,GAAW,KAAA;AAGjB,IAAM,QAAA,GAAW,kBAAA;AAIjB,IAAM,yBAAA,GAA4B,EAAA;AAGlC,IAAM,cAAA,GAAiB,EAAA;AAWvB,SAAS,KAAA,CAAM,MAAc,EAAA,EAAoB;AAC/C,EAAA,OAAA,CAAQ,IAAA,CAAK,KAAA,CAAM,EAAE,CAAA,IAAK,EAAC,EAAG,MAAA;AAChC;AAcA,SAAS,aAAa,IAAA,EAAuB;AAC3C,EAAA,OAAO;AAAA,IACL,OAAA,EAAS,KAAA,CAAM,IAAA,EAAM,cAAc,CAAA;AAAA,IACnC,aAAA,EAAe,KAAA,CAAM,IAAA,EAAM,cAAc,CAAA;AAAA,IACzC,OAAA,EAAS,KAAA,CAAM,IAAA,EAAM,cAAc,CAAA;AAAA,IACnC,SAAA,EAAW,KAAA,CAAM,IAAA,EAAM,SAAS,CAAA;AAAA,IAChC,OAAA,EAAS,KAAA,CAAM,IAAA,EAAM,QAAQ,CAAA;AAAA,IAC7B,aAAA,EAAe,KAAA,CAAM,IAAA,EAAM,QAAQ;AAAA,GACrC;AACF;AAOO,SAAS,uBAAuB,IAAA,EAA+B;AACpE,EAAA,MAAM,EAAE,SAAS,aAAA,EAAe,OAAA,EAAS,WAAW,OAAA,EAAS,aAAA,EAAc,GAAI,YAAA,CAAa,IAAI,CAAA;AAGhG,EAAA,IAAI,UAAU,CAAA,EAAG;AACf,IAAA,OAAO,EAAE,QAAA,EAAU,IAAA,EAAM,OAAA,EAAS,SAAS,aAAA,EAAc;AAAA,EAC3D;AAIA,EAAA,IAAI,OAAA,GAAU,CAAA,IAAK,aAAA,GAAgB,CAAA,EAAG;AACpC,IAAA,IAAI,YAAY,aAAA,EAAe;AAC7B,MAAA,OAAO,EAAE,QAAA,EAAU,SAAA,EAAW,OAAA,EAAS,SAAS,aAAA,EAAc;AAAA,IAChE;AACA,IAAA,OAAO;AAAA,MACL,QAAA,EAAU,OAAA,GAAU,aAAA,GAAgB,IAAA,GAAO,IAAA;AAAA,MAC3C,OAAA;AAAA,MACA,OAAA,EAAS;AAAA,KACX;AAAA,EACF;AAGA,EAAA,IAAI,gBAAgB,CAAA,EAAG;AACrB,IAAA,OAAO,EAAE,QAAA,EAAU,IAAA,EAAM,OAAA,EAAS,SAAS,aAAA,EAAc;AAAA,EAC3D;AAGA,EAAA,IAAI,UAAU,CAAA,EAAG;AACf,IAAA,OAAO,EAAE,QAAA,EAAU,IAAA,EAAM,OAAA,EAAS,SAAS,CAAA,EAAE;AAAA,EAC/C;AAKA,EAAA,IAAI,SAAA,IAAa,CAAA,IAAK,IAAA,CAAK,MAAA,IAAU,cAAA,EAAgB;AACnD,IAAA,OAAO,EAAE,QAAA,EAAU,IAAA,EAAM,OAAA,EAAS,CAAA,EAAG,SAAS,SAAA,EAAU;AAAA,EAC1D;AAKA,EAAA,IAAI,aAAA,IAAiB,yBAAA,IAA6B,OAAA,KAAY,CAAA,EAAG;AAC/D,IAAA,OAAO,EAAE,QAAA,EAAU,IAAA,EAAM,OAAA,EAAS,CAAA,EAAG,SAAS,CAAA,EAAE;AAAA,EAClD;AAIA,EAAA,IAAI,SAAA,GAAY,CAAA,IAAK,OAAA,KAAY,CAAA,EAAG;AAClC,IAAA,OAAO,EAAE,QAAA,EAAU,IAAA,EAAM,OAAA,EAAS,CAAA,EAAG,SAAS,SAAA,EAAU;AAAA,EAC1D;AAEA,EAAA,OAAO,EAAE,QAAA,EAAU,SAAA,EAAW,OAAA,EAAS,CAAA,EAAG,SAAS,CAAA,EAAE;AACvD;AAIO,SAAS,UAAU,IAAA,EAAuB;AAC/C,EAAA,OAAO,sBAAA,CAAuB,IAAI,CAAA,CAAE,QAAA,KAAa,IAAA;AACnD;AAIO,SAAS,YAAY,IAAA,EAAuB;AACjD,EAAA,OAAO,sBAAA,CAAuB,IAAI,CAAA,CAAE,QAAA,KAAa,IAAA;AACnD","file":"cyrillic.js","sourcesContent":["/**\n * `langtell/cyrillic` — a cheap, roster-free Cyrillic language fast-path.\n *\n * Where {@link classifyBySnippet} (`langtell/classify`) scores a snippet\n * *relative to a candidate roster you pass in*, this is the opposite trade: a\n * fixed, zero-config discriminator for the four Cyrillic languages langtell\n * profiles — Ukrainian, Russian, Belarusian, Bulgarian — decided purely by\n * letters distinctive to each, with no profiles, no tokenization, and no franc.\n * Reach for it when you just need \"is this Russian / is this Ukrainian?\" on a\n * hot path and don't want to assemble a candidate set.\n *\n * Each language carries letters the others (mostly) don't:\n * Ukrainian — і ї є ґ\n * Russian — ы ё (ъ э are shared with bg/be and handled separately)\n * Belarusian — ў (uniquely Belarusian)\n * Bulgarian — ъ used as a vowel in nearly every word\n *\n * The shared letters `ъ` and `э` need care: `подъезд` is Russian, `съм\n * българин` is Bulgarian — both contain `ъ`. We disambiguate by density and\n * length: a single `ъ` in a short snippet leans RU; multiple `ъ` in longer text\n * leans BG. A lone `э` with no other distinctives stays `\"unknown\"` rather than\n * silently guessing (Russian uses it in loanwords/`Это`, Belarusian in\n * `гэта`/`сэрца`).\n *\n * Zero-dependency and side-effect-free. The cheap heuristic stays cheap; if a\n * use case needs more than letter signals can give, escalate to\n * {@link classifyBySnippet} or a franc-backed source.\n */\n\n/** The four Cyrillic languages this fast-path tells apart, plus the `\"unknown\"`\n * sentinel when letter signals are insufficient. */\nexport type CyrillicLanguage = \"uk\" | \"ru\" | \"be\" | \"bg\" | \"unknown\";\n\nconst UK_DISTINCTIVE = /[іїєґ]/gi;\nconst RU_DISTINCTIVE = /[ыё]/gi;\nconst BE_DISTINCTIVE = /ў/gi;\nconst HARD_SIGN = /ъ/gi;\nconst E_OBOROT = /э/gi;\n// U+0400–U+04FF is the Cyrillic Unicode block; written as explicit \\u escapes\n// so the range bounds are unambiguous (regexp/no-obscure-range).\nconst CYRILLIC = /[\\u0400-\\u04FF]/g;\n\n/** Minimum Cyrillic-letter count before the fallback guesses a language. Below\n * this, a short snippet (`Привет`, `Хочу`) is too ambiguous to act on. */\nconst MIN_CYRILLIC_FOR_FALLBACK = 10;\n/** Minimum text length before `ъ`-density is read as Bulgarian. Single-word\n * Russian samples like `подъезд` (7 chars) would otherwise misclassify. */\nconst MIN_LEN_FOR_BG = 10;\n\n/** The verdict from {@link detectCyrillicLanguage}: the chosen language plus the\n * raw distinctive-letter tallies that drove a uk-vs-ru decision (informational;\n * `ruScore` carries the deciding count for the ru/bg fallbacks too). */\nexport interface CyrillicVerdict {\n language: CyrillicLanguage;\n ukScore: number;\n ruScore: number;\n}\n\nfunction count(text: string, re: RegExp): number {\n return (text.match(re) ?? []).length;\n}\n\n/** Raw letter-signal tallies over `text`, one pass per distinctive class.\n * Gathered up front so {@link detectCyrillicLanguage} reads as a pure decision\n * cascade over these counts rather than interleaving counting and branching. */\ninterface Signals {\n ukScore: number;\n ruDistinctive: number;\n beScore: number;\n hardSigns: number;\n eOborot: number;\n cyrillicCount: number;\n}\n\nfunction countSignals(text: string): Signals {\n return {\n ukScore: count(text, UK_DISTINCTIVE),\n ruDistinctive: count(text, RU_DISTINCTIVE),\n beScore: count(text, BE_DISTINCTIVE),\n hardSigns: count(text, HARD_SIGN),\n eOborot: count(text, E_OBOROT),\n cyrillicCount: count(text, CYRILLIC),\n };\n}\n\n/**\n * Identify the Cyrillic language of `text` by distinctive letters, returning the\n * chosen language and the uk/ru tallies behind it. `\"unknown\"` when there is no\n * Cyrillic evidence, on a uk/ru tie, or when only an ambiguous `э` is present.\n */\nexport function detectCyrillicLanguage(text: string): CyrillicVerdict {\n const { ukScore, ruDistinctive, beScore, hardSigns, eOborot, cyrillicCount } = countSignals(text);\n\n // ў is uniquely Belarusian — strongest single signal.\n if (beScore > 0) {\n return { language: \"be\", ukScore, ruScore: ruDistinctive };\n }\n\n // Both UA and RU evidence present — a tie is \"unknown\", not a silent UA call\n // (which would misclassify Belarusian/Bulgarian whenever і or ё balances out).\n if (ukScore > 0 && ruDistinctive > 0) {\n if (ukScore === ruDistinctive) {\n return { language: \"unknown\", ukScore, ruScore: ruDistinctive };\n }\n return {\n language: ukScore > ruDistinctive ? \"uk\" : \"ru\",\n ukScore,\n ruScore: ruDistinctive,\n };\n }\n\n // Distinctive RU letters (ы, ё) with no UA evidence — unambiguously RU.\n if (ruDistinctive > 0) {\n return { language: \"ru\", ukScore, ruScore: ruDistinctive };\n }\n\n // Distinctive UA letters with no RU distinctives — UA.\n if (ukScore > 0) {\n return { language: \"uk\", ukScore, ruScore: 0 };\n }\n\n // Bulgarian: ъ as a vowel — multiple occurrences in non-trivial text, no UA /\n // RU distinctives. Length guard keeps short Russian compounds like `подъезд`\n // from sliding into BG.\n if (hardSigns >= 2 && text.length >= MIN_LEN_FOR_BG) {\n return { language: \"bg\", ukScore: 0, ruScore: hardSigns };\n }\n\n // Russian fallback: substantial Cyrillic with no UA, no BE distinctives, and\n // no э (which signals possible Belarusian). Catches short-but-clear Russian\n // text with none of ы/ё/ъ/э (`Привет, мир`, `Здравствуйте, меня зовут …`).\n if (cyrillicCount >= MIN_CYRILLIC_FOR_FALLBACK && eOborot === 0) {\n return { language: \"ru\", ukScore: 0, ruScore: 0 };\n }\n\n // Lone ъ (no э, no other distinctives) in shorter text — RU compound-word\n // pattern (подъезд, объект, съезд).\n if (hardSigns > 0 && eOborot === 0) {\n return { language: \"ru\", ukScore: 0, ruScore: hardSigns };\n }\n\n return { language: \"unknown\", ukScore: 0, ruScore: 0 };\n}\n\n/** Convenience predicate: `true` iff {@link detectCyrillicLanguage} calls `text`\n * Russian. */\nexport function isRussian(text: string): boolean {\n return detectCyrillicLanguage(text).language === \"ru\";\n}\n\n/** Convenience predicate: `true` iff {@link detectCyrillicLanguage} calls `text`\n * Ukrainian. */\nexport function isUkrainian(text: string): boolean {\n return detectCyrillicLanguage(text).language === \"uk\";\n}\n"]}
package/dist/franc.js CHANGED
@@ -1,4 +1,4 @@
1
- import { scopeCandidates } from './chunk-U34Z3ZSV.js';
1
+ import { scopeCandidates } from './chunk-BL627TWI.js';
2
2
  import { francAll } from 'franc';
3
3
 
4
4
  var RUNG_MIN_LENGTH = 24;
package/dist/fuse.js CHANGED
@@ -1,5 +1,5 @@
1
- export { fuse } from './chunk-G44HHVK5.js';
2
- import './chunk-OVSPOZ5J.js';
3
- import './chunk-U34Z3ZSV.js';
1
+ export { fuse } from './chunk-6PWEE3SR.js';
2
+ import './chunk-YCUSX3GG.js';
3
+ import './chunk-BL627TWI.js';
4
4
  //# sourceMappingURL=fuse.js.map
5
5
  //# sourceMappingURL=fuse.js.map
package/dist/headers.js CHANGED
@@ -1,4 +1,4 @@
1
- export { evidenceFromHeaders } from './chunk-3LDE35U2.js';
2
- import './chunk-OVSPOZ5J.js';
1
+ export { evidenceFromHeaders } from './chunk-FEKBPTHK.js';
2
+ import './chunk-YCUSX3GG.js';
3
3
  //# sourceMappingURL=headers.js.map
4
4
  //# sourceMappingURL=headers.js.map
package/dist/html.js CHANGED
@@ -1,4 +1,4 @@
1
- export { evidenceFromHtml } from './chunk-KI4MAI3N.js';
2
- import './chunk-OVSPOZ5J.js';
1
+ export { evidenceFromHtml } from './chunk-K4MXTIY7.js';
2
+ import './chunk-YCUSX3GG.js';
3
3
  //# sourceMappingURL=html.js.map
4
4
  //# sourceMappingURL=html.js.map
package/dist/index.d.ts CHANGED
@@ -35,17 +35,34 @@ declare function compile<const E extends readonly EvidenceSource[] = []>(config?
35
35
  * directly, so exact lookup still finds them.
36
36
  */
37
37
  declare function normalizeLanguageCode(input: string | undefined | null): string | null;
38
+ /** Options for {@link normalizeBCP47}. */
39
+ interface NormalizeBCP47Options {
40
+ /**
41
+ * What to return when the input's primary subtag is not in the alias table.
42
+ * - `"subtag"` (default) — pass the raw primary subtag through, so a code
43
+ * outside the table still resolves to its language (`pt-BR` → `pt`,
44
+ * `sv` → `sv`). Best for a permissive normalizer whose roster decides
45
+ * relevance downstream.
46
+ * - `"null"` — treat an unknown head as unsupported and return `null`. Best
47
+ * for callers that gate on a fixed alias set and read `null` as "not a
48
+ * language I handle".
49
+ */
50
+ unknownHead?: "subtag" | "null";
51
+ }
38
52
  /**
39
53
  * BCP-47-aware normalization: try the full string first, then strip a
40
54
  * region/script suffix (`en-US` → `en`, `zh_CN` → `zh`). Use ONLY for inputs
41
55
  * documented to be BCP-47 — `hreflang`, `<html lang>`, `Content-Language`,
42
56
  * `data-lang`/`data-locale` — never for free-text URL slugs.
43
57
  *
44
- * Falls back to the raw primary subtag when no alias matches, so a code outside
45
- * the alias table (e.g. `pt-BR` → `pt`) still resolves to its language. The
46
- * roster decides relevance downstream.
58
+ * By default, falls back to the raw primary subtag when no alias matches, so a
59
+ * code outside the alias table (e.g. `pt-BR` → `pt`) still resolves to its
60
+ * language; the roster decides relevance downstream. Pass
61
+ * `{ unknownHead: "null" }` to instead return `null` for any tag whose head
62
+ * isn't in the table — for callers that treat "not in my alias set" as
63
+ * unsupported. The default (`"subtag"`) is unchanged.
47
64
  */
48
- declare function normalizeBCP47(input: string | undefined | null): string | null;
65
+ declare function normalizeBCP47(input: string | undefined | null, options?: NormalizeBCP47Options): string | null;
49
66
  /**
50
67
  * Extract the primary subtag from a BCP-47-ish value, lowercased, then resolve
51
68
  * it through the alias table (`ua` → `uk`). Handles `Accept-Language`-style
@@ -57,4 +74,4 @@ declare function normalizeBCP47(input: string | undefined | null): string | null
57
74
  */
58
75
  declare function primarySubtag(value: string | undefined | null): string | null;
59
76
 
60
- export { DetectFn, DetectorConfig, EvidenceSource, compile, normalizeBCP47, normalizeLanguageCode, primarySubtag };
77
+ export { DetectFn, DetectorConfig, EvidenceSource, type NormalizeBCP47Options, compile, normalizeBCP47, normalizeLanguageCode, primarySubtag };
package/dist/index.js CHANGED
@@ -1,13 +1,13 @@
1
- import { evidenceFromText } from './chunk-KCK3XWCJ.js';
2
- export { evidenceFromText } from './chunk-KCK3XWCJ.js';
3
- import { evidenceFromHtml } from './chunk-KI4MAI3N.js';
4
- export { evidenceFromHtml } from './chunk-KI4MAI3N.js';
5
- import { evidenceFromHeaders } from './chunk-3LDE35U2.js';
6
- export { evidenceFromHeaders } from './chunk-3LDE35U2.js';
7
- import { fuse } from './chunk-G44HHVK5.js';
8
- export { fuse } from './chunk-G44HHVK5.js';
9
- export { normalizeBCP47, normalizeLanguageCode, primarySubtag } from './chunk-OVSPOZ5J.js';
10
- import './chunk-U34Z3ZSV.js';
1
+ import { evidenceFromText } from './chunk-7TDGJOIJ.js';
2
+ export { evidenceFromText } from './chunk-7TDGJOIJ.js';
3
+ import { evidenceFromHtml } from './chunk-K4MXTIY7.js';
4
+ export { evidenceFromHtml } from './chunk-K4MXTIY7.js';
5
+ import { evidenceFromHeaders } from './chunk-FEKBPTHK.js';
6
+ export { evidenceFromHeaders } from './chunk-FEKBPTHK.js';
7
+ import { fuse } from './chunk-6PWEE3SR.js';
8
+ export { fuse } from './chunk-6PWEE3SR.js';
9
+ export { normalizeBCP47, normalizeLanguageCode, primarySubtag } from './chunk-YCUSX3GG.js';
10
+ import './chunk-BL627TWI.js';
11
11
 
12
12
  // src/compile.ts
13
13
  function builtIns(candidates) {
package/dist/text.js CHANGED
@@ -1,4 +1,4 @@
1
- export { evidenceFromText } from './chunk-KCK3XWCJ.js';
2
- import './chunk-U34Z3ZSV.js';
1
+ export { evidenceFromText } from './chunk-7TDGJOIJ.js';
2
+ import './chunk-BL627TWI.js';
3
3
  //# sourceMappingURL=text.js.map
4
4
  //# sourceMappingURL=text.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "langtell",
3
- "version": "0.4.0",
3
+ "version": "0.5.0",
4
4
  "description": "Tell me the language — evidence-fusion language detection for short strings, with an auditable confidence trail.",
5
5
  "type": "module",
6
6
  "license": "MIT",
@@ -65,6 +65,10 @@
65
65
  "types": "./dist/chrome-ai.d.ts",
66
66
  "import": "./dist/chrome-ai.js"
67
67
  },
68
+ "./cyrillic": {
69
+ "types": "./dist/cyrillic.d.ts",
70
+ "import": "./dist/cyrillic.js"
71
+ },
68
72
  "./package.json": "./package.json"
69
73
  },
70
74
  "files": [
@@ -1 +0,0 @@
1
- {"version":3,"sources":["../src/internal/bcp47.ts"],"names":[],"mappings":";AAuBA,IAAM,OAAA,GAAkC;AAAA;AAAA,EAEtC,EAAA,EAAI,IAAA;AAAA,EACJ,EAAA,EAAI,IAAA;AAAA,EACJ,kBAAA,EAAK,IAAA;AAAA,EACL,4DAAA,EAAY,IAAA;AAAA,EACZ,kEAAA,EAAa,IAAA;AAAA,EACb,uFAAA,EAAmB,IAAA;AAAA,EACnB,iFAAA,EAAkB,IAAA;AAAA,EAClB,mGAAA,EAAqB,IAAA;AAAA,EACrB,SAAA,EAAW,IAAA;AAAA,EACX,cAAA,EAAgB,IAAA;AAAA;AAAA,EAGhB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,kBAAA,EAAK,IAAA;AAAA,EACL,0CAAA,EAAS,IAAA;AAAA,EACT,mDAAA,EAAa,IAAA;AAAA,EACb,mDAAA,EAAa,IAAA;AAAA,EACb,qEAAA,EAAgB,IAAA;AAAA,EAChB,yDAAA,EAAc,IAAA;AAAA,EACd,OAAA,EAAS,IAAA;AAAA,EACT,YAAA,EAAc,IAAA;AAAA,EACd,sDAAA,EAAW,IAAA;AAAA,EACX,iFAAA,EAAkB,IAAA;AAAA,EAClB,qEAAA,EAAgB,IAAA;AAAA,EAChB,qEAAA,EAAgB,IAAA;AAAA;AAAA,EAGhB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,4DAAA,EAAY,IAAA;AAAA,EACZ,uFAAA,EAAmB,IAAA;AAAA,EACnB,UAAA,EAAY,IAAA;AAAA,EACZ,eAAA,EAAiB,IAAA;AAAA;AAAA,EAGjB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,sDAAA,EAAW,IAAA;AAAA,EACX,iFAAA,EAAkB,IAAA;AAAA,EAClB,SAAA,EAAW,IAAA;AAAA,EACX,cAAA,EAAgB,IAAA;AAAA;AAAA,EAGhB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,OAAA,EAAS,IAAA;AAAA,EACT,YAAA,EAAc,IAAA;AAAA,EACd,4DAAA,EAAY,IAAA;AAAA,EACZ,4DAAA,EAAY,IAAA;AAAA;AAAA,EAGZ,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,MAAA,EAAQ,IAAA;AAAA,EACR,WAAA,EAAa,IAAA;AAAA,EACb,MAAA,EAAQ,IAAA;AAAA,EACR,gDAAA,EAAU,IAAA;AAAA;AAAA,EAGV,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,GAAA,EAAK,IAAA;AAAA,EACL,OAAA,EAAS,IAAA;AAAA,EACT,aAAA,EAAe,IAAA;AAAA,EACf,MAAA,EAAQ,IAAA;AAAA,EACR,gDAAA,EAAU,IAAA;AAAA;AAAA,EAGV,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,aAAA,EAAU,IAAA;AAAA,EACV,QAAA,EAAU,IAAA;AAAA,EACV,gBAAA,EAAe,IAAA;AAAA,EACf,MAAA,EAAQ,IAAA;AAAA,EACR,4DAAA,EAAY,IAAA;AAAA;AAAA,EAGZ,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,YAAA,EAAS,IAAA;AAAA,EACT,OAAA,EAAS,IAAA;AAAA,EACT,eAAA,EAAc,IAAA;AAAA,EACd,OAAA,EAAS,IAAA;AAAA,EACT,sDAAA,EAAW,IAAA;AAAA;AAAA,EAGX,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,QAAA,EAAU,IAAA;AAAA,EACV,aAAA,EAAe,IAAA;AAAA,EACf,OAAA,EAAS,IAAA;AAAA,EACT,4DAAA,EAAY;AACd,CAAA;AASO,SAAS,sBAAsB,KAAA,EAAiD;AACrF,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,KAAU,IAAA,EAAM,OAAO,IAAA;AAClD,EAAA,MAAM,OAAA,GAAU,KAAA,CAAM,IAAA,EAAK,CAAE,WAAA,EAAY;AACzC,EAAA,IAAI,OAAA,CAAQ,MAAA,KAAW,CAAA,EAAG,OAAO,IAAA;AACjC,EAAA,OAAO,OAAA,CAAQ,OAAO,CAAA,IAAK,IAAA;AAC7B;AAYO,SAAS,eAAe,KAAA,EAAiD;AAC9E,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,KAAU,IAAA,EAAM,OAAO,IAAA;AAClD,EAAA,MAAM,OAAA,GAAU,MAAM,IAAA,EAAK,CAAE,aAAY,CAAE,OAAA,CAAQ,MAAM,GAAG,CAAA;AAC5D,EAAA,IAAI,OAAA,CAAQ,MAAA,KAAW,CAAA,EAAG,OAAO,IAAA;AACjC,EAAA,MAAM,MAAA,GAAS,QAAQ,OAAO,CAAA;AAC9B,EAAA,IAAI,MAAA,KAAW,QAAW,OAAO,MAAA;AACjC,EAAA,MAAM,IAAA,GAAO,OAAA,CAAQ,KAAA,CAAM,GAAG,EAAE,CAAC,CAAA;AACjC,EAAA,IAAI,IAAA,KAAS,MAAA,IAAa,IAAA,CAAK,MAAA,KAAW,GAAG,OAAO,IAAA;AACpD,EAAA,OAAO,OAAA,CAAQ,IAAI,CAAA,IAAK,IAAA;AAC1B;AAWO,SAAS,cAAc,KAAA,EAAiD;AAC7E,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,KAAU,IAAA,EAAM,OAAO,IAAA;AAClD,EAAA,MAAM,QAAQ,KAAA,CAAM,KAAA,CAAM,GAAG,CAAA,CAAE,CAAC,GAAG,IAAA,EAAK;AACxC,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,CAAM,MAAA,KAAW,GAAG,OAAO,IAAA;AAEtD,EAAA,MAAM,MAAM,KAAA,CAAM,KAAA,CAAM,GAAG,CAAA,CAAE,CAAC,GAAG,IAAA,EAAK;AACtC,EAAA,OAAO,eAAe,GAAG,CAAA;AAC3B","file":"chunk-OVSPOZ5J.js","sourcesContent":["/**\n * BCP-47 / language-code normalization.\n *\n * Two entry points with deliberately different strictness:\n * - {@link normalizeBCP47} — for inputs documented to be BCP-47 (`<html lang>`,\n * hreflang, `Content-Language`): try the full string, then strip a\n * region/script suffix (`en-US` → `en`, `zh_CN` → `zh`).\n * - {@link normalizeLanguageCode} — strict exact-match only, for free-text\n * contexts (URL slugs, link text) where a hyphen split could be a coincidence.\n *\n * Both resolve aliases that appear in the wild (`ua` → `uk`, `rus` → `ru`,\n * localized picker phrases) to a canonical ISO 639-1 code.\n */\n\n/**\n * Aliases mapped to canonical ISO 639-1 codes. Keys are lowercased.\n *\n * Ukrainian is the load-bearing case: most sites use `ua` in URLs even though\n * the ISO code is `uk`. Both are accepted on input; `uk` is always output.\n *\n * Includes localized phrases users see in language pickers (`українською`,\n * `по-русски`, `in english`, …).\n */\nconst ALIASES: Record<string, string> = {\n // Ukrainian\n ua: \"uk\",\n uk: \"uk\",\n укр: \"uk\",\n українська: \"uk\",\n українською: \"uk\",\n \"українська мова\": \"uk\",\n \"на українській\": \"uk\",\n \"українською мовою\": \"uk\",\n ukrainian: \"uk\",\n \"in ukrainian\": \"uk\",\n\n // Russian\n ru: \"ru\",\n rus: \"ru\",\n рус: \"ru\",\n русский: \"ru\",\n \"по-русски\": \"ru\",\n \"по русски\": \"ru\",\n \"русский язык\": \"ru\",\n \"на русском\": \"ru\",\n russian: \"ru\",\n \"in russian\": \"ru\",\n російська: \"ru\",\n \"російська мова\": \"ru\",\n \"по-російськи\": \"ru\",\n \"по російськи\": \"ru\",\n\n // Belarusian\n be: \"be\",\n bel: \"be\",\n беларуская: \"be\",\n \"беларуская мова\": \"be\",\n belarusian: \"be\",\n \"in belarusian\": \"be\",\n\n // Bulgarian\n bg: \"bg\",\n bul: \"bg\",\n български: \"bg\",\n \"български език\": \"bg\",\n bulgarian: \"bg\",\n \"in bulgarian\": \"bg\",\n\n // English\n en: \"en\",\n eng: \"en\",\n english: \"en\",\n \"in english\": \"en\",\n англійська: \"en\",\n английский: \"en\",\n\n // Polish\n pl: \"pl\",\n pol: \"pl\",\n polski: \"pl\",\n \"po polsku\": \"pl\",\n polish: \"pl\",\n польська: \"pl\",\n\n // German\n de: \"de\",\n deu: \"de\",\n ger: \"de\",\n deutsch: \"de\",\n \"auf deutsch\": \"de\",\n german: \"de\",\n німецька: \"de\",\n\n // French\n fr: \"fr\",\n fra: \"fr\",\n français: \"fr\",\n francais: \"fr\",\n \"en français\": \"fr\",\n french: \"fr\",\n французька: \"fr\",\n\n // Spanish\n es: \"es\",\n spa: \"es\",\n español: \"es\",\n espanol: \"es\",\n \"en español\": \"es\",\n spanish: \"es\",\n іспанська: \"es\",\n\n // Italian\n it: \"it\",\n ita: \"it\",\n italiano: \"it\",\n \"in italiano\": \"it\",\n italian: \"it\",\n італійська: \"it\",\n};\n\n/**\n * Strict, exact-match lookup. Returns `null` for unknown inputs and does NOT\n * fall back to a hyphen prefix. Use anywhere a hyphen split could be a\n * coincidence — URL path segments (`/ru-return-warranty`), title attrs, link\n * text. The phrase aliases (`по-русски`, `in english`) are in the table\n * directly, so exact lookup still finds them.\n */\nexport function normalizeLanguageCode(input: string | undefined | null): string | null {\n if (input === undefined || input === null) return null;\n const cleaned = input.trim().toLowerCase();\n if (cleaned.length === 0) return null;\n return ALIASES[cleaned] ?? null;\n}\n\n/**\n * BCP-47-aware normalization: try the full string first, then strip a\n * region/script suffix (`en-US` → `en`, `zh_CN` → `zh`). Use ONLY for inputs\n * documented to be BCP-47 — `hreflang`, `<html lang>`, `Content-Language`,\n * `data-lang`/`data-locale` — never for free-text URL slugs.\n *\n * Falls back to the raw primary subtag when no alias matches, so a code outside\n * the alias table (e.g. `pt-BR` → `pt`) still resolves to its language. The\n * roster decides relevance downstream.\n */\nexport function normalizeBCP47(input: string | undefined | null): string | null {\n if (input === undefined || input === null) return null;\n const cleaned = input.trim().toLowerCase().replace(/_/g, \"-\");\n if (cleaned.length === 0) return null;\n const direct = ALIASES[cleaned];\n if (direct !== undefined) return direct;\n const head = cleaned.split(\"-\")[0];\n if (head === undefined || head.length === 0) return null;\n return ALIASES[head] ?? head;\n}\n\n/**\n * Extract the primary subtag from a BCP-47-ish value, lowercased, then resolve\n * it through the alias table (`ua` → `uk`). Handles `Accept-Language`-style\n * comma lists (`en-US,en;q=0.9` → `en`). Returns `null` for empty/nullish.\n *\n * This is the header/HTML extraction helper: it tolerates the messy shapes those\n * sources carry (comma lists, `q` weights) where {@link normalizeBCP47} expects\n * a single tag.\n */\nexport function primarySubtag(value: string | undefined | null): string | null {\n if (value === undefined || value === null) return null;\n const first = value.split(\",\")[0]?.trim();\n if (first === undefined || first.length === 0) return null;\n // Drop a `;q=…` weight if present.\n const tag = first.split(\";\")[0]?.trim();\n return normalizeBCP47(tag);\n}\n"]}
@@ -1 +0,0 @@
1
- {"version":3,"sources":["../src/internal/classify.ts"],"names":[],"mappings":";AA0BO,IAAM,UAAA,GAAa;AA0B1B,IAAM,OAAA,GAA0B;AAAA,EAC9B,QAAA,EAAU,SAAA;AAAA,EACV,MAAA,EAAQ,CAAA;AAAA,EACR,IAAA,EAAM,IAAA;AAAA,EACN,cAAA,EAAgB;AAClB,CAAA;AAmBA,IAAM,WAAA,GAAc,sBAAA;AACpB,IAAM,QAAA,GAAW,mBAAA;AAsBjB,IAAM,cAAA,GAAoC;AAAA,EACxC,oBAAA;AAAA;AAAA,EACA,cAAA;AAAA;AAAA,EACA,2CAAA;AAAA;AAAA,EACA;AAAA;AACF,CAAA;AAIO,SAAS,WAAW,IAAA,EAAsB;AAC/C,EAAA,IAAI,GAAA,GAAM,IAAA;AACV,EAAA,KAAA,MAAW,MAAM,cAAA,EAAgB,GAAA,GAAM,GAAA,CAAI,OAAA,CAAQ,IAAI,GAAG,CAAA;AAC1D,EAAA,OAAO,GAAA;AACT;AAKA,SAAS,eAAe,IAAA,EAA2C;AACjE,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,MAAW,EAAA,IAAM,UAAA,CAAW,IAAI,CAAA,EAAG;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,SAAA,IACxB,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,EACrC;AACA,EAAA,IAAI,GAAA,KAAQ,CAAA,IAAK,GAAA,KAAQ,CAAA,EAAG,OAAO,IAAA;AACnC,EAAA,OAAO,GAAA,IAAO,MAAM,UAAA,GAAa,OAAA;AACnC;AAMO,SAAS,gBAAgB,OAAA,EAA6C;AAC3E,EAAA,KAAA,MAAW,EAAA,IAAM,QAAQ,QAAA,EAAU;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,UAAA;AACjC,IAAA,IAAI,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,OAAA;AAAA,EAChC;AACA,EAAA,OAAO,IAAA;AACT;AAMO,SAAS,eAAA,CACd,MACA,UAAA,EACK;AACL,EAAA,MAAM,MAAA,GAAS,eAAe,IAAI,CAAA;AAClC,EAAA,IAAI,MAAA,KAAW,IAAA,EAAM,OAAO,EAAC;AAI7B,EAAA,MAAM,IAAA,uBAAW,GAAA,EAAY;AAC7B,EAAA,MAAM,SAAc,EAAC;AACrB,EAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,IAAA,IAAI,eAAA,CAAgB,CAAC,CAAA,KAAM,MAAA,IAAU,KAAK,GAAA,CAAI,CAAA,CAAE,IAAI,CAAA,EAAG;AACvD,IAAA,IAAA,CAAK,GAAA,CAAI,EAAE,IAAI,CAAA;AACf,IAAA,MAAA,CAAO,KAAK,CAAC,CAAA;AAAA,EACf;AACA,EAAA,OAAO,MAAA;AACT;AA8BA,SAAS,SAAS,IAAA,EAAwB;AACxC,EAAA,OAAO,KAAK,WAAA,EAAY,CAAE,KAAA,CAAM,UAAU,KAAK,EAAC;AAClD;AAOA,SAAS,KAAA,CAAM,OAAyB,UAAA,EAAwD;AAC9F,EAAA,MAAM,MAAA,GAAS,IAAI,GAAA,CAAoB,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,KAAM,CAAC,CAAA,CAAE,IAAA,EAAM,CAAC,CAAC,CAAC,CAAA;AACzE,EAAA,KAAA,MAAW,QAAQ,KAAA,EAAO;AACxB,IAAA,IAAI,KAAA,GAAuB,IAAA;AAC3B,IAAA,IAAI,MAAA,GAAS,CAAA;AACb,IAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,MAAA,IAAI,CAAA,CAAE,GAAA,CAAI,GAAA,CAAI,IAAI,CAAA,EAAG;AACnB,QAAA,MAAA,IAAU,CAAA;AACV,QAAA,IAAI,SAAS,CAAA,EAAG;AACd,UAAA,KAAA,GAAQ,IAAA;AACR,UAAA;AAAA,QACF;AACA,QAAA,KAAA,GAAQ,CAAA,CAAE,IAAA;AAAA,MACZ;AAAA,IACF;AACA,IAAA,IAAI,KAAA,KAAU,IAAA,EAAM,MAAA,CAAO,GAAA,CAAI,KAAA,EAAA,CAAQ,OAAO,GAAA,CAAI,KAAK,CAAA,IAAK,CAAA,IAAK,CAAC,CAAA;AAAA,EACpE;AACA,EAAA,OAAO,MAAA;AACT;AAGA,SAAS,OAAO,MAAA,EAAsE;AACpF,EAAA,IAAI,GAAA,GAAM,EAAA;AACV,EAAA,IAAI,MAAA,GAAS,EAAA;AACb,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,KAAA,MAAW,CAAC,CAAA,EAAG,KAAK,CAAA,IAAK,MAAA,EAAQ;AAC/B,IAAA,IAAI,QAAQ,GAAA,EAAK;AACf,MAAA,MAAA,GAAS,GAAA;AACT,MAAA,GAAA,GAAM,KAAA;AACN,MAAA,IAAA,GAAO,CAAA;AAAA,IACT,CAAA,MAAA,IAAW,QAAQ,MAAA,EAAQ;AACzB,MAAA,MAAA,GAAS,KAAA;AAAA,IACX;AAAA,EACF;AACA,EAAA,IAAI,IAAA,KAAS,IAAA,IAAQ,GAAA,GAAM,CAAA,EAAG,OAAO,IAAA;AACrC,EAAA,MAAM,MAAA,GAAS,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,QAAQ,CAAC,CAAA;AACvC,EAAA,OAAO,MAAA,IAAU,CAAA,GAAI,EAAE,IAAA,EAAM,QAAO,GAAI,IAAA;AAC1C;AAEA,SAAS,aAAA,CACP,YACA,IAAA,EACc;AACd,EAAA,OAAO,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,MAAO,EAAE,IAAA,EAAM,CAAA,CAAE,IAAA,EAAM,GAAA,EAAK,IAAI,GAAA,CAAI,IAAA,CAAK,CAAC,CAAC,GAAE,CAAE,CAAA;AACxE;AAIA,SAAS,UAAA,CAAW,MAAc,MAAA,EAAwD;AACxF,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,KAAK,WAAA,EAAY;AAAA,MACjB,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,QAAA,IAAY,CAAA,CAAE,SAAS,EAAA,CAAG;AAAA;AAC3D,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,IAAA,EAAM,QAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAM,CAAA,EAAE,GAAI,IAAA;AAC/D;AAGA,SAAS,QAAA,CACP,MAAA,EACA,MAAA,EACA,IAAA,EACA,IAAA,EACoB;AACpB,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,MAAA;AAAA,MACA,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,KAAA,GAAQ,IAAI,CAAA,IAAK,EAAE;AAAA;AACpD,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,MAAM,MAAA,EAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAK,GAAI,IAAA;AAC5D;AAcO,SAAS,iBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACgB;AAChB,EAAA,IAAI,CAAC,IAAA,IAAQ,UAAA,CAAW,MAAA,KAAW,GAAG,OAAO,OAAA;AAI7C,EAAA,MAAM,OAAA,GAAU,WAAW,IAAI,CAAA;AAG/B,EAAA,MAAM,MAAA,GAAS,eAAA,CAAgB,OAAA,EAAS,UAAU,CAAA;AAClD,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAKhC,EAAA,MAAM,cAAA,GAAiB,OAAO,MAAA,IAAU,CAAA;AAExC,EAAA,MAAM,QAAA,GAAW,UAAA,CAAW,OAAA,EAAS,MAAM,CAAA;AAC3C,EAAA,IAAI,QAAA,EAAU,OAAO,EAAE,GAAG,UAAU,cAAA,EAAe;AAEnD,EAAA,MAAM,MAAA,GAAS,SAAS,OAAO,CAAA;AAC/B,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAEhC,EAAA,MAAM,MAAA,GACJ,QAAA,CAAS,MAAA,EAAQ,MAAA,EAAQ,YAAY,IAAI,CAAA,IACzC,QAAA,CAAS,MAAA,EAAQ,QAAQ,UAAA,EAAY,IAAI,CAAA,IACzC,KAAA,GAAQ,SAAS,MAAM,CAAA;AACzB,EAAA,OAAO,MAAA,GAAS,EAAE,GAAG,MAAA,EAAQ,gBAAe,GAAI,OAAA;AAClD","file":"chunk-U34Z3ZSV.js","sourcesContent":["/**\n * Per-snippet language classification by candidate-set-relative set-difference.\n *\n * A ladder of rungs; the first rung whose leader clears a lead (margin) of ≥1\n * wins; otherwise `\"unknown\"`:\n *\n * 1 alphabet — characters distinctive within the candidate set\n * 2a function words — curated grammatical markers (highest precision)\n * 2b frequent words — corpus content words\n * 3 franc — optional trigram backstop for the distinctive-free\n * residual, injected as a resolver (this module stays\n * franc-free and importable without franc's tables)\n *\n * \"Distinctive\" is ALWAYS relative to the candidate set: a signal counts for a\n * candidate iff it appears in that candidate's profile and in NO other\n * candidate's. So `і` decides {uk, ru} (only uk has it) but is inert in\n * {uk, be} (both have it), and the word `и` decides {uk, ru} even though the\n * *letter* `и` is shared. Nothing is precomputed — uniqueness is the runtime\n * output, never stored.\n *\n * Adapted to langtell's {@link LanguageProfile} shape: the `words` and `iso6393`\n * fields are optional here, so a bare `{ code, alphabet }` profile still\n * classifies on rung 1.\n */\nimport type { LanguageProfile } from \"../types.js\";\n\nexport const FRANC_RUNG = 3;\n\n/** Which rung decided a verdict; `null` when unknown. */\nexport type Rung = 1 | \"2a\" | \"2b\" | typeof FRANC_RUNG | null;\n\nexport interface SnippetVerdict {\n /** Winning language code, or the sentinel `\"unknown\"`. */\n language: string;\n /** Lead of the winner over the runner-up, in the rung's own unit (distinctive\n * char/word count for rungs 1–2; franc score-gap for rung 3). 0 when unknown. */\n margin: number;\n /** Which rung decided; `null` when unknown. */\n rung: Rung;\n /** Whether ≥2 same-script candidates were in scope when the verdict was\n * reached. `true` ⇒ the distinctive-letter/word machinery actually chose\n * between candidates; `false` ⇒ the winner was the lone candidate in its\n * script, selected by script alone (no evidence it is *distinctively* that\n * language). `false` for `\"unknown\"`. */\n discriminating: boolean;\n}\n\n/** A rung's verdict before {@link classifyBySnippet} stamps on the scope-derived\n * `discriminating` flag (which a single rung can't know — it depends on how many\n * same-script candidates were scoped). */\nexport type RungVerdict = Pick<SnippetVerdict, \"language\" | \"margin\" | \"rung\">;\n\nconst UNKNOWN: SnippetVerdict = {\n language: \"unknown\",\n margin: 0,\n rung: null,\n discriminating: false,\n};\n\n/** Resolver for rung 3 (the optional trigram backstop), injected into\n * {@link classifyBySnippet} by callers that have franc available. Kept as an\n * injected seam — not a direct import — so this module stays franc-free and\n * importable without pulling franc's tables. Returns a rung-3 verdict or\n * `null` (abstain).\n *\n * Generic over the concrete profile type `P` the caller classifies with, so a\n * consumer that defines a stricter profile (e.g. `words` required) can type its\n * resolver over that exact shape and hand it to {@link classifyBySnippet} with\n * no adapter — the resolver sees `readonly P[]`, the same array the classifier\n * scoped from its input. Defaults to {@link LanguageProfile} for callers that\n * don't narrow. */\nexport type Rung3Resolver<P extends LanguageProfile = LanguageProfile> = (\n text: string,\n scoped: readonly P[],\n) => RungVerdict | null;\n\nconst CYRILLIC_RE = /\\p{Script=Cyrillic}/u;\nconst LATIN_RE = /\\p{Script=Latin}/u;\n\n/** A coarse script bucket — the only two the candidate-relative classifier\n * distinguishes today. `null` means \"no letters / undetermined\". */\nexport type ScriptName = \"cyrillic\" | \"latin\";\n\n/** Below this length, trigrams are too noisy to justify a rung-3 verdict. */\nexport const RUNG3_MIN_LENGTH = 24;\n\n/**\n * Trailing/inline Latin \"noise\" tokens — URLs, @handles, #hashtags — that a\n * Cyrillic title commonly carries (a headline followed by a link or a social\n * handle). These are almost always Latin even on Cyrillic-language content, so\n * left in they can flip {@link dominantScript} to Latin and let genuinely\n * Cyrillic content scope to the wrong roster. Stripped before the script vote\n * AND before the rung tallies so the URL's letters never contribute either.\n *\n * Kept as separate simple patterns (applied in order — schemes/www before bare\n * domains) rather than one big alternation, so each stays readable. ASCII-only\n * `[a-z0-9-]` in the domain pattern means a Cyrillic word is never mistaken for\n * a domain.\n */\nconst NOISE_PATTERNS: readonly RegExp[] = [\n /\\bhttps?:\\/\\/\\S+/gi, // full URLs\n /\\bwww\\.\\S+/gi, // www.… without a scheme\n /\\b[a-z0-9-]+(?:\\.[a-z0-9-]+)+(?:\\/\\S*)?/gi, // bare domains (example.com/path)\n /[@#][\\p{L}\\p{N}_]+/gu, // @handles and #hashtags\n];\n\n/** Drop URLs / @handles / #hashtags so trailing Latin noise can't outvote the\n * prose's script or pollute the per-rung tallies. */\nexport function stripNoise(text: string): string {\n let out = text;\n for (const re of NOISE_PATTERNS) out = out.replace(re, \" \");\n return out;\n}\n\n/** The script most of `text` is written in, or `null` if it carries no letters.\n * Noise (URLs/handles/hashtags) is stripped first so a single trailing link\n * can't flip a multi-word Cyrillic title's vote to Latin. */\nfunction dominantScript(text: string): \"cyrillic\" | \"latin\" | null {\n let cyr = 0;\n let lat = 0;\n for (const ch of stripNoise(text)) {\n if (CYRILLIC_RE.test(ch)) cyr += 1;\n else if (LATIN_RE.test(ch)) lat += 1;\n }\n if (cyr === 0 && lat === 0) return null;\n return cyr >= lat ? \"cyrillic\" : \"latin\";\n}\n\n/** The script a profile's alphabet is written in, or `null` if it carries no\n * Cyrillic/Latin letter. Exported so the fuser can derive each roster\n * candidate's script without re-deriving the script regexes — a Latin alphabet\n * ⇒ `\"latin\"`, a Cyrillic one ⇒ `\"cyrillic\"`. */\nexport function scriptOfProfile(profile: LanguageProfile): ScriptName | null {\n for (const ch of profile.alphabet) {\n if (CYRILLIC_RE.test(ch)) return \"cyrillic\";\n if (LATIN_RE.test(ch)) return \"latin\";\n }\n return null;\n}\n\n/** Candidates whose script matches the text's dominant script (others can't tip\n * the verdict). Empty when the text carries no letters. Generic over the\n * concrete profile type `P`: the result is a subset of the input array, so it\n * keeps `P` — a stricter caller's profiles stay strictly typed downstream. */\nexport function scopeCandidates<P extends LanguageProfile>(\n text: string,\n candidates: readonly P[],\n): P[] {\n const script = dominantScript(text);\n if (script === null) return [];\n // Keep one profile per code. A language listed twice would otherwise make its\n // own distinctive chars/words read as \"owned by ≥2 candidates\" in `tally`,\n // cancelling them out and collapsing the verdict to \"unknown\".\n const seen = new Set<string>();\n const scoped: P[] = [];\n for (const c of candidates) {\n if (scriptOfProfile(c) !== script || seen.has(c.code)) continue;\n seen.add(c.code);\n scoped.push(c);\n }\n return scoped;\n}\n\n/**\n * Per-language set of characters globally unique within `profiles` — present in\n * exactly one profile's alphabet. Relative to the given profile set: the unique\n * set shrinks as languages are added (a second Latin language un-uniques a–z).\n */\nexport function distinctiveChars(profiles: readonly LanguageProfile[]): Map<string, Set<string>> {\n const owners = new Map<string, string[]>();\n for (const p of profiles) {\n for (const ch of new Set(p.alphabet)) {\n const list = owners.get(ch);\n if (list) list.push(p.code);\n else owners.set(ch, [p.code]);\n }\n }\n const result = new Map<string, Set<string>>(profiles.map((p) => [p.code, new Set()]));\n for (const [ch, codes] of owners) {\n const [only] = codes;\n if (codes.length === 1 && only !== undefined) result.get(only)?.add(ch);\n }\n return result;\n}\n\ninterface Membership {\n code: string;\n set: ReadonlySet<string>;\n}\n\n/** Lowercased Unicode letter-run tokens. Keeps single-char tokens (`і`, `и`). */\nfunction tokenize(text: string): string[] {\n return text.toLowerCase().match(/\\p{L}+/gu) ?? [];\n}\n\n/**\n * Tally how many items (characters or word tokens) are distinctive to each\n * candidate — present in exactly one candidate's set. Items owned by zero or by\n * ≥2 candidates contribute nothing.\n */\nfunction tally(items: Iterable<string>, membership: readonly Membership[]): Map<string, number> {\n const scores = new Map<string, number>(membership.map((m) => [m.code, 0]));\n for (const item of items) {\n let owner: string | null = null;\n let owners = 0;\n for (const m of membership) {\n if (m.set.has(item)) {\n owners += 1;\n if (owners > 1) {\n owner = null;\n break;\n }\n owner = m.code;\n }\n }\n if (owner !== null) scores.set(owner, (scores.get(owner) ?? 0) + 1);\n }\n return scores;\n}\n\n/** The leading candidate and its lead over the runner-up, or `null` if <1. */\nfunction leader(scores: Map<string, number>): { code: string; margin: number } | null {\n let max = -1;\n let second = -1;\n let code: string | null = null;\n for (const [c, score] of scores) {\n if (score > max) {\n second = max;\n max = score;\n code = c;\n } else if (score > second) {\n second = score;\n }\n }\n if (code === null || max < 1) return null;\n const margin = max - Math.max(second, 0);\n return margin >= 1 ? { code, margin } : null;\n}\n\nfunction membershipFor(\n candidates: readonly LanguageProfile[],\n pick: (p: LanguageProfile) => Iterable<string>,\n): Membership[] {\n return candidates.map((c) => ({ code: c.code, set: new Set(pick(c)) }));\n}\n\n/** Rung 1 — characters (alphabet + orthographic {@link LanguageProfile.marks})\n * distinctive within the scoped candidate set. */\nfunction letterRung(text: string, scoped: readonly LanguageProfile[]): RungVerdict | null {\n const r = leader(\n tally(\n text.toLowerCase(),\n membershipFor(scoped, (p) => p.alphabet + (p.marks ?? \"\")),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung: 1 } : null;\n}\n\n/** Rung 2 — distinctive words from the given tier (2a function, 2b frequent). */\nfunction wordRung(\n tokens: readonly string[],\n scoped: readonly LanguageProfile[],\n tier: \"function\" | \"frequent\",\n rung: \"2a\" | \"2b\",\n): RungVerdict | null {\n const r = leader(\n tally(\n tokens,\n membershipFor(scoped, (p) => p.words?.[tier] ?? []),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung } : null;\n}\n\n/**\n * Classify `text` among `candidates`. Synchronous and allocation-light. Returns\n * `\"unknown\"` on empty evidence, on a tie inside the candidate set, or when\n * nothing is distinctive.\n *\n * Generic over the concrete profile type `P`, inferred from `candidates`. The\n * optional `rung3` resolver is typed over the same `P`, so a consumer with a\n * stricter profile (e.g. `words` required) can pass its own resolver directly,\n * with no adapter — the resolver sees exactly the profiles the caller passed.\n * `P` defaults to {@link LanguageProfile}, so the bare two-argument form and\n * every existing call site are unchanged.\n */\nexport function classifyBySnippet<P extends LanguageProfile = LanguageProfile>(\n text: string,\n candidates: readonly P[],\n rung3?: Rung3Resolver<P>,\n): SnippetVerdict {\n if (!text || candidates.length === 0) return UNKNOWN;\n\n // Drop URLs / @handles / #hashtags once, up front: trailing Latin noise must\n // not flip the dominant-script vote nor pollute the per-rung tallies.\n const cleaned = stripNoise(text);\n\n // Restrict to candidates in the text's dominant script.\n const scoped = scopeCandidates(cleaned, candidates);\n if (scoped.length === 0) return UNKNOWN;\n\n // ≥2 same-script candidates means the distinctive machinery actually had a\n // choice to make; a lone scoped candidate wins by script alone. Stamped onto\n // whichever rung decides — a single rung can't see the scope size.\n const discriminating = scoped.length >= 2;\n\n const byLetter = letterRung(cleaned, scoped);\n if (byLetter) return { ...byLetter, discriminating };\n\n const tokens = tokenize(cleaned);\n if (tokens.length === 0) return UNKNOWN;\n\n const byWord =\n wordRung(tokens, scoped, \"function\", \"2a\") ??\n wordRung(tokens, scoped, \"frequent\", \"2b\") ??\n rung3?.(cleaned, scoped);\n return byWord ? { ...byWord, discriminating } : UNKNOWN;\n}\n"]}