langtell 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -26
- package/dist/chrome-ai.d.ts +29 -0
- package/dist/chrome-ai.js +71 -0
- package/dist/chrome-ai.js.map +1 -0
- package/dist/chunk-3LDE35U2.js +36 -0
- package/dist/chunk-3LDE35U2.js.map +1 -0
- package/dist/chunk-3SO2WI75.js +35 -0
- package/dist/chunk-3SO2WI75.js.map +1 -0
- package/dist/chunk-KI4MAI3N.js +27 -0
- package/dist/chunk-KI4MAI3N.js.map +1 -0
- package/dist/chunk-OVSPOZ5J.js +115 -0
- package/dist/chunk-OVSPOZ5J.js.map +1 -0
- package/dist/chunk-RFR5I7P7.js +123 -0
- package/dist/chunk-RFR5I7P7.js.map +1 -0
- package/dist/chunk-TYSRYQN7.js +102 -0
- package/dist/chunk-TYSRYQN7.js.map +1 -0
- package/dist/franc.d.ts +25 -0
- package/dist/franc.js +59 -0
- package/dist/franc.js.map +1 -0
- package/dist/fuse.d.ts +24 -0
- package/dist/fuse.js +4 -0
- package/dist/fuse.js.map +1 -0
- package/dist/headers.d.ts +6 -0
- package/dist/headers.js +4 -0
- package/dist/headers.js.map +1 -0
- package/dist/html.d.ts +17 -0
- package/dist/html.js +4 -0
- package/dist/html.js.map +1 -0
- package/dist/index.d.ts +59 -0
- package/dist/index.js +64 -0
- package/dist/index.js.map +1 -0
- package/dist/profiles.d.ts +39 -0
- package/dist/profiles.js +1026 -0
- package/dist/profiles.js.map +1 -0
- package/dist/text.d.ts +65 -0
- package/dist/text.js +4 -0
- package/dist/text.js.map +1 -0
- package/dist/types-D4Ux-xA6.d.ts +97 -0
- package/package.json +100 -3
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/internal/classify.ts"],"names":[],"mappings":";AAyCA,IAAM,UAA0B,EAAE,QAAA,EAAU,WAAW,MAAA,EAAQ,CAAA,EAAG,MAAM,IAAA,EAAK;AAY7E,IAAM,WAAA,GAAc,sBAAA;AACpB,IAAM,QAAA,GAAW,mBAAA;AAkBjB,IAAM,cAAA,GAAoC;AAAA,EACxC,oBAAA;AAAA;AAAA,EACA,cAAA;AAAA;AAAA,EACA,2CAAA;AAAA;AAAA,EACA;AAAA;AACF,CAAA;AAIO,SAAS,WAAW,IAAA,EAAsB;AAC/C,EAAA,IAAI,GAAA,GAAM,IAAA;AACV,EAAA,KAAA,MAAW,MAAM,cAAA,EAAgB,GAAA,GAAM,GAAA,CAAI,OAAA,CAAQ,IAAI,GAAG,CAAA;AAC1D,EAAA,OAAO,GAAA;AACT;AAKA,SAAS,eAAe,IAAA,EAA2C;AACjE,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,MAAW,EAAA,IAAM,UAAA,CAAW,IAAI,CAAA,EAAG;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,SAAA,IACxB,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,EACrC;AACA,EAAA,IAAI,GAAA,KAAQ,CAAA,IAAK,GAAA,KAAQ,CAAA,EAAG,OAAO,IAAA;AACnC,EAAA,OAAO,GAAA,IAAO,MAAM,UAAA,GAAa,OAAA;AACnC;AAGA,SAAS,cAAc,OAAA,EAAuD;AAC5E,EAAA,KAAA,MAAW,EAAA,IAAM,QAAQ,QAAA,EAAU;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,UAAA;AACjC,IAAA,IAAI,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,OAAA;AAAA,EAChC;AACA,EAAA,OAAO,IAAA;AACT;AAIO,SAAS,eAAA,CACd,MACA,UAAA,EACmB;AACnB,EAAA,MAAM,MAAA,GAAS,eAAe,IAAI,CAAA;AAClC,EAAA,IAAI,MAAA,KAAW,IAAA,EAAM,OAAO,EAAC;AAI7B,EAAA,MAAM,IAAA,uBAAW,GAAA,EAAY;AAC7B,EAAA,MAAM,SAA4B,EAAC;AACnC,EAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,IAAA,IAAI,aAAA,CAAc,CAAC,CAAA,KAAM,MAAA,IAAU,KAAK,GAAA,CAAI,CAAA,CAAE,IAAI,CAAA,EAAG;AACrD,IAAA,IAAA,CAAK,GAAA,CAAI,EAAE,IAAI,CAAA;AACf,IAAA,MAAA,CAAO,KAAK,CAAC,CAAA;AAAA,EACf;AACA,EAAA,OAAO,MAAA;AACT;AA8BA,SAAS,SAAS,IAAA,EAAwB;AACxC,EAAA,OAAO,KAAK,WAAA,EAAY,CAAE,KAAA,CAAM,UAAU,KAAK,EAAC;AAClD;AAOA,SAAS,KAAA,CAAM,OAAyB,UAAA,EAAwD;AAC9F,EAAA,MAAM,MAAA,GAAS,IAAI,GAAA,CAAoB,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,KAAM,CAAC,CAAA,CAAE,IAAA,EAAM,CAAC,CAAC,CAAC,CAAA;AACzE,EAAA,KAAA,MAAW,QAAQ,KAAA,EAAO;AACxB,IAAA,IAAI,KAAA,GAAuB,IAAA;AAC3B,IAAA,IAAI,MAAA,GAAS,CAAA;AACb,IAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,MAAA,IAAI,CAAA,CAAE,GAAA,CAAI,GAAA,CAAI,IAAI,CAAA,EAAG;AACnB,QAAA,MAAA,IAAU,CAAA;AACV,QAAA,IAAI,SAAS,CAAA,EAAG;AACd,UAAA,KAAA,GAAQ,IAAA;AACR,UAAA;AAAA,QACF;AACA,QAAA,KAAA,GAAQ,CAAA,CAAE,IAAA;AAAA,MACZ;AAAA,IACF;AACA,IAAA,IAAI,KAAA,KAAU,IAAA,EAAM,MAAA,CAAO,GAAA,CAAI,KAAA,EAAA,CAAQ,OAAO,GAAA,CAAI,KAAK,CAAA,IAAK,CAAA,IAAK,CAAC,CAAA;AAAA,EACpE;AACA,EAAA,OAAO,MAAA;AACT;AAGA,SAAS,OAAO,MAAA,EAAsE;AACpF,EAAA,IAAI,GAAA,GAAM,EAAA;AACV,EAAA,IAAI,MAAA,GAAS,EAAA;AACb,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,KAAA,MAAW,CAAC,CAAA,EAAG,KAAK,CAAA,IAAK,MAAA,EAAQ;AAC/B,IAAA,IAAI,QAAQ,GAAA,EAAK;AACf,MAAA,MAAA,GAAS,GAAA;AACT,MAAA,GAAA,GAAM,KAAA;AACN,MAAA,IAAA,GAAO,CAAA;AAAA,IACT,CAAA,MAAA,IAAW,QAAQ,MAAA,EAAQ;AACzB,MAAA,MAAA,GAAS,KAAA;AAAA,IACX;AAAA,EACF;AACA,EAAA,IAAI,IAAA,KAAS,IAAA,IAAQ,GAAA,GAAM,CAAA,EAAG,OAAO,IAAA;AACrC,EAAA,MAAM,MAAA,GAAS,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,QAAQ,CAAC,CAAA;AACvC,EAAA,OAAO,MAAA,IAAU,CAAA,GAAI,EAAE,IAAA,EAAM,QAAO,GAAI,IAAA;AAC1C;AAEA,SAAS,aAAA,CACP,YACA,IAAA,EACc;AACd,EAAA,OAAO,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,MAAO,EAAE,IAAA,EAAM,CAAA,CAAE,IAAA,EAAM,GAAA,EAAK,IAAI,GAAA,CAAI,IAAA,CAAK,CAAC,CAAC,GAAE,CAAE,CAAA;AACxE;AAIA,SAAS,UAAA,CAAW,MAAc,MAAA,EAA2D;AAC3F,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,KAAK,WAAA,EAAY;AAAA,MACjB,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,QAAA,IAAY,CAAA,CAAE,SAAS,EAAA,CAAG;AAAA;AAC3D,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,IAAA,EAAM,QAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAM,CAAA,EAAE,GAAI,IAAA;AAC/D;AAGA,SAAS,QAAA,CACP,MAAA,EACA,MAAA,EACA,IAAA,EACA,IAAA,EACuB;AACvB,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,MAAA;AAAA,MACA,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,KAAA,GAAQ,IAAI,CAAA,IAAK,EAAE;AAAA;AACpD,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,MAAM,MAAA,EAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAK,GAAI,IAAA;AAC5D;AAOO,SAAS,iBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACgB;AAChB,EAAA,IAAI,CAAC,IAAA,IAAQ,UAAA,CAAW,MAAA,KAAW,GAAG,OAAO,OAAA;AAI7C,EAAA,MAAM,OAAA,GAAU,WAAW,IAAI,CAAA;AAG/B,EAAA,MAAM,MAAA,GAAS,eAAA,CAAgB,OAAA,EAAS,UAAU,CAAA;AAClD,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAEhC,EAAA,MAAM,QAAA,GAAW,UAAA,CAAW,OAAA,EAAS,MAAM,CAAA;AAC3C,EAAA,IAAI,UAAU,OAAO,QAAA;AAErB,EAAA,MAAM,MAAA,GAAS,SAAS,OAAO,CAAA;AAC/B,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAEhC,EAAA,OACE,QAAA,CAAS,MAAA,EAAQ,MAAA,EAAQ,UAAA,EAAY,IAAI,CAAA,IACzC,QAAA,CAAS,MAAA,EAAQ,MAAA,EAAQ,YAAY,IAAI,CAAA,IACzC,KAAA,GAAQ,OAAA,EAAS,MAAM,CAAA,IACvB,OAAA;AAEJ","file":"chunk-RFR5I7P7.js","sourcesContent":["/**\n * Per-snippet language classification by candidate-set-relative set-difference.\n *\n * A ladder of rungs; the first rung whose leader clears a lead (margin) of ≥1\n * wins; otherwise `\"unknown\"`:\n *\n * 1 alphabet — characters distinctive within the candidate set\n * 2a function words — curated grammatical markers (highest precision)\n * 2b frequent words — corpus content words\n * 3 franc — optional trigram backstop for the distinctive-free\n * residual, injected as a resolver (this module stays\n * franc-free and importable without franc's tables)\n *\n * \"Distinctive\" is ALWAYS relative to the candidate set: a signal counts for a\n * candidate iff it appears in that candidate's profile and in NO other\n * candidate's. So `і` decides {uk, ru} (only uk has it) but is inert in\n * {uk, be} (both have it), and the word `и` decides {uk, ru} even though the\n * *letter* `и` is shared. Nothing is precomputed — uniqueness is the runtime\n * output, never stored.\n *\n * Adapted to langtell's {@link LanguageProfile} shape: the `words` and `iso6393`\n * fields are optional here, so a bare `{ code, alphabet }` profile still\n * classifies on rung 1.\n */\nimport type { LanguageProfile } from \"../types.js\";\n\nexport const FRANC_RUNG = 3;\n\n/** Which rung decided a verdict; `null` when unknown. */\nexport type Rung = 1 | \"2a\" | \"2b\" | typeof FRANC_RUNG | null;\n\nexport interface SnippetVerdict {\n /** Winning language code, or the sentinel `\"unknown\"`. */\n language: string;\n /** Lead of the winner over the runner-up, in the rung's own unit (distinctive\n * char/word count for rungs 1–2; franc score-gap for rung 3). 0 when unknown. */\n margin: number;\n /** Which rung decided; `null` when unknown. */\n rung: Rung;\n}\n\nconst UNKNOWN: SnippetVerdict = { language: \"unknown\", margin: 0, rung: null };\n\n/** Resolver for rung 3 (the optional trigram backstop), injected into\n * {@link classifyBySnippet} by callers that have franc available. Kept as an\n * injected seam — not a direct import — so this module stays franc-free and\n * importable without pulling franc's tables. Returns a rung-3 verdict or\n * `null` (abstain). */\nexport type Rung3Resolver = (\n text: string,\n scoped: readonly LanguageProfile[],\n) => SnippetVerdict | null;\n\nconst CYRILLIC_RE = /\\p{Script=Cyrillic}/u;\nconst LATIN_RE = /\\p{Script=Latin}/u;\n\n/** Below this length, trigrams are too noisy to justify a rung-3 verdict. */\nexport const RUNG3_MIN_LENGTH = 24;\n\n/**\n * Trailing/inline Latin \"noise\" tokens — URLs, @handles, #hashtags — that a\n * Cyrillic title commonly carries (a headline followed by a link or a social\n * handle). These are almost always Latin even on Cyrillic-language content, so\n * left in they can flip {@link dominantScript} to Latin and let genuinely\n * Cyrillic content scope to the wrong roster. Stripped before the script vote\n * AND before the rung tallies so the URL's letters never contribute either.\n *\n * Kept as separate simple patterns (applied in order — schemes/www before bare\n * domains) rather than one big alternation, so each stays readable. ASCII-only\n * `[a-z0-9-]` in the domain pattern means a Cyrillic word is never mistaken for\n * a domain.\n */\nconst NOISE_PATTERNS: readonly RegExp[] = [\n /\\bhttps?:\\/\\/\\S+/gi, // full URLs\n /\\bwww\\.\\S+/gi, // www.… without a scheme\n /\\b[a-z0-9-]+(?:\\.[a-z0-9-]+)+(?:\\/\\S*)?/gi, // bare domains (example.com/path)\n /[@#][\\p{L}\\p{N}_]+/gu, // @handles and #hashtags\n];\n\n/** Drop URLs / @handles / #hashtags so trailing Latin noise can't outvote the\n * prose's script or pollute the per-rung tallies. */\nexport function stripNoise(text: string): string {\n let out = text;\n for (const re of NOISE_PATTERNS) out = out.replace(re, \" \");\n return out;\n}\n\n/** The script most of `text` is written in, or `null` if it carries no letters.\n * Noise (URLs/handles/hashtags) is stripped first so a single trailing link\n * can't flip a multi-word Cyrillic title's vote to Latin. */\nfunction dominantScript(text: string): \"cyrillic\" | \"latin\" | null {\n let cyr = 0;\n let lat = 0;\n for (const ch of stripNoise(text)) {\n if (CYRILLIC_RE.test(ch)) cyr += 1;\n else if (LATIN_RE.test(ch)) lat += 1;\n }\n if (cyr === 0 && lat === 0) return null;\n return cyr >= lat ? \"cyrillic\" : \"latin\";\n}\n\n/** The script of a profile's alphabet. */\nfunction profileScript(profile: LanguageProfile): \"cyrillic\" | \"latin\" | null {\n for (const ch of profile.alphabet) {\n if (CYRILLIC_RE.test(ch)) return \"cyrillic\";\n if (LATIN_RE.test(ch)) return \"latin\";\n }\n return null;\n}\n\n/** Candidates whose script matches the text's dominant script (others can't tip\n * the verdict). Empty when the text carries no letters. */\nexport function scopeCandidates(\n text: string,\n candidates: readonly LanguageProfile[],\n): LanguageProfile[] {\n const script = dominantScript(text);\n if (script === null) return [];\n // Keep one profile per code. A language listed twice would otherwise make its\n // own distinctive chars/words read as \"owned by ≥2 candidates\" in `tally`,\n // cancelling them out and collapsing the verdict to \"unknown\".\n const seen = new Set<string>();\n const scoped: LanguageProfile[] = [];\n for (const c of candidates) {\n if (profileScript(c) !== script || seen.has(c.code)) continue;\n seen.add(c.code);\n scoped.push(c);\n }\n return scoped;\n}\n\n/**\n * Per-language set of characters globally unique within `profiles` — present in\n * exactly one profile's alphabet. Relative to the given profile set: the unique\n * set shrinks as languages are added (a second Latin language un-uniques a–z).\n */\nexport function distinctiveChars(profiles: readonly LanguageProfile[]): Map<string, Set<string>> {\n const owners = new Map<string, string[]>();\n for (const p of profiles) {\n for (const ch of new Set(p.alphabet)) {\n const list = owners.get(ch);\n if (list) list.push(p.code);\n else owners.set(ch, [p.code]);\n }\n }\n const result = new Map<string, Set<string>>(profiles.map((p) => [p.code, new Set()]));\n for (const [ch, codes] of owners) {\n const [only] = codes;\n if (codes.length === 1 && only !== undefined) result.get(only)?.add(ch);\n }\n return result;\n}\n\ninterface Membership {\n code: string;\n set: ReadonlySet<string>;\n}\n\n/** Lowercased Unicode letter-run tokens. Keeps single-char tokens (`і`, `и`). */\nfunction tokenize(text: string): string[] {\n return text.toLowerCase().match(/\\p{L}+/gu) ?? [];\n}\n\n/**\n * Tally how many items (characters or word tokens) are distinctive to each\n * candidate — present in exactly one candidate's set. Items owned by zero or by\n * ≥2 candidates contribute nothing.\n */\nfunction tally(items: Iterable<string>, membership: readonly Membership[]): Map<string, number> {\n const scores = new Map<string, number>(membership.map((m) => [m.code, 0]));\n for (const item of items) {\n let owner: string | null = null;\n let owners = 0;\n for (const m of membership) {\n if (m.set.has(item)) {\n owners += 1;\n if (owners > 1) {\n owner = null;\n break;\n }\n owner = m.code;\n }\n }\n if (owner !== null) scores.set(owner, (scores.get(owner) ?? 0) + 1);\n }\n return scores;\n}\n\n/** The leading candidate and its lead over the runner-up, or `null` if <1. */\nfunction leader(scores: Map<string, number>): { code: string; margin: number } | null {\n let max = -1;\n let second = -1;\n let code: string | null = null;\n for (const [c, score] of scores) {\n if (score > max) {\n second = max;\n max = score;\n code = c;\n } else if (score > second) {\n second = score;\n }\n }\n if (code === null || max < 1) return null;\n const margin = max - Math.max(second, 0);\n return margin >= 1 ? { code, margin } : null;\n}\n\nfunction membershipFor(\n candidates: readonly LanguageProfile[],\n pick: (p: LanguageProfile) => Iterable<string>,\n): Membership[] {\n return candidates.map((c) => ({ code: c.code, set: new Set(pick(c)) }));\n}\n\n/** Rung 1 — characters (alphabet + orthographic {@link LanguageProfile.marks})\n * distinctive within the scoped candidate set. */\nfunction letterRung(text: string, scoped: readonly LanguageProfile[]): SnippetVerdict | null {\n const r = leader(\n tally(\n text.toLowerCase(),\n membershipFor(scoped, (p) => p.alphabet + (p.marks ?? \"\")),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung: 1 } : null;\n}\n\n/** Rung 2 — distinctive words from the given tier (2a function, 2b frequent). */\nfunction wordRung(\n tokens: readonly string[],\n scoped: readonly LanguageProfile[],\n tier: \"function\" | \"frequent\",\n rung: \"2a\" | \"2b\",\n): SnippetVerdict | null {\n const r = leader(\n tally(\n tokens,\n membershipFor(scoped, (p) => p.words?.[tier] ?? []),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung } : null;\n}\n\n/**\n * Classify `text` among `candidates`. Synchronous and allocation-light. Returns\n * `\"unknown\"` on empty evidence, on a tie inside the candidate set, or when\n * nothing is distinctive.\n */\nexport function classifyBySnippet(\n text: string,\n candidates: readonly LanguageProfile[],\n rung3?: Rung3Resolver,\n): SnippetVerdict {\n if (!text || candidates.length === 0) return UNKNOWN;\n\n // Drop URLs / @handles / #hashtags once, up front: trailing Latin noise must\n // not flip the dominant-script vote nor pollute the per-rung tallies.\n const cleaned = stripNoise(text);\n\n // Restrict to candidates in the text's dominant script.\n const scoped = scopeCandidates(cleaned, candidates);\n if (scoped.length === 0) return UNKNOWN;\n\n const byLetter = letterRung(cleaned, scoped);\n if (byLetter) return byLetter;\n\n const tokens = tokenize(cleaned);\n if (tokens.length === 0) return UNKNOWN;\n\n return (\n wordRung(tokens, scoped, \"function\", \"2a\") ??\n wordRung(tokens, scoped, \"frequent\", \"2b\") ??\n rung3?.(cleaned, scoped) ??\n UNKNOWN\n );\n}\n"]}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import { normalizeBCP47 } from './chunk-OVSPOZ5J.js';
|
|
2
|
+
|
|
3
|
+
// src/fuse.ts
|
|
4
|
+
var DEFAULT_KIND_WEIGHT = {
|
|
5
|
+
"title-script": 1,
|
|
6
|
+
"explicit-locale": 1,
|
|
7
|
+
"chrome-ai": 1,
|
|
8
|
+
"source-prior": 0.7,
|
|
9
|
+
franc: 0.7,
|
|
10
|
+
"http-content-language": 0.6,
|
|
11
|
+
"meta-content-language": 0.55,
|
|
12
|
+
"meta-og-locale": 0.55,
|
|
13
|
+
"html-lang": 0.5
|
|
14
|
+
};
|
|
15
|
+
var SCRIPT_KINDS = /* @__PURE__ */ new Set(["title-script", "franc", "chrome-ai"]);
|
|
16
|
+
var SCRIPT_CONFIDENCE_FLOOR = 0.6;
|
|
17
|
+
var MIN_WINNING_SCORE = 0.35;
|
|
18
|
+
var MIN_MARGIN = 0.12;
|
|
19
|
+
function fuse(evidence, options = {}) {
|
|
20
|
+
const weights = options.weights ?? {};
|
|
21
|
+
const normalized = normalizeEvidence(evidence, options.candidates);
|
|
22
|
+
const scores = /* @__PURE__ */ new Map();
|
|
23
|
+
for (const item of normalized) {
|
|
24
|
+
if (item.language === "unknown") continue;
|
|
25
|
+
const weight = weights[item.source] ?? weights[item.kind] ?? DEFAULT_KIND_WEIGHT[item.kind] ?? 0.5;
|
|
26
|
+
scores.set(item.language, (scores.get(item.language) ?? 0) + clamp01(item.confidence) * weight);
|
|
27
|
+
}
|
|
28
|
+
const pinned = confidentScriptLanguage(normalized);
|
|
29
|
+
const { best, bestScore, secondScore } = argmax(scores, pinned);
|
|
30
|
+
if (best === null || bestScore < MIN_WINNING_SCORE || bestScore - secondScore < MIN_MARGIN) {
|
|
31
|
+
if (pinned !== null && scores.has(pinned)) {
|
|
32
|
+
const score = scores.get(pinned) ?? 0;
|
|
33
|
+
return {
|
|
34
|
+
language: pinned,
|
|
35
|
+
confidence: clamp01(score / (score + 0.15)),
|
|
36
|
+
evidence: [...normalized]
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
return { language: "unknown", confidence: clamp01(bestScore), evidence: [...normalized] };
|
|
40
|
+
}
|
|
41
|
+
return {
|
|
42
|
+
language: best,
|
|
43
|
+
confidence: clamp01(bestScore / (bestScore + secondScore + 0.15)),
|
|
44
|
+
evidence: [...normalized]
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
function normalizeEvidence(evidence, _candidates) {
|
|
48
|
+
return evidence.map((item) => {
|
|
49
|
+
if (item.language === "unknown") return item;
|
|
50
|
+
const normalized = normalizeBCP47(item.language) ?? item.language;
|
|
51
|
+
if (normalized === item.language) return item;
|
|
52
|
+
return { ...item, language: normalized };
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
function confidentScriptLanguage(evidence) {
|
|
56
|
+
let best = null;
|
|
57
|
+
let bestConfidence = 0;
|
|
58
|
+
for (const item of evidence) {
|
|
59
|
+
if (item.language === "unknown" || !SCRIPT_KINDS.has(item.kind)) continue;
|
|
60
|
+
const c = clamp01(item.confidence);
|
|
61
|
+
if (c < SCRIPT_CONFIDENCE_FLOOR) continue;
|
|
62
|
+
if (c > bestConfidence) {
|
|
63
|
+
bestConfidence = c;
|
|
64
|
+
best = item.language;
|
|
65
|
+
} else if (c === bestConfidence && item.language !== best) {
|
|
66
|
+
best = null;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
return best;
|
|
70
|
+
}
|
|
71
|
+
function argmax(scores, pinned) {
|
|
72
|
+
let best = null;
|
|
73
|
+
let bestScore = 0;
|
|
74
|
+
let secondScore = 0;
|
|
75
|
+
const pinnedScore = pinned !== null ? scores.get(pinned) ?? 0 : 0;
|
|
76
|
+
for (const [language, raw] of scores) {
|
|
77
|
+
const score = pinned !== null && language !== pinned ? Math.min(raw, pinnedScore) : raw;
|
|
78
|
+
if (score > bestScore) {
|
|
79
|
+
secondScore = bestScore;
|
|
80
|
+
bestScore = score;
|
|
81
|
+
best = language;
|
|
82
|
+
} else if (score > secondScore) {
|
|
83
|
+
secondScore = score;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
if (pinned !== null && best !== pinned && bestScore === pinnedScore && pinnedScore > 0) {
|
|
87
|
+
secondScore = bestScore;
|
|
88
|
+
best = pinned;
|
|
89
|
+
bestScore = pinnedScore;
|
|
90
|
+
}
|
|
91
|
+
return { best, bestScore, secondScore };
|
|
92
|
+
}
|
|
93
|
+
function clamp01(value) {
|
|
94
|
+
if (!Number.isFinite(value)) return 0;
|
|
95
|
+
if (value < 0) return 0;
|
|
96
|
+
if (value > 1) return 1;
|
|
97
|
+
return value;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
export { fuse };
|
|
101
|
+
//# sourceMappingURL=chunk-TYSRYQN7.js.map
|
|
102
|
+
//# sourceMappingURL=chunk-TYSRYQN7.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/fuse.ts"],"names":[],"mappings":";;;AAcA,IAAM,mBAAA,GAA8C;AAAA,EAClD,cAAA,EAAgB,CAAA;AAAA,EAChB,iBAAA,EAAmB,CAAA;AAAA,EACnB,WAAA,EAAa,CAAA;AAAA,EACb,cAAA,EAAgB,GAAA;AAAA,EAChB,KAAA,EAAO,GAAA;AAAA,EACP,uBAAA,EAAyB,GAAA;AAAA,EACzB,uBAAA,EAAyB,IAAA;AAAA,EACzB,gBAAA,EAAkB,IAAA;AAAA,EAClB,WAAA,EAAa;AACf,CAAA;AAKA,IAAM,+BAAe,IAAI,GAAA,CAAY,CAAC,cAAA,EAAgB,OAAA,EAAS,WAAW,CAAC,CAAA;AAI3E,IAAM,uBAAA,GAA0B,GAAA;AAEhC,IAAM,iBAAA,GAAoB,IAAA;AAC1B,IAAM,UAAA,GAAa,IAAA;AAcZ,SAAS,IAAA,CACd,QAAA,EACA,OAAA,GAAuB,EAAC,EACR;AAChB,EAAA,MAAM,OAAA,GAAU,OAAA,CAAQ,OAAA,IAAW,EAAC;AACpC,EAAA,MAAM,UAAA,GAAa,iBAAA,CAAkB,QAAA,EAAU,OAAA,CAAQ,UAAU,CAAA;AAEjE,EAAA,MAAM,MAAA,uBAAa,GAAA,EAAoB;AACvC,EAAA,KAAA,MAAW,QAAQ,UAAA,EAAY;AAC7B,IAAA,IAAI,IAAA,CAAK,aAAa,SAAA,EAAW;AACjC,IAAA,MAAM,MAAA,GACJ,OAAA,CAAQ,IAAA,CAAK,MAAM,CAAA,IAAK,OAAA,CAAQ,IAAA,CAAK,IAAI,CAAA,IAAK,mBAAA,CAAoB,IAAA,CAAK,IAAI,CAAA,IAAK,GAAA;AAClF,IAAA,MAAA,CAAO,GAAA,CAAI,IAAA,CAAK,QAAA,EAAA,CAAW,MAAA,CAAO,GAAA,CAAI,IAAA,CAAK,QAAQ,CAAA,IAAK,CAAA,IAAK,OAAA,CAAQ,IAAA,CAAK,UAAU,IAAI,MAAM,CAAA;AAAA,EAChG;AAGA,EAAA,MAAM,MAAA,GAAS,wBAAwB,UAAU,CAAA;AAEjD,EAAA,MAAM,EAAE,IAAA,EAAM,SAAA,EAAW,aAAY,GAAI,MAAA,CAAO,QAAQ,MAAM,CAAA;AAE9D,EAAA,IAAI,SAAS,IAAA,IAAQ,SAAA,GAAY,iBAAA,IAAqB,SAAA,GAAY,cAAc,UAAA,EAAY;AAG1F,IAAA,IAAI,MAAA,KAAW,IAAA,IAAQ,MAAA,CAAO,GAAA,CAAI,MAAM,CAAA,EAAG;AACzC,MAAA,MAAM,KAAA,GAAQ,MAAA,CAAO,GAAA,CAAI,MAAM,CAAA,IAAK,CAAA;AACpC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,UAAA,EAAY,OAAA,CAAQ,KAAA,IAAS,KAAA,GAAQ,IAAA,CAAK,CAAA;AAAA,QAC1C,QAAA,EAAU,CAAC,GAAG,UAAU;AAAA,OAC1B;AAAA,IACF;AACA,IAAA,OAAO,EAAE,QAAA,EAAU,SAAA,EAAW,UAAA,EAAY,OAAA,CAAQ,SAAS,CAAA,EAAG,QAAA,EAAU,CAAC,GAAG,UAAU,CAAA,EAAE;AAAA,EAC1F;AAEA,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,IAAA;AAAA,IACV,UAAA,EAAY,OAAA,CAAQ,SAAA,IAAa,SAAA,GAAY,cAAc,IAAA,CAAK,CAAA;AAAA,IAChE,QAAA,EAAU,CAAC,GAAG,UAAU;AAAA,GAC1B;AACF;AAYA,SAAS,iBAAA,CACP,UACA,WAAA,EACoB;AACpB,EAAA,OAAO,QAAA,CAAS,GAAA,CAAI,CAAC,IAAA,KAAS;AAC5B,IAAA,IAAI,IAAA,CAAK,QAAA,KAAa,SAAA,EAAW,OAAO,IAAA;AACxC,IAAA,MAAM,UAAA,GAAa,cAAA,CAAe,IAAA,CAAK,QAAQ,KAAK,IAAA,CAAK,QAAA;AACzD,IAAA,IAAI,UAAA,KAAe,IAAA,CAAK,QAAA,EAAU,OAAO,IAAA;AACzC,IAAA,OAAO,EAAE,GAAG,IAAA,EAAM,QAAA,EAAU,UAAA,EAAW;AAAA,EACzC,CAAC,CAAA;AACH;AAKA,SAAS,wBAAwB,QAAA,EAAsD;AACrF,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,IAAI,cAAA,GAAiB,CAAA;AACrB,EAAA,KAAA,MAAW,QAAQ,QAAA,EAAU;AAC3B,IAAA,IAAI,IAAA,CAAK,aAAa,SAAA,IAAa,CAAC,aAAa,GAAA,CAAI,IAAA,CAAK,IAAI,CAAA,EAAG;AACjE,IAAA,MAAM,CAAA,GAAI,OAAA,CAAQ,IAAA,CAAK,UAAU,CAAA;AACjC,IAAA,IAAI,IAAI,uBAAA,EAAyB;AACjC,IAAA,IAAI,IAAI,cAAA,EAAgB;AACtB,MAAA,cAAA,GAAiB,CAAA;AACjB,MAAA,IAAA,GAAO,IAAA,CAAK,QAAA;AAAA,IACd,CAAA,MAAA,IAAW,CAAA,KAAM,cAAA,IAAkB,IAAA,CAAK,aAAa,IAAA,EAAM;AAEzD,MAAA,IAAA,GAAO,IAAA;AAAA,IACT;AAAA,EACF;AACA,EAAA,OAAO,IAAA;AACT;AAQA,SAAS,MAAA,CACP,QACA,MAAA,EACiE;AACjE,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,IAAI,SAAA,GAAY,CAAA;AAChB,EAAA,IAAI,WAAA,GAAc,CAAA;AAClB,EAAA,MAAM,cAAc,MAAA,KAAW,IAAA,GAAQ,OAAO,GAAA,CAAI,MAAM,KAAK,CAAA,GAAK,CAAA;AAElE,EAAA,KAAA,MAAW,CAAC,QAAA,EAAU,GAAG,CAAA,IAAK,MAAA,EAAQ;AAEpC,IAAA,MAAM,KAAA,GAAQ,WAAW,IAAA,IAAQ,QAAA,KAAa,SAAS,IAAA,CAAK,GAAA,CAAI,GAAA,EAAK,WAAW,CAAA,GAAI,GAAA;AACpF,IAAA,IAAI,QAAQ,SAAA,EAAW;AACrB,MAAA,WAAA,GAAc,SAAA;AACd,MAAA,SAAA,GAAY,KAAA;AACZ,MAAA,IAAA,GAAO,QAAA;AAAA,IACT,CAAA,MAAA,IAAW,QAAQ,WAAA,EAAa;AAC9B,MAAA,WAAA,GAAc,KAAA;AAAA,IAChB;AAAA,EACF;AAEA,EAAA,IAAI,WAAW,IAAA,IAAQ,IAAA,KAAS,UAAU,SAAA,KAAc,WAAA,IAAe,cAAc,CAAA,EAAG;AACtF,IAAA,WAAA,GAAc,SAAA;AACd,IAAA,IAAA,GAAO,MAAA;AACP,IAAA,SAAA,GAAY,WAAA;AAAA,EACd;AACA,EAAA,OAAO,EAAE,IAAA,EAAM,SAAA,EAAW,WAAA,EAAY;AACxC;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chunk-TYSRYQN7.js","sourcesContent":["import type { Classification, LanguageEvidence, LanguageProfile, Weights } from \"./types.js\";\nimport { normalizeBCP47 } from \"./internal/bcp47.js\";\n\nexport interface FuseOptions {\n weights?: Weights;\n /** The candidate roster. When present, incoming evidence tags are normalized\n * into it (`uk-UA` → `uk`, `ua` → `uk`) so context signals (page/header\n * locale) land on the same code the text rungs use. */\n candidates?: readonly LanguageProfile[];\n}\n\n/** Default per-kind weights. Clear lexical signal (script, explicit locale)\n * outweighs contextual signal (page tags, headers). Callers override per\n * `source` id or `kind` via {@link FuseOptions.weights}. */\nconst DEFAULT_KIND_WEIGHT: Record<string, number> = {\n \"title-script\": 1,\n \"explicit-locale\": 1,\n \"chrome-ai\": 1,\n \"source-prior\": 0.7,\n franc: 0.7,\n \"http-content-language\": 0.6,\n \"meta-content-language\": 0.55,\n \"meta-og-locale\": 0.55,\n \"html-lang\": 0.5,\n};\n\n/** Evidence kinds that constitute *clear script evidence* — a verdict the text\n * classifier or an on-device model reached by actually reading the string. The\n * guard below forbids weaker page/header *context* from flipping these. */\nconst SCRIPT_KINDS = new Set<string>([\"title-script\", \"franc\", \"chrome-ai\"]);\n\n/** A script verdict this confident is treated as settled — context may add to it\n * but must not flip the winner to a different language. */\nconst SCRIPT_CONFIDENCE_FLOOR = 0.6;\n\nconst MIN_WINNING_SCORE = 0.35;\nconst MIN_MARGIN = 0.12;\n\n/**\n * Combine evidence into a single weighted verdict with an audit trail.\n *\n * Three steps:\n * 1. Normalize each item's language tag into the candidate roster (BCP-47:\n * `uk-UA`/`ua` → `uk`) so text, page, and header signals agree on a code.\n * 2. Weighted argmax over languages (caller weights override per `source`/`kind`).\n * 3. Apply the guard **context must never override clear script evidence**: when\n * the text classifier (or an on-device model) confidently read one language,\n * weaker page/header context for a *different* language cannot win — a\n * Ukrainian page chrome does not make a Latin/English title Ukrainian.\n */\nexport function fuse(\n evidence: readonly LanguageEvidence[],\n options: FuseOptions = {},\n): Classification {\n const weights = options.weights ?? {};\n const normalized = normalizeEvidence(evidence, options.candidates);\n\n const scores = new Map<string, number>();\n for (const item of normalized) {\n if (item.language === \"unknown\") continue;\n const weight =\n weights[item.source] ?? weights[item.kind] ?? DEFAULT_KIND_WEIGHT[item.kind] ?? 0.5;\n scores.set(item.language, (scores.get(item.language) ?? 0) + clamp01(item.confidence) * weight);\n }\n\n // The context-vs-script guard: a confident script read pins the winner.\n const pinned = confidentScriptLanguage(normalized);\n\n const { best, bestScore, secondScore } = argmax(scores, pinned);\n\n if (best === null || bestScore < MIN_WINNING_SCORE || bestScore - secondScore < MIN_MARGIN) {\n // A pinned script language still wins even on a thin margin — clear script\n // evidence is never demoted to \"unknown\" by competing context.\n if (pinned !== null && scores.has(pinned)) {\n const score = scores.get(pinned) ?? 0;\n return {\n language: pinned,\n confidence: clamp01(score / (score + 0.15)),\n evidence: [...normalized],\n };\n }\n return { language: \"unknown\", confidence: clamp01(bestScore), evidence: [...normalized] };\n }\n\n return {\n language: best,\n confidence: clamp01(bestScore / (bestScore + secondScore + 0.15)),\n evidence: [...normalized],\n };\n}\n\n/** Normalize each item's tag into the roster's code space (BCP-47-aware). Items\n * already `\"unknown\"` pass through untouched. Tags are BCP-47-normalized\n * (`en-US` → `en`, `ua` → `uk`) so text, page, and header signals land on the\n * same code. The normalized code is kept even when it falls outside the roster —\n * argmax simply won't favor an out-of-roster context tag, but it stays in the\n * audit trail.\n *\n * The roster is accepted (and reserved) so a future revision can fold roster\n * aliasing in without a signature change; today BCP-47 normalization alone\n * reconciles the codes the producers emit. */\nfunction normalizeEvidence(\n evidence: readonly LanguageEvidence[],\n _candidates: readonly LanguageProfile[] | undefined,\n): LanguageEvidence[] {\n return evidence.map((item) => {\n if (item.language === \"unknown\") return item;\n const normalized = normalizeBCP47(item.language) ?? item.language;\n if (normalized === item.language) return item;\n return { ...item, language: normalized };\n });\n}\n\n/** The language of a *clear script* read confident enough to pin the verdict, or\n * `null` when none qualifies. When two script reads disagree, the higher-\n * confidence one pins (a tie leaves nothing pinned — argmax decides normally). */\nfunction confidentScriptLanguage(evidence: readonly LanguageEvidence[]): string | null {\n let best: string | null = null;\n let bestConfidence = 0;\n for (const item of evidence) {\n if (item.language === \"unknown\" || !SCRIPT_KINDS.has(item.kind)) continue;\n const c = clamp01(item.confidence);\n if (c < SCRIPT_CONFIDENCE_FLOOR) continue;\n if (c > bestConfidence) {\n bestConfidence = c;\n best = item.language;\n } else if (c === bestConfidence && item.language !== best) {\n // Two equally-confident script reads for different languages — ambiguous.\n best = null;\n }\n }\n return best;\n}\n\n/**\n * Weighted argmax. When `pinned` is set (a confident script language), any\n * *other* language's score may only come from context kinds; that score is\n * capped so it can never exceed the pinned language. This enforces the guard\n * without discarding the context from the audit trail.\n */\nfunction argmax(\n scores: Map<string, number>,\n pinned: string | null,\n): { best: string | null; bestScore: number; secondScore: number } {\n let best: string | null = null;\n let bestScore = 0;\n let secondScore = 0;\n const pinnedScore = pinned !== null ? (scores.get(pinned) ?? 0) : 0;\n\n for (const [language, raw] of scores) {\n // Guard: a non-pinned language cannot out-score the pinned one.\n const score = pinned !== null && language !== pinned ? Math.min(raw, pinnedScore) : raw;\n if (score > bestScore) {\n secondScore = bestScore;\n bestScore = score;\n best = language;\n } else if (score > secondScore) {\n secondScore = score;\n }\n }\n // On a pinned tie (pinned capped equal to a context language), prefer pinned.\n if (pinned !== null && best !== pinned && bestScore === pinnedScore && pinnedScore > 0) {\n secondScore = bestScore;\n best = pinned;\n bestScore = pinnedScore;\n }\n return { best, bestScore, secondScore };\n}\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
|
package/dist/franc.d.ts
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import { L as LanguageProfile, S as SyncSource, a as LanguageEvidence } from './types-D4Ux-xA6.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Producer: the franc trigram backstop over `text`, scoped to `candidates`.
|
|
5
|
+
* Synchronous — franc itself is sync. Emits at most one `kind: "franc"` item.
|
|
6
|
+
*/
|
|
7
|
+
declare function evidenceFromFranc(text: string | undefined, candidates: readonly LanguageProfile[] | undefined): LanguageEvidence[];
|
|
8
|
+
/**
|
|
9
|
+
* Build a franc {@link SyncSource} bound to a candidate roster, for use in
|
|
10
|
+
* `compile({ engines: [createFrancEngine(candidates)] })`. franc needs the
|
|
11
|
+
* roster to scope its `only` restriction, so it is bound at construction (the
|
|
12
|
+
* same shape `compile` uses to bind the built-in text producer).
|
|
13
|
+
*
|
|
14
|
+
* A `SyncSource` (not async): franc runs in-process and synchronously, so the
|
|
15
|
+
* compiled `detect` stays synchronous — no `await` ceremony on the hot path.
|
|
16
|
+
*/
|
|
17
|
+
declare function createFrancEngine(candidates: readonly LanguageProfile[]): SyncSource;
|
|
18
|
+
/**
|
|
19
|
+
* A bare franc engine with no bound roster — it abstains until given candidates.
|
|
20
|
+
* Prefer {@link createFrancEngine} with your roster; this default exists so the
|
|
21
|
+
* engine has a stable named export and a no-config import works.
|
|
22
|
+
*/
|
|
23
|
+
declare const francEngine: SyncSource;
|
|
24
|
+
|
|
25
|
+
export { createFrancEngine, evidenceFromFranc, francEngine };
|
package/dist/franc.js
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { scopeCandidates } from './chunk-RFR5I7P7.js';
|
|
2
|
+
import { francAll } from 'franc';
|
|
3
|
+
|
|
4
|
+
var RUNG_MIN_LENGTH = 24;
|
|
5
|
+
var FRANC_MIN_LENGTH = 10;
|
|
6
|
+
var DEFAULT_MAX_CHARS = 2e3;
|
|
7
|
+
function francScore(text, scoped) {
|
|
8
|
+
const byIso = /* @__PURE__ */ new Map();
|
|
9
|
+
for (const c of scoped) if (c.iso6393 !== void 0) byIso.set(c.iso6393, c.code);
|
|
10
|
+
if (byIso.size < 2) return null;
|
|
11
|
+
const sample = text.slice(0, DEFAULT_MAX_CHARS);
|
|
12
|
+
const ranked = francAll(sample, { only: [...byIso.keys()], minLength: FRANC_MIN_LENGTH });
|
|
13
|
+
const top = ranked[0];
|
|
14
|
+
if (!top || top[0] === "und") return null;
|
|
15
|
+
const language = byIso.get(top[0]);
|
|
16
|
+
if (language === void 0) return null;
|
|
17
|
+
return { language, margin: top[1] - (ranked[1]?.[1] ?? 0) };
|
|
18
|
+
}
|
|
19
|
+
function evidenceFromFranc(text, candidates) {
|
|
20
|
+
if (text === void 0 || text.trim().length < RUNG_MIN_LENGTH) return [];
|
|
21
|
+
if (candidates === void 0 || candidates.length === 0) return [];
|
|
22
|
+
const scoped = scopeCandidates(text, candidates);
|
|
23
|
+
const scored = francScore(text, scoped);
|
|
24
|
+
if (scored === null) return [];
|
|
25
|
+
return [
|
|
26
|
+
{
|
|
27
|
+
kind: "franc",
|
|
28
|
+
language: scored.language,
|
|
29
|
+
// franc's score-gap is 0..1; lift it into a usable confidence band.
|
|
30
|
+
confidence: clamp01(0.4 + scored.margin * 0.5),
|
|
31
|
+
source: "franc",
|
|
32
|
+
value: scored.language
|
|
33
|
+
}
|
|
34
|
+
];
|
|
35
|
+
}
|
|
36
|
+
function createFrancEngine(candidates) {
|
|
37
|
+
return {
|
|
38
|
+
id: "franc",
|
|
39
|
+
sync: true,
|
|
40
|
+
inputs: ["text"],
|
|
41
|
+
detect: (input) => evidenceFromFranc(input.text, candidates)
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
var francEngine = {
|
|
45
|
+
id: "franc",
|
|
46
|
+
sync: true,
|
|
47
|
+
inputs: ["text"],
|
|
48
|
+
detect: (input) => evidenceFromFranc(input.text, void 0)
|
|
49
|
+
};
|
|
50
|
+
function clamp01(value) {
|
|
51
|
+
if (!Number.isFinite(value)) return 0;
|
|
52
|
+
if (value < 0) return 0;
|
|
53
|
+
if (value > 1) return 1;
|
|
54
|
+
return value;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export { createFrancEngine, evidenceFromFranc, francEngine };
|
|
58
|
+
//# sourceMappingURL=franc.js.map
|
|
59
|
+
//# sourceMappingURL=franc.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/franc.ts"],"names":[],"mappings":";;;AAmBA,IAAM,eAAA,GAAkB,EAAA;AAExB,IAAM,gBAAA,GAAmB,EAAA;AAEzB,IAAM,iBAAA,GAAoB,GAAA;AAQ1B,SAAS,UAAA,CACP,MACA,MAAA,EAC6C;AAC7C,EAAA,MAAM,KAAA,uBAAY,GAAA,EAAoB;AACtC,EAAA,KAAA,MAAW,CAAA,IAAK,MAAA,EAAQ,IAAI,CAAA,CAAE,OAAA,KAAY,MAAA,EAAW,KAAA,CAAM,GAAA,CAAI,CAAA,CAAE,OAAA,EAAS,CAAA,CAAE,IAAI,CAAA;AAChF,EAAA,IAAI,KAAA,CAAM,IAAA,GAAO,CAAA,EAAG,OAAO,IAAA;AAC3B,EAAA,MAAM,MAAA,GAAS,IAAA,CAAK,KAAA,CAAM,CAAA,EAAG,iBAAiB,CAAA;AAC9C,EAAA,MAAM,MAAA,GAAS,QAAA,CAAS,MAAA,EAAQ,EAAE,IAAA,EAAM,CAAC,GAAG,KAAA,CAAM,IAAA,EAAM,CAAA,EAAG,SAAA,EAAW,kBAAkB,CAAA;AACxF,EAAA,MAAM,GAAA,GAAM,OAAO,CAAC,CAAA;AACpB,EAAA,IAAI,CAAC,GAAA,IAAO,GAAA,CAAI,CAAC,CAAA,KAAM,OAAO,OAAO,IAAA;AACrC,EAAA,MAAM,QAAA,GAAW,KAAA,CAAM,GAAA,CAAI,GAAA,CAAI,CAAC,CAAC,CAAA;AACjC,EAAA,IAAI,QAAA,KAAa,QAAW,OAAO,IAAA;AACnC,EAAA,OAAO,EAAE,QAAA,EAAU,MAAA,EAAQ,GAAA,CAAI,CAAC,CAAA,IAAK,MAAA,CAAO,CAAC,CAAA,GAAI,CAAC,CAAA,IAAK,CAAA,CAAA,EAAG;AAC5D;AAMO,SAAS,iBAAA,CACd,MACA,UAAA,EACoB;AACpB,EAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,GAAS,eAAA,SAAwB,EAAC;AACxE,EAAA,IAAI,eAAe,MAAA,IAAa,UAAA,CAAW,MAAA,KAAW,CAAA,SAAU,EAAC;AAEjE,EAAA,MAAM,MAAA,GAAS,eAAA,CAAgB,IAAA,EAAM,UAAU,CAAA;AAC/C,EAAA,MAAM,MAAA,GAAS,UAAA,CAAW,IAAA,EAAM,MAAM,CAAA;AACtC,EAAA,IAAI,MAAA,KAAW,IAAA,EAAM,OAAO,EAAC;AAE7B,EAAA,OAAO;AAAA,IACL;AAAA,MACE,IAAA,EAAM,OAAA;AAAA,MACN,UAAU,MAAA,CAAO,QAAA;AAAA;AAAA,MAEjB,UAAA,EAAY,OAAA,CAAQ,GAAA,GAAM,MAAA,CAAO,SAAS,GAAG,CAAA;AAAA,MAC7C,MAAA,EAAQ,OAAA;AAAA,MACR,OAAO,MAAA,CAAO;AAAA;AAChB,GACF;AACF;AAWO,SAAS,kBAAkB,UAAA,EAAoD;AACpF,EAAA,OAAO;AAAA,IACL,EAAA,EAAI,OAAA;AAAA,IACJ,IAAA,EAAM,IAAA;AAAA,IACN,MAAA,EAAQ,CAAC,MAAM,CAAA;AAAA,IACf,QAAQ,CAAC,KAAA,KAAU,iBAAA,CAAkB,KAAA,CAAM,MAAM,UAAU;AAAA,GAC7D;AACF;AAOO,IAAM,WAAA,GAA0B;AAAA,EACrC,EAAA,EAAI,OAAA;AAAA,EACJ,IAAA,EAAM,IAAA;AAAA,EACN,MAAA,EAAQ,CAAC,MAAM,CAAA;AAAA,EACf,QAAQ,CAAC,KAAA,KAAU,iBAAA,CAAkB,KAAA,CAAM,MAAM,MAAS;AAC5D;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"franc.js","sourcesContent":["/**\n * `langtell/franc` — the opt-in franc engine (trigram-based detection, ~187\n * languages). Importing this module statically pulls `franc` and its trigram\n * tables, so it lives behind its own subpath; the zero-dependency core never\n * imports it (enforced by an ESLint boundary rule). `franc` is declared as an\n * OPTIONAL peer dependency — install it only if you use this engine.\n *\n * The engine is a candidate-relative *backstop*: franc is scoped to the\n * candidates' ISO 639-3 codes (`only`), runs only on text past a length floor\n * where trigrams are reliable, and emits `kind: \"franc\"` evidence with franc's\n * own score-gap as the confidence. It abstains (emits nothing) when fewer than\n * two candidates carry an `iso6393`, when franc returns `und`, or when the\n * sample is too short.\n */\nimport { francAll } from \"franc\";\nimport type { LanguageEvidence, LanguageProfile, SyncSource } from \"./types.js\";\nimport { scopeCandidates } from \"./internal/classify.js\";\n\n/** Minimum sample length, in characters. Below this trigrams are too noisy. */\nconst RUNG_MIN_LENGTH = 24;\n/** Floor franc itself uses to bail to `und` rather than guess. */\nconst FRANC_MIN_LENGTH = 10;\n/** Cap on text length sent to franc (longer adds cost, not accuracy). */\nconst DEFAULT_MAX_CHARS = 2000;\n\n/**\n * Run franc scoped to the candidates' ISO 639-3 codes, mapping the winner back\n * to its BCP-47 code. Returns `null` when fewer than two candidates carry an\n * `iso6393` code or franc abstains (`und`). The margin is franc's own score-gap\n * (top1 − top2, 0..1).\n */\nfunction francScore(\n text: string,\n scoped: readonly LanguageProfile[],\n): { language: string; margin: number } | null {\n const byIso = new Map<string, string>();\n for (const c of scoped) if (c.iso6393 !== undefined) byIso.set(c.iso6393, c.code);\n if (byIso.size < 2) return null;\n const sample = text.slice(0, DEFAULT_MAX_CHARS);\n const ranked = francAll(sample, { only: [...byIso.keys()], minLength: FRANC_MIN_LENGTH });\n const top = ranked[0];\n if (!top || top[0] === \"und\") return null;\n const language = byIso.get(top[0]);\n if (language === undefined) return null;\n return { language, margin: top[1] - (ranked[1]?.[1] ?? 0) };\n}\n\n/**\n * Producer: the franc trigram backstop over `text`, scoped to `candidates`.\n * Synchronous — franc itself is sync. Emits at most one `kind: \"franc\"` item.\n */\nexport function evidenceFromFranc(\n text: string | undefined,\n candidates: readonly LanguageProfile[] | undefined,\n): LanguageEvidence[] {\n if (text === undefined || text.trim().length < RUNG_MIN_LENGTH) return [];\n if (candidates === undefined || candidates.length === 0) return [];\n\n const scoped = scopeCandidates(text, candidates);\n const scored = francScore(text, scoped);\n if (scored === null) return [];\n\n return [\n {\n kind: \"franc\",\n language: scored.language,\n // franc's score-gap is 0..1; lift it into a usable confidence band.\n confidence: clamp01(0.4 + scored.margin * 0.5),\n source: \"franc\",\n value: scored.language,\n },\n ];\n}\n\n/**\n * Build a franc {@link SyncSource} bound to a candidate roster, for use in\n * `compile({ engines: [createFrancEngine(candidates)] })`. franc needs the\n * roster to scope its `only` restriction, so it is bound at construction (the\n * same shape `compile` uses to bind the built-in text producer).\n *\n * A `SyncSource` (not async): franc runs in-process and synchronously, so the\n * compiled `detect` stays synchronous — no `await` ceremony on the hot path.\n */\nexport function createFrancEngine(candidates: readonly LanguageProfile[]): SyncSource {\n return {\n id: \"franc\",\n sync: true,\n inputs: [\"text\"],\n detect: (input) => evidenceFromFranc(input.text, candidates),\n };\n}\n\n/**\n * A bare franc engine with no bound roster — it abstains until given candidates.\n * Prefer {@link createFrancEngine} with your roster; this default exists so the\n * engine has a stable named export and a no-config import works.\n */\nexport const francEngine: SyncSource = {\n id: \"franc\",\n sync: true,\n inputs: [\"text\"],\n detect: (input) => evidenceFromFranc(input.text, undefined),\n};\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
|
package/dist/fuse.d.ts
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { W as Weights, L as LanguageProfile, a as LanguageEvidence, C as Classification } from './types-D4Ux-xA6.js';
|
|
2
|
+
|
|
3
|
+
interface FuseOptions {
|
|
4
|
+
weights?: Weights;
|
|
5
|
+
/** The candidate roster. When present, incoming evidence tags are normalized
|
|
6
|
+
* into it (`uk-UA` → `uk`, `ua` → `uk`) so context signals (page/header
|
|
7
|
+
* locale) land on the same code the text rungs use. */
|
|
8
|
+
candidates?: readonly LanguageProfile[];
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Combine evidence into a single weighted verdict with an audit trail.
|
|
12
|
+
*
|
|
13
|
+
* Three steps:
|
|
14
|
+
* 1. Normalize each item's language tag into the candidate roster (BCP-47:
|
|
15
|
+
* `uk-UA`/`ua` → `uk`) so text, page, and header signals agree on a code.
|
|
16
|
+
* 2. Weighted argmax over languages (caller weights override per `source`/`kind`).
|
|
17
|
+
* 3. Apply the guard **context must never override clear script evidence**: when
|
|
18
|
+
* the text classifier (or an on-device model) confidently read one language,
|
|
19
|
+
* weaker page/header context for a *different* language cannot win — a
|
|
20
|
+
* Ukrainian page chrome does not make a Latin/English title Ukrainian.
|
|
21
|
+
*/
|
|
22
|
+
declare function fuse(evidence: readonly LanguageEvidence[], options?: FuseOptions): Classification;
|
|
23
|
+
|
|
24
|
+
export { type FuseOptions, fuse };
|
package/dist/fuse.js
ADDED
package/dist/fuse.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"names":[],"mappings":"","file":"fuse.js"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { g as HeaderBag, a as LanguageEvidence } from './types-D4Ux-xA6.js';
|
|
2
|
+
|
|
3
|
+
/** Producer: the HTTP `Content-Language` response header. */
|
|
4
|
+
declare function evidenceFromHeaders(headers: HeaderBag | undefined): LanguageEvidence[];
|
|
5
|
+
|
|
6
|
+
export { evidenceFromHeaders };
|
package/dist/headers.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"names":[],"mappings":"","file":"headers.js"}
|
package/dist/html.d.ts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { a as LanguageEvidence } from './types-D4Ux-xA6.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Producer: language clues from an HTML string's metadata.
|
|
5
|
+
*
|
|
6
|
+
* Reads three independent declarations, each emitted as its own evidence item
|
|
7
|
+
* (the fuser weighs them):
|
|
8
|
+
* - `<html lang>` → `html-lang`
|
|
9
|
+
* - `<meta http-equiv="content-language">` → `meta-content-language`
|
|
10
|
+
* - `<meta property="og:locale">` → `meta-og-locale`
|
|
11
|
+
*
|
|
12
|
+
* All tags are BCP-47-normalized (`uk-UA` → `uk`, `en_US` → `en`). Sync and
|
|
13
|
+
* zero-dependency — regex extraction only, never a DOM parse.
|
|
14
|
+
*/
|
|
15
|
+
declare function evidenceFromHtml(html: string | undefined): LanguageEvidence[];
|
|
16
|
+
|
|
17
|
+
export { evidenceFromHtml };
|
package/dist/html.js
ADDED
package/dist/html.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"names":[],"mappings":"","file":"html.js"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { E as EvidenceSource, D as DetectorConfig, b as DetectFn } from './types-D4Ux-xA6.js';
|
|
2
|
+
export { A as AsyncSource, C as Classification, c as DetectContext, d as DetectInput, e as EarlyExit, f as EvidenceKind, H as HasAsync, g as HeaderBag, h as LanguageCode, a as LanguageEvidence, L as LanguageProfile, i as SourceInput, S as SyncSource, W as Weights } from './types-D4Ux-xA6.js';
|
|
3
|
+
export { FuseOptions, fuse } from './fuse.js';
|
|
4
|
+
export { evidenceFromText } from './text.js';
|
|
5
|
+
export { evidenceFromHtml } from './html.js';
|
|
6
|
+
export { evidenceFromHeaders } from './headers.js';
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Build a configured detector. Does the per-roster setup once and returns a
|
|
10
|
+
* `detect` function whose sync/async shape is fixed by the registered engines
|
|
11
|
+
* (see {@link DetectFn}). The built-in producers are always registered; opt-in
|
|
12
|
+
* engines (franc, chrome-ai) are added via `config.engines`.
|
|
13
|
+
*/
|
|
14
|
+
declare function compile<const E extends readonly EvidenceSource[] = []>(config?: DetectorConfig<E>): DetectFn<E>;
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* BCP-47 / language-code normalization.
|
|
18
|
+
*
|
|
19
|
+
* Two entry points with deliberately different strictness:
|
|
20
|
+
* - {@link normalizeBCP47} — for inputs documented to be BCP-47 (`<html lang>`,
|
|
21
|
+
* hreflang, `Content-Language`): try the full string, then strip a
|
|
22
|
+
* region/script suffix (`en-US` → `en`, `zh_CN` → `zh`).
|
|
23
|
+
* - {@link normalizeLanguageCode} — strict exact-match only, for free-text
|
|
24
|
+
* contexts (URL slugs, link text) where a hyphen split could be a coincidence.
|
|
25
|
+
*
|
|
26
|
+
* Both resolve aliases that appear in the wild (`ua` → `uk`, `rus` → `ru`,
|
|
27
|
+
* localized picker phrases) to a canonical ISO 639-1 code.
|
|
28
|
+
*/
|
|
29
|
+
/**
|
|
30
|
+
* Strict, exact-match lookup. Returns `null` for unknown inputs and does NOT
|
|
31
|
+
* fall back to a hyphen prefix. Use anywhere a hyphen split could be a
|
|
32
|
+
* coincidence — URL path segments (`/ru-return-warranty`), title attrs, link
|
|
33
|
+
* text. The phrase aliases (`по-русски`, `in english`) are in the table
|
|
34
|
+
* directly, so exact lookup still finds them.
|
|
35
|
+
*/
|
|
36
|
+
declare function normalizeLanguageCode(input: string | undefined | null): string | null;
|
|
37
|
+
/**
|
|
38
|
+
* BCP-47-aware normalization: try the full string first, then strip a
|
|
39
|
+
* region/script suffix (`en-US` → `en`, `zh_CN` → `zh`). Use ONLY for inputs
|
|
40
|
+
* documented to be BCP-47 — `hreflang`, `<html lang>`, `Content-Language`,
|
|
41
|
+
* `data-lang`/`data-locale` — never for free-text URL slugs.
|
|
42
|
+
*
|
|
43
|
+
* Falls back to the raw primary subtag when no alias matches, so a code outside
|
|
44
|
+
* the alias table (e.g. `pt-BR` → `pt`) still resolves to its language. The
|
|
45
|
+
* roster decides relevance downstream.
|
|
46
|
+
*/
|
|
47
|
+
declare function normalizeBCP47(input: string | undefined | null): string | null;
|
|
48
|
+
/**
|
|
49
|
+
* Extract the primary subtag from a BCP-47-ish value, lowercased, then resolve
|
|
50
|
+
* it through the alias table (`ua` → `uk`). Handles `Accept-Language`-style
|
|
51
|
+
* comma lists (`en-US,en;q=0.9` → `en`). Returns `null` for empty/nullish.
|
|
52
|
+
*
|
|
53
|
+
* This is the header/HTML extraction helper: it tolerates the messy shapes those
|
|
54
|
+
* sources carry (comma lists, `q` weights) where {@link normalizeBCP47} expects
|
|
55
|
+
* a single tag.
|
|
56
|
+
*/
|
|
57
|
+
declare function primarySubtag(value: string | undefined | null): string | null;
|
|
58
|
+
|
|
59
|
+
export { DetectFn, DetectorConfig, EvidenceSource, compile, normalizeBCP47, normalizeLanguageCode, primarySubtag };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { evidenceFromText } from './chunk-3SO2WI75.js';
|
|
2
|
+
export { evidenceFromText } from './chunk-3SO2WI75.js';
|
|
3
|
+
import { evidenceFromHtml } from './chunk-KI4MAI3N.js';
|
|
4
|
+
export { evidenceFromHtml } from './chunk-KI4MAI3N.js';
|
|
5
|
+
import { evidenceFromHeaders } from './chunk-3LDE35U2.js';
|
|
6
|
+
export { evidenceFromHeaders } from './chunk-3LDE35U2.js';
|
|
7
|
+
import { fuse } from './chunk-TYSRYQN7.js';
|
|
8
|
+
export { fuse } from './chunk-TYSRYQN7.js';
|
|
9
|
+
export { normalizeBCP47, normalizeLanguageCode, primarySubtag } from './chunk-OVSPOZ5J.js';
|
|
10
|
+
import './chunk-RFR5I7P7.js';
|
|
11
|
+
|
|
12
|
+
// src/compile.ts
|
|
13
|
+
function builtIns(candidates) {
|
|
14
|
+
return [
|
|
15
|
+
{
|
|
16
|
+
id: "text",
|
|
17
|
+
sync: true,
|
|
18
|
+
inputs: ["text"],
|
|
19
|
+
detect: (i) => evidenceFromText(i.text, candidates)
|
|
20
|
+
},
|
|
21
|
+
{ id: "html", sync: true, inputs: ["html"], detect: (i) => evidenceFromHtml(i.html) },
|
|
22
|
+
{
|
|
23
|
+
id: "headers",
|
|
24
|
+
sync: true,
|
|
25
|
+
inputs: ["headers"],
|
|
26
|
+
detect: (i) => evidenceFromHeaders(i.headers)
|
|
27
|
+
}
|
|
28
|
+
];
|
|
29
|
+
}
|
|
30
|
+
function applicable(source, input) {
|
|
31
|
+
return source.inputs.every((key) => input[key] !== void 0);
|
|
32
|
+
}
|
|
33
|
+
function compile(config = {}) {
|
|
34
|
+
const sources = [...builtIns(config.candidates), ...config.engines ?? []];
|
|
35
|
+
const hasAsync = sources.some((source) => !source.sync);
|
|
36
|
+
const weights = config.weights;
|
|
37
|
+
const candidates = config.candidates;
|
|
38
|
+
if (!hasAsync) {
|
|
39
|
+
const detect2 = (input) => {
|
|
40
|
+
const evidence = [];
|
|
41
|
+
for (const source of sources) {
|
|
42
|
+
if (source.sync && applicable(source, input)) evidence.push(...source.detect(input));
|
|
43
|
+
}
|
|
44
|
+
return fuse(evidence, { weights, candidates });
|
|
45
|
+
};
|
|
46
|
+
return detect2;
|
|
47
|
+
}
|
|
48
|
+
const detect = async (input, ctx = {}) => {
|
|
49
|
+
const evidence = [];
|
|
50
|
+
const pending = [];
|
|
51
|
+
for (const source of sources) {
|
|
52
|
+
if (!applicable(source, input)) continue;
|
|
53
|
+
if (source.sync) evidence.push(...source.detect(input));
|
|
54
|
+
else pending.push(Promise.resolve(source.detect(input, ctx)).catch(() => []));
|
|
55
|
+
}
|
|
56
|
+
for (const batch of await Promise.all(pending)) evidence.push(...batch);
|
|
57
|
+
return fuse(evidence, { weights });
|
|
58
|
+
};
|
|
59
|
+
return detect;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
export { compile };
|
|
63
|
+
//# sourceMappingURL=index.js.map
|
|
64
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/compile.ts"],"names":["detect"],"mappings":";;;;;;;;;;;;AAmBA,SAAS,SAAS,UAAA,EAAkE;AAClF,EAAA,OAAO;AAAA,IACL;AAAA,MACE,EAAA,EAAI,MAAA;AAAA,MACJ,IAAA,EAAM,IAAA;AAAA,MACN,MAAA,EAAQ,CAAC,MAAM,CAAA;AAAA,MACf,QAAQ,CAAC,CAAA,KAAM,gBAAA,CAAiB,CAAA,CAAE,MAAM,UAAU;AAAA,KACpD;AAAA,IACA,EAAE,EAAA,EAAI,MAAA,EAAQ,IAAA,EAAM,MAAM,MAAA,EAAQ,CAAC,MAAM,CAAA,EAAG,QAAQ,CAAC,CAAA,KAAM,gBAAA,CAAiB,CAAA,CAAE,IAAI,CAAA,EAAE;AAAA,IACpF;AAAA,MACE,EAAA,EAAI,SAAA;AAAA,MACJ,IAAA,EAAM,IAAA;AAAA,MACN,MAAA,EAAQ,CAAC,SAAS,CAAA;AAAA,MAClB,MAAA,EAAQ,CAAC,CAAA,KAAM,mBAAA,CAAoB,EAAE,OAAO;AAAA;AAC9C,GACF;AACF;AAGA,SAAS,UAAA,CAAW,QAAwB,KAAA,EAA6B;AACvE,EAAA,OAAO,MAAA,CAAO,OAAO,KAAA,CAAM,CAAC,QAAQ,KAAA,CAAM,GAAG,MAAM,MAAS,CAAA;AAC9D;AAQO,SAAS,OAAA,CACd,MAAA,GAA4B,EAAC,EAChB;AACb,EAAA,MAAM,OAAA,GAA4B,CAAC,GAAG,QAAA,CAAS,MAAA,CAAO,UAAU,CAAA,EAAG,GAAI,MAAA,CAAO,OAAA,IAAW,EAAG,CAAA;AAC5F,EAAA,MAAM,WAAW,OAAA,CAAQ,IAAA,CAAK,CAAC,MAAA,KAAW,CAAC,OAAO,IAAI,CAAA;AACtD,EAAA,MAAM,UAAU,MAAA,CAAO,OAAA;AACvB,EAAA,MAAM,aAAa,MAAA,CAAO,UAAA;AAE1B,EAAA,IAAI,CAAC,QAAA,EAAU;AACb,IAAA,MAAMA,OAAAA,GAAS,CAAC,KAAA,KAAuC;AACrD,MAAA,MAAM,WAA+B,EAAC;AACtC,MAAA,KAAA,MAAW,UAAU,OAAA,EAAS;AAC5B,QAAA,IAAI,MAAA,CAAO,IAAA,IAAQ,UAAA,CAAW,MAAA,EAAQ,KAAK,CAAA,EAAG,QAAA,CAAS,IAAA,CAAK,GAAG,MAAA,CAAO,MAAA,CAAO,KAAK,CAAC,CAAA;AAAA,MACrF;AACA,MAAA,OAAO,IAAA,CAAK,QAAA,EAAU,EAAE,OAAA,EAAS,YAAY,CAAA;AAAA,IAC/C,CAAA;AACA,IAAA,OAAOA,OAAAA;AAAA,EACT;AAEA,EAAA,MAAM,MAAA,GAAS,OAAO,KAAA,EAAoB,GAAA,GAAqB,EAAC,KAA+B;AAC7F,IAAA,MAAM,WAA+B,EAAC;AACtC,IAAA,MAAM,UAAyC,EAAC;AAChD,IAAA,KAAA,MAAW,UAAU,OAAA,EAAS;AAC5B,MAAA,IAAI,CAAC,UAAA,CAAW,MAAA,EAAQ,KAAK,CAAA,EAAG;AAChC,MAAA,IAAI,MAAA,CAAO,MAAM,QAAA,CAAS,IAAA,CAAK,GAAG,MAAA,CAAO,MAAA,CAAO,KAAK,CAAC,CAAA;AAAA,WACjD,OAAA,CAAQ,IAAA,CAAK,OAAA,CAAQ,OAAA,CAAQ,OAAO,MAAA,CAAO,KAAA,EAAO,GAAG,CAAC,CAAA,CAAE,KAAA,CAAM,MAAM,EAAE,CAAC,CAAA;AAAA,IAC9E;AACA,IAAA,KAAA,MAAW,KAAA,IAAS,MAAM,OAAA,CAAQ,GAAA,CAAI,OAAO,CAAA,EAAG,QAAA,CAAS,IAAA,CAAK,GAAG,KAAK,CAAA;AACtE,IAAA,OAAO,IAAA,CAAK,QAAA,EAAU,EAAE,OAAA,EAAS,CAAA;AAAA,EACnC,CAAA;AACA,EAAA,OAAO,MAAA;AACT","file":"index.js","sourcesContent":["import { evidenceFromHeaders } from \"./headers.js\";\nimport { evidenceFromHtml } from \"./html.js\";\nimport { evidenceFromText } from \"./text.js\";\nimport { fuse } from \"./fuse.js\";\nimport type {\n Classification,\n DetectContext,\n DetectFn,\n DetectInput,\n DetectorConfig,\n EvidenceSource,\n LanguageEvidence,\n LanguageProfile,\n SyncSource,\n} from \"./types.js\";\n\n/** The always-on, zero-dependency producers. The text producer is bound to the\n * configured candidate roster so its scoring is roster-relative (and so it\n * abstains when no roster was supplied — its signals need candidates). */\nfunction builtIns(candidates: readonly LanguageProfile[] | undefined): SyncSource[] {\n return [\n {\n id: \"text\",\n sync: true,\n inputs: [\"text\"],\n detect: (i) => evidenceFromText(i.text, candidates),\n },\n { id: \"html\", sync: true, inputs: [\"html\"], detect: (i) => evidenceFromHtml(i.html) },\n {\n id: \"headers\",\n sync: true,\n inputs: [\"headers\"],\n detect: (i) => evidenceFromHeaders(i.headers),\n },\n ];\n}\n\n/** Run a source only when every input it declares is present. */\nfunction applicable(source: EvidenceSource, input: DetectInput): boolean {\n return source.inputs.every((key) => input[key] !== undefined);\n}\n\n/**\n * Build a configured detector. Does the per-roster setup once and returns a\n * `detect` function whose sync/async shape is fixed by the registered engines\n * (see {@link DetectFn}). The built-in producers are always registered; opt-in\n * engines (franc, chrome-ai) are added via `config.engines`.\n */\nexport function compile<const E extends readonly EvidenceSource[] = []>(\n config: DetectorConfig<E> = {},\n): DetectFn<E> {\n const sources: EvidenceSource[] = [...builtIns(config.candidates), ...(config.engines ?? [])];\n const hasAsync = sources.some((source) => !source.sync);\n const weights = config.weights;\n const candidates = config.candidates;\n\n if (!hasAsync) {\n const detect = (input: DetectInput): Classification => {\n const evidence: LanguageEvidence[] = [];\n for (const source of sources) {\n if (source.sync && applicable(source, input)) evidence.push(...source.detect(input));\n }\n return fuse(evidence, { weights, candidates });\n };\n return detect as DetectFn<E>;\n }\n\n const detect = async (input: DetectInput, ctx: DetectContext = {}): Promise<Classification> => {\n const evidence: LanguageEvidence[] = [];\n const pending: Promise<LanguageEvidence[]>[] = [];\n for (const source of sources) {\n if (!applicable(source, input)) continue;\n if (source.sync) evidence.push(...source.detect(input));\n else pending.push(Promise.resolve(source.detect(input, ctx)).catch(() => []));\n }\n for (const batch of await Promise.all(pending)) evidence.push(...batch);\n return fuse(evidence, { weights });\n };\n return detect as DetectFn<E>;\n}\n"]}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { h as LanguageCode, L as LanguageProfile } from './types-D4Ux-xA6.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* `langtell/profiles` — ready-to-use {@link LanguageProfile} data.
|
|
5
|
+
*
|
|
6
|
+
* This is the heavy DATA half of the library: alphabets, curated function-word
|
|
7
|
+
* lists, and corpus-frequent word lists. It is deliberately kept behind its own
|
|
8
|
+
* subpath, OUT of the zero-dependency core, so `import { compile } from
|
|
9
|
+
* "langtell"` never drags the word corpora into a bundle that only needs the
|
|
10
|
+
* script/letter rungs. Pass these into `compile({ candidates: [...] })`.
|
|
11
|
+
*
|
|
12
|
+
* Each profile is declarative and auditable:
|
|
13
|
+
* - `alphabet` — the language's lowercased alphabet (raw; distinctiveness
|
|
14
|
+
* is computed at runtime per candidate set).
|
|
15
|
+
* - `marks` — orthographic marks that count as rung-1 evidence but
|
|
16
|
+
* are not alphabet letters (the intra-word apostrophe).
|
|
17
|
+
* - `words.function` — curated grammatical markers, hand-verified.
|
|
18
|
+
* - `words.frequent` — common everyday words from a subtitle-frequency corpus.
|
|
19
|
+
* - `iso6393` — ISO 639-3 code for the optional franc engine.
|
|
20
|
+
*
|
|
21
|
+
* Curation rule for `function`: a token may appear in exactly one candidate's
|
|
22
|
+
* list ONLY if that form is genuinely used by only that language among those we
|
|
23
|
+
* support. Shared forms must be in every list that uses them (set-difference
|
|
24
|
+
* then cancels them) or omitted from all. When in doubt, omit: a missing marker
|
|
25
|
+
* only costs recall.
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
declare const uk: LanguageProfile;
|
|
29
|
+
declare const ru: LanguageProfile;
|
|
30
|
+
declare const be: LanguageProfile;
|
|
31
|
+
declare const bg: LanguageProfile;
|
|
32
|
+
declare const en: LanguageProfile;
|
|
33
|
+
|
|
34
|
+
/** Registry of shipped profiles, keyed by BCP-47 code. */
|
|
35
|
+
declare const PROFILES: Readonly<Record<LanguageCode, LanguageProfile>>;
|
|
36
|
+
/** Resolve profiles for the given codes, skipping any without a shipped profile. */
|
|
37
|
+
declare function getProfiles(codes: readonly LanguageCode[]): LanguageProfile[];
|
|
38
|
+
|
|
39
|
+
export { PROFILES, be, bg, en, getProfiles, ru, uk };
|