langtell 0.0.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +67 -26
- package/dist/chrome-ai.d.ts +29 -0
- package/dist/chrome-ai.js +71 -0
- package/dist/chrome-ai.js.map +1 -0
- package/dist/chunk-3LDE35U2.js +36 -0
- package/dist/chunk-3LDE35U2.js.map +1 -0
- package/dist/chunk-7G3MEXWK.js +109 -0
- package/dist/chunk-7G3MEXWK.js.map +1 -0
- package/dist/chunk-KI4MAI3N.js +27 -0
- package/dist/chunk-KI4MAI3N.js.map +1 -0
- package/dist/chunk-NCGZPEDA.js +131 -0
- package/dist/chunk-NCGZPEDA.js.map +1 -0
- package/dist/chunk-OVSPOZ5J.js +115 -0
- package/dist/chunk-OVSPOZ5J.js.map +1 -0
- package/dist/chunk-PT7R2BRQ.js +35 -0
- package/dist/chunk-PT7R2BRQ.js.map +1 -0
- package/dist/classify.d.ts +63 -0
- package/dist/classify.js +3 -0
- package/dist/classify.js.map +1 -0
- package/dist/franc.d.ts +25 -0
- package/dist/franc.js +59 -0
- package/dist/franc.js.map +1 -0
- package/dist/fuse.d.ts +30 -0
- package/dist/fuse.js +4 -0
- package/dist/fuse.js.map +1 -0
- package/dist/headers.d.ts +6 -0
- package/dist/headers.js +4 -0
- package/dist/headers.js.map +1 -0
- package/dist/html.d.ts +17 -0
- package/dist/html.js +4 -0
- package/dist/html.js.map +1 -0
- package/dist/index.d.ts +60 -0
- package/dist/index.js +67 -0
- package/dist/index.js.map +1 -0
- package/dist/profiles.d.ts +47 -0
- package/dist/profiles.js +1030 -0
- package/dist/profiles.js.map +1 -0
- package/dist/text.d.ts +22 -0
- package/dist/text.js +4 -0
- package/dist/text.js.map +1 -0
- package/dist/types-BIXrkuAr.d.ts +120 -0
- package/package.json +104 -3
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/internal/classify.ts"],"names":[],"mappings":";AA0BO,IAAM,UAAA,GAAa;AA0B1B,IAAM,OAAA,GAA0B;AAAA,EAC9B,QAAA,EAAU,SAAA;AAAA,EACV,MAAA,EAAQ,CAAA;AAAA,EACR,IAAA,EAAM,IAAA;AAAA,EACN,cAAA,EAAgB;AAClB,CAAA;AAYA,IAAM,WAAA,GAAc,sBAAA;AACpB,IAAM,QAAA,GAAW,mBAAA;AAkBjB,IAAM,cAAA,GAAoC;AAAA,EACxC,oBAAA;AAAA;AAAA,EACA,cAAA;AAAA;AAAA,EACA,2CAAA;AAAA;AAAA,EACA;AAAA;AACF,CAAA;AAIO,SAAS,WAAW,IAAA,EAAsB;AAC/C,EAAA,IAAI,GAAA,GAAM,IAAA;AACV,EAAA,KAAA,MAAW,MAAM,cAAA,EAAgB,GAAA,GAAM,GAAA,CAAI,OAAA,CAAQ,IAAI,GAAG,CAAA;AAC1D,EAAA,OAAO,GAAA;AACT;AAKA,SAAS,eAAe,IAAA,EAA2C;AACjE,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,MAAW,EAAA,IAAM,UAAA,CAAW,IAAI,CAAA,EAAG;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,SAAA,IACxB,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,EACrC;AACA,EAAA,IAAI,GAAA,KAAQ,CAAA,IAAK,GAAA,KAAQ,CAAA,EAAG,OAAO,IAAA;AACnC,EAAA,OAAO,GAAA,IAAO,MAAM,UAAA,GAAa,OAAA;AACnC;AAGA,SAAS,cAAc,OAAA,EAAuD;AAC5E,EAAA,KAAA,MAAW,EAAA,IAAM,QAAQ,QAAA,EAAU;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,UAAA;AACjC,IAAA,IAAI,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,OAAA;AAAA,EAChC;AACA,EAAA,OAAO,IAAA;AACT;AAIO,SAAS,eAAA,CACd,MACA,UAAA,EACmB;AACnB,EAAA,MAAM,MAAA,GAAS,eAAe,IAAI,CAAA;AAClC,EAAA,IAAI,MAAA,KAAW,IAAA,EAAM,OAAO,EAAC;AAI7B,EAAA,MAAM,IAAA,uBAAW,GAAA,EAAY;AAC7B,EAAA,MAAM,SAA4B,EAAC;AACnC,EAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,IAAA,IAAI,aAAA,CAAc,CAAC,CAAA,KAAM,MAAA,IAAU,KAAK,GAAA,CAAI,CAAA,CAAE,IAAI,CAAA,EAAG;AACrD,IAAA,IAAA,CAAK,GAAA,CAAI,EAAE,IAAI,CAAA;AACf,IAAA,MAAA,CAAO,KAAK,CAAC,CAAA;AAAA,EACf;AACA,EAAA,OAAO,MAAA;AACT;AA8BA,SAAS,SAAS,IAAA,EAAwB;AACxC,EAAA,OAAO,KAAK,WAAA,EAAY,CAAE,KAAA,CAAM,UAAU,KAAK,EAAC;AAClD;AAOA,SAAS,KAAA,CAAM,OAAyB,UAAA,EAAwD;AAC9F,EAAA,MAAM,MAAA,GAAS,IAAI,GAAA,CAAoB,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,KAAM,CAAC,CAAA,CAAE,IAAA,EAAM,CAAC,CAAC,CAAC,CAAA;AACzE,EAAA,KAAA,MAAW,QAAQ,KAAA,EAAO;AACxB,IAAA,IAAI,KAAA,GAAuB,IAAA;AAC3B,IAAA,IAAI,MAAA,GAAS,CAAA;AACb,IAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,MAAA,IAAI,CAAA,CAAE,GAAA,CAAI,GAAA,CAAI,IAAI,CAAA,EAAG;AACnB,QAAA,MAAA,IAAU,CAAA;AACV,QAAA,IAAI,SAAS,CAAA,EAAG;AACd,UAAA,KAAA,GAAQ,IAAA;AACR,UAAA;AAAA,QACF;AACA,QAAA,KAAA,GAAQ,CAAA,CAAE,IAAA;AAAA,MACZ;AAAA,IACF;AACA,IAAA,IAAI,KAAA,KAAU,IAAA,EAAM,MAAA,CAAO,GAAA,CAAI,KAAA,EAAA,CAAQ,OAAO,GAAA,CAAI,KAAK,CAAA,IAAK,CAAA,IAAK,CAAC,CAAA;AAAA,EACpE;AACA,EAAA,OAAO,MAAA;AACT;AAGA,SAAS,OAAO,MAAA,EAAsE;AACpF,EAAA,IAAI,GAAA,GAAM,EAAA;AACV,EAAA,IAAI,MAAA,GAAS,EAAA;AACb,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,KAAA,MAAW,CAAC,CAAA,EAAG,KAAK,CAAA,IAAK,MAAA,EAAQ;AAC/B,IAAA,IAAI,QAAQ,GAAA,EAAK;AACf,MAAA,MAAA,GAAS,GAAA;AACT,MAAA,GAAA,GAAM,KAAA;AACN,MAAA,IAAA,GAAO,CAAA;AAAA,IACT,CAAA,MAAA,IAAW,QAAQ,MAAA,EAAQ;AACzB,MAAA,MAAA,GAAS,KAAA;AAAA,IACX;AAAA,EACF;AACA,EAAA,IAAI,IAAA,KAAS,IAAA,IAAQ,GAAA,GAAM,CAAA,EAAG,OAAO,IAAA;AACrC,EAAA,MAAM,MAAA,GAAS,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,QAAQ,CAAC,CAAA;AACvC,EAAA,OAAO,MAAA,IAAU,CAAA,GAAI,EAAE,IAAA,EAAM,QAAO,GAAI,IAAA;AAC1C;AAEA,SAAS,aAAA,CACP,YACA,IAAA,EACc;AACd,EAAA,OAAO,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,MAAO,EAAE,IAAA,EAAM,CAAA,CAAE,IAAA,EAAM,GAAA,EAAK,IAAI,GAAA,CAAI,IAAA,CAAK,CAAC,CAAC,GAAE,CAAE,CAAA;AACxE;AAIA,SAAS,UAAA,CAAW,MAAc,MAAA,EAAwD;AACxF,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,KAAK,WAAA,EAAY;AAAA,MACjB,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,QAAA,IAAY,CAAA,CAAE,SAAS,EAAA,CAAG;AAAA;AAC3D,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,IAAA,EAAM,QAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAM,CAAA,EAAE,GAAI,IAAA;AAC/D;AAGA,SAAS,QAAA,CACP,MAAA,EACA,MAAA,EACA,IAAA,EACA,IAAA,EACoB;AACpB,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,MAAA;AAAA,MACA,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,KAAA,GAAQ,IAAI,CAAA,IAAK,EAAE;AAAA;AACpD,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,MAAM,MAAA,EAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAK,GAAI,IAAA;AAC5D;AAOO,SAAS,iBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACgB;AAChB,EAAA,IAAI,CAAC,IAAA,IAAQ,UAAA,CAAW,MAAA,KAAW,GAAG,OAAO,OAAA;AAI7C,EAAA,MAAM,OAAA,GAAU,WAAW,IAAI,CAAA;AAG/B,EAAA,MAAM,MAAA,GAAS,eAAA,CAAgB,OAAA,EAAS,UAAU,CAAA;AAClD,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAKhC,EAAA,MAAM,cAAA,GAAiB,OAAO,MAAA,IAAU,CAAA;AAExC,EAAA,MAAM,QAAA,GAAW,UAAA,CAAW,OAAA,EAAS,MAAM,CAAA;AAC3C,EAAA,IAAI,QAAA,EAAU,OAAO,EAAE,GAAG,UAAU,cAAA,EAAe;AAEnD,EAAA,MAAM,MAAA,GAAS,SAAS,OAAO,CAAA;AAC/B,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAEhC,EAAA,MAAM,MAAA,GACJ,QAAA,CAAS,MAAA,EAAQ,MAAA,EAAQ,YAAY,IAAI,CAAA,IACzC,QAAA,CAAS,MAAA,EAAQ,QAAQ,UAAA,EAAY,IAAI,CAAA,IACzC,KAAA,GAAQ,SAAS,MAAM,CAAA;AACzB,EAAA,OAAO,MAAA,GAAS,EAAE,GAAG,MAAA,EAAQ,gBAAe,GAAI,OAAA;AAClD","file":"chunk-NCGZPEDA.js","sourcesContent":["/**\n * Per-snippet language classification by candidate-set-relative set-difference.\n *\n * A ladder of rungs; the first rung whose leader clears a lead (margin) of ≥1\n * wins; otherwise `\"unknown\"`:\n *\n * 1 alphabet — characters distinctive within the candidate set\n * 2a function words — curated grammatical markers (highest precision)\n * 2b frequent words — corpus content words\n * 3 franc — optional trigram backstop for the distinctive-free\n * residual, injected as a resolver (this module stays\n * franc-free and importable without franc's tables)\n *\n * \"Distinctive\" is ALWAYS relative to the candidate set: a signal counts for a\n * candidate iff it appears in that candidate's profile and in NO other\n * candidate's. So `і` decides {uk, ru} (only uk has it) but is inert in\n * {uk, be} (both have it), and the word `и` decides {uk, ru} even though the\n * *letter* `и` is shared. Nothing is precomputed — uniqueness is the runtime\n * output, never stored.\n *\n * Adapted to langtell's {@link LanguageProfile} shape: the `words` and `iso6393`\n * fields are optional here, so a bare `{ code, alphabet }` profile still\n * classifies on rung 1.\n */\nimport type { LanguageProfile } from \"../types.js\";\n\nexport const FRANC_RUNG = 3;\n\n/** Which rung decided a verdict; `null` when unknown. */\nexport type Rung = 1 | \"2a\" | \"2b\" | typeof FRANC_RUNG | null;\n\nexport interface SnippetVerdict {\n /** Winning language code, or the sentinel `\"unknown\"`. */\n language: string;\n /** Lead of the winner over the runner-up, in the rung's own unit (distinctive\n * char/word count for rungs 1–2; franc score-gap for rung 3). 0 when unknown. */\n margin: number;\n /** Which rung decided; `null` when unknown. */\n rung: Rung;\n /** Whether ≥2 same-script candidates were in scope when the verdict was\n * reached. `true` ⇒ the distinctive-letter/word machinery actually chose\n * between candidates; `false` ⇒ the winner was the lone candidate in its\n * script, selected by script alone (no evidence it is *distinctively* that\n * language). `false` for `\"unknown\"`. */\n discriminating: boolean;\n}\n\n/** A rung's verdict before {@link classifyBySnippet} stamps on the scope-derived\n * `discriminating` flag (which a single rung can't know — it depends on how many\n * same-script candidates were scoped). */\nexport type RungVerdict = Pick<SnippetVerdict, \"language\" | \"margin\" | \"rung\">;\n\nconst UNKNOWN: SnippetVerdict = {\n language: \"unknown\",\n margin: 0,\n rung: null,\n discriminating: false,\n};\n\n/** Resolver for rung 3 (the optional trigram backstop), injected into\n * {@link classifyBySnippet} by callers that have franc available. Kept as an\n * injected seam — not a direct import — so this module stays franc-free and\n * importable without pulling franc's tables. Returns a rung-3 verdict or\n * `null` (abstain). */\nexport type Rung3Resolver = (\n text: string,\n scoped: readonly LanguageProfile[],\n) => RungVerdict | null;\n\nconst CYRILLIC_RE = /\\p{Script=Cyrillic}/u;\nconst LATIN_RE = /\\p{Script=Latin}/u;\n\n/** Below this length, trigrams are too noisy to justify a rung-3 verdict. */\nexport const RUNG3_MIN_LENGTH = 24;\n\n/**\n * Trailing/inline Latin \"noise\" tokens — URLs, @handles, #hashtags — that a\n * Cyrillic title commonly carries (a headline followed by a link or a social\n * handle). These are almost always Latin even on Cyrillic-language content, so\n * left in they can flip {@link dominantScript} to Latin and let genuinely\n * Cyrillic content scope to the wrong roster. Stripped before the script vote\n * AND before the rung tallies so the URL's letters never contribute either.\n *\n * Kept as separate simple patterns (applied in order — schemes/www before bare\n * domains) rather than one big alternation, so each stays readable. ASCII-only\n * `[a-z0-9-]` in the domain pattern means a Cyrillic word is never mistaken for\n * a domain.\n */\nconst NOISE_PATTERNS: readonly RegExp[] = [\n /\\bhttps?:\\/\\/\\S+/gi, // full URLs\n /\\bwww\\.\\S+/gi, // www.… without a scheme\n /\\b[a-z0-9-]+(?:\\.[a-z0-9-]+)+(?:\\/\\S*)?/gi, // bare domains (example.com/path)\n /[@#][\\p{L}\\p{N}_]+/gu, // @handles and #hashtags\n];\n\n/** Drop URLs / @handles / #hashtags so trailing Latin noise can't outvote the\n * prose's script or pollute the per-rung tallies. */\nexport function stripNoise(text: string): string {\n let out = text;\n for (const re of NOISE_PATTERNS) out = out.replace(re, \" \");\n return out;\n}\n\n/** The script most of `text` is written in, or `null` if it carries no letters.\n * Noise (URLs/handles/hashtags) is stripped first so a single trailing link\n * can't flip a multi-word Cyrillic title's vote to Latin. */\nfunction dominantScript(text: string): \"cyrillic\" | \"latin\" | null {\n let cyr = 0;\n let lat = 0;\n for (const ch of stripNoise(text)) {\n if (CYRILLIC_RE.test(ch)) cyr += 1;\n else if (LATIN_RE.test(ch)) lat += 1;\n }\n if (cyr === 0 && lat === 0) return null;\n return cyr >= lat ? \"cyrillic\" : \"latin\";\n}\n\n/** The script of a profile's alphabet. */\nfunction profileScript(profile: LanguageProfile): \"cyrillic\" | \"latin\" | null {\n for (const ch of profile.alphabet) {\n if (CYRILLIC_RE.test(ch)) return \"cyrillic\";\n if (LATIN_RE.test(ch)) return \"latin\";\n }\n return null;\n}\n\n/** Candidates whose script matches the text's dominant script (others can't tip\n * the verdict). Empty when the text carries no letters. */\nexport function scopeCandidates(\n text: string,\n candidates: readonly LanguageProfile[],\n): LanguageProfile[] {\n const script = dominantScript(text);\n if (script === null) return [];\n // Keep one profile per code. A language listed twice would otherwise make its\n // own distinctive chars/words read as \"owned by ≥2 candidates\" in `tally`,\n // cancelling them out and collapsing the verdict to \"unknown\".\n const seen = new Set<string>();\n const scoped: LanguageProfile[] = [];\n for (const c of candidates) {\n if (profileScript(c) !== script || seen.has(c.code)) continue;\n seen.add(c.code);\n scoped.push(c);\n }\n return scoped;\n}\n\n/**\n * Per-language set of characters globally unique within `profiles` — present in\n * exactly one profile's alphabet. Relative to the given profile set: the unique\n * set shrinks as languages are added (a second Latin language un-uniques a–z).\n */\nexport function distinctiveChars(profiles: readonly LanguageProfile[]): Map<string, Set<string>> {\n const owners = new Map<string, string[]>();\n for (const p of profiles) {\n for (const ch of new Set(p.alphabet)) {\n const list = owners.get(ch);\n if (list) list.push(p.code);\n else owners.set(ch, [p.code]);\n }\n }\n const result = new Map<string, Set<string>>(profiles.map((p) => [p.code, new Set()]));\n for (const [ch, codes] of owners) {\n const [only] = codes;\n if (codes.length === 1 && only !== undefined) result.get(only)?.add(ch);\n }\n return result;\n}\n\ninterface Membership {\n code: string;\n set: ReadonlySet<string>;\n}\n\n/** Lowercased Unicode letter-run tokens. Keeps single-char tokens (`і`, `и`). */\nfunction tokenize(text: string): string[] {\n return text.toLowerCase().match(/\\p{L}+/gu) ?? [];\n}\n\n/**\n * Tally how many items (characters or word tokens) are distinctive to each\n * candidate — present in exactly one candidate's set. Items owned by zero or by\n * ≥2 candidates contribute nothing.\n */\nfunction tally(items: Iterable<string>, membership: readonly Membership[]): Map<string, number> {\n const scores = new Map<string, number>(membership.map((m) => [m.code, 0]));\n for (const item of items) {\n let owner: string | null = null;\n let owners = 0;\n for (const m of membership) {\n if (m.set.has(item)) {\n owners += 1;\n if (owners > 1) {\n owner = null;\n break;\n }\n owner = m.code;\n }\n }\n if (owner !== null) scores.set(owner, (scores.get(owner) ?? 0) + 1);\n }\n return scores;\n}\n\n/** The leading candidate and its lead over the runner-up, or `null` if <1. */\nfunction leader(scores: Map<string, number>): { code: string; margin: number } | null {\n let max = -1;\n let second = -1;\n let code: string | null = null;\n for (const [c, score] of scores) {\n if (score > max) {\n second = max;\n max = score;\n code = c;\n } else if (score > second) {\n second = score;\n }\n }\n if (code === null || max < 1) return null;\n const margin = max - Math.max(second, 0);\n return margin >= 1 ? { code, margin } : null;\n}\n\nfunction membershipFor(\n candidates: readonly LanguageProfile[],\n pick: (p: LanguageProfile) => Iterable<string>,\n): Membership[] {\n return candidates.map((c) => ({ code: c.code, set: new Set(pick(c)) }));\n}\n\n/** Rung 1 — characters (alphabet + orthographic {@link LanguageProfile.marks})\n * distinctive within the scoped candidate set. */\nfunction letterRung(text: string, scoped: readonly LanguageProfile[]): RungVerdict | null {\n const r = leader(\n tally(\n text.toLowerCase(),\n membershipFor(scoped, (p) => p.alphabet + (p.marks ?? \"\")),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung: 1 } : null;\n}\n\n/** Rung 2 — distinctive words from the given tier (2a function, 2b frequent). */\nfunction wordRung(\n tokens: readonly string[],\n scoped: readonly LanguageProfile[],\n tier: \"function\" | \"frequent\",\n rung: \"2a\" | \"2b\",\n): RungVerdict | null {\n const r = leader(\n tally(\n tokens,\n membershipFor(scoped, (p) => p.words?.[tier] ?? []),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung } : null;\n}\n\n/**\n * Classify `text` among `candidates`. Synchronous and allocation-light. Returns\n * `\"unknown\"` on empty evidence, on a tie inside the candidate set, or when\n * nothing is distinctive.\n */\nexport function classifyBySnippet(\n text: string,\n candidates: readonly LanguageProfile[],\n rung3?: Rung3Resolver,\n): SnippetVerdict {\n if (!text || candidates.length === 0) return UNKNOWN;\n\n // Drop URLs / @handles / #hashtags once, up front: trailing Latin noise must\n // not flip the dominant-script vote nor pollute the per-rung tallies.\n const cleaned = stripNoise(text);\n\n // Restrict to candidates in the text's dominant script.\n const scoped = scopeCandidates(cleaned, candidates);\n if (scoped.length === 0) return UNKNOWN;\n\n // ≥2 same-script candidates means the distinctive machinery actually had a\n // choice to make; a lone scoped candidate wins by script alone. Stamped onto\n // whichever rung decides — a single rung can't see the scope size.\n const discriminating = scoped.length >= 2;\n\n const byLetter = letterRung(cleaned, scoped);\n if (byLetter) return { ...byLetter, discriminating };\n\n const tokens = tokenize(cleaned);\n if (tokens.length === 0) return UNKNOWN;\n\n const byWord =\n wordRung(tokens, scoped, \"function\", \"2a\") ??\n wordRung(tokens, scoped, \"frequent\", \"2b\") ??\n rung3?.(cleaned, scoped);\n return byWord ? { ...byWord, discriminating } : UNKNOWN;\n}\n"]}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
// src/internal/bcp47.ts
|
|
2
|
+
var ALIASES = {
|
|
3
|
+
// Ukrainian
|
|
4
|
+
ua: "uk",
|
|
5
|
+
uk: "uk",
|
|
6
|
+
\u0443\u043A\u0440: "uk",
|
|
7
|
+
\u0443\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0430: "uk",
|
|
8
|
+
\u0443\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u043E\u044E: "uk",
|
|
9
|
+
"\u0443\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0430 \u043C\u043E\u0432\u0430": "uk",
|
|
10
|
+
"\u043D\u0430 \u0443\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0456\u0439": "uk",
|
|
11
|
+
"\u0443\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u043E\u044E \u043C\u043E\u0432\u043E\u044E": "uk",
|
|
12
|
+
ukrainian: "uk",
|
|
13
|
+
"in ukrainian": "uk",
|
|
14
|
+
// Russian
|
|
15
|
+
ru: "ru",
|
|
16
|
+
rus: "ru",
|
|
17
|
+
\u0440\u0443\u0441: "ru",
|
|
18
|
+
\u0440\u0443\u0441\u0441\u043A\u0438\u0439: "ru",
|
|
19
|
+
"\u043F\u043E-\u0440\u0443\u0441\u0441\u043A\u0438": "ru",
|
|
20
|
+
"\u043F\u043E \u0440\u0443\u0441\u0441\u043A\u0438": "ru",
|
|
21
|
+
"\u0440\u0443\u0441\u0441\u043A\u0438\u0439 \u044F\u0437\u044B\u043A": "ru",
|
|
22
|
+
"\u043D\u0430 \u0440\u0443\u0441\u0441\u043A\u043E\u043C": "ru",
|
|
23
|
+
russian: "ru",
|
|
24
|
+
"in russian": "ru",
|
|
25
|
+
\u0440\u043E\u0441\u0456\u0439\u0441\u044C\u043A\u0430: "ru",
|
|
26
|
+
"\u0440\u043E\u0441\u0456\u0439\u0441\u044C\u043A\u0430 \u043C\u043E\u0432\u0430": "ru",
|
|
27
|
+
"\u043F\u043E-\u0440\u043E\u0441\u0456\u0439\u0441\u044C\u043A\u0438": "ru",
|
|
28
|
+
"\u043F\u043E \u0440\u043E\u0441\u0456\u0439\u0441\u044C\u043A\u0438": "ru",
|
|
29
|
+
// Belarusian
|
|
30
|
+
be: "be",
|
|
31
|
+
bel: "be",
|
|
32
|
+
\u0431\u0435\u043B\u0430\u0440\u0443\u0441\u043A\u0430\u044F: "be",
|
|
33
|
+
"\u0431\u0435\u043B\u0430\u0440\u0443\u0441\u043A\u0430\u044F \u043C\u043E\u0432\u0430": "be",
|
|
34
|
+
belarusian: "be",
|
|
35
|
+
"in belarusian": "be",
|
|
36
|
+
// Bulgarian
|
|
37
|
+
bg: "bg",
|
|
38
|
+
bul: "bg",
|
|
39
|
+
\u0431\u044A\u043B\u0433\u0430\u0440\u0441\u043A\u0438: "bg",
|
|
40
|
+
"\u0431\u044A\u043B\u0433\u0430\u0440\u0441\u043A\u0438 \u0435\u0437\u0438\u043A": "bg",
|
|
41
|
+
bulgarian: "bg",
|
|
42
|
+
"in bulgarian": "bg",
|
|
43
|
+
// English
|
|
44
|
+
en: "en",
|
|
45
|
+
eng: "en",
|
|
46
|
+
english: "en",
|
|
47
|
+
"in english": "en",
|
|
48
|
+
\u0430\u043D\u0433\u043B\u0456\u0439\u0441\u044C\u043A\u0430: "en",
|
|
49
|
+
\u0430\u043D\u0433\u043B\u0438\u0439\u0441\u043A\u0438\u0439: "en",
|
|
50
|
+
// Polish
|
|
51
|
+
pl: "pl",
|
|
52
|
+
pol: "pl",
|
|
53
|
+
polski: "pl",
|
|
54
|
+
"po polsku": "pl",
|
|
55
|
+
polish: "pl",
|
|
56
|
+
\u043F\u043E\u043B\u044C\u0441\u044C\u043A\u0430: "pl",
|
|
57
|
+
// German
|
|
58
|
+
de: "de",
|
|
59
|
+
deu: "de",
|
|
60
|
+
ger: "de",
|
|
61
|
+
deutsch: "de",
|
|
62
|
+
"auf deutsch": "de",
|
|
63
|
+
german: "de",
|
|
64
|
+
\u043D\u0456\u043C\u0435\u0446\u044C\u043A\u0430: "de",
|
|
65
|
+
// French
|
|
66
|
+
fr: "fr",
|
|
67
|
+
fra: "fr",
|
|
68
|
+
fran\u00E7ais: "fr",
|
|
69
|
+
francais: "fr",
|
|
70
|
+
"en fran\xE7ais": "fr",
|
|
71
|
+
french: "fr",
|
|
72
|
+
\u0444\u0440\u0430\u043D\u0446\u0443\u0437\u044C\u043A\u0430: "fr",
|
|
73
|
+
// Spanish
|
|
74
|
+
es: "es",
|
|
75
|
+
spa: "es",
|
|
76
|
+
espa\u00F1ol: "es",
|
|
77
|
+
espanol: "es",
|
|
78
|
+
"en espa\xF1ol": "es",
|
|
79
|
+
spanish: "es",
|
|
80
|
+
\u0456\u0441\u043F\u0430\u043D\u0441\u044C\u043A\u0430: "es",
|
|
81
|
+
// Italian
|
|
82
|
+
it: "it",
|
|
83
|
+
ita: "it",
|
|
84
|
+
italiano: "it",
|
|
85
|
+
"in italiano": "it",
|
|
86
|
+
italian: "it",
|
|
87
|
+
\u0456\u0442\u0430\u043B\u0456\u0439\u0441\u044C\u043A\u0430: "it"
|
|
88
|
+
};
|
|
89
|
+
function normalizeLanguageCode(input) {
|
|
90
|
+
if (input === void 0 || input === null) return null;
|
|
91
|
+
const cleaned = input.trim().toLowerCase();
|
|
92
|
+
if (cleaned.length === 0) return null;
|
|
93
|
+
return ALIASES[cleaned] ?? null;
|
|
94
|
+
}
|
|
95
|
+
function normalizeBCP47(input) {
|
|
96
|
+
if (input === void 0 || input === null) return null;
|
|
97
|
+
const cleaned = input.trim().toLowerCase().replace(/_/g, "-");
|
|
98
|
+
if (cleaned.length === 0) return null;
|
|
99
|
+
const direct = ALIASES[cleaned];
|
|
100
|
+
if (direct !== void 0) return direct;
|
|
101
|
+
const head = cleaned.split("-")[0];
|
|
102
|
+
if (head === void 0 || head.length === 0) return null;
|
|
103
|
+
return ALIASES[head] ?? head;
|
|
104
|
+
}
|
|
105
|
+
function primarySubtag(value) {
|
|
106
|
+
if (value === void 0 || value === null) return null;
|
|
107
|
+
const first = value.split(",")[0]?.trim();
|
|
108
|
+
if (first === void 0 || first.length === 0) return null;
|
|
109
|
+
const tag = first.split(";")[0]?.trim();
|
|
110
|
+
return normalizeBCP47(tag);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
export { normalizeBCP47, normalizeLanguageCode, primarySubtag };
|
|
114
|
+
//# sourceMappingURL=chunk-OVSPOZ5J.js.map
|
|
115
|
+
//# sourceMappingURL=chunk-OVSPOZ5J.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/internal/bcp47.ts"],"names":[],"mappings":";AAuBA,IAAM,OAAA,GAAkC;AAAA;AAAA,EAEtC,EAAA,EAAI,IAAA;AAAA,EACJ,EAAA,EAAI,IAAA;AAAA,EACJ,kBAAA,EAAK,IAAA;AAAA,EACL,4DAAA,EAAY,IAAA;AAAA,EACZ,kEAAA,EAAa,IAAA;AAAA,EACb,uFAAA,EAAmB,IAAA;AAAA,EACnB,iFAAA,EAAkB,IAAA;AAAA,EAClB,mGAAA,EAAqB,IAAA;AAAA,EACrB,SAAA,EAAW,IAAA;AAAA,EACX,cAAA,EAAgB,IAAA;AAAA;AAAA,EAGhB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,kBAAA,EAAK,IAAA;AAAA,EACL,0CAAA,EAAS,IAAA;AAAA,EACT,mDAAA,EAAa,IAAA;AAAA,EACb,mDAAA,EAAa,IAAA;AAAA,EACb,qEAAA,EAAgB,IAAA;AAAA,EAChB,yDAAA,EAAc,IAAA;AAAA,EACd,OAAA,EAAS,IAAA;AAAA,EACT,YAAA,EAAc,IAAA;AAAA,EACd,sDAAA,EAAW,IAAA;AAAA,EACX,iFAAA,EAAkB,IAAA;AAAA,EAClB,qEAAA,EAAgB,IAAA;AAAA,EAChB,qEAAA,EAAgB,IAAA;AAAA;AAAA,EAGhB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,4DAAA,EAAY,IAAA;AAAA,EACZ,uFAAA,EAAmB,IAAA;AAAA,EACnB,UAAA,EAAY,IAAA;AAAA,EACZ,eAAA,EAAiB,IAAA;AAAA;AAAA,EAGjB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,sDAAA,EAAW,IAAA;AAAA,EACX,iFAAA,EAAkB,IAAA;AAAA,EAClB,SAAA,EAAW,IAAA;AAAA,EACX,cAAA,EAAgB,IAAA;AAAA;AAAA,EAGhB,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,OAAA,EAAS,IAAA;AAAA,EACT,YAAA,EAAc,IAAA;AAAA,EACd,4DAAA,EAAY,IAAA;AAAA,EACZ,4DAAA,EAAY,IAAA;AAAA;AAAA,EAGZ,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,MAAA,EAAQ,IAAA;AAAA,EACR,WAAA,EAAa,IAAA;AAAA,EACb,MAAA,EAAQ,IAAA;AAAA,EACR,gDAAA,EAAU,IAAA;AAAA;AAAA,EAGV,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,GAAA,EAAK,IAAA;AAAA,EACL,OAAA,EAAS,IAAA;AAAA,EACT,aAAA,EAAe,IAAA;AAAA,EACf,MAAA,EAAQ,IAAA;AAAA,EACR,gDAAA,EAAU,IAAA;AAAA;AAAA,EAGV,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,aAAA,EAAU,IAAA;AAAA,EACV,QAAA,EAAU,IAAA;AAAA,EACV,gBAAA,EAAe,IAAA;AAAA,EACf,MAAA,EAAQ,IAAA;AAAA,EACR,4DAAA,EAAY,IAAA;AAAA;AAAA,EAGZ,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,YAAA,EAAS,IAAA;AAAA,EACT,OAAA,EAAS,IAAA;AAAA,EACT,eAAA,EAAc,IAAA;AAAA,EACd,OAAA,EAAS,IAAA;AAAA,EACT,sDAAA,EAAW,IAAA;AAAA;AAAA,EAGX,EAAA,EAAI,IAAA;AAAA,EACJ,GAAA,EAAK,IAAA;AAAA,EACL,QAAA,EAAU,IAAA;AAAA,EACV,aAAA,EAAe,IAAA;AAAA,EACf,OAAA,EAAS,IAAA;AAAA,EACT,4DAAA,EAAY;AACd,CAAA;AASO,SAAS,sBAAsB,KAAA,EAAiD;AACrF,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,KAAU,IAAA,EAAM,OAAO,IAAA;AAClD,EAAA,MAAM,OAAA,GAAU,KAAA,CAAM,IAAA,EAAK,CAAE,WAAA,EAAY;AACzC,EAAA,IAAI,OAAA,CAAQ,MAAA,KAAW,CAAA,EAAG,OAAO,IAAA;AACjC,EAAA,OAAO,OAAA,CAAQ,OAAO,CAAA,IAAK,IAAA;AAC7B;AAYO,SAAS,eAAe,KAAA,EAAiD;AAC9E,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,KAAU,IAAA,EAAM,OAAO,IAAA;AAClD,EAAA,MAAM,OAAA,GAAU,MAAM,IAAA,EAAK,CAAE,aAAY,CAAE,OAAA,CAAQ,MAAM,GAAG,CAAA;AAC5D,EAAA,IAAI,OAAA,CAAQ,MAAA,KAAW,CAAA,EAAG,OAAO,IAAA;AACjC,EAAA,MAAM,MAAA,GAAS,QAAQ,OAAO,CAAA;AAC9B,EAAA,IAAI,MAAA,KAAW,QAAW,OAAO,MAAA;AACjC,EAAA,MAAM,IAAA,GAAO,OAAA,CAAQ,KAAA,CAAM,GAAG,EAAE,CAAC,CAAA;AACjC,EAAA,IAAI,IAAA,KAAS,MAAA,IAAa,IAAA,CAAK,MAAA,KAAW,GAAG,OAAO,IAAA;AACpD,EAAA,OAAO,OAAA,CAAQ,IAAI,CAAA,IAAK,IAAA;AAC1B;AAWO,SAAS,cAAc,KAAA,EAAiD;AAC7E,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,KAAU,IAAA,EAAM,OAAO,IAAA;AAClD,EAAA,MAAM,QAAQ,KAAA,CAAM,KAAA,CAAM,GAAG,CAAA,CAAE,CAAC,GAAG,IAAA,EAAK;AACxC,EAAA,IAAI,KAAA,KAAU,MAAA,IAAa,KAAA,CAAM,MAAA,KAAW,GAAG,OAAO,IAAA;AAEtD,EAAA,MAAM,MAAM,KAAA,CAAM,KAAA,CAAM,GAAG,CAAA,CAAE,CAAC,GAAG,IAAA,EAAK;AACtC,EAAA,OAAO,eAAe,GAAG,CAAA;AAC3B","file":"chunk-OVSPOZ5J.js","sourcesContent":["/**\n * BCP-47 / language-code normalization.\n *\n * Two entry points with deliberately different strictness:\n * - {@link normalizeBCP47} — for inputs documented to be BCP-47 (`<html lang>`,\n * hreflang, `Content-Language`): try the full string, then strip a\n * region/script suffix (`en-US` → `en`, `zh_CN` → `zh`).\n * - {@link normalizeLanguageCode} — strict exact-match only, for free-text\n * contexts (URL slugs, link text) where a hyphen split could be a coincidence.\n *\n * Both resolve aliases that appear in the wild (`ua` → `uk`, `rus` → `ru`,\n * localized picker phrases) to a canonical ISO 639-1 code.\n */\n\n/**\n * Aliases mapped to canonical ISO 639-1 codes. Keys are lowercased.\n *\n * Ukrainian is the load-bearing case: most sites use `ua` in URLs even though\n * the ISO code is `uk`. Both are accepted on input; `uk` is always output.\n *\n * Includes localized phrases users see in language pickers (`українською`,\n * `по-русски`, `in english`, …).\n */\nconst ALIASES: Record<string, string> = {\n // Ukrainian\n ua: \"uk\",\n uk: \"uk\",\n укр: \"uk\",\n українська: \"uk\",\n українською: \"uk\",\n \"українська мова\": \"uk\",\n \"на українській\": \"uk\",\n \"українською мовою\": \"uk\",\n ukrainian: \"uk\",\n \"in ukrainian\": \"uk\",\n\n // Russian\n ru: \"ru\",\n rus: \"ru\",\n рус: \"ru\",\n русский: \"ru\",\n \"по-русски\": \"ru\",\n \"по русски\": \"ru\",\n \"русский язык\": \"ru\",\n \"на русском\": \"ru\",\n russian: \"ru\",\n \"in russian\": \"ru\",\n російська: \"ru\",\n \"російська мова\": \"ru\",\n \"по-російськи\": \"ru\",\n \"по російськи\": \"ru\",\n\n // Belarusian\n be: \"be\",\n bel: \"be\",\n беларуская: \"be\",\n \"беларуская мова\": \"be\",\n belarusian: \"be\",\n \"in belarusian\": \"be\",\n\n // Bulgarian\n bg: \"bg\",\n bul: \"bg\",\n български: \"bg\",\n \"български език\": \"bg\",\n bulgarian: \"bg\",\n \"in bulgarian\": \"bg\",\n\n // English\n en: \"en\",\n eng: \"en\",\n english: \"en\",\n \"in english\": \"en\",\n англійська: \"en\",\n английский: \"en\",\n\n // Polish\n pl: \"pl\",\n pol: \"pl\",\n polski: \"pl\",\n \"po polsku\": \"pl\",\n polish: \"pl\",\n польська: \"pl\",\n\n // German\n de: \"de\",\n deu: \"de\",\n ger: \"de\",\n deutsch: \"de\",\n \"auf deutsch\": \"de\",\n german: \"de\",\n німецька: \"de\",\n\n // French\n fr: \"fr\",\n fra: \"fr\",\n français: \"fr\",\n francais: \"fr\",\n \"en français\": \"fr\",\n french: \"fr\",\n французька: \"fr\",\n\n // Spanish\n es: \"es\",\n spa: \"es\",\n español: \"es\",\n espanol: \"es\",\n \"en español\": \"es\",\n spanish: \"es\",\n іспанська: \"es\",\n\n // Italian\n it: \"it\",\n ita: \"it\",\n italiano: \"it\",\n \"in italiano\": \"it\",\n italian: \"it\",\n італійська: \"it\",\n};\n\n/**\n * Strict, exact-match lookup. Returns `null` for unknown inputs and does NOT\n * fall back to a hyphen prefix. Use anywhere a hyphen split could be a\n * coincidence — URL path segments (`/ru-return-warranty`), title attrs, link\n * text. The phrase aliases (`по-русски`, `in english`) are in the table\n * directly, so exact lookup still finds them.\n */\nexport function normalizeLanguageCode(input: string | undefined | null): string | null {\n if (input === undefined || input === null) return null;\n const cleaned = input.trim().toLowerCase();\n if (cleaned.length === 0) return null;\n return ALIASES[cleaned] ?? null;\n}\n\n/**\n * BCP-47-aware normalization: try the full string first, then strip a\n * region/script suffix (`en-US` → `en`, `zh_CN` → `zh`). Use ONLY for inputs\n * documented to be BCP-47 — `hreflang`, `<html lang>`, `Content-Language`,\n * `data-lang`/`data-locale` — never for free-text URL slugs.\n *\n * Falls back to the raw primary subtag when no alias matches, so a code outside\n * the alias table (e.g. `pt-BR` → `pt`) still resolves to its language. The\n * roster decides relevance downstream.\n */\nexport function normalizeBCP47(input: string | undefined | null): string | null {\n if (input === undefined || input === null) return null;\n const cleaned = input.trim().toLowerCase().replace(/_/g, \"-\");\n if (cleaned.length === 0) return null;\n const direct = ALIASES[cleaned];\n if (direct !== undefined) return direct;\n const head = cleaned.split(\"-\")[0];\n if (head === undefined || head.length === 0) return null;\n return ALIASES[head] ?? head;\n}\n\n/**\n * Extract the primary subtag from a BCP-47-ish value, lowercased, then resolve\n * it through the alias table (`ua` → `uk`). Handles `Accept-Language`-style\n * comma lists (`en-US,en;q=0.9` → `en`). Returns `null` for empty/nullish.\n *\n * This is the header/HTML extraction helper: it tolerates the messy shapes those\n * sources carry (comma lists, `q` weights) where {@link normalizeBCP47} expects\n * a single tag.\n */\nexport function primarySubtag(value: string | undefined | null): string | null {\n if (value === undefined || value === null) return null;\n const first = value.split(\",\")[0]?.trim();\n if (first === undefined || first.length === 0) return null;\n // Drop a `;q=…` weight if present.\n const tag = first.split(\";\")[0]?.trim();\n return normalizeBCP47(tag);\n}\n"]}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { classifyBySnippet } from './chunk-NCGZPEDA.js';
|
|
2
|
+
|
|
3
|
+
// src/text.ts
|
|
4
|
+
function evidenceFromText(text, candidates, rung3) {
|
|
5
|
+
if (text === void 0 || text.trim().length === 0) return [];
|
|
6
|
+
if (candidates === void 0 || candidates.length === 0) return [];
|
|
7
|
+
const verdict = classifyBySnippet(text, candidates, rung3);
|
|
8
|
+
if (verdict.language === "unknown") return [];
|
|
9
|
+
const item = {
|
|
10
|
+
kind: "title-script",
|
|
11
|
+
language: verdict.language,
|
|
12
|
+
confidence: marginToConfidence(verdict.margin, verdict.rung),
|
|
13
|
+
source: "title-script",
|
|
14
|
+
value: text.trim().slice(0, 80)
|
|
15
|
+
};
|
|
16
|
+
if (!verdict.discriminating) item.discriminating = false;
|
|
17
|
+
return [item];
|
|
18
|
+
}
|
|
19
|
+
function marginToConfidence(margin, rung) {
|
|
20
|
+
if (rung === 3) {
|
|
21
|
+
return clamp01(0.4 + Math.min(Math.max(margin, 0), 1) * 0.35);
|
|
22
|
+
}
|
|
23
|
+
const lead = Math.max(margin, 1);
|
|
24
|
+
return clamp01(0.6 + Math.min(lead, 4) / 4 * 0.35);
|
|
25
|
+
}
|
|
26
|
+
function clamp01(value) {
|
|
27
|
+
if (!Number.isFinite(value)) return 0;
|
|
28
|
+
if (value < 0) return 0;
|
|
29
|
+
if (value > 1) return 1;
|
|
30
|
+
return value;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export { evidenceFromText };
|
|
34
|
+
//# sourceMappingURL=chunk-PT7R2BRQ.js.map
|
|
35
|
+
//# sourceMappingURL=chunk-PT7R2BRQ.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/text.ts"],"names":[],"mappings":";;;AAmBO,SAAS,gBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACoB;AACpB,EAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,KAAW,CAAA,SAAU,EAAC;AAC5D,EAAA,IAAI,eAAe,MAAA,IAAa,UAAA,CAAW,MAAA,KAAW,CAAA,SAAU,EAAC;AAEjE,EAAA,MAAM,OAAA,GAAU,iBAAA,CAAkB,IAAA,EAAM,UAAA,EAAY,KAAK,CAAA;AACzD,EAAA,IAAI,OAAA,CAAQ,QAAA,KAAa,SAAA,EAAW,OAAO,EAAC;AAE5C,EAAA,MAAM,IAAA,GAAyB;AAAA,IAC7B,IAAA,EAAM,cAAA;AAAA,IACN,UAAU,OAAA,CAAQ,QAAA;AAAA,IAClB,UAAA,EAAY,kBAAA,CAAmB,OAAA,CAAQ,MAAA,EAAQ,QAAQ,IAAI,CAAA;AAAA,IAC3D,MAAA,EAAQ,cAAA;AAAA,IACR,OAAO,IAAA,CAAK,IAAA,EAAK,CAAE,KAAA,CAAM,GAAG,EAAE;AAAA,GAChC;AAIA,EAAA,IAAI,CAAC,OAAA,CAAQ,cAAA,EAAgB,IAAA,CAAK,cAAA,GAAiB,KAAA;AACnD,EAAA,OAAO,CAAC,IAAI,CAAA;AACd;AAWA,SAAS,kBAAA,CAAmB,QAAgB,IAAA,EAAoB;AAC9D,EAAA,IAAI,SAAS,CAAA,EAAG;AAEd,IAAA,OAAO,OAAA,CAAQ,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA,EAAG,CAAC,CAAA,GAAI,IAAI,CAAA;AAAA,EAC9D;AACA,EAAA,MAAM,IAAA,GAAO,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA;AAC/B,EAAA,OAAO,OAAA,CAAQ,MAAO,IAAA,CAAK,GAAA,CAAI,MAAM,CAAC,CAAA,GAAI,IAAK,IAAI,CAAA;AACrD;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chunk-PT7R2BRQ.js","sourcesContent":["import type { LanguageEvidence, LanguageProfile } from \"./types.js\";\nimport { classifyBySnippet, type Rung, type Rung3Resolver } from \"./internal/classify.js\";\n\n/**\n * Producer: candidate-relative script + lexical signals from the title text.\n *\n * Wraps the ported snippet classifier ({@link classifyBySnippet}): noise strip →\n * dominant-script scope → distinctive letters (rung 1) → function words (2a) →\n * frequent words (2b). The `candidates` roster makes scoring roster-relative —\n * `і` decides Ukrainian only when Russian is also a candidate. Sync and\n * zero-dependency; the optional franc rung is injected via `rung3`.\n *\n * Emits at most one `kind: \"title-script\"` evidence item. The classifier's\n * integer `margin` (the winner's lead over the runner-up) maps to a 0..1\n * `confidence`: a verdict at all means the dominant script and the deciding rung\n * agreed, so the floor is high; a wider lead nudges it up. With no candidates\n * (or no usable distinctive signal) it abstains — emitting nothing rather than a\n * coarse \"unknown\", since the roster decides relevance.\n */\nexport function evidenceFromText(\n text: string | undefined,\n candidates?: readonly LanguageProfile[],\n rung3?: Rung3Resolver,\n): LanguageEvidence[] {\n if (text === undefined || text.trim().length === 0) return [];\n if (candidates === undefined || candidates.length === 0) return [];\n\n const verdict = classifyBySnippet(text, candidates, rung3);\n if (verdict.language === \"unknown\") return [];\n\n const item: LanguageEvidence = {\n kind: \"title-script\",\n language: verdict.language,\n confidence: marginToConfidence(verdict.margin, verdict.rung),\n source: \"title-script\",\n value: text.trim().slice(0, 80),\n };\n // Surface only the meaningful negative: the script was owned by ≤1 candidate,\n // so it didn't choose between candidates. The discriminating case stays narrow\n // (flag omitted). `fuse({ nonDiscriminatingScript: \"unknown\" })` reads this.\n if (!verdict.discriminating) item.discriminating = false;\n return [item];\n}\n\n/**\n * Map the classifier's per-rung lead to a 0..1 confidence.\n *\n * Rungs 1–2 carry an integer count of distinctive items (≥1). A verdict already\n * means script + rung agreed, so the floor is high (0.6) and each extra\n * distinctive item adds up to a 0.35 bonus, saturating by a lead of 4. Rung 3\n * (franc) carries franc's own 0..1 score-gap, which is weaker evidence, so it is\n * scaled into a 0.4..0.75 band.\n */\nfunction marginToConfidence(margin: number, rung: Rung): number {\n if (rung === 3) {\n // franc score-gap is already 0..1; weaker than the distinctive rungs.\n return clamp01(0.4 + Math.min(Math.max(margin, 0), 1) * 0.35);\n }\n const lead = Math.max(margin, 1);\n return clamp01(0.6 + (Math.min(lead, 4) / 4) * 0.35);\n}\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { L as LanguageProfile } from './types-BIXrkuAr.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Per-snippet language classification by candidate-set-relative set-difference.
|
|
5
|
+
*
|
|
6
|
+
* A ladder of rungs; the first rung whose leader clears a lead (margin) of ≥1
|
|
7
|
+
* wins; otherwise `"unknown"`:
|
|
8
|
+
*
|
|
9
|
+
* 1 alphabet — characters distinctive within the candidate set
|
|
10
|
+
* 2a function words — curated grammatical markers (highest precision)
|
|
11
|
+
* 2b frequent words — corpus content words
|
|
12
|
+
* 3 franc — optional trigram backstop for the distinctive-free
|
|
13
|
+
* residual, injected as a resolver (this module stays
|
|
14
|
+
* franc-free and importable without franc's tables)
|
|
15
|
+
*
|
|
16
|
+
* "Distinctive" is ALWAYS relative to the candidate set: a signal counts for a
|
|
17
|
+
* candidate iff it appears in that candidate's profile and in NO other
|
|
18
|
+
* candidate's. So `і` decides {uk, ru} (only uk has it) but is inert in
|
|
19
|
+
* {uk, be} (both have it), and the word `и` decides {uk, ru} even though the
|
|
20
|
+
* *letter* `и` is shared. Nothing is precomputed — uniqueness is the runtime
|
|
21
|
+
* output, never stored.
|
|
22
|
+
*
|
|
23
|
+
* Adapted to langtell's {@link LanguageProfile} shape: the `words` and `iso6393`
|
|
24
|
+
* fields are optional here, so a bare `{ code, alphabet }` profile still
|
|
25
|
+
* classifies on rung 1.
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
declare const FRANC_RUNG = 3;
|
|
29
|
+
/** Which rung decided a verdict; `null` when unknown. */
|
|
30
|
+
type Rung = 1 | "2a" | "2b" | typeof FRANC_RUNG | null;
|
|
31
|
+
interface SnippetVerdict {
|
|
32
|
+
/** Winning language code, or the sentinel `"unknown"`. */
|
|
33
|
+
language: string;
|
|
34
|
+
/** Lead of the winner over the runner-up, in the rung's own unit (distinctive
|
|
35
|
+
* char/word count for rungs 1–2; franc score-gap for rung 3). 0 when unknown. */
|
|
36
|
+
margin: number;
|
|
37
|
+
/** Which rung decided; `null` when unknown. */
|
|
38
|
+
rung: Rung;
|
|
39
|
+
/** Whether ≥2 same-script candidates were in scope when the verdict was
|
|
40
|
+
* reached. `true` ⇒ the distinctive-letter/word machinery actually chose
|
|
41
|
+
* between candidates; `false` ⇒ the winner was the lone candidate in its
|
|
42
|
+
* script, selected by script alone (no evidence it is *distinctively* that
|
|
43
|
+
* language). `false` for `"unknown"`. */
|
|
44
|
+
discriminating: boolean;
|
|
45
|
+
}
|
|
46
|
+
/** A rung's verdict before {@link classifyBySnippet} stamps on the scope-derived
|
|
47
|
+
* `discriminating` flag (which a single rung can't know — it depends on how many
|
|
48
|
+
* same-script candidates were scoped). */
|
|
49
|
+
type RungVerdict = Pick<SnippetVerdict, "language" | "margin" | "rung">;
|
|
50
|
+
/** Resolver for rung 3 (the optional trigram backstop), injected into
|
|
51
|
+
* {@link classifyBySnippet} by callers that have franc available. Kept as an
|
|
52
|
+
* injected seam — not a direct import — so this module stays franc-free and
|
|
53
|
+
* importable without pulling franc's tables. Returns a rung-3 verdict or
|
|
54
|
+
* `null` (abstain). */
|
|
55
|
+
type Rung3Resolver = (text: string, scoped: readonly LanguageProfile[]) => RungVerdict | null;
|
|
56
|
+
/**
|
|
57
|
+
* Classify `text` among `candidates`. Synchronous and allocation-light. Returns
|
|
58
|
+
* `"unknown"` on empty evidence, on a tie inside the candidate set, or when
|
|
59
|
+
* nothing is distinctive.
|
|
60
|
+
*/
|
|
61
|
+
declare function classifyBySnippet(text: string, candidates: readonly LanguageProfile[], rung3?: Rung3Resolver): SnippetVerdict;
|
|
62
|
+
|
|
63
|
+
export { FRANC_RUNG, type Rung, type Rung3Resolver, type RungVerdict, type SnippetVerdict, classifyBySnippet };
|
package/dist/classify.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"names":[],"mappings":"","file":"classify.js"}
|
package/dist/franc.d.ts
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import { L as LanguageProfile, S as SyncSource, a as LanguageEvidence } from './types-BIXrkuAr.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Producer: the franc trigram backstop over `text`, scoped to `candidates`.
|
|
5
|
+
* Synchronous — franc itself is sync. Emits at most one `kind: "franc"` item.
|
|
6
|
+
*/
|
|
7
|
+
declare function evidenceFromFranc(text: string | undefined, candidates: readonly LanguageProfile[] | undefined): LanguageEvidence[];
|
|
8
|
+
/**
|
|
9
|
+
* Build a franc {@link SyncSource} bound to a candidate roster, for use in
|
|
10
|
+
* `compile({ engines: [createFrancEngine(candidates)] })`. franc needs the
|
|
11
|
+
* roster to scope its `only` restriction, so it is bound at construction (the
|
|
12
|
+
* same shape `compile` uses to bind the built-in text producer).
|
|
13
|
+
*
|
|
14
|
+
* A `SyncSource` (not async): franc runs in-process and synchronously, so the
|
|
15
|
+
* compiled `detect` stays synchronous — no `await` ceremony on the hot path.
|
|
16
|
+
*/
|
|
17
|
+
declare function createFrancEngine(candidates: readonly LanguageProfile[]): SyncSource;
|
|
18
|
+
/**
|
|
19
|
+
* A bare franc engine with no bound roster — it abstains until given candidates.
|
|
20
|
+
* Prefer {@link createFrancEngine} with your roster; this default exists so the
|
|
21
|
+
* engine has a stable named export and a no-config import works.
|
|
22
|
+
*/
|
|
23
|
+
declare const francEngine: SyncSource;
|
|
24
|
+
|
|
25
|
+
export { createFrancEngine, evidenceFromFranc, francEngine };
|
package/dist/franc.js
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { scopeCandidates } from './chunk-NCGZPEDA.js';
|
|
2
|
+
import { francAll } from 'franc';
|
|
3
|
+
|
|
4
|
+
var RUNG_MIN_LENGTH = 24;
|
|
5
|
+
var FRANC_MIN_LENGTH = 10;
|
|
6
|
+
var DEFAULT_MAX_CHARS = 2e3;
|
|
7
|
+
function francScore(text, scoped) {
|
|
8
|
+
const byIso = /* @__PURE__ */ new Map();
|
|
9
|
+
for (const c of scoped) if (c.iso6393 !== void 0) byIso.set(c.iso6393, c.code);
|
|
10
|
+
if (byIso.size < 2) return null;
|
|
11
|
+
const sample = text.slice(0, DEFAULT_MAX_CHARS);
|
|
12
|
+
const ranked = francAll(sample, { only: [...byIso.keys()], minLength: FRANC_MIN_LENGTH });
|
|
13
|
+
const top = ranked[0];
|
|
14
|
+
if (!top || top[0] === "und") return null;
|
|
15
|
+
const language = byIso.get(top[0]);
|
|
16
|
+
if (language === void 0) return null;
|
|
17
|
+
return { language, margin: top[1] - (ranked[1]?.[1] ?? 0) };
|
|
18
|
+
}
|
|
19
|
+
function evidenceFromFranc(text, candidates) {
|
|
20
|
+
if (text === void 0 || text.trim().length < RUNG_MIN_LENGTH) return [];
|
|
21
|
+
if (candidates === void 0 || candidates.length === 0) return [];
|
|
22
|
+
const scoped = scopeCandidates(text, candidates);
|
|
23
|
+
const scored = francScore(text, scoped);
|
|
24
|
+
if (scored === null) return [];
|
|
25
|
+
return [
|
|
26
|
+
{
|
|
27
|
+
kind: "franc",
|
|
28
|
+
language: scored.language,
|
|
29
|
+
// franc's score-gap is 0..1; lift it into a usable confidence band.
|
|
30
|
+
confidence: clamp01(0.4 + scored.margin * 0.5),
|
|
31
|
+
source: "franc",
|
|
32
|
+
value: scored.language
|
|
33
|
+
}
|
|
34
|
+
];
|
|
35
|
+
}
|
|
36
|
+
function createFrancEngine(candidates) {
|
|
37
|
+
return {
|
|
38
|
+
id: "franc",
|
|
39
|
+
sync: true,
|
|
40
|
+
inputs: ["text"],
|
|
41
|
+
detect: (input) => evidenceFromFranc(input.text, candidates)
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
var francEngine = {
|
|
45
|
+
id: "franc",
|
|
46
|
+
sync: true,
|
|
47
|
+
inputs: ["text"],
|
|
48
|
+
detect: (input) => evidenceFromFranc(input.text, void 0)
|
|
49
|
+
};
|
|
50
|
+
function clamp01(value) {
|
|
51
|
+
if (!Number.isFinite(value)) return 0;
|
|
52
|
+
if (value < 0) return 0;
|
|
53
|
+
if (value > 1) return 1;
|
|
54
|
+
return value;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export { createFrancEngine, evidenceFromFranc, francEngine };
|
|
58
|
+
//# sourceMappingURL=franc.js.map
|
|
59
|
+
//# sourceMappingURL=franc.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/franc.ts"],"names":[],"mappings":";;;AAmBA,IAAM,eAAA,GAAkB,EAAA;AAExB,IAAM,gBAAA,GAAmB,EAAA;AAEzB,IAAM,iBAAA,GAAoB,GAAA;AAQ1B,SAAS,UAAA,CACP,MACA,MAAA,EAC6C;AAC7C,EAAA,MAAM,KAAA,uBAAY,GAAA,EAAoB;AACtC,EAAA,KAAA,MAAW,CAAA,IAAK,MAAA,EAAQ,IAAI,CAAA,CAAE,OAAA,KAAY,MAAA,EAAW,KAAA,CAAM,GAAA,CAAI,CAAA,CAAE,OAAA,EAAS,CAAA,CAAE,IAAI,CAAA;AAChF,EAAA,IAAI,KAAA,CAAM,IAAA,GAAO,CAAA,EAAG,OAAO,IAAA;AAC3B,EAAA,MAAM,MAAA,GAAS,IAAA,CAAK,KAAA,CAAM,CAAA,EAAG,iBAAiB,CAAA;AAC9C,EAAA,MAAM,MAAA,GAAS,QAAA,CAAS,MAAA,EAAQ,EAAE,IAAA,EAAM,CAAC,GAAG,KAAA,CAAM,IAAA,EAAM,CAAA,EAAG,SAAA,EAAW,kBAAkB,CAAA;AACxF,EAAA,MAAM,GAAA,GAAM,OAAO,CAAC,CAAA;AACpB,EAAA,IAAI,CAAC,GAAA,IAAO,GAAA,CAAI,CAAC,CAAA,KAAM,OAAO,OAAO,IAAA;AACrC,EAAA,MAAM,QAAA,GAAW,KAAA,CAAM,GAAA,CAAI,GAAA,CAAI,CAAC,CAAC,CAAA;AACjC,EAAA,IAAI,QAAA,KAAa,QAAW,OAAO,IAAA;AACnC,EAAA,OAAO,EAAE,QAAA,EAAU,MAAA,EAAQ,GAAA,CAAI,CAAC,CAAA,IAAK,MAAA,CAAO,CAAC,CAAA,GAAI,CAAC,CAAA,IAAK,CAAA,CAAA,EAAG;AAC5D;AAMO,SAAS,iBAAA,CACd,MACA,UAAA,EACoB;AACpB,EAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,GAAS,eAAA,SAAwB,EAAC;AACxE,EAAA,IAAI,eAAe,MAAA,IAAa,UAAA,CAAW,MAAA,KAAW,CAAA,SAAU,EAAC;AAEjE,EAAA,MAAM,MAAA,GAAS,eAAA,CAAgB,IAAA,EAAM,UAAU,CAAA;AAC/C,EAAA,MAAM,MAAA,GAAS,UAAA,CAAW,IAAA,EAAM,MAAM,CAAA;AACtC,EAAA,IAAI,MAAA,KAAW,IAAA,EAAM,OAAO,EAAC;AAE7B,EAAA,OAAO;AAAA,IACL;AAAA,MACE,IAAA,EAAM,OAAA;AAAA,MACN,UAAU,MAAA,CAAO,QAAA;AAAA;AAAA,MAEjB,UAAA,EAAY,OAAA,CAAQ,GAAA,GAAM,MAAA,CAAO,SAAS,GAAG,CAAA;AAAA,MAC7C,MAAA,EAAQ,OAAA;AAAA,MACR,OAAO,MAAA,CAAO;AAAA;AAChB,GACF;AACF;AAWO,SAAS,kBAAkB,UAAA,EAAoD;AACpF,EAAA,OAAO;AAAA,IACL,EAAA,EAAI,OAAA;AAAA,IACJ,IAAA,EAAM,IAAA;AAAA,IACN,MAAA,EAAQ,CAAC,MAAM,CAAA;AAAA,IACf,QAAQ,CAAC,KAAA,KAAU,iBAAA,CAAkB,KAAA,CAAM,MAAM,UAAU;AAAA,GAC7D;AACF;AAOO,IAAM,WAAA,GAA0B;AAAA,EACrC,EAAA,EAAI,OAAA;AAAA,EACJ,IAAA,EAAM,IAAA;AAAA,EACN,MAAA,EAAQ,CAAC,MAAM,CAAA;AAAA,EACf,QAAQ,CAAC,KAAA,KAAU,iBAAA,CAAkB,KAAA,CAAM,MAAM,MAAS;AAC5D;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"franc.js","sourcesContent":["/**\n * `langtell/franc` — the opt-in franc engine (trigram-based detection, ~187\n * languages). Importing this module statically pulls `franc` and its trigram\n * tables, so it lives behind its own subpath; the zero-dependency core never\n * imports it (enforced by an ESLint boundary rule). `franc` is declared as an\n * OPTIONAL peer dependency — install it only if you use this engine.\n *\n * The engine is a candidate-relative *backstop*: franc is scoped to the\n * candidates' ISO 639-3 codes (`only`), runs only on text past a length floor\n * where trigrams are reliable, and emits `kind: \"franc\"` evidence with franc's\n * own score-gap as the confidence. It abstains (emits nothing) when fewer than\n * two candidates carry an `iso6393`, when franc returns `und`, or when the\n * sample is too short.\n */\nimport { francAll } from \"franc\";\nimport type { LanguageEvidence, LanguageProfile, SyncSource } from \"./types.js\";\nimport { scopeCandidates } from \"./internal/classify.js\";\n\n/** Minimum sample length, in characters. Below this trigrams are too noisy. */\nconst RUNG_MIN_LENGTH = 24;\n/** Floor franc itself uses to bail to `und` rather than guess. */\nconst FRANC_MIN_LENGTH = 10;\n/** Cap on text length sent to franc (longer adds cost, not accuracy). */\nconst DEFAULT_MAX_CHARS = 2000;\n\n/**\n * Run franc scoped to the candidates' ISO 639-3 codes, mapping the winner back\n * to its BCP-47 code. Returns `null` when fewer than two candidates carry an\n * `iso6393` code or franc abstains (`und`). The margin is franc's own score-gap\n * (top1 − top2, 0..1).\n */\nfunction francScore(\n text: string,\n scoped: readonly LanguageProfile[],\n): { language: string; margin: number } | null {\n const byIso = new Map<string, string>();\n for (const c of scoped) if (c.iso6393 !== undefined) byIso.set(c.iso6393, c.code);\n if (byIso.size < 2) return null;\n const sample = text.slice(0, DEFAULT_MAX_CHARS);\n const ranked = francAll(sample, { only: [...byIso.keys()], minLength: FRANC_MIN_LENGTH });\n const top = ranked[0];\n if (!top || top[0] === \"und\") return null;\n const language = byIso.get(top[0]);\n if (language === undefined) return null;\n return { language, margin: top[1] - (ranked[1]?.[1] ?? 0) };\n}\n\n/**\n * Producer: the franc trigram backstop over `text`, scoped to `candidates`.\n * Synchronous — franc itself is sync. Emits at most one `kind: \"franc\"` item.\n */\nexport function evidenceFromFranc(\n text: string | undefined,\n candidates: readonly LanguageProfile[] | undefined,\n): LanguageEvidence[] {\n if (text === undefined || text.trim().length < RUNG_MIN_LENGTH) return [];\n if (candidates === undefined || candidates.length === 0) return [];\n\n const scoped = scopeCandidates(text, candidates);\n const scored = francScore(text, scoped);\n if (scored === null) return [];\n\n return [\n {\n kind: \"franc\",\n language: scored.language,\n // franc's score-gap is 0..1; lift it into a usable confidence band.\n confidence: clamp01(0.4 + scored.margin * 0.5),\n source: \"franc\",\n value: scored.language,\n },\n ];\n}\n\n/**\n * Build a franc {@link SyncSource} bound to a candidate roster, for use in\n * `compile({ engines: [createFrancEngine(candidates)] })`. franc needs the\n * roster to scope its `only` restriction, so it is bound at construction (the\n * same shape `compile` uses to bind the built-in text producer).\n *\n * A `SyncSource` (not async): franc runs in-process and synchronously, so the\n * compiled `detect` stays synchronous — no `await` ceremony on the hot path.\n */\nexport function createFrancEngine(candidates: readonly LanguageProfile[]): SyncSource {\n return {\n id: \"franc\",\n sync: true,\n inputs: [\"text\"],\n detect: (input) => evidenceFromFranc(input.text, candidates),\n };\n}\n\n/**\n * A bare franc engine with no bound roster — it abstains until given candidates.\n * Prefer {@link createFrancEngine} with your roster; this default exists so the\n * engine has a stable named export and a no-config import works.\n */\nexport const francEngine: SyncSource = {\n id: \"franc\",\n sync: true,\n inputs: [\"text\"],\n detect: (input) => evidenceFromFranc(input.text, undefined),\n};\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
|
package/dist/fuse.d.ts
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { W as Weights, L as LanguageProfile, N as NonDiscriminatingScript, a as LanguageEvidence, C as Classification } from './types-BIXrkuAr.js';
|
|
2
|
+
|
|
3
|
+
interface FuseOptions {
|
|
4
|
+
weights?: Weights;
|
|
5
|
+
/** The candidate roster. When present, incoming evidence tags are normalized
|
|
6
|
+
* into it (`uk-UA` → `uk`, `ua` → `uk`) so context signals (page/header
|
|
7
|
+
* locale) land on the same code the text rungs use. */
|
|
8
|
+
candidates?: readonly LanguageProfile[];
|
|
9
|
+
/** How to resolve a *non-discriminating* script read (one flagged
|
|
10
|
+
* `discriminating: false` — its winning script owned by ≤1 roster candidate).
|
|
11
|
+
* Default `"candidate"` keeps current behavior; `"unknown"` drops such a read
|
|
12
|
+
* unless non-script evidence corroborates the same language. See
|
|
13
|
+
* {@link NonDiscriminatingScript}. */
|
|
14
|
+
nonDiscriminatingScript?: NonDiscriminatingScript;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Combine evidence into a single weighted verdict with an audit trail.
|
|
18
|
+
*
|
|
19
|
+
* Three steps:
|
|
20
|
+
* 1. Normalize each item's language tag into the candidate roster (BCP-47:
|
|
21
|
+
* `uk-UA`/`ua` → `uk`) so text, page, and header signals agree on a code.
|
|
22
|
+
* 2. Weighted argmax over languages (caller weights override per `source`/`kind`).
|
|
23
|
+
* 3. Apply the guard **context must never override clear script evidence**: when
|
|
24
|
+
* the text classifier (or an on-device model) confidently read one language,
|
|
25
|
+
* weaker page/header context for a *different* language cannot win — a
|
|
26
|
+
* Ukrainian page chrome does not make a Latin/English title Ukrainian.
|
|
27
|
+
*/
|
|
28
|
+
declare function fuse(evidence: readonly LanguageEvidence[], options?: FuseOptions): Classification;
|
|
29
|
+
|
|
30
|
+
export { type FuseOptions, fuse };
|
package/dist/fuse.js
ADDED
package/dist/fuse.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"names":[],"mappings":"","file":"fuse.js"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { H as HeaderBag, a as LanguageEvidence } from './types-BIXrkuAr.js';
|
|
2
|
+
|
|
3
|
+
/** Producer: the HTTP `Content-Language` response header. */
|
|
4
|
+
declare function evidenceFromHeaders(headers: HeaderBag | undefined): LanguageEvidence[];
|
|
5
|
+
|
|
6
|
+
export { evidenceFromHeaders };
|
package/dist/headers.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"names":[],"mappings":"","file":"headers.js"}
|
package/dist/html.d.ts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { a as LanguageEvidence } from './types-BIXrkuAr.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Producer: language clues from an HTML string's metadata.
|
|
5
|
+
*
|
|
6
|
+
* Reads three independent declarations, each emitted as its own evidence item
|
|
7
|
+
* (the fuser weighs them):
|
|
8
|
+
* - `<html lang>` → `html-lang`
|
|
9
|
+
* - `<meta http-equiv="content-language">` → `meta-content-language`
|
|
10
|
+
* - `<meta property="og:locale">` → `meta-og-locale`
|
|
11
|
+
*
|
|
12
|
+
* All tags are BCP-47-normalized (`uk-UA` → `uk`, `en_US` → `en`). Sync and
|
|
13
|
+
* zero-dependency — regex extraction only, never a DOM parse.
|
|
14
|
+
*/
|
|
15
|
+
declare function evidenceFromHtml(html: string | undefined): LanguageEvidence[];
|
|
16
|
+
|
|
17
|
+
export { evidenceFromHtml };
|
package/dist/html.js
ADDED
package/dist/html.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"names":[],"mappings":"","file":"html.js"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import { E as EvidenceSource, D as DetectorConfig, c as DetectFn } from './types-BIXrkuAr.js';
|
|
2
|
+
export { A as AsyncSource, C as Classification, d as DetectContext, e as DetectInput, f as EarlyExit, g as EvidenceKind, h as HasAsync, H as HeaderBag, b as LanguageCode, a as LanguageEvidence, L as LanguageProfile, N as NonDiscriminatingScript, i as SourceInput, S as SyncSource, W as Weights } from './types-BIXrkuAr.js';
|
|
3
|
+
export { FuseOptions, fuse } from './fuse.js';
|
|
4
|
+
export { evidenceFromText } from './text.js';
|
|
5
|
+
export { evidenceFromHtml } from './html.js';
|
|
6
|
+
export { evidenceFromHeaders } from './headers.js';
|
|
7
|
+
import './classify.js';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Build a configured detector. Does the per-roster setup once and returns a
|
|
11
|
+
* `detect` function whose sync/async shape is fixed by the registered engines
|
|
12
|
+
* (see {@link DetectFn}). The built-in producers are always registered; opt-in
|
|
13
|
+
* engines (franc, chrome-ai) are added via `config.engines`.
|
|
14
|
+
*/
|
|
15
|
+
declare function compile<const E extends readonly EvidenceSource[] = []>(config?: DetectorConfig<E>): DetectFn<E>;
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* BCP-47 / language-code normalization.
|
|
19
|
+
*
|
|
20
|
+
* Two entry points with deliberately different strictness:
|
|
21
|
+
* - {@link normalizeBCP47} — for inputs documented to be BCP-47 (`<html lang>`,
|
|
22
|
+
* hreflang, `Content-Language`): try the full string, then strip a
|
|
23
|
+
* region/script suffix (`en-US` → `en`, `zh_CN` → `zh`).
|
|
24
|
+
* - {@link normalizeLanguageCode} — strict exact-match only, for free-text
|
|
25
|
+
* contexts (URL slugs, link text) where a hyphen split could be a coincidence.
|
|
26
|
+
*
|
|
27
|
+
* Both resolve aliases that appear in the wild (`ua` → `uk`, `rus` → `ru`,
|
|
28
|
+
* localized picker phrases) to a canonical ISO 639-1 code.
|
|
29
|
+
*/
|
|
30
|
+
/**
|
|
31
|
+
* Strict, exact-match lookup. Returns `null` for unknown inputs and does NOT
|
|
32
|
+
* fall back to a hyphen prefix. Use anywhere a hyphen split could be a
|
|
33
|
+
* coincidence — URL path segments (`/ru-return-warranty`), title attrs, link
|
|
34
|
+
* text. The phrase aliases (`по-русски`, `in english`) are in the table
|
|
35
|
+
* directly, so exact lookup still finds them.
|
|
36
|
+
*/
|
|
37
|
+
declare function normalizeLanguageCode(input: string | undefined | null): string | null;
|
|
38
|
+
/**
|
|
39
|
+
* BCP-47-aware normalization: try the full string first, then strip a
|
|
40
|
+
* region/script suffix (`en-US` → `en`, `zh_CN` → `zh`). Use ONLY for inputs
|
|
41
|
+
* documented to be BCP-47 — `hreflang`, `<html lang>`, `Content-Language`,
|
|
42
|
+
* `data-lang`/`data-locale` — never for free-text URL slugs.
|
|
43
|
+
*
|
|
44
|
+
* Falls back to the raw primary subtag when no alias matches, so a code outside
|
|
45
|
+
* the alias table (e.g. `pt-BR` → `pt`) still resolves to its language. The
|
|
46
|
+
* roster decides relevance downstream.
|
|
47
|
+
*/
|
|
48
|
+
declare function normalizeBCP47(input: string | undefined | null): string | null;
|
|
49
|
+
/**
|
|
50
|
+
* Extract the primary subtag from a BCP-47-ish value, lowercased, then resolve
|
|
51
|
+
* it through the alias table (`ua` → `uk`). Handles `Accept-Language`-style
|
|
52
|
+
* comma lists (`en-US,en;q=0.9` → `en`). Returns `null` for empty/nullish.
|
|
53
|
+
*
|
|
54
|
+
* This is the header/HTML extraction helper: it tolerates the messy shapes those
|
|
55
|
+
* sources carry (comma lists, `q` weights) where {@link normalizeBCP47} expects
|
|
56
|
+
* a single tag.
|
|
57
|
+
*/
|
|
58
|
+
declare function primarySubtag(value: string | undefined | null): string | null;
|
|
59
|
+
|
|
60
|
+
export { DetectFn, DetectorConfig, EvidenceSource, compile, normalizeBCP47, normalizeLanguageCode, primarySubtag };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import { evidenceFromText } from './chunk-PT7R2BRQ.js';
|
|
2
|
+
export { evidenceFromText } from './chunk-PT7R2BRQ.js';
|
|
3
|
+
import { evidenceFromHtml } from './chunk-KI4MAI3N.js';
|
|
4
|
+
export { evidenceFromHtml } from './chunk-KI4MAI3N.js';
|
|
5
|
+
import { evidenceFromHeaders } from './chunk-3LDE35U2.js';
|
|
6
|
+
export { evidenceFromHeaders } from './chunk-3LDE35U2.js';
|
|
7
|
+
import { fuse } from './chunk-7G3MEXWK.js';
|
|
8
|
+
export { fuse } from './chunk-7G3MEXWK.js';
|
|
9
|
+
export { normalizeBCP47, normalizeLanguageCode, primarySubtag } from './chunk-OVSPOZ5J.js';
|
|
10
|
+
import './chunk-NCGZPEDA.js';
|
|
11
|
+
|
|
12
|
+
// src/compile.ts
|
|
13
|
+
function builtIns(candidates) {
|
|
14
|
+
return [
|
|
15
|
+
{
|
|
16
|
+
id: "text",
|
|
17
|
+
sync: true,
|
|
18
|
+
inputs: ["text"],
|
|
19
|
+
detect: (i) => evidenceFromText(i.text, candidates)
|
|
20
|
+
},
|
|
21
|
+
{ id: "html", sync: true, inputs: ["html"], detect: (i) => evidenceFromHtml(i.html) },
|
|
22
|
+
{
|
|
23
|
+
id: "headers",
|
|
24
|
+
sync: true,
|
|
25
|
+
inputs: ["headers"],
|
|
26
|
+
detect: (i) => evidenceFromHeaders(i.headers)
|
|
27
|
+
}
|
|
28
|
+
];
|
|
29
|
+
}
|
|
30
|
+
function applicable(source, input) {
|
|
31
|
+
return source.inputs.every((key) => input[key] !== void 0);
|
|
32
|
+
}
|
|
33
|
+
function compile(config = {}) {
|
|
34
|
+
const sources = [...builtIns(config.candidates), ...config.engines ?? []];
|
|
35
|
+
const hasAsync = sources.some((source) => !source.sync);
|
|
36
|
+
const fuseOptions = {
|
|
37
|
+
weights: config.weights,
|
|
38
|
+
candidates: config.candidates,
|
|
39
|
+
nonDiscriminatingScript: config.nonDiscriminatingScript
|
|
40
|
+
};
|
|
41
|
+
if (!hasAsync) {
|
|
42
|
+
const detect2 = (input) => {
|
|
43
|
+
const evidence = [];
|
|
44
|
+
for (const source of sources) {
|
|
45
|
+
if (source.sync && applicable(source, input)) evidence.push(...source.detect(input));
|
|
46
|
+
}
|
|
47
|
+
return fuse(evidence, fuseOptions);
|
|
48
|
+
};
|
|
49
|
+
return detect2;
|
|
50
|
+
}
|
|
51
|
+
const detect = async (input, ctx = {}) => {
|
|
52
|
+
const evidence = [];
|
|
53
|
+
const pending = [];
|
|
54
|
+
for (const source of sources) {
|
|
55
|
+
if (!applicable(source, input)) continue;
|
|
56
|
+
if (source.sync) evidence.push(...source.detect(input));
|
|
57
|
+
else pending.push(Promise.resolve(source.detect(input, ctx)).catch(() => []));
|
|
58
|
+
}
|
|
59
|
+
for (const batch of await Promise.all(pending)) evidence.push(...batch);
|
|
60
|
+
return fuse(evidence, fuseOptions);
|
|
61
|
+
};
|
|
62
|
+
return detect;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export { compile };
|
|
66
|
+
//# sourceMappingURL=index.js.map
|
|
67
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/compile.ts"],"names":["detect"],"mappings":";;;;;;;;;;;;AAmBA,SAAS,SAAS,UAAA,EAAkE;AAClF,EAAA,OAAO;AAAA,IACL;AAAA,MACE,EAAA,EAAI,MAAA;AAAA,MACJ,IAAA,EAAM,IAAA;AAAA,MACN,MAAA,EAAQ,CAAC,MAAM,CAAA;AAAA,MACf,QAAQ,CAAC,CAAA,KAAM,gBAAA,CAAiB,CAAA,CAAE,MAAM,UAAU;AAAA,KACpD;AAAA,IACA,EAAE,EAAA,EAAI,MAAA,EAAQ,IAAA,EAAM,MAAM,MAAA,EAAQ,CAAC,MAAM,CAAA,EAAG,QAAQ,CAAC,CAAA,KAAM,gBAAA,CAAiB,CAAA,CAAE,IAAI,CAAA,EAAE;AAAA,IACpF;AAAA,MACE,EAAA,EAAI,SAAA;AAAA,MACJ,IAAA,EAAM,IAAA;AAAA,MACN,MAAA,EAAQ,CAAC,SAAS,CAAA;AAAA,MAClB,MAAA,EAAQ,CAAC,CAAA,KAAM,mBAAA,CAAoB,EAAE,OAAO;AAAA;AAC9C,GACF;AACF;AAGA,SAAS,UAAA,CAAW,QAAwB,KAAA,EAA6B;AACvE,EAAA,OAAO,MAAA,CAAO,OAAO,KAAA,CAAM,CAAC,QAAQ,KAAA,CAAM,GAAG,MAAM,MAAS,CAAA;AAC9D;AAQO,SAAS,OAAA,CACd,MAAA,GAA4B,EAAC,EAChB;AACb,EAAA,MAAM,OAAA,GAA4B,CAAC,GAAG,QAAA,CAAS,MAAA,CAAO,UAAU,CAAA,EAAG,GAAI,MAAA,CAAO,OAAA,IAAW,EAAG,CAAA;AAC5F,EAAA,MAAM,WAAW,OAAA,CAAQ,IAAA,CAAK,CAAC,MAAA,KAAW,CAAC,OAAO,IAAI,CAAA;AACtD,EAAA,MAAM,WAAA,GAA2B;AAAA,IAC/B,SAAS,MAAA,CAAO,OAAA;AAAA,IAChB,YAAY,MAAA,CAAO,UAAA;AAAA,IACnB,yBAAyB,MAAA,CAAO;AAAA,GAClC;AAEA,EAAA,IAAI,CAAC,QAAA,EAAU;AACb,IAAA,MAAMA,OAAAA,GAAS,CAAC,KAAA,KAAuC;AACrD,MAAA,MAAM,WAA+B,EAAC;AACtC,MAAA,KAAA,MAAW,UAAU,OAAA,EAAS;AAC5B,QAAA,IAAI,MAAA,CAAO,IAAA,IAAQ,UAAA,CAAW,MAAA,EAAQ,KAAK,CAAA,EAAG,QAAA,CAAS,IAAA,CAAK,GAAG,MAAA,CAAO,MAAA,CAAO,KAAK,CAAC,CAAA;AAAA,MACrF;AACA,MAAA,OAAO,IAAA,CAAK,UAAU,WAAW,CAAA;AAAA,IACnC,CAAA;AACA,IAAA,OAAOA,OAAAA;AAAA,EACT;AAEA,EAAA,MAAM,MAAA,GAAS,OAAO,KAAA,EAAoB,GAAA,GAAqB,EAAC,KAA+B;AAC7F,IAAA,MAAM,WAA+B,EAAC;AACtC,IAAA,MAAM,UAAyC,EAAC;AAChD,IAAA,KAAA,MAAW,UAAU,OAAA,EAAS;AAC5B,MAAA,IAAI,CAAC,UAAA,CAAW,MAAA,EAAQ,KAAK,CAAA,EAAG;AAChC,MAAA,IAAI,MAAA,CAAO,MAAM,QAAA,CAAS,IAAA,CAAK,GAAG,MAAA,CAAO,MAAA,CAAO,KAAK,CAAC,CAAA;AAAA,WACjD,OAAA,CAAQ,IAAA,CAAK,OAAA,CAAQ,OAAA,CAAQ,OAAO,MAAA,CAAO,KAAA,EAAO,GAAG,CAAC,CAAA,CAAE,KAAA,CAAM,MAAM,EAAE,CAAC,CAAA;AAAA,IAC9E;AACA,IAAA,KAAA,MAAW,KAAA,IAAS,MAAM,OAAA,CAAQ,GAAA,CAAI,OAAO,CAAA,EAAG,QAAA,CAAS,IAAA,CAAK,GAAG,KAAK,CAAA;AACtE,IAAA,OAAO,IAAA,CAAK,UAAU,WAAW,CAAA;AAAA,EACnC,CAAA;AACA,EAAA,OAAO,MAAA;AACT","file":"index.js","sourcesContent":["import { evidenceFromHeaders } from \"./headers.js\";\nimport { evidenceFromHtml } from \"./html.js\";\nimport { evidenceFromText } from \"./text.js\";\nimport { fuse, type FuseOptions } from \"./fuse.js\";\nimport type {\n Classification,\n DetectContext,\n DetectFn,\n DetectInput,\n DetectorConfig,\n EvidenceSource,\n LanguageEvidence,\n LanguageProfile,\n SyncSource,\n} from \"./types.js\";\n\n/** The always-on, zero-dependency producers. The text producer is bound to the\n * configured candidate roster so its scoring is roster-relative (and so it\n * abstains when no roster was supplied — its signals need candidates). */\nfunction builtIns(candidates: readonly LanguageProfile[] | undefined): SyncSource[] {\n return [\n {\n id: \"text\",\n sync: true,\n inputs: [\"text\"],\n detect: (i) => evidenceFromText(i.text, candidates),\n },\n { id: \"html\", sync: true, inputs: [\"html\"], detect: (i) => evidenceFromHtml(i.html) },\n {\n id: \"headers\",\n sync: true,\n inputs: [\"headers\"],\n detect: (i) => evidenceFromHeaders(i.headers),\n },\n ];\n}\n\n/** Run a source only when every input it declares is present. */\nfunction applicable(source: EvidenceSource, input: DetectInput): boolean {\n return source.inputs.every((key) => input[key] !== undefined);\n}\n\n/**\n * Build a configured detector. Does the per-roster setup once and returns a\n * `detect` function whose sync/async shape is fixed by the registered engines\n * (see {@link DetectFn}). The built-in producers are always registered; opt-in\n * engines (franc, chrome-ai) are added via `config.engines`.\n */\nexport function compile<const E extends readonly EvidenceSource[] = []>(\n config: DetectorConfig<E> = {},\n): DetectFn<E> {\n const sources: EvidenceSource[] = [...builtIns(config.candidates), ...(config.engines ?? [])];\n const hasAsync = sources.some((source) => !source.sync);\n const fuseOptions: FuseOptions = {\n weights: config.weights,\n candidates: config.candidates,\n nonDiscriminatingScript: config.nonDiscriminatingScript,\n };\n\n if (!hasAsync) {\n const detect = (input: DetectInput): Classification => {\n const evidence: LanguageEvidence[] = [];\n for (const source of sources) {\n if (source.sync && applicable(source, input)) evidence.push(...source.detect(input));\n }\n return fuse(evidence, fuseOptions);\n };\n return detect as DetectFn<E>;\n }\n\n const detect = async (input: DetectInput, ctx: DetectContext = {}): Promise<Classification> => {\n const evidence: LanguageEvidence[] = [];\n const pending: Promise<LanguageEvidence[]>[] = [];\n for (const source of sources) {\n if (!applicable(source, input)) continue;\n if (source.sync) evidence.push(...source.detect(input));\n else pending.push(Promise.resolve(source.detect(input, ctx)).catch(() => []));\n }\n for (const batch of await Promise.all(pending)) evidence.push(...batch);\n return fuse(evidence, fuseOptions);\n };\n return detect as DetectFn<E>;\n}\n"]}
|