langtell 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-PT7R2BRQ.js → chunk-NQK7CZR2.js} +3 -3
- package/dist/{chunk-PT7R2BRQ.js.map → chunk-NQK7CZR2.js.map} +1 -1
- package/dist/{chunk-NCGZPEDA.js → chunk-WYSCL5J5.js} +2 -2
- package/dist/chunk-WYSCL5J5.js.map +1 -0
- package/dist/classify.d.ts +17 -3
- package/dist/classify.js +1 -1
- package/dist/franc.js +1 -1
- package/dist/index.js +3 -3
- package/dist/text.js +2 -2
- package/package.json +1 -1
- package/dist/chunk-NCGZPEDA.js.map +0 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { classifyBySnippet } from './chunk-
|
|
1
|
+
import { classifyBySnippet } from './chunk-WYSCL5J5.js';
|
|
2
2
|
|
|
3
3
|
// src/text.ts
|
|
4
4
|
function evidenceFromText(text, candidates, rung3) {
|
|
@@ -31,5 +31,5 @@ function clamp01(value) {
|
|
|
31
31
|
}
|
|
32
32
|
|
|
33
33
|
export { evidenceFromText };
|
|
34
|
-
//# sourceMappingURL=chunk-
|
|
35
|
-
//# sourceMappingURL=chunk-
|
|
34
|
+
//# sourceMappingURL=chunk-NQK7CZR2.js.map
|
|
35
|
+
//# sourceMappingURL=chunk-NQK7CZR2.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/text.ts"],"names":[],"mappings":";;;AAmBO,SAAS,gBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACoB;AACpB,EAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,KAAW,CAAA,SAAU,EAAC;AAC5D,EAAA,IAAI,eAAe,MAAA,IAAa,UAAA,CAAW,MAAA,KAAW,CAAA,SAAU,EAAC;AAEjE,EAAA,MAAM,OAAA,GAAU,iBAAA,CAAkB,IAAA,EAAM,UAAA,EAAY,KAAK,CAAA;AACzD,EAAA,IAAI,OAAA,CAAQ,QAAA,KAAa,SAAA,EAAW,OAAO,EAAC;AAE5C,EAAA,MAAM,IAAA,GAAyB;AAAA,IAC7B,IAAA,EAAM,cAAA;AAAA,IACN,UAAU,OAAA,CAAQ,QAAA;AAAA,IAClB,UAAA,EAAY,kBAAA,CAAmB,OAAA,CAAQ,MAAA,EAAQ,QAAQ,IAAI,CAAA;AAAA,IAC3D,MAAA,EAAQ,cAAA;AAAA,IACR,OAAO,IAAA,CAAK,IAAA,EAAK,CAAE,KAAA,CAAM,GAAG,EAAE;AAAA,GAChC;AAIA,EAAA,IAAI,CAAC,OAAA,CAAQ,cAAA,EAAgB,IAAA,CAAK,cAAA,GAAiB,KAAA;AACnD,EAAA,OAAO,CAAC,IAAI,CAAA;AACd;AAWA,SAAS,kBAAA,CAAmB,QAAgB,IAAA,EAAoB;AAC9D,EAAA,IAAI,SAAS,CAAA,EAAG;AAEd,IAAA,OAAO,OAAA,CAAQ,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA,EAAG,CAAC,CAAA,GAAI,IAAI,CAAA;AAAA,EAC9D;AACA,EAAA,MAAM,IAAA,GAAO,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA;AAC/B,EAAA,OAAO,OAAA,CAAQ,MAAO,IAAA,CAAK,GAAA,CAAI,MAAM,CAAC,CAAA,GAAI,IAAK,IAAI,CAAA;AACrD;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chunk-
|
|
1
|
+
{"version":3,"sources":["../src/text.ts"],"names":[],"mappings":";;;AAmBO,SAAS,gBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACoB;AACpB,EAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,KAAW,CAAA,SAAU,EAAC;AAC5D,EAAA,IAAI,eAAe,MAAA,IAAa,UAAA,CAAW,MAAA,KAAW,CAAA,SAAU,EAAC;AAEjE,EAAA,MAAM,OAAA,GAAU,iBAAA,CAAkB,IAAA,EAAM,UAAA,EAAY,KAAK,CAAA;AACzD,EAAA,IAAI,OAAA,CAAQ,QAAA,KAAa,SAAA,EAAW,OAAO,EAAC;AAE5C,EAAA,MAAM,IAAA,GAAyB;AAAA,IAC7B,IAAA,EAAM,cAAA;AAAA,IACN,UAAU,OAAA,CAAQ,QAAA;AAAA,IAClB,UAAA,EAAY,kBAAA,CAAmB,OAAA,CAAQ,MAAA,EAAQ,QAAQ,IAAI,CAAA;AAAA,IAC3D,MAAA,EAAQ,cAAA;AAAA,IACR,OAAO,IAAA,CAAK,IAAA,EAAK,CAAE,KAAA,CAAM,GAAG,EAAE;AAAA,GAChC;AAIA,EAAA,IAAI,CAAC,OAAA,CAAQ,cAAA,EAAgB,IAAA,CAAK,cAAA,GAAiB,KAAA;AACnD,EAAA,OAAO,CAAC,IAAI,CAAA;AACd;AAWA,SAAS,kBAAA,CAAmB,QAAgB,IAAA,EAAoB;AAC9D,EAAA,IAAI,SAAS,CAAA,EAAG;AAEd,IAAA,OAAO,OAAA,CAAQ,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA,EAAG,CAAC,CAAA,GAAI,IAAI,CAAA;AAAA,EAC9D;AACA,EAAA,MAAM,IAAA,GAAO,IAAA,CAAK,GAAA,CAAI,MAAA,EAAQ,CAAC,CAAA;AAC/B,EAAA,OAAO,OAAA,CAAQ,MAAO,IAAA,CAAK,GAAA,CAAI,MAAM,CAAC,CAAA,GAAI,IAAK,IAAI,CAAA;AACrD;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chunk-NQK7CZR2.js","sourcesContent":["import type { LanguageEvidence, LanguageProfile } from \"./types.js\";\nimport { classifyBySnippet, type Rung, type Rung3Resolver } from \"./internal/classify.js\";\n\n/**\n * Producer: candidate-relative script + lexical signals from the title text.\n *\n * Wraps the ported snippet classifier ({@link classifyBySnippet}): noise strip →\n * dominant-script scope → distinctive letters (rung 1) → function words (2a) →\n * frequent words (2b). The `candidates` roster makes scoring roster-relative —\n * `і` decides Ukrainian only when Russian is also a candidate. Sync and\n * zero-dependency; the optional franc rung is injected via `rung3`.\n *\n * Emits at most one `kind: \"title-script\"` evidence item. The classifier's\n * integer `margin` (the winner's lead over the runner-up) maps to a 0..1\n * `confidence`: a verdict at all means the dominant script and the deciding rung\n * agreed, so the floor is high; a wider lead nudges it up. With no candidates\n * (or no usable distinctive signal) it abstains — emitting nothing rather than a\n * coarse \"unknown\", since the roster decides relevance.\n */\nexport function evidenceFromText(\n text: string | undefined,\n candidates?: readonly LanguageProfile[],\n rung3?: Rung3Resolver,\n): LanguageEvidence[] {\n if (text === undefined || text.trim().length === 0) return [];\n if (candidates === undefined || candidates.length === 0) return [];\n\n const verdict = classifyBySnippet(text, candidates, rung3);\n if (verdict.language === \"unknown\") return [];\n\n const item: LanguageEvidence = {\n kind: \"title-script\",\n language: verdict.language,\n confidence: marginToConfidence(verdict.margin, verdict.rung),\n source: \"title-script\",\n value: text.trim().slice(0, 80),\n };\n // Surface only the meaningful negative: the script was owned by ≤1 candidate,\n // so it didn't choose between candidates. The discriminating case stays narrow\n // (flag omitted). `fuse({ nonDiscriminatingScript: \"unknown\" })` reads this.\n if (!verdict.discriminating) item.discriminating = false;\n return [item];\n}\n\n/**\n * Map the classifier's per-rung lead to a 0..1 confidence.\n *\n * Rungs 1–2 carry an integer count of distinctive items (≥1). A verdict already\n * means script + rung agreed, so the floor is high (0.6) and each extra\n * distinctive item adds up to a 0.35 bonus, saturating by a lead of 4. Rung 3\n * (franc) carries franc's own 0..1 score-gap, which is weaker evidence, so it is\n * scaled into a 0.4..0.75 band.\n */\nfunction marginToConfidence(margin: number, rung: Rung): number {\n if (rung === 3) {\n // franc score-gap is already 0..1; weaker than the distinctive rungs.\n return clamp01(0.4 + Math.min(Math.max(margin, 0), 1) * 0.35);\n }\n const lead = Math.max(margin, 1);\n return clamp01(0.6 + (Math.min(lead, 4) / 4) * 0.35);\n}\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
|
|
@@ -127,5 +127,5 @@ function classifyBySnippet(text, candidates, rung3) {
|
|
|
127
127
|
}
|
|
128
128
|
|
|
129
129
|
export { FRANC_RUNG, classifyBySnippet, scopeCandidates };
|
|
130
|
-
//# sourceMappingURL=chunk-
|
|
131
|
-
//# sourceMappingURL=chunk-
|
|
130
|
+
//# sourceMappingURL=chunk-WYSCL5J5.js.map
|
|
131
|
+
//# sourceMappingURL=chunk-WYSCL5J5.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/internal/classify.ts"],"names":[],"mappings":";AA0BO,IAAM,UAAA,GAAa;AA0B1B,IAAM,OAAA,GAA0B;AAAA,EAC9B,QAAA,EAAU,SAAA;AAAA,EACV,MAAA,EAAQ,CAAA;AAAA,EACR,IAAA,EAAM,IAAA;AAAA,EACN,cAAA,EAAgB;AAClB,CAAA;AAmBA,IAAM,WAAA,GAAc,sBAAA;AACpB,IAAM,QAAA,GAAW,mBAAA;AAkBjB,IAAM,cAAA,GAAoC;AAAA,EACxC,oBAAA;AAAA;AAAA,EACA,cAAA;AAAA;AAAA,EACA,2CAAA;AAAA;AAAA,EACA;AAAA;AACF,CAAA;AAIO,SAAS,WAAW,IAAA,EAAsB;AAC/C,EAAA,IAAI,GAAA,GAAM,IAAA;AACV,EAAA,KAAA,MAAW,MAAM,cAAA,EAAgB,GAAA,GAAM,GAAA,CAAI,OAAA,CAAQ,IAAI,GAAG,CAAA;AAC1D,EAAA,OAAO,GAAA;AACT;AAKA,SAAS,eAAe,IAAA,EAA2C;AACjE,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,MAAW,EAAA,IAAM,UAAA,CAAW,IAAI,CAAA,EAAG;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,SAAA,IACxB,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,EACrC;AACA,EAAA,IAAI,GAAA,KAAQ,CAAA,IAAK,GAAA,KAAQ,CAAA,EAAG,OAAO,IAAA;AACnC,EAAA,OAAO,GAAA,IAAO,MAAM,UAAA,GAAa,OAAA;AACnC;AAGA,SAAS,cAAc,OAAA,EAAuD;AAC5E,EAAA,KAAA,MAAW,EAAA,IAAM,QAAQ,QAAA,EAAU;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,UAAA;AACjC,IAAA,IAAI,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,OAAA;AAAA,EAChC;AACA,EAAA,OAAO,IAAA;AACT;AAMO,SAAS,eAAA,CACd,MACA,UAAA,EACK;AACL,EAAA,MAAM,MAAA,GAAS,eAAe,IAAI,CAAA;AAClC,EAAA,IAAI,MAAA,KAAW,IAAA,EAAM,OAAO,EAAC;AAI7B,EAAA,MAAM,IAAA,uBAAW,GAAA,EAAY;AAC7B,EAAA,MAAM,SAAc,EAAC;AACrB,EAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,IAAA,IAAI,aAAA,CAAc,CAAC,CAAA,KAAM,MAAA,IAAU,KAAK,GAAA,CAAI,CAAA,CAAE,IAAI,CAAA,EAAG;AACrD,IAAA,IAAA,CAAK,GAAA,CAAI,EAAE,IAAI,CAAA;AACf,IAAA,MAAA,CAAO,KAAK,CAAC,CAAA;AAAA,EACf;AACA,EAAA,OAAO,MAAA;AACT;AA8BA,SAAS,SAAS,IAAA,EAAwB;AACxC,EAAA,OAAO,KAAK,WAAA,EAAY,CAAE,KAAA,CAAM,UAAU,KAAK,EAAC;AAClD;AAOA,SAAS,KAAA,CAAM,OAAyB,UAAA,EAAwD;AAC9F,EAAA,MAAM,MAAA,GAAS,IAAI,GAAA,CAAoB,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,KAAM,CAAC,CAAA,CAAE,IAAA,EAAM,CAAC,CAAC,CAAC,CAAA;AACzE,EAAA,KAAA,MAAW,QAAQ,KAAA,EAAO;AACxB,IAAA,IAAI,KAAA,GAAuB,IAAA;AAC3B,IAAA,IAAI,MAAA,GAAS,CAAA;AACb,IAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,MAAA,IAAI,CAAA,CAAE,GAAA,CAAI,GAAA,CAAI,IAAI,CAAA,EAAG;AACnB,QAAA,MAAA,IAAU,CAAA;AACV,QAAA,IAAI,SAAS,CAAA,EAAG;AACd,UAAA,KAAA,GAAQ,IAAA;AACR,UAAA;AAAA,QACF;AACA,QAAA,KAAA,GAAQ,CAAA,CAAE,IAAA;AAAA,MACZ;AAAA,IACF;AACA,IAAA,IAAI,KAAA,KAAU,IAAA,EAAM,MAAA,CAAO,GAAA,CAAI,KAAA,EAAA,CAAQ,OAAO,GAAA,CAAI,KAAK,CAAA,IAAK,CAAA,IAAK,CAAC,CAAA;AAAA,EACpE;AACA,EAAA,OAAO,MAAA;AACT;AAGA,SAAS,OAAO,MAAA,EAAsE;AACpF,EAAA,IAAI,GAAA,GAAM,EAAA;AACV,EAAA,IAAI,MAAA,GAAS,EAAA;AACb,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,KAAA,MAAW,CAAC,CAAA,EAAG,KAAK,CAAA,IAAK,MAAA,EAAQ;AAC/B,IAAA,IAAI,QAAQ,GAAA,EAAK;AACf,MAAA,MAAA,GAAS,GAAA;AACT,MAAA,GAAA,GAAM,KAAA;AACN,MAAA,IAAA,GAAO,CAAA;AAAA,IACT,CAAA,MAAA,IAAW,QAAQ,MAAA,EAAQ;AACzB,MAAA,MAAA,GAAS,KAAA;AAAA,IACX;AAAA,EACF;AACA,EAAA,IAAI,IAAA,KAAS,IAAA,IAAQ,GAAA,GAAM,CAAA,EAAG,OAAO,IAAA;AACrC,EAAA,MAAM,MAAA,GAAS,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,QAAQ,CAAC,CAAA;AACvC,EAAA,OAAO,MAAA,IAAU,CAAA,GAAI,EAAE,IAAA,EAAM,QAAO,GAAI,IAAA;AAC1C;AAEA,SAAS,aAAA,CACP,YACA,IAAA,EACc;AACd,EAAA,OAAO,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,MAAO,EAAE,IAAA,EAAM,CAAA,CAAE,IAAA,EAAM,GAAA,EAAK,IAAI,GAAA,CAAI,IAAA,CAAK,CAAC,CAAC,GAAE,CAAE,CAAA;AACxE;AAIA,SAAS,UAAA,CAAW,MAAc,MAAA,EAAwD;AACxF,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,KAAK,WAAA,EAAY;AAAA,MACjB,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,QAAA,IAAY,CAAA,CAAE,SAAS,EAAA,CAAG;AAAA;AAC3D,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,IAAA,EAAM,QAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAM,CAAA,EAAE,GAAI,IAAA;AAC/D;AAGA,SAAS,QAAA,CACP,MAAA,EACA,MAAA,EACA,IAAA,EACA,IAAA,EACoB;AACpB,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,MAAA;AAAA,MACA,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,KAAA,GAAQ,IAAI,CAAA,IAAK,EAAE;AAAA;AACpD,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,MAAM,MAAA,EAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAK,GAAI,IAAA;AAC5D;AAcO,SAAS,iBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACgB;AAChB,EAAA,IAAI,CAAC,IAAA,IAAQ,UAAA,CAAW,MAAA,KAAW,GAAG,OAAO,OAAA;AAI7C,EAAA,MAAM,OAAA,GAAU,WAAW,IAAI,CAAA;AAG/B,EAAA,MAAM,MAAA,GAAS,eAAA,CAAgB,OAAA,EAAS,UAAU,CAAA;AAClD,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAKhC,EAAA,MAAM,cAAA,GAAiB,OAAO,MAAA,IAAU,CAAA;AAExC,EAAA,MAAM,QAAA,GAAW,UAAA,CAAW,OAAA,EAAS,MAAM,CAAA;AAC3C,EAAA,IAAI,QAAA,EAAU,OAAO,EAAE,GAAG,UAAU,cAAA,EAAe;AAEnD,EAAA,MAAM,MAAA,GAAS,SAAS,OAAO,CAAA;AAC/B,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAEhC,EAAA,MAAM,MAAA,GACJ,QAAA,CAAS,MAAA,EAAQ,MAAA,EAAQ,YAAY,IAAI,CAAA,IACzC,QAAA,CAAS,MAAA,EAAQ,QAAQ,UAAA,EAAY,IAAI,CAAA,IACzC,KAAA,GAAQ,SAAS,MAAM,CAAA;AACzB,EAAA,OAAO,MAAA,GAAS,EAAE,GAAG,MAAA,EAAQ,gBAAe,GAAI,OAAA;AAClD","file":"chunk-WYSCL5J5.js","sourcesContent":["/**\n * Per-snippet language classification by candidate-set-relative set-difference.\n *\n * A ladder of rungs; the first rung whose leader clears a lead (margin) of ≥1\n * wins; otherwise `\"unknown\"`:\n *\n * 1 alphabet — characters distinctive within the candidate set\n * 2a function words — curated grammatical markers (highest precision)\n * 2b frequent words — corpus content words\n * 3 franc — optional trigram backstop for the distinctive-free\n * residual, injected as a resolver (this module stays\n * franc-free and importable without franc's tables)\n *\n * \"Distinctive\" is ALWAYS relative to the candidate set: a signal counts for a\n * candidate iff it appears in that candidate's profile and in NO other\n * candidate's. So `і` decides {uk, ru} (only uk has it) but is inert in\n * {uk, be} (both have it), and the word `и` decides {uk, ru} even though the\n * *letter* `и` is shared. Nothing is precomputed — uniqueness is the runtime\n * output, never stored.\n *\n * Adapted to langtell's {@link LanguageProfile} shape: the `words` and `iso6393`\n * fields are optional here, so a bare `{ code, alphabet }` profile still\n * classifies on rung 1.\n */\nimport type { LanguageProfile } from \"../types.js\";\n\nexport const FRANC_RUNG = 3;\n\n/** Which rung decided a verdict; `null` when unknown. */\nexport type Rung = 1 | \"2a\" | \"2b\" | typeof FRANC_RUNG | null;\n\nexport interface SnippetVerdict {\n /** Winning language code, or the sentinel `\"unknown\"`. */\n language: string;\n /** Lead of the winner over the runner-up, in the rung's own unit (distinctive\n * char/word count for rungs 1–2; franc score-gap for rung 3). 0 when unknown. */\n margin: number;\n /** Which rung decided; `null` when unknown. */\n rung: Rung;\n /** Whether ≥2 same-script candidates were in scope when the verdict was\n * reached. `true` ⇒ the distinctive-letter/word machinery actually chose\n * between candidates; `false` ⇒ the winner was the lone candidate in its\n * script, selected by script alone (no evidence it is *distinctively* that\n * language). `false` for `\"unknown\"`. */\n discriminating: boolean;\n}\n\n/** A rung's verdict before {@link classifyBySnippet} stamps on the scope-derived\n * `discriminating` flag (which a single rung can't know — it depends on how many\n * same-script candidates were scoped). */\nexport type RungVerdict = Pick<SnippetVerdict, \"language\" | \"margin\" | \"rung\">;\n\nconst UNKNOWN: SnippetVerdict = {\n language: \"unknown\",\n margin: 0,\n rung: null,\n discriminating: false,\n};\n\n/** Resolver for rung 3 (the optional trigram backstop), injected into\n * {@link classifyBySnippet} by callers that have franc available. Kept as an\n * injected seam — not a direct import — so this module stays franc-free and\n * importable without pulling franc's tables. Returns a rung-3 verdict or\n * `null` (abstain).\n *\n * Generic over the concrete profile type `P` the caller classifies with, so a\n * consumer that defines a stricter profile (e.g. `words` required) can type its\n * resolver over that exact shape and hand it to {@link classifyBySnippet} with\n * no adapter — the resolver sees `readonly P[]`, the same array the classifier\n * scoped from its input. Defaults to {@link LanguageProfile} for callers that\n * don't narrow. */\nexport type Rung3Resolver<P extends LanguageProfile = LanguageProfile> = (\n text: string,\n scoped: readonly P[],\n) => RungVerdict | null;\n\nconst CYRILLIC_RE = /\\p{Script=Cyrillic}/u;\nconst LATIN_RE = /\\p{Script=Latin}/u;\n\n/** Below this length, trigrams are too noisy to justify a rung-3 verdict. */\nexport const RUNG3_MIN_LENGTH = 24;\n\n/**\n * Trailing/inline Latin \"noise\" tokens — URLs, @handles, #hashtags — that a\n * Cyrillic title commonly carries (a headline followed by a link or a social\n * handle). These are almost always Latin even on Cyrillic-language content, so\n * left in they can flip {@link dominantScript} to Latin and let genuinely\n * Cyrillic content scope to the wrong roster. Stripped before the script vote\n * AND before the rung tallies so the URL's letters never contribute either.\n *\n * Kept as separate simple patterns (applied in order — schemes/www before bare\n * domains) rather than one big alternation, so each stays readable. ASCII-only\n * `[a-z0-9-]` in the domain pattern means a Cyrillic word is never mistaken for\n * a domain.\n */\nconst NOISE_PATTERNS: readonly RegExp[] = [\n /\\bhttps?:\\/\\/\\S+/gi, // full URLs\n /\\bwww\\.\\S+/gi, // www.… without a scheme\n /\\b[a-z0-9-]+(?:\\.[a-z0-9-]+)+(?:\\/\\S*)?/gi, // bare domains (example.com/path)\n /[@#][\\p{L}\\p{N}_]+/gu, // @handles and #hashtags\n];\n\n/** Drop URLs / @handles / #hashtags so trailing Latin noise can't outvote the\n * prose's script or pollute the per-rung tallies. */\nexport function stripNoise(text: string): string {\n let out = text;\n for (const re of NOISE_PATTERNS) out = out.replace(re, \" \");\n return out;\n}\n\n/** The script most of `text` is written in, or `null` if it carries no letters.\n * Noise (URLs/handles/hashtags) is stripped first so a single trailing link\n * can't flip a multi-word Cyrillic title's vote to Latin. */\nfunction dominantScript(text: string): \"cyrillic\" | \"latin\" | null {\n let cyr = 0;\n let lat = 0;\n for (const ch of stripNoise(text)) {\n if (CYRILLIC_RE.test(ch)) cyr += 1;\n else if (LATIN_RE.test(ch)) lat += 1;\n }\n if (cyr === 0 && lat === 0) return null;\n return cyr >= lat ? \"cyrillic\" : \"latin\";\n}\n\n/** The script of a profile's alphabet. */\nfunction profileScript(profile: LanguageProfile): \"cyrillic\" | \"latin\" | null {\n for (const ch of profile.alphabet) {\n if (CYRILLIC_RE.test(ch)) return \"cyrillic\";\n if (LATIN_RE.test(ch)) return \"latin\";\n }\n return null;\n}\n\n/** Candidates whose script matches the text's dominant script (others can't tip\n * the verdict). Empty when the text carries no letters. Generic over the\n * concrete profile type `P`: the result is a subset of the input array, so it\n * keeps `P` — a stricter caller's profiles stay strictly typed downstream. */\nexport function scopeCandidates<P extends LanguageProfile>(\n text: string,\n candidates: readonly P[],\n): P[] {\n const script = dominantScript(text);\n if (script === null) return [];\n // Keep one profile per code. A language listed twice would otherwise make its\n // own distinctive chars/words read as \"owned by ≥2 candidates\" in `tally`,\n // cancelling them out and collapsing the verdict to \"unknown\".\n const seen = new Set<string>();\n const scoped: P[] = [];\n for (const c of candidates) {\n if (profileScript(c) !== script || seen.has(c.code)) continue;\n seen.add(c.code);\n scoped.push(c);\n }\n return scoped;\n}\n\n/**\n * Per-language set of characters globally unique within `profiles` — present in\n * exactly one profile's alphabet. Relative to the given profile set: the unique\n * set shrinks as languages are added (a second Latin language un-uniques a–z).\n */\nexport function distinctiveChars(profiles: readonly LanguageProfile[]): Map<string, Set<string>> {\n const owners = new Map<string, string[]>();\n for (const p of profiles) {\n for (const ch of new Set(p.alphabet)) {\n const list = owners.get(ch);\n if (list) list.push(p.code);\n else owners.set(ch, [p.code]);\n }\n }\n const result = new Map<string, Set<string>>(profiles.map((p) => [p.code, new Set()]));\n for (const [ch, codes] of owners) {\n const [only] = codes;\n if (codes.length === 1 && only !== undefined) result.get(only)?.add(ch);\n }\n return result;\n}\n\ninterface Membership {\n code: string;\n set: ReadonlySet<string>;\n}\n\n/** Lowercased Unicode letter-run tokens. Keeps single-char tokens (`і`, `и`). */\nfunction tokenize(text: string): string[] {\n return text.toLowerCase().match(/\\p{L}+/gu) ?? [];\n}\n\n/**\n * Tally how many items (characters or word tokens) are distinctive to each\n * candidate — present in exactly one candidate's set. Items owned by zero or by\n * ≥2 candidates contribute nothing.\n */\nfunction tally(items: Iterable<string>, membership: readonly Membership[]): Map<string, number> {\n const scores = new Map<string, number>(membership.map((m) => [m.code, 0]));\n for (const item of items) {\n let owner: string | null = null;\n let owners = 0;\n for (const m of membership) {\n if (m.set.has(item)) {\n owners += 1;\n if (owners > 1) {\n owner = null;\n break;\n }\n owner = m.code;\n }\n }\n if (owner !== null) scores.set(owner, (scores.get(owner) ?? 0) + 1);\n }\n return scores;\n}\n\n/** The leading candidate and its lead over the runner-up, or `null` if <1. */\nfunction leader(scores: Map<string, number>): { code: string; margin: number } | null {\n let max = -1;\n let second = -1;\n let code: string | null = null;\n for (const [c, score] of scores) {\n if (score > max) {\n second = max;\n max = score;\n code = c;\n } else if (score > second) {\n second = score;\n }\n }\n if (code === null || max < 1) return null;\n const margin = max - Math.max(second, 0);\n return margin >= 1 ? { code, margin } : null;\n}\n\nfunction membershipFor(\n candidates: readonly LanguageProfile[],\n pick: (p: LanguageProfile) => Iterable<string>,\n): Membership[] {\n return candidates.map((c) => ({ code: c.code, set: new Set(pick(c)) }));\n}\n\n/** Rung 1 — characters (alphabet + orthographic {@link LanguageProfile.marks})\n * distinctive within the scoped candidate set. */\nfunction letterRung(text: string, scoped: readonly LanguageProfile[]): RungVerdict | null {\n const r = leader(\n tally(\n text.toLowerCase(),\n membershipFor(scoped, (p) => p.alphabet + (p.marks ?? \"\")),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung: 1 } : null;\n}\n\n/** Rung 2 — distinctive words from the given tier (2a function, 2b frequent). */\nfunction wordRung(\n tokens: readonly string[],\n scoped: readonly LanguageProfile[],\n tier: \"function\" | \"frequent\",\n rung: \"2a\" | \"2b\",\n): RungVerdict | null {\n const r = leader(\n tally(\n tokens,\n membershipFor(scoped, (p) => p.words?.[tier] ?? []),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung } : null;\n}\n\n/**\n * Classify `text` among `candidates`. Synchronous and allocation-light. Returns\n * `\"unknown\"` on empty evidence, on a tie inside the candidate set, or when\n * nothing is distinctive.\n *\n * Generic over the concrete profile type `P`, inferred from `candidates`. The\n * optional `rung3` resolver is typed over the same `P`, so a consumer with a\n * stricter profile (e.g. `words` required) can pass its own resolver directly,\n * with no adapter — the resolver sees exactly the profiles the caller passed.\n * `P` defaults to {@link LanguageProfile}, so the bare two-argument form and\n * every existing call site are unchanged.\n */\nexport function classifyBySnippet<P extends LanguageProfile = LanguageProfile>(\n text: string,\n candidates: readonly P[],\n rung3?: Rung3Resolver<P>,\n): SnippetVerdict {\n if (!text || candidates.length === 0) return UNKNOWN;\n\n // Drop URLs / @handles / #hashtags once, up front: trailing Latin noise must\n // not flip the dominant-script vote nor pollute the per-rung tallies.\n const cleaned = stripNoise(text);\n\n // Restrict to candidates in the text's dominant script.\n const scoped = scopeCandidates(cleaned, candidates);\n if (scoped.length === 0) return UNKNOWN;\n\n // ≥2 same-script candidates means the distinctive machinery actually had a\n // choice to make; a lone scoped candidate wins by script alone. Stamped onto\n // whichever rung decides — a single rung can't see the scope size.\n const discriminating = scoped.length >= 2;\n\n const byLetter = letterRung(cleaned, scoped);\n if (byLetter) return { ...byLetter, discriminating };\n\n const tokens = tokenize(cleaned);\n if (tokens.length === 0) return UNKNOWN;\n\n const byWord =\n wordRung(tokens, scoped, \"function\", \"2a\") ??\n wordRung(tokens, scoped, \"frequent\", \"2b\") ??\n rung3?.(cleaned, scoped);\n return byWord ? { ...byWord, discriminating } : UNKNOWN;\n}\n"]}
|
package/dist/classify.d.ts
CHANGED
|
@@ -51,13 +51,27 @@ type RungVerdict = Pick<SnippetVerdict, "language" | "margin" | "rung">;
|
|
|
51
51
|
* {@link classifyBySnippet} by callers that have franc available. Kept as an
|
|
52
52
|
* injected seam — not a direct import — so this module stays franc-free and
|
|
53
53
|
* importable without pulling franc's tables. Returns a rung-3 verdict or
|
|
54
|
-
* `null` (abstain).
|
|
55
|
-
|
|
54
|
+
* `null` (abstain).
|
|
55
|
+
*
|
|
56
|
+
* Generic over the concrete profile type `P` the caller classifies with, so a
|
|
57
|
+
* consumer that defines a stricter profile (e.g. `words` required) can type its
|
|
58
|
+
* resolver over that exact shape and hand it to {@link classifyBySnippet} with
|
|
59
|
+
* no adapter — the resolver sees `readonly P[]`, the same array the classifier
|
|
60
|
+
* scoped from its input. Defaults to {@link LanguageProfile} for callers that
|
|
61
|
+
* don't narrow. */
|
|
62
|
+
type Rung3Resolver<P extends LanguageProfile = LanguageProfile> = (text: string, scoped: readonly P[]) => RungVerdict | null;
|
|
56
63
|
/**
|
|
57
64
|
* Classify `text` among `candidates`. Synchronous and allocation-light. Returns
|
|
58
65
|
* `"unknown"` on empty evidence, on a tie inside the candidate set, or when
|
|
59
66
|
* nothing is distinctive.
|
|
67
|
+
*
|
|
68
|
+
* Generic over the concrete profile type `P`, inferred from `candidates`. The
|
|
69
|
+
* optional `rung3` resolver is typed over the same `P`, so a consumer with a
|
|
70
|
+
* stricter profile (e.g. `words` required) can pass its own resolver directly,
|
|
71
|
+
* with no adapter — the resolver sees exactly the profiles the caller passed.
|
|
72
|
+
* `P` defaults to {@link LanguageProfile}, so the bare two-argument form and
|
|
73
|
+
* every existing call site are unchanged.
|
|
60
74
|
*/
|
|
61
|
-
declare function classifyBySnippet(text: string, candidates: readonly
|
|
75
|
+
declare function classifyBySnippet<P extends LanguageProfile = LanguageProfile>(text: string, candidates: readonly P[], rung3?: Rung3Resolver<P>): SnippetVerdict;
|
|
62
76
|
|
|
63
77
|
export { FRANC_RUNG, type Rung, type Rung3Resolver, type RungVerdict, type SnippetVerdict, classifyBySnippet };
|
package/dist/classify.js
CHANGED
package/dist/franc.js
CHANGED
package/dist/index.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { evidenceFromText } from './chunk-
|
|
2
|
-
export { evidenceFromText } from './chunk-
|
|
1
|
+
import { evidenceFromText } from './chunk-NQK7CZR2.js';
|
|
2
|
+
export { evidenceFromText } from './chunk-NQK7CZR2.js';
|
|
3
3
|
import { evidenceFromHtml } from './chunk-KI4MAI3N.js';
|
|
4
4
|
export { evidenceFromHtml } from './chunk-KI4MAI3N.js';
|
|
5
5
|
import { evidenceFromHeaders } from './chunk-3LDE35U2.js';
|
|
@@ -7,7 +7,7 @@ export { evidenceFromHeaders } from './chunk-3LDE35U2.js';
|
|
|
7
7
|
import { fuse } from './chunk-7G3MEXWK.js';
|
|
8
8
|
export { fuse } from './chunk-7G3MEXWK.js';
|
|
9
9
|
export { normalizeBCP47, normalizeLanguageCode, primarySubtag } from './chunk-OVSPOZ5J.js';
|
|
10
|
-
import './chunk-
|
|
10
|
+
import './chunk-WYSCL5J5.js';
|
|
11
11
|
|
|
12
12
|
// src/compile.ts
|
|
13
13
|
function builtIns(candidates) {
|
package/dist/text.js
CHANGED
package/package.json
CHANGED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/internal/classify.ts"],"names":[],"mappings":";AA0BO,IAAM,UAAA,GAAa;AA0B1B,IAAM,OAAA,GAA0B;AAAA,EAC9B,QAAA,EAAU,SAAA;AAAA,EACV,MAAA,EAAQ,CAAA;AAAA,EACR,IAAA,EAAM,IAAA;AAAA,EACN,cAAA,EAAgB;AAClB,CAAA;AAYA,IAAM,WAAA,GAAc,sBAAA;AACpB,IAAM,QAAA,GAAW,mBAAA;AAkBjB,IAAM,cAAA,GAAoC;AAAA,EACxC,oBAAA;AAAA;AAAA,EACA,cAAA;AAAA;AAAA,EACA,2CAAA;AAAA;AAAA,EACA;AAAA;AACF,CAAA;AAIO,SAAS,WAAW,IAAA,EAAsB;AAC/C,EAAA,IAAI,GAAA,GAAM,IAAA;AACV,EAAA,KAAA,MAAW,MAAM,cAAA,EAAgB,GAAA,GAAM,GAAA,CAAI,OAAA,CAAQ,IAAI,GAAG,CAAA;AAC1D,EAAA,OAAO,GAAA;AACT;AAKA,SAAS,eAAe,IAAA,EAA2C;AACjE,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,IAAI,GAAA,GAAM,CAAA;AACV,EAAA,KAAA,MAAW,EAAA,IAAM,UAAA,CAAW,IAAI,CAAA,EAAG;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,SAAA,IACxB,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,GAAA,IAAO,CAAA;AAAA,EACrC;AACA,EAAA,IAAI,GAAA,KAAQ,CAAA,IAAK,GAAA,KAAQ,CAAA,EAAG,OAAO,IAAA;AACnC,EAAA,OAAO,GAAA,IAAO,MAAM,UAAA,GAAa,OAAA;AACnC;AAGA,SAAS,cAAc,OAAA,EAAuD;AAC5E,EAAA,KAAA,MAAW,EAAA,IAAM,QAAQ,QAAA,EAAU;AACjC,IAAA,IAAI,WAAA,CAAY,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,UAAA;AACjC,IAAA,IAAI,QAAA,CAAS,IAAA,CAAK,EAAE,CAAA,EAAG,OAAO,OAAA;AAAA,EAChC;AACA,EAAA,OAAO,IAAA;AACT;AAIO,SAAS,eAAA,CACd,MACA,UAAA,EACmB;AACnB,EAAA,MAAM,MAAA,GAAS,eAAe,IAAI,CAAA;AAClC,EAAA,IAAI,MAAA,KAAW,IAAA,EAAM,OAAO,EAAC;AAI7B,EAAA,MAAM,IAAA,uBAAW,GAAA,EAAY;AAC7B,EAAA,MAAM,SAA4B,EAAC;AACnC,EAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,IAAA,IAAI,aAAA,CAAc,CAAC,CAAA,KAAM,MAAA,IAAU,KAAK,GAAA,CAAI,CAAA,CAAE,IAAI,CAAA,EAAG;AACrD,IAAA,IAAA,CAAK,GAAA,CAAI,EAAE,IAAI,CAAA;AACf,IAAA,MAAA,CAAO,KAAK,CAAC,CAAA;AAAA,EACf;AACA,EAAA,OAAO,MAAA;AACT;AA8BA,SAAS,SAAS,IAAA,EAAwB;AACxC,EAAA,OAAO,KAAK,WAAA,EAAY,CAAE,KAAA,CAAM,UAAU,KAAK,EAAC;AAClD;AAOA,SAAS,KAAA,CAAM,OAAyB,UAAA,EAAwD;AAC9F,EAAA,MAAM,MAAA,GAAS,IAAI,GAAA,CAAoB,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,KAAM,CAAC,CAAA,CAAE,IAAA,EAAM,CAAC,CAAC,CAAC,CAAA;AACzE,EAAA,KAAA,MAAW,QAAQ,KAAA,EAAO;AACxB,IAAA,IAAI,KAAA,GAAuB,IAAA;AAC3B,IAAA,IAAI,MAAA,GAAS,CAAA;AACb,IAAA,KAAA,MAAW,KAAK,UAAA,EAAY;AAC1B,MAAA,IAAI,CAAA,CAAE,GAAA,CAAI,GAAA,CAAI,IAAI,CAAA,EAAG;AACnB,QAAA,MAAA,IAAU,CAAA;AACV,QAAA,IAAI,SAAS,CAAA,EAAG;AACd,UAAA,KAAA,GAAQ,IAAA;AACR,UAAA;AAAA,QACF;AACA,QAAA,KAAA,GAAQ,CAAA,CAAE,IAAA;AAAA,MACZ;AAAA,IACF;AACA,IAAA,IAAI,KAAA,KAAU,IAAA,EAAM,MAAA,CAAO,GAAA,CAAI,KAAA,EAAA,CAAQ,OAAO,GAAA,CAAI,KAAK,CAAA,IAAK,CAAA,IAAK,CAAC,CAAA;AAAA,EACpE;AACA,EAAA,OAAO,MAAA;AACT;AAGA,SAAS,OAAO,MAAA,EAAsE;AACpF,EAAA,IAAI,GAAA,GAAM,EAAA;AACV,EAAA,IAAI,MAAA,GAAS,EAAA;AACb,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,KAAA,MAAW,CAAC,CAAA,EAAG,KAAK,CAAA,IAAK,MAAA,EAAQ;AAC/B,IAAA,IAAI,QAAQ,GAAA,EAAK;AACf,MAAA,MAAA,GAAS,GAAA;AACT,MAAA,GAAA,GAAM,KAAA;AACN,MAAA,IAAA,GAAO,CAAA;AAAA,IACT,CAAA,MAAA,IAAW,QAAQ,MAAA,EAAQ;AACzB,MAAA,MAAA,GAAS,KAAA;AAAA,IACX;AAAA,EACF;AACA,EAAA,IAAI,IAAA,KAAS,IAAA,IAAQ,GAAA,GAAM,CAAA,EAAG,OAAO,IAAA;AACrC,EAAA,MAAM,MAAA,GAAS,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,QAAQ,CAAC,CAAA;AACvC,EAAA,OAAO,MAAA,IAAU,CAAA,GAAI,EAAE,IAAA,EAAM,QAAO,GAAI,IAAA;AAC1C;AAEA,SAAS,aAAA,CACP,YACA,IAAA,EACc;AACd,EAAA,OAAO,UAAA,CAAW,GAAA,CAAI,CAAC,CAAA,MAAO,EAAE,IAAA,EAAM,CAAA,CAAE,IAAA,EAAM,GAAA,EAAK,IAAI,GAAA,CAAI,IAAA,CAAK,CAAC,CAAC,GAAE,CAAE,CAAA;AACxE;AAIA,SAAS,UAAA,CAAW,MAAc,MAAA,EAAwD;AACxF,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,KAAK,WAAA,EAAY;AAAA,MACjB,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,QAAA,IAAY,CAAA,CAAE,SAAS,EAAA,CAAG;AAAA;AAC3D,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,IAAA,EAAM,QAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAM,CAAA,EAAE,GAAI,IAAA;AAC/D;AAGA,SAAS,QAAA,CACP,MAAA,EACA,MAAA,EACA,IAAA,EACA,IAAA,EACoB;AACpB,EAAA,MAAM,CAAA,GAAI,MAAA;AAAA,IACR,KAAA;AAAA,MACE,MAAA;AAAA,MACA,aAAA,CAAc,QAAQ,CAAC,CAAA,KAAM,EAAE,KAAA,GAAQ,IAAI,CAAA,IAAK,EAAE;AAAA;AACpD,GACF;AACA,EAAA,OAAO,CAAA,GAAI,EAAE,QAAA,EAAU,CAAA,CAAE,MAAM,MAAA,EAAQ,CAAA,CAAE,MAAA,EAAQ,IAAA,EAAK,GAAI,IAAA;AAC5D;AAOO,SAAS,iBAAA,CACd,IAAA,EACA,UAAA,EACA,KAAA,EACgB;AAChB,EAAA,IAAI,CAAC,IAAA,IAAQ,UAAA,CAAW,MAAA,KAAW,GAAG,OAAO,OAAA;AAI7C,EAAA,MAAM,OAAA,GAAU,WAAW,IAAI,CAAA;AAG/B,EAAA,MAAM,MAAA,GAAS,eAAA,CAAgB,OAAA,EAAS,UAAU,CAAA;AAClD,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAKhC,EAAA,MAAM,cAAA,GAAiB,OAAO,MAAA,IAAU,CAAA;AAExC,EAAA,MAAM,QAAA,GAAW,UAAA,CAAW,OAAA,EAAS,MAAM,CAAA;AAC3C,EAAA,IAAI,QAAA,EAAU,OAAO,EAAE,GAAG,UAAU,cAAA,EAAe;AAEnD,EAAA,MAAM,MAAA,GAAS,SAAS,OAAO,CAAA;AAC/B,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,OAAA;AAEhC,EAAA,MAAM,MAAA,GACJ,QAAA,CAAS,MAAA,EAAQ,MAAA,EAAQ,YAAY,IAAI,CAAA,IACzC,QAAA,CAAS,MAAA,EAAQ,QAAQ,UAAA,EAAY,IAAI,CAAA,IACzC,KAAA,GAAQ,SAAS,MAAM,CAAA;AACzB,EAAA,OAAO,MAAA,GAAS,EAAE,GAAG,MAAA,EAAQ,gBAAe,GAAI,OAAA;AAClD","file":"chunk-NCGZPEDA.js","sourcesContent":["/**\n * Per-snippet language classification by candidate-set-relative set-difference.\n *\n * A ladder of rungs; the first rung whose leader clears a lead (margin) of ≥1\n * wins; otherwise `\"unknown\"`:\n *\n * 1 alphabet — characters distinctive within the candidate set\n * 2a function words — curated grammatical markers (highest precision)\n * 2b frequent words — corpus content words\n * 3 franc — optional trigram backstop for the distinctive-free\n * residual, injected as a resolver (this module stays\n * franc-free and importable without franc's tables)\n *\n * \"Distinctive\" is ALWAYS relative to the candidate set: a signal counts for a\n * candidate iff it appears in that candidate's profile and in NO other\n * candidate's. So `і` decides {uk, ru} (only uk has it) but is inert in\n * {uk, be} (both have it), and the word `и` decides {uk, ru} even though the\n * *letter* `и` is shared. Nothing is precomputed — uniqueness is the runtime\n * output, never stored.\n *\n * Adapted to langtell's {@link LanguageProfile} shape: the `words` and `iso6393`\n * fields are optional here, so a bare `{ code, alphabet }` profile still\n * classifies on rung 1.\n */\nimport type { LanguageProfile } from \"../types.js\";\n\nexport const FRANC_RUNG = 3;\n\n/** Which rung decided a verdict; `null` when unknown. */\nexport type Rung = 1 | \"2a\" | \"2b\" | typeof FRANC_RUNG | null;\n\nexport interface SnippetVerdict {\n /** Winning language code, or the sentinel `\"unknown\"`. */\n language: string;\n /** Lead of the winner over the runner-up, in the rung's own unit (distinctive\n * char/word count for rungs 1–2; franc score-gap for rung 3). 0 when unknown. */\n margin: number;\n /** Which rung decided; `null` when unknown. */\n rung: Rung;\n /** Whether ≥2 same-script candidates were in scope when the verdict was\n * reached. `true` ⇒ the distinctive-letter/word machinery actually chose\n * between candidates; `false` ⇒ the winner was the lone candidate in its\n * script, selected by script alone (no evidence it is *distinctively* that\n * language). `false` for `\"unknown\"`. */\n discriminating: boolean;\n}\n\n/** A rung's verdict before {@link classifyBySnippet} stamps on the scope-derived\n * `discriminating` flag (which a single rung can't know — it depends on how many\n * same-script candidates were scoped). */\nexport type RungVerdict = Pick<SnippetVerdict, \"language\" | \"margin\" | \"rung\">;\n\nconst UNKNOWN: SnippetVerdict = {\n language: \"unknown\",\n margin: 0,\n rung: null,\n discriminating: false,\n};\n\n/** Resolver for rung 3 (the optional trigram backstop), injected into\n * {@link classifyBySnippet} by callers that have franc available. Kept as an\n * injected seam — not a direct import — so this module stays franc-free and\n * importable without pulling franc's tables. Returns a rung-3 verdict or\n * `null` (abstain). */\nexport type Rung3Resolver = (\n text: string,\n scoped: readonly LanguageProfile[],\n) => RungVerdict | null;\n\nconst CYRILLIC_RE = /\\p{Script=Cyrillic}/u;\nconst LATIN_RE = /\\p{Script=Latin}/u;\n\n/** Below this length, trigrams are too noisy to justify a rung-3 verdict. */\nexport const RUNG3_MIN_LENGTH = 24;\n\n/**\n * Trailing/inline Latin \"noise\" tokens — URLs, @handles, #hashtags — that a\n * Cyrillic title commonly carries (a headline followed by a link or a social\n * handle). These are almost always Latin even on Cyrillic-language content, so\n * left in they can flip {@link dominantScript} to Latin and let genuinely\n * Cyrillic content scope to the wrong roster. Stripped before the script vote\n * AND before the rung tallies so the URL's letters never contribute either.\n *\n * Kept as separate simple patterns (applied in order — schemes/www before bare\n * domains) rather than one big alternation, so each stays readable. ASCII-only\n * `[a-z0-9-]` in the domain pattern means a Cyrillic word is never mistaken for\n * a domain.\n */\nconst NOISE_PATTERNS: readonly RegExp[] = [\n /\\bhttps?:\\/\\/\\S+/gi, // full URLs\n /\\bwww\\.\\S+/gi, // www.… without a scheme\n /\\b[a-z0-9-]+(?:\\.[a-z0-9-]+)+(?:\\/\\S*)?/gi, // bare domains (example.com/path)\n /[@#][\\p{L}\\p{N}_]+/gu, // @handles and #hashtags\n];\n\n/** Drop URLs / @handles / #hashtags so trailing Latin noise can't outvote the\n * prose's script or pollute the per-rung tallies. */\nexport function stripNoise(text: string): string {\n let out = text;\n for (const re of NOISE_PATTERNS) out = out.replace(re, \" \");\n return out;\n}\n\n/** The script most of `text` is written in, or `null` if it carries no letters.\n * Noise (URLs/handles/hashtags) is stripped first so a single trailing link\n * can't flip a multi-word Cyrillic title's vote to Latin. */\nfunction dominantScript(text: string): \"cyrillic\" | \"latin\" | null {\n let cyr = 0;\n let lat = 0;\n for (const ch of stripNoise(text)) {\n if (CYRILLIC_RE.test(ch)) cyr += 1;\n else if (LATIN_RE.test(ch)) lat += 1;\n }\n if (cyr === 0 && lat === 0) return null;\n return cyr >= lat ? \"cyrillic\" : \"latin\";\n}\n\n/** The script of a profile's alphabet. */\nfunction profileScript(profile: LanguageProfile): \"cyrillic\" | \"latin\" | null {\n for (const ch of profile.alphabet) {\n if (CYRILLIC_RE.test(ch)) return \"cyrillic\";\n if (LATIN_RE.test(ch)) return \"latin\";\n }\n return null;\n}\n\n/** Candidates whose script matches the text's dominant script (others can't tip\n * the verdict). Empty when the text carries no letters. */\nexport function scopeCandidates(\n text: string,\n candidates: readonly LanguageProfile[],\n): LanguageProfile[] {\n const script = dominantScript(text);\n if (script === null) return [];\n // Keep one profile per code. A language listed twice would otherwise make its\n // own distinctive chars/words read as \"owned by ≥2 candidates\" in `tally`,\n // cancelling them out and collapsing the verdict to \"unknown\".\n const seen = new Set<string>();\n const scoped: LanguageProfile[] = [];\n for (const c of candidates) {\n if (profileScript(c) !== script || seen.has(c.code)) continue;\n seen.add(c.code);\n scoped.push(c);\n }\n return scoped;\n}\n\n/**\n * Per-language set of characters globally unique within `profiles` — present in\n * exactly one profile's alphabet. Relative to the given profile set: the unique\n * set shrinks as languages are added (a second Latin language un-uniques a–z).\n */\nexport function distinctiveChars(profiles: readonly LanguageProfile[]): Map<string, Set<string>> {\n const owners = new Map<string, string[]>();\n for (const p of profiles) {\n for (const ch of new Set(p.alphabet)) {\n const list = owners.get(ch);\n if (list) list.push(p.code);\n else owners.set(ch, [p.code]);\n }\n }\n const result = new Map<string, Set<string>>(profiles.map((p) => [p.code, new Set()]));\n for (const [ch, codes] of owners) {\n const [only] = codes;\n if (codes.length === 1 && only !== undefined) result.get(only)?.add(ch);\n }\n return result;\n}\n\ninterface Membership {\n code: string;\n set: ReadonlySet<string>;\n}\n\n/** Lowercased Unicode letter-run tokens. Keeps single-char tokens (`і`, `и`). */\nfunction tokenize(text: string): string[] {\n return text.toLowerCase().match(/\\p{L}+/gu) ?? [];\n}\n\n/**\n * Tally how many items (characters or word tokens) are distinctive to each\n * candidate — present in exactly one candidate's set. Items owned by zero or by\n * ≥2 candidates contribute nothing.\n */\nfunction tally(items: Iterable<string>, membership: readonly Membership[]): Map<string, number> {\n const scores = new Map<string, number>(membership.map((m) => [m.code, 0]));\n for (const item of items) {\n let owner: string | null = null;\n let owners = 0;\n for (const m of membership) {\n if (m.set.has(item)) {\n owners += 1;\n if (owners > 1) {\n owner = null;\n break;\n }\n owner = m.code;\n }\n }\n if (owner !== null) scores.set(owner, (scores.get(owner) ?? 0) + 1);\n }\n return scores;\n}\n\n/** The leading candidate and its lead over the runner-up, or `null` if <1. */\nfunction leader(scores: Map<string, number>): { code: string; margin: number } | null {\n let max = -1;\n let second = -1;\n let code: string | null = null;\n for (const [c, score] of scores) {\n if (score > max) {\n second = max;\n max = score;\n code = c;\n } else if (score > second) {\n second = score;\n }\n }\n if (code === null || max < 1) return null;\n const margin = max - Math.max(second, 0);\n return margin >= 1 ? { code, margin } : null;\n}\n\nfunction membershipFor(\n candidates: readonly LanguageProfile[],\n pick: (p: LanguageProfile) => Iterable<string>,\n): Membership[] {\n return candidates.map((c) => ({ code: c.code, set: new Set(pick(c)) }));\n}\n\n/** Rung 1 — characters (alphabet + orthographic {@link LanguageProfile.marks})\n * distinctive within the scoped candidate set. */\nfunction letterRung(text: string, scoped: readonly LanguageProfile[]): RungVerdict | null {\n const r = leader(\n tally(\n text.toLowerCase(),\n membershipFor(scoped, (p) => p.alphabet + (p.marks ?? \"\")),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung: 1 } : null;\n}\n\n/** Rung 2 — distinctive words from the given tier (2a function, 2b frequent). */\nfunction wordRung(\n tokens: readonly string[],\n scoped: readonly LanguageProfile[],\n tier: \"function\" | \"frequent\",\n rung: \"2a\" | \"2b\",\n): RungVerdict | null {\n const r = leader(\n tally(\n tokens,\n membershipFor(scoped, (p) => p.words?.[tier] ?? []),\n ),\n );\n return r ? { language: r.code, margin: r.margin, rung } : null;\n}\n\n/**\n * Classify `text` among `candidates`. Synchronous and allocation-light. Returns\n * `\"unknown\"` on empty evidence, on a tie inside the candidate set, or when\n * nothing is distinctive.\n */\nexport function classifyBySnippet(\n text: string,\n candidates: readonly LanguageProfile[],\n rung3?: Rung3Resolver,\n): SnippetVerdict {\n if (!text || candidates.length === 0) return UNKNOWN;\n\n // Drop URLs / @handles / #hashtags once, up front: trailing Latin noise must\n // not flip the dominant-script vote nor pollute the per-rung tallies.\n const cleaned = stripNoise(text);\n\n // Restrict to candidates in the text's dominant script.\n const scoped = scopeCandidates(cleaned, candidates);\n if (scoped.length === 0) return UNKNOWN;\n\n // ≥2 same-script candidates means the distinctive machinery actually had a\n // choice to make; a lone scoped candidate wins by script alone. Stamped onto\n // whichever rung decides — a single rung can't see the scope size.\n const discriminating = scoped.length >= 2;\n\n const byLetter = letterRung(cleaned, scoped);\n if (byLetter) return { ...byLetter, discriminating };\n\n const tokens = tokenize(cleaned);\n if (tokens.length === 0) return UNKNOWN;\n\n const byWord =\n wordRung(tokens, scoped, \"function\", \"2a\") ??\n wordRung(tokens, scoped, \"frequent\", \"2b\") ??\n rung3?.(cleaned, scoped);\n return byWord ? { ...byWord, discriminating } : UNKNOWN;\n}\n"]}
|