langtell 0.0.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +67 -26
- package/dist/chrome-ai.d.ts +29 -0
- package/dist/chrome-ai.js +71 -0
- package/dist/chrome-ai.js.map +1 -0
- package/dist/chunk-3LDE35U2.js +36 -0
- package/dist/chunk-3LDE35U2.js.map +1 -0
- package/dist/chunk-7G3MEXWK.js +109 -0
- package/dist/chunk-7G3MEXWK.js.map +1 -0
- package/dist/chunk-KI4MAI3N.js +27 -0
- package/dist/chunk-KI4MAI3N.js.map +1 -0
- package/dist/chunk-NCGZPEDA.js +131 -0
- package/dist/chunk-NCGZPEDA.js.map +1 -0
- package/dist/chunk-OVSPOZ5J.js +115 -0
- package/dist/chunk-OVSPOZ5J.js.map +1 -0
- package/dist/chunk-PT7R2BRQ.js +35 -0
- package/dist/chunk-PT7R2BRQ.js.map +1 -0
- package/dist/classify.d.ts +63 -0
- package/dist/classify.js +3 -0
- package/dist/classify.js.map +1 -0
- package/dist/franc.d.ts +25 -0
- package/dist/franc.js +59 -0
- package/dist/franc.js.map +1 -0
- package/dist/fuse.d.ts +30 -0
- package/dist/fuse.js +4 -0
- package/dist/fuse.js.map +1 -0
- package/dist/headers.d.ts +6 -0
- package/dist/headers.js +4 -0
- package/dist/headers.js.map +1 -0
- package/dist/html.d.ts +17 -0
- package/dist/html.js +4 -0
- package/dist/html.js.map +1 -0
- package/dist/index.d.ts +60 -0
- package/dist/index.js +67 -0
- package/dist/index.js.map +1 -0
- package/dist/profiles.d.ts +47 -0
- package/dist/profiles.js +1030 -0
- package/dist/profiles.js.map +1 -0
- package/dist/text.d.ts +22 -0
- package/dist/text.js +4 -0
- package/dist/text.js.map +1 -0
- package/dist/types-BIXrkuAr.d.ts +120 -0
- package/package.json +104 -3
package/README.md
CHANGED
|
@@ -4,20 +4,21 @@
|
|
|
4
4
|
|
|
5
5
|
`langtell` infers the language of short strings — titles, snippets, headlines —
|
|
6
6
|
by **fusing evidence from many signals** into a single weighted verdict with a
|
|
7
|
-
confidence score and an auditable trail. It reads the
|
|
7
|
+
confidence score and an auditable trail. It reads the _tells_: the script and
|
|
8
8
|
distinctive letters of the text, the `<html lang>` / `og:locale` / meta tags of
|
|
9
9
|
the page it came from, the HTTP `Content-Language` header, and — optionally —
|
|
10
10
|
heavier statistical engines like [franc](https://github.com/wooorm/franc) or the
|
|
11
11
|
on-device Chrome AI language detector.
|
|
12
12
|
|
|
13
13
|
It is **not** another trigram detector competing with franc/cld3/tinyld. Those
|
|
14
|
-
answer
|
|
15
|
-
`langtell` answers
|
|
16
|
-
and source it arrived in?"
|
|
14
|
+
answer _"what language is this body of text?"_ from the characters alone.
|
|
15
|
+
`langtell` answers _"what language is this **title**, given the page, transport,
|
|
16
|
+
and source it arrived in?"_ — and shows its work.
|
|
17
17
|
|
|
18
|
-
> **Status:**
|
|
19
|
-
>
|
|
20
|
-
>
|
|
18
|
+
> **Status:** early. The core detector (candidate-relative script/letter
|
|
19
|
+
> scoring, the BCP-47-aware fuser with the context-vs-script guard, and the
|
|
20
|
+
> opt-in franc and Chrome AI engines) is implemented and tested. The API below
|
|
21
|
+
> reflects the committed design.
|
|
21
22
|
|
|
22
23
|
## Why
|
|
23
24
|
|
|
@@ -26,7 +27,7 @@ and source it arrived in?"* — and shows its work.
|
|
|
26
27
|
out-of-band metadata that a pure text detector never sees.
|
|
27
28
|
- **Auditable, not magic.** Every verdict carries the list of signals that
|
|
28
29
|
produced it (`evidence[]`), each with its kind, language, confidence, and raw
|
|
29
|
-
value — so you can debug
|
|
30
|
+
value — so you can debug _why_ a title was classified the way it was.
|
|
30
31
|
- **Pay only for what you use.** The zero-dependency core (script + HTML + header
|
|
31
32
|
signals) is fully synchronous. Heavy engines (franc's trigram tables, the
|
|
32
33
|
browser detector) live behind their own subpaths and only enter your bundle —
|
|
@@ -36,41 +37,81 @@ and source it arrived in?"* — and shows its work.
|
|
|
36
37
|
|
|
37
38
|
```ts
|
|
38
39
|
import { compile } from "langtell";
|
|
40
|
+
import { uk, ru, en } from "langtell/profiles"; // ready-made roster data
|
|
39
41
|
|
|
40
42
|
// compile() does the per-roster setup once; call the returned fn many times.
|
|
41
|
-
const detect = compile({ candidates: [
|
|
43
|
+
const detect = compile({ candidates: [uk, ru, en] });
|
|
42
44
|
|
|
43
45
|
const result = detect({
|
|
44
46
|
text: "Їжак Сонік",
|
|
45
|
-
html,
|
|
47
|
+
html, // optional: <html lang>, og:locale, meta content-language
|
|
46
48
|
responseHeaders, // optional: HTTP Content-Language
|
|
47
49
|
});
|
|
48
50
|
// → { language: "uk", confidence: 0.9x, evidence: [{ kind: "title-script", ... }, ...] }
|
|
49
51
|
```
|
|
50
52
|
|
|
51
|
-
Add
|
|
52
|
-
|
|
53
|
+
Add the franc engine — it stays behind its own import door so its trigram tables
|
|
54
|
+
never reach a bundle that doesn't use it. franc runs in-process and
|
|
55
|
+
synchronously, so `detect` stays synchronous:
|
|
53
56
|
|
|
54
57
|
```ts
|
|
55
|
-
import { compile }
|
|
56
|
-
import {
|
|
58
|
+
import { compile } from "langtell";
|
|
59
|
+
import { uk, ru, en } from "langtell/profiles";
|
|
60
|
+
import { createFrancEngine } from "langtell/franc";
|
|
61
|
+
|
|
62
|
+
const candidates = [uk, ru, en];
|
|
63
|
+
const detect = compile({ candidates, engines: [createFrancEngine(candidates)] });
|
|
64
|
+
const result = detect({ text, html, responseHeaders });
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Register the on-device Chrome AI engine and the return type becomes `Promise`
|
|
68
|
+
automatically, because that engine is async:
|
|
69
|
+
|
|
70
|
+
```ts
|
|
71
|
+
import { compile } from "langtell";
|
|
72
|
+
import { uk, ru, en } from "langtell/profiles";
|
|
73
|
+
import { chromeAiEngine } from "langtell/chrome-ai";
|
|
57
74
|
|
|
58
|
-
const detect = compile({ candidates: [
|
|
59
|
-
const result = await detect({ text
|
|
75
|
+
const detect = compile({ candidates: [uk, ru, en], engines: [chromeAiEngine] });
|
|
76
|
+
const result = await detect({ text }); // Promise<Classification>
|
|
60
77
|
```
|
|
61
78
|
|
|
79
|
+
Need more than "what language + how sure"? The default `Classification` collapses
|
|
80
|
+
the candidate-relative ladder into one `confidence` float. When you need the raw
|
|
81
|
+
structure — _which_ rung decided (distinctive letters → function words → frequent
|
|
82
|
+
words → optional trigram backstop) and the integer **margin** (the winner's lead
|
|
83
|
+
over the runner-up) — reach for the opt-in `langtell/classify` door. It stays
|
|
84
|
+
zero-dependency and franc-free; scoring is relative to the roster you pass in.
|
|
85
|
+
|
|
86
|
+
```ts
|
|
87
|
+
import { classifyBySnippet } from "langtell/classify";
|
|
88
|
+
import { uk, ru } from "langtell/profiles";
|
|
89
|
+
|
|
90
|
+
classifyBySnippet("Слава Україні", [uk, ru]);
|
|
91
|
+
// → { language: "uk", margin: 2, rung: 1, discriminating: true } (a distinctive letter)
|
|
92
|
+
classifyBySnippet("Кофе и чай", [uk, ru]);
|
|
93
|
+
// → { language: "ru", margin: 1, rung: "2a", … } (a function-word marker)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
This powers per-rung safety gates ("act only when a _weak_ rung clears a high
|
|
97
|
+
margin") and diagnostics — uses a single confidence number can't serve. The
|
|
98
|
+
high-level `compile`/`detect`/`fuse` output is unchanged; this is purely additive.
|
|
99
|
+
|
|
62
100
|
## API at a glance
|
|
63
101
|
|
|
64
|
-
| Export
|
|
65
|
-
|
|
|
66
|
-
| `compile(config)`
|
|
67
|
-
| `detect(input)`
|
|
68
|
-
| `evidenceFromText(text)` | Producer: script + distinctive-letter signals. Zero-dep, sync.
|
|
69
|
-
| `evidenceFromHtml(html)`
|
|
70
|
-
| `evidenceFromHeaders(h)`
|
|
71
|
-
| `
|
|
72
|
-
| `
|
|
73
|
-
| `langtell/
|
|
102
|
+
| Export | Role |
|
|
103
|
+
| ------------------------------------- | ------------------------------------------------------------------------------- |
|
|
104
|
+
| `compile(config)` | Build a configured `detect` function (does the precompute once). |
|
|
105
|
+
| `detect(input)` | The compiled detector. Sync or `Promise`, by config — see below. |
|
|
106
|
+
| `evidenceFromText(text, candidates?)` | Producer: roster-relative script + distinctive-letter signals. Zero-dep, sync. |
|
|
107
|
+
| `evidenceFromHtml(html)` | Producer: `<html lang>`, meta content-language, `og:locale`. Zero-dep, sync. |
|
|
108
|
+
| `evidenceFromHeaders(h)` | Producer: HTTP `Content-Language`. Zero-dep, sync. |
|
|
109
|
+
| `normalizeBCP47(tag)` | Normalize a BCP-47 tag/alias to a canonical code (`uk-UA`/`ua` → `uk`). |
|
|
110
|
+
| `fuse(evidence, opts?)` | Weighted blend + "context never overrides clear script" guard. |
|
|
111
|
+
| `langtell/profiles` | Ready-made `LanguageProfile` data (uk/ru/be/bg/en). Opt-in (carries word data). |
|
|
112
|
+
| `langtell/classify` | Opt-in structured snippet verdict (`{ language, margin, rung }`). Zero-dep. |
|
|
113
|
+
| `langtell/franc` | Opt-in franc engine (pulls trigram tables). Sync. |
|
|
114
|
+
| `langtell/chrome-ai` | Opt-in on-device Chrome AI engine (browser). Async. |
|
|
74
115
|
|
|
75
116
|
`detect` returns a plain `Classification` when every registered source is
|
|
76
117
|
synchronous, and `Promise<Classification>` the moment an async engine is in the
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { A as AsyncSource } from './types-BIXrkuAr.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* `langtell/chrome-ai` — the opt-in on-device engine wrapping the browser's
|
|
5
|
+
* `LanguageDetector` API (Gemini Nano on Chrome 138+ / Edge). Lives behind its
|
|
6
|
+
* own subpath; the zero-dependency core never imports it.
|
|
7
|
+
*
|
|
8
|
+
* Opportunistic: it never triggers a model download. Availability (per Chrome
|
|
9
|
+
* docs):
|
|
10
|
+
* - `available` — model loaded, ready. We're available.
|
|
11
|
+
* - `downloadable` — could be fetched on demand. Treated as unavailable so we
|
|
12
|
+
* never initiate a download the user hasn't consented to.
|
|
13
|
+
* - `downloading` — same reasoning; wait for the model to land.
|
|
14
|
+
* - `unavailable` — Chrome's flat-out no. Skip.
|
|
15
|
+
*
|
|
16
|
+
* Emits `kind: "chrome-ai"` evidence with the model's own confidence. An
|
|
17
|
+
* `AsyncSource`: registering it flips the compiled `detect` to `Promise`-typed.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Build a chrome-ai {@link AsyncSource}. State (availability + session) is
|
|
22
|
+
* cached per instance: once availability is confirmed it is not re-probed for
|
|
23
|
+
* the instance's lifetime, and the detector session is created once and reused.
|
|
24
|
+
*/
|
|
25
|
+
declare function createChromeAiEngine(): AsyncSource;
|
|
26
|
+
/** A ready-to-register chrome-ai engine instance. */
|
|
27
|
+
declare const chromeAiEngine: AsyncSource;
|
|
28
|
+
|
|
29
|
+
export { chromeAiEngine, createChromeAiEngine };
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
// src/chrome-ai.ts
|
|
2
|
+
var DEFAULT_MAX_CHARS = 2e3;
|
|
3
|
+
var CONFIDENCE_THRESHOLD = 0.6;
|
|
4
|
+
function getApi() {
|
|
5
|
+
const globalRef = globalThis;
|
|
6
|
+
return globalRef.LanguageDetector ?? null;
|
|
7
|
+
}
|
|
8
|
+
function createChromeAiEngine() {
|
|
9
|
+
let cachedAvailability = null;
|
|
10
|
+
let cachedSession = null;
|
|
11
|
+
async function checkAvailability() {
|
|
12
|
+
if (cachedAvailability !== null) return cachedAvailability;
|
|
13
|
+
const api = getApi();
|
|
14
|
+
if (!api) {
|
|
15
|
+
cachedAvailability = false;
|
|
16
|
+
return false;
|
|
17
|
+
}
|
|
18
|
+
const state = await api.availability();
|
|
19
|
+
cachedAvailability = state === "available";
|
|
20
|
+
return cachedAvailability;
|
|
21
|
+
}
|
|
22
|
+
async function getSession() {
|
|
23
|
+
if (cachedSession) return cachedSession;
|
|
24
|
+
const api = getApi();
|
|
25
|
+
if (!api) throw new Error("chrome-ai: LanguageDetector API missing");
|
|
26
|
+
cachedSession = await api.create();
|
|
27
|
+
return cachedSession;
|
|
28
|
+
}
|
|
29
|
+
return {
|
|
30
|
+
id: "chrome-ai",
|
|
31
|
+
sync: false,
|
|
32
|
+
inputs: ["text"],
|
|
33
|
+
isAvailable() {
|
|
34
|
+
if (cachedAvailability !== null) return cachedAvailability;
|
|
35
|
+
if (!getApi()) {
|
|
36
|
+
cachedAvailability = false;
|
|
37
|
+
return false;
|
|
38
|
+
}
|
|
39
|
+
return checkAvailability();
|
|
40
|
+
},
|
|
41
|
+
async detect(input, ctx = {}) {
|
|
42
|
+
const text = input.text;
|
|
43
|
+
if (text === void 0 || text.trim().length === 0) return [];
|
|
44
|
+
const session = await getSession();
|
|
45
|
+
const sample = text.slice(0, ctx.maxChars ?? DEFAULT_MAX_CHARS);
|
|
46
|
+
const results = await session.detect(sample);
|
|
47
|
+
const top = results[0];
|
|
48
|
+
if (!top || top.confidence < CONFIDENCE_THRESHOLD) return [];
|
|
49
|
+
return [
|
|
50
|
+
{
|
|
51
|
+
kind: "chrome-ai",
|
|
52
|
+
language: top.detectedLanguage,
|
|
53
|
+
confidence: clamp01(top.confidence),
|
|
54
|
+
source: "chrome-ai",
|
|
55
|
+
value: top.detectedLanguage
|
|
56
|
+
}
|
|
57
|
+
];
|
|
58
|
+
}
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
var chromeAiEngine = createChromeAiEngine();
|
|
62
|
+
function clamp01(value) {
|
|
63
|
+
if (!Number.isFinite(value)) return 0;
|
|
64
|
+
if (value < 0) return 0;
|
|
65
|
+
if (value > 1) return 1;
|
|
66
|
+
return value;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
export { chromeAiEngine, createChromeAiEngine };
|
|
70
|
+
//# sourceMappingURL=chrome-ai.js.map
|
|
71
|
+
//# sourceMappingURL=chrome-ai.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/chrome-ai.ts"],"names":[],"mappings":";AAkBA,IAAM,iBAAA,GAAoB,GAAA;AAG1B,IAAM,oBAAA,GAAuB,GAAA;AAkB7B,SAAS,MAAA,GAAqC;AAC5C,EAAA,MAAM,SAAA,GAAY,UAAA;AAClB,EAAA,OAAO,UAAU,gBAAA,IAAoB,IAAA;AACvC;AAOO,SAAS,oBAAA,GAAoC;AAClD,EAAA,IAAI,kBAAA,GAAqC,IAAA;AACzC,EAAA,IAAI,aAAA,GAAgD,IAAA;AAEpD,EAAA,eAAe,iBAAA,GAAsC;AACnD,IAAA,IAAI,kBAAA,KAAuB,MAAM,OAAO,kBAAA;AACxC,IAAA,MAAM,MAAM,MAAA,EAAO;AACnB,IAAA,IAAI,CAAC,GAAA,EAAK;AACR,MAAA,kBAAA,GAAqB,KAAA;AACrB,MAAA,OAAO,KAAA;AAAA,IACT;AACA,IAAA,MAAM,KAAA,GAAQ,MAAM,GAAA,CAAI,YAAA,EAAa;AACrC,IAAA,kBAAA,GAAqB,KAAA,KAAU,WAAA;AAC/B,IAAA,OAAO,kBAAA;AAAA,EACT;AAEA,EAAA,eAAe,UAAA,GAA+C;AAC5D,IAAA,IAAI,eAAe,OAAO,aAAA;AAC1B,IAAA,MAAM,MAAM,MAAA,EAAO;AACnB,IAAA,IAAI,CAAC,GAAA,EAAK,MAAM,IAAI,MAAM,yCAAyC,CAAA;AACnE,IAAA,aAAA,GAAgB,MAAM,IAAI,MAAA,EAAO;AACjC,IAAA,OAAO,aAAA;AAAA,EACT;AAEA,EAAA,OAAO;AAAA,IACL,EAAA,EAAI,WAAA;AAAA,IACJ,IAAA,EAAM,KAAA;AAAA,IACN,MAAA,EAAQ,CAAC,MAAM,CAAA;AAAA,IACf,WAAA,GAA0C;AACxC,MAAA,IAAI,kBAAA,KAAuB,MAAM,OAAO,kBAAA;AAGxC,MAAA,IAAI,CAAC,QAAO,EAAG;AACb,QAAA,kBAAA,GAAqB,KAAA;AACrB,QAAA,OAAO,KAAA;AAAA,MACT;AACA,MAAA,OAAO,iBAAA,EAAkB;AAAA,IAC3B,CAAA;AAAA,IACA,MAAM,MAAA,CAAO,KAAA,EAAO,GAAA,GAAqB,EAAC,EAAgC;AACxE,MAAA,MAAM,OAAO,KAAA,CAAM,IAAA;AACnB,MAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,KAAW,CAAA,SAAU,EAAC;AAC5D,MAAA,MAAM,OAAA,GAAU,MAAM,UAAA,EAAW;AACjC,MAAA,MAAM,SAAS,IAAA,CAAK,KAAA,CAAM,CAAA,EAAG,GAAA,CAAI,YAAY,iBAAiB,CAAA;AAC9D,MAAA,MAAM,OAAA,GAAU,MAAM,OAAA,CAAQ,MAAA,CAAO,MAAM,CAAA;AAC3C,MAAA,MAAM,GAAA,GAAM,QAAQ,CAAC,CAAA;AACrB,MAAA,IAAI,CAAC,GAAA,IAAO,GAAA,CAAI,UAAA,GAAa,oBAAA,SAA6B,EAAC;AAC3D,MAAA,OAAO;AAAA,QACL;AAAA,UACE,IAAA,EAAM,WAAA;AAAA,UACN,UAAU,GAAA,CAAI,gBAAA;AAAA,UACd,UAAA,EAAY,OAAA,CAAQ,GAAA,CAAI,UAAU,CAAA;AAAA,UAClC,MAAA,EAAQ,WAAA;AAAA,UACR,OAAO,GAAA,CAAI;AAAA;AACb,OACF;AAAA,IACF;AAAA,GACF;AACF;AAGO,IAAM,iBAA8B,oBAAA;AAE3C,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chrome-ai.js","sourcesContent":["/**\n * `langtell/chrome-ai` — the opt-in on-device engine wrapping the browser's\n * `LanguageDetector` API (Gemini Nano on Chrome 138+ / Edge). Lives behind its\n * own subpath; the zero-dependency core never imports it.\n *\n * Opportunistic: it never triggers a model download. Availability (per Chrome\n * docs):\n * - `available` — model loaded, ready. We're available.\n * - `downloadable` — could be fetched on demand. Treated as unavailable so we\n * never initiate a download the user hasn't consented to.\n * - `downloading` — same reasoning; wait for the model to land.\n * - `unavailable` — Chrome's flat-out no. Skip.\n *\n * Emits `kind: \"chrome-ai\"` evidence with the model's own confidence. An\n * `AsyncSource`: registering it flips the compiled `detect` to `Promise`-typed.\n */\nimport type { AsyncSource, DetectContext, LanguageEvidence } from \"./types.js\";\n\nconst DEFAULT_MAX_CHARS = 2000;\n/** Minimum top-language confidence to count as a confident detection; below\n * this we abstain rather than risk a thin-plurality misclassification. */\nconst CONFIDENCE_THRESHOLD = 0.6;\n\ntype AvailabilityState = \"available\" | \"downloadable\" | \"downloading\" | \"unavailable\";\n\ninterface LanguageDetectorResult {\n detectedLanguage: string;\n confidence: number;\n}\n\ninterface LanguageDetectorSession {\n detect(text: string): Promise<LanguageDetectorResult[]>;\n}\n\ninterface LanguageDetectorApi {\n availability(): Promise<AvailabilityState>;\n create(): Promise<LanguageDetectorSession>;\n}\n\nfunction getApi(): LanguageDetectorApi | null {\n const globalRef = globalThis as unknown as { LanguageDetector?: LanguageDetectorApi };\n return globalRef.LanguageDetector ?? null;\n}\n\n/**\n * Build a chrome-ai {@link AsyncSource}. State (availability + session) is\n * cached per instance: once availability is confirmed it is not re-probed for\n * the instance's lifetime, and the detector session is created once and reused.\n */\nexport function createChromeAiEngine(): AsyncSource {\n let cachedAvailability: boolean | null = null;\n let cachedSession: LanguageDetectorSession | null = null;\n\n async function checkAvailability(): Promise<boolean> {\n if (cachedAvailability !== null) return cachedAvailability;\n const api = getApi();\n if (!api) {\n cachedAvailability = false;\n return false;\n }\n const state = await api.availability();\n cachedAvailability = state === \"available\";\n return cachedAvailability;\n }\n\n async function getSession(): Promise<LanguageDetectorSession> {\n if (cachedSession) return cachedSession;\n const api = getApi();\n if (!api) throw new Error(\"chrome-ai: LanguageDetector API missing\");\n cachedSession = await api.create();\n return cachedSession;\n }\n\n return {\n id: \"chrome-ai\",\n sync: false,\n inputs: [\"text\"],\n isAvailable(): boolean | Promise<boolean> {\n if (cachedAvailability !== null) return cachedAvailability;\n // No API at all — return false synchronously so the common (non-Chrome)\n // case skips without a promise round-trip.\n if (!getApi()) {\n cachedAvailability = false;\n return false;\n }\n return checkAvailability();\n },\n async detect(input, ctx: DetectContext = {}): Promise<LanguageEvidence[]> {\n const text = input.text;\n if (text === undefined || text.trim().length === 0) return [];\n const session = await getSession();\n const sample = text.slice(0, ctx.maxChars ?? DEFAULT_MAX_CHARS);\n const results = await session.detect(sample);\n const top = results[0];\n if (!top || top.confidence < CONFIDENCE_THRESHOLD) return [];\n return [\n {\n kind: \"chrome-ai\",\n language: top.detectedLanguage,\n confidence: clamp01(top.confidence),\n source: \"chrome-ai\",\n value: top.detectedLanguage,\n },\n ];\n },\n };\n}\n\n/** A ready-to-register chrome-ai engine instance. */\nexport const chromeAiEngine: AsyncSource = createChromeAiEngine();\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { primarySubtag } from './chunk-OVSPOZ5J.js';
|
|
2
|
+
|
|
3
|
+
// src/headers.ts
|
|
4
|
+
function evidenceFromHeaders(headers) {
|
|
5
|
+
if (headers === void 0) return [];
|
|
6
|
+
const value = getHeader(headers, "content-language");
|
|
7
|
+
const lang = primarySubtag(value);
|
|
8
|
+
if (lang === null) return [];
|
|
9
|
+
return [
|
|
10
|
+
{
|
|
11
|
+
kind: "http-content-language",
|
|
12
|
+
language: lang,
|
|
13
|
+
confidence: 0.8,
|
|
14
|
+
source: "http-content-language",
|
|
15
|
+
value: value ?? ""
|
|
16
|
+
}
|
|
17
|
+
];
|
|
18
|
+
}
|
|
19
|
+
function getHeader(headers, name) {
|
|
20
|
+
if (isHeaders(headers)) {
|
|
21
|
+
return headers.get(name) ?? void 0;
|
|
22
|
+
}
|
|
23
|
+
for (const [key, value] of Object.entries(headers)) {
|
|
24
|
+
if (key.toLowerCase() !== name) continue;
|
|
25
|
+
if (Array.isArray(value)) return value.join(",");
|
|
26
|
+
return value ?? void 0;
|
|
27
|
+
}
|
|
28
|
+
return void 0;
|
|
29
|
+
}
|
|
30
|
+
function isHeaders(headers) {
|
|
31
|
+
return typeof Headers !== "undefined" && headers instanceof Headers;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export { evidenceFromHeaders };
|
|
35
|
+
//# sourceMappingURL=chunk-3LDE35U2.js.map
|
|
36
|
+
//# sourceMappingURL=chunk-3LDE35U2.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/headers.ts"],"names":[],"mappings":";;;AAIO,SAAS,oBAAoB,OAAA,EAAoD;AACtF,EAAA,IAAI,OAAA,KAAY,MAAA,EAAW,OAAO,EAAC;AAEnC,EAAA,MAAM,KAAA,GAAQ,SAAA,CAAU,OAAA,EAAS,kBAAkB,CAAA;AACnD,EAAA,MAAM,IAAA,GAAO,cAAc,KAAK,CAAA;AAChC,EAAA,IAAI,IAAA,KAAS,IAAA,EAAM,OAAO,EAAC;AAE3B,EAAA,OAAO;AAAA,IACL;AAAA,MACE,IAAA,EAAM,uBAAA;AAAA,MACN,QAAA,EAAU,IAAA;AAAA,MACV,UAAA,EAAY,GAAA;AAAA,MACZ,MAAA,EAAQ,uBAAA;AAAA,MACR,OAAO,KAAA,IAAS;AAAA;AAClB,GACF;AACF;AAEA,SAAS,SAAA,CAAU,SAAoB,IAAA,EAAkC;AACvE,EAAA,IAAI,SAAA,CAAU,OAAO,CAAA,EAAG;AACtB,IAAA,OAAO,OAAA,CAAQ,GAAA,CAAI,IAAI,CAAA,IAAK,MAAA;AAAA,EAC9B;AACA,EAAA,KAAA,MAAW,CAAC,GAAA,EAAK,KAAK,KAAK,MAAA,CAAO,OAAA,CAAQ,OAAO,CAAA,EAAG;AAClD,IAAA,IAAI,GAAA,CAAI,WAAA,EAAY,KAAM,IAAA,EAAM;AAChC,IAAA,IAAI,MAAM,OAAA,CAAQ,KAAK,GAAG,OAAO,KAAA,CAAM,KAAK,GAAG,CAAA;AAC/C,IAAA,OAAO,KAAA,IAAS,MAAA;AAAA,EAClB;AACA,EAAA,OAAO,MAAA;AACT;AAEA,SAAS,UAAU,OAAA,EAAwC;AACzD,EAAA,OAAO,OAAO,OAAA,KAAY,WAAA,IAAe,OAAA,YAAmB,OAAA;AAC9D","file":"chunk-3LDE35U2.js","sourcesContent":["import type { HeaderBag, LanguageEvidence } from \"./types.js\";\nimport { primarySubtag } from \"./internal/bcp47.js\";\n\n/** Producer: the HTTP `Content-Language` response header. */\nexport function evidenceFromHeaders(headers: HeaderBag | undefined): LanguageEvidence[] {\n if (headers === undefined) return [];\n\n const value = getHeader(headers, \"content-language\");\n const lang = primarySubtag(value);\n if (lang === null) return [];\n\n return [\n {\n kind: \"http-content-language\",\n language: lang,\n confidence: 0.8,\n source: \"http-content-language\",\n value: value ?? \"\",\n },\n ];\n}\n\nfunction getHeader(headers: HeaderBag, name: string): string | undefined {\n if (isHeaders(headers)) {\n return headers.get(name) ?? undefined;\n }\n for (const [key, value] of Object.entries(headers)) {\n if (key.toLowerCase() !== name) continue;\n if (Array.isArray(value)) return value.join(\",\");\n return value ?? undefined;\n }\n return undefined;\n}\n\nfunction isHeaders(headers: HeaderBag): headers is Headers {\n return typeof Headers !== \"undefined\" && headers instanceof Headers;\n}\n"]}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import { normalizeBCP47 } from './chunk-OVSPOZ5J.js';
|
|
2
|
+
|
|
3
|
+
// src/fuse.ts
|
|
4
|
+
var DEFAULT_KIND_WEIGHT = {
|
|
5
|
+
"title-script": 1,
|
|
6
|
+
"explicit-locale": 1,
|
|
7
|
+
"chrome-ai": 1,
|
|
8
|
+
"source-prior": 0.7,
|
|
9
|
+
franc: 0.7,
|
|
10
|
+
"http-content-language": 0.6,
|
|
11
|
+
"meta-content-language": 0.55,
|
|
12
|
+
"meta-og-locale": 0.55,
|
|
13
|
+
"html-lang": 0.5
|
|
14
|
+
};
|
|
15
|
+
var SCRIPT_KINDS = /* @__PURE__ */ new Set(["title-script", "franc", "chrome-ai"]);
|
|
16
|
+
var SCRIPT_CONFIDENCE_FLOOR = 0.6;
|
|
17
|
+
var MIN_WINNING_SCORE = 0.35;
|
|
18
|
+
var MIN_MARGIN = 0.12;
|
|
19
|
+
function fuse(evidence, options = {}) {
|
|
20
|
+
const weights = options.weights ?? {};
|
|
21
|
+
const normalized = normalizeEvidence(evidence, options.candidates);
|
|
22
|
+
const scoring = options.nonDiscriminatingScript === "unknown" ? normalized.filter((item) => !isNeutralized(item, normalized)) : normalized;
|
|
23
|
+
const scores = /* @__PURE__ */ new Map();
|
|
24
|
+
for (const item of scoring) {
|
|
25
|
+
if (item.language === "unknown") continue;
|
|
26
|
+
const weight = weights[item.source] ?? weights[item.kind] ?? DEFAULT_KIND_WEIGHT[item.kind] ?? 0.5;
|
|
27
|
+
scores.set(item.language, (scores.get(item.language) ?? 0) + clamp01(item.confidence) * weight);
|
|
28
|
+
}
|
|
29
|
+
const pinned = confidentScriptLanguage(scoring);
|
|
30
|
+
const { best, bestScore, secondScore } = argmax(scores, pinned);
|
|
31
|
+
if (best === null || bestScore < MIN_WINNING_SCORE || bestScore - secondScore < MIN_MARGIN) {
|
|
32
|
+
if (pinned !== null && scores.has(pinned)) {
|
|
33
|
+
const score = scores.get(pinned) ?? 0;
|
|
34
|
+
return {
|
|
35
|
+
language: pinned,
|
|
36
|
+
confidence: clamp01(score / (score + 0.15)),
|
|
37
|
+
evidence: [...normalized]
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
return { language: "unknown", confidence: clamp01(bestScore), evidence: [...normalized] };
|
|
41
|
+
}
|
|
42
|
+
return {
|
|
43
|
+
language: best,
|
|
44
|
+
confidence: clamp01(bestScore / (bestScore + secondScore + 0.15)),
|
|
45
|
+
evidence: [...normalized]
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
function normalizeEvidence(evidence, _candidates) {
|
|
49
|
+
return evidence.map((item) => {
|
|
50
|
+
if (item.language === "unknown") return item;
|
|
51
|
+
const normalized = normalizeBCP47(item.language) ?? item.language;
|
|
52
|
+
if (normalized === item.language) return item;
|
|
53
|
+
return { ...item, language: normalized };
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
function isNeutralized(item, all) {
|
|
57
|
+
if (item.discriminating !== false || !SCRIPT_KINDS.has(item.kind)) return false;
|
|
58
|
+
return !all.some(
|
|
59
|
+
(other) => other.language === item.language && other.language !== "unknown" && !SCRIPT_KINDS.has(other.kind)
|
|
60
|
+
);
|
|
61
|
+
}
|
|
62
|
+
function confidentScriptLanguage(evidence) {
|
|
63
|
+
let best = null;
|
|
64
|
+
let bestConfidence = 0;
|
|
65
|
+
for (const item of evidence) {
|
|
66
|
+
if (item.language === "unknown" || !SCRIPT_KINDS.has(item.kind)) continue;
|
|
67
|
+
const c = clamp01(item.confidence);
|
|
68
|
+
if (c < SCRIPT_CONFIDENCE_FLOOR) continue;
|
|
69
|
+
if (c > bestConfidence) {
|
|
70
|
+
bestConfidence = c;
|
|
71
|
+
best = item.language;
|
|
72
|
+
} else if (c === bestConfidence && item.language !== best) {
|
|
73
|
+
best = null;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
return best;
|
|
77
|
+
}
|
|
78
|
+
function argmax(scores, pinned) {
|
|
79
|
+
let best = null;
|
|
80
|
+
let bestScore = 0;
|
|
81
|
+
let secondScore = 0;
|
|
82
|
+
const pinnedScore = pinned !== null ? scores.get(pinned) ?? 0 : 0;
|
|
83
|
+
for (const [language, raw] of scores) {
|
|
84
|
+
const score = pinned !== null && language !== pinned ? Math.min(raw, pinnedScore) : raw;
|
|
85
|
+
if (score > bestScore) {
|
|
86
|
+
secondScore = bestScore;
|
|
87
|
+
bestScore = score;
|
|
88
|
+
best = language;
|
|
89
|
+
} else if (score > secondScore) {
|
|
90
|
+
secondScore = score;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
if (pinned !== null && best !== pinned && bestScore === pinnedScore && pinnedScore > 0) {
|
|
94
|
+
secondScore = bestScore;
|
|
95
|
+
best = pinned;
|
|
96
|
+
bestScore = pinnedScore;
|
|
97
|
+
}
|
|
98
|
+
return { best, bestScore, secondScore };
|
|
99
|
+
}
|
|
100
|
+
function clamp01(value) {
|
|
101
|
+
if (!Number.isFinite(value)) return 0;
|
|
102
|
+
if (value < 0) return 0;
|
|
103
|
+
if (value > 1) return 1;
|
|
104
|
+
return value;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export { fuse };
|
|
108
|
+
//# sourceMappingURL=chunk-7G3MEXWK.js.map
|
|
109
|
+
//# sourceMappingURL=chunk-7G3MEXWK.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/fuse.ts"],"names":[],"mappings":";;;AA0BA,IAAM,mBAAA,GAA8C;AAAA,EAClD,cAAA,EAAgB,CAAA;AAAA,EAChB,iBAAA,EAAmB,CAAA;AAAA,EACnB,WAAA,EAAa,CAAA;AAAA,EACb,cAAA,EAAgB,GAAA;AAAA,EAChB,KAAA,EAAO,GAAA;AAAA,EACP,uBAAA,EAAyB,GAAA;AAAA,EACzB,uBAAA,EAAyB,IAAA;AAAA,EACzB,gBAAA,EAAkB,IAAA;AAAA,EAClB,WAAA,EAAa;AACf,CAAA;AAKA,IAAM,+BAAe,IAAI,GAAA,CAAY,CAAC,cAAA,EAAgB,OAAA,EAAS,WAAW,CAAC,CAAA;AAI3E,IAAM,uBAAA,GAA0B,GAAA;AAEhC,IAAM,iBAAA,GAAoB,IAAA;AAC1B,IAAM,UAAA,GAAa,IAAA;AAcZ,SAAS,IAAA,CACd,QAAA,EACA,OAAA,GAAuB,EAAC,EACR;AAChB,EAAA,MAAM,OAAA,GAAU,OAAA,CAAQ,OAAA,IAAW,EAAC;AACpC,EAAA,MAAM,UAAA,GAAa,iBAAA,CAAkB,QAAA,EAAU,OAAA,CAAQ,UAAU,CAAA;AAKjE,EAAA,MAAM,OAAA,GACJ,OAAA,CAAQ,uBAAA,KAA4B,SAAA,GAChC,UAAA,CAAW,MAAA,CAAO,CAAC,IAAA,KAAS,CAAC,aAAA,CAAc,IAAA,EAAM,UAAU,CAAC,CAAA,GAC5D,UAAA;AAEN,EAAA,MAAM,MAAA,uBAAa,GAAA,EAAoB;AACvC,EAAA,KAAA,MAAW,QAAQ,OAAA,EAAS;AAC1B,IAAA,IAAI,IAAA,CAAK,aAAa,SAAA,EAAW;AACjC,IAAA,MAAM,MAAA,GACJ,OAAA,CAAQ,IAAA,CAAK,MAAM,CAAA,IAAK,OAAA,CAAQ,IAAA,CAAK,IAAI,CAAA,IAAK,mBAAA,CAAoB,IAAA,CAAK,IAAI,CAAA,IAAK,GAAA;AAClF,IAAA,MAAA,CAAO,GAAA,CAAI,IAAA,CAAK,QAAA,EAAA,CAAW,MAAA,CAAO,GAAA,CAAI,IAAA,CAAK,QAAQ,CAAA,IAAK,CAAA,IAAK,OAAA,CAAQ,IAAA,CAAK,UAAU,IAAI,MAAM,CAAA;AAAA,EAChG;AAGA,EAAA,MAAM,MAAA,GAAS,wBAAwB,OAAO,CAAA;AAE9C,EAAA,MAAM,EAAE,IAAA,EAAM,SAAA,EAAW,aAAY,GAAI,MAAA,CAAO,QAAQ,MAAM,CAAA;AAE9D,EAAA,IAAI,SAAS,IAAA,IAAQ,SAAA,GAAY,iBAAA,IAAqB,SAAA,GAAY,cAAc,UAAA,EAAY;AAG1F,IAAA,IAAI,MAAA,KAAW,IAAA,IAAQ,MAAA,CAAO,GAAA,CAAI,MAAM,CAAA,EAAG;AACzC,MAAA,MAAM,KAAA,GAAQ,MAAA,CAAO,GAAA,CAAI,MAAM,CAAA,IAAK,CAAA;AACpC,MAAA,OAAO;AAAA,QACL,QAAA,EAAU,MAAA;AAAA,QACV,UAAA,EAAY,OAAA,CAAQ,KAAA,IAAS,KAAA,GAAQ,IAAA,CAAK,CAAA;AAAA,QAC1C,QAAA,EAAU,CAAC,GAAG,UAAU;AAAA,OAC1B;AAAA,IACF;AACA,IAAA,OAAO,EAAE,QAAA,EAAU,SAAA,EAAW,UAAA,EAAY,OAAA,CAAQ,SAAS,CAAA,EAAG,QAAA,EAAU,CAAC,GAAG,UAAU,CAAA,EAAE;AAAA,EAC1F;AAEA,EAAA,OAAO;AAAA,IACL,QAAA,EAAU,IAAA;AAAA,IACV,UAAA,EAAY,OAAA,CAAQ,SAAA,IAAa,SAAA,GAAY,cAAc,IAAA,CAAK,CAAA;AAAA,IAChE,QAAA,EAAU,CAAC,GAAG,UAAU;AAAA,GAC1B;AACF;AAYA,SAAS,iBAAA,CACP,UACA,WAAA,EACoB;AACpB,EAAA,OAAO,QAAA,CAAS,GAAA,CAAI,CAAC,IAAA,KAAS;AAC5B,IAAA,IAAI,IAAA,CAAK,QAAA,KAAa,SAAA,EAAW,OAAO,IAAA;AACxC,IAAA,MAAM,UAAA,GAAa,cAAA,CAAe,IAAA,CAAK,QAAQ,KAAK,IAAA,CAAK,QAAA;AACzD,IAAA,IAAI,UAAA,KAAe,IAAA,CAAK,QAAA,EAAU,OAAO,IAAA;AACzC,IAAA,OAAO,EAAE,GAAG,IAAA,EAAM,QAAA,EAAU,UAAA,EAAW;AAAA,EACzC,CAAC,CAAA;AACH;AAUA,SAAS,aAAA,CAAc,MAAwB,GAAA,EAA2C;AACxF,EAAA,IAAI,IAAA,CAAK,mBAAmB,KAAA,IAAS,CAAC,aAAa,GAAA,CAAI,IAAA,CAAK,IAAI,CAAA,EAAG,OAAO,KAAA;AAC1E,EAAA,OAAO,CAAC,GAAA,CAAI,IAAA;AAAA,IACV,CAAC,KAAA,KACC,KAAA,CAAM,QAAA,KAAa,IAAA,CAAK,QAAA,IACxB,KAAA,CAAM,QAAA,KAAa,SAAA,IACnB,CAAC,YAAA,CAAa,GAAA,CAAI,MAAM,IAAI;AAAA,GAChC;AACF;AAKA,SAAS,wBAAwB,QAAA,EAAsD;AACrF,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,IAAI,cAAA,GAAiB,CAAA;AACrB,EAAA,KAAA,MAAW,QAAQ,QAAA,EAAU;AAC3B,IAAA,IAAI,IAAA,CAAK,aAAa,SAAA,IAAa,CAAC,aAAa,GAAA,CAAI,IAAA,CAAK,IAAI,CAAA,EAAG;AACjE,IAAA,MAAM,CAAA,GAAI,OAAA,CAAQ,IAAA,CAAK,UAAU,CAAA;AACjC,IAAA,IAAI,IAAI,uBAAA,EAAyB;AACjC,IAAA,IAAI,IAAI,cAAA,EAAgB;AACtB,MAAA,cAAA,GAAiB,CAAA;AACjB,MAAA,IAAA,GAAO,IAAA,CAAK,QAAA;AAAA,IACd,CAAA,MAAA,IAAW,CAAA,KAAM,cAAA,IAAkB,IAAA,CAAK,aAAa,IAAA,EAAM;AAEzD,MAAA,IAAA,GAAO,IAAA;AAAA,IACT;AAAA,EACF;AACA,EAAA,OAAO,IAAA;AACT;AAQA,SAAS,MAAA,CACP,QACA,MAAA,EACiE;AACjE,EAAA,IAAI,IAAA,GAAsB,IAAA;AAC1B,EAAA,IAAI,SAAA,GAAY,CAAA;AAChB,EAAA,IAAI,WAAA,GAAc,CAAA;AAClB,EAAA,MAAM,cAAc,MAAA,KAAW,IAAA,GAAQ,OAAO,GAAA,CAAI,MAAM,KAAK,CAAA,GAAK,CAAA;AAElE,EAAA,KAAA,MAAW,CAAC,QAAA,EAAU,GAAG,CAAA,IAAK,MAAA,EAAQ;AAEpC,IAAA,MAAM,KAAA,GAAQ,WAAW,IAAA,IAAQ,QAAA,KAAa,SAAS,IAAA,CAAK,GAAA,CAAI,GAAA,EAAK,WAAW,CAAA,GAAI,GAAA;AACpF,IAAA,IAAI,QAAQ,SAAA,EAAW;AACrB,MAAA,WAAA,GAAc,SAAA;AACd,MAAA,SAAA,GAAY,KAAA;AACZ,MAAA,IAAA,GAAO,QAAA;AAAA,IACT,CAAA,MAAA,IAAW,QAAQ,WAAA,EAAa;AAC9B,MAAA,WAAA,GAAc,KAAA;AAAA,IAChB;AAAA,EACF;AAEA,EAAA,IAAI,WAAW,IAAA,IAAQ,IAAA,KAAS,UAAU,SAAA,KAAc,WAAA,IAAe,cAAc,CAAA,EAAG;AACtF,IAAA,WAAA,GAAc,SAAA;AACd,IAAA,IAAA,GAAO,MAAA;AACP,IAAA,SAAA,GAAY,WAAA;AAAA,EACd;AACA,EAAA,OAAO,EAAE,IAAA,EAAM,SAAA,EAAW,WAAA,EAAY;AACxC;AAEA,SAAS,QAAQ,KAAA,EAAuB;AACtC,EAAA,IAAI,CAAC,MAAA,CAAO,QAAA,CAAS,KAAK,GAAG,OAAO,CAAA;AACpC,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,IAAI,KAAA,GAAQ,GAAG,OAAO,CAAA;AACtB,EAAA,OAAO,KAAA;AACT","file":"chunk-7G3MEXWK.js","sourcesContent":["import type {\n Classification,\n LanguageEvidence,\n LanguageProfile,\n NonDiscriminatingScript,\n Weights,\n} from \"./types.js\";\nimport { normalizeBCP47 } from \"./internal/bcp47.js\";\n\nexport interface FuseOptions {\n weights?: Weights;\n /** The candidate roster. When present, incoming evidence tags are normalized\n * into it (`uk-UA` → `uk`, `ua` → `uk`) so context signals (page/header\n * locale) land on the same code the text rungs use. */\n candidates?: readonly LanguageProfile[];\n /** How to resolve a *non-discriminating* script read (one flagged\n * `discriminating: false` — its winning script owned by ≤1 roster candidate).\n * Default `\"candidate\"` keeps current behavior; `\"unknown\"` drops such a read\n * unless non-script evidence corroborates the same language. See\n * {@link NonDiscriminatingScript}. */\n nonDiscriminatingScript?: NonDiscriminatingScript;\n}\n\n/** Default per-kind weights. Clear lexical signal (script, explicit locale)\n * outweighs contextual signal (page tags, headers). Callers override per\n * `source` id or `kind` via {@link FuseOptions.weights}. */\nconst DEFAULT_KIND_WEIGHT: Record<string, number> = {\n \"title-script\": 1,\n \"explicit-locale\": 1,\n \"chrome-ai\": 1,\n \"source-prior\": 0.7,\n franc: 0.7,\n \"http-content-language\": 0.6,\n \"meta-content-language\": 0.55,\n \"meta-og-locale\": 0.55,\n \"html-lang\": 0.5,\n};\n\n/** Evidence kinds that constitute *clear script evidence* — a verdict the text\n * classifier or an on-device model reached by actually reading the string. The\n * guard below forbids weaker page/header *context* from flipping these. */\nconst SCRIPT_KINDS = new Set<string>([\"title-script\", \"franc\", \"chrome-ai\"]);\n\n/** A script verdict this confident is treated as settled — context may add to it\n * but must not flip the winner to a different language. */\nconst SCRIPT_CONFIDENCE_FLOOR = 0.6;\n\nconst MIN_WINNING_SCORE = 0.35;\nconst MIN_MARGIN = 0.12;\n\n/**\n * Combine evidence into a single weighted verdict with an audit trail.\n *\n * Three steps:\n * 1. Normalize each item's language tag into the candidate roster (BCP-47:\n * `uk-UA`/`ua` → `uk`) so text, page, and header signals agree on a code.\n * 2. Weighted argmax over languages (caller weights override per `source`/`kind`).\n * 3. Apply the guard **context must never override clear script evidence**: when\n * the text classifier (or an on-device model) confidently read one language,\n * weaker page/header context for a *different* language cannot win — a\n * Ukrainian page chrome does not make a Latin/English title Ukrainian.\n */\nexport function fuse(\n evidence: readonly LanguageEvidence[],\n options: FuseOptions = {},\n): Classification {\n const weights = options.weights ?? {};\n const normalized = normalizeEvidence(evidence, options.candidates);\n\n // Under `\"unknown\"`, a non-discriminating script read scores nothing on its own\n // — it's dropped from the tally and the pin below — but stays in the trail. The\n // full `normalized` set is still returned as evidence.\n const scoring =\n options.nonDiscriminatingScript === \"unknown\"\n ? normalized.filter((item) => !isNeutralized(item, normalized))\n : normalized;\n\n const scores = new Map<string, number>();\n for (const item of scoring) {\n if (item.language === \"unknown\") continue;\n const weight =\n weights[item.source] ?? weights[item.kind] ?? DEFAULT_KIND_WEIGHT[item.kind] ?? 0.5;\n scores.set(item.language, (scores.get(item.language) ?? 0) + clamp01(item.confidence) * weight);\n }\n\n // The context-vs-script guard: a confident script read pins the winner.\n const pinned = confidentScriptLanguage(scoring);\n\n const { best, bestScore, secondScore } = argmax(scores, pinned);\n\n if (best === null || bestScore < MIN_WINNING_SCORE || bestScore - secondScore < MIN_MARGIN) {\n // A pinned script language still wins even on a thin margin — clear script\n // evidence is never demoted to \"unknown\" by competing context.\n if (pinned !== null && scores.has(pinned)) {\n const score = scores.get(pinned) ?? 0;\n return {\n language: pinned,\n confidence: clamp01(score / (score + 0.15)),\n evidence: [...normalized],\n };\n }\n return { language: \"unknown\", confidence: clamp01(bestScore), evidence: [...normalized] };\n }\n\n return {\n language: best,\n confidence: clamp01(bestScore / (bestScore + secondScore + 0.15)),\n evidence: [...normalized],\n };\n}\n\n/** Normalize each item's tag into the roster's code space (BCP-47-aware). Items\n * already `\"unknown\"` pass through untouched. Tags are BCP-47-normalized\n * (`en-US` → `en`, `ua` → `uk`) so text, page, and header signals land on the\n * same code. The normalized code is kept even when it falls outside the roster —\n * argmax simply won't favor an out-of-roster context tag, but it stays in the\n * audit trail.\n *\n * The roster is accepted (and reserved) so a future revision can fold roster\n * aliasing in without a signature change; today BCP-47 normalization alone\n * reconciles the codes the producers emit. */\nfunction normalizeEvidence(\n evidence: readonly LanguageEvidence[],\n _candidates: readonly LanguageProfile[] | undefined,\n): LanguageEvidence[] {\n return evidence.map((item) => {\n if (item.language === \"unknown\") return item;\n const normalized = normalizeBCP47(item.language) ?? item.language;\n if (normalized === item.language) return item;\n return { ...item, language: normalized };\n });\n}\n\n/**\n * Whether a non-discriminating script read should score nothing (mode\n * `\"unknown\"`). True when `item` is a script kind flagged `discriminating:\n * false` (its winning script is owned by ≤1 roster candidate) AND no *non-script*\n * evidence corroborates its language. Corroboration must come from context kinds\n * (page tags, headers): two lone-candidate script reads agreeing is still two\n * defaults, not real evidence — so script kinds never corroborate one another.\n */\nfunction isNeutralized(item: LanguageEvidence, all: readonly LanguageEvidence[]): boolean {\n if (item.discriminating !== false || !SCRIPT_KINDS.has(item.kind)) return false;\n return !all.some(\n (other) =>\n other.language === item.language &&\n other.language !== \"unknown\" &&\n !SCRIPT_KINDS.has(other.kind),\n );\n}\n\n/** The language of a *clear script* read confident enough to pin the verdict, or\n * `null` when none qualifies. When two script reads disagree, the higher-\n * confidence one pins (a tie leaves nothing pinned — argmax decides normally). */\nfunction confidentScriptLanguage(evidence: readonly LanguageEvidence[]): string | null {\n let best: string | null = null;\n let bestConfidence = 0;\n for (const item of evidence) {\n if (item.language === \"unknown\" || !SCRIPT_KINDS.has(item.kind)) continue;\n const c = clamp01(item.confidence);\n if (c < SCRIPT_CONFIDENCE_FLOOR) continue;\n if (c > bestConfidence) {\n bestConfidence = c;\n best = item.language;\n } else if (c === bestConfidence && item.language !== best) {\n // Two equally-confident script reads for different languages — ambiguous.\n best = null;\n }\n }\n return best;\n}\n\n/**\n * Weighted argmax. When `pinned` is set (a confident script language), any\n * *other* language's score may only come from context kinds; that score is\n * capped so it can never exceed the pinned language. This enforces the guard\n * without discarding the context from the audit trail.\n */\nfunction argmax(\n scores: Map<string, number>,\n pinned: string | null,\n): { best: string | null; bestScore: number; secondScore: number } {\n let best: string | null = null;\n let bestScore = 0;\n let secondScore = 0;\n const pinnedScore = pinned !== null ? (scores.get(pinned) ?? 0) : 0;\n\n for (const [language, raw] of scores) {\n // Guard: a non-pinned language cannot out-score the pinned one.\n const score = pinned !== null && language !== pinned ? Math.min(raw, pinnedScore) : raw;\n if (score > bestScore) {\n secondScore = bestScore;\n bestScore = score;\n best = language;\n } else if (score > secondScore) {\n secondScore = score;\n }\n }\n // On a pinned tie (pinned capped equal to a context language), prefer pinned.\n if (pinned !== null && best !== pinned && bestScore === pinnedScore && pinnedScore > 0) {\n secondScore = bestScore;\n best = pinned;\n bestScore = pinnedScore;\n }\n return { best, bestScore, secondScore };\n}\n\nfunction clamp01(value: number): number {\n if (!Number.isFinite(value)) return 0;\n if (value < 0) return 0;\n if (value > 1) return 1;\n return value;\n}\n"]}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import { normalizeBCP47 } from './chunk-OVSPOZ5J.js';
|
|
2
|
+
|
|
3
|
+
// src/html.ts
|
|
4
|
+
function evidenceFromHtml(html) {
|
|
5
|
+
if (html === void 0 || html.trim().length === 0) return [];
|
|
6
|
+
const out = [];
|
|
7
|
+
const htmlLang = /<html\b[^>]*\blang=["']?([^"'\s>]+)/i.exec(html)?.[1];
|
|
8
|
+
pushTag(out, "html-lang", 0.7, htmlLang);
|
|
9
|
+
const metaContentLang = /<meta\b[^>]*\bhttp-equiv=["']?content-language["']?[^>]*\bcontent=["']?([^"'\s>]+)/i.exec(
|
|
10
|
+
html
|
|
11
|
+
)?.[1] ?? /<meta\b[^>]*\bcontent=["']?([^"'\s>]+)["']?[^>]*\bhttp-equiv=["']?content-language/i.exec(
|
|
12
|
+
html
|
|
13
|
+
)?.[1];
|
|
14
|
+
pushTag(out, "meta-content-language", 0.6, metaContentLang);
|
|
15
|
+
const ogLocale = /<meta\b[^>]*\bproperty=["']?og:locale["']?[^>]*\bcontent=["']?([^"'\s>]+)/i.exec(html)?.[1] ?? /<meta\b[^>]*\bcontent=["']?([^"'\s>]+)["']?[^>]*\bproperty=["']?og:locale/i.exec(html)?.[1];
|
|
16
|
+
pushTag(out, "meta-og-locale", 0.6, ogLocale);
|
|
17
|
+
return out;
|
|
18
|
+
}
|
|
19
|
+
function pushTag(out, kind, confidence, raw) {
|
|
20
|
+
const lang = normalizeBCP47(raw);
|
|
21
|
+
if (lang === null) return;
|
|
22
|
+
out.push({ kind, language: lang, confidence, source: kind, value: raw ?? "" });
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export { evidenceFromHtml };
|
|
26
|
+
//# sourceMappingURL=chunk-KI4MAI3N.js.map
|
|
27
|
+
//# sourceMappingURL=chunk-KI4MAI3N.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/html.ts"],"names":[],"mappings":";;;AAeO,SAAS,iBAAiB,IAAA,EAA8C;AAC7E,EAAA,IAAI,IAAA,KAAS,UAAa,IAAA,CAAK,IAAA,GAAO,MAAA,KAAW,CAAA,SAAU,EAAC;AAE5D,EAAA,MAAM,MAA0B,EAAC;AAEjC,EAAA,MAAM,QAAA,GAAW,sCAAA,CAAuC,IAAA,CAAK,IAAI,IAAI,CAAC,CAAA;AACtE,EAAA,OAAA,CAAQ,GAAA,EAAK,WAAA,EAAa,GAAA,EAAK,QAAQ,CAAA;AAGvC,EAAA,MAAM,kBACJ,qFAAA,CAAsF,IAAA;AAAA,IACpF;AAAA,GACF,GAAI,CAAC,CAAA,IACL,qFAAA,CAAsF,IAAA;AAAA,IACpF;AAAA,MACE,CAAC,CAAA;AACP,EAAA,OAAA,CAAQ,GAAA,EAAK,uBAAA,EAAyB,GAAA,EAAK,eAAe,CAAA;AAG1D,EAAA,MAAM,QAAA,GACJ,4EAAA,CAA6E,IAAA,CAAK,IAAI,CAAA,GAAI,CAAC,CAAA,IAC3F,4EAAA,CAA6E,IAAA,CAAK,IAAI,CAAA,GAAI,CAAC,CAAA;AAC7F,EAAA,OAAA,CAAQ,GAAA,EAAK,gBAAA,EAAkB,GAAA,EAAK,QAAQ,CAAA;AAE5C,EAAA,OAAO,GAAA;AACT;AAEA,SAAS,OAAA,CACP,GAAA,EACA,IAAA,EACA,UAAA,EACA,GAAA,EACM;AACN,EAAA,MAAM,IAAA,GAAO,eAAe,GAAG,CAAA;AAC/B,EAAA,IAAI,SAAS,IAAA,EAAM;AACnB,EAAA,GAAA,CAAI,IAAA,CAAK,EAAE,IAAA,EAAM,QAAA,EAAU,IAAA,EAAM,UAAA,EAAY,MAAA,EAAQ,IAAA,EAAM,KAAA,EAAO,GAAA,IAAO,EAAA,EAAI,CAAA;AAC/E","file":"chunk-KI4MAI3N.js","sourcesContent":["import type { LanguageEvidence } from \"./types.js\";\nimport { normalizeBCP47 } from \"./internal/bcp47.js\";\n\n/**\n * Producer: language clues from an HTML string's metadata.\n *\n * Reads three independent declarations, each emitted as its own evidence item\n * (the fuser weighs them):\n * - `<html lang>` → `html-lang`\n * - `<meta http-equiv=\"content-language\">` → `meta-content-language`\n * - `<meta property=\"og:locale\">` → `meta-og-locale`\n *\n * All tags are BCP-47-normalized (`uk-UA` → `uk`, `en_US` → `en`). Sync and\n * zero-dependency — regex extraction only, never a DOM parse.\n */\nexport function evidenceFromHtml(html: string | undefined): LanguageEvidence[] {\n if (html === undefined || html.trim().length === 0) return [];\n\n const out: LanguageEvidence[] = [];\n\n const htmlLang = /<html\\b[^>]*\\blang=[\"']?([^\"'\\s>]+)/i.exec(html)?.[1];\n pushTag(out, \"html-lang\", 0.7, htmlLang);\n\n // <meta http-equiv=\"content-language\" content=\"uk\"> (attribute order varies).\n const metaContentLang =\n /<meta\\b[^>]*\\bhttp-equiv=[\"']?content-language[\"']?[^>]*\\bcontent=[\"']?([^\"'\\s>]+)/i.exec(\n html,\n )?.[1] ??\n /<meta\\b[^>]*\\bcontent=[\"']?([^\"'\\s>]+)[\"']?[^>]*\\bhttp-equiv=[\"']?content-language/i.exec(\n html,\n )?.[1];\n pushTag(out, \"meta-content-language\", 0.6, metaContentLang);\n\n // <meta property=\"og:locale\" content=\"uk_UA\"> (attribute order varies).\n const ogLocale =\n /<meta\\b[^>]*\\bproperty=[\"']?og:locale[\"']?[^>]*\\bcontent=[\"']?([^\"'\\s>]+)/i.exec(html)?.[1] ??\n /<meta\\b[^>]*\\bcontent=[\"']?([^\"'\\s>]+)[\"']?[^>]*\\bproperty=[\"']?og:locale/i.exec(html)?.[1];\n pushTag(out, \"meta-og-locale\", 0.6, ogLocale);\n\n return out;\n}\n\nfunction pushTag(\n out: LanguageEvidence[],\n kind: \"html-lang\" | \"meta-content-language\" | \"meta-og-locale\",\n confidence: number,\n raw: string | undefined,\n): void {\n const lang = normalizeBCP47(raw);\n if (lang === null) return;\n out.push({ kind, language: lang, confidence, source: kind, value: raw ?? \"\" });\n}\n"]}
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
// src/internal/classify.ts
|
|
2
|
+
var FRANC_RUNG = 3;
|
|
3
|
+
var UNKNOWN = {
|
|
4
|
+
language: "unknown",
|
|
5
|
+
margin: 0,
|
|
6
|
+
rung: null,
|
|
7
|
+
discriminating: false
|
|
8
|
+
};
|
|
9
|
+
var CYRILLIC_RE = /\p{Script=Cyrillic}/u;
|
|
10
|
+
var LATIN_RE = /\p{Script=Latin}/u;
|
|
11
|
+
var NOISE_PATTERNS = [
|
|
12
|
+
/\bhttps?:\/\/\S+/gi,
|
|
13
|
+
// full URLs
|
|
14
|
+
/\bwww\.\S+/gi,
|
|
15
|
+
// www.… without a scheme
|
|
16
|
+
/\b[a-z0-9-]+(?:\.[a-z0-9-]+)+(?:\/\S*)?/gi,
|
|
17
|
+
// bare domains (example.com/path)
|
|
18
|
+
/[@#][\p{L}\p{N}_]+/gu
|
|
19
|
+
// @handles and #hashtags
|
|
20
|
+
];
|
|
21
|
+
function stripNoise(text) {
|
|
22
|
+
let out = text;
|
|
23
|
+
for (const re of NOISE_PATTERNS) out = out.replace(re, " ");
|
|
24
|
+
return out;
|
|
25
|
+
}
|
|
26
|
+
function dominantScript(text) {
|
|
27
|
+
let cyr = 0;
|
|
28
|
+
let lat = 0;
|
|
29
|
+
for (const ch of stripNoise(text)) {
|
|
30
|
+
if (CYRILLIC_RE.test(ch)) cyr += 1;
|
|
31
|
+
else if (LATIN_RE.test(ch)) lat += 1;
|
|
32
|
+
}
|
|
33
|
+
if (cyr === 0 && lat === 0) return null;
|
|
34
|
+
return cyr >= lat ? "cyrillic" : "latin";
|
|
35
|
+
}
|
|
36
|
+
function profileScript(profile) {
|
|
37
|
+
for (const ch of profile.alphabet) {
|
|
38
|
+
if (CYRILLIC_RE.test(ch)) return "cyrillic";
|
|
39
|
+
if (LATIN_RE.test(ch)) return "latin";
|
|
40
|
+
}
|
|
41
|
+
return null;
|
|
42
|
+
}
|
|
43
|
+
function scopeCandidates(text, candidates) {
|
|
44
|
+
const script = dominantScript(text);
|
|
45
|
+
if (script === null) return [];
|
|
46
|
+
const seen = /* @__PURE__ */ new Set();
|
|
47
|
+
const scoped = [];
|
|
48
|
+
for (const c of candidates) {
|
|
49
|
+
if (profileScript(c) !== script || seen.has(c.code)) continue;
|
|
50
|
+
seen.add(c.code);
|
|
51
|
+
scoped.push(c);
|
|
52
|
+
}
|
|
53
|
+
return scoped;
|
|
54
|
+
}
|
|
55
|
+
function tokenize(text) {
|
|
56
|
+
return text.toLowerCase().match(/\p{L}+/gu) ?? [];
|
|
57
|
+
}
|
|
58
|
+
function tally(items, membership) {
|
|
59
|
+
const scores = new Map(membership.map((m) => [m.code, 0]));
|
|
60
|
+
for (const item of items) {
|
|
61
|
+
let owner = null;
|
|
62
|
+
let owners = 0;
|
|
63
|
+
for (const m of membership) {
|
|
64
|
+
if (m.set.has(item)) {
|
|
65
|
+
owners += 1;
|
|
66
|
+
if (owners > 1) {
|
|
67
|
+
owner = null;
|
|
68
|
+
break;
|
|
69
|
+
}
|
|
70
|
+
owner = m.code;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
if (owner !== null) scores.set(owner, (scores.get(owner) ?? 0) + 1);
|
|
74
|
+
}
|
|
75
|
+
return scores;
|
|
76
|
+
}
|
|
77
|
+
function leader(scores) {
|
|
78
|
+
let max = -1;
|
|
79
|
+
let second = -1;
|
|
80
|
+
let code = null;
|
|
81
|
+
for (const [c, score] of scores) {
|
|
82
|
+
if (score > max) {
|
|
83
|
+
second = max;
|
|
84
|
+
max = score;
|
|
85
|
+
code = c;
|
|
86
|
+
} else if (score > second) {
|
|
87
|
+
second = score;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
if (code === null || max < 1) return null;
|
|
91
|
+
const margin = max - Math.max(second, 0);
|
|
92
|
+
return margin >= 1 ? { code, margin } : null;
|
|
93
|
+
}
|
|
94
|
+
function membershipFor(candidates, pick) {
|
|
95
|
+
return candidates.map((c) => ({ code: c.code, set: new Set(pick(c)) }));
|
|
96
|
+
}
|
|
97
|
+
function letterRung(text, scoped) {
|
|
98
|
+
const r = leader(
|
|
99
|
+
tally(
|
|
100
|
+
text.toLowerCase(),
|
|
101
|
+
membershipFor(scoped, (p) => p.alphabet + (p.marks ?? ""))
|
|
102
|
+
)
|
|
103
|
+
);
|
|
104
|
+
return r ? { language: r.code, margin: r.margin, rung: 1 } : null;
|
|
105
|
+
}
|
|
106
|
+
function wordRung(tokens, scoped, tier, rung) {
|
|
107
|
+
const r = leader(
|
|
108
|
+
tally(
|
|
109
|
+
tokens,
|
|
110
|
+
membershipFor(scoped, (p) => p.words?.[tier] ?? [])
|
|
111
|
+
)
|
|
112
|
+
);
|
|
113
|
+
return r ? { language: r.code, margin: r.margin, rung } : null;
|
|
114
|
+
}
|
|
115
|
+
function classifyBySnippet(text, candidates, rung3) {
|
|
116
|
+
if (!text || candidates.length === 0) return UNKNOWN;
|
|
117
|
+
const cleaned = stripNoise(text);
|
|
118
|
+
const scoped = scopeCandidates(cleaned, candidates);
|
|
119
|
+
if (scoped.length === 0) return UNKNOWN;
|
|
120
|
+
const discriminating = scoped.length >= 2;
|
|
121
|
+
const byLetter = letterRung(cleaned, scoped);
|
|
122
|
+
if (byLetter) return { ...byLetter, discriminating };
|
|
123
|
+
const tokens = tokenize(cleaned);
|
|
124
|
+
if (tokens.length === 0) return UNKNOWN;
|
|
125
|
+
const byWord = wordRung(tokens, scoped, "function", "2a") ?? wordRung(tokens, scoped, "frequent", "2b") ?? rung3?.(cleaned, scoped);
|
|
126
|
+
return byWord ? { ...byWord, discriminating } : UNKNOWN;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
export { FRANC_RUNG, classifyBySnippet, scopeCandidates };
|
|
130
|
+
//# sourceMappingURL=chunk-NCGZPEDA.js.map
|
|
131
|
+
//# sourceMappingURL=chunk-NCGZPEDA.js.map
|