@oomfware/lang-detect 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +14 -0
- package/README.md +68 -0
- package/dist/eval.d.ts +8 -0
- package/dist/eval.d.ts.map +1 -0
- package/dist/eval.js +145 -0
- package/dist/eval.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +20 -0
- package/dist/index.js.map +1 -0
- package/dist/lite.d.ts +3 -0
- package/dist/lite.d.ts.map +1 -0
- package/dist/lite.js +20 -0
- package/dist/lite.js.map +1 -0
- package/dist/nn/detect.d.ts +25 -0
- package/dist/nn/detect.d.ts.map +1 -0
- package/dist/nn/detect.js +209 -0
- package/dist/nn/detect.js.map +1 -0
- package/dist/nn/forward.d.ts +38 -0
- package/dist/nn/forward.d.ts.map +1 -0
- package/dist/nn/forward.js +154 -0
- package/dist/nn/forward.js.map +1 -0
- package/dist/nn/groups.d.ts +23 -0
- package/dist/nn/groups.d.ts.map +1 -0
- package/dist/nn/groups.js +81 -0
- package/dist/nn/groups.js.map +1 -0
- package/dist/nn/load.d.ts +15 -0
- package/dist/nn/load.d.ts.map +1 -0
- package/dist/nn/load.js +21 -0
- package/dist/nn/load.js.map +1 -0
- package/dist/nn/load.node.d.ts +15 -0
- package/dist/nn/load.node.d.ts.map +1 -0
- package/dist/nn/load.node.js +23 -0
- package/dist/nn/load.node.js.map +1 -0
- package/dist/nn/normalize.d.ts +17 -0
- package/dist/nn/normalize.d.ts.map +1 -0
- package/dist/nn/normalize.js +34 -0
- package/dist/nn/normalize.js.map +1 -0
- package/package.json +61 -0
- package/src/eval.ts +173 -0
- package/src/index.ts +22 -0
- package/src/lite.ts +25 -0
- package/src/nn/detect.ts +309 -0
- package/src/nn/forward.ts +181 -0
- package/src/nn/load.node.ts +24 -0
- package/src/nn/load.ts +21 -0
- package/src/nn/normalize.ts +38 -0
- package/weights/lite/arabic.bin +0 -0
- package/weights/lite/arabic.json +1 -0
- package/weights/lite/cyrillic.bin +5 -0
- package/weights/lite/cyrillic.json +1 -0
- package/weights/lite/devanagari.bin +0 -0
- package/weights/lite/devanagari.json +1 -0
- package/weights/lite/latin.bin +5 -0
- package/weights/lite/latin.json +1 -0
- package/weights/standard/arabic.bin +0 -0
- package/weights/standard/arabic.json +1 -0
- package/weights/standard/cyrillic.bin +0 -0
- package/weights/standard/cyrillic.json +1 -0
- package/weights/standard/devanagari.bin +9 -0
- package/weights/standard/devanagari.json +1 -0
- package/weights/standard/latin.bin +0 -0
- package/weights/standard/latin.json +1 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
BSD Zero Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mary
|
|
4
|
+
|
|
5
|
+
Permission to use, copy, modify, and/or distribute this software for any
|
|
6
|
+
purpose with or without fee is hereby granted.
|
|
7
|
+
|
|
8
|
+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
|
|
9
|
+
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
|
|
10
|
+
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
|
|
11
|
+
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
|
|
12
|
+
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
|
|
13
|
+
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
|
14
|
+
PERFORMANCE OF THIS SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# @oomfware/lang-detect
|
|
2
|
+
|
|
3
|
+
natural language detection library.
|
|
4
|
+
|
|
5
|
+
```sh
|
|
6
|
+
npm install @oomfware/lang-detect
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## usage
|
|
10
|
+
|
|
11
|
+
call `initialize()` once to load model weights via `fetch()`, then use `detect()` synchronously on
|
|
12
|
+
any text. results are sorted by probability, highest first.
|
|
13
|
+
|
|
14
|
+
```ts
|
|
15
|
+
import { initialize, detect } from '@oomfware/lang-detect';
|
|
16
|
+
|
|
17
|
+
await initialize();
|
|
18
|
+
|
|
19
|
+
const results = detect('the quick brown fox jumps over the lazy dog');
|
|
20
|
+
console.log(results[0]); // -> ['eng', 0.98]
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
mixed-script text returns detections for each script present, with probabilities scaled by
|
|
24
|
+
proportion:
|
|
25
|
+
|
|
26
|
+
```ts
|
|
27
|
+
const results = detect('Hello Мир');
|
|
28
|
+
// -> [['eng', ...], ['rus', ...]]
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### variants
|
|
32
|
+
|
|
33
|
+
two variants are available, trading accuracy for smaller weights:
|
|
34
|
+
|
|
35
|
+
| subpath | weights | accuracy |
|
|
36
|
+
| --------- | ------- | -------- |
|
|
37
|
+
| (default) | 57.4 KB | 95.2% |
|
|
38
|
+
| `/lite` | 43.1 KB | 95.1% |
|
|
39
|
+
|
|
40
|
+
```ts
|
|
41
|
+
import { initialize, detect } from '@oomfware/lang-detect/lite';
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### supported languages
|
|
45
|
+
|
|
46
|
+
50 languages across Latin, Cyrillic, Arabic, Devanagari, CJK, and unique-script families.
|
|
47
|
+
|
|
48
|
+
| code | language | code | language | code | language |
|
|
49
|
+
| ----- | ---------------- | ----- | ---------------- | ----- | ---------- |
|
|
50
|
+
| `afr` | Afrikaans | `hau` | Hausa | `por` | Portuguese |
|
|
51
|
+
| `ara` | Arabic | `heb` | Hebrew | `ron` | Romanian |
|
|
52
|
+
| `aze` | Azerbaijani | `hin` | Hindi | `run` | Rundi |
|
|
53
|
+
| `bel` | Belarusian | `hrv` | Croatian | `rus` | Russian |
|
|
54
|
+
| `ben` | Bengali | `hun` | Hungarian | `slk` | Slovak |
|
|
55
|
+
| `bul` | Bulgarian | `hye` | Armenian | `spa` | Spanish |
|
|
56
|
+
| `cat` | Catalan | `ind` | Indonesian | `srp` | Serbian |
|
|
57
|
+
| `ces` | Czech | `isl` | Icelandic | `swe` | Swedish |
|
|
58
|
+
| `ckb` | Central Kurdish | `ita` | Italian | `tgl` | Tagalog |
|
|
59
|
+
| `cmn` | Mandarin Chinese | `jpn` | Japanese | `tur` | Turkish |
|
|
60
|
+
| `dan` | Danish | `kat` | Georgian | `ukr` | Ukrainian |
|
|
61
|
+
| `deu` | German | `kaz` | Kazakh | `vie` | Vietnamese |
|
|
62
|
+
| `ell` | Greek | `kor` | Korean | | |
|
|
63
|
+
| `eng` | English | `lit` | Lithuanian | | |
|
|
64
|
+
| `est` | Estonian | `mar` | Marathi | | |
|
|
65
|
+
| `eus` | Basque | `mkd` | Macedonian | | |
|
|
66
|
+
| `fin` | Finnish | `nld` | Dutch | | |
|
|
67
|
+
| `fra` | French | `nob` | Norwegian Bokmål | | |
|
|
68
|
+
| `pes` | Persian | `pol` | Polish | | |
|
package/dist/eval.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval.d.ts","sourceRoot":"","sources":["../src/eval.ts"],"names":[],"mappings":"AAAA;;;;;GAKG"}
|
package/dist/eval.js
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evaluate detection accuracy against the UDHR dataset.
|
|
3
|
+
*
|
|
4
|
+
* usage:
|
|
5
|
+
* node --conditions source src/eval.ts [--lite] [--lande]
|
|
6
|
+
*/
|
|
7
|
+
import fs from 'node:fs';
|
|
8
|
+
import path from 'node:path';
|
|
9
|
+
import { parseArgs } from 'node:util';
|
|
10
|
+
import { create } from "./nn/detect.js";
|
|
11
|
+
const { values: args } = parseArgs({
|
|
12
|
+
options: {
|
|
13
|
+
lite: { type: 'boolean', default: false },
|
|
14
|
+
lande: { type: 'boolean', default: false },
|
|
15
|
+
},
|
|
16
|
+
});
|
|
17
|
+
const variant = args.lite ? 'lite' : 'standard';
|
|
18
|
+
const quantBits = args.lite ? 6 : 8;
|
|
19
|
+
const weightsDir = path.resolve(import.meta.dirname, '..', 'weights', variant);
|
|
20
|
+
const { initialize, detect } = create({
|
|
21
|
+
cyrillic: {
|
|
22
|
+
weights: new URL(`file://${path.join(weightsDir, 'cyrillic.bin')}`),
|
|
23
|
+
meta: new URL(`file://${path.join(weightsDir, 'cyrillic.json')}`),
|
|
24
|
+
},
|
|
25
|
+
arabic: {
|
|
26
|
+
weights: new URL(`file://${path.join(weightsDir, 'arabic.bin')}`),
|
|
27
|
+
meta: new URL(`file://${path.join(weightsDir, 'arabic.json')}`),
|
|
28
|
+
},
|
|
29
|
+
devanagari: {
|
|
30
|
+
weights: new URL(`file://${path.join(weightsDir, 'devanagari.bin')}`),
|
|
31
|
+
meta: new URL(`file://${path.join(weightsDir, 'devanagari.json')}`),
|
|
32
|
+
},
|
|
33
|
+
latin: {
|
|
34
|
+
weights: new URL(`file://${path.join(weightsDir, 'latin.bin')}`),
|
|
35
|
+
meta: new URL(`file://${path.join(weightsDir, 'latin.json')}`),
|
|
36
|
+
},
|
|
37
|
+
}, quantBits);
|
|
38
|
+
// ── UDHR code → ISO 639-3 mapping ──
|
|
39
|
+
const UDHR_CODE_TO_LANG = {
|
|
40
|
+
afr: 'afr',
|
|
41
|
+
bel: 'bel',
|
|
42
|
+
ben: 'ben',
|
|
43
|
+
bul: 'bul',
|
|
44
|
+
cat: 'cat',
|
|
45
|
+
ces: 'ces',
|
|
46
|
+
ckb: 'ckb',
|
|
47
|
+
cmn_hans: 'cmn',
|
|
48
|
+
dan: 'dan',
|
|
49
|
+
deu_1996: 'deu',
|
|
50
|
+
ell_monotonic: 'ell',
|
|
51
|
+
eng: 'eng',
|
|
52
|
+
eus: 'eus',
|
|
53
|
+
fin: 'fin',
|
|
54
|
+
fra: 'fra',
|
|
55
|
+
hau_NG: 'hau',
|
|
56
|
+
heb: 'heb',
|
|
57
|
+
hin: 'hin',
|
|
58
|
+
hrv: 'hrv',
|
|
59
|
+
hun: 'hun',
|
|
60
|
+
hye: 'hye',
|
|
61
|
+
ind: 'ind',
|
|
62
|
+
isl: 'isl',
|
|
63
|
+
ita: 'ita',
|
|
64
|
+
jpn: 'jpn',
|
|
65
|
+
kat: 'kat',
|
|
66
|
+
kaz: 'kaz',
|
|
67
|
+
kor: 'kor',
|
|
68
|
+
lit: 'lit',
|
|
69
|
+
mar: 'mar',
|
|
70
|
+
mkd: 'mkd',
|
|
71
|
+
nld: 'nld',
|
|
72
|
+
nob: 'nob',
|
|
73
|
+
pes_1: 'pes',
|
|
74
|
+
pol: 'pol',
|
|
75
|
+
por_BR: 'por',
|
|
76
|
+
por_PT: 'por',
|
|
77
|
+
ron_2006: 'ron',
|
|
78
|
+
run: 'run',
|
|
79
|
+
rus: 'rus',
|
|
80
|
+
slk: 'slk',
|
|
81
|
+
spa: 'spa',
|
|
82
|
+
srp_cyrl: 'srp',
|
|
83
|
+
srp_latn: 'srp',
|
|
84
|
+
swe: 'swe',
|
|
85
|
+
tgl: 'tgl',
|
|
86
|
+
tur: 'tur',
|
|
87
|
+
ukr: 'ukr',
|
|
88
|
+
vie: 'vie',
|
|
89
|
+
};
|
|
90
|
+
const TAG_RE = /<[^>]+>/g;
|
|
91
|
+
// ── load UDHR sentences ──
|
|
92
|
+
const declDir = path.resolve(import.meta.dirname, '..', 'train', 'resources', 'udhr', 'declaration');
|
|
93
|
+
const sentences = [];
|
|
94
|
+
for (const [code, lang] of Object.entries(UDHR_CODE_TO_LANG)) {
|
|
95
|
+
const htmlFile = path.join(declDir, `${code}.html`);
|
|
96
|
+
if (!fs.existsSync(htmlFile)) {
|
|
97
|
+
continue;
|
|
98
|
+
}
|
|
99
|
+
const content = fs.readFileSync(htmlFile, 'utf-8');
|
|
100
|
+
for (const match of content.matchAll(/<p>(.*?)<\/p>/gs)) {
|
|
101
|
+
const text = match[1].replace(TAG_RE, '').trim();
|
|
102
|
+
if (text.length < 10) {
|
|
103
|
+
continue;
|
|
104
|
+
}
|
|
105
|
+
sentences.push({ lang, text });
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
const evaluate = (name, detectFn) => {
|
|
109
|
+
const perLang = {};
|
|
110
|
+
let totalPass = 0;
|
|
111
|
+
for (const { lang, text } of sentences) {
|
|
112
|
+
perLang[lang] ??= { pass: 0, total: 0 };
|
|
113
|
+
perLang[lang].total++;
|
|
114
|
+
if (detectFn(text) === lang) {
|
|
115
|
+
perLang[lang].pass++;
|
|
116
|
+
totalPass++;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
const overallAcc = (totalPass / sentences.length) * 100;
|
|
120
|
+
console.log(`\n=== ${name} ===`);
|
|
121
|
+
console.log(`${sentences.length} sentences, ${Object.keys(perLang).length} languages`);
|
|
122
|
+
console.log(`overall accuracy: ${overallAcc.toFixed(2)}%`);
|
|
123
|
+
const sorted = Object.entries(perLang).sort((a, b) => a[1].pass / a[1].total - b[1].pass / b[1].total);
|
|
124
|
+
for (const [lang, stats] of sorted) {
|
|
125
|
+
const acc = (stats.pass / stats.total) * 100;
|
|
126
|
+
if (acc < 100) {
|
|
127
|
+
console.log(` ${lang}: ${acc.toFixed(1)}% (${stats.total})`);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
};
|
|
131
|
+
// ── evaluate ──
|
|
132
|
+
await initialize();
|
|
133
|
+
evaluate(`UDHR: ${variant} (${quantBits}-bit)`, (text) => {
|
|
134
|
+
const result = detect(text);
|
|
135
|
+
return result[0]?.[0];
|
|
136
|
+
});
|
|
137
|
+
if (args.lande) {
|
|
138
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
139
|
+
const { default: lande } = await import('lande');
|
|
140
|
+
evaluate('UDHR: lande', (text) => {
|
|
141
|
+
const result = lande(text);
|
|
142
|
+
return result?.[0]?.[0];
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
//# sourceMappingURL=eval.js.map
|
package/dist/eval.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval.js","sourceRoot":"","sources":["../src/eval.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,MAAM,SAAS,CAAC;AACzB,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAEtC,OAAO,EAAE,MAAM,EAAE,MAAM,gBAAgB,CAAC;AAExC,MAAM,EAAE,MAAM,EAAE,IAAI,EAAE,GAAG,SAAS,CAAC;IAClC,OAAO,EAAE;QACR,IAAI,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,KAAK,EAAE;QACzC,KAAK,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,KAAK,EAAE;KAC1C;CACD,CAAC,CAAC;AAEH,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,UAAU,CAAC;AAChD,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AAEpC,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,IAAI,CAAC,OAAQ,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;AAChF,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,GAAG,MAAM,CACpC;IACC,QAAQ,EAAE;QACT,OAAO,EAAE,IAAI,GAAG,CAAC,UAAU,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,cAAc,CAAC,EAAE,CAAC;QACnE,IAAI,EAAE,IAAI,GAAG,CAAC,UAAU,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,eAAe,CAAC,EAAE,CAAC;KACjE;IACD,MAAM,EAAE;QACP,OAAO,EAAE,IAAI,GAAG,CAAC,UAAU,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,YAAY,CAAC,EAAE,CAAC;QACjE,IAAI,EAAE,IAAI,GAAG,CAAC,UAAU,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,aAAa,CAAC,EAAE,CAAC;KAC/D;IACD,UAAU,EAAE;QACX,OAAO,EAAE,IAAI,GAAG,CAAC,UAAU,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,gBAAgB,CAAC,EAAE,CAAC;QACrE,IAAI,EAAE,IAAI,GAAG,CAAC,UAAU,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,iBAAiB,CAAC,EAAE,CAAC;KACnE;IACD,KAAK,EAAE;QACN,OAAO,EAAE,IAAI,GAAG,CAAC,UAAU,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,WAAW,CAAC,EAAE,CAAC;QAChE,IAAI,EAAE,IAAI,GAAG,CAAC,UAAU,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,YAAY,CAAC,EAAE,CAAC;KAC9D;CACD,EACD,SAAS,CACT,CAAC;AAEF,sCAAsC;AAEtC,MAAM,iBAAiB,GAA2B;IACjD,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,QAAQ,EAAE,KAAK;IACf,GAAG,EAAE,KAAK;IACV,QAAQ,EAAE,KAAK;IACf,aAAa,EAAE,KAAK;IACpB,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,MAAM,EAAE,KAAK;IACb,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,KAAK,EAAE,KAAK;IACZ,GAAG,EAAE,KAAK;IACV,MAAM,EAAE,KAAK;IACb,MAAM,EAAE,KAAK;IACb,QAAQ,EAAE,KAAK;IACf,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,QAAQ,EAAE,KAAK;IACf,QAAQ,EAAE,KAAK;IACf,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;IACV,GAAG,EAAE,KAAK;CACV,CAAC;AAEF,MAAM,MAAM,GAAG,UAAU,CAAC;AAE1B,4BAA4B;AAE5B,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,IAAI,CAAC,OAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,WAAW,EAAE,MAAM,EAAE,aAAa,CAAC,CAAC;AACtG,MAAM,SAAS,GAAqC,EAAE,CAAC;AAEvD,KAAK,MAAM,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,iBAAiB,CAAC,EAAE,CAAC;IAC9D,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,GAAG,IAAI,OAAO,CAAC,CAAC;IACpD,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC9B,SAAS;IACV,CAAC;IAED,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACnD,KAAK,MAAM,KAAK,IAAI,OAAO,CAAC,QAAQ,CAAC,iBAAiB,CAAC,EAAE,CAAC;QACzD,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACjD,IAAI,IAAI,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;YACtB,SAAS;QACV,CAAC;QACD,SAAS,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;IAChC,CAAC;AACF,CAAC;AAMD,MAAM,QAAQ,GAAG,CAAC,IAAY,EAAE,QAA8C,EAAE,EAAE;IACjF,MAAM,OAAO,GAA0B,EAAE,CAAC;IAC1C,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,KAAK,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,SAAS,EAAE,CAAC;QACxC,OAAO,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC;QACxC,OAAO,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC;QAEtB,IAAI,QAAQ,CAAC,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC;YAC7B,OAAO,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;YACrB,SAAS,EAAE,CAAC;QACb,CAAC;IACF,CAAC;IAED,MAAM,UAAU,GAAG,CAAC,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC;IAExD,OAAO,CAAC,GAAG,CAAC,SAAS,IAAI,MAAM,CAAC,CAAC;IACjC,OAAO,CAAC,GAAG,CAAC,GAAG,SAAS,CAAC,MAAM,eAAe,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,YAAY,CAAC,CAAC;IACvF,OAAO,CAAC,GAAG,CAAC,qBAAqB,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;IAE3D,MAAM,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;IACvG,KAAK,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,MAAM,EAAE,CAAC;QACpC,MAAM,GAAG,GAAG,CAAC,KAAK,CAAC,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC;QAC7C,IAAI,GAAG,GAAG,GAAG,EAAE,CAAC;YACf,OAAO,CAAC,GAAG,CAAC,KAAK,IAAI,KAAK,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,KAAK,CAAC,KAAK,GAAG,CAAC,CAAC;QAC/D,CAAC;IACF,CAAC;AACF,CAAC,CAAC;AAEF,iBAAiB;AAEjB,MAAM,UAAU,EAAE,CAAC;AAEnB,QAAQ,CAAC,SAAS,OAAO,KAAK,SAAS,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;IACxD,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC;IAC5B,OAAO,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;AACvB,CAAC,CAAC,CAAC;AAEH,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;IAChB,iEAAiE;IACjE,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,CAAC;IACjD,QAAQ,CAAC,aAAa,EAAE,CAAC,IAAI,EAAE,EAAE;QAChC,MAAM,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC;QAC3B,OAAO,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;IACzB,CAAC,CAAC,CAAC;AACJ,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAEA,YAAY,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAEhD,eAAO,MAAQ,UAAU,uBAAE,MAAM,wDAiB/B,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { create } from "./nn/detect.js";
|
|
2
|
+
export const { initialize, detect } = create({
|
|
3
|
+
cyrillic: {
|
|
4
|
+
weights: new URL('../weights/standard/cyrillic.bin', import.meta.url),
|
|
5
|
+
meta: new URL('../weights/standard/cyrillic.json', import.meta.url),
|
|
6
|
+
},
|
|
7
|
+
arabic: {
|
|
8
|
+
weights: new URL('../weights/standard/arabic.bin', import.meta.url),
|
|
9
|
+
meta: new URL('../weights/standard/arabic.json', import.meta.url),
|
|
10
|
+
},
|
|
11
|
+
devanagari: {
|
|
12
|
+
weights: new URL('../weights/standard/devanagari.bin', import.meta.url),
|
|
13
|
+
meta: new URL('../weights/standard/devanagari.json', import.meta.url),
|
|
14
|
+
},
|
|
15
|
+
latin: {
|
|
16
|
+
weights: new URL('../weights/standard/latin.bin', import.meta.url),
|
|
17
|
+
meta: new URL('../weights/standard/latin.json', import.meta.url),
|
|
18
|
+
},
|
|
19
|
+
});
|
|
20
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,gBAAgB,CAAC;AAIxC,MAAM,CAAC,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,GAAG,MAAM,CAAC;IAC5C,QAAQ,EAAE;QACT,OAAO,EAAE,IAAI,GAAG,CAAC,kCAAkC,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;QACrE,IAAI,EAAE,IAAI,GAAG,CAAC,mCAAmC,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;KACnE;IACD,MAAM,EAAE;QACP,OAAO,EAAE,IAAI,GAAG,CAAC,gCAAgC,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;QACnE,IAAI,EAAE,IAAI,GAAG,CAAC,iCAAiC,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;KACjE;IACD,UAAU,EAAE;QACX,OAAO,EAAE,IAAI,GAAG,CAAC,oCAAoC,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;QACvE,IAAI,EAAE,IAAI,GAAG,CAAC,qCAAqC,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;KACrE;IACD,KAAK,EAAE;QACN,OAAO,EAAE,IAAI,GAAG,CAAC,+BAA+B,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;QAClE,IAAI,EAAE,IAAI,GAAG,CAAC,gCAAgC,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;KAChE;CACD,CAAC,CAAC"}
|
package/dist/lite.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"lite.d.ts","sourceRoot":"","sources":["../src/lite.ts"],"names":[],"mappings":"AAEA,YAAY,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAEhD,eAAO,MAAQ,UAAU,uBAAE,MAAM,wDAoBhC,CAAC"}
|
package/dist/lite.js
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { create } from "./nn/detect.js";
|
|
2
|
+
export const { initialize, detect } = create({
|
|
3
|
+
cyrillic: {
|
|
4
|
+
weights: new URL('../weights/lite/cyrillic.bin', import.meta.url),
|
|
5
|
+
meta: new URL('../weights/lite/cyrillic.json', import.meta.url),
|
|
6
|
+
},
|
|
7
|
+
arabic: {
|
|
8
|
+
weights: new URL('../weights/lite/arabic.bin', import.meta.url),
|
|
9
|
+
meta: new URL('../weights/lite/arabic.json', import.meta.url),
|
|
10
|
+
},
|
|
11
|
+
devanagari: {
|
|
12
|
+
weights: new URL('../weights/lite/devanagari.bin', import.meta.url),
|
|
13
|
+
meta: new URL('../weights/lite/devanagari.json', import.meta.url),
|
|
14
|
+
},
|
|
15
|
+
latin: {
|
|
16
|
+
weights: new URL('../weights/lite/latin.bin', import.meta.url),
|
|
17
|
+
meta: new URL('../weights/lite/latin.json', import.meta.url),
|
|
18
|
+
},
|
|
19
|
+
}, 6);
|
|
20
|
+
//# sourceMappingURL=lite.js.map
|
package/dist/lite.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"lite.js","sourceRoot":"","sources":["../src/lite.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,gBAAgB,CAAC;AAIxC,MAAM,CAAC,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,GAAG,MAAM,CAC3C;IACC,QAAQ,EAAE;QACT,OAAO,EAAE,IAAI,GAAG,CAAC,8BAA8B,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;QACjE,IAAI,EAAE,IAAI,GAAG,CAAC,+BAA+B,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;KAC/D;IACD,MAAM,EAAE;QACP,OAAO,EAAE,IAAI,GAAG,CAAC,4BAA4B,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;QAC/D,IAAI,EAAE,IAAI,GAAG,CAAC,6BAA6B,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;KAC7D;IACD,UAAU,EAAE;QACX,OAAO,EAAE,IAAI,GAAG,CAAC,gCAAgC,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;QACnE,IAAI,EAAE,IAAI,GAAG,CAAC,iCAAiC,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;KACjE;IACD,KAAK,EAAE;QACN,OAAO,EAAE,IAAI,GAAG,CAAC,2BAA2B,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;QAC9D,IAAI,EAAE,IAAI,GAAG,CAAC,4BAA4B,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;KAC5D;CACD,EACD,CAAC,CACD,CAAC"}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/** a single detection result: ISO 639-3 language code and its probability. */
|
|
2
|
+
export type Detection = [lang: string, probability: number];
|
|
3
|
+
/** URLs for a single group's weight + metadata files. */
|
|
4
|
+
type GroupSource = {
|
|
5
|
+
weights: URL;
|
|
6
|
+
meta: URL;
|
|
7
|
+
};
|
|
8
|
+
/** returned by {@link create} — call initialize() once, then detect() synchronously. */
|
|
9
|
+
type Detector = {
|
|
10
|
+
initialize: () => Promise<void>;
|
|
11
|
+
detect: (text: string) => Detection[];
|
|
12
|
+
};
|
|
13
|
+
/**
|
|
14
|
+
* creates a detector for a specific weight variant.
|
|
15
|
+
*
|
|
16
|
+
* call initialize() once to load and dequantize weights via fetch(), then
|
|
17
|
+
* call detect() synchronously for each input text.
|
|
18
|
+
*
|
|
19
|
+
* @param sources record of group names to their weight/meta file URLs
|
|
20
|
+
* @param quantBits quantization bit width (default 8)
|
|
21
|
+
* @returns detector with initialize() and detect() methods
|
|
22
|
+
*/
|
|
23
|
+
export declare const create: (sources: Record<string, GroupSource>, quantBits?: number) => Detector;
|
|
24
|
+
export {};
|
|
25
|
+
//# sourceMappingURL=detect.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"detect.d.ts","sourceRoot":"","sources":["../../src/nn/detect.ts"],"names":[],"mappings":"AAOA,8EAA8E;AAC9E,MAAM,MAAM,SAAS,GAAG,CAAC,IAAI,EAAE,MAAM,EAAE,WAAW,EAAE,MAAM,CAAC,CAAC;AAE5D,yDAAyD;AACzD,KAAK,WAAW,GAAG;IAClB,OAAO,EAAE,GAAG,CAAC;IACb,IAAI,EAAE,GAAG,CAAC;CACV,CAAC;AAwBF,wFAAwF;AACxF,KAAK,QAAQ,GAAG;IACf,UAAU,EAAE,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;IAChC,MAAM,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,SAAS,EAAE,CAAC;CACtC,CAAC;AA+IF;;;;;;;;;GASG;AACH,eAAO,MAAM,MAAM,GAAI,SAAS,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,EAAE,kBAAa,KAAG,QAmF5E,CAAC"}
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import { loadBinary, loadJson } from '#load';
|
|
2
|
+
import { forward, loadWeights, loadWeights6 } from "./forward.js";
|
|
3
|
+
import { normalize, extractNgrams } from "./normalize.js";
|
|
4
|
+
/**
|
|
5
|
+
* classifies a character's Unicode codepoint into a script family.
|
|
6
|
+
*
|
|
7
|
+
* @param cp the codepoint to classify
|
|
8
|
+
* @returns the script family, or `null` if not recognized
|
|
9
|
+
*/
|
|
10
|
+
const classifyCodepoint = (cp) => {
|
|
11
|
+
// unique scripts
|
|
12
|
+
if ((cp >= 0xac00 && cp <= 0xd7af) || (cp >= 0x1100 && cp <= 0x11ff)) {
|
|
13
|
+
return 'korean';
|
|
14
|
+
}
|
|
15
|
+
if ((cp >= 0x10a0 && cp <= 0x10ff) || (cp >= 0x2d00 && cp <= 0x2d2f)) {
|
|
16
|
+
return 'georgian';
|
|
17
|
+
}
|
|
18
|
+
if (cp >= 0x0530 && cp <= 0x058f) {
|
|
19
|
+
return 'armenian';
|
|
20
|
+
}
|
|
21
|
+
if (cp >= 0x0980 && cp <= 0x09ff) {
|
|
22
|
+
return 'bengali';
|
|
23
|
+
}
|
|
24
|
+
if ((cp >= 0x0370 && cp <= 0x03ff) || (cp >= 0x1f00 && cp <= 0x1fff)) {
|
|
25
|
+
return 'greek';
|
|
26
|
+
}
|
|
27
|
+
if (cp >= 0x0590 && cp <= 0x05ff) {
|
|
28
|
+
return 'hebrew';
|
|
29
|
+
}
|
|
30
|
+
// CJK
|
|
31
|
+
if ((cp >= 0x3040 && cp <= 0x309f) || (cp >= 0x30a0 && cp <= 0x30ff)) {
|
|
32
|
+
return 'cjk_kana';
|
|
33
|
+
}
|
|
34
|
+
if ((cp >= 0x4e00 && cp <= 0x9fff) || (cp >= 0x3400 && cp <= 0x4dbf)) {
|
|
35
|
+
return 'cjk_han';
|
|
36
|
+
}
|
|
37
|
+
// NN groups
|
|
38
|
+
if (cp >= 0x0400 && cp <= 0x04ff) {
|
|
39
|
+
return 'cyrillic';
|
|
40
|
+
}
|
|
41
|
+
if ((cp >= 0x0600 && cp <= 0x06ff) || (cp >= 0x0750 && cp <= 0x077f)) {
|
|
42
|
+
return 'arabic';
|
|
43
|
+
}
|
|
44
|
+
if (cp >= 0x0900 && cp <= 0x097f) {
|
|
45
|
+
return 'devanagari';
|
|
46
|
+
}
|
|
47
|
+
if ((cp >= 0x0041 && cp <= 0x005a) || (cp >= 0x0061 && cp <= 0x007a) || (cp >= 0x00c0 && cp <= 0x024f)) {
|
|
48
|
+
return 'latin';
|
|
49
|
+
}
|
|
50
|
+
return null;
|
|
51
|
+
};
|
|
52
|
+
/** maps unique script families to their ISO 639-3 language code. */
|
|
53
|
+
const UNIQUE_SCRIPT_MAP = {
|
|
54
|
+
korean: 'kor',
|
|
55
|
+
georgian: 'kat',
|
|
56
|
+
armenian: 'hye',
|
|
57
|
+
bengali: 'ben',
|
|
58
|
+
greek: 'ell',
|
|
59
|
+
hebrew: 'heb',
|
|
60
|
+
};
|
|
61
|
+
/** maps script families to NN group names. */
|
|
62
|
+
const SCRIPT_TO_GROUP = {
|
|
63
|
+
cyrillic: 'cyrillic',
|
|
64
|
+
arabic: 'arabic',
|
|
65
|
+
devanagari: 'devanagari',
|
|
66
|
+
latin: 'latin',
|
|
67
|
+
};
|
|
68
|
+
// #endregion
|
|
69
|
+
// #region inference helpers
|
|
70
|
+
/**
|
|
71
|
+
* builds the input feature vector for a group model from normalized text.
|
|
72
|
+
*
|
|
73
|
+
* @param text normalized text
|
|
74
|
+
* @param ngrams the group's ngram vocabulary
|
|
75
|
+
* @returns float32 input vector matching the model's expected layout
|
|
76
|
+
*/
|
|
77
|
+
const buildInput = (text, ngrams) => {
|
|
78
|
+
const unigrams = extractNgrams(text, 1);
|
|
79
|
+
const bigrams = extractNgrams(text, 2);
|
|
80
|
+
const trigrams = extractNgrams(text, 3);
|
|
81
|
+
const quadgrams = extractNgrams(text, 4);
|
|
82
|
+
const values = [
|
|
83
|
+
...ngrams.unigrams.map((v) => unigrams[v] || 0),
|
|
84
|
+
...ngrams.bigrams.map((v) => bigrams[v] || 0),
|
|
85
|
+
...ngrams.trigrams.map((v) => trigrams[v] || 0),
|
|
86
|
+
...ngrams.quadgrams.map((v) => quadgrams[v] || 0),
|
|
87
|
+
];
|
|
88
|
+
return new Float32Array(values);
|
|
89
|
+
};
|
|
90
|
+
// #endregion
|
|
91
|
+
// #region weight loading
|
|
92
|
+
/**
|
|
93
|
+
* loads and dequantizes weights for a single group from its binary + metadata files.
|
|
94
|
+
*
|
|
95
|
+
* @param source URLs for the group's weight and metadata files
|
|
96
|
+
* @param quantBits quantization bit width (8 or 6)
|
|
97
|
+
* @returns the loaded model ready for inference
|
|
98
|
+
*/
|
|
99
|
+
const loadGroup = async (source, quantBits) => {
|
|
100
|
+
const [bin, rawMeta] = await Promise.all([loadBinary(source.weights), loadJson(source.meta)]);
|
|
101
|
+
const meta = rawMeta;
|
|
102
|
+
const load = quantBits === 6 ? loadWeights6 : loadWeights;
|
|
103
|
+
const weights = load(bin, meta.inputSize, meta.outputSize);
|
|
104
|
+
return { meta, weights };
|
|
105
|
+
};
|
|
106
|
+
// #endregion
|
|
107
|
+
// #region detection
|
|
108
|
+
/**
|
|
109
|
+
* creates a detector for a specific weight variant.
|
|
110
|
+
*
|
|
111
|
+
* call initialize() once to load and dequantize weights via fetch(), then
|
|
112
|
+
* call detect() synchronously for each input text.
|
|
113
|
+
*
|
|
114
|
+
* @param sources record of group names to their weight/meta file URLs
|
|
115
|
+
* @param quantBits quantization bit width (default 8)
|
|
116
|
+
* @returns detector with initialize() and detect() methods
|
|
117
|
+
*/
|
|
118
|
+
export const create = (sources, quantBits = 8) => {
|
|
119
|
+
let models = null;
|
|
120
|
+
const initialize = async () => {
|
|
121
|
+
const entries = Object.entries(sources);
|
|
122
|
+
const loaded = await Promise.all(entries.map(([, source]) => loadGroup(source, quantBits)));
|
|
123
|
+
models = {};
|
|
124
|
+
for (let i = 0; i < entries.length; i++) {
|
|
125
|
+
models[entries[i][0]] = loaded[i];
|
|
126
|
+
}
|
|
127
|
+
};
|
|
128
|
+
const detect = (text) => {
|
|
129
|
+
if (!models) {
|
|
130
|
+
throw new Error(`call initialize() first`);
|
|
131
|
+
}
|
|
132
|
+
// classify characters by script family
|
|
133
|
+
const scriptCounts = new Map();
|
|
134
|
+
let totalClassified = 0;
|
|
135
|
+
for (let i = 0; i < text.length; i++) {
|
|
136
|
+
const cp = text.codePointAt(i);
|
|
137
|
+
// skip surrogates for astral characters
|
|
138
|
+
if (cp > 0xffff) {
|
|
139
|
+
i++;
|
|
140
|
+
}
|
|
141
|
+
const family = classifyCodepoint(cp);
|
|
142
|
+
if (family) {
|
|
143
|
+
scriptCounts.set(family, (scriptCounts.get(family) || 0) + 1);
|
|
144
|
+
totalClassified++;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
// no classified characters — fallback to latin
|
|
148
|
+
if (totalClassified === 0) {
|
|
149
|
+
return detectGroup(text, 'latin', models);
|
|
150
|
+
}
|
|
151
|
+
const results = [];
|
|
152
|
+
for (const [family, count] of scriptCounts) {
|
|
153
|
+
const proportion = count / totalClassified;
|
|
154
|
+
// unique script languages — use proportion directly as probability
|
|
155
|
+
const uniqueLang = UNIQUE_SCRIPT_MAP[family];
|
|
156
|
+
if (uniqueLang) {
|
|
157
|
+
results.push([uniqueLang, proportion]);
|
|
158
|
+
continue;
|
|
159
|
+
}
|
|
160
|
+
// CJK — kana implies Japanese, Han-only implies Chinese
|
|
161
|
+
if (family === 'cjk_kana') {
|
|
162
|
+
results.push(['jpn', proportion]);
|
|
163
|
+
continue;
|
|
164
|
+
}
|
|
165
|
+
if (family === 'cjk_han') {
|
|
166
|
+
// only count as Chinese if no kana detected (otherwise Han is part of Japanese)
|
|
167
|
+
if (!scriptCounts.has('cjk_kana')) {
|
|
168
|
+
results.push(['cmn', proportion]);
|
|
169
|
+
}
|
|
170
|
+
continue;
|
|
171
|
+
}
|
|
172
|
+
// NN group — run model and scale by proportion
|
|
173
|
+
const groupName = SCRIPT_TO_GROUP[family];
|
|
174
|
+
if (groupName && models[groupName]) {
|
|
175
|
+
const groupResults = detectGroup(text, groupName, models, proportion);
|
|
176
|
+
results.push(...groupResults);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
// if nothing was produced (shouldn't happen, but safety), fallback to latin
|
|
180
|
+
if (results.length === 0) {
|
|
181
|
+
return detectGroup(text, 'latin', models);
|
|
182
|
+
}
|
|
183
|
+
results.sort((a, b) => b[1] - a[1]);
|
|
184
|
+
return results;
|
|
185
|
+
};
|
|
186
|
+
return { initialize, detect };
|
|
187
|
+
};
|
|
188
|
+
/**
|
|
189
|
+
* runs a group's model on the input text and returns detections scaled by proportion.
|
|
190
|
+
*
|
|
191
|
+
* @param text raw input text
|
|
192
|
+
* @param groupName key into the loaded models
|
|
193
|
+
* @param models loaded model records
|
|
194
|
+
* @param proportion script proportion to scale probabilities by
|
|
195
|
+
* @returns detections for this group
|
|
196
|
+
*/
|
|
197
|
+
const detectGroup = (text, groupName, models, proportion = 1) => {
|
|
198
|
+
const model = models[groupName];
|
|
199
|
+
if (!model) {
|
|
200
|
+
throw new Error(`weights not loaded for group '${groupName}'`);
|
|
201
|
+
}
|
|
202
|
+
const normalized = normalize(text);
|
|
203
|
+
const input = buildInput(normalized, model.meta.ngrams);
|
|
204
|
+
const output = forward(input, model.weights);
|
|
205
|
+
const results = model.meta.langs.map((lang, i) => [lang, output[i] * proportion]);
|
|
206
|
+
return results;
|
|
207
|
+
};
|
|
208
|
+
// #endregion
|
|
209
|
+
//# sourceMappingURL=detect.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"detect.js","sourceRoot":"","sources":["../../src/nn/detect.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,QAAQ,EAAE,MAAM,OAAO,CAAC;AAE7C,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,YAAY,EAAqB,MAAM,cAAc,CAAC;AACrF,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AA4D1D;;;;;GAKG;AACH,MAAM,iBAAiB,GAAG,CAAC,EAAU,EAAuB,EAAE;IAC7D,iBAAiB;IACjB,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,EAAE,CAAC;QACtE,OAAO,QAAQ,CAAC;IACjB,CAAC;IACD,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,EAAE,CAAC;QACtE,OAAO,UAAU,CAAC;IACnB,CAAC;IACD,IAAI,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,EAAE,CAAC;QAClC,OAAO,UAAU,CAAC;IACnB,CAAC;IACD,IAAI,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,EAAE,CAAC;QAClC,OAAO,SAAS,CAAC;IAClB,CAAC;IACD,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,EAAE,CAAC;QACtE,OAAO,OAAO,CAAC;IAChB,CAAC;IACD,IAAI,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,EAAE,CAAC;QAClC,OAAO,QAAQ,CAAC;IACjB,CAAC;IAED,MAAM;IACN,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,EAAE,CAAC;QACtE,OAAO,UAAU,CAAC;IACnB,CAAC;IACD,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,EAAE,CAAC;QACtE,OAAO,SAAS,CAAC;IAClB,CAAC;IAED,YAAY;IACZ,IAAI,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,EAAE,CAAC;QAClC,OAAO,UAAU,CAAC;IACnB,CAAC;IACD,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,EAAE,CAAC;QACtE,OAAO,QAAQ,CAAC;IACjB,CAAC;IACD,IAAI,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,EAAE,CAAC;QAClC,OAAO,YAAY,CAAC;IACrB,CAAC;IACD,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,EAAE,CAAC;QACxG,OAAO,OAAO,CAAC;IAChB,CAAC;IAED,OAAO,IAAI,CAAC;AACb,CAAC,CAAC;AAEF,oEAAoE;AACpE,MAAM,iBAAiB,GAA0C;IAChE,MAAM,EAAE,KAAK;IACb,QAAQ,EAAE,KAAK;IACf,QAAQ,EAAE,KAAK;IACf,OAAO,EAAE,KAAK;IACd,KAAK,EAAE,KAAK;IACZ,MAAM,EAAE,KAAK;CACb,CAAC;AAEF,8CAA8C;AAC9C,MAAM,eAAe,GAA0C;IAC9D,QAAQ,EAAE,UAAU;IACpB,MAAM,EAAE,QAAQ;IAChB,UAAU,EAAE,YAAY;IACxB,KAAK,EAAE,OAAO;CACd,CAAC;AAEF,aAAa;AAEb,4BAA4B;AAE5B;;;;;;GAMG;AACH,MAAM,UAAU,GAAG,CAAC,IAAY,EAAE,MAAmB,EAAgB,EAAE;IACtE,MAAM,QAAQ,GAAG,aAAa,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;IACxC,MAAM,OAAO,GAAG,aAAa,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;IACvC,MAAM,QAAQ,GAAG,aAAa,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;IACxC,MAAM,SAAS,GAAG,aAAa,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;IAEzC,MAAM,MAAM,GAAG;QACd,GAAG,MAAM,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAC/C,GAAG,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAC7C,GAAG,MAAM,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAC/C,GAAG,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;KACjD,CAAC;IAEF,OAAO,IAAI,YAAY,CAAC,MAAM,CAAC,CAAC;AACjC,CAAC,CAAC;AAEF,aAAa;AAEb,yBAAyB;AAEzB;;;;;;GAMG;AACH,MAAM,SAAS,GAAG,KAAK,EAAE,MAAmB,EAAE,SAAiB,EAAuB,EAAE;IACvF,MAAM,CAAC,GAAG,EAAE,OAAO,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAC9F,MAAM,IAAI,GAAG,OAAoB,CAAC;IAElC,MAAM,IAAI,GAAG,SAAS,KAAK,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,WAAW,CAAC;IAC1D,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;IAE3D,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC;AAC1B,CAAC,CAAC;AAEF,aAAa;AAEb,oBAAoB;AAEpB;;;;;;;;;GASG;AACH,MAAM,CAAC,MAAM,MAAM,GAAG,CAAC,OAAoC,EAAE,SAAS,GAAG,CAAC,EAAY,EAAE;IACvF,IAAI,MAAM,GAAsC,IAAI,CAAC;IAErD,MAAM,UAAU,GAAG,KAAK,IAAI,EAAE;QAC7B,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QACxC,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC,CAAC,CAAC;QAE5F,MAAM,GAAG,EAAE,CAAC;QACZ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACzC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;IACF,CAAC,CAAC;IAEF,MAAM,MAAM,GAAG,CAAC,IAAY,EAAe,EAAE;QAC5C,IAAI,CAAC,MAAM,EAAE,CAAC;YACb,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC5C,CAAC;QAED,uCAAuC;QACvC,MAAM,YAAY,GAAG,IAAI,GAAG,EAAwB,CAAC;QACrD,IAAI,eAAe,GAAG,CAAC,CAAC;QAExB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,MAAM,EAAE,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAE,CAAC;YAChC,wCAAwC;YACxC,IAAI,EAAE,GAAG,MAAM,EAAE,CAAC;gBACjB,CAAC,EAAE,CAAC;YACL,CAAC;YACD,MAAM,MAAM,GAAG,iBAAiB,CAAC,EAAE,CAAC,CAAC;YACrC,IAAI,MAAM,EAAE,CAAC;gBACZ,YAAY,CAAC,GAAG,CAAC,MAAM,EAAE,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;gBAC9D,eAAe,EAAE,CAAC;YACnB,CAAC;QACF,CAAC;QAED,+CAA+C;QAC/C,IAAI,eAAe,KAAK,CAAC,EAAE,CAAC;YAC3B,OAAO,WAAW,CAAC,IAAI,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAC3C,CAAC;QAED,MAAM,OAAO,GAAgB,EAAE,CAAC;QAEhC,KAAK,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,YAAY,EAAE,CAAC;YAC5C,MAAM,UAAU,GAAG,KAAK,GAAG,eAAe,CAAC;YAE3C,mEAAmE;YACnE,MAAM,UAAU,GAAG,iBAAiB,CAAC,MAAM,CAAC,CAAC;YAC7C,IAAI,UAAU,EAAE,CAAC;gBAChB,OAAO,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC,CAAC;gBACvC,SAAS;YACV,CAAC;YAED,wDAAwD;YACxD,IAAI,MAAM,KAAK,UAAU,EAAE,CAAC;gBAC3B,OAAO,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,UAAU,CAAC,CAAC,CAAC;gBAClC,SAAS;YACV,CAAC;YACD,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;gBAC1B,gFAAgF;gBAChF,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;oBACnC,OAAO,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,UAAU,CAAC,CAAC,CAAC;gBACnC,CAAC;gBACD,SAAS;YACV,CAAC;YAED,+CAA+C;YAC/C,MAAM,SAAS,GAAG,eAAe,CAAC,MAAM,CAAC,CAAC;YAC1C,IAAI,SAAS,IAAI,MAAM,CAAC,SAAS,CAAC,EAAE,CAAC;gBACpC,MAAM,YAAY,GAAG,WAAW,CAAC,IAAI,EAAE,SAAS,EAAE,MAAM,EAAE,UAAU,CAAC,CAAC;gBACtE,OAAO,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,CAAC;YAC/B,CAAC;QACF,CAAC;QAED,4EAA4E;QAC5E,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,OAAO,WAAW,CAAC,IAAI,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;QAC3C,CAAC;QAED,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACpC,OAAO,OAAO,CAAC;IAChB,CAAC,CAAC;IAEF,OAAO,EAAE,UAAU,EAAE,MAAM,EAAE,CAAC;AAC/B,CAAC,CAAC;AAEF;;;;;;;;GAQG;AACH,MAAM,WAAW,GAAG,CACnB,IAAY,EACZ,SAAiB,EACjB,MAAkC,EAClC,UAAU,GAAG,CAAC,EACA,EAAE;IAChB,MAAM,KAAK,GAAG,MAAM,CAAC,SAAS,CAAC,CAAC;IAChC,IAAI,CAAC,KAAK,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CAAC,iCAAiC,SAAS,GAAG,CAAC,CAAC;IAChE,CAAC;IAED,MAAM,UAAU,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,KAAK,GAAG,UAAU,CAAC,UAAU,EAAE,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACxD,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;IAE7C,MAAM,OAAO,GAAgB,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC,CAAC,CAAC;IAC/F,OAAO,OAAO,CAAC;AAChB,CAAC,CAAC;AAEF,aAAa"}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/** float32 weights for a linear model (dense → softmax). */
|
|
2
|
+
export type ModelWeights = {
|
|
3
|
+
w: Float32Array;
|
|
4
|
+
b: Float32Array;
|
|
5
|
+
inputSize: number;
|
|
6
|
+
outputSize: number;
|
|
7
|
+
};
|
|
8
|
+
/**
|
|
9
|
+
* loads int8 quantized weights from a binary buffer and dequantizes to float32.
|
|
10
|
+
*
|
|
11
|
+
* binary format: 2 × f32 scales (wScale, bScale), then weight bytes, then bias bytes.
|
|
12
|
+
*
|
|
13
|
+
* @param bin raw binary weight data
|
|
14
|
+
* @param inputSize number of input features
|
|
15
|
+
* @param outputSize number of output classes
|
|
16
|
+
* @returns dequantized model weights
|
|
17
|
+
*/
|
|
18
|
+
export declare const loadWeights: (bin: ArrayBuffer, inputSize: number, outputSize: number) => ModelWeights;
|
|
19
|
+
/**
|
|
20
|
+
* loads int6 packed quantized weights from a binary buffer and dequantizes to float32.
|
|
21
|
+
*
|
|
22
|
+
* same header as int8 (2 × f32 scales), but payload is 6-bit packed.
|
|
23
|
+
*
|
|
24
|
+
* @param bin raw binary weight data
|
|
25
|
+
* @param inputSize number of input features
|
|
26
|
+
* @param outputSize number of output classes
|
|
27
|
+
* @returns dequantized model weights
|
|
28
|
+
*/
|
|
29
|
+
export declare const loadWeights6: (bin: ArrayBuffer, inputSize: number, outputSize: number) => ModelWeights;
|
|
30
|
+
/**
|
|
31
|
+
* forward pass for a linear model: dense → softmax.
|
|
32
|
+
*
|
|
33
|
+
* @param input input feature vector (ngram frequencies)
|
|
34
|
+
* @param m model weights
|
|
35
|
+
* @returns output probabilities (one per language in the group)
|
|
36
|
+
*/
|
|
37
|
+
export declare const forward: (input: Float32Array, m: ModelWeights) => Float32Array;
|
|
38
|
+
//# sourceMappingURL=forward.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"forward.d.ts","sourceRoot":"","sources":["../../src/nn/forward.ts"],"names":[],"mappings":"AAEA,4DAA4D;AAC5D,MAAM,MAAM,YAAY,GAAG;IAC1B,CAAC,EAAE,YAAY,CAAC;IAChB,CAAC,EAAE,YAAY,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;CACnB,CAAC;AAmEF;;;;;;;;;GASG;AACH,eAAO,MAAM,WAAW,GAAI,KAAK,WAAW,EAAE,WAAW,MAAM,EAAE,YAAY,MAAM,KAAG,YAerF,CAAC;AAEF;;;;;;;;;GASG;AACH,eAAO,MAAM,YAAY,GAAI,KAAK,WAAW,EAAE,WAAW,MAAM,EAAE,YAAY,MAAM,KAAG,YAkBtF,CAAC;AA4BF;;;;;;GAMG;AACH,eAAO,MAAM,OAAO,GAAI,OAAO,YAAY,EAAE,GAAG,YAAY,KAAG,YAa9D,CAAC"}
|