@oomfware/lang-detect 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +14 -0
- package/README.md +68 -0
- package/dist/eval.d.ts +8 -0
- package/dist/eval.d.ts.map +1 -0
- package/dist/eval.js +145 -0
- package/dist/eval.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +20 -0
- package/dist/index.js.map +1 -0
- package/dist/lite.d.ts +3 -0
- package/dist/lite.d.ts.map +1 -0
- package/dist/lite.js +20 -0
- package/dist/lite.js.map +1 -0
- package/dist/nn/detect.d.ts +25 -0
- package/dist/nn/detect.d.ts.map +1 -0
- package/dist/nn/detect.js +209 -0
- package/dist/nn/detect.js.map +1 -0
- package/dist/nn/forward.d.ts +38 -0
- package/dist/nn/forward.d.ts.map +1 -0
- package/dist/nn/forward.js +154 -0
- package/dist/nn/forward.js.map +1 -0
- package/dist/nn/groups.d.ts +23 -0
- package/dist/nn/groups.d.ts.map +1 -0
- package/dist/nn/groups.js +81 -0
- package/dist/nn/groups.js.map +1 -0
- package/dist/nn/load.d.ts +15 -0
- package/dist/nn/load.d.ts.map +1 -0
- package/dist/nn/load.js +21 -0
- package/dist/nn/load.js.map +1 -0
- package/dist/nn/load.node.d.ts +15 -0
- package/dist/nn/load.node.d.ts.map +1 -0
- package/dist/nn/load.node.js +23 -0
- package/dist/nn/load.node.js.map +1 -0
- package/dist/nn/normalize.d.ts +17 -0
- package/dist/nn/normalize.d.ts.map +1 -0
- package/dist/nn/normalize.js +34 -0
- package/dist/nn/normalize.js.map +1 -0
- package/package.json +61 -0
- package/src/eval.ts +173 -0
- package/src/index.ts +22 -0
- package/src/lite.ts +25 -0
- package/src/nn/detect.ts +309 -0
- package/src/nn/forward.ts +181 -0
- package/src/nn/load.node.ts +24 -0
- package/src/nn/load.ts +21 -0
- package/src/nn/normalize.ts +38 -0
- package/weights/lite/arabic.bin +0 -0
- package/weights/lite/arabic.json +1 -0
- package/weights/lite/cyrillic.bin +5 -0
- package/weights/lite/cyrillic.json +1 -0
- package/weights/lite/devanagari.bin +0 -0
- package/weights/lite/devanagari.json +1 -0
- package/weights/lite/latin.bin +5 -0
- package/weights/lite/latin.json +1 -0
- package/weights/standard/arabic.bin +0 -0
- package/weights/standard/arabic.json +1 -0
- package/weights/standard/cyrillic.bin +0 -0
- package/weights/standard/cyrillic.json +1 -0
- package/weights/standard/devanagari.bin +9 -0
- package/weights/standard/devanagari.json +1 -0
- package/weights/standard/latin.bin +0 -0
- package/weights/standard/latin.json +1 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
// #region types
|
|
2
|
+
// #endregion
|
|
3
|
+
// #region dequantization
|
|
4
|
+
/**
|
|
5
|
+
* dequantizes an int8 array back to float32 using its absmax scale.
|
|
6
|
+
*
|
|
7
|
+
* @param data quantized int8 values
|
|
8
|
+
* @param scale the scale factor used during quantization (scaleMax / absmax)
|
|
9
|
+
* @returns dequantized float32 array
|
|
10
|
+
*/
|
|
11
|
+
const dequantize = (data, scale) => {
|
|
12
|
+
const result = new Float32Array(data.length);
|
|
13
|
+
for (let i = 0; i < data.length; i++) {
|
|
14
|
+
result[i] = data[i] / scale;
|
|
15
|
+
}
|
|
16
|
+
return result;
|
|
17
|
+
};
|
|
18
|
+
/**
|
|
19
|
+
* unpacks 6-bit packed bytes into signed int8 values.
|
|
20
|
+
*
|
|
21
|
+
* packing scheme: 4 values (6 bits each, unsigned offset by +31) → 3 bytes.
|
|
22
|
+
* byte0 = (u0 << 2) | (u1 >> 4)
|
|
23
|
+
* byte1 = ((u1 & 0x0F) << 4) | (u2 >> 2)
|
|
24
|
+
* byte2 = ((u2 & 0x03) << 6) | u3
|
|
25
|
+
*
|
|
26
|
+
* @param packed packed 6-bit data
|
|
27
|
+
* @param count number of original values
|
|
28
|
+
* @returns signed int8 values in [-31, 31]
|
|
29
|
+
*/
|
|
30
|
+
const unpack6 = (packed, count) => {
|
|
31
|
+
const result = new Int8Array(count);
|
|
32
|
+
let ri = 0;
|
|
33
|
+
let pi = 0;
|
|
34
|
+
// process full groups of 4
|
|
35
|
+
const fullGroups = (count >> 2) << 2;
|
|
36
|
+
while (ri < fullGroups) {
|
|
37
|
+
const b0 = packed[pi];
|
|
38
|
+
const b1 = packed[pi + 1];
|
|
39
|
+
const b2 = packed[pi + 2];
|
|
40
|
+
result[ri] = (b0 >> 2) - 31;
|
|
41
|
+
result[ri + 1] = (((b0 & 0x03) << 4) | (b1 >> 4)) - 31;
|
|
42
|
+
result[ri + 2] = (((b1 & 0x0f) << 2) | (b2 >> 6)) - 31;
|
|
43
|
+
result[ri + 3] = (b2 & 0x3f) - 31;
|
|
44
|
+
ri += 4;
|
|
45
|
+
pi += 3;
|
|
46
|
+
}
|
|
47
|
+
// remainder (1-3 values)
|
|
48
|
+
const rem = count - fullGroups;
|
|
49
|
+
if (rem >= 1) {
|
|
50
|
+
result[ri] = (packed[pi] >> 2) - 31;
|
|
51
|
+
}
|
|
52
|
+
if (rem >= 2) {
|
|
53
|
+
result[ri + 1] = (((packed[pi] & 0x03) << 4) | (packed[pi + 1] >> 4)) - 31;
|
|
54
|
+
}
|
|
55
|
+
if (rem >= 3) {
|
|
56
|
+
result[ri + 2] = (((packed[pi + 1] & 0x0f) << 2) | (packed[pi + 2] >> 6)) - 31;
|
|
57
|
+
}
|
|
58
|
+
return result;
|
|
59
|
+
};
|
|
60
|
+
/**
|
|
61
|
+
* loads int8 quantized weights from a binary buffer and dequantizes to float32.
|
|
62
|
+
*
|
|
63
|
+
* binary format: 2 × f32 scales (wScale, bScale), then weight bytes, then bias bytes.
|
|
64
|
+
*
|
|
65
|
+
* @param bin raw binary weight data
|
|
66
|
+
* @param inputSize number of input features
|
|
67
|
+
* @param outputSize number of output classes
|
|
68
|
+
* @returns dequantized model weights
|
|
69
|
+
*/
|
|
70
|
+
export const loadWeights = (bin, inputSize, outputSize) => {
|
|
71
|
+
const view = new DataView(bin);
|
|
72
|
+
const wScale = view.getFloat32(0, true);
|
|
73
|
+
const bScale = view.getFloat32(4, true);
|
|
74
|
+
const wSize = outputSize * inputSize;
|
|
75
|
+
const w = new Int8Array(bin, 8, wSize);
|
|
76
|
+
const b = new Int8Array(bin, 8 + wSize, outputSize);
|
|
77
|
+
return {
|
|
78
|
+
w: dequantize(w, wScale),
|
|
79
|
+
b: dequantize(b, bScale),
|
|
80
|
+
inputSize,
|
|
81
|
+
outputSize,
|
|
82
|
+
};
|
|
83
|
+
};
|
|
84
|
+
/**
|
|
85
|
+
* loads int6 packed quantized weights from a binary buffer and dequantizes to float32.
|
|
86
|
+
*
|
|
87
|
+
* same header as int8 (2 × f32 scales), but payload is 6-bit packed.
|
|
88
|
+
*
|
|
89
|
+
* @param bin raw binary weight data
|
|
90
|
+
* @param inputSize number of input features
|
|
91
|
+
* @param outputSize number of output classes
|
|
92
|
+
* @returns dequantized model weights
|
|
93
|
+
*/
|
|
94
|
+
export const loadWeights6 = (bin, inputSize, outputSize) => {
|
|
95
|
+
const view = new DataView(bin);
|
|
96
|
+
const wScale = view.getFloat32(0, true);
|
|
97
|
+
const bScale = view.getFloat32(4, true);
|
|
98
|
+
const wCount = outputSize * inputSize;
|
|
99
|
+
const wPackedSize = Math.ceil((wCount * 3) / 4);
|
|
100
|
+
const bPackedSize = Math.ceil((outputSize * 3) / 4);
|
|
101
|
+
const wPacked = new Uint8Array(bin, 8, wPackedSize);
|
|
102
|
+
const bPacked = new Uint8Array(bin, 8 + wPackedSize, bPackedSize);
|
|
103
|
+
return {
|
|
104
|
+
w: dequantize(unpack6(wPacked, wCount), wScale),
|
|
105
|
+
b: dequantize(unpack6(bPacked, outputSize), bScale),
|
|
106
|
+
inputSize,
|
|
107
|
+
outputSize,
|
|
108
|
+
};
|
|
109
|
+
};
|
|
110
|
+
// #endregion
|
|
111
|
+
// #region forward pass
|
|
112
|
+
/**
|
|
113
|
+
* applies softmax in-place to an output array.
|
|
114
|
+
*
|
|
115
|
+
* @param output logit array to convert to probabilities
|
|
116
|
+
*/
|
|
117
|
+
const softmax = (output) => {
|
|
118
|
+
let max = -Infinity;
|
|
119
|
+
for (let i = 0; i < output.length; i++) {
|
|
120
|
+
if (output[i] > max) {
|
|
121
|
+
max = output[i];
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
let expSum = 0;
|
|
125
|
+
for (let i = 0; i < output.length; i++) {
|
|
126
|
+
output[i] = Math.exp(output[i] - max);
|
|
127
|
+
expSum += output[i];
|
|
128
|
+
}
|
|
129
|
+
for (let i = 0; i < output.length; i++) {
|
|
130
|
+
output[i] /= expSum;
|
|
131
|
+
}
|
|
132
|
+
};
|
|
133
|
+
/**
|
|
134
|
+
* forward pass for a linear model: dense → softmax.
|
|
135
|
+
*
|
|
136
|
+
* @param input input feature vector (ngram frequencies)
|
|
137
|
+
* @param m model weights
|
|
138
|
+
* @returns output probabilities (one per language in the group)
|
|
139
|
+
*/
|
|
140
|
+
export const forward = (input, m) => {
|
|
141
|
+
const output = new Float32Array(m.outputSize);
|
|
142
|
+
for (let i = 0; i < m.outputSize; i++) {
|
|
143
|
+
let sum = m.b[i];
|
|
144
|
+
const off = i * m.inputSize;
|
|
145
|
+
for (let j = 0; j < m.inputSize; j++) {
|
|
146
|
+
sum += input[j] * m.w[off + j];
|
|
147
|
+
}
|
|
148
|
+
output[i] = sum;
|
|
149
|
+
}
|
|
150
|
+
softmax(output);
|
|
151
|
+
return output;
|
|
152
|
+
};
|
|
153
|
+
// #endregion
|
|
154
|
+
//# sourceMappingURL=forward.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"forward.js","sourceRoot":"","sources":["../../src/nn/forward.ts"],"names":[],"mappings":"AAAA,gBAAgB;AAUhB,aAAa;AAEb,yBAAyB;AAEzB;;;;;;GAMG;AACH,MAAM,UAAU,GAAG,CAAC,IAAe,EAAE,KAAa,EAAgB,EAAE;IACnE,MAAM,MAAM,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC7C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC;IAC7B,CAAC;IACD,OAAO,MAAM,CAAC;AACf,CAAC,CAAC;AAEF;;;;;;;;;;;GAWG;AACH,MAAM,OAAO,GAAG,CAAC,MAAkB,EAAE,KAAa,EAAa,EAAE;IAChE,MAAM,MAAM,GAAG,IAAI,SAAS,CAAC,KAAK,CAAC,CAAC;IACpC,IAAI,EAAE,GAAG,CAAC,CAAC;IACX,IAAI,EAAE,GAAG,CAAC,CAAC;IAEX,2BAA2B;IAC3B,MAAM,UAAU,GAAG,CAAC,KAAK,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC;IACrC,OAAO,EAAE,GAAG,UAAU,EAAE,CAAC;QACxB,MAAM,EAAE,GAAG,MAAM,CAAC,EAAE,CAAC,CAAC;QACtB,MAAM,EAAE,GAAG,MAAM,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;QAC1B,MAAM,EAAE,GAAG,MAAM,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;QAC1B,MAAM,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC,GAAG,EAAE,CAAC;QAC5B,MAAM,CAAC,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC;QACvD,MAAM,CAAC,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC;QACvD,MAAM,CAAC,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,EAAE,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAClC,EAAE,IAAI,CAAC,CAAC;QACR,EAAE,IAAI,CAAC,CAAC;IACT,CAAC;IAED,yBAAyB;IACzB,MAAM,GAAG,GAAG,KAAK,GAAG,UAAU,CAAC;IAC/B,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC;QACd,MAAM,CAAC,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,CAAC;IACrC,CAAC;IACD,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC;QACd,MAAM,CAAC,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC;IAC5E,CAAC;IACD,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC;QACd,MAAM,CAAC,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC;IAChF,CAAC;IAED,OAAO,MAAM,CAAC;AACf,CAAC,CAAC;AAEF;;;;;;;;;GASG;AACH,MAAM,CAAC,MAAM,WAAW,GAAG,CAAC,GAAgB,EAAE,SAAiB,EAAE,UAAkB,EAAgB,EAAE;IACpG,MAAM,IAAI,GAAG,IAAI,QAAQ,CAAC,GAAG,CAAC,CAAC;IAC/B,MAAM,MAAM,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;IACxC,MAAM,MAAM,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;IAExC,MAAM,KAAK,GAAG,UAAU,GAAG,SAAS,CAAC;IACrC,MAAM,CAAC,GAAG,IAAI,SAAS,CAAC,GAAG,EAAE,CAAC,EAAE,KAAK,CAAC,CAAC;IACvC,MAAM,CAAC,GAAG,IAAI,SAAS,CAAC,GAAG,EAAE,CAAC,GAAG,KAAK,EAAE,UAAU,CAAC,CAAC;IAEpD,OAAO;QACN,CAAC,EAAE,UAAU,CAAC,CAAC,EAAE,MAAM,CAAC;QACxB,CAAC,EAAE,UAAU,CAAC,CAAC,EAAE,MAAM,CAAC;QACxB,SAAS;QACT,UAAU;KACV,CAAC;AACH,CAAC,CAAC;AAEF;;;;;;;;;GASG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG,CAAC,GAAgB,EAAE,SAAiB,EAAE,UAAkB,EAAgB,EAAE;IACrG,MAAM,IAAI,GAAG,IAAI,QAAQ,CAAC,GAAG,CAAC,CAAC;IAC/B,MAAM,MAAM,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;IACxC,MAAM,MAAM,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;IAExC,MAAM,MAAM,GAAG,UAAU,GAAG,SAAS,CAAC;IACtC,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAChD,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAEpD,MAAM,OAAO,GAAG,IAAI,UAAU,CAAC,GAAG,EAAE,CAAC,EAAE,WAAW,CAAC,CAAC;IACpD,MAAM,OAAO,GAAG,IAAI,UAAU,CAAC,GAAG,EAAE,CAAC,GAAG,WAAW,EAAE,WAAW,CAAC,CAAC;IAElE,OAAO;QACN,CAAC,EAAE,UAAU,CAAC,OAAO,CAAC,OAAO,EAAE,MAAM,CAAC,EAAE,MAAM,CAAC;QAC/C,CAAC,EAAE,UAAU,CAAC,OAAO,CAAC,OAAO,EAAE,UAAU,CAAC,EAAE,MAAM,CAAC;QACnD,SAAS;QACT,UAAU;KACV,CAAC;AACH,CAAC,CAAC;AAEF,aAAa;AAEb,uBAAuB;AAEvB;;;;GAIG;AACH,MAAM,OAAO,GAAG,CAAC,MAAoB,EAAQ,EAAE;IAC9C,IAAI,GAAG,GAAG,CAAC,QAAQ,CAAC;IACpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACxC,IAAI,MAAM,CAAC,CAAC,CAAC,GAAG,GAAG,EAAE,CAAC;YACrB,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;QACjB,CAAC;IACF,CAAC;IACD,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACxC,MAAM,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC;QACtC,MAAM,IAAI,MAAM,CAAC,CAAC,CAAC,CAAC;IACrB,CAAC;IACD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACxC,MAAM,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC;IACrB,CAAC;AACF,CAAC,CAAC;AAEF;;;;;;GAMG;AACH,MAAM,CAAC,MAAM,OAAO,GAAG,CAAC,KAAmB,EAAE,CAAe,EAAgB,EAAE;IAC7E,MAAM,MAAM,GAAG,IAAI,YAAY,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC;IAC9C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,IAAI,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACjB,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,SAAS,CAAC;QAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,GAAG,IAAI,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC;QAChC,CAAC;QACD,MAAM,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC;IACjB,CAAC;IAED,OAAO,CAAC,MAAM,CAAC,CAAC;IAChB,OAAO,MAAM,CAAC;AACf,CAAC,CAAC;AAEF,aAAa"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/** languages identifiable by unique Unicode script ranges alone (no NN needed). */
|
|
2
|
+
declare const UNIQUE_SCRIPT_LANGS: Record<string, (text: string) => boolean>;
|
|
3
|
+
/**
|
|
4
|
+
* detects CJK languages by script. Japanese is identified by kana; if only
|
|
5
|
+
* Han characters are present, defaults to Mandarin Chinese.
|
|
6
|
+
*
|
|
7
|
+
* @param text input text
|
|
8
|
+
* @returns `'jpn'`, `'cmn'`, or `null` if no CJK script detected
|
|
9
|
+
*/
|
|
10
|
+
declare const detectCJK: (text: string) => string | null;
|
|
11
|
+
/** a script group with its languages and detection test. */
|
|
12
|
+
type Group = {
|
|
13
|
+
langs: string[];
|
|
14
|
+
test: (text: string) => boolean;
|
|
15
|
+
};
|
|
16
|
+
/**
|
|
17
|
+
* script-family groups for NN-based detection.
|
|
18
|
+
* architecture sizes are defined in the weight metadata (.json files).
|
|
19
|
+
*/
|
|
20
|
+
declare const GROUPS: Record<string, Group>;
|
|
21
|
+
export type { Group };
|
|
22
|
+
export { UNIQUE_SCRIPT_LANGS, detectCJK, GROUPS };
|
|
23
|
+
//# sourceMappingURL=groups.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"groups.d.ts","sourceRoot":"","sources":["../../src/nn/groups.ts"],"names":[],"mappings":"AAEA,mFAAmF;AACnF,QAAA,MAAM,mBAAmB,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAOlE,CAAC;AAEF;;;;;;GAMG;AACH,QAAA,MAAM,SAAS,GAAI,MAAM,MAAM,KAAG,MAAM,GAAG,IAQ1C,CAAC;AAMF,4DAA4D;AAC5D,KAAK,KAAK,GAAG;IACZ,KAAK,EAAE,MAAM,EAAE,CAAC;IAChB,IAAI,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC;CAChC,CAAC;AAEF;;;GAGG;AACH,QAAA,MAAM,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,KAAK,CAgDjC,CAAC;AAIF,YAAY,EAAE,KAAK,EAAE,CAAC;AACtB,OAAO,EAAE,mBAAmB,EAAE,SAAS,EAAE,MAAM,EAAE,CAAC"}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
// #region script detection
|
|
2
|
+
/** languages identifiable by unique Unicode script ranges alone (no NN needed). */
|
|
3
|
+
const UNIQUE_SCRIPT_LANGS = {
|
|
4
|
+
kor: (t) => /[\uAC00-\uD7AF\u1100-\u11FF]/.test(t),
|
|
5
|
+
kat: (t) => /[\u10A0-\u10FF\u2D00-\u2D2F]/.test(t),
|
|
6
|
+
hye: (t) => /[\u0530-\u058F]/.test(t),
|
|
7
|
+
ben: (t) => /[\u0980-\u09FF]/.test(t),
|
|
8
|
+
ell: (t) => /[\u0370-\u03FF\u1F00-\u1FFF]/.test(t),
|
|
9
|
+
heb: (t) => /[\u0590-\u05FF]/.test(t),
|
|
10
|
+
};
|
|
11
|
+
/**
|
|
12
|
+
* detects CJK languages by script. Japanese is identified by kana; if only
|
|
13
|
+
* Han characters are present, defaults to Mandarin Chinese.
|
|
14
|
+
*
|
|
15
|
+
* @param text input text
|
|
16
|
+
* @returns `'jpn'`, `'cmn'`, or `null` if no CJK script detected
|
|
17
|
+
*/
|
|
18
|
+
const detectCJK = (text) => {
|
|
19
|
+
if (/[\u3040-\u309F\u30A0-\u30FF]/.test(text)) {
|
|
20
|
+
return 'jpn';
|
|
21
|
+
}
|
|
22
|
+
if (/[\u4E00-\u9FFF\u3400-\u4DBF]/.test(text)) {
|
|
23
|
+
return 'cmn';
|
|
24
|
+
}
|
|
25
|
+
return null;
|
|
26
|
+
};
|
|
27
|
+
/**
|
|
28
|
+
* script-family groups for NN-based detection.
|
|
29
|
+
* architecture sizes are defined in the weight metadata (.json files).
|
|
30
|
+
*/
|
|
31
|
+
const GROUPS = {
|
|
32
|
+
cyrillic: {
|
|
33
|
+
langs: ['bel', 'bul', 'kaz', 'mkd', 'rus', 'srp', 'ukr'],
|
|
34
|
+
test: (t) => /[\u0400-\u04FF]/.test(t),
|
|
35
|
+
},
|
|
36
|
+
arabic: {
|
|
37
|
+
langs: ['ara', 'ckb', 'pes'],
|
|
38
|
+
test: (t) => /[\u0600-\u06FF\u0750-\u077F]/.test(t),
|
|
39
|
+
},
|
|
40
|
+
devanagari: {
|
|
41
|
+
langs: ['hin', 'mar'],
|
|
42
|
+
test: (t) => /[\u0900-\u097F]/.test(t),
|
|
43
|
+
},
|
|
44
|
+
latin: {
|
|
45
|
+
langs: [
|
|
46
|
+
'afr',
|
|
47
|
+
'aze',
|
|
48
|
+
'cat',
|
|
49
|
+
'ces',
|
|
50
|
+
'dan',
|
|
51
|
+
'deu',
|
|
52
|
+
'eng',
|
|
53
|
+
'est',
|
|
54
|
+
'eus',
|
|
55
|
+
'fin',
|
|
56
|
+
'fra',
|
|
57
|
+
'hau',
|
|
58
|
+
'hrv',
|
|
59
|
+
'hun',
|
|
60
|
+
'ind',
|
|
61
|
+
'isl',
|
|
62
|
+
'ita',
|
|
63
|
+
'lit',
|
|
64
|
+
'nld',
|
|
65
|
+
'nob',
|
|
66
|
+
'pol',
|
|
67
|
+
'por',
|
|
68
|
+
'ron',
|
|
69
|
+
'run',
|
|
70
|
+
'slk',
|
|
71
|
+
'spa',
|
|
72
|
+
'swe',
|
|
73
|
+
'tgl',
|
|
74
|
+
'tur',
|
|
75
|
+
'vie',
|
|
76
|
+
],
|
|
77
|
+
test: (t) => /[a-zA-Z\u00C0-\u024F]/.test(t),
|
|
78
|
+
},
|
|
79
|
+
};
|
|
80
|
+
export { UNIQUE_SCRIPT_LANGS, detectCJK, GROUPS };
|
|
81
|
+
//# sourceMappingURL=groups.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"groups.js","sourceRoot":"","sources":["../../src/nn/groups.ts"],"names":[],"mappings":"AAAA,2BAA2B;AAE3B,mFAAmF;AACnF,MAAM,mBAAmB,GAA8C;IACtE,GAAG,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,8BAA8B,CAAC,IAAI,CAAC,CAAC,CAAC;IAClD,GAAG,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,8BAA8B,CAAC,IAAI,CAAC,CAAC,CAAC;IAClD,GAAG,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC;IACrC,GAAG,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC;IACrC,GAAG,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,8BAA8B,CAAC,IAAI,CAAC,CAAC,CAAC;IAClD,GAAG,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC;CACrC,CAAC;AAEF;;;;;;GAMG;AACH,MAAM,SAAS,GAAG,CAAC,IAAY,EAAiB,EAAE;IACjD,IAAI,8BAA8B,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;QAC/C,OAAO,KAAK,CAAC;IACd,CAAC;IACD,IAAI,8BAA8B,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;QAC/C,OAAO,KAAK,CAAC;IACd,CAAC;IACD,OAAO,IAAI,CAAC;AACb,CAAC,CAAC;AAYF;;;GAGG;AACH,MAAM,MAAM,GAA0B;IACrC,QAAQ,EAAE;QACT,KAAK,EAAE,CAAC,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,CAAC;QACxD,IAAI,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC;KACtC;IACD,MAAM,EAAE;QACP,KAAK,EAAE,CAAC,KAAK,EAAE,KAAK,EAAE,KAAK,CAAC;QAC5B,IAAI,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,8BAA8B,CAAC,IAAI,CAAC,CAAC,CAAC;KACnD;IACD,UAAU,EAAE;QACX,KAAK,EAAE,CAAC,KAAK,EAAE,KAAK,CAAC;QACrB,IAAI,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC;KACtC;IACD,KAAK,EAAE;QACN,KAAK,EAAE;YACN,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;YACL,KAAK;SACL;QACD,IAAI,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,uBAAuB,CAAC,IAAI,CAAC,CAAC,CAAC;KAC5C;CACD,CAAC;AAKF,OAAO,EAAE,mBAAmB,EAAE,SAAS,EAAE,MAAM,EAAE,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* loads binary data from a URL via fetch.
|
|
3
|
+
*
|
|
4
|
+
* @param url URL to fetch
|
|
5
|
+
* @returns the response body as an ArrayBuffer
|
|
6
|
+
*/
|
|
7
|
+
export declare const loadBinary: (url: URL) => Promise<ArrayBuffer>;
|
|
8
|
+
/**
|
|
9
|
+
* loads and parses JSON from a URL via fetch.
|
|
10
|
+
*
|
|
11
|
+
* @param url URL to fetch
|
|
12
|
+
* @returns the parsed JSON value
|
|
13
|
+
*/
|
|
14
|
+
export declare const loadJson: (url: URL) => Promise<unknown>;
|
|
15
|
+
//# sourceMappingURL=load.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"load.d.ts","sourceRoot":"","sources":["../../src/nn/load.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AACH,eAAO,MAAM,UAAU,GAAU,KAAK,GAAG,KAAG,OAAO,CAAC,WAAW,CAG9D,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,QAAQ,GAAU,KAAK,GAAG,KAAG,OAAO,CAAC,OAAO,CAGxD,CAAC"}
|
package/dist/nn/load.js
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* loads binary data from a URL via fetch.
|
|
3
|
+
*
|
|
4
|
+
* @param url URL to fetch
|
|
5
|
+
* @returns the response body as an ArrayBuffer
|
|
6
|
+
*/
|
|
7
|
+
export const loadBinary = async (url) => {
|
|
8
|
+
const response = await fetch(url);
|
|
9
|
+
return response.arrayBuffer();
|
|
10
|
+
};
|
|
11
|
+
/**
|
|
12
|
+
* loads and parses JSON from a URL via fetch.
|
|
13
|
+
*
|
|
14
|
+
* @param url URL to fetch
|
|
15
|
+
* @returns the parsed JSON value
|
|
16
|
+
*/
|
|
17
|
+
export const loadJson = async (url) => {
|
|
18
|
+
const response = await fetch(url);
|
|
19
|
+
return response.json();
|
|
20
|
+
};
|
|
21
|
+
//# sourceMappingURL=load.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"load.js","sourceRoot":"","sources":["../../src/nn/load.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AACH,MAAM,CAAC,MAAM,UAAU,GAAG,KAAK,EAAE,GAAQ,EAAwB,EAAE;IAClE,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;IAClC,OAAO,QAAQ,CAAC,WAAW,EAAE,CAAC;AAC/B,CAAC,CAAC;AAEF;;;;;GAKG;AACH,MAAM,CAAC,MAAM,QAAQ,GAAG,KAAK,EAAE,GAAQ,EAAoB,EAAE;IAC5D,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;IAClC,OAAO,QAAQ,CAAC,IAAI,EAAE,CAAC;AACxB,CAAC,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* loads binary data from a file URL using node:fs.
|
|
3
|
+
*
|
|
4
|
+
* @param url file URL to load
|
|
5
|
+
* @returns the file contents as an ArrayBuffer
|
|
6
|
+
*/
|
|
7
|
+
export declare const loadBinary: (url: URL) => Promise<ArrayBuffer>;
|
|
8
|
+
/**
|
|
9
|
+
* loads and parses JSON from a file URL using node:fs.
|
|
10
|
+
*
|
|
11
|
+
* @param url file URL to load
|
|
12
|
+
* @returns the parsed JSON value
|
|
13
|
+
*/
|
|
14
|
+
export declare const loadJson: (url: URL) => Promise<unknown>;
|
|
15
|
+
//# sourceMappingURL=load.node.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"load.node.d.ts","sourceRoot":"","sources":["../../src/nn/load.node.ts"],"names":[],"mappings":"AAGA;;;;;GAKG;AACH,eAAO,MAAM,UAAU,GAAU,KAAK,GAAG,KAAG,OAAO,CAAC,WAAW,CAG9D,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,QAAQ,GAAU,KAAK,GAAG,KAAG,OAAO,CAAC,OAAO,CAGxD,CAAC"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { readFileSync } from 'node:fs';
|
|
2
|
+
import { fileURLToPath } from 'node:url';
|
|
3
|
+
/**
|
|
4
|
+
* loads binary data from a file URL using node:fs.
|
|
5
|
+
*
|
|
6
|
+
* @param url file URL to load
|
|
7
|
+
* @returns the file contents as an ArrayBuffer
|
|
8
|
+
*/
|
|
9
|
+
export const loadBinary = async (url) => {
|
|
10
|
+
const buffer = readFileSync(fileURLToPath(url));
|
|
11
|
+
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
|
|
12
|
+
};
|
|
13
|
+
/**
|
|
14
|
+
* loads and parses JSON from a file URL using node:fs.
|
|
15
|
+
*
|
|
16
|
+
* @param url file URL to load
|
|
17
|
+
* @returns the parsed JSON value
|
|
18
|
+
*/
|
|
19
|
+
export const loadJson = async (url) => {
|
|
20
|
+
const content = readFileSync(fileURLToPath(url), 'utf-8');
|
|
21
|
+
return JSON.parse(content);
|
|
22
|
+
};
|
|
23
|
+
//# sourceMappingURL=load.node.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"load.node.js","sourceRoot":"","sources":["../../src/nn/load.node.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACvC,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AAEzC;;;;;GAKG;AACH,MAAM,CAAC,MAAM,UAAU,GAAG,KAAK,EAAE,GAAQ,EAAwB,EAAE;IAClE,MAAM,MAAM,GAAG,YAAY,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC;IAChD,OAAO,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,UAAU,EAAE,MAAM,CAAC,UAAU,GAAG,MAAM,CAAC,UAAU,CAAC,CAAC;AACtF,CAAC,CAAC;AAEF;;;;;GAKG;AACH,MAAM,CAAC,MAAM,QAAQ,GAAG,KAAK,EAAE,GAAQ,EAAoB,EAAE;IAC5D,MAAM,OAAO,GAAG,YAAY,CAAC,aAAa,CAAC,GAAG,CAAC,EAAE,OAAO,CAAC,CAAC;IAC1D,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;AAC5B,CAAC,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* normalizes text for ngram extraction: lowercases, strips non-letter/non-mark
|
|
3
|
+
* characters, collapses whitespace, and pads with spaces.
|
|
4
|
+
*
|
|
5
|
+
* @param text raw input text
|
|
6
|
+
* @returns normalized text padded with leading/trailing spaces
|
|
7
|
+
*/
|
|
8
|
+
export declare const normalize: (text: string) => string;
|
|
9
|
+
/**
|
|
10
|
+
* extracts ngram frequencies from a string.
|
|
11
|
+
*
|
|
12
|
+
* @param text normalized text (from {@link normalize})
|
|
13
|
+
* @param length ngram length (1 for unigrams, 2 for bigrams, etc.)
|
|
14
|
+
* @returns map of ngram string to its relative frequency (count / total)
|
|
15
|
+
*/
|
|
16
|
+
export declare const extractNgrams: (text: string, length: number) => Record<string, number>;
|
|
17
|
+
//# sourceMappingURL=normalize.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"normalize.d.ts","sourceRoot":"","sources":["../../src/nn/normalize.ts"],"names":[],"mappings":"AAIA;;;;;;GAMG;AACH,eAAO,MAAM,SAAS,GAAI,MAAM,MAAM,KAAG,MAExC,CAAC;AAEF;;;;;;GAMG;AACH,eAAO,MAAM,aAAa,GAAI,MAAM,MAAM,EAAE,QAAQ,MAAM,KAAG,MAAM,CAAC,MAAM,EAAE,MAAM,CAejF,CAAC"}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
const HYPHEN_RE = /-+/g;
|
|
2
|
+
const NON_LETTER_RE = /[^\p{L}\p{M}\s]/gu;
|
|
3
|
+
const MULTI_SPACE_RE = /\s{2,}/g;
|
|
4
|
+
/**
|
|
5
|
+
* normalizes text for ngram extraction: lowercases, strips non-letter/non-mark
|
|
6
|
+
* characters, collapses whitespace, and pads with spaces.
|
|
7
|
+
*
|
|
8
|
+
* @param text raw input text
|
|
9
|
+
* @returns normalized text padded with leading/trailing spaces
|
|
10
|
+
*/
|
|
11
|
+
export const normalize = (text) => {
|
|
12
|
+
return ` ${text.replace(HYPHEN_RE, ' ').replace(NON_LETTER_RE, '').replace(MULTI_SPACE_RE, ' ').toLowerCase().trim()} `;
|
|
13
|
+
};
|
|
14
|
+
/**
|
|
15
|
+
* extracts ngram frequencies from a string.
|
|
16
|
+
*
|
|
17
|
+
* @param text normalized text (from {@link normalize})
|
|
18
|
+
* @param length ngram length (1 for unigrams, 2 for bigrams, etc.)
|
|
19
|
+
* @returns map of ngram string to its relative frequency (count / total)
|
|
20
|
+
*/
|
|
21
|
+
export const extractNgrams = (text, length) => {
|
|
22
|
+
const ngrams = {};
|
|
23
|
+
let total = 0;
|
|
24
|
+
for (let i = 0, l = text.length - length; i <= l; i++) {
|
|
25
|
+
const value = text.slice(i, i + length);
|
|
26
|
+
ngrams[value] = (ngrams[value] || 0) + 1;
|
|
27
|
+
total++;
|
|
28
|
+
}
|
|
29
|
+
for (const value in ngrams) {
|
|
30
|
+
ngrams[value] /= total;
|
|
31
|
+
}
|
|
32
|
+
return ngrams;
|
|
33
|
+
};
|
|
34
|
+
//# sourceMappingURL=normalize.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"normalize.js","sourceRoot":"","sources":["../../src/nn/normalize.ts"],"names":[],"mappings":"AAAA,MAAM,SAAS,GAAG,KAAK,CAAC;AACxB,MAAM,aAAa,GAAG,mBAAmB,CAAC;AAC1C,MAAM,cAAc,GAAG,SAAS,CAAC;AAEjC;;;;;;GAMG;AACH,MAAM,CAAC,MAAM,SAAS,GAAG,CAAC,IAAY,EAAU,EAAE;IACjD,OAAO,IAAI,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,cAAc,EAAE,GAAG,CAAC,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,GAAG,CAAC;AACzH,CAAC,CAAC;AAEF;;;;;;GAMG;AACH,MAAM,CAAC,MAAM,aAAa,GAAG,CAAC,IAAY,EAAE,MAAc,EAA0B,EAAE;IACrF,MAAM,MAAM,GAA2B,EAAE,CAAC;IAC1C,IAAI,KAAK,GAAG,CAAC,CAAC;IAEd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACvD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,CAAC;QACxC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;QACzC,KAAK,EAAE,CAAC;IACT,CAAC;IAED,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC5B,MAAM,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC;IACxB,CAAC;IAED,OAAO,MAAM,CAAC;AACf,CAAC,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@oomfware/lang-detect",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "natural language detection library",
|
|
5
|
+
"license": "0BSD",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "https://codeberg.org/oomfware/lang-detect"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"dist/",
|
|
12
|
+
"src/",
|
|
13
|
+
"weights/",
|
|
14
|
+
"!src/**/*.bench.ts",
|
|
15
|
+
"!src/**/*.test.ts"
|
|
16
|
+
],
|
|
17
|
+
"type": "module",
|
|
18
|
+
"types": "./dist/index.d.ts",
|
|
19
|
+
"imports": {
|
|
20
|
+
"#load": {
|
|
21
|
+
"source": "./src/nn/load.node.ts",
|
|
22
|
+
"node": "./dist/nn/load.node.js",
|
|
23
|
+
"default": "./dist/nn/load.js"
|
|
24
|
+
}
|
|
25
|
+
},
|
|
26
|
+
"exports": {
|
|
27
|
+
".": {
|
|
28
|
+
"types": "./dist/index.d.ts",
|
|
29
|
+
"default": "./dist/index.js"
|
|
30
|
+
},
|
|
31
|
+
"./lite": {
|
|
32
|
+
"types": "./dist/lite.d.ts",
|
|
33
|
+
"default": "./dist/lite.js"
|
|
34
|
+
},
|
|
35
|
+
"./package.json": "./package.json"
|
|
36
|
+
},
|
|
37
|
+
"publishConfig": {
|
|
38
|
+
"access": "public"
|
|
39
|
+
},
|
|
40
|
+
"devDependencies": {
|
|
41
|
+
"@types/node": "^25.3.3",
|
|
42
|
+
"bumpp": "^10.4.1",
|
|
43
|
+
"lande": "^1.0.10",
|
|
44
|
+
"oxfmt": "^0.36.0",
|
|
45
|
+
"oxlint": "^1.51.0",
|
|
46
|
+
"typescript": "^5.9.3"
|
|
47
|
+
},
|
|
48
|
+
"scripts": {
|
|
49
|
+
"build": "tsc",
|
|
50
|
+
"dev": "tsc --watch",
|
|
51
|
+
"typecheck": "tsc --noEmit",
|
|
52
|
+
"fmt": "oxfmt",
|
|
53
|
+
"lint": "oxlint",
|
|
54
|
+
"eval": "node --conditions source src/eval.ts",
|
|
55
|
+
"test": "node --test --conditions source src/**/*.test.ts",
|
|
56
|
+
"download-dataset": "cd train && bash download_dataset.sh",
|
|
57
|
+
"export": "pnpm run export:standard && pnpm run export:lite",
|
|
58
|
+
"export:standard": "cd train && uv run export.py -e pruned_mega_aug50 -o ../weights/standard",
|
|
59
|
+
"export:lite": "cd train && uv run export.py -e pruned_mega_aug50 -o ../weights/lite --quant-bits 6"
|
|
60
|
+
}
|
|
61
|
+
}
|
package/src/eval.ts
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evaluate detection accuracy against the UDHR dataset.
|
|
3
|
+
*
|
|
4
|
+
* usage:
|
|
5
|
+
* node --conditions source src/eval.ts [--lite] [--lande]
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import fs from 'node:fs';
|
|
9
|
+
import path from 'node:path';
|
|
10
|
+
import { parseArgs } from 'node:util';
|
|
11
|
+
|
|
12
|
+
import { create } from './nn/detect.ts';
|
|
13
|
+
|
|
14
|
+
const { values: args } = parseArgs({
|
|
15
|
+
options: {
|
|
16
|
+
lite: { type: 'boolean', default: false },
|
|
17
|
+
lande: { type: 'boolean', default: false },
|
|
18
|
+
},
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
const variant = args.lite ? 'lite' : 'standard';
|
|
22
|
+
const quantBits = args.lite ? 6 : 8;
|
|
23
|
+
|
|
24
|
+
const weightsDir = path.resolve(import.meta.dirname!, '..', 'weights', variant);
|
|
25
|
+
const { initialize, detect } = create(
|
|
26
|
+
{
|
|
27
|
+
cyrillic: {
|
|
28
|
+
weights: new URL(`file://${path.join(weightsDir, 'cyrillic.bin')}`),
|
|
29
|
+
meta: new URL(`file://${path.join(weightsDir, 'cyrillic.json')}`),
|
|
30
|
+
},
|
|
31
|
+
arabic: {
|
|
32
|
+
weights: new URL(`file://${path.join(weightsDir, 'arabic.bin')}`),
|
|
33
|
+
meta: new URL(`file://${path.join(weightsDir, 'arabic.json')}`),
|
|
34
|
+
},
|
|
35
|
+
devanagari: {
|
|
36
|
+
weights: new URL(`file://${path.join(weightsDir, 'devanagari.bin')}`),
|
|
37
|
+
meta: new URL(`file://${path.join(weightsDir, 'devanagari.json')}`),
|
|
38
|
+
},
|
|
39
|
+
latin: {
|
|
40
|
+
weights: new URL(`file://${path.join(weightsDir, 'latin.bin')}`),
|
|
41
|
+
meta: new URL(`file://${path.join(weightsDir, 'latin.json')}`),
|
|
42
|
+
},
|
|
43
|
+
},
|
|
44
|
+
quantBits,
|
|
45
|
+
);
|
|
46
|
+
|
|
47
|
+
// ── UDHR code → ISO 639-3 mapping ──
|
|
48
|
+
|
|
49
|
+
const UDHR_CODE_TO_LANG: Record<string, string> = {
|
|
50
|
+
afr: 'afr',
|
|
51
|
+
bel: 'bel',
|
|
52
|
+
ben: 'ben',
|
|
53
|
+
bul: 'bul',
|
|
54
|
+
cat: 'cat',
|
|
55
|
+
ces: 'ces',
|
|
56
|
+
ckb: 'ckb',
|
|
57
|
+
cmn_hans: 'cmn',
|
|
58
|
+
dan: 'dan',
|
|
59
|
+
deu_1996: 'deu',
|
|
60
|
+
ell_monotonic: 'ell',
|
|
61
|
+
eng: 'eng',
|
|
62
|
+
eus: 'eus',
|
|
63
|
+
fin: 'fin',
|
|
64
|
+
fra: 'fra',
|
|
65
|
+
hau_NG: 'hau',
|
|
66
|
+
heb: 'heb',
|
|
67
|
+
hin: 'hin',
|
|
68
|
+
hrv: 'hrv',
|
|
69
|
+
hun: 'hun',
|
|
70
|
+
hye: 'hye',
|
|
71
|
+
ind: 'ind',
|
|
72
|
+
isl: 'isl',
|
|
73
|
+
ita: 'ita',
|
|
74
|
+
jpn: 'jpn',
|
|
75
|
+
kat: 'kat',
|
|
76
|
+
kaz: 'kaz',
|
|
77
|
+
kor: 'kor',
|
|
78
|
+
lit: 'lit',
|
|
79
|
+
mar: 'mar',
|
|
80
|
+
mkd: 'mkd',
|
|
81
|
+
nld: 'nld',
|
|
82
|
+
nob: 'nob',
|
|
83
|
+
pes_1: 'pes',
|
|
84
|
+
pol: 'pol',
|
|
85
|
+
por_BR: 'por',
|
|
86
|
+
por_PT: 'por',
|
|
87
|
+
ron_2006: 'ron',
|
|
88
|
+
run: 'run',
|
|
89
|
+
rus: 'rus',
|
|
90
|
+
slk: 'slk',
|
|
91
|
+
spa: 'spa',
|
|
92
|
+
srp_cyrl: 'srp',
|
|
93
|
+
srp_latn: 'srp',
|
|
94
|
+
swe: 'swe',
|
|
95
|
+
tgl: 'tgl',
|
|
96
|
+
tur: 'tur',
|
|
97
|
+
ukr: 'ukr',
|
|
98
|
+
vie: 'vie',
|
|
99
|
+
};
|
|
100
|
+
|
|
101
|
+
const TAG_RE = /<[^>]+>/g;
|
|
102
|
+
|
|
103
|
+
// ── load UDHR sentences ──
|
|
104
|
+
|
|
105
|
+
const declDir = path.resolve(import.meta.dirname!, '..', 'train', 'resources', 'udhr', 'declaration');
|
|
106
|
+
const sentences: { lang: string; text: string }[] = [];
|
|
107
|
+
|
|
108
|
+
for (const [code, lang] of Object.entries(UDHR_CODE_TO_LANG)) {
|
|
109
|
+
const htmlFile = path.join(declDir, `${code}.html`);
|
|
110
|
+
if (!fs.existsSync(htmlFile)) {
|
|
111
|
+
continue;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const content = fs.readFileSync(htmlFile, 'utf-8');
|
|
115
|
+
for (const match of content.matchAll(/<p>(.*?)<\/p>/gs)) {
|
|
116
|
+
const text = match[1].replace(TAG_RE, '').trim();
|
|
117
|
+
if (text.length < 10) {
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
sentences.push({ lang, text });
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// ── helpers ──
|
|
125
|
+
|
|
126
|
+
type Stats = { pass: number; total: number };
|
|
127
|
+
|
|
128
|
+
const evaluate = (name: string, detectFn: (text: string) => string | undefined) => {
|
|
129
|
+
const perLang: Record<string, Stats> = {};
|
|
130
|
+
let totalPass = 0;
|
|
131
|
+
|
|
132
|
+
for (const { lang, text } of sentences) {
|
|
133
|
+
perLang[lang] ??= { pass: 0, total: 0 };
|
|
134
|
+
perLang[lang].total++;
|
|
135
|
+
|
|
136
|
+
if (detectFn(text) === lang) {
|
|
137
|
+
perLang[lang].pass++;
|
|
138
|
+
totalPass++;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
const overallAcc = (totalPass / sentences.length) * 100;
|
|
143
|
+
|
|
144
|
+
console.log(`\n=== ${name} ===`);
|
|
145
|
+
console.log(`${sentences.length} sentences, ${Object.keys(perLang).length} languages`);
|
|
146
|
+
console.log(`overall accuracy: ${overallAcc.toFixed(2)}%`);
|
|
147
|
+
|
|
148
|
+
const sorted = Object.entries(perLang).sort((a, b) => a[1].pass / a[1].total - b[1].pass / b[1].total);
|
|
149
|
+
for (const [lang, stats] of sorted) {
|
|
150
|
+
const acc = (stats.pass / stats.total) * 100;
|
|
151
|
+
if (acc < 100) {
|
|
152
|
+
console.log(` ${lang}: ${acc.toFixed(1)}% (${stats.total})`);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
};
|
|
156
|
+
|
|
157
|
+
// ── evaluate ──
|
|
158
|
+
|
|
159
|
+
await initialize();
|
|
160
|
+
|
|
161
|
+
evaluate(`UDHR: ${variant} (${quantBits}-bit)`, (text) => {
|
|
162
|
+
const result = detect(text);
|
|
163
|
+
return result[0]?.[0];
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
if (args.lande) {
|
|
167
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
168
|
+
const { default: lande } = await import('lande');
|
|
169
|
+
evaluate('UDHR: lande', (text) => {
|
|
170
|
+
const result = lande(text);
|
|
171
|
+
return result?.[0]?.[0];
|
|
172
|
+
});
|
|
173
|
+
}
|