@oomfware/lang-detect 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/LICENSE +14 -0
  2. package/README.md +68 -0
  3. package/dist/eval.d.ts +8 -0
  4. package/dist/eval.d.ts.map +1 -0
  5. package/dist/eval.js +145 -0
  6. package/dist/eval.js.map +1 -0
  7. package/dist/index.d.ts +3 -0
  8. package/dist/index.d.ts.map +1 -0
  9. package/dist/index.js +20 -0
  10. package/dist/index.js.map +1 -0
  11. package/dist/lite.d.ts +3 -0
  12. package/dist/lite.d.ts.map +1 -0
  13. package/dist/lite.js +20 -0
  14. package/dist/lite.js.map +1 -0
  15. package/dist/nn/detect.d.ts +25 -0
  16. package/dist/nn/detect.d.ts.map +1 -0
  17. package/dist/nn/detect.js +209 -0
  18. package/dist/nn/detect.js.map +1 -0
  19. package/dist/nn/forward.d.ts +38 -0
  20. package/dist/nn/forward.d.ts.map +1 -0
  21. package/dist/nn/forward.js +154 -0
  22. package/dist/nn/forward.js.map +1 -0
  23. package/dist/nn/groups.d.ts +23 -0
  24. package/dist/nn/groups.d.ts.map +1 -0
  25. package/dist/nn/groups.js +81 -0
  26. package/dist/nn/groups.js.map +1 -0
  27. package/dist/nn/load.d.ts +15 -0
  28. package/dist/nn/load.d.ts.map +1 -0
  29. package/dist/nn/load.js +21 -0
  30. package/dist/nn/load.js.map +1 -0
  31. package/dist/nn/load.node.d.ts +15 -0
  32. package/dist/nn/load.node.d.ts.map +1 -0
  33. package/dist/nn/load.node.js +23 -0
  34. package/dist/nn/load.node.js.map +1 -0
  35. package/dist/nn/normalize.d.ts +17 -0
  36. package/dist/nn/normalize.d.ts.map +1 -0
  37. package/dist/nn/normalize.js +34 -0
  38. package/dist/nn/normalize.js.map +1 -0
  39. package/package.json +61 -0
  40. package/src/eval.ts +173 -0
  41. package/src/index.ts +22 -0
  42. package/src/lite.ts +25 -0
  43. package/src/nn/detect.ts +309 -0
  44. package/src/nn/forward.ts +181 -0
  45. package/src/nn/load.node.ts +24 -0
  46. package/src/nn/load.ts +21 -0
  47. package/src/nn/normalize.ts +38 -0
  48. package/weights/lite/arabic.bin +0 -0
  49. package/weights/lite/arabic.json +1 -0
  50. package/weights/lite/cyrillic.bin +5 -0
  51. package/weights/lite/cyrillic.json +1 -0
  52. package/weights/lite/devanagari.bin +0 -0
  53. package/weights/lite/devanagari.json +1 -0
  54. package/weights/lite/latin.bin +5 -0
  55. package/weights/lite/latin.json +1 -0
  56. package/weights/standard/arabic.bin +0 -0
  57. package/weights/standard/arabic.json +1 -0
  58. package/weights/standard/cyrillic.bin +0 -0
  59. package/weights/standard/cyrillic.json +1 -0
  60. package/weights/standard/devanagari.bin +9 -0
  61. package/weights/standard/devanagari.json +1 -0
  62. package/weights/standard/latin.bin +0 -0
  63. package/weights/standard/latin.json +1 -0
package/src/index.ts ADDED
@@ -0,0 +1,22 @@
1
+ import { create } from './nn/detect.ts';
2
+
3
+ export type { Detection } from './nn/detect.ts';
4
+
5
+ export const { initialize, detect } = create({
6
+ cyrillic: {
7
+ weights: new URL('../weights/standard/cyrillic.bin', import.meta.url),
8
+ meta: new URL('../weights/standard/cyrillic.json', import.meta.url),
9
+ },
10
+ arabic: {
11
+ weights: new URL('../weights/standard/arabic.bin', import.meta.url),
12
+ meta: new URL('../weights/standard/arabic.json', import.meta.url),
13
+ },
14
+ devanagari: {
15
+ weights: new URL('../weights/standard/devanagari.bin', import.meta.url),
16
+ meta: new URL('../weights/standard/devanagari.json', import.meta.url),
17
+ },
18
+ latin: {
19
+ weights: new URL('../weights/standard/latin.bin', import.meta.url),
20
+ meta: new URL('../weights/standard/latin.json', import.meta.url),
21
+ },
22
+ });
package/src/lite.ts ADDED
@@ -0,0 +1,25 @@
1
+ import { create } from './nn/detect.ts';
2
+
3
+ export type { Detection } from './nn/detect.ts';
4
+
5
+ export const { initialize, detect } = create(
6
+ {
7
+ cyrillic: {
8
+ weights: new URL('../weights/lite/cyrillic.bin', import.meta.url),
9
+ meta: new URL('../weights/lite/cyrillic.json', import.meta.url),
10
+ },
11
+ arabic: {
12
+ weights: new URL('../weights/lite/arabic.bin', import.meta.url),
13
+ meta: new URL('../weights/lite/arabic.json', import.meta.url),
14
+ },
15
+ devanagari: {
16
+ weights: new URL('../weights/lite/devanagari.bin', import.meta.url),
17
+ meta: new URL('../weights/lite/devanagari.json', import.meta.url),
18
+ },
19
+ latin: {
20
+ weights: new URL('../weights/lite/latin.bin', import.meta.url),
21
+ meta: new URL('../weights/lite/latin.json', import.meta.url),
22
+ },
23
+ },
24
+ 6,
25
+ );
@@ -0,0 +1,309 @@
1
+ import { loadBinary, loadJson } from '#load';
2
+
3
+ import { forward, loadWeights, loadWeights6, type ModelWeights } from './forward.ts';
4
+ import { normalize, extractNgrams } from './normalize.ts';
5
+
6
+ // #region types
7
+
8
+ /** a single detection result: ISO 639-3 language code and its probability. */
9
+ export type Detection = [lang: string, probability: number];
10
+
11
+ /** URLs for a single group's weight + metadata files. */
12
+ type GroupSource = {
13
+ weights: URL;
14
+ meta: URL;
15
+ };
16
+
17
+ /** ngram vocabulary lists that define the input vector layout for a group model. */
18
+ type GroupNgrams = {
19
+ unigrams: string[];
20
+ bigrams: string[];
21
+ trigrams: string[];
22
+ quadgrams: string[];
23
+ };
24
+
25
+ /** weight metadata loaded from a group's .json file. */
26
+ type GroupMeta = {
27
+ langs: string[];
28
+ ngrams: GroupNgrams;
29
+ inputSize: number;
30
+ outputSize: number;
31
+ };
32
+
33
+ /** a loaded group model ready for inference. */
34
+ type ReadyModel = {
35
+ meta: GroupMeta;
36
+ weights: ModelWeights;
37
+ };
38
+
39
+ /** returned by {@link create} — call initialize() once, then detect() synchronously. */
40
+ type Detector = {
41
+ initialize: () => Promise<void>;
42
+ detect: (text: string) => Detection[];
43
+ };
44
+
45
+ // #endregion
46
+
47
+ // #region script classification
48
+
49
+ /** script family identifiers for character classification. */
50
+ type ScriptFamily =
51
+ | 'korean'
52
+ | 'georgian'
53
+ | 'armenian'
54
+ | 'bengali'
55
+ | 'greek'
56
+ | 'hebrew'
57
+ | 'cjk_kana'
58
+ | 'cjk_han'
59
+ | 'cyrillic'
60
+ | 'arabic'
61
+ | 'devanagari'
62
+ | 'latin';
63
+
64
+ /**
65
+ * classifies a character's Unicode codepoint into a script family.
66
+ *
67
+ * @param cp the codepoint to classify
68
+ * @returns the script family, or `null` if not recognized
69
+ */
70
+ const classifyCodepoint = (cp: number): ScriptFamily | null => {
71
+ // unique scripts
72
+ if ((cp >= 0xac00 && cp <= 0xd7af) || (cp >= 0x1100 && cp <= 0x11ff)) {
73
+ return 'korean';
74
+ }
75
+ if ((cp >= 0x10a0 && cp <= 0x10ff) || (cp >= 0x2d00 && cp <= 0x2d2f)) {
76
+ return 'georgian';
77
+ }
78
+ if (cp >= 0x0530 && cp <= 0x058f) {
79
+ return 'armenian';
80
+ }
81
+ if (cp >= 0x0980 && cp <= 0x09ff) {
82
+ return 'bengali';
83
+ }
84
+ if ((cp >= 0x0370 && cp <= 0x03ff) || (cp >= 0x1f00 && cp <= 0x1fff)) {
85
+ return 'greek';
86
+ }
87
+ if (cp >= 0x0590 && cp <= 0x05ff) {
88
+ return 'hebrew';
89
+ }
90
+
91
+ // CJK
92
+ if ((cp >= 0x3040 && cp <= 0x309f) || (cp >= 0x30a0 && cp <= 0x30ff)) {
93
+ return 'cjk_kana';
94
+ }
95
+ if ((cp >= 0x4e00 && cp <= 0x9fff) || (cp >= 0x3400 && cp <= 0x4dbf)) {
96
+ return 'cjk_han';
97
+ }
98
+
99
+ // NN groups
100
+ if (cp >= 0x0400 && cp <= 0x04ff) {
101
+ return 'cyrillic';
102
+ }
103
+ if ((cp >= 0x0600 && cp <= 0x06ff) || (cp >= 0x0750 && cp <= 0x077f)) {
104
+ return 'arabic';
105
+ }
106
+ if (cp >= 0x0900 && cp <= 0x097f) {
107
+ return 'devanagari';
108
+ }
109
+ if ((cp >= 0x0041 && cp <= 0x005a) || (cp >= 0x0061 && cp <= 0x007a) || (cp >= 0x00c0 && cp <= 0x024f)) {
110
+ return 'latin';
111
+ }
112
+
113
+ return null;
114
+ };
115
+
116
+ /** maps unique script families to their ISO 639-3 language code. */
117
+ const UNIQUE_SCRIPT_MAP: Partial<Record<ScriptFamily, string>> = {
118
+ korean: 'kor',
119
+ georgian: 'kat',
120
+ armenian: 'hye',
121
+ bengali: 'ben',
122
+ greek: 'ell',
123
+ hebrew: 'heb',
124
+ };
125
+
126
+ /** maps script families to NN group names. */
127
+ const SCRIPT_TO_GROUP: Partial<Record<ScriptFamily, string>> = {
128
+ cyrillic: 'cyrillic',
129
+ arabic: 'arabic',
130
+ devanagari: 'devanagari',
131
+ latin: 'latin',
132
+ };
133
+
134
+ // #endregion
135
+
136
+ // #region inference helpers
137
+
138
+ /**
139
+ * builds the input feature vector for a group model from normalized text.
140
+ *
141
+ * @param text normalized text
142
+ * @param ngrams the group's ngram vocabulary
143
+ * @returns float32 input vector matching the model's expected layout
144
+ */
145
+ const buildInput = (text: string, ngrams: GroupNgrams): Float32Array => {
146
+ const unigrams = extractNgrams(text, 1);
147
+ const bigrams = extractNgrams(text, 2);
148
+ const trigrams = extractNgrams(text, 3);
149
+ const quadgrams = extractNgrams(text, 4);
150
+
151
+ const values = [
152
+ ...ngrams.unigrams.map((v) => unigrams[v] || 0),
153
+ ...ngrams.bigrams.map((v) => bigrams[v] || 0),
154
+ ...ngrams.trigrams.map((v) => trigrams[v] || 0),
155
+ ...ngrams.quadgrams.map((v) => quadgrams[v] || 0),
156
+ ];
157
+
158
+ return new Float32Array(values);
159
+ };
160
+
161
+ // #endregion
162
+
163
+ // #region weight loading
164
+
165
+ /**
166
+ * loads and dequantizes weights for a single group from its binary + metadata files.
167
+ *
168
+ * @param source URLs for the group's weight and metadata files
169
+ * @param quantBits quantization bit width (8 or 6)
170
+ * @returns the loaded model ready for inference
171
+ */
172
+ const loadGroup = async (source: GroupSource, quantBits: number): Promise<ReadyModel> => {
173
+ const [bin, rawMeta] = await Promise.all([loadBinary(source.weights), loadJson(source.meta)]);
174
+ const meta = rawMeta as GroupMeta;
175
+
176
+ const load = quantBits === 6 ? loadWeights6 : loadWeights;
177
+ const weights = load(bin, meta.inputSize, meta.outputSize);
178
+
179
+ return { meta, weights };
180
+ };
181
+
182
+ // #endregion
183
+
184
+ // #region detection
185
+
186
+ /**
187
+ * creates a detector for a specific weight variant.
188
+ *
189
+ * call initialize() once to load and dequantize weights via fetch(), then
190
+ * call detect() synchronously for each input text.
191
+ *
192
+ * @param sources record of group names to their weight/meta file URLs
193
+ * @param quantBits quantization bit width (default 8)
194
+ * @returns detector with initialize() and detect() methods
195
+ */
196
+ export const create = (sources: Record<string, GroupSource>, quantBits = 8): Detector => {
197
+ let models: Record<string, ReadyModel> | null = null;
198
+
199
+ const initialize = async () => {
200
+ const entries = Object.entries(sources);
201
+ const loaded = await Promise.all(entries.map(([, source]) => loadGroup(source, quantBits)));
202
+
203
+ models = {};
204
+ for (let i = 0; i < entries.length; i++) {
205
+ models[entries[i][0]] = loaded[i];
206
+ }
207
+ };
208
+
209
+ const detect = (text: string): Detection[] => {
210
+ if (!models) {
211
+ throw new Error(`call initialize() first`);
212
+ }
213
+
214
+ // classify characters by script family
215
+ const scriptCounts = new Map<ScriptFamily, number>();
216
+ let totalClassified = 0;
217
+
218
+ for (let i = 0; i < text.length; i++) {
219
+ const cp = text.codePointAt(i)!;
220
+ // skip surrogates for astral characters
221
+ if (cp > 0xffff) {
222
+ i++;
223
+ }
224
+ const family = classifyCodepoint(cp);
225
+ if (family) {
226
+ scriptCounts.set(family, (scriptCounts.get(family) || 0) + 1);
227
+ totalClassified++;
228
+ }
229
+ }
230
+
231
+ // no classified characters — fallback to latin
232
+ if (totalClassified === 0) {
233
+ return detectGroup(text, 'latin', models);
234
+ }
235
+
236
+ const results: Detection[] = [];
237
+
238
+ for (const [family, count] of scriptCounts) {
239
+ const proportion = count / totalClassified;
240
+
241
+ // unique script languages — use proportion directly as probability
242
+ const uniqueLang = UNIQUE_SCRIPT_MAP[family];
243
+ if (uniqueLang) {
244
+ results.push([uniqueLang, proportion]);
245
+ continue;
246
+ }
247
+
248
+ // CJK — kana implies Japanese, Han-only implies Chinese
249
+ if (family === 'cjk_kana') {
250
+ results.push(['jpn', proportion]);
251
+ continue;
252
+ }
253
+ if (family === 'cjk_han') {
254
+ // only count as Chinese if no kana detected (otherwise Han is part of Japanese)
255
+ if (!scriptCounts.has('cjk_kana')) {
256
+ results.push(['cmn', proportion]);
257
+ }
258
+ continue;
259
+ }
260
+
261
+ // NN group — run model and scale by proportion
262
+ const groupName = SCRIPT_TO_GROUP[family];
263
+ if (groupName && models[groupName]) {
264
+ const groupResults = detectGroup(text, groupName, models, proportion);
265
+ results.push(...groupResults);
266
+ }
267
+ }
268
+
269
+ // if nothing was produced (shouldn't happen, but safety), fallback to latin
270
+ if (results.length === 0) {
271
+ return detectGroup(text, 'latin', models);
272
+ }
273
+
274
+ results.sort((a, b) => b[1] - a[1]);
275
+ return results;
276
+ };
277
+
278
+ return { initialize, detect };
279
+ };
280
+
281
+ /**
282
+ * runs a group's model on the input text and returns detections scaled by proportion.
283
+ *
284
+ * @param text raw input text
285
+ * @param groupName key into the loaded models
286
+ * @param models loaded model records
287
+ * @param proportion script proportion to scale probabilities by
288
+ * @returns detections for this group
289
+ */
290
+ const detectGroup = (
291
+ text: string,
292
+ groupName: string,
293
+ models: Record<string, ReadyModel>,
294
+ proportion = 1,
295
+ ): Detection[] => {
296
+ const model = models[groupName];
297
+ if (!model) {
298
+ throw new Error(`weights not loaded for group '${groupName}'`);
299
+ }
300
+
301
+ const normalized = normalize(text);
302
+ const input = buildInput(normalized, model.meta.ngrams);
303
+ const output = forward(input, model.weights);
304
+
305
+ const results: Detection[] = model.meta.langs.map((lang, i) => [lang, output[i] * proportion]);
306
+ return results;
307
+ };
308
+
309
+ // #endregion
@@ -0,0 +1,181 @@
1
+ // #region types
2
+
3
+ /** float32 weights for a linear model (dense → softmax). */
4
+ export type ModelWeights = {
5
+ w: Float32Array;
6
+ b: Float32Array;
7
+ inputSize: number;
8
+ outputSize: number;
9
+ };
10
+
11
+ // #endregion
12
+
13
+ // #region dequantization
14
+
15
+ /**
16
+ * dequantizes an int8 array back to float32 using its absmax scale.
17
+ *
18
+ * @param data quantized int8 values
19
+ * @param scale the scale factor used during quantization (scaleMax / absmax)
20
+ * @returns dequantized float32 array
21
+ */
22
+ const dequantize = (data: Int8Array, scale: number): Float32Array => {
23
+ const result = new Float32Array(data.length);
24
+ for (let i = 0; i < data.length; i++) {
25
+ result[i] = data[i] / scale;
26
+ }
27
+ return result;
28
+ };
29
+
30
+ /**
31
+ * unpacks 6-bit packed bytes into signed int8 values.
32
+ *
33
+ * packing scheme: 4 values (6 bits each, unsigned offset by +31) → 3 bytes.
34
+ * byte0 = (u0 << 2) | (u1 >> 4)
35
+ * byte1 = ((u1 & 0x0F) << 4) | (u2 >> 2)
36
+ * byte2 = ((u2 & 0x03) << 6) | u3
37
+ *
38
+ * @param packed packed 6-bit data
39
+ * @param count number of original values
40
+ * @returns signed int8 values in [-31, 31]
41
+ */
42
+ const unpack6 = (packed: Uint8Array, count: number): Int8Array => {
43
+ const result = new Int8Array(count);
44
+ let ri = 0;
45
+ let pi = 0;
46
+
47
+ // process full groups of 4
48
+ const fullGroups = (count >> 2) << 2;
49
+ while (ri < fullGroups) {
50
+ const b0 = packed[pi];
51
+ const b1 = packed[pi + 1];
52
+ const b2 = packed[pi + 2];
53
+ result[ri] = (b0 >> 2) - 31;
54
+ result[ri + 1] = (((b0 & 0x03) << 4) | (b1 >> 4)) - 31;
55
+ result[ri + 2] = (((b1 & 0x0f) << 2) | (b2 >> 6)) - 31;
56
+ result[ri + 3] = (b2 & 0x3f) - 31;
57
+ ri += 4;
58
+ pi += 3;
59
+ }
60
+
61
+ // remainder (1-3 values)
62
+ const rem = count - fullGroups;
63
+ if (rem >= 1) {
64
+ result[ri] = (packed[pi] >> 2) - 31;
65
+ }
66
+ if (rem >= 2) {
67
+ result[ri + 1] = (((packed[pi] & 0x03) << 4) | (packed[pi + 1] >> 4)) - 31;
68
+ }
69
+ if (rem >= 3) {
70
+ result[ri + 2] = (((packed[pi + 1] & 0x0f) << 2) | (packed[pi + 2] >> 6)) - 31;
71
+ }
72
+
73
+ return result;
74
+ };
75
+
76
+ /**
77
+ * loads int8 quantized weights from a binary buffer and dequantizes to float32.
78
+ *
79
+ * binary format: 2 × f32 scales (wScale, bScale), then weight bytes, then bias bytes.
80
+ *
81
+ * @param bin raw binary weight data
82
+ * @param inputSize number of input features
83
+ * @param outputSize number of output classes
84
+ * @returns dequantized model weights
85
+ */
86
+ export const loadWeights = (bin: ArrayBuffer, inputSize: number, outputSize: number): ModelWeights => {
87
+ const view = new DataView(bin);
88
+ const wScale = view.getFloat32(0, true);
89
+ const bScale = view.getFloat32(4, true);
90
+
91
+ const wSize = outputSize * inputSize;
92
+ const w = new Int8Array(bin, 8, wSize);
93
+ const b = new Int8Array(bin, 8 + wSize, outputSize);
94
+
95
+ return {
96
+ w: dequantize(w, wScale),
97
+ b: dequantize(b, bScale),
98
+ inputSize,
99
+ outputSize,
100
+ };
101
+ };
102
+
103
+ /**
104
+ * loads int6 packed quantized weights from a binary buffer and dequantizes to float32.
105
+ *
106
+ * same header as int8 (2 × f32 scales), but payload is 6-bit packed.
107
+ *
108
+ * @param bin raw binary weight data
109
+ * @param inputSize number of input features
110
+ * @param outputSize number of output classes
111
+ * @returns dequantized model weights
112
+ */
113
+ export const loadWeights6 = (bin: ArrayBuffer, inputSize: number, outputSize: number): ModelWeights => {
114
+ const view = new DataView(bin);
115
+ const wScale = view.getFloat32(0, true);
116
+ const bScale = view.getFloat32(4, true);
117
+
118
+ const wCount = outputSize * inputSize;
119
+ const wPackedSize = Math.ceil((wCount * 3) / 4);
120
+ const bPackedSize = Math.ceil((outputSize * 3) / 4);
121
+
122
+ const wPacked = new Uint8Array(bin, 8, wPackedSize);
123
+ const bPacked = new Uint8Array(bin, 8 + wPackedSize, bPackedSize);
124
+
125
+ return {
126
+ w: dequantize(unpack6(wPacked, wCount), wScale),
127
+ b: dequantize(unpack6(bPacked, outputSize), bScale),
128
+ inputSize,
129
+ outputSize,
130
+ };
131
+ };
132
+
133
+ // #endregion
134
+
135
+ // #region forward pass
136
+
137
+ /**
138
+ * applies softmax in-place to an output array.
139
+ *
140
+ * @param output logit array to convert to probabilities
141
+ */
142
+ const softmax = (output: Float32Array): void => {
143
+ let max = -Infinity;
144
+ for (let i = 0; i < output.length; i++) {
145
+ if (output[i] > max) {
146
+ max = output[i];
147
+ }
148
+ }
149
+ let expSum = 0;
150
+ for (let i = 0; i < output.length; i++) {
151
+ output[i] = Math.exp(output[i] - max);
152
+ expSum += output[i];
153
+ }
154
+ for (let i = 0; i < output.length; i++) {
155
+ output[i] /= expSum;
156
+ }
157
+ };
158
+
159
+ /**
160
+ * forward pass for a linear model: dense → softmax.
161
+ *
162
+ * @param input input feature vector (ngram frequencies)
163
+ * @param m model weights
164
+ * @returns output probabilities (one per language in the group)
165
+ */
166
+ export const forward = (input: Float32Array, m: ModelWeights): Float32Array => {
167
+ const output = new Float32Array(m.outputSize);
168
+ for (let i = 0; i < m.outputSize; i++) {
169
+ let sum = m.b[i];
170
+ const off = i * m.inputSize;
171
+ for (let j = 0; j < m.inputSize; j++) {
172
+ sum += input[j] * m.w[off + j];
173
+ }
174
+ output[i] = sum;
175
+ }
176
+
177
+ softmax(output);
178
+ return output;
179
+ };
180
+
181
+ // #endregion
@@ -0,0 +1,24 @@
1
+ import { readFileSync } from 'node:fs';
2
+ import { fileURLToPath } from 'node:url';
3
+
4
+ /**
5
+ * loads binary data from a file URL using node:fs.
6
+ *
7
+ * @param url file URL to load
8
+ * @returns the file contents as an ArrayBuffer
9
+ */
10
+ export const loadBinary = async (url: URL): Promise<ArrayBuffer> => {
11
+ const buffer = readFileSync(fileURLToPath(url));
12
+ return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
13
+ };
14
+
15
+ /**
16
+ * loads and parses JSON from a file URL using node:fs.
17
+ *
18
+ * @param url file URL to load
19
+ * @returns the parsed JSON value
20
+ */
21
+ export const loadJson = async (url: URL): Promise<unknown> => {
22
+ const content = readFileSync(fileURLToPath(url), 'utf-8');
23
+ return JSON.parse(content);
24
+ };
package/src/nn/load.ts ADDED
@@ -0,0 +1,21 @@
1
+ /**
2
+ * loads binary data from a URL via fetch.
3
+ *
4
+ * @param url URL to fetch
5
+ * @returns the response body as an ArrayBuffer
6
+ */
7
+ export const loadBinary = async (url: URL): Promise<ArrayBuffer> => {
8
+ const response = await fetch(url);
9
+ return response.arrayBuffer();
10
+ };
11
+
12
+ /**
13
+ * loads and parses JSON from a URL via fetch.
14
+ *
15
+ * @param url URL to fetch
16
+ * @returns the parsed JSON value
17
+ */
18
+ export const loadJson = async (url: URL): Promise<unknown> => {
19
+ const response = await fetch(url);
20
+ return response.json();
21
+ };
@@ -0,0 +1,38 @@
1
+ const HYPHEN_RE = /-+/g;
2
+ const NON_LETTER_RE = /[^\p{L}\p{M}\s]/gu;
3
+ const MULTI_SPACE_RE = /\s{2,}/g;
4
+
5
+ /**
6
+ * normalizes text for ngram extraction: lowercases, strips non-letter/non-mark
7
+ * characters, collapses whitespace, and pads with spaces.
8
+ *
9
+ * @param text raw input text
10
+ * @returns normalized text padded with leading/trailing spaces
11
+ */
12
+ export const normalize = (text: string): string => {
13
+ return ` ${text.replace(HYPHEN_RE, ' ').replace(NON_LETTER_RE, '').replace(MULTI_SPACE_RE, ' ').toLowerCase().trim()} `;
14
+ };
15
+
16
+ /**
17
+ * extracts ngram frequencies from a string.
18
+ *
19
+ * @param text normalized text (from {@link normalize})
20
+ * @param length ngram length (1 for unigrams, 2 for bigrams, etc.)
21
+ * @returns map of ngram string to its relative frequency (count / total)
22
+ */
23
+ export const extractNgrams = (text: string, length: number): Record<string, number> => {
24
+ const ngrams: Record<string, number> = {};
25
+ let total = 0;
26
+
27
+ for (let i = 0, l = text.length - length; i <= l; i++) {
28
+ const value = text.slice(i, i + length);
29
+ ngrams[value] = (ngrams[value] || 0) + 1;
30
+ total++;
31
+ }
32
+
33
+ for (const value in ngrams) {
34
+ ngrams[value] /= total;
35
+ }
36
+
37
+ return ngrams;
38
+ };
Binary file
@@ -0,0 +1 @@
1
+ {"langs":["ara","ckb","pes"],"ngrams":{"unigrams":[" ","ە","ا","ل","ی","ن","ي","و","د","م","ت","ر","ب","ک","ه","ع","ێ","س","ك","ۆ","ش","أ","ئ","ز","ة","گ","خ","ف","پ","ق","ح","ڵ","آ","ج","ڕ","چ","ّ","e","ط","ى","ژ","ص","ذ","a","غ","إ","i","ض","َ","n","ظ","ث","r","ٔ","ء","ê","ً","ُ","k","ِ","ْ","ؤ","ٍ","b","o","ٌ","ھ","m","ـ","d","t","l","y","s","c","h","â","p","w","u","ٱ","ۀ","ٓ","v","ā","g","f","j","x","z","q","ﻻ","ڤ","ٰ","ۜ","あ","š","а","ﻹ",""],"bigrams":["ال","ە ","ی "," ا","د ","م ","ه ","ة "," د","ي ","ر "," أ"," ل","می","ل ","دە","ند"," ي","ای"," ئ","ست"," ر","ى ","ەم","لە","وو"," آ","بە"," ك","ز ","به","أن","ەر","ید","نە","کا","از","ەن","ري","یا","که","ێت","در","تە","ك ","مە","ب ","هە","ین"," إ","ێک","کن","كا","او","يا","خو","ود","ين","رە","ری"," ی","ده","ير","ەت","تا","اه","ع ","سە","ني","لت","ۆ ","دي","ش ","ول","ەس","نه","هم","ائ","بۆ","بی","لق","دن","ێ ","ەب","یر","كن"," ڕ","یش","بي"," گ","گر","ێن","یم","ۆر","سي","هی","ء ","بێ","تن","تێ","رة","زا","لی","لن","خۆ","شە","شد","ذا","فت","هر","شو","اء","اڵ","نێ","مل","ئا","مه","ەز","یس","فر","رێ","ته","ەد","يم","ێر","عن","نو","ێش","زن","لد","لێ","قد","شم","کس","يس","یگ","رف","ً ","ڕا","يو","خە","گا","ەخ","چە","عد","وز","ۆش","إن","کو","مت"],"trigrams":[" ال"," دە"," می"," لە","ند ","ست "," را","را "," بە"," به","که ","به ","ن ا","است"," اس"," هە"," که","لە ","ێت ","از ","كان"," در"," تۆ"," از","ما ","ای ","ید ","ەی ","در ","ها "," خو"," كا"," دا","ه ا","ين ","می ","نا "," پێ"," بر","ین ","وو ","ود "," بۆ"," یک"," نا"," آن","بوو","یک ","ه ب","امي","رة ","یەک"," کن","کرد"," ای","ا ا","ی ا","دار","ەکا","ه م","سام","ار ","ني ","یت ","ده ","های"," او","این","ت ا","ێک ","د ب","انە"," لي"," لم","بۆ "," ان","ذا ","ر ا","رای","یان"," هم","یم ","کات","دن ","اء ","آن ","بال"," نم","ري ","ەر ","ی م"," هذ","ا م","کان"," عن","ال ","با ","م ا"," نە","دة ","سەر","ی ئ","ارە"," تا","ة م","رە ","میک","نمی","ۆر "," لل","نە ","ير "," سە","ه د","ي م","ەند"," دو"," تو","الب","ێکی","ن ر","اً ","نم ","الن"," ئا","تر ","ایە","نند","زان","ا د","اد ","مة ","واه","اوە","بود","ی ل","ی ر"," خۆ","ول "," رو","هم ","دوو"," ام","الإ","تە ","اید","خود","كل "," هر","د م"],"quadgrams":[" لە "," را "," الم"," به "," که ","ن ال"," است","است "," بۆ "," از "," در "," یک "," كان"," می ","ئەوە"," این"," کرد"," او "," آن ","برای","كان "," زۆر","رای ","امي ","توان","سامي"," بوو"," سام"," دار","زۆر ","ا ال","ەکان"," نمی"," با ","ت ال","بوو ","را ب","ل ال","های "," الت","ێکی ","این "," میک"," الج","یان ","خواه"," بال","نند ","ارد ","م ال"," خود","ن را","ه ال"," بود"," لم ","مان ","اید "," ما ","دارد"," الب"," الن","د ال","ی دە","میتو","کات ","یتوا"," چیز","ر ال","ویست","ر می","ایە "," الإ","توم ","کرد ","ا می"," توم","انم ","ا به","باید"," هست","ی را"," الش","انە ","شود ","ووە ","ب ال","ه با","ی کن"," سەر"," هیچ"," الا"," بای","بود "," باش"," الو"," هل ","ە کە"," شما","نید ","و ال"]},"inputSize":500,"outputSize":3}
@@ -0,0 +1,5 @@
1
+ �o�?�INB�'u��~'�q��v\�WЁן��ى�)u�aiz`M�RvSe�T}Ԛz�uu�Ug�}�a�Şy�u�ݍ��~'�a�^e��m��e��u��q�u����u�y�y��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}�ڍ���!u5�9��n'!�'aix[m�b��nc^X��_�D�`�^y��^(I�u�fI�^��n&މ�[��zZ(��YigU�`uuPbYqŕv!Q��}�d�U�qVZnH�]�f����ށ��gz&`��Y~f6ana�4օ��m�qhj^�8�}�i��Y��]�a�c�H~W��\�'ڂ)`U�`��T}֥y�ۖG܅�ډf�y� j
2
+ Zy�cU��~X��m�m���y�_���uؖ�j�m���ؕ�獖�m��Yf�i��e�"��`Q�ׂ�Y��y��a����݂'Y��"e��zc}����}�^b� V��'��$u��e����9�u��G�u��y�a�y߆����q�ۉ�"y��e�ߍ���z�mǜ�ޅ�am�Z�硉�υכy�^y��]G�}�!nF�}��n7�u�Q��q� y�Wnj$u����V���������[�����Y��&�e�am�Չ�i�u� e��z$�q暅�i�a���g"e7u�}�]�r%��቙#m�m���}ƙ}�^�x`���r_u&䉖�}��y�$}��n��"���q��ך~$yh���y��u�Xv&�a��q��nv�b8}e^��Xm��]Ơm��q֢z'�jG�GXvG�r7���"a�_r�rށ�Z~b~�z_y��e�"��ug$�v��VrV�}g��ށ�`y7]y��~����h�"y瀉���cvT�tv�L�\A֔m�m�Zq��}��d�\e�]�`r^y�\��ru�)��v�q�u�}�v�y�y�u�y�y��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��]��]�Y��R7�~G�u����=Wm��vWm�WY��m��}����V�x���BG^n�^^Yi���q���cu�au�_a��jW؎2be�#Y��m�Xy7�������TzT؍ i�^m��~�(&yv�v�V�^ yɝ�gnc]�蠥��m��e�N��E�h����6�~ �bV�Yz!��dy���F�ziR�w`^Wc��Xz�~g����֚m�T}�X�֠r���&ݮ�UX�~Tוg�nV�f�z6!y��w�aGڂt��_�����$^ ]ؠ}�ҍɟI�e�ȟ��]�[a��z\y�zQq��J:!t�ڂ�&�A�XyiX���mX!�נf(rV�q�am6ߒ~V_Y� �ّ�ނ' �%�yǟy�!}h��(^5�a��\U���'�V��3���Zm�^�7��5�����W���bW�iե�h��W ���I�uٟ�'�m�lf(}�݁��f7�a��iؘ�9Vf�^W!b�nVh��z���q�Z�b�Ɯ��eU��f����i�Yq�]����h���ar�e�~8%mԎQHq��M�["ԙ� �&Zf(�vG�|�m^8 �ݦۍ�ݒ�^v7�����B�jS�b�v(�v'����I��a����6�u��u�feT�j�!���y��\vڥh�}7aVH�qȠr��q���VW!�h%��"~xQV�m�[��r�a�d�7 �g֑6ܞw�N�䢉Y�'}����~X eh�}�q�)������Q�r'�y�]���uU�u�\q��e��~�ev�W��+uD�m�y(�vuqّuצa�^�g��w��מ���瞅�z���z�y��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}�X���iW�v9ny�Xm��6u�^���y���^mלu��~'�nG����}�&u����]m�}v��Ɨa��u�#�&�~G�e��m��vj�e�g~�y�Xa���zw_q��aǗ�&u� ]�`��^��bm�c~��vS�E_i��m�}W�r7VyƜu�ډƜU�ލW�iV]��a�Y��m٘VU�V��_U��Y�Yr5�r7�y�v$�~�I�n6ߍ�����y�a��b���z�}��awYy�q矅�~�e��}�ڑ�}�i�q�^y�a�#n$r7�WY}ǢuG�iu^��y�ז'�y�\�&[m��iv}��a�\^5�m�}� ]�&v��$Y�]y�[qvY]�^B�Y��nv�u�Wfj�u��m�mלq��y� ���e؛��`�W`uv!y��u��u��q�m�ۍ�Z��Z%�M��q��q�i�\���]��Iu�i����vEm�cF!m�#v�a��ug�}�BVq�a�֞a��z�q��j}vq硁�]���M�܅�du���h�}�_�n�ncu�d}���ay�\m�^q�]j�a��y�z&��!z6�Z(���ߕ�ގ}ǚ��ׁ��uvۅǡvF�vGi�#n�Y؟��r7�֙e� ���i�#�(��azb��c��څ�`�ןv܊!��߆Gq�$zI�]uv�i�]�&�}棅��i�_���vfނj�W�za}��}� }���\5���V]~�y��Ux�m��iY]��q��q�\y�݂i7�q�[���}�^z\u�^q�_uǠ��z'�p��u��u�u�y�y�u�y�y��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��ew�n&"���u�"z)"i������]j`���r�y�!e�}ș�w u�O�(�m��q�_m��E�\�^&�z'm��U]yfcz�m��ٍw~F�vgZ'Z����g[y�_v}�]�]b7�u��R r�m��y���(��ނ�B��a�d�&��#Y�T��FWy��q8҆$�N7�� vw]�ՍH��F�a��}�❇�q�U���xjQfbQ֘�f�jW�N'��y*�u�jh���fw!u�\}����vg[eXV���ǡ��_�)�x�_��b�0Ւ6����Ǟm�\^ �]�]��&fH�u����rfZ���n'��\y)[�Ȟ�X�y��b(q��^ڄ���v&ee�����r& m����ߝ�y�\�Ȣ�7ˉ��zH�z&]��bG���^�
3
+ Zr(Xb'e�~`��q�Xi��q�"uF}��zXݙ��� zڍ��z9�'�e�e�^��a�G�i蚆�v5frVf'�vHr7!�����m螊"]� v4WY�0n'��؝Y�[v�ףu�X��ej6���#t����9�hU�^��!� �'��&�^��'��u �5u�>`i�`ifz'V�^��m�� be�]j�擮G�ja�e�Y�a���G��vd^y�fZa��f� ��І&VRXfv'�v:ev:����H�}��i�b�X`~(^u��iؙ���e�ߊfXyw�e�>'�qU��֊F�u�"�� �'�z��\}Ȕ~fq� }�SJ z8]`��JuXa�u�Ս�[uơ�X��i�b�n���\u�_q�^uן1�^v�m�uמu�q�y�y�q�y�y��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}�݆'؉��e�&bM(\�v�f�mX�qW�]������a�\��^q�%b'�E�N�nJT~#vH!��bvaa�#f��4���\]�_e�b^&�VXҊ&�}��q�b�m�gmu!v(Tb:WY��Tm�z�~u����Iءn9�I(dFfa�����%ى�Yj8ֵ�TrG�]�m�����ե����~'ݝ�&Y��e�\eơm�YqXXfI^z�b6�����m�"mf�Mמr��Y��]�_my�a�aq�#��Wuh���m�$M�Z���r}螕��^F�r �]����vwc��fe�ci�!64a��[��%Q�Z:!z�q��Uw#q�aJ7���}�"m�)i��T���M��Vۑ����\e��}��w�Yȡ�d^v��^hyv߉��j b�E�`i�q�cu�br %^'�uS"fUUz�e�\�'���y�ㅦ���rX$�g���Se��n&%^'�uG#j5�i��z(�U����ܕg���܎WI�ai�Xn8]r�e�"a�u����(�W�yȢ�)~'�y�Yr9_xY!q���~�ev΢Vr�v�E��]�`�"��yd\��g�]�ba�U�`�V�i8�ju���c��#9��u�b��aG�u�[�i �&W�u�E�^�h^v��U$y�߉���\egYm�ayw�����Y�`r'E���X�Q��n���Wa�Ff8"r6�u�e�"n�v�a�_yY!qXc���}� r(�����u�N�}Hq�)n�Y��6�z�aauaN�&�e�߅�]�h!gX�=�Sbi՞z6�uy^q�"qǘ��^bg�s�f'�v�q��u�y�u�y�y��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}�^iw�j �~Su��y�m�\a��o\u���]Q�cj%�r7�i�^q�܅����Ԏ'�v���m��Z��q��ywX}��}g�q��Sq�a4ԗq�؉�z7v6�V'�Yɡu�^m؜m��r6Zm��y�&^�ra��y�]����zRtee��}�U�c]��q�]�"z9Yu�؁�^e�=ge�^�Ǚ���q�܆��Y}Ve�mؕ��Y����v]G�M�!��Zy�!��Zu��u��^U�d^�qx҆5 ��y�]��~��}��}ԝu� u�Xu�!r6����&�%��qHT}֠��_�h[��u�aq�z&Zi�}� q�a��q��r�\R'�}(]i���U����e�[��╷�J!�蚉xY}��G��Yu�&u�����u�a�Y�m�#d��i�]jby�}Wv��y�y��]r&�z7����q��~'���ۖaE�!zd~8�y��}��Gm��rG�q��a��z%�u�\u�anG�n9!jт7���^��ߝ�^m��iأb
4
+ vc��ނ6�~��6ȉ��]���BO��n[�X�q��}i�[��m�ցwSu��I6�U�^I�ԉ�$�������$��\R��[��$�4e��j�~j%��ZjG!u��}�� �'�q梞FYa��u��N7���q�bnGea�cm�q��u�u�#a��m��}�nGaq���~fy�!}� qz)`}��}��~'���R}�SV!v!����q��ię��u�v�U7y��f}��y�u��}֛�מ:�y�uhy���rv�y� ���!���}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}��}� bc�(�!�c�6����a�aq��u7��bQ���c���fx$����_���Y�։�#q���HRF�v)u�v`m��~W��re��Sm�u�a�բ��Zm��q�Yy�b�\���^X^gi��u�Q�h�m���Wr��a����Z#b��~_}�VV4ڑ��s#���^�v�i�NS�z_BG�f��آrei�de�j�q�Z�W�qɢ���m�]�Ǘ��!ugdq�)n\y� z]�ߊᩨci��m֗mfւhnq�!a�Yu��rfa�UjFZq��f(ҙ�ߑ� ��Y�vUN��m�dj�rߩ��6`��%Eמ���h�m�}��m��qf`Yiz��U��a�ەv��q��VY�e'�x����}��q�gu�_Y���m��m���!y��}z���h�ٜ��ށ��]֞�~Ӊ���Ve�(�������r6u��z }ǧq���u��a�y�a�ەgQ����!R`q�Wy�\Jhr^�E\�Yq�"b&zDX��^�^��UZ �a�juJ5�z
5
+ �N�R&W�f�iG�Q6}�uw�:8]ix�y�_e�SuG�6ڎ"�hv'���QnU��֦:H�j ��ga}ד���v6Wiy`r�Xz^�ɛ~'!�������%Q�aqV��B "��e��r4�}��g\e8�m���x�m�m��u��} ڊV��4�f8bR �_�~��