fr-spell 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json ADDED
@@ -0,0 +1,50 @@
1
+ {
2
+ "name": "fr-spell",
3
+ "description": "A NPM Library To Convert Derivative Forms Of Noun, Verb or Adjective In French To Lemmas Or Vice Versa.",
4
+ "version": "1.0.1",
5
+ "type": "module",
6
+ "main": "./src/frspell.js",
7
+ "exports": {
8
+ ".": "./src/frspell.js"
9
+ },
10
+ "scripts": {
11
+ "help": "node scripts/help.js",
12
+ "test": "node test/test.js",
13
+ "benchmark:prepare": "node benchmark/generate-checklists.js",
14
+ "benchmark": "node benchmark/run-benchmark.js",
15
+ "benchmark:lemma": "node benchmark/run-benchmark.js lemma",
16
+ "benchmark:noun": "node benchmark/run-benchmark.js noun",
17
+ "benchmark:verb": "node benchmark/run-benchmark.js verb",
18
+ "benchmark:adje": "node benchmark/run-benchmark.js adje"
19
+ },
20
+ "repository": {
21
+ "type": "git",
22
+ "url": "https://github.com/davychxn/FR-SPELL"
23
+ },
24
+ "keywords": [
25
+ "french",
26
+ "Le français",
27
+ "word lemma",
28
+ "word derivative",
29
+ "lemmatization",
30
+ "conjugation",
31
+ "spelling",
32
+ "french noun",
33
+ "french verb",
34
+ "french adjective"
35
+ ],
36
+ "homepage": "https://github.com/davychxn/FR-SPELL",
37
+ "license": "MIT",
38
+ "funding": {
39
+ "type": "patreon",
40
+ "url": "https://www.patreon.com/davychxn"
41
+ },
42
+ "author": {
43
+ "name": "Davy Chen",
44
+ "email": "davy.chen@163.com",
45
+ "url": "https://www.linkedin.com/in/davychxn/"
46
+ },
47
+ "dependencies": {
48
+ "onnxruntime-node": "^1.24.3"
49
+ }
50
+ }
@@ -0,0 +1,54 @@
1
+ const lines = [
2
+ 'FR-SPELL Help',
3
+ '============',
4
+ '',
5
+ 'Create predictor:',
6
+ " import { FrSpell } from 'fr-spell';",
7
+ ' const predictor = await FrSpell();',
8
+ '',
9
+ 'API methods:',
10
+ ' lemma(input)',
11
+ ' nounDerive(lemma, person)',
12
+ ' adjeDerive(lemma, person)',
13
+ ' verbDerive(lemma, person, mode, tense)',
14
+ ' derive(lemma, wordType, person, mode, tense)',
15
+ '',
16
+ 'Parameter descriptions:',
17
+ ' input : inflected or conjugated surface form (example: mangeons)',
18
+ ' lemma : base form (example: manger)',
19
+ ' wordType : NOUN | ADJE | VERB',
20
+ ' mode/tense : only needed for verbDerive and derive with wordType=VERB',
21
+ '',
22
+ 'Allowed person values:',
23
+ ' FST (1st person singular)',
24
+ ' SND (2nd person singular)',
25
+ ' THD_M (3rd person masculine singular)',
26
+ ' THD_F (3rd person feminine singular)',
27
+ ' FST_PL (1st person plural)',
28
+ ' SND_PL (2nd person plural)',
29
+ ' THD_PLM (3rd person masculine plural)',
30
+ ' THD_PLF (3rd person feminine plural)',
31
+ '',
32
+ 'Allowed mode values:',
33
+ ' INDI (indicative)',
34
+ ' SUBJ (subjunctive)',
35
+ ' COND (conditional)',
36
+ ' PART (participle)',
37
+ ' IMPE (imperative)',
38
+ ' INFI (infinitive)',
39
+ '',
40
+ 'Supported tense values in this implementation:',
41
+ ' PRES (present), IMPA (imperfect), FUTU (future), PASS (past)',
42
+ '',
43
+ 'Notes:',
44
+ ' - noun/adje derive do not require mode or tense in user input',
45
+ ' - for verbs, use one of PRES/IMPA/FUTU/PASS in tense',
46
+ '',
47
+ 'Examples:',
48
+ " await predictor.lemma('mangeons');",
49
+ " await predictor.nounDerive('chat', 'THD_PLF');",
50
+ " await predictor.adjeDerive('beau', 'THD_F');",
51
+ " await predictor.verbDerive('manger', 'FST_PL', 'INDI', 'PRES');",
52
+ ];
53
+
54
+ console.log(lines.join('\n'));
package/src/frspell.js ADDED
@@ -0,0 +1,9 @@
1
+ /**
2
+ * FR-SPELL source entry
3
+ * Author: Davy Chen <davy.chen@163.com>
4
+ * Profile: https://www.linkedin.com/in/davychxn/
5
+ */
6
+
7
+ export {
8
+ FrSpell
9
+ } from './module/Predictor.js';
@@ -0,0 +1,416 @@
1
+ /**
2
+ * FR-SPELL core predictor implementation
3
+ * Author: Davy Chen <davy.chen@163.com>
4
+ * Profile: https://www.linkedin.com/in/davychxn/
5
+ */
6
+
7
+ import fs from 'node:fs/promises';
8
+ import path from 'node:path';
9
+ import { fileURLToPath } from 'node:url';
10
+ import * as ort from 'onnxruntime-node';
11
+
12
+ function argmax(arr) {
13
+ let bestIdx = 0;
14
+ let bestVal = arr[0];
15
+ for (let i = 1; i < arr.length; i += 1) {
16
+ if (arr[i] > bestVal) {
17
+ bestVal = arr[i];
18
+ bestIdx = i;
19
+ }
20
+ }
21
+ return { index: bestIdx, value: bestVal };
22
+ }
23
+
24
+ function softmaxAt(logits, idx) {
25
+ let maxVal = -Infinity;
26
+ for (let i = 0; i < logits.length; i += 1) {
27
+ if (logits[i] > maxVal) maxVal = logits[i];
28
+ }
29
+ let sum = 0;
30
+ for (let i = 0; i < logits.length; i += 1) {
31
+ sum += Math.exp(logits[i] - maxVal);
32
+ }
33
+ return Math.exp(logits[idx] - maxVal) / sum;
34
+ }
35
+
36
+ function makeInt64Tensor2D(rows) {
37
+ const batch = rows.length;
38
+ const seq = rows[0].length;
39
+ const data = new BigInt64Array(batch * seq);
40
+ let k = 0;
41
+ for (let i = 0; i < batch; i += 1) {
42
+ for (let j = 0; j < seq; j += 1) {
43
+ data[k] = BigInt(rows[i][j]);
44
+ k += 1;
45
+ }
46
+ }
47
+ return new ort.Tensor('int64', data, [batch, seq]);
48
+ }
49
+
50
+ function makeInt64Tensor1D(values) {
51
+ const data = new BigInt64Array(values.length);
52
+ for (let i = 0; i < values.length; i += 1) {
53
+ data[i] = BigInt(values[i]);
54
+ }
55
+ return new ort.Tensor('int64', data, [values.length]);
56
+ }
57
+
58
+ function toStoi(itos) {
59
+ const stoi = {};
60
+ for (let i = 0; i < itos.length; i += 1) stoi[itos[i]] = i;
61
+ return stoi;
62
+ }
63
+
64
+ function decodeTextFromIds(ids, itos, specials) {
65
+ let out = '';
66
+ for (const id of ids) {
67
+ const tok = itos[id];
68
+ if (tok === specials.eos) break;
69
+ if (tok === specials.pad || tok === specials.bos || tok === specials.unk) continue;
70
+ out += tok;
71
+ }
72
+ return out;
73
+ }
74
+
75
+ function normalizeEnumToken(value, fallback) {
76
+ if (typeof value === 'string' && value.trim().length > 0) {
77
+ return value.trim().toUpperCase();
78
+ }
79
+ return fallback;
80
+ }
81
+
82
+ const MODULE_DIR = path.dirname(fileURLToPath(import.meta.url));
83
+ const PACKAGE_ROOT = path.resolve(MODULE_DIR, '..', '..');
84
+
85
+ const DEFAULT_MODEL_PATHS = {
86
+ lemmaModelPath: path.resolve(PACKAGE_ROOT, 'models/small/lemma_type_model.int8.onnx'),
87
+ lemmaVocabPath: path.resolve(PACKAGE_ROOT, 'models/small/lemma_type_vocab.json'),
88
+ lemmaLabelsPath: path.resolve(PACKAGE_ROOT, 'models/small/lemma_type_labels.json'),
89
+ derivativeModelPath: path.resolve(PACKAGE_ROOT, 'models/small/derive_form_model.int8.onnx'),
90
+ derivativeVocabPath: path.resolve(PACKAGE_ROOT, 'models/small/derive_form_vocab.json'),
91
+ };
92
+
93
+ export async function createLemmaTypePredictor({
94
+ modelPath,
95
+ vocabPath,
96
+ labelsPath,
97
+ maxDecodeLen,
98
+ executionProviders,
99
+ }) {
100
+ const resolvedModel = path.resolve(modelPath);
101
+ const resolvedVocab = path.resolve(vocabPath);
102
+ const resolvedLabels = path.resolve(labelsPath);
103
+
104
+ const [vocabRaw, labelsRaw] = await Promise.all([
105
+ fs.readFile(resolvedVocab, 'utf-8'),
106
+ fs.readFile(resolvedLabels, 'utf-8'),
107
+ ]);
108
+
109
+ const vocab = JSON.parse(vocabRaw);
110
+ const labels = JSON.parse(labelsRaw);
111
+
112
+ const itos = vocab.itos;
113
+ const stoi = toStoi(itos);
114
+
115
+ const idToCode = {};
116
+ for (const [k, v] of Object.entries(vocab.id_to_code)) {
117
+ idToCode[Number(k)] = v;
118
+ }
119
+ const codeCharToName = labels.code_char_to_name || {};
120
+
121
+ const PAD = '<pad>';
122
+ const BOS = '<bos>';
123
+ const EOS = '<eos>';
124
+ const UNK = '<unk>';
125
+
126
+ const padId = stoi[PAD];
127
+ const bosId = stoi[BOS];
128
+ const eosId = stoi[EOS];
129
+ const unkId = stoi[UNK];
130
+
131
+ const session = await ort.InferenceSession.create(resolvedModel, {
132
+ executionProviders: executionProviders || ['cpu'],
133
+ graphOptimizationLevel: 'all',
134
+ });
135
+ const inputNameSet = new Set(session.inputNames);
136
+
137
+ const decodeLimit = maxDecodeLen || labels.max_decode_len || 32;
138
+
139
+ function encodeWord(word) {
140
+ const ids = [];
141
+ for (const ch of word) {
142
+ ids.push(stoi[ch] ?? unkId);
143
+ }
144
+ ids.push(eosId);
145
+ return ids;
146
+ }
147
+
148
+ const specials = {
149
+ pad: PAD,
150
+ bos: BOS,
151
+ eos: EOS,
152
+ unk: UNK,
153
+ };
154
+
155
+ async function predict(word) {
156
+ const t0 = performance.now();
157
+ const srcIds = encodeWord(word);
158
+ const srcTensor = makeInt64Tensor2D([srcIds]);
159
+ const srcLenTensor = makeInt64Tensor1D([srcIds.length]);
160
+
161
+ let tgt = [bosId];
162
+ let codeLogits = null;
163
+ const lemmaTokenIds = [];
164
+
165
+ for (let step = 0; step < decodeLimit; step += 1) {
166
+ const tgtTensor = makeInt64Tensor2D([tgt]);
167
+ const outputs = await session.run({
168
+ ...(inputNameSet.has('src') ? { src: srcTensor } : {}),
169
+ ...(inputNameSet.has('src_len') ? { src_len: srcLenTensor } : {}),
170
+ ...(inputNameSet.has('tgt_in') ? { tgt_in: tgtTensor } : {}),
171
+ });
172
+
173
+ const token = outputs.token_logits;
174
+ const code = outputs.code_logits;
175
+ codeLogits = code.data;
176
+
177
+ // token_logits shape: [1, tgt_len, vocab_size]
178
+ const vocabSize = token.dims[2];
179
+ const tgtLen = tgt.length;
180
+ const lastOffset = (tgtLen - 1) * vocabSize;
181
+ const lastStep = token.data.slice(lastOffset, lastOffset + vocabSize);
182
+ const { index: nextId } = argmax(lastStep);
183
+
184
+ lemmaTokenIds.push(nextId);
185
+ if (nextId === eosId) break;
186
+ tgt.push(nextId);
187
+ }
188
+
189
+ const predLemma = decodeTextFromIds(lemmaTokenIds, itos, specials);
190
+ const codeBest = argmax(codeLogits);
191
+ const predCode = idToCode[codeBest.index] || 'A';
192
+ const codeConf = softmaxAt(codeLogits, codeBest.index);
193
+ const predType = codeCharToName[predCode] || 'UNKNOWN';
194
+
195
+ return {
196
+ input: word,
197
+ lemma: predLemma,
198
+ wordType: predType,
199
+ confidence: codeConf,
200
+ timeMs: performance.now() - t0,
201
+ };
202
+ }
203
+
204
+ async function lemma(word) {
205
+ return predict(word);
206
+ }
207
+
208
+ async function predictBatch(words) {
209
+ const out = [];
210
+ for (const w of words) {
211
+ // Keep sequential to keep implementation simple and deterministic.
212
+ out.push(await predict(w));
213
+ }
214
+ return out;
215
+ }
216
+
217
+ return {
218
+ predict,
219
+ lemma,
220
+ predictBatch,
221
+ metadata: {
222
+ modelPath: resolvedModel,
223
+ vocabSize: itos.length,
224
+ decodeLimit,
225
+ },
226
+ };
227
+ }
228
+
229
+ export async function createDerivativeTypePredictor({
230
+ modelPath,
231
+ vocabPath,
232
+ maxDecodeLen,
233
+ executionProviders,
234
+ }) {
235
+ const resolvedModel = path.resolve(modelPath);
236
+ const resolvedVocab = path.resolve(vocabPath);
237
+
238
+ const vocabRaw = await fs.readFile(resolvedVocab, 'utf-8');
239
+ const vocab = JSON.parse(vocabRaw);
240
+ const itos = vocab.itos;
241
+ const stoi = toStoi(itos);
242
+
243
+ const PAD = '<pad>';
244
+ const BOS = '<bos>';
245
+ const EOS = '<eos>';
246
+ const UNK = '<unk>';
247
+
248
+ const padId = stoi[PAD];
249
+ const bosId = stoi[BOS];
250
+ const eosId = stoi[EOS];
251
+ const unkId = stoi[UNK];
252
+
253
+ if (padId === undefined || bosId === undefined || eosId === undefined || unkId === undefined) {
254
+ throw new Error('Invalid derive vocab: missing special tokens <pad>/<bos>/<eos>/<unk>.');
255
+ }
256
+
257
+ const session = await ort.InferenceSession.create(resolvedModel, {
258
+ executionProviders: executionProviders || ['cpu'],
259
+ graphOptimizationLevel: 'all',
260
+ });
261
+ const inputNameSet = new Set(session.inputNames);
262
+ const outputTokenName = session.outputNames.includes('token_logits')
263
+ ? 'token_logits'
264
+ : session.outputNames[0];
265
+
266
+ const decodeLimit = maxDecodeLen || 48;
267
+ const specials = {
268
+ pad: PAD,
269
+ bos: BOS,
270
+ eos: EOS,
271
+ unk: UNK,
272
+ };
273
+
274
+ function encodeText(text, addBos = false, addEos = false) {
275
+ const ids = [];
276
+ if (addBos) ids.push(bosId);
277
+ for (const ch of text) ids.push(stoi[ch] ?? unkId);
278
+ if (addEos) ids.push(eosId);
279
+ return ids;
280
+ }
281
+
282
+ function buildSourceText(lemma, wordType, person, mode, tense) {
283
+ return `L:${lemma}|W:${wordType}|P:${person}|M:${mode}|T:${tense}`;
284
+ }
285
+
286
+ async function predict(lemma, wordType, sentencePerson, sentenceMode, sentenceTense) {
287
+ const t0 = performance.now();
288
+ const normalizedLemma = String(lemma || '').trim();
289
+ const normalizedWordType = normalizeEnumToken(wordType, 'NONE');
290
+ const normalizedPerson = normalizeEnumToken(sentencePerson, 'ALL');
291
+ const isNounOrAdje = normalizedWordType === 'NOUN' || normalizedWordType === 'ADJE';
292
+ const normalizedMode = isNounOrAdje
293
+ ? normalizeEnumToken(sentenceMode, 'ALL')
294
+ : normalizeEnumToken(sentenceMode, 'NONE');
295
+ const normalizedTense = isNounOrAdje
296
+ ? normalizeEnumToken(sentenceTense, 'ALL')
297
+ : normalizeEnumToken(sentenceTense, 'NONE');
298
+
299
+ const source = buildSourceText(
300
+ normalizedLemma,
301
+ normalizedWordType,
302
+ normalizedPerson,
303
+ normalizedMode,
304
+ normalizedTense,
305
+ );
306
+
307
+ const srcIds = encodeText(source, false, true);
308
+ const srcTensor = makeInt64Tensor2D([srcIds]);
309
+ const srcLenTensor = makeInt64Tensor1D([srcIds.length]);
310
+
311
+ const tokenIds = [];
312
+ let lastStepLogits = null;
313
+ let tgt = [bosId];
314
+
315
+ for (let step = 0; step < decodeLimit; step += 1) {
316
+ const tgtTensor = makeInt64Tensor2D([tgt]);
317
+ const outputs = await session.run({
318
+ ...(inputNameSet.has('src') ? { src: srcTensor } : {}),
319
+ ...(inputNameSet.has('src_len') ? { src_len: srcLenTensor } : {}),
320
+ ...(inputNameSet.has('tgt_in') ? { tgt_in: tgtTensor } : {}),
321
+ });
322
+
323
+ const token = outputs[outputTokenName];
324
+ const vocabSize = token.dims[2];
325
+ const tgtLen = tgt.length;
326
+ const lastOffset = (tgtLen - 1) * vocabSize;
327
+ lastStepLogits = token.data.slice(lastOffset, lastOffset + vocabSize);
328
+ const { index: nextId } = argmax(lastStepLogits);
329
+
330
+ tokenIds.push(nextId);
331
+ if (nextId === eosId) break;
332
+ tgt.push(nextId);
333
+ }
334
+
335
+ const form = decodeTextFromIds(tokenIds, itos, specials);
336
+ const bestId = argmax(lastStepLogits).index;
337
+ const confidence = softmaxAt(lastStepLogits, bestId);
338
+
339
+ return {
340
+ lemma: normalizedLemma,
341
+ wordType: normalizedWordType,
342
+ person: normalizedPerson,
343
+ mode: normalizedMode,
344
+ tense: normalizedTense,
345
+ output: form,
346
+ confidence,
347
+ timeMs: performance.now() - t0,
348
+ };
349
+ }
350
+
351
+ async function nounDerive(lemma, sentencePerson, sentenceMode, sentenceTense) {
352
+ return predict(lemma, 'NOUN', sentencePerson, sentenceMode, sentenceTense);
353
+ }
354
+
355
+ async function adjeDerive(lemma, sentencePerson, sentenceMode, sentenceTense) {
356
+ return predict(lemma, 'ADJE', sentencePerson, sentenceMode, sentenceTense);
357
+ }
358
+
359
+ async function verbDerive(lemma, sentencePerson, sentenceMode, sentenceTense) {
360
+ return predict(lemma, 'VERB', sentencePerson, sentenceMode, sentenceTense);
361
+ }
362
+
363
+ return {
364
+ predict,
365
+ derive: predict,
366
+ nounDerive,
367
+ adjeDerive,
368
+ verbDerive,
369
+ metadata: {
370
+ modelPath: resolvedModel,
371
+ vocabSize: itos.length,
372
+ decodeLimit,
373
+ },
374
+ };
375
+ }
376
+
377
+ export async function FrSpell(options = {}) {
378
+ const {
379
+ lemmaModelPath = DEFAULT_MODEL_PATHS.lemmaModelPath,
380
+ lemmaVocabPath = DEFAULT_MODEL_PATHS.lemmaVocabPath,
381
+ lemmaLabelsPath = DEFAULT_MODEL_PATHS.lemmaLabelsPath,
382
+ derivativeModelPath = DEFAULT_MODEL_PATHS.derivativeModelPath,
383
+ derivativeVocabPath = DEFAULT_MODEL_PATHS.derivativeVocabPath,
384
+ lemmaMaxDecodeLen,
385
+ derivativeMaxDecodeLen,
386
+ executionProviders,
387
+ } = options;
388
+
389
+ const [lemmaPredictor, derivativePredictor] = await Promise.all([
390
+ createLemmaTypePredictor({
391
+ modelPath: lemmaModelPath,
392
+ vocabPath: lemmaVocabPath,
393
+ labelsPath: lemmaLabelsPath,
394
+ maxDecodeLen: lemmaMaxDecodeLen,
395
+ executionProviders,
396
+ }),
397
+ createDerivativeTypePredictor({
398
+ modelPath: derivativeModelPath,
399
+ vocabPath: derivativeVocabPath,
400
+ maxDecodeLen: derivativeMaxDecodeLen,
401
+ executionProviders,
402
+ }),
403
+ ]);
404
+
405
+ return {
406
+ lemma: lemmaPredictor.lemma,
407
+ derive: derivativePredictor.derive,
408
+ nounDerive: derivativePredictor.nounDerive,
409
+ adjeDerive: derivativePredictor.adjeDerive,
410
+ verbDerive: derivativePredictor.verbDerive,
411
+ metadata: {
412
+ lemma: lemmaPredictor.metadata,
413
+ derivative: derivativePredictor.metadata,
414
+ },
415
+ };
416
+ }
package/test/test.js ADDED
@@ -0,0 +1,21 @@
1
+ import { FrSpell } from '../src/frspell.js';
2
+
3
+ const predictor = await FrSpell();
4
+
5
+ const lemmaResult = await predictor.lemma('mangeons');
6
+ const nounResult = await predictor.nounDerive('chat', 'THD_PLF');
7
+ const adjeResult = await predictor.adjeDerive('beau', 'THD_F');
8
+ const verbResult1 = await predictor.verbDerive('manger', 'FST_PL', 'INDI', 'PRES');
9
+ const verbResult2 = await predictor.verbDerive('manger', 'SND_PL', 'INDI', 'FUTU');
10
+ const verbResult3 = await predictor.verbDerive('manger', 'FST_PL', 'INDI', 'PASS');
11
+ const verbResult4 = await predictor.verbDerive('manger', 'SND', 'SUBJ', 'PRES');
12
+ const verbResult5 = await predictor.verbDerive('manger', 'THD_PLF', 'PART', 'PASS');
13
+
14
+ console.log(lemmaResult);
15
+ console.log(nounResult);
16
+ console.log(adjeResult);
17
+ console.log(verbResult1);
18
+ console.log(verbResult2);
19
+ console.log(verbResult3);
20
+ console.log(verbResult4);
21
+ console.log(verbResult5);