fr-spell 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.cn.md +33 -46
- package/README.fr.md +33 -46
- package/README.md +33 -46
- package/frspell.browser.js +10936 -0
- package/models/community/derive_form_model.int8.onnx +0 -0
- package/models/community/lemma_type_model.int8.onnx +0 -0
- package/{src/module → module}/Predictor.js +5 -5
- package/package.json +19 -13
- package/benchmark/checklist_adje_100.json +0 -702
- package/benchmark/checklist_lemma_verb_100.json +0 -402
- package/benchmark/checklist_noun_100.json +0 -702
- package/benchmark/checklist_verb_100.json +0 -702
- package/benchmark/generate-checklists.js +0 -192
- package/benchmark/run-benchmark.js +0 -123
- package/models/small/derive_form_model.int8.onnx +0 -0
- package/models/small/lemma_type_model.int8.onnx +0 -0
- package/test/test.js +0 -21
- /package/{src/frspell.js → index.js} +0 -0
- /package/models/{small → community}/derive_form_vocab.json +0 -0
- /package/models/{small → community}/lemma_type_labels.json +0 -0
- /package/models/{small → community}/lemma_type_vocab.json +0 -0
|
@@ -1,192 +0,0 @@
|
|
|
1
|
-
import fs from 'node:fs/promises';
|
|
2
|
-
import path from 'node:path';
|
|
3
|
-
|
|
4
|
-
const WORD_RE = /^[A-Za-zÀ-ÖØ-öø-ÿ'’-]+$/;
|
|
5
|
-
|
|
6
|
-
const DEFAULT_TRAINING_CSV =
|
|
7
|
-
'F:/davychen/2026doc/work/20260331_coderhome/20260420_dict/WORD-SUGGEST/training.csv';
|
|
8
|
-
const DEFAULT_DERIVE_CSV =
|
|
9
|
-
'F:/davychen/2026doc/work/20260331_coderhome/20260420_dict/WORD-SUGGEST/training_derive.csv';
|
|
10
|
-
|
|
11
|
-
const ROOT = process.cwd();
|
|
12
|
-
const BENCH_DIR = path.join(ROOT, 'benchmark');
|
|
13
|
-
|
|
14
|
-
function isCommonWordLike(value) {
|
|
15
|
-
if (typeof value !== 'string') return false;
|
|
16
|
-
const text = value.trim();
|
|
17
|
-
return text.length >= 2 && text.length <= 24 && WORD_RE.test(text);
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
function parseCsv(content) {
|
|
21
|
-
const lines = content.split(/\r?\n/).filter((line) => line.length > 0);
|
|
22
|
-
if (lines.length === 0) return [];
|
|
23
|
-
|
|
24
|
-
const headers = lines[0].split(',').map((h) => h.trim());
|
|
25
|
-
const rows = [];
|
|
26
|
-
|
|
27
|
-
for (let i = 1; i < lines.length; i += 1) {
|
|
28
|
-
const values = lines[i].split(',');
|
|
29
|
-
if (values.length < headers.length) continue;
|
|
30
|
-
|
|
31
|
-
const row = {};
|
|
32
|
-
for (let j = 0; j < headers.length; j += 1) {
|
|
33
|
-
row[headers[j]] = (values[j] ?? '').trim();
|
|
34
|
-
}
|
|
35
|
-
rows.push(row);
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
return rows;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
function shuffle(items) {
|
|
42
|
-
const arr = [...items];
|
|
43
|
-
for (let i = arr.length - 1; i > 0; i -= 1) {
|
|
44
|
-
const j = Math.floor(Math.random() * (i + 1));
|
|
45
|
-
[arr[i], arr[j]] = [arr[j], arr[i]];
|
|
46
|
-
}
|
|
47
|
-
return arr;
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
function sampleRows(rows, count) {
|
|
51
|
-
return shuffle(rows).slice(0, Math.min(count, rows.length));
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
function topFrequentLemmas(rows, topN) {
|
|
55
|
-
const freq = new Map();
|
|
56
|
-
for (const row of rows) {
|
|
57
|
-
freq.set(row.lemma, (freq.get(row.lemma) ?? 0) + 1);
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
return new Set(
|
|
61
|
-
[...freq.entries()]
|
|
62
|
-
.sort((a, b) => b[1] - a[1])
|
|
63
|
-
.slice(0, topN)
|
|
64
|
-
.map(([lemma]) => lemma),
|
|
65
|
-
);
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
function uniqueBy(rows, keySelector) {
|
|
69
|
-
const seen = new Set();
|
|
70
|
-
const output = [];
|
|
71
|
-
for (const row of rows) {
|
|
72
|
-
const key = keySelector(row);
|
|
73
|
-
if (seen.has(key)) continue;
|
|
74
|
-
seen.add(key);
|
|
75
|
-
output.push(row);
|
|
76
|
-
}
|
|
77
|
-
return output;
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
function buildVerbLemmaChecklist(trainingRows, limit) {
|
|
81
|
-
const filtered = [];
|
|
82
|
-
for (const row of trainingRows) {
|
|
83
|
-
const input = row.input?.trim();
|
|
84
|
-
const target = row.target?.trim();
|
|
85
|
-
if (!input || !target) continue;
|
|
86
|
-
|
|
87
|
-
const splitAt = target.lastIndexOf('_');
|
|
88
|
-
if (splitAt <= 0 || splitAt === target.length - 1) continue;
|
|
89
|
-
|
|
90
|
-
const lemma = target.slice(0, splitAt);
|
|
91
|
-
const typeCode = target.slice(splitAt + 1);
|
|
92
|
-
if (typeCode !== 'F') continue; // F => VERB in current model label mapping.
|
|
93
|
-
if (!isCommonWordLike(input) || !isCommonWordLike(lemma)) continue;
|
|
94
|
-
|
|
95
|
-
filtered.push({
|
|
96
|
-
input,
|
|
97
|
-
target: lemma,
|
|
98
|
-
});
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
const deduped = uniqueBy(filtered, (r) => `${r.input}|${r.target}`);
|
|
102
|
-
return sampleRows(deduped, limit);
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
function buildDeriveChecklist(rows, wordType, limit) {
|
|
106
|
-
const typeRows = rows.filter(
|
|
107
|
-
(row) =>
|
|
108
|
-
row.word_type === wordType &&
|
|
109
|
-
isCommonWordLike(row.lemma) &&
|
|
110
|
-
isCommonWordLike(row.form) &&
|
|
111
|
-
row.person &&
|
|
112
|
-
row.mode &&
|
|
113
|
-
row.tense,
|
|
114
|
-
);
|
|
115
|
-
|
|
116
|
-
const commonLemmaSet = topFrequentLemmas(typeRows, 2000);
|
|
117
|
-
const commonRows = typeRows.filter((row) => commonLemmaSet.has(row.lemma));
|
|
118
|
-
|
|
119
|
-
const formatted = commonRows.map((row) => ({
|
|
120
|
-
lemma: row.lemma,
|
|
121
|
-
person: row.person,
|
|
122
|
-
mode: row.mode,
|
|
123
|
-
tense: row.tense,
|
|
124
|
-
target: row.form,
|
|
125
|
-
}));
|
|
126
|
-
|
|
127
|
-
const deduped = uniqueBy(
|
|
128
|
-
formatted,
|
|
129
|
-
(r) => `${r.lemma}|${r.person}|${r.mode}|${r.tense}|${r.target}`,
|
|
130
|
-
);
|
|
131
|
-
|
|
132
|
-
return sampleRows(deduped, limit);
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
async function main() {
|
|
136
|
-
const trainingCsvPath = process.env.FRSPELL_TRAINING_CSV || DEFAULT_TRAINING_CSV;
|
|
137
|
-
const deriveCsvPath = process.env.FRSPELL_DERIVE_CSV || DEFAULT_DERIVE_CSV;
|
|
138
|
-
const limit = Number(process.env.FRSPELL_BENCHMARK_SIZE || '100');
|
|
139
|
-
|
|
140
|
-
const [trainingRaw, deriveRaw] = await Promise.all([
|
|
141
|
-
fs.readFile(trainingCsvPath, 'utf-8'),
|
|
142
|
-
fs.readFile(deriveCsvPath, 'utf-8'),
|
|
143
|
-
]);
|
|
144
|
-
|
|
145
|
-
const trainingRows = parseCsv(trainingRaw);
|
|
146
|
-
const deriveRows = parseCsv(deriveRaw);
|
|
147
|
-
|
|
148
|
-
const verbLemmaChecklist = buildVerbLemmaChecklist(trainingRows, limit);
|
|
149
|
-
const nounChecklist = buildDeriveChecklist(deriveRows, 'NOUN', limit);
|
|
150
|
-
const verbChecklist = buildDeriveChecklist(deriveRows, 'VERB', limit);
|
|
151
|
-
const adjeChecklist = buildDeriveChecklist(deriveRows, 'ADJE', limit);
|
|
152
|
-
|
|
153
|
-
if (
|
|
154
|
-
verbLemmaChecklist.length < limit ||
|
|
155
|
-
nounChecklist.length < limit ||
|
|
156
|
-
verbChecklist.length < limit ||
|
|
157
|
-
adjeChecklist.length < limit
|
|
158
|
-
) {
|
|
159
|
-
throw new Error(
|
|
160
|
-
`Not enough data to build all checklists at size ${limit}. ` +
|
|
161
|
-
`Generated sizes: lemmaVerb=${verbLemmaChecklist.length}, noun=${nounChecklist.length}, ` +
|
|
162
|
-
`verb=${verbChecklist.length}, adje=${adjeChecklist.length}`,
|
|
163
|
-
);
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
await Promise.all([
|
|
167
|
-
fs.writeFile(
|
|
168
|
-
path.join(BENCH_DIR, 'checklist_lemma_verb_100.json'),
|
|
169
|
-
JSON.stringify(verbLemmaChecklist, null, 2),
|
|
170
|
-
),
|
|
171
|
-
fs.writeFile(
|
|
172
|
-
path.join(BENCH_DIR, 'checklist_noun_100.json'),
|
|
173
|
-
JSON.stringify(nounChecklist, null, 2),
|
|
174
|
-
),
|
|
175
|
-
fs.writeFile(
|
|
176
|
-
path.join(BENCH_DIR, 'checklist_verb_100.json'),
|
|
177
|
-
JSON.stringify(verbChecklist, null, 2),
|
|
178
|
-
),
|
|
179
|
-
fs.writeFile(
|
|
180
|
-
path.join(BENCH_DIR, 'checklist_adje_100.json'),
|
|
181
|
-
JSON.stringify(adjeChecklist, null, 2),
|
|
182
|
-
),
|
|
183
|
-
]);
|
|
184
|
-
|
|
185
|
-
console.log('Generated benchmark checklists:');
|
|
186
|
-
console.log(`- checklist_lemma_verb_100.json (${verbLemmaChecklist.length})`);
|
|
187
|
-
console.log(`- checklist_noun_100.json (${nounChecklist.length})`);
|
|
188
|
-
console.log(`- checklist_verb_100.json (${verbChecklist.length})`);
|
|
189
|
-
console.log(`- checklist_adje_100.json (${adjeChecklist.length})`);
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
await main();
|
|
@@ -1,123 +0,0 @@
|
|
|
1
|
-
import fs from 'node:fs/promises';
|
|
2
|
-
import path from 'node:path';
|
|
3
|
-
import { FrSpell } from '../src/frspell.js';
|
|
4
|
-
|
|
5
|
-
const ROOT = process.cwd();
|
|
6
|
-
const CHECKLIST_FILES = {
|
|
7
|
-
lemma: 'checklist_lemma_verb_100.json',
|
|
8
|
-
noun: 'checklist_noun_100.json',
|
|
9
|
-
verb: 'checklist_verb_100.json',
|
|
10
|
-
adje: 'checklist_adje_100.json',
|
|
11
|
-
};
|
|
12
|
-
|
|
13
|
-
function mean(values) {
|
|
14
|
-
if (values.length === 0) return 0;
|
|
15
|
-
return values.reduce((sum, n) => sum + n, 0) / values.length;
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
async function loadChecklist(fileName) {
|
|
19
|
-
const fullPath = path.join(ROOT, 'benchmark', fileName);
|
|
20
|
-
const raw = await fs.readFile(fullPath, 'utf-8');
|
|
21
|
-
return JSON.parse(raw);
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
async function runLemmaBenchmark(predictor, checklist) {
|
|
25
|
-
let correct = 0;
|
|
26
|
-
const elapsed = [];
|
|
27
|
-
|
|
28
|
-
for (const item of checklist) {
|
|
29
|
-
const t0 = performance.now();
|
|
30
|
-
const result = await predictor.lemma(item.input);
|
|
31
|
-
elapsed.push(performance.now() - t0);
|
|
32
|
-
|
|
33
|
-
if (result.lemma === item.target) {
|
|
34
|
-
correct += 1;
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
return {
|
|
39
|
-
name: 'lemma-from-conjugation',
|
|
40
|
-
size: checklist.length,
|
|
41
|
-
correct,
|
|
42
|
-
accuracy: checklist.length > 0 ? correct / checklist.length : 0,
|
|
43
|
-
avgMs: mean(elapsed),
|
|
44
|
-
};
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
async function runDeriveBenchmark(predictor, checklist, type) {
|
|
48
|
-
let correct = 0;
|
|
49
|
-
const elapsed = [];
|
|
50
|
-
|
|
51
|
-
for (const item of checklist) {
|
|
52
|
-
const t0 = performance.now();
|
|
53
|
-
const result =
|
|
54
|
-
type === 'noun'
|
|
55
|
-
? await predictor.nounDerive(item.lemma, item.person, item.mode, item.tense)
|
|
56
|
-
: type === 'verb'
|
|
57
|
-
? await predictor.verbDerive(item.lemma, item.person, item.mode, item.tense)
|
|
58
|
-
: await predictor.adjeDerive(item.lemma, item.person, item.mode, item.tense);
|
|
59
|
-
|
|
60
|
-
elapsed.push(performance.now() - t0);
|
|
61
|
-
if (result.output === item.target) {
|
|
62
|
-
correct += 1;
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
return {
|
|
67
|
-
name: `${type}-derive`,
|
|
68
|
-
size: checklist.length,
|
|
69
|
-
correct,
|
|
70
|
-
accuracy: checklist.length > 0 ? correct / checklist.length : 0,
|
|
71
|
-
avgMs: mean(elapsed),
|
|
72
|
-
};
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
function printSummary(summary) {
|
|
76
|
-
const accuracyPct = (summary.accuracy * 100).toFixed(2);
|
|
77
|
-
const avgMs = summary.avgMs.toFixed(2);
|
|
78
|
-
|
|
79
|
-
console.log(`[${summary.name}]`);
|
|
80
|
-
console.log(` samples : ${summary.size}`);
|
|
81
|
-
console.log(` correct : ${summary.correct}`);
|
|
82
|
-
console.log(` accuracy: ${accuracyPct}%`);
|
|
83
|
-
console.log(` avg time: ${avgMs} ms`);
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
async function main() {
|
|
87
|
-
const mode = process.argv[2] || 'all';
|
|
88
|
-
|
|
89
|
-
const predictor = await FrSpell();
|
|
90
|
-
|
|
91
|
-
const jobs = [];
|
|
92
|
-
|
|
93
|
-
if (mode === 'all' || mode === 'lemma') {
|
|
94
|
-
const checklist = await loadChecklist(CHECKLIST_FILES.lemma);
|
|
95
|
-
jobs.push(runLemmaBenchmark(predictor, checklist));
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
if (mode === 'all' || mode === 'noun') {
|
|
99
|
-
const checklist = await loadChecklist(CHECKLIST_FILES.noun);
|
|
100
|
-
jobs.push(runDeriveBenchmark(predictor, checklist, 'noun'));
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
if (mode === 'all' || mode === 'verb') {
|
|
104
|
-
const checklist = await loadChecklist(CHECKLIST_FILES.verb);
|
|
105
|
-
jobs.push(runDeriveBenchmark(predictor, checklist, 'verb'));
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
if (mode === 'all' || mode === 'adje') {
|
|
109
|
-
const checklist = await loadChecklist(CHECKLIST_FILES.adje);
|
|
110
|
-
jobs.push(runDeriveBenchmark(predictor, checklist, 'adje'));
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
if (jobs.length === 0) {
|
|
114
|
-
throw new Error(`Unsupported benchmark mode: ${mode}`);
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
const summaries = await Promise.all(jobs);
|
|
118
|
-
for (const summary of summaries) {
|
|
119
|
-
printSummary(summary);
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
await main();
|
|
Binary file
|
|
Binary file
|
package/test/test.js
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import { FrSpell } from '../src/frspell.js';
|
|
2
|
-
|
|
3
|
-
const predictor = await FrSpell();
|
|
4
|
-
|
|
5
|
-
const lemmaResult = await predictor.lemma('mangeons');
|
|
6
|
-
const nounResult = await predictor.nounDerive('chat', 'THD_PLF');
|
|
7
|
-
const adjeResult = await predictor.adjeDerive('beau', 'THD_F');
|
|
8
|
-
const verbResult1 = await predictor.verbDerive('manger', 'FST_PL', 'INDI', 'PRES');
|
|
9
|
-
const verbResult2 = await predictor.verbDerive('manger', 'SND_PL', 'INDI', 'FUTU');
|
|
10
|
-
const verbResult3 = await predictor.verbDerive('manger', 'FST_PL', 'INDI', 'PASS');
|
|
11
|
-
const verbResult4 = await predictor.verbDerive('manger', 'SND', 'SUBJ', 'PRES');
|
|
12
|
-
const verbResult5 = await predictor.verbDerive('manger', 'THD_PLF', 'PART', 'PASS');
|
|
13
|
-
|
|
14
|
-
console.log(lemmaResult);
|
|
15
|
-
console.log(nounResult);
|
|
16
|
-
console.log(adjeResult);
|
|
17
|
-
console.log(verbResult1);
|
|
18
|
-
console.log(verbResult2);
|
|
19
|
-
console.log(verbResult3);
|
|
20
|
-
console.log(verbResult4);
|
|
21
|
-
console.log(verbResult5);
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|