npm - fr-spell - Versions diffs - 1.0.2 → 1.0.4 - Mend

fr-spell 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.cn.md +17 -63
package/README.fr.md +17 -63
package/README.md +17 -63
package/{src/module → module}/Predictor.js +9 -1
package/package.json +17 -16
package/benchmark/checklist_adje_100.json +0 -702
package/benchmark/checklist_lemma_verb_100.json +0 -402
package/benchmark/checklist_noun_100.json +0 -702
package/benchmark/checklist_verb_100.json +0 -702
package/benchmark/generate-checklists.js +0 -192
package/benchmark/run-benchmark.js +0 -123
package/dist/models/community/derive_form_model.int8.onnx +0 -0
package/dist/models/community/derive_form_vocab.json +0 -74
package/dist/models/community/lemma_type_labels.json +0 -47
package/dist/models/community/lemma_type_model.int8.onnx +0 -0
package/dist/models/community/lemma_type_vocab.json +0 -244
package/scripts/build.js +0 -53
package/src/frspell.browser.global.js +0 -9
package/src/module/Predictor.browser.js +0 -428
package/test/test.js +0 -21
/package/{dist/frspell.browser.js → frspell.browser.js} +0 -0
/package/{src/frspell.js → index.js} +0 -0

package/scripts/build.js DELETED Viewed

@@ -1,53 +0,0 @@
-import fs from 'node:fs/promises';
-import path from 'node:path';
-import { build } from 'esbuild';
-const rootDir = path.resolve(process.cwd());
-const distDir = path.join(rootDir, 'dist');
-const distModelsDir = path.join(distDir, 'models', 'community');
-const sourceModelsDir = path.join(rootDir, 'models', 'community');
-async function cleanDist() {
-  await fs.rm(distDir, { recursive: true, force: true });
-  await fs.mkdir(distModelsDir, { recursive: true });
-}
-async function bundleBrowserBuild() {
-  await build({
-    entryPoints: [path.join(rootDir, 'src', 'frspell.browser.global.js')],
-    bundle: true,
-    outfile: path.join(distDir, 'frspell.browser.js'),
-    format: 'iife',
-    platform: 'browser',
-    target: ['es2020'],
-    minify: false,
-  });
-}
-async function copyModelAssets() {
-  const assetFiles = [
-    'lemma_type_model.int8.onnx',
-    'derive_form_model.int8.onnx',
-    'lemma_type_vocab.json',
-    'lemma_type_labels.json',
-    'derive_form_vocab.json',
-  ];
-  await Promise.all(
-    assetFiles.map((name) =>
-      fs.copyFile(path.join(sourceModelsDir, name), path.join(distModelsDir, name)),
-    ),
-  );
-}
-async function main() {
-  await cleanDist();
-  await bundleBrowserBuild();
-  await copyModelAssets();
-  console.log('Build completed: dist/frspell.browser.js and model assets are ready.');
-}
-main().catch((error) => {
-  console.error(error);
-  process.exitCode = 1;
-});

package/src/frspell.browser.global.js DELETED Viewed

@@ -1,9 +0,0 @@
-/**
- * FR-SPELL browser global entry
- */
-import { FrSpell } from './module/Predictor.browser.js';
-if (typeof window !== 'undefined') {
-  window.FrSpell = FrSpell;
-}

package/src/module/Predictor.browser.js DELETED Viewed

@@ -1,428 +0,0 @@
-/**
- * FR-SPELL browser predictor implementation
- * Author: Davy Chen <davy.chen@163.com>
- * Profile: https://www.linkedin.com/in/davychxn/
- */
-import * as ort from 'onnxruntime-web';
-function argmax(arr) {
-  let bestIdx = 0;
-  let bestVal = arr[0];
-  for (let i = 1; i < arr.length; i += 1) {
-    if (arr[i] > bestVal) {
-      bestVal = arr[i];
-      bestIdx = i;
-    }
-  }
-  return { index: bestIdx, value: bestVal };
-}
-function softmaxAt(logits, idx) {
-  let maxVal = -Infinity;
-  for (let i = 0; i < logits.length; i += 1) {
-    if (logits[i] > maxVal) maxVal = logits[i];
-  }
-  let sum = 0;
-  for (let i = 0; i < logits.length; i += 1) {
-    sum += Math.exp(logits[i] - maxVal);
-  }
-  return Math.exp(logits[idx] - maxVal) / sum;
-}
-function makeInt64Tensor2D(rows) {
-  const batch = rows.length;
-  const seq = rows[0].length;
-  const data = new BigInt64Array(batch * seq);
-  let k = 0;
-  for (let i = 0; i < batch; i += 1) {
-    for (let j = 0; j < seq; j += 1) {
-      data[k] = BigInt(rows[i][j]);
-      k += 1;
-    }
-  }
-  return new ort.Tensor('int64', data, [batch, seq]);
-}
-function makeInt64Tensor1D(values) {
-  const data = new BigInt64Array(values.length);
-  for (let i = 0; i < values.length; i += 1) {
-    data[i] = BigInt(values[i]);
-  }
-  return new ort.Tensor('int64', data, [values.length]);
-}
-function toStoi(itos) {
-  const stoi = {};
-  for (let i = 0; i < itos.length; i += 1) stoi[itos[i]] = i;
-  return stoi;
-}
-function decodeTextFromIds(ids, itos, specials) {
-  let out = '';
-  for (const id of ids) {
-    const tok = itos[id];
-    if (tok === specials.eos) break;
-    if (tok === specials.pad || tok === specials.bos || tok === specials.unk) continue;
-    out += tok;
-  }
-  return out;
-}
-function normalizeEnumToken(value, fallback) {
-  if (typeof value === 'string' && value.trim().length > 0) {
-    return value.trim().toUpperCase();
-  }
-  return fallback;
-}
-function trimTrailingSlash(v) {
-  return String(v || '').replace(/\/+$/, '');
-}
-function resolveAssetPath(basePath, assetName) {
-  return `${trimTrailingSlash(basePath)}/${assetName}`;
-}
-async function readJsonFromUrl(url) {
-  const response = await fetch(url);
-  if (!response.ok) {
-    throw new Error(`Failed to fetch JSON asset: ${url} (status ${response.status})`);
-  }
-  return response.json();
-}
-const DEFAULT_MODEL_BASE_PATH = './models/community';
-function getDefaultModelPaths(modelBasePath = DEFAULT_MODEL_BASE_PATH) {
-  return {
-    lemmaModelPath: resolveAssetPath(modelBasePath, 'lemma_type_model.int8.onnx'),
-    lemmaVocabPath: resolveAssetPath(modelBasePath, 'lemma_type_vocab.json'),
-    lemmaLabelsPath: resolveAssetPath(modelBasePath, 'lemma_type_labels.json'),
-    derivativeModelPath: resolveAssetPath(modelBasePath, 'derive_form_model.int8.onnx'),
-    derivativeVocabPath: resolveAssetPath(modelBasePath, 'derive_form_vocab.json'),
-  };
-}
-export async function createLemmaTypePredictor({
-  modelPath,
-  vocabPath,
-  labelsPath,
-  maxDecodeLen,
-  executionProviders,
-}) {
-  const [vocab, labels] = await Promise.all([
-    readJsonFromUrl(vocabPath),
-    readJsonFromUrl(labelsPath),
-  ]);
-  const itos = vocab.itos;
-  const stoi = toStoi(itos);
-  const idToCode = {};
-  for (const [k, v] of Object.entries(vocab.id_to_code)) {
-    idToCode[Number(k)] = v;
-  }
-  const codeCharToName = labels.code_char_to_name || {};
-  const PAD = '<pad>';
-  const BOS = '<bos>';
-  const EOS = '<eos>';
-  const UNK = '<unk>';
-  const padId = stoi[PAD];
-  const bosId = stoi[BOS];
-  const eosId = stoi[EOS];
-  const unkId = stoi[UNK];
-  const session = await ort.InferenceSession.create(modelPath, {
-    executionProviders: executionProviders || ['wasm'],
-    graphOptimizationLevel: 'all',
-  });
-  const inputNameSet = new Set(session.inputNames);
-  const decodeLimit = maxDecodeLen || labels.max_decode_len || 32;
-  function encodeWord(word) {
-    const ids = [];
-    for (const ch of word) {
-      ids.push(stoi[ch] ?? unkId);
-    }
-    ids.push(eosId);
-    return ids;
-  }
-  const specials = {
-    pad: PAD,
-    bos: BOS,
-    eos: EOS,
-    unk: UNK,
-  };
-  async function predict(word) {
-    const t0 = performance.now();
-    const srcIds = encodeWord(word);
-    const srcTensor = makeInt64Tensor2D([srcIds]);
-    const srcLenTensor = makeInt64Tensor1D([srcIds.length]);
-    let tgt = [bosId];
-    let codeLogits = null;
-    const lemmaTokenIds = [];
-    for (let step = 0; step < decodeLimit; step += 1) {
-      const tgtTensor = makeInt64Tensor2D([tgt]);
-      const outputs = await session.run({
-        ...(inputNameSet.has('src') ? { src: srcTensor } : {}),
-        ...(inputNameSet.has('src_len') ? { src_len: srcLenTensor } : {}),
-        ...(inputNameSet.has('tgt_in') ? { tgt_in: tgtTensor } : {}),
-      });
-      const token = outputs.token_logits;
-      const code = outputs.code_logits;
-      codeLogits = code.data;
-      // token_logits shape: [1, tgt_len, vocab_size]
-      const vocabSize = token.dims[2];
-      const tgtLen = tgt.length;
-      const lastOffset = (tgtLen - 1) * vocabSize;
-      const lastStep = token.data.slice(lastOffset, lastOffset + vocabSize);
-      const { index: nextId } = argmax(lastStep);
-      lemmaTokenIds.push(nextId);
-      if (nextId === eosId) break;
-      tgt.push(nextId);
-    }
-    const predLemma = decodeTextFromIds(lemmaTokenIds, itos, specials);
-    const codeBest = argmax(codeLogits);
-    const predCode = idToCode[codeBest.index] || 'A';
-    const codeConf = softmaxAt(codeLogits, codeBest.index);
-    const predType = codeCharToName[predCode] || 'UNKNOWN';
-    return {
-      input: word,
-      lemma: predLemma,
-      wordType: predType,
-      confidence: codeConf,
-      timeMs: performance.now() - t0,
-    };
-  }
-  async function lemma(word) {
-    return predict(word);
-  }
-  async function predictBatch(words) {
-    const out = [];
-    for (const w of words) {
-      // Keep sequential to keep implementation simple and deterministic.
-      out.push(await predict(w));
-    }
-    return out;
-  }
-  return {
-    predict,
-    lemma,
-    predictBatch,
-    metadata: {
-      modelPath,
-      vocabSize: itos.length,
-      decodeLimit,
-    },
-  };
-}
-export async function createDerivativeTypePredictor({
-  modelPath,
-  vocabPath,
-  maxDecodeLen,
-  executionProviders,
-}) {
-  const vocab = await readJsonFromUrl(vocabPath);
-  const itos = vocab.itos;
-  const stoi = toStoi(itos);
-  const PAD = '<pad>';
-  const BOS = '<bos>';
-  const EOS = '<eos>';
-  const UNK = '<unk>';
-  const padId = stoi[PAD];
-  const bosId = stoi[BOS];
-  const eosId = stoi[EOS];
-  const unkId = stoi[UNK];
-  if (padId === undefined || bosId === undefined || eosId === undefined || unkId === undefined) {
-    throw new Error('Invalid derive vocab: missing special tokens <pad>/<bos>/<eos>/<unk>.');
-  }
-  const session = await ort.InferenceSession.create(modelPath, {
-    executionProviders: executionProviders || ['wasm'],
-    graphOptimizationLevel: 'all',
-  });
-  const inputNameSet = new Set(session.inputNames);
-  const outputTokenName = session.outputNames.includes('token_logits')
-    ? 'token_logits'
-    : session.outputNames[0];
-  const decodeLimit = maxDecodeLen || 48;
-  const specials = {
-    pad: PAD,
-    bos: BOS,
-    eos: EOS,
-    unk: UNK,
-  };
-  function encodeText(text, addBos = false, addEos = false) {
-    const ids = [];
-    if (addBos) ids.push(bosId);
-    for (const ch of text) ids.push(stoi[ch] ?? unkId);
-    if (addEos) ids.push(eosId);
-    return ids;
-  }
-  function buildSourceText(lemma, wordType, person, mode, tense) {
-    return `L:${lemma}|W:${wordType}|P:${person}|M:${mode}|T:${tense}`;
-  }
-  async function predict(lemma, wordType, sentencePerson, sentenceMode, sentenceTense) {
-    const t0 = performance.now();
-    const normalizedLemma = String(lemma || '').trim();
-    const normalizedWordType = normalizeEnumToken(wordType, 'NONE');
-    const normalizedPerson = normalizeEnumToken(sentencePerson, 'ALL');
-    const isNounOrAdje = normalizedWordType === 'NOUN' || normalizedWordType === 'ADJE';
-    const normalizedMode = isNounOrAdje
-      ? normalizeEnumToken(sentenceMode, 'ALL')
-      : normalizeEnumToken(sentenceMode, 'NONE');
-    const normalizedTense = isNounOrAdje
-      ? normalizeEnumToken(sentenceTense, 'ALL')
-      : normalizeEnumToken(sentenceTense, 'NONE');
-    const source = buildSourceText(
-      normalizedLemma,
-      normalizedWordType,
-      normalizedPerson,
-      normalizedMode,
-      normalizedTense,
-    );
-    const srcIds = encodeText(source, false, true);
-    const srcTensor = makeInt64Tensor2D([srcIds]);
-    const srcLenTensor = makeInt64Tensor1D([srcIds.length]);
-    const tokenIds = [];
-    let lastStepLogits = null;
-    let tgt = [bosId];
-    for (let step = 0; step < decodeLimit; step += 1) {
-      const tgtTensor = makeInt64Tensor2D([tgt]);
-      const outputs = await session.run({
-        ...(inputNameSet.has('src') ? { src: srcTensor } : {}),
-        ...(inputNameSet.has('src_len') ? { src_len: srcLenTensor } : {}),
-        ...(inputNameSet.has('tgt_in') ? { tgt_in: tgtTensor } : {}),
-      });
-      const token = outputs[outputTokenName];
-      const vocabSize = token.dims[2];
-      const tgtLen = tgt.length;
-      const lastOffset = (tgtLen - 1) * vocabSize;
-      lastStepLogits = token.data.slice(lastOffset, lastOffset + vocabSize);
-      const { index: nextId } = argmax(lastStepLogits);
-      tokenIds.push(nextId);
-      if (nextId === eosId) break;
-      tgt.push(nextId);
-    }
-    const form = decodeTextFromIds(tokenIds, itos, specials);
-    const bestId = argmax(lastStepLogits).index;
-    const confidence = softmaxAt(lastStepLogits, bestId);
-    return {
-      lemma: normalizedLemma,
-      wordType: normalizedWordType,
-      person: normalizedPerson,
-      mode: normalizedMode,
-      tense: normalizedTense,
-      output: form,
-      confidence,
-      timeMs: performance.now() - t0,
-    };
-  }
-  async function nounDerive(lemma, sentencePerson, sentenceMode, sentenceTense) {
-    return predict(lemma, 'NOUN', sentencePerson, sentenceMode, sentenceTense);
-  }
-  async function adjeDerive(lemma, sentencePerson, sentenceMode, sentenceTense) {
-    return predict(lemma, 'ADJE', sentencePerson, sentenceMode, sentenceTense);
-  }
-  async function verbDerive(lemma, sentencePerson, sentenceMode, sentenceTense) {
-    return predict(lemma, 'VERB', sentencePerson, sentenceMode, sentenceTense);
-  }
-  return {
-    predict,
-    derive: predict,
-    nounDerive,
-    adjeDerive,
-    verbDerive,
-    metadata: {
-      modelPath,
-      vocabSize: itos.length,
-      decodeLimit,
-    },
-  };
-}
-export async function FrSpell(options = {}) {
-  const {
-    modelBasePath = DEFAULT_MODEL_BASE_PATH,
-    lemmaMaxDecodeLen,
-    derivativeMaxDecodeLen,
-    executionProviders,
-    wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/',
-  } = options;
-  const defaultPaths = getDefaultModelPaths(modelBasePath);
-  const lemmaModelPath = options.lemmaModelPath || defaultPaths.lemmaModelPath;
-  const lemmaVocabPath = options.lemmaVocabPath || defaultPaths.lemmaVocabPath;
-  const lemmaLabelsPath = options.lemmaLabelsPath || defaultPaths.lemmaLabelsPath;
-  const derivativeModelPath = options.derivativeModelPath || defaultPaths.derivativeModelPath;
-  const derivativeVocabPath = options.derivativeVocabPath || defaultPaths.derivativeVocabPath;
-  if (wasmPaths && ort.env && ort.env.wasm) {
-    ort.env.wasm.wasmPaths = wasmPaths;
-  }
-  const [lemmaPredictor, derivativePredictor] = await Promise.all([
-    createLemmaTypePredictor({
-      modelPath: lemmaModelPath,
-      vocabPath: lemmaVocabPath,
-      labelsPath: lemmaLabelsPath,
-      maxDecodeLen: lemmaMaxDecodeLen,
-      executionProviders,
-    }),
-    createDerivativeTypePredictor({
-      modelPath: derivativeModelPath,
-      vocabPath: derivativeVocabPath,
-      maxDecodeLen: derivativeMaxDecodeLen,
-      executionProviders,
-    }),
-  ]);
-  return {
-    lemma: lemmaPredictor.lemma,
-    derive: derivativePredictor.derive,
-    nounDerive: derivativePredictor.nounDerive,
-    adjeDerive: derivativePredictor.adjeDerive,
-    verbDerive: derivativePredictor.verbDerive,
-    metadata: {
-      lemma: lemmaPredictor.metadata,
-      derivative: derivativePredictor.metadata,
-    },
-  };
-}

package/test/test.js DELETED Viewed

@@ -1,21 +0,0 @@
-import { FrSpell } from '../src/frspell.js';
-const predictor = await FrSpell();
-const lemmaResult = await predictor.lemma('mangeons');
-const nounResult = await predictor.nounDerive('chat', 'THD_PLF');
-const adjeResult = await predictor.adjeDerive('beau', 'THD_F');
-const verbResult1 = await predictor.verbDerive('manger', 'FST_PL', 'INDI', 'PRES');
-const verbResult2 = await predictor.verbDerive('manger', 'SND_PL', 'INDI', 'FUTU');
-const verbResult3 = await predictor.verbDerive('manger', 'FST_PL', 'INDI', 'PASS');
-const verbResult4 = await predictor.verbDerive('manger', 'SND', 'SUBJ', 'PRES');
-const verbResult5 = await predictor.verbDerive('manger', 'THD_PLF', 'PART', 'PASS');
-console.log(lemmaResult);
-console.log(nounResult);
-console.log(adjeResult);
-console.log(verbResult1);
-console.log(verbResult2);
-console.log(verbResult3);
-console.log(verbResult4);
-console.log(verbResult5);

/package/{dist/frspell.browser.js → frspell.browser.js} RENAMED Viewed

File without changes

/package/{src/frspell.js → index.js} RENAMED Viewed

File without changes