lemma-is 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/README.md +21 -20
  2. package/package.json +1 -1
package/README.md CHANGED
@@ -25,15 +25,18 @@ npm install lemma-is
25
25
  import { readFileSync } from "fs";
26
26
  import { BinaryLemmatizer, extractIndexableLemmas } from "lemma-is";
27
27
 
28
- const buffer = readFileSync("./lemma-is.bin");
29
- const lemmatizer = BinaryLemmatizer.loadFromBuffer(buffer.buffer.slice(
30
- buffer.byteOffset, buffer.byteOffset + buffer.byteLength
31
- ));
32
-
33
- app.post("/lemmatize", (req, res) => {
34
- const lemmas = extractIndexableLemmas(req.body.text, lemmatizer);
35
- res.json({ lemmas: [...lemmas] });
36
- });
28
+ // Binary data is bundled with the package
29
+ const buffer = readFileSync("node_modules/lemma-is/data-dist/lemma-is.bin");
30
+ const lemmatizer = BinaryLemmatizer.loadFromBuffer(
31
+ buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
32
+ );
33
+
34
+ lemmatizer.lemmatize("börnin"); // ["barn"]
35
+ lemmatizer.lemmatize("fóru"); // ["fara", "fóra"]
36
+
37
+ // Full pipeline for search indexing
38
+ const lemmas = extractIndexableLemmas("Börnin fóru í bíó", lemmatizer);
39
+ // → ["barn", "fara", "fóra", "í", "bíó"]
37
40
  ```
38
41
 
39
42
  ## The Problem
@@ -52,10 +55,6 @@ If you index "Börnin fóru í bíó" by splitting on whitespace, a search for "
52
55
  ## Solution
53
56
 
54
57
  ```typescript
55
- import { BinaryLemmatizer } from "lemma-is";
56
-
57
- const lemmatizer = await BinaryLemmatizer.load("/data/lemma-is.bin");
58
-
59
58
  lemmatizer.lemmatize("börnin"); // → ["barn"]
60
59
  lemmatizer.lemmatize("fóru"); // → ["fara"]
61
60
  lemmatizer.lemmatize("kvenna"); // → ["kona"]
@@ -86,9 +85,9 @@ lemmatizer.lemmatize("við");
86
85
  The library uses shallow grammar rules based on Icelandic case government to disambiguate prepositions:
87
86
 
88
87
  ```typescript
89
- import { BinaryLemmatizer, Disambiguator } from "lemma-is";
88
+ import { Disambiguator } from "lemma-is";
90
89
 
91
- const lemmatizer = await BinaryLemmatizer.load("/data/lemma-is.bin");
90
+ // lemmatizer loaded as shown in Quickstart
92
91
  const disambiguator = new Disambiguator(lemmatizer, lemmatizer, { useGrammarRules: true });
93
92
 
94
93
  // "á borðinu" - borðinu is dative (þgf), á governs dative → preposition
@@ -139,9 +138,7 @@ lemmatizer.lemmatizeWithMorph("börnum");
139
138
  Use corpus frequencies to pick the most likely lemma based on context:
140
139
 
141
140
  ```typescript
142
- import { BinaryLemmatizer, processText } from "lemma-is";
143
-
144
- const lemmatizer = await BinaryLemmatizer.load("/data/lemma-is.bin");
141
+ import { processText } from "lemma-is";
145
142
 
146
143
  // BinaryLemmatizer has built-in bigram frequencies for disambiguation
147
144
  // "við erum" = "we are" → bigrams favor pronoun "ég" over preposition
@@ -271,6 +268,7 @@ const lemmas = extractIndexableLemmas(text, lemmatizer, {
271
268
  ### Setup
272
269
 
273
270
  ```typescript
271
+ import { readFileSync } from "fs";
274
272
  import {
275
273
  BinaryLemmatizer,
276
274
  extractIndexableLemmas,
@@ -278,7 +276,10 @@ import {
278
276
  createKnownLemmaSet
279
277
  } from "lemma-is";
280
278
 
281
- const lemmatizer = await BinaryLemmatizer.load("/data/lemma-is.bin");
279
+ const buffer = readFileSync("node_modules/lemma-is/data-dist/lemma-is.bin");
280
+ const lemmatizer = BinaryLemmatizer.loadFromBuffer(
281
+ buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
282
+ );
282
283
  const knownLemmas = createKnownLemmaSet(lemmatizer.getAllLemmas());
283
284
  const splitter = new CompoundSplitter(lemmatizer, knownLemmas);
284
285
  ```
@@ -366,7 +367,7 @@ uv run python scripts/build-binary.py # builds lemma-is.bin with morph featur
366
367
  import { readFileSync } from "fs";
367
368
  import { BinaryLemmatizer } from "lemma-is";
368
369
 
369
- const buffer = readFileSync("data-dist/lemma-is.bin");
370
+ const buffer = readFileSync("node_modules/lemma-is/data-dist/lemma-is.bin");
370
371
  const lemmatizer = BinaryLemmatizer.loadFromBuffer(
371
372
  buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
372
373
  );
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "lemma-is",
3
- "version": "0.2.2",
3
+ "version": "0.2.3",
4
4
  "description": "Icelandic word form to lemma lookup for browser and Node.js",
5
5
  "keywords": [
6
6
  "icelandic",