lemma-is 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -20
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -25,15 +25,18 @@ npm install lemma-is
|
|
|
25
25
|
import { readFileSync } from "fs";
|
|
26
26
|
import { BinaryLemmatizer, extractIndexableLemmas } from "lemma-is";
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
const
|
|
30
|
-
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
28
|
+
// Binary data is bundled with the package
|
|
29
|
+
const buffer = readFileSync("node_modules/lemma-is/data-dist/lemma-is.bin");
|
|
30
|
+
const lemmatizer = BinaryLemmatizer.loadFromBuffer(
|
|
31
|
+
buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
|
|
32
|
+
);
|
|
33
|
+
|
|
34
|
+
lemmatizer.lemmatize("börnin"); // → ["barn"]
|
|
35
|
+
lemmatizer.lemmatize("fóru"); // → ["fara", "fóra"]
|
|
36
|
+
|
|
37
|
+
// Full pipeline for search indexing
|
|
38
|
+
const lemmas = extractIndexableLemmas("Börnin fóru í bíó", lemmatizer);
|
|
39
|
+
// → ["barn", "fara", "fóra", "í", "bíó"]
|
|
37
40
|
```
|
|
38
41
|
|
|
39
42
|
## The Problem
|
|
@@ -52,10 +55,6 @@ If you index "Börnin fóru í bíó" by splitting on whitespace, a search for "
|
|
|
52
55
|
## Solution
|
|
53
56
|
|
|
54
57
|
```typescript
|
|
55
|
-
import { BinaryLemmatizer } from "lemma-is";
|
|
56
|
-
|
|
57
|
-
const lemmatizer = await BinaryLemmatizer.load("/data/lemma-is.bin");
|
|
58
|
-
|
|
59
58
|
lemmatizer.lemmatize("börnin"); // → ["barn"]
|
|
60
59
|
lemmatizer.lemmatize("fóru"); // → ["fara"]
|
|
61
60
|
lemmatizer.lemmatize("kvenna"); // → ["kona"]
|
|
@@ -86,9 +85,9 @@ lemmatizer.lemmatize("við");
|
|
|
86
85
|
The library uses shallow grammar rules based on Icelandic case government to disambiguate prepositions:
|
|
87
86
|
|
|
88
87
|
```typescript
|
|
89
|
-
import {
|
|
88
|
+
import { Disambiguator } from "lemma-is";
|
|
90
89
|
|
|
91
|
-
|
|
90
|
+
// lemmatizer loaded as shown in Quickstart
|
|
92
91
|
const disambiguator = new Disambiguator(lemmatizer, lemmatizer, { useGrammarRules: true });
|
|
93
92
|
|
|
94
93
|
// "á borðinu" - borðinu is dative (þgf), á governs dative → preposition
|
|
@@ -139,9 +138,7 @@ lemmatizer.lemmatizeWithMorph("börnum");
|
|
|
139
138
|
Use corpus frequencies to pick the most likely lemma based on context:
|
|
140
139
|
|
|
141
140
|
```typescript
|
|
142
|
-
import {
|
|
143
|
-
|
|
144
|
-
const lemmatizer = await BinaryLemmatizer.load("/data/lemma-is.bin");
|
|
141
|
+
import { processText } from "lemma-is";
|
|
145
142
|
|
|
146
143
|
// BinaryLemmatizer has built-in bigram frequencies for disambiguation
|
|
147
144
|
// "við erum" = "we are" → bigrams favor pronoun "ég" over preposition
|
|
@@ -271,6 +268,7 @@ const lemmas = extractIndexableLemmas(text, lemmatizer, {
|
|
|
271
268
|
### Setup
|
|
272
269
|
|
|
273
270
|
```typescript
|
|
271
|
+
import { readFileSync } from "fs";
|
|
274
272
|
import {
|
|
275
273
|
BinaryLemmatizer,
|
|
276
274
|
extractIndexableLemmas,
|
|
@@ -278,7 +276,10 @@ import {
|
|
|
278
276
|
createKnownLemmaSet
|
|
279
277
|
} from "lemma-is";
|
|
280
278
|
|
|
281
|
-
const
|
|
279
|
+
const buffer = readFileSync("node_modules/lemma-is/data-dist/lemma-is.bin");
|
|
280
|
+
const lemmatizer = BinaryLemmatizer.loadFromBuffer(
|
|
281
|
+
buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
|
|
282
|
+
);
|
|
282
283
|
const knownLemmas = createKnownLemmaSet(lemmatizer.getAllLemmas());
|
|
283
284
|
const splitter = new CompoundSplitter(lemmatizer, knownLemmas);
|
|
284
285
|
```
|
|
@@ -366,7 +367,7 @@ uv run python scripts/build-binary.py # builds lemma-is.bin with morph featur
|
|
|
366
367
|
import { readFileSync } from "fs";
|
|
367
368
|
import { BinaryLemmatizer } from "lemma-is";
|
|
368
369
|
|
|
369
|
-
const buffer = readFileSync("data-dist/lemma-is.bin");
|
|
370
|
+
const buffer = readFileSync("node_modules/lemma-is/data-dist/lemma-is.bin");
|
|
370
371
|
const lemmatizer = BinaryLemmatizer.loadFromBuffer(
|
|
371
372
|
buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
|
|
372
373
|
);
|