lemma-is 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -49
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -4,15 +4,15 @@ Icelandic lemmatization for JavaScript. Maps inflected word forms to base forms
|
|
|
4
4
|
|
|
5
5
|
## Why?
|
|
6
6
|
|
|
7
|
-
Existing Icelandic NLP tools
|
|
7
|
+
Existing Icelandic NLP tools are Python/C++:
|
|
8
8
|
|
|
9
9
|
| Tool | Runtime | Standalone? | Notes |
|
|
10
10
|
|------|---------|-------------|-------|
|
|
11
|
-
| **[GreynirEngine](https://github.com/mideind/GreynirEngine)** | Python + C++ | ✓ | Gold standard. Full parser, POS tagger
|
|
11
|
+
| **[GreynirEngine](https://github.com/mideind/GreynirEngine)** | Python + C++ | ✓ | Gold standard. Full parser, POS tagger. |
|
|
12
12
|
| **[Nefnir](https://github.com/lexis-project/Nefnir)** | Python | ✗ | Requires POS tags from IceNLP/IceStagger (Java, unmaintained). |
|
|
13
|
-
| **lemma-is** | TypeScript | ✓ |
|
|
13
|
+
| **lemma-is** | TypeScript | ✓ | Node.js servers. Grammar-based disambiguation, compound splitting. |
|
|
14
14
|
|
|
15
|
-
lemma-is trades parsing accuracy for
|
|
15
|
+
lemma-is trades parsing accuracy for JS ecosystem integration—good enough for search indexing, runs in any Node.js environment.
|
|
16
16
|
|
|
17
17
|
## Quickstart
|
|
18
18
|
|
|
@@ -20,37 +20,23 @@ lemma-is trades parsing accuracy for portability—good enough for search, runs
|
|
|
20
20
|
npm install lemma-is
|
|
21
21
|
```
|
|
22
22
|
|
|
23
|
-
**
|
|
24
|
-
```bash
|
|
25
|
-
# Option 1: Download pre-built from npm
|
|
26
|
-
cp node_modules/lemma-is/data-dist/lemma-is.bin ./public/
|
|
27
|
-
|
|
28
|
-
# Option 2: Build from source (requires BÍN data + Python)
|
|
29
|
-
# Download SHsnid.csv from https://bin.arnastofnun.is/DMII/LTdata/k-LTdata/
|
|
30
|
-
uv run python scripts/build-data.py && uv run python scripts/build-binary.py
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
**Browser (Web Worker)** — see [`test.html`](test.html) for a complete example:
|
|
34
|
-
```typescript
|
|
35
|
-
// Load in worker to avoid blocking main thread
|
|
36
|
-
const lemmatizer = await BinaryLemmatizer.load("/data/lemma-is.bin");
|
|
37
|
-
self.postMessage({ lemmas: lemmatizer.lemmatize("börnin") }); // → ["barn"]
|
|
38
|
-
```
|
|
39
|
-
|
|
40
|
-
**Node.js endpoint**:
|
|
23
|
+
**Node.js**:
|
|
41
24
|
```typescript
|
|
42
25
|
import { readFileSync } from "fs";
|
|
43
26
|
import { BinaryLemmatizer, extractIndexableLemmas } from "lemma-is";
|
|
44
27
|
|
|
45
|
-
|
|
46
|
-
const
|
|
47
|
-
|
|
48
|
-
)
|
|
28
|
+
// Binary data is bundled with the package
|
|
29
|
+
const buffer = readFileSync("node_modules/lemma-is/data-dist/lemma-is.bin");
|
|
30
|
+
const lemmatizer = BinaryLemmatizer.loadFromBuffer(
|
|
31
|
+
buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
|
|
32
|
+
);
|
|
49
33
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
34
|
+
lemmatizer.lemmatize("börnin"); // → ["barn"]
|
|
35
|
+
lemmatizer.lemmatize("fóru"); // → ["fara", "fóra"]
|
|
36
|
+
|
|
37
|
+
// Full pipeline for search indexing
|
|
38
|
+
const lemmas = extractIndexableLemmas("Börnin fóru í bíó", lemmatizer);
|
|
39
|
+
// → ["barn", "fara", "fóra", "í", "bíó"]
|
|
54
40
|
```
|
|
55
41
|
|
|
56
42
|
## The Problem
|
|
@@ -69,10 +55,6 @@ If you index "Börnin fóru í bíó" by splitting on whitespace, a search for "
|
|
|
69
55
|
## Solution
|
|
70
56
|
|
|
71
57
|
```typescript
|
|
72
|
-
import { BinaryLemmatizer } from "lemma-is";
|
|
73
|
-
|
|
74
|
-
const lemmatizer = await BinaryLemmatizer.load("/data/lemma-is.bin");
|
|
75
|
-
|
|
76
58
|
lemmatizer.lemmatize("börnin"); // → ["barn"]
|
|
77
59
|
lemmatizer.lemmatize("fóru"); // → ["fara"]
|
|
78
60
|
lemmatizer.lemmatize("kvenna"); // → ["kona"]
|
|
@@ -103,9 +85,9 @@ lemmatizer.lemmatize("við");
|
|
|
103
85
|
The library uses shallow grammar rules based on Icelandic case government to disambiguate prepositions:
|
|
104
86
|
|
|
105
87
|
```typescript
|
|
106
|
-
import {
|
|
88
|
+
import { Disambiguator } from "lemma-is";
|
|
107
89
|
|
|
108
|
-
|
|
90
|
+
// lemmatizer loaded as shown in Quickstart
|
|
109
91
|
const disambiguator = new Disambiguator(lemmatizer, lemmatizer, { useGrammarRules: true });
|
|
110
92
|
|
|
111
93
|
// "á borðinu" - borðinu is dative (þgf), á governs dative → preposition
|
|
@@ -156,9 +138,7 @@ lemmatizer.lemmatizeWithMorph("börnum");
|
|
|
156
138
|
Use corpus frequencies to pick the most likely lemma based on context:
|
|
157
139
|
|
|
158
140
|
```typescript
|
|
159
|
-
import {
|
|
160
|
-
|
|
161
|
-
const lemmatizer = await BinaryLemmatizer.load("/data/lemma-is.bin");
|
|
141
|
+
import { processText } from "lemma-is";
|
|
162
142
|
|
|
163
143
|
// BinaryLemmatizer has built-in bigram frequencies for disambiguation
|
|
164
144
|
// "við erum" = "we are" → bigrams favor pronoun "ég" over preposition
|
|
@@ -288,6 +268,7 @@ const lemmas = extractIndexableLemmas(text, lemmatizer, {
|
|
|
288
268
|
### Setup
|
|
289
269
|
|
|
290
270
|
```typescript
|
|
271
|
+
import { readFileSync } from "fs";
|
|
291
272
|
import {
|
|
292
273
|
BinaryLemmatizer,
|
|
293
274
|
extractIndexableLemmas,
|
|
@@ -295,7 +276,10 @@ import {
|
|
|
295
276
|
createKnownLemmaSet
|
|
296
277
|
} from "lemma-is";
|
|
297
278
|
|
|
298
|
-
const
|
|
279
|
+
const buffer = readFileSync("node_modules/lemma-is/data-dist/lemma-is.bin");
|
|
280
|
+
const lemmatizer = BinaryLemmatizer.loadFromBuffer(
|
|
281
|
+
buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
|
|
282
|
+
);
|
|
299
283
|
const knownLemmas = createKnownLemmaSet(lemmatizer.getAllLemmas());
|
|
300
284
|
const splitter = new CompoundSplitter(lemmatizer, knownLemmas);
|
|
301
285
|
```
|
|
@@ -383,7 +367,7 @@ uv run python scripts/build-binary.py # builds lemma-is.bin with morph featur
|
|
|
383
367
|
import { readFileSync } from "fs";
|
|
384
368
|
import { BinaryLemmatizer } from "lemma-is";
|
|
385
369
|
|
|
386
|
-
const buffer = readFileSync("data-dist/lemma-is.bin");
|
|
370
|
+
const buffer = readFileSync("node_modules/lemma-is/data-dist/lemma-is.bin");
|
|
387
371
|
const lemmatizer = BinaryLemmatizer.loadFromBuffer(
|
|
388
372
|
buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
|
|
389
373
|
);
|
|
@@ -441,15 +425,14 @@ This library makes tradeoffs for portability. Know what you're getting.
|
|
|
441
425
|
|
|
442
426
|
### File Size
|
|
443
427
|
|
|
444
|
-
The binary is **~91 MB**.
|
|
428
|
+
The binary is **~91 MB**. This library targets Node.js server environments where the data is loaded once at startup.
|
|
445
429
|
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
```
|
|
430
|
+
Not recommended for:
|
|
431
|
+
- **Serverless/edge** — cold start latency loading 91 MB
|
|
432
|
+
- **Browser/Web Workers** — download size prohibitive for most users
|
|
433
|
+
- **Cloudflare Workers** — fits 128 MB limit but cold starts are slow
|
|
451
434
|
|
|
452
|
-
|
|
435
|
+
For browser applications, run lemmatization server-side and expose an API endpoint.
|
|
453
436
|
|
|
454
437
|
### No Query Expansion
|
|
455
438
|
|
|
@@ -573,4 +556,15 @@ pnpm build:data # rebuild binary from BÍN source
|
|
|
573
556
|
|
|
574
557
|
## License
|
|
575
558
|
|
|
576
|
-
MIT
|
|
559
|
+
MIT for the code.
|
|
560
|
+
|
|
561
|
+
### Data License (BÍN)
|
|
562
|
+
|
|
563
|
+
The linguistic data is derived from [BÍN](https://bin.arnastofnun.is/) (Beygingarlýsing íslensks nútímamáls) © Árni Magnússon Institute for Icelandic Studies.
|
|
564
|
+
|
|
565
|
+
**By using this package, you agree to BÍN's conditions:**
|
|
566
|
+
- Credit the Árni Magnússon Institute in your product's UI
|
|
567
|
+
- Do not redistribute the raw data separately
|
|
568
|
+
- Do not publish inflection paradigms without permission
|
|
569
|
+
|
|
570
|
+
Full terms: [BÍN License Conditions](https://bin.arnastofnun.is/DMII/LTdata/conditions/)
|