lemma-is 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -12
- package/package.json +13 -14
package/README.md
CHANGED
|
@@ -26,20 +26,39 @@ If a user searches "hestur" but your document contains "hestinum", they won't fi
|
|
|
26
26
|
|
|
27
27
|
## Why lemma-is?
|
|
28
28
|
|
|
29
|
-
|
|
29
|
+
GreynirEngine remains the gold standard for **sentence parsing** and grammatical analysis in Icelandic. But full parsing is not forgiving: if a sentence doesn't parse, you don't get disambiguated lemmas. That makes it a poor fit for messy, real‑world search indexing where recall matters.
|
|
30
30
|
|
|
31
|
-
|
|
31
|
+
GreynirEngine also exposes a non‑parsing lemmatizer via its `bintokenizer`/`simple_lemmatize` pipeline, which can return all possible lemmas for a token. This is more forgiving but **overindexes heavily** without sentence‑level disambiguation.
|
|
32
32
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
|
38
|
-
|
|
39
|
-
| **
|
|
40
|
-
| **
|
|
33
|
+
lemma-is targets this gap: high‑recall lemmatization for search, tolerant of noise, with light disambiguation and compound splitting, and it runs anywhere JavaScript runs.
|
|
34
|
+
|
|
35
|
+
IFD benchmark summary (lemma recall + overindexing measured against gold lemmas in the Icelandic Frequency Dictionary corpus):
|
|
36
|
+
|
|
37
|
+
| | lemma-is core | lemma-is full | GreynirEngine (BÍN lookup) |
|
|
38
|
+
|---|---|---|---|
|
|
39
|
+
| **Runtime** | Node, Bun, Deno | Node, Bun, Deno | Python |
|
|
40
|
+
| **Throughput** | ~19.0M words/min | ~14.7M words/min | ~13.3K words/min |
|
|
41
|
+
| **Recall (IFD)** | 95.996% | 98.585% | 81.4% (parsed-only) |
|
|
42
|
+
| **Avg candidates** | 1.57 | 1.57 | 1.0 |
|
|
43
|
+
| **Overindexing (extraRate)** | 0.388 | 0.373 | 0.186 |
|
|
44
|
+
| **Memory (load)** | ~18.5 MB | ~182 MB | ~417 MB RSS |
|
|
45
|
+
| **Parse failures** | n/a | n/a | 27% (sample) |
|
|
46
|
+
| **Disambiguation** | Bigrams + grammar rules | Bigrams + grammar rules | Full grammar + BÍN |
|
|
47
|
+
| **Use case** | Search indexing | Search indexing | NLP analysis |
|
|
41
48
|
|
|
42
49
|
See [BENCHMARKS.md](./BENCHMARKS.md) for methodology and detailed results.
|
|
50
|
+
The IFD gold corpus is referenced here: `https://repository.clarin.is/repository/xmlui/handle/20.500.12537/36`.
|
|
51
|
+
GreynirEngine numbers are from full sentence parsing on a 1,000-sentence IFD sample; parse failures and tokenization mismatches lower measured recall. The bintokenizer-based lemmatizer is more forgiving but overindexes heavily when all lemmas are kept.
|
|
52
|
+
|
|
53
|
+
### Optimization summary (0.5.0)
|
|
54
|
+
|
|
55
|
+
- **Core memory**: ~18.5 MB load (heap + ArrayBuffers) for `lemma-is.core.bin`
|
|
56
|
+
- **Full memory**: ~182 MB load for `lemma-is.bin`
|
|
57
|
+
- **Greynir full parser memory**: ~417 MB RSS (sample run)
|
|
58
|
+
- **Core speed**: ~19.0M words/min; **Full speed**: ~14.7M words/min
|
|
59
|
+
- **Core recall**: 95.996% on IFD; **Full recall**: 98.585%
|
|
60
|
+
- **Core recall boost**: unknown‑word suffix fallback enabled only in core to raise recall without hurting full
|
|
61
|
+
- **Lower memory compound lookup**: Bloom filter known‑lemma lookup reduces RAM when splitting compounds
|
|
43
62
|
|
|
44
63
|
### The Trade-off
|
|
45
64
|
|
|
@@ -111,9 +130,9 @@ disambiguator.disambiguate("á", null, "borðinu");
|
|
|
111
130
|
Icelandic forms long compounds. Split them for better search coverage:
|
|
112
131
|
|
|
113
132
|
```typescript
|
|
114
|
-
import { CompoundSplitter,
|
|
133
|
+
import { CompoundSplitter, createKnownLemmaFilter } from "lemma-is";
|
|
115
134
|
|
|
116
|
-
const knownLemmas =
|
|
135
|
+
const knownLemmas = createKnownLemmaFilter(lemmatizer.getAllLemmas());
|
|
117
136
|
const splitter = new CompoundSplitter(lemmatizer, knownLemmas);
|
|
118
137
|
|
|
119
138
|
splitter.split("landbúnaðarráðherra");
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "lemma-is",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.5.0",
|
|
4
4
|
"description": "Icelandic word form to lemma lookup for browser and Node.js",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"icelandic",
|
|
@@ -27,17 +27,6 @@
|
|
|
27
27
|
"data-dist/lemma-is.core.bin",
|
|
28
28
|
"README.md"
|
|
29
29
|
],
|
|
30
|
-
"scripts": {
|
|
31
|
-
"build": "tsdown",
|
|
32
|
-
"build:data": "uv run python scripts/build-data.py",
|
|
33
|
-
"build:binary": "uv run python scripts/build-binary.py",
|
|
34
|
-
"build:core": "uv run python scripts/build-binary.py --no-bigrams --no-morph --top-words 350000 --output data-dist/lemma-is.core.bin",
|
|
35
|
-
"benchmark:core-sweep": "node --import=tsx scripts/benchmark/core-sweep.ts",
|
|
36
|
-
"test": "NODE_OPTIONS='--max-old-space-size=8192' vitest run",
|
|
37
|
-
"test:watch": "NODE_OPTIONS='--max-old-space-size=8192' vitest",
|
|
38
|
-
"typecheck": "tsc --noEmit",
|
|
39
|
-
"serve": "python3 -m http.server 8080"
|
|
40
|
-
},
|
|
41
30
|
"devDependencies": {
|
|
42
31
|
"@types/node": "^22.0.0",
|
|
43
32
|
"tsdown": "^0.20.1",
|
|
@@ -45,11 +34,21 @@
|
|
|
45
34
|
"typescript": "^5.9.3",
|
|
46
35
|
"vitest": "^4.0.18"
|
|
47
36
|
},
|
|
48
|
-
"packageManager": "pnpm@10.10.0",
|
|
49
37
|
"dependencies": {
|
|
50
38
|
"dawg-lookup": "^2.2.1",
|
|
51
39
|
"dawg-set": "^0.0.0",
|
|
52
40
|
"tokenize-is": "^0.1.0",
|
|
53
41
|
"trie-mapping": "^4.0.0"
|
|
42
|
+
},
|
|
43
|
+
"scripts": {
|
|
44
|
+
"build": "tsdown",
|
|
45
|
+
"build:data": "uv run python scripts/build-data.py",
|
|
46
|
+
"build:binary": "uv run python scripts/build-binary.py",
|
|
47
|
+
"build:core": "uv run python scripts/build-binary.py --no-bigrams --no-morph --top-words 350000 --output data-dist/lemma-is.core.bin",
|
|
48
|
+
"benchmark:core-sweep": "node --import=tsx scripts/benchmark/core-sweep.ts",
|
|
49
|
+
"test": "NODE_OPTIONS='--max-old-space-size=8192' vitest run",
|
|
50
|
+
"test:watch": "NODE_OPTIONS='--max-old-space-size=8192' vitest",
|
|
51
|
+
"typecheck": "tsc --noEmit",
|
|
52
|
+
"serve": "python3 -m http.server 8080"
|
|
54
53
|
}
|
|
55
|
-
}
|
|
54
|
+
}
|