lemma-is 0.2.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +127 -424
- package/data-dist/lemma-is.core.bin +0 -0
- package/dist/index.d.mts +38 -7
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1 -1
- package/dist/index.mjs.map +1 -1
- package/package.json +5 -2
- package/data-dist/lemma-is.bin +0 -0
|
Binary file
|
package/dist/index.d.mts
CHANGED
|
@@ -250,8 +250,6 @@ interface DisambiguatorOptions {
|
|
|
250
250
|
leftWeight?: number;
|
|
251
251
|
/** Weight for right context (next word) */
|
|
252
252
|
rightWeight?: number;
|
|
253
|
-
/** Enable phrase-based disambiguation */
|
|
254
|
-
usePhraseRules?: boolean;
|
|
255
253
|
/** Enable preference rules (e.g., "á" context rules) */
|
|
256
254
|
usePreferenceRules?: boolean;
|
|
257
255
|
/** Enable grammar rules (case government) */
|
|
@@ -281,6 +279,10 @@ interface DisambiguatedToken {
|
|
|
281
279
|
interface MorphLemmatizerLike extends LemmatizerLike {
|
|
282
280
|
lemmatizeWithMorph?(word: string): LemmaWithMorph[];
|
|
283
281
|
}
|
|
282
|
+
interface DisambiguationContextHint {
|
|
283
|
+
prevLemmas?: string[];
|
|
284
|
+
nextLemmas?: string[];
|
|
285
|
+
}
|
|
284
286
|
/**
|
|
285
287
|
* Disambiguate lemmas using a multi-phase pipeline.
|
|
286
288
|
*/
|
|
@@ -289,7 +291,6 @@ declare class Disambiguator {
|
|
|
289
291
|
bigrams: BigramProvider | null;
|
|
290
292
|
leftWeight: number;
|
|
291
293
|
rightWeight: number;
|
|
292
|
-
usePhraseRules: boolean;
|
|
293
294
|
usePreferenceRules: boolean;
|
|
294
295
|
useGrammarRules: boolean;
|
|
295
296
|
constructor(lemmatizer: LemmatizerLike, bigrams?: BigramProvider | null, options?: DisambiguatorOptions);
|
|
@@ -300,7 +301,7 @@ declare class Disambiguator {
|
|
|
300
301
|
* @param prevWord - Previous word (left context), or null
|
|
301
302
|
* @param nextWord - Next word (right context), or null
|
|
302
303
|
*/
|
|
303
|
-
disambiguate(word: string, prevWord: string | null, nextWord: string | null): DisambiguatedToken;
|
|
304
|
+
disambiguate(word: string, prevWord: string | null, nextWord: string | null, hint?: DisambiguationContextHint): DisambiguatedToken;
|
|
304
305
|
/**
|
|
305
306
|
* Disambiguate an array of tokens.
|
|
306
307
|
*
|
|
@@ -363,6 +364,12 @@ declare function getRulesForWord(word: string): DisambiguationRule[];
|
|
|
363
364
|
declare function hasDisambiguationRules(word: string): boolean;
|
|
364
365
|
//#endregion
|
|
365
366
|
//#region src/mini-grammar.d.ts
|
|
367
|
+
/**
|
|
368
|
+
* Interface for lemmatizer used in grammar rules.
|
|
369
|
+
*/
|
|
370
|
+
interface GrammarLemmatizerLike {
|
|
371
|
+
lemmatizeWithPOS?(word: string): LemmaWithPOS[];
|
|
372
|
+
}
|
|
366
373
|
/**
|
|
367
374
|
* Preposition case government rules.
|
|
368
375
|
*
|
|
@@ -422,19 +429,41 @@ declare function applyPrepositionRule(candidates: LemmaWithMorph[], nextWordMorp
|
|
|
422
429
|
* @returns GrammarRuleMatch if a rule applies, null otherwise
|
|
423
430
|
*/
|
|
424
431
|
declare function applyPronounVerbRule(candidates: LemmaWithMorph[], prevWord: string | null): GrammarRuleMatch | null;
|
|
432
|
+
/**
|
|
433
|
+
* Apply noun-after-preposition rule to disambiguate.
|
|
434
|
+
*
|
|
435
|
+
* If the previous word is a preposition and the current word has a
|
|
436
|
+
* noun candidate with a case governed by that preposition, prefer
|
|
437
|
+
* the noun reading.
|
|
438
|
+
*
|
|
439
|
+
* This rule only applies when:
|
|
440
|
+
* - The previous word is UNAMBIGUOUSLY a preposition (no pronoun reading), OR
|
|
441
|
+
* - The current word has no verb candidate
|
|
442
|
+
*
|
|
443
|
+
* Example: "til fundar" → "fundar" is noun "fundur" (genitive), not verb "funda"
|
|
444
|
+
* Counter-example: "við fórum" → "við" is pronoun, "fórum" is verb "fara"
|
|
445
|
+
*
|
|
446
|
+
* @param candidates - All possible readings of the current word
|
|
447
|
+
* @param prevWord - The previous word (raw form)
|
|
448
|
+
* @param lemmatizer - Lemmatizer for looking up the previous word
|
|
449
|
+
* @returns GrammarRuleMatch if a rule applies, null otherwise
|
|
450
|
+
*/
|
|
451
|
+
declare function applyNounAfterPrepositionRule(candidates: LemmaWithMorph[], prevWord: string | null, lemmatizer: GrammarLemmatizerLike | null): GrammarRuleMatch | null;
|
|
425
452
|
/**
|
|
426
453
|
* Apply all mini-grammar rules in sequence.
|
|
427
454
|
*
|
|
428
455
|
* Rules are applied in order of specificity:
|
|
429
456
|
* 1. Preposition + case government (most reliable)
|
|
430
|
-
* 2.
|
|
457
|
+
* 2. Noun after preposition (governed case)
|
|
458
|
+
* 3. Pronoun + verb pattern
|
|
431
459
|
*
|
|
432
460
|
* @param candidates - All possible readings of the current word
|
|
433
461
|
* @param prevWord - Previous word (raw form)
|
|
434
462
|
* @param nextWordMorph - Morphological analyses of the next word
|
|
463
|
+
* @param lemmatizer - Optional lemmatizer for looking up previous word POS
|
|
435
464
|
* @returns GrammarRuleMatch if any rule applies, null otherwise
|
|
436
465
|
*/
|
|
437
|
-
declare function applyGrammarRules(candidates: LemmaWithMorph[], prevWord: string | null, nextWordMorph: LemmaWithMorph[]): GrammarRuleMatch | null;
|
|
466
|
+
declare function applyGrammarRules(candidates: LemmaWithMorph[], prevWord: string | null, nextWordMorph: LemmaWithMorph[], lemmatizer?: GrammarLemmatizerLike | null): GrammarRuleMatch | null;
|
|
438
467
|
/**
|
|
439
468
|
* Check if a word is a known preposition.
|
|
440
469
|
*/
|
|
@@ -583,6 +612,8 @@ interface ProcessedToken {
|
|
|
583
612
|
confidence?: number;
|
|
584
613
|
/** Compound split result if applicable */
|
|
585
614
|
compoundSplit?: CompoundSplit;
|
|
615
|
+
/** Lemmas derived from compound parts (if any) */
|
|
616
|
+
compoundLemmas?: string[];
|
|
586
617
|
}
|
|
587
618
|
/**
|
|
588
619
|
* Options for text processing.
|
|
@@ -673,5 +704,5 @@ declare function runBenchmark(text: string, lemmatizer: LemmatizerLike, strategy
|
|
|
673
704
|
compoundSplitter?: CompoundSplitter;
|
|
674
705
|
}): ProcessingMetrics;
|
|
675
706
|
//#endregion
|
|
676
|
-
export { type BigramProvider, type BinaryLemmatizeOptions, BinaryLemmatizer, type BinaryLemmatizerOptions, CASE_NAMES, CONTEXTUAL_STOPWORDS, type CompoundSplit, type CompoundSplitMode, CompoundSplitter, type CompoundSplitterOptions, DISAMBIGUATION_RULES, type DisambiguatedToken, type DisambiguationRule, Disambiguator, type DisambiguatorOptions, GENDER_NAMES, type GrammarRuleMatch, type GrammaticalCase, type GrammaticalGender, type GrammaticalNumber, type LemmaWithMorph, type LemmaWithPOS, type LemmatizerLike, type MorphFeatures, NOMINATIVE_PRONOUNS, NUMBER_NAMES, PREPOSITION_CASES, PROTECTED_LEMMAS, type ProcessOptions, type ProcessedToken, type ProcessingMetrics, type ProcessingStrategy, STATIC_PHRASES, STOPWORDS_IS, type StaticPhrase, WORD_CLASS_NAMES, WORD_CLASS_NAMES_IS, type WordClass, applyGrammarRules, applyPrepositionRule, applyPronounVerbRule, canGovernCase, createKnownLemmaSet, extractDisambiguatedLemmas, extractIndexableLemmas, getGovernedCases, getPhraseInfo, getRulesForWord, hasDisambiguationRules, isContextualStopword, isKnownPhrase, isKnownPreposition, isStopword, matchPhrase, processText, removeStopwords, runBenchmark };
|
|
707
|
+
export { type BigramProvider, type BinaryLemmatizeOptions, BinaryLemmatizer, type BinaryLemmatizerOptions, CASE_NAMES, CONTEXTUAL_STOPWORDS, type CompoundSplit, type CompoundSplitMode, CompoundSplitter, type CompoundSplitterOptions, DISAMBIGUATION_RULES, type DisambiguatedToken, type DisambiguationRule, Disambiguator, type DisambiguatorOptions, GENDER_NAMES, type GrammarLemmatizerLike, type GrammarRuleMatch, type GrammaticalCase, type GrammaticalGender, type GrammaticalNumber, type LemmaWithMorph, type LemmaWithPOS, type LemmatizerLike, type MorphFeatures, NOMINATIVE_PRONOUNS, NUMBER_NAMES, PREPOSITION_CASES, PROTECTED_LEMMAS, type ProcessOptions, type ProcessedToken, type ProcessingMetrics, type ProcessingStrategy, STATIC_PHRASES, STOPWORDS_IS, type StaticPhrase, WORD_CLASS_NAMES, WORD_CLASS_NAMES_IS, type WordClass, applyGrammarRules, applyNounAfterPrepositionRule, applyPrepositionRule, applyPronounVerbRule, canGovernCase, createKnownLemmaSet, extractDisambiguatedLemmas, extractIndexableLemmas, getGovernedCases, getPhraseInfo, getRulesForWord, hasDisambiguationRules, isContextualStopword, isKnownPhrase, isKnownPreposition, isStopword, matchPhrase, processText, removeStopwords, runBenchmark };
|
|
677
708
|
//# sourceMappingURL=index.d.mts.map
|
package/dist/index.d.mts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/stopwords.ts","../src/types.ts","../src/binary-lemmatizer.ts","../src/disambiguate.ts","../src/disambiguation-rules.ts","../src/mini-grammar.ts","../src/compounds.ts","../src/phrases.ts","../src/pipeline.ts"],"mappings":";;AAUA;;;;;AA4EA;;cA5Ea,YAAA,EAAY,GAAA;;;AA0FzB;iBAdgB,UAAA,CAAW,IAAA;;;;AAuD3B;;;;;AAiBA;;cA1Da,oBAAA,EAAsB,GAAA,SAAY,GAAA;;;;;;;;;;;iBAyC/B,oBAAA,CAAqB,KAAA,UAAe,GAAA;;;;iBAiBpC,eAAA,kBAAA,CAAkC,KAAA,EAAO,CAAA,KAAM,CAAA;;;;AApJ/D;;;;;AA4EA;;;;KC3EY,SAAA;ADyFZ;;;AAAA,cC1Ea,gBAAA,EAAkB,MAAA,CAAO,SAAA;;ADmHtC;;cCnGa,mBAAA,EAAqB,MAAA,CAAO,SAAA;;;ADoHzC;KCpGY,eAAA;;;;KAKA,iBAAA;;;;KAKA,iBAAA;;;;cAKC,UAAA,EAAY,MAAA,CAAO,eAAA;;;;cAUnB,YAAA,EAAc,MAAA,CAAO,iBAAA;AAzDlC;;;AAAA,cAkEa,YAAA,EAAc,MAAA,CAAO,iBAAA;;AAlDlC;;UA0DiB,aAAA;EACf,IAAA,GAAO,eAAA;EACP,MAAA,GAAS,iBAAA;EACT,MAAA,GAAS,iBAAA;AAAA;;;;UAMM,YAAA;EACf,KAAA;EACA,GAAA,EAAK,SAAA;AAAA;;AA3CP;;UAiDiB,cAAA,SAAuB,YAAA;EACtC,KAAA,GAAQ,aAAA;AAAA;AA7CV;;;;AAAA,UAoDiB,cAAA;EACf,SAAA,CAAU,IAAA;EACV,gBAAA,EAAkB,IAAA,WAAe,YAAA;AAAA;;;AAnCnC;;UA0CiB,cAAA;EACf,IAAA,CAAK,KAAA,UAAe,KAAA;AAAA;;;UCjEL,uBAAA;EACf,KAAA,UAAe,KAAA;AAAA;AAAA,UAGA,sBAAA;EACf,SAAA,GAAY,SAAA;AAAA;AAAA,cAGD,gBAAA,YAA4B,cAAA,EAAgB,cAAA;EAAA,QAC/C,MAAA;EAAA,QACA,UAAA;EAAA,QACA,YAAA;EAAA,QACA,YAAA;EAAA,QACA,WAAA;EAAA,QACA,WAAA;EAAA,QACA,YAAA;EAAA,QACA,OAAA;EAAA,QACA,eAAA;EAAA,QACA,eAAA;EAAA,QACA,eAAA;EAAA,QACA,eAAA;EAAA,QACA,WAAA;EAAA,QAEA,UAAA;EAAA,QACA,SAAA;EAAA,QACA,UAAA;EAAA,QACA,WAAA;EAAA,QACA,OAAA;EAAA,QAEA,OAAA;EAAA,QAED,WAAA,CAAA;ED3EsC;AAgB/C;;EAhB+C,OCiKhC,IAAA,CACX,GAAA,UACA,OAAA,GAAS,uBAAA,GACR,OAAA,CAAQ,gBAAA;EDpJqB;;AAgBlC;EAhBkC,OCmKzB,cAAA,CAAe,MAAA,EAAQ,WAAA,GAAc,gBAAA;;;;UAOpC,SAAA;EDrJmB;;;EAAA,QC4JnB,QAAA;EDvJE;;;EAAA,QC8JF,OAAA;ED9JmB;AAK7B;;;EAL6B,QCsKnB,QAAA;EDjKqC;AAU/C;;;;ECiLE,SAAA,CAAU,IAAA,UAAc,OAAA,GAAS,sBAAA;EDxKtB;;;;;EAAA,QCkNH,WAAA;ED1MoB;;;;ECwO5B,gBAAA,CAAiB,IAAA,WAAe,YAAA;EDrON;;;;ECqQ1B,kBAAA,CAAmB,IAAA,WAAe,cAAA;EDtQzB;;;EC4ST,gBAAA,CAAA;ED3S0B;AAM5B;;EC4SE,UAAA,CAAA;ED1Sc;;;EAAA,QCiTN,UAAA;EDjTM;;AAMhB;;ECmVE,UAAA,CAAW,KAAA,UAAe,KAAA;EDnVwB;;;;EC4VlD,IAAA,CAAK,KAAA,UAAe,KAAA;ED3VC;AAOvB;;EC2VE,OAAA,CAAQ,IAAA;EDzVqC;;;EAAA,ICgWzC,eAAA,CAAA;EDhWc;;;EAAA,ICuWd,aAAA,CAAA;EDhWW;;;EAAA,ICuWX,gBAAA,CAAA;EDtWJ;;;EAAA,IC6WI,UAAA,CAAA;ED7W6B;;;;ECqXjC,YAAA,CAAA;AAAA;;;UCxee,oBAAA;EHyHD;EGvHd,UAAA;;EAEA,WAAA;EHqH8D;EGnH9D,
|
|
1
|
+
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/stopwords.ts","../src/types.ts","../src/binary-lemmatizer.ts","../src/disambiguate.ts","../src/disambiguation-rules.ts","../src/mini-grammar.ts","../src/compounds.ts","../src/phrases.ts","../src/pipeline.ts"],"mappings":";;AAUA;;;;;AA4EA;;cA5Ea,YAAA,EAAY,GAAA;;;AA0FzB;iBAdgB,UAAA,CAAW,IAAA;;;;AAuD3B;;;;;AAiBA;;cA1Da,oBAAA,EAAsB,GAAA,SAAY,GAAA;;;;;;;;;;;iBAyC/B,oBAAA,CAAqB,KAAA,UAAe,GAAA;;;;iBAiBpC,eAAA,kBAAA,CAAkC,KAAA,EAAO,CAAA,KAAM,CAAA;;;;AApJ/D;;;;;AA4EA;;;;KC3EY,SAAA;ADyFZ;;;AAAA,cC1Ea,gBAAA,EAAkB,MAAA,CAAO,SAAA;;ADmHtC;;cCnGa,mBAAA,EAAqB,MAAA,CAAO,SAAA;;;ADoHzC;KCpGY,eAAA;;;;KAKA,iBAAA;;;;KAKA,iBAAA;;;;cAKC,UAAA,EAAY,MAAA,CAAO,eAAA;;;;cAUnB,YAAA,EAAc,MAAA,CAAO,iBAAA;AAzDlC;;;AAAA,cAkEa,YAAA,EAAc,MAAA,CAAO,iBAAA;;AAlDlC;;UA0DiB,aAAA;EACf,IAAA,GAAO,eAAA;EACP,MAAA,GAAS,iBAAA;EACT,MAAA,GAAS,iBAAA;AAAA;;;;UAMM,YAAA;EACf,KAAA;EACA,GAAA,EAAK,SAAA;AAAA;;AA3CP;;UAiDiB,cAAA,SAAuB,YAAA;EACtC,KAAA,GAAQ,aAAA;AAAA;AA7CV;;;;AAAA,UAoDiB,cAAA;EACf,SAAA,CAAU,IAAA;EACV,gBAAA,EAAkB,IAAA,WAAe,YAAA;AAAA;;;AAnCnC;;UA0CiB,cAAA;EACf,IAAA,CAAK,KAAA,UAAe,KAAA;AAAA;;;UCjEL,uBAAA;EACf,KAAA,UAAe,KAAA;AAAA;AAAA,UAGA,sBAAA;EACf,SAAA,GAAY,SAAA;AAAA;AAAA,cAGD,gBAAA,YAA4B,cAAA,EAAgB,cAAA;EAAA,QAC/C,MAAA;EAAA,QACA,UAAA;EAAA,QACA,YAAA;EAAA,QACA,YAAA;EAAA,QACA,WAAA;EAAA,QACA,WAAA;EAAA,QACA,YAAA;EAAA,QACA,OAAA;EAAA,QACA,eAAA;EAAA,QACA,eAAA;EAAA,QACA,eAAA;EAAA,QACA,eAAA;EAAA,QACA,WAAA;EAAA,QAEA,UAAA;EAAA,QACA,SAAA;EAAA,QACA,UAAA;EAAA,QACA,WAAA;EAAA,QACA,OAAA;EAAA,QAEA,OAAA;EAAA,QAED,WAAA,CAAA;ED3EsC;AAgB/C;;EAhB+C,OCiKhC,IAAA,CACX,GAAA,UACA,OAAA,GAAS,uBAAA,GACR,OAAA,CAAQ,gBAAA;EDpJqB;;AAgBlC;EAhBkC,OCmKzB,cAAA,CAAe,MAAA,EAAQ,WAAA,GAAc,gBAAA;;;;UAOpC,SAAA;EDrJmB;;;EAAA,QC4JnB,QAAA;EDvJE;;;EAAA,QC8JF,OAAA;ED9JmB;AAK7B;;;EAL6B,QCsKnB,QAAA;EDjKqC;AAU/C;;;;ECiLE,SAAA,CAAU,IAAA,UAAc,OAAA,GAAS,sBAAA;EDxKtB;;;;;EAAA,QCkNH,WAAA;ED1MoB;;;;ECwO5B,gBAAA,CAAiB,IAAA,WAAe,YAAA;EDrON;;;;ECqQ1B,kBAAA,CAAmB,IAAA,WAAe,cAAA;EDtQzB;;;EC4ST,gBAAA,CAAA;ED3S0B;AAM5B;;EC4SE,UAAA,CAAA;ED1Sc;;;EAAA,QCiTN,UAAA;EDjTM;;AAMhB;;ECmVE,UAAA,CAAW,KAAA,UAAe,KAAA;EDnVwB;;;;EC4VlD,IAAA,CAAK,KAAA,UAAe,KAAA;ED3VC;AAOvB;;EC2VE,OAAA,CAAQ,IAAA;EDzVqC;;;EAAA,ICgWzC,eAAA,CAAA;EDhWc;;;EAAA,ICuWd,aAAA,CAAA;EDhWW;;;EAAA,ICuWX,gBAAA,CAAA;EDtWJ;;;EAAA,IC6WI,UAAA,CAAA;ED7W6B;;;;ECqXjC,YAAA,CAAA;AAAA;;;UCxee,oBAAA;EHyHD;EGvHd,UAAA;;EAEA,WAAA;EHqH8D;EGnH9D,kBAAA;EHoI6B;EGlI7B,eAAA;AAAA;AAAA,UAGe,kBAAA;EH+HwC;EG7HvD,KAAA;EH6H6D;EG3H7D,KAAA;EH2H8D;EGzH9D,GAAA,GAAM,SAAA;;EAEN,UAAA;EF5BU;EE8BV,iBAAA,GAAoB,YAAA;;EAEpB,SAAA;EFhCmB;EEkCnB,UAAA;EFRD;EEUC,UAAA;AAAA;;AFLF;;UEWU,mBAAA,SAA4B,cAAA;EACpC,kBAAA,EAAoB,IAAA,WAAe,cAAA;AAAA;AAAA,UAuBpB,yBAAA;EACf,UAAA;EACA,UAAA;AAAA;;AFhBF;;cE6Pa,aAAA;EACX,UAAA,EAAY,mBAAA;EACZ,OAAA,EAAS,cAAA;EACT,UAAA;EACA,WAAA;EACA,kBAAA;EACA,eAAA;cAGE,UAAA,EAAY,cAAA,EACZ,OAAA,GAAS,cAAA,SACT,OAAA,GAAS,oBAAA;EF9PA;;;;;AAUb;;EEqQE,YAAA,CACE,IAAA,UACA,QAAA,iBACA,QAAA,iBACA,IAAA,GAAM,yBAAA,GACL,kBAAA;EF1QsB;;AAS3B;;;;EEmUE,eAAA,CAAgB,MAAA,aAAmB,kBAAA;EF3TpB;;;;;;EE+Uf,aAAA,CAAc,MAAA,aAAmB,GAAA;AAAA;;;;iBAenB,0BAAA,CACd,IAAA,UACA,UAAA,EAAY,cAAA,EACZ,OAAA,EAAS,cAAA,EACT,OAAA;EACE,QAAA,IAAY,IAAA;EACZ,eAAA;AAAA,IAED,GAAA;;;;;;;AHtWH;;UIrFiB,kBAAA;EJqFkB;EInFjC,IAAA;EJ4Hc;EI1Hd,MAAA,EAAQ,SAAA;;EAER,IAAA,EAAM,SAAA;EJwHwD;EItH9D,OAAA;EJuI6B;EIrI7B,WAAA;AAAA;;;;;;;;;;cAYW,oBAAA,EAAsB,kBAAA;;;;iBA6InB,eAAA,CAAgB,IAAA,WAAe,kBAAA;AHxJ/C;;;AAAA,iBGgKgB,sBAAA,CAAuB,IAAA;;;AJtFvC;;;AAAA,UK/EiB,qBAAA;EACf,gBAAA,EAAkB,IAAA,WAAe,YAAA;AAAA;;;;;ALwInC;;;;;cK5Ha,iBAAA,EAAmB,GAAA,SAAY,GAAA,CAAI,eAAA;;;;;;cA+CnC,mBAAA,EAAmB,GAAA;;AJtEhC;;UIsFiB,gBAAA;EJtFI;EIwFnB,KAAA;EJzEW;EI2EX,GAAA,EAAK,SAAA;;EAEL,IAAA;EJ7E6C;EI+E7C,UAAA;AAAA;;;;AJ/CF;;;;iBIyDgB,aAAA,CACd,SAAA,UACA,YAAA,EAAc,eAAA;AJtDhB;;;;;AAKA;;;;;AALA,iBIuEgB,oBAAA,CACd,UAAA,EAAY,cAAA,IACZ,aAAA,EAAe,cAAA,KACd,gBAAA;;;;;AJtDH;;;;;AASA;iBI6EgB,oBAAA,CACd,UAAA,EAAY,cAAA,IACZ,QAAA,kBACC,gBAAA;;;;AJxEH;;;;;;;;;;;;;;;;iBIqHgB,6BAAA,CACd,UAAA,EAAY,cAAA,IACZ,QAAA,iBACA,UAAA,EAAY,qBAAA,UACX,gBAAA;;;;;;;;;AJxGH;;;;;;iBI8JgB,iBAAA,CACd,UAAA,EAAY,cAAA,IACZ,QAAA,iBACA,aAAA,EAAe,cAAA,IACf,UAAA,GAAY,qBAAA,UACX,gBAAA;;;AJ3JH;iBI8KgB,kBAAA,CAAmB,KAAA;;;;iBAOnB,gBAAA,CAAiB,SAAA,WAAoB,GAAA,CAAI,eAAA;;;;;ALrKzD;;cMzHa,gBAAA,EAAgB,GAAA;AAAA,UAoFZ,aAAA;ENqC+C;EMnC9D,IAAA;ENoD6B;EMlD7B,KAAA;ENkD8D;EMhD9D,UAAA;ENgDuD;EM9CvD,UAAA;EN8C6D;EM5C7D,UAAA;AAAA;;;;ALvGF;;;;KKiHY,iBAAA;AAAA,UAEK,uBAAA;ELzFhB;;;;EK8FC,aAAA;EL9ED;EKgFC,iBAAA;EL3FgC;;AAgBlC;;EKgFE,IAAA,GAAO,iBAAA;AAAA;AAAA,cA8EI,gBAAA;EAAA,QACH,UAAA;EAAA,QACA,aAAA;EAAA,QACA,iBAAA;EAAA,QACA,WAAA;EAAA,QACA,IAAA;cAGN,UAAA,EAAY,cAAA,EACZ,WAAA,EAAa,GAAA,UACb,OAAA,GAAS,uBAAA;EL9JgB;;;EAAA,QK0KnB,OAAA;ELrKG;;;;;AAUb;;;EK6KE,KAAA,CAAM,IAAA,WAAe,aAAA;EL7K4B;AASnD;;EATmD,QK0RzC,aAAA;EAAA,QAwBA,QAAA;ELzSyC;AAQnD;;;EKqWE,YAAA,CAAa,IAAA;AAAA;;;;;iBAUC,mBAAA,CAAoB,MAAA,aAAmB,GAAA;;;;ANzcvD;;;;;AA4EA;;;;;AAcA;UOvFiB,YAAA;;EAEf,KAAA;EPqFgD;EOnFhD,UAAA;EP4HkC;EO1HlC,GAAA;AAAA;;AP2IF;;;cOpIa,cAAA,EAAgB,GAAA,SAAY,YAAA;;;;;iBA6GzB,WAAA,CACd,KAAA,YACA,UAAA;EACG,MAAA,EAAQ,YAAA;EAAc,SAAA;AAAA;;AN/H3B;;iBM+IgB,aAAA,CAAc,IAAA;;;ANhI9B;iBMuIgB,aAAA,CAAc,IAAA,WAAe,YAAA;;;;;;UC5H5B,cAAA;ER4Ff;EQ1FA,QAAA;ER6DiC;EQ3DjC,IAAA;ERoGc;EQlGd,MAAA;;EAEA,QAAA;ERgG8D;EQ9F9D,aAAA;ER+G6B;EQ7G7B,UAAA;ER6G8D;EQ3G9D,aAAA,GAAgB,aAAA;ER2GuC;EQzGvD,cAAA;AAAA;;;;UAMe,cAAA;;EAEf,OAAA,GAAU,cAAA;EPlDS;EOoDnB,gBAAA,GAAmB,gBAAA;EPpDA;EOsDnB,eAAA;EPvCW;;;;;AAgBb;EO8BE,sBAAA;;EAEA,cAAA;EPhCgD;AAgBlD;;;;;EOuBE,kBAAA;EPlB2B;;;;AAK7B;;EOoBE,kBAAA;AAAA;;APfF;;;;;AAUA;;iBOgBgB,WAAA,CACd,IAAA,UACA,UAAA,EAAY,cAAA,EACZ,OAAA,GAAS,cAAA,GACR,cAAA;;;APXH;;;;;AAQA;iBOiJgB,sBAAA,CACd,IAAA,UACA,UAAA,EAAY,cAAA,EACZ,OAAA,GAAS,cAAA,GACR,GAAA;;;;KAiES,kBAAA;;;;UAKK,iBAAA;EPzNf;EO2NA,SAAA;EP1NA;EO4NA,eAAA;EP5N0B;EO8N1B,QAAA;EPxNe;EO0Nf,cAAA;;EAEA,aAAA;EP3NA;EO6NA,aAAA;EP5NK;EO8NL,cAAA;EP9Nc;EOgOd,eAAA;EP1N8B;EO4N9B,YAAA;EP5NkD;EO8NlD,MAAA;AAAA;;;;iBAMc,YAAA,CACd,IAAA,UACA,UAAA,EAAY,cAAA,EACZ,QAAA,EAAU,kBAAA,EACV,SAAA;EACE,OAAA,GAAU,cAAA;EACV,gBAAA,GAAmB,gBAAA;AAAA,IAEpB,iBAAA"}
|
package/dist/index.mjs
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import{tokenize as e}from"tokenize-is";const t=new Set(`á.að.aðra.aðrar.aðrir.af.alla.allan.allar.allir.allnokkra.allnokkrar.allnokkrir.allnokkru.allnokkrum.allnokkuð.allnokkur.allnokkurn.allnokkurra.allnokkurrar.allnokkurri.allnokkurs.allnokkurt.allra.allrar.allri.alls.allt.alltað.allur.án.andspænis.annað.annaðhvort.annan.annar.annarra.annarrar.annarri.annars.árla.ásamt.auk.austan.austanundir.austur.báða.báðar.báðir.báðum.bæði.bak.beggja.eða.eður.ef.eftir.ég.ein.eina.einar.einhver.einhverja.einhverjar.einhverjir.einhverju.einhverjum.einhvern.einhverra.einhverrar.einhverri.einhvers.einir.einn.einna.einnar.einni.eins.einskis.einu.einum.eitt.eitthvað.eitthvert.ekkert.ella.ellegar.en.enda.enga.engan.engar.engin.enginn.engir.engra.engrar.engri.engu.engum.er.fáein.fáeina.fáeinar.fáeinir.fáeinna.fáeinum.fjær.fjarri.flestalla.flestallan.flestallar.flestallir.flestallra.flestallrar.flestallri.flestalls.flestallt.flestallur.flestöll.flestöllu.flestöllum.frá.fram.fyrir.fyrst.gagnstætt.gagnvart.gegn.gegnt.gegnum.hana.handa.handan.hann.hans.heldur.hennar.henni.hið.hin.hina.hinar.hinir.hinn.hinna.hinnar.hinni.hins.hinu.hinum.hitt.hjá.honum.hún.hvað.hvaða.hvenær.hver.hverja.hverjar.hverjir.hverju.hverjum.hvern.hverra.hverrar.hverri.hvers.hvert.hvílík.hvílíka.hvílíkan.hvílíkar.hvílíkir.hvílíkra.hvílíkrar.hvílíkri.hvílíks.hvílíkt.hvílíku.hvílíkum.hvílíkur.hvor.hvora.hvorar.hvorir.hvorki.hvorn.hvorra.hvorrar.hvorri.hvors.hvort.hvoru.hvorug.hvoruga.hvorugan.hvorugar.hvorugir.hvorugra.hvorugrar.hvorugri.hvorugs.hvorugt.hvorugu.hvorugum.hvorugur.hvorum.í.inn.innan.innanundir.jafnframt.jafnhliða.kring.kringum.með.meðal.meðan.meður.mér.mestalla.mestallan.mestallar.mestallir.mestallra.mestallrar.mestallri.mestalls.mestallt.mestallur.mestöll.mestöllu.mestöllum.miðli.mig.milli.millum.mín.mína.mínar.mínir.minn.minna.minnar.minni.míns.mínu.mínum.mitt.mót.móti.nær.nærri.næst.næstum.nálægt.né.neðan.nein.neina.neinar.neinir.neinn.neinna.neinnar.neinni.neins.neinu.neinum.neitt.nema.niður.nokkra.nokkrar.nokkrir.nokkru.nokkrum.nokkuð.nokkur.nokkurn.nokkurra.nokkurrar.nokkurri.nokkurs.nokkurt.norðan.nú.öðru.öðrum.of.ofan.ofar.og.óháð.okkar.okkur.öll.öllu.öllum.önnur.órafjarri.oss.sá.sakir.sama.saman.samar.samfara.samhliða.sami.samir.samkvæmt.samra.samrar.samri.sams.samskipa.samt.samtímis.samur.sem.sér.sérhvað.sérhver.sérhverja.sérhverjar.sérhverjir.sérhverju.sérhverjum.sérhvern.sérhverra.sérhverrar.sérhverri.sérhvers.sérhvert.síðan.síðla.sig.sín.sína.sínar.sínhver.sínhverja.sínhverjar.sínhverjir.sínhverju.sínhverjum.sínhvern.sínhverra.sínhverrar.sínhverri.sínhvers.sínhvert.sínhvor.sínhvora.sínhvorar.sínhvorir.sínhvorn.sínhvorra.sínhvorrar.sínhvorri.sínhvors.sínhvort.sínhvoru.sínhvorum.sínir.sinn.sinna.sinnar.sinnhver.sinnhverja.sinnhverjar.sinnhverjir.sinnhverju.sinnhverjum.sinnhvern.sinnhverra.sinnhverrar.sinnhverri.sinnhvers.sinnhvert.sinnhvor.sinnhvora.sinnhvorar.sinnhvorir.sinnhvorn.sinnhvorra.sinnhvorrar.sinnhvorri.sinnhvors.sinnhvort.sinnhvoru.sinnhvorum.sinni.síns.sínu.sínum.sitt.sitthvað.sitthver.sitthverja.sitthverjar.sitthverjir.sitthverju.sitthverjum.sitthvern.sitthverra.sitthverrar.sitthverri.sitthvers.sitthvert.sitthvor.sitthvora.sitthvorar.sitthvorir.sitthvorn.sitthvorra.sitthvorrar.sitthvorri.sitthvors.sitthvort.sitthvoru.sitthvorum.sjálf.sjálfa.sjálfan.sjálfar.sjálfir.sjálfra.sjálfrar.sjálfri.sjálfs.sjálft.sjálfu.sjálfum.sjálfur.slík.slíka.slíkan.slíkar.slíkir.slíkra.slíkrar.slíkri.slíks.slíkt.slíku.slíkum.slíkur.snemma.sökum.söm.sömu.sömum.sú.sum.suma.suman.sumar.sumir.sumra.sumrar.sumri.sums.sumt.sumu.sumum.sumur.sunnan.svo.til.tráss.um.umfram.umhverfis.undan.undir.uns.upp.úr.út.utan.útundan.vegna.vér.vestan.vestur.vettugi.við.viður.vor.vora.vorar.vorir.vorn.vorra.vorrar.vorri.vors.vort.voru.vorum.yðar.yður.yfir.ykkar.ykkur.ýmis.ýmiss.ýmissa.ýmissar.ýmissi.ýmist.ýmsa.ýmsan.ýmsar.ýmsir.ýmsu.ýmsum.þá.það.þær.þann.þar.þau.þegar.þeim.þeir.þeirra.þeirrar.þeirri.þennan.þér.þess.þessa.þessar.þessara.þessarar.þessari.þessi.þessir.þessu.þessum.þetta.þið.þig.þín.þína.þínar.þínir.þinn.þinna.þinnar.þinni.þíns.þínu.þínum.þitt.þó.þónokkra.þónokkrar.þónokkrir.þónokkru.þónokkrum.þónokkuð.þónokkur.þónokkurn.þónokkurra.þónokkurrar.þónokkurri.þónokkurs.þónokkurt.þótt.þú.því.þvílík.þvílíka.þvílíkan.þvílíkar.þvílíkir.þvílíkra.þvílíkrar.þvílíkri.þvílíks.þvílíkt.þvílíku.þvílíkum.þvílíkur`.split(`.`));function n(e){return t.has(e.toLowerCase())}const r=new Map([[`á`,new Set([`fs`,`ao`])],[`við`,new Set([`fs`,`fn`])],[`af`,new Set([`fs`,`ao`])],[`til`,new Set([`fs`])],[`um`,new Set([`fs`])],[`frá`,new Set([`fs`])],[`yfir`,new Set([`fs`,`ao`])],[`undir`,new Set([`fs`,`ao`])],[`fyrir`,new Set([`fs`,`ao`])],[`eftir`,new Set([`fs`,`ao`])],[`gegn`,new Set([`fs`])],[`hjá`,new Set([`fs`])],[`úr`,new Set([`fs`])],[`í`,new Set([`fs`])]]);function i(e,n){let i=e.toLowerCase(),a=r.get(i);return a&&n?a.has(n):t.has(i)}function a(e){return e.filter(e=>!n(e))}const o=1279610177,s=[`no`,`so`,`lo`,`ao`,`fs`,`fn`,`st`,`to`,`gr`,`uh`],c=[void 0,`nf`,`þf`,`þgf`,`ef`],l=[void 0,`kk`,`kvk`,`hk`],u=[`et`,`ft`];var d=class e{buffer;stringPool;lemmaOffsets;lemmaLengths;wordOffsets;wordLengths;entryOffsets;entries;bigramW1Offsets;bigramW1Lengths;bigramW2Offsets;bigramW2Lengths;bigramFreqs;lemmaCount;wordCount;entryCount;bigramCount;version;decoder=new TextDecoder(`utf-8`);constructor(e){this.buffer=e;let t=new DataView(e),n=t.getUint32(0,!0);if(n!==o)throw Error(`Invalid binary format: expected magic 0x${o.toString(16)}, got 0x${n.toString(16)}`);if(this.version=t.getUint32(4,!0),this.version!==1&&this.version!==2)throw Error(`Unsupported version: ${this.version}`);let r=t.getUint32(8,!0);this.lemmaCount=t.getUint32(12,!0),this.wordCount=t.getUint32(16,!0),this.entryCount=t.getUint32(20,!0),this.bigramCount=t.getUint32(24,!0);let i=32;this.stringPool=new Uint8Array(e,i,r),i+=r,this.lemmaOffsets=new Uint32Array(e,i,this.lemmaCount),i+=this.lemmaCount*4,this.lemmaLengths=new Uint8Array(e,i,this.lemmaCount),i+=this.lemmaCount,i=i+3&-4,this.wordOffsets=new Uint32Array(e,i,this.wordCount),i+=this.wordCount*4,this.wordLengths=new Uint8Array(e,i,this.wordCount),i+=this.wordCount,i=i+3&-4,this.entryOffsets=new Uint32Array(e,i,this.wordCount+1),i+=(this.wordCount+1)*4,this.entries=new Uint32Array(e,i,this.entryCount),i+=this.entryCount*4,this.bigramW1Offsets=new Uint32Array(e,i,this.bigramCount),i+=this.bigramCount*4,this.bigramW1Lengths=new Uint8Array(e,i,this.bigramCount),i+=this.bigramCount,i=i+3&-4,this.bigramW2Offsets=new Uint32Array(e,i,this.bigramCount),i+=this.bigramCount*4,this.bigramW2Lengths=new Uint8Array(e,i,this.bigramCount),i+=this.bigramCount,i=i+3&-4,this.bigramFreqs=new Uint32Array(e,i,this.bigramCount)}static async load(t,n={}){let r=await(n.fetch??fetch)(t);if(!r.ok)throw Error(`Failed to load binary data: ${r.status}`);return new e(await r.arrayBuffer())}static loadFromBuffer(t){return new e(t)}getString(e,t){return this.decoder.decode(this.stringPool.subarray(e,e+t))}getLemma(e){return this.getString(this.lemmaOffsets[e],this.lemmaLengths[e])}getWord(e){return this.getString(this.wordOffsets[e],this.wordLengths[e])}findWord(e){let t=0,n=this.wordCount-1;for(;t<=n;){let r=t+n>>>1,i=this.getWord(r);if(i===e)return r;i<e?t=r+1:n=r-1}return-1}lemmatize(e,t={}){let n=e.toLowerCase(),r=this.findWord(n);if(r===-1)return[n];let i=this.entryOffsets[r],a=this.entryOffsets[r+1],{wordClass:o}=t,c=new Set,l=[];for(let e=i;e<a;e++){let{lemmaIdx:t,posCode:n}=this.unpackEntry(this.entries[e]),r=s[n];if(o&&r!==o)continue;let i=this.getLemma(t);c.has(i)||(c.add(i),l.push(i))}return l.length===0?[n]:l}unpackEntry(e){return this.version===1?{lemmaIdx:e>>>4,posCode:e&15,caseCode:0,genderCode:0,numberCode:0}:{lemmaIdx:e>>>10,posCode:e&15,caseCode:e>>>4&7,genderCode:e>>>7&3,numberCode:e>>>9&1}}lemmatizeWithPOS(e){let t=e.toLowerCase(),n=this.findWord(t);if(n===-1)return[];let r=this.entryOffsets[n],i=this.entryOffsets[n+1],a=new Set,o=[];for(let e=r;e<i;e++){let{lemmaIdx:t,posCode:n}=this.unpackEntry(this.entries[e]),r=this.getLemma(t),i=s[n]??``,c=`${r}:${i}`;a.has(c)||(a.add(c),o.push({lemma:r,pos:i}))}return o}lemmatizeWithMorph(e){let t=e.toLowerCase(),n=this.findWord(t);if(n===-1)return[];let r=this.entryOffsets[n],i=this.entryOffsets[n+1],a=[];for(let e=r;e<i;e++){let{lemmaIdx:t,posCode:n,caseCode:r,genderCode:i,numberCode:o}=this.unpackEntry(this.entries[e]),d={},f=c[r],p=l[i],m=u[o];f&&(d.case=f),p&&(d.gender=p),m&&(d.number=m),a.push({lemma:this.getLemma(t),pos:s[n]??``,morph:Object.keys(d).length>0?d:void 0})}return a}hasMorphFeatures(){return this.version>=2}getVersion(){return this.version}findBigram(e,t){let n=0,r=this.bigramCount-1;for(;n<=r;){let i=n+r>>>1,a=this.getString(this.bigramW1Offsets[i],this.bigramW1Lengths[i]);if(a<e)n=i+1;else if(a>e)r=i-1;else{let e=this.getString(this.bigramW2Offsets[i],this.bigramW2Lengths[i]);if(e===t)return i;e<t?n=i+1:r=i-1}}return-1}bigramFreq(e,t){let n=this.findBigram(e.toLowerCase(),t.toLowerCase());return n===-1?0:this.bigramFreqs[n]}freq(e,t){return this.bigramFreq(e,t)}isKnown(e){return this.findWord(e.toLowerCase())!==-1}get lemmaCountValue(){return this.lemmaCount}get wordFormCount(){return this.wordCount}get bigramCountValue(){return this.bigramCount}get bufferSize(){return this.buffer.byteLength}getAllLemmas(){let e=[];for(let t=0;t<this.lemmaCount;t++)e.push(this.getLemma(t));return e}};const f=[{word:`á`,prefer:`so`,over:`fs`,context:`after_pronoun`,description:`á after pronoun = verb 'eiga' (I own, you own)`},{word:`á`,prefer:`fs`,over:`so`,context:`before_noun`,description:`á before noun = preposition (on, at)`},{word:`við`,prefer:`fn`,over:`fs`,context:`sentence_start`,description:`við at sentence start = pronoun 'we'`},{word:`við`,prefer:`fs`,over:`fn`,context:`before_noun`,description:`við before noun = preposition 'by/at'`},{word:`af`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`af before noun = preposition 'of/from'`},{word:`til`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`til before noun = preposition 'to'`},{word:`um`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`um before noun = preposition 'about/around'`},{word:`yfir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`yfir before noun = preposition 'over'`},{word:`undir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`undir before noun = preposition 'under'`},{word:`fyrir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`fyrir before noun = preposition 'for/before'`},{word:`eftir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`eftir before noun = preposition 'after'`},{word:`frá`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`frá before noun = preposition 'from'`},{word:`með`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`með before noun = preposition 'with'`},{word:`í`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`í before noun = preposition 'in'`},{word:`úr`,prefer:`fs`,over:`no`,context:`before_noun`,description:`úr before noun = preposition 'out of'`}];function p(e){let t=e.toLowerCase();return f.filter(e=>e.word===t)}function m(e){return f.some(t=>t.word===e.toLowerCase())}const h=new Map([[`á`,new Set([`þf`,`þgf`])],[`í`,new Set([`þf`,`þgf`])],[`við`,new Set([`þf`,`þgf`])],[`með`,new Set([`þf`,`þgf`])],[`undir`,new Set([`þf`,`þgf`])],[`yfir`,new Set([`þf`,`þgf`])],[`fyrir`,new Set([`þf`,`þgf`])],[`um`,new Set([`þf`])],[`gegnum`,new Set([`þf`])],[`kringum`,new Set([`þf`])],[`umhverfis`,new Set([`þf`])],[`af`,new Set([`þgf`])],[`frá`,new Set([`þgf`])],[`hjá`,new Set([`þgf`])],[`úr`,new Set([`þgf`])],[`að`,new Set([`þgf`])],[`móti`,new Set([`þgf`])],[`nálægt`,new Set([`þgf`])],[`gegn`,new Set([`þgf`])],[`gagnvart`,new Set([`þgf`])],[`handa`,new Set([`þgf`])],[`meðal`,new Set([`ef`])],[`til`,new Set([`ef`])],[`án`,new Set([`ef`])],[`vegna`,new Set([`ef`])],[`sakir`,new Set([`ef`])],[`utan`,new Set([`ef`])],[`innan`,new Set([`ef`])],[`meðfram`,new Set([`þgf`])],[`milli`,new Set([`ef`])],[`auk`,new Set([`ef`])],[`í stað`,new Set([`ef`])]]),g=new Set([`ég`,`þú`,`hann`,`hún`,`það`,`við`,`þið`,`þeir`,`þær`,`þau`]);function _(e,t){return t?h.get(e)?.has(t)??!1:!1}function v(e,t){let n=e.filter(e=>e.pos===`fs`);if(n.length===0)return null;for(let e of n)for(let n of t)if(n.morph?.case&&_(e.lemma,n.morph.case))return{lemma:e.lemma,pos:`fs`,rule:`prep+${n.morph.case}`,confidence:.9};return null}function y(e,t){if(!t)return null;let n=t.toLowerCase();if(!g.has(n))return null;let r=e.filter(e=>e.pos===`so`);return r.length===0||!e.some(e=>e.pos!==`so`)?null:{lemma:(r.find(e=>e.lemma===`eiga`)??r[0]).lemma,pos:`so`,rule:`pronoun+verb`,confidence:.85}}function b(e,t,n){return v(e,n)||y(e,t)||null}function x(e){return h.has(e)}function S(e){return h.get(e)}const C={name:`unambiguous`,run(e){return e.length===1?{lemma:e[0].lemma,pos:e[0].pos,confidence:1}:null}},w={name:`preference_rules`,run(e,t,n){if(!n.usePreferenceRules)return null;for(let n of f){let r=T(n,e,t);if(r)return{lemma:r.lemma,pos:r.pos,confidence:.85}}return null}};function T(e,t,n){let r=t.find(t=>t.lemma.toLowerCase()===e.word.toLowerCase()&&t.pos===e.prefer),i=t.find(t=>t.lemma.toLowerCase()===e.word.toLowerCase()&&t.pos===e.over);if(!r||!i)return null;if(e.context===`before_noun`){let e=n.nextWord;if(e&&/^[A-ZÁÉÍÓÚÝÞÆÖ]/.test(e))return r}else if(e.context===`before_verb`){let e=n.nextWord?.toLowerCase();if(e&&![`þessi`,`þetta`,`sá`,`sú`,`það`,`hinn`,`hin`,`hið`].includes(e))return r}else if(e.context===`after_pronoun`){let e=n.prevWord?.toLowerCase();if(e&&[`ég`,`þú`,`hann`,`hún`,`það`,`við`,`þið`,`þeir`,`þær`,`þau`].includes(e))return r}return null}const E=[C,w,{name:`grammar_rules`,run(e,t,n){if(!n.useGrammarRules)return null;let r=e.map(e=>({...e,morph:void 0}));if(n.lemmatizer.lemmatizeWithMorph){let e=t.allTokens[t.index];if(e){let t=n.lemmatizer.lemmatizeWithMorph(e);r.length=0,r.push(...t)}}let i=b(r,t.prevWord,t.nextWordMorph??[]);return i?{lemma:i.lemma,pos:i.pos,confidence:i.confidence}:null}},{name:`word_bigrams`,run(e,t,n){if(!n.bigrams||e.length===0)return null;let r=[];for(let i of e){let e=0;if(t.prevWord){let r=t.prevLemmas||n.lemmatizer.lemmatize(t.prevWord);for(let t of r){let r=n.bigrams.freq(t,i.lemma);r>0&&(e+=Math.log(r+1)*n.leftWeight)}}if(t.nextWord){let r=t.nextLemmas||n.lemmatizer.lemmatize(t.nextWord);for(let t of r){let r=n.bigrams.freq(i.lemma,t);r>0&&(e+=Math.log(r+1)*n.rightWeight)}}r.push({candidate:i,score:e})}if(r.sort((e,t)=>t.score-e.score),r.length>0&&r[0].score>0){let e=r[0].score,t=r.reduce((e,t)=>e+Math.exp(t.score),0),n=t>0?Math.exp(e)/t:.5;return{lemma:r[0].candidate.lemma,pos:r[0].candidate.pos,confidence:n}}return null}},{name:`fallback`,run(e){return e.length>0?{lemma:e[0].lemma,pos:e[0].pos,confidence:1/e.length}:null}}];var D=class{lemmatizer;bigrams;leftWeight;rightWeight;usePhraseRules;usePreferenceRules;useGrammarRules;constructor(e,t=null,n={}){this.lemmatizer=e,this.bigrams=t,this.leftWeight=n.leftWeight??1,this.rightWeight=n.rightWeight??1,this.usePhraseRules=n.usePhraseRules??!0,this.usePreferenceRules=n.usePreferenceRules??!0,this.useGrammarRules=n.useGrammarRules??!0}disambiguate(e,t,n){let r;r=this.lemmatizer.lemmatizeWithPOS?this.lemmatizer.lemmatizeWithPOS(e):this.lemmatizer.lemmatize(e).map(e=>({lemma:e,pos:`no`}));let i=r.map(e=>e.lemma),a=e,o;n&&this.lemmatizer.lemmatizeWithMorph&&(o=this.lemmatizer.lemmatizeWithMorph(n));let s={prevWord:t,nextWord:n,nextWordMorph:o,allTokens:[e],index:0};for(let e of E){let t=e.run(r,s,this);if(t)return{token:a,lemma:t.lemma,pos:t.pos,candidates:i,candidatesWithPOS:r,ambiguous:i.length>1,confidence:t.confidence,resolvedBy:e.name}}return{token:a,lemma:e.toLowerCase(),candidates:i,candidatesWithPOS:r,ambiguous:!1,confidence:0,resolvedBy:`none`}}disambiguateAll(e){let t=[];for(let n=0;n<e.length;n++){let r=e[n],i=n>0?e[n-1]:null,a=n<e.length-1?e[n+1]:null;t.push(this.disambiguate(r,i,a))}return t}extractLemmas(e){let t=new Set,n=this.disambiguateAll(e);for(let e of n)t.add(e.lemma);return t}};function O(e,n,r,i={}){let{tokenize:a,removeStopwords:o}=i,s=a?a(e):e.split(/\s+/).filter(e=>e.length>0).map(e=>e.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu,``)).filter(e=>e.length>0),c=new D(n,r).extractLemmas(s);if(o)for(let e of c)t.has(e)&&c.delete(e);return c}const k={no:`noun`,so:`verb`,lo:`adjective`,ao:`adverb`,fs:`preposition`,fn:`pronoun`,st:`conjunction`,to:`numeral`,gr:`article`,uh:`interjection`},A={no:`nafnorð`,so:`sagnorð`,lo:`lýsingarorð`,ao:`atviksorð`,fs:`forsetning`,fn:`fornafn`,st:`samtenging`,to:`töluorð`,gr:`greinir`,uh:`upphrópun`},j={nf:`nominative`,þf:`accusative`,þgf:`dative`,ef:`genitive`},M={kk:`masculine`,kvk:`feminine`,hk:`neuter`},N={et:`singular`,ft:`plural`},P=new Set(`ísland.england.írland.skotland.finnland.grænland.holland.þýskaland.frakkland.pólland.tékkland.svissland.rússland.eistland.lettland.litháen.danmörk.noregur.svíþjóð.bandaríkin.spánn.portúgal.ítalía.grikkland.þingvellir.akureyri.ísafjörður.reykjavík.keflavík.hafnarfjörður.kópavogur.seltjarnarnes.garðabær.mosfellsbær.vestmannaeyjar.húsavík.sauðárkrókur.siglufjörður.ólafsfjörður.dalvík.egilsstaðir.neskaupstaður.seyðisfjörður.eskifjörður.reyðarfjörður.fáskrúðsfjörður.stöðvarfjörður.djúpivogur.höfn.vík.selfoss.hveragerði.þorlákshöfn.grindavík.sandgerði.borgarnes.stykkishólmur.grundarfjörður.ólafsvík.búðardalur.patreksfjörður.flateyri.suðureyri.bolungarvík.hólmavík.hvammstangi.blönduós.skagaströnd.varmahlíð.hlíðarendi.bergþórshvol.íslandsbanki.landsbankinn.arionbanki.alþingi`.split(`.`)),F=new Set(`maður.kona.stjóri.ráðherra.forseti.formaður.fulltrúi.starfsmaður.hús.staður.vegur.borg.bær.dalur.fjörður.félag.banki.sjóður.stofnun.ráð.rannsókn.greiðsla.mál.kerfi.verk.þjónusta.rekstur.viðskipti.verð.kostnaður`.split(`.`)),I=new Set([`vera`,`hafa`,`gera`,`fara`,`koma`,`segja`,`vilja`,`mega`,`þurfa`,`verða`,`geta`,`sjá`,`taka`,`eiga`,`láta`,`halda`,`leyfa`,`búa`]),L=[`s`,`u`,`a`];var R=class{lemmatizer;minPartLength;tryLinkingLetters;knownLemmas;mode;constructor(e,t,n={}){this.lemmatizer=e,this.knownLemmas=t,this.minPartLength=n.minPartLength??3,this.tryLinkingLetters=n.tryLinkingLetters??!0,this.mode=n.mode??`balanced`}noSplit(e,t){return{word:e,parts:t,indexTerms:t,confidence:0,isCompound:!1}}split(e){let t=e.toLowerCase(),n=this.lemmatizer.lemmatize(e),r=n[0]?.toLowerCase();if(r&&P.has(r)||P.has(t))return this.noSplit(e,n);let i=n.length>0&&n[0].toLowerCase()!==t,a=n.length===1;if(this.mode===`conservative`)return e.includes(`-`)?this.splitAtHyphen(e,n):this.noSplit(e,n);if(this.mode===`balanced`&&i&&a&&t.length<12||t.length<this.minPartLength*2)return this.noSplit(e,n);let o=[];for(let e=this.minPartLength;e<=t.length-this.minPartLength;e++){let n=t.slice(0,e),r=t.slice(e),i=this.trySplit(n,r);if(i&&o.push(i),this.tryLinkingLetters){for(let e of L)if(n.endsWith(e)&&n.length>this.minPartLength){let e=n.slice(0,-1),t=this.trySplit(e,r);t&&o.push({...t,score:t.score*.95})}}}if(o.length===0)return this.noSplit(e,n);o.sort((e,t)=>t.score-e.score);let s=o[0];if(this.mode===`balanced`&&i&&s.score<.6)return this.noSplit(e,n);let c=[...new Set([...s.leftParts,...s.rightParts])];return{word:e,parts:c,indexTerms:[...new Set([...c,t])],confidence:Math.min(s.score,1),isCompound:!0}}splitAtHyphen(e,t){let n=e.split(`-`).filter(e=>e.length>0);if(n.length<2)return this.noSplit(e,t);let r=[];for(let e of n){let t=this.lemmatizer.lemmatize(e);r.push(...t)}let i=[...new Set(r)];return{word:e,parts:i,indexTerms:[...new Set([...i,e.toLowerCase()])],confidence:.9,isCompound:!0}}trySplit(e,t){let n=this.lemmatizer.lemmatize(e),r=this.lemmatizer.lemmatize(t),i=[...new Set(n.filter(e=>this.knownLemmas.has(e)))],a=[...new Set(r.filter(e=>this.knownLemmas.has(e)))];if(i.length===0||a.length===0)return null;let o=0,s=1-Math.abs(e.length-t.length)/(e.length+t.length);o+=s*.2;let c=(e.length+t.length)/2,l=Math.min(c/6,1);o+=l*.2,a.some(e=>F.has(e))&&(o+=.3);let u=i.some(e=>I.has(e)),d=a.some(e=>I.has(e));return u&&d?o-=.3:!u&&!d&&(o+=.2),(e.length<4||t.length<4)&&(o-=.15),{leftParts:i,rightParts:a,score:Math.max(0,o)}}getAllLemmas(e){return this.split(e).indexTerms}};function z(e){return new Set(e.map(e=>e.toLowerCase()))}const B=new Map([[`til dæmis`,{lemma:`til dæmi`,isStopword:!0,pos:`ao`}],[`með öðrum orðum`,{lemma:`með annar orð`,isStopword:!0,pos:`ao`}],[`í raun`,{lemma:`í raun`,isStopword:!0,pos:`ao`}],[`í raun og veru`,{lemma:`í raun og vera`,isStopword:!0,pos:`ao`}],[`af og til`,{lemma:`af og til`,isStopword:!0,pos:`ao`}],[`aftur á móti`,{lemma:`aftur á mót`,isStopword:!0,pos:`ao`}],[`alla vega`,{lemma:`allur vegur`,isStopword:!0,pos:`ao`}],[`alls ekki`,{lemma:`alls ekki`,isStopword:!0,pos:`ao`}],[`alls staðar`,{lemma:`allur staður`,isStopword:!0,pos:`ao`}],[`allt í allt`,{lemma:`allur í allur`,isStopword:!0,pos:`ao`}],[`annars vegar`,{lemma:`annar vegur`,isStopword:!0,pos:`ao`}],[`auk þess`,{lemma:`auk það`,isStopword:!0,pos:`ao`}],[`að auki`,{lemma:`að auki`,isStopword:!0,pos:`ao`}],[`að vísu`,{lemma:`að vís`,isStopword:!0,pos:`ao`}],[`að sjálfsögðu`,{lemma:`að sjálfsagður`,isStopword:!0,pos:`ao`}],[`að minnsta kosti`,{lemma:`að lítill kostur`,isStopword:!0,pos:`ao`}],[`að öllu leyti`,{lemma:`að allur leyti`,isStopword:!0,pos:`ao`}],[`að nokkru leyti`,{lemma:`að nokkur leyti`,isStopword:!0,pos:`ao`}],[`ef til vill`,{lemma:`ef til vilja`,isStopword:!0,pos:`ao`}],[`einhvers staðar`,{lemma:`einhver staður`,isStopword:!0,pos:`ao`}],[`einhvern veginn`,{lemma:`einhver vegur`,isStopword:!0,pos:`ao`}],[`ekki síst`,{lemma:`ekki síður`,isStopword:!0,pos:`ao`}],[`engu að síður`,{lemma:`enginn að síður`,isStopword:!0,pos:`ao`}],[`fyrst og fremst`,{lemma:`snemma og fremri`,isStopword:!0,pos:`ao`}],[`hins vegar`,{lemma:`hinn vegur`,isStopword:!0,pos:`ao`}],[`hér og þar`,{lemma:`hér og þar`,isStopword:!0,pos:`ao`}],[`hér um bil`,{lemma:`hér um bil`,isStopword:!0,pos:`ao`}],[`hér á landi`,{lemma:`hér á land`,isStopword:!0,pos:`ao`}],[`hvað mest`,{lemma:`hvað mjög`,isStopword:!0,pos:`ao`}],[`hverju sinni`,{lemma:`hver sinn`,isStopword:!0,pos:`ao`}],[`hvorki né`,{lemma:`hvorki né`,isStopword:!0,pos:`ao`}],[`í burtu`,{lemma:`í burtu`,isStopword:!0,pos:`ao`}],[`í gær`,{lemma:`í gær`,isStopword:!0,pos:`ao`}],[`í senn`,{lemma:`í senn`,isStopword:!0,pos:`ao`}],[`í sífellu`,{lemma:`í sífella`,isStopword:!0,pos:`ao`}],[`lengi vel`,{lemma:`lengi vel`,isStopword:!0,pos:`ao`}],[`meira að segja`,{lemma:`mikill að segja`,isStopword:!0,pos:`ao`}],[`meira og minna`,{lemma:`mikill og lítill`,isStopword:!0,pos:`ao`}],[`meðal annars`,{lemma:`meðal annar`,isStopword:!0,pos:`ao`}],[`nokkurn veginn`,{lemma:`nokkur vegur`,isStopword:!0,pos:`ao`}],[`og svo framvegis`,{lemma:`og svo framvegis`,isStopword:!0,pos:`ao`}],[`satt að segja`,{lemma:`sannur að segja`,isStopword:!0,pos:`ao`}],[`sem betur fer`,{lemma:`sem vel fara`,isStopword:!0,pos:`ao`}],[`smám saman`,{lemma:`smátt saman`,isStopword:!0,pos:`ao`}],[`svo sem`,{lemma:`svo sem`,isStopword:!0,pos:`ao`}],[`sér í lagi`,{lemma:`sér í lag`,isStopword:!0,pos:`ao`}],[`til og frá`,{lemma:`til og frá`,isStopword:!0,pos:`ao`}],[`til baka`,{lemma:`til baka`,isStopword:!0,pos:`ao`}],[`vítt og breitt`,{lemma:`vítt og breitt`,isStopword:!0,pos:`ao`}],[`á ný`,{lemma:`á ný`,isStopword:!0,pos:`ao`}],[`á meðan`,{lemma:`á meðan`,isStopword:!0,pos:`ao`}],[`á sama tíma`,{lemma:`á samur tími`,isStopword:!0,pos:`ao`}],[`á hinn bóginn`,{lemma:`á hinn bógur`,isStopword:!0,pos:`ao`}],[`þar af leiðandi`,{lemma:`þar af leiða`,isStopword:!0,pos:`ao`}],[`þar að auki`,{lemma:`þar að auki`,isStopword:!0,pos:`ao`}],[`það er að segja`,{lemma:`það vera að segja`,isStopword:!0,pos:`ao`}],[`þess vegna`,{lemma:`það vegna`,isStopword:!0,pos:`ao`}],[`því miður`,{lemma:`það lítt`,isStopword:!0,pos:`ao`}],[`þrátt fyrir`,{lemma:`þrátt fyrir`,isStopword:!0,pos:`ao`}],[`á dögunum`,{lemma:`á dagur`,isStopword:!0,pos:`ao`}],[`á sínum tíma`,{lemma:`á sinn tími`,isStopword:!0,pos:`ao`}],[`á endanum`,{lemma:`á endi`,isStopword:!0,pos:`ao`}],[`einu sinni`,{lemma:`einn sinn`,isStopword:!1,pos:`ao`}],[`eitt sinn`,{lemma:`einn sinn`,isStopword:!1,pos:`ao`}],[`í fyrsta sinn`,{lemma:`í fyrstur sinn`,isStopword:!1,pos:`ao`}],[`í kvöld`,{lemma:`í kvöld`,isStopword:!1,pos:`ao`}],[`í morgun`,{lemma:`í morgunn`,isStopword:!1,pos:`ao`}],[`á morgun`,{lemma:`á morgunn`,isStopword:!1,pos:`ao`}],[`fyrir hönd`,{lemma:`fyrir hönd`,isStopword:!1,pos:`fs`}],[`með tilliti til`,{lemma:`með tillit til`,isStopword:!1,pos:`fs`}],[`í ljósi`,{lemma:`í ljós`,isStopword:!1,pos:`fs`}],[`í stað`,{lemma:`í staður`,isStopword:!1,pos:`fs`}],[`fyrir aftan`,{lemma:`fyrir aftan`,isStopword:!1,pos:`fs`}],[`fyrir austan`,{lemma:`fyrir austan`,isStopword:!1,pos:`fs`}],[`fyrir framan`,{lemma:`fyrir framan`,isStopword:!1,pos:`fs`}],[`fyrir handan`,{lemma:`fyrir handan`,isStopword:!1,pos:`fs`}],[`fyrir innan`,{lemma:`fyrir innan`,isStopword:!1,pos:`fs`}],[`fyrir neðan`,{lemma:`fyrir neðan`,isStopword:!1,pos:`fs`}],[`fyrir norðan`,{lemma:`fyrir norðan`,isStopword:!1,pos:`fs`}],[`fyrir ofan`,{lemma:`fyrir ofan`,isStopword:!1,pos:`fs`}],[`fyrir sunnan`,{lemma:`fyrir sunnan`,isStopword:!1,pos:`fs`}],[`fyrir utan`,{lemma:`fyrir utan`,isStopword:!1,pos:`fs`}],[`fyrir vestan`,{lemma:`fyrir vestan`,isStopword:!1,pos:`fs`}],[`í gegnum`,{lemma:`í gegnum`,isStopword:!1,pos:`fs`}],[`í kringum`,{lemma:`í kringum`,isStopword:!1,pos:`fs`}],[`innan við`,{lemma:`innan við`,isStopword:!1,pos:`fs`}],[`upp úr`,{lemma:`upp úr`,isStopword:!1,pos:`fs`}],[`þvert á`,{lemma:`þvert á`,isStopword:!1,pos:`fs`}],[`þar eð`,{lemma:`þar eð`,isStopword:!0,pos:`st`}],[`sameinuðu þjóðirnar`,{lemma:`Sameinuðu þjóðirnar`,isStopword:!1,pos:`entity`}],[`evrópusambandið`,{lemma:`Evrópusambandið`,isStopword:!1,pos:`entity`}],[`nato`,{lemma:`NATO`,isStopword:!1,pos:`entity`}],[`nató`,{lemma:`NATO`,isStopword:!1,pos:`entity`}]]);function V(e,t){for(let n=Math.min(4,e.length-t);n>=2;n--){let r=e.slice(t,t+n).join(` `).toLowerCase(),i=B.get(r);if(i)return{phrase:i,wordCount:n}}return null}function H(e){return B.has(e.toLowerCase())}function U(e){return B.get(e.toLowerCase())}const W=new Set([`word`]),G=new Set([`person`,`company`,`entity`]),K=new Set([`punctuation`,`s_begin`,`s_end`,`s_split`,`unknown`]);function q(t,n,r={}){let{bigrams:i,compoundSplitter:a,includeNumbers:o=!1,alwaysTryCompounds:s=!0}=r,c=e(t),l=[],u=[];for(let e=0;e<c.length;e++){let t=c[e];if(!K.has(t.kind)){if(G.has(t.kind)){l.push({original:t.text??``,kind:t.kind,lemmas:[],isEntity:!0});continue}if(t.kind===`number`||t.kind===`ordinal`){o&&l.push({original:t.text??``,kind:t.kind,lemmas:[],isEntity:!1});continue}if(W.has(t.kind)){let e=t.text??``,r=n.lemmatize(e),i={original:e,kind:t.kind,lemmas:r,isEntity:!1},o=r.length===1&&r[0]===e.toLowerCase();if(a&&(s||o)){let t=a.split(e);if(t.isCompound){i.compoundSplit=t;let e=t.parts.flatMap(e=>n.lemmatize(e));i.lemmas=[...new Set([...r,...e])]}}l.push(i),u.push({index:l.length-1,token:t});continue}l.push({original:t.text??``,kind:t.kind,lemmas:[],isEntity:!1})}}if(i&&u.length>0){let e=new D(n,i);for(let t=0;t<u.length;t++){let{index:n,token:r}=u[t],i=t>0?u[t-1].token:null,a=t<u.length-1?u[t+1].token:null,o=e.disambiguate(r.text??``,i?.text??null,a?.text??null);l[n].disambiguated=o.lemma,l[n].confidence=o.confidence}}else for(let{index:e}of u){let t=l[e];t.lemmas.length>0&&(t.disambiguated=t.lemmas[0],t.confidence=t.lemmas.length===1?1:.5)}return l}function J(e,n,r={}){let{removeStopwords:a=!1,indexAllCandidates:o=!0,useContextualStopwords:s=!1}=r,c=q(e,n,r),l=new Set,u=(e,n)=>a?s?i(e,n):t.has(e):!1;for(let e of c)if(!e.isEntity){if(o)for(let t of e.lemmas)u(t)||l.add(t);else e.disambiguated&&(u(e.disambiguated)||l.add(e.disambiguated));if(e.compoundSplit?.isCompound)for(let t of e.compoundSplit.parts){let e=n.lemmatize(t);for(let t of e)u(t)||l.add(t)}}return l}function Y(e,t,n,r={}){let i=performance.now(),a,o;switch(n){case`naive`:{let n=e.split(/\s+/).filter(e=>e.length>0),r=[];for(let e of n){let n=e.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu,``);if(n){let e=t.lemmatize(n);r.push({original:n,kind:`word`,lemmas:e,isEntity:!1,disambiguated:e[0],confidence:e.length===1?1:.5})}}a=r,o=new Set(r.map(e=>e.disambiguated).filter(Boolean));break}case`tokenized`:a=q(e,t),o=new Set(a.filter(e=>e.kind===`word`&&e.lemmas.length>0).map(e=>e.lemmas[0]));break;case`disambiguated`:a=q(e,t,{bigrams:r.bigrams}),o=J(e,t,{bigrams:r.bigrams});break;case`full`:a=q(e,t,{bigrams:r.bigrams,compoundSplitter:r.compoundSplitter}),o=J(e,t,{bigrams:r.bigrams,compoundSplitter:r.compoundSplitter});break}let s=performance.now()-i,c=a.filter(e=>e.kind===`word`),l=c.length,u=c.filter(e=>e.lemmas.length>0&&!(e.lemmas.length===1&&e.lemmas[0]===e.original.toLowerCase())).length,d=c.filter(e=>e.lemmas.length>1).length,f=c.filter(e=>e.confidence!==void 0).map(e=>e.confidence),p=f.length>0?f.reduce((e,t)=>e+t,0)/f.length:0,m=c.filter(e=>e.compoundSplit?.isCompound).length,h=a.filter(e=>e.isEntity).length;return{wordCount:l,lemmatizedCount:u,coverage:l>0?u/l:0,ambiguousCount:d,ambiguityRate:l>0?d/l:0,avgConfidence:p,compoundsFound:m,entitiesSkipped:h,uniqueLemmas:o.size,timeMs:s}}export{d as BinaryLemmatizer,j as CASE_NAMES,r as CONTEXTUAL_STOPWORDS,R as CompoundSplitter,f as DISAMBIGUATION_RULES,D as Disambiguator,M as GENDER_NAMES,g as NOMINATIVE_PRONOUNS,N as NUMBER_NAMES,h as PREPOSITION_CASES,P as PROTECTED_LEMMAS,B as STATIC_PHRASES,t as STOPWORDS_IS,k as WORD_CLASS_NAMES,A as WORD_CLASS_NAMES_IS,b as applyGrammarRules,v as applyPrepositionRule,y as applyPronounVerbRule,_ as canGovernCase,z as createKnownLemmaSet,O as extractDisambiguatedLemmas,J as extractIndexableLemmas,S as getGovernedCases,U as getPhraseInfo,p as getRulesForWord,m as hasDisambiguationRules,i as isContextualStopword,H as isKnownPhrase,x as isKnownPreposition,n as isStopword,V as matchPhrase,q as processText,a as removeStopwords,Y as runBenchmark};
|
|
1
|
+
import{tokenize as e}from"tokenize-is";const t=new Set(`á.að.aðra.aðrar.aðrir.af.alla.allan.allar.allir.allnokkra.allnokkrar.allnokkrir.allnokkru.allnokkrum.allnokkuð.allnokkur.allnokkurn.allnokkurra.allnokkurrar.allnokkurri.allnokkurs.allnokkurt.allra.allrar.allri.alls.allt.alltað.allur.án.andspænis.annað.annaðhvort.annan.annar.annarra.annarrar.annarri.annars.árla.ásamt.auk.austan.austanundir.austur.báða.báðar.báðir.báðum.bæði.bak.beggja.eða.eður.ef.eftir.ég.ein.eina.einar.einhver.einhverja.einhverjar.einhverjir.einhverju.einhverjum.einhvern.einhverra.einhverrar.einhverri.einhvers.einir.einn.einna.einnar.einni.eins.einskis.einu.einum.eitt.eitthvað.eitthvert.ekkert.ella.ellegar.en.enda.enga.engan.engar.engin.enginn.engir.engra.engrar.engri.engu.engum.er.fáein.fáeina.fáeinar.fáeinir.fáeinna.fáeinum.fjær.fjarri.flestalla.flestallan.flestallar.flestallir.flestallra.flestallrar.flestallri.flestalls.flestallt.flestallur.flestöll.flestöllu.flestöllum.frá.fram.fyrir.fyrst.gagnstætt.gagnvart.gegn.gegnt.gegnum.hana.handa.handan.hann.hans.heldur.hennar.henni.hið.hin.hina.hinar.hinir.hinn.hinna.hinnar.hinni.hins.hinu.hinum.hitt.hjá.honum.hún.hvað.hvaða.hvenær.hver.hverja.hverjar.hverjir.hverju.hverjum.hvern.hverra.hverrar.hverri.hvers.hvert.hvílík.hvílíka.hvílíkan.hvílíkar.hvílíkir.hvílíkra.hvílíkrar.hvílíkri.hvílíks.hvílíkt.hvílíku.hvílíkum.hvílíkur.hvor.hvora.hvorar.hvorir.hvorki.hvorn.hvorra.hvorrar.hvorri.hvors.hvort.hvoru.hvorug.hvoruga.hvorugan.hvorugar.hvorugir.hvorugra.hvorugrar.hvorugri.hvorugs.hvorugt.hvorugu.hvorugum.hvorugur.hvorum.í.inn.innan.innanundir.jafnframt.jafnhliða.kring.kringum.með.meðal.meðan.meður.mér.mestalla.mestallan.mestallar.mestallir.mestallra.mestallrar.mestallri.mestalls.mestallt.mestallur.mestöll.mestöllu.mestöllum.miðli.mig.milli.millum.mín.mína.mínar.mínir.minn.minna.minnar.minni.míns.mínu.mínum.mitt.mót.móti.nær.nærri.næst.næstum.nálægt.né.neðan.nein.neina.neinar.neinir.neinn.neinna.neinnar.neinni.neins.neinu.neinum.neitt.nema.niður.nokkra.nokkrar.nokkrir.nokkru.nokkrum.nokkuð.nokkur.nokkurn.nokkurra.nokkurrar.nokkurri.nokkurs.nokkurt.norðan.nú.öðru.öðrum.of.ofan.ofar.og.óháð.okkar.okkur.öll.öllu.öllum.önnur.órafjarri.oss.sá.sakir.sama.saman.samar.samfara.samhliða.sami.samir.samkvæmt.samra.samrar.samri.sams.samskipa.samt.samtímis.samur.sem.sér.sérhvað.sérhver.sérhverja.sérhverjar.sérhverjir.sérhverju.sérhverjum.sérhvern.sérhverra.sérhverrar.sérhverri.sérhvers.sérhvert.síðan.síðla.sig.sín.sína.sínar.sínhver.sínhverja.sínhverjar.sínhverjir.sínhverju.sínhverjum.sínhvern.sínhverra.sínhverrar.sínhverri.sínhvers.sínhvert.sínhvor.sínhvora.sínhvorar.sínhvorir.sínhvorn.sínhvorra.sínhvorrar.sínhvorri.sínhvors.sínhvort.sínhvoru.sínhvorum.sínir.sinn.sinna.sinnar.sinnhver.sinnhverja.sinnhverjar.sinnhverjir.sinnhverju.sinnhverjum.sinnhvern.sinnhverra.sinnhverrar.sinnhverri.sinnhvers.sinnhvert.sinnhvor.sinnhvora.sinnhvorar.sinnhvorir.sinnhvorn.sinnhvorra.sinnhvorrar.sinnhvorri.sinnhvors.sinnhvort.sinnhvoru.sinnhvorum.sinni.síns.sínu.sínum.sitt.sitthvað.sitthver.sitthverja.sitthverjar.sitthverjir.sitthverju.sitthverjum.sitthvern.sitthverra.sitthverrar.sitthverri.sitthvers.sitthvert.sitthvor.sitthvora.sitthvorar.sitthvorir.sitthvorn.sitthvorra.sitthvorrar.sitthvorri.sitthvors.sitthvort.sitthvoru.sitthvorum.sjálf.sjálfa.sjálfan.sjálfar.sjálfir.sjálfra.sjálfrar.sjálfri.sjálfs.sjálft.sjálfu.sjálfum.sjálfur.slík.slíka.slíkan.slíkar.slíkir.slíkra.slíkrar.slíkri.slíks.slíkt.slíku.slíkum.slíkur.snemma.sökum.söm.sömu.sömum.sú.sum.suma.suman.sumar.sumir.sumra.sumrar.sumri.sums.sumt.sumu.sumum.sumur.sunnan.svo.til.tráss.um.umfram.umhverfis.undan.undir.uns.upp.úr.út.utan.útundan.vegna.vér.vestan.vestur.vettugi.við.viður.vor.vora.vorar.vorir.vorn.vorra.vorrar.vorri.vors.vort.voru.vorum.yðar.yður.yfir.ykkar.ykkur.ýmis.ýmiss.ýmissa.ýmissar.ýmissi.ýmist.ýmsa.ýmsan.ýmsar.ýmsir.ýmsu.ýmsum.þá.það.þær.þann.þar.þau.þegar.þeim.þeir.þeirra.þeirrar.þeirri.þennan.þér.þess.þessa.þessar.þessara.þessarar.þessari.þessi.þessir.þessu.þessum.þetta.þið.þig.þín.þína.þínar.þínir.þinn.þinna.þinnar.þinni.þíns.þínu.þínum.þitt.þó.þónokkra.þónokkrar.þónokkrir.þónokkru.þónokkrum.þónokkuð.þónokkur.þónokkurn.þónokkurra.þónokkurrar.þónokkurri.þónokkurs.þónokkurt.þótt.þú.því.þvílík.þvílíka.þvílíkan.þvílíkar.þvílíkir.þvílíkra.þvílíkrar.þvílíkri.þvílíks.þvílíkt.þvílíku.þvílíkum.þvílíkur`.split(`.`));function n(e){return t.has(e.toLowerCase())}const r=new Map([[`á`,new Set([`fs`,`ao`])],[`við`,new Set([`fs`,`fn`])],[`af`,new Set([`fs`,`ao`])],[`til`,new Set([`fs`])],[`um`,new Set([`fs`])],[`frá`,new Set([`fs`])],[`yfir`,new Set([`fs`,`ao`])],[`undir`,new Set([`fs`,`ao`])],[`fyrir`,new Set([`fs`,`ao`])],[`eftir`,new Set([`fs`,`ao`])],[`gegn`,new Set([`fs`])],[`hjá`,new Set([`fs`])],[`úr`,new Set([`fs`])],[`í`,new Set([`fs`])]]);function i(e,n){let i=e.toLowerCase(),a=r.get(i);return a&&n?a.has(n):t.has(i)}function a(e){return e.filter(e=>!n(e))}const o=1279610177,s=[`no`,`so`,`lo`,`ao`,`fs`,`fn`,`st`,`to`,`gr`,`uh`],c=[void 0,`nf`,`þf`,`þgf`,`ef`],l=[void 0,`kk`,`kvk`,`hk`],u=[`et`,`ft`];var d=class e{buffer;stringPool;lemmaOffsets;lemmaLengths;wordOffsets;wordLengths;entryOffsets;entries;bigramW1Offsets;bigramW1Lengths;bigramW2Offsets;bigramW2Lengths;bigramFreqs;lemmaCount;wordCount;entryCount;bigramCount;version;decoder=new TextDecoder(`utf-8`);constructor(e){this.buffer=e;let t=new DataView(e),n=t.getUint32(0,!0);if(n!==o)throw Error(`Invalid binary format: expected magic 0x${o.toString(16)}, got 0x${n.toString(16)}`);if(this.version=t.getUint32(4,!0),this.version!==1&&this.version!==2)throw Error(`Unsupported version: ${this.version}`);let r=t.getUint32(8,!0);this.lemmaCount=t.getUint32(12,!0),this.wordCount=t.getUint32(16,!0),this.entryCount=t.getUint32(20,!0),this.bigramCount=t.getUint32(24,!0);let i=32;this.stringPool=new Uint8Array(e,i,r),i+=r,this.lemmaOffsets=new Uint32Array(e,i,this.lemmaCount),i+=this.lemmaCount*4,this.lemmaLengths=new Uint8Array(e,i,this.lemmaCount),i+=this.lemmaCount,i=i+3&-4,this.wordOffsets=new Uint32Array(e,i,this.wordCount),i+=this.wordCount*4,this.wordLengths=new Uint8Array(e,i,this.wordCount),i+=this.wordCount,i=i+3&-4,this.entryOffsets=new Uint32Array(e,i,this.wordCount+1),i+=(this.wordCount+1)*4,this.entries=new Uint32Array(e,i,this.entryCount),i+=this.entryCount*4,this.bigramW1Offsets=new Uint32Array(e,i,this.bigramCount),i+=this.bigramCount*4,this.bigramW1Lengths=new Uint8Array(e,i,this.bigramCount),i+=this.bigramCount,i=i+3&-4,this.bigramW2Offsets=new Uint32Array(e,i,this.bigramCount),i+=this.bigramCount*4,this.bigramW2Lengths=new Uint8Array(e,i,this.bigramCount),i+=this.bigramCount,i=i+3&-4,this.bigramFreqs=new Uint32Array(e,i,this.bigramCount)}static async load(t,n={}){let r=await(n.fetch??fetch)(t);if(!r.ok)throw Error(`Failed to load binary data: ${r.status}`);return new e(await r.arrayBuffer())}static loadFromBuffer(t){return new e(t)}getString(e,t){return this.decoder.decode(this.stringPool.subarray(e,e+t))}getLemma(e){return this.getString(this.lemmaOffsets[e],this.lemmaLengths[e])}getWord(e){return this.getString(this.wordOffsets[e],this.wordLengths[e])}findWord(e){let t=0,n=this.wordCount-1;for(;t<=n;){let r=t+n>>>1,i=this.getWord(r);if(i===e)return r;i<e?t=r+1:n=r-1}return-1}lemmatize(e,t={}){let n=e.toLowerCase(),r=this.findWord(n);if(r===-1)return[n];let i=this.entryOffsets[r],a=this.entryOffsets[r+1],{wordClass:o}=t,c=new Set,l=[];for(let e=i;e<a;e++){let{lemmaIdx:t,posCode:n}=this.unpackEntry(this.entries[e]),r=s[n];if(o&&r!==o)continue;let i=this.getLemma(t);c.has(i)||(c.add(i),l.push(i))}return l.length===0?[n]:l}unpackEntry(e){return this.version===1?{lemmaIdx:e>>>4,posCode:e&15,caseCode:0,genderCode:0,numberCode:0}:{lemmaIdx:e>>>10,posCode:e&15,caseCode:e>>>4&7,genderCode:e>>>7&3,numberCode:e>>>9&1}}lemmatizeWithPOS(e){let t=e.toLowerCase(),n=this.findWord(t);if(n===-1)return[];let r=this.entryOffsets[n],i=this.entryOffsets[n+1],a=new Set,o=[];for(let e=r;e<i;e++){let{lemmaIdx:t,posCode:n}=this.unpackEntry(this.entries[e]),r=this.getLemma(t),i=s[n]??``,c=`${r}:${i}`;a.has(c)||(a.add(c),o.push({lemma:r,pos:i}))}return o}lemmatizeWithMorph(e){let t=e.toLowerCase(),n=this.findWord(t);if(n===-1)return[];let r=this.entryOffsets[n],i=this.entryOffsets[n+1],a=[];for(let e=r;e<i;e++){let{lemmaIdx:t,posCode:n,caseCode:r,genderCode:i,numberCode:o}=this.unpackEntry(this.entries[e]),d={},f=c[r],p=l[i],m=u[o];f&&(d.case=f),p&&(d.gender=p),m&&(d.number=m),a.push({lemma:this.getLemma(t),pos:s[n]??``,morph:Object.keys(d).length>0?d:void 0})}return a}hasMorphFeatures(){return this.version>=2}getVersion(){return this.version}findBigram(e,t){let n=0,r=this.bigramCount-1;for(;n<=r;){let i=n+r>>>1,a=this.getString(this.bigramW1Offsets[i],this.bigramW1Lengths[i]);if(a<e)n=i+1;else if(a>e)r=i-1;else{let e=this.getString(this.bigramW2Offsets[i],this.bigramW2Lengths[i]);if(e===t)return i;e<t?n=i+1:r=i-1}}return-1}bigramFreq(e,t){let n=this.findBigram(e.toLowerCase(),t.toLowerCase());return n===-1?0:this.bigramFreqs[n]}freq(e,t){return this.bigramFreq(e,t)}isKnown(e){return this.findWord(e.toLowerCase())!==-1}get lemmaCountValue(){return this.lemmaCount}get wordFormCount(){return this.wordCount}get bigramCountValue(){return this.bigramCount}get bufferSize(){return this.buffer.byteLength}getAllLemmas(){let e=[];for(let t=0;t<this.lemmaCount;t++)e.push(this.getLemma(t));return e}};const f=[{word:`á`,prefer:`so`,over:`fs`,context:`after_pronoun`,description:`á after pronoun = verb 'eiga' (I own, you own)`},{word:`á`,prefer:`fs`,over:`so`,context:`before_noun`,description:`á before noun = preposition (on, at)`},{word:`við`,prefer:`fn`,over:`fs`,context:`sentence_start`,description:`við at sentence start = pronoun 'we'`},{word:`við`,prefer:`fs`,over:`fn`,context:`before_noun`,description:`við before noun = preposition 'by/at'`},{word:`af`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`af before noun = preposition 'of/from'`},{word:`til`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`til before noun = preposition 'to'`},{word:`um`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`um before noun = preposition 'about/around'`},{word:`yfir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`yfir before noun = preposition 'over'`},{word:`undir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`undir before noun = preposition 'under'`},{word:`fyrir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`fyrir before noun = preposition 'for/before'`},{word:`eftir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`eftir before noun = preposition 'after'`},{word:`frá`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`frá before noun = preposition 'from'`},{word:`með`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`með before noun = preposition 'with'`},{word:`í`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`í before noun = preposition 'in'`},{word:`úr`,prefer:`fs`,over:`no`,context:`before_noun`,description:`úr before noun = preposition 'out of'`}];function p(e){let t=e.toLowerCase();return f.filter(e=>e.word===t)}function m(e){return f.some(t=>t.word===e.toLowerCase())}const h=new Map([[`á`,new Set([`þf`,`þgf`])],[`í`,new Set([`þf`,`þgf`])],[`við`,new Set([`þf`,`þgf`])],[`með`,new Set([`þf`,`þgf`])],[`undir`,new Set([`þf`,`þgf`])],[`yfir`,new Set([`þf`,`þgf`])],[`fyrir`,new Set([`þf`,`þgf`])],[`um`,new Set([`þf`])],[`gegnum`,new Set([`þf`])],[`kringum`,new Set([`þf`])],[`umhverfis`,new Set([`þf`])],[`af`,new Set([`þgf`])],[`frá`,new Set([`þgf`])],[`hjá`,new Set([`þgf`])],[`úr`,new Set([`þgf`])],[`að`,new Set([`þgf`])],[`móti`,new Set([`þgf`])],[`nálægt`,new Set([`þgf`])],[`gegn`,new Set([`þgf`])],[`gagnvart`,new Set([`þgf`])],[`handa`,new Set([`þgf`])],[`meðal`,new Set([`ef`])],[`til`,new Set([`ef`])],[`án`,new Set([`ef`])],[`vegna`,new Set([`ef`])],[`sakir`,new Set([`ef`])],[`utan`,new Set([`ef`])],[`innan`,new Set([`ef`])],[`meðfram`,new Set([`þgf`])],[`milli`,new Set([`ef`])],[`auk`,new Set([`ef`])],[`í stað`,new Set([`ef`])]]),g=new Set([`ég`,`þú`,`hann`,`hún`,`það`,`við`,`þið`,`þeir`,`þær`,`þau`]);function _(e,t){return t?h.get(e)?.has(t)??!1:!1}function v(e,t){let n=e.filter(e=>e.pos===`fs`);if(n.length===0)return null;for(let e of n)for(let n of t)if(n.morph?.case&&_(e.lemma,n.morph.case))return{lemma:e.lemma,pos:`fs`,rule:`prep+${n.morph.case}`,confidence:.9};return null}function y(e,t){if(!t)return null;let n=t.toLowerCase();if(!g.has(n))return null;let r=e.filter(e=>e.pos===`so`);return r.length===0||!e.some(e=>e.pos!==`so`)?null:{lemma:(r.find(e=>e.lemma===`eiga`)??r[0]).lemma,pos:`so`,rule:`pronoun+verb`,confidence:.85}}function b(e,t,n){if(!t||!n?.lemmatizeWithPOS)return null;let r=n.lemmatizeWithPOS(t),i=r.find(e=>e.pos===`fs`);if(!i)return null;let a=r.some(e=>e.pos===`fn`),o=e.some(e=>e.pos===`so`);if(a&&o)return null;let s=h.get(i.lemma);if(!s)return null;let c=e.filter(e=>e.pos===`no`);for(let e of c)if(e.morph?.case&&s.has(e.morph.case))return{lemma:e.lemma,pos:`no`,rule:`noun_after_prep+${e.morph.case}`,confidence:.9};return null}function x(e,t,n,r=null){return v(e,n)||b(e,t,r)||y(e,t)||null}function S(e){return h.has(e)}function C(e){return h.get(e)}const w={name:`unambiguous`,run(e){return e.length===1?{lemma:e[0].lemma,pos:e[0].pos,confidence:1}:null}},T={name:`preference_rules`,run(e,t,n){if(!n.usePreferenceRules)return null;for(let n of f){let r=E(n,e,t);if(r)return{lemma:r.lemma,pos:r.pos,confidence:.85}}return null}};function E(e,t,n){let r=t.find(t=>t.lemma.toLowerCase()===e.word.toLowerCase()&&t.pos===e.prefer),i=t.find(t=>t.lemma.toLowerCase()===e.word.toLowerCase()&&t.pos===e.over);if(!r||!i)return null;if(e.context===`before_noun`){let e=n.nextWord;if(e&&/^[A-ZÁÉÍÓÚÝÞÆÖ]/.test(e))return r}else if(e.context===`before_verb`){let e=n.nextWord?.toLowerCase();if(e&&![`þessi`,`þetta`,`sá`,`sú`,`það`,`hinn`,`hin`,`hið`].includes(e))return r}else if(e.context===`after_pronoun`){let e=n.prevWord?.toLowerCase();if(e&&[`ég`,`þú`,`hann`,`hún`,`það`,`við`,`þið`,`þeir`,`þær`,`þau`].includes(e))return r}return null}const D=[w,T,{name:`grammar_rules`,run(e,t,n){if(!n.useGrammarRules)return null;let r=e.map(e=>({...e,morph:void 0}));if(n.lemmatizer.lemmatizeWithMorph){let e=t.allTokens[t.index];if(e){let t=n.lemmatizer.lemmatizeWithMorph(e);r.length=0,r.push(...t)}}let i=x(r,t.prevWord,t.nextWordMorph??[],n.lemmatizer);return i?{lemma:i.lemma,pos:i.pos,confidence:i.confidence}:null}},{name:`word_bigrams`,run(e,t,n){if(!n.bigrams||e.length===0)return null;let r=[];for(let i of e){let e=0;if(t.prevWord){let r=t.prevLemmas||n.lemmatizer.lemmatize(t.prevWord);for(let t of r){let r=n.bigrams.freq(t,i.lemma);r>0&&(e+=Math.log(r+1)*n.leftWeight)}}if(t.nextWord){let r=t.nextLemmas||n.lemmatizer.lemmatize(t.nextWord);for(let t of r){let r=n.bigrams.freq(i.lemma,t);r>0&&(e+=Math.log(r+1)*n.rightWeight)}}r.push({candidate:i,score:e})}if(r.sort((e,t)=>t.score-e.score),r.length>0&&r[0].score>0){let e=r[0].score,t=r.reduce((e,t)=>e+Math.exp(t.score),0),n=t>0?Math.exp(e)/t:.5;return{lemma:r[0].candidate.lemma,pos:r[0].candidate.pos,confidence:n}}return null}},{name:`fallback`,run(e){return e.length>0?{lemma:e[0].lemma,pos:e[0].pos,confidence:1/e.length}:null}}];var O=class{lemmatizer;bigrams;leftWeight;rightWeight;usePreferenceRules;useGrammarRules;constructor(e,t=null,n={}){this.lemmatizer=e,this.bigrams=t,this.leftWeight=n.leftWeight??1,this.rightWeight=n.rightWeight??1,this.usePreferenceRules=n.usePreferenceRules??!0,this.useGrammarRules=n.useGrammarRules??!0}disambiguate(e,t,n,r={}){let i;i=this.lemmatizer.lemmatizeWithPOS?this.lemmatizer.lemmatizeWithPOS(e):this.lemmatizer.lemmatize(e).map(e=>({lemma:e,pos:`no`}));let a=i.map(e=>e.lemma),o=e,s;n&&this.lemmatizer.lemmatizeWithMorph&&(s=this.lemmatizer.lemmatizeWithMorph(n));let c={prevWord:t,nextWord:n,prevLemmas:r.prevLemmas,nextLemmas:r.nextLemmas,nextWordMorph:s,allTokens:[e],index:0};for(let e of D){let t=e.run(i,c,this);if(t)return{token:o,lemma:t.lemma,pos:t.pos,candidates:a,candidatesWithPOS:i,ambiguous:a.length>1,confidence:t.confidence,resolvedBy:e.name}}return{token:o,lemma:e.toLowerCase(),candidates:a,candidatesWithPOS:i,ambiguous:!1,confidence:0,resolvedBy:`none`}}disambiguateAll(e){let t=[];for(let n=0;n<e.length;n++){let r=e[n],i=n>0?e[n-1]:null,a=n<e.length-1?e[n+1]:null;t.push(this.disambiguate(r,i,a))}return t}extractLemmas(e){let t=new Set,n=this.disambiguateAll(e);for(let e of n)t.add(e.lemma);return t}};function k(e,n,r,i={}){let{tokenize:a,removeStopwords:o}=i,s=a?a(e):e.split(/\s+/).filter(e=>e.length>0).map(e=>e.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu,``)).filter(e=>e.length>0),c=new O(n,r).extractLemmas(s);if(o)for(let e of c)t.has(e)&&c.delete(e);return c}const A={no:`noun`,so:`verb`,lo:`adjective`,ao:`adverb`,fs:`preposition`,fn:`pronoun`,st:`conjunction`,to:`numeral`,gr:`article`,uh:`interjection`},j={no:`nafnorð`,so:`sagnorð`,lo:`lýsingarorð`,ao:`atviksorð`,fs:`forsetning`,fn:`fornafn`,st:`samtenging`,to:`töluorð`,gr:`greinir`,uh:`upphrópun`},M={nf:`nominative`,þf:`accusative`,þgf:`dative`,ef:`genitive`},N={kk:`masculine`,kvk:`feminine`,hk:`neuter`},P={et:`singular`,ft:`plural`},F=new Set(`ísland.england.írland.skotland.finnland.grænland.holland.þýskaland.frakkland.pólland.tékkland.svissland.rússland.eistland.lettland.litháen.danmörk.noregur.svíþjóð.bandaríkin.spánn.portúgal.ítalía.grikkland.þingvellir.akureyri.ísafjörður.reykjavík.keflavík.hafnarfjörður.kópavogur.seltjarnarnes.garðabær.mosfellsbær.vestmannaeyjar.húsavík.sauðárkrókur.siglufjörður.ólafsfjörður.dalvík.egilsstaðir.neskaupstaður.seyðisfjörður.eskifjörður.reyðarfjörður.fáskrúðsfjörður.stöðvarfjörður.djúpivogur.höfn.vík.selfoss.hveragerði.þorlákshöfn.grindavík.sandgerði.borgarnes.stykkishólmur.grundarfjörður.ólafsvík.búðardalur.patreksfjörður.flateyri.suðureyri.bolungarvík.hólmavík.hvammstangi.blönduós.skagaströnd.varmahlíð.hlíðarendi.bergþórshvol.íslandsbanki.landsbankinn.arionbanki.alþingi`.split(`.`)),I=new Set(`maður.kona.stjóri.ráðherra.forseti.formaður.fulltrúi.starfsmaður.hús.staður.vegur.borg.bær.dalur.fjörður.félag.banki.sjóður.stofnun.ráð.rannsókn.greiðsla.mál.kerfi.verk.þjónusta.rekstur.viðskipti.verð.kostnaður`.split(`.`)),L=new Set([`vera`,`hafa`,`gera`,`fara`,`koma`,`segja`,`vilja`,`mega`,`þurfa`,`verða`,`geta`,`sjá`,`taka`,`eiga`,`láta`,`halda`,`leyfa`,`búa`]),R=[`s`,`u`,`a`];var z=class{lemmatizer;minPartLength;tryLinkingLetters;knownLemmas;mode;constructor(e,t,n={}){this.lemmatizer=e,this.knownLemmas=t,this.minPartLength=n.minPartLength??3,this.tryLinkingLetters=n.tryLinkingLetters??!0,this.mode=n.mode??`balanced`}noSplit(e,t){return{word:e,parts:t,indexTerms:t,confidence:0,isCompound:!1}}split(e){let t=e.toLowerCase(),n=this.lemmatizer.lemmatize(e),r=n[0]?.toLowerCase();if(r&&F.has(r)||F.has(t))return this.noSplit(e,n);let i=n.length>0&&n[0].toLowerCase()!==t,a=n.length===1;if(this.mode===`conservative`)return e.includes(`-`)?this.splitAtHyphen(e,n):this.noSplit(e,n);if(this.mode===`balanced`&&i&&a&&t.length<12||t.length<this.minPartLength*2)return this.noSplit(e,n);let o=[];for(let e=this.minPartLength;e<=t.length-this.minPartLength;e++){let n=t.slice(0,e),r=t.slice(e),i=this.trySplit(n,r);if(i&&o.push(i),this.tryLinkingLetters){for(let e of R)if(n.endsWith(e)&&n.length>this.minPartLength){let e=n.slice(0,-1),t=this.trySplit(e,r);t&&o.push({...t,score:t.score*.95})}}}if(o.length===0)return this.noSplit(e,n);o.sort((e,t)=>t.score-e.score);let s=o[0];if(this.mode===`balanced`&&i&&s.score<.6)return this.noSplit(e,n);let c=[...new Set([...s.leftParts,...s.rightParts])];return{word:e,parts:c,indexTerms:[...new Set([...c,t])],confidence:Math.min(s.score,1),isCompound:!0}}splitAtHyphen(e,t){let n=e.split(`-`).filter(e=>e.length>0);if(n.length<2)return this.noSplit(e,t);let r=[];for(let e of n){let t=this.lemmatizer.lemmatize(e);r.push(...t)}let i=[...new Set(r)];return{word:e,parts:i,indexTerms:[...new Set([...i,e.toLowerCase()])],confidence:.9,isCompound:!0}}trySplit(e,t){let n=this.lemmatizer.lemmatize(e),r=this.lemmatizer.lemmatize(t),i=[...new Set(n.filter(e=>this.knownLemmas.has(e)))],a=[...new Set(r.filter(e=>this.knownLemmas.has(e)))];if(i.length===0||a.length===0)return null;let o=0,s=1-Math.abs(e.length-t.length)/(e.length+t.length);o+=s*.2;let c=(e.length+t.length)/2,l=Math.min(c/6,1);o+=l*.2,a.some(e=>I.has(e))&&(o+=.3);let u=i.some(e=>L.has(e)),d=a.some(e=>L.has(e));return u&&d?o-=.3:!u&&!d&&(o+=.2),(e.length<4||t.length<4)&&(o-=.15),{leftParts:i,rightParts:a,score:Math.max(0,o)}}getAllLemmas(e){return this.split(e).indexTerms}};function B(e){return new Set(e.map(e=>e.toLowerCase()))}const V=new Map([[`til dæmis`,{lemma:`til dæmi`,isStopword:!0,pos:`ao`}],[`með öðrum orðum`,{lemma:`með annar orð`,isStopword:!0,pos:`ao`}],[`í raun`,{lemma:`í raun`,isStopword:!0,pos:`ao`}],[`í raun og veru`,{lemma:`í raun og vera`,isStopword:!0,pos:`ao`}],[`af og til`,{lemma:`af og til`,isStopword:!0,pos:`ao`}],[`aftur á móti`,{lemma:`aftur á mót`,isStopword:!0,pos:`ao`}],[`alla vega`,{lemma:`allur vegur`,isStopword:!0,pos:`ao`}],[`alls ekki`,{lemma:`alls ekki`,isStopword:!0,pos:`ao`}],[`alls staðar`,{lemma:`allur staður`,isStopword:!0,pos:`ao`}],[`allt í allt`,{lemma:`allur í allur`,isStopword:!0,pos:`ao`}],[`annars vegar`,{lemma:`annar vegur`,isStopword:!0,pos:`ao`}],[`auk þess`,{lemma:`auk það`,isStopword:!0,pos:`ao`}],[`að auki`,{lemma:`að auki`,isStopword:!0,pos:`ao`}],[`að vísu`,{lemma:`að vís`,isStopword:!0,pos:`ao`}],[`að sjálfsögðu`,{lemma:`að sjálfsagður`,isStopword:!0,pos:`ao`}],[`að minnsta kosti`,{lemma:`að lítill kostur`,isStopword:!0,pos:`ao`}],[`að öllu leyti`,{lemma:`að allur leyti`,isStopword:!0,pos:`ao`}],[`að nokkru leyti`,{lemma:`að nokkur leyti`,isStopword:!0,pos:`ao`}],[`ef til vill`,{lemma:`ef til vilja`,isStopword:!0,pos:`ao`}],[`einhvers staðar`,{lemma:`einhver staður`,isStopword:!0,pos:`ao`}],[`einhvern veginn`,{lemma:`einhver vegur`,isStopword:!0,pos:`ao`}],[`ekki síst`,{lemma:`ekki síður`,isStopword:!0,pos:`ao`}],[`engu að síður`,{lemma:`enginn að síður`,isStopword:!0,pos:`ao`}],[`fyrst og fremst`,{lemma:`snemma og fremri`,isStopword:!0,pos:`ao`}],[`hins vegar`,{lemma:`hinn vegur`,isStopword:!0,pos:`ao`}],[`hér og þar`,{lemma:`hér og þar`,isStopword:!0,pos:`ao`}],[`hér um bil`,{lemma:`hér um bil`,isStopword:!0,pos:`ao`}],[`hér á landi`,{lemma:`hér á land`,isStopword:!0,pos:`ao`}],[`hvað mest`,{lemma:`hvað mjög`,isStopword:!0,pos:`ao`}],[`hverju sinni`,{lemma:`hver sinn`,isStopword:!0,pos:`ao`}],[`hvorki né`,{lemma:`hvorki né`,isStopword:!0,pos:`ao`}],[`í burtu`,{lemma:`í burtu`,isStopword:!0,pos:`ao`}],[`í gær`,{lemma:`í gær`,isStopword:!0,pos:`ao`}],[`í senn`,{lemma:`í senn`,isStopword:!0,pos:`ao`}],[`í sífellu`,{lemma:`í sífella`,isStopword:!0,pos:`ao`}],[`lengi vel`,{lemma:`lengi vel`,isStopword:!0,pos:`ao`}],[`meira að segja`,{lemma:`mikill að segja`,isStopword:!0,pos:`ao`}],[`meira og minna`,{lemma:`mikill og lítill`,isStopword:!0,pos:`ao`}],[`meðal annars`,{lemma:`meðal annar`,isStopword:!0,pos:`ao`}],[`nokkurn veginn`,{lemma:`nokkur vegur`,isStopword:!0,pos:`ao`}],[`og svo framvegis`,{lemma:`og svo framvegis`,isStopword:!0,pos:`ao`}],[`satt að segja`,{lemma:`sannur að segja`,isStopword:!0,pos:`ao`}],[`sem betur fer`,{lemma:`sem vel fara`,isStopword:!0,pos:`ao`}],[`smám saman`,{lemma:`smátt saman`,isStopword:!0,pos:`ao`}],[`svo sem`,{lemma:`svo sem`,isStopword:!0,pos:`ao`}],[`sér í lagi`,{lemma:`sér í lag`,isStopword:!0,pos:`ao`}],[`til og frá`,{lemma:`til og frá`,isStopword:!0,pos:`ao`}],[`til baka`,{lemma:`til baka`,isStopword:!0,pos:`ao`}],[`vítt og breitt`,{lemma:`vítt og breitt`,isStopword:!0,pos:`ao`}],[`á ný`,{lemma:`á ný`,isStopword:!0,pos:`ao`}],[`á meðan`,{lemma:`á meðan`,isStopword:!0,pos:`ao`}],[`á sama tíma`,{lemma:`á samur tími`,isStopword:!0,pos:`ao`}],[`á hinn bóginn`,{lemma:`á hinn bógur`,isStopword:!0,pos:`ao`}],[`þar af leiðandi`,{lemma:`þar af leiða`,isStopword:!0,pos:`ao`}],[`þar að auki`,{lemma:`þar að auki`,isStopword:!0,pos:`ao`}],[`það er að segja`,{lemma:`það vera að segja`,isStopword:!0,pos:`ao`}],[`þess vegna`,{lemma:`það vegna`,isStopword:!0,pos:`ao`}],[`því miður`,{lemma:`það lítt`,isStopword:!0,pos:`ao`}],[`þrátt fyrir`,{lemma:`þrátt fyrir`,isStopword:!0,pos:`ao`}],[`á dögunum`,{lemma:`á dagur`,isStopword:!0,pos:`ao`}],[`á sínum tíma`,{lemma:`á sinn tími`,isStopword:!0,pos:`ao`}],[`á endanum`,{lemma:`á endi`,isStopword:!0,pos:`ao`}],[`einu sinni`,{lemma:`einn sinn`,isStopword:!1,pos:`ao`}],[`eitt sinn`,{lemma:`einn sinn`,isStopword:!1,pos:`ao`}],[`í fyrsta sinn`,{lemma:`í fyrstur sinn`,isStopword:!1,pos:`ao`}],[`í kvöld`,{lemma:`í kvöld`,isStopword:!1,pos:`ao`}],[`í morgun`,{lemma:`í morgunn`,isStopword:!1,pos:`ao`}],[`á morgun`,{lemma:`á morgunn`,isStopword:!1,pos:`ao`}],[`fyrir hönd`,{lemma:`fyrir hönd`,isStopword:!1,pos:`fs`}],[`með tilliti til`,{lemma:`með tillit til`,isStopword:!1,pos:`fs`}],[`í ljósi`,{lemma:`í ljós`,isStopword:!1,pos:`fs`}],[`í stað`,{lemma:`í staður`,isStopword:!1,pos:`fs`}],[`fyrir aftan`,{lemma:`fyrir aftan`,isStopword:!1,pos:`fs`}],[`fyrir austan`,{lemma:`fyrir austan`,isStopword:!1,pos:`fs`}],[`fyrir framan`,{lemma:`fyrir framan`,isStopword:!1,pos:`fs`}],[`fyrir handan`,{lemma:`fyrir handan`,isStopword:!1,pos:`fs`}],[`fyrir innan`,{lemma:`fyrir innan`,isStopword:!1,pos:`fs`}],[`fyrir neðan`,{lemma:`fyrir neðan`,isStopword:!1,pos:`fs`}],[`fyrir norðan`,{lemma:`fyrir norðan`,isStopword:!1,pos:`fs`}],[`fyrir ofan`,{lemma:`fyrir ofan`,isStopword:!1,pos:`fs`}],[`fyrir sunnan`,{lemma:`fyrir sunnan`,isStopword:!1,pos:`fs`}],[`fyrir utan`,{lemma:`fyrir utan`,isStopword:!1,pos:`fs`}],[`fyrir vestan`,{lemma:`fyrir vestan`,isStopword:!1,pos:`fs`}],[`í gegnum`,{lemma:`í gegnum`,isStopword:!1,pos:`fs`}],[`í kringum`,{lemma:`í kringum`,isStopword:!1,pos:`fs`}],[`innan við`,{lemma:`innan við`,isStopword:!1,pos:`fs`}],[`upp úr`,{lemma:`upp úr`,isStopword:!1,pos:`fs`}],[`þvert á`,{lemma:`þvert á`,isStopword:!1,pos:`fs`}],[`þar eð`,{lemma:`þar eð`,isStopword:!0,pos:`st`}],[`sameinuðu þjóðirnar`,{lemma:`Sameinuðu þjóðirnar`,isStopword:!1,pos:`entity`}],[`evrópusambandið`,{lemma:`Evrópusambandið`,isStopword:!1,pos:`entity`}],[`nato`,{lemma:`NATO`,isStopword:!1,pos:`entity`}],[`nató`,{lemma:`NATO`,isStopword:!1,pos:`entity`}]]);function H(e,t){for(let n=Math.min(4,e.length-t);n>=2;n--){let r=e.slice(t,t+n).join(` `).toLowerCase(),i=V.get(r);if(i)return{phrase:i,wordCount:n}}return null}function U(e){return V.has(e.toLowerCase())}function W(e){return V.get(e.toLowerCase())}const G=new Set([`word`]),K=new Set([`person`,`company`,`entity`]),q=new Set([`punctuation`,`s_begin`,`s_end`,`s_split`,`unknown`]);function J(t,n,r={}){let{bigrams:i,compoundSplitter:a,includeNumbers:o=!1,alwaysTryCompounds:s=!0}=r,c=e(t),l=[],u=[],d=new Map,f=e=>{let t=e.toLowerCase(),r=d.get(t);if(r)return r;let i=n.lemmatize(e);return d.set(t,i),i};for(let e=0;e<c.length;e++){let t=c[e];if(!q.has(t.kind)){if(K.has(t.kind)){l.push({original:t.text??``,kind:t.kind,lemmas:[],isEntity:!0});continue}if(t.kind===`number`||t.kind===`ordinal`){o&&l.push({original:t.text??``,kind:t.kind,lemmas:[],isEntity:!1});continue}if(G.has(t.kind)){let e=t.text??``,n=f(e),r={original:e,kind:t.kind,lemmas:n,isEntity:!1},i=n.length===1&&n[0]===e.toLowerCase();if(a&&(s||i)){let t=a.split(e);if(t.isCompound){r.compoundSplit=t;let e=t.parts.flatMap(e=>f(e));r.compoundLemmas=e,r.lemmas=[...new Set([...n,...e])]}}l.push(r),u.push({index:l.length-1,token:t});continue}l.push({original:t.text??``,kind:t.kind,lemmas:[],isEntity:!1})}}if(i&&u.length>0){let e=new O(n,i);for(let t=0;t<u.length;t++){let{index:n,token:r}=u[t],i=t>0?u[t-1].token:null,a=t<u.length-1?u[t+1].token:null,o=e.disambiguate(r.text??``,i?.text??null,a?.text??null,{prevLemmas:i?.text?f(i.text):void 0,nextLemmas:a?.text?f(a.text):void 0});l[n].disambiguated=o.lemma,l[n].confidence=o.confidence}}else for(let{index:e}of u){let t=l[e];t.lemmas.length>0&&(t.disambiguated=t.lemmas[0],t.confidence=t.lemmas.length===1?1:.5)}return l}function Y(e,n,r={}){let{removeStopwords:a=!1,indexAllCandidates:o=!0,useContextualStopwords:s=!1}=r,c=J(e,n,r),l=new Set,u=(e,n)=>a?s?i(e,n):t.has(e):!1;for(let e of c)if(!e.isEntity){if(o)for(let t of e.lemmas)u(t)||l.add(t);else e.disambiguated&&(u(e.disambiguated)||l.add(e.disambiguated));if(e.compoundSplit?.isCompound){let t=e.compoundLemmas?e.compoundLemmas:e.compoundSplit.parts.flatMap(e=>n.lemmatize(e));for(let e of t)u(e)||l.add(e)}}return l}function X(e,t,n,r={}){let i=performance.now(),a,o;switch(n){case`naive`:{let n=e.split(/\s+/).filter(e=>e.length>0),r=[];for(let e of n){let n=e.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu,``);if(n){let e=t.lemmatize(n);r.push({original:n,kind:`word`,lemmas:e,isEntity:!1,disambiguated:e[0],confidence:e.length===1?1:.5})}}a=r,o=new Set(r.map(e=>e.disambiguated).filter(Boolean));break}case`tokenized`:a=J(e,t),o=new Set(a.filter(e=>e.kind===`word`&&e.lemmas.length>0).map(e=>e.lemmas[0]));break;case`disambiguated`:a=J(e,t,{bigrams:r.bigrams}),o=Y(e,t,{bigrams:r.bigrams});break;case`full`:a=J(e,t,{bigrams:r.bigrams,compoundSplitter:r.compoundSplitter}),o=Y(e,t,{bigrams:r.bigrams,compoundSplitter:r.compoundSplitter});break}let s=performance.now()-i,c=a.filter(e=>e.kind===`word`),l=c.length,u=c.filter(e=>e.lemmas.length>0&&!(e.lemmas.length===1&&e.lemmas[0]===e.original.toLowerCase())).length,d=c.filter(e=>e.lemmas.length>1).length,f=c.filter(e=>e.confidence!==void 0).map(e=>e.confidence),p=f.length>0?f.reduce((e,t)=>e+t,0)/f.length:0,m=c.filter(e=>e.compoundSplit?.isCompound).length,h=a.filter(e=>e.isEntity).length;return{wordCount:l,lemmatizedCount:u,coverage:l>0?u/l:0,ambiguousCount:d,ambiguityRate:l>0?d/l:0,avgConfidence:p,compoundsFound:m,entitiesSkipped:h,uniqueLemmas:o.size,timeMs:s}}export{d as BinaryLemmatizer,M as CASE_NAMES,r as CONTEXTUAL_STOPWORDS,z as CompoundSplitter,f as DISAMBIGUATION_RULES,O as Disambiguator,N as GENDER_NAMES,g as NOMINATIVE_PRONOUNS,P as NUMBER_NAMES,h as PREPOSITION_CASES,F as PROTECTED_LEMMAS,V as STATIC_PHRASES,t as STOPWORDS_IS,A as WORD_CLASS_NAMES,j as WORD_CLASS_NAMES_IS,x as applyGrammarRules,b as applyNounAfterPrepositionRule,v as applyPrepositionRule,y as applyPronounVerbRule,_ as canGovernCase,B as createKnownLemmaSet,k as extractDisambiguatedLemmas,Y as extractIndexableLemmas,C as getGovernedCases,W as getPhraseInfo,p as getRulesForWord,m as hasDisambiguationRules,i as isContextualStopword,U as isKnownPhrase,S as isKnownPreposition,n as isStopword,H as matchPhrase,J as processText,a as removeStopwords,X as runBenchmark};
|
|
2
2
|
//# sourceMappingURL=index.mjs.map
|