lemma-is 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +107 -430
- package/dist/index.d.mts +31 -3
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1 -1
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -363,6 +363,12 @@ declare function getRulesForWord(word: string): DisambiguationRule[];
|
|
|
363
363
|
declare function hasDisambiguationRules(word: string): boolean;
|
|
364
364
|
//#endregion
|
|
365
365
|
//#region src/mini-grammar.d.ts
|
|
366
|
+
/**
|
|
367
|
+
* Interface for lemmatizer used in grammar rules.
|
|
368
|
+
*/
|
|
369
|
+
interface GrammarLemmatizerLike {
|
|
370
|
+
lemmatizeWithPOS?(word: string): LemmaWithPOS[];
|
|
371
|
+
}
|
|
366
372
|
/**
|
|
367
373
|
* Preposition case government rules.
|
|
368
374
|
*
|
|
@@ -422,19 +428,41 @@ declare function applyPrepositionRule(candidates: LemmaWithMorph[], nextWordMorp
|
|
|
422
428
|
* @returns GrammarRuleMatch if a rule applies, null otherwise
|
|
423
429
|
*/
|
|
424
430
|
declare function applyPronounVerbRule(candidates: LemmaWithMorph[], prevWord: string | null): GrammarRuleMatch | null;
|
|
431
|
+
/**
|
|
432
|
+
* Apply noun-after-preposition rule to disambiguate.
|
|
433
|
+
*
|
|
434
|
+
* If the previous word is a preposition and the current word has a
|
|
435
|
+
* noun candidate with a case governed by that preposition, prefer
|
|
436
|
+
* the noun reading.
|
|
437
|
+
*
|
|
438
|
+
* This rule only applies when:
|
|
439
|
+
* - The previous word is UNAMBIGUOUSLY a preposition (no pronoun reading), OR
|
|
440
|
+
* - The current word has no verb candidate
|
|
441
|
+
*
|
|
442
|
+
* Example: "til fundar" → "fundar" is noun "fundur" (genitive), not verb "funda"
|
|
443
|
+
* Counter-example: "við fórum" → "við" is pronoun, "fórum" is verb "fara"
|
|
444
|
+
*
|
|
445
|
+
* @param candidates - All possible readings of the current word
|
|
446
|
+
* @param prevWord - The previous word (raw form)
|
|
447
|
+
* @param lemmatizer - Lemmatizer for looking up the previous word
|
|
448
|
+
* @returns GrammarRuleMatch if a rule applies, null otherwise
|
|
449
|
+
*/
|
|
450
|
+
declare function applyNounAfterPrepositionRule(candidates: LemmaWithMorph[], prevWord: string | null, lemmatizer: GrammarLemmatizerLike | null): GrammarRuleMatch | null;
|
|
425
451
|
/**
|
|
426
452
|
* Apply all mini-grammar rules in sequence.
|
|
427
453
|
*
|
|
428
454
|
* Rules are applied in order of specificity:
|
|
429
455
|
* 1. Preposition + case government (most reliable)
|
|
430
|
-
* 2.
|
|
456
|
+
* 2. Noun after preposition (governed case)
|
|
457
|
+
* 3. Pronoun + verb pattern
|
|
431
458
|
*
|
|
432
459
|
* @param candidates - All possible readings of the current word
|
|
433
460
|
* @param prevWord - Previous word (raw form)
|
|
434
461
|
* @param nextWordMorph - Morphological analyses of the next word
|
|
462
|
+
* @param lemmatizer - Optional lemmatizer for looking up previous word POS
|
|
435
463
|
* @returns GrammarRuleMatch if any rule applies, null otherwise
|
|
436
464
|
*/
|
|
437
|
-
declare function applyGrammarRules(candidates: LemmaWithMorph[], prevWord: string | null, nextWordMorph: LemmaWithMorph[]): GrammarRuleMatch | null;
|
|
465
|
+
declare function applyGrammarRules(candidates: LemmaWithMorph[], prevWord: string | null, nextWordMorph: LemmaWithMorph[], lemmatizer?: GrammarLemmatizerLike | null): GrammarRuleMatch | null;
|
|
438
466
|
/**
|
|
439
467
|
* Check if a word is a known preposition.
|
|
440
468
|
*/
|
|
@@ -673,5 +701,5 @@ declare function runBenchmark(text: string, lemmatizer: LemmatizerLike, strategy
|
|
|
673
701
|
compoundSplitter?: CompoundSplitter;
|
|
674
702
|
}): ProcessingMetrics;
|
|
675
703
|
//#endregion
|
|
676
|
-
export { type BigramProvider, type BinaryLemmatizeOptions, BinaryLemmatizer, type BinaryLemmatizerOptions, CASE_NAMES, CONTEXTUAL_STOPWORDS, type CompoundSplit, type CompoundSplitMode, CompoundSplitter, type CompoundSplitterOptions, DISAMBIGUATION_RULES, type DisambiguatedToken, type DisambiguationRule, Disambiguator, type DisambiguatorOptions, GENDER_NAMES, type GrammarRuleMatch, type GrammaticalCase, type GrammaticalGender, type GrammaticalNumber, type LemmaWithMorph, type LemmaWithPOS, type LemmatizerLike, type MorphFeatures, NOMINATIVE_PRONOUNS, NUMBER_NAMES, PREPOSITION_CASES, PROTECTED_LEMMAS, type ProcessOptions, type ProcessedToken, type ProcessingMetrics, type ProcessingStrategy, STATIC_PHRASES, STOPWORDS_IS, type StaticPhrase, WORD_CLASS_NAMES, WORD_CLASS_NAMES_IS, type WordClass, applyGrammarRules, applyPrepositionRule, applyPronounVerbRule, canGovernCase, createKnownLemmaSet, extractDisambiguatedLemmas, extractIndexableLemmas, getGovernedCases, getPhraseInfo, getRulesForWord, hasDisambiguationRules, isContextualStopword, isKnownPhrase, isKnownPreposition, isStopword, matchPhrase, processText, removeStopwords, runBenchmark };
|
|
704
|
+
export { type BigramProvider, type BinaryLemmatizeOptions, BinaryLemmatizer, type BinaryLemmatizerOptions, CASE_NAMES, CONTEXTUAL_STOPWORDS, type CompoundSplit, type CompoundSplitMode, CompoundSplitter, type CompoundSplitterOptions, DISAMBIGUATION_RULES, type DisambiguatedToken, type DisambiguationRule, Disambiguator, type DisambiguatorOptions, GENDER_NAMES, type GrammarLemmatizerLike, type GrammarRuleMatch, type GrammaticalCase, type GrammaticalGender, type GrammaticalNumber, type LemmaWithMorph, type LemmaWithPOS, type LemmatizerLike, type MorphFeatures, NOMINATIVE_PRONOUNS, NUMBER_NAMES, PREPOSITION_CASES, PROTECTED_LEMMAS, type ProcessOptions, type ProcessedToken, type ProcessingMetrics, type ProcessingStrategy, STATIC_PHRASES, STOPWORDS_IS, type StaticPhrase, WORD_CLASS_NAMES, WORD_CLASS_NAMES_IS, type WordClass, applyGrammarRules, applyNounAfterPrepositionRule, applyPrepositionRule, applyPronounVerbRule, canGovernCase, createKnownLemmaSet, extractDisambiguatedLemmas, extractIndexableLemmas, getGovernedCases, getPhraseInfo, getRulesForWord, hasDisambiguationRules, isContextualStopword, isKnownPhrase, isKnownPreposition, isStopword, matchPhrase, processText, removeStopwords, runBenchmark };
|
|
677
705
|
//# sourceMappingURL=index.d.mts.map
|
package/dist/index.d.mts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/stopwords.ts","../src/types.ts","../src/binary-lemmatizer.ts","../src/disambiguate.ts","../src/disambiguation-rules.ts","../src/mini-grammar.ts","../src/compounds.ts","../src/phrases.ts","../src/pipeline.ts"],"mappings":";;AAUA;;;;;AA4EA;;cA5Ea,YAAA,EAAY,GAAA;;;AA0FzB;iBAdgB,UAAA,CAAW,IAAA;;;;AAuD3B;;;;;AAiBA;;cA1Da,oBAAA,EAAsB,GAAA,SAAY,GAAA;;;;;;;;;;;iBAyC/B,oBAAA,CAAqB,KAAA,UAAe,GAAA;;;;iBAiBpC,eAAA,kBAAA,CAAkC,KAAA,EAAO,CAAA,KAAM,CAAA;;;;AApJ/D;;;;;AA4EA;;;;KC3EY,SAAA;ADyFZ;;;AAAA,cC1Ea,gBAAA,EAAkB,MAAA,CAAO,SAAA;;ADmHtC;;cCnGa,mBAAA,EAAqB,MAAA,CAAO,SAAA;;;ADoHzC;KCpGY,eAAA;;;;KAKA,iBAAA;;;;KAKA,iBAAA;;;;cAKC,UAAA,EAAY,MAAA,CAAO,eAAA;;;;cAUnB,YAAA,EAAc,MAAA,CAAO,iBAAA;AAzDlC;;;AAAA,cAkEa,YAAA,EAAc,MAAA,CAAO,iBAAA;;AAlDlC;;UA0DiB,aAAA;EACf,IAAA,GAAO,eAAA;EACP,MAAA,GAAS,iBAAA;EACT,MAAA,GAAS,iBAAA;AAAA;;;;UAMM,YAAA;EACf,KAAA;EACA,GAAA,EAAK,SAAA;AAAA;;AA3CP;;UAiDiB,cAAA,SAAuB,YAAA;EACtC,KAAA,GAAQ,aAAA;AAAA;AA7CV;;;;AAAA,UAoDiB,cAAA;EACf,SAAA,CAAU,IAAA;EACV,gBAAA,EAAkB,IAAA,WAAe,YAAA;AAAA;;;AAnCnC;;UA0CiB,cAAA;EACf,IAAA,CAAK,KAAA,UAAe,KAAA;AAAA;;;UCjEL,uBAAA;EACf,KAAA,UAAe,KAAA;AAAA;AAAA,UAGA,sBAAA;EACf,SAAA,GAAY,SAAA;AAAA;AAAA,cAGD,gBAAA,YAA4B,cAAA,EAAgB,cAAA;EAAA,QAC/C,MAAA;EAAA,QACA,UAAA;EAAA,QACA,YAAA;EAAA,QACA,YAAA;EAAA,QACA,WAAA;EAAA,QACA,WAAA;EAAA,QACA,YAAA;EAAA,QACA,OAAA;EAAA,QACA,eAAA;EAAA,QACA,eAAA;EAAA,QACA,eAAA;EAAA,QACA,eAAA;EAAA,QACA,WAAA;EAAA,QAEA,UAAA;EAAA,QACA,SAAA;EAAA,QACA,UAAA;EAAA,QACA,WAAA;EAAA,QACA,OAAA;EAAA,QAEA,OAAA;EAAA,QAED,WAAA,CAAA;ED3EsC;AAgB/C;;EAhB+C,OCiKhC,IAAA,CACX,GAAA,UACA,OAAA,GAAS,uBAAA,GACR,OAAA,CAAQ,gBAAA;EDpJqB;;AAgBlC;EAhBkC,OCmKzB,cAAA,CAAe,MAAA,EAAQ,WAAA,GAAc,gBAAA;;;;UAOpC,SAAA;EDrJmB;;;EAAA,QC4JnB,QAAA;EDvJE;;;EAAA,QC8JF,OAAA;ED9JmB;AAK7B;;;EAL6B,QCsKnB,QAAA;EDjKqC;AAU/C;;;;ECiLE,SAAA,CAAU,IAAA,UAAc,OAAA,GAAS,sBAAA;EDxKtB;;;;;EAAA,QCkNH,WAAA;ED1MoB;;;;ECwO5B,gBAAA,CAAiB,IAAA,WAAe,YAAA;EDrON;;;;ECqQ1B,kBAAA,CAAmB,IAAA,WAAe,cAAA;EDtQzB;;;EC4ST,gBAAA,CAAA;ED3S0B;AAM5B;;EC4SE,UAAA,CAAA;ED1Sc;;;EAAA,QCiTN,UAAA;EDjTM;;AAMhB;;ECmVE,UAAA,CAAW,KAAA,UAAe,KAAA;EDnVwB;;;;EC4VlD,IAAA,CAAK,KAAA,UAAe,KAAA;ED3VC;AAOvB;;EC2VE,OAAA,CAAQ,IAAA;EDzVqC;;;EAAA,ICgWzC,eAAA,CAAA;EDhWc;;;EAAA,ICuWd,aAAA,CAAA;EDhWW;;;EAAA,ICuWX,gBAAA,CAAA;EDtWJ;;;EAAA,IC6WI,UAAA,CAAA;ED7W6B;;;;ECqXjC,YAAA,CAAA;AAAA;;;UCxee,oBAAA;EHyHD;EGvHd,UAAA;;EAEA,WAAA;EHqH8D;EGnH9D,cAAA;EHoI6B;EGlI7B,kBAAA;EHkI8D;EGhI9D,eAAA;AAAA;AAAA,UAGe,kBAAA;EH6H8C;EG3H7D,KAAA;EH2H8D;EGzH9D,KAAA;;EAEA,GAAA,GAAM,SAAA;EF5BI;EE8BV,UAAA;;EAEA,iBAAA,GAAoB,YAAA;EFhCD;EEkCnB,SAAA;EFRD;EEUC,UAAA;EFrB6B;EEuB7B,UAAA;AAAA;;;;UAMQ,mBAAA,SAA4B,cAAA;EACpC,kBAAA,EAAoB,IAAA,WAAe,cAAA;AAAA;;;;
|
|
1
|
+
{"version":3,"file":"index.d.mts","names":[],"sources":["../src/stopwords.ts","../src/types.ts","../src/binary-lemmatizer.ts","../src/disambiguate.ts","../src/disambiguation-rules.ts","../src/mini-grammar.ts","../src/compounds.ts","../src/phrases.ts","../src/pipeline.ts"],"mappings":";;AAUA;;;;;AA4EA;;cA5Ea,YAAA,EAAY,GAAA;;;AA0FzB;iBAdgB,UAAA,CAAW,IAAA;;;;AAuD3B;;;;;AAiBA;;cA1Da,oBAAA,EAAsB,GAAA,SAAY,GAAA;;;;;;;;;;;iBAyC/B,oBAAA,CAAqB,KAAA,UAAe,GAAA;;;;iBAiBpC,eAAA,kBAAA,CAAkC,KAAA,EAAO,CAAA,KAAM,CAAA;;;;AApJ/D;;;;;AA4EA;;;;KC3EY,SAAA;ADyFZ;;;AAAA,cC1Ea,gBAAA,EAAkB,MAAA,CAAO,SAAA;;ADmHtC;;cCnGa,mBAAA,EAAqB,MAAA,CAAO,SAAA;;;ADoHzC;KCpGY,eAAA;;;;KAKA,iBAAA;;;;KAKA,iBAAA;;;;cAKC,UAAA,EAAY,MAAA,CAAO,eAAA;;;;cAUnB,YAAA,EAAc,MAAA,CAAO,iBAAA;AAzDlC;;;AAAA,cAkEa,YAAA,EAAc,MAAA,CAAO,iBAAA;;AAlDlC;;UA0DiB,aAAA;EACf,IAAA,GAAO,eAAA;EACP,MAAA,GAAS,iBAAA;EACT,MAAA,GAAS,iBAAA;AAAA;;;;UAMM,YAAA;EACf,KAAA;EACA,GAAA,EAAK,SAAA;AAAA;;AA3CP;;UAiDiB,cAAA,SAAuB,YAAA;EACtC,KAAA,GAAQ,aAAA;AAAA;AA7CV;;;;AAAA,UAoDiB,cAAA;EACf,SAAA,CAAU,IAAA;EACV,gBAAA,EAAkB,IAAA,WAAe,YAAA;AAAA;;;AAnCnC;;UA0CiB,cAAA;EACf,IAAA,CAAK,KAAA,UAAe,KAAA;AAAA;;;UCjEL,uBAAA;EACf,KAAA,UAAe,KAAA;AAAA;AAAA,UAGA,sBAAA;EACf,SAAA,GAAY,SAAA;AAAA;AAAA,cAGD,gBAAA,YAA4B,cAAA,EAAgB,cAAA;EAAA,QAC/C,MAAA;EAAA,QACA,UAAA;EAAA,QACA,YAAA;EAAA,QACA,YAAA;EAAA,QACA,WAAA;EAAA,QACA,WAAA;EAAA,QACA,YAAA;EAAA,QACA,OAAA;EAAA,QACA,eAAA;EAAA,QACA,eAAA;EAAA,QACA,eAAA;EAAA,QACA,eAAA;EAAA,QACA,WAAA;EAAA,QAEA,UAAA;EAAA,QACA,SAAA;EAAA,QACA,UAAA;EAAA,QACA,WAAA;EAAA,QACA,OAAA;EAAA,QAEA,OAAA;EAAA,QAED,WAAA,CAAA;ED3EsC;AAgB/C;;EAhB+C,OCiKhC,IAAA,CACX,GAAA,UACA,OAAA,GAAS,uBAAA,GACR,OAAA,CAAQ,gBAAA;EDpJqB;;AAgBlC;EAhBkC,OCmKzB,cAAA,CAAe,MAAA,EAAQ,WAAA,GAAc,gBAAA;;;;UAOpC,SAAA;EDrJmB;;;EAAA,QC4JnB,QAAA;EDvJE;;;EAAA,QC8JF,OAAA;ED9JmB;AAK7B;;;EAL6B,QCsKnB,QAAA;EDjKqC;AAU/C;;;;ECiLE,SAAA,CAAU,IAAA,UAAc,OAAA,GAAS,sBAAA;EDxKtB;;;;;EAAA,QCkNH,WAAA;ED1MoB;;;;ECwO5B,gBAAA,CAAiB,IAAA,WAAe,YAAA;EDrON;;;;ECqQ1B,kBAAA,CAAmB,IAAA,WAAe,cAAA;EDtQzB;;;EC4ST,gBAAA,CAAA;ED3S0B;AAM5B;;EC4SE,UAAA,CAAA;ED1Sc;;;EAAA,QCiTN,UAAA;EDjTM;;AAMhB;;ECmVE,UAAA,CAAW,KAAA,UAAe,KAAA;EDnVwB;;;;EC4VlD,IAAA,CAAK,KAAA,UAAe,KAAA;ED3VC;AAOvB;;EC2VE,OAAA,CAAQ,IAAA;EDzVqC;;;EAAA,ICgWzC,eAAA,CAAA;EDhWc;;;EAAA,ICuWd,aAAA,CAAA;EDhWW;;;EAAA,ICuWX,gBAAA,CAAA;EDtWJ;;;EAAA,IC6WI,UAAA,CAAA;ED7W6B;;;;ECqXjC,YAAA,CAAA;AAAA;;;UCxee,oBAAA;EHyHD;EGvHd,UAAA;;EAEA,WAAA;EHqH8D;EGnH9D,cAAA;EHoI6B;EGlI7B,kBAAA;EHkI8D;EGhI9D,eAAA;AAAA;AAAA,UAGe,kBAAA;EH6H8C;EG3H7D,KAAA;EH2H8D;EGzH9D,KAAA;;EAEA,GAAA,GAAM,SAAA;EF5BI;EE8BV,UAAA;;EAEA,iBAAA,GAAoB,YAAA;EFhCD;EEkCnB,SAAA;EFRD;EEUC,UAAA;EFrB6B;EEuB7B,UAAA;AAAA;;;;UAMQ,mBAAA,SAA4B,cAAA;EACpC,kBAAA,EAAoB,IAAA,WAAe,cAAA;AAAA;;;;cAiQxB,aAAA;EACX,UAAA,EAAY,mBAAA;EACZ,OAAA,EAAS,cAAA;EACT,UAAA;EACA,WAAA;EACA,cAAA;EACA,kBAAA;EACA,eAAA;cAGE,UAAA,EAAY,cAAA,EACZ,OAAA,GAAS,cAAA,SACT,OAAA,GAAS,oBAAA;EFjQgB;AAK7B;;;;;AAUA;EEoQE,YAAA,CACE,IAAA,UACA,QAAA,iBACA,QAAA,kBACC,kBAAA;;;;AF/PL;;;EE+TE,eAAA,CAAgB,MAAA,aAAmB,kBAAA;EF/Tc;AAQnD;;;;;EE2UE,aAAA,CAAc,MAAA,aAAmB,GAAA;AAAA;;;;iBAenB,0BAAA,CACd,IAAA,UACA,UAAA,EAAY,cAAA,EACZ,OAAA,EAAS,cAAA,EACT,OAAA;EACE,QAAA,IAAY,IAAA;EACZ,eAAA;AAAA,IAED,GAAA;;;;;;;AHlWH;;UIrFiB,kBAAA;EJqFkB;EInFjC,IAAA;EJ4Hc;EI1Hd,MAAA,EAAQ,SAAA;;EAER,IAAA,EAAM,SAAA;EJwHwD;EItH9D,OAAA;EJuI6B;EIrI7B,WAAA;AAAA;;;;;;;;;;cAYW,oBAAA,EAAsB,kBAAA;;;;iBA6InB,eAAA,CAAgB,IAAA,WAAe,kBAAA;AHxJ/C;;;AAAA,iBGgKgB,sBAAA,CAAuB,IAAA;;;AJtFvC;;;AAAA,UK/EiB,qBAAA;EACf,gBAAA,EAAkB,IAAA,WAAe,YAAA;AAAA;;;;;ALwInC;;;;;cK5Ha,iBAAA,EAAmB,GAAA,SAAY,GAAA,CAAI,eAAA;;;;;;cA+CnC,mBAAA,EAAmB,GAAA;;AJtEhC;;UIsFiB,gBAAA;EJtFI;EIwFnB,KAAA;EJzEW;EI2EX,GAAA,EAAK,SAAA;;EAEL,IAAA;EJ7E6C;EI+E7C,UAAA;AAAA;;;;AJ/CF;;;;iBIyDgB,aAAA,CACd,SAAA,UACA,YAAA,EAAc,eAAA;AJtDhB;;;;;AAKA;;;;;AALA,iBIuEgB,oBAAA,CACd,UAAA,EAAY,cAAA,IACZ,aAAA,EAAe,cAAA,KACd,gBAAA;;;;;AJtDH;;;;;AASA;iBI6EgB,oBAAA,CACd,UAAA,EAAY,cAAA,IACZ,QAAA,kBACC,gBAAA;;;;AJxEH;;;;;;;;;;;;;;;;iBIqHgB,6BAAA,CACd,UAAA,EAAY,cAAA,IACZ,QAAA,iBACA,UAAA,EAAY,qBAAA,UACX,gBAAA;;;;;;;;;AJxGH;;;;;;iBI8JgB,iBAAA,CACd,UAAA,EAAY,cAAA,IACZ,QAAA,iBACA,aAAA,EAAe,cAAA,IACf,UAAA,GAAY,qBAAA,UACX,gBAAA;;;AJ3JH;iBI8KgB,kBAAA,CAAmB,KAAA;;;;iBAOnB,gBAAA,CAAiB,SAAA,WAAoB,GAAA,CAAI,eAAA;;;;;ALrKzD;;cMzHa,gBAAA,EAAgB,GAAA;AAAA,UAoFZ,aAAA;ENqC+C;EMnC9D,IAAA;ENoD6B;EMlD7B,KAAA;ENkD8D;EMhD9D,UAAA;ENgDuD;EM9CvD,UAAA;EN8C6D;EM5C7D,UAAA;AAAA;;;;ALvGF;;;;KKiHY,iBAAA;AAAA,UAEK,uBAAA;ELzFhB;;;;EK8FC,aAAA;EL9ED;EKgFC,iBAAA;EL3FgC;;AAgBlC;;EKgFE,IAAA,GAAO,iBAAA;AAAA;AAAA,cA8EI,gBAAA;EAAA,QACH,UAAA;EAAA,QACA,aAAA;EAAA,QACA,iBAAA;EAAA,QACA,WAAA;EAAA,QACA,IAAA;cAGN,UAAA,EAAY,cAAA,EACZ,WAAA,EAAa,GAAA,UACb,OAAA,GAAS,uBAAA;EL9JgB;;;EAAA,QK0KnB,OAAA;ELrKG;;;;;AAUb;;;EK6KE,KAAA,CAAM,IAAA,WAAe,aAAA;EL7K4B;AASnD;;EATmD,QK0RzC,aAAA;EAAA,QAwBA,QAAA;ELzSyC;AAQnD;;;EKqWE,YAAA,CAAa,IAAA;AAAA;;;;;iBAUC,mBAAA,CAAoB,MAAA,aAAmB,GAAA;;;;ANzcvD;;;;;AA4EA;;;;;AAcA;UOvFiB,YAAA;;EAEf,KAAA;EPqFgD;EOnFhD,UAAA;EP4HkC;EO1HlC,GAAA;AAAA;;AP2IF;;;cOpIa,cAAA,EAAgB,GAAA,SAAY,YAAA;;;;;iBA6GzB,WAAA,CACd,KAAA,YACA,UAAA;EACG,MAAA,EAAQ,YAAA;EAAc,SAAA;AAAA;;AN/H3B;;iBM+IgB,aAAA,CAAc,IAAA;;;ANhI9B;iBMuIgB,aAAA,CAAc,IAAA,WAAe,YAAA;;;;;;UC5H5B,cAAA;ER4Ff;EQ1FA,QAAA;ER6DiC;EQ3DjC,IAAA;ERoGc;EQlGd,MAAA;;EAEA,QAAA;ERgG8D;EQ9F9D,aAAA;ER+G6B;EQ7G7B,UAAA;ER6G8D;EQ3G9D,aAAA,GAAgB,aAAA;AAAA;;;;UAMD,cAAA;;EAEf,OAAA,GAAU,cAAA;;EAEV,gBAAA,GAAmB,gBAAA;EPlDA;EOoDnB,eAAA;EPpDmB;;AAerB;;;;EO4CE,sBAAA;EP5BW;EO8BX,cAAA;;;;APdF;;;EOqBE,kBAAA;EPrByB;AAK3B;;;;;EOuBE,kBAAA;AAAA;;;;APbF;;;;;iBOwBgB,WAAA,CACd,IAAA,UACA,UAAA,EAAY,cAAA,EACZ,OAAA,GAAS,cAAA,GACR,cAAA;;;;;APTH;;;;iBOwIgB,sBAAA,CACd,IAAA,UACA,UAAA,EAAY,cAAA,EACZ,OAAA,GAAS,cAAA,GACR,GAAA;APpIH;;;AAAA,KOqMY,kBAAA;;;;UAKK,iBAAA;EPzMf;EO2MA,SAAA;EP1MA;EO4MA,eAAA;EP3MA;EO6MA,QAAA;EP7M0B;EO+M1B,cAAA;EPzMe;EO2Mf,aAAA;;EAEA,aAAA;EP5MA;EO8MA,cAAA;EP7MK;EO+ML,eAAA;EP/Mc;EOiNd,YAAA;EP3M8B;EO6M9B,MAAA;AAAA;;;;iBAMc,YAAA,CACd,IAAA,UACA,UAAA,EAAY,cAAA,EACZ,QAAA,EAAU,kBAAA,EACV,SAAA;EACE,OAAA,GAAU,cAAA;EACV,gBAAA,GAAmB,gBAAA;AAAA,IAEpB,iBAAA"}
|
package/dist/index.mjs
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import{tokenize as e}from"tokenize-is";const t=new Set(`á.að.aðra.aðrar.aðrir.af.alla.allan.allar.allir.allnokkra.allnokkrar.allnokkrir.allnokkru.allnokkrum.allnokkuð.allnokkur.allnokkurn.allnokkurra.allnokkurrar.allnokkurri.allnokkurs.allnokkurt.allra.allrar.allri.alls.allt.alltað.allur.án.andspænis.annað.annaðhvort.annan.annar.annarra.annarrar.annarri.annars.árla.ásamt.auk.austan.austanundir.austur.báða.báðar.báðir.báðum.bæði.bak.beggja.eða.eður.ef.eftir.ég.ein.eina.einar.einhver.einhverja.einhverjar.einhverjir.einhverju.einhverjum.einhvern.einhverra.einhverrar.einhverri.einhvers.einir.einn.einna.einnar.einni.eins.einskis.einu.einum.eitt.eitthvað.eitthvert.ekkert.ella.ellegar.en.enda.enga.engan.engar.engin.enginn.engir.engra.engrar.engri.engu.engum.er.fáein.fáeina.fáeinar.fáeinir.fáeinna.fáeinum.fjær.fjarri.flestalla.flestallan.flestallar.flestallir.flestallra.flestallrar.flestallri.flestalls.flestallt.flestallur.flestöll.flestöllu.flestöllum.frá.fram.fyrir.fyrst.gagnstætt.gagnvart.gegn.gegnt.gegnum.hana.handa.handan.hann.hans.heldur.hennar.henni.hið.hin.hina.hinar.hinir.hinn.hinna.hinnar.hinni.hins.hinu.hinum.hitt.hjá.honum.hún.hvað.hvaða.hvenær.hver.hverja.hverjar.hverjir.hverju.hverjum.hvern.hverra.hverrar.hverri.hvers.hvert.hvílík.hvílíka.hvílíkan.hvílíkar.hvílíkir.hvílíkra.hvílíkrar.hvílíkri.hvílíks.hvílíkt.hvílíku.hvílíkum.hvílíkur.hvor.hvora.hvorar.hvorir.hvorki.hvorn.hvorra.hvorrar.hvorri.hvors.hvort.hvoru.hvorug.hvoruga.hvorugan.hvorugar.hvorugir.hvorugra.hvorugrar.hvorugri.hvorugs.hvorugt.hvorugu.hvorugum.hvorugur.hvorum.í.inn.innan.innanundir.jafnframt.jafnhliða.kring.kringum.með.meðal.meðan.meður.mér.mestalla.mestallan.mestallar.mestallir.mestallra.mestallrar.mestallri.mestalls.mestallt.mestallur.mestöll.mestöllu.mestöllum.miðli.mig.milli.millum.mín.mína.mínar.mínir.minn.minna.minnar.minni.míns.mínu.mínum.mitt.mót.móti.nær.nærri.næst.næstum.nálægt.né.neðan.nein.neina.neinar.neinir.neinn.neinna.neinnar.neinni.neins.neinu.neinum.neitt.nema.niður.nokkra.nokkrar.nokkrir.nokkru.nokkrum.nokkuð.nokkur.nokkurn.nokkurra.nokkurrar.nokkurri.nokkurs.nokkurt.norðan.nú.öðru.öðrum.of.ofan.ofar.og.óháð.okkar.okkur.öll.öllu.öllum.önnur.órafjarri.oss.sá.sakir.sama.saman.samar.samfara.samhliða.sami.samir.samkvæmt.samra.samrar.samri.sams.samskipa.samt.samtímis.samur.sem.sér.sérhvað.sérhver.sérhverja.sérhverjar.sérhverjir.sérhverju.sérhverjum.sérhvern.sérhverra.sérhverrar.sérhverri.sérhvers.sérhvert.síðan.síðla.sig.sín.sína.sínar.sínhver.sínhverja.sínhverjar.sínhverjir.sínhverju.sínhverjum.sínhvern.sínhverra.sínhverrar.sínhverri.sínhvers.sínhvert.sínhvor.sínhvora.sínhvorar.sínhvorir.sínhvorn.sínhvorra.sínhvorrar.sínhvorri.sínhvors.sínhvort.sínhvoru.sínhvorum.sínir.sinn.sinna.sinnar.sinnhver.sinnhverja.sinnhverjar.sinnhverjir.sinnhverju.sinnhverjum.sinnhvern.sinnhverra.sinnhverrar.sinnhverri.sinnhvers.sinnhvert.sinnhvor.sinnhvora.sinnhvorar.sinnhvorir.sinnhvorn.sinnhvorra.sinnhvorrar.sinnhvorri.sinnhvors.sinnhvort.sinnhvoru.sinnhvorum.sinni.síns.sínu.sínum.sitt.sitthvað.sitthver.sitthverja.sitthverjar.sitthverjir.sitthverju.sitthverjum.sitthvern.sitthverra.sitthverrar.sitthverri.sitthvers.sitthvert.sitthvor.sitthvora.sitthvorar.sitthvorir.sitthvorn.sitthvorra.sitthvorrar.sitthvorri.sitthvors.sitthvort.sitthvoru.sitthvorum.sjálf.sjálfa.sjálfan.sjálfar.sjálfir.sjálfra.sjálfrar.sjálfri.sjálfs.sjálft.sjálfu.sjálfum.sjálfur.slík.slíka.slíkan.slíkar.slíkir.slíkra.slíkrar.slíkri.slíks.slíkt.slíku.slíkum.slíkur.snemma.sökum.söm.sömu.sömum.sú.sum.suma.suman.sumar.sumir.sumra.sumrar.sumri.sums.sumt.sumu.sumum.sumur.sunnan.svo.til.tráss.um.umfram.umhverfis.undan.undir.uns.upp.úr.út.utan.útundan.vegna.vér.vestan.vestur.vettugi.við.viður.vor.vora.vorar.vorir.vorn.vorra.vorrar.vorri.vors.vort.voru.vorum.yðar.yður.yfir.ykkar.ykkur.ýmis.ýmiss.ýmissa.ýmissar.ýmissi.ýmist.ýmsa.ýmsan.ýmsar.ýmsir.ýmsu.ýmsum.þá.það.þær.þann.þar.þau.þegar.þeim.þeir.þeirra.þeirrar.þeirri.þennan.þér.þess.þessa.þessar.þessara.þessarar.þessari.þessi.þessir.þessu.þessum.þetta.þið.þig.þín.þína.þínar.þínir.þinn.þinna.þinnar.þinni.þíns.þínu.þínum.þitt.þó.þónokkra.þónokkrar.þónokkrir.þónokkru.þónokkrum.þónokkuð.þónokkur.þónokkurn.þónokkurra.þónokkurrar.þónokkurri.þónokkurs.þónokkurt.þótt.þú.því.þvílík.þvílíka.þvílíkan.þvílíkar.þvílíkir.þvílíkra.þvílíkrar.þvílíkri.þvílíks.þvílíkt.þvílíku.þvílíkum.þvílíkur`.split(`.`));function n(e){return t.has(e.toLowerCase())}const r=new Map([[`á`,new Set([`fs`,`ao`])],[`við`,new Set([`fs`,`fn`])],[`af`,new Set([`fs`,`ao`])],[`til`,new Set([`fs`])],[`um`,new Set([`fs`])],[`frá`,new Set([`fs`])],[`yfir`,new Set([`fs`,`ao`])],[`undir`,new Set([`fs`,`ao`])],[`fyrir`,new Set([`fs`,`ao`])],[`eftir`,new Set([`fs`,`ao`])],[`gegn`,new Set([`fs`])],[`hjá`,new Set([`fs`])],[`úr`,new Set([`fs`])],[`í`,new Set([`fs`])]]);function i(e,n){let i=e.toLowerCase(),a=r.get(i);return a&&n?a.has(n):t.has(i)}function a(e){return e.filter(e=>!n(e))}const o=1279610177,s=[`no`,`so`,`lo`,`ao`,`fs`,`fn`,`st`,`to`,`gr`,`uh`],c=[void 0,`nf`,`þf`,`þgf`,`ef`],l=[void 0,`kk`,`kvk`,`hk`],u=[`et`,`ft`];var d=class e{buffer;stringPool;lemmaOffsets;lemmaLengths;wordOffsets;wordLengths;entryOffsets;entries;bigramW1Offsets;bigramW1Lengths;bigramW2Offsets;bigramW2Lengths;bigramFreqs;lemmaCount;wordCount;entryCount;bigramCount;version;decoder=new TextDecoder(`utf-8`);constructor(e){this.buffer=e;let t=new DataView(e),n=t.getUint32(0,!0);if(n!==o)throw Error(`Invalid binary format: expected magic 0x${o.toString(16)}, got 0x${n.toString(16)}`);if(this.version=t.getUint32(4,!0),this.version!==1&&this.version!==2)throw Error(`Unsupported version: ${this.version}`);let r=t.getUint32(8,!0);this.lemmaCount=t.getUint32(12,!0),this.wordCount=t.getUint32(16,!0),this.entryCount=t.getUint32(20,!0),this.bigramCount=t.getUint32(24,!0);let i=32;this.stringPool=new Uint8Array(e,i,r),i+=r,this.lemmaOffsets=new Uint32Array(e,i,this.lemmaCount),i+=this.lemmaCount*4,this.lemmaLengths=new Uint8Array(e,i,this.lemmaCount),i+=this.lemmaCount,i=i+3&-4,this.wordOffsets=new Uint32Array(e,i,this.wordCount),i+=this.wordCount*4,this.wordLengths=new Uint8Array(e,i,this.wordCount),i+=this.wordCount,i=i+3&-4,this.entryOffsets=new Uint32Array(e,i,this.wordCount+1),i+=(this.wordCount+1)*4,this.entries=new Uint32Array(e,i,this.entryCount),i+=this.entryCount*4,this.bigramW1Offsets=new Uint32Array(e,i,this.bigramCount),i+=this.bigramCount*4,this.bigramW1Lengths=new Uint8Array(e,i,this.bigramCount),i+=this.bigramCount,i=i+3&-4,this.bigramW2Offsets=new Uint32Array(e,i,this.bigramCount),i+=this.bigramCount*4,this.bigramW2Lengths=new Uint8Array(e,i,this.bigramCount),i+=this.bigramCount,i=i+3&-4,this.bigramFreqs=new Uint32Array(e,i,this.bigramCount)}static async load(t,n={}){let r=await(n.fetch??fetch)(t);if(!r.ok)throw Error(`Failed to load binary data: ${r.status}`);return new e(await r.arrayBuffer())}static loadFromBuffer(t){return new e(t)}getString(e,t){return this.decoder.decode(this.stringPool.subarray(e,e+t))}getLemma(e){return this.getString(this.lemmaOffsets[e],this.lemmaLengths[e])}getWord(e){return this.getString(this.wordOffsets[e],this.wordLengths[e])}findWord(e){let t=0,n=this.wordCount-1;for(;t<=n;){let r=t+n>>>1,i=this.getWord(r);if(i===e)return r;i<e?t=r+1:n=r-1}return-1}lemmatize(e,t={}){let n=e.toLowerCase(),r=this.findWord(n);if(r===-1)return[n];let i=this.entryOffsets[r],a=this.entryOffsets[r+1],{wordClass:o}=t,c=new Set,l=[];for(let e=i;e<a;e++){let{lemmaIdx:t,posCode:n}=this.unpackEntry(this.entries[e]),r=s[n];if(o&&r!==o)continue;let i=this.getLemma(t);c.has(i)||(c.add(i),l.push(i))}return l.length===0?[n]:l}unpackEntry(e){return this.version===1?{lemmaIdx:e>>>4,posCode:e&15,caseCode:0,genderCode:0,numberCode:0}:{lemmaIdx:e>>>10,posCode:e&15,caseCode:e>>>4&7,genderCode:e>>>7&3,numberCode:e>>>9&1}}lemmatizeWithPOS(e){let t=e.toLowerCase(),n=this.findWord(t);if(n===-1)return[];let r=this.entryOffsets[n],i=this.entryOffsets[n+1],a=new Set,o=[];for(let e=r;e<i;e++){let{lemmaIdx:t,posCode:n}=this.unpackEntry(this.entries[e]),r=this.getLemma(t),i=s[n]??``,c=`${r}:${i}`;a.has(c)||(a.add(c),o.push({lemma:r,pos:i}))}return o}lemmatizeWithMorph(e){let t=e.toLowerCase(),n=this.findWord(t);if(n===-1)return[];let r=this.entryOffsets[n],i=this.entryOffsets[n+1],a=[];for(let e=r;e<i;e++){let{lemmaIdx:t,posCode:n,caseCode:r,genderCode:i,numberCode:o}=this.unpackEntry(this.entries[e]),d={},f=c[r],p=l[i],m=u[o];f&&(d.case=f),p&&(d.gender=p),m&&(d.number=m),a.push({lemma:this.getLemma(t),pos:s[n]??``,morph:Object.keys(d).length>0?d:void 0})}return a}hasMorphFeatures(){return this.version>=2}getVersion(){return this.version}findBigram(e,t){let n=0,r=this.bigramCount-1;for(;n<=r;){let i=n+r>>>1,a=this.getString(this.bigramW1Offsets[i],this.bigramW1Lengths[i]);if(a<e)n=i+1;else if(a>e)r=i-1;else{let e=this.getString(this.bigramW2Offsets[i],this.bigramW2Lengths[i]);if(e===t)return i;e<t?n=i+1:r=i-1}}return-1}bigramFreq(e,t){let n=this.findBigram(e.toLowerCase(),t.toLowerCase());return n===-1?0:this.bigramFreqs[n]}freq(e,t){return this.bigramFreq(e,t)}isKnown(e){return this.findWord(e.toLowerCase())!==-1}get lemmaCountValue(){return this.lemmaCount}get wordFormCount(){return this.wordCount}get bigramCountValue(){return this.bigramCount}get bufferSize(){return this.buffer.byteLength}getAllLemmas(){let e=[];for(let t=0;t<this.lemmaCount;t++)e.push(this.getLemma(t));return e}};const f=[{word:`á`,prefer:`so`,over:`fs`,context:`after_pronoun`,description:`á after pronoun = verb 'eiga' (I own, you own)`},{word:`á`,prefer:`fs`,over:`so`,context:`before_noun`,description:`á before noun = preposition (on, at)`},{word:`við`,prefer:`fn`,over:`fs`,context:`sentence_start`,description:`við at sentence start = pronoun 'we'`},{word:`við`,prefer:`fs`,over:`fn`,context:`before_noun`,description:`við before noun = preposition 'by/at'`},{word:`af`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`af before noun = preposition 'of/from'`},{word:`til`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`til before noun = preposition 'to'`},{word:`um`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`um before noun = preposition 'about/around'`},{word:`yfir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`yfir before noun = preposition 'over'`},{word:`undir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`undir before noun = preposition 'under'`},{word:`fyrir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`fyrir before noun = preposition 'for/before'`},{word:`eftir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`eftir before noun = preposition 'after'`},{word:`frá`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`frá before noun = preposition 'from'`},{word:`með`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`með before noun = preposition 'with'`},{word:`í`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`í before noun = preposition 'in'`},{word:`úr`,prefer:`fs`,over:`no`,context:`before_noun`,description:`úr before noun = preposition 'out of'`}];function p(e){let t=e.toLowerCase();return f.filter(e=>e.word===t)}function m(e){return f.some(t=>t.word===e.toLowerCase())}const h=new Map([[`á`,new Set([`þf`,`þgf`])],[`í`,new Set([`þf`,`þgf`])],[`við`,new Set([`þf`,`þgf`])],[`með`,new Set([`þf`,`þgf`])],[`undir`,new Set([`þf`,`þgf`])],[`yfir`,new Set([`þf`,`þgf`])],[`fyrir`,new Set([`þf`,`þgf`])],[`um`,new Set([`þf`])],[`gegnum`,new Set([`þf`])],[`kringum`,new Set([`þf`])],[`umhverfis`,new Set([`þf`])],[`af`,new Set([`þgf`])],[`frá`,new Set([`þgf`])],[`hjá`,new Set([`þgf`])],[`úr`,new Set([`þgf`])],[`að`,new Set([`þgf`])],[`móti`,new Set([`þgf`])],[`nálægt`,new Set([`þgf`])],[`gegn`,new Set([`þgf`])],[`gagnvart`,new Set([`þgf`])],[`handa`,new Set([`þgf`])],[`meðal`,new Set([`ef`])],[`til`,new Set([`ef`])],[`án`,new Set([`ef`])],[`vegna`,new Set([`ef`])],[`sakir`,new Set([`ef`])],[`utan`,new Set([`ef`])],[`innan`,new Set([`ef`])],[`meðfram`,new Set([`þgf`])],[`milli`,new Set([`ef`])],[`auk`,new Set([`ef`])],[`í stað`,new Set([`ef`])]]),g=new Set([`ég`,`þú`,`hann`,`hún`,`það`,`við`,`þið`,`þeir`,`þær`,`þau`]);function _(e,t){return t?h.get(e)?.has(t)??!1:!1}function v(e,t){let n=e.filter(e=>e.pos===`fs`);if(n.length===0)return null;for(let e of n)for(let n of t)if(n.morph?.case&&_(e.lemma,n.morph.case))return{lemma:e.lemma,pos:`fs`,rule:`prep+${n.morph.case}`,confidence:.9};return null}function y(e,t){if(!t)return null;let n=t.toLowerCase();if(!g.has(n))return null;let r=e.filter(e=>e.pos===`so`);return r.length===0||!e.some(e=>e.pos!==`so`)?null:{lemma:(r.find(e=>e.lemma===`eiga`)??r[0]).lemma,pos:`so`,rule:`pronoun+verb`,confidence:.85}}function b(e,t,n){return v(e,n)||y(e,t)||null}function x(e){return h.has(e)}function S(e){return h.get(e)}const C={name:`unambiguous`,run(e){return e.length===1?{lemma:e[0].lemma,pos:e[0].pos,confidence:1}:null}},w={name:`preference_rules`,run(e,t,n){if(!n.usePreferenceRules)return null;for(let n of f){let r=T(n,e,t);if(r)return{lemma:r.lemma,pos:r.pos,confidence:.85}}return null}};function T(e,t,n){let r=t.find(t=>t.lemma.toLowerCase()===e.word.toLowerCase()&&t.pos===e.prefer),i=t.find(t=>t.lemma.toLowerCase()===e.word.toLowerCase()&&t.pos===e.over);if(!r||!i)return null;if(e.context===`before_noun`){let e=n.nextWord;if(e&&/^[A-ZÁÉÍÓÚÝÞÆÖ]/.test(e))return r}else if(e.context===`before_verb`){let e=n.nextWord?.toLowerCase();if(e&&![`þessi`,`þetta`,`sá`,`sú`,`það`,`hinn`,`hin`,`hið`].includes(e))return r}else if(e.context===`after_pronoun`){let e=n.prevWord?.toLowerCase();if(e&&[`ég`,`þú`,`hann`,`hún`,`það`,`við`,`þið`,`þeir`,`þær`,`þau`].includes(e))return r}return null}const E=[C,w,{name:`grammar_rules`,run(e,t,n){if(!n.useGrammarRules)return null;let r=e.map(e=>({...e,morph:void 0}));if(n.lemmatizer.lemmatizeWithMorph){let e=t.allTokens[t.index];if(e){let t=n.lemmatizer.lemmatizeWithMorph(e);r.length=0,r.push(...t)}}let i=b(r,t.prevWord,t.nextWordMorph??[]);return i?{lemma:i.lemma,pos:i.pos,confidence:i.confidence}:null}},{name:`word_bigrams`,run(e,t,n){if(!n.bigrams||e.length===0)return null;let r=[];for(let i of e){let e=0;if(t.prevWord){let r=t.prevLemmas||n.lemmatizer.lemmatize(t.prevWord);for(let t of r){let r=n.bigrams.freq(t,i.lemma);r>0&&(e+=Math.log(r+1)*n.leftWeight)}}if(t.nextWord){let r=t.nextLemmas||n.lemmatizer.lemmatize(t.nextWord);for(let t of r){let r=n.bigrams.freq(i.lemma,t);r>0&&(e+=Math.log(r+1)*n.rightWeight)}}r.push({candidate:i,score:e})}if(r.sort((e,t)=>t.score-e.score),r.length>0&&r[0].score>0){let e=r[0].score,t=r.reduce((e,t)=>e+Math.exp(t.score),0),n=t>0?Math.exp(e)/t:.5;return{lemma:r[0].candidate.lemma,pos:r[0].candidate.pos,confidence:n}}return null}},{name:`fallback`,run(e){return e.length>0?{lemma:e[0].lemma,pos:e[0].pos,confidence:1/e.length}:null}}];var D=class{lemmatizer;bigrams;leftWeight;rightWeight;usePhraseRules;usePreferenceRules;useGrammarRules;constructor(e,t=null,n={}){this.lemmatizer=e,this.bigrams=t,this.leftWeight=n.leftWeight??1,this.rightWeight=n.rightWeight??1,this.usePhraseRules=n.usePhraseRules??!0,this.usePreferenceRules=n.usePreferenceRules??!0,this.useGrammarRules=n.useGrammarRules??!0}disambiguate(e,t,n){let r;r=this.lemmatizer.lemmatizeWithPOS?this.lemmatizer.lemmatizeWithPOS(e):this.lemmatizer.lemmatize(e).map(e=>({lemma:e,pos:`no`}));let i=r.map(e=>e.lemma),a=e,o;n&&this.lemmatizer.lemmatizeWithMorph&&(o=this.lemmatizer.lemmatizeWithMorph(n));let s={prevWord:t,nextWord:n,nextWordMorph:o,allTokens:[e],index:0};for(let e of E){let t=e.run(r,s,this);if(t)return{token:a,lemma:t.lemma,pos:t.pos,candidates:i,candidatesWithPOS:r,ambiguous:i.length>1,confidence:t.confidence,resolvedBy:e.name}}return{token:a,lemma:e.toLowerCase(),candidates:i,candidatesWithPOS:r,ambiguous:!1,confidence:0,resolvedBy:`none`}}disambiguateAll(e){let t=[];for(let n=0;n<e.length;n++){let r=e[n],i=n>0?e[n-1]:null,a=n<e.length-1?e[n+1]:null;t.push(this.disambiguate(r,i,a))}return t}extractLemmas(e){let t=new Set,n=this.disambiguateAll(e);for(let e of n)t.add(e.lemma);return t}};function O(e,n,r,i={}){let{tokenize:a,removeStopwords:o}=i,s=a?a(e):e.split(/\s+/).filter(e=>e.length>0).map(e=>e.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu,``)).filter(e=>e.length>0),c=new D(n,r).extractLemmas(s);if(o)for(let e of c)t.has(e)&&c.delete(e);return c}const k={no:`noun`,so:`verb`,lo:`adjective`,ao:`adverb`,fs:`preposition`,fn:`pronoun`,st:`conjunction`,to:`numeral`,gr:`article`,uh:`interjection`},A={no:`nafnorð`,so:`sagnorð`,lo:`lýsingarorð`,ao:`atviksorð`,fs:`forsetning`,fn:`fornafn`,st:`samtenging`,to:`töluorð`,gr:`greinir`,uh:`upphrópun`},j={nf:`nominative`,þf:`accusative`,þgf:`dative`,ef:`genitive`},M={kk:`masculine`,kvk:`feminine`,hk:`neuter`},N={et:`singular`,ft:`plural`},P=new Set(`ísland.england.írland.skotland.finnland.grænland.holland.þýskaland.frakkland.pólland.tékkland.svissland.rússland.eistland.lettland.litháen.danmörk.noregur.svíþjóð.bandaríkin.spánn.portúgal.ítalía.grikkland.þingvellir.akureyri.ísafjörður.reykjavík.keflavík.hafnarfjörður.kópavogur.seltjarnarnes.garðabær.mosfellsbær.vestmannaeyjar.húsavík.sauðárkrókur.siglufjörður.ólafsfjörður.dalvík.egilsstaðir.neskaupstaður.seyðisfjörður.eskifjörður.reyðarfjörður.fáskrúðsfjörður.stöðvarfjörður.djúpivogur.höfn.vík.selfoss.hveragerði.þorlákshöfn.grindavík.sandgerði.borgarnes.stykkishólmur.grundarfjörður.ólafsvík.búðardalur.patreksfjörður.flateyri.suðureyri.bolungarvík.hólmavík.hvammstangi.blönduós.skagaströnd.varmahlíð.hlíðarendi.bergþórshvol.íslandsbanki.landsbankinn.arionbanki.alþingi`.split(`.`)),F=new Set(`maður.kona.stjóri.ráðherra.forseti.formaður.fulltrúi.starfsmaður.hús.staður.vegur.borg.bær.dalur.fjörður.félag.banki.sjóður.stofnun.ráð.rannsókn.greiðsla.mál.kerfi.verk.þjónusta.rekstur.viðskipti.verð.kostnaður`.split(`.`)),I=new Set([`vera`,`hafa`,`gera`,`fara`,`koma`,`segja`,`vilja`,`mega`,`þurfa`,`verða`,`geta`,`sjá`,`taka`,`eiga`,`láta`,`halda`,`leyfa`,`búa`]),L=[`s`,`u`,`a`];var R=class{lemmatizer;minPartLength;tryLinkingLetters;knownLemmas;mode;constructor(e,t,n={}){this.lemmatizer=e,this.knownLemmas=t,this.minPartLength=n.minPartLength??3,this.tryLinkingLetters=n.tryLinkingLetters??!0,this.mode=n.mode??`balanced`}noSplit(e,t){return{word:e,parts:t,indexTerms:t,confidence:0,isCompound:!1}}split(e){let t=e.toLowerCase(),n=this.lemmatizer.lemmatize(e),r=n[0]?.toLowerCase();if(r&&P.has(r)||P.has(t))return this.noSplit(e,n);let i=n.length>0&&n[0].toLowerCase()!==t,a=n.length===1;if(this.mode===`conservative`)return e.includes(`-`)?this.splitAtHyphen(e,n):this.noSplit(e,n);if(this.mode===`balanced`&&i&&a&&t.length<12||t.length<this.minPartLength*2)return this.noSplit(e,n);let o=[];for(let e=this.minPartLength;e<=t.length-this.minPartLength;e++){let n=t.slice(0,e),r=t.slice(e),i=this.trySplit(n,r);if(i&&o.push(i),this.tryLinkingLetters){for(let e of L)if(n.endsWith(e)&&n.length>this.minPartLength){let e=n.slice(0,-1),t=this.trySplit(e,r);t&&o.push({...t,score:t.score*.95})}}}if(o.length===0)return this.noSplit(e,n);o.sort((e,t)=>t.score-e.score);let s=o[0];if(this.mode===`balanced`&&i&&s.score<.6)return this.noSplit(e,n);let c=[...new Set([...s.leftParts,...s.rightParts])];return{word:e,parts:c,indexTerms:[...new Set([...c,t])],confidence:Math.min(s.score,1),isCompound:!0}}splitAtHyphen(e,t){let n=e.split(`-`).filter(e=>e.length>0);if(n.length<2)return this.noSplit(e,t);let r=[];for(let e of n){let t=this.lemmatizer.lemmatize(e);r.push(...t)}let i=[...new Set(r)];return{word:e,parts:i,indexTerms:[...new Set([...i,e.toLowerCase()])],confidence:.9,isCompound:!0}}trySplit(e,t){let n=this.lemmatizer.lemmatize(e),r=this.lemmatizer.lemmatize(t),i=[...new Set(n.filter(e=>this.knownLemmas.has(e)))],a=[...new Set(r.filter(e=>this.knownLemmas.has(e)))];if(i.length===0||a.length===0)return null;let o=0,s=1-Math.abs(e.length-t.length)/(e.length+t.length);o+=s*.2;let c=(e.length+t.length)/2,l=Math.min(c/6,1);o+=l*.2,a.some(e=>F.has(e))&&(o+=.3);let u=i.some(e=>I.has(e)),d=a.some(e=>I.has(e));return u&&d?o-=.3:!u&&!d&&(o+=.2),(e.length<4||t.length<4)&&(o-=.15),{leftParts:i,rightParts:a,score:Math.max(0,o)}}getAllLemmas(e){return this.split(e).indexTerms}};function z(e){return new Set(e.map(e=>e.toLowerCase()))}const B=new Map([[`til dæmis`,{lemma:`til dæmi`,isStopword:!0,pos:`ao`}],[`með öðrum orðum`,{lemma:`með annar orð`,isStopword:!0,pos:`ao`}],[`í raun`,{lemma:`í raun`,isStopword:!0,pos:`ao`}],[`í raun og veru`,{lemma:`í raun og vera`,isStopword:!0,pos:`ao`}],[`af og til`,{lemma:`af og til`,isStopword:!0,pos:`ao`}],[`aftur á móti`,{lemma:`aftur á mót`,isStopword:!0,pos:`ao`}],[`alla vega`,{lemma:`allur vegur`,isStopword:!0,pos:`ao`}],[`alls ekki`,{lemma:`alls ekki`,isStopword:!0,pos:`ao`}],[`alls staðar`,{lemma:`allur staður`,isStopword:!0,pos:`ao`}],[`allt í allt`,{lemma:`allur í allur`,isStopword:!0,pos:`ao`}],[`annars vegar`,{lemma:`annar vegur`,isStopword:!0,pos:`ao`}],[`auk þess`,{lemma:`auk það`,isStopword:!0,pos:`ao`}],[`að auki`,{lemma:`að auki`,isStopword:!0,pos:`ao`}],[`að vísu`,{lemma:`að vís`,isStopword:!0,pos:`ao`}],[`að sjálfsögðu`,{lemma:`að sjálfsagður`,isStopword:!0,pos:`ao`}],[`að minnsta kosti`,{lemma:`að lítill kostur`,isStopword:!0,pos:`ao`}],[`að öllu leyti`,{lemma:`að allur leyti`,isStopword:!0,pos:`ao`}],[`að nokkru leyti`,{lemma:`að nokkur leyti`,isStopword:!0,pos:`ao`}],[`ef til vill`,{lemma:`ef til vilja`,isStopword:!0,pos:`ao`}],[`einhvers staðar`,{lemma:`einhver staður`,isStopword:!0,pos:`ao`}],[`einhvern veginn`,{lemma:`einhver vegur`,isStopword:!0,pos:`ao`}],[`ekki síst`,{lemma:`ekki síður`,isStopword:!0,pos:`ao`}],[`engu að síður`,{lemma:`enginn að síður`,isStopword:!0,pos:`ao`}],[`fyrst og fremst`,{lemma:`snemma og fremri`,isStopword:!0,pos:`ao`}],[`hins vegar`,{lemma:`hinn vegur`,isStopword:!0,pos:`ao`}],[`hér og þar`,{lemma:`hér og þar`,isStopword:!0,pos:`ao`}],[`hér um bil`,{lemma:`hér um bil`,isStopword:!0,pos:`ao`}],[`hér á landi`,{lemma:`hér á land`,isStopword:!0,pos:`ao`}],[`hvað mest`,{lemma:`hvað mjög`,isStopword:!0,pos:`ao`}],[`hverju sinni`,{lemma:`hver sinn`,isStopword:!0,pos:`ao`}],[`hvorki né`,{lemma:`hvorki né`,isStopword:!0,pos:`ao`}],[`í burtu`,{lemma:`í burtu`,isStopword:!0,pos:`ao`}],[`í gær`,{lemma:`í gær`,isStopword:!0,pos:`ao`}],[`í senn`,{lemma:`í senn`,isStopword:!0,pos:`ao`}],[`í sífellu`,{lemma:`í sífella`,isStopword:!0,pos:`ao`}],[`lengi vel`,{lemma:`lengi vel`,isStopword:!0,pos:`ao`}],[`meira að segja`,{lemma:`mikill að segja`,isStopword:!0,pos:`ao`}],[`meira og minna`,{lemma:`mikill og lítill`,isStopword:!0,pos:`ao`}],[`meðal annars`,{lemma:`meðal annar`,isStopword:!0,pos:`ao`}],[`nokkurn veginn`,{lemma:`nokkur vegur`,isStopword:!0,pos:`ao`}],[`og svo framvegis`,{lemma:`og svo framvegis`,isStopword:!0,pos:`ao`}],[`satt að segja`,{lemma:`sannur að segja`,isStopword:!0,pos:`ao`}],[`sem betur fer`,{lemma:`sem vel fara`,isStopword:!0,pos:`ao`}],[`smám saman`,{lemma:`smátt saman`,isStopword:!0,pos:`ao`}],[`svo sem`,{lemma:`svo sem`,isStopword:!0,pos:`ao`}],[`sér í lagi`,{lemma:`sér í lag`,isStopword:!0,pos:`ao`}],[`til og frá`,{lemma:`til og frá`,isStopword:!0,pos:`ao`}],[`til baka`,{lemma:`til baka`,isStopword:!0,pos:`ao`}],[`vítt og breitt`,{lemma:`vítt og breitt`,isStopword:!0,pos:`ao`}],[`á ný`,{lemma:`á ný`,isStopword:!0,pos:`ao`}],[`á meðan`,{lemma:`á meðan`,isStopword:!0,pos:`ao`}],[`á sama tíma`,{lemma:`á samur tími`,isStopword:!0,pos:`ao`}],[`á hinn bóginn`,{lemma:`á hinn bógur`,isStopword:!0,pos:`ao`}],[`þar af leiðandi`,{lemma:`þar af leiða`,isStopword:!0,pos:`ao`}],[`þar að auki`,{lemma:`þar að auki`,isStopword:!0,pos:`ao`}],[`það er að segja`,{lemma:`það vera að segja`,isStopword:!0,pos:`ao`}],[`þess vegna`,{lemma:`það vegna`,isStopword:!0,pos:`ao`}],[`því miður`,{lemma:`það lítt`,isStopword:!0,pos:`ao`}],[`þrátt fyrir`,{lemma:`þrátt fyrir`,isStopword:!0,pos:`ao`}],[`á dögunum`,{lemma:`á dagur`,isStopword:!0,pos:`ao`}],[`á sínum tíma`,{lemma:`á sinn tími`,isStopword:!0,pos:`ao`}],[`á endanum`,{lemma:`á endi`,isStopword:!0,pos:`ao`}],[`einu sinni`,{lemma:`einn sinn`,isStopword:!1,pos:`ao`}],[`eitt sinn`,{lemma:`einn sinn`,isStopword:!1,pos:`ao`}],[`í fyrsta sinn`,{lemma:`í fyrstur sinn`,isStopword:!1,pos:`ao`}],[`í kvöld`,{lemma:`í kvöld`,isStopword:!1,pos:`ao`}],[`í morgun`,{lemma:`í morgunn`,isStopword:!1,pos:`ao`}],[`á morgun`,{lemma:`á morgunn`,isStopword:!1,pos:`ao`}],[`fyrir hönd`,{lemma:`fyrir hönd`,isStopword:!1,pos:`fs`}],[`með tilliti til`,{lemma:`með tillit til`,isStopword:!1,pos:`fs`}],[`í ljósi`,{lemma:`í ljós`,isStopword:!1,pos:`fs`}],[`í stað`,{lemma:`í staður`,isStopword:!1,pos:`fs`}],[`fyrir aftan`,{lemma:`fyrir aftan`,isStopword:!1,pos:`fs`}],[`fyrir austan`,{lemma:`fyrir austan`,isStopword:!1,pos:`fs`}],[`fyrir framan`,{lemma:`fyrir framan`,isStopword:!1,pos:`fs`}],[`fyrir handan`,{lemma:`fyrir handan`,isStopword:!1,pos:`fs`}],[`fyrir innan`,{lemma:`fyrir innan`,isStopword:!1,pos:`fs`}],[`fyrir neðan`,{lemma:`fyrir neðan`,isStopword:!1,pos:`fs`}],[`fyrir norðan`,{lemma:`fyrir norðan`,isStopword:!1,pos:`fs`}],[`fyrir ofan`,{lemma:`fyrir ofan`,isStopword:!1,pos:`fs`}],[`fyrir sunnan`,{lemma:`fyrir sunnan`,isStopword:!1,pos:`fs`}],[`fyrir utan`,{lemma:`fyrir utan`,isStopword:!1,pos:`fs`}],[`fyrir vestan`,{lemma:`fyrir vestan`,isStopword:!1,pos:`fs`}],[`í gegnum`,{lemma:`í gegnum`,isStopword:!1,pos:`fs`}],[`í kringum`,{lemma:`í kringum`,isStopword:!1,pos:`fs`}],[`innan við`,{lemma:`innan við`,isStopword:!1,pos:`fs`}],[`upp úr`,{lemma:`upp úr`,isStopword:!1,pos:`fs`}],[`þvert á`,{lemma:`þvert á`,isStopword:!1,pos:`fs`}],[`þar eð`,{lemma:`þar eð`,isStopword:!0,pos:`st`}],[`sameinuðu þjóðirnar`,{lemma:`Sameinuðu þjóðirnar`,isStopword:!1,pos:`entity`}],[`evrópusambandið`,{lemma:`Evrópusambandið`,isStopword:!1,pos:`entity`}],[`nato`,{lemma:`NATO`,isStopword:!1,pos:`entity`}],[`nató`,{lemma:`NATO`,isStopword:!1,pos:`entity`}]]);function V(e,t){for(let n=Math.min(4,e.length-t);n>=2;n--){let r=e.slice(t,t+n).join(` `).toLowerCase(),i=B.get(r);if(i)return{phrase:i,wordCount:n}}return null}function H(e){return B.has(e.toLowerCase())}function U(e){return B.get(e.toLowerCase())}const W=new Set([`word`]),G=new Set([`person`,`company`,`entity`]),K=new Set([`punctuation`,`s_begin`,`s_end`,`s_split`,`unknown`]);function q(t,n,r={}){let{bigrams:i,compoundSplitter:a,includeNumbers:o=!1,alwaysTryCompounds:s=!0}=r,c=e(t),l=[],u=[];for(let e=0;e<c.length;e++){let t=c[e];if(!K.has(t.kind)){if(G.has(t.kind)){l.push({original:t.text??``,kind:t.kind,lemmas:[],isEntity:!0});continue}if(t.kind===`number`||t.kind===`ordinal`){o&&l.push({original:t.text??``,kind:t.kind,lemmas:[],isEntity:!1});continue}if(W.has(t.kind)){let e=t.text??``,r=n.lemmatize(e),i={original:e,kind:t.kind,lemmas:r,isEntity:!1},o=r.length===1&&r[0]===e.toLowerCase();if(a&&(s||o)){let t=a.split(e);if(t.isCompound){i.compoundSplit=t;let e=t.parts.flatMap(e=>n.lemmatize(e));i.lemmas=[...new Set([...r,...e])]}}l.push(i),u.push({index:l.length-1,token:t});continue}l.push({original:t.text??``,kind:t.kind,lemmas:[],isEntity:!1})}}if(i&&u.length>0){let e=new D(n,i);for(let t=0;t<u.length;t++){let{index:n,token:r}=u[t],i=t>0?u[t-1].token:null,a=t<u.length-1?u[t+1].token:null,o=e.disambiguate(r.text??``,i?.text??null,a?.text??null);l[n].disambiguated=o.lemma,l[n].confidence=o.confidence}}else for(let{index:e}of u){let t=l[e];t.lemmas.length>0&&(t.disambiguated=t.lemmas[0],t.confidence=t.lemmas.length===1?1:.5)}return l}function J(e,n,r={}){let{removeStopwords:a=!1,indexAllCandidates:o=!0,useContextualStopwords:s=!1}=r,c=q(e,n,r),l=new Set,u=(e,n)=>a?s?i(e,n):t.has(e):!1;for(let e of c)if(!e.isEntity){if(o)for(let t of e.lemmas)u(t)||l.add(t);else e.disambiguated&&(u(e.disambiguated)||l.add(e.disambiguated));if(e.compoundSplit?.isCompound)for(let t of e.compoundSplit.parts){let e=n.lemmatize(t);for(let t of e)u(t)||l.add(t)}}return l}function Y(e,t,n,r={}){let i=performance.now(),a,o;switch(n){case`naive`:{let n=e.split(/\s+/).filter(e=>e.length>0),r=[];for(let e of n){let n=e.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu,``);if(n){let e=t.lemmatize(n);r.push({original:n,kind:`word`,lemmas:e,isEntity:!1,disambiguated:e[0],confidence:e.length===1?1:.5})}}a=r,o=new Set(r.map(e=>e.disambiguated).filter(Boolean));break}case`tokenized`:a=q(e,t),o=new Set(a.filter(e=>e.kind===`word`&&e.lemmas.length>0).map(e=>e.lemmas[0]));break;case`disambiguated`:a=q(e,t,{bigrams:r.bigrams}),o=J(e,t,{bigrams:r.bigrams});break;case`full`:a=q(e,t,{bigrams:r.bigrams,compoundSplitter:r.compoundSplitter}),o=J(e,t,{bigrams:r.bigrams,compoundSplitter:r.compoundSplitter});break}let s=performance.now()-i,c=a.filter(e=>e.kind===`word`),l=c.length,u=c.filter(e=>e.lemmas.length>0&&!(e.lemmas.length===1&&e.lemmas[0]===e.original.toLowerCase())).length,d=c.filter(e=>e.lemmas.length>1).length,f=c.filter(e=>e.confidence!==void 0).map(e=>e.confidence),p=f.length>0?f.reduce((e,t)=>e+t,0)/f.length:0,m=c.filter(e=>e.compoundSplit?.isCompound).length,h=a.filter(e=>e.isEntity).length;return{wordCount:l,lemmatizedCount:u,coverage:l>0?u/l:0,ambiguousCount:d,ambiguityRate:l>0?d/l:0,avgConfidence:p,compoundsFound:m,entitiesSkipped:h,uniqueLemmas:o.size,timeMs:s}}export{d as BinaryLemmatizer,j as CASE_NAMES,r as CONTEXTUAL_STOPWORDS,R as CompoundSplitter,f as DISAMBIGUATION_RULES,D as Disambiguator,M as GENDER_NAMES,g as NOMINATIVE_PRONOUNS,N as NUMBER_NAMES,h as PREPOSITION_CASES,P as PROTECTED_LEMMAS,B as STATIC_PHRASES,t as STOPWORDS_IS,k as WORD_CLASS_NAMES,A as WORD_CLASS_NAMES_IS,b as applyGrammarRules,v as applyPrepositionRule,y as applyPronounVerbRule,_ as canGovernCase,z as createKnownLemmaSet,O as extractDisambiguatedLemmas,J as extractIndexableLemmas,S as getGovernedCases,U as getPhraseInfo,p as getRulesForWord,m as hasDisambiguationRules,i as isContextualStopword,H as isKnownPhrase,x as isKnownPreposition,n as isStopword,V as matchPhrase,q as processText,a as removeStopwords,Y as runBenchmark};
|
|
1
|
+
import{tokenize as e}from"tokenize-is";const t=new Set(`á.að.aðra.aðrar.aðrir.af.alla.allan.allar.allir.allnokkra.allnokkrar.allnokkrir.allnokkru.allnokkrum.allnokkuð.allnokkur.allnokkurn.allnokkurra.allnokkurrar.allnokkurri.allnokkurs.allnokkurt.allra.allrar.allri.alls.allt.alltað.allur.án.andspænis.annað.annaðhvort.annan.annar.annarra.annarrar.annarri.annars.árla.ásamt.auk.austan.austanundir.austur.báða.báðar.báðir.báðum.bæði.bak.beggja.eða.eður.ef.eftir.ég.ein.eina.einar.einhver.einhverja.einhverjar.einhverjir.einhverju.einhverjum.einhvern.einhverra.einhverrar.einhverri.einhvers.einir.einn.einna.einnar.einni.eins.einskis.einu.einum.eitt.eitthvað.eitthvert.ekkert.ella.ellegar.en.enda.enga.engan.engar.engin.enginn.engir.engra.engrar.engri.engu.engum.er.fáein.fáeina.fáeinar.fáeinir.fáeinna.fáeinum.fjær.fjarri.flestalla.flestallan.flestallar.flestallir.flestallra.flestallrar.flestallri.flestalls.flestallt.flestallur.flestöll.flestöllu.flestöllum.frá.fram.fyrir.fyrst.gagnstætt.gagnvart.gegn.gegnt.gegnum.hana.handa.handan.hann.hans.heldur.hennar.henni.hið.hin.hina.hinar.hinir.hinn.hinna.hinnar.hinni.hins.hinu.hinum.hitt.hjá.honum.hún.hvað.hvaða.hvenær.hver.hverja.hverjar.hverjir.hverju.hverjum.hvern.hverra.hverrar.hverri.hvers.hvert.hvílík.hvílíka.hvílíkan.hvílíkar.hvílíkir.hvílíkra.hvílíkrar.hvílíkri.hvílíks.hvílíkt.hvílíku.hvílíkum.hvílíkur.hvor.hvora.hvorar.hvorir.hvorki.hvorn.hvorra.hvorrar.hvorri.hvors.hvort.hvoru.hvorug.hvoruga.hvorugan.hvorugar.hvorugir.hvorugra.hvorugrar.hvorugri.hvorugs.hvorugt.hvorugu.hvorugum.hvorugur.hvorum.í.inn.innan.innanundir.jafnframt.jafnhliða.kring.kringum.með.meðal.meðan.meður.mér.mestalla.mestallan.mestallar.mestallir.mestallra.mestallrar.mestallri.mestalls.mestallt.mestallur.mestöll.mestöllu.mestöllum.miðli.mig.milli.millum.mín.mína.mínar.mínir.minn.minna.minnar.minni.míns.mínu.mínum.mitt.mót.móti.nær.nærri.næst.næstum.nálægt.né.neðan.nein.neina.neinar.neinir.neinn.neinna.neinnar.neinni.neins.neinu.neinum.neitt.nema.niður.nokkra.nokkrar.nokkrir.nokkru.nokkrum.nokkuð.nokkur.nokkurn.nokkurra.nokkurrar.nokkurri.nokkurs.nokkurt.norðan.nú.öðru.öðrum.of.ofan.ofar.og.óháð.okkar.okkur.öll.öllu.öllum.önnur.órafjarri.oss.sá.sakir.sama.saman.samar.samfara.samhliða.sami.samir.samkvæmt.samra.samrar.samri.sams.samskipa.samt.samtímis.samur.sem.sér.sérhvað.sérhver.sérhverja.sérhverjar.sérhverjir.sérhverju.sérhverjum.sérhvern.sérhverra.sérhverrar.sérhverri.sérhvers.sérhvert.síðan.síðla.sig.sín.sína.sínar.sínhver.sínhverja.sínhverjar.sínhverjir.sínhverju.sínhverjum.sínhvern.sínhverra.sínhverrar.sínhverri.sínhvers.sínhvert.sínhvor.sínhvora.sínhvorar.sínhvorir.sínhvorn.sínhvorra.sínhvorrar.sínhvorri.sínhvors.sínhvort.sínhvoru.sínhvorum.sínir.sinn.sinna.sinnar.sinnhver.sinnhverja.sinnhverjar.sinnhverjir.sinnhverju.sinnhverjum.sinnhvern.sinnhverra.sinnhverrar.sinnhverri.sinnhvers.sinnhvert.sinnhvor.sinnhvora.sinnhvorar.sinnhvorir.sinnhvorn.sinnhvorra.sinnhvorrar.sinnhvorri.sinnhvors.sinnhvort.sinnhvoru.sinnhvorum.sinni.síns.sínu.sínum.sitt.sitthvað.sitthver.sitthverja.sitthverjar.sitthverjir.sitthverju.sitthverjum.sitthvern.sitthverra.sitthverrar.sitthverri.sitthvers.sitthvert.sitthvor.sitthvora.sitthvorar.sitthvorir.sitthvorn.sitthvorra.sitthvorrar.sitthvorri.sitthvors.sitthvort.sitthvoru.sitthvorum.sjálf.sjálfa.sjálfan.sjálfar.sjálfir.sjálfra.sjálfrar.sjálfri.sjálfs.sjálft.sjálfu.sjálfum.sjálfur.slík.slíka.slíkan.slíkar.slíkir.slíkra.slíkrar.slíkri.slíks.slíkt.slíku.slíkum.slíkur.snemma.sökum.söm.sömu.sömum.sú.sum.suma.suman.sumar.sumir.sumra.sumrar.sumri.sums.sumt.sumu.sumum.sumur.sunnan.svo.til.tráss.um.umfram.umhverfis.undan.undir.uns.upp.úr.út.utan.útundan.vegna.vér.vestan.vestur.vettugi.við.viður.vor.vora.vorar.vorir.vorn.vorra.vorrar.vorri.vors.vort.voru.vorum.yðar.yður.yfir.ykkar.ykkur.ýmis.ýmiss.ýmissa.ýmissar.ýmissi.ýmist.ýmsa.ýmsan.ýmsar.ýmsir.ýmsu.ýmsum.þá.það.þær.þann.þar.þau.þegar.þeim.þeir.þeirra.þeirrar.þeirri.þennan.þér.þess.þessa.þessar.þessara.þessarar.þessari.þessi.þessir.þessu.þessum.þetta.þið.þig.þín.þína.þínar.þínir.þinn.þinna.þinnar.þinni.þíns.þínu.þínum.þitt.þó.þónokkra.þónokkrar.þónokkrir.þónokkru.þónokkrum.þónokkuð.þónokkur.þónokkurn.þónokkurra.þónokkurrar.þónokkurri.þónokkurs.þónokkurt.þótt.þú.því.þvílík.þvílíka.þvílíkan.þvílíkar.þvílíkir.þvílíkra.þvílíkrar.þvílíkri.þvílíks.þvílíkt.þvílíku.þvílíkum.þvílíkur`.split(`.`));function n(e){return t.has(e.toLowerCase())}const r=new Map([[`á`,new Set([`fs`,`ao`])],[`við`,new Set([`fs`,`fn`])],[`af`,new Set([`fs`,`ao`])],[`til`,new Set([`fs`])],[`um`,new Set([`fs`])],[`frá`,new Set([`fs`])],[`yfir`,new Set([`fs`,`ao`])],[`undir`,new Set([`fs`,`ao`])],[`fyrir`,new Set([`fs`,`ao`])],[`eftir`,new Set([`fs`,`ao`])],[`gegn`,new Set([`fs`])],[`hjá`,new Set([`fs`])],[`úr`,new Set([`fs`])],[`í`,new Set([`fs`])]]);function i(e,n){let i=e.toLowerCase(),a=r.get(i);return a&&n?a.has(n):t.has(i)}function a(e){return e.filter(e=>!n(e))}const o=1279610177,s=[`no`,`so`,`lo`,`ao`,`fs`,`fn`,`st`,`to`,`gr`,`uh`],c=[void 0,`nf`,`þf`,`þgf`,`ef`],l=[void 0,`kk`,`kvk`,`hk`],u=[`et`,`ft`];var d=class e{buffer;stringPool;lemmaOffsets;lemmaLengths;wordOffsets;wordLengths;entryOffsets;entries;bigramW1Offsets;bigramW1Lengths;bigramW2Offsets;bigramW2Lengths;bigramFreqs;lemmaCount;wordCount;entryCount;bigramCount;version;decoder=new TextDecoder(`utf-8`);constructor(e){this.buffer=e;let t=new DataView(e),n=t.getUint32(0,!0);if(n!==o)throw Error(`Invalid binary format: expected magic 0x${o.toString(16)}, got 0x${n.toString(16)}`);if(this.version=t.getUint32(4,!0),this.version!==1&&this.version!==2)throw Error(`Unsupported version: ${this.version}`);let r=t.getUint32(8,!0);this.lemmaCount=t.getUint32(12,!0),this.wordCount=t.getUint32(16,!0),this.entryCount=t.getUint32(20,!0),this.bigramCount=t.getUint32(24,!0);let i=32;this.stringPool=new Uint8Array(e,i,r),i+=r,this.lemmaOffsets=new Uint32Array(e,i,this.lemmaCount),i+=this.lemmaCount*4,this.lemmaLengths=new Uint8Array(e,i,this.lemmaCount),i+=this.lemmaCount,i=i+3&-4,this.wordOffsets=new Uint32Array(e,i,this.wordCount),i+=this.wordCount*4,this.wordLengths=new Uint8Array(e,i,this.wordCount),i+=this.wordCount,i=i+3&-4,this.entryOffsets=new Uint32Array(e,i,this.wordCount+1),i+=(this.wordCount+1)*4,this.entries=new Uint32Array(e,i,this.entryCount),i+=this.entryCount*4,this.bigramW1Offsets=new Uint32Array(e,i,this.bigramCount),i+=this.bigramCount*4,this.bigramW1Lengths=new Uint8Array(e,i,this.bigramCount),i+=this.bigramCount,i=i+3&-4,this.bigramW2Offsets=new Uint32Array(e,i,this.bigramCount),i+=this.bigramCount*4,this.bigramW2Lengths=new Uint8Array(e,i,this.bigramCount),i+=this.bigramCount,i=i+3&-4,this.bigramFreqs=new Uint32Array(e,i,this.bigramCount)}static async load(t,n={}){let r=await(n.fetch??fetch)(t);if(!r.ok)throw Error(`Failed to load binary data: ${r.status}`);return new e(await r.arrayBuffer())}static loadFromBuffer(t){return new e(t)}getString(e,t){return this.decoder.decode(this.stringPool.subarray(e,e+t))}getLemma(e){return this.getString(this.lemmaOffsets[e],this.lemmaLengths[e])}getWord(e){return this.getString(this.wordOffsets[e],this.wordLengths[e])}findWord(e){let t=0,n=this.wordCount-1;for(;t<=n;){let r=t+n>>>1,i=this.getWord(r);if(i===e)return r;i<e?t=r+1:n=r-1}return-1}lemmatize(e,t={}){let n=e.toLowerCase(),r=this.findWord(n);if(r===-1)return[n];let i=this.entryOffsets[r],a=this.entryOffsets[r+1],{wordClass:o}=t,c=new Set,l=[];for(let e=i;e<a;e++){let{lemmaIdx:t,posCode:n}=this.unpackEntry(this.entries[e]),r=s[n];if(o&&r!==o)continue;let i=this.getLemma(t);c.has(i)||(c.add(i),l.push(i))}return l.length===0?[n]:l}unpackEntry(e){return this.version===1?{lemmaIdx:e>>>4,posCode:e&15,caseCode:0,genderCode:0,numberCode:0}:{lemmaIdx:e>>>10,posCode:e&15,caseCode:e>>>4&7,genderCode:e>>>7&3,numberCode:e>>>9&1}}lemmatizeWithPOS(e){let t=e.toLowerCase(),n=this.findWord(t);if(n===-1)return[];let r=this.entryOffsets[n],i=this.entryOffsets[n+1],a=new Set,o=[];for(let e=r;e<i;e++){let{lemmaIdx:t,posCode:n}=this.unpackEntry(this.entries[e]),r=this.getLemma(t),i=s[n]??``,c=`${r}:${i}`;a.has(c)||(a.add(c),o.push({lemma:r,pos:i}))}return o}lemmatizeWithMorph(e){let t=e.toLowerCase(),n=this.findWord(t);if(n===-1)return[];let r=this.entryOffsets[n],i=this.entryOffsets[n+1],a=[];for(let e=r;e<i;e++){let{lemmaIdx:t,posCode:n,caseCode:r,genderCode:i,numberCode:o}=this.unpackEntry(this.entries[e]),d={},f=c[r],p=l[i],m=u[o];f&&(d.case=f),p&&(d.gender=p),m&&(d.number=m),a.push({lemma:this.getLemma(t),pos:s[n]??``,morph:Object.keys(d).length>0?d:void 0})}return a}hasMorphFeatures(){return this.version>=2}getVersion(){return this.version}findBigram(e,t){let n=0,r=this.bigramCount-1;for(;n<=r;){let i=n+r>>>1,a=this.getString(this.bigramW1Offsets[i],this.bigramW1Lengths[i]);if(a<e)n=i+1;else if(a>e)r=i-1;else{let e=this.getString(this.bigramW2Offsets[i],this.bigramW2Lengths[i]);if(e===t)return i;e<t?n=i+1:r=i-1}}return-1}bigramFreq(e,t){let n=this.findBigram(e.toLowerCase(),t.toLowerCase());return n===-1?0:this.bigramFreqs[n]}freq(e,t){return this.bigramFreq(e,t)}isKnown(e){return this.findWord(e.toLowerCase())!==-1}get lemmaCountValue(){return this.lemmaCount}get wordFormCount(){return this.wordCount}get bigramCountValue(){return this.bigramCount}get bufferSize(){return this.buffer.byteLength}getAllLemmas(){let e=[];for(let t=0;t<this.lemmaCount;t++)e.push(this.getLemma(t));return e}};const f=[{word:`á`,prefer:`so`,over:`fs`,context:`after_pronoun`,description:`á after pronoun = verb 'eiga' (I own, you own)`},{word:`á`,prefer:`fs`,over:`so`,context:`before_noun`,description:`á before noun = preposition (on, at)`},{word:`við`,prefer:`fn`,over:`fs`,context:`sentence_start`,description:`við at sentence start = pronoun 'we'`},{word:`við`,prefer:`fs`,over:`fn`,context:`before_noun`,description:`við before noun = preposition 'by/at'`},{word:`af`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`af before noun = preposition 'of/from'`},{word:`til`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`til before noun = preposition 'to'`},{word:`um`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`um before noun = preposition 'about/around'`},{word:`yfir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`yfir before noun = preposition 'over'`},{word:`undir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`undir before noun = preposition 'under'`},{word:`fyrir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`fyrir before noun = preposition 'for/before'`},{word:`eftir`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`eftir before noun = preposition 'after'`},{word:`frá`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`frá before noun = preposition 'from'`},{word:`með`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`með before noun = preposition 'with'`},{word:`í`,prefer:`fs`,over:`ao`,context:`before_noun`,description:`í before noun = preposition 'in'`},{word:`úr`,prefer:`fs`,over:`no`,context:`before_noun`,description:`úr before noun = preposition 'out of'`}];function p(e){let t=e.toLowerCase();return f.filter(e=>e.word===t)}function m(e){return f.some(t=>t.word===e.toLowerCase())}const h=new Map([[`á`,new Set([`þf`,`þgf`])],[`í`,new Set([`þf`,`þgf`])],[`við`,new Set([`þf`,`þgf`])],[`með`,new Set([`þf`,`þgf`])],[`undir`,new Set([`þf`,`þgf`])],[`yfir`,new Set([`þf`,`þgf`])],[`fyrir`,new Set([`þf`,`þgf`])],[`um`,new Set([`þf`])],[`gegnum`,new Set([`þf`])],[`kringum`,new Set([`þf`])],[`umhverfis`,new Set([`þf`])],[`af`,new Set([`þgf`])],[`frá`,new Set([`þgf`])],[`hjá`,new Set([`þgf`])],[`úr`,new Set([`þgf`])],[`að`,new Set([`þgf`])],[`móti`,new Set([`þgf`])],[`nálægt`,new Set([`þgf`])],[`gegn`,new Set([`þgf`])],[`gagnvart`,new Set([`þgf`])],[`handa`,new Set([`þgf`])],[`meðal`,new Set([`ef`])],[`til`,new Set([`ef`])],[`án`,new Set([`ef`])],[`vegna`,new Set([`ef`])],[`sakir`,new Set([`ef`])],[`utan`,new Set([`ef`])],[`innan`,new Set([`ef`])],[`meðfram`,new Set([`þgf`])],[`milli`,new Set([`ef`])],[`auk`,new Set([`ef`])],[`í stað`,new Set([`ef`])]]),g=new Set([`ég`,`þú`,`hann`,`hún`,`það`,`við`,`þið`,`þeir`,`þær`,`þau`]);function _(e,t){return t?h.get(e)?.has(t)??!1:!1}function v(e,t){let n=e.filter(e=>e.pos===`fs`);if(n.length===0)return null;for(let e of n)for(let n of t)if(n.morph?.case&&_(e.lemma,n.morph.case))return{lemma:e.lemma,pos:`fs`,rule:`prep+${n.morph.case}`,confidence:.9};return null}function y(e,t){if(!t)return null;let n=t.toLowerCase();if(!g.has(n))return null;let r=e.filter(e=>e.pos===`so`);return r.length===0||!e.some(e=>e.pos!==`so`)?null:{lemma:(r.find(e=>e.lemma===`eiga`)??r[0]).lemma,pos:`so`,rule:`pronoun+verb`,confidence:.85}}function b(e,t,n){if(!t||!n?.lemmatizeWithPOS)return null;let r=n.lemmatizeWithPOS(t),i=r.find(e=>e.pos===`fs`);if(!i)return null;let a=r.some(e=>e.pos===`fn`),o=e.some(e=>e.pos===`so`);if(a&&o)return null;let s=h.get(i.lemma);if(!s)return null;let c=e.filter(e=>e.pos===`no`);for(let e of c)if(e.morph?.case&&s.has(e.morph.case))return{lemma:e.lemma,pos:`no`,rule:`noun_after_prep+${e.morph.case}`,confidence:.9};return null}function x(e,t,n,r=null){return v(e,n)||b(e,t,r)||y(e,t)||null}function S(e){return h.has(e)}function C(e){return h.get(e)}const w={name:`unambiguous`,run(e){return e.length===1?{lemma:e[0].lemma,pos:e[0].pos,confidence:1}:null}},T={name:`preference_rules`,run(e,t,n){if(!n.usePreferenceRules)return null;for(let n of f){let r=E(n,e,t);if(r)return{lemma:r.lemma,pos:r.pos,confidence:.85}}return null}};function E(e,t,n){let r=t.find(t=>t.lemma.toLowerCase()===e.word.toLowerCase()&&t.pos===e.prefer),i=t.find(t=>t.lemma.toLowerCase()===e.word.toLowerCase()&&t.pos===e.over);if(!r||!i)return null;if(e.context===`before_noun`){let e=n.nextWord;if(e&&/^[A-ZÁÉÍÓÚÝÞÆÖ]/.test(e))return r}else if(e.context===`before_verb`){let e=n.nextWord?.toLowerCase();if(e&&![`þessi`,`þetta`,`sá`,`sú`,`það`,`hinn`,`hin`,`hið`].includes(e))return r}else if(e.context===`after_pronoun`){let e=n.prevWord?.toLowerCase();if(e&&[`ég`,`þú`,`hann`,`hún`,`það`,`við`,`þið`,`þeir`,`þær`,`þau`].includes(e))return r}return null}const D=[w,T,{name:`grammar_rules`,run(e,t,n){if(!n.useGrammarRules)return null;let r=e.map(e=>({...e,morph:void 0}));if(n.lemmatizer.lemmatizeWithMorph){let e=t.allTokens[t.index];if(e){let t=n.lemmatizer.lemmatizeWithMorph(e);r.length=0,r.push(...t)}}let i=x(r,t.prevWord,t.nextWordMorph??[],n.lemmatizer);return i?{lemma:i.lemma,pos:i.pos,confidence:i.confidence}:null}},{name:`word_bigrams`,run(e,t,n){if(!n.bigrams||e.length===0)return null;let r=[];for(let i of e){let e=0;if(t.prevWord){let r=t.prevLemmas||n.lemmatizer.lemmatize(t.prevWord);for(let t of r){let r=n.bigrams.freq(t,i.lemma);r>0&&(e+=Math.log(r+1)*n.leftWeight)}}if(t.nextWord){let r=t.nextLemmas||n.lemmatizer.lemmatize(t.nextWord);for(let t of r){let r=n.bigrams.freq(i.lemma,t);r>0&&(e+=Math.log(r+1)*n.rightWeight)}}r.push({candidate:i,score:e})}if(r.sort((e,t)=>t.score-e.score),r.length>0&&r[0].score>0){let e=r[0].score,t=r.reduce((e,t)=>e+Math.exp(t.score),0),n=t>0?Math.exp(e)/t:.5;return{lemma:r[0].candidate.lemma,pos:r[0].candidate.pos,confidence:n}}return null}},{name:`fallback`,run(e){return e.length>0?{lemma:e[0].lemma,pos:e[0].pos,confidence:1/e.length}:null}}];var O=class{lemmatizer;bigrams;leftWeight;rightWeight;usePhraseRules;usePreferenceRules;useGrammarRules;constructor(e,t=null,n={}){this.lemmatizer=e,this.bigrams=t,this.leftWeight=n.leftWeight??1,this.rightWeight=n.rightWeight??1,this.usePhraseRules=n.usePhraseRules??!0,this.usePreferenceRules=n.usePreferenceRules??!0,this.useGrammarRules=n.useGrammarRules??!0}disambiguate(e,t,n){let r;r=this.lemmatizer.lemmatizeWithPOS?this.lemmatizer.lemmatizeWithPOS(e):this.lemmatizer.lemmatize(e).map(e=>({lemma:e,pos:`no`}));let i=r.map(e=>e.lemma),a=e,o;n&&this.lemmatizer.lemmatizeWithMorph&&(o=this.lemmatizer.lemmatizeWithMorph(n));let s={prevWord:t,nextWord:n,nextWordMorph:o,allTokens:[e],index:0};for(let e of D){let t=e.run(r,s,this);if(t)return{token:a,lemma:t.lemma,pos:t.pos,candidates:i,candidatesWithPOS:r,ambiguous:i.length>1,confidence:t.confidence,resolvedBy:e.name}}return{token:a,lemma:e.toLowerCase(),candidates:i,candidatesWithPOS:r,ambiguous:!1,confidence:0,resolvedBy:`none`}}disambiguateAll(e){let t=[];for(let n=0;n<e.length;n++){let r=e[n],i=n>0?e[n-1]:null,a=n<e.length-1?e[n+1]:null;t.push(this.disambiguate(r,i,a))}return t}extractLemmas(e){let t=new Set,n=this.disambiguateAll(e);for(let e of n)t.add(e.lemma);return t}};function k(e,n,r,i={}){let{tokenize:a,removeStopwords:o}=i,s=a?a(e):e.split(/\s+/).filter(e=>e.length>0).map(e=>e.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu,``)).filter(e=>e.length>0),c=new O(n,r).extractLemmas(s);if(o)for(let e of c)t.has(e)&&c.delete(e);return c}const A={no:`noun`,so:`verb`,lo:`adjective`,ao:`adverb`,fs:`preposition`,fn:`pronoun`,st:`conjunction`,to:`numeral`,gr:`article`,uh:`interjection`},j={no:`nafnorð`,so:`sagnorð`,lo:`lýsingarorð`,ao:`atviksorð`,fs:`forsetning`,fn:`fornafn`,st:`samtenging`,to:`töluorð`,gr:`greinir`,uh:`upphrópun`},M={nf:`nominative`,þf:`accusative`,þgf:`dative`,ef:`genitive`},N={kk:`masculine`,kvk:`feminine`,hk:`neuter`},P={et:`singular`,ft:`plural`},F=new Set(`ísland.england.írland.skotland.finnland.grænland.holland.þýskaland.frakkland.pólland.tékkland.svissland.rússland.eistland.lettland.litháen.danmörk.noregur.svíþjóð.bandaríkin.spánn.portúgal.ítalía.grikkland.þingvellir.akureyri.ísafjörður.reykjavík.keflavík.hafnarfjörður.kópavogur.seltjarnarnes.garðabær.mosfellsbær.vestmannaeyjar.húsavík.sauðárkrókur.siglufjörður.ólafsfjörður.dalvík.egilsstaðir.neskaupstaður.seyðisfjörður.eskifjörður.reyðarfjörður.fáskrúðsfjörður.stöðvarfjörður.djúpivogur.höfn.vík.selfoss.hveragerði.þorlákshöfn.grindavík.sandgerði.borgarnes.stykkishólmur.grundarfjörður.ólafsvík.búðardalur.patreksfjörður.flateyri.suðureyri.bolungarvík.hólmavík.hvammstangi.blönduós.skagaströnd.varmahlíð.hlíðarendi.bergþórshvol.íslandsbanki.landsbankinn.arionbanki.alþingi`.split(`.`)),I=new Set(`maður.kona.stjóri.ráðherra.forseti.formaður.fulltrúi.starfsmaður.hús.staður.vegur.borg.bær.dalur.fjörður.félag.banki.sjóður.stofnun.ráð.rannsókn.greiðsla.mál.kerfi.verk.þjónusta.rekstur.viðskipti.verð.kostnaður`.split(`.`)),L=new Set([`vera`,`hafa`,`gera`,`fara`,`koma`,`segja`,`vilja`,`mega`,`þurfa`,`verða`,`geta`,`sjá`,`taka`,`eiga`,`láta`,`halda`,`leyfa`,`búa`]),R=[`s`,`u`,`a`];var z=class{lemmatizer;minPartLength;tryLinkingLetters;knownLemmas;mode;constructor(e,t,n={}){this.lemmatizer=e,this.knownLemmas=t,this.minPartLength=n.minPartLength??3,this.tryLinkingLetters=n.tryLinkingLetters??!0,this.mode=n.mode??`balanced`}noSplit(e,t){return{word:e,parts:t,indexTerms:t,confidence:0,isCompound:!1}}split(e){let t=e.toLowerCase(),n=this.lemmatizer.lemmatize(e),r=n[0]?.toLowerCase();if(r&&F.has(r)||F.has(t))return this.noSplit(e,n);let i=n.length>0&&n[0].toLowerCase()!==t,a=n.length===1;if(this.mode===`conservative`)return e.includes(`-`)?this.splitAtHyphen(e,n):this.noSplit(e,n);if(this.mode===`balanced`&&i&&a&&t.length<12||t.length<this.minPartLength*2)return this.noSplit(e,n);let o=[];for(let e=this.minPartLength;e<=t.length-this.minPartLength;e++){let n=t.slice(0,e),r=t.slice(e),i=this.trySplit(n,r);if(i&&o.push(i),this.tryLinkingLetters){for(let e of R)if(n.endsWith(e)&&n.length>this.minPartLength){let e=n.slice(0,-1),t=this.trySplit(e,r);t&&o.push({...t,score:t.score*.95})}}}if(o.length===0)return this.noSplit(e,n);o.sort((e,t)=>t.score-e.score);let s=o[0];if(this.mode===`balanced`&&i&&s.score<.6)return this.noSplit(e,n);let c=[...new Set([...s.leftParts,...s.rightParts])];return{word:e,parts:c,indexTerms:[...new Set([...c,t])],confidence:Math.min(s.score,1),isCompound:!0}}splitAtHyphen(e,t){let n=e.split(`-`).filter(e=>e.length>0);if(n.length<2)return this.noSplit(e,t);let r=[];for(let e of n){let t=this.lemmatizer.lemmatize(e);r.push(...t)}let i=[...new Set(r)];return{word:e,parts:i,indexTerms:[...new Set([...i,e.toLowerCase()])],confidence:.9,isCompound:!0}}trySplit(e,t){let n=this.lemmatizer.lemmatize(e),r=this.lemmatizer.lemmatize(t),i=[...new Set(n.filter(e=>this.knownLemmas.has(e)))],a=[...new Set(r.filter(e=>this.knownLemmas.has(e)))];if(i.length===0||a.length===0)return null;let o=0,s=1-Math.abs(e.length-t.length)/(e.length+t.length);o+=s*.2;let c=(e.length+t.length)/2,l=Math.min(c/6,1);o+=l*.2,a.some(e=>I.has(e))&&(o+=.3);let u=i.some(e=>L.has(e)),d=a.some(e=>L.has(e));return u&&d?o-=.3:!u&&!d&&(o+=.2),(e.length<4||t.length<4)&&(o-=.15),{leftParts:i,rightParts:a,score:Math.max(0,o)}}getAllLemmas(e){return this.split(e).indexTerms}};function B(e){return new Set(e.map(e=>e.toLowerCase()))}const V=new Map([[`til dæmis`,{lemma:`til dæmi`,isStopword:!0,pos:`ao`}],[`með öðrum orðum`,{lemma:`með annar orð`,isStopword:!0,pos:`ao`}],[`í raun`,{lemma:`í raun`,isStopword:!0,pos:`ao`}],[`í raun og veru`,{lemma:`í raun og vera`,isStopword:!0,pos:`ao`}],[`af og til`,{lemma:`af og til`,isStopword:!0,pos:`ao`}],[`aftur á móti`,{lemma:`aftur á mót`,isStopword:!0,pos:`ao`}],[`alla vega`,{lemma:`allur vegur`,isStopword:!0,pos:`ao`}],[`alls ekki`,{lemma:`alls ekki`,isStopword:!0,pos:`ao`}],[`alls staðar`,{lemma:`allur staður`,isStopword:!0,pos:`ao`}],[`allt í allt`,{lemma:`allur í allur`,isStopword:!0,pos:`ao`}],[`annars vegar`,{lemma:`annar vegur`,isStopword:!0,pos:`ao`}],[`auk þess`,{lemma:`auk það`,isStopword:!0,pos:`ao`}],[`að auki`,{lemma:`að auki`,isStopword:!0,pos:`ao`}],[`að vísu`,{lemma:`að vís`,isStopword:!0,pos:`ao`}],[`að sjálfsögðu`,{lemma:`að sjálfsagður`,isStopword:!0,pos:`ao`}],[`að minnsta kosti`,{lemma:`að lítill kostur`,isStopword:!0,pos:`ao`}],[`að öllu leyti`,{lemma:`að allur leyti`,isStopword:!0,pos:`ao`}],[`að nokkru leyti`,{lemma:`að nokkur leyti`,isStopword:!0,pos:`ao`}],[`ef til vill`,{lemma:`ef til vilja`,isStopword:!0,pos:`ao`}],[`einhvers staðar`,{lemma:`einhver staður`,isStopword:!0,pos:`ao`}],[`einhvern veginn`,{lemma:`einhver vegur`,isStopword:!0,pos:`ao`}],[`ekki síst`,{lemma:`ekki síður`,isStopword:!0,pos:`ao`}],[`engu að síður`,{lemma:`enginn að síður`,isStopword:!0,pos:`ao`}],[`fyrst og fremst`,{lemma:`snemma og fremri`,isStopword:!0,pos:`ao`}],[`hins vegar`,{lemma:`hinn vegur`,isStopword:!0,pos:`ao`}],[`hér og þar`,{lemma:`hér og þar`,isStopword:!0,pos:`ao`}],[`hér um bil`,{lemma:`hér um bil`,isStopword:!0,pos:`ao`}],[`hér á landi`,{lemma:`hér á land`,isStopword:!0,pos:`ao`}],[`hvað mest`,{lemma:`hvað mjög`,isStopword:!0,pos:`ao`}],[`hverju sinni`,{lemma:`hver sinn`,isStopword:!0,pos:`ao`}],[`hvorki né`,{lemma:`hvorki né`,isStopword:!0,pos:`ao`}],[`í burtu`,{lemma:`í burtu`,isStopword:!0,pos:`ao`}],[`í gær`,{lemma:`í gær`,isStopword:!0,pos:`ao`}],[`í senn`,{lemma:`í senn`,isStopword:!0,pos:`ao`}],[`í sífellu`,{lemma:`í sífella`,isStopword:!0,pos:`ao`}],[`lengi vel`,{lemma:`lengi vel`,isStopword:!0,pos:`ao`}],[`meira að segja`,{lemma:`mikill að segja`,isStopword:!0,pos:`ao`}],[`meira og minna`,{lemma:`mikill og lítill`,isStopword:!0,pos:`ao`}],[`meðal annars`,{lemma:`meðal annar`,isStopword:!0,pos:`ao`}],[`nokkurn veginn`,{lemma:`nokkur vegur`,isStopword:!0,pos:`ao`}],[`og svo framvegis`,{lemma:`og svo framvegis`,isStopword:!0,pos:`ao`}],[`satt að segja`,{lemma:`sannur að segja`,isStopword:!0,pos:`ao`}],[`sem betur fer`,{lemma:`sem vel fara`,isStopword:!0,pos:`ao`}],[`smám saman`,{lemma:`smátt saman`,isStopword:!0,pos:`ao`}],[`svo sem`,{lemma:`svo sem`,isStopword:!0,pos:`ao`}],[`sér í lagi`,{lemma:`sér í lag`,isStopword:!0,pos:`ao`}],[`til og frá`,{lemma:`til og frá`,isStopword:!0,pos:`ao`}],[`til baka`,{lemma:`til baka`,isStopword:!0,pos:`ao`}],[`vítt og breitt`,{lemma:`vítt og breitt`,isStopword:!0,pos:`ao`}],[`á ný`,{lemma:`á ný`,isStopword:!0,pos:`ao`}],[`á meðan`,{lemma:`á meðan`,isStopword:!0,pos:`ao`}],[`á sama tíma`,{lemma:`á samur tími`,isStopword:!0,pos:`ao`}],[`á hinn bóginn`,{lemma:`á hinn bógur`,isStopword:!0,pos:`ao`}],[`þar af leiðandi`,{lemma:`þar af leiða`,isStopword:!0,pos:`ao`}],[`þar að auki`,{lemma:`þar að auki`,isStopword:!0,pos:`ao`}],[`það er að segja`,{lemma:`það vera að segja`,isStopword:!0,pos:`ao`}],[`þess vegna`,{lemma:`það vegna`,isStopword:!0,pos:`ao`}],[`því miður`,{lemma:`það lítt`,isStopword:!0,pos:`ao`}],[`þrátt fyrir`,{lemma:`þrátt fyrir`,isStopword:!0,pos:`ao`}],[`á dögunum`,{lemma:`á dagur`,isStopword:!0,pos:`ao`}],[`á sínum tíma`,{lemma:`á sinn tími`,isStopword:!0,pos:`ao`}],[`á endanum`,{lemma:`á endi`,isStopword:!0,pos:`ao`}],[`einu sinni`,{lemma:`einn sinn`,isStopword:!1,pos:`ao`}],[`eitt sinn`,{lemma:`einn sinn`,isStopword:!1,pos:`ao`}],[`í fyrsta sinn`,{lemma:`í fyrstur sinn`,isStopword:!1,pos:`ao`}],[`í kvöld`,{lemma:`í kvöld`,isStopword:!1,pos:`ao`}],[`í morgun`,{lemma:`í morgunn`,isStopword:!1,pos:`ao`}],[`á morgun`,{lemma:`á morgunn`,isStopword:!1,pos:`ao`}],[`fyrir hönd`,{lemma:`fyrir hönd`,isStopword:!1,pos:`fs`}],[`með tilliti til`,{lemma:`með tillit til`,isStopword:!1,pos:`fs`}],[`í ljósi`,{lemma:`í ljós`,isStopword:!1,pos:`fs`}],[`í stað`,{lemma:`í staður`,isStopword:!1,pos:`fs`}],[`fyrir aftan`,{lemma:`fyrir aftan`,isStopword:!1,pos:`fs`}],[`fyrir austan`,{lemma:`fyrir austan`,isStopword:!1,pos:`fs`}],[`fyrir framan`,{lemma:`fyrir framan`,isStopword:!1,pos:`fs`}],[`fyrir handan`,{lemma:`fyrir handan`,isStopword:!1,pos:`fs`}],[`fyrir innan`,{lemma:`fyrir innan`,isStopword:!1,pos:`fs`}],[`fyrir neðan`,{lemma:`fyrir neðan`,isStopword:!1,pos:`fs`}],[`fyrir norðan`,{lemma:`fyrir norðan`,isStopword:!1,pos:`fs`}],[`fyrir ofan`,{lemma:`fyrir ofan`,isStopword:!1,pos:`fs`}],[`fyrir sunnan`,{lemma:`fyrir sunnan`,isStopword:!1,pos:`fs`}],[`fyrir utan`,{lemma:`fyrir utan`,isStopword:!1,pos:`fs`}],[`fyrir vestan`,{lemma:`fyrir vestan`,isStopword:!1,pos:`fs`}],[`í gegnum`,{lemma:`í gegnum`,isStopword:!1,pos:`fs`}],[`í kringum`,{lemma:`í kringum`,isStopword:!1,pos:`fs`}],[`innan við`,{lemma:`innan við`,isStopword:!1,pos:`fs`}],[`upp úr`,{lemma:`upp úr`,isStopword:!1,pos:`fs`}],[`þvert á`,{lemma:`þvert á`,isStopword:!1,pos:`fs`}],[`þar eð`,{lemma:`þar eð`,isStopword:!0,pos:`st`}],[`sameinuðu þjóðirnar`,{lemma:`Sameinuðu þjóðirnar`,isStopword:!1,pos:`entity`}],[`evrópusambandið`,{lemma:`Evrópusambandið`,isStopword:!1,pos:`entity`}],[`nato`,{lemma:`NATO`,isStopword:!1,pos:`entity`}],[`nató`,{lemma:`NATO`,isStopword:!1,pos:`entity`}]]);function H(e,t){for(let n=Math.min(4,e.length-t);n>=2;n--){let r=e.slice(t,t+n).join(` `).toLowerCase(),i=V.get(r);if(i)return{phrase:i,wordCount:n}}return null}function U(e){return V.has(e.toLowerCase())}function W(e){return V.get(e.toLowerCase())}const G=new Set([`word`]),K=new Set([`person`,`company`,`entity`]),q=new Set([`punctuation`,`s_begin`,`s_end`,`s_split`,`unknown`]);function J(t,n,r={}){let{bigrams:i,compoundSplitter:a,includeNumbers:o=!1,alwaysTryCompounds:s=!0}=r,c=e(t),l=[],u=[];for(let e=0;e<c.length;e++){let t=c[e];if(!q.has(t.kind)){if(K.has(t.kind)){l.push({original:t.text??``,kind:t.kind,lemmas:[],isEntity:!0});continue}if(t.kind===`number`||t.kind===`ordinal`){o&&l.push({original:t.text??``,kind:t.kind,lemmas:[],isEntity:!1});continue}if(G.has(t.kind)){let e=t.text??``,r=n.lemmatize(e),i={original:e,kind:t.kind,lemmas:r,isEntity:!1},o=r.length===1&&r[0]===e.toLowerCase();if(a&&(s||o)){let t=a.split(e);if(t.isCompound){i.compoundSplit=t;let e=t.parts.flatMap(e=>n.lemmatize(e));i.lemmas=[...new Set([...r,...e])]}}l.push(i),u.push({index:l.length-1,token:t});continue}l.push({original:t.text??``,kind:t.kind,lemmas:[],isEntity:!1})}}if(i&&u.length>0){let e=new O(n,i);for(let t=0;t<u.length;t++){let{index:n,token:r}=u[t],i=t>0?u[t-1].token:null,a=t<u.length-1?u[t+1].token:null,o=e.disambiguate(r.text??``,i?.text??null,a?.text??null);l[n].disambiguated=o.lemma,l[n].confidence=o.confidence}}else for(let{index:e}of u){let t=l[e];t.lemmas.length>0&&(t.disambiguated=t.lemmas[0],t.confidence=t.lemmas.length===1?1:.5)}return l}function Y(e,n,r={}){let{removeStopwords:a=!1,indexAllCandidates:o=!0,useContextualStopwords:s=!1}=r,c=J(e,n,r),l=new Set,u=(e,n)=>a?s?i(e,n):t.has(e):!1;for(let e of c)if(!e.isEntity){if(o)for(let t of e.lemmas)u(t)||l.add(t);else e.disambiguated&&(u(e.disambiguated)||l.add(e.disambiguated));if(e.compoundSplit?.isCompound)for(let t of e.compoundSplit.parts){let e=n.lemmatize(t);for(let t of e)u(t)||l.add(t)}}return l}function X(e,t,n,r={}){let i=performance.now(),a,o;switch(n){case`naive`:{let n=e.split(/\s+/).filter(e=>e.length>0),r=[];for(let e of n){let n=e.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu,``);if(n){let e=t.lemmatize(n);r.push({original:n,kind:`word`,lemmas:e,isEntity:!1,disambiguated:e[0],confidence:e.length===1?1:.5})}}a=r,o=new Set(r.map(e=>e.disambiguated).filter(Boolean));break}case`tokenized`:a=J(e,t),o=new Set(a.filter(e=>e.kind===`word`&&e.lemmas.length>0).map(e=>e.lemmas[0]));break;case`disambiguated`:a=J(e,t,{bigrams:r.bigrams}),o=Y(e,t,{bigrams:r.bigrams});break;case`full`:a=J(e,t,{bigrams:r.bigrams,compoundSplitter:r.compoundSplitter}),o=Y(e,t,{bigrams:r.bigrams,compoundSplitter:r.compoundSplitter});break}let s=performance.now()-i,c=a.filter(e=>e.kind===`word`),l=c.length,u=c.filter(e=>e.lemmas.length>0&&!(e.lemmas.length===1&&e.lemmas[0]===e.original.toLowerCase())).length,d=c.filter(e=>e.lemmas.length>1).length,f=c.filter(e=>e.confidence!==void 0).map(e=>e.confidence),p=f.length>0?f.reduce((e,t)=>e+t,0)/f.length:0,m=c.filter(e=>e.compoundSplit?.isCompound).length,h=a.filter(e=>e.isEntity).length;return{wordCount:l,lemmatizedCount:u,coverage:l>0?u/l:0,ambiguousCount:d,ambiguityRate:l>0?d/l:0,avgConfidence:p,compoundsFound:m,entitiesSkipped:h,uniqueLemmas:o.size,timeMs:s}}export{d as BinaryLemmatizer,M as CASE_NAMES,r as CONTEXTUAL_STOPWORDS,z as CompoundSplitter,f as DISAMBIGUATION_RULES,O as Disambiguator,N as GENDER_NAMES,g as NOMINATIVE_PRONOUNS,P as NUMBER_NAMES,h as PREPOSITION_CASES,F as PROTECTED_LEMMAS,V as STATIC_PHRASES,t as STOPWORDS_IS,A as WORD_CLASS_NAMES,j as WORD_CLASS_NAMES_IS,x as applyGrammarRules,b as applyNounAfterPrepositionRule,v as applyPrepositionRule,y as applyPronounVerbRule,_ as canGovernCase,B as createKnownLemmaSet,k as extractDisambiguatedLemmas,Y as extractIndexableLemmas,C as getGovernedCases,W as getPhraseInfo,p as getRulesForWord,m as hasDisambiguationRules,i as isContextualStopword,U as isKnownPhrase,S as isKnownPreposition,n as isStopword,H as matchPhrase,J as processText,a as removeStopwords,X as runBenchmark};
|
|
2
2
|
//# sourceMappingURL=index.mjs.map
|