@lokascript/semantic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +686 -0
- package/dist/browser-ar.ar.global.js +2 -0
- package/dist/browser-core.core.global.js +2 -0
- package/dist/browser-de.de.global.js +2 -0
- package/dist/browser-east-asian.east-asian.global.js +2 -0
- package/dist/browser-en-tr.en-tr.global.js +2 -0
- package/dist/browser-en.en.global.js +2 -0
- package/dist/browser-es-en.es-en.global.js +2 -0
- package/dist/browser-es.es.global.js +2 -0
- package/dist/browser-fr.fr.global.js +2 -0
- package/dist/browser-id.id.global.js +2 -0
- package/dist/browser-ja.ja.global.js +2 -0
- package/dist/browser-ko.ko.global.js +2 -0
- package/dist/browser-lazy.lazy.global.js +2 -0
- package/dist/browser-priority.priority.global.js +2 -0
- package/dist/browser-pt.pt.global.js +2 -0
- package/dist/browser-qu.qu.global.js +2 -0
- package/dist/browser-sw.sw.global.js +2 -0
- package/dist/browser-tr.tr.global.js +2 -0
- package/dist/browser-western.western.global.js +2 -0
- package/dist/browser-zh.zh.global.js +2 -0
- package/dist/browser.global.js +3 -0
- package/dist/browser.global.js.map +1 -0
- package/dist/index.cjs +35051 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3426 -0
- package/dist/index.d.ts +3426 -0
- package/dist/index.js +34890 -0
- package/dist/index.js.map +1 -0
- package/dist/languages/ar.d.ts +78 -0
- package/dist/languages/ar.js +1622 -0
- package/dist/languages/ar.js.map +1 -0
- package/dist/languages/de.d.ts +38 -0
- package/dist/languages/de.js +1168 -0
- package/dist/languages/de.js.map +1 -0
- package/dist/languages/en.d.ts +44 -0
- package/dist/languages/en.js +3491 -0
- package/dist/languages/en.js.map +1 -0
- package/dist/languages/es.d.ts +52 -0
- package/dist/languages/es.js +1493 -0
- package/dist/languages/es.js.map +1 -0
- package/dist/languages/fr.d.ts +37 -0
- package/dist/languages/fr.js +1159 -0
- package/dist/languages/fr.js.map +1 -0
- package/dist/languages/id.d.ts +35 -0
- package/dist/languages/id.js +1152 -0
- package/dist/languages/id.js.map +1 -0
- package/dist/languages/ja.d.ts +53 -0
- package/dist/languages/ja.js +1430 -0
- package/dist/languages/ja.js.map +1 -0
- package/dist/languages/ko.d.ts +51 -0
- package/dist/languages/ko.js +1729 -0
- package/dist/languages/ko.js.map +1 -0
- package/dist/languages/pt.d.ts +37 -0
- package/dist/languages/pt.js +1127 -0
- package/dist/languages/pt.js.map +1 -0
- package/dist/languages/qu.d.ts +36 -0
- package/dist/languages/qu.js +1143 -0
- package/dist/languages/qu.js.map +1 -0
- package/dist/languages/sw.d.ts +35 -0
- package/dist/languages/sw.js +1147 -0
- package/dist/languages/sw.js.map +1 -0
- package/dist/languages/tr.d.ts +45 -0
- package/dist/languages/tr.js +1529 -0
- package/dist/languages/tr.js.map +1 -0
- package/dist/languages/zh.d.ts +58 -0
- package/dist/languages/zh.js +1257 -0
- package/dist/languages/zh.js.map +1 -0
- package/dist/types-C4dcj53L.d.ts +600 -0
- package/package.json +202 -0
- package/src/__test-utils__/index.ts +7 -0
- package/src/__test-utils__/test-helpers.ts +8 -0
- package/src/__types__/test-helpers.ts +122 -0
- package/src/analysis/index.ts +479 -0
- package/src/ast-builder/command-mappers.ts +1133 -0
- package/src/ast-builder/expression-parser/index.ts +41 -0
- package/src/ast-builder/expression-parser/parser.ts +563 -0
- package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
- package/src/ast-builder/expression-parser/types.ts +208 -0
- package/src/ast-builder/index.ts +536 -0
- package/src/ast-builder/value-converters.ts +172 -0
- package/src/bridge.ts +275 -0
- package/src/browser-ar.ts +162 -0
- package/src/browser-core.ts +231 -0
- package/src/browser-de.ts +162 -0
- package/src/browser-east-asian.ts +173 -0
- package/src/browser-en-tr.ts +165 -0
- package/src/browser-en.ts +157 -0
- package/src/browser-es-en.ts +200 -0
- package/src/browser-es.ts +170 -0
- package/src/browser-fr.ts +162 -0
- package/src/browser-id.ts +162 -0
- package/src/browser-ja.ts +162 -0
- package/src/browser-ko.ts +162 -0
- package/src/browser-lazy.ts +189 -0
- package/src/browser-priority.ts +214 -0
- package/src/browser-pt.ts +162 -0
- package/src/browser-qu.ts +162 -0
- package/src/browser-sw.ts +162 -0
- package/src/browser-tr.ts +162 -0
- package/src/browser-western.ts +181 -0
- package/src/browser-zh.ts +162 -0
- package/src/browser.ts +268 -0
- package/src/cache/index.ts +14 -0
- package/src/cache/semantic-cache.ts +344 -0
- package/src/core-bridge.ts +372 -0
- package/src/explicit/converter.ts +258 -0
- package/src/explicit/index.ts +18 -0
- package/src/explicit/parser.ts +236 -0
- package/src/explicit/renderer.ts +424 -0
- package/src/generators/command-schemas.ts +1636 -0
- package/src/generators/event-handler-generator.ts +109 -0
- package/src/generators/index.ts +117 -0
- package/src/generators/language-profiles.ts +139 -0
- package/src/generators/pattern-generator.ts +537 -0
- package/src/generators/profiles/arabic.ts +131 -0
- package/src/generators/profiles/bengali.ts +132 -0
- package/src/generators/profiles/chinese.ts +124 -0
- package/src/generators/profiles/english.ts +113 -0
- package/src/generators/profiles/french.ts +125 -0
- package/src/generators/profiles/german.ts +126 -0
- package/src/generators/profiles/hindi.ts +146 -0
- package/src/generators/profiles/index.ts +46 -0
- package/src/generators/profiles/indonesian.ts +125 -0
- package/src/generators/profiles/italian.ts +139 -0
- package/src/generators/profiles/japanese.ts +149 -0
- package/src/generators/profiles/korean.ts +127 -0
- package/src/generators/profiles/marker-templates.ts +288 -0
- package/src/generators/profiles/ms.ts +130 -0
- package/src/generators/profiles/polish.ts +249 -0
- package/src/generators/profiles/portuguese.ts +115 -0
- package/src/generators/profiles/quechua.ts +113 -0
- package/src/generators/profiles/russian.ts +260 -0
- package/src/generators/profiles/spanish.ts +130 -0
- package/src/generators/profiles/swahili.ts +129 -0
- package/src/generators/profiles/thai.ts +132 -0
- package/src/generators/profiles/tl.ts +128 -0
- package/src/generators/profiles/turkish.ts +124 -0
- package/src/generators/profiles/types.ts +165 -0
- package/src/generators/profiles/ukrainian.ts +270 -0
- package/src/generators/profiles/vietnamese.ts +133 -0
- package/src/generators/schema-error-codes.ts +160 -0
- package/src/generators/schema-validator.ts +391 -0
- package/src/index.ts +429 -0
- package/src/language-building-schema.ts +3170 -0
- package/src/language-loader.ts +394 -0
- package/src/languages/_all.ts +65 -0
- package/src/languages/ar.ts +15 -0
- package/src/languages/bn.ts +16 -0
- package/src/languages/de.ts +15 -0
- package/src/languages/en.ts +29 -0
- package/src/languages/es.ts +15 -0
- package/src/languages/fr.ts +15 -0
- package/src/languages/hi.ts +26 -0
- package/src/languages/id.ts +15 -0
- package/src/languages/index.ts +18 -0
- package/src/languages/it.ts +15 -0
- package/src/languages/ja.ts +15 -0
- package/src/languages/ko.ts +15 -0
- package/src/languages/ms.ts +16 -0
- package/src/languages/pl.ts +18 -0
- package/src/languages/pt.ts +15 -0
- package/src/languages/qu.ts +15 -0
- package/src/languages/ru.ts +26 -0
- package/src/languages/sw.ts +15 -0
- package/src/languages/th.ts +16 -0
- package/src/languages/tl.ts +16 -0
- package/src/languages/tr.ts +15 -0
- package/src/languages/uk.ts +26 -0
- package/src/languages/vi.ts +16 -0
- package/src/languages/zh.ts +15 -0
- package/src/parser/index.ts +15 -0
- package/src/parser/pattern-matcher.ts +1181 -0
- package/src/parser/semantic-parser.ts +573 -0
- package/src/parser/utils/index.ts +35 -0
- package/src/parser/utils/marker-resolution.ts +111 -0
- package/src/parser/utils/possessive-keywords.ts +43 -0
- package/src/parser/utils/role-positioning.ts +70 -0
- package/src/parser/utils/type-validation.ts +134 -0
- package/src/patterns/add/ar.ts +71 -0
- package/src/patterns/add/bn.ts +70 -0
- package/src/patterns/add/hi.ts +69 -0
- package/src/patterns/add/index.ts +87 -0
- package/src/patterns/add/it.ts +61 -0
- package/src/patterns/add/ja.ts +93 -0
- package/src/patterns/add/ko.ts +74 -0
- package/src/patterns/add/ms.ts +30 -0
- package/src/patterns/add/pl.ts +62 -0
- package/src/patterns/add/ru.ts +62 -0
- package/src/patterns/add/th.ts +49 -0
- package/src/patterns/add/tl.ts +30 -0
- package/src/patterns/add/tr.ts +71 -0
- package/src/patterns/add/uk.ts +62 -0
- package/src/patterns/add/vi.ts +61 -0
- package/src/patterns/add/zh.ts +71 -0
- package/src/patterns/builders.ts +207 -0
- package/src/patterns/decrement/bn.ts +70 -0
- package/src/patterns/decrement/de.ts +42 -0
- package/src/patterns/decrement/hi.ts +68 -0
- package/src/patterns/decrement/index.ts +79 -0
- package/src/patterns/decrement/it.ts +69 -0
- package/src/patterns/decrement/ms.ts +30 -0
- package/src/patterns/decrement/pl.ts +58 -0
- package/src/patterns/decrement/ru.ts +58 -0
- package/src/patterns/decrement/th.ts +49 -0
- package/src/patterns/decrement/tl.ts +30 -0
- package/src/patterns/decrement/tr.ts +48 -0
- package/src/patterns/decrement/uk.ts +58 -0
- package/src/patterns/decrement/vi.ts +61 -0
- package/src/patterns/decrement/zh.ts +32 -0
- package/src/patterns/en.ts +302 -0
- package/src/patterns/event-handler/ar.ts +151 -0
- package/src/patterns/event-handler/bn.ts +72 -0
- package/src/patterns/event-handler/de.ts +117 -0
- package/src/patterns/event-handler/en.ts +117 -0
- package/src/patterns/event-handler/es.ts +136 -0
- package/src/patterns/event-handler/fr.ts +117 -0
- package/src/patterns/event-handler/hi.ts +64 -0
- package/src/patterns/event-handler/id.ts +117 -0
- package/src/patterns/event-handler/index.ts +119 -0
- package/src/patterns/event-handler/it.ts +54 -0
- package/src/patterns/event-handler/ja.ts +118 -0
- package/src/patterns/event-handler/ko.ts +133 -0
- package/src/patterns/event-handler/ms.ts +30 -0
- package/src/patterns/event-handler/pl.ts +62 -0
- package/src/patterns/event-handler/pt.ts +117 -0
- package/src/patterns/event-handler/qu.ts +66 -0
- package/src/patterns/event-handler/ru.ts +62 -0
- package/src/patterns/event-handler/shared.ts +270 -0
- package/src/patterns/event-handler/sw.ts +117 -0
- package/src/patterns/event-handler/th.ts +53 -0
- package/src/patterns/event-handler/tl.ts +30 -0
- package/src/patterns/event-handler/tr.ts +170 -0
- package/src/patterns/event-handler/uk.ts +62 -0
- package/src/patterns/event-handler/vi.ts +61 -0
- package/src/patterns/event-handler/zh.ts +150 -0
- package/src/patterns/get/ar.ts +49 -0
- package/src/patterns/get/bn.ts +47 -0
- package/src/patterns/get/de.ts +32 -0
- package/src/patterns/get/hi.ts +52 -0
- package/src/patterns/get/index.ts +83 -0
- package/src/patterns/get/it.ts +56 -0
- package/src/patterns/get/ja.ts +53 -0
- package/src/patterns/get/ko.ts +53 -0
- package/src/patterns/get/ms.ts +30 -0
- package/src/patterns/get/pl.ts +57 -0
- package/src/patterns/get/ru.ts +57 -0
- package/src/patterns/get/th.ts +29 -0
- package/src/patterns/get/tl.ts +30 -0
- package/src/patterns/get/uk.ts +57 -0
- package/src/patterns/get/vi.ts +48 -0
- package/src/patterns/grammar-transformed/index.ts +39 -0
- package/src/patterns/grammar-transformed/ja.ts +1713 -0
- package/src/patterns/grammar-transformed/ko.ts +1311 -0
- package/src/patterns/grammar-transformed/tr.ts +1067 -0
- package/src/patterns/hide/ar.ts +67 -0
- package/src/patterns/hide/bn.ts +47 -0
- package/src/patterns/hide/de.ts +36 -0
- package/src/patterns/hide/hi.ts +61 -0
- package/src/patterns/hide/index.ts +91 -0
- package/src/patterns/hide/it.ts +56 -0
- package/src/patterns/hide/ja.ts +69 -0
- package/src/patterns/hide/ko.ts +69 -0
- package/src/patterns/hide/ms.ts +30 -0
- package/src/patterns/hide/pl.ts +57 -0
- package/src/patterns/hide/ru.ts +57 -0
- package/src/patterns/hide/th.ts +29 -0
- package/src/patterns/hide/tl.ts +30 -0
- package/src/patterns/hide/tr.ts +65 -0
- package/src/patterns/hide/uk.ts +57 -0
- package/src/patterns/hide/vi.ts +56 -0
- package/src/patterns/hide/zh.ts +68 -0
- package/src/patterns/increment/bn.ts +70 -0
- package/src/patterns/increment/de.ts +36 -0
- package/src/patterns/increment/hi.ts +68 -0
- package/src/patterns/increment/index.ts +79 -0
- package/src/patterns/increment/it.ts +69 -0
- package/src/patterns/increment/ms.ts +30 -0
- package/src/patterns/increment/pl.ts +58 -0
- package/src/patterns/increment/ru.ts +58 -0
- package/src/patterns/increment/th.ts +49 -0
- package/src/patterns/increment/tl.ts +30 -0
- package/src/patterns/increment/tr.ts +52 -0
- package/src/patterns/increment/uk.ts +58 -0
- package/src/patterns/increment/vi.ts +61 -0
- package/src/patterns/increment/zh.ts +32 -0
- package/src/patterns/index.ts +84 -0
- package/src/patterns/languages/en/control-flow.ts +93 -0
- package/src/patterns/languages/en/fetch.ts +62 -0
- package/src/patterns/languages/en/index.ts +42 -0
- package/src/patterns/languages/en/repeat.ts +67 -0
- package/src/patterns/languages/en/set.ts +48 -0
- package/src/patterns/languages/en/swap.ts +38 -0
- package/src/patterns/languages/en/temporal.ts +57 -0
- package/src/patterns/put/ar.ts +74 -0
- package/src/patterns/put/bn.ts +53 -0
- package/src/patterns/put/en.ts +74 -0
- package/src/patterns/put/es.ts +74 -0
- package/src/patterns/put/hi.ts +69 -0
- package/src/patterns/put/id.ts +96 -0
- package/src/patterns/put/index.ts +99 -0
- package/src/patterns/put/it.ts +56 -0
- package/src/patterns/put/ja.ts +75 -0
- package/src/patterns/put/ko.ts +67 -0
- package/src/patterns/put/ms.ts +30 -0
- package/src/patterns/put/pl.ts +81 -0
- package/src/patterns/put/ru.ts +85 -0
- package/src/patterns/put/th.ts +32 -0
- package/src/patterns/put/tl.ts +30 -0
- package/src/patterns/put/tr.ts +67 -0
- package/src/patterns/put/uk.ts +85 -0
- package/src/patterns/put/vi.ts +72 -0
- package/src/patterns/put/zh.ts +62 -0
- package/src/patterns/registry.ts +163 -0
- package/src/patterns/remove/ar.ts +71 -0
- package/src/patterns/remove/bn.ts +68 -0
- package/src/patterns/remove/hi.ts +69 -0
- package/src/patterns/remove/index.ts +87 -0
- package/src/patterns/remove/it.ts +69 -0
- package/src/patterns/remove/ja.ts +74 -0
- package/src/patterns/remove/ko.ts +78 -0
- package/src/patterns/remove/ms.ts +30 -0
- package/src/patterns/remove/pl.ts +62 -0
- package/src/patterns/remove/ru.ts +62 -0
- package/src/patterns/remove/th.ts +49 -0
- package/src/patterns/remove/tl.ts +30 -0
- package/src/patterns/remove/tr.ts +78 -0
- package/src/patterns/remove/uk.ts +62 -0
- package/src/patterns/remove/vi.ts +61 -0
- package/src/patterns/remove/zh.ts +72 -0
- package/src/patterns/set/ar.ts +84 -0
- package/src/patterns/set/bn.ts +53 -0
- package/src/patterns/set/de.ts +84 -0
- package/src/patterns/set/es.ts +92 -0
- package/src/patterns/set/fr.ts +88 -0
- package/src/patterns/set/hi.ts +56 -0
- package/src/patterns/set/id.ts +84 -0
- package/src/patterns/set/index.ts +107 -0
- package/src/patterns/set/it.ts +56 -0
- package/src/patterns/set/ja.ts +86 -0
- package/src/patterns/set/ko.ts +85 -0
- package/src/patterns/set/ms.ts +30 -0
- package/src/patterns/set/pl.ts +57 -0
- package/src/patterns/set/pt.ts +84 -0
- package/src/patterns/set/ru.ts +57 -0
- package/src/patterns/set/th.ts +31 -0
- package/src/patterns/set/tl.ts +30 -0
- package/src/patterns/set/tr.ts +107 -0
- package/src/patterns/set/uk.ts +57 -0
- package/src/patterns/set/vi.ts +53 -0
- package/src/patterns/set/zh.ts +84 -0
- package/src/patterns/show/ar.ts +67 -0
- package/src/patterns/show/bn.ts +47 -0
- package/src/patterns/show/de.ts +32 -0
- package/src/patterns/show/fr.ts +32 -0
- package/src/patterns/show/hi.ts +61 -0
- package/src/patterns/show/index.ts +95 -0
- package/src/patterns/show/it.ts +56 -0
- package/src/patterns/show/ja.ts +69 -0
- package/src/patterns/show/ko.ts +73 -0
- package/src/patterns/show/ms.ts +30 -0
- package/src/patterns/show/pl.ts +57 -0
- package/src/patterns/show/ru.ts +57 -0
- package/src/patterns/show/th.ts +29 -0
- package/src/patterns/show/tl.ts +30 -0
- package/src/patterns/show/tr.ts +65 -0
- package/src/patterns/show/uk.ts +57 -0
- package/src/patterns/show/vi.ts +56 -0
- package/src/patterns/show/zh.ts +68 -0
- package/src/patterns/take/ar.ts +51 -0
- package/src/patterns/take/index.ts +31 -0
- package/src/patterns/toggle/ar.ts +61 -0
- package/src/patterns/toggle/bn.ts +70 -0
- package/src/patterns/toggle/en.ts +61 -0
- package/src/patterns/toggle/es.ts +61 -0
- package/src/patterns/toggle/hi.ts +80 -0
- package/src/patterns/toggle/index.ts +95 -0
- package/src/patterns/toggle/it.ts +69 -0
- package/src/patterns/toggle/ja.ts +156 -0
- package/src/patterns/toggle/ko.ts +113 -0
- package/src/patterns/toggle/ms.ts +30 -0
- package/src/patterns/toggle/pl.ts +62 -0
- package/src/patterns/toggle/ru.ts +62 -0
- package/src/patterns/toggle/th.ts +50 -0
- package/src/patterns/toggle/tl.ts +30 -0
- package/src/patterns/toggle/tr.ts +88 -0
- package/src/patterns/toggle/uk.ts +62 -0
- package/src/patterns/toggle/vi.ts +61 -0
- package/src/patterns/toggle/zh.ts +99 -0
- package/src/public-api.ts +286 -0
- package/src/registry.ts +441 -0
- package/src/tokenizers/arabic.ts +723 -0
- package/src/tokenizers/base.ts +1300 -0
- package/src/tokenizers/bengali.ts +289 -0
- package/src/tokenizers/chinese.ts +481 -0
- package/src/tokenizers/english.ts +416 -0
- package/src/tokenizers/french.ts +326 -0
- package/src/tokenizers/german.ts +324 -0
- package/src/tokenizers/hindi.ts +319 -0
- package/src/tokenizers/index.ts +127 -0
- package/src/tokenizers/indonesian.ts +306 -0
- package/src/tokenizers/italian.ts +458 -0
- package/src/tokenizers/japanese.ts +447 -0
- package/src/tokenizers/korean.ts +642 -0
- package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
- package/src/tokenizers/morphology/french-normalizer.ts +268 -0
- package/src/tokenizers/morphology/german-normalizer.ts +256 -0
- package/src/tokenizers/morphology/index.ts +46 -0
- package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
- package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
- package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
- package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
- package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
- package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
- package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
- package/src/tokenizers/morphology/types.ts +211 -0
- package/src/tokenizers/ms.ts +198 -0
- package/src/tokenizers/polish.ts +354 -0
- package/src/tokenizers/portuguese.ts +304 -0
- package/src/tokenizers/quechua.ts +339 -0
- package/src/tokenizers/russian.ts +375 -0
- package/src/tokenizers/spanish.ts +403 -0
- package/src/tokenizers/swahili.ts +303 -0
- package/src/tokenizers/thai.ts +236 -0
- package/src/tokenizers/tl.ts +198 -0
- package/src/tokenizers/turkish.ts +411 -0
- package/src/tokenizers/ukrainian.ts +369 -0
- package/src/tokenizers/vietnamese.ts +410 -0
- package/src/types/grammar-types.ts +617 -0
- package/src/types/unified-profile.ts +267 -0
- package/src/types.ts +709 -0
- package/src/utils/confidence-calculator.ts +147 -0
- package/src/validators/command-validator.ts +380 -0
- package/src/validators/index.ts +15 -0
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* German Morphological Normalizer
|
|
3
|
+
*
|
|
4
|
+
* Reduces German verb conjugations to their infinitive forms.
|
|
5
|
+
* German verbs have:
|
|
6
|
+
* - Weak verbs (regular): machen → machte (past)
|
|
7
|
+
* - Strong verbs (stem changes): fahren → fuhr (past)
|
|
8
|
+
* - Mixed verbs: kennen → kannte
|
|
9
|
+
* - Separable prefixes: an-, auf-, aus-, ein-, mit-, vor-, zu-
|
|
10
|
+
*
|
|
11
|
+
* Key features:
|
|
12
|
+
* - Handles common conjugation endings
|
|
13
|
+
* - Recognizes past participle ge- prefix
|
|
14
|
+
* - Handles separable prefix verbs
|
|
15
|
+
*
|
|
16
|
+
* Examples:
|
|
17
|
+
* zeigt → zeigen (3rd person present)
|
|
18
|
+
* gemacht → machen (past participle)
|
|
19
|
+
* anzeigen → anzeigen (separable prefix verb)
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import type { MorphologicalNormalizer, NormalizationResult, ConjugationType } from './types';
|
|
23
|
+
import { noChange, normalized } from './types';
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Common separable prefixes in German.
|
|
27
|
+
*/
|
|
28
|
+
const SEPARABLE_PREFIXES = [
|
|
29
|
+
'an',
|
|
30
|
+
'auf',
|
|
31
|
+
'aus',
|
|
32
|
+
'ein',
|
|
33
|
+
'mit',
|
|
34
|
+
'vor',
|
|
35
|
+
'zu',
|
|
36
|
+
'ab',
|
|
37
|
+
'bei',
|
|
38
|
+
'nach',
|
|
39
|
+
'weg',
|
|
40
|
+
'um',
|
|
41
|
+
'her',
|
|
42
|
+
'hin',
|
|
43
|
+
];
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Check if a word looks like a German verb.
|
|
47
|
+
*/
|
|
48
|
+
function looksLikeGermanVerb(word: string): boolean {
|
|
49
|
+
const lower = word.toLowerCase();
|
|
50
|
+
// Check for infinitive ending
|
|
51
|
+
if (lower.endsWith('en') || lower.endsWith('eln') || lower.endsWith('ern')) return true;
|
|
52
|
+
// Check for past participle prefix ge-
|
|
53
|
+
if (lower.startsWith('ge') && lower.endsWith('t')) return true;
|
|
54
|
+
if (lower.startsWith('ge') && lower.endsWith('en')) return true;
|
|
55
|
+
// Check for German-specific characters
|
|
56
|
+
if (/[äöüß]/i.test(word)) return true;
|
|
57
|
+
return false;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Verb conjugation endings.
|
|
62
|
+
* German infinitives end in -en (or -eln/-ern for some verbs).
|
|
63
|
+
*/
|
|
64
|
+
const VERB_ENDINGS: readonly {
|
|
65
|
+
ending: string;
|
|
66
|
+
stem: string;
|
|
67
|
+
confidence: number;
|
|
68
|
+
type: ConjugationType;
|
|
69
|
+
}[] = [
|
|
70
|
+
// Present participle
|
|
71
|
+
{ ending: 'end', stem: 'en', confidence: 0.88, type: 'gerund' },
|
|
72
|
+
|
|
73
|
+
// Present indicative (regular weak verbs)
|
|
74
|
+
{ ending: 'e', stem: 'en', confidence: 0.75, type: 'present' }, // ich
|
|
75
|
+
{ ending: 'st', stem: 'en', confidence: 0.8, type: 'present' }, // du
|
|
76
|
+
{ ending: 't', stem: 'en', confidence: 0.78, type: 'present' }, // er/sie/es, ihr
|
|
77
|
+
{ ending: 'en', stem: 'en', confidence: 0.85, type: 'dictionary' }, // wir/sie/Sie, infinitive
|
|
78
|
+
|
|
79
|
+
// Past tense (weak verbs: -te, -test, -te, -ten, -tet, -ten)
|
|
80
|
+
{ ending: 'test', stem: 'en', confidence: 0.85, type: 'past' }, // du
|
|
81
|
+
{ ending: 'ten', stem: 'en', confidence: 0.82, type: 'past' }, // wir/sie/Sie
|
|
82
|
+
{ ending: 'tet', stem: 'en', confidence: 0.85, type: 'past' }, // ihr
|
|
83
|
+
{ ending: 'te', stem: 'en', confidence: 0.82, type: 'past' }, // ich/er/sie/es
|
|
84
|
+
|
|
85
|
+
// Subjunctive II (weak verbs)
|
|
86
|
+
{ ending: 'test', stem: 'en', confidence: 0.8, type: 'subjunctive' },
|
|
87
|
+
{ ending: 'ten', stem: 'en', confidence: 0.78, type: 'subjunctive' },
|
|
88
|
+
{ ending: 'tet', stem: 'en', confidence: 0.8, type: 'subjunctive' },
|
|
89
|
+
{ ending: 'te', stem: 'en', confidence: 0.78, type: 'subjunctive' },
|
|
90
|
+
|
|
91
|
+
// Imperative
|
|
92
|
+
{ ending: 'e', stem: 'en', confidence: 0.72, type: 'imperative' }, // du (informal singular)
|
|
93
|
+
{ ending: 't', stem: 'en', confidence: 0.72, type: 'imperative' }, // ihr (informal plural)
|
|
94
|
+
{ ending: 'en', stem: 'en', confidence: 0.75, type: 'imperative' }, // Sie (formal)
|
|
95
|
+
];
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* -eln and -ern verb endings (sammeln, wandern).
|
|
99
|
+
*/
|
|
100
|
+
const ELN_ERN_ENDINGS: readonly {
|
|
101
|
+
ending: string;
|
|
102
|
+
stem: string;
|
|
103
|
+
confidence: number;
|
|
104
|
+
type: ConjugationType;
|
|
105
|
+
}[] = [
|
|
106
|
+
// Present
|
|
107
|
+
{ ending: 'le', stem: 'eln', confidence: 0.82, type: 'present' }, // ich sammle
|
|
108
|
+
{ ending: 'elst', stem: 'eln', confidence: 0.85, type: 'present' }, // du sammelst
|
|
109
|
+
{ ending: 'elt', stem: 'eln', confidence: 0.85, type: 'present' }, // er/sie/es sammelt
|
|
110
|
+
{ ending: 'eln', stem: 'eln', confidence: 0.88, type: 'dictionary' }, // infinitive
|
|
111
|
+
|
|
112
|
+
{ ending: 're', stem: 'ern', confidence: 0.82, type: 'present' }, // ich wandre
|
|
113
|
+
{ ending: 'erst', stem: 'ern', confidence: 0.85, type: 'present' }, // du wanderst
|
|
114
|
+
{ ending: 'ert', stem: 'ern', confidence: 0.85, type: 'present' }, // er/sie/es wandert
|
|
115
|
+
{ ending: 'ern', stem: 'ern', confidence: 0.88, type: 'dictionary' }, // infinitive
|
|
116
|
+
];
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* All endings combined, sorted by length (longest first).
|
|
120
|
+
*/
|
|
121
|
+
const ALL_ENDINGS = [...VERB_ENDINGS, ...ELN_ERN_ENDINGS].sort(
|
|
122
|
+
(a, b) => b.ending.length - a.ending.length
|
|
123
|
+
);
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* German morphological normalizer.
|
|
127
|
+
*/
|
|
128
|
+
export class GermanMorphologicalNormalizer implements MorphologicalNormalizer {
|
|
129
|
+
readonly language = 'de';
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Check if a word might be a German verb that can be normalized.
|
|
133
|
+
*/
|
|
134
|
+
isNormalizable(word: string): boolean {
|
|
135
|
+
if (word.length < 3) return false;
|
|
136
|
+
return looksLikeGermanVerb(word);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Normalize a German word to its infinitive form.
|
|
141
|
+
*/
|
|
142
|
+
normalize(word: string): NormalizationResult {
|
|
143
|
+
const lower = word.toLowerCase();
|
|
144
|
+
|
|
145
|
+
// Check if this is already an infinitive (no change needed)
|
|
146
|
+
if (lower.endsWith('en') && lower.length >= 4) {
|
|
147
|
+
return noChange(word);
|
|
148
|
+
}
|
|
149
|
+
if ((lower.endsWith('eln') || lower.endsWith('ern')) && lower.length >= 5) {
|
|
150
|
+
return noChange(word);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Try past participle normalization (ge-...-t or ge-...-en)
|
|
154
|
+
const participleResult = this.tryParticipleNormalization(lower);
|
|
155
|
+
if (participleResult) return participleResult;
|
|
156
|
+
|
|
157
|
+
// Try standard conjugation normalization
|
|
158
|
+
const conjugationResult = this.tryConjugationNormalization(lower);
|
|
159
|
+
if (conjugationResult) return conjugationResult;
|
|
160
|
+
|
|
161
|
+
// No normalization needed
|
|
162
|
+
return noChange(word);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Try to normalize a past participle.
|
|
167
|
+
* German past participles often have ge- prefix and -t or -en suffix.
|
|
168
|
+
*
|
|
169
|
+
* Examples:
|
|
170
|
+
* gemacht → machen (weak verb)
|
|
171
|
+
* gegangen → gehen (strong verb)
|
|
172
|
+
* angemacht → anmachen (separable prefix)
|
|
173
|
+
*/
|
|
174
|
+
private tryParticipleNormalization(word: string): NormalizationResult | null {
|
|
175
|
+
// Check for separable prefix verbs first (e.g., "angemacht" → "anmachen")
|
|
176
|
+
for (const prefix of SEPARABLE_PREFIXES) {
|
|
177
|
+
if (word.startsWith(prefix + 'ge')) {
|
|
178
|
+
const afterPrefix = word.slice(prefix.length);
|
|
179
|
+
const innerResult = this.trySimpleParticipleNormalization(afterPrefix);
|
|
180
|
+
if (innerResult) {
|
|
181
|
+
const metadata: {
|
|
182
|
+
removedPrefixes: string[];
|
|
183
|
+
removedSuffixes?: readonly string[];
|
|
184
|
+
conjugationType: 'participle';
|
|
185
|
+
} = {
|
|
186
|
+
removedPrefixes: ['ge'],
|
|
187
|
+
conjugationType: 'participle',
|
|
188
|
+
};
|
|
189
|
+
if (innerResult.metadata?.removedSuffixes) {
|
|
190
|
+
metadata.removedSuffixes = innerResult.metadata.removedSuffixes;
|
|
191
|
+
}
|
|
192
|
+
return normalized(prefix + innerResult.stem, innerResult.confidence * 0.95, metadata);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// Try simple ge- prefix participle
|
|
198
|
+
return this.trySimpleParticipleNormalization(word);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Try to normalize a simple ge-...-t or ge-...-en participle.
|
|
203
|
+
*/
|
|
204
|
+
private trySimpleParticipleNormalization(word: string): NormalizationResult | null {
|
|
205
|
+
if (!word.startsWith('ge')) return null;
|
|
206
|
+
|
|
207
|
+
const withoutGe = word.slice(2);
|
|
208
|
+
|
|
209
|
+
// Weak verb participle: ge-...-t → ...-en
|
|
210
|
+
if (withoutGe.endsWith('t') && withoutGe.length >= 3) {
|
|
211
|
+
const stem = withoutGe.slice(0, -1);
|
|
212
|
+
return normalized(stem + 'en', 0.85, {
|
|
213
|
+
removedPrefixes: ['ge'],
|
|
214
|
+
removedSuffixes: ['t'],
|
|
215
|
+
conjugationType: 'participle',
|
|
216
|
+
});
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// Strong verb participle: ge-...-en → ...-en (same ending)
|
|
220
|
+
if (withoutGe.endsWith('en') && withoutGe.length >= 4) {
|
|
221
|
+
return normalized(withoutGe, 0.82, {
|
|
222
|
+
removedPrefixes: ['ge'],
|
|
223
|
+
conjugationType: 'participle',
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
return null;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Try to normalize a conjugated verb to its infinitive.
|
|
232
|
+
*/
|
|
233
|
+
private tryConjugationNormalization(word: string): NormalizationResult | null {
|
|
234
|
+
for (const rule of ALL_ENDINGS) {
|
|
235
|
+
if (word.endsWith(rule.ending)) {
|
|
236
|
+
const stemBase = word.slice(0, -rule.ending.length);
|
|
237
|
+
|
|
238
|
+
// Must have a meaningful stem (at least 2 characters)
|
|
239
|
+
if (stemBase.length < 2) continue;
|
|
240
|
+
|
|
241
|
+
// Reconstruct infinitive
|
|
242
|
+
const infinitive = stemBase + rule.stem;
|
|
243
|
+
|
|
244
|
+
return normalized(infinitive, rule.confidence, {
|
|
245
|
+
removedSuffixes: [rule.ending],
|
|
246
|
+
conjugationType: rule.type,
|
|
247
|
+
});
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
return null;
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// Export singleton instance
|
|
256
|
+
export const germanMorphologicalNormalizer = new GermanMorphologicalNormalizer();
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Morphological Normalizers
|
|
3
|
+
*
|
|
4
|
+
* Re-exports all morphological normalizer types and implementations.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
// Types
|
|
8
|
+
export type {
|
|
9
|
+
NormalizationResult,
|
|
10
|
+
NormalizationMetadata,
|
|
11
|
+
ConjugationType,
|
|
12
|
+
MorphologicalNormalizer,
|
|
13
|
+
SuffixRule,
|
|
14
|
+
PrefixRule,
|
|
15
|
+
} from './types';
|
|
16
|
+
|
|
17
|
+
export { noChange, normalized } from './types';
|
|
18
|
+
|
|
19
|
+
// Language-specific normalizers
|
|
20
|
+
export {
|
|
21
|
+
JapaneseMorphologicalNormalizer,
|
|
22
|
+
japaneseMorphologicalNormalizer,
|
|
23
|
+
} from './japanese-normalizer';
|
|
24
|
+
|
|
25
|
+
export { KoreanMorphologicalNormalizer, koreanMorphologicalNormalizer } from './korean-normalizer';
|
|
26
|
+
|
|
27
|
+
export {
|
|
28
|
+
SpanishMorphologicalNormalizer,
|
|
29
|
+
spanishMorphologicalNormalizer,
|
|
30
|
+
} from './spanish-normalizer';
|
|
31
|
+
|
|
32
|
+
export { ArabicMorphologicalNormalizer, arabicMorphologicalNormalizer } from './arabic-normalizer';
|
|
33
|
+
|
|
34
|
+
export {
|
|
35
|
+
TurkishMorphologicalNormalizer,
|
|
36
|
+
turkishMorphologicalNormalizer,
|
|
37
|
+
} from './turkish-normalizer';
|
|
38
|
+
|
|
39
|
+
export {
|
|
40
|
+
PortugueseMorphologicalNormalizer,
|
|
41
|
+
portugueseMorphologicalNormalizer,
|
|
42
|
+
} from './portuguese-normalizer';
|
|
43
|
+
|
|
44
|
+
export { FrenchMorphologicalNormalizer, frenchMorphologicalNormalizer } from './french-normalizer';
|
|
45
|
+
|
|
46
|
+
export { GermanMorphologicalNormalizer, germanMorphologicalNormalizer } from './german-normalizer';
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Italian Morphological Normalizer
|
|
3
|
+
*
|
|
4
|
+
* Reduces Italian verb conjugations to their infinitive forms.
|
|
5
|
+
* Italian has three verb conjugation classes (-are, -ere, -ire) and
|
|
6
|
+
* supports reflexive verbs (verbs with -si suffix).
|
|
7
|
+
*
|
|
8
|
+
* Key features:
|
|
9
|
+
* - Reflexive verb handling: mostrarsi → mostrare, nascondersi → nascondere
|
|
10
|
+
* - Regular conjugation patterns for -are, -ere, -ire verbs
|
|
11
|
+
* - Handles common irregular verbs
|
|
12
|
+
*
|
|
13
|
+
* Examples:
|
|
14
|
+
* mostrarsi → mostrare (reflexive infinitive)
|
|
15
|
+
* alternando → alternare (gerund)
|
|
16
|
+
* nascosto → nascondere (past participle)
|
|
17
|
+
* mostra → mostrare (3rd person present)
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import type { MorphologicalNormalizer, NormalizationResult, ConjugationType } from './types';
|
|
21
|
+
import { noChange, normalized } from './types';
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Check if a character is an Italian-specific letter (accented characters).
|
|
25
|
+
*/
|
|
26
|
+
function isItalianSpecificLetter(char: string): boolean {
|
|
27
|
+
return /[àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ]/.test(char);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Check if a word looks like an Italian verb.
|
|
32
|
+
* Italian verbs end in -are, -ere, or -ire, or have Italian-specific characters.
|
|
33
|
+
*/
|
|
34
|
+
function looksLikeItalianVerb(word: string): boolean {
|
|
35
|
+
const lower = word.toLowerCase();
|
|
36
|
+
// Check for infinitive endings
|
|
37
|
+
if (lower.endsWith('are') || lower.endsWith('ere') || lower.endsWith('ire')) return true;
|
|
38
|
+
// Check for common conjugation endings
|
|
39
|
+
if (lower.endsWith('ando') || lower.endsWith('endo')) return true;
|
|
40
|
+
if (lower.endsWith('ato') || lower.endsWith('uto') || lower.endsWith('ito')) return true;
|
|
41
|
+
// Check for reflexive -si ending
|
|
42
|
+
if (lower.endsWith('arsi') || lower.endsWith('ersi') || lower.endsWith('irsi')) return true;
|
|
43
|
+
// Check for Italian-specific characters
|
|
44
|
+
for (const char of word) {
|
|
45
|
+
if (isItalianSpecificLetter(char)) return true;
|
|
46
|
+
}
|
|
47
|
+
return false;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Reflexive pronoun patterns that can be attached to verbs.
|
|
52
|
+
*/
|
|
53
|
+
const REFLEXIVE_SUFFIXES = ['si', 'mi', 'ti', 'ci', 'vi'];
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* -ARE verb conjugation endings mapped to infinitive reconstruction.
|
|
57
|
+
*/
|
|
58
|
+
const ARE_ENDINGS: readonly {
|
|
59
|
+
ending: string;
|
|
60
|
+
stem: string;
|
|
61
|
+
confidence: number;
|
|
62
|
+
type: ConjugationType;
|
|
63
|
+
}[] = [
|
|
64
|
+
// Gerund (-ando)
|
|
65
|
+
{ ending: 'ando', stem: 'are', confidence: 0.88, type: 'gerund' },
|
|
66
|
+
// Past participle (-ato)
|
|
67
|
+
{ ending: 'ato', stem: 'are', confidence: 0.88, type: 'participle' },
|
|
68
|
+
{ ending: 'ata', stem: 'are', confidence: 0.88, type: 'participle' },
|
|
69
|
+
{ ending: 'ati', stem: 'are', confidence: 0.88, type: 'participle' },
|
|
70
|
+
{ ending: 'ate', stem: 'are', confidence: 0.88, type: 'participle' },
|
|
71
|
+
// Present indicative
|
|
72
|
+
{ ending: 'o', stem: 'are', confidence: 0.75, type: 'present' }, // io
|
|
73
|
+
{ ending: 'i', stem: 'are', confidence: 0.72, type: 'present' }, // tu
|
|
74
|
+
{ ending: 'a', stem: 'are', confidence: 0.75, type: 'present' }, // lui/lei
|
|
75
|
+
{ ending: 'iamo', stem: 'are', confidence: 0.85, type: 'present' }, // noi
|
|
76
|
+
{ ending: 'ate', stem: 'are', confidence: 0.85, type: 'present' }, // voi
|
|
77
|
+
{ ending: 'ano', stem: 'are', confidence: 0.85, type: 'present' }, // loro
|
|
78
|
+
// Imperfect
|
|
79
|
+
{ ending: 'avo', stem: 'are', confidence: 0.88, type: 'past' }, // io
|
|
80
|
+
{ ending: 'avi', stem: 'are', confidence: 0.88, type: 'past' }, // tu
|
|
81
|
+
{ ending: 'ava', stem: 'are', confidence: 0.88, type: 'past' }, // lui/lei
|
|
82
|
+
{ ending: 'avamo', stem: 'are', confidence: 0.88, type: 'past' }, // noi
|
|
83
|
+
{ ending: 'avate', stem: 'are', confidence: 0.88, type: 'past' }, // voi
|
|
84
|
+
{ ending: 'avano', stem: 'are', confidence: 0.88, type: 'past' }, // loro
|
|
85
|
+
// Preterite (passato remoto)
|
|
86
|
+
{ ending: 'ai', stem: 'are', confidence: 0.85, type: 'past' }, // io
|
|
87
|
+
{ ending: 'asti', stem: 'are', confidence: 0.88, type: 'past' }, // tu
|
|
88
|
+
{ ending: 'ò', stem: 'are', confidence: 0.85, type: 'past' }, // lui/lei
|
|
89
|
+
{ ending: 'ammo', stem: 'are', confidence: 0.88, type: 'past' }, // noi
|
|
90
|
+
{ ending: 'aste', stem: 'are', confidence: 0.88, type: 'past' }, // voi
|
|
91
|
+
{ ending: 'arono', stem: 'are', confidence: 0.88, type: 'past' }, // loro
|
|
92
|
+
// Subjunctive present
|
|
93
|
+
{ ending: 'i', stem: 'are', confidence: 0.72, type: 'subjunctive' }, // io/tu/lui (ambiguous)
|
|
94
|
+
{ ending: 'ino', stem: 'are', confidence: 0.82, type: 'subjunctive' }, // loro
|
|
95
|
+
// Imperative
|
|
96
|
+
{ ending: 'a', stem: 'are', confidence: 0.75, type: 'imperative' }, // tu
|
|
97
|
+
// Infinitive
|
|
98
|
+
{ ending: 'are', stem: 'are', confidence: 0.92, type: 'dictionary' },
|
|
99
|
+
];
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* -ERE verb conjugation endings.
|
|
103
|
+
*/
|
|
104
|
+
const ERE_ENDINGS: readonly {
|
|
105
|
+
ending: string;
|
|
106
|
+
stem: string;
|
|
107
|
+
confidence: number;
|
|
108
|
+
type: ConjugationType;
|
|
109
|
+
}[] = [
|
|
110
|
+
// Gerund (-endo)
|
|
111
|
+
{ ending: 'endo', stem: 'ere', confidence: 0.88, type: 'gerund' },
|
|
112
|
+
// Past participle (-uto)
|
|
113
|
+
{ ending: 'uto', stem: 'ere', confidence: 0.85, type: 'participle' },
|
|
114
|
+
{ ending: 'uta', stem: 'ere', confidence: 0.85, type: 'participle' },
|
|
115
|
+
{ ending: 'uti', stem: 'ere', confidence: 0.85, type: 'participle' },
|
|
116
|
+
{ ending: 'ute', stem: 'ere', confidence: 0.85, type: 'participle' },
|
|
117
|
+
// Present indicative
|
|
118
|
+
{ ending: 'o', stem: 'ere', confidence: 0.72, type: 'present' }, // io
|
|
119
|
+
{ ending: 'i', stem: 'ere', confidence: 0.72, type: 'present' }, // tu
|
|
120
|
+
{ ending: 'e', stem: 'ere', confidence: 0.72, type: 'present' }, // lui/lei
|
|
121
|
+
{ ending: 'iamo', stem: 'ere', confidence: 0.85, type: 'present' }, // noi
|
|
122
|
+
{ ending: 'ete', stem: 'ere', confidence: 0.85, type: 'present' }, // voi
|
|
123
|
+
{ ending: 'ono', stem: 'ere', confidence: 0.82, type: 'present' }, // loro
|
|
124
|
+
// Imperfect
|
|
125
|
+
{ ending: 'evo', stem: 'ere', confidence: 0.88, type: 'past' }, // io
|
|
126
|
+
{ ending: 'evi', stem: 'ere', confidence: 0.88, type: 'past' }, // tu
|
|
127
|
+
{ ending: 'eva', stem: 'ere', confidence: 0.88, type: 'past' }, // lui/lei
|
|
128
|
+
{ ending: 'evamo', stem: 'ere', confidence: 0.88, type: 'past' }, // noi
|
|
129
|
+
{ ending: 'evate', stem: 'ere', confidence: 0.88, type: 'past' }, // voi
|
|
130
|
+
{ ending: 'evano', stem: 'ere', confidence: 0.88, type: 'past' }, // loro
|
|
131
|
+
// Preterite
|
|
132
|
+
{ ending: 'ei', stem: 'ere', confidence: 0.85, type: 'past' }, // io
|
|
133
|
+
{ ending: 'etti', stem: 'ere', confidence: 0.85, type: 'past' }, // io (variant)
|
|
134
|
+
{ ending: 'esti', stem: 'ere', confidence: 0.88, type: 'past' }, // tu
|
|
135
|
+
{ ending: 'é', stem: 'ere', confidence: 0.85, type: 'past' }, // lui/lei
|
|
136
|
+
{ ending: 'ette', stem: 'ere', confidence: 0.85, type: 'past' }, // lui/lei (variant)
|
|
137
|
+
{ ending: 'emmo', stem: 'ere', confidence: 0.88, type: 'past' }, // noi
|
|
138
|
+
{ ending: 'este', stem: 'ere', confidence: 0.88, type: 'past' }, // voi
|
|
139
|
+
{ ending: 'erono', stem: 'ere', confidence: 0.88, type: 'past' }, // loro
|
|
140
|
+
{ ending: 'ettero', stem: 'ere', confidence: 0.88, type: 'past' }, // loro (variant)
|
|
141
|
+
// Infinitive
|
|
142
|
+
{ ending: 'ere', stem: 'ere', confidence: 0.92, type: 'dictionary' },
|
|
143
|
+
];
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* -IRE verb conjugation endings.
|
|
147
|
+
*/
|
|
148
|
+
const IRE_ENDINGS: readonly {
|
|
149
|
+
ending: string;
|
|
150
|
+
stem: string;
|
|
151
|
+
confidence: number;
|
|
152
|
+
type: ConjugationType;
|
|
153
|
+
}[] = [
|
|
154
|
+
// Gerund (-endo)
|
|
155
|
+
{ ending: 'endo', stem: 'ire', confidence: 0.85, type: 'gerund' },
|
|
156
|
+
// Past participle (-ito)
|
|
157
|
+
{ ending: 'ito', stem: 'ire', confidence: 0.85, type: 'participle' },
|
|
158
|
+
{ ending: 'ita', stem: 'ire', confidence: 0.85, type: 'participle' },
|
|
159
|
+
{ ending: 'iti', stem: 'ire', confidence: 0.85, type: 'participle' },
|
|
160
|
+
{ ending: 'ite', stem: 'ire', confidence: 0.85, type: 'participle' },
|
|
161
|
+
// Present indicative (standard)
|
|
162
|
+
{ ending: 'o', stem: 'ire', confidence: 0.7, type: 'present' }, // io
|
|
163
|
+
{ ending: 'i', stem: 'ire', confidence: 0.7, type: 'present' }, // tu
|
|
164
|
+
{ ending: 'e', stem: 'ire', confidence: 0.7, type: 'present' }, // lui/lei
|
|
165
|
+
{ ending: 'iamo', stem: 'ire', confidence: 0.85, type: 'present' }, // noi
|
|
166
|
+
{ ending: 'ite', stem: 'ire', confidence: 0.85, type: 'present' }, // voi
|
|
167
|
+
{ ending: 'ono', stem: 'ire', confidence: 0.78, type: 'present' }, // loro
|
|
168
|
+
// Present indicative (-isco verbs)
|
|
169
|
+
{ ending: 'isco', stem: 'ire', confidence: 0.85, type: 'present' }, // io
|
|
170
|
+
{ ending: 'isci', stem: 'ire', confidence: 0.85, type: 'present' }, // tu
|
|
171
|
+
{ ending: 'isce', stem: 'ire', confidence: 0.85, type: 'present' }, // lui/lei
|
|
172
|
+
{ ending: 'iscono', stem: 'ire', confidence: 0.88, type: 'present' }, // loro
|
|
173
|
+
// Imperfect
|
|
174
|
+
{ ending: 'ivo', stem: 'ire', confidence: 0.88, type: 'past' }, // io
|
|
175
|
+
{ ending: 'ivi', stem: 'ire', confidence: 0.88, type: 'past' }, // tu
|
|
176
|
+
{ ending: 'iva', stem: 'ire', confidence: 0.88, type: 'past' }, // lui/lei
|
|
177
|
+
{ ending: 'ivamo', stem: 'ire', confidence: 0.88, type: 'past' }, // noi
|
|
178
|
+
{ ending: 'ivate', stem: 'ire', confidence: 0.88, type: 'past' }, // voi
|
|
179
|
+
{ ending: 'ivano', stem: 'ire', confidence: 0.88, type: 'past' }, // loro
|
|
180
|
+
// Preterite
|
|
181
|
+
{ ending: 'ii', stem: 'ire', confidence: 0.85, type: 'past' }, // io
|
|
182
|
+
{ ending: 'isti', stem: 'ire', confidence: 0.88, type: 'past' }, // tu
|
|
183
|
+
{ ending: 'ì', stem: 'ire', confidence: 0.85, type: 'past' }, // lui/lei
|
|
184
|
+
{ ending: 'immo', stem: 'ire', confidence: 0.88, type: 'past' }, // noi
|
|
185
|
+
{ ending: 'iste', stem: 'ire', confidence: 0.88, type: 'past' }, // voi
|
|
186
|
+
{ ending: 'irono', stem: 'ire', confidence: 0.88, type: 'past' }, // loro
|
|
187
|
+
// Infinitive
|
|
188
|
+
{ ending: 'ire', stem: 'ire', confidence: 0.92, type: 'dictionary' },
|
|
189
|
+
];
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* All endings combined, sorted by length (longest first).
|
|
193
|
+
*/
|
|
194
|
+
const ALL_ENDINGS = [...ARE_ENDINGS, ...ERE_ENDINGS, ...IRE_ENDINGS].sort(
|
|
195
|
+
(a, b) => b.ending.length - a.ending.length
|
|
196
|
+
);
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Italian morphological normalizer.
|
|
200
|
+
*/
|
|
201
|
+
export class ItalianMorphologicalNormalizer implements MorphologicalNormalizer {
|
|
202
|
+
readonly language = 'it';
|
|
203
|
+
|
|
204
|
+
/**
|
|
205
|
+
* Check if a word might be an Italian verb that can be normalized.
|
|
206
|
+
*/
|
|
207
|
+
isNormalizable(word: string): boolean {
|
|
208
|
+
if (word.length < 3) return false;
|
|
209
|
+
return looksLikeItalianVerb(word);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Normalize an Italian word to its infinitive form.
|
|
214
|
+
*/
|
|
215
|
+
normalize(word: string): NormalizationResult {
|
|
216
|
+
const lower = word.toLowerCase();
|
|
217
|
+
|
|
218
|
+
// Check if this is already an infinitive (no change needed)
|
|
219
|
+
if (lower.endsWith('are') || lower.endsWith('ere') || lower.endsWith('ire')) {
|
|
220
|
+
// If it's a simple infinitive, return as-is with 1.0 confidence
|
|
221
|
+
// (unless it's a reflexive like "mostrarsi")
|
|
222
|
+
if (
|
|
223
|
+
!REFLEXIVE_SUFFIXES.some(
|
|
224
|
+
s => lower.endsWith(s + 'are') || lower.endsWith(s + 'ere') || lower.endsWith(s + 'ire')
|
|
225
|
+
)
|
|
226
|
+
) {
|
|
227
|
+
return noChange(word);
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Try reflexive verb normalization first (highest priority)
|
|
232
|
+
const reflexiveResult = this.tryReflexiveNormalization(lower);
|
|
233
|
+
if (reflexiveResult) return reflexiveResult;
|
|
234
|
+
|
|
235
|
+
// Try standard conjugation normalization
|
|
236
|
+
const conjugationResult = this.tryConjugationNormalization(lower);
|
|
237
|
+
if (conjugationResult) return conjugationResult;
|
|
238
|
+
|
|
239
|
+
// No normalization needed
|
|
240
|
+
return noChange(word);
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Try to normalize a reflexive verb.
|
|
245
|
+
* Reflexive verbs end with -si, -mi, -ti, -ci, -vi attached to infinitive.
|
|
246
|
+
*
|
|
247
|
+
* In Italian, reflexive infinitives drop the final -e before attaching the pronoun:
|
|
248
|
+
* mostrare + si → mostrarsi (not mostraresi)
|
|
249
|
+
* nascondere + si → nascondersi
|
|
250
|
+
*
|
|
251
|
+
* Examples:
|
|
252
|
+
* mostrarsi → mostrare
|
|
253
|
+
* nascondersi → nascondere
|
|
254
|
+
*/
|
|
255
|
+
private tryReflexiveNormalization(word: string): NormalizationResult | null {
|
|
256
|
+
for (const suffix of REFLEXIVE_SUFFIXES) {
|
|
257
|
+
if (word.endsWith(suffix)) {
|
|
258
|
+
const withoutReflexive = word.slice(0, -suffix.length);
|
|
259
|
+
|
|
260
|
+
// In Italian, reflexive infinitives are formed by dropping the final -e
|
|
261
|
+
// So mostrarsi = mostrar + si, where mostrar comes from mostrare
|
|
262
|
+
// Check if adding 'e' gives us a valid infinitive
|
|
263
|
+
if (
|
|
264
|
+
withoutReflexive.endsWith('ar') ||
|
|
265
|
+
withoutReflexive.endsWith('er') ||
|
|
266
|
+
withoutReflexive.endsWith('ir')
|
|
267
|
+
) {
|
|
268
|
+
// Reconstruct the infinitive by adding 'e'
|
|
269
|
+
const infinitive = withoutReflexive + 'e';
|
|
270
|
+
return normalized(infinitive, 0.88, {
|
|
271
|
+
removedSuffixes: [suffix],
|
|
272
|
+
conjugationType: 'reflexive',
|
|
273
|
+
});
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// Check if this already looks like an infinitive (less common case)
|
|
277
|
+
if (
|
|
278
|
+
withoutReflexive.endsWith('are') ||
|
|
279
|
+
withoutReflexive.endsWith('ere') ||
|
|
280
|
+
withoutReflexive.endsWith('ire')
|
|
281
|
+
) {
|
|
282
|
+
return normalized(withoutReflexive, 0.88, {
|
|
283
|
+
removedSuffixes: [suffix],
|
|
284
|
+
conjugationType: 'reflexive',
|
|
285
|
+
});
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
// Try to normalize the remaining part as a conjugated verb
|
|
289
|
+
const innerResult = this.tryConjugationNormalization(withoutReflexive);
|
|
290
|
+
if (innerResult && innerResult.stem !== withoutReflexive) {
|
|
291
|
+
// It's a reflexive conjugated form
|
|
292
|
+
return normalized(innerResult.stem, innerResult.confidence * 0.95, {
|
|
293
|
+
removedSuffixes: [suffix, ...(innerResult.metadata?.removedSuffixes || [])],
|
|
294
|
+
conjugationType: 'reflexive',
|
|
295
|
+
});
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
return null;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
/**
|
|
304
|
+
* Try to normalize a conjugated verb to its infinitive.
|
|
305
|
+
*/
|
|
306
|
+
private tryConjugationNormalization(word: string): NormalizationResult | null {
|
|
307
|
+
for (const rule of ALL_ENDINGS) {
|
|
308
|
+
if (word.endsWith(rule.ending)) {
|
|
309
|
+
const stemBase = word.slice(0, -rule.ending.length);
|
|
310
|
+
|
|
311
|
+
// Must have a meaningful stem (at least 2 characters)
|
|
312
|
+
if (stemBase.length < 2) continue;
|
|
313
|
+
|
|
314
|
+
// Reconstruct infinitive
|
|
315
|
+
const infinitive = stemBase + rule.stem;
|
|
316
|
+
|
|
317
|
+
return normalized(infinitive, rule.confidence, {
|
|
318
|
+
removedSuffixes: [rule.ending],
|
|
319
|
+
conjugationType: rule.type,
|
|
320
|
+
});
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
return null;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// Export singleton instance
|
|
329
|
+
export const italianMorphologicalNormalizer = new ItalianMorphologicalNormalizer();
|