@lokascript/semantic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +686 -0
- package/dist/browser-ar.ar.global.js +2 -0
- package/dist/browser-core.core.global.js +2 -0
- package/dist/browser-de.de.global.js +2 -0
- package/dist/browser-east-asian.east-asian.global.js +2 -0
- package/dist/browser-en-tr.en-tr.global.js +2 -0
- package/dist/browser-en.en.global.js +2 -0
- package/dist/browser-es-en.es-en.global.js +2 -0
- package/dist/browser-es.es.global.js +2 -0
- package/dist/browser-fr.fr.global.js +2 -0
- package/dist/browser-id.id.global.js +2 -0
- package/dist/browser-ja.ja.global.js +2 -0
- package/dist/browser-ko.ko.global.js +2 -0
- package/dist/browser-lazy.lazy.global.js +2 -0
- package/dist/browser-priority.priority.global.js +2 -0
- package/dist/browser-pt.pt.global.js +2 -0
- package/dist/browser-qu.qu.global.js +2 -0
- package/dist/browser-sw.sw.global.js +2 -0
- package/dist/browser-tr.tr.global.js +2 -0
- package/dist/browser-western.western.global.js +2 -0
- package/dist/browser-zh.zh.global.js +2 -0
- package/dist/browser.global.js +3 -0
- package/dist/browser.global.js.map +1 -0
- package/dist/index.cjs +35051 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3426 -0
- package/dist/index.d.ts +3426 -0
- package/dist/index.js +34890 -0
- package/dist/index.js.map +1 -0
- package/dist/languages/ar.d.ts +78 -0
- package/dist/languages/ar.js +1622 -0
- package/dist/languages/ar.js.map +1 -0
- package/dist/languages/de.d.ts +38 -0
- package/dist/languages/de.js +1168 -0
- package/dist/languages/de.js.map +1 -0
- package/dist/languages/en.d.ts +44 -0
- package/dist/languages/en.js +3491 -0
- package/dist/languages/en.js.map +1 -0
- package/dist/languages/es.d.ts +52 -0
- package/dist/languages/es.js +1493 -0
- package/dist/languages/es.js.map +1 -0
- package/dist/languages/fr.d.ts +37 -0
- package/dist/languages/fr.js +1159 -0
- package/dist/languages/fr.js.map +1 -0
- package/dist/languages/id.d.ts +35 -0
- package/dist/languages/id.js +1152 -0
- package/dist/languages/id.js.map +1 -0
- package/dist/languages/ja.d.ts +53 -0
- package/dist/languages/ja.js +1430 -0
- package/dist/languages/ja.js.map +1 -0
- package/dist/languages/ko.d.ts +51 -0
- package/dist/languages/ko.js +1729 -0
- package/dist/languages/ko.js.map +1 -0
- package/dist/languages/pt.d.ts +37 -0
- package/dist/languages/pt.js +1127 -0
- package/dist/languages/pt.js.map +1 -0
- package/dist/languages/qu.d.ts +36 -0
- package/dist/languages/qu.js +1143 -0
- package/dist/languages/qu.js.map +1 -0
- package/dist/languages/sw.d.ts +35 -0
- package/dist/languages/sw.js +1147 -0
- package/dist/languages/sw.js.map +1 -0
- package/dist/languages/tr.d.ts +45 -0
- package/dist/languages/tr.js +1529 -0
- package/dist/languages/tr.js.map +1 -0
- package/dist/languages/zh.d.ts +58 -0
- package/dist/languages/zh.js +1257 -0
- package/dist/languages/zh.js.map +1 -0
- package/dist/types-C4dcj53L.d.ts +600 -0
- package/package.json +202 -0
- package/src/__test-utils__/index.ts +7 -0
- package/src/__test-utils__/test-helpers.ts +8 -0
- package/src/__types__/test-helpers.ts +122 -0
- package/src/analysis/index.ts +479 -0
- package/src/ast-builder/command-mappers.ts +1133 -0
- package/src/ast-builder/expression-parser/index.ts +41 -0
- package/src/ast-builder/expression-parser/parser.ts +563 -0
- package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
- package/src/ast-builder/expression-parser/types.ts +208 -0
- package/src/ast-builder/index.ts +536 -0
- package/src/ast-builder/value-converters.ts +172 -0
- package/src/bridge.ts +275 -0
- package/src/browser-ar.ts +162 -0
- package/src/browser-core.ts +231 -0
- package/src/browser-de.ts +162 -0
- package/src/browser-east-asian.ts +173 -0
- package/src/browser-en-tr.ts +165 -0
- package/src/browser-en.ts +157 -0
- package/src/browser-es-en.ts +200 -0
- package/src/browser-es.ts +170 -0
- package/src/browser-fr.ts +162 -0
- package/src/browser-id.ts +162 -0
- package/src/browser-ja.ts +162 -0
- package/src/browser-ko.ts +162 -0
- package/src/browser-lazy.ts +189 -0
- package/src/browser-priority.ts +214 -0
- package/src/browser-pt.ts +162 -0
- package/src/browser-qu.ts +162 -0
- package/src/browser-sw.ts +162 -0
- package/src/browser-tr.ts +162 -0
- package/src/browser-western.ts +181 -0
- package/src/browser-zh.ts +162 -0
- package/src/browser.ts +268 -0
- package/src/cache/index.ts +14 -0
- package/src/cache/semantic-cache.ts +344 -0
- package/src/core-bridge.ts +372 -0
- package/src/explicit/converter.ts +258 -0
- package/src/explicit/index.ts +18 -0
- package/src/explicit/parser.ts +236 -0
- package/src/explicit/renderer.ts +424 -0
- package/src/generators/command-schemas.ts +1636 -0
- package/src/generators/event-handler-generator.ts +109 -0
- package/src/generators/index.ts +117 -0
- package/src/generators/language-profiles.ts +139 -0
- package/src/generators/pattern-generator.ts +537 -0
- package/src/generators/profiles/arabic.ts +131 -0
- package/src/generators/profiles/bengali.ts +132 -0
- package/src/generators/profiles/chinese.ts +124 -0
- package/src/generators/profiles/english.ts +113 -0
- package/src/generators/profiles/french.ts +125 -0
- package/src/generators/profiles/german.ts +126 -0
- package/src/generators/profiles/hindi.ts +146 -0
- package/src/generators/profiles/index.ts +46 -0
- package/src/generators/profiles/indonesian.ts +125 -0
- package/src/generators/profiles/italian.ts +139 -0
- package/src/generators/profiles/japanese.ts +149 -0
- package/src/generators/profiles/korean.ts +127 -0
- package/src/generators/profiles/marker-templates.ts +288 -0
- package/src/generators/profiles/ms.ts +130 -0
- package/src/generators/profiles/polish.ts +249 -0
- package/src/generators/profiles/portuguese.ts +115 -0
- package/src/generators/profiles/quechua.ts +113 -0
- package/src/generators/profiles/russian.ts +260 -0
- package/src/generators/profiles/spanish.ts +130 -0
- package/src/generators/profiles/swahili.ts +129 -0
- package/src/generators/profiles/thai.ts +132 -0
- package/src/generators/profiles/tl.ts +128 -0
- package/src/generators/profiles/turkish.ts +124 -0
- package/src/generators/profiles/types.ts +165 -0
- package/src/generators/profiles/ukrainian.ts +270 -0
- package/src/generators/profiles/vietnamese.ts +133 -0
- package/src/generators/schema-error-codes.ts +160 -0
- package/src/generators/schema-validator.ts +391 -0
- package/src/index.ts +429 -0
- package/src/language-building-schema.ts +3170 -0
- package/src/language-loader.ts +394 -0
- package/src/languages/_all.ts +65 -0
- package/src/languages/ar.ts +15 -0
- package/src/languages/bn.ts +16 -0
- package/src/languages/de.ts +15 -0
- package/src/languages/en.ts +29 -0
- package/src/languages/es.ts +15 -0
- package/src/languages/fr.ts +15 -0
- package/src/languages/hi.ts +26 -0
- package/src/languages/id.ts +15 -0
- package/src/languages/index.ts +18 -0
- package/src/languages/it.ts +15 -0
- package/src/languages/ja.ts +15 -0
- package/src/languages/ko.ts +15 -0
- package/src/languages/ms.ts +16 -0
- package/src/languages/pl.ts +18 -0
- package/src/languages/pt.ts +15 -0
- package/src/languages/qu.ts +15 -0
- package/src/languages/ru.ts +26 -0
- package/src/languages/sw.ts +15 -0
- package/src/languages/th.ts +16 -0
- package/src/languages/tl.ts +16 -0
- package/src/languages/tr.ts +15 -0
- package/src/languages/uk.ts +26 -0
- package/src/languages/vi.ts +16 -0
- package/src/languages/zh.ts +15 -0
- package/src/parser/index.ts +15 -0
- package/src/parser/pattern-matcher.ts +1181 -0
- package/src/parser/semantic-parser.ts +573 -0
- package/src/parser/utils/index.ts +35 -0
- package/src/parser/utils/marker-resolution.ts +111 -0
- package/src/parser/utils/possessive-keywords.ts +43 -0
- package/src/parser/utils/role-positioning.ts +70 -0
- package/src/parser/utils/type-validation.ts +134 -0
- package/src/patterns/add/ar.ts +71 -0
- package/src/patterns/add/bn.ts +70 -0
- package/src/patterns/add/hi.ts +69 -0
- package/src/patterns/add/index.ts +87 -0
- package/src/patterns/add/it.ts +61 -0
- package/src/patterns/add/ja.ts +93 -0
- package/src/patterns/add/ko.ts +74 -0
- package/src/patterns/add/ms.ts +30 -0
- package/src/patterns/add/pl.ts +62 -0
- package/src/patterns/add/ru.ts +62 -0
- package/src/patterns/add/th.ts +49 -0
- package/src/patterns/add/tl.ts +30 -0
- package/src/patterns/add/tr.ts +71 -0
- package/src/patterns/add/uk.ts +62 -0
- package/src/patterns/add/vi.ts +61 -0
- package/src/patterns/add/zh.ts +71 -0
- package/src/patterns/builders.ts +207 -0
- package/src/patterns/decrement/bn.ts +70 -0
- package/src/patterns/decrement/de.ts +42 -0
- package/src/patterns/decrement/hi.ts +68 -0
- package/src/patterns/decrement/index.ts +79 -0
- package/src/patterns/decrement/it.ts +69 -0
- package/src/patterns/decrement/ms.ts +30 -0
- package/src/patterns/decrement/pl.ts +58 -0
- package/src/patterns/decrement/ru.ts +58 -0
- package/src/patterns/decrement/th.ts +49 -0
- package/src/patterns/decrement/tl.ts +30 -0
- package/src/patterns/decrement/tr.ts +48 -0
- package/src/patterns/decrement/uk.ts +58 -0
- package/src/patterns/decrement/vi.ts +61 -0
- package/src/patterns/decrement/zh.ts +32 -0
- package/src/patterns/en.ts +302 -0
- package/src/patterns/event-handler/ar.ts +151 -0
- package/src/patterns/event-handler/bn.ts +72 -0
- package/src/patterns/event-handler/de.ts +117 -0
- package/src/patterns/event-handler/en.ts +117 -0
- package/src/patterns/event-handler/es.ts +136 -0
- package/src/patterns/event-handler/fr.ts +117 -0
- package/src/patterns/event-handler/hi.ts +64 -0
- package/src/patterns/event-handler/id.ts +117 -0
- package/src/patterns/event-handler/index.ts +119 -0
- package/src/patterns/event-handler/it.ts +54 -0
- package/src/patterns/event-handler/ja.ts +118 -0
- package/src/patterns/event-handler/ko.ts +133 -0
- package/src/patterns/event-handler/ms.ts +30 -0
- package/src/patterns/event-handler/pl.ts +62 -0
- package/src/patterns/event-handler/pt.ts +117 -0
- package/src/patterns/event-handler/qu.ts +66 -0
- package/src/patterns/event-handler/ru.ts +62 -0
- package/src/patterns/event-handler/shared.ts +270 -0
- package/src/patterns/event-handler/sw.ts +117 -0
- package/src/patterns/event-handler/th.ts +53 -0
- package/src/patterns/event-handler/tl.ts +30 -0
- package/src/patterns/event-handler/tr.ts +170 -0
- package/src/patterns/event-handler/uk.ts +62 -0
- package/src/patterns/event-handler/vi.ts +61 -0
- package/src/patterns/event-handler/zh.ts +150 -0
- package/src/patterns/get/ar.ts +49 -0
- package/src/patterns/get/bn.ts +47 -0
- package/src/patterns/get/de.ts +32 -0
- package/src/patterns/get/hi.ts +52 -0
- package/src/patterns/get/index.ts +83 -0
- package/src/patterns/get/it.ts +56 -0
- package/src/patterns/get/ja.ts +53 -0
- package/src/patterns/get/ko.ts +53 -0
- package/src/patterns/get/ms.ts +30 -0
- package/src/patterns/get/pl.ts +57 -0
- package/src/patterns/get/ru.ts +57 -0
- package/src/patterns/get/th.ts +29 -0
- package/src/patterns/get/tl.ts +30 -0
- package/src/patterns/get/uk.ts +57 -0
- package/src/patterns/get/vi.ts +48 -0
- package/src/patterns/grammar-transformed/index.ts +39 -0
- package/src/patterns/grammar-transformed/ja.ts +1713 -0
- package/src/patterns/grammar-transformed/ko.ts +1311 -0
- package/src/patterns/grammar-transformed/tr.ts +1067 -0
- package/src/patterns/hide/ar.ts +67 -0
- package/src/patterns/hide/bn.ts +47 -0
- package/src/patterns/hide/de.ts +36 -0
- package/src/patterns/hide/hi.ts +61 -0
- package/src/patterns/hide/index.ts +91 -0
- package/src/patterns/hide/it.ts +56 -0
- package/src/patterns/hide/ja.ts +69 -0
- package/src/patterns/hide/ko.ts +69 -0
- package/src/patterns/hide/ms.ts +30 -0
- package/src/patterns/hide/pl.ts +57 -0
- package/src/patterns/hide/ru.ts +57 -0
- package/src/patterns/hide/th.ts +29 -0
- package/src/patterns/hide/tl.ts +30 -0
- package/src/patterns/hide/tr.ts +65 -0
- package/src/patterns/hide/uk.ts +57 -0
- package/src/patterns/hide/vi.ts +56 -0
- package/src/patterns/hide/zh.ts +68 -0
- package/src/patterns/increment/bn.ts +70 -0
- package/src/patterns/increment/de.ts +36 -0
- package/src/patterns/increment/hi.ts +68 -0
- package/src/patterns/increment/index.ts +79 -0
- package/src/patterns/increment/it.ts +69 -0
- package/src/patterns/increment/ms.ts +30 -0
- package/src/patterns/increment/pl.ts +58 -0
- package/src/patterns/increment/ru.ts +58 -0
- package/src/patterns/increment/th.ts +49 -0
- package/src/patterns/increment/tl.ts +30 -0
- package/src/patterns/increment/tr.ts +52 -0
- package/src/patterns/increment/uk.ts +58 -0
- package/src/patterns/increment/vi.ts +61 -0
- package/src/patterns/increment/zh.ts +32 -0
- package/src/patterns/index.ts +84 -0
- package/src/patterns/languages/en/control-flow.ts +93 -0
- package/src/patterns/languages/en/fetch.ts +62 -0
- package/src/patterns/languages/en/index.ts +42 -0
- package/src/patterns/languages/en/repeat.ts +67 -0
- package/src/patterns/languages/en/set.ts +48 -0
- package/src/patterns/languages/en/swap.ts +38 -0
- package/src/patterns/languages/en/temporal.ts +57 -0
- package/src/patterns/put/ar.ts +74 -0
- package/src/patterns/put/bn.ts +53 -0
- package/src/patterns/put/en.ts +74 -0
- package/src/patterns/put/es.ts +74 -0
- package/src/patterns/put/hi.ts +69 -0
- package/src/patterns/put/id.ts +96 -0
- package/src/patterns/put/index.ts +99 -0
- package/src/patterns/put/it.ts +56 -0
- package/src/patterns/put/ja.ts +75 -0
- package/src/patterns/put/ko.ts +67 -0
- package/src/patterns/put/ms.ts +30 -0
- package/src/patterns/put/pl.ts +81 -0
- package/src/patterns/put/ru.ts +85 -0
- package/src/patterns/put/th.ts +32 -0
- package/src/patterns/put/tl.ts +30 -0
- package/src/patterns/put/tr.ts +67 -0
- package/src/patterns/put/uk.ts +85 -0
- package/src/patterns/put/vi.ts +72 -0
- package/src/patterns/put/zh.ts +62 -0
- package/src/patterns/registry.ts +163 -0
- package/src/patterns/remove/ar.ts +71 -0
- package/src/patterns/remove/bn.ts +68 -0
- package/src/patterns/remove/hi.ts +69 -0
- package/src/patterns/remove/index.ts +87 -0
- package/src/patterns/remove/it.ts +69 -0
- package/src/patterns/remove/ja.ts +74 -0
- package/src/patterns/remove/ko.ts +78 -0
- package/src/patterns/remove/ms.ts +30 -0
- package/src/patterns/remove/pl.ts +62 -0
- package/src/patterns/remove/ru.ts +62 -0
- package/src/patterns/remove/th.ts +49 -0
- package/src/patterns/remove/tl.ts +30 -0
- package/src/patterns/remove/tr.ts +78 -0
- package/src/patterns/remove/uk.ts +62 -0
- package/src/patterns/remove/vi.ts +61 -0
- package/src/patterns/remove/zh.ts +72 -0
- package/src/patterns/set/ar.ts +84 -0
- package/src/patterns/set/bn.ts +53 -0
- package/src/patterns/set/de.ts +84 -0
- package/src/patterns/set/es.ts +92 -0
- package/src/patterns/set/fr.ts +88 -0
- package/src/patterns/set/hi.ts +56 -0
- package/src/patterns/set/id.ts +84 -0
- package/src/patterns/set/index.ts +107 -0
- package/src/patterns/set/it.ts +56 -0
- package/src/patterns/set/ja.ts +86 -0
- package/src/patterns/set/ko.ts +85 -0
- package/src/patterns/set/ms.ts +30 -0
- package/src/patterns/set/pl.ts +57 -0
- package/src/patterns/set/pt.ts +84 -0
- package/src/patterns/set/ru.ts +57 -0
- package/src/patterns/set/th.ts +31 -0
- package/src/patterns/set/tl.ts +30 -0
- package/src/patterns/set/tr.ts +107 -0
- package/src/patterns/set/uk.ts +57 -0
- package/src/patterns/set/vi.ts +53 -0
- package/src/patterns/set/zh.ts +84 -0
- package/src/patterns/show/ar.ts +67 -0
- package/src/patterns/show/bn.ts +47 -0
- package/src/patterns/show/de.ts +32 -0
- package/src/patterns/show/fr.ts +32 -0
- package/src/patterns/show/hi.ts +61 -0
- package/src/patterns/show/index.ts +95 -0
- package/src/patterns/show/it.ts +56 -0
- package/src/patterns/show/ja.ts +69 -0
- package/src/patterns/show/ko.ts +73 -0
- package/src/patterns/show/ms.ts +30 -0
- package/src/patterns/show/pl.ts +57 -0
- package/src/patterns/show/ru.ts +57 -0
- package/src/patterns/show/th.ts +29 -0
- package/src/patterns/show/tl.ts +30 -0
- package/src/patterns/show/tr.ts +65 -0
- package/src/patterns/show/uk.ts +57 -0
- package/src/patterns/show/vi.ts +56 -0
- package/src/patterns/show/zh.ts +68 -0
- package/src/patterns/take/ar.ts +51 -0
- package/src/patterns/take/index.ts +31 -0
- package/src/patterns/toggle/ar.ts +61 -0
- package/src/patterns/toggle/bn.ts +70 -0
- package/src/patterns/toggle/en.ts +61 -0
- package/src/patterns/toggle/es.ts +61 -0
- package/src/patterns/toggle/hi.ts +80 -0
- package/src/patterns/toggle/index.ts +95 -0
- package/src/patterns/toggle/it.ts +69 -0
- package/src/patterns/toggle/ja.ts +156 -0
- package/src/patterns/toggle/ko.ts +113 -0
- package/src/patterns/toggle/ms.ts +30 -0
- package/src/patterns/toggle/pl.ts +62 -0
- package/src/patterns/toggle/ru.ts +62 -0
- package/src/patterns/toggle/th.ts +50 -0
- package/src/patterns/toggle/tl.ts +30 -0
- package/src/patterns/toggle/tr.ts +88 -0
- package/src/patterns/toggle/uk.ts +62 -0
- package/src/patterns/toggle/vi.ts +61 -0
- package/src/patterns/toggle/zh.ts +99 -0
- package/src/public-api.ts +286 -0
- package/src/registry.ts +441 -0
- package/src/tokenizers/arabic.ts +723 -0
- package/src/tokenizers/base.ts +1300 -0
- package/src/tokenizers/bengali.ts +289 -0
- package/src/tokenizers/chinese.ts +481 -0
- package/src/tokenizers/english.ts +416 -0
- package/src/tokenizers/french.ts +326 -0
- package/src/tokenizers/german.ts +324 -0
- package/src/tokenizers/hindi.ts +319 -0
- package/src/tokenizers/index.ts +127 -0
- package/src/tokenizers/indonesian.ts +306 -0
- package/src/tokenizers/italian.ts +458 -0
- package/src/tokenizers/japanese.ts +447 -0
- package/src/tokenizers/korean.ts +642 -0
- package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
- package/src/tokenizers/morphology/french-normalizer.ts +268 -0
- package/src/tokenizers/morphology/german-normalizer.ts +256 -0
- package/src/tokenizers/morphology/index.ts +46 -0
- package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
- package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
- package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
- package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
- package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
- package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
- package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
- package/src/tokenizers/morphology/types.ts +211 -0
- package/src/tokenizers/ms.ts +198 -0
- package/src/tokenizers/polish.ts +354 -0
- package/src/tokenizers/portuguese.ts +304 -0
- package/src/tokenizers/quechua.ts +339 -0
- package/src/tokenizers/russian.ts +375 -0
- package/src/tokenizers/spanish.ts +403 -0
- package/src/tokenizers/swahili.ts +303 -0
- package/src/tokenizers/thai.ts +236 -0
- package/src/tokenizers/tl.ts +198 -0
- package/src/tokenizers/turkish.ts +411 -0
- package/src/tokenizers/ukrainian.ts +369 -0
- package/src/tokenizers/vietnamese.ts +410 -0
- package/src/types/grammar-types.ts +617 -0
- package/src/types/unified-profile.ts +267 -0
- package/src/types.ts +709 -0
- package/src/utils/confidence-calculator.ts +147 -0
- package/src/validators/command-validator.ts +380 -0
- package/src/validators/index.ts +15 -0
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Portuguese Morphological Normalizer
|
|
3
|
+
*
|
|
4
|
+
* Reduces Portuguese verb conjugations to their infinitive forms.
|
|
5
|
+
* Portuguese has three verb conjugation classes (-ar, -er, -ir) and
|
|
6
|
+
* supports reflexive verbs (verbs with -se suffix).
|
|
7
|
+
*
|
|
8
|
+
* Key features:
|
|
9
|
+
* - Reflexive verb handling: mostrar-se → mostrar, esconder-se → esconder
|
|
10
|
+
* - Regular conjugation patterns for -ar, -er, -ir verbs
|
|
11
|
+
* - Handles common irregular verbs
|
|
12
|
+
* - Brazilian Portuguese variants
|
|
13
|
+
*
|
|
14
|
+
* Examples:
|
|
15
|
+
* mostrar-se → mostrar (reflexive infinitive)
|
|
16
|
+
* alternando → alternar (gerund)
|
|
17
|
+
* escondido → esconder (past participle)
|
|
18
|
+
* mostra → mostrar (3rd person present)
|
|
19
|
+
* clicou → clicar (3rd person preterite)
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import type { MorphologicalNormalizer, NormalizationResult, ConjugationType } from './types';
|
|
23
|
+
import { noChange, normalized } from './types';
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Check if a character is a Portuguese-specific letter (accented characters and ç).
|
|
27
|
+
*/
|
|
28
|
+
function isPortugueseSpecificLetter(char: string): boolean {
|
|
29
|
+
return /[áàâãéêíóôõúüçÁÀÂÃÉÊÍÓÔÕÚÜÇ]/.test(char);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Check if a word looks like a Portuguese verb.
|
|
34
|
+
* Portuguese verbs end in -ar, -er, or -ir, or have Portuguese-specific characters.
|
|
35
|
+
*/
|
|
36
|
+
function looksLikePortugueseVerb(word: string): boolean {
|
|
37
|
+
const lower = word.toLowerCase();
|
|
38
|
+
// Check for infinitive endings
|
|
39
|
+
if (lower.endsWith('ar') || lower.endsWith('er') || lower.endsWith('ir')) return true;
|
|
40
|
+
// Check for common conjugation endings
|
|
41
|
+
if (lower.endsWith('ando') || lower.endsWith('endo') || lower.endsWith('indo')) return true;
|
|
42
|
+
if (lower.endsWith('ado') || lower.endsWith('ido')) return true;
|
|
43
|
+
// Check for reflexive -se ending
|
|
44
|
+
if (lower.endsWith('ar-se') || lower.endsWith('er-se') || lower.endsWith('ir-se')) return true;
|
|
45
|
+
// Check for Portuguese-specific characters
|
|
46
|
+
for (const char of word) {
|
|
47
|
+
if (isPortugueseSpecificLetter(char)) return true;
|
|
48
|
+
}
|
|
49
|
+
return false;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Reflexive pronoun patterns that can be attached to verbs.
|
|
54
|
+
* Portuguese uses hyphenated reflexive pronouns: mostrar-se, esconder-me
|
|
55
|
+
*/
|
|
56
|
+
const REFLEXIVE_SUFFIXES = ['-se', '-me', '-te', '-nos', '-vos'];
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* -AR verb conjugation endings mapped to infinitive reconstruction.
|
|
60
|
+
*/
|
|
61
|
+
const AR_ENDINGS: readonly {
|
|
62
|
+
ending: string;
|
|
63
|
+
stem: string;
|
|
64
|
+
confidence: number;
|
|
65
|
+
type: ConjugationType;
|
|
66
|
+
}[] = [
|
|
67
|
+
// Gerund (-ando)
|
|
68
|
+
{ ending: 'ando', stem: 'ar', confidence: 0.88, type: 'gerund' },
|
|
69
|
+
// Past participle (-ado)
|
|
70
|
+
{ ending: 'ado', stem: 'ar', confidence: 0.88, type: 'participle' },
|
|
71
|
+
{ ending: 'ada', stem: 'ar', confidence: 0.88, type: 'participle' },
|
|
72
|
+
{ ending: 'ados', stem: 'ar', confidence: 0.88, type: 'participle' },
|
|
73
|
+
{ ending: 'adas', stem: 'ar', confidence: 0.88, type: 'participle' },
|
|
74
|
+
// Present indicative
|
|
75
|
+
{ ending: 'o', stem: 'ar', confidence: 0.75, type: 'present' }, // eu
|
|
76
|
+
{ ending: 'as', stem: 'ar', confidence: 0.82, type: 'present' }, // tu
|
|
77
|
+
{ ending: 'a', stem: 'ar', confidence: 0.75, type: 'present' }, // ele/ela/você
|
|
78
|
+
{ ending: 'amos', stem: 'ar', confidence: 0.85, type: 'present' }, // nós
|
|
79
|
+
{ ending: 'ais', stem: 'ar', confidence: 0.85, type: 'present' }, // vós
|
|
80
|
+
{ ending: 'am', stem: 'ar', confidence: 0.8, type: 'present' }, // eles/elas/vocês
|
|
81
|
+
// Preterite (past)
|
|
82
|
+
{ ending: 'ei', stem: 'ar', confidence: 0.88, type: 'past' }, // eu
|
|
83
|
+
{ ending: 'aste', stem: 'ar', confidence: 0.88, type: 'past' }, // tu
|
|
84
|
+
{ ending: 'ou', stem: 'ar', confidence: 0.88, type: 'past' }, // ele/ela/você
|
|
85
|
+
{ ending: 'ámos', stem: 'ar', confidence: 0.88, type: 'past' }, // nós (with accent)
|
|
86
|
+
{ ending: 'amos', stem: 'ar', confidence: 0.85, type: 'past' }, // nós (Brazilian)
|
|
87
|
+
{ ending: 'astes', stem: 'ar', confidence: 0.88, type: 'past' }, // vós
|
|
88
|
+
{ ending: 'aram', stem: 'ar', confidence: 0.88, type: 'past' }, // eles/elas/vocês
|
|
89
|
+
// Imperfect
|
|
90
|
+
{ ending: 'ava', stem: 'ar', confidence: 0.88, type: 'past' }, // eu/ele
|
|
91
|
+
{ ending: 'avas', stem: 'ar', confidence: 0.88, type: 'past' }, // tu
|
|
92
|
+
{ ending: 'ávamos', stem: 'ar', confidence: 0.88, type: 'past' }, // nós
|
|
93
|
+
{ ending: 'avamos', stem: 'ar', confidence: 0.85, type: 'past' }, // nós (no accent)
|
|
94
|
+
{ ending: 'áveis', stem: 'ar', confidence: 0.88, type: 'past' }, // vós
|
|
95
|
+
{ ending: 'aveis', stem: 'ar', confidence: 0.85, type: 'past' }, // vós (no accent)
|
|
96
|
+
{ ending: 'avam', stem: 'ar', confidence: 0.88, type: 'past' }, // eles
|
|
97
|
+
// Subjunctive
|
|
98
|
+
{ ending: 'e', stem: 'ar', confidence: 0.72, type: 'subjunctive' }, // eu/ele (ambiguous)
|
|
99
|
+
{ ending: 'es', stem: 'ar', confidence: 0.78, type: 'subjunctive' }, // tu
|
|
100
|
+
{ ending: 'emos', stem: 'ar', confidence: 0.82, type: 'subjunctive' }, // nós
|
|
101
|
+
{ ending: 'eis', stem: 'ar', confidence: 0.82, type: 'subjunctive' }, // vós
|
|
102
|
+
{ ending: 'em', stem: 'ar', confidence: 0.78, type: 'subjunctive' }, // eles
|
|
103
|
+
// Imperative
|
|
104
|
+
{ ending: 'a', stem: 'ar', confidence: 0.75, type: 'imperative' }, // tu/você
|
|
105
|
+
{ ending: 'ai', stem: 'ar', confidence: 0.85, type: 'imperative' }, // vós
|
|
106
|
+
// Infinitive
|
|
107
|
+
{ ending: 'ar', stem: 'ar', confidence: 0.92, type: 'dictionary' },
|
|
108
|
+
];
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* -ER verb conjugation endings.
|
|
112
|
+
*/
|
|
113
|
+
const ER_ENDINGS: readonly {
|
|
114
|
+
ending: string;
|
|
115
|
+
stem: string;
|
|
116
|
+
confidence: number;
|
|
117
|
+
type: ConjugationType;
|
|
118
|
+
}[] = [
|
|
119
|
+
// Gerund (-endo)
|
|
120
|
+
{ ending: 'endo', stem: 'er', confidence: 0.88, type: 'gerund' },
|
|
121
|
+
// Past participle (-ido)
|
|
122
|
+
{ ending: 'ido', stem: 'er', confidence: 0.85, type: 'participle' },
|
|
123
|
+
{ ending: 'ida', stem: 'er', confidence: 0.85, type: 'participle' },
|
|
124
|
+
{ ending: 'idos', stem: 'er', confidence: 0.85, type: 'participle' },
|
|
125
|
+
{ ending: 'idas', stem: 'er', confidence: 0.85, type: 'participle' },
|
|
126
|
+
// Present indicative
|
|
127
|
+
{ ending: 'o', stem: 'er', confidence: 0.72, type: 'present' }, // eu
|
|
128
|
+
{ ending: 'es', stem: 'er', confidence: 0.78, type: 'present' }, // tu
|
|
129
|
+
{ ending: 'e', stem: 'er', confidence: 0.72, type: 'present' }, // ele
|
|
130
|
+
{ ending: 'emos', stem: 'er', confidence: 0.85, type: 'present' }, // nós
|
|
131
|
+
{ ending: 'eis', stem: 'er', confidence: 0.82, type: 'present' }, // vós
|
|
132
|
+
{ ending: 'em', stem: 'er', confidence: 0.78, type: 'present' }, // eles
|
|
133
|
+
// Preterite
|
|
134
|
+
{ ending: 'i', stem: 'er', confidence: 0.85, type: 'past' }, // eu
|
|
135
|
+
{ ending: 'este', stem: 'er', confidence: 0.88, type: 'past' }, // tu
|
|
136
|
+
{ ending: 'eu', stem: 'er', confidence: 0.88, type: 'past' }, // ele
|
|
137
|
+
{ ending: 'emos', stem: 'er', confidence: 0.85, type: 'past' }, // nós
|
|
138
|
+
{ ending: 'estes', stem: 'er', confidence: 0.88, type: 'past' }, // vós
|
|
139
|
+
{ ending: 'eram', stem: 'er', confidence: 0.88, type: 'past' }, // eles
|
|
140
|
+
// Imperfect
|
|
141
|
+
{ ending: 'ia', stem: 'er', confidence: 0.85, type: 'past' }, // eu/ele
|
|
142
|
+
{ ending: 'ias', stem: 'er', confidence: 0.85, type: 'past' }, // tu
|
|
143
|
+
{ ending: 'íamos', stem: 'er', confidence: 0.88, type: 'past' }, // nós
|
|
144
|
+
{ ending: 'iamos', stem: 'er', confidence: 0.85, type: 'past' }, // nós (no accent)
|
|
145
|
+
{ ending: 'íeis', stem: 'er', confidence: 0.88, type: 'past' }, // vós
|
|
146
|
+
{ ending: 'ieis', stem: 'er', confidence: 0.85, type: 'past' }, // vós (no accent)
|
|
147
|
+
{ ending: 'iam', stem: 'er', confidence: 0.85, type: 'past' }, // eles
|
|
148
|
+
// Infinitive
|
|
149
|
+
{ ending: 'er', stem: 'er', confidence: 0.92, type: 'dictionary' },
|
|
150
|
+
];
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* -IR verb conjugation endings.
|
|
154
|
+
*/
|
|
155
|
+
const IR_ENDINGS: readonly {
|
|
156
|
+
ending: string;
|
|
157
|
+
stem: string;
|
|
158
|
+
confidence: number;
|
|
159
|
+
type: ConjugationType;
|
|
160
|
+
}[] = [
|
|
161
|
+
// Gerund (-indo)
|
|
162
|
+
{ ending: 'indo', stem: 'ir', confidence: 0.88, type: 'gerund' },
|
|
163
|
+
// Past participle (-ido)
|
|
164
|
+
{ ending: 'ido', stem: 'ir', confidence: 0.85, type: 'participle' },
|
|
165
|
+
{ ending: 'ida', stem: 'ir', confidence: 0.85, type: 'participle' },
|
|
166
|
+
{ ending: 'idos', stem: 'ir', confidence: 0.85, type: 'participle' },
|
|
167
|
+
{ ending: 'idas', stem: 'ir', confidence: 0.85, type: 'participle' },
|
|
168
|
+
// Present indicative
|
|
169
|
+
{ ending: 'o', stem: 'ir', confidence: 0.72, type: 'present' }, // eu
|
|
170
|
+
{ ending: 'es', stem: 'ir', confidence: 0.78, type: 'present' }, // tu
|
|
171
|
+
{ ending: 'e', stem: 'ir', confidence: 0.72, type: 'present' }, // ele
|
|
172
|
+
{ ending: 'imos', stem: 'ir', confidence: 0.85, type: 'present' }, // nós
|
|
173
|
+
{ ending: 'is', stem: 'ir', confidence: 0.82, type: 'present' }, // vós
|
|
174
|
+
{ ending: 'em', stem: 'ir', confidence: 0.78, type: 'present' }, // eles
|
|
175
|
+
// Preterite (same as -er)
|
|
176
|
+
{ ending: 'i', stem: 'ir', confidence: 0.85, type: 'past' }, // eu
|
|
177
|
+
{ ending: 'iste', stem: 'ir', confidence: 0.88, type: 'past' }, // tu
|
|
178
|
+
{ ending: 'iu', stem: 'ir', confidence: 0.88, type: 'past' }, // ele
|
|
179
|
+
{ ending: 'imos', stem: 'ir', confidence: 0.85, type: 'past' }, // nós
|
|
180
|
+
{ ending: 'istes', stem: 'ir', confidence: 0.88, type: 'past' }, // vós
|
|
181
|
+
{ ending: 'iram', stem: 'ir', confidence: 0.88, type: 'past' }, // eles
|
|
182
|
+
// Imperfect (same as -er)
|
|
183
|
+
{ ending: 'ia', stem: 'ir', confidence: 0.85, type: 'past' },
|
|
184
|
+
{ ending: 'ias', stem: 'ir', confidence: 0.85, type: 'past' },
|
|
185
|
+
{ ending: 'íamos', stem: 'ir', confidence: 0.88, type: 'past' },
|
|
186
|
+
{ ending: 'iamos', stem: 'ir', confidence: 0.85, type: 'past' },
|
|
187
|
+
{ ending: 'íeis', stem: 'ir', confidence: 0.88, type: 'past' },
|
|
188
|
+
{ ending: 'ieis', stem: 'ir', confidence: 0.85, type: 'past' },
|
|
189
|
+
{ ending: 'iam', stem: 'ir', confidence: 0.85, type: 'past' },
|
|
190
|
+
// Infinitive
|
|
191
|
+
{ ending: 'ir', stem: 'ir', confidence: 0.92, type: 'dictionary' },
|
|
192
|
+
];
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* All endings combined, sorted by length (longest first).
|
|
196
|
+
*/
|
|
197
|
+
const ALL_ENDINGS = [...AR_ENDINGS, ...ER_ENDINGS, ...IR_ENDINGS].sort(
|
|
198
|
+
(a, b) => b.ending.length - a.ending.length
|
|
199
|
+
);
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Portuguese morphological normalizer.
|
|
203
|
+
*/
|
|
204
|
+
export class PortugueseMorphologicalNormalizer implements MorphologicalNormalizer {
|
|
205
|
+
readonly language = 'pt';
|
|
206
|
+
|
|
207
|
+
/**
|
|
208
|
+
* Check if a word might be a Portuguese verb that can be normalized.
|
|
209
|
+
*/
|
|
210
|
+
isNormalizable(word: string): boolean {
|
|
211
|
+
if (word.length < 3) return false;
|
|
212
|
+
return looksLikePortugueseVerb(word);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/**
|
|
216
|
+
* Normalize a Portuguese word to its infinitive form.
|
|
217
|
+
*/
|
|
218
|
+
normalize(word: string): NormalizationResult {
|
|
219
|
+
const lower = word.toLowerCase();
|
|
220
|
+
|
|
221
|
+
// Check if this is already an infinitive (no change needed)
|
|
222
|
+
if (lower.endsWith('ar') || lower.endsWith('er') || lower.endsWith('ir')) {
|
|
223
|
+
// If it's a simple infinitive, return as-is with 1.0 confidence
|
|
224
|
+
// (unless it's a reflexive like "mostrar-se")
|
|
225
|
+
if (!REFLEXIVE_SUFFIXES.some(s => lower.endsWith(s))) {
|
|
226
|
+
return noChange(word);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Try reflexive verb normalization first (highest priority)
|
|
231
|
+
const reflexiveResult = this.tryReflexiveNormalization(lower);
|
|
232
|
+
if (reflexiveResult) return reflexiveResult;
|
|
233
|
+
|
|
234
|
+
// Try standard conjugation normalization
|
|
235
|
+
const conjugationResult = this.tryConjugationNormalization(lower);
|
|
236
|
+
if (conjugationResult) return conjugationResult;
|
|
237
|
+
|
|
238
|
+
// No normalization needed
|
|
239
|
+
return noChange(word);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Try to normalize a reflexive verb.
|
|
244
|
+
* Portuguese reflexive verbs use hyphenated pronouns: mostrar-se, esconder-me
|
|
245
|
+
*
|
|
246
|
+
* Examples:
|
|
247
|
+
* mostrar-se → mostrar
|
|
248
|
+
* esconder-se → esconder
|
|
249
|
+
* exibir-se → exibir
|
|
250
|
+
*/
|
|
251
|
+
private tryReflexiveNormalization(word: string): NormalizationResult | null {
|
|
252
|
+
for (const suffix of REFLEXIVE_SUFFIXES) {
|
|
253
|
+
if (word.endsWith(suffix)) {
|
|
254
|
+
const withoutReflexive = word.slice(0, -suffix.length);
|
|
255
|
+
|
|
256
|
+
// Check if this looks like an infinitive
|
|
257
|
+
if (
|
|
258
|
+
withoutReflexive.endsWith('ar') ||
|
|
259
|
+
withoutReflexive.endsWith('er') ||
|
|
260
|
+
withoutReflexive.endsWith('ir')
|
|
261
|
+
) {
|
|
262
|
+
// It's a reflexive infinitive (e.g., mostrar-se → mostrar)
|
|
263
|
+
return normalized(withoutReflexive, 0.88, {
|
|
264
|
+
removedSuffixes: [suffix],
|
|
265
|
+
conjugationType: 'reflexive',
|
|
266
|
+
});
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// Try to normalize the remaining part as a conjugated verb
|
|
270
|
+
const innerResult = this.tryConjugationNormalization(withoutReflexive);
|
|
271
|
+
if (innerResult && innerResult.stem !== withoutReflexive) {
|
|
272
|
+
// It's a reflexive conjugated form
|
|
273
|
+
return normalized(innerResult.stem, innerResult.confidence * 0.95, {
|
|
274
|
+
removedSuffixes: [suffix, ...(innerResult.metadata?.removedSuffixes || [])],
|
|
275
|
+
conjugationType: 'reflexive',
|
|
276
|
+
});
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
return null;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
/**
|
|
285
|
+
* Try to normalize a conjugated verb to its infinitive.
|
|
286
|
+
*/
|
|
287
|
+
private tryConjugationNormalization(word: string): NormalizationResult | null {
|
|
288
|
+
for (const rule of ALL_ENDINGS) {
|
|
289
|
+
if (word.endsWith(rule.ending)) {
|
|
290
|
+
const stemBase = word.slice(0, -rule.ending.length);
|
|
291
|
+
|
|
292
|
+
// Must have a meaningful stem (at least 2 characters)
|
|
293
|
+
if (stemBase.length < 2) continue;
|
|
294
|
+
|
|
295
|
+
// Reconstruct infinitive
|
|
296
|
+
const infinitive = stemBase + rule.stem;
|
|
297
|
+
|
|
298
|
+
return normalized(infinitive, rule.confidence, {
|
|
299
|
+
removedSuffixes: [rule.ending],
|
|
300
|
+
conjugationType: rule.type,
|
|
301
|
+
});
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
return null;
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// Export singleton instance
|
|
310
|
+
export const portugueseMorphologicalNormalizer = new PortugueseMorphologicalNormalizer();
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Spanish Morphological Normalizer
|
|
3
|
+
*
|
|
4
|
+
* Reduces Spanish verb conjugations to their infinitive forms.
|
|
5
|
+
* Spanish has three verb conjugation classes (-ar, -er, -ir) and
|
|
6
|
+
* supports reflexive verbs (verbs with -se suffix).
|
|
7
|
+
*
|
|
8
|
+
* Key features:
|
|
9
|
+
* - Reflexive verb handling: mostrarse → mostrar, ocultarse → ocultar
|
|
10
|
+
* - Regular conjugation patterns for -ar, -er, -ir verbs
|
|
11
|
+
* - Handles common irregular verbs
|
|
12
|
+
*
|
|
13
|
+
* Examples:
|
|
14
|
+
* mostrarse → mostrar (reflexive infinitive)
|
|
15
|
+
* alternando → alternar (gerund)
|
|
16
|
+
* escondido → esconder (past participle)
|
|
17
|
+
* muestra → mostrar (3rd person present)
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import type { MorphologicalNormalizer, NormalizationResult, ConjugationType } from './types';
|
|
21
|
+
import { noChange, normalized } from './types';
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Check if a character is a Spanish-specific letter (accented characters and ñ).
|
|
25
|
+
*/
|
|
26
|
+
function isSpanishSpecificLetter(char: string): boolean {
|
|
27
|
+
return /[áéíóúüñÁÉÍÓÚÜÑ]/.test(char);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Check if a word looks like a Spanish verb.
|
|
32
|
+
* Spanish verbs end in -ar, -er, or -ir, or have Spanish-specific characters.
|
|
33
|
+
*/
|
|
34
|
+
function looksLikeSpanishVerb(word: string): boolean {
|
|
35
|
+
const lower = word.toLowerCase();
|
|
36
|
+
// Check for infinitive endings
|
|
37
|
+
if (lower.endsWith('ar') || lower.endsWith('er') || lower.endsWith('ir')) return true;
|
|
38
|
+
// Check for common conjugation endings
|
|
39
|
+
if (lower.endsWith('ando') || lower.endsWith('iendo')) return true;
|
|
40
|
+
if (lower.endsWith('ado') || lower.endsWith('ido')) return true;
|
|
41
|
+
// Check for reflexive -se ending
|
|
42
|
+
if (lower.endsWith('arse') || lower.endsWith('erse') || lower.endsWith('irse')) return true;
|
|
43
|
+
// Check for Spanish-specific characters
|
|
44
|
+
for (const char of word) {
|
|
45
|
+
if (isSpanishSpecificLetter(char)) return true;
|
|
46
|
+
}
|
|
47
|
+
return false;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Reflexive pronoun patterns that can be attached to verbs.
|
|
52
|
+
*/
|
|
53
|
+
const REFLEXIVE_SUFFIXES = ['se', 'me', 'te', 'nos', 'os'];
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Reflexive pronouns that appear before conjugated verbs.
|
|
57
|
+
* Note: These are handled at the tokenizer level, not here.
|
|
58
|
+
*/
|
|
59
|
+
// const REFLEXIVE_PREFIXES = ['me', 'te', 'se', 'nos', 'os'];
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* -AR verb conjugation endings mapped to infinitive reconstruction.
|
|
63
|
+
*/
|
|
64
|
+
const AR_ENDINGS: readonly {
|
|
65
|
+
ending: string;
|
|
66
|
+
stem: string;
|
|
67
|
+
confidence: number;
|
|
68
|
+
type: ConjugationType;
|
|
69
|
+
}[] = [
|
|
70
|
+
// Gerund (-ando)
|
|
71
|
+
{ ending: 'ando', stem: 'ar', confidence: 0.88, type: 'gerund' },
|
|
72
|
+
// Past participle (-ado)
|
|
73
|
+
{ ending: 'ado', stem: 'ar', confidence: 0.88, type: 'participle' },
|
|
74
|
+
{ ending: 'ada', stem: 'ar', confidence: 0.88, type: 'participle' },
|
|
75
|
+
{ ending: 'ados', stem: 'ar', confidence: 0.88, type: 'participle' },
|
|
76
|
+
{ ending: 'adas', stem: 'ar', confidence: 0.88, type: 'participle' },
|
|
77
|
+
// Present indicative
|
|
78
|
+
{ ending: 'o', stem: 'ar', confidence: 0.75, type: 'present' }, // yo
|
|
79
|
+
{ ending: 'as', stem: 'ar', confidence: 0.82, type: 'present' }, // tú
|
|
80
|
+
{ ending: 'a', stem: 'ar', confidence: 0.75, type: 'present' }, // él/ella
|
|
81
|
+
{ ending: 'amos', stem: 'ar', confidence: 0.85, type: 'present' }, // nosotros
|
|
82
|
+
{ ending: 'áis', stem: 'ar', confidence: 0.85, type: 'present' }, // vosotros
|
|
83
|
+
{ ending: 'ais', stem: 'ar', confidence: 0.82, type: 'present' }, // vosotros (no accent)
|
|
84
|
+
{ ending: 'an', stem: 'ar', confidence: 0.8, type: 'present' }, // ellos
|
|
85
|
+
// Preterite
|
|
86
|
+
{ ending: 'é', stem: 'ar', confidence: 0.85, type: 'past' }, // yo
|
|
87
|
+
{ ending: 'aste', stem: 'ar', confidence: 0.88, type: 'past' }, // tú
|
|
88
|
+
{ ending: 'ó', stem: 'ar', confidence: 0.82, type: 'past' }, // él/ella
|
|
89
|
+
{ ending: 'amos', stem: 'ar', confidence: 0.85, type: 'past' }, // nosotros (same as present)
|
|
90
|
+
{ ending: 'asteis', stem: 'ar', confidence: 0.88, type: 'past' }, // vosotros
|
|
91
|
+
{ ending: 'aron', stem: 'ar', confidence: 0.88, type: 'past' }, // ellos
|
|
92
|
+
// Imperfect
|
|
93
|
+
{ ending: 'aba', stem: 'ar', confidence: 0.88, type: 'past' }, // yo/él
|
|
94
|
+
{ ending: 'abas', stem: 'ar', confidence: 0.88, type: 'past' }, // tú
|
|
95
|
+
{ ending: 'ábamos', stem: 'ar', confidence: 0.88, type: 'past' }, // nosotros
|
|
96
|
+
{ ending: 'abamos', stem: 'ar', confidence: 0.85, type: 'past' }, // nosotros (no accent)
|
|
97
|
+
{ ending: 'abais', stem: 'ar', confidence: 0.88, type: 'past' }, // vosotros
|
|
98
|
+
{ ending: 'aban', stem: 'ar', confidence: 0.88, type: 'past' }, // ellos
|
|
99
|
+
// Subjunctive
|
|
100
|
+
{ ending: 'e', stem: 'ar', confidence: 0.72, type: 'subjunctive' }, // yo/él (ambiguous)
|
|
101
|
+
{ ending: 'es', stem: 'ar', confidence: 0.78, type: 'subjunctive' }, // tú
|
|
102
|
+
{ ending: 'emos', stem: 'ar', confidence: 0.82, type: 'subjunctive' }, // nosotros
|
|
103
|
+
{ ending: 'éis', stem: 'ar', confidence: 0.85, type: 'subjunctive' }, // vosotros
|
|
104
|
+
{ ending: 'eis', stem: 'ar', confidence: 0.82, type: 'subjunctive' }, // vosotros (no accent)
|
|
105
|
+
{ ending: 'en', stem: 'ar', confidence: 0.78, type: 'subjunctive' }, // ellos
|
|
106
|
+
// Imperative
|
|
107
|
+
{ ending: 'a', stem: 'ar', confidence: 0.75, type: 'imperative' }, // tú (same as 3rd present)
|
|
108
|
+
{ ending: 'ad', stem: 'ar', confidence: 0.85, type: 'imperative' }, // vosotros
|
|
109
|
+
// Infinitive
|
|
110
|
+
{ ending: 'ar', stem: 'ar', confidence: 0.92, type: 'dictionary' },
|
|
111
|
+
];
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* -ER verb conjugation endings.
|
|
115
|
+
*/
|
|
116
|
+
const ER_ENDINGS: readonly {
|
|
117
|
+
ending: string;
|
|
118
|
+
stem: string;
|
|
119
|
+
confidence: number;
|
|
120
|
+
type: ConjugationType;
|
|
121
|
+
}[] = [
|
|
122
|
+
// Gerund (-iendo)
|
|
123
|
+
{ ending: 'iendo', stem: 'er', confidence: 0.88, type: 'gerund' },
|
|
124
|
+
// Past participle (-ido)
|
|
125
|
+
{ ending: 'ido', stem: 'er', confidence: 0.85, type: 'participle' },
|
|
126
|
+
{ ending: 'ida', stem: 'er', confidence: 0.85, type: 'participle' },
|
|
127
|
+
{ ending: 'idos', stem: 'er', confidence: 0.85, type: 'participle' },
|
|
128
|
+
{ ending: 'idas', stem: 'er', confidence: 0.85, type: 'participle' },
|
|
129
|
+
// Present indicative
|
|
130
|
+
{ ending: 'o', stem: 'er', confidence: 0.72, type: 'present' }, // yo
|
|
131
|
+
{ ending: 'es', stem: 'er', confidence: 0.78, type: 'present' }, // tú
|
|
132
|
+
{ ending: 'e', stem: 'er', confidence: 0.72, type: 'present' }, // él/ella
|
|
133
|
+
{ ending: 'emos', stem: 'er', confidence: 0.85, type: 'present' }, // nosotros
|
|
134
|
+
{ ending: 'éis', stem: 'er', confidence: 0.85, type: 'present' }, // vosotros
|
|
135
|
+
{ ending: 'eis', stem: 'er', confidence: 0.82, type: 'present' }, // vosotros (no accent)
|
|
136
|
+
{ ending: 'en', stem: 'er', confidence: 0.78, type: 'present' }, // ellos
|
|
137
|
+
// Preterite
|
|
138
|
+
{ ending: 'í', stem: 'er', confidence: 0.85, type: 'past' }, // yo
|
|
139
|
+
{ ending: 'iste', stem: 'er', confidence: 0.88, type: 'past' }, // tú
|
|
140
|
+
{ ending: 'ió', stem: 'er', confidence: 0.85, type: 'past' }, // él/ella
|
|
141
|
+
{ ending: 'io', stem: 'er', confidence: 0.82, type: 'past' }, // él/ella (no accent)
|
|
142
|
+
{ ending: 'imos', stem: 'er', confidence: 0.85, type: 'past' }, // nosotros
|
|
143
|
+
{ ending: 'isteis', stem: 'er', confidence: 0.88, type: 'past' }, // vosotros
|
|
144
|
+
{ ending: 'ieron', stem: 'er', confidence: 0.88, type: 'past' }, // ellos
|
|
145
|
+
// Imperfect
|
|
146
|
+
{ ending: 'ía', stem: 'er', confidence: 0.88, type: 'past' }, // yo/él
|
|
147
|
+
{ ending: 'ia', stem: 'er', confidence: 0.85, type: 'past' }, // yo/él (no accent)
|
|
148
|
+
{ ending: 'ías', stem: 'er', confidence: 0.88, type: 'past' }, // tú
|
|
149
|
+
{ ending: 'ias', stem: 'er', confidence: 0.85, type: 'past' }, // tú (no accent)
|
|
150
|
+
{ ending: 'íamos', stem: 'er', confidence: 0.88, type: 'past' }, // nosotros
|
|
151
|
+
{ ending: 'iamos', stem: 'er', confidence: 0.85, type: 'past' }, // nosotros (no accent)
|
|
152
|
+
{ ending: 'íais', stem: 'er', confidence: 0.88, type: 'past' }, // vosotros
|
|
153
|
+
{ ending: 'iais', stem: 'er', confidence: 0.85, type: 'past' }, // vosotros (no accent)
|
|
154
|
+
{ ending: 'ían', stem: 'er', confidence: 0.88, type: 'past' }, // ellos
|
|
155
|
+
{ ending: 'ian', stem: 'er', confidence: 0.85, type: 'past' }, // ellos (no accent)
|
|
156
|
+
// Infinitive
|
|
157
|
+
{ ending: 'er', stem: 'er', confidence: 0.92, type: 'dictionary' },
|
|
158
|
+
];
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* -IR verb conjugation endings.
|
|
162
|
+
*/
|
|
163
|
+
const IR_ENDINGS: readonly {
|
|
164
|
+
ending: string;
|
|
165
|
+
stem: string;
|
|
166
|
+
confidence: number;
|
|
167
|
+
type: ConjugationType;
|
|
168
|
+
}[] = [
|
|
169
|
+
// Gerund (-iendo)
|
|
170
|
+
{ ending: 'iendo', stem: 'ir', confidence: 0.88, type: 'gerund' },
|
|
171
|
+
// Past participle (-ido)
|
|
172
|
+
{ ending: 'ido', stem: 'ir', confidence: 0.85, type: 'participle' },
|
|
173
|
+
{ ending: 'ida', stem: 'ir', confidence: 0.85, type: 'participle' },
|
|
174
|
+
{ ending: 'idos', stem: 'ir', confidence: 0.85, type: 'participle' },
|
|
175
|
+
{ ending: 'idas', stem: 'ir', confidence: 0.85, type: 'participle' },
|
|
176
|
+
// Present indicative
|
|
177
|
+
{ ending: 'o', stem: 'ir', confidence: 0.72, type: 'present' }, // yo
|
|
178
|
+
{ ending: 'es', stem: 'ir', confidence: 0.78, type: 'present' }, // tú
|
|
179
|
+
{ ending: 'e', stem: 'ir', confidence: 0.72, type: 'present' }, // él/ella
|
|
180
|
+
{ ending: 'imos', stem: 'ir', confidence: 0.85, type: 'present' }, // nosotros
|
|
181
|
+
{ ending: 'ís', stem: 'ir', confidence: 0.85, type: 'present' }, // vosotros
|
|
182
|
+
{ ending: 'is', stem: 'ir', confidence: 0.82, type: 'present' }, // vosotros (no accent)
|
|
183
|
+
{ ending: 'en', stem: 'ir', confidence: 0.78, type: 'present' }, // ellos
|
|
184
|
+
// Preterite (same as -er)
|
|
185
|
+
{ ending: 'í', stem: 'ir', confidence: 0.85, type: 'past' }, // yo
|
|
186
|
+
{ ending: 'iste', stem: 'ir', confidence: 0.88, type: 'past' }, // tú
|
|
187
|
+
{ ending: 'ió', stem: 'ir', confidence: 0.85, type: 'past' }, // él/ella
|
|
188
|
+
{ ending: 'io', stem: 'ir', confidence: 0.82, type: 'past' }, // él/ella (no accent)
|
|
189
|
+
{ ending: 'imos', stem: 'ir', confidence: 0.85, type: 'past' }, // nosotros
|
|
190
|
+
{ ending: 'isteis', stem: 'ir', confidence: 0.88, type: 'past' }, // vosotros
|
|
191
|
+
{ ending: 'ieron', stem: 'ir', confidence: 0.88, type: 'past' }, // ellos
|
|
192
|
+
// Imperfect (same as -er)
|
|
193
|
+
{ ending: 'ía', stem: 'ir', confidence: 0.88, type: 'past' },
|
|
194
|
+
{ ending: 'ia', stem: 'ir', confidence: 0.85, type: 'past' },
|
|
195
|
+
{ ending: 'ías', stem: 'ir', confidence: 0.88, type: 'past' },
|
|
196
|
+
{ ending: 'ias', stem: 'ir', confidence: 0.85, type: 'past' },
|
|
197
|
+
{ ending: 'íamos', stem: 'ir', confidence: 0.88, type: 'past' },
|
|
198
|
+
{ ending: 'iamos', stem: 'ir', confidence: 0.85, type: 'past' },
|
|
199
|
+
{ ending: 'íais', stem: 'ir', confidence: 0.88, type: 'past' },
|
|
200
|
+
{ ending: 'iais', stem: 'ir', confidence: 0.85, type: 'past' },
|
|
201
|
+
{ ending: 'ían', stem: 'ir', confidence: 0.88, type: 'past' },
|
|
202
|
+
{ ending: 'ian', stem: 'ir', confidence: 0.85, type: 'past' },
|
|
203
|
+
// Infinitive
|
|
204
|
+
{ ending: 'ir', stem: 'ir', confidence: 0.92, type: 'dictionary' },
|
|
205
|
+
];
|
|
206
|
+
|
|
207
|
+
/**
|
|
208
|
+
* All endings combined, sorted by length (longest first).
|
|
209
|
+
*/
|
|
210
|
+
const ALL_ENDINGS = [...AR_ENDINGS, ...ER_ENDINGS, ...IR_ENDINGS].sort(
|
|
211
|
+
(a, b) => b.ending.length - a.ending.length
|
|
212
|
+
);
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* Spanish morphological normalizer.
|
|
216
|
+
*/
|
|
217
|
+
export class SpanishMorphologicalNormalizer implements MorphologicalNormalizer {
|
|
218
|
+
readonly language = 'es';
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Check if a word might be a Spanish verb that can be normalized.
|
|
222
|
+
*/
|
|
223
|
+
isNormalizable(word: string): boolean {
|
|
224
|
+
if (word.length < 3) return false;
|
|
225
|
+
return looksLikeSpanishVerb(word);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Normalize a Spanish word to its infinitive form.
|
|
230
|
+
*/
|
|
231
|
+
normalize(word: string): NormalizationResult {
|
|
232
|
+
const lower = word.toLowerCase();
|
|
233
|
+
|
|
234
|
+
// Check if this is already an infinitive (no change needed)
|
|
235
|
+
if (lower.endsWith('ar') || lower.endsWith('er') || lower.endsWith('ir')) {
|
|
236
|
+
// If it's a simple infinitive, return as-is with 1.0 confidence
|
|
237
|
+
// (unless it's a reflexive like "mostrarse")
|
|
238
|
+
if (
|
|
239
|
+
!REFLEXIVE_SUFFIXES.some(
|
|
240
|
+
s => lower.endsWith(s + 'ar') || lower.endsWith(s + 'er') || lower.endsWith(s + 'ir')
|
|
241
|
+
)
|
|
242
|
+
) {
|
|
243
|
+
return noChange(word);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Try reflexive verb normalization first (highest priority)
|
|
248
|
+
const reflexiveResult = this.tryReflexiveNormalization(lower);
|
|
249
|
+
if (reflexiveResult) return reflexiveResult;
|
|
250
|
+
|
|
251
|
+
// Try standard conjugation normalization
|
|
252
|
+
const conjugationResult = this.tryConjugationNormalization(lower);
|
|
253
|
+
if (conjugationResult) return conjugationResult;
|
|
254
|
+
|
|
255
|
+
// No normalization needed
|
|
256
|
+
return noChange(word);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Try to normalize a reflexive verb.
|
|
261
|
+
* Reflexive verbs end with -se, -me, -te, -nos, -os attached to infinitive.
|
|
262
|
+
*
|
|
263
|
+
* Examples:
|
|
264
|
+
* mostrarse → mostrar
|
|
265
|
+
* ocultarse → ocultar
|
|
266
|
+
* esconderse → esconder
|
|
267
|
+
*/
|
|
268
|
+
private tryReflexiveNormalization(word: string): NormalizationResult | null {
|
|
269
|
+
for (const suffix of REFLEXIVE_SUFFIXES) {
|
|
270
|
+
if (word.endsWith(suffix)) {
|
|
271
|
+
const withoutReflexive = word.slice(0, -suffix.length);
|
|
272
|
+
|
|
273
|
+
// Check if this looks like an infinitive
|
|
274
|
+
if (
|
|
275
|
+
withoutReflexive.endsWith('ar') ||
|
|
276
|
+
withoutReflexive.endsWith('er') ||
|
|
277
|
+
withoutReflexive.endsWith('ir')
|
|
278
|
+
) {
|
|
279
|
+
// It's a reflexive infinitive (e.g., mostrarse → mostrar)
|
|
280
|
+
return normalized(withoutReflexive, 0.88, {
|
|
281
|
+
removedSuffixes: [suffix],
|
|
282
|
+
conjugationType: 'reflexive',
|
|
283
|
+
});
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// Try to normalize the remaining part as a conjugated verb
|
|
287
|
+
const innerResult = this.tryConjugationNormalization(withoutReflexive);
|
|
288
|
+
if (innerResult && innerResult.stem !== withoutReflexive) {
|
|
289
|
+
// It's a reflexive conjugated form (e.g., muestrase → mostrar)
|
|
290
|
+
return normalized(innerResult.stem, innerResult.confidence * 0.95, {
|
|
291
|
+
removedSuffixes: [suffix, ...(innerResult.metadata?.removedSuffixes || [])],
|
|
292
|
+
conjugationType: 'reflexive',
|
|
293
|
+
});
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
return null;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Try to normalize a conjugated verb to its infinitive.
|
|
303
|
+
*/
|
|
304
|
+
private tryConjugationNormalization(word: string): NormalizationResult | null {
|
|
305
|
+
for (const rule of ALL_ENDINGS) {
|
|
306
|
+
if (word.endsWith(rule.ending)) {
|
|
307
|
+
const stemBase = word.slice(0, -rule.ending.length);
|
|
308
|
+
|
|
309
|
+
// Must have a meaningful stem (at least 2 characters)
|
|
310
|
+
if (stemBase.length < 2) continue;
|
|
311
|
+
|
|
312
|
+
// Reconstruct infinitive
|
|
313
|
+
const infinitive = stemBase + rule.stem;
|
|
314
|
+
|
|
315
|
+
return normalized(infinitive, rule.confidence, {
|
|
316
|
+
removedSuffixes: [rule.ending],
|
|
317
|
+
conjugationType: rule.type,
|
|
318
|
+
});
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
return null;
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
// Export singleton instance
|
|
327
|
+
export const spanishMorphologicalNormalizer = new SpanishMorphologicalNormalizer();
|