@lokascript/semantic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +686 -0
- package/dist/browser-ar.ar.global.js +2 -0
- package/dist/browser-core.core.global.js +2 -0
- package/dist/browser-de.de.global.js +2 -0
- package/dist/browser-east-asian.east-asian.global.js +2 -0
- package/dist/browser-en-tr.en-tr.global.js +2 -0
- package/dist/browser-en.en.global.js +2 -0
- package/dist/browser-es-en.es-en.global.js +2 -0
- package/dist/browser-es.es.global.js +2 -0
- package/dist/browser-fr.fr.global.js +2 -0
- package/dist/browser-id.id.global.js +2 -0
- package/dist/browser-ja.ja.global.js +2 -0
- package/dist/browser-ko.ko.global.js +2 -0
- package/dist/browser-lazy.lazy.global.js +2 -0
- package/dist/browser-priority.priority.global.js +2 -0
- package/dist/browser-pt.pt.global.js +2 -0
- package/dist/browser-qu.qu.global.js +2 -0
- package/dist/browser-sw.sw.global.js +2 -0
- package/dist/browser-tr.tr.global.js +2 -0
- package/dist/browser-western.western.global.js +2 -0
- package/dist/browser-zh.zh.global.js +2 -0
- package/dist/browser.global.js +3 -0
- package/dist/browser.global.js.map +1 -0
- package/dist/index.cjs +35051 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3426 -0
- package/dist/index.d.ts +3426 -0
- package/dist/index.js +34890 -0
- package/dist/index.js.map +1 -0
- package/dist/languages/ar.d.ts +78 -0
- package/dist/languages/ar.js +1622 -0
- package/dist/languages/ar.js.map +1 -0
- package/dist/languages/de.d.ts +38 -0
- package/dist/languages/de.js +1168 -0
- package/dist/languages/de.js.map +1 -0
- package/dist/languages/en.d.ts +44 -0
- package/dist/languages/en.js +3491 -0
- package/dist/languages/en.js.map +1 -0
- package/dist/languages/es.d.ts +52 -0
- package/dist/languages/es.js +1493 -0
- package/dist/languages/es.js.map +1 -0
- package/dist/languages/fr.d.ts +37 -0
- package/dist/languages/fr.js +1159 -0
- package/dist/languages/fr.js.map +1 -0
- package/dist/languages/id.d.ts +35 -0
- package/dist/languages/id.js +1152 -0
- package/dist/languages/id.js.map +1 -0
- package/dist/languages/ja.d.ts +53 -0
- package/dist/languages/ja.js +1430 -0
- package/dist/languages/ja.js.map +1 -0
- package/dist/languages/ko.d.ts +51 -0
- package/dist/languages/ko.js +1729 -0
- package/dist/languages/ko.js.map +1 -0
- package/dist/languages/pt.d.ts +37 -0
- package/dist/languages/pt.js +1127 -0
- package/dist/languages/pt.js.map +1 -0
- package/dist/languages/qu.d.ts +36 -0
- package/dist/languages/qu.js +1143 -0
- package/dist/languages/qu.js.map +1 -0
- package/dist/languages/sw.d.ts +35 -0
- package/dist/languages/sw.js +1147 -0
- package/dist/languages/sw.js.map +1 -0
- package/dist/languages/tr.d.ts +45 -0
- package/dist/languages/tr.js +1529 -0
- package/dist/languages/tr.js.map +1 -0
- package/dist/languages/zh.d.ts +58 -0
- package/dist/languages/zh.js +1257 -0
- package/dist/languages/zh.js.map +1 -0
- package/dist/types-C4dcj53L.d.ts +600 -0
- package/package.json +202 -0
- package/src/__test-utils__/index.ts +7 -0
- package/src/__test-utils__/test-helpers.ts +8 -0
- package/src/__types__/test-helpers.ts +122 -0
- package/src/analysis/index.ts +479 -0
- package/src/ast-builder/command-mappers.ts +1133 -0
- package/src/ast-builder/expression-parser/index.ts +41 -0
- package/src/ast-builder/expression-parser/parser.ts +563 -0
- package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
- package/src/ast-builder/expression-parser/types.ts +208 -0
- package/src/ast-builder/index.ts +536 -0
- package/src/ast-builder/value-converters.ts +172 -0
- package/src/bridge.ts +275 -0
- package/src/browser-ar.ts +162 -0
- package/src/browser-core.ts +231 -0
- package/src/browser-de.ts +162 -0
- package/src/browser-east-asian.ts +173 -0
- package/src/browser-en-tr.ts +165 -0
- package/src/browser-en.ts +157 -0
- package/src/browser-es-en.ts +200 -0
- package/src/browser-es.ts +170 -0
- package/src/browser-fr.ts +162 -0
- package/src/browser-id.ts +162 -0
- package/src/browser-ja.ts +162 -0
- package/src/browser-ko.ts +162 -0
- package/src/browser-lazy.ts +189 -0
- package/src/browser-priority.ts +214 -0
- package/src/browser-pt.ts +162 -0
- package/src/browser-qu.ts +162 -0
- package/src/browser-sw.ts +162 -0
- package/src/browser-tr.ts +162 -0
- package/src/browser-western.ts +181 -0
- package/src/browser-zh.ts +162 -0
- package/src/browser.ts +268 -0
- package/src/cache/index.ts +14 -0
- package/src/cache/semantic-cache.ts +344 -0
- package/src/core-bridge.ts +372 -0
- package/src/explicit/converter.ts +258 -0
- package/src/explicit/index.ts +18 -0
- package/src/explicit/parser.ts +236 -0
- package/src/explicit/renderer.ts +424 -0
- package/src/generators/command-schemas.ts +1636 -0
- package/src/generators/event-handler-generator.ts +109 -0
- package/src/generators/index.ts +117 -0
- package/src/generators/language-profiles.ts +139 -0
- package/src/generators/pattern-generator.ts +537 -0
- package/src/generators/profiles/arabic.ts +131 -0
- package/src/generators/profiles/bengali.ts +132 -0
- package/src/generators/profiles/chinese.ts +124 -0
- package/src/generators/profiles/english.ts +113 -0
- package/src/generators/profiles/french.ts +125 -0
- package/src/generators/profiles/german.ts +126 -0
- package/src/generators/profiles/hindi.ts +146 -0
- package/src/generators/profiles/index.ts +46 -0
- package/src/generators/profiles/indonesian.ts +125 -0
- package/src/generators/profiles/italian.ts +139 -0
- package/src/generators/profiles/japanese.ts +149 -0
- package/src/generators/profiles/korean.ts +127 -0
- package/src/generators/profiles/marker-templates.ts +288 -0
- package/src/generators/profiles/ms.ts +130 -0
- package/src/generators/profiles/polish.ts +249 -0
- package/src/generators/profiles/portuguese.ts +115 -0
- package/src/generators/profiles/quechua.ts +113 -0
- package/src/generators/profiles/russian.ts +260 -0
- package/src/generators/profiles/spanish.ts +130 -0
- package/src/generators/profiles/swahili.ts +129 -0
- package/src/generators/profiles/thai.ts +132 -0
- package/src/generators/profiles/tl.ts +128 -0
- package/src/generators/profiles/turkish.ts +124 -0
- package/src/generators/profiles/types.ts +165 -0
- package/src/generators/profiles/ukrainian.ts +270 -0
- package/src/generators/profiles/vietnamese.ts +133 -0
- package/src/generators/schema-error-codes.ts +160 -0
- package/src/generators/schema-validator.ts +391 -0
- package/src/index.ts +429 -0
- package/src/language-building-schema.ts +3170 -0
- package/src/language-loader.ts +394 -0
- package/src/languages/_all.ts +65 -0
- package/src/languages/ar.ts +15 -0
- package/src/languages/bn.ts +16 -0
- package/src/languages/de.ts +15 -0
- package/src/languages/en.ts +29 -0
- package/src/languages/es.ts +15 -0
- package/src/languages/fr.ts +15 -0
- package/src/languages/hi.ts +26 -0
- package/src/languages/id.ts +15 -0
- package/src/languages/index.ts +18 -0
- package/src/languages/it.ts +15 -0
- package/src/languages/ja.ts +15 -0
- package/src/languages/ko.ts +15 -0
- package/src/languages/ms.ts +16 -0
- package/src/languages/pl.ts +18 -0
- package/src/languages/pt.ts +15 -0
- package/src/languages/qu.ts +15 -0
- package/src/languages/ru.ts +26 -0
- package/src/languages/sw.ts +15 -0
- package/src/languages/th.ts +16 -0
- package/src/languages/tl.ts +16 -0
- package/src/languages/tr.ts +15 -0
- package/src/languages/uk.ts +26 -0
- package/src/languages/vi.ts +16 -0
- package/src/languages/zh.ts +15 -0
- package/src/parser/index.ts +15 -0
- package/src/parser/pattern-matcher.ts +1181 -0
- package/src/parser/semantic-parser.ts +573 -0
- package/src/parser/utils/index.ts +35 -0
- package/src/parser/utils/marker-resolution.ts +111 -0
- package/src/parser/utils/possessive-keywords.ts +43 -0
- package/src/parser/utils/role-positioning.ts +70 -0
- package/src/parser/utils/type-validation.ts +134 -0
- package/src/patterns/add/ar.ts +71 -0
- package/src/patterns/add/bn.ts +70 -0
- package/src/patterns/add/hi.ts +69 -0
- package/src/patterns/add/index.ts +87 -0
- package/src/patterns/add/it.ts +61 -0
- package/src/patterns/add/ja.ts +93 -0
- package/src/patterns/add/ko.ts +74 -0
- package/src/patterns/add/ms.ts +30 -0
- package/src/patterns/add/pl.ts +62 -0
- package/src/patterns/add/ru.ts +62 -0
- package/src/patterns/add/th.ts +49 -0
- package/src/patterns/add/tl.ts +30 -0
- package/src/patterns/add/tr.ts +71 -0
- package/src/patterns/add/uk.ts +62 -0
- package/src/patterns/add/vi.ts +61 -0
- package/src/patterns/add/zh.ts +71 -0
- package/src/patterns/builders.ts +207 -0
- package/src/patterns/decrement/bn.ts +70 -0
- package/src/patterns/decrement/de.ts +42 -0
- package/src/patterns/decrement/hi.ts +68 -0
- package/src/patterns/decrement/index.ts +79 -0
- package/src/patterns/decrement/it.ts +69 -0
- package/src/patterns/decrement/ms.ts +30 -0
- package/src/patterns/decrement/pl.ts +58 -0
- package/src/patterns/decrement/ru.ts +58 -0
- package/src/patterns/decrement/th.ts +49 -0
- package/src/patterns/decrement/tl.ts +30 -0
- package/src/patterns/decrement/tr.ts +48 -0
- package/src/patterns/decrement/uk.ts +58 -0
- package/src/patterns/decrement/vi.ts +61 -0
- package/src/patterns/decrement/zh.ts +32 -0
- package/src/patterns/en.ts +302 -0
- package/src/patterns/event-handler/ar.ts +151 -0
- package/src/patterns/event-handler/bn.ts +72 -0
- package/src/patterns/event-handler/de.ts +117 -0
- package/src/patterns/event-handler/en.ts +117 -0
- package/src/patterns/event-handler/es.ts +136 -0
- package/src/patterns/event-handler/fr.ts +117 -0
- package/src/patterns/event-handler/hi.ts +64 -0
- package/src/patterns/event-handler/id.ts +117 -0
- package/src/patterns/event-handler/index.ts +119 -0
- package/src/patterns/event-handler/it.ts +54 -0
- package/src/patterns/event-handler/ja.ts +118 -0
- package/src/patterns/event-handler/ko.ts +133 -0
- package/src/patterns/event-handler/ms.ts +30 -0
- package/src/patterns/event-handler/pl.ts +62 -0
- package/src/patterns/event-handler/pt.ts +117 -0
- package/src/patterns/event-handler/qu.ts +66 -0
- package/src/patterns/event-handler/ru.ts +62 -0
- package/src/patterns/event-handler/shared.ts +270 -0
- package/src/patterns/event-handler/sw.ts +117 -0
- package/src/patterns/event-handler/th.ts +53 -0
- package/src/patterns/event-handler/tl.ts +30 -0
- package/src/patterns/event-handler/tr.ts +170 -0
- package/src/patterns/event-handler/uk.ts +62 -0
- package/src/patterns/event-handler/vi.ts +61 -0
- package/src/patterns/event-handler/zh.ts +150 -0
- package/src/patterns/get/ar.ts +49 -0
- package/src/patterns/get/bn.ts +47 -0
- package/src/patterns/get/de.ts +32 -0
- package/src/patterns/get/hi.ts +52 -0
- package/src/patterns/get/index.ts +83 -0
- package/src/patterns/get/it.ts +56 -0
- package/src/patterns/get/ja.ts +53 -0
- package/src/patterns/get/ko.ts +53 -0
- package/src/patterns/get/ms.ts +30 -0
- package/src/patterns/get/pl.ts +57 -0
- package/src/patterns/get/ru.ts +57 -0
- package/src/patterns/get/th.ts +29 -0
- package/src/patterns/get/tl.ts +30 -0
- package/src/patterns/get/uk.ts +57 -0
- package/src/patterns/get/vi.ts +48 -0
- package/src/patterns/grammar-transformed/index.ts +39 -0
- package/src/patterns/grammar-transformed/ja.ts +1713 -0
- package/src/patterns/grammar-transformed/ko.ts +1311 -0
- package/src/patterns/grammar-transformed/tr.ts +1067 -0
- package/src/patterns/hide/ar.ts +67 -0
- package/src/patterns/hide/bn.ts +47 -0
- package/src/patterns/hide/de.ts +36 -0
- package/src/patterns/hide/hi.ts +61 -0
- package/src/patterns/hide/index.ts +91 -0
- package/src/patterns/hide/it.ts +56 -0
- package/src/patterns/hide/ja.ts +69 -0
- package/src/patterns/hide/ko.ts +69 -0
- package/src/patterns/hide/ms.ts +30 -0
- package/src/patterns/hide/pl.ts +57 -0
- package/src/patterns/hide/ru.ts +57 -0
- package/src/patterns/hide/th.ts +29 -0
- package/src/patterns/hide/tl.ts +30 -0
- package/src/patterns/hide/tr.ts +65 -0
- package/src/patterns/hide/uk.ts +57 -0
- package/src/patterns/hide/vi.ts +56 -0
- package/src/patterns/hide/zh.ts +68 -0
- package/src/patterns/increment/bn.ts +70 -0
- package/src/patterns/increment/de.ts +36 -0
- package/src/patterns/increment/hi.ts +68 -0
- package/src/patterns/increment/index.ts +79 -0
- package/src/patterns/increment/it.ts +69 -0
- package/src/patterns/increment/ms.ts +30 -0
- package/src/patterns/increment/pl.ts +58 -0
- package/src/patterns/increment/ru.ts +58 -0
- package/src/patterns/increment/th.ts +49 -0
- package/src/patterns/increment/tl.ts +30 -0
- package/src/patterns/increment/tr.ts +52 -0
- package/src/patterns/increment/uk.ts +58 -0
- package/src/patterns/increment/vi.ts +61 -0
- package/src/patterns/increment/zh.ts +32 -0
- package/src/patterns/index.ts +84 -0
- package/src/patterns/languages/en/control-flow.ts +93 -0
- package/src/patterns/languages/en/fetch.ts +62 -0
- package/src/patterns/languages/en/index.ts +42 -0
- package/src/patterns/languages/en/repeat.ts +67 -0
- package/src/patterns/languages/en/set.ts +48 -0
- package/src/patterns/languages/en/swap.ts +38 -0
- package/src/patterns/languages/en/temporal.ts +57 -0
- package/src/patterns/put/ar.ts +74 -0
- package/src/patterns/put/bn.ts +53 -0
- package/src/patterns/put/en.ts +74 -0
- package/src/patterns/put/es.ts +74 -0
- package/src/patterns/put/hi.ts +69 -0
- package/src/patterns/put/id.ts +96 -0
- package/src/patterns/put/index.ts +99 -0
- package/src/patterns/put/it.ts +56 -0
- package/src/patterns/put/ja.ts +75 -0
- package/src/patterns/put/ko.ts +67 -0
- package/src/patterns/put/ms.ts +30 -0
- package/src/patterns/put/pl.ts +81 -0
- package/src/patterns/put/ru.ts +85 -0
- package/src/patterns/put/th.ts +32 -0
- package/src/patterns/put/tl.ts +30 -0
- package/src/patterns/put/tr.ts +67 -0
- package/src/patterns/put/uk.ts +85 -0
- package/src/patterns/put/vi.ts +72 -0
- package/src/patterns/put/zh.ts +62 -0
- package/src/patterns/registry.ts +163 -0
- package/src/patterns/remove/ar.ts +71 -0
- package/src/patterns/remove/bn.ts +68 -0
- package/src/patterns/remove/hi.ts +69 -0
- package/src/patterns/remove/index.ts +87 -0
- package/src/patterns/remove/it.ts +69 -0
- package/src/patterns/remove/ja.ts +74 -0
- package/src/patterns/remove/ko.ts +78 -0
- package/src/patterns/remove/ms.ts +30 -0
- package/src/patterns/remove/pl.ts +62 -0
- package/src/patterns/remove/ru.ts +62 -0
- package/src/patterns/remove/th.ts +49 -0
- package/src/patterns/remove/tl.ts +30 -0
- package/src/patterns/remove/tr.ts +78 -0
- package/src/patterns/remove/uk.ts +62 -0
- package/src/patterns/remove/vi.ts +61 -0
- package/src/patterns/remove/zh.ts +72 -0
- package/src/patterns/set/ar.ts +84 -0
- package/src/patterns/set/bn.ts +53 -0
- package/src/patterns/set/de.ts +84 -0
- package/src/patterns/set/es.ts +92 -0
- package/src/patterns/set/fr.ts +88 -0
- package/src/patterns/set/hi.ts +56 -0
- package/src/patterns/set/id.ts +84 -0
- package/src/patterns/set/index.ts +107 -0
- package/src/patterns/set/it.ts +56 -0
- package/src/patterns/set/ja.ts +86 -0
- package/src/patterns/set/ko.ts +85 -0
- package/src/patterns/set/ms.ts +30 -0
- package/src/patterns/set/pl.ts +57 -0
- package/src/patterns/set/pt.ts +84 -0
- package/src/patterns/set/ru.ts +57 -0
- package/src/patterns/set/th.ts +31 -0
- package/src/patterns/set/tl.ts +30 -0
- package/src/patterns/set/tr.ts +107 -0
- package/src/patterns/set/uk.ts +57 -0
- package/src/patterns/set/vi.ts +53 -0
- package/src/patterns/set/zh.ts +84 -0
- package/src/patterns/show/ar.ts +67 -0
- package/src/patterns/show/bn.ts +47 -0
- package/src/patterns/show/de.ts +32 -0
- package/src/patterns/show/fr.ts +32 -0
- package/src/patterns/show/hi.ts +61 -0
- package/src/patterns/show/index.ts +95 -0
- package/src/patterns/show/it.ts +56 -0
- package/src/patterns/show/ja.ts +69 -0
- package/src/patterns/show/ko.ts +73 -0
- package/src/patterns/show/ms.ts +30 -0
- package/src/patterns/show/pl.ts +57 -0
- package/src/patterns/show/ru.ts +57 -0
- package/src/patterns/show/th.ts +29 -0
- package/src/patterns/show/tl.ts +30 -0
- package/src/patterns/show/tr.ts +65 -0
- package/src/patterns/show/uk.ts +57 -0
- package/src/patterns/show/vi.ts +56 -0
- package/src/patterns/show/zh.ts +68 -0
- package/src/patterns/take/ar.ts +51 -0
- package/src/patterns/take/index.ts +31 -0
- package/src/patterns/toggle/ar.ts +61 -0
- package/src/patterns/toggle/bn.ts +70 -0
- package/src/patterns/toggle/en.ts +61 -0
- package/src/patterns/toggle/es.ts +61 -0
- package/src/patterns/toggle/hi.ts +80 -0
- package/src/patterns/toggle/index.ts +95 -0
- package/src/patterns/toggle/it.ts +69 -0
- package/src/patterns/toggle/ja.ts +156 -0
- package/src/patterns/toggle/ko.ts +113 -0
- package/src/patterns/toggle/ms.ts +30 -0
- package/src/patterns/toggle/pl.ts +62 -0
- package/src/patterns/toggle/ru.ts +62 -0
- package/src/patterns/toggle/th.ts +50 -0
- package/src/patterns/toggle/tl.ts +30 -0
- package/src/patterns/toggle/tr.ts +88 -0
- package/src/patterns/toggle/uk.ts +62 -0
- package/src/patterns/toggle/vi.ts +61 -0
- package/src/patterns/toggle/zh.ts +99 -0
- package/src/public-api.ts +286 -0
- package/src/registry.ts +441 -0
- package/src/tokenizers/arabic.ts +723 -0
- package/src/tokenizers/base.ts +1300 -0
- package/src/tokenizers/bengali.ts +289 -0
- package/src/tokenizers/chinese.ts +481 -0
- package/src/tokenizers/english.ts +416 -0
- package/src/tokenizers/french.ts +326 -0
- package/src/tokenizers/german.ts +324 -0
- package/src/tokenizers/hindi.ts +319 -0
- package/src/tokenizers/index.ts +127 -0
- package/src/tokenizers/indonesian.ts +306 -0
- package/src/tokenizers/italian.ts +458 -0
- package/src/tokenizers/japanese.ts +447 -0
- package/src/tokenizers/korean.ts +642 -0
- package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
- package/src/tokenizers/morphology/french-normalizer.ts +268 -0
- package/src/tokenizers/morphology/german-normalizer.ts +256 -0
- package/src/tokenizers/morphology/index.ts +46 -0
- package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
- package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
- package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
- package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
- package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
- package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
- package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
- package/src/tokenizers/morphology/types.ts +211 -0
- package/src/tokenizers/ms.ts +198 -0
- package/src/tokenizers/polish.ts +354 -0
- package/src/tokenizers/portuguese.ts +304 -0
- package/src/tokenizers/quechua.ts +339 -0
- package/src/tokenizers/russian.ts +375 -0
- package/src/tokenizers/spanish.ts +403 -0
- package/src/tokenizers/swahili.ts +303 -0
- package/src/tokenizers/thai.ts +236 -0
- package/src/tokenizers/tl.ts +198 -0
- package/src/tokenizers/turkish.ts +411 -0
- package/src/tokenizers/ukrainian.ts +369 -0
- package/src/tokenizers/vietnamese.ts +410 -0
- package/src/types/grammar-types.ts +617 -0
- package/src/types/unified-profile.ts +267 -0
- package/src/types.ts +709 -0
- package/src/utils/confidence-calculator.ts +147 -0
- package/src/validators/command-validator.ts +380 -0
- package/src/validators/index.ts +15 -0
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Arabic Morphological Normalizer
|
|
3
|
+
*
|
|
4
|
+
* Arabic uses a complex root-pattern morphology system where most words
|
|
5
|
+
* are derived from triliteral (3-consonant) roots. This normalizer focuses
|
|
6
|
+
* on prefix/suffix stripping rather than full root extraction.
|
|
7
|
+
*
|
|
8
|
+
* Key features:
|
|
9
|
+
* - Definite article prefix: ال (al-)
|
|
10
|
+
* - Conjunction/preposition prefixes: و (wa-), ف (fa-), ب (bi-), ل (li-), ك (ka-)
|
|
11
|
+
* - Verb prefixes (present tense markers): ي (ya-), ت (ta-), ن (na-), أ (a-)
|
|
12
|
+
* - Plural/gender suffixes: ون (ūn), ين (īn), ات (āt), ة (a)
|
|
13
|
+
* - Pronoun suffixes: ها (hā), هم (hum), etc.
|
|
14
|
+
* - Diacritics handling: Words with and without diacritics should match
|
|
15
|
+
*
|
|
16
|
+
* Examples:
|
|
17
|
+
* والتبديل → تبديل → بدّل (and the changing → changing → change!)
|
|
18
|
+
* يبدّل → بدّل (he changes → change!)
|
|
19
|
+
* المستخدمين → مستخدم (the users → user)
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import type { MorphologicalNormalizer, NormalizationResult, PrefixRule } from './types';
|
|
23
|
+
import { noChange, normalized } from './types';
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Check if a character is Arabic.
|
|
27
|
+
*/
|
|
28
|
+
function isArabic(char: string): boolean {
|
|
29
|
+
const code = char.charCodeAt(0);
|
|
30
|
+
return (
|
|
31
|
+
(code >= 0x0600 && code <= 0x06ff) || // Arabic
|
|
32
|
+
(code >= 0x0750 && code <= 0x077f) || // Arabic Supplement
|
|
33
|
+
(code >= 0x08a0 && code <= 0x08ff) || // Arabic Extended-A
|
|
34
|
+
(code >= 0xfb50 && code <= 0xfdff) || // Arabic Presentation Forms-A
|
|
35
|
+
(code >= 0xfe70 && code <= 0xfeff)
|
|
36
|
+
); // Arabic Presentation Forms-B
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Check if a word contains Arabic characters.
|
|
41
|
+
*/
|
|
42
|
+
function containsArabic(word: string): boolean {
|
|
43
|
+
for (const char of word) {
|
|
44
|
+
if (isArabic(char)) return true;
|
|
45
|
+
}
|
|
46
|
+
return false;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Remove Arabic diacritics (tashkeel) from a word.
|
|
51
|
+
* This helps match words regardless of vocalization marks.
|
|
52
|
+
*/
|
|
53
|
+
function removeDiacritics(word: string): string {
|
|
54
|
+
// Arabic diacritics: fatha, kasra, damma, sukun, shadda, etc.
|
|
55
|
+
return word.replace(/[\u064B-\u0652\u0670]/g, '');
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Prefix rules for Arabic, ordered by priority.
|
|
60
|
+
* Combined prefixes should be checked first.
|
|
61
|
+
*/
|
|
62
|
+
const COMBINED_PREFIXES: readonly PrefixRule[] = [
|
|
63
|
+
// Conjunction + article combinations (4 chars)
|
|
64
|
+
{ pattern: 'وال', confidencePenalty: 0.15, prefixType: 'conjunction' }, // wa + al
|
|
65
|
+
{ pattern: 'فال', confidencePenalty: 0.15, prefixType: 'conjunction' }, // fa + al
|
|
66
|
+
{ pattern: 'بال', confidencePenalty: 0.15, prefixType: 'preposition' }, // bi + al
|
|
67
|
+
{ pattern: 'كال', confidencePenalty: 0.15, prefixType: 'preposition' }, // ka + al
|
|
68
|
+
{ pattern: 'لل', confidencePenalty: 0.12, prefixType: 'preposition' }, // li + al (assimilation)
|
|
69
|
+
];
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Single prefix rules.
|
|
73
|
+
* Note: Single-character prefixes require minimum 3-char remaining stem
|
|
74
|
+
* to avoid over-stripping words where the character is part of the root.
|
|
75
|
+
*/
|
|
76
|
+
const SINGLE_PREFIXES: readonly PrefixRule[] = [
|
|
77
|
+
// Definite article (2 chars) - can leave 2-char stem
|
|
78
|
+
{ pattern: 'ال', confidencePenalty: 0.08, prefixType: 'article', minRemaining: 2 },
|
|
79
|
+
|
|
80
|
+
// Conjunctions and prepositions (1 char) - need longer stem to be safe
|
|
81
|
+
{ pattern: 'و', confidencePenalty: 0.08, prefixType: 'conjunction', minRemaining: 3 }, // wa- (and)
|
|
82
|
+
{ pattern: 'ف', confidencePenalty: 0.08, prefixType: 'conjunction', minRemaining: 3 }, // fa- (then/so)
|
|
83
|
+
{ pattern: 'ب', confidencePenalty: 0.1, prefixType: 'preposition', minRemaining: 3 }, // bi- (with/by)
|
|
84
|
+
{ pattern: 'ل', confidencePenalty: 0.1, prefixType: 'preposition', minRemaining: 3 }, // li- (to/for)
|
|
85
|
+
{ pattern: 'ك', confidencePenalty: 0.1, prefixType: 'preposition', minRemaining: 3 }, // ka- (like/as)
|
|
86
|
+
];
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Verb prefixes (present tense markers).
|
|
90
|
+
* These are more tentative as they change verb meaning.
|
|
91
|
+
* Require minimum 3-char remaining to avoid over-stripping.
|
|
92
|
+
*/
|
|
93
|
+
const VERB_PREFIXES: readonly PrefixRule[] = [
|
|
94
|
+
{ pattern: 'ي', confidencePenalty: 0.12, prefixType: 'verb-marker', minRemaining: 3 }, // ya- (he/it)
|
|
95
|
+
{ pattern: 'ت', confidencePenalty: 0.12, prefixType: 'verb-marker', minRemaining: 3 }, // ta- (she/you)
|
|
96
|
+
{ pattern: 'ن', confidencePenalty: 0.12, prefixType: 'verb-marker', minRemaining: 3 }, // na- (we)
|
|
97
|
+
{ pattern: 'أ', confidencePenalty: 0.12, prefixType: 'verb-marker', minRemaining: 3 }, // a- (I)
|
|
98
|
+
{ pattern: 'ا', confidencePenalty: 0.12, prefixType: 'verb-marker', minRemaining: 3 }, // a- without hamza
|
|
99
|
+
];
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Suffix rules for Arabic.
|
|
103
|
+
*/
|
|
104
|
+
const SUFFIXES: readonly { pattern: string; confidencePenalty: number; type: string }[] = [
|
|
105
|
+
// Plural forms
|
|
106
|
+
{ pattern: 'ون', confidencePenalty: 0.1, type: 'masculine-plural' },
|
|
107
|
+
{ pattern: 'ين', confidencePenalty: 0.1, type: 'masculine-plural-accusative' },
|
|
108
|
+
{ pattern: 'ات', confidencePenalty: 0.1, type: 'feminine-plural' },
|
|
109
|
+
// Dual forms
|
|
110
|
+
{ pattern: 'ان', confidencePenalty: 0.1, type: 'dual-nominative' },
|
|
111
|
+
{ pattern: 'ين', confidencePenalty: 0.1, type: 'dual-accusative' },
|
|
112
|
+
// Pronoun suffixes
|
|
113
|
+
{ pattern: 'ها', confidencePenalty: 0.1, type: 'pronoun-her' },
|
|
114
|
+
{ pattern: 'هم', confidencePenalty: 0.1, type: 'pronoun-them' },
|
|
115
|
+
{ pattern: 'هن', confidencePenalty: 0.1, type: 'pronoun-them-f' },
|
|
116
|
+
{ pattern: 'نا', confidencePenalty: 0.1, type: 'pronoun-us' },
|
|
117
|
+
{ pattern: 'كم', confidencePenalty: 0.1, type: 'pronoun-you-pl' },
|
|
118
|
+
{ pattern: 'ك', confidencePenalty: 0.08, type: 'pronoun-you' },
|
|
119
|
+
{ pattern: 'ه', confidencePenalty: 0.08, type: 'pronoun-him' },
|
|
120
|
+
{ pattern: 'ي', confidencePenalty: 0.08, type: 'pronoun-me' },
|
|
121
|
+
// Feminine marker
|
|
122
|
+
{ pattern: 'ة', confidencePenalty: 0.08, type: 'feminine' },
|
|
123
|
+
];
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Arabic morphological normalizer.
|
|
127
|
+
*/
|
|
128
|
+
export class ArabicMorphologicalNormalizer implements MorphologicalNormalizer {
|
|
129
|
+
readonly language = 'ar';
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Check if a word might be an Arabic word that can be normalized.
|
|
133
|
+
*/
|
|
134
|
+
isNormalizable(word: string): boolean {
|
|
135
|
+
if (!containsArabic(word)) return false;
|
|
136
|
+
// Arabic words are typically at least 2 characters
|
|
137
|
+
if (word.length < 2) return false;
|
|
138
|
+
return true;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Normalize an Arabic word by stripping prefixes and suffixes.
|
|
143
|
+
*/
|
|
144
|
+
normalize(word: string): NormalizationResult {
|
|
145
|
+
// Remove diacritics for consistent matching
|
|
146
|
+
let stem = removeDiacritics(word);
|
|
147
|
+
let confidence = 1.0;
|
|
148
|
+
const removedPrefixes: string[] = [];
|
|
149
|
+
const removedSuffixes: string[] = [];
|
|
150
|
+
|
|
151
|
+
// Try combined prefixes first (longest match)
|
|
152
|
+
for (const rule of COMBINED_PREFIXES) {
|
|
153
|
+
if (stem.startsWith(rule.pattern)) {
|
|
154
|
+
const remaining = stem.slice(rule.pattern.length);
|
|
155
|
+
// Must leave a meaningful stem (at least 2 characters)
|
|
156
|
+
if (remaining.length >= 2) {
|
|
157
|
+
stem = remaining;
|
|
158
|
+
confidence -= rule.confidencePenalty;
|
|
159
|
+
removedPrefixes.push(rule.pattern);
|
|
160
|
+
break; // Only one combined prefix
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Try single prefixes (if no combined prefix was found)
|
|
166
|
+
if (removedPrefixes.length === 0) {
|
|
167
|
+
for (const rule of SINGLE_PREFIXES) {
|
|
168
|
+
if (stem.startsWith(rule.pattern)) {
|
|
169
|
+
const remaining = stem.slice(rule.pattern.length);
|
|
170
|
+
const minLen = rule.minRemaining ?? 2;
|
|
171
|
+
if (remaining.length >= minLen) {
|
|
172
|
+
stem = remaining;
|
|
173
|
+
confidence -= rule.confidencePenalty;
|
|
174
|
+
removedPrefixes.push(rule.pattern);
|
|
175
|
+
break; // Only one prefix at a time for now
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Try verb prefixes ONLY for words that look like verbs (not nouns)
|
|
182
|
+
// Skip if the word has noun-pattern suffixes or pronoun suffixes
|
|
183
|
+
// This prevents stripping ت from تغييرات (changes) or تغييرها (her change)
|
|
184
|
+
const looksLikeNoun =
|
|
185
|
+
stem.endsWith('ات') ||
|
|
186
|
+
stem.endsWith('ة') ||
|
|
187
|
+
stem.endsWith('ون') ||
|
|
188
|
+
stem.endsWith('ين') ||
|
|
189
|
+
stem.endsWith('ها') ||
|
|
190
|
+
stem.endsWith('هم') ||
|
|
191
|
+
stem.endsWith('هن') ||
|
|
192
|
+
stem.endsWith('نا') ||
|
|
193
|
+
stem.endsWith('كم');
|
|
194
|
+
if (
|
|
195
|
+
!looksLikeNoun &&
|
|
196
|
+
(removedPrefixes.length === 0 || removedPrefixes[0] === 'و' || removedPrefixes[0] === 'ف')
|
|
197
|
+
) {
|
|
198
|
+
for (const rule of VERB_PREFIXES) {
|
|
199
|
+
if (stem.startsWith(rule.pattern)) {
|
|
200
|
+
const remaining = stem.slice(rule.pattern.length);
|
|
201
|
+
const minLen = rule.minRemaining ?? 3;
|
|
202
|
+
if (remaining.length >= minLen) {
|
|
203
|
+
stem = remaining;
|
|
204
|
+
confidence -= rule.confidencePenalty;
|
|
205
|
+
removedPrefixes.push(rule.pattern);
|
|
206
|
+
break;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Try suffixes (can apply multiple passes)
|
|
213
|
+
for (const rule of SUFFIXES) {
|
|
214
|
+
if (stem.endsWith(rule.pattern)) {
|
|
215
|
+
const remaining = stem.slice(0, -rule.pattern.length);
|
|
216
|
+
// Must leave a meaningful stem
|
|
217
|
+
if (remaining.length >= 2) {
|
|
218
|
+
stem = remaining;
|
|
219
|
+
confidence -= rule.confidencePenalty;
|
|
220
|
+
removedSuffixes.push(rule.pattern);
|
|
221
|
+
// Don't break - some suffixes can be stacked
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// Ensure confidence stays reasonable
|
|
227
|
+
confidence = Math.max(0.5, confidence);
|
|
228
|
+
|
|
229
|
+
// If nothing was stripped, return unchanged
|
|
230
|
+
if (removedPrefixes.length === 0 && removedSuffixes.length === 0) {
|
|
231
|
+
return noChange(word);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return normalized(stem, confidence, {
|
|
235
|
+
removedPrefixes,
|
|
236
|
+
removedSuffixes,
|
|
237
|
+
});
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Export singleton instance
|
|
242
|
+
export const arabicMorphologicalNormalizer = new ArabicMorphologicalNormalizer();
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* French Morphological Normalizer
|
|
3
|
+
*
|
|
4
|
+
* Reduces French verb conjugations to their infinitive forms.
|
|
5
|
+
* French has three verb conjugation groups:
|
|
6
|
+
* - 1st group: -er verbs (parler, montrer, afficher)
|
|
7
|
+
* - 2nd group: -ir verbs with -iss- forms (finir, choisir)
|
|
8
|
+
* - 3rd group: irregular -ir, -re, -oir verbs (partir, prendre, voir)
|
|
9
|
+
*
|
|
10
|
+
* Key features:
|
|
11
|
+
* - Reflexive verb handling: se montrer → montrer
|
|
12
|
+
* - Regular conjugation patterns for all three groups
|
|
13
|
+
* - Past participle (-é, -i, -u) and present participle (-ant) forms
|
|
14
|
+
*
|
|
15
|
+
* Examples:
|
|
16
|
+
* affiche → afficher (3rd person present)
|
|
17
|
+
* montrant → montrer (present participle)
|
|
18
|
+
* caché → cacher (past participle)
|
|
19
|
+
* finissons → finir (1st person plural present)
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import type { MorphologicalNormalizer, NormalizationResult, ConjugationType } from './types';
|
|
23
|
+
import { noChange, normalized } from './types';
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Check if a word looks like a French verb.
|
|
27
|
+
*/
|
|
28
|
+
function looksLikeFrenchVerb(word: string): boolean {
|
|
29
|
+
const lower = word.toLowerCase();
|
|
30
|
+
// Check for infinitive endings
|
|
31
|
+
if (lower.endsWith('er') || lower.endsWith('ir') || lower.endsWith('re')) return true;
|
|
32
|
+
// Check for common conjugation endings
|
|
33
|
+
if (lower.endsWith('ant')) return true; // present participle
|
|
34
|
+
if (lower.endsWith('é') || lower.endsWith('i') || lower.endsWith('u')) return true; // past participles
|
|
35
|
+
// Check for French-specific characters
|
|
36
|
+
if (/[àâäéèêëïîôùûüÿçœæ]/i.test(word)) return true;
|
|
37
|
+
return false;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Reflexive pronouns that attach to verbs in imperative form.
|
|
42
|
+
*/
|
|
43
|
+
const REFLEXIVE_SUFFIXES = ['toi', 'vous', 'nous'];
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* -ER verb conjugation endings (1st group - largest group).
|
|
47
|
+
*/
|
|
48
|
+
const ER_ENDINGS: readonly {
|
|
49
|
+
ending: string;
|
|
50
|
+
stem: string;
|
|
51
|
+
confidence: number;
|
|
52
|
+
type: ConjugationType;
|
|
53
|
+
}[] = [
|
|
54
|
+
// Present participle
|
|
55
|
+
{ ending: 'ant', stem: 'er', confidence: 0.88, type: 'gerund' },
|
|
56
|
+
// Past participle
|
|
57
|
+
{ ending: 'é', stem: 'er', confidence: 0.88, type: 'participle' },
|
|
58
|
+
{ ending: 'ée', stem: 'er', confidence: 0.88, type: 'participle' },
|
|
59
|
+
{ ending: 'és', stem: 'er', confidence: 0.88, type: 'participle' },
|
|
60
|
+
{ ending: 'ées', stem: 'er', confidence: 0.88, type: 'participle' },
|
|
61
|
+
// Present indicative
|
|
62
|
+
{ ending: 'e', stem: 'er', confidence: 0.75, type: 'present' }, // je/il/elle
|
|
63
|
+
{ ending: 'es', stem: 'er', confidence: 0.78, type: 'present' }, // tu
|
|
64
|
+
{ ending: 'ons', stem: 'er', confidence: 0.85, type: 'present' }, // nous
|
|
65
|
+
{ ending: 'ez', stem: 'er', confidence: 0.85, type: 'present' }, // vous
|
|
66
|
+
{ ending: 'ent', stem: 'er', confidence: 0.82, type: 'present' }, // ils/elles
|
|
67
|
+
// Imperfect
|
|
68
|
+
{ ending: 'ais', stem: 'er', confidence: 0.82, type: 'past' }, // je/tu
|
|
69
|
+
{ ending: 'ait', stem: 'er', confidence: 0.82, type: 'past' }, // il/elle
|
|
70
|
+
{ ending: 'ions', stem: 'er', confidence: 0.85, type: 'past' }, // nous
|
|
71
|
+
{ ending: 'iez', stem: 'er', confidence: 0.85, type: 'past' }, // vous
|
|
72
|
+
{ ending: 'aient', stem: 'er', confidence: 0.85, type: 'past' }, // ils/elles
|
|
73
|
+
// Simple past (passé simple)
|
|
74
|
+
{ ending: 'ai', stem: 'er', confidence: 0.8, type: 'past' }, // je
|
|
75
|
+
{ ending: 'as', stem: 'er', confidence: 0.78, type: 'past' }, // tu
|
|
76
|
+
{ ending: 'a', stem: 'er', confidence: 0.75, type: 'past' }, // il/elle
|
|
77
|
+
{ ending: 'âmes', stem: 'er', confidence: 0.88, type: 'past' }, // nous
|
|
78
|
+
{ ending: 'âtes', stem: 'er', confidence: 0.88, type: 'past' }, // vous
|
|
79
|
+
{ ending: 'èrent', stem: 'er', confidence: 0.88, type: 'past' }, // ils/elles
|
|
80
|
+
// Future
|
|
81
|
+
{ ending: 'erai', stem: 'er', confidence: 0.85, type: 'future' }, // je
|
|
82
|
+
{ ending: 'eras', stem: 'er', confidence: 0.85, type: 'future' }, // tu
|
|
83
|
+
{ ending: 'era', stem: 'er', confidence: 0.82, type: 'future' }, // il/elle
|
|
84
|
+
{ ending: 'erons', stem: 'er', confidence: 0.88, type: 'future' }, // nous
|
|
85
|
+
{ ending: 'erez', stem: 'er', confidence: 0.88, type: 'future' }, // vous
|
|
86
|
+
{ ending: 'eront', stem: 'er', confidence: 0.88, type: 'future' }, // ils/elles
|
|
87
|
+
// Conditional
|
|
88
|
+
{ ending: 'erais', stem: 'er', confidence: 0.85, type: 'conditional' }, // je/tu
|
|
89
|
+
{ ending: 'erait', stem: 'er', confidence: 0.85, type: 'conditional' }, // il/elle
|
|
90
|
+
{ ending: 'erions', stem: 'er', confidence: 0.88, type: 'conditional' }, // nous
|
|
91
|
+
{ ending: 'eriez', stem: 'er', confidence: 0.88, type: 'conditional' }, // vous
|
|
92
|
+
{ ending: 'eraient', stem: 'er', confidence: 0.88, type: 'conditional' }, // ils/elles
|
|
93
|
+
// Subjunctive
|
|
94
|
+
{ ending: 'ions', stem: 'er', confidence: 0.8, type: 'subjunctive' }, // nous
|
|
95
|
+
{ ending: 'iez', stem: 'er', confidence: 0.8, type: 'subjunctive' }, // vous
|
|
96
|
+
// Imperative
|
|
97
|
+
{ ending: 'ons', stem: 'er', confidence: 0.82, type: 'imperative' }, // nous
|
|
98
|
+
{ ending: 'ez', stem: 'er', confidence: 0.82, type: 'imperative' }, // vous
|
|
99
|
+
// Infinitive
|
|
100
|
+
{ ending: 'er', stem: 'er', confidence: 0.92, type: 'dictionary' },
|
|
101
|
+
];
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* -IR verb conjugation endings (2nd group - verbs with -iss- forms).
|
|
105
|
+
* Examples: finir → finissons, choisir → choisissons
|
|
106
|
+
*/
|
|
107
|
+
const IR_ENDINGS: readonly {
|
|
108
|
+
ending: string;
|
|
109
|
+
stem: string;
|
|
110
|
+
confidence: number;
|
|
111
|
+
type: ConjugationType;
|
|
112
|
+
}[] = [
|
|
113
|
+
// Present participle
|
|
114
|
+
{ ending: 'issant', stem: 'ir', confidence: 0.88, type: 'gerund' },
|
|
115
|
+
// Past participle
|
|
116
|
+
{ ending: 'i', stem: 'ir', confidence: 0.8, type: 'participle' },
|
|
117
|
+
{ ending: 'ie', stem: 'ir', confidence: 0.82, type: 'participle' },
|
|
118
|
+
{ ending: 'is', stem: 'ir', confidence: 0.78, type: 'participle' },
|
|
119
|
+
{ ending: 'ies', stem: 'ir', confidence: 0.82, type: 'participle' },
|
|
120
|
+
// Present indicative with -iss-
|
|
121
|
+
{ ending: 'is', stem: 'ir', confidence: 0.78, type: 'present' }, // je/tu
|
|
122
|
+
{ ending: 'it', stem: 'ir', confidence: 0.78, type: 'present' }, // il/elle
|
|
123
|
+
{ ending: 'issons', stem: 'ir', confidence: 0.88, type: 'present' }, // nous
|
|
124
|
+
{ ending: 'issez', stem: 'ir', confidence: 0.88, type: 'present' }, // vous
|
|
125
|
+
{ ending: 'issent', stem: 'ir', confidence: 0.88, type: 'present' }, // ils/elles
|
|
126
|
+
// Imperfect
|
|
127
|
+
{ ending: 'issais', stem: 'ir', confidence: 0.85, type: 'past' }, // je/tu
|
|
128
|
+
{ ending: 'issait', stem: 'ir', confidence: 0.85, type: 'past' }, // il/elle
|
|
129
|
+
{ ending: 'issions', stem: 'ir', confidence: 0.88, type: 'past' }, // nous
|
|
130
|
+
{ ending: 'issiez', stem: 'ir', confidence: 0.88, type: 'past' }, // vous
|
|
131
|
+
{ ending: 'issaient', stem: 'ir', confidence: 0.88, type: 'past' }, // ils/elles
|
|
132
|
+
// Future
|
|
133
|
+
{ ending: 'irai', stem: 'ir', confidence: 0.85, type: 'future' }, // je
|
|
134
|
+
{ ending: 'iras', stem: 'ir', confidence: 0.85, type: 'future' }, // tu
|
|
135
|
+
{ ending: 'ira', stem: 'ir', confidence: 0.82, type: 'future' }, // il/elle
|
|
136
|
+
{ ending: 'irons', stem: 'ir', confidence: 0.88, type: 'future' }, // nous
|
|
137
|
+
{ ending: 'irez', stem: 'ir', confidence: 0.88, type: 'future' }, // vous
|
|
138
|
+
{ ending: 'iront', stem: 'ir', confidence: 0.88, type: 'future' }, // ils/elles
|
|
139
|
+
// Infinitive
|
|
140
|
+
{ ending: 'ir', stem: 'ir', confidence: 0.9, type: 'dictionary' },
|
|
141
|
+
];
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* -RE verb conjugation endings (3rd group).
|
|
145
|
+
* Examples: prendre, vendre, attendre
|
|
146
|
+
*/
|
|
147
|
+
const RE_ENDINGS: readonly {
|
|
148
|
+
ending: string;
|
|
149
|
+
stem: string;
|
|
150
|
+
confidence: number;
|
|
151
|
+
type: ConjugationType;
|
|
152
|
+
}[] = [
|
|
153
|
+
// Present participle
|
|
154
|
+
{ ending: 'ant', stem: 're', confidence: 0.82, type: 'gerund' },
|
|
155
|
+
// Past participle (common patterns)
|
|
156
|
+
{ ending: 'u', stem: 're', confidence: 0.8, type: 'participle' },
|
|
157
|
+
{ ending: 'ue', stem: 're', confidence: 0.82, type: 'participle' },
|
|
158
|
+
{ ending: 'us', stem: 're', confidence: 0.82, type: 'participle' },
|
|
159
|
+
{ ending: 'ues', stem: 're', confidence: 0.82, type: 'participle' },
|
|
160
|
+
// Present indicative
|
|
161
|
+
{ ending: 's', stem: 're', confidence: 0.72, type: 'present' }, // je/tu
|
|
162
|
+
{ ending: 'd', stem: 're', confidence: 0.75, type: 'present' }, // il/elle (prend, vend)
|
|
163
|
+
{ ending: 'ons', stem: 're', confidence: 0.82, type: 'present' }, // nous
|
|
164
|
+
{ ending: 'ez', stem: 're', confidence: 0.82, type: 'present' }, // vous
|
|
165
|
+
{ ending: 'ent', stem: 're', confidence: 0.8, type: 'present' }, // ils/elles
|
|
166
|
+
// Infinitive
|
|
167
|
+
{ ending: 're', stem: 're', confidence: 0.9, type: 'dictionary' },
|
|
168
|
+
];
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* All endings combined, sorted by length (longest first).
|
|
172
|
+
*/
|
|
173
|
+
const ALL_ENDINGS = [...ER_ENDINGS, ...IR_ENDINGS, ...RE_ENDINGS].sort(
|
|
174
|
+
(a, b) => b.ending.length - a.ending.length
|
|
175
|
+
);
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* French morphological normalizer.
|
|
179
|
+
*/
|
|
180
|
+
export class FrenchMorphologicalNormalizer implements MorphologicalNormalizer {
|
|
181
|
+
readonly language = 'fr';
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Check if a word might be a French verb that can be normalized.
|
|
185
|
+
*/
|
|
186
|
+
isNormalizable(word: string): boolean {
|
|
187
|
+
if (word.length < 3) return false;
|
|
188
|
+
return looksLikeFrenchVerb(word);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Normalize a French word to its infinitive form.
|
|
193
|
+
*/
|
|
194
|
+
normalize(word: string): NormalizationResult {
|
|
195
|
+
const lower = word.toLowerCase();
|
|
196
|
+
|
|
197
|
+
// Check if this is already an infinitive (no change needed)
|
|
198
|
+
if (lower.endsWith('er') || lower.endsWith('ir') || lower.endsWith('re')) {
|
|
199
|
+
// Simple infinitive, return as-is
|
|
200
|
+
if (lower.length >= 4) {
|
|
201
|
+
return noChange(word);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Try reflexive verb normalization first (for imperative forms like "montrez-vous")
|
|
206
|
+
const reflexiveResult = this.tryReflexiveNormalization(lower);
|
|
207
|
+
if (reflexiveResult) return reflexiveResult;
|
|
208
|
+
|
|
209
|
+
// Try standard conjugation normalization
|
|
210
|
+
const conjugationResult = this.tryConjugationNormalization(lower);
|
|
211
|
+
if (conjugationResult) return conjugationResult;
|
|
212
|
+
|
|
213
|
+
// No normalization needed
|
|
214
|
+
return noChange(word);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
/**
|
|
218
|
+
* Try to normalize a reflexive verb (imperative forms with attached pronouns).
|
|
219
|
+
* Examples: montrez-vous → montrer, lève-toi → lever
|
|
220
|
+
*/
|
|
221
|
+
private tryReflexiveNormalization(word: string): NormalizationResult | null {
|
|
222
|
+
// Check for hyphenated reflexive forms (e.g., "montrez-vous")
|
|
223
|
+
for (const suffix of REFLEXIVE_SUFFIXES) {
|
|
224
|
+
const hyphenatedSuffix = '-' + suffix;
|
|
225
|
+
if (word.endsWith(hyphenatedSuffix)) {
|
|
226
|
+
const withoutReflexive = word.slice(0, -hyphenatedSuffix.length);
|
|
227
|
+
|
|
228
|
+
// Try to normalize the remaining part
|
|
229
|
+
const innerResult = this.tryConjugationNormalization(withoutReflexive);
|
|
230
|
+
if (innerResult && innerResult.stem !== withoutReflexive) {
|
|
231
|
+
return normalized(innerResult.stem, innerResult.confidence * 0.95, {
|
|
232
|
+
removedSuffixes: [hyphenatedSuffix, ...(innerResult.metadata?.removedSuffixes || [])],
|
|
233
|
+
conjugationType: 'reflexive',
|
|
234
|
+
});
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
return null;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Try to normalize a conjugated verb to its infinitive.
|
|
244
|
+
*/
|
|
245
|
+
private tryConjugationNormalization(word: string): NormalizationResult | null {
|
|
246
|
+
for (const rule of ALL_ENDINGS) {
|
|
247
|
+
if (word.endsWith(rule.ending)) {
|
|
248
|
+
const stemBase = word.slice(0, -rule.ending.length);
|
|
249
|
+
|
|
250
|
+
// Must have a meaningful stem (at least 2 characters)
|
|
251
|
+
if (stemBase.length < 2) continue;
|
|
252
|
+
|
|
253
|
+
// Reconstruct infinitive
|
|
254
|
+
const infinitive = stemBase + rule.stem;
|
|
255
|
+
|
|
256
|
+
return normalized(infinitive, rule.confidence, {
|
|
257
|
+
removedSuffixes: [rule.ending],
|
|
258
|
+
conjugationType: rule.type,
|
|
259
|
+
});
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
return null;
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// Export singleton instance
|
|
268
|
+
export const frenchMorphologicalNormalizer = new FrenchMorphologicalNormalizer();
|