@lokascript/semantic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +686 -0
- package/dist/browser-ar.ar.global.js +2 -0
- package/dist/browser-core.core.global.js +2 -0
- package/dist/browser-de.de.global.js +2 -0
- package/dist/browser-east-asian.east-asian.global.js +2 -0
- package/dist/browser-en-tr.en-tr.global.js +2 -0
- package/dist/browser-en.en.global.js +2 -0
- package/dist/browser-es-en.es-en.global.js +2 -0
- package/dist/browser-es.es.global.js +2 -0
- package/dist/browser-fr.fr.global.js +2 -0
- package/dist/browser-id.id.global.js +2 -0
- package/dist/browser-ja.ja.global.js +2 -0
- package/dist/browser-ko.ko.global.js +2 -0
- package/dist/browser-lazy.lazy.global.js +2 -0
- package/dist/browser-priority.priority.global.js +2 -0
- package/dist/browser-pt.pt.global.js +2 -0
- package/dist/browser-qu.qu.global.js +2 -0
- package/dist/browser-sw.sw.global.js +2 -0
- package/dist/browser-tr.tr.global.js +2 -0
- package/dist/browser-western.western.global.js +2 -0
- package/dist/browser-zh.zh.global.js +2 -0
- package/dist/browser.global.js +3 -0
- package/dist/browser.global.js.map +1 -0
- package/dist/index.cjs +35051 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3426 -0
- package/dist/index.d.ts +3426 -0
- package/dist/index.js +34890 -0
- package/dist/index.js.map +1 -0
- package/dist/languages/ar.d.ts +78 -0
- package/dist/languages/ar.js +1622 -0
- package/dist/languages/ar.js.map +1 -0
- package/dist/languages/de.d.ts +38 -0
- package/dist/languages/de.js +1168 -0
- package/dist/languages/de.js.map +1 -0
- package/dist/languages/en.d.ts +44 -0
- package/dist/languages/en.js +3491 -0
- package/dist/languages/en.js.map +1 -0
- package/dist/languages/es.d.ts +52 -0
- package/dist/languages/es.js +1493 -0
- package/dist/languages/es.js.map +1 -0
- package/dist/languages/fr.d.ts +37 -0
- package/dist/languages/fr.js +1159 -0
- package/dist/languages/fr.js.map +1 -0
- package/dist/languages/id.d.ts +35 -0
- package/dist/languages/id.js +1152 -0
- package/dist/languages/id.js.map +1 -0
- package/dist/languages/ja.d.ts +53 -0
- package/dist/languages/ja.js +1430 -0
- package/dist/languages/ja.js.map +1 -0
- package/dist/languages/ko.d.ts +51 -0
- package/dist/languages/ko.js +1729 -0
- package/dist/languages/ko.js.map +1 -0
- package/dist/languages/pt.d.ts +37 -0
- package/dist/languages/pt.js +1127 -0
- package/dist/languages/pt.js.map +1 -0
- package/dist/languages/qu.d.ts +36 -0
- package/dist/languages/qu.js +1143 -0
- package/dist/languages/qu.js.map +1 -0
- package/dist/languages/sw.d.ts +35 -0
- package/dist/languages/sw.js +1147 -0
- package/dist/languages/sw.js.map +1 -0
- package/dist/languages/tr.d.ts +45 -0
- package/dist/languages/tr.js +1529 -0
- package/dist/languages/tr.js.map +1 -0
- package/dist/languages/zh.d.ts +58 -0
- package/dist/languages/zh.js +1257 -0
- package/dist/languages/zh.js.map +1 -0
- package/dist/types-C4dcj53L.d.ts +600 -0
- package/package.json +202 -0
- package/src/__test-utils__/index.ts +7 -0
- package/src/__test-utils__/test-helpers.ts +8 -0
- package/src/__types__/test-helpers.ts +122 -0
- package/src/analysis/index.ts +479 -0
- package/src/ast-builder/command-mappers.ts +1133 -0
- package/src/ast-builder/expression-parser/index.ts +41 -0
- package/src/ast-builder/expression-parser/parser.ts +563 -0
- package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
- package/src/ast-builder/expression-parser/types.ts +208 -0
- package/src/ast-builder/index.ts +536 -0
- package/src/ast-builder/value-converters.ts +172 -0
- package/src/bridge.ts +275 -0
- package/src/browser-ar.ts +162 -0
- package/src/browser-core.ts +231 -0
- package/src/browser-de.ts +162 -0
- package/src/browser-east-asian.ts +173 -0
- package/src/browser-en-tr.ts +165 -0
- package/src/browser-en.ts +157 -0
- package/src/browser-es-en.ts +200 -0
- package/src/browser-es.ts +170 -0
- package/src/browser-fr.ts +162 -0
- package/src/browser-id.ts +162 -0
- package/src/browser-ja.ts +162 -0
- package/src/browser-ko.ts +162 -0
- package/src/browser-lazy.ts +189 -0
- package/src/browser-priority.ts +214 -0
- package/src/browser-pt.ts +162 -0
- package/src/browser-qu.ts +162 -0
- package/src/browser-sw.ts +162 -0
- package/src/browser-tr.ts +162 -0
- package/src/browser-western.ts +181 -0
- package/src/browser-zh.ts +162 -0
- package/src/browser.ts +268 -0
- package/src/cache/index.ts +14 -0
- package/src/cache/semantic-cache.ts +344 -0
- package/src/core-bridge.ts +372 -0
- package/src/explicit/converter.ts +258 -0
- package/src/explicit/index.ts +18 -0
- package/src/explicit/parser.ts +236 -0
- package/src/explicit/renderer.ts +424 -0
- package/src/generators/command-schemas.ts +1636 -0
- package/src/generators/event-handler-generator.ts +109 -0
- package/src/generators/index.ts +117 -0
- package/src/generators/language-profiles.ts +139 -0
- package/src/generators/pattern-generator.ts +537 -0
- package/src/generators/profiles/arabic.ts +131 -0
- package/src/generators/profiles/bengali.ts +132 -0
- package/src/generators/profiles/chinese.ts +124 -0
- package/src/generators/profiles/english.ts +113 -0
- package/src/generators/profiles/french.ts +125 -0
- package/src/generators/profiles/german.ts +126 -0
- package/src/generators/profiles/hindi.ts +146 -0
- package/src/generators/profiles/index.ts +46 -0
- package/src/generators/profiles/indonesian.ts +125 -0
- package/src/generators/profiles/italian.ts +139 -0
- package/src/generators/profiles/japanese.ts +149 -0
- package/src/generators/profiles/korean.ts +127 -0
- package/src/generators/profiles/marker-templates.ts +288 -0
- package/src/generators/profiles/ms.ts +130 -0
- package/src/generators/profiles/polish.ts +249 -0
- package/src/generators/profiles/portuguese.ts +115 -0
- package/src/generators/profiles/quechua.ts +113 -0
- package/src/generators/profiles/russian.ts +260 -0
- package/src/generators/profiles/spanish.ts +130 -0
- package/src/generators/profiles/swahili.ts +129 -0
- package/src/generators/profiles/thai.ts +132 -0
- package/src/generators/profiles/tl.ts +128 -0
- package/src/generators/profiles/turkish.ts +124 -0
- package/src/generators/profiles/types.ts +165 -0
- package/src/generators/profiles/ukrainian.ts +270 -0
- package/src/generators/profiles/vietnamese.ts +133 -0
- package/src/generators/schema-error-codes.ts +160 -0
- package/src/generators/schema-validator.ts +391 -0
- package/src/index.ts +429 -0
- package/src/language-building-schema.ts +3170 -0
- package/src/language-loader.ts +394 -0
- package/src/languages/_all.ts +65 -0
- package/src/languages/ar.ts +15 -0
- package/src/languages/bn.ts +16 -0
- package/src/languages/de.ts +15 -0
- package/src/languages/en.ts +29 -0
- package/src/languages/es.ts +15 -0
- package/src/languages/fr.ts +15 -0
- package/src/languages/hi.ts +26 -0
- package/src/languages/id.ts +15 -0
- package/src/languages/index.ts +18 -0
- package/src/languages/it.ts +15 -0
- package/src/languages/ja.ts +15 -0
- package/src/languages/ko.ts +15 -0
- package/src/languages/ms.ts +16 -0
- package/src/languages/pl.ts +18 -0
- package/src/languages/pt.ts +15 -0
- package/src/languages/qu.ts +15 -0
- package/src/languages/ru.ts +26 -0
- package/src/languages/sw.ts +15 -0
- package/src/languages/th.ts +16 -0
- package/src/languages/tl.ts +16 -0
- package/src/languages/tr.ts +15 -0
- package/src/languages/uk.ts +26 -0
- package/src/languages/vi.ts +16 -0
- package/src/languages/zh.ts +15 -0
- package/src/parser/index.ts +15 -0
- package/src/parser/pattern-matcher.ts +1181 -0
- package/src/parser/semantic-parser.ts +573 -0
- package/src/parser/utils/index.ts +35 -0
- package/src/parser/utils/marker-resolution.ts +111 -0
- package/src/parser/utils/possessive-keywords.ts +43 -0
- package/src/parser/utils/role-positioning.ts +70 -0
- package/src/parser/utils/type-validation.ts +134 -0
- package/src/patterns/add/ar.ts +71 -0
- package/src/patterns/add/bn.ts +70 -0
- package/src/patterns/add/hi.ts +69 -0
- package/src/patterns/add/index.ts +87 -0
- package/src/patterns/add/it.ts +61 -0
- package/src/patterns/add/ja.ts +93 -0
- package/src/patterns/add/ko.ts +74 -0
- package/src/patterns/add/ms.ts +30 -0
- package/src/patterns/add/pl.ts +62 -0
- package/src/patterns/add/ru.ts +62 -0
- package/src/patterns/add/th.ts +49 -0
- package/src/patterns/add/tl.ts +30 -0
- package/src/patterns/add/tr.ts +71 -0
- package/src/patterns/add/uk.ts +62 -0
- package/src/patterns/add/vi.ts +61 -0
- package/src/patterns/add/zh.ts +71 -0
- package/src/patterns/builders.ts +207 -0
- package/src/patterns/decrement/bn.ts +70 -0
- package/src/patterns/decrement/de.ts +42 -0
- package/src/patterns/decrement/hi.ts +68 -0
- package/src/patterns/decrement/index.ts +79 -0
- package/src/patterns/decrement/it.ts +69 -0
- package/src/patterns/decrement/ms.ts +30 -0
- package/src/patterns/decrement/pl.ts +58 -0
- package/src/patterns/decrement/ru.ts +58 -0
- package/src/patterns/decrement/th.ts +49 -0
- package/src/patterns/decrement/tl.ts +30 -0
- package/src/patterns/decrement/tr.ts +48 -0
- package/src/patterns/decrement/uk.ts +58 -0
- package/src/patterns/decrement/vi.ts +61 -0
- package/src/patterns/decrement/zh.ts +32 -0
- package/src/patterns/en.ts +302 -0
- package/src/patterns/event-handler/ar.ts +151 -0
- package/src/patterns/event-handler/bn.ts +72 -0
- package/src/patterns/event-handler/de.ts +117 -0
- package/src/patterns/event-handler/en.ts +117 -0
- package/src/patterns/event-handler/es.ts +136 -0
- package/src/patterns/event-handler/fr.ts +117 -0
- package/src/patterns/event-handler/hi.ts +64 -0
- package/src/patterns/event-handler/id.ts +117 -0
- package/src/patterns/event-handler/index.ts +119 -0
- package/src/patterns/event-handler/it.ts +54 -0
- package/src/patterns/event-handler/ja.ts +118 -0
- package/src/patterns/event-handler/ko.ts +133 -0
- package/src/patterns/event-handler/ms.ts +30 -0
- package/src/patterns/event-handler/pl.ts +62 -0
- package/src/patterns/event-handler/pt.ts +117 -0
- package/src/patterns/event-handler/qu.ts +66 -0
- package/src/patterns/event-handler/ru.ts +62 -0
- package/src/patterns/event-handler/shared.ts +270 -0
- package/src/patterns/event-handler/sw.ts +117 -0
- package/src/patterns/event-handler/th.ts +53 -0
- package/src/patterns/event-handler/tl.ts +30 -0
- package/src/patterns/event-handler/tr.ts +170 -0
- package/src/patterns/event-handler/uk.ts +62 -0
- package/src/patterns/event-handler/vi.ts +61 -0
- package/src/patterns/event-handler/zh.ts +150 -0
- package/src/patterns/get/ar.ts +49 -0
- package/src/patterns/get/bn.ts +47 -0
- package/src/patterns/get/de.ts +32 -0
- package/src/patterns/get/hi.ts +52 -0
- package/src/patterns/get/index.ts +83 -0
- package/src/patterns/get/it.ts +56 -0
- package/src/patterns/get/ja.ts +53 -0
- package/src/patterns/get/ko.ts +53 -0
- package/src/patterns/get/ms.ts +30 -0
- package/src/patterns/get/pl.ts +57 -0
- package/src/patterns/get/ru.ts +57 -0
- package/src/patterns/get/th.ts +29 -0
- package/src/patterns/get/tl.ts +30 -0
- package/src/patterns/get/uk.ts +57 -0
- package/src/patterns/get/vi.ts +48 -0
- package/src/patterns/grammar-transformed/index.ts +39 -0
- package/src/patterns/grammar-transformed/ja.ts +1713 -0
- package/src/patterns/grammar-transformed/ko.ts +1311 -0
- package/src/patterns/grammar-transformed/tr.ts +1067 -0
- package/src/patterns/hide/ar.ts +67 -0
- package/src/patterns/hide/bn.ts +47 -0
- package/src/patterns/hide/de.ts +36 -0
- package/src/patterns/hide/hi.ts +61 -0
- package/src/patterns/hide/index.ts +91 -0
- package/src/patterns/hide/it.ts +56 -0
- package/src/patterns/hide/ja.ts +69 -0
- package/src/patterns/hide/ko.ts +69 -0
- package/src/patterns/hide/ms.ts +30 -0
- package/src/patterns/hide/pl.ts +57 -0
- package/src/patterns/hide/ru.ts +57 -0
- package/src/patterns/hide/th.ts +29 -0
- package/src/patterns/hide/tl.ts +30 -0
- package/src/patterns/hide/tr.ts +65 -0
- package/src/patterns/hide/uk.ts +57 -0
- package/src/patterns/hide/vi.ts +56 -0
- package/src/patterns/hide/zh.ts +68 -0
- package/src/patterns/increment/bn.ts +70 -0
- package/src/patterns/increment/de.ts +36 -0
- package/src/patterns/increment/hi.ts +68 -0
- package/src/patterns/increment/index.ts +79 -0
- package/src/patterns/increment/it.ts +69 -0
- package/src/patterns/increment/ms.ts +30 -0
- package/src/patterns/increment/pl.ts +58 -0
- package/src/patterns/increment/ru.ts +58 -0
- package/src/patterns/increment/th.ts +49 -0
- package/src/patterns/increment/tl.ts +30 -0
- package/src/patterns/increment/tr.ts +52 -0
- package/src/patterns/increment/uk.ts +58 -0
- package/src/patterns/increment/vi.ts +61 -0
- package/src/patterns/increment/zh.ts +32 -0
- package/src/patterns/index.ts +84 -0
- package/src/patterns/languages/en/control-flow.ts +93 -0
- package/src/patterns/languages/en/fetch.ts +62 -0
- package/src/patterns/languages/en/index.ts +42 -0
- package/src/patterns/languages/en/repeat.ts +67 -0
- package/src/patterns/languages/en/set.ts +48 -0
- package/src/patterns/languages/en/swap.ts +38 -0
- package/src/patterns/languages/en/temporal.ts +57 -0
- package/src/patterns/put/ar.ts +74 -0
- package/src/patterns/put/bn.ts +53 -0
- package/src/patterns/put/en.ts +74 -0
- package/src/patterns/put/es.ts +74 -0
- package/src/patterns/put/hi.ts +69 -0
- package/src/patterns/put/id.ts +96 -0
- package/src/patterns/put/index.ts +99 -0
- package/src/patterns/put/it.ts +56 -0
- package/src/patterns/put/ja.ts +75 -0
- package/src/patterns/put/ko.ts +67 -0
- package/src/patterns/put/ms.ts +30 -0
- package/src/patterns/put/pl.ts +81 -0
- package/src/patterns/put/ru.ts +85 -0
- package/src/patterns/put/th.ts +32 -0
- package/src/patterns/put/tl.ts +30 -0
- package/src/patterns/put/tr.ts +67 -0
- package/src/patterns/put/uk.ts +85 -0
- package/src/patterns/put/vi.ts +72 -0
- package/src/patterns/put/zh.ts +62 -0
- package/src/patterns/registry.ts +163 -0
- package/src/patterns/remove/ar.ts +71 -0
- package/src/patterns/remove/bn.ts +68 -0
- package/src/patterns/remove/hi.ts +69 -0
- package/src/patterns/remove/index.ts +87 -0
- package/src/patterns/remove/it.ts +69 -0
- package/src/patterns/remove/ja.ts +74 -0
- package/src/patterns/remove/ko.ts +78 -0
- package/src/patterns/remove/ms.ts +30 -0
- package/src/patterns/remove/pl.ts +62 -0
- package/src/patterns/remove/ru.ts +62 -0
- package/src/patterns/remove/th.ts +49 -0
- package/src/patterns/remove/tl.ts +30 -0
- package/src/patterns/remove/tr.ts +78 -0
- package/src/patterns/remove/uk.ts +62 -0
- package/src/patterns/remove/vi.ts +61 -0
- package/src/patterns/remove/zh.ts +72 -0
- package/src/patterns/set/ar.ts +84 -0
- package/src/patterns/set/bn.ts +53 -0
- package/src/patterns/set/de.ts +84 -0
- package/src/patterns/set/es.ts +92 -0
- package/src/patterns/set/fr.ts +88 -0
- package/src/patterns/set/hi.ts +56 -0
- package/src/patterns/set/id.ts +84 -0
- package/src/patterns/set/index.ts +107 -0
- package/src/patterns/set/it.ts +56 -0
- package/src/patterns/set/ja.ts +86 -0
- package/src/patterns/set/ko.ts +85 -0
- package/src/patterns/set/ms.ts +30 -0
- package/src/patterns/set/pl.ts +57 -0
- package/src/patterns/set/pt.ts +84 -0
- package/src/patterns/set/ru.ts +57 -0
- package/src/patterns/set/th.ts +31 -0
- package/src/patterns/set/tl.ts +30 -0
- package/src/patterns/set/tr.ts +107 -0
- package/src/patterns/set/uk.ts +57 -0
- package/src/patterns/set/vi.ts +53 -0
- package/src/patterns/set/zh.ts +84 -0
- package/src/patterns/show/ar.ts +67 -0
- package/src/patterns/show/bn.ts +47 -0
- package/src/patterns/show/de.ts +32 -0
- package/src/patterns/show/fr.ts +32 -0
- package/src/patterns/show/hi.ts +61 -0
- package/src/patterns/show/index.ts +95 -0
- package/src/patterns/show/it.ts +56 -0
- package/src/patterns/show/ja.ts +69 -0
- package/src/patterns/show/ko.ts +73 -0
- package/src/patterns/show/ms.ts +30 -0
- package/src/patterns/show/pl.ts +57 -0
- package/src/patterns/show/ru.ts +57 -0
- package/src/patterns/show/th.ts +29 -0
- package/src/patterns/show/tl.ts +30 -0
- package/src/patterns/show/tr.ts +65 -0
- package/src/patterns/show/uk.ts +57 -0
- package/src/patterns/show/vi.ts +56 -0
- package/src/patterns/show/zh.ts +68 -0
- package/src/patterns/take/ar.ts +51 -0
- package/src/patterns/take/index.ts +31 -0
- package/src/patterns/toggle/ar.ts +61 -0
- package/src/patterns/toggle/bn.ts +70 -0
- package/src/patterns/toggle/en.ts +61 -0
- package/src/patterns/toggle/es.ts +61 -0
- package/src/patterns/toggle/hi.ts +80 -0
- package/src/patterns/toggle/index.ts +95 -0
- package/src/patterns/toggle/it.ts +69 -0
- package/src/patterns/toggle/ja.ts +156 -0
- package/src/patterns/toggle/ko.ts +113 -0
- package/src/patterns/toggle/ms.ts +30 -0
- package/src/patterns/toggle/pl.ts +62 -0
- package/src/patterns/toggle/ru.ts +62 -0
- package/src/patterns/toggle/th.ts +50 -0
- package/src/patterns/toggle/tl.ts +30 -0
- package/src/patterns/toggle/tr.ts +88 -0
- package/src/patterns/toggle/uk.ts +62 -0
- package/src/patterns/toggle/vi.ts +61 -0
- package/src/patterns/toggle/zh.ts +99 -0
- package/src/public-api.ts +286 -0
- package/src/registry.ts +441 -0
- package/src/tokenizers/arabic.ts +723 -0
- package/src/tokenizers/base.ts +1300 -0
- package/src/tokenizers/bengali.ts +289 -0
- package/src/tokenizers/chinese.ts +481 -0
- package/src/tokenizers/english.ts +416 -0
- package/src/tokenizers/french.ts +326 -0
- package/src/tokenizers/german.ts +324 -0
- package/src/tokenizers/hindi.ts +319 -0
- package/src/tokenizers/index.ts +127 -0
- package/src/tokenizers/indonesian.ts +306 -0
- package/src/tokenizers/italian.ts +458 -0
- package/src/tokenizers/japanese.ts +447 -0
- package/src/tokenizers/korean.ts +642 -0
- package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
- package/src/tokenizers/morphology/french-normalizer.ts +268 -0
- package/src/tokenizers/morphology/german-normalizer.ts +256 -0
- package/src/tokenizers/morphology/index.ts +46 -0
- package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
- package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
- package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
- package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
- package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
- package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
- package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
- package/src/tokenizers/morphology/types.ts +211 -0
- package/src/tokenizers/ms.ts +198 -0
- package/src/tokenizers/polish.ts +354 -0
- package/src/tokenizers/portuguese.ts +304 -0
- package/src/tokenizers/quechua.ts +339 -0
- package/src/tokenizers/russian.ts +375 -0
- package/src/tokenizers/spanish.ts +403 -0
- package/src/tokenizers/swahili.ts +303 -0
- package/src/tokenizers/thai.ts +236 -0
- package/src/tokenizers/tl.ts +198 -0
- package/src/tokenizers/turkish.ts +411 -0
- package/src/tokenizers/ukrainian.ts +369 -0
- package/src/tokenizers/vietnamese.ts +410 -0
- package/src/types/grammar-types.ts +617 -0
- package/src/types/unified-profile.ts +267 -0
- package/src/types.ts +709 -0
- package/src/utils/confidence-calculator.ts +147 -0
- package/src/validators/command-validator.ts +380 -0
- package/src/validators/index.ts +15 -0
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Turkish Morphological Normalizer
|
|
3
|
+
*
|
|
4
|
+
* Turkish is a highly agglutinative language with strict vowel harmony.
|
|
5
|
+
* Suffixes attach in sequence and their vowels change based on the last
|
|
6
|
+
* vowel of the stem (front/back, rounded/unrounded).
|
|
7
|
+
*
|
|
8
|
+
* Vowel Harmony Rules:
|
|
9
|
+
* - Back vowels (a, ı, o, u) take back vowel suffixes
|
|
10
|
+
* - Front vowels (e, i, ö, ü) take front vowel suffixes
|
|
11
|
+
*
|
|
12
|
+
* Common verb suffixes:
|
|
13
|
+
* - Infinitive: -mak/-mek (değiştirmek = to change)
|
|
14
|
+
* - Present continuous: -iyor/-ıyor/-üyor/-uyor (değiştiriyor = is changing)
|
|
15
|
+
* - Past: -di/-dı/-dü/-du (değiştirdi = changed)
|
|
16
|
+
* - Reported past: -miş/-mış/-müş/-muş (değiştirmiş = apparently changed)
|
|
17
|
+
* - Future: -ecek/-acak (değiştirecek = will change)
|
|
18
|
+
* - Negation: -me/-ma before tense (değiştirmiyor = is not changing)
|
|
19
|
+
* - Passive: -il/-ıl/-ül/-ul (değiştirildi = was changed)
|
|
20
|
+
* - Causative: -tir/-tır/-tür/-tur (değiştirtmek = to make change)
|
|
21
|
+
*
|
|
22
|
+
* Person suffixes (after tense):
|
|
23
|
+
* - 1sg: -im/-ım/-üm/-um or -m (yapıyorum = I am doing)
|
|
24
|
+
* - 2sg: -sin/-sın/-sün/-sun (yapıyorsun = you are doing)
|
|
25
|
+
* - 3sg: (no suffix) (yapıyor = he/she is doing)
|
|
26
|
+
* - 1pl: -iz/-ız/-üz/-uz (yapıyoruz = we are doing)
|
|
27
|
+
* - 2pl: -siniz/-sınız/-sünüz/-sunuz (yapıyorsunuz = you all are doing)
|
|
28
|
+
* - 3pl: -ler/-lar (yapıyorlar = they are doing)
|
|
29
|
+
*
|
|
30
|
+
* Examples:
|
|
31
|
+
* değiştiriyorum → değiştir (I am changing)
|
|
32
|
+
* değiştirmek → değiştir (to change)
|
|
33
|
+
* gösterdi → göster (showed)
|
|
34
|
+
* gizleniyor → gizle (is being hidden)
|
|
35
|
+
*/
|
|
36
|
+
|
|
37
|
+
import type {
|
|
38
|
+
MorphologicalNormalizer,
|
|
39
|
+
NormalizationResult,
|
|
40
|
+
SuffixRule,
|
|
41
|
+
ConjugationType,
|
|
42
|
+
} from './types';
|
|
43
|
+
import { noChange, normalized } from './types';
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Check if a character is a Turkish letter.
|
|
47
|
+
* Turkish uses Latin alphabet with special characters: ç, ğ, ı, ö, ş, ü
|
|
48
|
+
*/
|
|
49
|
+
function isTurkishLetter(char: string): boolean {
|
|
50
|
+
const code = char.charCodeAt(0);
|
|
51
|
+
// Basic Latin letters
|
|
52
|
+
if ((code >= 0x41 && code <= 0x5a) || (code >= 0x61 && code <= 0x7a)) {
|
|
53
|
+
return true;
|
|
54
|
+
}
|
|
55
|
+
// Turkish special characters
|
|
56
|
+
const turkishChars = 'çÇğĞıİöÖşŞüÜ';
|
|
57
|
+
return turkishChars.includes(char);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Check if a word contains Turkish characters (including special chars).
|
|
62
|
+
*/
|
|
63
|
+
function containsTurkish(word: string): boolean {
|
|
64
|
+
for (const char of word) {
|
|
65
|
+
if (isTurkishLetter(char)) return true;
|
|
66
|
+
}
|
|
67
|
+
return false;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Check if a vowel is a back vowel.
|
|
72
|
+
*/
|
|
73
|
+
function isBackVowel(char: string): boolean {
|
|
74
|
+
return 'aıouAIOU'.includes(char);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Check if a vowel is a front vowel.
|
|
79
|
+
*/
|
|
80
|
+
function isFrontVowel(char: string): boolean {
|
|
81
|
+
return 'eiöüEİÖÜ'.includes(char);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Check if a character is a vowel.
|
|
86
|
+
*/
|
|
87
|
+
function isVowel(char: string): boolean {
|
|
88
|
+
return isBackVowel(char) || isFrontVowel(char);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Get the last vowel in a word.
|
|
93
|
+
*/
|
|
94
|
+
function getLastVowel(word: string): string | null {
|
|
95
|
+
for (let i = word.length - 1; i >= 0; i--) {
|
|
96
|
+
if (isVowel(word[i])) {
|
|
97
|
+
return word[i];
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
return null;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Check if a suffix matches vowel harmony with the stem.
|
|
105
|
+
* This helps validate that a potential suffix actually belongs.
|
|
106
|
+
*/
|
|
107
|
+
function matchesVowelHarmony(stem: string, suffix: string): boolean {
|
|
108
|
+
const stemLastVowel = getLastVowel(stem);
|
|
109
|
+
if (!stemLastVowel) return true; // No vowel in stem, can't validate
|
|
110
|
+
|
|
111
|
+
const suffixFirstVowel = suffix.split('').find(c => isVowel(c));
|
|
112
|
+
if (!suffixFirstVowel) return true; // No vowel in suffix, can't validate
|
|
113
|
+
|
|
114
|
+
// Back vowel stems take back vowel suffixes
|
|
115
|
+
if (isBackVowel(stemLastVowel)) {
|
|
116
|
+
return isBackVowel(suffixFirstVowel);
|
|
117
|
+
}
|
|
118
|
+
// Front vowel stems take front vowel suffixes
|
|
119
|
+
return isFrontVowel(suffixFirstVowel);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Suffix rules for Turkish verb conjugation.
|
|
124
|
+
* Each pattern includes all vowel harmony variants.
|
|
125
|
+
* Ordered by length (longest first) to ensure greedy matching.
|
|
126
|
+
*/
|
|
127
|
+
const TURKISH_SUFFIX_RULES: readonly SuffixRule[] = [
|
|
128
|
+
// Compound tense + person (longest patterns first)
|
|
129
|
+
// Present continuous + person
|
|
130
|
+
{ pattern: 'iyorsunuz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
131
|
+
{ pattern: 'ıyorsunuz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
132
|
+
{ pattern: 'üyorsunuz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
133
|
+
{ pattern: 'uyorsunuz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
134
|
+
{ pattern: 'iyorsun', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
135
|
+
{ pattern: 'ıyorsun', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
136
|
+
{ pattern: 'üyorsun', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
137
|
+
{ pattern: 'uyorsun', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
138
|
+
{ pattern: 'iyoruz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
139
|
+
{ pattern: 'ıyoruz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
140
|
+
{ pattern: 'üyoruz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
141
|
+
{ pattern: 'uyoruz', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
142
|
+
{ pattern: 'iyorum', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
|
|
143
|
+
{ pattern: 'ıyorum', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
|
|
144
|
+
{ pattern: 'üyorum', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
|
|
145
|
+
{ pattern: 'uyorum', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
|
|
146
|
+
{ pattern: 'iyorlar', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
147
|
+
{ pattern: 'ıyorlar', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
148
|
+
{ pattern: 'üyorlar', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
149
|
+
{ pattern: 'uyorlar', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
150
|
+
|
|
151
|
+
// Future tense + person
|
|
152
|
+
{ pattern: 'eceksiniz', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
|
|
153
|
+
{ pattern: 'acaksınız', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
|
|
154
|
+
{ pattern: 'eceksin', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
|
|
155
|
+
{ pattern: 'acaksın', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
|
|
156
|
+
{ pattern: 'eceğiz', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
|
|
157
|
+
{ pattern: 'acağız', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
|
|
158
|
+
{ pattern: 'eceğim', confidence: 0.85, conjugationType: 'future', minStemLength: 2 },
|
|
159
|
+
{ pattern: 'acağım', confidence: 0.85, conjugationType: 'future', minStemLength: 2 },
|
|
160
|
+
{ pattern: 'ecekler', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
|
|
161
|
+
{ pattern: 'acaklar', confidence: 0.82, conjugationType: 'future', minStemLength: 2 },
|
|
162
|
+
|
|
163
|
+
// Reported past + person
|
|
164
|
+
{ pattern: 'mişsiniz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
165
|
+
{ pattern: 'mışsınız', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
166
|
+
{ pattern: 'müşsünüz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
167
|
+
{ pattern: 'muşsunuz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
168
|
+
{ pattern: 'mişsin', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
169
|
+
{ pattern: 'mışsın', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
170
|
+
{ pattern: 'müşsün', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
171
|
+
{ pattern: 'muşsun', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
172
|
+
{ pattern: 'mişiz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
173
|
+
{ pattern: 'mışız', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
174
|
+
{ pattern: 'müşüz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
175
|
+
{ pattern: 'muşuz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
176
|
+
{ pattern: 'mişim', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
177
|
+
{ pattern: 'mışım', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
178
|
+
{ pattern: 'müşüm', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
179
|
+
{ pattern: 'muşum', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
180
|
+
{ pattern: 'mişler', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
181
|
+
{ pattern: 'mışlar', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
182
|
+
{ pattern: 'müşler', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
183
|
+
{ pattern: 'muşlar', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
184
|
+
|
|
185
|
+
// Past tense + person
|
|
186
|
+
{ pattern: 'diniz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
187
|
+
{ pattern: 'dınız', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
188
|
+
{ pattern: 'dünüz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
189
|
+
{ pattern: 'dunuz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
190
|
+
{ pattern: 'tiniz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
191
|
+
{ pattern: 'tınız', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
192
|
+
{ pattern: 'tünüz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
193
|
+
{ pattern: 'tunuz', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
194
|
+
{ pattern: 'diler', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
195
|
+
{ pattern: 'dılar', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
196
|
+
{ pattern: 'düler', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
197
|
+
{ pattern: 'dular', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
198
|
+
{ pattern: 'tiler', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
199
|
+
{ pattern: 'tılar', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
200
|
+
{ pattern: 'tüler', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
201
|
+
{ pattern: 'tular', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
202
|
+
{ pattern: 'din', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
203
|
+
{ pattern: 'dın', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
204
|
+
{ pattern: 'dün', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
205
|
+
{ pattern: 'dun', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
206
|
+
{ pattern: 'tin', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
207
|
+
{ pattern: 'tın', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
208
|
+
{ pattern: 'tün', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
209
|
+
{ pattern: 'tun', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
210
|
+
{ pattern: 'dik', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
211
|
+
{ pattern: 'dık', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
212
|
+
{ pattern: 'dük', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
213
|
+
{ pattern: 'duk', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
214
|
+
{ pattern: 'tik', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
215
|
+
{ pattern: 'tık', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
216
|
+
{ pattern: 'tük', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
217
|
+
{ pattern: 'tuk', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
218
|
+
{ pattern: 'dim', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
219
|
+
{ pattern: 'dım', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
220
|
+
{ pattern: 'düm', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
221
|
+
{ pattern: 'dum', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
222
|
+
{ pattern: 'tim', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
223
|
+
{ pattern: 'tım', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
224
|
+
{ pattern: 'tüm', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
225
|
+
{ pattern: 'tum', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
226
|
+
|
|
227
|
+
// Present continuous (no person - 3rd person singular)
|
|
228
|
+
{ pattern: 'iyor', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
|
|
229
|
+
{ pattern: 'ıyor', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
|
|
230
|
+
{ pattern: 'üyor', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
|
|
231
|
+
{ pattern: 'uyor', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
|
|
232
|
+
|
|
233
|
+
// Future (no person - 3rd person singular)
|
|
234
|
+
{ pattern: 'ecek', confidence: 0.85, conjugationType: 'future', minStemLength: 2 },
|
|
235
|
+
{ pattern: 'acak', confidence: 0.85, conjugationType: 'future', minStemLength: 2 },
|
|
236
|
+
|
|
237
|
+
// Reported past (no person - 3rd person singular)
|
|
238
|
+
{ pattern: 'miş', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
239
|
+
{ pattern: 'mış', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
240
|
+
{ pattern: 'müş', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
241
|
+
{ pattern: 'muş', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
242
|
+
|
|
243
|
+
// Simple past (no person - 3rd person singular)
|
|
244
|
+
{ pattern: 'di', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
245
|
+
{ pattern: 'dı', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
246
|
+
{ pattern: 'dü', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
247
|
+
{ pattern: 'du', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
248
|
+
{ pattern: 'ti', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
249
|
+
{ pattern: 'tı', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
250
|
+
{ pattern: 'tü', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
251
|
+
{ pattern: 'tu', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
252
|
+
|
|
253
|
+
// Infinitive
|
|
254
|
+
{ pattern: 'mek', confidence: 0.88, conjugationType: 'dictionary', minStemLength: 2 },
|
|
255
|
+
{ pattern: 'mak', confidence: 0.88, conjugationType: 'dictionary', minStemLength: 2 },
|
|
256
|
+
|
|
257
|
+
// Optative mood (let me/us...) - -eyim/-ayım/-elim/-alım
|
|
258
|
+
{ pattern: 'eyelim', confidence: 0.82, conjugationType: 'optative', minStemLength: 2 },
|
|
259
|
+
{ pattern: 'ayalım', confidence: 0.82, conjugationType: 'optative', minStemLength: 2 },
|
|
260
|
+
{ pattern: 'eyim', confidence: 0.82, conjugationType: 'optative', minStemLength: 2 },
|
|
261
|
+
{ pattern: 'ayım', confidence: 0.82, conjugationType: 'optative', minStemLength: 2 },
|
|
262
|
+
{ pattern: 'elim', confidence: 0.82, conjugationType: 'optative', minStemLength: 2 },
|
|
263
|
+
{ pattern: 'alım', confidence: 0.82, conjugationType: 'optative', minStemLength: 2 },
|
|
264
|
+
|
|
265
|
+
// Necessitative (must/should) - -meli/-malı
|
|
266
|
+
{ pattern: 'melisiniz', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
|
|
267
|
+
{ pattern: 'malısınız', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
|
|
268
|
+
{ pattern: 'melisin', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
|
|
269
|
+
{ pattern: 'malısın', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
|
|
270
|
+
{ pattern: 'meliyiz', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
|
|
271
|
+
{ pattern: 'malıyız', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
|
|
272
|
+
{ pattern: 'meliyim', confidence: 0.85, conjugationType: 'necessitative', minStemLength: 2 },
|
|
273
|
+
{ pattern: 'malıyım', confidence: 0.85, conjugationType: 'necessitative', minStemLength: 2 },
|
|
274
|
+
{ pattern: 'meliler', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
|
|
275
|
+
{ pattern: 'malılar', confidence: 0.82, conjugationType: 'necessitative', minStemLength: 2 },
|
|
276
|
+
{ pattern: 'meli', confidence: 0.85, conjugationType: 'necessitative', minStemLength: 2 },
|
|
277
|
+
{ pattern: 'malı', confidence: 0.85, conjugationType: 'necessitative', minStemLength: 2 },
|
|
278
|
+
|
|
279
|
+
// Ability (can) - -ebil/-abil + tense suffixes
|
|
280
|
+
{ pattern: 'ebiliyor', confidence: 0.82, conjugationType: 'potential', minStemLength: 2 },
|
|
281
|
+
{ pattern: 'abiliyor', confidence: 0.82, conjugationType: 'potential', minStemLength: 2 },
|
|
282
|
+
{ pattern: 'ebilir', confidence: 0.85, conjugationType: 'potential', minStemLength: 2 },
|
|
283
|
+
{ pattern: 'abilir', confidence: 0.85, conjugationType: 'potential', minStemLength: 2 },
|
|
284
|
+
{ pattern: 'ebildi', confidence: 0.82, conjugationType: 'potential', minStemLength: 2 },
|
|
285
|
+
{ pattern: 'abildi', confidence: 0.82, conjugationType: 'potential', minStemLength: 2 },
|
|
286
|
+
{ pattern: 'ebilmek', confidence: 0.85, conjugationType: 'potential', minStemLength: 2 },
|
|
287
|
+
{ pattern: 'abilmek', confidence: 0.85, conjugationType: 'potential', minStemLength: 2 },
|
|
288
|
+
|
|
289
|
+
// Imperative (2nd person singular is just stem, 2nd person plural has suffix)
|
|
290
|
+
{ pattern: 'iniz', confidence: 0.82, conjugationType: 'imperative', minStemLength: 2 },
|
|
291
|
+
{ pattern: 'ınız', confidence: 0.82, conjugationType: 'imperative', minStemLength: 2 },
|
|
292
|
+
{ pattern: 'ünüz', confidence: 0.82, conjugationType: 'imperative', minStemLength: 2 },
|
|
293
|
+
{ pattern: 'unuz', confidence: 0.82, conjugationType: 'imperative', minStemLength: 2 },
|
|
294
|
+
{ pattern: 'in', confidence: 0.8, conjugationType: 'imperative', minStemLength: 2 },
|
|
295
|
+
{ pattern: 'ın', confidence: 0.8, conjugationType: 'imperative', minStemLength: 2 },
|
|
296
|
+
{ pattern: 'ün', confidence: 0.8, conjugationType: 'imperative', minStemLength: 2 },
|
|
297
|
+
{ pattern: 'un', confidence: 0.8, conjugationType: 'imperative', minStemLength: 2 },
|
|
298
|
+
|
|
299
|
+
// Passive voice
|
|
300
|
+
{ pattern: 'ildi', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
|
|
301
|
+
{ pattern: 'ıldı', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
|
|
302
|
+
{ pattern: 'üldü', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
|
|
303
|
+
{ pattern: 'uldu', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
|
|
304
|
+
{ pattern: 'ilir', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
|
|
305
|
+
{ pattern: 'ılır', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
|
|
306
|
+
{ pattern: 'ülür', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
|
|
307
|
+
{ pattern: 'ulur', confidence: 0.82, conjugationType: 'passive', minStemLength: 2 },
|
|
308
|
+
|
|
309
|
+
// Causative
|
|
310
|
+
{ pattern: 'tirmek', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
|
|
311
|
+
{ pattern: 'tırmak', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
|
|
312
|
+
{ pattern: 'türmek', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
|
|
313
|
+
{ pattern: 'turmak', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
|
|
314
|
+
{ pattern: 'dirmek', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
|
|
315
|
+
{ pattern: 'dırmak', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
|
|
316
|
+
{ pattern: 'dürmek', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
|
|
317
|
+
{ pattern: 'durmak', confidence: 0.82, conjugationType: 'causative', minStemLength: 2 },
|
|
318
|
+
|
|
319
|
+
// Negation + tense combinations (very common)
|
|
320
|
+
{ pattern: 'miyorsunuz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
|
|
321
|
+
{ pattern: 'mıyorsunuz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
|
|
322
|
+
{ pattern: 'müyorsunuz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
|
|
323
|
+
{ pattern: 'muyorsunuz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
|
|
324
|
+
{ pattern: 'miyorsun', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
|
|
325
|
+
{ pattern: 'mıyorsun', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
|
|
326
|
+
{ pattern: 'müyorsun', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
|
|
327
|
+
{ pattern: 'muyorsun', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
|
|
328
|
+
{ pattern: 'miyoruz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
|
|
329
|
+
{ pattern: 'mıyoruz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
|
|
330
|
+
{ pattern: 'müyoruz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
|
|
331
|
+
{ pattern: 'muyoruz', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
|
|
332
|
+
{ pattern: 'miyorum', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
|
|
333
|
+
{ pattern: 'mıyorum', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
|
|
334
|
+
{ pattern: 'müyorum', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
|
|
335
|
+
{ pattern: 'muyorum', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
|
|
336
|
+
{ pattern: 'miyor', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
|
|
337
|
+
{ pattern: 'mıyor', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
|
|
338
|
+
{ pattern: 'müyor', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
|
|
339
|
+
{ pattern: 'muyor', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
|
|
340
|
+
{ pattern: 'medi', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
|
|
341
|
+
{ pattern: 'madı', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
|
|
342
|
+
{ pattern: 'me', confidence: 0.75, conjugationType: 'negative', minStemLength: 3 },
|
|
343
|
+
{ pattern: 'ma', confidence: 0.75, conjugationType: 'negative', minStemLength: 3 },
|
|
344
|
+
];
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* Turkish morphological normalizer.
|
|
348
|
+
*/
|
|
349
|
+
export class TurkishMorphologicalNormalizer implements MorphologicalNormalizer {
|
|
350
|
+
readonly language = 'tr';
|
|
351
|
+
|
|
352
|
+
/**
|
|
353
|
+
* Check if a word might be a Turkish verb that can be normalized.
|
|
354
|
+
*/
|
|
355
|
+
isNormalizable(word: string): boolean {
|
|
356
|
+
// Must contain Turkish characters
|
|
357
|
+
if (!containsTurkish(word)) return false;
|
|
358
|
+
|
|
359
|
+
// Must be at least 3 characters (Turkish verb stems are usually 2+ chars)
|
|
360
|
+
if (word.length < 3) return false;
|
|
361
|
+
|
|
362
|
+
return true;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
/**
|
|
366
|
+
* Normalize a Turkish word to its stem form.
|
|
367
|
+
*/
|
|
368
|
+
normalize(word: string): NormalizationResult {
|
|
369
|
+
// Convert to lowercase for matching
|
|
370
|
+
const lowerWord = word.toLowerCase();
|
|
371
|
+
|
|
372
|
+
// Try suffix rules
|
|
373
|
+
for (const rule of TURKISH_SUFFIX_RULES) {
|
|
374
|
+
if (lowerWord.endsWith(rule.pattern)) {
|
|
375
|
+
const stem = lowerWord.slice(0, -rule.pattern.length);
|
|
376
|
+
|
|
377
|
+
// Validate stem length
|
|
378
|
+
const minLength = rule.minStemLength ?? 2;
|
|
379
|
+
if (stem.length < minLength) continue;
|
|
380
|
+
|
|
381
|
+
// Validate vowel harmony (optional, can help avoid false positives)
|
|
382
|
+
if (!matchesVowelHarmony(stem, rule.pattern)) {
|
|
383
|
+
// Lower confidence if vowel harmony doesn't match
|
|
384
|
+
// but still allow it since there are exceptions
|
|
385
|
+
const adjustedConfidence = rule.confidence * 0.9;
|
|
386
|
+
|
|
387
|
+
const metadata: { removedSuffixes: string[]; conjugationType?: ConjugationType } = {
|
|
388
|
+
removedSuffixes: [rule.pattern],
|
|
389
|
+
};
|
|
390
|
+
if (rule.conjugationType) {
|
|
391
|
+
metadata.conjugationType = rule.conjugationType;
|
|
392
|
+
}
|
|
393
|
+
return normalized(stem, adjustedConfidence, metadata);
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
const metadata: { removedSuffixes: string[]; conjugationType?: ConjugationType } = {
|
|
397
|
+
removedSuffixes: [rule.pattern],
|
|
398
|
+
};
|
|
399
|
+
if (rule.conjugationType) {
|
|
400
|
+
metadata.conjugationType = rule.conjugationType;
|
|
401
|
+
}
|
|
402
|
+
return normalized(stem, rule.confidence, metadata);
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// No normalization needed
|
|
407
|
+
return noChange(word);
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
// Export singleton instance
|
|
412
|
+
export const turkishMorphologicalNormalizer = new TurkishMorphologicalNormalizer();
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Morphological Normalizer Types
|
|
3
|
+
*
|
|
4
|
+
* Defines interfaces for language-specific morphological analysis.
|
|
5
|
+
* Normalizers reduce conjugated/inflected forms to canonical stems
|
|
6
|
+
* that can be matched against keyword dictionaries.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Result of morphological normalization.
|
|
11
|
+
*/
|
|
12
|
+
export interface NormalizationResult {
|
|
13
|
+
/** The extracted stem/root form */
|
|
14
|
+
readonly stem: string;
|
|
15
|
+
|
|
16
|
+
/** Confidence in the normalization (0.0-1.0) */
|
|
17
|
+
readonly confidence: number;
|
|
18
|
+
|
|
19
|
+
/** Optional metadata about the transformation */
|
|
20
|
+
readonly metadata?: NormalizationMetadata;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Metadata about morphological transformations applied.
|
|
25
|
+
*/
|
|
26
|
+
export interface NormalizationMetadata {
|
|
27
|
+
/** Prefixes that were removed */
|
|
28
|
+
readonly removedPrefixes?: readonly string[];
|
|
29
|
+
|
|
30
|
+
/** Suffixes that were removed */
|
|
31
|
+
readonly removedSuffixes?: readonly string[];
|
|
32
|
+
|
|
33
|
+
/** Type of conjugation detected */
|
|
34
|
+
readonly conjugationType?: ConjugationType;
|
|
35
|
+
|
|
36
|
+
/** Original form classification */
|
|
37
|
+
readonly originalForm?: string;
|
|
38
|
+
|
|
39
|
+
/** Applied transformation rules (for debugging) */
|
|
40
|
+
readonly appliedRules?: readonly string[];
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Types of verb conjugation/inflection.
|
|
45
|
+
*/
|
|
46
|
+
export type ConjugationType =
|
|
47
|
+
// Tense
|
|
48
|
+
| 'present'
|
|
49
|
+
| 'past'
|
|
50
|
+
| 'future'
|
|
51
|
+
| 'progressive'
|
|
52
|
+
| 'perfect'
|
|
53
|
+
// Mood
|
|
54
|
+
| 'imperative'
|
|
55
|
+
| 'subjunctive'
|
|
56
|
+
| 'conditional'
|
|
57
|
+
// Voice
|
|
58
|
+
| 'passive'
|
|
59
|
+
| 'causative'
|
|
60
|
+
// Politeness (Japanese/Korean)
|
|
61
|
+
| 'polite'
|
|
62
|
+
| 'humble'
|
|
63
|
+
| 'honorific'
|
|
64
|
+
// Form
|
|
65
|
+
| 'negative'
|
|
66
|
+
| 'potential'
|
|
67
|
+
| 'volitional'
|
|
68
|
+
// Japanese conditional forms
|
|
69
|
+
| 'conditional-tara' // たら/したら - if/when (completed action)
|
|
70
|
+
| 'conditional-to' // と/すると - when (habitual/expected)
|
|
71
|
+
| 'conditional-ba' // ば/すれば - if (hypothetical)
|
|
72
|
+
// Korean-specific
|
|
73
|
+
| 'connective' // 하고, 해서 etc.
|
|
74
|
+
| 'conditional-myeon' // -(으)면 - if/when (general conditional)
|
|
75
|
+
| 'temporal-ttae' // -(으)ㄹ 때 - when (at the time of)
|
|
76
|
+
| 'causal-nikka' // -(으)니까 - because/since
|
|
77
|
+
// Korean honorific forms (-시- infix)
|
|
78
|
+
| 'honorific-conditional' // -하시면 - if (honorific)
|
|
79
|
+
| 'honorific-temporal' // -하실 때 - when (honorific)
|
|
80
|
+
| 'honorific-causal' // -하시니까 - because (honorific)
|
|
81
|
+
| 'honorific-past' // -하셨어요 - past (honorific)
|
|
82
|
+
| 'honorific-polite' // -하십니다 - polite (honorific)
|
|
83
|
+
// Korean sequential forms
|
|
84
|
+
| 'sequential-after' // -고 나서 - after doing
|
|
85
|
+
| 'sequential-before' // -기 전에 - before doing
|
|
86
|
+
| 'immediate' // -자마자 - as soon as
|
|
87
|
+
| 'obligation' // -아야/어야 해 - must do, should do
|
|
88
|
+
// Spanish-specific
|
|
89
|
+
| 'reflexive'
|
|
90
|
+
| 'reflexive-imperative'
|
|
91
|
+
| 'gerund'
|
|
92
|
+
| 'participle'
|
|
93
|
+
// Arabic-specific
|
|
94
|
+
| 'conditional-idha' // إذا - if/when (hypothetical)
|
|
95
|
+
| 'temporal-indama' // عندما - when (temporal conjunction)
|
|
96
|
+
| 'temporal-hina' // حين - at the time of
|
|
97
|
+
| 'temporal-lamma' // لمّا - when (past emphasis)
|
|
98
|
+
| 'past-verb' // فعل ماضي - past tense verb
|
|
99
|
+
// Turkish-specific
|
|
100
|
+
| 'conditional-se' // -se/-sa - if (hypothetical)
|
|
101
|
+
| 'temporal-ince' // -ince/-ınca/-unca/-ünce - when/as
|
|
102
|
+
| 'temporal-dikce' // -dikçe/-dıkça/-dukça/-dükçe - as/while
|
|
103
|
+
| 'aorist' // -ir/-ar - habitual/general
|
|
104
|
+
| 'optative' // -eyim/-ayım/-elim/-alım - let me/us
|
|
105
|
+
| 'necessitative' // -meli/-malı - must/should
|
|
106
|
+
// Japanese request/contracted forms
|
|
107
|
+
| 'request' // てください/でください - polite request
|
|
108
|
+
| 'casual-request' // てくれ/でくれ - casual request
|
|
109
|
+
| 'contracted' // ちゃう/じゃう - contracted completion (てしまう)
|
|
110
|
+
| 'contracted-past' // ちゃった/じゃった - contracted past completion
|
|
111
|
+
// Compound
|
|
112
|
+
| 'compound' // Multi-layer suffixes (ていなかった, 하고나서였어)
|
|
113
|
+
| 'te-form' // Japanese て-form
|
|
114
|
+
| 'dictionary'; // Base/infinitive form
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Interface for language-specific morphological normalizers.
|
|
118
|
+
*
|
|
119
|
+
* Normalizers attempt to reduce inflected word forms to their
|
|
120
|
+
* canonical stems. This enables matching conjugated verbs against
|
|
121
|
+
* keyword dictionaries that only contain base forms.
|
|
122
|
+
*
|
|
123
|
+
* Example (Japanese):
|
|
124
|
+
* 切り替えた (past) → { stem: '切り替え', confidence: 0.85 }
|
|
125
|
+
* 切り替えます (polite) → { stem: '切り替え', confidence: 0.85 }
|
|
126
|
+
*
|
|
127
|
+
* Example (Spanish):
|
|
128
|
+
* mostrarse (reflexive infinitive) → { stem: 'mostrar', confidence: 0.85 }
|
|
129
|
+
* alternando (gerund) → { stem: 'alternar', confidence: 0.85 }
|
|
130
|
+
*/
|
|
131
|
+
export interface MorphologicalNormalizer {
|
|
132
|
+
/** Language code this normalizer handles */
|
|
133
|
+
readonly language: string;
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Normalize a word to its canonical stem form.
|
|
137
|
+
*
|
|
138
|
+
* @param word - The word to normalize
|
|
139
|
+
* @returns Normalization result with stem and confidence
|
|
140
|
+
*/
|
|
141
|
+
normalize(word: string): NormalizationResult;
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Check if a word appears to be a verb form that can be normalized.
|
|
145
|
+
* Optional optimization to skip normalization for non-verb tokens.
|
|
146
|
+
*
|
|
147
|
+
* @param word - The word to check
|
|
148
|
+
* @returns true if the word might be a normalizable verb form
|
|
149
|
+
*/
|
|
150
|
+
isNormalizable?(word: string): boolean;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Configuration for suffix-based normalization rules.
|
|
155
|
+
* Used by agglutinative languages (Japanese, Korean, Turkish).
|
|
156
|
+
*/
|
|
157
|
+
export interface SuffixRule {
|
|
158
|
+
/** The suffix pattern to match */
|
|
159
|
+
readonly pattern: string;
|
|
160
|
+
|
|
161
|
+
/** Confidence when this suffix is stripped */
|
|
162
|
+
readonly confidence: number;
|
|
163
|
+
|
|
164
|
+
/** What to replace the suffix with (empty string for simple removal) */
|
|
165
|
+
readonly replacement?: string;
|
|
166
|
+
|
|
167
|
+
/** Conjugation type this suffix indicates */
|
|
168
|
+
readonly conjugationType?: ConjugationType;
|
|
169
|
+
|
|
170
|
+
/** Minimum stem length after stripping (to avoid over-stripping) */
|
|
171
|
+
readonly minStemLength?: number;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Configuration for prefix-based normalization rules.
|
|
176
|
+
* Used primarily by Arabic for article/conjunction prefixes.
|
|
177
|
+
*/
|
|
178
|
+
export interface PrefixRule {
|
|
179
|
+
/** The prefix pattern to match */
|
|
180
|
+
readonly pattern: string;
|
|
181
|
+
|
|
182
|
+
/** Confidence penalty when this prefix is stripped */
|
|
183
|
+
readonly confidencePenalty: number;
|
|
184
|
+
|
|
185
|
+
/** What the prefix indicates (for metadata) */
|
|
186
|
+
readonly prefixType?: 'article' | 'conjunction' | 'preposition' | 'verb-marker';
|
|
187
|
+
|
|
188
|
+
/** Minimum remaining characters after stripping (to avoid over-stripping) */
|
|
189
|
+
readonly minRemaining?: number;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/**
|
|
193
|
+
* Helper to create a "no change" normalization result.
|
|
194
|
+
*/
|
|
195
|
+
export function noChange(word: string): NormalizationResult {
|
|
196
|
+
return { stem: word, confidence: 1.0 };
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* Helper to create a normalization result with metadata.
|
|
201
|
+
*/
|
|
202
|
+
export function normalized(
|
|
203
|
+
stem: string,
|
|
204
|
+
confidence: number,
|
|
205
|
+
metadata?: NormalizationMetadata
|
|
206
|
+
): NormalizationResult {
|
|
207
|
+
if (metadata) {
|
|
208
|
+
return { stem, confidence, metadata };
|
|
209
|
+
}
|
|
210
|
+
return { stem, confidence };
|
|
211
|
+
}
|