@lokascript/semantic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +686 -0
- package/dist/browser-ar.ar.global.js +2 -0
- package/dist/browser-core.core.global.js +2 -0
- package/dist/browser-de.de.global.js +2 -0
- package/dist/browser-east-asian.east-asian.global.js +2 -0
- package/dist/browser-en-tr.en-tr.global.js +2 -0
- package/dist/browser-en.en.global.js +2 -0
- package/dist/browser-es-en.es-en.global.js +2 -0
- package/dist/browser-es.es.global.js +2 -0
- package/dist/browser-fr.fr.global.js +2 -0
- package/dist/browser-id.id.global.js +2 -0
- package/dist/browser-ja.ja.global.js +2 -0
- package/dist/browser-ko.ko.global.js +2 -0
- package/dist/browser-lazy.lazy.global.js +2 -0
- package/dist/browser-priority.priority.global.js +2 -0
- package/dist/browser-pt.pt.global.js +2 -0
- package/dist/browser-qu.qu.global.js +2 -0
- package/dist/browser-sw.sw.global.js +2 -0
- package/dist/browser-tr.tr.global.js +2 -0
- package/dist/browser-western.western.global.js +2 -0
- package/dist/browser-zh.zh.global.js +2 -0
- package/dist/browser.global.js +3 -0
- package/dist/browser.global.js.map +1 -0
- package/dist/index.cjs +35051 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3426 -0
- package/dist/index.d.ts +3426 -0
- package/dist/index.js +34890 -0
- package/dist/index.js.map +1 -0
- package/dist/languages/ar.d.ts +78 -0
- package/dist/languages/ar.js +1622 -0
- package/dist/languages/ar.js.map +1 -0
- package/dist/languages/de.d.ts +38 -0
- package/dist/languages/de.js +1168 -0
- package/dist/languages/de.js.map +1 -0
- package/dist/languages/en.d.ts +44 -0
- package/dist/languages/en.js +3491 -0
- package/dist/languages/en.js.map +1 -0
- package/dist/languages/es.d.ts +52 -0
- package/dist/languages/es.js +1493 -0
- package/dist/languages/es.js.map +1 -0
- package/dist/languages/fr.d.ts +37 -0
- package/dist/languages/fr.js +1159 -0
- package/dist/languages/fr.js.map +1 -0
- package/dist/languages/id.d.ts +35 -0
- package/dist/languages/id.js +1152 -0
- package/dist/languages/id.js.map +1 -0
- package/dist/languages/ja.d.ts +53 -0
- package/dist/languages/ja.js +1430 -0
- package/dist/languages/ja.js.map +1 -0
- package/dist/languages/ko.d.ts +51 -0
- package/dist/languages/ko.js +1729 -0
- package/dist/languages/ko.js.map +1 -0
- package/dist/languages/pt.d.ts +37 -0
- package/dist/languages/pt.js +1127 -0
- package/dist/languages/pt.js.map +1 -0
- package/dist/languages/qu.d.ts +36 -0
- package/dist/languages/qu.js +1143 -0
- package/dist/languages/qu.js.map +1 -0
- package/dist/languages/sw.d.ts +35 -0
- package/dist/languages/sw.js +1147 -0
- package/dist/languages/sw.js.map +1 -0
- package/dist/languages/tr.d.ts +45 -0
- package/dist/languages/tr.js +1529 -0
- package/dist/languages/tr.js.map +1 -0
- package/dist/languages/zh.d.ts +58 -0
- package/dist/languages/zh.js +1257 -0
- package/dist/languages/zh.js.map +1 -0
- package/dist/types-C4dcj53L.d.ts +600 -0
- package/package.json +202 -0
- package/src/__test-utils__/index.ts +7 -0
- package/src/__test-utils__/test-helpers.ts +8 -0
- package/src/__types__/test-helpers.ts +122 -0
- package/src/analysis/index.ts +479 -0
- package/src/ast-builder/command-mappers.ts +1133 -0
- package/src/ast-builder/expression-parser/index.ts +41 -0
- package/src/ast-builder/expression-parser/parser.ts +563 -0
- package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
- package/src/ast-builder/expression-parser/types.ts +208 -0
- package/src/ast-builder/index.ts +536 -0
- package/src/ast-builder/value-converters.ts +172 -0
- package/src/bridge.ts +275 -0
- package/src/browser-ar.ts +162 -0
- package/src/browser-core.ts +231 -0
- package/src/browser-de.ts +162 -0
- package/src/browser-east-asian.ts +173 -0
- package/src/browser-en-tr.ts +165 -0
- package/src/browser-en.ts +157 -0
- package/src/browser-es-en.ts +200 -0
- package/src/browser-es.ts +170 -0
- package/src/browser-fr.ts +162 -0
- package/src/browser-id.ts +162 -0
- package/src/browser-ja.ts +162 -0
- package/src/browser-ko.ts +162 -0
- package/src/browser-lazy.ts +189 -0
- package/src/browser-priority.ts +214 -0
- package/src/browser-pt.ts +162 -0
- package/src/browser-qu.ts +162 -0
- package/src/browser-sw.ts +162 -0
- package/src/browser-tr.ts +162 -0
- package/src/browser-western.ts +181 -0
- package/src/browser-zh.ts +162 -0
- package/src/browser.ts +268 -0
- package/src/cache/index.ts +14 -0
- package/src/cache/semantic-cache.ts +344 -0
- package/src/core-bridge.ts +372 -0
- package/src/explicit/converter.ts +258 -0
- package/src/explicit/index.ts +18 -0
- package/src/explicit/parser.ts +236 -0
- package/src/explicit/renderer.ts +424 -0
- package/src/generators/command-schemas.ts +1636 -0
- package/src/generators/event-handler-generator.ts +109 -0
- package/src/generators/index.ts +117 -0
- package/src/generators/language-profiles.ts +139 -0
- package/src/generators/pattern-generator.ts +537 -0
- package/src/generators/profiles/arabic.ts +131 -0
- package/src/generators/profiles/bengali.ts +132 -0
- package/src/generators/profiles/chinese.ts +124 -0
- package/src/generators/profiles/english.ts +113 -0
- package/src/generators/profiles/french.ts +125 -0
- package/src/generators/profiles/german.ts +126 -0
- package/src/generators/profiles/hindi.ts +146 -0
- package/src/generators/profiles/index.ts +46 -0
- package/src/generators/profiles/indonesian.ts +125 -0
- package/src/generators/profiles/italian.ts +139 -0
- package/src/generators/profiles/japanese.ts +149 -0
- package/src/generators/profiles/korean.ts +127 -0
- package/src/generators/profiles/marker-templates.ts +288 -0
- package/src/generators/profiles/ms.ts +130 -0
- package/src/generators/profiles/polish.ts +249 -0
- package/src/generators/profiles/portuguese.ts +115 -0
- package/src/generators/profiles/quechua.ts +113 -0
- package/src/generators/profiles/russian.ts +260 -0
- package/src/generators/profiles/spanish.ts +130 -0
- package/src/generators/profiles/swahili.ts +129 -0
- package/src/generators/profiles/thai.ts +132 -0
- package/src/generators/profiles/tl.ts +128 -0
- package/src/generators/profiles/turkish.ts +124 -0
- package/src/generators/profiles/types.ts +165 -0
- package/src/generators/profiles/ukrainian.ts +270 -0
- package/src/generators/profiles/vietnamese.ts +133 -0
- package/src/generators/schema-error-codes.ts +160 -0
- package/src/generators/schema-validator.ts +391 -0
- package/src/index.ts +429 -0
- package/src/language-building-schema.ts +3170 -0
- package/src/language-loader.ts +394 -0
- package/src/languages/_all.ts +65 -0
- package/src/languages/ar.ts +15 -0
- package/src/languages/bn.ts +16 -0
- package/src/languages/de.ts +15 -0
- package/src/languages/en.ts +29 -0
- package/src/languages/es.ts +15 -0
- package/src/languages/fr.ts +15 -0
- package/src/languages/hi.ts +26 -0
- package/src/languages/id.ts +15 -0
- package/src/languages/index.ts +18 -0
- package/src/languages/it.ts +15 -0
- package/src/languages/ja.ts +15 -0
- package/src/languages/ko.ts +15 -0
- package/src/languages/ms.ts +16 -0
- package/src/languages/pl.ts +18 -0
- package/src/languages/pt.ts +15 -0
- package/src/languages/qu.ts +15 -0
- package/src/languages/ru.ts +26 -0
- package/src/languages/sw.ts +15 -0
- package/src/languages/th.ts +16 -0
- package/src/languages/tl.ts +16 -0
- package/src/languages/tr.ts +15 -0
- package/src/languages/uk.ts +26 -0
- package/src/languages/vi.ts +16 -0
- package/src/languages/zh.ts +15 -0
- package/src/parser/index.ts +15 -0
- package/src/parser/pattern-matcher.ts +1181 -0
- package/src/parser/semantic-parser.ts +573 -0
- package/src/parser/utils/index.ts +35 -0
- package/src/parser/utils/marker-resolution.ts +111 -0
- package/src/parser/utils/possessive-keywords.ts +43 -0
- package/src/parser/utils/role-positioning.ts +70 -0
- package/src/parser/utils/type-validation.ts +134 -0
- package/src/patterns/add/ar.ts +71 -0
- package/src/patterns/add/bn.ts +70 -0
- package/src/patterns/add/hi.ts +69 -0
- package/src/patterns/add/index.ts +87 -0
- package/src/patterns/add/it.ts +61 -0
- package/src/patterns/add/ja.ts +93 -0
- package/src/patterns/add/ko.ts +74 -0
- package/src/patterns/add/ms.ts +30 -0
- package/src/patterns/add/pl.ts +62 -0
- package/src/patterns/add/ru.ts +62 -0
- package/src/patterns/add/th.ts +49 -0
- package/src/patterns/add/tl.ts +30 -0
- package/src/patterns/add/tr.ts +71 -0
- package/src/patterns/add/uk.ts +62 -0
- package/src/patterns/add/vi.ts +61 -0
- package/src/patterns/add/zh.ts +71 -0
- package/src/patterns/builders.ts +207 -0
- package/src/patterns/decrement/bn.ts +70 -0
- package/src/patterns/decrement/de.ts +42 -0
- package/src/patterns/decrement/hi.ts +68 -0
- package/src/patterns/decrement/index.ts +79 -0
- package/src/patterns/decrement/it.ts +69 -0
- package/src/patterns/decrement/ms.ts +30 -0
- package/src/patterns/decrement/pl.ts +58 -0
- package/src/patterns/decrement/ru.ts +58 -0
- package/src/patterns/decrement/th.ts +49 -0
- package/src/patterns/decrement/tl.ts +30 -0
- package/src/patterns/decrement/tr.ts +48 -0
- package/src/patterns/decrement/uk.ts +58 -0
- package/src/patterns/decrement/vi.ts +61 -0
- package/src/patterns/decrement/zh.ts +32 -0
- package/src/patterns/en.ts +302 -0
- package/src/patterns/event-handler/ar.ts +151 -0
- package/src/patterns/event-handler/bn.ts +72 -0
- package/src/patterns/event-handler/de.ts +117 -0
- package/src/patterns/event-handler/en.ts +117 -0
- package/src/patterns/event-handler/es.ts +136 -0
- package/src/patterns/event-handler/fr.ts +117 -0
- package/src/patterns/event-handler/hi.ts +64 -0
- package/src/patterns/event-handler/id.ts +117 -0
- package/src/patterns/event-handler/index.ts +119 -0
- package/src/patterns/event-handler/it.ts +54 -0
- package/src/patterns/event-handler/ja.ts +118 -0
- package/src/patterns/event-handler/ko.ts +133 -0
- package/src/patterns/event-handler/ms.ts +30 -0
- package/src/patterns/event-handler/pl.ts +62 -0
- package/src/patterns/event-handler/pt.ts +117 -0
- package/src/patterns/event-handler/qu.ts +66 -0
- package/src/patterns/event-handler/ru.ts +62 -0
- package/src/patterns/event-handler/shared.ts +270 -0
- package/src/patterns/event-handler/sw.ts +117 -0
- package/src/patterns/event-handler/th.ts +53 -0
- package/src/patterns/event-handler/tl.ts +30 -0
- package/src/patterns/event-handler/tr.ts +170 -0
- package/src/patterns/event-handler/uk.ts +62 -0
- package/src/patterns/event-handler/vi.ts +61 -0
- package/src/patterns/event-handler/zh.ts +150 -0
- package/src/patterns/get/ar.ts +49 -0
- package/src/patterns/get/bn.ts +47 -0
- package/src/patterns/get/de.ts +32 -0
- package/src/patterns/get/hi.ts +52 -0
- package/src/patterns/get/index.ts +83 -0
- package/src/patterns/get/it.ts +56 -0
- package/src/patterns/get/ja.ts +53 -0
- package/src/patterns/get/ko.ts +53 -0
- package/src/patterns/get/ms.ts +30 -0
- package/src/patterns/get/pl.ts +57 -0
- package/src/patterns/get/ru.ts +57 -0
- package/src/patterns/get/th.ts +29 -0
- package/src/patterns/get/tl.ts +30 -0
- package/src/patterns/get/uk.ts +57 -0
- package/src/patterns/get/vi.ts +48 -0
- package/src/patterns/grammar-transformed/index.ts +39 -0
- package/src/patterns/grammar-transformed/ja.ts +1713 -0
- package/src/patterns/grammar-transformed/ko.ts +1311 -0
- package/src/patterns/grammar-transformed/tr.ts +1067 -0
- package/src/patterns/hide/ar.ts +67 -0
- package/src/patterns/hide/bn.ts +47 -0
- package/src/patterns/hide/de.ts +36 -0
- package/src/patterns/hide/hi.ts +61 -0
- package/src/patterns/hide/index.ts +91 -0
- package/src/patterns/hide/it.ts +56 -0
- package/src/patterns/hide/ja.ts +69 -0
- package/src/patterns/hide/ko.ts +69 -0
- package/src/patterns/hide/ms.ts +30 -0
- package/src/patterns/hide/pl.ts +57 -0
- package/src/patterns/hide/ru.ts +57 -0
- package/src/patterns/hide/th.ts +29 -0
- package/src/patterns/hide/tl.ts +30 -0
- package/src/patterns/hide/tr.ts +65 -0
- package/src/patterns/hide/uk.ts +57 -0
- package/src/patterns/hide/vi.ts +56 -0
- package/src/patterns/hide/zh.ts +68 -0
- package/src/patterns/increment/bn.ts +70 -0
- package/src/patterns/increment/de.ts +36 -0
- package/src/patterns/increment/hi.ts +68 -0
- package/src/patterns/increment/index.ts +79 -0
- package/src/patterns/increment/it.ts +69 -0
- package/src/patterns/increment/ms.ts +30 -0
- package/src/patterns/increment/pl.ts +58 -0
- package/src/patterns/increment/ru.ts +58 -0
- package/src/patterns/increment/th.ts +49 -0
- package/src/patterns/increment/tl.ts +30 -0
- package/src/patterns/increment/tr.ts +52 -0
- package/src/patterns/increment/uk.ts +58 -0
- package/src/patterns/increment/vi.ts +61 -0
- package/src/patterns/increment/zh.ts +32 -0
- package/src/patterns/index.ts +84 -0
- package/src/patterns/languages/en/control-flow.ts +93 -0
- package/src/patterns/languages/en/fetch.ts +62 -0
- package/src/patterns/languages/en/index.ts +42 -0
- package/src/patterns/languages/en/repeat.ts +67 -0
- package/src/patterns/languages/en/set.ts +48 -0
- package/src/patterns/languages/en/swap.ts +38 -0
- package/src/patterns/languages/en/temporal.ts +57 -0
- package/src/patterns/put/ar.ts +74 -0
- package/src/patterns/put/bn.ts +53 -0
- package/src/patterns/put/en.ts +74 -0
- package/src/patterns/put/es.ts +74 -0
- package/src/patterns/put/hi.ts +69 -0
- package/src/patterns/put/id.ts +96 -0
- package/src/patterns/put/index.ts +99 -0
- package/src/patterns/put/it.ts +56 -0
- package/src/patterns/put/ja.ts +75 -0
- package/src/patterns/put/ko.ts +67 -0
- package/src/patterns/put/ms.ts +30 -0
- package/src/patterns/put/pl.ts +81 -0
- package/src/patterns/put/ru.ts +85 -0
- package/src/patterns/put/th.ts +32 -0
- package/src/patterns/put/tl.ts +30 -0
- package/src/patterns/put/tr.ts +67 -0
- package/src/patterns/put/uk.ts +85 -0
- package/src/patterns/put/vi.ts +72 -0
- package/src/patterns/put/zh.ts +62 -0
- package/src/patterns/registry.ts +163 -0
- package/src/patterns/remove/ar.ts +71 -0
- package/src/patterns/remove/bn.ts +68 -0
- package/src/patterns/remove/hi.ts +69 -0
- package/src/patterns/remove/index.ts +87 -0
- package/src/patterns/remove/it.ts +69 -0
- package/src/patterns/remove/ja.ts +74 -0
- package/src/patterns/remove/ko.ts +78 -0
- package/src/patterns/remove/ms.ts +30 -0
- package/src/patterns/remove/pl.ts +62 -0
- package/src/patterns/remove/ru.ts +62 -0
- package/src/patterns/remove/th.ts +49 -0
- package/src/patterns/remove/tl.ts +30 -0
- package/src/patterns/remove/tr.ts +78 -0
- package/src/patterns/remove/uk.ts +62 -0
- package/src/patterns/remove/vi.ts +61 -0
- package/src/patterns/remove/zh.ts +72 -0
- package/src/patterns/set/ar.ts +84 -0
- package/src/patterns/set/bn.ts +53 -0
- package/src/patterns/set/de.ts +84 -0
- package/src/patterns/set/es.ts +92 -0
- package/src/patterns/set/fr.ts +88 -0
- package/src/patterns/set/hi.ts +56 -0
- package/src/patterns/set/id.ts +84 -0
- package/src/patterns/set/index.ts +107 -0
- package/src/patterns/set/it.ts +56 -0
- package/src/patterns/set/ja.ts +86 -0
- package/src/patterns/set/ko.ts +85 -0
- package/src/patterns/set/ms.ts +30 -0
- package/src/patterns/set/pl.ts +57 -0
- package/src/patterns/set/pt.ts +84 -0
- package/src/patterns/set/ru.ts +57 -0
- package/src/patterns/set/th.ts +31 -0
- package/src/patterns/set/tl.ts +30 -0
- package/src/patterns/set/tr.ts +107 -0
- package/src/patterns/set/uk.ts +57 -0
- package/src/patterns/set/vi.ts +53 -0
- package/src/patterns/set/zh.ts +84 -0
- package/src/patterns/show/ar.ts +67 -0
- package/src/patterns/show/bn.ts +47 -0
- package/src/patterns/show/de.ts +32 -0
- package/src/patterns/show/fr.ts +32 -0
- package/src/patterns/show/hi.ts +61 -0
- package/src/patterns/show/index.ts +95 -0
- package/src/patterns/show/it.ts +56 -0
- package/src/patterns/show/ja.ts +69 -0
- package/src/patterns/show/ko.ts +73 -0
- package/src/patterns/show/ms.ts +30 -0
- package/src/patterns/show/pl.ts +57 -0
- package/src/patterns/show/ru.ts +57 -0
- package/src/patterns/show/th.ts +29 -0
- package/src/patterns/show/tl.ts +30 -0
- package/src/patterns/show/tr.ts +65 -0
- package/src/patterns/show/uk.ts +57 -0
- package/src/patterns/show/vi.ts +56 -0
- package/src/patterns/show/zh.ts +68 -0
- package/src/patterns/take/ar.ts +51 -0
- package/src/patterns/take/index.ts +31 -0
- package/src/patterns/toggle/ar.ts +61 -0
- package/src/patterns/toggle/bn.ts +70 -0
- package/src/patterns/toggle/en.ts +61 -0
- package/src/patterns/toggle/es.ts +61 -0
- package/src/patterns/toggle/hi.ts +80 -0
- package/src/patterns/toggle/index.ts +95 -0
- package/src/patterns/toggle/it.ts +69 -0
- package/src/patterns/toggle/ja.ts +156 -0
- package/src/patterns/toggle/ko.ts +113 -0
- package/src/patterns/toggle/ms.ts +30 -0
- package/src/patterns/toggle/pl.ts +62 -0
- package/src/patterns/toggle/ru.ts +62 -0
- package/src/patterns/toggle/th.ts +50 -0
- package/src/patterns/toggle/tl.ts +30 -0
- package/src/patterns/toggle/tr.ts +88 -0
- package/src/patterns/toggle/uk.ts +62 -0
- package/src/patterns/toggle/vi.ts +61 -0
- package/src/patterns/toggle/zh.ts +99 -0
- package/src/public-api.ts +286 -0
- package/src/registry.ts +441 -0
- package/src/tokenizers/arabic.ts +723 -0
- package/src/tokenizers/base.ts +1300 -0
- package/src/tokenizers/bengali.ts +289 -0
- package/src/tokenizers/chinese.ts +481 -0
- package/src/tokenizers/english.ts +416 -0
- package/src/tokenizers/french.ts +326 -0
- package/src/tokenizers/german.ts +324 -0
- package/src/tokenizers/hindi.ts +319 -0
- package/src/tokenizers/index.ts +127 -0
- package/src/tokenizers/indonesian.ts +306 -0
- package/src/tokenizers/italian.ts +458 -0
- package/src/tokenizers/japanese.ts +447 -0
- package/src/tokenizers/korean.ts +642 -0
- package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
- package/src/tokenizers/morphology/french-normalizer.ts +268 -0
- package/src/tokenizers/morphology/german-normalizer.ts +256 -0
- package/src/tokenizers/morphology/index.ts +46 -0
- package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
- package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
- package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
- package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
- package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
- package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
- package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
- package/src/tokenizers/morphology/types.ts +211 -0
- package/src/tokenizers/ms.ts +198 -0
- package/src/tokenizers/polish.ts +354 -0
- package/src/tokenizers/portuguese.ts +304 -0
- package/src/tokenizers/quechua.ts +339 -0
- package/src/tokenizers/russian.ts +375 -0
- package/src/tokenizers/spanish.ts +403 -0
- package/src/tokenizers/swahili.ts +303 -0
- package/src/tokenizers/thai.ts +236 -0
- package/src/tokenizers/tl.ts +198 -0
- package/src/tokenizers/turkish.ts +411 -0
- package/src/tokenizers/ukrainian.ts +369 -0
- package/src/tokenizers/vietnamese.ts +410 -0
- package/src/types/grammar-types.ts +617 -0
- package/src/types/unified-profile.ts +267 -0
- package/src/types.ts +709 -0
- package/src/utils/confidence-calculator.ts +147 -0
- package/src/validators/command-validator.ts +380 -0
- package/src/validators/index.ts +15 -0
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Korean Morphological Normalizer
|
|
3
|
+
*
|
|
4
|
+
* Reduces Korean verb conjugations to their stem forms.
|
|
5
|
+
* Korean verbs conjugate by modifying their endings:
|
|
6
|
+
*
|
|
7
|
+
* Base: 토글 (togeul) - "toggle" (loanword)
|
|
8
|
+
* 다 ending: 토글하다 (togeul-hada) - "to toggle" (dictionary form)
|
|
9
|
+
* 요 ending: 토글해요 (togeul-haeyo) - polite present
|
|
10
|
+
* 니다 ending: 토글합니다 (togeul-hamnida) - formal present
|
|
11
|
+
* 세요 ending: 토글하세요 (togeul-haseyo) - honorific request
|
|
12
|
+
* 았/었 past: 토글했어 (togeul-haesseo) - informal past
|
|
13
|
+
*
|
|
14
|
+
* Korean also has vowel harmony affecting suffix forms.
|
|
15
|
+
*
|
|
16
|
+
* This normalizer strips these suffixes to find the stem,
|
|
17
|
+
* which can then be matched against keyword dictionaries.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import type {
|
|
21
|
+
MorphologicalNormalizer,
|
|
22
|
+
NormalizationResult,
|
|
23
|
+
SuffixRule,
|
|
24
|
+
ConjugationType,
|
|
25
|
+
} from './types';
|
|
26
|
+
import { noChange, normalized } from './types';
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Check if a character is a Korean syllable block (Hangul).
|
|
30
|
+
* Korean syllables are in the range U+AC00 to U+D7A3.
|
|
31
|
+
*/
|
|
32
|
+
function isHangul(char: string): boolean {
|
|
33
|
+
const code = char.charCodeAt(0);
|
|
34
|
+
return code >= 0xac00 && code <= 0xd7a3;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Check if a word contains Korean characters.
|
|
39
|
+
*/
|
|
40
|
+
function containsKorean(word: string): boolean {
|
|
41
|
+
for (const char of word) {
|
|
42
|
+
if (isHangul(char)) return true;
|
|
43
|
+
}
|
|
44
|
+
return false;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Suffix rules for Korean verb conjugation.
|
|
49
|
+
* Ordered by length (longest first) to ensure greedy matching.
|
|
50
|
+
*/
|
|
51
|
+
const KOREAN_SUFFIX_RULES: readonly SuffixRule[] = [
|
|
52
|
+
// Honorific conditional/temporal forms (-시- infix)
|
|
53
|
+
// These are critical for polite/formal Korean
|
|
54
|
+
{ pattern: '하시니까', confidence: 0.85, conjugationType: 'honorific-causal', minStemLength: 1 },
|
|
55
|
+
{ pattern: '하실때', confidence: 0.88, conjugationType: 'honorific-temporal', minStemLength: 1 },
|
|
56
|
+
{ pattern: '하실 때', confidence: 0.88, conjugationType: 'honorific-temporal', minStemLength: 1 },
|
|
57
|
+
{
|
|
58
|
+
pattern: '하시면',
|
|
59
|
+
confidence: 0.88,
|
|
60
|
+
conjugationType: 'honorific-conditional',
|
|
61
|
+
minStemLength: 1,
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
pattern: '으시면',
|
|
65
|
+
confidence: 0.85,
|
|
66
|
+
conjugationType: 'honorific-conditional',
|
|
67
|
+
minStemLength: 2,
|
|
68
|
+
},
|
|
69
|
+
{ pattern: '시면', confidence: 0.82, conjugationType: 'honorific-conditional', minStemLength: 2 },
|
|
70
|
+
|
|
71
|
+
// Sequential/temporal forms - "after doing", "before doing", "as soon as"
|
|
72
|
+
{ pattern: '하고나서', confidence: 0.85, conjugationType: 'sequential-after', minStemLength: 1 },
|
|
73
|
+
{ pattern: '하고 나서', confidence: 0.85, conjugationType: 'sequential-after', minStemLength: 1 },
|
|
74
|
+
{ pattern: '하고서', confidence: 0.85, conjugationType: 'sequential-after', minStemLength: 1 },
|
|
75
|
+
{ pattern: '고나서', confidence: 0.82, conjugationType: 'sequential-after', minStemLength: 2 },
|
|
76
|
+
{ pattern: '고 나서', confidence: 0.82, conjugationType: 'sequential-after', minStemLength: 2 },
|
|
77
|
+
{ pattern: '고서', confidence: 0.82, conjugationType: 'sequential-after', minStemLength: 2 },
|
|
78
|
+
{ pattern: '하기전에', confidence: 0.85, conjugationType: 'sequential-before', minStemLength: 1 },
|
|
79
|
+
{
|
|
80
|
+
pattern: '하기 전에',
|
|
81
|
+
confidence: 0.85,
|
|
82
|
+
conjugationType: 'sequential-before',
|
|
83
|
+
minStemLength: 1,
|
|
84
|
+
},
|
|
85
|
+
{ pattern: '기전에', confidence: 0.82, conjugationType: 'sequential-before', minStemLength: 2 },
|
|
86
|
+
{ pattern: '기 전에', confidence: 0.82, conjugationType: 'sequential-before', minStemLength: 2 },
|
|
87
|
+
{ pattern: '하자마자', confidence: 0.88, conjugationType: 'immediate', minStemLength: 1 },
|
|
88
|
+
{ pattern: '자마자', confidence: 0.85, conjugationType: 'immediate', minStemLength: 2 },
|
|
89
|
+
|
|
90
|
+
// Obligation forms - "must do", "should do"
|
|
91
|
+
{ pattern: '해야해요', confidence: 0.85, conjugationType: 'obligation', minStemLength: 1 },
|
|
92
|
+
{ pattern: '해야해', confidence: 0.85, conjugationType: 'obligation', minStemLength: 1 },
|
|
93
|
+
{ pattern: '해야하다', confidence: 0.85, conjugationType: 'obligation', minStemLength: 1 },
|
|
94
|
+
{ pattern: '어야해요', confidence: 0.82, conjugationType: 'obligation', minStemLength: 2 },
|
|
95
|
+
{ pattern: '어야해', confidence: 0.82, conjugationType: 'obligation', minStemLength: 2 },
|
|
96
|
+
{ pattern: '아야해요', confidence: 0.82, conjugationType: 'obligation', minStemLength: 2 },
|
|
97
|
+
{ pattern: '아야해', confidence: 0.82, conjugationType: 'obligation', minStemLength: 2 },
|
|
98
|
+
|
|
99
|
+
// Conditional forms - most natural for event handlers (longest first)
|
|
100
|
+
// These are critical for native Korean idioms like "클릭하면 증가"
|
|
101
|
+
{ pattern: '하니까', confidence: 0.85, conjugationType: 'causal-nikka', minStemLength: 1 },
|
|
102
|
+
{ pattern: '할때', confidence: 0.88, conjugationType: 'temporal-ttae', minStemLength: 1 },
|
|
103
|
+
{ pattern: '할 때', confidence: 0.88, conjugationType: 'temporal-ttae', minStemLength: 1 },
|
|
104
|
+
{ pattern: '을때', confidence: 0.85, conjugationType: 'temporal-ttae', minStemLength: 2 },
|
|
105
|
+
{ pattern: '을 때', confidence: 0.85, conjugationType: 'temporal-ttae', minStemLength: 2 },
|
|
106
|
+
{ pattern: '하면', confidence: 0.88, conjugationType: 'conditional-myeon', minStemLength: 1 },
|
|
107
|
+
{ pattern: '으면', confidence: 0.85, conjugationType: 'conditional-myeon', minStemLength: 2 },
|
|
108
|
+
{ pattern: '니까', confidence: 0.82, conjugationType: 'causal-nikka', minStemLength: 2 },
|
|
109
|
+
{ pattern: '면', confidence: 0.8, conjugationType: 'conditional-myeon', minStemLength: 2 },
|
|
110
|
+
|
|
111
|
+
// Formal polite forms (longest first)
|
|
112
|
+
{ pattern: '하였습니다', confidence: 0.85, conjugationType: 'past', minStemLength: 1 },
|
|
113
|
+
{ pattern: '했습니다', confidence: 0.85, conjugationType: 'past', minStemLength: 1 },
|
|
114
|
+
{ pattern: '합니다', confidence: 0.85, conjugationType: 'polite', minStemLength: 1 },
|
|
115
|
+
{ pattern: '습니다', confidence: 0.82, conjugationType: 'polite', minStemLength: 2 },
|
|
116
|
+
{ pattern: '됩니다', confidence: 0.82, conjugationType: 'polite', minStemLength: 1 },
|
|
117
|
+
{ pattern: 'ㅂ니다', confidence: 0.82, conjugationType: 'polite', minStemLength: 2 },
|
|
118
|
+
|
|
119
|
+
// Honorific request forms
|
|
120
|
+
{ pattern: '하세요', confidence: 0.85, conjugationType: 'honorific', minStemLength: 1 },
|
|
121
|
+
{ pattern: '하십시오', confidence: 0.85, conjugationType: 'honorific', minStemLength: 1 },
|
|
122
|
+
{ pattern: '세요', confidence: 0.82, conjugationType: 'honorific', minStemLength: 2 },
|
|
123
|
+
{ pattern: '십시오', confidence: 0.82, conjugationType: 'honorific', minStemLength: 2 },
|
|
124
|
+
|
|
125
|
+
// Informal polite (요) forms
|
|
126
|
+
{ pattern: '하고있어요', confidence: 0.82, conjugationType: 'progressive', minStemLength: 1 },
|
|
127
|
+
{ pattern: '하고있어', confidence: 0.82, conjugationType: 'progressive', minStemLength: 1 },
|
|
128
|
+
{ pattern: '했어요', confidence: 0.85, conjugationType: 'past', minStemLength: 1 },
|
|
129
|
+
{ pattern: '해요', confidence: 0.85, conjugationType: 'polite', minStemLength: 1 },
|
|
130
|
+
{ pattern: '어요', confidence: 0.82, conjugationType: 'polite', minStemLength: 2 },
|
|
131
|
+
{ pattern: '아요', confidence: 0.82, conjugationType: 'polite', minStemLength: 2 },
|
|
132
|
+
|
|
133
|
+
// Informal (반말) forms
|
|
134
|
+
{ pattern: '했어', confidence: 0.85, conjugationType: 'past', minStemLength: 1 },
|
|
135
|
+
{ pattern: '해', confidence: 0.8, conjugationType: 'present', minStemLength: 1 },
|
|
136
|
+
{ pattern: '었어', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
137
|
+
{ pattern: '았어', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
138
|
+
|
|
139
|
+
// Progressive forms
|
|
140
|
+
{ pattern: '하고있다', confidence: 0.82, conjugationType: 'progressive', minStemLength: 1 },
|
|
141
|
+
{ pattern: '고있다', confidence: 0.8, conjugationType: 'progressive', minStemLength: 2 },
|
|
142
|
+
{ pattern: '고있어', confidence: 0.8, conjugationType: 'progressive', minStemLength: 2 },
|
|
143
|
+
|
|
144
|
+
// Dictionary/infinitive form (하다 verbs)
|
|
145
|
+
{ pattern: '하다', confidence: 0.88, conjugationType: 'dictionary', minStemLength: 1 },
|
|
146
|
+
|
|
147
|
+
// Negative forms
|
|
148
|
+
{ pattern: '하지않다', confidence: 0.82, conjugationType: 'negative', minStemLength: 1 },
|
|
149
|
+
{ pattern: '안하다', confidence: 0.82, conjugationType: 'negative', minStemLength: 1 },
|
|
150
|
+
{ pattern: '지않다', confidence: 0.8, conjugationType: 'negative', minStemLength: 2 },
|
|
151
|
+
|
|
152
|
+
// Imperative forms
|
|
153
|
+
{ pattern: '해라', confidence: 0.82, conjugationType: 'imperative', minStemLength: 1 },
|
|
154
|
+
{ pattern: '하라', confidence: 0.82, conjugationType: 'imperative', minStemLength: 1 },
|
|
155
|
+
|
|
156
|
+
// Generic verb endings (lower confidence)
|
|
157
|
+
{ pattern: '다', confidence: 0.75, conjugationType: 'dictionary', minStemLength: 2 },
|
|
158
|
+
];
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* 하다 verb pattern - very common pattern in Korean.
|
|
162
|
+
* Noun + 하다 forms a verb.
|
|
163
|
+
* e.g., 토글 + 하다 = 토글하다 (to toggle)
|
|
164
|
+
*/
|
|
165
|
+
const HADA_PATTERNS: readonly {
|
|
166
|
+
pattern: string;
|
|
167
|
+
confidence: number;
|
|
168
|
+
conjugationType: ConjugationType;
|
|
169
|
+
}[] = [
|
|
170
|
+
// Honorific forms (-시- infix) - polite/formal Korean
|
|
171
|
+
// 클릭하시면 → 클릭 (if you click - honorific)
|
|
172
|
+
{ pattern: '하시니까', confidence: 0.88, conjugationType: 'honorific-causal' },
|
|
173
|
+
{ pattern: '하실때', confidence: 0.88, conjugationType: 'honorific-temporal' },
|
|
174
|
+
{ pattern: '하실 때', confidence: 0.88, conjugationType: 'honorific-temporal' },
|
|
175
|
+
{ pattern: '하시면', confidence: 0.88, conjugationType: 'honorific-conditional' },
|
|
176
|
+
{ pattern: '하셨어요', confidence: 0.85, conjugationType: 'honorific-past' },
|
|
177
|
+
{ pattern: '하셨어', confidence: 0.85, conjugationType: 'honorific-past' },
|
|
178
|
+
{ pattern: '하십니다', confidence: 0.85, conjugationType: 'honorific-polite' },
|
|
179
|
+
|
|
180
|
+
// Sequential/temporal forms - "after doing", "before doing", "as soon as"
|
|
181
|
+
{ pattern: '하고나서', confidence: 0.88, conjugationType: 'sequential-after' },
|
|
182
|
+
{ pattern: '하고 나서', confidence: 0.88, conjugationType: 'sequential-after' },
|
|
183
|
+
{ pattern: '하고서', confidence: 0.88, conjugationType: 'sequential-after' },
|
|
184
|
+
{ pattern: '하기전에', confidence: 0.88, conjugationType: 'sequential-before' },
|
|
185
|
+
{ pattern: '하기 전에', confidence: 0.88, conjugationType: 'sequential-before' },
|
|
186
|
+
{ pattern: '하자마자', confidence: 0.88, conjugationType: 'immediate' },
|
|
187
|
+
|
|
188
|
+
// Obligation forms - "must do", "should do"
|
|
189
|
+
{ pattern: '해야해요', confidence: 0.88, conjugationType: 'obligation' },
|
|
190
|
+
{ pattern: '해야해', confidence: 0.88, conjugationType: 'obligation' },
|
|
191
|
+
{ pattern: '해야하다', confidence: 0.88, conjugationType: 'obligation' },
|
|
192
|
+
|
|
193
|
+
// Conditional forms - most natural for event handlers (highest priority)
|
|
194
|
+
// 클릭하면 → 클릭 (if clicked)
|
|
195
|
+
{ pattern: '하니까', confidence: 0.88, conjugationType: 'causal-nikka' },
|
|
196
|
+
{ pattern: '할때', confidence: 0.88, conjugationType: 'temporal-ttae' },
|
|
197
|
+
{ pattern: '할 때', confidence: 0.88, conjugationType: 'temporal-ttae' },
|
|
198
|
+
{ pattern: '하면', confidence: 0.88, conjugationType: 'conditional-myeon' },
|
|
199
|
+
|
|
200
|
+
// Formal
|
|
201
|
+
{ pattern: '하였습니다', confidence: 0.85, conjugationType: 'past' },
|
|
202
|
+
{ pattern: '했습니다', confidence: 0.85, conjugationType: 'past' },
|
|
203
|
+
{ pattern: '합니다', confidence: 0.85, conjugationType: 'polite' },
|
|
204
|
+
{ pattern: '하십시오', confidence: 0.85, conjugationType: 'honorific' },
|
|
205
|
+
{ pattern: '하세요', confidence: 0.85, conjugationType: 'honorific' },
|
|
206
|
+
// Informal polite
|
|
207
|
+
{ pattern: '했어요', confidence: 0.85, conjugationType: 'past' },
|
|
208
|
+
{ pattern: '해요', confidence: 0.85, conjugationType: 'polite' },
|
|
209
|
+
// Informal
|
|
210
|
+
{ pattern: '했어', confidence: 0.85, conjugationType: 'past' },
|
|
211
|
+
{ pattern: '해', confidence: 0.8, conjugationType: 'present' },
|
|
212
|
+
// Progressive
|
|
213
|
+
{ pattern: '하고있어요', confidence: 0.82, conjugationType: 'progressive' },
|
|
214
|
+
{ pattern: '하고있어', confidence: 0.82, conjugationType: 'progressive' },
|
|
215
|
+
{ pattern: '하고있다', confidence: 0.82, conjugationType: 'progressive' },
|
|
216
|
+
// Connective forms (해서 = because/so, 하고 = and)
|
|
217
|
+
{ pattern: '해서', confidence: 0.82, conjugationType: 'connective' },
|
|
218
|
+
{ pattern: '하고', confidence: 0.8, conjugationType: 'connective' },
|
|
219
|
+
// Negative
|
|
220
|
+
{ pattern: '하지않아요', confidence: 0.82, conjugationType: 'negative' },
|
|
221
|
+
{ pattern: '하지않다', confidence: 0.82, conjugationType: 'negative' },
|
|
222
|
+
{ pattern: '안해요', confidence: 0.82, conjugationType: 'negative' },
|
|
223
|
+
{ pattern: '안해', confidence: 0.82, conjugationType: 'negative' },
|
|
224
|
+
// Imperative
|
|
225
|
+
{ pattern: '해라', confidence: 0.82, conjugationType: 'imperative' },
|
|
226
|
+
{ pattern: '하라', confidence: 0.82, conjugationType: 'imperative' },
|
|
227
|
+
// Dictionary form
|
|
228
|
+
{ pattern: '하다', confidence: 0.88, conjugationType: 'dictionary' },
|
|
229
|
+
];
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* Korean morphological normalizer.
|
|
233
|
+
*/
|
|
234
|
+
export class KoreanMorphologicalNormalizer implements MorphologicalNormalizer {
|
|
235
|
+
readonly language = 'ko';
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Check if a word might be a Korean verb that can be normalized.
|
|
239
|
+
*/
|
|
240
|
+
isNormalizable(word: string): boolean {
|
|
241
|
+
// Must contain Korean characters
|
|
242
|
+
if (!containsKorean(word)) return false;
|
|
243
|
+
|
|
244
|
+
// Must be at least 2 characters
|
|
245
|
+
if (word.length < 2) return false;
|
|
246
|
+
|
|
247
|
+
return true;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* Normalize a Korean word to its stem form.
|
|
252
|
+
*/
|
|
253
|
+
normalize(word: string): NormalizationResult {
|
|
254
|
+
// Check for compound conjugations first (multi-layer suffixes)
|
|
255
|
+
const compoundResult = this.normalizeCompound(word);
|
|
256
|
+
if (compoundResult) return compoundResult;
|
|
257
|
+
|
|
258
|
+
// Check for 하다 verb patterns (most common verb type)
|
|
259
|
+
const hadaResult = this.tryHadaNormalization(word);
|
|
260
|
+
if (hadaResult) return hadaResult;
|
|
261
|
+
|
|
262
|
+
// Try general suffix rules
|
|
263
|
+
for (const rule of KOREAN_SUFFIX_RULES) {
|
|
264
|
+
if (word.endsWith(rule.pattern)) {
|
|
265
|
+
const stem = word.slice(0, -rule.pattern.length);
|
|
266
|
+
|
|
267
|
+
// Validate stem length
|
|
268
|
+
const minLength = rule.minStemLength ?? 2;
|
|
269
|
+
if (stem.length < minLength) continue;
|
|
270
|
+
|
|
271
|
+
const metadata: {
|
|
272
|
+
removedSuffixes: string[];
|
|
273
|
+
conjugationType?: typeof rule.conjugationType;
|
|
274
|
+
} = {
|
|
275
|
+
removedSuffixes: [rule.pattern],
|
|
276
|
+
};
|
|
277
|
+
if (rule.conjugationType) {
|
|
278
|
+
metadata.conjugationType = rule.conjugationType;
|
|
279
|
+
}
|
|
280
|
+
return normalized(stem, rule.confidence, metadata);
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// No normalization needed
|
|
285
|
+
return noChange(word);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* Try to normalize a 하다 verb.
|
|
290
|
+
* 하다 verbs are formed by noun + 하다, very common in Korean.
|
|
291
|
+
*/
|
|
292
|
+
private tryHadaNormalization(word: string): NormalizationResult | null {
|
|
293
|
+
for (const pattern of HADA_PATTERNS) {
|
|
294
|
+
if (word.endsWith(pattern.pattern)) {
|
|
295
|
+
const stem = word.slice(0, -pattern.pattern.length);
|
|
296
|
+
|
|
297
|
+
// 하다 verbs need at least one character for the noun part
|
|
298
|
+
if (stem.length < 1) continue;
|
|
299
|
+
|
|
300
|
+
// Return the noun part (without 하다 conjugation)
|
|
301
|
+
return normalized(stem, pattern.confidence, {
|
|
302
|
+
removedSuffixes: [pattern.pattern],
|
|
303
|
+
conjugationType: pattern.conjugationType,
|
|
304
|
+
originalForm: 'hada-verb',
|
|
305
|
+
});
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
return null;
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
/**
|
|
312
|
+
* Normalize compound conjugations (multi-layer suffixes).
|
|
313
|
+
* Korean has complex compound forms that combine multiple grammatical elements.
|
|
314
|
+
*/
|
|
315
|
+
private normalizeCompound(word: string): NormalizationResult | null {
|
|
316
|
+
// Compound patterns with sequential/modal forms
|
|
317
|
+
const compoundPatterns: readonly {
|
|
318
|
+
pattern: string;
|
|
319
|
+
suffixes: string[];
|
|
320
|
+
confidence: number;
|
|
321
|
+
conjugationType: ConjugationType;
|
|
322
|
+
minStemLength: number;
|
|
323
|
+
}[] = [
|
|
324
|
+
// Sequential past forms (after doing, was)
|
|
325
|
+
{
|
|
326
|
+
pattern: '하고나서였어',
|
|
327
|
+
suffixes: ['하고나서', '였어'],
|
|
328
|
+
confidence: 0.78,
|
|
329
|
+
conjugationType: 'sequential-after',
|
|
330
|
+
minStemLength: 2,
|
|
331
|
+
},
|
|
332
|
+
{
|
|
333
|
+
pattern: '하고나서였다',
|
|
334
|
+
suffixes: ['하고나서', '였다'],
|
|
335
|
+
confidence: 0.78,
|
|
336
|
+
conjugationType: 'sequential-after',
|
|
337
|
+
minStemLength: 2,
|
|
338
|
+
},
|
|
339
|
+
{
|
|
340
|
+
pattern: '하고나서',
|
|
341
|
+
suffixes: ['하고', '나서'],
|
|
342
|
+
confidence: 0.85,
|
|
343
|
+
conjugationType: 'sequential-after',
|
|
344
|
+
minStemLength: 2,
|
|
345
|
+
},
|
|
346
|
+
|
|
347
|
+
// Modal necessity past forms (had to do)
|
|
348
|
+
{
|
|
349
|
+
pattern: '해야했어',
|
|
350
|
+
suffixes: ['해야', '했어'],
|
|
351
|
+
confidence: 0.8,
|
|
352
|
+
conjugationType: 'obligation',
|
|
353
|
+
minStemLength: 2,
|
|
354
|
+
},
|
|
355
|
+
{
|
|
356
|
+
pattern: '해야했다',
|
|
357
|
+
suffixes: ['해야', '했다'],
|
|
358
|
+
confidence: 0.8,
|
|
359
|
+
conjugationType: 'obligation',
|
|
360
|
+
minStemLength: 2,
|
|
361
|
+
},
|
|
362
|
+
{
|
|
363
|
+
pattern: '해야했습니다',
|
|
364
|
+
suffixes: ['해야', '했습니다'],
|
|
365
|
+
confidence: 0.8,
|
|
366
|
+
conjugationType: 'obligation',
|
|
367
|
+
minStemLength: 2,
|
|
368
|
+
},
|
|
369
|
+
|
|
370
|
+
// Honorific simultaneous forms (while doing, honorific)
|
|
371
|
+
{
|
|
372
|
+
pattern: '하시면서',
|
|
373
|
+
suffixes: ['하시', '면서'],
|
|
374
|
+
confidence: 0.82,
|
|
375
|
+
conjugationType: 'connective',
|
|
376
|
+
minStemLength: 2,
|
|
377
|
+
},
|
|
378
|
+
{
|
|
379
|
+
pattern: '하시며',
|
|
380
|
+
suffixes: ['하시', '며'],
|
|
381
|
+
confidence: 0.82,
|
|
382
|
+
conjugationType: 'connective',
|
|
383
|
+
minStemLength: 2,
|
|
384
|
+
},
|
|
385
|
+
|
|
386
|
+
// Progressive forms with copula
|
|
387
|
+
{
|
|
388
|
+
pattern: '하고있었어',
|
|
389
|
+
suffixes: ['하고', '있었어'],
|
|
390
|
+
confidence: 0.8,
|
|
391
|
+
conjugationType: 'progressive',
|
|
392
|
+
minStemLength: 2,
|
|
393
|
+
},
|
|
394
|
+
{
|
|
395
|
+
pattern: '하고있었다',
|
|
396
|
+
suffixes: ['하고', '있었다'],
|
|
397
|
+
confidence: 0.8,
|
|
398
|
+
conjugationType: 'progressive',
|
|
399
|
+
minStemLength: 2,
|
|
400
|
+
},
|
|
401
|
+
];
|
|
402
|
+
|
|
403
|
+
for (const {
|
|
404
|
+
pattern,
|
|
405
|
+
suffixes,
|
|
406
|
+
confidence,
|
|
407
|
+
conjugationType,
|
|
408
|
+
minStemLength,
|
|
409
|
+
} of compoundPatterns) {
|
|
410
|
+
if (word.endsWith(pattern)) {
|
|
411
|
+
const stem = word.slice(0, -pattern.length);
|
|
412
|
+
|
|
413
|
+
// Validate minimum stem length
|
|
414
|
+
if (stem.length < minStemLength) continue;
|
|
415
|
+
|
|
416
|
+
return normalized(stem, confidence, {
|
|
417
|
+
removedSuffixes: suffixes,
|
|
418
|
+
conjugationType,
|
|
419
|
+
});
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
return null;
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
// Export singleton instance
|
|
428
|
+
export const koreanMorphologicalNormalizer = new KoreanMorphologicalNormalizer();
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Polish Morphological Normalizer
|
|
3
|
+
*
|
|
4
|
+
* Normalizes Polish verb forms to their base/infinitive form.
|
|
5
|
+
*
|
|
6
|
+
* Polish verb conjugation is complex with:
|
|
7
|
+
* - Three main conjugation classes (determined by infinitive ending)
|
|
8
|
+
* - Person/number agreement (6 forms per tense)
|
|
9
|
+
* - Aspect pairs (perfective/imperfective)
|
|
10
|
+
*
|
|
11
|
+
* For software UI, Polish uses IMPERATIVE form (unlike most languages):
|
|
12
|
+
* - zapisz (save), otwórz (open), usuń (delete)
|
|
13
|
+
*
|
|
14
|
+
* This normalizer focuses on recognizing imperative forms and
|
|
15
|
+
* mapping them back to their base form for keyword matching.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
export interface NormalizationResult {
|
|
19
|
+
stem: string;
|
|
20
|
+
suffix?: string;
|
|
21
|
+
confidence: number;
|
|
22
|
+
originalForm?: string;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Polish Morphological Normalizer
|
|
27
|
+
*
|
|
28
|
+
* Key patterns:
|
|
29
|
+
* - Imperative suffixes: -aj, -ij, -uj (2nd person singular)
|
|
30
|
+
* - Infinitive endings: -ać, -eć, -ić, -yć, -ąć
|
|
31
|
+
* - Present tense endings: -am, -em, -ę, -asz, -esz, -isz, -ysz
|
|
32
|
+
*/
|
|
33
|
+
export class PolishMorphologicalNormalizer {
|
|
34
|
+
/**
|
|
35
|
+
* Normalize a Polish verb to its base/infinitive form
|
|
36
|
+
*/
|
|
37
|
+
normalize(word: string): NormalizationResult {
|
|
38
|
+
const lower = word.toLowerCase();
|
|
39
|
+
|
|
40
|
+
// Already in infinitive form (-ać, -eć, -ić, -yć, -ąć, -ować)?
|
|
41
|
+
if (this.isInfinitive(lower)) {
|
|
42
|
+
return { stem: lower, confidence: 1.0 };
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Try imperative normalization
|
|
46
|
+
const imperativeResult = this.tryImperativeNormalization(lower);
|
|
47
|
+
if (imperativeResult) return imperativeResult;
|
|
48
|
+
|
|
49
|
+
// Try present tense normalization
|
|
50
|
+
const presentResult = this.tryPresentTenseNormalization(lower);
|
|
51
|
+
if (presentResult) return presentResult;
|
|
52
|
+
|
|
53
|
+
// Try past tense normalization
|
|
54
|
+
const pastResult = this.tryPastTenseNormalization(lower);
|
|
55
|
+
if (pastResult) return pastResult;
|
|
56
|
+
|
|
57
|
+
// Return as-is if no normalization found
|
|
58
|
+
return { stem: lower, confidence: 0.5, originalForm: word };
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Check if word is already in infinitive form
|
|
63
|
+
*/
|
|
64
|
+
private isInfinitive(word: string): boolean {
|
|
65
|
+
const infinitiveEndings = ['ać', 'eć', 'ić', 'yć', 'ąć', 'ować', 'iwać', 'ywać'];
|
|
66
|
+
return infinitiveEndings.some(ending => word.endsWith(ending));
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Try to normalize imperative form to infinitive
|
|
71
|
+
*
|
|
72
|
+
* Polish imperative (2nd person singular) patterns:
|
|
73
|
+
* - pisać → pisz (write)
|
|
74
|
+
* - czytać → czytaj (read)
|
|
75
|
+
* - robić → rób (do)
|
|
76
|
+
* - mówić → mów (speak)
|
|
77
|
+
* - uczyć → ucz (teach)
|
|
78
|
+
*/
|
|
79
|
+
private tryImperativeNormalization(word: string): NormalizationResult | null {
|
|
80
|
+
// Common imperative forms used in Polish software UI
|
|
81
|
+
const imperativeToInfinitive: Map<string, string> = new Map([
|
|
82
|
+
// Core commands
|
|
83
|
+
['przełącz', 'przełączać'],
|
|
84
|
+
['przelacz', 'przelaczac'],
|
|
85
|
+
['dodaj', 'dodawać'],
|
|
86
|
+
['usuń', 'usuwać'],
|
|
87
|
+
['usun', 'usuwac'],
|
|
88
|
+
['umieść', 'umieszczać'],
|
|
89
|
+
['umiesc', 'umieszczac'],
|
|
90
|
+
['wstaw', 'wstawiać'],
|
|
91
|
+
['ustaw', 'ustawiać'],
|
|
92
|
+
['pobierz', 'pobierać'],
|
|
93
|
+
['weź', 'brać'],
|
|
94
|
+
['wez', 'brac'],
|
|
95
|
+
['zwiększ', 'zwiększać'],
|
|
96
|
+
['zwieksz', 'zwiekszac'],
|
|
97
|
+
['zmniejsz', 'zmniejszać'],
|
|
98
|
+
['pokaż', 'pokazywać'],
|
|
99
|
+
['pokaz', 'pokazywac'],
|
|
100
|
+
['ukryj', 'ukrywać'],
|
|
101
|
+
['schowaj', 'schowywać'],
|
|
102
|
+
['czekaj', 'czekać'],
|
|
103
|
+
['poczekaj', 'poczekać'],
|
|
104
|
+
['idź', 'iść'],
|
|
105
|
+
['idz', 'isc'],
|
|
106
|
+
['przejdź', 'przejść'],
|
|
107
|
+
['przejdz', 'przejsc'],
|
|
108
|
+
['wywołaj', 'wywoływać'],
|
|
109
|
+
['wywolaj', 'wywolywac'],
|
|
110
|
+
['wyślij', 'wysyłać'],
|
|
111
|
+
['wyslij', 'wysylac'],
|
|
112
|
+
['loguj', 'logować'],
|
|
113
|
+
['wypisz', 'wypisywać'],
|
|
114
|
+
['sklonuj', 'sklonować'],
|
|
115
|
+
['kopiuj', 'kopiować'],
|
|
116
|
+
['zamień', 'zamieniać'],
|
|
117
|
+
['zamien', 'zamieniac'],
|
|
118
|
+
['utwórz', 'tworzyć'],
|
|
119
|
+
['utworz', 'tworzyc'],
|
|
120
|
+
['stwórz', 'stwarzać'],
|
|
121
|
+
['stworz', 'stwarzac'],
|
|
122
|
+
['skup', 'skupiać'],
|
|
123
|
+
['rozmyj', 'rozmywać'],
|
|
124
|
+
['nawiguj', 'nawigować'],
|
|
125
|
+
['załaduj', 'ładować'],
|
|
126
|
+
['zaladuj', 'ladowac'],
|
|
127
|
+
['powtórz', 'powtarzać'],
|
|
128
|
+
['powtorz', 'powtarzac'],
|
|
129
|
+
['kontynuuj', 'kontynuować'],
|
|
130
|
+
['zatrzymaj', 'zatrzymywać'],
|
|
131
|
+
['przerwij', 'przerywać'],
|
|
132
|
+
['rzuć', 'rzucać'],
|
|
133
|
+
['rzuc', 'rzucac'],
|
|
134
|
+
['zwróć', 'zwracać'],
|
|
135
|
+
['zwroc', 'zwracac'],
|
|
136
|
+
['inicjuj', 'inicjować'],
|
|
137
|
+
['zainstaluj', 'instalować'],
|
|
138
|
+
['zmierz', 'mierzyć'],
|
|
139
|
+
]);
|
|
140
|
+
|
|
141
|
+
if (imperativeToInfinitive.has(word)) {
|
|
142
|
+
return {
|
|
143
|
+
stem: imperativeToInfinitive.get(word)!,
|
|
144
|
+
suffix: 'imperative',
|
|
145
|
+
confidence: 0.95,
|
|
146
|
+
originalForm: word,
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// Generic imperative pattern: ends in consonant or -j
|
|
151
|
+
// Try to reconstruct infinitive
|
|
152
|
+
|
|
153
|
+
// Pattern: -aj → -ać (czytaj → czytać)
|
|
154
|
+
if (word.endsWith('aj')) {
|
|
155
|
+
const stem = word.slice(0, -2) + 'ać';
|
|
156
|
+
return { stem, suffix: 'aj', confidence: 0.8, originalForm: word };
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Pattern: -uj → -ować (kopiuj → kopiować)
|
|
160
|
+
if (word.endsWith('uj')) {
|
|
161
|
+
const stem = word.slice(0, -2) + 'ować';
|
|
162
|
+
return { stem, suffix: 'uj', confidence: 0.8, originalForm: word };
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Pattern: -ij → -ić (rób → robić - irregular)
|
|
166
|
+
if (word.endsWith('ij')) {
|
|
167
|
+
const stem = word.slice(0, -2) + 'ić';
|
|
168
|
+
return { stem, suffix: 'ij', confidence: 0.75, originalForm: word };
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return null;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Try to normalize present tense form to infinitive
|
|
176
|
+
*/
|
|
177
|
+
private tryPresentTenseNormalization(word: string): NormalizationResult | null {
|
|
178
|
+
// Pattern: -am → -ać (czytam → czytać)
|
|
179
|
+
if (word.endsWith('am')) {
|
|
180
|
+
const stem = word.slice(0, -2) + 'ać';
|
|
181
|
+
return { stem, suffix: 'am', confidence: 0.8, originalForm: word };
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Pattern: -em → -eć (rozumiem → rozumieć)
|
|
185
|
+
if (word.endsWith('em') && word.length > 3) {
|
|
186
|
+
const stem = word.slice(0, -2) + 'eć';
|
|
187
|
+
return { stem, suffix: 'em', confidence: 0.75, originalForm: word };
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Pattern: -ę → -ać/-eć (piszę → pisać)
|
|
191
|
+
if (word.endsWith('ę')) {
|
|
192
|
+
const stem = word.slice(0, -1) + 'ać';
|
|
193
|
+
return { stem, suffix: 'ę', confidence: 0.7, originalForm: word };
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Pattern: -uję → -ować (pracuję → pracować)
|
|
197
|
+
if (word.endsWith('uję') || word.endsWith('uje')) {
|
|
198
|
+
const stem = word.slice(0, -3) + 'ować';
|
|
199
|
+
return { stem, suffix: 'uję', confidence: 0.85, originalForm: word };
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return null;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Try to normalize past tense form to infinitive
|
|
207
|
+
*/
|
|
208
|
+
private tryPastTenseNormalization(word: string): NormalizationResult | null {
|
|
209
|
+
// Pattern: -ałem/-ałam → -ać (czytałem → czytać)
|
|
210
|
+
if (word.endsWith('ałem') || word.endsWith('ałam')) {
|
|
211
|
+
const stem = word.slice(0, -4) + 'ać';
|
|
212
|
+
return { stem, suffix: word.slice(-4), confidence: 0.85, originalForm: word };
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Pattern: -ał/-ała → -ać (czytał → czytać)
|
|
216
|
+
if (word.endsWith('ał') || word.endsWith('ała')) {
|
|
217
|
+
const suffixLen = word.endsWith('ała') ? 3 : 2;
|
|
218
|
+
const stem = word.slice(0, -suffixLen) + 'ać';
|
|
219
|
+
return { stem, suffix: word.slice(-suffixLen), confidence: 0.8, originalForm: word };
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// Pattern: -iłem/-iłam → -ić (robiłem → robić)
|
|
223
|
+
if (
|
|
224
|
+
word.endsWith('iłem') ||
|
|
225
|
+
word.endsWith('iłam') ||
|
|
226
|
+
word.endsWith('ilem') ||
|
|
227
|
+
word.endsWith('ilam')
|
|
228
|
+
) {
|
|
229
|
+
const stem = word.slice(0, -4) + 'ić';
|
|
230
|
+
return { stem, suffix: word.slice(-4), confidence: 0.85, originalForm: word };
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Pattern: -ił/-iła → -ić (robił → robić)
|
|
234
|
+
if (
|
|
235
|
+
word.endsWith('ił') ||
|
|
236
|
+
word.endsWith('iła') ||
|
|
237
|
+
word.endsWith('il') ||
|
|
238
|
+
word.endsWith('ila')
|
|
239
|
+
) {
|
|
240
|
+
const suffixLen = word.endsWith('iła') || word.endsWith('ila') ? 3 : 2;
|
|
241
|
+
const stem = word.slice(0, -suffixLen) + 'ić';
|
|
242
|
+
return { stem, suffix: word.slice(-suffixLen), confidence: 0.8, originalForm: word };
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
return null;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/**
|
|
249
|
+
* Check if two words are morphologically related
|
|
250
|
+
*/
|
|
251
|
+
areMorphologicallyRelated(word1: string, word2: string): boolean {
|
|
252
|
+
const norm1 = this.normalize(word1);
|
|
253
|
+
const norm2 = this.normalize(word2);
|
|
254
|
+
|
|
255
|
+
// Same stem
|
|
256
|
+
if (norm1.stem === norm2.stem) return true;
|
|
257
|
+
|
|
258
|
+
// Check if one is prefix of the other (for aspectual pairs)
|
|
259
|
+
const stems = [norm1.stem, norm2.stem].sort((a, b) => a.length - b.length);
|
|
260
|
+
if (stems[1].endsWith(stems[0].slice(-4))) return true;
|
|
261
|
+
|
|
262
|
+
return false;
|
|
263
|
+
}
|
|
264
|
+
}
|