@lokascript/semantic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +686 -0
- package/dist/browser-ar.ar.global.js +2 -0
- package/dist/browser-core.core.global.js +2 -0
- package/dist/browser-de.de.global.js +2 -0
- package/dist/browser-east-asian.east-asian.global.js +2 -0
- package/dist/browser-en-tr.en-tr.global.js +2 -0
- package/dist/browser-en.en.global.js +2 -0
- package/dist/browser-es-en.es-en.global.js +2 -0
- package/dist/browser-es.es.global.js +2 -0
- package/dist/browser-fr.fr.global.js +2 -0
- package/dist/browser-id.id.global.js +2 -0
- package/dist/browser-ja.ja.global.js +2 -0
- package/dist/browser-ko.ko.global.js +2 -0
- package/dist/browser-lazy.lazy.global.js +2 -0
- package/dist/browser-priority.priority.global.js +2 -0
- package/dist/browser-pt.pt.global.js +2 -0
- package/dist/browser-qu.qu.global.js +2 -0
- package/dist/browser-sw.sw.global.js +2 -0
- package/dist/browser-tr.tr.global.js +2 -0
- package/dist/browser-western.western.global.js +2 -0
- package/dist/browser-zh.zh.global.js +2 -0
- package/dist/browser.global.js +3 -0
- package/dist/browser.global.js.map +1 -0
- package/dist/index.cjs +35051 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3426 -0
- package/dist/index.d.ts +3426 -0
- package/dist/index.js +34890 -0
- package/dist/index.js.map +1 -0
- package/dist/languages/ar.d.ts +78 -0
- package/dist/languages/ar.js +1622 -0
- package/dist/languages/ar.js.map +1 -0
- package/dist/languages/de.d.ts +38 -0
- package/dist/languages/de.js +1168 -0
- package/dist/languages/de.js.map +1 -0
- package/dist/languages/en.d.ts +44 -0
- package/dist/languages/en.js +3491 -0
- package/dist/languages/en.js.map +1 -0
- package/dist/languages/es.d.ts +52 -0
- package/dist/languages/es.js +1493 -0
- package/dist/languages/es.js.map +1 -0
- package/dist/languages/fr.d.ts +37 -0
- package/dist/languages/fr.js +1159 -0
- package/dist/languages/fr.js.map +1 -0
- package/dist/languages/id.d.ts +35 -0
- package/dist/languages/id.js +1152 -0
- package/dist/languages/id.js.map +1 -0
- package/dist/languages/ja.d.ts +53 -0
- package/dist/languages/ja.js +1430 -0
- package/dist/languages/ja.js.map +1 -0
- package/dist/languages/ko.d.ts +51 -0
- package/dist/languages/ko.js +1729 -0
- package/dist/languages/ko.js.map +1 -0
- package/dist/languages/pt.d.ts +37 -0
- package/dist/languages/pt.js +1127 -0
- package/dist/languages/pt.js.map +1 -0
- package/dist/languages/qu.d.ts +36 -0
- package/dist/languages/qu.js +1143 -0
- package/dist/languages/qu.js.map +1 -0
- package/dist/languages/sw.d.ts +35 -0
- package/dist/languages/sw.js +1147 -0
- package/dist/languages/sw.js.map +1 -0
- package/dist/languages/tr.d.ts +45 -0
- package/dist/languages/tr.js +1529 -0
- package/dist/languages/tr.js.map +1 -0
- package/dist/languages/zh.d.ts +58 -0
- package/dist/languages/zh.js +1257 -0
- package/dist/languages/zh.js.map +1 -0
- package/dist/types-C4dcj53L.d.ts +600 -0
- package/package.json +202 -0
- package/src/__test-utils__/index.ts +7 -0
- package/src/__test-utils__/test-helpers.ts +8 -0
- package/src/__types__/test-helpers.ts +122 -0
- package/src/analysis/index.ts +479 -0
- package/src/ast-builder/command-mappers.ts +1133 -0
- package/src/ast-builder/expression-parser/index.ts +41 -0
- package/src/ast-builder/expression-parser/parser.ts +563 -0
- package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
- package/src/ast-builder/expression-parser/types.ts +208 -0
- package/src/ast-builder/index.ts +536 -0
- package/src/ast-builder/value-converters.ts +172 -0
- package/src/bridge.ts +275 -0
- package/src/browser-ar.ts +162 -0
- package/src/browser-core.ts +231 -0
- package/src/browser-de.ts +162 -0
- package/src/browser-east-asian.ts +173 -0
- package/src/browser-en-tr.ts +165 -0
- package/src/browser-en.ts +157 -0
- package/src/browser-es-en.ts +200 -0
- package/src/browser-es.ts +170 -0
- package/src/browser-fr.ts +162 -0
- package/src/browser-id.ts +162 -0
- package/src/browser-ja.ts +162 -0
- package/src/browser-ko.ts +162 -0
- package/src/browser-lazy.ts +189 -0
- package/src/browser-priority.ts +214 -0
- package/src/browser-pt.ts +162 -0
- package/src/browser-qu.ts +162 -0
- package/src/browser-sw.ts +162 -0
- package/src/browser-tr.ts +162 -0
- package/src/browser-western.ts +181 -0
- package/src/browser-zh.ts +162 -0
- package/src/browser.ts +268 -0
- package/src/cache/index.ts +14 -0
- package/src/cache/semantic-cache.ts +344 -0
- package/src/core-bridge.ts +372 -0
- package/src/explicit/converter.ts +258 -0
- package/src/explicit/index.ts +18 -0
- package/src/explicit/parser.ts +236 -0
- package/src/explicit/renderer.ts +424 -0
- package/src/generators/command-schemas.ts +1636 -0
- package/src/generators/event-handler-generator.ts +109 -0
- package/src/generators/index.ts +117 -0
- package/src/generators/language-profiles.ts +139 -0
- package/src/generators/pattern-generator.ts +537 -0
- package/src/generators/profiles/arabic.ts +131 -0
- package/src/generators/profiles/bengali.ts +132 -0
- package/src/generators/profiles/chinese.ts +124 -0
- package/src/generators/profiles/english.ts +113 -0
- package/src/generators/profiles/french.ts +125 -0
- package/src/generators/profiles/german.ts +126 -0
- package/src/generators/profiles/hindi.ts +146 -0
- package/src/generators/profiles/index.ts +46 -0
- package/src/generators/profiles/indonesian.ts +125 -0
- package/src/generators/profiles/italian.ts +139 -0
- package/src/generators/profiles/japanese.ts +149 -0
- package/src/generators/profiles/korean.ts +127 -0
- package/src/generators/profiles/marker-templates.ts +288 -0
- package/src/generators/profiles/ms.ts +130 -0
- package/src/generators/profiles/polish.ts +249 -0
- package/src/generators/profiles/portuguese.ts +115 -0
- package/src/generators/profiles/quechua.ts +113 -0
- package/src/generators/profiles/russian.ts +260 -0
- package/src/generators/profiles/spanish.ts +130 -0
- package/src/generators/profiles/swahili.ts +129 -0
- package/src/generators/profiles/thai.ts +132 -0
- package/src/generators/profiles/tl.ts +128 -0
- package/src/generators/profiles/turkish.ts +124 -0
- package/src/generators/profiles/types.ts +165 -0
- package/src/generators/profiles/ukrainian.ts +270 -0
- package/src/generators/profiles/vietnamese.ts +133 -0
- package/src/generators/schema-error-codes.ts +160 -0
- package/src/generators/schema-validator.ts +391 -0
- package/src/index.ts +429 -0
- package/src/language-building-schema.ts +3170 -0
- package/src/language-loader.ts +394 -0
- package/src/languages/_all.ts +65 -0
- package/src/languages/ar.ts +15 -0
- package/src/languages/bn.ts +16 -0
- package/src/languages/de.ts +15 -0
- package/src/languages/en.ts +29 -0
- package/src/languages/es.ts +15 -0
- package/src/languages/fr.ts +15 -0
- package/src/languages/hi.ts +26 -0
- package/src/languages/id.ts +15 -0
- package/src/languages/index.ts +18 -0
- package/src/languages/it.ts +15 -0
- package/src/languages/ja.ts +15 -0
- package/src/languages/ko.ts +15 -0
- package/src/languages/ms.ts +16 -0
- package/src/languages/pl.ts +18 -0
- package/src/languages/pt.ts +15 -0
- package/src/languages/qu.ts +15 -0
- package/src/languages/ru.ts +26 -0
- package/src/languages/sw.ts +15 -0
- package/src/languages/th.ts +16 -0
- package/src/languages/tl.ts +16 -0
- package/src/languages/tr.ts +15 -0
- package/src/languages/uk.ts +26 -0
- package/src/languages/vi.ts +16 -0
- package/src/languages/zh.ts +15 -0
- package/src/parser/index.ts +15 -0
- package/src/parser/pattern-matcher.ts +1181 -0
- package/src/parser/semantic-parser.ts +573 -0
- package/src/parser/utils/index.ts +35 -0
- package/src/parser/utils/marker-resolution.ts +111 -0
- package/src/parser/utils/possessive-keywords.ts +43 -0
- package/src/parser/utils/role-positioning.ts +70 -0
- package/src/parser/utils/type-validation.ts +134 -0
- package/src/patterns/add/ar.ts +71 -0
- package/src/patterns/add/bn.ts +70 -0
- package/src/patterns/add/hi.ts +69 -0
- package/src/patterns/add/index.ts +87 -0
- package/src/patterns/add/it.ts +61 -0
- package/src/patterns/add/ja.ts +93 -0
- package/src/patterns/add/ko.ts +74 -0
- package/src/patterns/add/ms.ts +30 -0
- package/src/patterns/add/pl.ts +62 -0
- package/src/patterns/add/ru.ts +62 -0
- package/src/patterns/add/th.ts +49 -0
- package/src/patterns/add/tl.ts +30 -0
- package/src/patterns/add/tr.ts +71 -0
- package/src/patterns/add/uk.ts +62 -0
- package/src/patterns/add/vi.ts +61 -0
- package/src/patterns/add/zh.ts +71 -0
- package/src/patterns/builders.ts +207 -0
- package/src/patterns/decrement/bn.ts +70 -0
- package/src/patterns/decrement/de.ts +42 -0
- package/src/patterns/decrement/hi.ts +68 -0
- package/src/patterns/decrement/index.ts +79 -0
- package/src/patterns/decrement/it.ts +69 -0
- package/src/patterns/decrement/ms.ts +30 -0
- package/src/patterns/decrement/pl.ts +58 -0
- package/src/patterns/decrement/ru.ts +58 -0
- package/src/patterns/decrement/th.ts +49 -0
- package/src/patterns/decrement/tl.ts +30 -0
- package/src/patterns/decrement/tr.ts +48 -0
- package/src/patterns/decrement/uk.ts +58 -0
- package/src/patterns/decrement/vi.ts +61 -0
- package/src/patterns/decrement/zh.ts +32 -0
- package/src/patterns/en.ts +302 -0
- package/src/patterns/event-handler/ar.ts +151 -0
- package/src/patterns/event-handler/bn.ts +72 -0
- package/src/patterns/event-handler/de.ts +117 -0
- package/src/patterns/event-handler/en.ts +117 -0
- package/src/patterns/event-handler/es.ts +136 -0
- package/src/patterns/event-handler/fr.ts +117 -0
- package/src/patterns/event-handler/hi.ts +64 -0
- package/src/patterns/event-handler/id.ts +117 -0
- package/src/patterns/event-handler/index.ts +119 -0
- package/src/patterns/event-handler/it.ts +54 -0
- package/src/patterns/event-handler/ja.ts +118 -0
- package/src/patterns/event-handler/ko.ts +133 -0
- package/src/patterns/event-handler/ms.ts +30 -0
- package/src/patterns/event-handler/pl.ts +62 -0
- package/src/patterns/event-handler/pt.ts +117 -0
- package/src/patterns/event-handler/qu.ts +66 -0
- package/src/patterns/event-handler/ru.ts +62 -0
- package/src/patterns/event-handler/shared.ts +270 -0
- package/src/patterns/event-handler/sw.ts +117 -0
- package/src/patterns/event-handler/th.ts +53 -0
- package/src/patterns/event-handler/tl.ts +30 -0
- package/src/patterns/event-handler/tr.ts +170 -0
- package/src/patterns/event-handler/uk.ts +62 -0
- package/src/patterns/event-handler/vi.ts +61 -0
- package/src/patterns/event-handler/zh.ts +150 -0
- package/src/patterns/get/ar.ts +49 -0
- package/src/patterns/get/bn.ts +47 -0
- package/src/patterns/get/de.ts +32 -0
- package/src/patterns/get/hi.ts +52 -0
- package/src/patterns/get/index.ts +83 -0
- package/src/patterns/get/it.ts +56 -0
- package/src/patterns/get/ja.ts +53 -0
- package/src/patterns/get/ko.ts +53 -0
- package/src/patterns/get/ms.ts +30 -0
- package/src/patterns/get/pl.ts +57 -0
- package/src/patterns/get/ru.ts +57 -0
- package/src/patterns/get/th.ts +29 -0
- package/src/patterns/get/tl.ts +30 -0
- package/src/patterns/get/uk.ts +57 -0
- package/src/patterns/get/vi.ts +48 -0
- package/src/patterns/grammar-transformed/index.ts +39 -0
- package/src/patterns/grammar-transformed/ja.ts +1713 -0
- package/src/patterns/grammar-transformed/ko.ts +1311 -0
- package/src/patterns/grammar-transformed/tr.ts +1067 -0
- package/src/patterns/hide/ar.ts +67 -0
- package/src/patterns/hide/bn.ts +47 -0
- package/src/patterns/hide/de.ts +36 -0
- package/src/patterns/hide/hi.ts +61 -0
- package/src/patterns/hide/index.ts +91 -0
- package/src/patterns/hide/it.ts +56 -0
- package/src/patterns/hide/ja.ts +69 -0
- package/src/patterns/hide/ko.ts +69 -0
- package/src/patterns/hide/ms.ts +30 -0
- package/src/patterns/hide/pl.ts +57 -0
- package/src/patterns/hide/ru.ts +57 -0
- package/src/patterns/hide/th.ts +29 -0
- package/src/patterns/hide/tl.ts +30 -0
- package/src/patterns/hide/tr.ts +65 -0
- package/src/patterns/hide/uk.ts +57 -0
- package/src/patterns/hide/vi.ts +56 -0
- package/src/patterns/hide/zh.ts +68 -0
- package/src/patterns/increment/bn.ts +70 -0
- package/src/patterns/increment/de.ts +36 -0
- package/src/patterns/increment/hi.ts +68 -0
- package/src/patterns/increment/index.ts +79 -0
- package/src/patterns/increment/it.ts +69 -0
- package/src/patterns/increment/ms.ts +30 -0
- package/src/patterns/increment/pl.ts +58 -0
- package/src/patterns/increment/ru.ts +58 -0
- package/src/patterns/increment/th.ts +49 -0
- package/src/patterns/increment/tl.ts +30 -0
- package/src/patterns/increment/tr.ts +52 -0
- package/src/patterns/increment/uk.ts +58 -0
- package/src/patterns/increment/vi.ts +61 -0
- package/src/patterns/increment/zh.ts +32 -0
- package/src/patterns/index.ts +84 -0
- package/src/patterns/languages/en/control-flow.ts +93 -0
- package/src/patterns/languages/en/fetch.ts +62 -0
- package/src/patterns/languages/en/index.ts +42 -0
- package/src/patterns/languages/en/repeat.ts +67 -0
- package/src/patterns/languages/en/set.ts +48 -0
- package/src/patterns/languages/en/swap.ts +38 -0
- package/src/patterns/languages/en/temporal.ts +57 -0
- package/src/patterns/put/ar.ts +74 -0
- package/src/patterns/put/bn.ts +53 -0
- package/src/patterns/put/en.ts +74 -0
- package/src/patterns/put/es.ts +74 -0
- package/src/patterns/put/hi.ts +69 -0
- package/src/patterns/put/id.ts +96 -0
- package/src/patterns/put/index.ts +99 -0
- package/src/patterns/put/it.ts +56 -0
- package/src/patterns/put/ja.ts +75 -0
- package/src/patterns/put/ko.ts +67 -0
- package/src/patterns/put/ms.ts +30 -0
- package/src/patterns/put/pl.ts +81 -0
- package/src/patterns/put/ru.ts +85 -0
- package/src/patterns/put/th.ts +32 -0
- package/src/patterns/put/tl.ts +30 -0
- package/src/patterns/put/tr.ts +67 -0
- package/src/patterns/put/uk.ts +85 -0
- package/src/patterns/put/vi.ts +72 -0
- package/src/patterns/put/zh.ts +62 -0
- package/src/patterns/registry.ts +163 -0
- package/src/patterns/remove/ar.ts +71 -0
- package/src/patterns/remove/bn.ts +68 -0
- package/src/patterns/remove/hi.ts +69 -0
- package/src/patterns/remove/index.ts +87 -0
- package/src/patterns/remove/it.ts +69 -0
- package/src/patterns/remove/ja.ts +74 -0
- package/src/patterns/remove/ko.ts +78 -0
- package/src/patterns/remove/ms.ts +30 -0
- package/src/patterns/remove/pl.ts +62 -0
- package/src/patterns/remove/ru.ts +62 -0
- package/src/patterns/remove/th.ts +49 -0
- package/src/patterns/remove/tl.ts +30 -0
- package/src/patterns/remove/tr.ts +78 -0
- package/src/patterns/remove/uk.ts +62 -0
- package/src/patterns/remove/vi.ts +61 -0
- package/src/patterns/remove/zh.ts +72 -0
- package/src/patterns/set/ar.ts +84 -0
- package/src/patterns/set/bn.ts +53 -0
- package/src/patterns/set/de.ts +84 -0
- package/src/patterns/set/es.ts +92 -0
- package/src/patterns/set/fr.ts +88 -0
- package/src/patterns/set/hi.ts +56 -0
- package/src/patterns/set/id.ts +84 -0
- package/src/patterns/set/index.ts +107 -0
- package/src/patterns/set/it.ts +56 -0
- package/src/patterns/set/ja.ts +86 -0
- package/src/patterns/set/ko.ts +85 -0
- package/src/patterns/set/ms.ts +30 -0
- package/src/patterns/set/pl.ts +57 -0
- package/src/patterns/set/pt.ts +84 -0
- package/src/patterns/set/ru.ts +57 -0
- package/src/patterns/set/th.ts +31 -0
- package/src/patterns/set/tl.ts +30 -0
- package/src/patterns/set/tr.ts +107 -0
- package/src/patterns/set/uk.ts +57 -0
- package/src/patterns/set/vi.ts +53 -0
- package/src/patterns/set/zh.ts +84 -0
- package/src/patterns/show/ar.ts +67 -0
- package/src/patterns/show/bn.ts +47 -0
- package/src/patterns/show/de.ts +32 -0
- package/src/patterns/show/fr.ts +32 -0
- package/src/patterns/show/hi.ts +61 -0
- package/src/patterns/show/index.ts +95 -0
- package/src/patterns/show/it.ts +56 -0
- package/src/patterns/show/ja.ts +69 -0
- package/src/patterns/show/ko.ts +73 -0
- package/src/patterns/show/ms.ts +30 -0
- package/src/patterns/show/pl.ts +57 -0
- package/src/patterns/show/ru.ts +57 -0
- package/src/patterns/show/th.ts +29 -0
- package/src/patterns/show/tl.ts +30 -0
- package/src/patterns/show/tr.ts +65 -0
- package/src/patterns/show/uk.ts +57 -0
- package/src/patterns/show/vi.ts +56 -0
- package/src/patterns/show/zh.ts +68 -0
- package/src/patterns/take/ar.ts +51 -0
- package/src/patterns/take/index.ts +31 -0
- package/src/patterns/toggle/ar.ts +61 -0
- package/src/patterns/toggle/bn.ts +70 -0
- package/src/patterns/toggle/en.ts +61 -0
- package/src/patterns/toggle/es.ts +61 -0
- package/src/patterns/toggle/hi.ts +80 -0
- package/src/patterns/toggle/index.ts +95 -0
- package/src/patterns/toggle/it.ts +69 -0
- package/src/patterns/toggle/ja.ts +156 -0
- package/src/patterns/toggle/ko.ts +113 -0
- package/src/patterns/toggle/ms.ts +30 -0
- package/src/patterns/toggle/pl.ts +62 -0
- package/src/patterns/toggle/ru.ts +62 -0
- package/src/patterns/toggle/th.ts +50 -0
- package/src/patterns/toggle/tl.ts +30 -0
- package/src/patterns/toggle/tr.ts +88 -0
- package/src/patterns/toggle/uk.ts +62 -0
- package/src/patterns/toggle/vi.ts +61 -0
- package/src/patterns/toggle/zh.ts +99 -0
- package/src/public-api.ts +286 -0
- package/src/registry.ts +441 -0
- package/src/tokenizers/arabic.ts +723 -0
- package/src/tokenizers/base.ts +1300 -0
- package/src/tokenizers/bengali.ts +289 -0
- package/src/tokenizers/chinese.ts +481 -0
- package/src/tokenizers/english.ts +416 -0
- package/src/tokenizers/french.ts +326 -0
- package/src/tokenizers/german.ts +324 -0
- package/src/tokenizers/hindi.ts +319 -0
- package/src/tokenizers/index.ts +127 -0
- package/src/tokenizers/indonesian.ts +306 -0
- package/src/tokenizers/italian.ts +458 -0
- package/src/tokenizers/japanese.ts +447 -0
- package/src/tokenizers/korean.ts +642 -0
- package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
- package/src/tokenizers/morphology/french-normalizer.ts +268 -0
- package/src/tokenizers/morphology/german-normalizer.ts +256 -0
- package/src/tokenizers/morphology/index.ts +46 -0
- package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
- package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
- package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
- package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
- package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
- package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
- package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
- package/src/tokenizers/morphology/types.ts +211 -0
- package/src/tokenizers/ms.ts +198 -0
- package/src/tokenizers/polish.ts +354 -0
- package/src/tokenizers/portuguese.ts +304 -0
- package/src/tokenizers/quechua.ts +339 -0
- package/src/tokenizers/russian.ts +375 -0
- package/src/tokenizers/spanish.ts +403 -0
- package/src/tokenizers/swahili.ts +303 -0
- package/src/tokenizers/thai.ts +236 -0
- package/src/tokenizers/tl.ts +198 -0
- package/src/tokenizers/turkish.ts +411 -0
- package/src/tokenizers/ukrainian.ts +369 -0
- package/src/tokenizers/vietnamese.ts +410 -0
- package/src/types/grammar-types.ts +617 -0
- package/src/types/unified-profile.ts +267 -0
- package/src/types.ts +709 -0
- package/src/utils/confidence-calculator.ts +147 -0
- package/src/validators/command-validator.ts +380 -0
- package/src/validators/index.ts +15 -0
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Japanese Morphological Normalizer
|
|
3
|
+
*
|
|
4
|
+
* Reduces Japanese verb conjugations to their stem forms.
|
|
5
|
+
* Japanese verbs conjugate by modifying their endings:
|
|
6
|
+
*
|
|
7
|
+
* Base: 切り替え (kiri-kae) - "toggle"
|
|
8
|
+
* て-form: 切り替えて (kiri-kaete) - "toggle and..."
|
|
9
|
+
* た-form: 切り替えた (kiri-kaeta) - "toggled" (past)
|
|
10
|
+
* ます-form: 切り替えます (kiri-kaemasu) - polite present
|
|
11
|
+
* ている: 切り替えている (kiri-kaeteiru) - "is toggling" (progressive)
|
|
12
|
+
* ない: 切り替えない (kiri-kaenai) - "don't toggle" (negative)
|
|
13
|
+
*
|
|
14
|
+
* This normalizer strips these suffixes to find the stem,
|
|
15
|
+
* which can then be matched against keyword dictionaries.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import type {
|
|
19
|
+
MorphologicalNormalizer,
|
|
20
|
+
NormalizationResult,
|
|
21
|
+
SuffixRule,
|
|
22
|
+
ConjugationType,
|
|
23
|
+
} from './types';
|
|
24
|
+
import { noChange, normalized } from './types';
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Suffix rules for Japanese verb conjugation.
|
|
28
|
+
* Ordered by length (longest first) to ensure greedy matching.
|
|
29
|
+
*/
|
|
30
|
+
const JAPANESE_SUFFIX_RULES: readonly SuffixRule[] = [
|
|
31
|
+
// Conditional forms - very common for event handlers (longest first)
|
|
32
|
+
// したら/すると/すれば are する verb conditionals
|
|
33
|
+
{ pattern: 'したら', confidence: 0.88, conjugationType: 'conditional-tara', minStemLength: 2 },
|
|
34
|
+
{ pattern: 'すると', confidence: 0.88, conjugationType: 'conditional-to', minStemLength: 2 },
|
|
35
|
+
{ pattern: 'すれば', confidence: 0.85, conjugationType: 'conditional-ba', minStemLength: 2 },
|
|
36
|
+
// たら/れば are regular verb conditionals
|
|
37
|
+
{ pattern: 'たら', confidence: 0.85, conjugationType: 'conditional-tara', minStemLength: 2 },
|
|
38
|
+
{ pattern: 'れば', confidence: 0.82, conjugationType: 'conditional-ba', minStemLength: 2 },
|
|
39
|
+
|
|
40
|
+
// Compound forms (longest first)
|
|
41
|
+
{ pattern: 'ていました', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
42
|
+
{ pattern: 'ています', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
|
|
43
|
+
{ pattern: 'てください', confidence: 0.85, conjugationType: 'request', minStemLength: 2 },
|
|
44
|
+
{ pattern: 'でください', confidence: 0.85, conjugationType: 'request', minStemLength: 2 },
|
|
45
|
+
{ pattern: 'ている', confidence: 0.85, conjugationType: 'progressive', minStemLength: 2 },
|
|
46
|
+
{ pattern: 'ておく', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
47
|
+
{ pattern: 'てみる', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
48
|
+
{ pattern: 'てある', confidence: 0.82, conjugationType: 'progressive', minStemLength: 2 },
|
|
49
|
+
|
|
50
|
+
// Casual request forms
|
|
51
|
+
{ pattern: 'てくれ', confidence: 0.8, conjugationType: 'casual-request', minStemLength: 2 },
|
|
52
|
+
{ pattern: 'でくれ', confidence: 0.8, conjugationType: 'casual-request', minStemLength: 2 },
|
|
53
|
+
|
|
54
|
+
// Contracted/colloquial forms (ちゃう/じゃう = てしまう/でしまう)
|
|
55
|
+
{ pattern: 'ちゃった', confidence: 0.82, conjugationType: 'contracted-past', minStemLength: 2 },
|
|
56
|
+
{ pattern: 'じゃった', confidence: 0.82, conjugationType: 'contracted-past', minStemLength: 2 },
|
|
57
|
+
{ pattern: 'ちゃう', confidence: 0.82, conjugationType: 'contracted', minStemLength: 2 },
|
|
58
|
+
{ pattern: 'じゃう', confidence: 0.82, conjugationType: 'contracted', minStemLength: 2 },
|
|
59
|
+
|
|
60
|
+
// Polite forms
|
|
61
|
+
{ pattern: 'ました', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
62
|
+
{ pattern: 'ません', confidence: 0.85, conjugationType: 'negative', minStemLength: 2 },
|
|
63
|
+
{ pattern: 'ます', confidence: 0.85, conjugationType: 'polite', minStemLength: 2 },
|
|
64
|
+
|
|
65
|
+
// て/た forms (very common)
|
|
66
|
+
{ pattern: 'て', confidence: 0.85, conjugationType: 'te-form', minStemLength: 2 },
|
|
67
|
+
{ pattern: 'た', confidence: 0.85, conjugationType: 'past', minStemLength: 2 },
|
|
68
|
+
|
|
69
|
+
// Negative forms
|
|
70
|
+
{ pattern: 'ない', confidence: 0.82, conjugationType: 'negative', minStemLength: 2 },
|
|
71
|
+
{ pattern: 'なかった', confidence: 0.82, conjugationType: 'past', minStemLength: 2 },
|
|
72
|
+
|
|
73
|
+
// Potential forms
|
|
74
|
+
{ pattern: 'られる', confidence: 0.8, conjugationType: 'potential', minStemLength: 2 },
|
|
75
|
+
{ pattern: 'れる', confidence: 0.78, conjugationType: 'potential', minStemLength: 2 },
|
|
76
|
+
|
|
77
|
+
// Passive forms
|
|
78
|
+
{ pattern: 'られた', confidence: 0.8, conjugationType: 'passive', minStemLength: 2 },
|
|
79
|
+
|
|
80
|
+
// Causative forms
|
|
81
|
+
{ pattern: 'させる', confidence: 0.8, conjugationType: 'causative', minStemLength: 2 },
|
|
82
|
+
{ pattern: 'せる', confidence: 0.78, conjugationType: 'causative', minStemLength: 2 },
|
|
83
|
+
|
|
84
|
+
// Volitional forms
|
|
85
|
+
{ pattern: 'よう', confidence: 0.8, conjugationType: 'volitional', minStemLength: 2 },
|
|
86
|
+
|
|
87
|
+
// Dictionary form ending (る-verbs) - lower confidence due to ambiguity
|
|
88
|
+
{ pattern: 'る', confidence: 0.75, conjugationType: 'dictionary', minStemLength: 3 },
|
|
89
|
+
];
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Special する verb patterns.
|
|
93
|
+
* する verbs are formed by noun + する, very common in Japanese.
|
|
94
|
+
* Order by length (longest first) for greedy matching.
|
|
95
|
+
*/
|
|
96
|
+
const SURU_PATTERNS: readonly {
|
|
97
|
+
pattern: string;
|
|
98
|
+
confidence: number;
|
|
99
|
+
conjugationType: ConjugationType;
|
|
100
|
+
}[] = [
|
|
101
|
+
// Conditional forms (most important for native idioms)
|
|
102
|
+
{ pattern: 'したら', confidence: 0.88, conjugationType: 'conditional-tara' },
|
|
103
|
+
{ pattern: 'すると', confidence: 0.88, conjugationType: 'conditional-to' },
|
|
104
|
+
{ pattern: 'すれば', confidence: 0.85, conjugationType: 'conditional-ba' },
|
|
105
|
+
// Progressive forms
|
|
106
|
+
{ pattern: 'しています', confidence: 0.85, conjugationType: 'progressive' },
|
|
107
|
+
{ pattern: 'している', confidence: 0.85, conjugationType: 'progressive' },
|
|
108
|
+
// Other forms
|
|
109
|
+
{ pattern: 'しました', confidence: 0.85, conjugationType: 'past' },
|
|
110
|
+
{ pattern: 'します', confidence: 0.85, conjugationType: 'polite' },
|
|
111
|
+
{ pattern: 'しない', confidence: 0.82, conjugationType: 'negative' },
|
|
112
|
+
{ pattern: 'して', confidence: 0.85, conjugationType: 'te-form' },
|
|
113
|
+
{ pattern: 'した', confidence: 0.85, conjugationType: 'past' },
|
|
114
|
+
{ pattern: 'する', confidence: 0.88, conjugationType: 'dictionary' },
|
|
115
|
+
];
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Check if a character is hiragana.
|
|
119
|
+
*/
|
|
120
|
+
function isHiragana(char: string): boolean {
|
|
121
|
+
const code = char.charCodeAt(0);
|
|
122
|
+
return code >= 0x3040 && code <= 0x309f;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Check if a character is katakana.
|
|
127
|
+
*/
|
|
128
|
+
function isKatakana(char: string): boolean {
|
|
129
|
+
const code = char.charCodeAt(0);
|
|
130
|
+
return code >= 0x30a0 && code <= 0x30ff;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Check if a character is kanji.
|
|
135
|
+
*/
|
|
136
|
+
function isKanji(char: string): boolean {
|
|
137
|
+
const code = char.charCodeAt(0);
|
|
138
|
+
return (code >= 0x4e00 && code <= 0x9fff) || (code >= 0x3400 && code <= 0x4dbf);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Check if a word contains Japanese characters.
|
|
143
|
+
*/
|
|
144
|
+
function containsJapanese(word: string): boolean {
|
|
145
|
+
for (const char of word) {
|
|
146
|
+
if (isHiragana(char) || isKatakana(char) || isKanji(char)) {
|
|
147
|
+
return true;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
return false;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Japanese morphological normalizer.
|
|
155
|
+
*/
|
|
156
|
+
export class JapaneseMorphologicalNormalizer implements MorphologicalNormalizer {
|
|
157
|
+
readonly language = 'ja';
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Check if a word might be a Japanese verb that can be normalized.
|
|
161
|
+
*/
|
|
162
|
+
isNormalizable(word: string): boolean {
|
|
163
|
+
// Must contain Japanese characters
|
|
164
|
+
if (!containsJapanese(word)) return false;
|
|
165
|
+
|
|
166
|
+
// Must be at least 2 characters
|
|
167
|
+
if (word.length < 2) return false;
|
|
168
|
+
|
|
169
|
+
// Check if it ends with a hiragana character (verbs typically do)
|
|
170
|
+
const lastChar = word[word.length - 1];
|
|
171
|
+
return isHiragana(lastChar);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Normalize a Japanese word to its stem form.
|
|
176
|
+
*/
|
|
177
|
+
normalize(word: string): NormalizationResult {
|
|
178
|
+
// Check for compound conjugations first (multi-layer suffixes)
|
|
179
|
+
const compoundResult = this.normalizeCompound(word);
|
|
180
|
+
if (compoundResult) return compoundResult;
|
|
181
|
+
|
|
182
|
+
// Check for する verb patterns (most common compound verbs)
|
|
183
|
+
const suruResult = this.trySuruNormalization(word);
|
|
184
|
+
if (suruResult) return suruResult;
|
|
185
|
+
|
|
186
|
+
// Try suffix rules
|
|
187
|
+
for (const rule of JAPANESE_SUFFIX_RULES) {
|
|
188
|
+
if (word.endsWith(rule.pattern)) {
|
|
189
|
+
const stem = word.slice(0, -rule.pattern.length);
|
|
190
|
+
|
|
191
|
+
// Validate stem length
|
|
192
|
+
const minLength = rule.minStemLength ?? 2;
|
|
193
|
+
if (stem.length < minLength) continue;
|
|
194
|
+
|
|
195
|
+
// Return normalized result
|
|
196
|
+
const metadata: {
|
|
197
|
+
removedSuffixes: string[];
|
|
198
|
+
conjugationType?: typeof rule.conjugationType;
|
|
199
|
+
} = {
|
|
200
|
+
removedSuffixes: [rule.pattern],
|
|
201
|
+
};
|
|
202
|
+
if (rule.conjugationType) {
|
|
203
|
+
metadata.conjugationType = rule.conjugationType;
|
|
204
|
+
}
|
|
205
|
+
return normalized(stem, rule.confidence, metadata);
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// No normalization needed
|
|
210
|
+
return noChange(word);
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/**
|
|
214
|
+
* Try to normalize a する verb.
|
|
215
|
+
*/
|
|
216
|
+
private trySuruNormalization(word: string): NormalizationResult | null {
|
|
217
|
+
for (const pattern of SURU_PATTERNS) {
|
|
218
|
+
if (word.endsWith(pattern.pattern)) {
|
|
219
|
+
const stem = word.slice(0, -pattern.pattern.length);
|
|
220
|
+
|
|
221
|
+
// する verbs need at least one character for the noun part
|
|
222
|
+
if (stem.length < 1) continue;
|
|
223
|
+
|
|
224
|
+
// Return the noun part (without する)
|
|
225
|
+
return normalized(stem, pattern.confidence, {
|
|
226
|
+
removedSuffixes: [pattern.pattern],
|
|
227
|
+
conjugationType: pattern.conjugationType,
|
|
228
|
+
originalForm: 'suru-verb',
|
|
229
|
+
});
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
return null;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Normalize compound conjugations (multi-layer suffixes).
|
|
237
|
+
* These are combinations like ていなかった (was not doing), でいない (is not doing).
|
|
238
|
+
* Handles cases that single-suffix rules miss.
|
|
239
|
+
*/
|
|
240
|
+
private normalizeCompound(word: string): NormalizationResult | null {
|
|
241
|
+
// Compound patterns with negative progressive forms
|
|
242
|
+
const compoundPatterns: readonly {
|
|
243
|
+
pattern: string;
|
|
244
|
+
suffixes: string[];
|
|
245
|
+
confidence: number;
|
|
246
|
+
minStemLength: number;
|
|
247
|
+
}[] = [
|
|
248
|
+
// Progressive negative past forms
|
|
249
|
+
{
|
|
250
|
+
pattern: 'ていなかった',
|
|
251
|
+
suffixes: ['て', 'い', 'なかった'],
|
|
252
|
+
confidence: 0.8,
|
|
253
|
+
minStemLength: 2,
|
|
254
|
+
},
|
|
255
|
+
{
|
|
256
|
+
pattern: 'でいなかった',
|
|
257
|
+
suffixes: ['で', 'い', 'なかった'],
|
|
258
|
+
confidence: 0.8,
|
|
259
|
+
minStemLength: 2,
|
|
260
|
+
},
|
|
261
|
+
// Progressive negative forms
|
|
262
|
+
{ pattern: 'ていない', suffixes: ['て', 'い', 'ない'], confidence: 0.82, minStemLength: 2 },
|
|
263
|
+
{ pattern: 'でいない', suffixes: ['で', 'い', 'ない'], confidence: 0.82, minStemLength: 2 },
|
|
264
|
+
// Progressive past forms
|
|
265
|
+
{ pattern: 'ていた', suffixes: ['て', 'い', 'た'], confidence: 0.85, minStemLength: 2 },
|
|
266
|
+
{ pattern: 'でいた', suffixes: ['で', 'い', 'た'], confidence: 0.85, minStemLength: 2 },
|
|
267
|
+
];
|
|
268
|
+
|
|
269
|
+
for (const { pattern, suffixes, confidence, minStemLength } of compoundPatterns) {
|
|
270
|
+
if (word.endsWith(pattern)) {
|
|
271
|
+
const stem = word.slice(0, -pattern.length);
|
|
272
|
+
|
|
273
|
+
// Validate minimum stem length
|
|
274
|
+
if (stem.length < minStemLength) continue;
|
|
275
|
+
|
|
276
|
+
return normalized(stem, confidence, {
|
|
277
|
+
removedSuffixes: suffixes,
|
|
278
|
+
conjugationType: 'compound',
|
|
279
|
+
});
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
return null;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// Export singleton instance
|
|
288
|
+
export const japaneseMorphologicalNormalizer = new JapaneseMorphologicalNormalizer();
|