@lokascript/semantic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +686 -0
- package/dist/browser-ar.ar.global.js +2 -0
- package/dist/browser-core.core.global.js +2 -0
- package/dist/browser-de.de.global.js +2 -0
- package/dist/browser-east-asian.east-asian.global.js +2 -0
- package/dist/browser-en-tr.en-tr.global.js +2 -0
- package/dist/browser-en.en.global.js +2 -0
- package/dist/browser-es-en.es-en.global.js +2 -0
- package/dist/browser-es.es.global.js +2 -0
- package/dist/browser-fr.fr.global.js +2 -0
- package/dist/browser-id.id.global.js +2 -0
- package/dist/browser-ja.ja.global.js +2 -0
- package/dist/browser-ko.ko.global.js +2 -0
- package/dist/browser-lazy.lazy.global.js +2 -0
- package/dist/browser-priority.priority.global.js +2 -0
- package/dist/browser-pt.pt.global.js +2 -0
- package/dist/browser-qu.qu.global.js +2 -0
- package/dist/browser-sw.sw.global.js +2 -0
- package/dist/browser-tr.tr.global.js +2 -0
- package/dist/browser-western.western.global.js +2 -0
- package/dist/browser-zh.zh.global.js +2 -0
- package/dist/browser.global.js +3 -0
- package/dist/browser.global.js.map +1 -0
- package/dist/index.cjs +35051 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3426 -0
- package/dist/index.d.ts +3426 -0
- package/dist/index.js +34890 -0
- package/dist/index.js.map +1 -0
- package/dist/languages/ar.d.ts +78 -0
- package/dist/languages/ar.js +1622 -0
- package/dist/languages/ar.js.map +1 -0
- package/dist/languages/de.d.ts +38 -0
- package/dist/languages/de.js +1168 -0
- package/dist/languages/de.js.map +1 -0
- package/dist/languages/en.d.ts +44 -0
- package/dist/languages/en.js +3491 -0
- package/dist/languages/en.js.map +1 -0
- package/dist/languages/es.d.ts +52 -0
- package/dist/languages/es.js +1493 -0
- package/dist/languages/es.js.map +1 -0
- package/dist/languages/fr.d.ts +37 -0
- package/dist/languages/fr.js +1159 -0
- package/dist/languages/fr.js.map +1 -0
- package/dist/languages/id.d.ts +35 -0
- package/dist/languages/id.js +1152 -0
- package/dist/languages/id.js.map +1 -0
- package/dist/languages/ja.d.ts +53 -0
- package/dist/languages/ja.js +1430 -0
- package/dist/languages/ja.js.map +1 -0
- package/dist/languages/ko.d.ts +51 -0
- package/dist/languages/ko.js +1729 -0
- package/dist/languages/ko.js.map +1 -0
- package/dist/languages/pt.d.ts +37 -0
- package/dist/languages/pt.js +1127 -0
- package/dist/languages/pt.js.map +1 -0
- package/dist/languages/qu.d.ts +36 -0
- package/dist/languages/qu.js +1143 -0
- package/dist/languages/qu.js.map +1 -0
- package/dist/languages/sw.d.ts +35 -0
- package/dist/languages/sw.js +1147 -0
- package/dist/languages/sw.js.map +1 -0
- package/dist/languages/tr.d.ts +45 -0
- package/dist/languages/tr.js +1529 -0
- package/dist/languages/tr.js.map +1 -0
- package/dist/languages/zh.d.ts +58 -0
- package/dist/languages/zh.js +1257 -0
- package/dist/languages/zh.js.map +1 -0
- package/dist/types-C4dcj53L.d.ts +600 -0
- package/package.json +202 -0
- package/src/__test-utils__/index.ts +7 -0
- package/src/__test-utils__/test-helpers.ts +8 -0
- package/src/__types__/test-helpers.ts +122 -0
- package/src/analysis/index.ts +479 -0
- package/src/ast-builder/command-mappers.ts +1133 -0
- package/src/ast-builder/expression-parser/index.ts +41 -0
- package/src/ast-builder/expression-parser/parser.ts +563 -0
- package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
- package/src/ast-builder/expression-parser/types.ts +208 -0
- package/src/ast-builder/index.ts +536 -0
- package/src/ast-builder/value-converters.ts +172 -0
- package/src/bridge.ts +275 -0
- package/src/browser-ar.ts +162 -0
- package/src/browser-core.ts +231 -0
- package/src/browser-de.ts +162 -0
- package/src/browser-east-asian.ts +173 -0
- package/src/browser-en-tr.ts +165 -0
- package/src/browser-en.ts +157 -0
- package/src/browser-es-en.ts +200 -0
- package/src/browser-es.ts +170 -0
- package/src/browser-fr.ts +162 -0
- package/src/browser-id.ts +162 -0
- package/src/browser-ja.ts +162 -0
- package/src/browser-ko.ts +162 -0
- package/src/browser-lazy.ts +189 -0
- package/src/browser-priority.ts +214 -0
- package/src/browser-pt.ts +162 -0
- package/src/browser-qu.ts +162 -0
- package/src/browser-sw.ts +162 -0
- package/src/browser-tr.ts +162 -0
- package/src/browser-western.ts +181 -0
- package/src/browser-zh.ts +162 -0
- package/src/browser.ts +268 -0
- package/src/cache/index.ts +14 -0
- package/src/cache/semantic-cache.ts +344 -0
- package/src/core-bridge.ts +372 -0
- package/src/explicit/converter.ts +258 -0
- package/src/explicit/index.ts +18 -0
- package/src/explicit/parser.ts +236 -0
- package/src/explicit/renderer.ts +424 -0
- package/src/generators/command-schemas.ts +1636 -0
- package/src/generators/event-handler-generator.ts +109 -0
- package/src/generators/index.ts +117 -0
- package/src/generators/language-profiles.ts +139 -0
- package/src/generators/pattern-generator.ts +537 -0
- package/src/generators/profiles/arabic.ts +131 -0
- package/src/generators/profiles/bengali.ts +132 -0
- package/src/generators/profiles/chinese.ts +124 -0
- package/src/generators/profiles/english.ts +113 -0
- package/src/generators/profiles/french.ts +125 -0
- package/src/generators/profiles/german.ts +126 -0
- package/src/generators/profiles/hindi.ts +146 -0
- package/src/generators/profiles/index.ts +46 -0
- package/src/generators/profiles/indonesian.ts +125 -0
- package/src/generators/profiles/italian.ts +139 -0
- package/src/generators/profiles/japanese.ts +149 -0
- package/src/generators/profiles/korean.ts +127 -0
- package/src/generators/profiles/marker-templates.ts +288 -0
- package/src/generators/profiles/ms.ts +130 -0
- package/src/generators/profiles/polish.ts +249 -0
- package/src/generators/profiles/portuguese.ts +115 -0
- package/src/generators/profiles/quechua.ts +113 -0
- package/src/generators/profiles/russian.ts +260 -0
- package/src/generators/profiles/spanish.ts +130 -0
- package/src/generators/profiles/swahili.ts +129 -0
- package/src/generators/profiles/thai.ts +132 -0
- package/src/generators/profiles/tl.ts +128 -0
- package/src/generators/profiles/turkish.ts +124 -0
- package/src/generators/profiles/types.ts +165 -0
- package/src/generators/profiles/ukrainian.ts +270 -0
- package/src/generators/profiles/vietnamese.ts +133 -0
- package/src/generators/schema-error-codes.ts +160 -0
- package/src/generators/schema-validator.ts +391 -0
- package/src/index.ts +429 -0
- package/src/language-building-schema.ts +3170 -0
- package/src/language-loader.ts +394 -0
- package/src/languages/_all.ts +65 -0
- package/src/languages/ar.ts +15 -0
- package/src/languages/bn.ts +16 -0
- package/src/languages/de.ts +15 -0
- package/src/languages/en.ts +29 -0
- package/src/languages/es.ts +15 -0
- package/src/languages/fr.ts +15 -0
- package/src/languages/hi.ts +26 -0
- package/src/languages/id.ts +15 -0
- package/src/languages/index.ts +18 -0
- package/src/languages/it.ts +15 -0
- package/src/languages/ja.ts +15 -0
- package/src/languages/ko.ts +15 -0
- package/src/languages/ms.ts +16 -0
- package/src/languages/pl.ts +18 -0
- package/src/languages/pt.ts +15 -0
- package/src/languages/qu.ts +15 -0
- package/src/languages/ru.ts +26 -0
- package/src/languages/sw.ts +15 -0
- package/src/languages/th.ts +16 -0
- package/src/languages/tl.ts +16 -0
- package/src/languages/tr.ts +15 -0
- package/src/languages/uk.ts +26 -0
- package/src/languages/vi.ts +16 -0
- package/src/languages/zh.ts +15 -0
- package/src/parser/index.ts +15 -0
- package/src/parser/pattern-matcher.ts +1181 -0
- package/src/parser/semantic-parser.ts +573 -0
- package/src/parser/utils/index.ts +35 -0
- package/src/parser/utils/marker-resolution.ts +111 -0
- package/src/parser/utils/possessive-keywords.ts +43 -0
- package/src/parser/utils/role-positioning.ts +70 -0
- package/src/parser/utils/type-validation.ts +134 -0
- package/src/patterns/add/ar.ts +71 -0
- package/src/patterns/add/bn.ts +70 -0
- package/src/patterns/add/hi.ts +69 -0
- package/src/patterns/add/index.ts +87 -0
- package/src/patterns/add/it.ts +61 -0
- package/src/patterns/add/ja.ts +93 -0
- package/src/patterns/add/ko.ts +74 -0
- package/src/patterns/add/ms.ts +30 -0
- package/src/patterns/add/pl.ts +62 -0
- package/src/patterns/add/ru.ts +62 -0
- package/src/patterns/add/th.ts +49 -0
- package/src/patterns/add/tl.ts +30 -0
- package/src/patterns/add/tr.ts +71 -0
- package/src/patterns/add/uk.ts +62 -0
- package/src/patterns/add/vi.ts +61 -0
- package/src/patterns/add/zh.ts +71 -0
- package/src/patterns/builders.ts +207 -0
- package/src/patterns/decrement/bn.ts +70 -0
- package/src/patterns/decrement/de.ts +42 -0
- package/src/patterns/decrement/hi.ts +68 -0
- package/src/patterns/decrement/index.ts +79 -0
- package/src/patterns/decrement/it.ts +69 -0
- package/src/patterns/decrement/ms.ts +30 -0
- package/src/patterns/decrement/pl.ts +58 -0
- package/src/patterns/decrement/ru.ts +58 -0
- package/src/patterns/decrement/th.ts +49 -0
- package/src/patterns/decrement/tl.ts +30 -0
- package/src/patterns/decrement/tr.ts +48 -0
- package/src/patterns/decrement/uk.ts +58 -0
- package/src/patterns/decrement/vi.ts +61 -0
- package/src/patterns/decrement/zh.ts +32 -0
- package/src/patterns/en.ts +302 -0
- package/src/patterns/event-handler/ar.ts +151 -0
- package/src/patterns/event-handler/bn.ts +72 -0
- package/src/patterns/event-handler/de.ts +117 -0
- package/src/patterns/event-handler/en.ts +117 -0
- package/src/patterns/event-handler/es.ts +136 -0
- package/src/patterns/event-handler/fr.ts +117 -0
- package/src/patterns/event-handler/hi.ts +64 -0
- package/src/patterns/event-handler/id.ts +117 -0
- package/src/patterns/event-handler/index.ts +119 -0
- package/src/patterns/event-handler/it.ts +54 -0
- package/src/patterns/event-handler/ja.ts +118 -0
- package/src/patterns/event-handler/ko.ts +133 -0
- package/src/patterns/event-handler/ms.ts +30 -0
- package/src/patterns/event-handler/pl.ts +62 -0
- package/src/patterns/event-handler/pt.ts +117 -0
- package/src/patterns/event-handler/qu.ts +66 -0
- package/src/patterns/event-handler/ru.ts +62 -0
- package/src/patterns/event-handler/shared.ts +270 -0
- package/src/patterns/event-handler/sw.ts +117 -0
- package/src/patterns/event-handler/th.ts +53 -0
- package/src/patterns/event-handler/tl.ts +30 -0
- package/src/patterns/event-handler/tr.ts +170 -0
- package/src/patterns/event-handler/uk.ts +62 -0
- package/src/patterns/event-handler/vi.ts +61 -0
- package/src/patterns/event-handler/zh.ts +150 -0
- package/src/patterns/get/ar.ts +49 -0
- package/src/patterns/get/bn.ts +47 -0
- package/src/patterns/get/de.ts +32 -0
- package/src/patterns/get/hi.ts +52 -0
- package/src/patterns/get/index.ts +83 -0
- package/src/patterns/get/it.ts +56 -0
- package/src/patterns/get/ja.ts +53 -0
- package/src/patterns/get/ko.ts +53 -0
- package/src/patterns/get/ms.ts +30 -0
- package/src/patterns/get/pl.ts +57 -0
- package/src/patterns/get/ru.ts +57 -0
- package/src/patterns/get/th.ts +29 -0
- package/src/patterns/get/tl.ts +30 -0
- package/src/patterns/get/uk.ts +57 -0
- package/src/patterns/get/vi.ts +48 -0
- package/src/patterns/grammar-transformed/index.ts +39 -0
- package/src/patterns/grammar-transformed/ja.ts +1713 -0
- package/src/patterns/grammar-transformed/ko.ts +1311 -0
- package/src/patterns/grammar-transformed/tr.ts +1067 -0
- package/src/patterns/hide/ar.ts +67 -0
- package/src/patterns/hide/bn.ts +47 -0
- package/src/patterns/hide/de.ts +36 -0
- package/src/patterns/hide/hi.ts +61 -0
- package/src/patterns/hide/index.ts +91 -0
- package/src/patterns/hide/it.ts +56 -0
- package/src/patterns/hide/ja.ts +69 -0
- package/src/patterns/hide/ko.ts +69 -0
- package/src/patterns/hide/ms.ts +30 -0
- package/src/patterns/hide/pl.ts +57 -0
- package/src/patterns/hide/ru.ts +57 -0
- package/src/patterns/hide/th.ts +29 -0
- package/src/patterns/hide/tl.ts +30 -0
- package/src/patterns/hide/tr.ts +65 -0
- package/src/patterns/hide/uk.ts +57 -0
- package/src/patterns/hide/vi.ts +56 -0
- package/src/patterns/hide/zh.ts +68 -0
- package/src/patterns/increment/bn.ts +70 -0
- package/src/patterns/increment/de.ts +36 -0
- package/src/patterns/increment/hi.ts +68 -0
- package/src/patterns/increment/index.ts +79 -0
- package/src/patterns/increment/it.ts +69 -0
- package/src/patterns/increment/ms.ts +30 -0
- package/src/patterns/increment/pl.ts +58 -0
- package/src/patterns/increment/ru.ts +58 -0
- package/src/patterns/increment/th.ts +49 -0
- package/src/patterns/increment/tl.ts +30 -0
- package/src/patterns/increment/tr.ts +52 -0
- package/src/patterns/increment/uk.ts +58 -0
- package/src/patterns/increment/vi.ts +61 -0
- package/src/patterns/increment/zh.ts +32 -0
- package/src/patterns/index.ts +84 -0
- package/src/patterns/languages/en/control-flow.ts +93 -0
- package/src/patterns/languages/en/fetch.ts +62 -0
- package/src/patterns/languages/en/index.ts +42 -0
- package/src/patterns/languages/en/repeat.ts +67 -0
- package/src/patterns/languages/en/set.ts +48 -0
- package/src/patterns/languages/en/swap.ts +38 -0
- package/src/patterns/languages/en/temporal.ts +57 -0
- package/src/patterns/put/ar.ts +74 -0
- package/src/patterns/put/bn.ts +53 -0
- package/src/patterns/put/en.ts +74 -0
- package/src/patterns/put/es.ts +74 -0
- package/src/patterns/put/hi.ts +69 -0
- package/src/patterns/put/id.ts +96 -0
- package/src/patterns/put/index.ts +99 -0
- package/src/patterns/put/it.ts +56 -0
- package/src/patterns/put/ja.ts +75 -0
- package/src/patterns/put/ko.ts +67 -0
- package/src/patterns/put/ms.ts +30 -0
- package/src/patterns/put/pl.ts +81 -0
- package/src/patterns/put/ru.ts +85 -0
- package/src/patterns/put/th.ts +32 -0
- package/src/patterns/put/tl.ts +30 -0
- package/src/patterns/put/tr.ts +67 -0
- package/src/patterns/put/uk.ts +85 -0
- package/src/patterns/put/vi.ts +72 -0
- package/src/patterns/put/zh.ts +62 -0
- package/src/patterns/registry.ts +163 -0
- package/src/patterns/remove/ar.ts +71 -0
- package/src/patterns/remove/bn.ts +68 -0
- package/src/patterns/remove/hi.ts +69 -0
- package/src/patterns/remove/index.ts +87 -0
- package/src/patterns/remove/it.ts +69 -0
- package/src/patterns/remove/ja.ts +74 -0
- package/src/patterns/remove/ko.ts +78 -0
- package/src/patterns/remove/ms.ts +30 -0
- package/src/patterns/remove/pl.ts +62 -0
- package/src/patterns/remove/ru.ts +62 -0
- package/src/patterns/remove/th.ts +49 -0
- package/src/patterns/remove/tl.ts +30 -0
- package/src/patterns/remove/tr.ts +78 -0
- package/src/patterns/remove/uk.ts +62 -0
- package/src/patterns/remove/vi.ts +61 -0
- package/src/patterns/remove/zh.ts +72 -0
- package/src/patterns/set/ar.ts +84 -0
- package/src/patterns/set/bn.ts +53 -0
- package/src/patterns/set/de.ts +84 -0
- package/src/patterns/set/es.ts +92 -0
- package/src/patterns/set/fr.ts +88 -0
- package/src/patterns/set/hi.ts +56 -0
- package/src/patterns/set/id.ts +84 -0
- package/src/patterns/set/index.ts +107 -0
- package/src/patterns/set/it.ts +56 -0
- package/src/patterns/set/ja.ts +86 -0
- package/src/patterns/set/ko.ts +85 -0
- package/src/patterns/set/ms.ts +30 -0
- package/src/patterns/set/pl.ts +57 -0
- package/src/patterns/set/pt.ts +84 -0
- package/src/patterns/set/ru.ts +57 -0
- package/src/patterns/set/th.ts +31 -0
- package/src/patterns/set/tl.ts +30 -0
- package/src/patterns/set/tr.ts +107 -0
- package/src/patterns/set/uk.ts +57 -0
- package/src/patterns/set/vi.ts +53 -0
- package/src/patterns/set/zh.ts +84 -0
- package/src/patterns/show/ar.ts +67 -0
- package/src/patterns/show/bn.ts +47 -0
- package/src/patterns/show/de.ts +32 -0
- package/src/patterns/show/fr.ts +32 -0
- package/src/patterns/show/hi.ts +61 -0
- package/src/patterns/show/index.ts +95 -0
- package/src/patterns/show/it.ts +56 -0
- package/src/patterns/show/ja.ts +69 -0
- package/src/patterns/show/ko.ts +73 -0
- package/src/patterns/show/ms.ts +30 -0
- package/src/patterns/show/pl.ts +57 -0
- package/src/patterns/show/ru.ts +57 -0
- package/src/patterns/show/th.ts +29 -0
- package/src/patterns/show/tl.ts +30 -0
- package/src/patterns/show/tr.ts +65 -0
- package/src/patterns/show/uk.ts +57 -0
- package/src/patterns/show/vi.ts +56 -0
- package/src/patterns/show/zh.ts +68 -0
- package/src/patterns/take/ar.ts +51 -0
- package/src/patterns/take/index.ts +31 -0
- package/src/patterns/toggle/ar.ts +61 -0
- package/src/patterns/toggle/bn.ts +70 -0
- package/src/patterns/toggle/en.ts +61 -0
- package/src/patterns/toggle/es.ts +61 -0
- package/src/patterns/toggle/hi.ts +80 -0
- package/src/patterns/toggle/index.ts +95 -0
- package/src/patterns/toggle/it.ts +69 -0
- package/src/patterns/toggle/ja.ts +156 -0
- package/src/patterns/toggle/ko.ts +113 -0
- package/src/patterns/toggle/ms.ts +30 -0
- package/src/patterns/toggle/pl.ts +62 -0
- package/src/patterns/toggle/ru.ts +62 -0
- package/src/patterns/toggle/th.ts +50 -0
- package/src/patterns/toggle/tl.ts +30 -0
- package/src/patterns/toggle/tr.ts +88 -0
- package/src/patterns/toggle/uk.ts +62 -0
- package/src/patterns/toggle/vi.ts +61 -0
- package/src/patterns/toggle/zh.ts +99 -0
- package/src/public-api.ts +286 -0
- package/src/registry.ts +441 -0
- package/src/tokenizers/arabic.ts +723 -0
- package/src/tokenizers/base.ts +1300 -0
- package/src/tokenizers/bengali.ts +289 -0
- package/src/tokenizers/chinese.ts +481 -0
- package/src/tokenizers/english.ts +416 -0
- package/src/tokenizers/french.ts +326 -0
- package/src/tokenizers/german.ts +324 -0
- package/src/tokenizers/hindi.ts +319 -0
- package/src/tokenizers/index.ts +127 -0
- package/src/tokenizers/indonesian.ts +306 -0
- package/src/tokenizers/italian.ts +458 -0
- package/src/tokenizers/japanese.ts +447 -0
- package/src/tokenizers/korean.ts +642 -0
- package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
- package/src/tokenizers/morphology/french-normalizer.ts +268 -0
- package/src/tokenizers/morphology/german-normalizer.ts +256 -0
- package/src/tokenizers/morphology/index.ts +46 -0
- package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
- package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
- package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
- package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
- package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
- package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
- package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
- package/src/tokenizers/morphology/types.ts +211 -0
- package/src/tokenizers/ms.ts +198 -0
- package/src/tokenizers/polish.ts +354 -0
- package/src/tokenizers/portuguese.ts +304 -0
- package/src/tokenizers/quechua.ts +339 -0
- package/src/tokenizers/russian.ts +375 -0
- package/src/tokenizers/spanish.ts +403 -0
- package/src/tokenizers/swahili.ts +303 -0
- package/src/tokenizers/thai.ts +236 -0
- package/src/tokenizers/tl.ts +198 -0
- package/src/tokenizers/turkish.ts +411 -0
- package/src/tokenizers/ukrainian.ts +369 -0
- package/src/tokenizers/vietnamese.ts +410 -0
- package/src/types/grammar-types.ts +617 -0
- package/src/types/unified-profile.ts +267 -0
- package/src/types.ts +709 -0
- package/src/utils/confidence-calculator.ts +147 -0
- package/src/validators/command-validator.ts +380 -0
- package/src/validators/index.ts +15 -0
|
@@ -0,0 +1,642 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Korean Tokenizer
|
|
3
|
+
*
|
|
4
|
+
* Tokenizes Korean hyperscript input.
|
|
5
|
+
* Korean is an agglutinative language with:
|
|
6
|
+
* - Hangul syllable blocks (가-힣)
|
|
7
|
+
* - Particles (조사) mark grammatical roles
|
|
8
|
+
* - 하다 verbs (noun + 하다)
|
|
9
|
+
* - CSS selectors are embedded ASCII
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import type { LanguageToken, TokenKind, TokenStream } from '../types';
|
|
13
|
+
import {
|
|
14
|
+
BaseTokenizer,
|
|
15
|
+
TokenStreamImpl,
|
|
16
|
+
createToken,
|
|
17
|
+
createPosition,
|
|
18
|
+
createUnicodeRangeClassifier,
|
|
19
|
+
combineClassifiers,
|
|
20
|
+
isWhitespace,
|
|
21
|
+
isSelectorStart,
|
|
22
|
+
isQuote,
|
|
23
|
+
isDigit,
|
|
24
|
+
isAsciiIdentifierChar,
|
|
25
|
+
isUrlStart,
|
|
26
|
+
type KeywordEntry,
|
|
27
|
+
type TimeUnitMapping,
|
|
28
|
+
} from './base';
|
|
29
|
+
import { KoreanMorphologicalNormalizer } from './morphology/korean-normalizer';
|
|
30
|
+
import { koreanProfile } from '../generators/profiles/korean';
|
|
31
|
+
|
|
32
|
+
// =============================================================================
|
|
33
|
+
// Korean Character Classification
|
|
34
|
+
// =============================================================================
|
|
35
|
+
|
|
36
|
+
/** Check if character is a Korean syllable block (U+AC00-U+D7A3). */
|
|
37
|
+
const isHangul = createUnicodeRangeClassifier([[0xac00, 0xd7a3]]);
|
|
38
|
+
|
|
39
|
+
/** Check if character is a Hangul Jamo (U+1100-U+11FF, U+3130-U+318F). */
|
|
40
|
+
const isJamo = createUnicodeRangeClassifier([
|
|
41
|
+
[0x1100, 0x11ff], // Hangul Jamo
|
|
42
|
+
[0x3130, 0x318f], // Hangul Compatibility Jamo
|
|
43
|
+
]);
|
|
44
|
+
|
|
45
|
+
/** Check if character is Korean (Hangul syllable or Jamo). */
|
|
46
|
+
const isKorean = combineClassifiers(isHangul, isJamo);
|
|
47
|
+
|
|
48
|
+
// =============================================================================
|
|
49
|
+
// Korean Particles (조사)
|
|
50
|
+
// =============================================================================
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Korean particles that mark grammatical roles.
|
|
54
|
+
* These appear after nouns and vary based on vowel harmony.
|
|
55
|
+
*/
|
|
56
|
+
const PARTICLES = new Set([
|
|
57
|
+
// Subject markers
|
|
58
|
+
'이', // i - after consonant
|
|
59
|
+
'가', // ga - after vowel
|
|
60
|
+
// Object markers
|
|
61
|
+
'을', // eul - after consonant
|
|
62
|
+
'를', // reul - after vowel
|
|
63
|
+
// Topic markers
|
|
64
|
+
'은', // eun - after consonant
|
|
65
|
+
'는', // neun - after vowel
|
|
66
|
+
// Location/time markers
|
|
67
|
+
'에', // e - at, to
|
|
68
|
+
'에서', // eseo - at (action location), from
|
|
69
|
+
'로', // ro - to, by means (after vowel or ㄹ)
|
|
70
|
+
'으로', // euro - to, by means (after consonant)
|
|
71
|
+
// Others
|
|
72
|
+
'와', // wa - and, with (after vowel)
|
|
73
|
+
'과', // gwa - and, with (after consonant)
|
|
74
|
+
'의', // ui - possessive ('s)
|
|
75
|
+
'도', // do - also
|
|
76
|
+
'만', // man - only
|
|
77
|
+
'부터', // buteo - from
|
|
78
|
+
'까지', // kkaji - until
|
|
79
|
+
'처럼', // cheoreom - like
|
|
80
|
+
'보다', // boda - than
|
|
81
|
+
]);
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Single-character particles.
|
|
85
|
+
*/
|
|
86
|
+
const SINGLE_CHAR_PARTICLES = new Set([
|
|
87
|
+
'이',
|
|
88
|
+
'가',
|
|
89
|
+
'을',
|
|
90
|
+
'를',
|
|
91
|
+
'은',
|
|
92
|
+
'는',
|
|
93
|
+
'에',
|
|
94
|
+
'로',
|
|
95
|
+
'와',
|
|
96
|
+
'과',
|
|
97
|
+
'의',
|
|
98
|
+
'도',
|
|
99
|
+
'만',
|
|
100
|
+
]);
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Multi-character particles.
|
|
104
|
+
*/
|
|
105
|
+
const MULTI_CHAR_PARTICLES = ['에서', '으로', '부터', '까지', '처럼', '보다'];
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Particle metadata mapping particles to semantic roles, confidence scores,
|
|
109
|
+
* and vowel harmony variants. Korean particles change based on whether the
|
|
110
|
+
* preceding syllable ends in a consonant or vowel.
|
|
111
|
+
*/
|
|
112
|
+
interface ParticleMetadata {
|
|
113
|
+
readonly role: string; // SemanticRole
|
|
114
|
+
readonly confidence: number;
|
|
115
|
+
readonly variant?: 'consonant' | 'vowel'; // For vowel harmony pairs
|
|
116
|
+
readonly description?: string;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
const PARTICLE_ROLES = new Map<string, ParticleMetadata>([
|
|
120
|
+
// Subject markers (vowel harmony pair)
|
|
121
|
+
[
|
|
122
|
+
'이',
|
|
123
|
+
{
|
|
124
|
+
role: 'agent',
|
|
125
|
+
confidence: 0.85,
|
|
126
|
+
variant: 'consonant',
|
|
127
|
+
description: 'subject marker (after consonant)',
|
|
128
|
+
},
|
|
129
|
+
],
|
|
130
|
+
[
|
|
131
|
+
'가',
|
|
132
|
+
{
|
|
133
|
+
role: 'agent',
|
|
134
|
+
confidence: 0.85,
|
|
135
|
+
variant: 'vowel',
|
|
136
|
+
description: 'subject marker (after vowel)',
|
|
137
|
+
},
|
|
138
|
+
],
|
|
139
|
+
|
|
140
|
+
// Object markers (vowel harmony pair)
|
|
141
|
+
[
|
|
142
|
+
'을',
|
|
143
|
+
{
|
|
144
|
+
role: 'patient',
|
|
145
|
+
confidence: 0.95,
|
|
146
|
+
variant: 'consonant',
|
|
147
|
+
description: 'object marker (after consonant)',
|
|
148
|
+
},
|
|
149
|
+
],
|
|
150
|
+
[
|
|
151
|
+
'를',
|
|
152
|
+
{
|
|
153
|
+
role: 'patient',
|
|
154
|
+
confidence: 0.95,
|
|
155
|
+
variant: 'vowel',
|
|
156
|
+
description: 'object marker (after vowel)',
|
|
157
|
+
},
|
|
158
|
+
],
|
|
159
|
+
|
|
160
|
+
// Topic markers (vowel harmony pair)
|
|
161
|
+
[
|
|
162
|
+
'은',
|
|
163
|
+
{
|
|
164
|
+
role: 'agent',
|
|
165
|
+
confidence: 0.75,
|
|
166
|
+
variant: 'consonant',
|
|
167
|
+
description: 'topic marker (after consonant)',
|
|
168
|
+
},
|
|
169
|
+
],
|
|
170
|
+
[
|
|
171
|
+
'는',
|
|
172
|
+
{
|
|
173
|
+
role: 'agent',
|
|
174
|
+
confidence: 0.75,
|
|
175
|
+
variant: 'vowel',
|
|
176
|
+
description: 'topic marker (after vowel)',
|
|
177
|
+
},
|
|
178
|
+
],
|
|
179
|
+
|
|
180
|
+
// Location/time markers
|
|
181
|
+
['에', { role: 'destination', confidence: 0.85, description: 'at/to marker' }],
|
|
182
|
+
['에서', { role: 'source', confidence: 0.8, description: 'at/from marker (action location)' }],
|
|
183
|
+
|
|
184
|
+
// Direction/means markers (vowel harmony pair)
|
|
185
|
+
[
|
|
186
|
+
'로',
|
|
187
|
+
{
|
|
188
|
+
role: 'destination',
|
|
189
|
+
confidence: 0.85,
|
|
190
|
+
variant: 'vowel',
|
|
191
|
+
description: 'to/by means (after vowel or ㄹ)',
|
|
192
|
+
},
|
|
193
|
+
],
|
|
194
|
+
[
|
|
195
|
+
'으로',
|
|
196
|
+
{
|
|
197
|
+
role: 'destination',
|
|
198
|
+
confidence: 0.85,
|
|
199
|
+
variant: 'consonant',
|
|
200
|
+
description: 'to/by means (after consonant)',
|
|
201
|
+
},
|
|
202
|
+
],
|
|
203
|
+
|
|
204
|
+
// And/with markers (vowel harmony pair)
|
|
205
|
+
[
|
|
206
|
+
'와',
|
|
207
|
+
{ role: 'style', confidence: 0.7, variant: 'vowel', description: 'and/with (after vowel)' },
|
|
208
|
+
],
|
|
209
|
+
[
|
|
210
|
+
'과',
|
|
211
|
+
{
|
|
212
|
+
role: 'style',
|
|
213
|
+
confidence: 0.7,
|
|
214
|
+
variant: 'consonant',
|
|
215
|
+
description: 'and/with (after consonant)',
|
|
216
|
+
},
|
|
217
|
+
],
|
|
218
|
+
|
|
219
|
+
// Other markers
|
|
220
|
+
['의', { role: 'patient', confidence: 0.6, description: 'possessive marker' }],
|
|
221
|
+
['도', { role: 'patient', confidence: 0.65, description: 'also/too marker' }],
|
|
222
|
+
['만', { role: 'patient', confidence: 0.65, description: 'only marker' }],
|
|
223
|
+
['부터', { role: 'source', confidence: 0.9, description: 'from/since marker' }],
|
|
224
|
+
['까지', { role: 'destination', confidence: 0.75, description: 'until/to marker' }],
|
|
225
|
+
['처럼', { role: 'manner', confidence: 0.8, description: 'like/as marker' }],
|
|
226
|
+
['보다', { role: 'source', confidence: 0.75, description: 'than marker' }],
|
|
227
|
+
]);
|
|
228
|
+
|
|
229
|
+
// =============================================================================
|
|
230
|
+
// Korean Extras (keywords not in profile)
|
|
231
|
+
// =============================================================================
|
|
232
|
+
|
|
233
|
+
/**
|
|
234
|
+
* Extra keywords not covered by the profile:
|
|
235
|
+
* - Literals (true, false, null, undefined)
|
|
236
|
+
* - Positional words
|
|
237
|
+
* - Event names
|
|
238
|
+
* - Attached particle forms (native idioms)
|
|
239
|
+
* - Conditional event forms
|
|
240
|
+
* - Time units
|
|
241
|
+
*/
|
|
242
|
+
const KOREAN_EXTRAS: KeywordEntry[] = [
|
|
243
|
+
// Values/Literals
|
|
244
|
+
{ native: '참', normalized: 'true' },
|
|
245
|
+
{ native: '거짓', normalized: 'false' },
|
|
246
|
+
{ native: '널', normalized: 'null' },
|
|
247
|
+
{ native: '미정의', normalized: 'undefined' },
|
|
248
|
+
|
|
249
|
+
// Positional
|
|
250
|
+
{ native: '첫번째', normalized: 'first' },
|
|
251
|
+
{ native: '마지막', normalized: 'last' },
|
|
252
|
+
{ native: '다음', normalized: 'next' },
|
|
253
|
+
{ native: '이전', normalized: 'previous' },
|
|
254
|
+
{ native: '가장가까운', normalized: 'closest' },
|
|
255
|
+
{ native: '부모', normalized: 'parent' },
|
|
256
|
+
|
|
257
|
+
// Events
|
|
258
|
+
{ native: '클릭', normalized: 'click' },
|
|
259
|
+
{ native: '더블클릭', normalized: 'dblclick' },
|
|
260
|
+
{ native: '변경', normalized: 'change' },
|
|
261
|
+
{ native: '제출', normalized: 'submit' },
|
|
262
|
+
{ native: '입력', normalized: 'input' },
|
|
263
|
+
{ native: '로드', normalized: 'load' },
|
|
264
|
+
{ native: '스크롤', normalized: 'scroll' },
|
|
265
|
+
{ native: '키다운', normalized: 'keydown' },
|
|
266
|
+
{ native: '키업', normalized: 'keyup' },
|
|
267
|
+
{ native: '마우스오버', normalized: 'mouseover' },
|
|
268
|
+
{ native: '마우스아웃', normalized: 'mouseout' },
|
|
269
|
+
|
|
270
|
+
// References (additional forms)
|
|
271
|
+
{ native: '내', normalized: 'my' },
|
|
272
|
+
{ native: '그것의', normalized: 'its' },
|
|
273
|
+
|
|
274
|
+
// Conditional event forms (native idioms)
|
|
275
|
+
{ native: '하면', normalized: 'on' },
|
|
276
|
+
{ native: '으면', normalized: 'on' },
|
|
277
|
+
{ native: '면', normalized: 'on' },
|
|
278
|
+
{ native: '할때', normalized: 'on' },
|
|
279
|
+
{ native: '할 때', normalized: 'on' },
|
|
280
|
+
{ native: '을때', normalized: 'on' },
|
|
281
|
+
{ native: '을 때', normalized: 'on' },
|
|
282
|
+
{ native: '하니까', normalized: 'on' },
|
|
283
|
+
{ native: '니까', normalized: 'on' },
|
|
284
|
+
|
|
285
|
+
// Control flow helpers
|
|
286
|
+
{ native: '그러면', normalized: 'then' },
|
|
287
|
+
{ native: '그렇지않으면', normalized: 'otherwise' },
|
|
288
|
+
{ native: '중단', normalized: 'break' },
|
|
289
|
+
|
|
290
|
+
// Logical
|
|
291
|
+
{ native: '그리고', normalized: 'and' },
|
|
292
|
+
{ native: '또는', normalized: 'or' },
|
|
293
|
+
{ native: '아니', normalized: 'not' },
|
|
294
|
+
{ native: '이다', normalized: 'is' },
|
|
295
|
+
|
|
296
|
+
// Command overrides (ensure correct mapping when profile has multiple meanings)
|
|
297
|
+
{ native: '추가', normalized: 'add' }, // Profile may have this as 'append'
|
|
298
|
+
|
|
299
|
+
// Attached particle forms (native idioms - particle + verb without space)
|
|
300
|
+
// Object particle 를 (after vowel)
|
|
301
|
+
{ native: '를토글', normalized: 'toggle' },
|
|
302
|
+
{ native: '를전환', normalized: 'toggle' },
|
|
303
|
+
{ native: '를추가', normalized: 'add' },
|
|
304
|
+
{ native: '를제거', normalized: 'remove' },
|
|
305
|
+
{ native: '를삭제', normalized: 'remove' },
|
|
306
|
+
{ native: '를증가', normalized: 'increment' },
|
|
307
|
+
{ native: '를감소', normalized: 'decrement' },
|
|
308
|
+
{ native: '를표시', normalized: 'show' },
|
|
309
|
+
{ native: '를숨기다', normalized: 'hide' },
|
|
310
|
+
{ native: '를설정', normalized: 'set' },
|
|
311
|
+
// Object particle 을 (after consonant)
|
|
312
|
+
{ native: '을토글', normalized: 'toggle' },
|
|
313
|
+
{ native: '을전환', normalized: 'toggle' },
|
|
314
|
+
{ native: '을추가', normalized: 'add' },
|
|
315
|
+
{ native: '을제거', normalized: 'remove' },
|
|
316
|
+
{ native: '을삭제', normalized: 'remove' },
|
|
317
|
+
{ native: '을증가', normalized: 'increment' },
|
|
318
|
+
{ native: '을감소', normalized: 'decrement' },
|
|
319
|
+
{ native: '을표시', normalized: 'show' },
|
|
320
|
+
{ native: '을숨기다', normalized: 'hide' },
|
|
321
|
+
{ native: '을설정', normalized: 'set' },
|
|
322
|
+
|
|
323
|
+
// Time units
|
|
324
|
+
{ native: '초', normalized: 's' },
|
|
325
|
+
{ native: '밀리초', normalized: 'ms' },
|
|
326
|
+
{ native: '분', normalized: 'm' },
|
|
327
|
+
{ native: '시간', normalized: 'h' },
|
|
328
|
+
];
|
|
329
|
+
|
|
330
|
+
// =============================================================================
|
|
331
|
+
// Korean Time Units
|
|
332
|
+
// =============================================================================
|
|
333
|
+
|
|
334
|
+
/**
|
|
335
|
+
* Korean time unit patterns for number parsing.
|
|
336
|
+
* Sorted by length (longest first) to ensure correct matching.
|
|
337
|
+
* Korean time units attach directly without whitespace.
|
|
338
|
+
*/
|
|
339
|
+
const KOREAN_TIME_UNITS: readonly TimeUnitMapping[] = [
|
|
340
|
+
{ pattern: '밀리초', suffix: 'ms', length: 3 },
|
|
341
|
+
{ pattern: '시간', suffix: 'h', length: 2 },
|
|
342
|
+
{ pattern: '초', suffix: 's', length: 1 },
|
|
343
|
+
{ pattern: '분', suffix: 'm', length: 1 },
|
|
344
|
+
];
|
|
345
|
+
|
|
346
|
+
// =============================================================================
|
|
347
|
+
// Korean Tokenizer Implementation
|
|
348
|
+
// =============================================================================
|
|
349
|
+
|
|
350
|
+
export class KoreanTokenizer extends BaseTokenizer {
|
|
351
|
+
readonly language = 'ko';
|
|
352
|
+
readonly direction = 'ltr' as const;
|
|
353
|
+
|
|
354
|
+
constructor() {
|
|
355
|
+
super();
|
|
356
|
+
// Initialize keywords from profile + extras (single source of truth)
|
|
357
|
+
this.initializeKeywordsFromProfile(koreanProfile, KOREAN_EXTRAS);
|
|
358
|
+
// Set morphological normalizer for verb conjugations
|
|
359
|
+
this.normalizer = new KoreanMorphologicalNormalizer();
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
tokenize(input: string): TokenStream {
|
|
363
|
+
const tokens: LanguageToken[] = [];
|
|
364
|
+
let pos = 0;
|
|
365
|
+
|
|
366
|
+
while (pos < input.length) {
|
|
367
|
+
// Skip whitespace
|
|
368
|
+
if (isWhitespace(input[pos])) {
|
|
369
|
+
pos++;
|
|
370
|
+
continue;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
// Try CSS selector first (ASCII-based, highest priority)
|
|
374
|
+
if (isSelectorStart(input[pos])) {
|
|
375
|
+
// Check for event modifier first (.once, .debounce(), etc.)
|
|
376
|
+
const modifierToken = this.tryEventModifier(input, pos);
|
|
377
|
+
if (modifierToken) {
|
|
378
|
+
tokens.push(modifierToken);
|
|
379
|
+
pos = modifierToken.position.end;
|
|
380
|
+
continue;
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
const selectorToken = this.trySelector(input, pos);
|
|
384
|
+
if (selectorToken) {
|
|
385
|
+
tokens.push(selectorToken);
|
|
386
|
+
pos = selectorToken.position.end;
|
|
387
|
+
continue;
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// Try string literal
|
|
392
|
+
if (isQuote(input[pos])) {
|
|
393
|
+
const stringToken = this.tryString(input, pos);
|
|
394
|
+
if (stringToken) {
|
|
395
|
+
tokens.push(stringToken);
|
|
396
|
+
pos = stringToken.position.end;
|
|
397
|
+
continue;
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
// Try URL (/path, ./path, http://, etc.)
|
|
402
|
+
if (isUrlStart(input, pos)) {
|
|
403
|
+
const urlToken = this.tryUrl(input, pos);
|
|
404
|
+
if (urlToken) {
|
|
405
|
+
tokens.push(urlToken);
|
|
406
|
+
pos = urlToken.position.end;
|
|
407
|
+
continue;
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
// Try number (including Korean time units)
|
|
412
|
+
if (isDigit(input[pos])) {
|
|
413
|
+
const numberToken = this.extractKoreanNumber(input, pos);
|
|
414
|
+
if (numberToken) {
|
|
415
|
+
tokens.push(numberToken);
|
|
416
|
+
pos = numberToken.position.end;
|
|
417
|
+
continue;
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
// Try variable reference (:varname)
|
|
422
|
+
const varToken = this.tryVariableRef(input, pos);
|
|
423
|
+
if (varToken) {
|
|
424
|
+
tokens.push(varToken);
|
|
425
|
+
pos = varToken.position.end;
|
|
426
|
+
continue;
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// Try Korean word FIRST (before particles)
|
|
430
|
+
// This ensures keywords like 로그 aren't split on particle characters
|
|
431
|
+
if (isKorean(input[pos])) {
|
|
432
|
+
const wordToken = this.extractKoreanWord(input, pos);
|
|
433
|
+
if (wordToken) {
|
|
434
|
+
tokens.push(wordToken);
|
|
435
|
+
pos = wordToken.position.end;
|
|
436
|
+
continue;
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// Try multi-character particle (before single-character)
|
|
441
|
+
const multiParticle = this.tryMultiCharParticle(input, pos, MULTI_CHAR_PARTICLES);
|
|
442
|
+
if (multiParticle) {
|
|
443
|
+
// Add role metadata to particle token
|
|
444
|
+
const metadata = PARTICLE_ROLES.get(multiParticle.value);
|
|
445
|
+
if (metadata) {
|
|
446
|
+
tokens.push({
|
|
447
|
+
...multiParticle,
|
|
448
|
+
metadata: {
|
|
449
|
+
particleRole: metadata.role,
|
|
450
|
+
particleConfidence: metadata.confidence,
|
|
451
|
+
particleVariant: metadata.variant,
|
|
452
|
+
},
|
|
453
|
+
});
|
|
454
|
+
} else {
|
|
455
|
+
tokens.push(multiParticle);
|
|
456
|
+
}
|
|
457
|
+
pos = multiParticle.position.end;
|
|
458
|
+
continue;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
// Try single-character particle
|
|
462
|
+
if (SINGLE_CHAR_PARTICLES.has(input[pos])) {
|
|
463
|
+
const particle = input[pos];
|
|
464
|
+
const metadata = PARTICLE_ROLES.get(particle);
|
|
465
|
+
if (metadata) {
|
|
466
|
+
tokens.push({
|
|
467
|
+
...createToken(particle, 'particle', createPosition(pos, pos + 1)),
|
|
468
|
+
metadata: {
|
|
469
|
+
particleRole: metadata.role,
|
|
470
|
+
particleConfidence: metadata.confidence,
|
|
471
|
+
particleVariant: metadata.variant,
|
|
472
|
+
},
|
|
473
|
+
});
|
|
474
|
+
} else {
|
|
475
|
+
tokens.push(createToken(particle, 'particle', createPosition(pos, pos + 1)));
|
|
476
|
+
}
|
|
477
|
+
pos++;
|
|
478
|
+
continue;
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
// Try ASCII word (for mixed content)
|
|
482
|
+
if (isAsciiIdentifierChar(input[pos])) {
|
|
483
|
+
const asciiToken = this.extractAsciiWord(input, pos);
|
|
484
|
+
if (asciiToken) {
|
|
485
|
+
tokens.push(asciiToken);
|
|
486
|
+
pos = asciiToken.position.end;
|
|
487
|
+
continue;
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
// Skip unknown character
|
|
492
|
+
pos++;
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
return new TokenStreamImpl(tokens, 'ko');
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
classifyToken(token: string): TokenKind {
|
|
499
|
+
if (PARTICLES.has(token)) return 'particle';
|
|
500
|
+
// O(1) Map lookup instead of O(n) array search
|
|
501
|
+
if (this.isKeyword(token)) return 'keyword';
|
|
502
|
+
if (token.startsWith('#') || token.startsWith('.') || token.startsWith('[')) return 'selector';
|
|
503
|
+
if (token.startsWith('"') || token.startsWith("'")) return 'literal';
|
|
504
|
+
if (/^\d/.test(token)) return 'literal';
|
|
505
|
+
|
|
506
|
+
return 'identifier';
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
/**
|
|
510
|
+
* Extract a Korean word (sequence of Hangul).
|
|
511
|
+
* Prioritizes known keywords, then uses particle-based word boundaries.
|
|
512
|
+
*
|
|
513
|
+
* Uses morphological normalization to handle verb conjugations.
|
|
514
|
+
*/
|
|
515
|
+
private extractKoreanWord(input: string, startPos: number): LanguageToken | null {
|
|
516
|
+
// First, try to find the longest matching keyword starting at this position
|
|
517
|
+
// This ensures compound words like 추가, 증가, 숨기다 are recognized whole
|
|
518
|
+
const maxKeywordLen = 6; // Longest Korean keyword
|
|
519
|
+
for (let len = Math.min(maxKeywordLen, input.length - startPos); len >= 2; len--) {
|
|
520
|
+
const candidate = input.slice(startPos, startPos + len);
|
|
521
|
+
// Check all chars are Korean
|
|
522
|
+
let allKorean = true;
|
|
523
|
+
for (let i = 0; i < candidate.length; i++) {
|
|
524
|
+
if (!isKorean(candidate[i])) {
|
|
525
|
+
allKorean = false;
|
|
526
|
+
break;
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
if (!allKorean) continue;
|
|
530
|
+
|
|
531
|
+
// O(1) Map lookup instead of O(n) array search
|
|
532
|
+
const keywordEntry = this.lookupKeyword(candidate);
|
|
533
|
+
if (keywordEntry) {
|
|
534
|
+
return createToken(
|
|
535
|
+
candidate,
|
|
536
|
+
'keyword',
|
|
537
|
+
createPosition(startPos, startPos + len),
|
|
538
|
+
keywordEntry.normalized
|
|
539
|
+
);
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
// Try morphological normalization for conjugated forms
|
|
543
|
+
const morphToken = this.tryMorphKeywordMatch(candidate, startPos, startPos + len);
|
|
544
|
+
if (morphToken) return morphToken;
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
// No keyword match - extract as regular word using particle boundaries
|
|
548
|
+
let pos = startPos;
|
|
549
|
+
let word = '';
|
|
550
|
+
|
|
551
|
+
while (pos < input.length) {
|
|
552
|
+
const char = input[pos];
|
|
553
|
+
const nextChar = pos + 1 < input.length ? input[pos + 1] : '';
|
|
554
|
+
|
|
555
|
+
// Stop at single-char particles only if:
|
|
556
|
+
// 1. We have content already
|
|
557
|
+
// 2. The particle is at a word boundary (followed by whitespace, end, non-Korean, or another particle)
|
|
558
|
+
if (SINGLE_CHAR_PARTICLES.has(char) && word.length > 0) {
|
|
559
|
+
const isWordBoundary =
|
|
560
|
+
nextChar === '' ||
|
|
561
|
+
isWhitespace(nextChar) ||
|
|
562
|
+
!isKorean(nextChar) ||
|
|
563
|
+
SINGLE_CHAR_PARTICLES.has(nextChar);
|
|
564
|
+
|
|
565
|
+
if (isWordBoundary) {
|
|
566
|
+
break;
|
|
567
|
+
}
|
|
568
|
+
// Otherwise, continue - this particle char is part of the word
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
// Check for multi-char particle (these are always at word boundaries)
|
|
572
|
+
let foundMulti = false;
|
|
573
|
+
for (const particle of MULTI_CHAR_PARTICLES) {
|
|
574
|
+
if (input.slice(pos, pos + particle.length) === particle && word.length > 0) {
|
|
575
|
+
// Only treat as particle if followed by word boundary
|
|
576
|
+
const afterParticle = pos + particle.length;
|
|
577
|
+
const charAfter = afterParticle < input.length ? input[afterParticle] : '';
|
|
578
|
+
if (charAfter === '' || isWhitespace(charAfter) || !isKorean(charAfter)) {
|
|
579
|
+
foundMulti = true;
|
|
580
|
+
break;
|
|
581
|
+
}
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
if (foundMulti) break;
|
|
585
|
+
|
|
586
|
+
// Continue if Korean character
|
|
587
|
+
if (isKorean(char)) {
|
|
588
|
+
word += char;
|
|
589
|
+
pos++;
|
|
590
|
+
} else {
|
|
591
|
+
break;
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
if (!word) return null;
|
|
596
|
+
|
|
597
|
+
// O(1) Map lookup instead of O(n) array search
|
|
598
|
+
const keywordEntry = this.lookupKeyword(word);
|
|
599
|
+
if (keywordEntry) {
|
|
600
|
+
return createToken(word, 'keyword', createPosition(startPos, pos), keywordEntry.normalized);
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
// Try morphological normalization for conjugated forms
|
|
604
|
+
const morphToken = this.tryMorphKeywordMatch(word, startPos, pos);
|
|
605
|
+
if (morphToken) return morphToken;
|
|
606
|
+
|
|
607
|
+
// Not a keyword, return as identifier
|
|
608
|
+
return createToken(word, 'identifier', createPosition(startPos, pos));
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
/**
|
|
612
|
+
* Extract an ASCII word (for mixed Korean/English content).
|
|
613
|
+
*/
|
|
614
|
+
private extractAsciiWord(input: string, startPos: number): LanguageToken | null {
|
|
615
|
+
let pos = startPos;
|
|
616
|
+
let word = '';
|
|
617
|
+
|
|
618
|
+
while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
|
|
619
|
+
word += input[pos++];
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
if (!word) return null;
|
|
623
|
+
|
|
624
|
+
return createToken(word, 'identifier', createPosition(startPos, pos));
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
/**
|
|
628
|
+
* Extract a number, including Korean time unit suffixes.
|
|
629
|
+
* Korean time units attach directly without whitespace.
|
|
630
|
+
*/
|
|
631
|
+
private extractKoreanNumber(input: string, startPos: number): LanguageToken | null {
|
|
632
|
+
return this.tryNumberWithTimeUnits(input, startPos, KOREAN_TIME_UNITS, {
|
|
633
|
+
allowSign: false,
|
|
634
|
+
skipWhitespace: false,
|
|
635
|
+
});
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
/**
|
|
640
|
+
* Singleton instance.
|
|
641
|
+
*/
|
|
642
|
+
export const koreanTokenizer = new KoreanTokenizer();
|