@lokascript/semantic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +686 -0
- package/dist/browser-ar.ar.global.js +2 -0
- package/dist/browser-core.core.global.js +2 -0
- package/dist/browser-de.de.global.js +2 -0
- package/dist/browser-east-asian.east-asian.global.js +2 -0
- package/dist/browser-en-tr.en-tr.global.js +2 -0
- package/dist/browser-en.en.global.js +2 -0
- package/dist/browser-es-en.es-en.global.js +2 -0
- package/dist/browser-es.es.global.js +2 -0
- package/dist/browser-fr.fr.global.js +2 -0
- package/dist/browser-id.id.global.js +2 -0
- package/dist/browser-ja.ja.global.js +2 -0
- package/dist/browser-ko.ko.global.js +2 -0
- package/dist/browser-lazy.lazy.global.js +2 -0
- package/dist/browser-priority.priority.global.js +2 -0
- package/dist/browser-pt.pt.global.js +2 -0
- package/dist/browser-qu.qu.global.js +2 -0
- package/dist/browser-sw.sw.global.js +2 -0
- package/dist/browser-tr.tr.global.js +2 -0
- package/dist/browser-western.western.global.js +2 -0
- package/dist/browser-zh.zh.global.js +2 -0
- package/dist/browser.global.js +3 -0
- package/dist/browser.global.js.map +1 -0
- package/dist/index.cjs +35051 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3426 -0
- package/dist/index.d.ts +3426 -0
- package/dist/index.js +34890 -0
- package/dist/index.js.map +1 -0
- package/dist/languages/ar.d.ts +78 -0
- package/dist/languages/ar.js +1622 -0
- package/dist/languages/ar.js.map +1 -0
- package/dist/languages/de.d.ts +38 -0
- package/dist/languages/de.js +1168 -0
- package/dist/languages/de.js.map +1 -0
- package/dist/languages/en.d.ts +44 -0
- package/dist/languages/en.js +3491 -0
- package/dist/languages/en.js.map +1 -0
- package/dist/languages/es.d.ts +52 -0
- package/dist/languages/es.js +1493 -0
- package/dist/languages/es.js.map +1 -0
- package/dist/languages/fr.d.ts +37 -0
- package/dist/languages/fr.js +1159 -0
- package/dist/languages/fr.js.map +1 -0
- package/dist/languages/id.d.ts +35 -0
- package/dist/languages/id.js +1152 -0
- package/dist/languages/id.js.map +1 -0
- package/dist/languages/ja.d.ts +53 -0
- package/dist/languages/ja.js +1430 -0
- package/dist/languages/ja.js.map +1 -0
- package/dist/languages/ko.d.ts +51 -0
- package/dist/languages/ko.js +1729 -0
- package/dist/languages/ko.js.map +1 -0
- package/dist/languages/pt.d.ts +37 -0
- package/dist/languages/pt.js +1127 -0
- package/dist/languages/pt.js.map +1 -0
- package/dist/languages/qu.d.ts +36 -0
- package/dist/languages/qu.js +1143 -0
- package/dist/languages/qu.js.map +1 -0
- package/dist/languages/sw.d.ts +35 -0
- package/dist/languages/sw.js +1147 -0
- package/dist/languages/sw.js.map +1 -0
- package/dist/languages/tr.d.ts +45 -0
- package/dist/languages/tr.js +1529 -0
- package/dist/languages/tr.js.map +1 -0
- package/dist/languages/zh.d.ts +58 -0
- package/dist/languages/zh.js +1257 -0
- package/dist/languages/zh.js.map +1 -0
- package/dist/types-C4dcj53L.d.ts +600 -0
- package/package.json +202 -0
- package/src/__test-utils__/index.ts +7 -0
- package/src/__test-utils__/test-helpers.ts +8 -0
- package/src/__types__/test-helpers.ts +122 -0
- package/src/analysis/index.ts +479 -0
- package/src/ast-builder/command-mappers.ts +1133 -0
- package/src/ast-builder/expression-parser/index.ts +41 -0
- package/src/ast-builder/expression-parser/parser.ts +563 -0
- package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
- package/src/ast-builder/expression-parser/types.ts +208 -0
- package/src/ast-builder/index.ts +536 -0
- package/src/ast-builder/value-converters.ts +172 -0
- package/src/bridge.ts +275 -0
- package/src/browser-ar.ts +162 -0
- package/src/browser-core.ts +231 -0
- package/src/browser-de.ts +162 -0
- package/src/browser-east-asian.ts +173 -0
- package/src/browser-en-tr.ts +165 -0
- package/src/browser-en.ts +157 -0
- package/src/browser-es-en.ts +200 -0
- package/src/browser-es.ts +170 -0
- package/src/browser-fr.ts +162 -0
- package/src/browser-id.ts +162 -0
- package/src/browser-ja.ts +162 -0
- package/src/browser-ko.ts +162 -0
- package/src/browser-lazy.ts +189 -0
- package/src/browser-priority.ts +214 -0
- package/src/browser-pt.ts +162 -0
- package/src/browser-qu.ts +162 -0
- package/src/browser-sw.ts +162 -0
- package/src/browser-tr.ts +162 -0
- package/src/browser-western.ts +181 -0
- package/src/browser-zh.ts +162 -0
- package/src/browser.ts +268 -0
- package/src/cache/index.ts +14 -0
- package/src/cache/semantic-cache.ts +344 -0
- package/src/core-bridge.ts +372 -0
- package/src/explicit/converter.ts +258 -0
- package/src/explicit/index.ts +18 -0
- package/src/explicit/parser.ts +236 -0
- package/src/explicit/renderer.ts +424 -0
- package/src/generators/command-schemas.ts +1636 -0
- package/src/generators/event-handler-generator.ts +109 -0
- package/src/generators/index.ts +117 -0
- package/src/generators/language-profiles.ts +139 -0
- package/src/generators/pattern-generator.ts +537 -0
- package/src/generators/profiles/arabic.ts +131 -0
- package/src/generators/profiles/bengali.ts +132 -0
- package/src/generators/profiles/chinese.ts +124 -0
- package/src/generators/profiles/english.ts +113 -0
- package/src/generators/profiles/french.ts +125 -0
- package/src/generators/profiles/german.ts +126 -0
- package/src/generators/profiles/hindi.ts +146 -0
- package/src/generators/profiles/index.ts +46 -0
- package/src/generators/profiles/indonesian.ts +125 -0
- package/src/generators/profiles/italian.ts +139 -0
- package/src/generators/profiles/japanese.ts +149 -0
- package/src/generators/profiles/korean.ts +127 -0
- package/src/generators/profiles/marker-templates.ts +288 -0
- package/src/generators/profiles/ms.ts +130 -0
- package/src/generators/profiles/polish.ts +249 -0
- package/src/generators/profiles/portuguese.ts +115 -0
- package/src/generators/profiles/quechua.ts +113 -0
- package/src/generators/profiles/russian.ts +260 -0
- package/src/generators/profiles/spanish.ts +130 -0
- package/src/generators/profiles/swahili.ts +129 -0
- package/src/generators/profiles/thai.ts +132 -0
- package/src/generators/profiles/tl.ts +128 -0
- package/src/generators/profiles/turkish.ts +124 -0
- package/src/generators/profiles/types.ts +165 -0
- package/src/generators/profiles/ukrainian.ts +270 -0
- package/src/generators/profiles/vietnamese.ts +133 -0
- package/src/generators/schema-error-codes.ts +160 -0
- package/src/generators/schema-validator.ts +391 -0
- package/src/index.ts +429 -0
- package/src/language-building-schema.ts +3170 -0
- package/src/language-loader.ts +394 -0
- package/src/languages/_all.ts +65 -0
- package/src/languages/ar.ts +15 -0
- package/src/languages/bn.ts +16 -0
- package/src/languages/de.ts +15 -0
- package/src/languages/en.ts +29 -0
- package/src/languages/es.ts +15 -0
- package/src/languages/fr.ts +15 -0
- package/src/languages/hi.ts +26 -0
- package/src/languages/id.ts +15 -0
- package/src/languages/index.ts +18 -0
- package/src/languages/it.ts +15 -0
- package/src/languages/ja.ts +15 -0
- package/src/languages/ko.ts +15 -0
- package/src/languages/ms.ts +16 -0
- package/src/languages/pl.ts +18 -0
- package/src/languages/pt.ts +15 -0
- package/src/languages/qu.ts +15 -0
- package/src/languages/ru.ts +26 -0
- package/src/languages/sw.ts +15 -0
- package/src/languages/th.ts +16 -0
- package/src/languages/tl.ts +16 -0
- package/src/languages/tr.ts +15 -0
- package/src/languages/uk.ts +26 -0
- package/src/languages/vi.ts +16 -0
- package/src/languages/zh.ts +15 -0
- package/src/parser/index.ts +15 -0
- package/src/parser/pattern-matcher.ts +1181 -0
- package/src/parser/semantic-parser.ts +573 -0
- package/src/parser/utils/index.ts +35 -0
- package/src/parser/utils/marker-resolution.ts +111 -0
- package/src/parser/utils/possessive-keywords.ts +43 -0
- package/src/parser/utils/role-positioning.ts +70 -0
- package/src/parser/utils/type-validation.ts +134 -0
- package/src/patterns/add/ar.ts +71 -0
- package/src/patterns/add/bn.ts +70 -0
- package/src/patterns/add/hi.ts +69 -0
- package/src/patterns/add/index.ts +87 -0
- package/src/patterns/add/it.ts +61 -0
- package/src/patterns/add/ja.ts +93 -0
- package/src/patterns/add/ko.ts +74 -0
- package/src/patterns/add/ms.ts +30 -0
- package/src/patterns/add/pl.ts +62 -0
- package/src/patterns/add/ru.ts +62 -0
- package/src/patterns/add/th.ts +49 -0
- package/src/patterns/add/tl.ts +30 -0
- package/src/patterns/add/tr.ts +71 -0
- package/src/patterns/add/uk.ts +62 -0
- package/src/patterns/add/vi.ts +61 -0
- package/src/patterns/add/zh.ts +71 -0
- package/src/patterns/builders.ts +207 -0
- package/src/patterns/decrement/bn.ts +70 -0
- package/src/patterns/decrement/de.ts +42 -0
- package/src/patterns/decrement/hi.ts +68 -0
- package/src/patterns/decrement/index.ts +79 -0
- package/src/patterns/decrement/it.ts +69 -0
- package/src/patterns/decrement/ms.ts +30 -0
- package/src/patterns/decrement/pl.ts +58 -0
- package/src/patterns/decrement/ru.ts +58 -0
- package/src/patterns/decrement/th.ts +49 -0
- package/src/patterns/decrement/tl.ts +30 -0
- package/src/patterns/decrement/tr.ts +48 -0
- package/src/patterns/decrement/uk.ts +58 -0
- package/src/patterns/decrement/vi.ts +61 -0
- package/src/patterns/decrement/zh.ts +32 -0
- package/src/patterns/en.ts +302 -0
- package/src/patterns/event-handler/ar.ts +151 -0
- package/src/patterns/event-handler/bn.ts +72 -0
- package/src/patterns/event-handler/de.ts +117 -0
- package/src/patterns/event-handler/en.ts +117 -0
- package/src/patterns/event-handler/es.ts +136 -0
- package/src/patterns/event-handler/fr.ts +117 -0
- package/src/patterns/event-handler/hi.ts +64 -0
- package/src/patterns/event-handler/id.ts +117 -0
- package/src/patterns/event-handler/index.ts +119 -0
- package/src/patterns/event-handler/it.ts +54 -0
- package/src/patterns/event-handler/ja.ts +118 -0
- package/src/patterns/event-handler/ko.ts +133 -0
- package/src/patterns/event-handler/ms.ts +30 -0
- package/src/patterns/event-handler/pl.ts +62 -0
- package/src/patterns/event-handler/pt.ts +117 -0
- package/src/patterns/event-handler/qu.ts +66 -0
- package/src/patterns/event-handler/ru.ts +62 -0
- package/src/patterns/event-handler/shared.ts +270 -0
- package/src/patterns/event-handler/sw.ts +117 -0
- package/src/patterns/event-handler/th.ts +53 -0
- package/src/patterns/event-handler/tl.ts +30 -0
- package/src/patterns/event-handler/tr.ts +170 -0
- package/src/patterns/event-handler/uk.ts +62 -0
- package/src/patterns/event-handler/vi.ts +61 -0
- package/src/patterns/event-handler/zh.ts +150 -0
- package/src/patterns/get/ar.ts +49 -0
- package/src/patterns/get/bn.ts +47 -0
- package/src/patterns/get/de.ts +32 -0
- package/src/patterns/get/hi.ts +52 -0
- package/src/patterns/get/index.ts +83 -0
- package/src/patterns/get/it.ts +56 -0
- package/src/patterns/get/ja.ts +53 -0
- package/src/patterns/get/ko.ts +53 -0
- package/src/patterns/get/ms.ts +30 -0
- package/src/patterns/get/pl.ts +57 -0
- package/src/patterns/get/ru.ts +57 -0
- package/src/patterns/get/th.ts +29 -0
- package/src/patterns/get/tl.ts +30 -0
- package/src/patterns/get/uk.ts +57 -0
- package/src/patterns/get/vi.ts +48 -0
- package/src/patterns/grammar-transformed/index.ts +39 -0
- package/src/patterns/grammar-transformed/ja.ts +1713 -0
- package/src/patterns/grammar-transformed/ko.ts +1311 -0
- package/src/patterns/grammar-transformed/tr.ts +1067 -0
- package/src/patterns/hide/ar.ts +67 -0
- package/src/patterns/hide/bn.ts +47 -0
- package/src/patterns/hide/de.ts +36 -0
- package/src/patterns/hide/hi.ts +61 -0
- package/src/patterns/hide/index.ts +91 -0
- package/src/patterns/hide/it.ts +56 -0
- package/src/patterns/hide/ja.ts +69 -0
- package/src/patterns/hide/ko.ts +69 -0
- package/src/patterns/hide/ms.ts +30 -0
- package/src/patterns/hide/pl.ts +57 -0
- package/src/patterns/hide/ru.ts +57 -0
- package/src/patterns/hide/th.ts +29 -0
- package/src/patterns/hide/tl.ts +30 -0
- package/src/patterns/hide/tr.ts +65 -0
- package/src/patterns/hide/uk.ts +57 -0
- package/src/patterns/hide/vi.ts +56 -0
- package/src/patterns/hide/zh.ts +68 -0
- package/src/patterns/increment/bn.ts +70 -0
- package/src/patterns/increment/de.ts +36 -0
- package/src/patterns/increment/hi.ts +68 -0
- package/src/patterns/increment/index.ts +79 -0
- package/src/patterns/increment/it.ts +69 -0
- package/src/patterns/increment/ms.ts +30 -0
- package/src/patterns/increment/pl.ts +58 -0
- package/src/patterns/increment/ru.ts +58 -0
- package/src/patterns/increment/th.ts +49 -0
- package/src/patterns/increment/tl.ts +30 -0
- package/src/patterns/increment/tr.ts +52 -0
- package/src/patterns/increment/uk.ts +58 -0
- package/src/patterns/increment/vi.ts +61 -0
- package/src/patterns/increment/zh.ts +32 -0
- package/src/patterns/index.ts +84 -0
- package/src/patterns/languages/en/control-flow.ts +93 -0
- package/src/patterns/languages/en/fetch.ts +62 -0
- package/src/patterns/languages/en/index.ts +42 -0
- package/src/patterns/languages/en/repeat.ts +67 -0
- package/src/patterns/languages/en/set.ts +48 -0
- package/src/patterns/languages/en/swap.ts +38 -0
- package/src/patterns/languages/en/temporal.ts +57 -0
- package/src/patterns/put/ar.ts +74 -0
- package/src/patterns/put/bn.ts +53 -0
- package/src/patterns/put/en.ts +74 -0
- package/src/patterns/put/es.ts +74 -0
- package/src/patterns/put/hi.ts +69 -0
- package/src/patterns/put/id.ts +96 -0
- package/src/patterns/put/index.ts +99 -0
- package/src/patterns/put/it.ts +56 -0
- package/src/patterns/put/ja.ts +75 -0
- package/src/patterns/put/ko.ts +67 -0
- package/src/patterns/put/ms.ts +30 -0
- package/src/patterns/put/pl.ts +81 -0
- package/src/patterns/put/ru.ts +85 -0
- package/src/patterns/put/th.ts +32 -0
- package/src/patterns/put/tl.ts +30 -0
- package/src/patterns/put/tr.ts +67 -0
- package/src/patterns/put/uk.ts +85 -0
- package/src/patterns/put/vi.ts +72 -0
- package/src/patterns/put/zh.ts +62 -0
- package/src/patterns/registry.ts +163 -0
- package/src/patterns/remove/ar.ts +71 -0
- package/src/patterns/remove/bn.ts +68 -0
- package/src/patterns/remove/hi.ts +69 -0
- package/src/patterns/remove/index.ts +87 -0
- package/src/patterns/remove/it.ts +69 -0
- package/src/patterns/remove/ja.ts +74 -0
- package/src/patterns/remove/ko.ts +78 -0
- package/src/patterns/remove/ms.ts +30 -0
- package/src/patterns/remove/pl.ts +62 -0
- package/src/patterns/remove/ru.ts +62 -0
- package/src/patterns/remove/th.ts +49 -0
- package/src/patterns/remove/tl.ts +30 -0
- package/src/patterns/remove/tr.ts +78 -0
- package/src/patterns/remove/uk.ts +62 -0
- package/src/patterns/remove/vi.ts +61 -0
- package/src/patterns/remove/zh.ts +72 -0
- package/src/patterns/set/ar.ts +84 -0
- package/src/patterns/set/bn.ts +53 -0
- package/src/patterns/set/de.ts +84 -0
- package/src/patterns/set/es.ts +92 -0
- package/src/patterns/set/fr.ts +88 -0
- package/src/patterns/set/hi.ts +56 -0
- package/src/patterns/set/id.ts +84 -0
- package/src/patterns/set/index.ts +107 -0
- package/src/patterns/set/it.ts +56 -0
- package/src/patterns/set/ja.ts +86 -0
- package/src/patterns/set/ko.ts +85 -0
- package/src/patterns/set/ms.ts +30 -0
- package/src/patterns/set/pl.ts +57 -0
- package/src/patterns/set/pt.ts +84 -0
- package/src/patterns/set/ru.ts +57 -0
- package/src/patterns/set/th.ts +31 -0
- package/src/patterns/set/tl.ts +30 -0
- package/src/patterns/set/tr.ts +107 -0
- package/src/patterns/set/uk.ts +57 -0
- package/src/patterns/set/vi.ts +53 -0
- package/src/patterns/set/zh.ts +84 -0
- package/src/patterns/show/ar.ts +67 -0
- package/src/patterns/show/bn.ts +47 -0
- package/src/patterns/show/de.ts +32 -0
- package/src/patterns/show/fr.ts +32 -0
- package/src/patterns/show/hi.ts +61 -0
- package/src/patterns/show/index.ts +95 -0
- package/src/patterns/show/it.ts +56 -0
- package/src/patterns/show/ja.ts +69 -0
- package/src/patterns/show/ko.ts +73 -0
- package/src/patterns/show/ms.ts +30 -0
- package/src/patterns/show/pl.ts +57 -0
- package/src/patterns/show/ru.ts +57 -0
- package/src/patterns/show/th.ts +29 -0
- package/src/patterns/show/tl.ts +30 -0
- package/src/patterns/show/tr.ts +65 -0
- package/src/patterns/show/uk.ts +57 -0
- package/src/patterns/show/vi.ts +56 -0
- package/src/patterns/show/zh.ts +68 -0
- package/src/patterns/take/ar.ts +51 -0
- package/src/patterns/take/index.ts +31 -0
- package/src/patterns/toggle/ar.ts +61 -0
- package/src/patterns/toggle/bn.ts +70 -0
- package/src/patterns/toggle/en.ts +61 -0
- package/src/patterns/toggle/es.ts +61 -0
- package/src/patterns/toggle/hi.ts +80 -0
- package/src/patterns/toggle/index.ts +95 -0
- package/src/patterns/toggle/it.ts +69 -0
- package/src/patterns/toggle/ja.ts +156 -0
- package/src/patterns/toggle/ko.ts +113 -0
- package/src/patterns/toggle/ms.ts +30 -0
- package/src/patterns/toggle/pl.ts +62 -0
- package/src/patterns/toggle/ru.ts +62 -0
- package/src/patterns/toggle/th.ts +50 -0
- package/src/patterns/toggle/tl.ts +30 -0
- package/src/patterns/toggle/tr.ts +88 -0
- package/src/patterns/toggle/uk.ts +62 -0
- package/src/patterns/toggle/vi.ts +61 -0
- package/src/patterns/toggle/zh.ts +99 -0
- package/src/public-api.ts +286 -0
- package/src/registry.ts +441 -0
- package/src/tokenizers/arabic.ts +723 -0
- package/src/tokenizers/base.ts +1300 -0
- package/src/tokenizers/bengali.ts +289 -0
- package/src/tokenizers/chinese.ts +481 -0
- package/src/tokenizers/english.ts +416 -0
- package/src/tokenizers/french.ts +326 -0
- package/src/tokenizers/german.ts +324 -0
- package/src/tokenizers/hindi.ts +319 -0
- package/src/tokenizers/index.ts +127 -0
- package/src/tokenizers/indonesian.ts +306 -0
- package/src/tokenizers/italian.ts +458 -0
- package/src/tokenizers/japanese.ts +447 -0
- package/src/tokenizers/korean.ts +642 -0
- package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
- package/src/tokenizers/morphology/french-normalizer.ts +268 -0
- package/src/tokenizers/morphology/german-normalizer.ts +256 -0
- package/src/tokenizers/morphology/index.ts +46 -0
- package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
- package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
- package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
- package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
- package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
- package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
- package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
- package/src/tokenizers/morphology/types.ts +211 -0
- package/src/tokenizers/ms.ts +198 -0
- package/src/tokenizers/polish.ts +354 -0
- package/src/tokenizers/portuguese.ts +304 -0
- package/src/tokenizers/quechua.ts +339 -0
- package/src/tokenizers/russian.ts +375 -0
- package/src/tokenizers/spanish.ts +403 -0
- package/src/tokenizers/swahili.ts +303 -0
- package/src/tokenizers/thai.ts +236 -0
- package/src/tokenizers/tl.ts +198 -0
- package/src/tokenizers/turkish.ts +411 -0
- package/src/tokenizers/ukrainian.ts +369 -0
- package/src/tokenizers/vietnamese.ts +410 -0
- package/src/types/grammar-types.ts +617 -0
- package/src/types/unified-profile.ts +267 -0
- package/src/types.ts +709 -0
- package/src/utils/confidence-calculator.ts +147 -0
- package/src/validators/command-validator.ts +380 -0
- package/src/validators/index.ts +15 -0
|
@@ -0,0 +1,447 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Japanese Tokenizer
|
|
3
|
+
*
|
|
4
|
+
* Tokenizes Japanese hyperscript input.
|
|
5
|
+
* Japanese is challenging because:
|
|
6
|
+
* - No spaces between words
|
|
7
|
+
* - Particles (助詞) mark grammatical roles
|
|
8
|
+
* - Mixed scripts (hiragana, katakana, kanji, romaji)
|
|
9
|
+
* - CSS selectors are embedded ASCII
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import type { LanguageToken, TokenKind, TokenStream } from '../types';
|
|
13
|
+
import {
|
|
14
|
+
BaseTokenizer,
|
|
15
|
+
TokenStreamImpl,
|
|
16
|
+
createToken,
|
|
17
|
+
createPosition,
|
|
18
|
+
createUnicodeRangeClassifier,
|
|
19
|
+
combineClassifiers,
|
|
20
|
+
isWhitespace,
|
|
21
|
+
isSelectorStart,
|
|
22
|
+
isQuote,
|
|
23
|
+
isDigit,
|
|
24
|
+
isAsciiIdentifierChar,
|
|
25
|
+
isUrlStart,
|
|
26
|
+
type KeywordEntry,
|
|
27
|
+
type TimeUnitMapping,
|
|
28
|
+
} from './base';
|
|
29
|
+
import { JapaneseMorphologicalNormalizer } from './morphology/japanese-normalizer';
|
|
30
|
+
import { japaneseProfile } from '../generators/profiles/japanese';
|
|
31
|
+
|
|
32
|
+
// =============================================================================
|
|
33
|
+
// Japanese Character Classification
|
|
34
|
+
// =============================================================================
|
|
35
|
+
|
|
36
|
+
/** Check if character is hiragana (U+3040-U+309F). */
|
|
37
|
+
const isHiragana = createUnicodeRangeClassifier([[0x3040, 0x309f]]);
|
|
38
|
+
|
|
39
|
+
/** Check if character is katakana (U+30A0-U+30FF). */
|
|
40
|
+
const isKatakana = createUnicodeRangeClassifier([[0x30a0, 0x30ff]]);
|
|
41
|
+
|
|
42
|
+
/** Check if character is kanji (CJK Unified Ideographs + Extension A). */
|
|
43
|
+
const isKanji = createUnicodeRangeClassifier([
|
|
44
|
+
[0x4e00, 0x9fff], // CJK Unified Ideographs
|
|
45
|
+
[0x3400, 0x4dbf], // CJK Unified Ideographs Extension A
|
|
46
|
+
]);
|
|
47
|
+
|
|
48
|
+
/** Check if character is Japanese (hiragana, katakana, or kanji). */
|
|
49
|
+
const isJapanese = combineClassifiers(isHiragana, isKatakana, isKanji);
|
|
50
|
+
|
|
51
|
+
// =============================================================================
|
|
52
|
+
// Japanese Particles
|
|
53
|
+
// =============================================================================
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Japanese particles that mark grammatical roles.
|
|
57
|
+
* These are single hiragana characters that appear after nouns/verbs.
|
|
58
|
+
*/
|
|
59
|
+
const PARTICLES = new Set([
|
|
60
|
+
'を', // wo - object marker
|
|
61
|
+
'に', // ni - destination, time
|
|
62
|
+
'で', // de - location of action, means
|
|
63
|
+
'から', // kara - from
|
|
64
|
+
'まで', // made - until
|
|
65
|
+
'へ', // e - direction
|
|
66
|
+
'と', // to - and, with
|
|
67
|
+
'の', // no - possessive
|
|
68
|
+
'が', // ga - subject marker
|
|
69
|
+
'は', // wa - topic marker
|
|
70
|
+
'も', // mo - also
|
|
71
|
+
'より', // yori - than, from
|
|
72
|
+
]);
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Single-character particles (most common).
|
|
76
|
+
*/
|
|
77
|
+
const SINGLE_CHAR_PARTICLES = new Set(['を', 'に', 'で', 'へ', 'と', 'の', 'が', 'は', 'も']);
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Multi-character particles.
|
|
81
|
+
*/
|
|
82
|
+
const MULTI_CHAR_PARTICLES = ['から', 'まで', 'より'];
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Particle metadata mapping particles to semantic roles and confidence scores.
|
|
86
|
+
* Used to enhance particle tokens with role information for the pattern matcher.
|
|
87
|
+
*/
|
|
88
|
+
interface ParticleMetadata {
|
|
89
|
+
readonly role: string; // SemanticRole
|
|
90
|
+
readonly confidence: number;
|
|
91
|
+
readonly description?: string;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
const PARTICLE_ROLES = new Map<string, ParticleMetadata>([
|
|
95
|
+
['を', { role: 'patient', confidence: 0.95, description: 'object marker' }],
|
|
96
|
+
['に', { role: 'destination', confidence: 0.85, description: 'destination/time marker' }],
|
|
97
|
+
['で', { role: 'manner', confidence: 0.88, description: 'means/location marker' }],
|
|
98
|
+
['から', { role: 'source', confidence: 0.9, description: 'from/source marker' }],
|
|
99
|
+
['まで', { role: 'destination', confidence: 0.75, description: 'until/boundary marker' }],
|
|
100
|
+
['へ', { role: 'destination', confidence: 0.9, description: 'direction marker' }],
|
|
101
|
+
['と', { role: 'style', confidence: 0.7, description: 'with/and marker' }],
|
|
102
|
+
['の', { role: 'patient', confidence: 0.6, description: 'possessive marker' }],
|
|
103
|
+
['が', { role: 'agent', confidence: 0.85, description: 'subject marker' }],
|
|
104
|
+
['は', { role: 'agent', confidence: 0.75, description: 'topic marker' }],
|
|
105
|
+
['も', { role: 'patient', confidence: 0.65, description: 'also/too marker' }],
|
|
106
|
+
['より', { role: 'source', confidence: 0.85, description: 'from/than marker' }],
|
|
107
|
+
]);
|
|
108
|
+
|
|
109
|
+
// =============================================================================
|
|
110
|
+
// Japanese Extras (keywords not in profile)
|
|
111
|
+
// =============================================================================
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Extra keywords not covered by the profile:
|
|
115
|
+
* - Literals (true, false, null, undefined)
|
|
116
|
+
* - Positional words
|
|
117
|
+
* - Event names
|
|
118
|
+
* - Attached particle forms (native idioms)
|
|
119
|
+
* - Conditional event forms
|
|
120
|
+
* - Time units
|
|
121
|
+
*/
|
|
122
|
+
const JAPANESE_EXTRAS: KeywordEntry[] = [
|
|
123
|
+
// Values/Literals
|
|
124
|
+
{ native: '真', normalized: 'true' },
|
|
125
|
+
{ native: '偽', normalized: 'false' },
|
|
126
|
+
{ native: 'ヌル', normalized: 'null' },
|
|
127
|
+
{ native: '未定義', normalized: 'undefined' },
|
|
128
|
+
|
|
129
|
+
// Positional
|
|
130
|
+
{ native: '最初', normalized: 'first' },
|
|
131
|
+
{ native: '最後', normalized: 'last' },
|
|
132
|
+
{ native: '次', normalized: 'next' },
|
|
133
|
+
{ native: '前', normalized: 'previous' },
|
|
134
|
+
{ native: '最も近い', normalized: 'closest' },
|
|
135
|
+
{ native: '親', normalized: 'parent' },
|
|
136
|
+
|
|
137
|
+
// Events
|
|
138
|
+
{ native: 'クリック', normalized: 'click' },
|
|
139
|
+
{ native: '変更', normalized: 'change' },
|
|
140
|
+
{ native: '送信', normalized: 'submit' },
|
|
141
|
+
{ native: '入力', normalized: 'input' },
|
|
142
|
+
{ native: 'ロード', normalized: 'load' },
|
|
143
|
+
{ native: 'スクロール', normalized: 'scroll' },
|
|
144
|
+
{ native: 'キーダウン', normalized: 'keydown' },
|
|
145
|
+
{ native: 'キーアップ', normalized: 'keyup' },
|
|
146
|
+
{ native: 'マウスオーバー', normalized: 'mouseover' },
|
|
147
|
+
{ native: 'マウスアウト', normalized: 'mouseout' },
|
|
148
|
+
{ native: 'ブラー', normalized: 'blur' },
|
|
149
|
+
|
|
150
|
+
// References (additional forms)
|
|
151
|
+
{ native: '私', normalized: 'me' },
|
|
152
|
+
{ native: '私の', normalized: 'my' },
|
|
153
|
+
{ native: 'その', normalized: 'its' },
|
|
154
|
+
|
|
155
|
+
// Note: Attached particle forms (を切り替え, を追加, etc.) are intentionally NOT included
|
|
156
|
+
// because they would cause ambiguous parsing. The separate particle + verb pattern
|
|
157
|
+
// (を + 切り替え) is preferred for consistent semantic analysis.
|
|
158
|
+
|
|
159
|
+
// Conditional event forms
|
|
160
|
+
{ native: 'したら', normalized: 'on' },
|
|
161
|
+
{ native: 'すると', normalized: 'on' },
|
|
162
|
+
{ native: '時に', normalized: 'on' },
|
|
163
|
+
|
|
164
|
+
// Control flow helpers
|
|
165
|
+
{ native: 'もし', normalized: 'if' }, // Starts with particle も, needs explicit entry
|
|
166
|
+
{ native: 'ならば', normalized: 'then' },
|
|
167
|
+
{ native: 'なら', normalized: 'then' },
|
|
168
|
+
|
|
169
|
+
// Time units
|
|
170
|
+
{ native: '秒', normalized: 's' },
|
|
171
|
+
{ native: 'ミリ秒', normalized: 'ms' },
|
|
172
|
+
{ native: '分', normalized: 'm' },
|
|
173
|
+
{ native: '時間', normalized: 'h' },
|
|
174
|
+
];
|
|
175
|
+
|
|
176
|
+
// =============================================================================
|
|
177
|
+
// Japanese Time Units
|
|
178
|
+
// =============================================================================
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Japanese time unit patterns for number parsing.
|
|
182
|
+
* Sorted by length (longest first) to ensure correct matching.
|
|
183
|
+
* Japanese time units attach directly without whitespace.
|
|
184
|
+
*/
|
|
185
|
+
const JAPANESE_TIME_UNITS: readonly TimeUnitMapping[] = [
|
|
186
|
+
{ pattern: 'ミリ秒', suffix: 'ms', length: 3 },
|
|
187
|
+
{ pattern: '時間', suffix: 'h', length: 2 },
|
|
188
|
+
{ pattern: '秒', suffix: 's', length: 1 },
|
|
189
|
+
{ pattern: '分', suffix: 'm', length: 1 },
|
|
190
|
+
];
|
|
191
|
+
|
|
192
|
+
// =============================================================================
|
|
193
|
+
// Japanese Tokenizer Implementation
|
|
194
|
+
// =============================================================================
|
|
195
|
+
|
|
196
|
+
export class JapaneseTokenizer extends BaseTokenizer {
|
|
197
|
+
readonly language = 'ja';
|
|
198
|
+
readonly direction = 'ltr' as const;
|
|
199
|
+
|
|
200
|
+
constructor() {
|
|
201
|
+
super();
|
|
202
|
+
// Initialize keywords from profile + extras (single source of truth)
|
|
203
|
+
this.initializeKeywordsFromProfile(japaneseProfile, JAPANESE_EXTRAS);
|
|
204
|
+
// Set morphological normalizer for verb conjugations
|
|
205
|
+
this.normalizer = new JapaneseMorphologicalNormalizer();
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
tokenize(input: string): TokenStream {
|
|
209
|
+
const tokens: LanguageToken[] = [];
|
|
210
|
+
let pos = 0;
|
|
211
|
+
|
|
212
|
+
while (pos < input.length) {
|
|
213
|
+
// Skip whitespace (Japanese can have spaces for readability)
|
|
214
|
+
if (isWhitespace(input[pos])) {
|
|
215
|
+
pos++;
|
|
216
|
+
continue;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// Try CSS selector first (ASCII-based, highest priority)
|
|
220
|
+
if (isSelectorStart(input[pos])) {
|
|
221
|
+
// Check for event modifier first (.once, .debounce(), etc.)
|
|
222
|
+
const modifierToken = this.tryEventModifier(input, pos);
|
|
223
|
+
if (modifierToken) {
|
|
224
|
+
tokens.push(modifierToken);
|
|
225
|
+
pos = modifierToken.position.end;
|
|
226
|
+
continue;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
const selectorToken = this.trySelector(input, pos);
|
|
230
|
+
if (selectorToken) {
|
|
231
|
+
tokens.push(selectorToken);
|
|
232
|
+
pos = selectorToken.position.end;
|
|
233
|
+
continue;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Try string literal (both ASCII and Japanese quotes)
|
|
238
|
+
if (isQuote(input[pos])) {
|
|
239
|
+
const stringToken = this.tryString(input, pos);
|
|
240
|
+
if (stringToken) {
|
|
241
|
+
tokens.push(stringToken);
|
|
242
|
+
pos = stringToken.position.end;
|
|
243
|
+
continue;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Try URL (/path, ./path, http://, etc.)
|
|
248
|
+
if (isUrlStart(input, pos)) {
|
|
249
|
+
const urlToken = this.tryUrl(input, pos);
|
|
250
|
+
if (urlToken) {
|
|
251
|
+
tokens.push(urlToken);
|
|
252
|
+
pos = urlToken.position.end;
|
|
253
|
+
continue;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// Try number (including Japanese time units)
|
|
258
|
+
if (isDigit(input[pos])) {
|
|
259
|
+
const numberToken = this.extractJapaneseNumber(input, pos);
|
|
260
|
+
if (numberToken) {
|
|
261
|
+
tokens.push(numberToken);
|
|
262
|
+
pos = numberToken.position.end;
|
|
263
|
+
continue;
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// Try variable reference (:varname)
|
|
268
|
+
const varToken = this.tryVariableRef(input, pos);
|
|
269
|
+
if (varToken) {
|
|
270
|
+
tokens.push(varToken);
|
|
271
|
+
pos = varToken.position.end;
|
|
272
|
+
continue;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// Try multi-character particle (before single-character)
|
|
276
|
+
const multiParticle = this.tryMultiCharParticle(input, pos, MULTI_CHAR_PARTICLES);
|
|
277
|
+
if (multiParticle) {
|
|
278
|
+
// Add role metadata to particle token
|
|
279
|
+
const metadata = PARTICLE_ROLES.get(multiParticle.value);
|
|
280
|
+
if (metadata) {
|
|
281
|
+
tokens.push({
|
|
282
|
+
...multiParticle,
|
|
283
|
+
metadata: {
|
|
284
|
+
particleRole: metadata.role,
|
|
285
|
+
particleConfidence: metadata.confidence,
|
|
286
|
+
},
|
|
287
|
+
});
|
|
288
|
+
} else {
|
|
289
|
+
tokens.push(multiParticle);
|
|
290
|
+
}
|
|
291
|
+
pos = multiParticle.position.end;
|
|
292
|
+
continue;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// Check if this starts a multi-character keyword (before single-char particle check)
|
|
296
|
+
// This prevents splitting keywords like もし (if) into も (particle) + し (identifier)
|
|
297
|
+
if (SINGLE_CHAR_PARTICLES.has(input[pos])) {
|
|
298
|
+
const keywordToken = this.tryProfileKeyword(input, pos);
|
|
299
|
+
// Only accept keywords longer than 1 char (e.g., もし but not を/で/に which are role markers)
|
|
300
|
+
if (keywordToken && keywordToken.value.length > 1) {
|
|
301
|
+
tokens.push(keywordToken);
|
|
302
|
+
pos = keywordToken.position.end;
|
|
303
|
+
continue;
|
|
304
|
+
}
|
|
305
|
+
// Not a multi-char keyword, treat as particle
|
|
306
|
+
const particle = input[pos];
|
|
307
|
+
const metadata = PARTICLE_ROLES.get(particle);
|
|
308
|
+
if (metadata) {
|
|
309
|
+
tokens.push({
|
|
310
|
+
...createToken(particle, 'particle', createPosition(pos, pos + 1)),
|
|
311
|
+
metadata: {
|
|
312
|
+
particleRole: metadata.role,
|
|
313
|
+
particleConfidence: metadata.confidence,
|
|
314
|
+
},
|
|
315
|
+
});
|
|
316
|
+
} else {
|
|
317
|
+
tokens.push(createToken(particle, 'particle', createPosition(pos, pos + 1)));
|
|
318
|
+
}
|
|
319
|
+
pos++;
|
|
320
|
+
continue;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Try Japanese word (kanji/kana sequence)
|
|
324
|
+
if (isJapanese(input[pos])) {
|
|
325
|
+
const wordToken = this.extractJapaneseWord(input, pos);
|
|
326
|
+
if (wordToken) {
|
|
327
|
+
tokens.push(wordToken);
|
|
328
|
+
pos = wordToken.position.end;
|
|
329
|
+
continue;
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
// Try ASCII word (for mixed content)
|
|
334
|
+
if (isAsciiIdentifierChar(input[pos])) {
|
|
335
|
+
const asciiToken = this.extractAsciiWord(input, pos);
|
|
336
|
+
if (asciiToken) {
|
|
337
|
+
tokens.push(asciiToken);
|
|
338
|
+
pos = asciiToken.position.end;
|
|
339
|
+
continue;
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// Skip unknown character
|
|
344
|
+
pos++;
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
return new TokenStreamImpl(tokens, 'ja');
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
classifyToken(token: string): TokenKind {
|
|
351
|
+
if (PARTICLES.has(token)) return 'particle';
|
|
352
|
+
// O(1) Map lookup instead of O(n) array search
|
|
353
|
+
if (this.isKeyword(token)) return 'keyword';
|
|
354
|
+
if (token.startsWith('#') || token.startsWith('.') || token.startsWith('[')) return 'selector';
|
|
355
|
+
if (token.startsWith('"') || token.startsWith("'") || token.startsWith('「')) return 'literal';
|
|
356
|
+
if (/^\d/.test(token)) return 'literal';
|
|
357
|
+
|
|
358
|
+
return 'identifier';
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
/**
|
|
362
|
+
* Extract a Japanese word (sequence of kanji/kana).
|
|
363
|
+
* Stops at particles, ASCII, or whitespace.
|
|
364
|
+
*
|
|
365
|
+
* Uses morphological normalization to handle verb conjugations:
|
|
366
|
+
* 1. First checks if the exact word is in the keyword map
|
|
367
|
+
* 2. If not found, tries to strip conjugation suffixes and check again
|
|
368
|
+
*/
|
|
369
|
+
private extractJapaneseWord(input: string, startPos: number): LanguageToken | null {
|
|
370
|
+
let pos = startPos;
|
|
371
|
+
let word = '';
|
|
372
|
+
|
|
373
|
+
while (pos < input.length) {
|
|
374
|
+
const char = input[pos];
|
|
375
|
+
|
|
376
|
+
// Stop at particles (except within longer words)
|
|
377
|
+
if (SINGLE_CHAR_PARTICLES.has(char) && word.length > 0) {
|
|
378
|
+
break;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// Check for multi-char particle
|
|
382
|
+
let foundMulti = false;
|
|
383
|
+
for (const particle of MULTI_CHAR_PARTICLES) {
|
|
384
|
+
if (input.slice(pos, pos + particle.length) === particle && word.length > 0) {
|
|
385
|
+
foundMulti = true;
|
|
386
|
+
break;
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
if (foundMulti) break;
|
|
390
|
+
|
|
391
|
+
// Continue if Japanese character
|
|
392
|
+
if (isJapanese(char)) {
|
|
393
|
+
word += char;
|
|
394
|
+
pos++;
|
|
395
|
+
} else {
|
|
396
|
+
break;
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
if (!word) return null;
|
|
401
|
+
|
|
402
|
+
// O(1) Map lookup instead of O(n) array search
|
|
403
|
+
const keywordEntry = this.lookupKeyword(word);
|
|
404
|
+
if (keywordEntry) {
|
|
405
|
+
return createToken(word, 'keyword', createPosition(startPos, pos), keywordEntry.normalized);
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
// Try morphological normalization for conjugated forms
|
|
409
|
+
const morphToken = this.tryMorphKeywordMatch(word, startPos, pos);
|
|
410
|
+
if (morphToken) return morphToken;
|
|
411
|
+
|
|
412
|
+
// Not a keyword, return as identifier
|
|
413
|
+
return createToken(word, 'identifier', createPosition(startPos, pos));
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
/**
|
|
417
|
+
* Extract an ASCII word (for mixed Japanese/English content).
|
|
418
|
+
*/
|
|
419
|
+
private extractAsciiWord(input: string, startPos: number): LanguageToken | null {
|
|
420
|
+
let pos = startPos;
|
|
421
|
+
let word = '';
|
|
422
|
+
|
|
423
|
+
while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
|
|
424
|
+
word += input[pos++];
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
if (!word) return null;
|
|
428
|
+
|
|
429
|
+
return createToken(word, 'identifier', createPosition(startPos, pos));
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
/**
|
|
433
|
+
* Extract a number, including Japanese time unit suffixes.
|
|
434
|
+
* Japanese time units attach directly without whitespace.
|
|
435
|
+
*/
|
|
436
|
+
private extractJapaneseNumber(input: string, startPos: number): LanguageToken | null {
|
|
437
|
+
return this.tryNumberWithTimeUnits(input, startPos, JAPANESE_TIME_UNITS, {
|
|
438
|
+
allowSign: false,
|
|
439
|
+
skipWhitespace: false,
|
|
440
|
+
});
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
/**
|
|
445
|
+
* Singleton instance.
|
|
446
|
+
*/
|
|
447
|
+
export const japaneseTokenizer = new JapaneseTokenizer();
|