@lokascript/semantic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +686 -0
- package/dist/browser-ar.ar.global.js +2 -0
- package/dist/browser-core.core.global.js +2 -0
- package/dist/browser-de.de.global.js +2 -0
- package/dist/browser-east-asian.east-asian.global.js +2 -0
- package/dist/browser-en-tr.en-tr.global.js +2 -0
- package/dist/browser-en.en.global.js +2 -0
- package/dist/browser-es-en.es-en.global.js +2 -0
- package/dist/browser-es.es.global.js +2 -0
- package/dist/browser-fr.fr.global.js +2 -0
- package/dist/browser-id.id.global.js +2 -0
- package/dist/browser-ja.ja.global.js +2 -0
- package/dist/browser-ko.ko.global.js +2 -0
- package/dist/browser-lazy.lazy.global.js +2 -0
- package/dist/browser-priority.priority.global.js +2 -0
- package/dist/browser-pt.pt.global.js +2 -0
- package/dist/browser-qu.qu.global.js +2 -0
- package/dist/browser-sw.sw.global.js +2 -0
- package/dist/browser-tr.tr.global.js +2 -0
- package/dist/browser-western.western.global.js +2 -0
- package/dist/browser-zh.zh.global.js +2 -0
- package/dist/browser.global.js +3 -0
- package/dist/browser.global.js.map +1 -0
- package/dist/index.cjs +35051 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3426 -0
- package/dist/index.d.ts +3426 -0
- package/dist/index.js +34890 -0
- package/dist/index.js.map +1 -0
- package/dist/languages/ar.d.ts +78 -0
- package/dist/languages/ar.js +1622 -0
- package/dist/languages/ar.js.map +1 -0
- package/dist/languages/de.d.ts +38 -0
- package/dist/languages/de.js +1168 -0
- package/dist/languages/de.js.map +1 -0
- package/dist/languages/en.d.ts +44 -0
- package/dist/languages/en.js +3491 -0
- package/dist/languages/en.js.map +1 -0
- package/dist/languages/es.d.ts +52 -0
- package/dist/languages/es.js +1493 -0
- package/dist/languages/es.js.map +1 -0
- package/dist/languages/fr.d.ts +37 -0
- package/dist/languages/fr.js +1159 -0
- package/dist/languages/fr.js.map +1 -0
- package/dist/languages/id.d.ts +35 -0
- package/dist/languages/id.js +1152 -0
- package/dist/languages/id.js.map +1 -0
- package/dist/languages/ja.d.ts +53 -0
- package/dist/languages/ja.js +1430 -0
- package/dist/languages/ja.js.map +1 -0
- package/dist/languages/ko.d.ts +51 -0
- package/dist/languages/ko.js +1729 -0
- package/dist/languages/ko.js.map +1 -0
- package/dist/languages/pt.d.ts +37 -0
- package/dist/languages/pt.js +1127 -0
- package/dist/languages/pt.js.map +1 -0
- package/dist/languages/qu.d.ts +36 -0
- package/dist/languages/qu.js +1143 -0
- package/dist/languages/qu.js.map +1 -0
- package/dist/languages/sw.d.ts +35 -0
- package/dist/languages/sw.js +1147 -0
- package/dist/languages/sw.js.map +1 -0
- package/dist/languages/tr.d.ts +45 -0
- package/dist/languages/tr.js +1529 -0
- package/dist/languages/tr.js.map +1 -0
- package/dist/languages/zh.d.ts +58 -0
- package/dist/languages/zh.js +1257 -0
- package/dist/languages/zh.js.map +1 -0
- package/dist/types-C4dcj53L.d.ts +600 -0
- package/package.json +202 -0
- package/src/__test-utils__/index.ts +7 -0
- package/src/__test-utils__/test-helpers.ts +8 -0
- package/src/__types__/test-helpers.ts +122 -0
- package/src/analysis/index.ts +479 -0
- package/src/ast-builder/command-mappers.ts +1133 -0
- package/src/ast-builder/expression-parser/index.ts +41 -0
- package/src/ast-builder/expression-parser/parser.ts +563 -0
- package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
- package/src/ast-builder/expression-parser/types.ts +208 -0
- package/src/ast-builder/index.ts +536 -0
- package/src/ast-builder/value-converters.ts +172 -0
- package/src/bridge.ts +275 -0
- package/src/browser-ar.ts +162 -0
- package/src/browser-core.ts +231 -0
- package/src/browser-de.ts +162 -0
- package/src/browser-east-asian.ts +173 -0
- package/src/browser-en-tr.ts +165 -0
- package/src/browser-en.ts +157 -0
- package/src/browser-es-en.ts +200 -0
- package/src/browser-es.ts +170 -0
- package/src/browser-fr.ts +162 -0
- package/src/browser-id.ts +162 -0
- package/src/browser-ja.ts +162 -0
- package/src/browser-ko.ts +162 -0
- package/src/browser-lazy.ts +189 -0
- package/src/browser-priority.ts +214 -0
- package/src/browser-pt.ts +162 -0
- package/src/browser-qu.ts +162 -0
- package/src/browser-sw.ts +162 -0
- package/src/browser-tr.ts +162 -0
- package/src/browser-western.ts +181 -0
- package/src/browser-zh.ts +162 -0
- package/src/browser.ts +268 -0
- package/src/cache/index.ts +14 -0
- package/src/cache/semantic-cache.ts +344 -0
- package/src/core-bridge.ts +372 -0
- package/src/explicit/converter.ts +258 -0
- package/src/explicit/index.ts +18 -0
- package/src/explicit/parser.ts +236 -0
- package/src/explicit/renderer.ts +424 -0
- package/src/generators/command-schemas.ts +1636 -0
- package/src/generators/event-handler-generator.ts +109 -0
- package/src/generators/index.ts +117 -0
- package/src/generators/language-profiles.ts +139 -0
- package/src/generators/pattern-generator.ts +537 -0
- package/src/generators/profiles/arabic.ts +131 -0
- package/src/generators/profiles/bengali.ts +132 -0
- package/src/generators/profiles/chinese.ts +124 -0
- package/src/generators/profiles/english.ts +113 -0
- package/src/generators/profiles/french.ts +125 -0
- package/src/generators/profiles/german.ts +126 -0
- package/src/generators/profiles/hindi.ts +146 -0
- package/src/generators/profiles/index.ts +46 -0
- package/src/generators/profiles/indonesian.ts +125 -0
- package/src/generators/profiles/italian.ts +139 -0
- package/src/generators/profiles/japanese.ts +149 -0
- package/src/generators/profiles/korean.ts +127 -0
- package/src/generators/profiles/marker-templates.ts +288 -0
- package/src/generators/profiles/ms.ts +130 -0
- package/src/generators/profiles/polish.ts +249 -0
- package/src/generators/profiles/portuguese.ts +115 -0
- package/src/generators/profiles/quechua.ts +113 -0
- package/src/generators/profiles/russian.ts +260 -0
- package/src/generators/profiles/spanish.ts +130 -0
- package/src/generators/profiles/swahili.ts +129 -0
- package/src/generators/profiles/thai.ts +132 -0
- package/src/generators/profiles/tl.ts +128 -0
- package/src/generators/profiles/turkish.ts +124 -0
- package/src/generators/profiles/types.ts +165 -0
- package/src/generators/profiles/ukrainian.ts +270 -0
- package/src/generators/profiles/vietnamese.ts +133 -0
- package/src/generators/schema-error-codes.ts +160 -0
- package/src/generators/schema-validator.ts +391 -0
- package/src/index.ts +429 -0
- package/src/language-building-schema.ts +3170 -0
- package/src/language-loader.ts +394 -0
- package/src/languages/_all.ts +65 -0
- package/src/languages/ar.ts +15 -0
- package/src/languages/bn.ts +16 -0
- package/src/languages/de.ts +15 -0
- package/src/languages/en.ts +29 -0
- package/src/languages/es.ts +15 -0
- package/src/languages/fr.ts +15 -0
- package/src/languages/hi.ts +26 -0
- package/src/languages/id.ts +15 -0
- package/src/languages/index.ts +18 -0
- package/src/languages/it.ts +15 -0
- package/src/languages/ja.ts +15 -0
- package/src/languages/ko.ts +15 -0
- package/src/languages/ms.ts +16 -0
- package/src/languages/pl.ts +18 -0
- package/src/languages/pt.ts +15 -0
- package/src/languages/qu.ts +15 -0
- package/src/languages/ru.ts +26 -0
- package/src/languages/sw.ts +15 -0
- package/src/languages/th.ts +16 -0
- package/src/languages/tl.ts +16 -0
- package/src/languages/tr.ts +15 -0
- package/src/languages/uk.ts +26 -0
- package/src/languages/vi.ts +16 -0
- package/src/languages/zh.ts +15 -0
- package/src/parser/index.ts +15 -0
- package/src/parser/pattern-matcher.ts +1181 -0
- package/src/parser/semantic-parser.ts +573 -0
- package/src/parser/utils/index.ts +35 -0
- package/src/parser/utils/marker-resolution.ts +111 -0
- package/src/parser/utils/possessive-keywords.ts +43 -0
- package/src/parser/utils/role-positioning.ts +70 -0
- package/src/parser/utils/type-validation.ts +134 -0
- package/src/patterns/add/ar.ts +71 -0
- package/src/patterns/add/bn.ts +70 -0
- package/src/patterns/add/hi.ts +69 -0
- package/src/patterns/add/index.ts +87 -0
- package/src/patterns/add/it.ts +61 -0
- package/src/patterns/add/ja.ts +93 -0
- package/src/patterns/add/ko.ts +74 -0
- package/src/patterns/add/ms.ts +30 -0
- package/src/patterns/add/pl.ts +62 -0
- package/src/patterns/add/ru.ts +62 -0
- package/src/patterns/add/th.ts +49 -0
- package/src/patterns/add/tl.ts +30 -0
- package/src/patterns/add/tr.ts +71 -0
- package/src/patterns/add/uk.ts +62 -0
- package/src/patterns/add/vi.ts +61 -0
- package/src/patterns/add/zh.ts +71 -0
- package/src/patterns/builders.ts +207 -0
- package/src/patterns/decrement/bn.ts +70 -0
- package/src/patterns/decrement/de.ts +42 -0
- package/src/patterns/decrement/hi.ts +68 -0
- package/src/patterns/decrement/index.ts +79 -0
- package/src/patterns/decrement/it.ts +69 -0
- package/src/patterns/decrement/ms.ts +30 -0
- package/src/patterns/decrement/pl.ts +58 -0
- package/src/patterns/decrement/ru.ts +58 -0
- package/src/patterns/decrement/th.ts +49 -0
- package/src/patterns/decrement/tl.ts +30 -0
- package/src/patterns/decrement/tr.ts +48 -0
- package/src/patterns/decrement/uk.ts +58 -0
- package/src/patterns/decrement/vi.ts +61 -0
- package/src/patterns/decrement/zh.ts +32 -0
- package/src/patterns/en.ts +302 -0
- package/src/patterns/event-handler/ar.ts +151 -0
- package/src/patterns/event-handler/bn.ts +72 -0
- package/src/patterns/event-handler/de.ts +117 -0
- package/src/patterns/event-handler/en.ts +117 -0
- package/src/patterns/event-handler/es.ts +136 -0
- package/src/patterns/event-handler/fr.ts +117 -0
- package/src/patterns/event-handler/hi.ts +64 -0
- package/src/patterns/event-handler/id.ts +117 -0
- package/src/patterns/event-handler/index.ts +119 -0
- package/src/patterns/event-handler/it.ts +54 -0
- package/src/patterns/event-handler/ja.ts +118 -0
- package/src/patterns/event-handler/ko.ts +133 -0
- package/src/patterns/event-handler/ms.ts +30 -0
- package/src/patterns/event-handler/pl.ts +62 -0
- package/src/patterns/event-handler/pt.ts +117 -0
- package/src/patterns/event-handler/qu.ts +66 -0
- package/src/patterns/event-handler/ru.ts +62 -0
- package/src/patterns/event-handler/shared.ts +270 -0
- package/src/patterns/event-handler/sw.ts +117 -0
- package/src/patterns/event-handler/th.ts +53 -0
- package/src/patterns/event-handler/tl.ts +30 -0
- package/src/patterns/event-handler/tr.ts +170 -0
- package/src/patterns/event-handler/uk.ts +62 -0
- package/src/patterns/event-handler/vi.ts +61 -0
- package/src/patterns/event-handler/zh.ts +150 -0
- package/src/patterns/get/ar.ts +49 -0
- package/src/patterns/get/bn.ts +47 -0
- package/src/patterns/get/de.ts +32 -0
- package/src/patterns/get/hi.ts +52 -0
- package/src/patterns/get/index.ts +83 -0
- package/src/patterns/get/it.ts +56 -0
- package/src/patterns/get/ja.ts +53 -0
- package/src/patterns/get/ko.ts +53 -0
- package/src/patterns/get/ms.ts +30 -0
- package/src/patterns/get/pl.ts +57 -0
- package/src/patterns/get/ru.ts +57 -0
- package/src/patterns/get/th.ts +29 -0
- package/src/patterns/get/tl.ts +30 -0
- package/src/patterns/get/uk.ts +57 -0
- package/src/patterns/get/vi.ts +48 -0
- package/src/patterns/grammar-transformed/index.ts +39 -0
- package/src/patterns/grammar-transformed/ja.ts +1713 -0
- package/src/patterns/grammar-transformed/ko.ts +1311 -0
- package/src/patterns/grammar-transformed/tr.ts +1067 -0
- package/src/patterns/hide/ar.ts +67 -0
- package/src/patterns/hide/bn.ts +47 -0
- package/src/patterns/hide/de.ts +36 -0
- package/src/patterns/hide/hi.ts +61 -0
- package/src/patterns/hide/index.ts +91 -0
- package/src/patterns/hide/it.ts +56 -0
- package/src/patterns/hide/ja.ts +69 -0
- package/src/patterns/hide/ko.ts +69 -0
- package/src/patterns/hide/ms.ts +30 -0
- package/src/patterns/hide/pl.ts +57 -0
- package/src/patterns/hide/ru.ts +57 -0
- package/src/patterns/hide/th.ts +29 -0
- package/src/patterns/hide/tl.ts +30 -0
- package/src/patterns/hide/tr.ts +65 -0
- package/src/patterns/hide/uk.ts +57 -0
- package/src/patterns/hide/vi.ts +56 -0
- package/src/patterns/hide/zh.ts +68 -0
- package/src/patterns/increment/bn.ts +70 -0
- package/src/patterns/increment/de.ts +36 -0
- package/src/patterns/increment/hi.ts +68 -0
- package/src/patterns/increment/index.ts +79 -0
- package/src/patterns/increment/it.ts +69 -0
- package/src/patterns/increment/ms.ts +30 -0
- package/src/patterns/increment/pl.ts +58 -0
- package/src/patterns/increment/ru.ts +58 -0
- package/src/patterns/increment/th.ts +49 -0
- package/src/patterns/increment/tl.ts +30 -0
- package/src/patterns/increment/tr.ts +52 -0
- package/src/patterns/increment/uk.ts +58 -0
- package/src/patterns/increment/vi.ts +61 -0
- package/src/patterns/increment/zh.ts +32 -0
- package/src/patterns/index.ts +84 -0
- package/src/patterns/languages/en/control-flow.ts +93 -0
- package/src/patterns/languages/en/fetch.ts +62 -0
- package/src/patterns/languages/en/index.ts +42 -0
- package/src/patterns/languages/en/repeat.ts +67 -0
- package/src/patterns/languages/en/set.ts +48 -0
- package/src/patterns/languages/en/swap.ts +38 -0
- package/src/patterns/languages/en/temporal.ts +57 -0
- package/src/patterns/put/ar.ts +74 -0
- package/src/patterns/put/bn.ts +53 -0
- package/src/patterns/put/en.ts +74 -0
- package/src/patterns/put/es.ts +74 -0
- package/src/patterns/put/hi.ts +69 -0
- package/src/patterns/put/id.ts +96 -0
- package/src/patterns/put/index.ts +99 -0
- package/src/patterns/put/it.ts +56 -0
- package/src/patterns/put/ja.ts +75 -0
- package/src/patterns/put/ko.ts +67 -0
- package/src/patterns/put/ms.ts +30 -0
- package/src/patterns/put/pl.ts +81 -0
- package/src/patterns/put/ru.ts +85 -0
- package/src/patterns/put/th.ts +32 -0
- package/src/patterns/put/tl.ts +30 -0
- package/src/patterns/put/tr.ts +67 -0
- package/src/patterns/put/uk.ts +85 -0
- package/src/patterns/put/vi.ts +72 -0
- package/src/patterns/put/zh.ts +62 -0
- package/src/patterns/registry.ts +163 -0
- package/src/patterns/remove/ar.ts +71 -0
- package/src/patterns/remove/bn.ts +68 -0
- package/src/patterns/remove/hi.ts +69 -0
- package/src/patterns/remove/index.ts +87 -0
- package/src/patterns/remove/it.ts +69 -0
- package/src/patterns/remove/ja.ts +74 -0
- package/src/patterns/remove/ko.ts +78 -0
- package/src/patterns/remove/ms.ts +30 -0
- package/src/patterns/remove/pl.ts +62 -0
- package/src/patterns/remove/ru.ts +62 -0
- package/src/patterns/remove/th.ts +49 -0
- package/src/patterns/remove/tl.ts +30 -0
- package/src/patterns/remove/tr.ts +78 -0
- package/src/patterns/remove/uk.ts +62 -0
- package/src/patterns/remove/vi.ts +61 -0
- package/src/patterns/remove/zh.ts +72 -0
- package/src/patterns/set/ar.ts +84 -0
- package/src/patterns/set/bn.ts +53 -0
- package/src/patterns/set/de.ts +84 -0
- package/src/patterns/set/es.ts +92 -0
- package/src/patterns/set/fr.ts +88 -0
- package/src/patterns/set/hi.ts +56 -0
- package/src/patterns/set/id.ts +84 -0
- package/src/patterns/set/index.ts +107 -0
- package/src/patterns/set/it.ts +56 -0
- package/src/patterns/set/ja.ts +86 -0
- package/src/patterns/set/ko.ts +85 -0
- package/src/patterns/set/ms.ts +30 -0
- package/src/patterns/set/pl.ts +57 -0
- package/src/patterns/set/pt.ts +84 -0
- package/src/patterns/set/ru.ts +57 -0
- package/src/patterns/set/th.ts +31 -0
- package/src/patterns/set/tl.ts +30 -0
- package/src/patterns/set/tr.ts +107 -0
- package/src/patterns/set/uk.ts +57 -0
- package/src/patterns/set/vi.ts +53 -0
- package/src/patterns/set/zh.ts +84 -0
- package/src/patterns/show/ar.ts +67 -0
- package/src/patterns/show/bn.ts +47 -0
- package/src/patterns/show/de.ts +32 -0
- package/src/patterns/show/fr.ts +32 -0
- package/src/patterns/show/hi.ts +61 -0
- package/src/patterns/show/index.ts +95 -0
- package/src/patterns/show/it.ts +56 -0
- package/src/patterns/show/ja.ts +69 -0
- package/src/patterns/show/ko.ts +73 -0
- package/src/patterns/show/ms.ts +30 -0
- package/src/patterns/show/pl.ts +57 -0
- package/src/patterns/show/ru.ts +57 -0
- package/src/patterns/show/th.ts +29 -0
- package/src/patterns/show/tl.ts +30 -0
- package/src/patterns/show/tr.ts +65 -0
- package/src/patterns/show/uk.ts +57 -0
- package/src/patterns/show/vi.ts +56 -0
- package/src/patterns/show/zh.ts +68 -0
- package/src/patterns/take/ar.ts +51 -0
- package/src/patterns/take/index.ts +31 -0
- package/src/patterns/toggle/ar.ts +61 -0
- package/src/patterns/toggle/bn.ts +70 -0
- package/src/patterns/toggle/en.ts +61 -0
- package/src/patterns/toggle/es.ts +61 -0
- package/src/patterns/toggle/hi.ts +80 -0
- package/src/patterns/toggle/index.ts +95 -0
- package/src/patterns/toggle/it.ts +69 -0
- package/src/patterns/toggle/ja.ts +156 -0
- package/src/patterns/toggle/ko.ts +113 -0
- package/src/patterns/toggle/ms.ts +30 -0
- package/src/patterns/toggle/pl.ts +62 -0
- package/src/patterns/toggle/ru.ts +62 -0
- package/src/patterns/toggle/th.ts +50 -0
- package/src/patterns/toggle/tl.ts +30 -0
- package/src/patterns/toggle/tr.ts +88 -0
- package/src/patterns/toggle/uk.ts +62 -0
- package/src/patterns/toggle/vi.ts +61 -0
- package/src/patterns/toggle/zh.ts +99 -0
- package/src/public-api.ts +286 -0
- package/src/registry.ts +441 -0
- package/src/tokenizers/arabic.ts +723 -0
- package/src/tokenizers/base.ts +1300 -0
- package/src/tokenizers/bengali.ts +289 -0
- package/src/tokenizers/chinese.ts +481 -0
- package/src/tokenizers/english.ts +416 -0
- package/src/tokenizers/french.ts +326 -0
- package/src/tokenizers/german.ts +324 -0
- package/src/tokenizers/hindi.ts +319 -0
- package/src/tokenizers/index.ts +127 -0
- package/src/tokenizers/indonesian.ts +306 -0
- package/src/tokenizers/italian.ts +458 -0
- package/src/tokenizers/japanese.ts +447 -0
- package/src/tokenizers/korean.ts +642 -0
- package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
- package/src/tokenizers/morphology/french-normalizer.ts +268 -0
- package/src/tokenizers/morphology/german-normalizer.ts +256 -0
- package/src/tokenizers/morphology/index.ts +46 -0
- package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
- package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
- package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
- package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
- package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
- package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
- package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
- package/src/tokenizers/morphology/types.ts +211 -0
- package/src/tokenizers/ms.ts +198 -0
- package/src/tokenizers/polish.ts +354 -0
- package/src/tokenizers/portuguese.ts +304 -0
- package/src/tokenizers/quechua.ts +339 -0
- package/src/tokenizers/russian.ts +375 -0
- package/src/tokenizers/spanish.ts +403 -0
- package/src/tokenizers/swahili.ts +303 -0
- package/src/tokenizers/thai.ts +236 -0
- package/src/tokenizers/tl.ts +198 -0
- package/src/tokenizers/turkish.ts +411 -0
- package/src/tokenizers/ukrainian.ts +369 -0
- package/src/tokenizers/vietnamese.ts +410 -0
- package/src/types/grammar-types.ts +617 -0
- package/src/types/unified-profile.ts +267 -0
- package/src/types.ts +709 -0
- package/src/utils/confidence-calculator.ts +147 -0
- package/src/validators/command-validator.ts +380 -0
- package/src/validators/index.ts +15 -0
|
@@ -0,0 +1,481 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chinese Tokenizer
|
|
3
|
+
*
|
|
4
|
+
* Tokenizes Chinese hyperscript input.
|
|
5
|
+
* Chinese is challenging because:
|
|
6
|
+
* - No spaces between words (like Japanese)
|
|
7
|
+
* - Uses CJK characters (shared with Japanese Kanji)
|
|
8
|
+
* - SVO word order (like English)
|
|
9
|
+
* - Uses prepositions (把, 在, 从, etc.) for grammatical roles
|
|
10
|
+
* - No conjugation (unlike Japanese/Korean)
|
|
11
|
+
* - CSS selectors are embedded ASCII
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import type { LanguageToken, TokenKind, TokenStream } from '../types';
|
|
15
|
+
import {
|
|
16
|
+
BaseTokenizer,
|
|
17
|
+
TokenStreamImpl,
|
|
18
|
+
createToken,
|
|
19
|
+
createPosition,
|
|
20
|
+
createUnicodeRangeClassifier,
|
|
21
|
+
isWhitespace,
|
|
22
|
+
isSelectorStart,
|
|
23
|
+
isQuote,
|
|
24
|
+
isDigit,
|
|
25
|
+
isAsciiIdentifierChar,
|
|
26
|
+
isUrlStart,
|
|
27
|
+
type KeywordEntry,
|
|
28
|
+
type TimeUnitMapping,
|
|
29
|
+
} from './base';
|
|
30
|
+
import { chineseProfile } from '../generators/profiles/chinese';
|
|
31
|
+
|
|
32
|
+
// =============================================================================
|
|
33
|
+
// Chinese Character Classification
|
|
34
|
+
// =============================================================================
|
|
35
|
+
|
|
36
|
+
/** Check if character is a CJK character (Chinese). */
|
|
37
|
+
const isChinese = createUnicodeRangeClassifier([
|
|
38
|
+
[0x4e00, 0x9fff], // CJK Unified Ideographs
|
|
39
|
+
[0x3400, 0x4dbf], // CJK Unified Ideographs Extension A
|
|
40
|
+
[0x20000, 0x2a6df], // CJK Unified Ideographs Extension B
|
|
41
|
+
[0xf900, 0xfaff], // CJK Compatibility Ideographs
|
|
42
|
+
[0x2f800, 0x2fa1f], // CJK Compatibility Ideographs Supplement
|
|
43
|
+
]);
|
|
44
|
+
|
|
45
|
+
// =============================================================================
|
|
46
|
+
// Chinese Particles/Prepositions
|
|
47
|
+
// =============================================================================
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Chinese grammatical particles and prepositions.
|
|
51
|
+
* These mark grammatical relationships in Chinese sentences.
|
|
52
|
+
*/
|
|
53
|
+
const PARTICLES = new Set([
|
|
54
|
+
'把', // ba - marks direct object (BA construction)
|
|
55
|
+
'在', // zai - at, in, on (location)
|
|
56
|
+
'从', // cong - from
|
|
57
|
+
'到', // dao - to, until
|
|
58
|
+
'向', // xiang - towards
|
|
59
|
+
'给', // gei - to, for (recipient)
|
|
60
|
+
'对', // dui - to, towards
|
|
61
|
+
'用', // yong - with, using
|
|
62
|
+
'被', // bei - by (passive)
|
|
63
|
+
'让', // rang - let, allow
|
|
64
|
+
'的', // de - possessive/attributive
|
|
65
|
+
'地', // de - adverbial marker
|
|
66
|
+
'得', // de - complement marker
|
|
67
|
+
'了', // le - completion marker
|
|
68
|
+
'着', // zhe - progressive marker
|
|
69
|
+
'过', // guo - experiential marker
|
|
70
|
+
'吗', // ma - question particle
|
|
71
|
+
'呢', // ne - question/emphasis particle
|
|
72
|
+
'吧', // ba - suggestion particle
|
|
73
|
+
]);
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Multi-character particles/phrases.
|
|
77
|
+
*/
|
|
78
|
+
const MULTI_CHAR_PARTICLES = ['然后', '接着', '并且', '或者', '如果', '那么', '否则'];
|
|
79
|
+
|
|
80
|
+
// =============================================================================
|
|
81
|
+
// Chinese Extras (keywords not in profile)
|
|
82
|
+
// =============================================================================
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Extra keywords not covered by the profile:
|
|
86
|
+
* - Literals (true, false, null, undefined)
|
|
87
|
+
* - Positional words
|
|
88
|
+
* - Event names
|
|
89
|
+
* - Time units
|
|
90
|
+
* - Additional synonyms
|
|
91
|
+
*/
|
|
92
|
+
const CHINESE_EXTRAS: KeywordEntry[] = [
|
|
93
|
+
// Values/Literals
|
|
94
|
+
{ native: '真', normalized: 'true' },
|
|
95
|
+
{ native: '假', normalized: 'false' },
|
|
96
|
+
{ native: '空', normalized: 'null' },
|
|
97
|
+
{ native: '未定义', normalized: 'undefined' },
|
|
98
|
+
|
|
99
|
+
// Positional
|
|
100
|
+
{ native: '第一个', normalized: 'first' },
|
|
101
|
+
{ native: '首个', normalized: 'first' },
|
|
102
|
+
{ native: '最后一个', normalized: 'last' },
|
|
103
|
+
{ native: '末个', normalized: 'last' },
|
|
104
|
+
{ native: '下一个', normalized: 'next' },
|
|
105
|
+
{ native: '上一个', normalized: 'previous' },
|
|
106
|
+
{ native: '最近的', normalized: 'closest' },
|
|
107
|
+
{ native: '父级', normalized: 'parent' },
|
|
108
|
+
|
|
109
|
+
// Events
|
|
110
|
+
{ native: '点击', normalized: 'click' },
|
|
111
|
+
{ native: '双击', normalized: 'dblclick' },
|
|
112
|
+
{ native: '输入', normalized: 'input' },
|
|
113
|
+
{ native: '变更', normalized: 'change' },
|
|
114
|
+
{ native: '改变', normalized: 'change' },
|
|
115
|
+
{ native: '提交', normalized: 'submit' },
|
|
116
|
+
{ native: '按键', normalized: 'keydown' },
|
|
117
|
+
{ native: '释放键', normalized: 'keyup' },
|
|
118
|
+
{ native: '鼠标移入', normalized: 'mouseover' },
|
|
119
|
+
{ native: '鼠标移出', normalized: 'mouseout' },
|
|
120
|
+
{ native: '获得焦点', normalized: 'focus' },
|
|
121
|
+
{ native: '失去焦点', normalized: 'blur' },
|
|
122
|
+
{ native: '加载', normalized: 'load' },
|
|
123
|
+
{ native: '滚动', normalized: 'scroll' },
|
|
124
|
+
|
|
125
|
+
// Additional references
|
|
126
|
+
{ native: '我的', normalized: 'my' },
|
|
127
|
+
{ native: '它的', normalized: 'its' },
|
|
128
|
+
|
|
129
|
+
// Time units
|
|
130
|
+
{ native: '秒', normalized: 's' },
|
|
131
|
+
{ native: '毫秒', normalized: 'ms' },
|
|
132
|
+
{ native: '分钟', normalized: 'm' },
|
|
133
|
+
{ native: '小时', normalized: 'h' },
|
|
134
|
+
|
|
135
|
+
// Logical operators
|
|
136
|
+
{ native: '和', normalized: 'and' },
|
|
137
|
+
{ native: '或者', normalized: 'or' },
|
|
138
|
+
{ native: '或', normalized: 'or' },
|
|
139
|
+
{ native: '不', normalized: 'not' },
|
|
140
|
+
{ native: '非', normalized: 'not' },
|
|
141
|
+
{ native: '是', normalized: 'is' },
|
|
142
|
+
|
|
143
|
+
// Additional synonyms not in profile
|
|
144
|
+
{ native: '若', normalized: 'if' },
|
|
145
|
+
{ native: '不然', normalized: 'else' },
|
|
146
|
+
{ native: '循环', normalized: 'repeat' },
|
|
147
|
+
{ native: '遍历', normalized: 'for' },
|
|
148
|
+
{ native: '每个', normalized: 'for' },
|
|
149
|
+
{ native: '为每', normalized: 'for' },
|
|
150
|
+
{ native: '中止', normalized: 'halt' },
|
|
151
|
+
{ native: '抛', normalized: 'throw' },
|
|
152
|
+
{ native: '呼叫', normalized: 'call' },
|
|
153
|
+
{ native: '回', normalized: 'return' },
|
|
154
|
+
{ native: '脚本', normalized: 'js' },
|
|
155
|
+
{ native: '通知', normalized: 'tell' },
|
|
156
|
+
{ native: '缺省', normalized: 'default' },
|
|
157
|
+
{ native: '初始', normalized: 'init' },
|
|
158
|
+
{ native: '动作', normalized: 'behavior' },
|
|
159
|
+
{ native: '激发', normalized: 'trigger' },
|
|
160
|
+
{ native: '对焦', normalized: 'focus' },
|
|
161
|
+
{ native: '模糊', normalized: 'blur' },
|
|
162
|
+
{ native: '跳转', normalized: 'go' },
|
|
163
|
+
{ native: '导航', normalized: 'go' },
|
|
164
|
+
{ native: '抓取', normalized: 'fetch' },
|
|
165
|
+
{ native: '获取数据', normalized: 'fetch' },
|
|
166
|
+
{ native: '安定', normalized: 'settle' },
|
|
167
|
+
{ native: '拿取', normalized: 'take' },
|
|
168
|
+
{ native: '取', normalized: 'take' },
|
|
169
|
+
{ native: '创建', normalized: 'make' },
|
|
170
|
+
{ native: '克隆', normalized: 'clone' },
|
|
171
|
+
{ native: '记录', normalized: 'log' },
|
|
172
|
+
{ native: '打印', normalized: 'log' },
|
|
173
|
+
{ native: '动画', normalized: 'transition' },
|
|
174
|
+
|
|
175
|
+
// Modifiers
|
|
176
|
+
{ native: '到里面', normalized: 'into' },
|
|
177
|
+
{ native: '里', normalized: 'into' },
|
|
178
|
+
{ native: '前', normalized: 'before' },
|
|
179
|
+
{ native: '后', normalized: 'after' },
|
|
180
|
+
{ native: '那么', normalized: 'then' },
|
|
181
|
+
{ native: '完', normalized: 'end' },
|
|
182
|
+
];
|
|
183
|
+
|
|
184
|
+
// =============================================================================
|
|
185
|
+
// Chinese Time Units
|
|
186
|
+
// =============================================================================
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* Chinese time unit patterns for number parsing.
|
|
190
|
+
* Sorted by length (longest first) to ensure correct matching.
|
|
191
|
+
* Chinese time units attach directly without whitespace.
|
|
192
|
+
*/
|
|
193
|
+
const CHINESE_TIME_UNITS: readonly TimeUnitMapping[] = [
|
|
194
|
+
{ pattern: '毫秒', suffix: 'ms', length: 2 },
|
|
195
|
+
{ pattern: '分钟', suffix: 'm', length: 2 },
|
|
196
|
+
{ pattern: '小时', suffix: 'h', length: 2 },
|
|
197
|
+
{ pattern: '秒', suffix: 's', length: 1 },
|
|
198
|
+
{ pattern: '分', suffix: 'm', length: 1 },
|
|
199
|
+
];
|
|
200
|
+
|
|
201
|
+
// =============================================================================
|
|
202
|
+
// Chinese Tokenizer Implementation
|
|
203
|
+
// =============================================================================
|
|
204
|
+
|
|
205
|
+
export class ChineseTokenizer extends BaseTokenizer {
|
|
206
|
+
readonly language = 'zh';
|
|
207
|
+
readonly direction = 'ltr' as const;
|
|
208
|
+
|
|
209
|
+
constructor() {
|
|
210
|
+
super();
|
|
211
|
+
this.initializeKeywordsFromProfile(chineseProfile, CHINESE_EXTRAS);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
tokenize(input: string): TokenStream {
|
|
215
|
+
const tokens: LanguageToken[] = [];
|
|
216
|
+
let pos = 0;
|
|
217
|
+
|
|
218
|
+
while (pos < input.length) {
|
|
219
|
+
// Skip whitespace (Chinese can have spaces for readability)
|
|
220
|
+
if (isWhitespace(input[pos])) {
|
|
221
|
+
pos++;
|
|
222
|
+
continue;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Try CSS selector first (ASCII-based, highest priority)
|
|
226
|
+
if (isSelectorStart(input[pos])) {
|
|
227
|
+
// Check for event modifier first (.once, .debounce(), etc.)
|
|
228
|
+
const modifierToken = this.tryEventModifier(input, pos);
|
|
229
|
+
if (modifierToken) {
|
|
230
|
+
tokens.push(modifierToken);
|
|
231
|
+
pos = modifierToken.position.end;
|
|
232
|
+
continue;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const selectorToken = this.trySelector(input, pos);
|
|
236
|
+
if (selectorToken) {
|
|
237
|
+
tokens.push(selectorToken);
|
|
238
|
+
pos = selectorToken.position.end;
|
|
239
|
+
continue;
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// Try string literal (both ASCII and Chinese quotes)
|
|
244
|
+
// Chinese quotes: \u201C " \u201D " \u2018 ' \u2019 '
|
|
245
|
+
if (
|
|
246
|
+
isQuote(input[pos]) ||
|
|
247
|
+
input[pos] === '\u201C' ||
|
|
248
|
+
input[pos] === '\u201D' ||
|
|
249
|
+
input[pos] === '\u2018' ||
|
|
250
|
+
input[pos] === '\u2019'
|
|
251
|
+
) {
|
|
252
|
+
const stringToken = this.tryChineseString(input, pos);
|
|
253
|
+
if (stringToken) {
|
|
254
|
+
tokens.push(stringToken);
|
|
255
|
+
pos = stringToken.position.end;
|
|
256
|
+
continue;
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Try URL (/path, ./path, http://, etc.)
|
|
261
|
+
if (isUrlStart(input, pos)) {
|
|
262
|
+
const urlToken = this.tryUrl(input, pos);
|
|
263
|
+
if (urlToken) {
|
|
264
|
+
tokens.push(urlToken);
|
|
265
|
+
pos = urlToken.position.end;
|
|
266
|
+
continue;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// Try number (including Chinese time units)
|
|
271
|
+
if (isDigit(input[pos])) {
|
|
272
|
+
const numberToken = this.extractChineseNumber(input, pos);
|
|
273
|
+
if (numberToken) {
|
|
274
|
+
tokens.push(numberToken);
|
|
275
|
+
pos = numberToken.position.end;
|
|
276
|
+
continue;
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// Try variable reference (:varname)
|
|
281
|
+
const varToken = this.tryVariableRef(input, pos);
|
|
282
|
+
if (varToken) {
|
|
283
|
+
tokens.push(varToken);
|
|
284
|
+
pos = varToken.position.end;
|
|
285
|
+
continue;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
// Try multi-character particle (before single-character)
|
|
289
|
+
const multiParticle = this.tryMultiCharParticle(input, pos, MULTI_CHAR_PARTICLES);
|
|
290
|
+
if (multiParticle) {
|
|
291
|
+
tokens.push(multiParticle);
|
|
292
|
+
pos = multiParticle.position.end;
|
|
293
|
+
continue;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Try Chinese word (CJK sequence)
|
|
297
|
+
if (isChinese(input[pos])) {
|
|
298
|
+
const wordToken = this.extractChineseWord(input, pos);
|
|
299
|
+
if (wordToken) {
|
|
300
|
+
tokens.push(wordToken);
|
|
301
|
+
pos = wordToken.position.end;
|
|
302
|
+
continue;
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// Try ASCII word (for mixed content)
|
|
307
|
+
if (isAsciiIdentifierChar(input[pos])) {
|
|
308
|
+
const asciiToken = this.extractAsciiWord(input, pos);
|
|
309
|
+
if (asciiToken) {
|
|
310
|
+
tokens.push(asciiToken);
|
|
311
|
+
pos = asciiToken.position.end;
|
|
312
|
+
continue;
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// Skip unknown character
|
|
317
|
+
pos++;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
return new TokenStreamImpl(tokens, 'zh');
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
classifyToken(token: string): TokenKind {
|
|
324
|
+
if (PARTICLES.has(token)) return 'particle';
|
|
325
|
+
// O(1) Map lookup instead of O(n) array search
|
|
326
|
+
if (this.isKeyword(token)) return 'keyword';
|
|
327
|
+
if (
|
|
328
|
+
token.startsWith('#') ||
|
|
329
|
+
token.startsWith('.') ||
|
|
330
|
+
token.startsWith('[') ||
|
|
331
|
+
token.startsWith('<')
|
|
332
|
+
)
|
|
333
|
+
return 'selector';
|
|
334
|
+
if (
|
|
335
|
+
token.startsWith('"') ||
|
|
336
|
+
token.startsWith("'") ||
|
|
337
|
+
token.startsWith('\u201C') ||
|
|
338
|
+
token.startsWith('\u2018')
|
|
339
|
+
)
|
|
340
|
+
return 'literal';
|
|
341
|
+
if (/^\d/.test(token)) return 'literal';
|
|
342
|
+
|
|
343
|
+
return 'identifier';
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* Extract a Chinese word.
|
|
348
|
+
* Uses greedy matching to find the longest known keyword.
|
|
349
|
+
* Chinese doesn't have inflection, so we don't need morphological normalization.
|
|
350
|
+
* profileKeywords is already sorted longest-first, enabling greedy matching.
|
|
351
|
+
*/
|
|
352
|
+
private extractChineseWord(input: string, startPos: number): LanguageToken | null {
|
|
353
|
+
// profileKeywords is sorted longest-first, so iterate through for greedy match
|
|
354
|
+
for (const entry of this.profileKeywords) {
|
|
355
|
+
const keyword = entry.native;
|
|
356
|
+
const candidate = input.slice(startPos, startPos + keyword.length);
|
|
357
|
+
|
|
358
|
+
if (candidate === keyword) {
|
|
359
|
+
// Check all chars are Chinese (to avoid matching partial ASCII)
|
|
360
|
+
let allChinese = true;
|
|
361
|
+
for (let i = 0; i < keyword.length; i++) {
|
|
362
|
+
if (!isChinese(keyword[i])) {
|
|
363
|
+
allChinese = false;
|
|
364
|
+
break;
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
if (allChinese) {
|
|
368
|
+
return createToken(
|
|
369
|
+
candidate,
|
|
370
|
+
'keyword',
|
|
371
|
+
createPosition(startPos, startPos + keyword.length),
|
|
372
|
+
entry.normalized
|
|
373
|
+
);
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
// No keyword match - extract as regular word
|
|
379
|
+
// Stop at particles, ASCII, or whitespace
|
|
380
|
+
let pos = startPos;
|
|
381
|
+
let word = '';
|
|
382
|
+
|
|
383
|
+
while (pos < input.length) {
|
|
384
|
+
const char = input[pos];
|
|
385
|
+
|
|
386
|
+
// Stop at single-char particles if we have content
|
|
387
|
+
if (PARTICLES.has(char) && word.length > 0) {
|
|
388
|
+
break;
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// Continue if Chinese character
|
|
392
|
+
if (isChinese(char)) {
|
|
393
|
+
word += char;
|
|
394
|
+
pos++;
|
|
395
|
+
} else {
|
|
396
|
+
break;
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
if (!word) return null;
|
|
401
|
+
|
|
402
|
+
// Check if this word is a particle
|
|
403
|
+
if (PARTICLES.has(word)) {
|
|
404
|
+
return createToken(word, 'particle', createPosition(startPos, pos));
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
// Not a keyword, return as identifier
|
|
408
|
+
return createToken(word, 'identifier', createPosition(startPos, pos));
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
/**
|
|
412
|
+
* Extract an ASCII word (for mixed Chinese/English content).
|
|
413
|
+
*/
|
|
414
|
+
private extractAsciiWord(input: string, startPos: number): LanguageToken | null {
|
|
415
|
+
let pos = startPos;
|
|
416
|
+
let word = '';
|
|
417
|
+
|
|
418
|
+
while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
|
|
419
|
+
word += input[pos++];
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
if (!word) return null;
|
|
423
|
+
|
|
424
|
+
return createToken(word, 'identifier', createPosition(startPos, pos));
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
/**
|
|
428
|
+
* Try to extract a string literal, including Chinese quotes.
|
|
429
|
+
* Chinese quotes: \u201C " (open) \u201D " (close) \u2018 ' (open) \u2019 ' (close)
|
|
430
|
+
*/
|
|
431
|
+
private tryChineseString(input: string, pos: number): LanguageToken | null {
|
|
432
|
+
const char = input[pos];
|
|
433
|
+
|
|
434
|
+
// ASCII quotes
|
|
435
|
+
if (char === '"' || char === "'" || char === '`') {
|
|
436
|
+
return this.tryString(input, pos);
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
// Chinese double quotes: \u201C " ... \u201D "
|
|
440
|
+
if (char === '\u201C') {
|
|
441
|
+
let endPos = pos + 1;
|
|
442
|
+
while (endPos < input.length && input[endPos] !== '\u201D') {
|
|
443
|
+
endPos++;
|
|
444
|
+
}
|
|
445
|
+
if (endPos >= input.length) return null;
|
|
446
|
+
|
|
447
|
+
const value = input.slice(pos, endPos + 1);
|
|
448
|
+
return createToken(value, 'literal', createPosition(pos, endPos + 1));
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
// Chinese single quotes: \u2018 ' ... \u2019 '
|
|
452
|
+
if (char === '\u2018') {
|
|
453
|
+
let endPos = pos + 1;
|
|
454
|
+
while (endPos < input.length && input[endPos] !== '\u2019') {
|
|
455
|
+
endPos++;
|
|
456
|
+
}
|
|
457
|
+
if (endPos >= input.length) return null;
|
|
458
|
+
|
|
459
|
+
const value = input.slice(pos, endPos + 1);
|
|
460
|
+
return createToken(value, 'literal', createPosition(pos, endPos + 1));
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
return null;
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
/**
|
|
467
|
+
* Extract a number, including Chinese time unit suffixes.
|
|
468
|
+
* Chinese time units attach directly without whitespace.
|
|
469
|
+
*/
|
|
470
|
+
private extractChineseNumber(input: string, startPos: number): LanguageToken | null {
|
|
471
|
+
return this.tryNumberWithTimeUnits(input, startPos, CHINESE_TIME_UNITS, {
|
|
472
|
+
allowSign: false,
|
|
473
|
+
skipWhitespace: false,
|
|
474
|
+
});
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
/**
|
|
479
|
+
* Singleton instance.
|
|
480
|
+
*/
|
|
481
|
+
export const chineseTokenizer = new ChineseTokenizer();
|