@lokascript/semantic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +686 -0
- package/dist/browser-ar.ar.global.js +2 -0
- package/dist/browser-core.core.global.js +2 -0
- package/dist/browser-de.de.global.js +2 -0
- package/dist/browser-east-asian.east-asian.global.js +2 -0
- package/dist/browser-en-tr.en-tr.global.js +2 -0
- package/dist/browser-en.en.global.js +2 -0
- package/dist/browser-es-en.es-en.global.js +2 -0
- package/dist/browser-es.es.global.js +2 -0
- package/dist/browser-fr.fr.global.js +2 -0
- package/dist/browser-id.id.global.js +2 -0
- package/dist/browser-ja.ja.global.js +2 -0
- package/dist/browser-ko.ko.global.js +2 -0
- package/dist/browser-lazy.lazy.global.js +2 -0
- package/dist/browser-priority.priority.global.js +2 -0
- package/dist/browser-pt.pt.global.js +2 -0
- package/dist/browser-qu.qu.global.js +2 -0
- package/dist/browser-sw.sw.global.js +2 -0
- package/dist/browser-tr.tr.global.js +2 -0
- package/dist/browser-western.western.global.js +2 -0
- package/dist/browser-zh.zh.global.js +2 -0
- package/dist/browser.global.js +3 -0
- package/dist/browser.global.js.map +1 -0
- package/dist/index.cjs +35051 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3426 -0
- package/dist/index.d.ts +3426 -0
- package/dist/index.js +34890 -0
- package/dist/index.js.map +1 -0
- package/dist/languages/ar.d.ts +78 -0
- package/dist/languages/ar.js +1622 -0
- package/dist/languages/ar.js.map +1 -0
- package/dist/languages/de.d.ts +38 -0
- package/dist/languages/de.js +1168 -0
- package/dist/languages/de.js.map +1 -0
- package/dist/languages/en.d.ts +44 -0
- package/dist/languages/en.js +3491 -0
- package/dist/languages/en.js.map +1 -0
- package/dist/languages/es.d.ts +52 -0
- package/dist/languages/es.js +1493 -0
- package/dist/languages/es.js.map +1 -0
- package/dist/languages/fr.d.ts +37 -0
- package/dist/languages/fr.js +1159 -0
- package/dist/languages/fr.js.map +1 -0
- package/dist/languages/id.d.ts +35 -0
- package/dist/languages/id.js +1152 -0
- package/dist/languages/id.js.map +1 -0
- package/dist/languages/ja.d.ts +53 -0
- package/dist/languages/ja.js +1430 -0
- package/dist/languages/ja.js.map +1 -0
- package/dist/languages/ko.d.ts +51 -0
- package/dist/languages/ko.js +1729 -0
- package/dist/languages/ko.js.map +1 -0
- package/dist/languages/pt.d.ts +37 -0
- package/dist/languages/pt.js +1127 -0
- package/dist/languages/pt.js.map +1 -0
- package/dist/languages/qu.d.ts +36 -0
- package/dist/languages/qu.js +1143 -0
- package/dist/languages/qu.js.map +1 -0
- package/dist/languages/sw.d.ts +35 -0
- package/dist/languages/sw.js +1147 -0
- package/dist/languages/sw.js.map +1 -0
- package/dist/languages/tr.d.ts +45 -0
- package/dist/languages/tr.js +1529 -0
- package/dist/languages/tr.js.map +1 -0
- package/dist/languages/zh.d.ts +58 -0
- package/dist/languages/zh.js +1257 -0
- package/dist/languages/zh.js.map +1 -0
- package/dist/types-C4dcj53L.d.ts +600 -0
- package/package.json +202 -0
- package/src/__test-utils__/index.ts +7 -0
- package/src/__test-utils__/test-helpers.ts +8 -0
- package/src/__types__/test-helpers.ts +122 -0
- package/src/analysis/index.ts +479 -0
- package/src/ast-builder/command-mappers.ts +1133 -0
- package/src/ast-builder/expression-parser/index.ts +41 -0
- package/src/ast-builder/expression-parser/parser.ts +563 -0
- package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
- package/src/ast-builder/expression-parser/types.ts +208 -0
- package/src/ast-builder/index.ts +536 -0
- package/src/ast-builder/value-converters.ts +172 -0
- package/src/bridge.ts +275 -0
- package/src/browser-ar.ts +162 -0
- package/src/browser-core.ts +231 -0
- package/src/browser-de.ts +162 -0
- package/src/browser-east-asian.ts +173 -0
- package/src/browser-en-tr.ts +165 -0
- package/src/browser-en.ts +157 -0
- package/src/browser-es-en.ts +200 -0
- package/src/browser-es.ts +170 -0
- package/src/browser-fr.ts +162 -0
- package/src/browser-id.ts +162 -0
- package/src/browser-ja.ts +162 -0
- package/src/browser-ko.ts +162 -0
- package/src/browser-lazy.ts +189 -0
- package/src/browser-priority.ts +214 -0
- package/src/browser-pt.ts +162 -0
- package/src/browser-qu.ts +162 -0
- package/src/browser-sw.ts +162 -0
- package/src/browser-tr.ts +162 -0
- package/src/browser-western.ts +181 -0
- package/src/browser-zh.ts +162 -0
- package/src/browser.ts +268 -0
- package/src/cache/index.ts +14 -0
- package/src/cache/semantic-cache.ts +344 -0
- package/src/core-bridge.ts +372 -0
- package/src/explicit/converter.ts +258 -0
- package/src/explicit/index.ts +18 -0
- package/src/explicit/parser.ts +236 -0
- package/src/explicit/renderer.ts +424 -0
- package/src/generators/command-schemas.ts +1636 -0
- package/src/generators/event-handler-generator.ts +109 -0
- package/src/generators/index.ts +117 -0
- package/src/generators/language-profiles.ts +139 -0
- package/src/generators/pattern-generator.ts +537 -0
- package/src/generators/profiles/arabic.ts +131 -0
- package/src/generators/profiles/bengali.ts +132 -0
- package/src/generators/profiles/chinese.ts +124 -0
- package/src/generators/profiles/english.ts +113 -0
- package/src/generators/profiles/french.ts +125 -0
- package/src/generators/profiles/german.ts +126 -0
- package/src/generators/profiles/hindi.ts +146 -0
- package/src/generators/profiles/index.ts +46 -0
- package/src/generators/profiles/indonesian.ts +125 -0
- package/src/generators/profiles/italian.ts +139 -0
- package/src/generators/profiles/japanese.ts +149 -0
- package/src/generators/profiles/korean.ts +127 -0
- package/src/generators/profiles/marker-templates.ts +288 -0
- package/src/generators/profiles/ms.ts +130 -0
- package/src/generators/profiles/polish.ts +249 -0
- package/src/generators/profiles/portuguese.ts +115 -0
- package/src/generators/profiles/quechua.ts +113 -0
- package/src/generators/profiles/russian.ts +260 -0
- package/src/generators/profiles/spanish.ts +130 -0
- package/src/generators/profiles/swahili.ts +129 -0
- package/src/generators/profiles/thai.ts +132 -0
- package/src/generators/profiles/tl.ts +128 -0
- package/src/generators/profiles/turkish.ts +124 -0
- package/src/generators/profiles/types.ts +165 -0
- package/src/generators/profiles/ukrainian.ts +270 -0
- package/src/generators/profiles/vietnamese.ts +133 -0
- package/src/generators/schema-error-codes.ts +160 -0
- package/src/generators/schema-validator.ts +391 -0
- package/src/index.ts +429 -0
- package/src/language-building-schema.ts +3170 -0
- package/src/language-loader.ts +394 -0
- package/src/languages/_all.ts +65 -0
- package/src/languages/ar.ts +15 -0
- package/src/languages/bn.ts +16 -0
- package/src/languages/de.ts +15 -0
- package/src/languages/en.ts +29 -0
- package/src/languages/es.ts +15 -0
- package/src/languages/fr.ts +15 -0
- package/src/languages/hi.ts +26 -0
- package/src/languages/id.ts +15 -0
- package/src/languages/index.ts +18 -0
- package/src/languages/it.ts +15 -0
- package/src/languages/ja.ts +15 -0
- package/src/languages/ko.ts +15 -0
- package/src/languages/ms.ts +16 -0
- package/src/languages/pl.ts +18 -0
- package/src/languages/pt.ts +15 -0
- package/src/languages/qu.ts +15 -0
- package/src/languages/ru.ts +26 -0
- package/src/languages/sw.ts +15 -0
- package/src/languages/th.ts +16 -0
- package/src/languages/tl.ts +16 -0
- package/src/languages/tr.ts +15 -0
- package/src/languages/uk.ts +26 -0
- package/src/languages/vi.ts +16 -0
- package/src/languages/zh.ts +15 -0
- package/src/parser/index.ts +15 -0
- package/src/parser/pattern-matcher.ts +1181 -0
- package/src/parser/semantic-parser.ts +573 -0
- package/src/parser/utils/index.ts +35 -0
- package/src/parser/utils/marker-resolution.ts +111 -0
- package/src/parser/utils/possessive-keywords.ts +43 -0
- package/src/parser/utils/role-positioning.ts +70 -0
- package/src/parser/utils/type-validation.ts +134 -0
- package/src/patterns/add/ar.ts +71 -0
- package/src/patterns/add/bn.ts +70 -0
- package/src/patterns/add/hi.ts +69 -0
- package/src/patterns/add/index.ts +87 -0
- package/src/patterns/add/it.ts +61 -0
- package/src/patterns/add/ja.ts +93 -0
- package/src/patterns/add/ko.ts +74 -0
- package/src/patterns/add/ms.ts +30 -0
- package/src/patterns/add/pl.ts +62 -0
- package/src/patterns/add/ru.ts +62 -0
- package/src/patterns/add/th.ts +49 -0
- package/src/patterns/add/tl.ts +30 -0
- package/src/patterns/add/tr.ts +71 -0
- package/src/patterns/add/uk.ts +62 -0
- package/src/patterns/add/vi.ts +61 -0
- package/src/patterns/add/zh.ts +71 -0
- package/src/patterns/builders.ts +207 -0
- package/src/patterns/decrement/bn.ts +70 -0
- package/src/patterns/decrement/de.ts +42 -0
- package/src/patterns/decrement/hi.ts +68 -0
- package/src/patterns/decrement/index.ts +79 -0
- package/src/patterns/decrement/it.ts +69 -0
- package/src/patterns/decrement/ms.ts +30 -0
- package/src/patterns/decrement/pl.ts +58 -0
- package/src/patterns/decrement/ru.ts +58 -0
- package/src/patterns/decrement/th.ts +49 -0
- package/src/patterns/decrement/tl.ts +30 -0
- package/src/patterns/decrement/tr.ts +48 -0
- package/src/patterns/decrement/uk.ts +58 -0
- package/src/patterns/decrement/vi.ts +61 -0
- package/src/patterns/decrement/zh.ts +32 -0
- package/src/patterns/en.ts +302 -0
- package/src/patterns/event-handler/ar.ts +151 -0
- package/src/patterns/event-handler/bn.ts +72 -0
- package/src/patterns/event-handler/de.ts +117 -0
- package/src/patterns/event-handler/en.ts +117 -0
- package/src/patterns/event-handler/es.ts +136 -0
- package/src/patterns/event-handler/fr.ts +117 -0
- package/src/patterns/event-handler/hi.ts +64 -0
- package/src/patterns/event-handler/id.ts +117 -0
- package/src/patterns/event-handler/index.ts +119 -0
- package/src/patterns/event-handler/it.ts +54 -0
- package/src/patterns/event-handler/ja.ts +118 -0
- package/src/patterns/event-handler/ko.ts +133 -0
- package/src/patterns/event-handler/ms.ts +30 -0
- package/src/patterns/event-handler/pl.ts +62 -0
- package/src/patterns/event-handler/pt.ts +117 -0
- package/src/patterns/event-handler/qu.ts +66 -0
- package/src/patterns/event-handler/ru.ts +62 -0
- package/src/patterns/event-handler/shared.ts +270 -0
- package/src/patterns/event-handler/sw.ts +117 -0
- package/src/patterns/event-handler/th.ts +53 -0
- package/src/patterns/event-handler/tl.ts +30 -0
- package/src/patterns/event-handler/tr.ts +170 -0
- package/src/patterns/event-handler/uk.ts +62 -0
- package/src/patterns/event-handler/vi.ts +61 -0
- package/src/patterns/event-handler/zh.ts +150 -0
- package/src/patterns/get/ar.ts +49 -0
- package/src/patterns/get/bn.ts +47 -0
- package/src/patterns/get/de.ts +32 -0
- package/src/patterns/get/hi.ts +52 -0
- package/src/patterns/get/index.ts +83 -0
- package/src/patterns/get/it.ts +56 -0
- package/src/patterns/get/ja.ts +53 -0
- package/src/patterns/get/ko.ts +53 -0
- package/src/patterns/get/ms.ts +30 -0
- package/src/patterns/get/pl.ts +57 -0
- package/src/patterns/get/ru.ts +57 -0
- package/src/patterns/get/th.ts +29 -0
- package/src/patterns/get/tl.ts +30 -0
- package/src/patterns/get/uk.ts +57 -0
- package/src/patterns/get/vi.ts +48 -0
- package/src/patterns/grammar-transformed/index.ts +39 -0
- package/src/patterns/grammar-transformed/ja.ts +1713 -0
- package/src/patterns/grammar-transformed/ko.ts +1311 -0
- package/src/patterns/grammar-transformed/tr.ts +1067 -0
- package/src/patterns/hide/ar.ts +67 -0
- package/src/patterns/hide/bn.ts +47 -0
- package/src/patterns/hide/de.ts +36 -0
- package/src/patterns/hide/hi.ts +61 -0
- package/src/patterns/hide/index.ts +91 -0
- package/src/patterns/hide/it.ts +56 -0
- package/src/patterns/hide/ja.ts +69 -0
- package/src/patterns/hide/ko.ts +69 -0
- package/src/patterns/hide/ms.ts +30 -0
- package/src/patterns/hide/pl.ts +57 -0
- package/src/patterns/hide/ru.ts +57 -0
- package/src/patterns/hide/th.ts +29 -0
- package/src/patterns/hide/tl.ts +30 -0
- package/src/patterns/hide/tr.ts +65 -0
- package/src/patterns/hide/uk.ts +57 -0
- package/src/patterns/hide/vi.ts +56 -0
- package/src/patterns/hide/zh.ts +68 -0
- package/src/patterns/increment/bn.ts +70 -0
- package/src/patterns/increment/de.ts +36 -0
- package/src/patterns/increment/hi.ts +68 -0
- package/src/patterns/increment/index.ts +79 -0
- package/src/patterns/increment/it.ts +69 -0
- package/src/patterns/increment/ms.ts +30 -0
- package/src/patterns/increment/pl.ts +58 -0
- package/src/patterns/increment/ru.ts +58 -0
- package/src/patterns/increment/th.ts +49 -0
- package/src/patterns/increment/tl.ts +30 -0
- package/src/patterns/increment/tr.ts +52 -0
- package/src/patterns/increment/uk.ts +58 -0
- package/src/patterns/increment/vi.ts +61 -0
- package/src/patterns/increment/zh.ts +32 -0
- package/src/patterns/index.ts +84 -0
- package/src/patterns/languages/en/control-flow.ts +93 -0
- package/src/patterns/languages/en/fetch.ts +62 -0
- package/src/patterns/languages/en/index.ts +42 -0
- package/src/patterns/languages/en/repeat.ts +67 -0
- package/src/patterns/languages/en/set.ts +48 -0
- package/src/patterns/languages/en/swap.ts +38 -0
- package/src/patterns/languages/en/temporal.ts +57 -0
- package/src/patterns/put/ar.ts +74 -0
- package/src/patterns/put/bn.ts +53 -0
- package/src/patterns/put/en.ts +74 -0
- package/src/patterns/put/es.ts +74 -0
- package/src/patterns/put/hi.ts +69 -0
- package/src/patterns/put/id.ts +96 -0
- package/src/patterns/put/index.ts +99 -0
- package/src/patterns/put/it.ts +56 -0
- package/src/patterns/put/ja.ts +75 -0
- package/src/patterns/put/ko.ts +67 -0
- package/src/patterns/put/ms.ts +30 -0
- package/src/patterns/put/pl.ts +81 -0
- package/src/patterns/put/ru.ts +85 -0
- package/src/patterns/put/th.ts +32 -0
- package/src/patterns/put/tl.ts +30 -0
- package/src/patterns/put/tr.ts +67 -0
- package/src/patterns/put/uk.ts +85 -0
- package/src/patterns/put/vi.ts +72 -0
- package/src/patterns/put/zh.ts +62 -0
- package/src/patterns/registry.ts +163 -0
- package/src/patterns/remove/ar.ts +71 -0
- package/src/patterns/remove/bn.ts +68 -0
- package/src/patterns/remove/hi.ts +69 -0
- package/src/patterns/remove/index.ts +87 -0
- package/src/patterns/remove/it.ts +69 -0
- package/src/patterns/remove/ja.ts +74 -0
- package/src/patterns/remove/ko.ts +78 -0
- package/src/patterns/remove/ms.ts +30 -0
- package/src/patterns/remove/pl.ts +62 -0
- package/src/patterns/remove/ru.ts +62 -0
- package/src/patterns/remove/th.ts +49 -0
- package/src/patterns/remove/tl.ts +30 -0
- package/src/patterns/remove/tr.ts +78 -0
- package/src/patterns/remove/uk.ts +62 -0
- package/src/patterns/remove/vi.ts +61 -0
- package/src/patterns/remove/zh.ts +72 -0
- package/src/patterns/set/ar.ts +84 -0
- package/src/patterns/set/bn.ts +53 -0
- package/src/patterns/set/de.ts +84 -0
- package/src/patterns/set/es.ts +92 -0
- package/src/patterns/set/fr.ts +88 -0
- package/src/patterns/set/hi.ts +56 -0
- package/src/patterns/set/id.ts +84 -0
- package/src/patterns/set/index.ts +107 -0
- package/src/patterns/set/it.ts +56 -0
- package/src/patterns/set/ja.ts +86 -0
- package/src/patterns/set/ko.ts +85 -0
- package/src/patterns/set/ms.ts +30 -0
- package/src/patterns/set/pl.ts +57 -0
- package/src/patterns/set/pt.ts +84 -0
- package/src/patterns/set/ru.ts +57 -0
- package/src/patterns/set/th.ts +31 -0
- package/src/patterns/set/tl.ts +30 -0
- package/src/patterns/set/tr.ts +107 -0
- package/src/patterns/set/uk.ts +57 -0
- package/src/patterns/set/vi.ts +53 -0
- package/src/patterns/set/zh.ts +84 -0
- package/src/patterns/show/ar.ts +67 -0
- package/src/patterns/show/bn.ts +47 -0
- package/src/patterns/show/de.ts +32 -0
- package/src/patterns/show/fr.ts +32 -0
- package/src/patterns/show/hi.ts +61 -0
- package/src/patterns/show/index.ts +95 -0
- package/src/patterns/show/it.ts +56 -0
- package/src/patterns/show/ja.ts +69 -0
- package/src/patterns/show/ko.ts +73 -0
- package/src/patterns/show/ms.ts +30 -0
- package/src/patterns/show/pl.ts +57 -0
- package/src/patterns/show/ru.ts +57 -0
- package/src/patterns/show/th.ts +29 -0
- package/src/patterns/show/tl.ts +30 -0
- package/src/patterns/show/tr.ts +65 -0
- package/src/patterns/show/uk.ts +57 -0
- package/src/patterns/show/vi.ts +56 -0
- package/src/patterns/show/zh.ts +68 -0
- package/src/patterns/take/ar.ts +51 -0
- package/src/patterns/take/index.ts +31 -0
- package/src/patterns/toggle/ar.ts +61 -0
- package/src/patterns/toggle/bn.ts +70 -0
- package/src/patterns/toggle/en.ts +61 -0
- package/src/patterns/toggle/es.ts +61 -0
- package/src/patterns/toggle/hi.ts +80 -0
- package/src/patterns/toggle/index.ts +95 -0
- package/src/patterns/toggle/it.ts +69 -0
- package/src/patterns/toggle/ja.ts +156 -0
- package/src/patterns/toggle/ko.ts +113 -0
- package/src/patterns/toggle/ms.ts +30 -0
- package/src/patterns/toggle/pl.ts +62 -0
- package/src/patterns/toggle/ru.ts +62 -0
- package/src/patterns/toggle/th.ts +50 -0
- package/src/patterns/toggle/tl.ts +30 -0
- package/src/patterns/toggle/tr.ts +88 -0
- package/src/patterns/toggle/uk.ts +62 -0
- package/src/patterns/toggle/vi.ts +61 -0
- package/src/patterns/toggle/zh.ts +99 -0
- package/src/public-api.ts +286 -0
- package/src/registry.ts +441 -0
- package/src/tokenizers/arabic.ts +723 -0
- package/src/tokenizers/base.ts +1300 -0
- package/src/tokenizers/bengali.ts +289 -0
- package/src/tokenizers/chinese.ts +481 -0
- package/src/tokenizers/english.ts +416 -0
- package/src/tokenizers/french.ts +326 -0
- package/src/tokenizers/german.ts +324 -0
- package/src/tokenizers/hindi.ts +319 -0
- package/src/tokenizers/index.ts +127 -0
- package/src/tokenizers/indonesian.ts +306 -0
- package/src/tokenizers/italian.ts +458 -0
- package/src/tokenizers/japanese.ts +447 -0
- package/src/tokenizers/korean.ts +642 -0
- package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
- package/src/tokenizers/morphology/french-normalizer.ts +268 -0
- package/src/tokenizers/morphology/german-normalizer.ts +256 -0
- package/src/tokenizers/morphology/index.ts +46 -0
- package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
- package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
- package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
- package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
- package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
- package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
- package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
- package/src/tokenizers/morphology/types.ts +211 -0
- package/src/tokenizers/ms.ts +198 -0
- package/src/tokenizers/polish.ts +354 -0
- package/src/tokenizers/portuguese.ts +304 -0
- package/src/tokenizers/quechua.ts +339 -0
- package/src/tokenizers/russian.ts +375 -0
- package/src/tokenizers/spanish.ts +403 -0
- package/src/tokenizers/swahili.ts +303 -0
- package/src/tokenizers/thai.ts +236 -0
- package/src/tokenizers/tl.ts +198 -0
- package/src/tokenizers/turkish.ts +411 -0
- package/src/tokenizers/ukrainian.ts +369 -0
- package/src/tokenizers/vietnamese.ts +410 -0
- package/src/types/grammar-types.ts +617 -0
- package/src/types/unified-profile.ts +267 -0
- package/src/types.ts +709 -0
- package/src/utils/confidence-calculator.ts +147 -0
- package/src/validators/command-validator.ts +380 -0
- package/src/validators/index.ts +15 -0
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* French Tokenizer
|
|
3
|
+
*
|
|
4
|
+
* Tokenizes French hyperscript input.
|
|
5
|
+
* French characteristics:
|
|
6
|
+
* - SVO word order
|
|
7
|
+
* - Space-separated words
|
|
8
|
+
* - Prepositions
|
|
9
|
+
* - Accent marks (é, è, ê, ë, à, â, ù, û, ô, î, ï, ç, œ, æ)
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import type { LanguageToken, TokenKind, TokenStream } from '../types';
|
|
13
|
+
import {
|
|
14
|
+
BaseTokenizer,
|
|
15
|
+
TokenStreamImpl,
|
|
16
|
+
createToken,
|
|
17
|
+
createPosition,
|
|
18
|
+
createLatinCharClassifiers,
|
|
19
|
+
isWhitespace,
|
|
20
|
+
isSelectorStart,
|
|
21
|
+
isQuote,
|
|
22
|
+
isDigit,
|
|
23
|
+
isUrlStart,
|
|
24
|
+
type KeywordEntry,
|
|
25
|
+
type TimeUnitMapping,
|
|
26
|
+
} from './base';
|
|
27
|
+
import { frenchProfile } from '../generators/profiles/french';
|
|
28
|
+
|
|
29
|
+
// =============================================================================
|
|
30
|
+
// French Character Classification
|
|
31
|
+
// =============================================================================
|
|
32
|
+
|
|
33
|
+
const { isLetter: isFrenchLetter, isIdentifierChar: isFrenchIdentifierChar } =
|
|
34
|
+
createLatinCharClassifiers(/[a-zA-ZàâäéèêëîïôùûüçœæÀÂÄÉÈÊËÎÏÔÙÛÜÇŒÆ]/);
|
|
35
|
+
|
|
36
|
+
// =============================================================================
|
|
37
|
+
// French Prepositions
|
|
38
|
+
// =============================================================================
|
|
39
|
+
|
|
40
|
+
const PREPOSITIONS = new Set([
|
|
41
|
+
'à', // to, at
|
|
42
|
+
'a', // to, at (no accent)
|
|
43
|
+
'de', // of, from
|
|
44
|
+
'du', // de + le
|
|
45
|
+
'des', // de + les
|
|
46
|
+
'dans', // in
|
|
47
|
+
'sur', // on
|
|
48
|
+
'sous', // under
|
|
49
|
+
'avec', // with
|
|
50
|
+
'sans', // without
|
|
51
|
+
'par', // by
|
|
52
|
+
'pour', // for
|
|
53
|
+
'entre', // between
|
|
54
|
+
'avant', // before
|
|
55
|
+
'après', // after
|
|
56
|
+
'apres', // after (no accent)
|
|
57
|
+
'depuis', // since, from
|
|
58
|
+
'vers', // towards
|
|
59
|
+
'chez', // at (someone's place)
|
|
60
|
+
'contre', // against
|
|
61
|
+
'au', // à + le
|
|
62
|
+
'aux', // à + les
|
|
63
|
+
]);
|
|
64
|
+
|
|
65
|
+
// =============================================================================
|
|
66
|
+
// French Extras (keywords not in profile)
|
|
67
|
+
// =============================================================================
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Extra keywords not covered by the profile:
|
|
71
|
+
* - Literals (true, false, null, undefined)
|
|
72
|
+
* - Positional words
|
|
73
|
+
* - Event names
|
|
74
|
+
* - Time units
|
|
75
|
+
* - Accent-free variants for accessibility
|
|
76
|
+
*/
|
|
77
|
+
const FRENCH_EXTRAS: KeywordEntry[] = [
|
|
78
|
+
// Values/Literals
|
|
79
|
+
{ native: 'vrai', normalized: 'true' },
|
|
80
|
+
{ native: 'faux', normalized: 'false' },
|
|
81
|
+
{ native: 'nul', normalized: 'null' },
|
|
82
|
+
{ native: 'indéfini', normalized: 'undefined' },
|
|
83
|
+
{ native: 'indefini', normalized: 'undefined' },
|
|
84
|
+
|
|
85
|
+
// Positional
|
|
86
|
+
{ native: 'premier', normalized: 'first' },
|
|
87
|
+
{ native: 'première', normalized: 'first' },
|
|
88
|
+
{ native: 'premiere', normalized: 'first' },
|
|
89
|
+
{ native: 'dernier', normalized: 'last' },
|
|
90
|
+
{ native: 'dernière', normalized: 'last' },
|
|
91
|
+
{ native: 'derniere', normalized: 'last' },
|
|
92
|
+
{ native: 'suivant', normalized: 'next' },
|
|
93
|
+
{ native: 'précédent', normalized: 'previous' },
|
|
94
|
+
{ native: 'precedent', normalized: 'previous' },
|
|
95
|
+
{ native: 'plus proche', normalized: 'closest' },
|
|
96
|
+
{ native: 'parent', normalized: 'parent' },
|
|
97
|
+
|
|
98
|
+
// Events
|
|
99
|
+
{ native: 'clic', normalized: 'click' },
|
|
100
|
+
{ native: 'click', normalized: 'click' },
|
|
101
|
+
{ native: 'entrée', normalized: 'input' },
|
|
102
|
+
{ native: 'entree', normalized: 'input' },
|
|
103
|
+
{ native: 'changement', normalized: 'change' },
|
|
104
|
+
{ native: 'soumission', normalized: 'submit' },
|
|
105
|
+
{ native: 'touche bas', normalized: 'keydown' },
|
|
106
|
+
{ native: 'touche haut', normalized: 'keyup' },
|
|
107
|
+
{ native: 'souris dessus', normalized: 'mouseover' },
|
|
108
|
+
{ native: 'souris dehors', normalized: 'mouseout' },
|
|
109
|
+
{ native: 'focus', normalized: 'focus' },
|
|
110
|
+
{ native: 'flou', normalized: 'blur' },
|
|
111
|
+
{ native: 'chargement', normalized: 'load' },
|
|
112
|
+
{ native: 'défilement', normalized: 'scroll' },
|
|
113
|
+
{ native: 'defilement', normalized: 'scroll' },
|
|
114
|
+
|
|
115
|
+
// Additional references
|
|
116
|
+
{ native: 'je', normalized: 'me' },
|
|
117
|
+
{ native: 'mon', normalized: 'my' },
|
|
118
|
+
{ native: 'ma', normalized: 'my' },
|
|
119
|
+
{ native: 'mes', normalized: 'my' },
|
|
120
|
+
{ native: 'ça', normalized: 'it' },
|
|
121
|
+
{ native: 'ca', normalized: 'it' },
|
|
122
|
+
{ native: 'resultat', normalized: 'result' },
|
|
123
|
+
{ native: 'evenement', normalized: 'event' },
|
|
124
|
+
|
|
125
|
+
// Time units
|
|
126
|
+
{ native: 'seconde', normalized: 's' },
|
|
127
|
+
{ native: 'secondes', normalized: 's' },
|
|
128
|
+
{ native: 'milliseconde', normalized: 'ms' },
|
|
129
|
+
{ native: 'millisecondes', normalized: 'ms' },
|
|
130
|
+
{ native: 'minute', normalized: 'm' },
|
|
131
|
+
{ native: 'minutes', normalized: 'm' },
|
|
132
|
+
{ native: 'heure', normalized: 'h' },
|
|
133
|
+
{ native: 'heures', normalized: 'h' },
|
|
134
|
+
|
|
135
|
+
// Accent-free variants (for user convenience)
|
|
136
|
+
{ native: 'prefixer', normalized: 'prepend' },
|
|
137
|
+
{ native: 'creer', normalized: 'make' },
|
|
138
|
+
{ native: 'definir', normalized: 'set' },
|
|
139
|
+
{ native: 'etablir', normalized: 'set' },
|
|
140
|
+
{ native: 'incrementer', normalized: 'increment' },
|
|
141
|
+
{ native: 'decrementer', normalized: 'decrement' },
|
|
142
|
+
{ native: 'declencher', normalized: 'trigger' },
|
|
143
|
+
{ native: 'defocaliser', normalized: 'blur' },
|
|
144
|
+
{ native: 'recuperer', normalized: 'fetch' },
|
|
145
|
+
{ native: 'repeter', normalized: 'repeat' },
|
|
146
|
+
{ native: 'arreter', normalized: 'halt' },
|
|
147
|
+
{ native: 'defaut', normalized: 'default' },
|
|
148
|
+
{ native: 'jusqua', normalized: 'until' },
|
|
149
|
+
{ native: 'apres', normalized: 'after' },
|
|
150
|
+
|
|
151
|
+
// Additional log synonyms
|
|
152
|
+
{ native: 'journaliser', normalized: 'log' },
|
|
153
|
+
|
|
154
|
+
// Additional morph synonym
|
|
155
|
+
{ native: 'transmuter', normalized: 'morph' },
|
|
156
|
+
|
|
157
|
+
// Multi-word phrases
|
|
158
|
+
{ native: 'tant que', normalized: 'while' },
|
|
159
|
+
];
|
|
160
|
+
|
|
161
|
+
// =============================================================================
|
|
162
|
+
// French Time Units
|
|
163
|
+
// =============================================================================
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* French time unit patterns for number parsing.
|
|
167
|
+
* Sorted by length (longest first) to ensure correct matching.
|
|
168
|
+
*/
|
|
169
|
+
const FRENCH_TIME_UNITS: readonly TimeUnitMapping[] = [
|
|
170
|
+
{ pattern: 'millisecondes', suffix: 'ms', length: 13, caseInsensitive: true },
|
|
171
|
+
{ pattern: 'milliseconde', suffix: 'ms', length: 12, caseInsensitive: true },
|
|
172
|
+
{ pattern: 'secondes', suffix: 's', length: 8, caseInsensitive: true },
|
|
173
|
+
{ pattern: 'seconde', suffix: 's', length: 7, caseInsensitive: true },
|
|
174
|
+
{ pattern: 'minutes', suffix: 'm', length: 7, caseInsensitive: true },
|
|
175
|
+
{ pattern: 'minute', suffix: 'm', length: 6, caseInsensitive: true },
|
|
176
|
+
{ pattern: 'heures', suffix: 'h', length: 6, caseInsensitive: true },
|
|
177
|
+
{ pattern: 'heure', suffix: 'h', length: 5, caseInsensitive: true },
|
|
178
|
+
];
|
|
179
|
+
|
|
180
|
+
// =============================================================================
|
|
181
|
+
// French Tokenizer Implementation
|
|
182
|
+
// =============================================================================
|
|
183
|
+
|
|
184
|
+
export class FrenchTokenizer extends BaseTokenizer {
|
|
185
|
+
readonly language = 'fr';
|
|
186
|
+
readonly direction = 'ltr' as const;
|
|
187
|
+
|
|
188
|
+
constructor() {
|
|
189
|
+
super();
|
|
190
|
+
this.initializeKeywordsFromProfile(frenchProfile, FRENCH_EXTRAS);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
tokenize(input: string): TokenStream {
|
|
194
|
+
const tokens: LanguageToken[] = [];
|
|
195
|
+
let pos = 0;
|
|
196
|
+
|
|
197
|
+
while (pos < input.length) {
|
|
198
|
+
if (isWhitespace(input[pos])) {
|
|
199
|
+
pos++;
|
|
200
|
+
continue;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
if (isSelectorStart(input[pos])) {
|
|
204
|
+
// Check for event modifier first (.once, .debounce(), etc.)
|
|
205
|
+
const modifierToken = this.tryEventModifier(input, pos);
|
|
206
|
+
if (modifierToken) {
|
|
207
|
+
tokens.push(modifierToken);
|
|
208
|
+
pos = modifierToken.position.end;
|
|
209
|
+
continue;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
const selectorToken = this.trySelector(input, pos);
|
|
213
|
+
if (selectorToken) {
|
|
214
|
+
tokens.push(selectorToken);
|
|
215
|
+
pos = selectorToken.position.end;
|
|
216
|
+
continue;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
if (isQuote(input[pos])) {
|
|
221
|
+
const stringToken = this.tryString(input, pos);
|
|
222
|
+
if (stringToken) {
|
|
223
|
+
tokens.push(stringToken);
|
|
224
|
+
pos = stringToken.position.end;
|
|
225
|
+
continue;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
if (isUrlStart(input, pos)) {
|
|
230
|
+
const urlToken = this.tryUrl(input, pos);
|
|
231
|
+
if (urlToken) {
|
|
232
|
+
tokens.push(urlToken);
|
|
233
|
+
pos = urlToken.position.end;
|
|
234
|
+
continue;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
if (
|
|
239
|
+
isDigit(input[pos]) ||
|
|
240
|
+
(input[pos] === '-' && pos + 1 < input.length && isDigit(input[pos + 1]))
|
|
241
|
+
) {
|
|
242
|
+
const numberToken = this.extractNumber(input, pos);
|
|
243
|
+
if (numberToken) {
|
|
244
|
+
tokens.push(numberToken);
|
|
245
|
+
pos = numberToken.position.end;
|
|
246
|
+
continue;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
const varToken = this.tryVariableRef(input, pos);
|
|
251
|
+
if (varToken) {
|
|
252
|
+
tokens.push(varToken);
|
|
253
|
+
pos = varToken.position.end;
|
|
254
|
+
continue;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
if (isFrenchLetter(input[pos])) {
|
|
258
|
+
const wordToken = this.extractWord(input, pos);
|
|
259
|
+
if (wordToken) {
|
|
260
|
+
tokens.push(wordToken);
|
|
261
|
+
pos = wordToken.position.end;
|
|
262
|
+
continue;
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
const operatorToken = this.tryOperator(input, pos);
|
|
267
|
+
if (operatorToken) {
|
|
268
|
+
tokens.push(operatorToken);
|
|
269
|
+
pos = operatorToken.position.end;
|
|
270
|
+
continue;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
pos++;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
return new TokenStreamImpl(tokens, 'fr');
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
classifyToken(token: string): TokenKind {
|
|
280
|
+
const lower = token.toLowerCase();
|
|
281
|
+
if (PREPOSITIONS.has(lower)) return 'particle';
|
|
282
|
+
// O(1) Map lookup instead of O(n) array search
|
|
283
|
+
if (this.isKeyword(lower)) return 'keyword';
|
|
284
|
+
if (token.startsWith('#') || token.startsWith('.') || token.startsWith('[')) return 'selector';
|
|
285
|
+
if (token.startsWith('"') || token.startsWith("'")) return 'literal';
|
|
286
|
+
if (/^\d/.test(token)) return 'literal';
|
|
287
|
+
return 'identifier';
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
private extractWord(input: string, startPos: number): LanguageToken | null {
|
|
291
|
+
let pos = startPos;
|
|
292
|
+
let word = '';
|
|
293
|
+
|
|
294
|
+
while (pos < input.length && isFrenchIdentifierChar(input[pos])) {
|
|
295
|
+
word += input[pos++];
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
if (!word) return null;
|
|
299
|
+
|
|
300
|
+
const lower = word.toLowerCase();
|
|
301
|
+
|
|
302
|
+
// O(1) Map lookup instead of O(n) array search
|
|
303
|
+
const keywordEntry = this.lookupKeyword(lower);
|
|
304
|
+
if (keywordEntry) {
|
|
305
|
+
return createToken(word, 'keyword', createPosition(startPos, pos), keywordEntry.normalized);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
if (PREPOSITIONS.has(lower)) {
|
|
309
|
+
return createToken(word, 'particle', createPosition(startPos, pos));
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
return createToken(word, 'identifier', createPosition(startPos, pos));
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
/**
|
|
316
|
+
* Extract a number, including French time unit suffixes.
|
|
317
|
+
*/
|
|
318
|
+
private extractNumber(input: string, startPos: number): LanguageToken | null {
|
|
319
|
+
return this.tryNumberWithTimeUnits(input, startPos, FRENCH_TIME_UNITS, {
|
|
320
|
+
allowSign: true,
|
|
321
|
+
skipWhitespace: true,
|
|
322
|
+
});
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
export const frenchTokenizer = new FrenchTokenizer();
|
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* German Tokenizer
|
|
3
|
+
*
|
|
4
|
+
* Tokenizes German hyperscript input.
|
|
5
|
+
* German characteristics:
|
|
6
|
+
* - SVO word order (V2 in main clauses, but SVO for our purposes)
|
|
7
|
+
* - Space-separated words
|
|
8
|
+
* - Prepositions
|
|
9
|
+
* - Umlauts (ä, ö, ü) and ß
|
|
10
|
+
* - Compound nouns
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import type { LanguageToken, TokenKind, TokenStream } from '../types';
|
|
14
|
+
import {
|
|
15
|
+
BaseTokenizer,
|
|
16
|
+
TokenStreamImpl,
|
|
17
|
+
createToken,
|
|
18
|
+
createPosition,
|
|
19
|
+
createLatinCharClassifiers,
|
|
20
|
+
isWhitespace,
|
|
21
|
+
isSelectorStart,
|
|
22
|
+
isQuote,
|
|
23
|
+
isDigit,
|
|
24
|
+
isUrlStart,
|
|
25
|
+
type KeywordEntry,
|
|
26
|
+
type TimeUnitMapping,
|
|
27
|
+
} from './base';
|
|
28
|
+
import { germanProfile } from '../generators/profiles/german';
|
|
29
|
+
|
|
30
|
+
// =============================================================================
|
|
31
|
+
// German Character Classification
|
|
32
|
+
// =============================================================================
|
|
33
|
+
|
|
34
|
+
const { isLetter: isGermanLetter, isIdentifierChar: isGermanIdentifierChar } =
|
|
35
|
+
createLatinCharClassifiers(/[a-zA-ZäöüÄÖÜß]/);
|
|
36
|
+
|
|
37
|
+
// =============================================================================
|
|
38
|
+
// German Prepositions
|
|
39
|
+
// =============================================================================
|
|
40
|
+
|
|
41
|
+
const PREPOSITIONS = new Set([
|
|
42
|
+
'an', // at, on
|
|
43
|
+
'auf', // on
|
|
44
|
+
'aus', // from, out of
|
|
45
|
+
'bei', // at, near
|
|
46
|
+
'durch', // through
|
|
47
|
+
'für', // for
|
|
48
|
+
'fur', // for (no umlaut)
|
|
49
|
+
'gegen', // against
|
|
50
|
+
'in', // in
|
|
51
|
+
'mit', // with
|
|
52
|
+
'nach', // after, to
|
|
53
|
+
'ohne', // without
|
|
54
|
+
'seit', // since
|
|
55
|
+
'über', // over, about
|
|
56
|
+
'uber', // over (no umlaut)
|
|
57
|
+
'um', // around, at
|
|
58
|
+
'unter', // under
|
|
59
|
+
'von', // from, of
|
|
60
|
+
'vor', // before, in front of
|
|
61
|
+
'zu', // to
|
|
62
|
+
'zwischen', // between
|
|
63
|
+
'bis', // until
|
|
64
|
+
'gegenüber', // opposite
|
|
65
|
+
'gegenuber', // opposite (no umlaut)
|
|
66
|
+
'während', // during
|
|
67
|
+
'wahrend', // during (no umlaut)
|
|
68
|
+
'wegen', // because of
|
|
69
|
+
'trotz', // despite
|
|
70
|
+
'statt', // instead of
|
|
71
|
+
'innerhalb', // inside
|
|
72
|
+
'außerhalb', // outside
|
|
73
|
+
'ausserhalb', // outside (no umlaut)
|
|
74
|
+
]);
|
|
75
|
+
|
|
76
|
+
// =============================================================================
|
|
77
|
+
// German Extras (keywords not in profile)
|
|
78
|
+
// =============================================================================
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Extra keywords not covered by the profile:
|
|
82
|
+
* - Literals (true, false, null, undefined)
|
|
83
|
+
* - Positional words
|
|
84
|
+
* - Event names
|
|
85
|
+
* - Time units
|
|
86
|
+
* - Umlaut-free variants for accessibility
|
|
87
|
+
* - Verb conjugation variants (imperatives)
|
|
88
|
+
*/
|
|
89
|
+
const GERMAN_EXTRAS: KeywordEntry[] = [
|
|
90
|
+
// Values/Literals
|
|
91
|
+
{ native: 'wahr', normalized: 'true' },
|
|
92
|
+
{ native: 'falsch', normalized: 'false' },
|
|
93
|
+
{ native: 'null', normalized: 'null' },
|
|
94
|
+
{ native: 'undefiniert', normalized: 'undefined' },
|
|
95
|
+
|
|
96
|
+
// Positional
|
|
97
|
+
{ native: 'erste', normalized: 'first' },
|
|
98
|
+
{ native: 'erster', normalized: 'first' },
|
|
99
|
+
{ native: 'erstes', normalized: 'first' },
|
|
100
|
+
{ native: 'letzte', normalized: 'last' },
|
|
101
|
+
{ native: 'letzter', normalized: 'last' },
|
|
102
|
+
{ native: 'letztes', normalized: 'last' },
|
|
103
|
+
{ native: 'nächste', normalized: 'next' },
|
|
104
|
+
{ native: 'nachste', normalized: 'next' },
|
|
105
|
+
{ native: 'vorherige', normalized: 'previous' },
|
|
106
|
+
{ native: 'nächste', normalized: 'closest' },
|
|
107
|
+
{ native: 'eltern', normalized: 'parent' },
|
|
108
|
+
|
|
109
|
+
// Events
|
|
110
|
+
{ native: 'klick', normalized: 'click' },
|
|
111
|
+
{ native: 'click', normalized: 'click' },
|
|
112
|
+
{ native: 'eingabe', normalized: 'input' },
|
|
113
|
+
{ native: 'änderung', normalized: 'change' },
|
|
114
|
+
{ native: 'anderung', normalized: 'change' },
|
|
115
|
+
{ native: 'absenden', normalized: 'submit' },
|
|
116
|
+
{ native: 'taste unten', normalized: 'keydown' },
|
|
117
|
+
{ native: 'taste oben', normalized: 'keyup' },
|
|
118
|
+
{ native: 'maus drüber', normalized: 'mouseover' },
|
|
119
|
+
{ native: 'maus druber', normalized: 'mouseover' },
|
|
120
|
+
{ native: 'maus weg', normalized: 'mouseout' },
|
|
121
|
+
{ native: 'fokus', normalized: 'focus' },
|
|
122
|
+
{ native: 'unschärfe', normalized: 'blur' },
|
|
123
|
+
{ native: 'unscharfe', normalized: 'blur' },
|
|
124
|
+
{ native: 'scrollen', normalized: 'scroll' },
|
|
125
|
+
|
|
126
|
+
// Additional references
|
|
127
|
+
{ native: 'meine', normalized: 'my' },
|
|
128
|
+
{ native: 'meinen', normalized: 'my' },
|
|
129
|
+
{ native: 'ergebnis', normalized: 'result' },
|
|
130
|
+
{ native: 'ziel', normalized: 'target' },
|
|
131
|
+
|
|
132
|
+
// Time units
|
|
133
|
+
{ native: 'sekunde', normalized: 's' },
|
|
134
|
+
{ native: 'sekunden', normalized: 's' },
|
|
135
|
+
{ native: 'millisekunde', normalized: 'ms' },
|
|
136
|
+
{ native: 'millisekunden', normalized: 'ms' },
|
|
137
|
+
{ native: 'minute', normalized: 'm' },
|
|
138
|
+
{ native: 'minuten', normalized: 'm' },
|
|
139
|
+
{ native: 'stunde', normalized: 'h' },
|
|
140
|
+
{ native: 'stunden', normalized: 'h' },
|
|
141
|
+
|
|
142
|
+
// Umlaut-free variants (for user convenience)
|
|
143
|
+
{ native: 'hinzufugen', normalized: 'add' },
|
|
144
|
+
{ native: 'hinzufgen', normalized: 'add' },
|
|
145
|
+
{ native: 'loschen', normalized: 'remove' },
|
|
146
|
+
{ native: 'anhangen', normalized: 'append' },
|
|
147
|
+
{ native: 'erhohen', normalized: 'increment' },
|
|
148
|
+
{ native: 'ubergang', normalized: 'transition' },
|
|
149
|
+
{ native: 'auslosen', normalized: 'trigger' },
|
|
150
|
+
{ native: 'zuruckgeben', normalized: 'return' },
|
|
151
|
+
{ native: 'anschliessend', normalized: 'then' },
|
|
152
|
+
|
|
153
|
+
// Verb conjugation variants (imperatives for test cases)
|
|
154
|
+
{ native: 'erhöhe', normalized: 'increment' },
|
|
155
|
+
{ native: 'erhohe', normalized: 'increment' },
|
|
156
|
+
{ native: 'verringere', normalized: 'decrement' },
|
|
157
|
+
];
|
|
158
|
+
|
|
159
|
+
// =============================================================================
|
|
160
|
+
// German Time Units
|
|
161
|
+
// =============================================================================
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* German time unit patterns for number parsing.
|
|
165
|
+
* Sorted by length (longest first) to ensure correct matching.
|
|
166
|
+
*/
|
|
167
|
+
const GERMAN_TIME_UNITS: readonly TimeUnitMapping[] = [
|
|
168
|
+
{ pattern: 'millisekunden', suffix: 'ms', length: 13, caseInsensitive: true },
|
|
169
|
+
{ pattern: 'millisekunde', suffix: 'ms', length: 12, caseInsensitive: true },
|
|
170
|
+
{ pattern: 'sekunden', suffix: 's', length: 8, caseInsensitive: true },
|
|
171
|
+
{ pattern: 'sekunde', suffix: 's', length: 7, caseInsensitive: true },
|
|
172
|
+
{ pattern: 'minuten', suffix: 'm', length: 7, caseInsensitive: true },
|
|
173
|
+
{ pattern: 'minute', suffix: 'm', length: 6, caseInsensitive: true },
|
|
174
|
+
{ pattern: 'stunden', suffix: 'h', length: 7, caseInsensitive: true },
|
|
175
|
+
{ pattern: 'stunde', suffix: 'h', length: 6, caseInsensitive: true },
|
|
176
|
+
];
|
|
177
|
+
|
|
178
|
+
// =============================================================================
|
|
179
|
+
// German Tokenizer Implementation
|
|
180
|
+
// =============================================================================
|
|
181
|
+
|
|
182
|
+
export class GermanTokenizer extends BaseTokenizer {
|
|
183
|
+
readonly language = 'de';
|
|
184
|
+
readonly direction = 'ltr' as const;
|
|
185
|
+
|
|
186
|
+
constructor() {
|
|
187
|
+
super();
|
|
188
|
+
this.initializeKeywordsFromProfile(germanProfile, GERMAN_EXTRAS);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
tokenize(input: string): TokenStream {
|
|
192
|
+
const tokens: LanguageToken[] = [];
|
|
193
|
+
let pos = 0;
|
|
194
|
+
|
|
195
|
+
while (pos < input.length) {
|
|
196
|
+
if (isWhitespace(input[pos])) {
|
|
197
|
+
pos++;
|
|
198
|
+
continue;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
if (isSelectorStart(input[pos])) {
|
|
202
|
+
// Check for event modifier first (.once, .debounce(), etc.)
|
|
203
|
+
const modifierToken = this.tryEventModifier(input, pos);
|
|
204
|
+
if (modifierToken) {
|
|
205
|
+
tokens.push(modifierToken);
|
|
206
|
+
pos = modifierToken.position.end;
|
|
207
|
+
continue;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
const selectorToken = this.trySelector(input, pos);
|
|
211
|
+
if (selectorToken) {
|
|
212
|
+
tokens.push(selectorToken);
|
|
213
|
+
pos = selectorToken.position.end;
|
|
214
|
+
continue;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
if (isQuote(input[pos])) {
|
|
219
|
+
const stringToken = this.tryString(input, pos);
|
|
220
|
+
if (stringToken) {
|
|
221
|
+
tokens.push(stringToken);
|
|
222
|
+
pos = stringToken.position.end;
|
|
223
|
+
continue;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
if (isUrlStart(input, pos)) {
|
|
228
|
+
const urlToken = this.tryUrl(input, pos);
|
|
229
|
+
if (urlToken) {
|
|
230
|
+
tokens.push(urlToken);
|
|
231
|
+
pos = urlToken.position.end;
|
|
232
|
+
continue;
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if (
|
|
237
|
+
isDigit(input[pos]) ||
|
|
238
|
+
(input[pos] === '-' && pos + 1 < input.length && isDigit(input[pos + 1]))
|
|
239
|
+
) {
|
|
240
|
+
const numberToken = this.extractNumber(input, pos);
|
|
241
|
+
if (numberToken) {
|
|
242
|
+
tokens.push(numberToken);
|
|
243
|
+
pos = numberToken.position.end;
|
|
244
|
+
continue;
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
const varToken = this.tryVariableRef(input, pos);
|
|
249
|
+
if (varToken) {
|
|
250
|
+
tokens.push(varToken);
|
|
251
|
+
pos = varToken.position.end;
|
|
252
|
+
continue;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
if (isGermanLetter(input[pos])) {
|
|
256
|
+
const wordToken = this.extractWord(input, pos);
|
|
257
|
+
if (wordToken) {
|
|
258
|
+
tokens.push(wordToken);
|
|
259
|
+
pos = wordToken.position.end;
|
|
260
|
+
continue;
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
const operatorToken = this.tryOperator(input, pos);
|
|
265
|
+
if (operatorToken) {
|
|
266
|
+
tokens.push(operatorToken);
|
|
267
|
+
pos = operatorToken.position.end;
|
|
268
|
+
continue;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
pos++;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
return new TokenStreamImpl(tokens, 'de');
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
classifyToken(token: string): TokenKind {
|
|
278
|
+
const lower = token.toLowerCase();
|
|
279
|
+
if (PREPOSITIONS.has(lower)) return 'particle';
|
|
280
|
+
// O(1) Map lookup instead of O(n) array search
|
|
281
|
+
if (this.isKeyword(lower)) return 'keyword';
|
|
282
|
+
if (token.startsWith('#') || token.startsWith('.') || token.startsWith('[')) return 'selector';
|
|
283
|
+
if (token.startsWith('"') || token.startsWith("'")) return 'literal';
|
|
284
|
+
if (/^\d/.test(token)) return 'literal';
|
|
285
|
+
return 'identifier';
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
private extractWord(input: string, startPos: number): LanguageToken | null {
|
|
289
|
+
let pos = startPos;
|
|
290
|
+
let word = '';
|
|
291
|
+
|
|
292
|
+
while (pos < input.length && isGermanIdentifierChar(input[pos])) {
|
|
293
|
+
word += input[pos++];
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
if (!word) return null;
|
|
297
|
+
|
|
298
|
+
const lower = word.toLowerCase();
|
|
299
|
+
|
|
300
|
+
// O(1) Map lookup instead of O(n) array search
|
|
301
|
+
const keywordEntry = this.lookupKeyword(lower);
|
|
302
|
+
if (keywordEntry) {
|
|
303
|
+
return createToken(word, 'keyword', createPosition(startPos, pos), keywordEntry.normalized);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
if (PREPOSITIONS.has(lower)) {
|
|
307
|
+
return createToken(word, 'particle', createPosition(startPos, pos));
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
return createToken(word, 'identifier', createPosition(startPos, pos));
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
/**
|
|
314
|
+
* Extract a number, including German time unit suffixes.
|
|
315
|
+
*/
|
|
316
|
+
private extractNumber(input: string, startPos: number): LanguageToken | null {
|
|
317
|
+
return this.tryNumberWithTimeUnits(input, startPos, GERMAN_TIME_UNITS, {
|
|
318
|
+
allowSign: true,
|
|
319
|
+
skipWhitespace: true,
|
|
320
|
+
});
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
export const germanTokenizer = new GermanTokenizer();
|