@lokascript/semantic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +686 -0
- package/dist/browser-ar.ar.global.js +2 -0
- package/dist/browser-core.core.global.js +2 -0
- package/dist/browser-de.de.global.js +2 -0
- package/dist/browser-east-asian.east-asian.global.js +2 -0
- package/dist/browser-en-tr.en-tr.global.js +2 -0
- package/dist/browser-en.en.global.js +2 -0
- package/dist/browser-es-en.es-en.global.js +2 -0
- package/dist/browser-es.es.global.js +2 -0
- package/dist/browser-fr.fr.global.js +2 -0
- package/dist/browser-id.id.global.js +2 -0
- package/dist/browser-ja.ja.global.js +2 -0
- package/dist/browser-ko.ko.global.js +2 -0
- package/dist/browser-lazy.lazy.global.js +2 -0
- package/dist/browser-priority.priority.global.js +2 -0
- package/dist/browser-pt.pt.global.js +2 -0
- package/dist/browser-qu.qu.global.js +2 -0
- package/dist/browser-sw.sw.global.js +2 -0
- package/dist/browser-tr.tr.global.js +2 -0
- package/dist/browser-western.western.global.js +2 -0
- package/dist/browser-zh.zh.global.js +2 -0
- package/dist/browser.global.js +3 -0
- package/dist/browser.global.js.map +1 -0
- package/dist/index.cjs +35051 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3426 -0
- package/dist/index.d.ts +3426 -0
- package/dist/index.js +34890 -0
- package/dist/index.js.map +1 -0
- package/dist/languages/ar.d.ts +78 -0
- package/dist/languages/ar.js +1622 -0
- package/dist/languages/ar.js.map +1 -0
- package/dist/languages/de.d.ts +38 -0
- package/dist/languages/de.js +1168 -0
- package/dist/languages/de.js.map +1 -0
- package/dist/languages/en.d.ts +44 -0
- package/dist/languages/en.js +3491 -0
- package/dist/languages/en.js.map +1 -0
- package/dist/languages/es.d.ts +52 -0
- package/dist/languages/es.js +1493 -0
- package/dist/languages/es.js.map +1 -0
- package/dist/languages/fr.d.ts +37 -0
- package/dist/languages/fr.js +1159 -0
- package/dist/languages/fr.js.map +1 -0
- package/dist/languages/id.d.ts +35 -0
- package/dist/languages/id.js +1152 -0
- package/dist/languages/id.js.map +1 -0
- package/dist/languages/ja.d.ts +53 -0
- package/dist/languages/ja.js +1430 -0
- package/dist/languages/ja.js.map +1 -0
- package/dist/languages/ko.d.ts +51 -0
- package/dist/languages/ko.js +1729 -0
- package/dist/languages/ko.js.map +1 -0
- package/dist/languages/pt.d.ts +37 -0
- package/dist/languages/pt.js +1127 -0
- package/dist/languages/pt.js.map +1 -0
- package/dist/languages/qu.d.ts +36 -0
- package/dist/languages/qu.js +1143 -0
- package/dist/languages/qu.js.map +1 -0
- package/dist/languages/sw.d.ts +35 -0
- package/dist/languages/sw.js +1147 -0
- package/dist/languages/sw.js.map +1 -0
- package/dist/languages/tr.d.ts +45 -0
- package/dist/languages/tr.js +1529 -0
- package/dist/languages/tr.js.map +1 -0
- package/dist/languages/zh.d.ts +58 -0
- package/dist/languages/zh.js +1257 -0
- package/dist/languages/zh.js.map +1 -0
- package/dist/types-C4dcj53L.d.ts +600 -0
- package/package.json +202 -0
- package/src/__test-utils__/index.ts +7 -0
- package/src/__test-utils__/test-helpers.ts +8 -0
- package/src/__types__/test-helpers.ts +122 -0
- package/src/analysis/index.ts +479 -0
- package/src/ast-builder/command-mappers.ts +1133 -0
- package/src/ast-builder/expression-parser/index.ts +41 -0
- package/src/ast-builder/expression-parser/parser.ts +563 -0
- package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
- package/src/ast-builder/expression-parser/types.ts +208 -0
- package/src/ast-builder/index.ts +536 -0
- package/src/ast-builder/value-converters.ts +172 -0
- package/src/bridge.ts +275 -0
- package/src/browser-ar.ts +162 -0
- package/src/browser-core.ts +231 -0
- package/src/browser-de.ts +162 -0
- package/src/browser-east-asian.ts +173 -0
- package/src/browser-en-tr.ts +165 -0
- package/src/browser-en.ts +157 -0
- package/src/browser-es-en.ts +200 -0
- package/src/browser-es.ts +170 -0
- package/src/browser-fr.ts +162 -0
- package/src/browser-id.ts +162 -0
- package/src/browser-ja.ts +162 -0
- package/src/browser-ko.ts +162 -0
- package/src/browser-lazy.ts +189 -0
- package/src/browser-priority.ts +214 -0
- package/src/browser-pt.ts +162 -0
- package/src/browser-qu.ts +162 -0
- package/src/browser-sw.ts +162 -0
- package/src/browser-tr.ts +162 -0
- package/src/browser-western.ts +181 -0
- package/src/browser-zh.ts +162 -0
- package/src/browser.ts +268 -0
- package/src/cache/index.ts +14 -0
- package/src/cache/semantic-cache.ts +344 -0
- package/src/core-bridge.ts +372 -0
- package/src/explicit/converter.ts +258 -0
- package/src/explicit/index.ts +18 -0
- package/src/explicit/parser.ts +236 -0
- package/src/explicit/renderer.ts +424 -0
- package/src/generators/command-schemas.ts +1636 -0
- package/src/generators/event-handler-generator.ts +109 -0
- package/src/generators/index.ts +117 -0
- package/src/generators/language-profiles.ts +139 -0
- package/src/generators/pattern-generator.ts +537 -0
- package/src/generators/profiles/arabic.ts +131 -0
- package/src/generators/profiles/bengali.ts +132 -0
- package/src/generators/profiles/chinese.ts +124 -0
- package/src/generators/profiles/english.ts +113 -0
- package/src/generators/profiles/french.ts +125 -0
- package/src/generators/profiles/german.ts +126 -0
- package/src/generators/profiles/hindi.ts +146 -0
- package/src/generators/profiles/index.ts +46 -0
- package/src/generators/profiles/indonesian.ts +125 -0
- package/src/generators/profiles/italian.ts +139 -0
- package/src/generators/profiles/japanese.ts +149 -0
- package/src/generators/profiles/korean.ts +127 -0
- package/src/generators/profiles/marker-templates.ts +288 -0
- package/src/generators/profiles/ms.ts +130 -0
- package/src/generators/profiles/polish.ts +249 -0
- package/src/generators/profiles/portuguese.ts +115 -0
- package/src/generators/profiles/quechua.ts +113 -0
- package/src/generators/profiles/russian.ts +260 -0
- package/src/generators/profiles/spanish.ts +130 -0
- package/src/generators/profiles/swahili.ts +129 -0
- package/src/generators/profiles/thai.ts +132 -0
- package/src/generators/profiles/tl.ts +128 -0
- package/src/generators/profiles/turkish.ts +124 -0
- package/src/generators/profiles/types.ts +165 -0
- package/src/generators/profiles/ukrainian.ts +270 -0
- package/src/generators/profiles/vietnamese.ts +133 -0
- package/src/generators/schema-error-codes.ts +160 -0
- package/src/generators/schema-validator.ts +391 -0
- package/src/index.ts +429 -0
- package/src/language-building-schema.ts +3170 -0
- package/src/language-loader.ts +394 -0
- package/src/languages/_all.ts +65 -0
- package/src/languages/ar.ts +15 -0
- package/src/languages/bn.ts +16 -0
- package/src/languages/de.ts +15 -0
- package/src/languages/en.ts +29 -0
- package/src/languages/es.ts +15 -0
- package/src/languages/fr.ts +15 -0
- package/src/languages/hi.ts +26 -0
- package/src/languages/id.ts +15 -0
- package/src/languages/index.ts +18 -0
- package/src/languages/it.ts +15 -0
- package/src/languages/ja.ts +15 -0
- package/src/languages/ko.ts +15 -0
- package/src/languages/ms.ts +16 -0
- package/src/languages/pl.ts +18 -0
- package/src/languages/pt.ts +15 -0
- package/src/languages/qu.ts +15 -0
- package/src/languages/ru.ts +26 -0
- package/src/languages/sw.ts +15 -0
- package/src/languages/th.ts +16 -0
- package/src/languages/tl.ts +16 -0
- package/src/languages/tr.ts +15 -0
- package/src/languages/uk.ts +26 -0
- package/src/languages/vi.ts +16 -0
- package/src/languages/zh.ts +15 -0
- package/src/parser/index.ts +15 -0
- package/src/parser/pattern-matcher.ts +1181 -0
- package/src/parser/semantic-parser.ts +573 -0
- package/src/parser/utils/index.ts +35 -0
- package/src/parser/utils/marker-resolution.ts +111 -0
- package/src/parser/utils/possessive-keywords.ts +43 -0
- package/src/parser/utils/role-positioning.ts +70 -0
- package/src/parser/utils/type-validation.ts +134 -0
- package/src/patterns/add/ar.ts +71 -0
- package/src/patterns/add/bn.ts +70 -0
- package/src/patterns/add/hi.ts +69 -0
- package/src/patterns/add/index.ts +87 -0
- package/src/patterns/add/it.ts +61 -0
- package/src/patterns/add/ja.ts +93 -0
- package/src/patterns/add/ko.ts +74 -0
- package/src/patterns/add/ms.ts +30 -0
- package/src/patterns/add/pl.ts +62 -0
- package/src/patterns/add/ru.ts +62 -0
- package/src/patterns/add/th.ts +49 -0
- package/src/patterns/add/tl.ts +30 -0
- package/src/patterns/add/tr.ts +71 -0
- package/src/patterns/add/uk.ts +62 -0
- package/src/patterns/add/vi.ts +61 -0
- package/src/patterns/add/zh.ts +71 -0
- package/src/patterns/builders.ts +207 -0
- package/src/patterns/decrement/bn.ts +70 -0
- package/src/patterns/decrement/de.ts +42 -0
- package/src/patterns/decrement/hi.ts +68 -0
- package/src/patterns/decrement/index.ts +79 -0
- package/src/patterns/decrement/it.ts +69 -0
- package/src/patterns/decrement/ms.ts +30 -0
- package/src/patterns/decrement/pl.ts +58 -0
- package/src/patterns/decrement/ru.ts +58 -0
- package/src/patterns/decrement/th.ts +49 -0
- package/src/patterns/decrement/tl.ts +30 -0
- package/src/patterns/decrement/tr.ts +48 -0
- package/src/patterns/decrement/uk.ts +58 -0
- package/src/patterns/decrement/vi.ts +61 -0
- package/src/patterns/decrement/zh.ts +32 -0
- package/src/patterns/en.ts +302 -0
- package/src/patterns/event-handler/ar.ts +151 -0
- package/src/patterns/event-handler/bn.ts +72 -0
- package/src/patterns/event-handler/de.ts +117 -0
- package/src/patterns/event-handler/en.ts +117 -0
- package/src/patterns/event-handler/es.ts +136 -0
- package/src/patterns/event-handler/fr.ts +117 -0
- package/src/patterns/event-handler/hi.ts +64 -0
- package/src/patterns/event-handler/id.ts +117 -0
- package/src/patterns/event-handler/index.ts +119 -0
- package/src/patterns/event-handler/it.ts +54 -0
- package/src/patterns/event-handler/ja.ts +118 -0
- package/src/patterns/event-handler/ko.ts +133 -0
- package/src/patterns/event-handler/ms.ts +30 -0
- package/src/patterns/event-handler/pl.ts +62 -0
- package/src/patterns/event-handler/pt.ts +117 -0
- package/src/patterns/event-handler/qu.ts +66 -0
- package/src/patterns/event-handler/ru.ts +62 -0
- package/src/patterns/event-handler/shared.ts +270 -0
- package/src/patterns/event-handler/sw.ts +117 -0
- package/src/patterns/event-handler/th.ts +53 -0
- package/src/patterns/event-handler/tl.ts +30 -0
- package/src/patterns/event-handler/tr.ts +170 -0
- package/src/patterns/event-handler/uk.ts +62 -0
- package/src/patterns/event-handler/vi.ts +61 -0
- package/src/patterns/event-handler/zh.ts +150 -0
- package/src/patterns/get/ar.ts +49 -0
- package/src/patterns/get/bn.ts +47 -0
- package/src/patterns/get/de.ts +32 -0
- package/src/patterns/get/hi.ts +52 -0
- package/src/patterns/get/index.ts +83 -0
- package/src/patterns/get/it.ts +56 -0
- package/src/patterns/get/ja.ts +53 -0
- package/src/patterns/get/ko.ts +53 -0
- package/src/patterns/get/ms.ts +30 -0
- package/src/patterns/get/pl.ts +57 -0
- package/src/patterns/get/ru.ts +57 -0
- package/src/patterns/get/th.ts +29 -0
- package/src/patterns/get/tl.ts +30 -0
- package/src/patterns/get/uk.ts +57 -0
- package/src/patterns/get/vi.ts +48 -0
- package/src/patterns/grammar-transformed/index.ts +39 -0
- package/src/patterns/grammar-transformed/ja.ts +1713 -0
- package/src/patterns/grammar-transformed/ko.ts +1311 -0
- package/src/patterns/grammar-transformed/tr.ts +1067 -0
- package/src/patterns/hide/ar.ts +67 -0
- package/src/patterns/hide/bn.ts +47 -0
- package/src/patterns/hide/de.ts +36 -0
- package/src/patterns/hide/hi.ts +61 -0
- package/src/patterns/hide/index.ts +91 -0
- package/src/patterns/hide/it.ts +56 -0
- package/src/patterns/hide/ja.ts +69 -0
- package/src/patterns/hide/ko.ts +69 -0
- package/src/patterns/hide/ms.ts +30 -0
- package/src/patterns/hide/pl.ts +57 -0
- package/src/patterns/hide/ru.ts +57 -0
- package/src/patterns/hide/th.ts +29 -0
- package/src/patterns/hide/tl.ts +30 -0
- package/src/patterns/hide/tr.ts +65 -0
- package/src/patterns/hide/uk.ts +57 -0
- package/src/patterns/hide/vi.ts +56 -0
- package/src/patterns/hide/zh.ts +68 -0
- package/src/patterns/increment/bn.ts +70 -0
- package/src/patterns/increment/de.ts +36 -0
- package/src/patterns/increment/hi.ts +68 -0
- package/src/patterns/increment/index.ts +79 -0
- package/src/patterns/increment/it.ts +69 -0
- package/src/patterns/increment/ms.ts +30 -0
- package/src/patterns/increment/pl.ts +58 -0
- package/src/patterns/increment/ru.ts +58 -0
- package/src/patterns/increment/th.ts +49 -0
- package/src/patterns/increment/tl.ts +30 -0
- package/src/patterns/increment/tr.ts +52 -0
- package/src/patterns/increment/uk.ts +58 -0
- package/src/patterns/increment/vi.ts +61 -0
- package/src/patterns/increment/zh.ts +32 -0
- package/src/patterns/index.ts +84 -0
- package/src/patterns/languages/en/control-flow.ts +93 -0
- package/src/patterns/languages/en/fetch.ts +62 -0
- package/src/patterns/languages/en/index.ts +42 -0
- package/src/patterns/languages/en/repeat.ts +67 -0
- package/src/patterns/languages/en/set.ts +48 -0
- package/src/patterns/languages/en/swap.ts +38 -0
- package/src/patterns/languages/en/temporal.ts +57 -0
- package/src/patterns/put/ar.ts +74 -0
- package/src/patterns/put/bn.ts +53 -0
- package/src/patterns/put/en.ts +74 -0
- package/src/patterns/put/es.ts +74 -0
- package/src/patterns/put/hi.ts +69 -0
- package/src/patterns/put/id.ts +96 -0
- package/src/patterns/put/index.ts +99 -0
- package/src/patterns/put/it.ts +56 -0
- package/src/patterns/put/ja.ts +75 -0
- package/src/patterns/put/ko.ts +67 -0
- package/src/patterns/put/ms.ts +30 -0
- package/src/patterns/put/pl.ts +81 -0
- package/src/patterns/put/ru.ts +85 -0
- package/src/patterns/put/th.ts +32 -0
- package/src/patterns/put/tl.ts +30 -0
- package/src/patterns/put/tr.ts +67 -0
- package/src/patterns/put/uk.ts +85 -0
- package/src/patterns/put/vi.ts +72 -0
- package/src/patterns/put/zh.ts +62 -0
- package/src/patterns/registry.ts +163 -0
- package/src/patterns/remove/ar.ts +71 -0
- package/src/patterns/remove/bn.ts +68 -0
- package/src/patterns/remove/hi.ts +69 -0
- package/src/patterns/remove/index.ts +87 -0
- package/src/patterns/remove/it.ts +69 -0
- package/src/patterns/remove/ja.ts +74 -0
- package/src/patterns/remove/ko.ts +78 -0
- package/src/patterns/remove/ms.ts +30 -0
- package/src/patterns/remove/pl.ts +62 -0
- package/src/patterns/remove/ru.ts +62 -0
- package/src/patterns/remove/th.ts +49 -0
- package/src/patterns/remove/tl.ts +30 -0
- package/src/patterns/remove/tr.ts +78 -0
- package/src/patterns/remove/uk.ts +62 -0
- package/src/patterns/remove/vi.ts +61 -0
- package/src/patterns/remove/zh.ts +72 -0
- package/src/patterns/set/ar.ts +84 -0
- package/src/patterns/set/bn.ts +53 -0
- package/src/patterns/set/de.ts +84 -0
- package/src/patterns/set/es.ts +92 -0
- package/src/patterns/set/fr.ts +88 -0
- package/src/patterns/set/hi.ts +56 -0
- package/src/patterns/set/id.ts +84 -0
- package/src/patterns/set/index.ts +107 -0
- package/src/patterns/set/it.ts +56 -0
- package/src/patterns/set/ja.ts +86 -0
- package/src/patterns/set/ko.ts +85 -0
- package/src/patterns/set/ms.ts +30 -0
- package/src/patterns/set/pl.ts +57 -0
- package/src/patterns/set/pt.ts +84 -0
- package/src/patterns/set/ru.ts +57 -0
- package/src/patterns/set/th.ts +31 -0
- package/src/patterns/set/tl.ts +30 -0
- package/src/patterns/set/tr.ts +107 -0
- package/src/patterns/set/uk.ts +57 -0
- package/src/patterns/set/vi.ts +53 -0
- package/src/patterns/set/zh.ts +84 -0
- package/src/patterns/show/ar.ts +67 -0
- package/src/patterns/show/bn.ts +47 -0
- package/src/patterns/show/de.ts +32 -0
- package/src/patterns/show/fr.ts +32 -0
- package/src/patterns/show/hi.ts +61 -0
- package/src/patterns/show/index.ts +95 -0
- package/src/patterns/show/it.ts +56 -0
- package/src/patterns/show/ja.ts +69 -0
- package/src/patterns/show/ko.ts +73 -0
- package/src/patterns/show/ms.ts +30 -0
- package/src/patterns/show/pl.ts +57 -0
- package/src/patterns/show/ru.ts +57 -0
- package/src/patterns/show/th.ts +29 -0
- package/src/patterns/show/tl.ts +30 -0
- package/src/patterns/show/tr.ts +65 -0
- package/src/patterns/show/uk.ts +57 -0
- package/src/patterns/show/vi.ts +56 -0
- package/src/patterns/show/zh.ts +68 -0
- package/src/patterns/take/ar.ts +51 -0
- package/src/patterns/take/index.ts +31 -0
- package/src/patterns/toggle/ar.ts +61 -0
- package/src/patterns/toggle/bn.ts +70 -0
- package/src/patterns/toggle/en.ts +61 -0
- package/src/patterns/toggle/es.ts +61 -0
- package/src/patterns/toggle/hi.ts +80 -0
- package/src/patterns/toggle/index.ts +95 -0
- package/src/patterns/toggle/it.ts +69 -0
- package/src/patterns/toggle/ja.ts +156 -0
- package/src/patterns/toggle/ko.ts +113 -0
- package/src/patterns/toggle/ms.ts +30 -0
- package/src/patterns/toggle/pl.ts +62 -0
- package/src/patterns/toggle/ru.ts +62 -0
- package/src/patterns/toggle/th.ts +50 -0
- package/src/patterns/toggle/tl.ts +30 -0
- package/src/patterns/toggle/tr.ts +88 -0
- package/src/patterns/toggle/uk.ts +62 -0
- package/src/patterns/toggle/vi.ts +61 -0
- package/src/patterns/toggle/zh.ts +99 -0
- package/src/public-api.ts +286 -0
- package/src/registry.ts +441 -0
- package/src/tokenizers/arabic.ts +723 -0
- package/src/tokenizers/base.ts +1300 -0
- package/src/tokenizers/bengali.ts +289 -0
- package/src/tokenizers/chinese.ts +481 -0
- package/src/tokenizers/english.ts +416 -0
- package/src/tokenizers/french.ts +326 -0
- package/src/tokenizers/german.ts +324 -0
- package/src/tokenizers/hindi.ts +319 -0
- package/src/tokenizers/index.ts +127 -0
- package/src/tokenizers/indonesian.ts +306 -0
- package/src/tokenizers/italian.ts +458 -0
- package/src/tokenizers/japanese.ts +447 -0
- package/src/tokenizers/korean.ts +642 -0
- package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
- package/src/tokenizers/morphology/french-normalizer.ts +268 -0
- package/src/tokenizers/morphology/german-normalizer.ts +256 -0
- package/src/tokenizers/morphology/index.ts +46 -0
- package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
- package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
- package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
- package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
- package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
- package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
- package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
- package/src/tokenizers/morphology/types.ts +211 -0
- package/src/tokenizers/ms.ts +198 -0
- package/src/tokenizers/polish.ts +354 -0
- package/src/tokenizers/portuguese.ts +304 -0
- package/src/tokenizers/quechua.ts +339 -0
- package/src/tokenizers/russian.ts +375 -0
- package/src/tokenizers/spanish.ts +403 -0
- package/src/tokenizers/swahili.ts +303 -0
- package/src/tokenizers/thai.ts +236 -0
- package/src/tokenizers/tl.ts +198 -0
- package/src/tokenizers/turkish.ts +411 -0
- package/src/tokenizers/ukrainian.ts +369 -0
- package/src/tokenizers/vietnamese.ts +410 -0
- package/src/types/grammar-types.ts +617 -0
- package/src/types/unified-profile.ts +267 -0
- package/src/types.ts +709 -0
- package/src/utils/confidence-calculator.ts +147 -0
- package/src/validators/command-validator.ts +380 -0
- package/src/validators/index.ts +15 -0
|
@@ -0,0 +1,723 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Arabic Tokenizer
|
|
3
|
+
*
|
|
4
|
+
* Tokenizes Arabic hyperscript input.
|
|
5
|
+
* Arabic is challenging because:
|
|
6
|
+
* - Right-to-left (RTL) text direction
|
|
7
|
+
* - Prefix prepositions that attach to words (بـ, لـ, كـ)
|
|
8
|
+
* - Root-pattern morphology
|
|
9
|
+
* - CSS selectors are LTR islands within RTL text
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import type { LanguageToken, TokenKind, TokenStream } from '../types';
|
|
13
|
+
import {
|
|
14
|
+
BaseTokenizer,
|
|
15
|
+
TokenStreamImpl,
|
|
16
|
+
createToken,
|
|
17
|
+
createPosition,
|
|
18
|
+
createUnicodeRangeClassifier,
|
|
19
|
+
isWhitespace,
|
|
20
|
+
isSelectorStart,
|
|
21
|
+
isQuote,
|
|
22
|
+
isDigit,
|
|
23
|
+
isAsciiIdentifierChar,
|
|
24
|
+
isUrlStart,
|
|
25
|
+
type KeywordEntry,
|
|
26
|
+
type TimeUnitMapping,
|
|
27
|
+
} from './base';
|
|
28
|
+
import { ArabicMorphologicalNormalizer } from './morphology/arabic-normalizer';
|
|
29
|
+
import { arabicProfile } from '../generators/profiles/arabic';
|
|
30
|
+
|
|
31
|
+
// =============================================================================
|
|
32
|
+
// Arabic Character Classification
|
|
33
|
+
// =============================================================================
|
|
34
|
+
|
|
35
|
+
/** Check if character is Arabic (includes all Arabic Unicode blocks). */
|
|
36
|
+
const isArabic = createUnicodeRangeClassifier([
|
|
37
|
+
[0x0600, 0x06ff], // Arabic
|
|
38
|
+
[0x0750, 0x077f], // Arabic Supplement
|
|
39
|
+
[0x08a0, 0x08ff], // Arabic Extended-A
|
|
40
|
+
[0xfb50, 0xfdff], // Arabic Presentation Forms-A
|
|
41
|
+
[0xfe70, 0xfeff], // Arabic Presentation Forms-B
|
|
42
|
+
]);
|
|
43
|
+
|
|
44
|
+
// =============================================================================
|
|
45
|
+
// Arabic Prefixes and Prepositions
|
|
46
|
+
// =============================================================================
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Arabic prefix prepositions that attach to the following word.
|
|
50
|
+
* These are marked with trailing hyphen in patterns to indicate attachment.
|
|
51
|
+
*/
|
|
52
|
+
const ATTACHED_PREFIXES = new Set([
|
|
53
|
+
'بـ', // bi- (with, by)
|
|
54
|
+
'لـ', // li- (to, for)
|
|
55
|
+
'كـ', // ka- (like, as)
|
|
56
|
+
'وـ', // wa- (and)
|
|
57
|
+
]);
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Arabic proclitic conjunctions and prefixes that attach directly to the following word.
|
|
61
|
+
* These are separated during tokenization for proper list/coordination handling.
|
|
62
|
+
*
|
|
63
|
+
* Single-character proclitics (و, ف) are emitted as separate conjunction tokens
|
|
64
|
+
* to support polysyndetic coordination (A وB وC).
|
|
65
|
+
*
|
|
66
|
+
* Attached prefixes (بـ, لـ, كـ) are prepositions that attach to words.
|
|
67
|
+
* Multi-proclitic sequences (ولـ, وبـ, فلـ, etc.) are split into components.
|
|
68
|
+
*
|
|
69
|
+
* @see NATIVE_REVIEW_NEEDED.md for implementation details
|
|
70
|
+
*/
|
|
71
|
+
const PROCLITICS = new Map<string, { normalized: string; type: 'conjunction' | 'preposition' }>([
|
|
72
|
+
// Conjunctions (single character)
|
|
73
|
+
['و', { normalized: 'and', type: 'conjunction' }], // wa - conjunction "and"
|
|
74
|
+
['ف', { normalized: 'then', type: 'conjunction' }], // fa - conjunction "then/so"
|
|
75
|
+
|
|
76
|
+
// Attached prefix prepositions
|
|
77
|
+
['ب', { normalized: 'with', type: 'preposition' }], // bi- (with, by)
|
|
78
|
+
['ل', { normalized: 'to', type: 'preposition' }], // li- (to, for)
|
|
79
|
+
['ك', { normalized: 'like', type: 'preposition' }], // ka- (like, as)
|
|
80
|
+
|
|
81
|
+
// Multi-proclitic sequences (conjunction + preposition)
|
|
82
|
+
['ول', { normalized: 'and-to', type: 'conjunction' }], // wa + li-
|
|
83
|
+
['وب', { normalized: 'and-with', type: 'conjunction' }], // wa + bi-
|
|
84
|
+
['وك', { normalized: 'and-like', type: 'conjunction' }], // wa + ka-
|
|
85
|
+
['فل', { normalized: 'then-to', type: 'conjunction' }], // fa + li-
|
|
86
|
+
['فب', { normalized: 'then-with', type: 'conjunction' }], // fa + bi-
|
|
87
|
+
['فك', { normalized: 'then-like', type: 'conjunction' }], // fa + ka-
|
|
88
|
+
]);
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Arabic temporal markers (event trigger keywords) with formality and confidence tracking.
|
|
92
|
+
*
|
|
93
|
+
* Formality levels:
|
|
94
|
+
* - 'formal': Modern Standard Arabic (MSA) - preferred in written/formal contexts
|
|
95
|
+
* - 'neutral': Common in both MSA and dialects
|
|
96
|
+
* - 'dialectal': Informal/colloquial - common in spoken Arabic
|
|
97
|
+
*
|
|
98
|
+
* Confidence reflects how reliably the marker indicates an event trigger ("on" event).
|
|
99
|
+
* Formal markers have higher confidence due to standardization.
|
|
100
|
+
*/
|
|
101
|
+
interface TemporalMarkerMetadata {
|
|
102
|
+
readonly normalized: string;
|
|
103
|
+
readonly formality: 'formal' | 'neutral' | 'dialectal';
|
|
104
|
+
readonly confidence: number;
|
|
105
|
+
readonly description: string;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const TEMPORAL_MARKERS = new Map<string, TemporalMarkerMetadata>([
|
|
109
|
+
[
|
|
110
|
+
'عندما',
|
|
111
|
+
{
|
|
112
|
+
normalized: 'on',
|
|
113
|
+
formality: 'formal',
|
|
114
|
+
confidence: 0.95,
|
|
115
|
+
description: 'when (formal MSA)',
|
|
116
|
+
},
|
|
117
|
+
],
|
|
118
|
+
[
|
|
119
|
+
'حينما',
|
|
120
|
+
{
|
|
121
|
+
normalized: 'on',
|
|
122
|
+
formality: 'formal',
|
|
123
|
+
confidence: 0.93,
|
|
124
|
+
description: 'when/whenever (formal)',
|
|
125
|
+
},
|
|
126
|
+
],
|
|
127
|
+
[
|
|
128
|
+
'عند',
|
|
129
|
+
{
|
|
130
|
+
normalized: 'on',
|
|
131
|
+
formality: 'neutral',
|
|
132
|
+
confidence: 0.88,
|
|
133
|
+
description: 'at/when (neutral)',
|
|
134
|
+
},
|
|
135
|
+
],
|
|
136
|
+
[
|
|
137
|
+
'حين',
|
|
138
|
+
{
|
|
139
|
+
normalized: 'on',
|
|
140
|
+
formality: 'neutral',
|
|
141
|
+
confidence: 0.85,
|
|
142
|
+
description: 'when/time (neutral)',
|
|
143
|
+
},
|
|
144
|
+
],
|
|
145
|
+
[
|
|
146
|
+
'لمّا',
|
|
147
|
+
{
|
|
148
|
+
normalized: 'on',
|
|
149
|
+
formality: 'dialectal',
|
|
150
|
+
confidence: 0.7,
|
|
151
|
+
description: 'when (dialectal, with shadda)',
|
|
152
|
+
},
|
|
153
|
+
],
|
|
154
|
+
[
|
|
155
|
+
'لما',
|
|
156
|
+
{
|
|
157
|
+
normalized: 'on',
|
|
158
|
+
formality: 'dialectal',
|
|
159
|
+
confidence: 0.68,
|
|
160
|
+
description: 'when (dialectal, no diacritic)',
|
|
161
|
+
},
|
|
162
|
+
],
|
|
163
|
+
[
|
|
164
|
+
'لدى',
|
|
165
|
+
{
|
|
166
|
+
normalized: 'on',
|
|
167
|
+
formality: 'neutral',
|
|
168
|
+
confidence: 0.82,
|
|
169
|
+
description: 'at/with (temporal)',
|
|
170
|
+
},
|
|
171
|
+
],
|
|
172
|
+
]);
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Arabic standalone prepositions.
|
|
176
|
+
* Note: Temporal markers (عند, لدى, etc.) are NOT in this set - they're handled
|
|
177
|
+
* separately in TEMPORAL_MARKERS with formality metadata.
|
|
178
|
+
*/
|
|
179
|
+
const PREPOSITIONS = new Set([
|
|
180
|
+
'في', // fī (in)
|
|
181
|
+
'على', // ʿalā (on)
|
|
182
|
+
'من', // min (from)
|
|
183
|
+
'إلى', // ilā (to)
|
|
184
|
+
'الى', // ilā (alternative spelling)
|
|
185
|
+
// 'عند' removed - it's a temporal marker with metadata
|
|
186
|
+
'مع', // maʿa (with)
|
|
187
|
+
'عن', // ʿan (about, from)
|
|
188
|
+
'قبل', // qabl (before)
|
|
189
|
+
'بعد', // baʿd (after)
|
|
190
|
+
'بين', // bayn (between)
|
|
191
|
+
]);
|
|
192
|
+
|
|
193
|
+
// =============================================================================
|
|
194
|
+
// Arabic Extras (keywords not in profile)
|
|
195
|
+
// =============================================================================
|
|
196
|
+
|
|
197
|
+
/**
|
|
198
|
+
* Extra keywords not covered by the profile:
|
|
199
|
+
* - Literals (true, false, null, undefined)
|
|
200
|
+
* - Positional words
|
|
201
|
+
* - Event names
|
|
202
|
+
* - Time units
|
|
203
|
+
* - Temporal conjunctions
|
|
204
|
+
* - Additional synonyms and spelling variants
|
|
205
|
+
*/
|
|
206
|
+
const ARABIC_EXTRAS: KeywordEntry[] = [
|
|
207
|
+
// Values/Literals
|
|
208
|
+
{ native: 'صحيح', normalized: 'true' },
|
|
209
|
+
{ native: 'خطأ', normalized: 'false' },
|
|
210
|
+
{ native: 'null', normalized: 'null' },
|
|
211
|
+
{ native: 'فارغ', normalized: 'null' },
|
|
212
|
+
{ native: 'غير معرف', normalized: 'undefined' },
|
|
213
|
+
|
|
214
|
+
// Positional
|
|
215
|
+
{ native: 'الأول', normalized: 'first' },
|
|
216
|
+
{ native: 'أول', normalized: 'first' },
|
|
217
|
+
{ native: 'الأخير', normalized: 'last' },
|
|
218
|
+
{ native: 'آخر', normalized: 'last' },
|
|
219
|
+
{ native: 'التالي', normalized: 'next' },
|
|
220
|
+
{ native: 'السابق', normalized: 'previous' },
|
|
221
|
+
{ native: 'الأقرب', normalized: 'closest' },
|
|
222
|
+
{ native: 'الأب', normalized: 'parent' },
|
|
223
|
+
|
|
224
|
+
// Events
|
|
225
|
+
{ native: 'النقر', normalized: 'click' },
|
|
226
|
+
{ native: 'نقر', normalized: 'click' },
|
|
227
|
+
{ native: 'الإدخال', normalized: 'input' },
|
|
228
|
+
{ native: 'إدخال', normalized: 'input' },
|
|
229
|
+
{ native: 'التغيير', normalized: 'change' },
|
|
230
|
+
{ native: 'تغيير', normalized: 'change' },
|
|
231
|
+
{ native: 'الإرسال', normalized: 'submit' },
|
|
232
|
+
{ native: 'إرسال', normalized: 'submit' },
|
|
233
|
+
{ native: 'التركيز', normalized: 'focus' },
|
|
234
|
+
{ native: 'فقدان التركيز', normalized: 'blur' },
|
|
235
|
+
{ native: 'ضغط', normalized: 'keydown' },
|
|
236
|
+
{ native: 'رفع', normalized: 'keyup' },
|
|
237
|
+
{ native: 'تمرير الفأرة', normalized: 'mouseover' },
|
|
238
|
+
{ native: 'مغادرة الفأرة', normalized: 'mouseout' },
|
|
239
|
+
{ native: 'تحميل', normalized: 'load' },
|
|
240
|
+
{ native: 'تمرير', normalized: 'scroll' },
|
|
241
|
+
|
|
242
|
+
// References
|
|
243
|
+
{ native: 'أنا', normalized: 'me' },
|
|
244
|
+
{ native: 'هو', normalized: 'it' },
|
|
245
|
+
{ native: 'هي', normalized: 'it' },
|
|
246
|
+
{ native: 'النتيجة', normalized: 'result' },
|
|
247
|
+
{ native: 'الحدث', normalized: 'event' },
|
|
248
|
+
{ native: 'الهدف', normalized: 'target' },
|
|
249
|
+
|
|
250
|
+
// Time units
|
|
251
|
+
{ native: 'ثانية', normalized: 's' },
|
|
252
|
+
{ native: 'ثواني', normalized: 's' },
|
|
253
|
+
{ native: 'ملي ثانية', normalized: 'ms' },
|
|
254
|
+
{ native: 'دقيقة', normalized: 'm' },
|
|
255
|
+
{ native: 'دقائق', normalized: 'm' },
|
|
256
|
+
{ native: 'ساعة', normalized: 'h' },
|
|
257
|
+
{ native: 'ساعات', normalized: 'h' },
|
|
258
|
+
|
|
259
|
+
// Note: Temporal markers (عندما, حينما, etc.) are in TEMPORAL_MARKERS map
|
|
260
|
+
// with formality metadata, not in ARABIC_EXTRAS
|
|
261
|
+
|
|
262
|
+
// Additional spelling variants (without diacritics)
|
|
263
|
+
{ native: 'بدل', normalized: 'toggle' },
|
|
264
|
+
{ native: 'غير', normalized: 'toggle' },
|
|
265
|
+
{ native: 'اضف', normalized: 'add' },
|
|
266
|
+
{ native: 'ازل', normalized: 'remove' },
|
|
267
|
+
{ native: 'اضع', normalized: 'put' },
|
|
268
|
+
{ native: 'يضع', normalized: 'put' },
|
|
269
|
+
{ native: 'اجعل', normalized: 'put' },
|
|
270
|
+
{ native: 'عين', normalized: 'set' },
|
|
271
|
+
{ native: 'زد', normalized: 'increment' },
|
|
272
|
+
{ native: 'ارفع', normalized: 'increment' },
|
|
273
|
+
{ native: 'انقص', normalized: 'decrement' },
|
|
274
|
+
{ native: 'قلل', normalized: 'decrement' },
|
|
275
|
+
{ native: 'سجل', normalized: 'log' },
|
|
276
|
+
{ native: 'اظهر', normalized: 'show' },
|
|
277
|
+
{ native: 'اعرض', normalized: 'show' },
|
|
278
|
+
{ native: 'اخف', normalized: 'hide' },
|
|
279
|
+
{ native: 'اخفي', normalized: 'hide' },
|
|
280
|
+
{ native: 'شغل', normalized: 'trigger' },
|
|
281
|
+
{ native: 'ارسل', normalized: 'send' },
|
|
282
|
+
{ native: 'ركز', normalized: 'focus' },
|
|
283
|
+
{ native: 'شوش', normalized: 'blur' },
|
|
284
|
+
{ native: 'اذا', normalized: 'if' },
|
|
285
|
+
{ native: 'لو', normalized: 'if' },
|
|
286
|
+
{ native: 'والا', normalized: 'else' },
|
|
287
|
+
{ native: 'توقف', normalized: 'halt' },
|
|
288
|
+
{ native: 'انسخ', normalized: 'clone' },
|
|
289
|
+
|
|
290
|
+
// Control flow helpers
|
|
291
|
+
{ native: 'إذن', normalized: 'then' },
|
|
292
|
+
{ native: 'فإن', normalized: 'then' },
|
|
293
|
+
{ native: 'نهاية', normalized: 'end' },
|
|
294
|
+
|
|
295
|
+
// Modifiers
|
|
296
|
+
{ native: 'قبل', normalized: 'before' },
|
|
297
|
+
{ native: 'بعد', normalized: 'after' },
|
|
298
|
+
];
|
|
299
|
+
|
|
300
|
+
// =============================================================================
|
|
301
|
+
// Arabic Time Units
|
|
302
|
+
// =============================================================================
|
|
303
|
+
|
|
304
|
+
/**
|
|
305
|
+
* Arabic time unit patterns for number parsing.
|
|
306
|
+
* Sorted by length (longest first) to ensure correct matching.
|
|
307
|
+
* Arabic allows space between number and unit (ملي ثانية = millisecond).
|
|
308
|
+
*/
|
|
309
|
+
const ARABIC_TIME_UNITS: readonly TimeUnitMapping[] = [
|
|
310
|
+
{ pattern: 'ملي ثانية', suffix: 'ms', length: 9, caseInsensitive: false },
|
|
311
|
+
{ pattern: 'ملي_ثانية', suffix: 'ms', length: 8, caseInsensitive: false },
|
|
312
|
+
{ pattern: 'دقائق', suffix: 'm', length: 5, caseInsensitive: false },
|
|
313
|
+
{ pattern: 'دقيقة', suffix: 'm', length: 5, caseInsensitive: false },
|
|
314
|
+
{ pattern: 'ثواني', suffix: 's', length: 5, caseInsensitive: false },
|
|
315
|
+
{ pattern: 'ثانية', suffix: 's', length: 5, caseInsensitive: false },
|
|
316
|
+
{ pattern: 'ساعات', suffix: 'h', length: 5, caseInsensitive: false },
|
|
317
|
+
{ pattern: 'ساعة', suffix: 'h', length: 4, caseInsensitive: false },
|
|
318
|
+
];
|
|
319
|
+
|
|
320
|
+
// =============================================================================
|
|
321
|
+
// Arabic Tokenizer Implementation
|
|
322
|
+
// =============================================================================
|
|
323
|
+
|
|
324
|
+
export class ArabicTokenizer extends BaseTokenizer {
|
|
325
|
+
readonly language = 'ar';
|
|
326
|
+
readonly direction = 'rtl' as const;
|
|
327
|
+
|
|
328
|
+
constructor() {
|
|
329
|
+
super();
|
|
330
|
+
this.initializeKeywordsFromProfile(arabicProfile, ARABIC_EXTRAS);
|
|
331
|
+
// Set morphological normalizer for prefix/suffix stripping
|
|
332
|
+
this.normalizer = new ArabicMorphologicalNormalizer();
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
tokenize(input: string): TokenStream {
|
|
336
|
+
const tokens: LanguageToken[] = [];
|
|
337
|
+
let pos = 0;
|
|
338
|
+
|
|
339
|
+
while (pos < input.length) {
|
|
340
|
+
// Skip whitespace
|
|
341
|
+
if (isWhitespace(input[pos])) {
|
|
342
|
+
pos++;
|
|
343
|
+
continue;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// Try CSS selector first (LTR island in RTL text)
|
|
347
|
+
if (isSelectorStart(input[pos])) {
|
|
348
|
+
// Check for event modifier first (.once, .debounce(), etc.)
|
|
349
|
+
const modifierToken = this.tryEventModifier(input, pos);
|
|
350
|
+
if (modifierToken) {
|
|
351
|
+
tokens.push(modifierToken);
|
|
352
|
+
pos = modifierToken.position.end;
|
|
353
|
+
continue;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
const selectorToken = this.trySelector(input, pos);
|
|
357
|
+
if (selectorToken) {
|
|
358
|
+
tokens.push(selectorToken);
|
|
359
|
+
pos = selectorToken.position.end;
|
|
360
|
+
continue;
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
// Try string literal
|
|
365
|
+
if (isQuote(input[pos])) {
|
|
366
|
+
const stringToken = this.tryString(input, pos);
|
|
367
|
+
if (stringToken) {
|
|
368
|
+
tokens.push(stringToken);
|
|
369
|
+
pos = stringToken.position.end;
|
|
370
|
+
continue;
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// Try URL (/path, ./path, http://, etc.)
|
|
375
|
+
if (isUrlStart(input, pos)) {
|
|
376
|
+
const urlToken = this.tryUrl(input, pos);
|
|
377
|
+
if (urlToken) {
|
|
378
|
+
tokens.push(urlToken);
|
|
379
|
+
pos = urlToken.position.end;
|
|
380
|
+
continue;
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
// Try number
|
|
385
|
+
if (isDigit(input[pos])) {
|
|
386
|
+
const numberToken = this.extractArabicNumber(input, pos);
|
|
387
|
+
if (numberToken) {
|
|
388
|
+
tokens.push(numberToken);
|
|
389
|
+
pos = numberToken.position.end;
|
|
390
|
+
continue;
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
// Try variable reference (:varname)
|
|
395
|
+
const varToken = this.tryVariableRef(input, pos);
|
|
396
|
+
if (varToken) {
|
|
397
|
+
tokens.push(varToken);
|
|
398
|
+
pos = varToken.position.end;
|
|
399
|
+
continue;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
// Try Arabic preposition (multi-word first)
|
|
403
|
+
const prepToken = this.tryPreposition(input, pos);
|
|
404
|
+
if (prepToken) {
|
|
405
|
+
tokens.push(prepToken);
|
|
406
|
+
pos = prepToken.position.end;
|
|
407
|
+
continue;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// Try Arabic word (with proclitic detection)
|
|
411
|
+
if (isArabic(input[pos])) {
|
|
412
|
+
// Check for proclitic conjunction (و or ف) attached to following word
|
|
413
|
+
const procliticResult = this.tryProclitic(input, pos);
|
|
414
|
+
if (procliticResult) {
|
|
415
|
+
tokens.push(procliticResult.conjunction);
|
|
416
|
+
pos = procliticResult.conjunction.position.end;
|
|
417
|
+
// Continue to let the next iteration extract the remaining word
|
|
418
|
+
continue;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
const wordToken = this.extractArabicWord(input, pos);
|
|
422
|
+
if (wordToken) {
|
|
423
|
+
tokens.push(wordToken);
|
|
424
|
+
pos = wordToken.position.end;
|
|
425
|
+
continue;
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// Try ASCII word (for mixed content)
|
|
430
|
+
if (isAsciiIdentifierChar(input[pos])) {
|
|
431
|
+
const asciiToken = this.extractAsciiWord(input, pos);
|
|
432
|
+
if (asciiToken) {
|
|
433
|
+
tokens.push(asciiToken);
|
|
434
|
+
pos = asciiToken.position.end;
|
|
435
|
+
continue;
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
// Skip unknown character
|
|
440
|
+
pos++;
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
return new TokenStreamImpl(tokens, 'ar');
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
classifyToken(token: string): TokenKind {
|
|
447
|
+
if (PREPOSITIONS.has(token)) return 'particle';
|
|
448
|
+
// O(1) Map lookup instead of O(n) array search
|
|
449
|
+
if (this.isKeyword(token)) return 'keyword';
|
|
450
|
+
if (token.startsWith('#') || token.startsWith('.') || token.startsWith('[')) return 'selector';
|
|
451
|
+
if (token.startsWith('"') || token.startsWith("'")) return 'literal';
|
|
452
|
+
if (/^\d/.test(token)) return 'literal';
|
|
453
|
+
|
|
454
|
+
return 'identifier';
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
/**
|
|
458
|
+
* Try to match an Arabic preposition.
|
|
459
|
+
* Attaches prepositionValue metadata for disambiguation in pattern matching.
|
|
460
|
+
*/
|
|
461
|
+
private tryPreposition(input: string, pos: number): LanguageToken | null {
|
|
462
|
+
// Check prepositions from longest to shortest
|
|
463
|
+
const sortedPreps = Array.from(PREPOSITIONS).sort((a, b) => b.length - a.length);
|
|
464
|
+
|
|
465
|
+
for (const prep of sortedPreps) {
|
|
466
|
+
if (input.slice(pos, pos + prep.length) === prep) {
|
|
467
|
+
// Check that it's a standalone word (followed by space or non-Arabic)
|
|
468
|
+
const nextPos = pos + prep.length;
|
|
469
|
+
if (nextPos >= input.length || isWhitespace(input[nextPos]) || !isArabic(input[nextPos])) {
|
|
470
|
+
const token = createToken(prep, 'particle', createPosition(pos, nextPos));
|
|
471
|
+
// Attach metadata for preposition disambiguation
|
|
472
|
+
return {
|
|
473
|
+
...token,
|
|
474
|
+
metadata: {
|
|
475
|
+
prepositionValue: prep,
|
|
476
|
+
},
|
|
477
|
+
};
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
return null;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
/**
|
|
485
|
+
* Try to extract a proclitic (conjunction or preposition) that's attached to the following word.
|
|
486
|
+
*
|
|
487
|
+
* Arabic proclitics attach directly to words without space:
|
|
488
|
+
* - والنقر → و + النقر (and + the-click)
|
|
489
|
+
* - فالتبديل → ف + التبديل (then + the-toggle)
|
|
490
|
+
* - بالنقر → ب + النقر (with + the-click)
|
|
491
|
+
* - ولالنقر → و + ل + النقر (and + to + the-click)
|
|
492
|
+
*
|
|
493
|
+
* This enables:
|
|
494
|
+
* - Polysyndetic coordination: A وB وC
|
|
495
|
+
* - Attached prepositions: بالنقر (with-the-click)
|
|
496
|
+
* - Multi-proclitic sequences: ولالنقر (and-to-the-click)
|
|
497
|
+
*
|
|
498
|
+
* Returns null if:
|
|
499
|
+
* - Not a proclitic character/sequence
|
|
500
|
+
* - Proclitic is standalone (followed by space)
|
|
501
|
+
* - Remaining word is too short (< 2 chars, to avoid false positives)
|
|
502
|
+
* - Full word is a recognized keyword (e.g., بدل should NOT be split to ب + دل)
|
|
503
|
+
*
|
|
504
|
+
* @see NATIVE_REVIEW_NEEDED.md for implementation rationale
|
|
505
|
+
*/
|
|
506
|
+
private tryProclitic(input: string, pos: number): { conjunction: LanguageToken } | null {
|
|
507
|
+
// CRITICAL: Check if the full word is a keyword BEFORE splitting
|
|
508
|
+
// This prevents keywords like بدل (toggle) from being split into ب (with) + دل
|
|
509
|
+
let wordEnd = pos;
|
|
510
|
+
while (wordEnd < input.length && (isArabic(input[wordEnd]) || input[wordEnd] === 'ـ')) {
|
|
511
|
+
wordEnd++;
|
|
512
|
+
}
|
|
513
|
+
const fullWord = input.slice(pos, wordEnd);
|
|
514
|
+
|
|
515
|
+
// Check if full word is a keyword (with or without diacritics)
|
|
516
|
+
if (this.lookupKeyword(fullWord)) {
|
|
517
|
+
return null; // Let extractArabicWord handle it
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
// Check temporal markers (they also shouldn't be split)
|
|
521
|
+
if (TEMPORAL_MARKERS.has(fullWord)) {
|
|
522
|
+
return null;
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
// Check prepositions (they also shouldn't be split)
|
|
526
|
+
if (PREPOSITIONS.has(fullWord)) {
|
|
527
|
+
return null;
|
|
528
|
+
}
|
|
529
|
+
// Try multi-character proclitics first (longest match)
|
|
530
|
+
// Check 2-character sequences (ول, وب, فل, فب, etc.)
|
|
531
|
+
if (pos + 2 <= input.length) {
|
|
532
|
+
const twoChar = input.slice(pos, pos + 2);
|
|
533
|
+
const twoCharEntry = PROCLITICS.get(twoChar);
|
|
534
|
+
if (twoCharEntry) {
|
|
535
|
+
// Check if there's a following Arabic character (proclitic must be attached)
|
|
536
|
+
const nextPos = pos + 2;
|
|
537
|
+
if (nextPos < input.length && isArabic(input[nextPos])) {
|
|
538
|
+
// Count remaining Arabic characters to ensure meaningful word follows
|
|
539
|
+
let remainingLength = 0;
|
|
540
|
+
let checkPos = nextPos;
|
|
541
|
+
while (checkPos < input.length && isArabic(input[checkPos])) {
|
|
542
|
+
remainingLength++;
|
|
543
|
+
checkPos++;
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
// Require at least 2 characters after proclitic to avoid false positives
|
|
547
|
+
if (remainingLength >= 2) {
|
|
548
|
+
// IMPORTANT: Check if a single-char proclitic would leave a keyword
|
|
549
|
+
// e.g., "وبدل" should be "و" + "بدل" (keyword), not "وب" + "دل"
|
|
550
|
+
const singleCharProclitic = PROCLITICS.get(input[pos]);
|
|
551
|
+
if (singleCharProclitic) {
|
|
552
|
+
const afterSingleChar = input.slice(pos + 1, wordEnd);
|
|
553
|
+
if (this.lookupKeyword(afterSingleChar)) {
|
|
554
|
+
// Single-char proclitic leaves a keyword - don't match multi-proclitic
|
|
555
|
+
// Fall through to single-char proclitic handling below
|
|
556
|
+
} else {
|
|
557
|
+
// Multi-char proclitic is valid
|
|
558
|
+
const tokenKind =
|
|
559
|
+
twoCharEntry.type === 'conjunction'
|
|
560
|
+
? ('conjunction' as const)
|
|
561
|
+
: ('particle' as const);
|
|
562
|
+
return {
|
|
563
|
+
conjunction: createToken(
|
|
564
|
+
twoChar,
|
|
565
|
+
tokenKind,
|
|
566
|
+
createPosition(pos, nextPos),
|
|
567
|
+
twoCharEntry.normalized
|
|
568
|
+
),
|
|
569
|
+
};
|
|
570
|
+
}
|
|
571
|
+
} else {
|
|
572
|
+
// No single-char proclitic alternative, use multi-char
|
|
573
|
+
const tokenKind =
|
|
574
|
+
twoCharEntry.type === 'conjunction'
|
|
575
|
+
? ('conjunction' as const)
|
|
576
|
+
: ('particle' as const);
|
|
577
|
+
return {
|
|
578
|
+
conjunction: createToken(
|
|
579
|
+
twoChar,
|
|
580
|
+
tokenKind,
|
|
581
|
+
createPosition(pos, nextPos),
|
|
582
|
+
twoCharEntry.normalized
|
|
583
|
+
),
|
|
584
|
+
};
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
// Try single-character proclitics
|
|
592
|
+
const char = input[pos];
|
|
593
|
+
const entry = PROCLITICS.get(char);
|
|
594
|
+
|
|
595
|
+
if (!entry) return null;
|
|
596
|
+
|
|
597
|
+
// Check if there's a following Arabic character (proclitic must be attached)
|
|
598
|
+
const nextPos = pos + 1;
|
|
599
|
+
if (nextPos >= input.length || !isArabic(input[nextPos])) {
|
|
600
|
+
return null; // Standalone conjunction or end of input
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
// Count remaining Arabic characters to ensure meaningful word follows
|
|
604
|
+
let remainingLength = 0;
|
|
605
|
+
let checkPos = nextPos;
|
|
606
|
+
while (checkPos < input.length && isArabic(input[checkPos])) {
|
|
607
|
+
remainingLength++;
|
|
608
|
+
checkPos++;
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
// Require at least 2 characters after proclitic to avoid false positives
|
|
612
|
+
// (e.g., وو could be a typo, and short roots need protection)
|
|
613
|
+
if (remainingLength < 2) {
|
|
614
|
+
return null;
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
const tokenKind =
|
|
618
|
+
entry.type === 'conjunction' ? ('conjunction' as const) : ('particle' as const);
|
|
619
|
+
return {
|
|
620
|
+
conjunction: createToken(char, tokenKind, createPosition(pos, nextPos), entry.normalized),
|
|
621
|
+
};
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
/**
|
|
625
|
+
* Extract an Arabic word.
|
|
626
|
+
* Uses morphological normalization to handle prefix/suffix variations.
|
|
627
|
+
* Attaches metadata for temporal markers (formality, confidence).
|
|
628
|
+
*/
|
|
629
|
+
private extractArabicWord(input: string, startPos: number): LanguageToken | null {
|
|
630
|
+
let pos = startPos;
|
|
631
|
+
let word = '';
|
|
632
|
+
|
|
633
|
+
// Check for attached prefix
|
|
634
|
+
for (const prefix of ATTACHED_PREFIXES) {
|
|
635
|
+
const basePrefix = prefix.replace('ـ', '');
|
|
636
|
+
if (input.slice(pos, pos + basePrefix.length) === basePrefix) {
|
|
637
|
+
// This is a prefix - extract it separately
|
|
638
|
+
// For now, include it in the word
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
// Extract Arabic characters
|
|
643
|
+
while (pos < input.length && (isArabic(input[pos]) || input[pos] === 'ـ')) {
|
|
644
|
+
word += input[pos++];
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
if (!word) return null;
|
|
648
|
+
|
|
649
|
+
// Check if it's a temporal marker (with formality metadata)
|
|
650
|
+
const temporalMarker = TEMPORAL_MARKERS.get(word);
|
|
651
|
+
if (temporalMarker) {
|
|
652
|
+
const token = createToken(
|
|
653
|
+
word,
|
|
654
|
+
'keyword',
|
|
655
|
+
createPosition(startPos, pos),
|
|
656
|
+
temporalMarker.normalized
|
|
657
|
+
);
|
|
658
|
+
return {
|
|
659
|
+
...token,
|
|
660
|
+
metadata: {
|
|
661
|
+
temporalFormality: temporalMarker.formality,
|
|
662
|
+
temporalConfidence: temporalMarker.confidence,
|
|
663
|
+
},
|
|
664
|
+
};
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
// O(1) Map lookup instead of O(n) array search
|
|
668
|
+
const keywordEntry = this.lookupKeyword(word);
|
|
669
|
+
if (keywordEntry) {
|
|
670
|
+
return createToken(word, 'keyword', createPosition(startPos, pos), keywordEntry.normalized);
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
// Check if it's a preposition (with metadata for disambiguation)
|
|
674
|
+
if (PREPOSITIONS.has(word)) {
|
|
675
|
+
const token = createToken(word, 'particle', createPosition(startPos, pos));
|
|
676
|
+
return {
|
|
677
|
+
...token,
|
|
678
|
+
metadata: {
|
|
679
|
+
prepositionValue: word,
|
|
680
|
+
},
|
|
681
|
+
};
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
// Try morphological normalization for conjugated/inflected forms
|
|
685
|
+
const morphToken = this.tryMorphKeywordMatch(word, startPos, pos);
|
|
686
|
+
if (morphToken) return morphToken;
|
|
687
|
+
|
|
688
|
+
// Not a keyword or recognized form, return as identifier
|
|
689
|
+
return createToken(word, 'identifier', createPosition(startPos, pos));
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
/**
|
|
693
|
+
* Extract an ASCII word.
|
|
694
|
+
*/
|
|
695
|
+
private extractAsciiWord(input: string, startPos: number): LanguageToken | null {
|
|
696
|
+
let pos = startPos;
|
|
697
|
+
let word = '';
|
|
698
|
+
|
|
699
|
+
while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
|
|
700
|
+
word += input[pos++];
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
if (!word) return null;
|
|
704
|
+
|
|
705
|
+
return createToken(word, 'identifier', createPosition(startPos, pos));
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
/**
|
|
709
|
+
* Extract a number, including Arabic time unit suffixes.
|
|
710
|
+
* Arabic allows space between number and unit.
|
|
711
|
+
*/
|
|
712
|
+
private extractArabicNumber(input: string, startPos: number): LanguageToken | null {
|
|
713
|
+
return this.tryNumberWithTimeUnits(input, startPos, ARABIC_TIME_UNITS, {
|
|
714
|
+
allowSign: false,
|
|
715
|
+
skipWhitespace: true,
|
|
716
|
+
});
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
/**
|
|
721
|
+
* Singleton instance.
|
|
722
|
+
*/
|
|
723
|
+
export const arabicTokenizer = new ArabicTokenizer();
|