@lokascript/semantic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +686 -0
- package/dist/browser-ar.ar.global.js +2 -0
- package/dist/browser-core.core.global.js +2 -0
- package/dist/browser-de.de.global.js +2 -0
- package/dist/browser-east-asian.east-asian.global.js +2 -0
- package/dist/browser-en-tr.en-tr.global.js +2 -0
- package/dist/browser-en.en.global.js +2 -0
- package/dist/browser-es-en.es-en.global.js +2 -0
- package/dist/browser-es.es.global.js +2 -0
- package/dist/browser-fr.fr.global.js +2 -0
- package/dist/browser-id.id.global.js +2 -0
- package/dist/browser-ja.ja.global.js +2 -0
- package/dist/browser-ko.ko.global.js +2 -0
- package/dist/browser-lazy.lazy.global.js +2 -0
- package/dist/browser-priority.priority.global.js +2 -0
- package/dist/browser-pt.pt.global.js +2 -0
- package/dist/browser-qu.qu.global.js +2 -0
- package/dist/browser-sw.sw.global.js +2 -0
- package/dist/browser-tr.tr.global.js +2 -0
- package/dist/browser-western.western.global.js +2 -0
- package/dist/browser-zh.zh.global.js +2 -0
- package/dist/browser.global.js +3 -0
- package/dist/browser.global.js.map +1 -0
- package/dist/index.cjs +35051 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3426 -0
- package/dist/index.d.ts +3426 -0
- package/dist/index.js +34890 -0
- package/dist/index.js.map +1 -0
- package/dist/languages/ar.d.ts +78 -0
- package/dist/languages/ar.js +1622 -0
- package/dist/languages/ar.js.map +1 -0
- package/dist/languages/de.d.ts +38 -0
- package/dist/languages/de.js +1168 -0
- package/dist/languages/de.js.map +1 -0
- package/dist/languages/en.d.ts +44 -0
- package/dist/languages/en.js +3491 -0
- package/dist/languages/en.js.map +1 -0
- package/dist/languages/es.d.ts +52 -0
- package/dist/languages/es.js +1493 -0
- package/dist/languages/es.js.map +1 -0
- package/dist/languages/fr.d.ts +37 -0
- package/dist/languages/fr.js +1159 -0
- package/dist/languages/fr.js.map +1 -0
- package/dist/languages/id.d.ts +35 -0
- package/dist/languages/id.js +1152 -0
- package/dist/languages/id.js.map +1 -0
- package/dist/languages/ja.d.ts +53 -0
- package/dist/languages/ja.js +1430 -0
- package/dist/languages/ja.js.map +1 -0
- package/dist/languages/ko.d.ts +51 -0
- package/dist/languages/ko.js +1729 -0
- package/dist/languages/ko.js.map +1 -0
- package/dist/languages/pt.d.ts +37 -0
- package/dist/languages/pt.js +1127 -0
- package/dist/languages/pt.js.map +1 -0
- package/dist/languages/qu.d.ts +36 -0
- package/dist/languages/qu.js +1143 -0
- package/dist/languages/qu.js.map +1 -0
- package/dist/languages/sw.d.ts +35 -0
- package/dist/languages/sw.js +1147 -0
- package/dist/languages/sw.js.map +1 -0
- package/dist/languages/tr.d.ts +45 -0
- package/dist/languages/tr.js +1529 -0
- package/dist/languages/tr.js.map +1 -0
- package/dist/languages/zh.d.ts +58 -0
- package/dist/languages/zh.js +1257 -0
- package/dist/languages/zh.js.map +1 -0
- package/dist/types-C4dcj53L.d.ts +600 -0
- package/package.json +202 -0
- package/src/__test-utils__/index.ts +7 -0
- package/src/__test-utils__/test-helpers.ts +8 -0
- package/src/__types__/test-helpers.ts +122 -0
- package/src/analysis/index.ts +479 -0
- package/src/ast-builder/command-mappers.ts +1133 -0
- package/src/ast-builder/expression-parser/index.ts +41 -0
- package/src/ast-builder/expression-parser/parser.ts +563 -0
- package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
- package/src/ast-builder/expression-parser/types.ts +208 -0
- package/src/ast-builder/index.ts +536 -0
- package/src/ast-builder/value-converters.ts +172 -0
- package/src/bridge.ts +275 -0
- package/src/browser-ar.ts +162 -0
- package/src/browser-core.ts +231 -0
- package/src/browser-de.ts +162 -0
- package/src/browser-east-asian.ts +173 -0
- package/src/browser-en-tr.ts +165 -0
- package/src/browser-en.ts +157 -0
- package/src/browser-es-en.ts +200 -0
- package/src/browser-es.ts +170 -0
- package/src/browser-fr.ts +162 -0
- package/src/browser-id.ts +162 -0
- package/src/browser-ja.ts +162 -0
- package/src/browser-ko.ts +162 -0
- package/src/browser-lazy.ts +189 -0
- package/src/browser-priority.ts +214 -0
- package/src/browser-pt.ts +162 -0
- package/src/browser-qu.ts +162 -0
- package/src/browser-sw.ts +162 -0
- package/src/browser-tr.ts +162 -0
- package/src/browser-western.ts +181 -0
- package/src/browser-zh.ts +162 -0
- package/src/browser.ts +268 -0
- package/src/cache/index.ts +14 -0
- package/src/cache/semantic-cache.ts +344 -0
- package/src/core-bridge.ts +372 -0
- package/src/explicit/converter.ts +258 -0
- package/src/explicit/index.ts +18 -0
- package/src/explicit/parser.ts +236 -0
- package/src/explicit/renderer.ts +424 -0
- package/src/generators/command-schemas.ts +1636 -0
- package/src/generators/event-handler-generator.ts +109 -0
- package/src/generators/index.ts +117 -0
- package/src/generators/language-profiles.ts +139 -0
- package/src/generators/pattern-generator.ts +537 -0
- package/src/generators/profiles/arabic.ts +131 -0
- package/src/generators/profiles/bengali.ts +132 -0
- package/src/generators/profiles/chinese.ts +124 -0
- package/src/generators/profiles/english.ts +113 -0
- package/src/generators/profiles/french.ts +125 -0
- package/src/generators/profiles/german.ts +126 -0
- package/src/generators/profiles/hindi.ts +146 -0
- package/src/generators/profiles/index.ts +46 -0
- package/src/generators/profiles/indonesian.ts +125 -0
- package/src/generators/profiles/italian.ts +139 -0
- package/src/generators/profiles/japanese.ts +149 -0
- package/src/generators/profiles/korean.ts +127 -0
- package/src/generators/profiles/marker-templates.ts +288 -0
- package/src/generators/profiles/ms.ts +130 -0
- package/src/generators/profiles/polish.ts +249 -0
- package/src/generators/profiles/portuguese.ts +115 -0
- package/src/generators/profiles/quechua.ts +113 -0
- package/src/generators/profiles/russian.ts +260 -0
- package/src/generators/profiles/spanish.ts +130 -0
- package/src/generators/profiles/swahili.ts +129 -0
- package/src/generators/profiles/thai.ts +132 -0
- package/src/generators/profiles/tl.ts +128 -0
- package/src/generators/profiles/turkish.ts +124 -0
- package/src/generators/profiles/types.ts +165 -0
- package/src/generators/profiles/ukrainian.ts +270 -0
- package/src/generators/profiles/vietnamese.ts +133 -0
- package/src/generators/schema-error-codes.ts +160 -0
- package/src/generators/schema-validator.ts +391 -0
- package/src/index.ts +429 -0
- package/src/language-building-schema.ts +3170 -0
- package/src/language-loader.ts +394 -0
- package/src/languages/_all.ts +65 -0
- package/src/languages/ar.ts +15 -0
- package/src/languages/bn.ts +16 -0
- package/src/languages/de.ts +15 -0
- package/src/languages/en.ts +29 -0
- package/src/languages/es.ts +15 -0
- package/src/languages/fr.ts +15 -0
- package/src/languages/hi.ts +26 -0
- package/src/languages/id.ts +15 -0
- package/src/languages/index.ts +18 -0
- package/src/languages/it.ts +15 -0
- package/src/languages/ja.ts +15 -0
- package/src/languages/ko.ts +15 -0
- package/src/languages/ms.ts +16 -0
- package/src/languages/pl.ts +18 -0
- package/src/languages/pt.ts +15 -0
- package/src/languages/qu.ts +15 -0
- package/src/languages/ru.ts +26 -0
- package/src/languages/sw.ts +15 -0
- package/src/languages/th.ts +16 -0
- package/src/languages/tl.ts +16 -0
- package/src/languages/tr.ts +15 -0
- package/src/languages/uk.ts +26 -0
- package/src/languages/vi.ts +16 -0
- package/src/languages/zh.ts +15 -0
- package/src/parser/index.ts +15 -0
- package/src/parser/pattern-matcher.ts +1181 -0
- package/src/parser/semantic-parser.ts +573 -0
- package/src/parser/utils/index.ts +35 -0
- package/src/parser/utils/marker-resolution.ts +111 -0
- package/src/parser/utils/possessive-keywords.ts +43 -0
- package/src/parser/utils/role-positioning.ts +70 -0
- package/src/parser/utils/type-validation.ts +134 -0
- package/src/patterns/add/ar.ts +71 -0
- package/src/patterns/add/bn.ts +70 -0
- package/src/patterns/add/hi.ts +69 -0
- package/src/patterns/add/index.ts +87 -0
- package/src/patterns/add/it.ts +61 -0
- package/src/patterns/add/ja.ts +93 -0
- package/src/patterns/add/ko.ts +74 -0
- package/src/patterns/add/ms.ts +30 -0
- package/src/patterns/add/pl.ts +62 -0
- package/src/patterns/add/ru.ts +62 -0
- package/src/patterns/add/th.ts +49 -0
- package/src/patterns/add/tl.ts +30 -0
- package/src/patterns/add/tr.ts +71 -0
- package/src/patterns/add/uk.ts +62 -0
- package/src/patterns/add/vi.ts +61 -0
- package/src/patterns/add/zh.ts +71 -0
- package/src/patterns/builders.ts +207 -0
- package/src/patterns/decrement/bn.ts +70 -0
- package/src/patterns/decrement/de.ts +42 -0
- package/src/patterns/decrement/hi.ts +68 -0
- package/src/patterns/decrement/index.ts +79 -0
- package/src/patterns/decrement/it.ts +69 -0
- package/src/patterns/decrement/ms.ts +30 -0
- package/src/patterns/decrement/pl.ts +58 -0
- package/src/patterns/decrement/ru.ts +58 -0
- package/src/patterns/decrement/th.ts +49 -0
- package/src/patterns/decrement/tl.ts +30 -0
- package/src/patterns/decrement/tr.ts +48 -0
- package/src/patterns/decrement/uk.ts +58 -0
- package/src/patterns/decrement/vi.ts +61 -0
- package/src/patterns/decrement/zh.ts +32 -0
- package/src/patterns/en.ts +302 -0
- package/src/patterns/event-handler/ar.ts +151 -0
- package/src/patterns/event-handler/bn.ts +72 -0
- package/src/patterns/event-handler/de.ts +117 -0
- package/src/patterns/event-handler/en.ts +117 -0
- package/src/patterns/event-handler/es.ts +136 -0
- package/src/patterns/event-handler/fr.ts +117 -0
- package/src/patterns/event-handler/hi.ts +64 -0
- package/src/patterns/event-handler/id.ts +117 -0
- package/src/patterns/event-handler/index.ts +119 -0
- package/src/patterns/event-handler/it.ts +54 -0
- package/src/patterns/event-handler/ja.ts +118 -0
- package/src/patterns/event-handler/ko.ts +133 -0
- package/src/patterns/event-handler/ms.ts +30 -0
- package/src/patterns/event-handler/pl.ts +62 -0
- package/src/patterns/event-handler/pt.ts +117 -0
- package/src/patterns/event-handler/qu.ts +66 -0
- package/src/patterns/event-handler/ru.ts +62 -0
- package/src/patterns/event-handler/shared.ts +270 -0
- package/src/patterns/event-handler/sw.ts +117 -0
- package/src/patterns/event-handler/th.ts +53 -0
- package/src/patterns/event-handler/tl.ts +30 -0
- package/src/patterns/event-handler/tr.ts +170 -0
- package/src/patterns/event-handler/uk.ts +62 -0
- package/src/patterns/event-handler/vi.ts +61 -0
- package/src/patterns/event-handler/zh.ts +150 -0
- package/src/patterns/get/ar.ts +49 -0
- package/src/patterns/get/bn.ts +47 -0
- package/src/patterns/get/de.ts +32 -0
- package/src/patterns/get/hi.ts +52 -0
- package/src/patterns/get/index.ts +83 -0
- package/src/patterns/get/it.ts +56 -0
- package/src/patterns/get/ja.ts +53 -0
- package/src/patterns/get/ko.ts +53 -0
- package/src/patterns/get/ms.ts +30 -0
- package/src/patterns/get/pl.ts +57 -0
- package/src/patterns/get/ru.ts +57 -0
- package/src/patterns/get/th.ts +29 -0
- package/src/patterns/get/tl.ts +30 -0
- package/src/patterns/get/uk.ts +57 -0
- package/src/patterns/get/vi.ts +48 -0
- package/src/patterns/grammar-transformed/index.ts +39 -0
- package/src/patterns/grammar-transformed/ja.ts +1713 -0
- package/src/patterns/grammar-transformed/ko.ts +1311 -0
- package/src/patterns/grammar-transformed/tr.ts +1067 -0
- package/src/patterns/hide/ar.ts +67 -0
- package/src/patterns/hide/bn.ts +47 -0
- package/src/patterns/hide/de.ts +36 -0
- package/src/patterns/hide/hi.ts +61 -0
- package/src/patterns/hide/index.ts +91 -0
- package/src/patterns/hide/it.ts +56 -0
- package/src/patterns/hide/ja.ts +69 -0
- package/src/patterns/hide/ko.ts +69 -0
- package/src/patterns/hide/ms.ts +30 -0
- package/src/patterns/hide/pl.ts +57 -0
- package/src/patterns/hide/ru.ts +57 -0
- package/src/patterns/hide/th.ts +29 -0
- package/src/patterns/hide/tl.ts +30 -0
- package/src/patterns/hide/tr.ts +65 -0
- package/src/patterns/hide/uk.ts +57 -0
- package/src/patterns/hide/vi.ts +56 -0
- package/src/patterns/hide/zh.ts +68 -0
- package/src/patterns/increment/bn.ts +70 -0
- package/src/patterns/increment/de.ts +36 -0
- package/src/patterns/increment/hi.ts +68 -0
- package/src/patterns/increment/index.ts +79 -0
- package/src/patterns/increment/it.ts +69 -0
- package/src/patterns/increment/ms.ts +30 -0
- package/src/patterns/increment/pl.ts +58 -0
- package/src/patterns/increment/ru.ts +58 -0
- package/src/patterns/increment/th.ts +49 -0
- package/src/patterns/increment/tl.ts +30 -0
- package/src/patterns/increment/tr.ts +52 -0
- package/src/patterns/increment/uk.ts +58 -0
- package/src/patterns/increment/vi.ts +61 -0
- package/src/patterns/increment/zh.ts +32 -0
- package/src/patterns/index.ts +84 -0
- package/src/patterns/languages/en/control-flow.ts +93 -0
- package/src/patterns/languages/en/fetch.ts +62 -0
- package/src/patterns/languages/en/index.ts +42 -0
- package/src/patterns/languages/en/repeat.ts +67 -0
- package/src/patterns/languages/en/set.ts +48 -0
- package/src/patterns/languages/en/swap.ts +38 -0
- package/src/patterns/languages/en/temporal.ts +57 -0
- package/src/patterns/put/ar.ts +74 -0
- package/src/patterns/put/bn.ts +53 -0
- package/src/patterns/put/en.ts +74 -0
- package/src/patterns/put/es.ts +74 -0
- package/src/patterns/put/hi.ts +69 -0
- package/src/patterns/put/id.ts +96 -0
- package/src/patterns/put/index.ts +99 -0
- package/src/patterns/put/it.ts +56 -0
- package/src/patterns/put/ja.ts +75 -0
- package/src/patterns/put/ko.ts +67 -0
- package/src/patterns/put/ms.ts +30 -0
- package/src/patterns/put/pl.ts +81 -0
- package/src/patterns/put/ru.ts +85 -0
- package/src/patterns/put/th.ts +32 -0
- package/src/patterns/put/tl.ts +30 -0
- package/src/patterns/put/tr.ts +67 -0
- package/src/patterns/put/uk.ts +85 -0
- package/src/patterns/put/vi.ts +72 -0
- package/src/patterns/put/zh.ts +62 -0
- package/src/patterns/registry.ts +163 -0
- package/src/patterns/remove/ar.ts +71 -0
- package/src/patterns/remove/bn.ts +68 -0
- package/src/patterns/remove/hi.ts +69 -0
- package/src/patterns/remove/index.ts +87 -0
- package/src/patterns/remove/it.ts +69 -0
- package/src/patterns/remove/ja.ts +74 -0
- package/src/patterns/remove/ko.ts +78 -0
- package/src/patterns/remove/ms.ts +30 -0
- package/src/patterns/remove/pl.ts +62 -0
- package/src/patterns/remove/ru.ts +62 -0
- package/src/patterns/remove/th.ts +49 -0
- package/src/patterns/remove/tl.ts +30 -0
- package/src/patterns/remove/tr.ts +78 -0
- package/src/patterns/remove/uk.ts +62 -0
- package/src/patterns/remove/vi.ts +61 -0
- package/src/patterns/remove/zh.ts +72 -0
- package/src/patterns/set/ar.ts +84 -0
- package/src/patterns/set/bn.ts +53 -0
- package/src/patterns/set/de.ts +84 -0
- package/src/patterns/set/es.ts +92 -0
- package/src/patterns/set/fr.ts +88 -0
- package/src/patterns/set/hi.ts +56 -0
- package/src/patterns/set/id.ts +84 -0
- package/src/patterns/set/index.ts +107 -0
- package/src/patterns/set/it.ts +56 -0
- package/src/patterns/set/ja.ts +86 -0
- package/src/patterns/set/ko.ts +85 -0
- package/src/patterns/set/ms.ts +30 -0
- package/src/patterns/set/pl.ts +57 -0
- package/src/patterns/set/pt.ts +84 -0
- package/src/patterns/set/ru.ts +57 -0
- package/src/patterns/set/th.ts +31 -0
- package/src/patterns/set/tl.ts +30 -0
- package/src/patterns/set/tr.ts +107 -0
- package/src/patterns/set/uk.ts +57 -0
- package/src/patterns/set/vi.ts +53 -0
- package/src/patterns/set/zh.ts +84 -0
- package/src/patterns/show/ar.ts +67 -0
- package/src/patterns/show/bn.ts +47 -0
- package/src/patterns/show/de.ts +32 -0
- package/src/patterns/show/fr.ts +32 -0
- package/src/patterns/show/hi.ts +61 -0
- package/src/patterns/show/index.ts +95 -0
- package/src/patterns/show/it.ts +56 -0
- package/src/patterns/show/ja.ts +69 -0
- package/src/patterns/show/ko.ts +73 -0
- package/src/patterns/show/ms.ts +30 -0
- package/src/patterns/show/pl.ts +57 -0
- package/src/patterns/show/ru.ts +57 -0
- package/src/patterns/show/th.ts +29 -0
- package/src/patterns/show/tl.ts +30 -0
- package/src/patterns/show/tr.ts +65 -0
- package/src/patterns/show/uk.ts +57 -0
- package/src/patterns/show/vi.ts +56 -0
- package/src/patterns/show/zh.ts +68 -0
- package/src/patterns/take/ar.ts +51 -0
- package/src/patterns/take/index.ts +31 -0
- package/src/patterns/toggle/ar.ts +61 -0
- package/src/patterns/toggle/bn.ts +70 -0
- package/src/patterns/toggle/en.ts +61 -0
- package/src/patterns/toggle/es.ts +61 -0
- package/src/patterns/toggle/hi.ts +80 -0
- package/src/patterns/toggle/index.ts +95 -0
- package/src/patterns/toggle/it.ts +69 -0
- package/src/patterns/toggle/ja.ts +156 -0
- package/src/patterns/toggle/ko.ts +113 -0
- package/src/patterns/toggle/ms.ts +30 -0
- package/src/patterns/toggle/pl.ts +62 -0
- package/src/patterns/toggle/ru.ts +62 -0
- package/src/patterns/toggle/th.ts +50 -0
- package/src/patterns/toggle/tl.ts +30 -0
- package/src/patterns/toggle/tr.ts +88 -0
- package/src/patterns/toggle/uk.ts +62 -0
- package/src/patterns/toggle/vi.ts +61 -0
- package/src/patterns/toggle/zh.ts +99 -0
- package/src/public-api.ts +286 -0
- package/src/registry.ts +441 -0
- package/src/tokenizers/arabic.ts +723 -0
- package/src/tokenizers/base.ts +1300 -0
- package/src/tokenizers/bengali.ts +289 -0
- package/src/tokenizers/chinese.ts +481 -0
- package/src/tokenizers/english.ts +416 -0
- package/src/tokenizers/french.ts +326 -0
- package/src/tokenizers/german.ts +324 -0
- package/src/tokenizers/hindi.ts +319 -0
- package/src/tokenizers/index.ts +127 -0
- package/src/tokenizers/indonesian.ts +306 -0
- package/src/tokenizers/italian.ts +458 -0
- package/src/tokenizers/japanese.ts +447 -0
- package/src/tokenizers/korean.ts +642 -0
- package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
- package/src/tokenizers/morphology/french-normalizer.ts +268 -0
- package/src/tokenizers/morphology/german-normalizer.ts +256 -0
- package/src/tokenizers/morphology/index.ts +46 -0
- package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
- package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
- package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
- package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
- package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
- package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
- package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
- package/src/tokenizers/morphology/types.ts +211 -0
- package/src/tokenizers/ms.ts +198 -0
- package/src/tokenizers/polish.ts +354 -0
- package/src/tokenizers/portuguese.ts +304 -0
- package/src/tokenizers/quechua.ts +339 -0
- package/src/tokenizers/russian.ts +375 -0
- package/src/tokenizers/spanish.ts +403 -0
- package/src/tokenizers/swahili.ts +303 -0
- package/src/tokenizers/thai.ts +236 -0
- package/src/tokenizers/tl.ts +198 -0
- package/src/tokenizers/turkish.ts +411 -0
- package/src/tokenizers/ukrainian.ts +369 -0
- package/src/tokenizers/vietnamese.ts +410 -0
- package/src/types/grammar-types.ts +617 -0
- package/src/types/unified-profile.ts +267 -0
- package/src/types.ts +709 -0
- package/src/utils/confidence-calculator.ts +147 -0
- package/src/validators/command-validator.ts +380 -0
- package/src/validators/index.ts +15 -0
|
@@ -0,0 +1,1622 @@
|
|
|
1
|
+
// src/registry.ts
|
|
2
|
+
var tokenizers = /* @__PURE__ */ new Map();
|
|
3
|
+
var profiles = /* @__PURE__ */ new Map();
|
|
4
|
+
var patternCache = /* @__PURE__ */ new Map();
|
|
5
|
+
function registerLanguage(code, tokenizer, profile) {
|
|
6
|
+
tokenizers.set(code, tokenizer);
|
|
7
|
+
profiles.set(code, profile);
|
|
8
|
+
patternCache.delete(code);
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
// src/tokenizers/base.ts
|
|
12
|
+
var TokenStreamImpl = class {
|
|
13
|
+
constructor(tokens, language) {
|
|
14
|
+
this.pos = 0;
|
|
15
|
+
this.tokens = tokens;
|
|
16
|
+
this.language = language;
|
|
17
|
+
}
|
|
18
|
+
peek(offset = 0) {
|
|
19
|
+
const index = this.pos + offset;
|
|
20
|
+
if (index < 0 || index >= this.tokens.length) {
|
|
21
|
+
return null;
|
|
22
|
+
}
|
|
23
|
+
return this.tokens[index];
|
|
24
|
+
}
|
|
25
|
+
advance() {
|
|
26
|
+
if (this.isAtEnd()) {
|
|
27
|
+
throw new Error("Unexpected end of token stream");
|
|
28
|
+
}
|
|
29
|
+
return this.tokens[this.pos++];
|
|
30
|
+
}
|
|
31
|
+
isAtEnd() {
|
|
32
|
+
return this.pos >= this.tokens.length;
|
|
33
|
+
}
|
|
34
|
+
mark() {
|
|
35
|
+
return { position: this.pos };
|
|
36
|
+
}
|
|
37
|
+
reset(mark) {
|
|
38
|
+
this.pos = mark.position;
|
|
39
|
+
}
|
|
40
|
+
position() {
|
|
41
|
+
return this.pos;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Get remaining tokens as an array.
|
|
45
|
+
*/
|
|
46
|
+
remaining() {
|
|
47
|
+
return this.tokens.slice(this.pos);
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Consume tokens while predicate is true.
|
|
51
|
+
*/
|
|
52
|
+
takeWhile(predicate) {
|
|
53
|
+
const result = [];
|
|
54
|
+
while (!this.isAtEnd() && predicate(this.peek())) {
|
|
55
|
+
result.push(this.advance());
|
|
56
|
+
}
|
|
57
|
+
return result;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Skip tokens while predicate is true.
|
|
61
|
+
*/
|
|
62
|
+
skipWhile(predicate) {
|
|
63
|
+
while (!this.isAtEnd() && predicate(this.peek())) {
|
|
64
|
+
this.advance();
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
};
|
|
68
|
+
function createPosition(start, end) {
|
|
69
|
+
return { start, end };
|
|
70
|
+
}
|
|
71
|
+
function createToken(value, kind, position, normalizedOrOptions) {
|
|
72
|
+
if (typeof normalizedOrOptions === "string") {
|
|
73
|
+
return { value, kind, position, normalized: normalizedOrOptions };
|
|
74
|
+
}
|
|
75
|
+
if (normalizedOrOptions) {
|
|
76
|
+
const { normalized: normalized2, stem, stemConfidence } = normalizedOrOptions;
|
|
77
|
+
const token = { value, kind, position };
|
|
78
|
+
if (normalized2 !== void 0) {
|
|
79
|
+
token.normalized = normalized2;
|
|
80
|
+
}
|
|
81
|
+
if (stem !== void 0) {
|
|
82
|
+
token.stem = stem;
|
|
83
|
+
if (stemConfidence !== void 0) {
|
|
84
|
+
token.stemConfidence = stemConfidence;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
return token;
|
|
88
|
+
}
|
|
89
|
+
return { value, kind, position };
|
|
90
|
+
}
|
|
91
|
+
function isWhitespace(char) {
|
|
92
|
+
return /\s/.test(char);
|
|
93
|
+
}
|
|
94
|
+
function isSelectorStart(char) {
|
|
95
|
+
return char === "#" || char === "." || char === "[" || char === "@" || char === "*" || char === "<";
|
|
96
|
+
}
|
|
97
|
+
function isQuote(char) {
|
|
98
|
+
return char === '"' || char === "'" || char === "`" || char === "\u300C" || char === "\u300D";
|
|
99
|
+
}
|
|
100
|
+
function isDigit(char) {
|
|
101
|
+
return /\d/.test(char);
|
|
102
|
+
}
|
|
103
|
+
function isAsciiLetter(char) {
|
|
104
|
+
return /[a-zA-Z]/.test(char);
|
|
105
|
+
}
|
|
106
|
+
function isAsciiIdentifierChar(char) {
|
|
107
|
+
return /[a-zA-Z0-9_-]/.test(char);
|
|
108
|
+
}
|
|
109
|
+
function createUnicodeRangeClassifier(ranges) {
|
|
110
|
+
return (char) => {
|
|
111
|
+
const code = char.charCodeAt(0);
|
|
112
|
+
return ranges.some(([start, end]) => code >= start && code <= end);
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
function extractCssSelector(input, startPos) {
|
|
116
|
+
if (startPos >= input.length) return null;
|
|
117
|
+
const char = input[startPos];
|
|
118
|
+
if (!isSelectorStart(char)) return null;
|
|
119
|
+
let pos = startPos;
|
|
120
|
+
let selector = "";
|
|
121
|
+
if (char === "#" || char === ".") {
|
|
122
|
+
selector += input[pos++];
|
|
123
|
+
while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
|
|
124
|
+
selector += input[pos++];
|
|
125
|
+
}
|
|
126
|
+
if (selector.length <= 1) return null;
|
|
127
|
+
if (pos < input.length && input[pos] === "." && char === "#") {
|
|
128
|
+
const methodStart = pos + 1;
|
|
129
|
+
let methodEnd = methodStart;
|
|
130
|
+
while (methodEnd < input.length && isAsciiIdentifierChar(input[methodEnd])) {
|
|
131
|
+
methodEnd++;
|
|
132
|
+
}
|
|
133
|
+
if (methodEnd < input.length && input[methodEnd] === "(") {
|
|
134
|
+
return selector;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
} else if (char === "[") {
|
|
138
|
+
let depth = 1;
|
|
139
|
+
let inQuote = false;
|
|
140
|
+
let quoteChar = null;
|
|
141
|
+
let escaped = false;
|
|
142
|
+
selector += input[pos++];
|
|
143
|
+
while (pos < input.length && depth > 0) {
|
|
144
|
+
const c = input[pos];
|
|
145
|
+
selector += c;
|
|
146
|
+
if (escaped) {
|
|
147
|
+
escaped = false;
|
|
148
|
+
} else if (c === "\\") {
|
|
149
|
+
escaped = true;
|
|
150
|
+
} else if (inQuote) {
|
|
151
|
+
if (c === quoteChar) {
|
|
152
|
+
inQuote = false;
|
|
153
|
+
quoteChar = null;
|
|
154
|
+
}
|
|
155
|
+
} else {
|
|
156
|
+
if (c === '"' || c === "'" || c === "`") {
|
|
157
|
+
inQuote = true;
|
|
158
|
+
quoteChar = c;
|
|
159
|
+
} else if (c === "[") {
|
|
160
|
+
depth++;
|
|
161
|
+
} else if (c === "]") {
|
|
162
|
+
depth--;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
pos++;
|
|
166
|
+
}
|
|
167
|
+
if (depth !== 0) return null;
|
|
168
|
+
} else if (char === "@") {
|
|
169
|
+
selector += input[pos++];
|
|
170
|
+
while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
|
|
171
|
+
selector += input[pos++];
|
|
172
|
+
}
|
|
173
|
+
if (selector.length <= 1) return null;
|
|
174
|
+
} else if (char === "*") {
|
|
175
|
+
selector += input[pos++];
|
|
176
|
+
while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
|
|
177
|
+
selector += input[pos++];
|
|
178
|
+
}
|
|
179
|
+
if (selector.length <= 1) return null;
|
|
180
|
+
} else if (char === "<") {
|
|
181
|
+
selector += input[pos++];
|
|
182
|
+
if (pos >= input.length || !isAsciiLetter(input[pos])) return null;
|
|
183
|
+
while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
|
|
184
|
+
selector += input[pos++];
|
|
185
|
+
}
|
|
186
|
+
while (pos < input.length) {
|
|
187
|
+
const modChar = input[pos];
|
|
188
|
+
if (modChar === ".") {
|
|
189
|
+
selector += input[pos++];
|
|
190
|
+
if (pos >= input.length || !isAsciiIdentifierChar(input[pos])) {
|
|
191
|
+
return null;
|
|
192
|
+
}
|
|
193
|
+
while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
|
|
194
|
+
selector += input[pos++];
|
|
195
|
+
}
|
|
196
|
+
} else if (modChar === "#") {
|
|
197
|
+
selector += input[pos++];
|
|
198
|
+
if (pos >= input.length || !isAsciiIdentifierChar(input[pos])) {
|
|
199
|
+
return null;
|
|
200
|
+
}
|
|
201
|
+
while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
|
|
202
|
+
selector += input[pos++];
|
|
203
|
+
}
|
|
204
|
+
} else if (modChar === "[") {
|
|
205
|
+
let depth = 1;
|
|
206
|
+
let inQuote = false;
|
|
207
|
+
let quoteChar = null;
|
|
208
|
+
let escaped = false;
|
|
209
|
+
selector += input[pos++];
|
|
210
|
+
while (pos < input.length && depth > 0) {
|
|
211
|
+
const c = input[pos];
|
|
212
|
+
selector += c;
|
|
213
|
+
if (escaped) {
|
|
214
|
+
escaped = false;
|
|
215
|
+
} else if (c === "\\") {
|
|
216
|
+
escaped = true;
|
|
217
|
+
} else if (inQuote) {
|
|
218
|
+
if (c === quoteChar) {
|
|
219
|
+
inQuote = false;
|
|
220
|
+
quoteChar = null;
|
|
221
|
+
}
|
|
222
|
+
} else {
|
|
223
|
+
if (c === '"' || c === "'" || c === "`") {
|
|
224
|
+
inQuote = true;
|
|
225
|
+
quoteChar = c;
|
|
226
|
+
} else if (c === "[") {
|
|
227
|
+
depth++;
|
|
228
|
+
} else if (c === "]") {
|
|
229
|
+
depth--;
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
pos++;
|
|
233
|
+
}
|
|
234
|
+
if (depth !== 0) return null;
|
|
235
|
+
} else {
|
|
236
|
+
break;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
while (pos < input.length && isWhitespace(input[pos])) {
|
|
240
|
+
selector += input[pos++];
|
|
241
|
+
}
|
|
242
|
+
if (pos < input.length && input[pos] === "/") {
|
|
243
|
+
selector += input[pos++];
|
|
244
|
+
while (pos < input.length && isWhitespace(input[pos])) {
|
|
245
|
+
selector += input[pos++];
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
if (pos >= input.length || input[pos] !== ">") return null;
|
|
249
|
+
selector += input[pos++];
|
|
250
|
+
}
|
|
251
|
+
return selector || null;
|
|
252
|
+
}
|
|
253
|
+
function isPossessiveMarker(input, pos) {
|
|
254
|
+
if (pos >= input.length || input[pos] !== "'") return false;
|
|
255
|
+
if (pos + 1 >= input.length) return false;
|
|
256
|
+
const nextChar = input[pos + 1].toLowerCase();
|
|
257
|
+
if (nextChar !== "s") return false;
|
|
258
|
+
if (pos + 2 >= input.length) return true;
|
|
259
|
+
const afterS = input[pos + 2];
|
|
260
|
+
return isWhitespace(afterS) || afterS === "*" || !isAsciiIdentifierChar(afterS);
|
|
261
|
+
}
|
|
262
|
+
function extractStringLiteral(input, startPos) {
|
|
263
|
+
if (startPos >= input.length) return null;
|
|
264
|
+
const openQuote = input[startPos];
|
|
265
|
+
if (!isQuote(openQuote)) return null;
|
|
266
|
+
if (openQuote === "'" && isPossessiveMarker(input, startPos)) {
|
|
267
|
+
return null;
|
|
268
|
+
}
|
|
269
|
+
const closeQuoteMap = {
|
|
270
|
+
'"': '"',
|
|
271
|
+
"'": "'",
|
|
272
|
+
"`": "`",
|
|
273
|
+
"\u300C": "\u300D"
|
|
274
|
+
};
|
|
275
|
+
const closeQuote = closeQuoteMap[openQuote];
|
|
276
|
+
if (!closeQuote) return null;
|
|
277
|
+
let pos = startPos + 1;
|
|
278
|
+
let literal = openQuote;
|
|
279
|
+
let escaped = false;
|
|
280
|
+
while (pos < input.length) {
|
|
281
|
+
const char = input[pos];
|
|
282
|
+
literal += char;
|
|
283
|
+
if (escaped) {
|
|
284
|
+
escaped = false;
|
|
285
|
+
} else if (char === "\\") {
|
|
286
|
+
escaped = true;
|
|
287
|
+
} else if (char === closeQuote) {
|
|
288
|
+
return literal;
|
|
289
|
+
}
|
|
290
|
+
pos++;
|
|
291
|
+
}
|
|
292
|
+
return literal;
|
|
293
|
+
}
|
|
294
|
+
function isUrlStart(input, pos) {
|
|
295
|
+
if (pos >= input.length) return false;
|
|
296
|
+
const char = input[pos];
|
|
297
|
+
const next = input[pos + 1] || "";
|
|
298
|
+
const third = input[pos + 2] || "";
|
|
299
|
+
if (char === "/" && next !== "/" && /[a-zA-Z0-9._-]/.test(next)) {
|
|
300
|
+
return true;
|
|
301
|
+
}
|
|
302
|
+
if (char === "/" && next === "/" && /[a-zA-Z]/.test(third)) {
|
|
303
|
+
return true;
|
|
304
|
+
}
|
|
305
|
+
if (char === "." && (next === "/" || next === "." && third === "/")) {
|
|
306
|
+
return true;
|
|
307
|
+
}
|
|
308
|
+
const slice = input.slice(pos, pos + 8).toLowerCase();
|
|
309
|
+
if (slice.startsWith("http://") || slice.startsWith("https://")) {
|
|
310
|
+
return true;
|
|
311
|
+
}
|
|
312
|
+
return false;
|
|
313
|
+
}
|
|
314
|
+
function extractUrl(input, startPos) {
|
|
315
|
+
if (!isUrlStart(input, startPos)) return null;
|
|
316
|
+
let pos = startPos;
|
|
317
|
+
let url = "";
|
|
318
|
+
const urlChars = /[a-zA-Z0-9/:._\-?&=%@+~!$'()*,;[\]]/;
|
|
319
|
+
while (pos < input.length) {
|
|
320
|
+
const char = input[pos];
|
|
321
|
+
if (char === "#") {
|
|
322
|
+
if (url.length > 0 && /[a-zA-Z0-9/.]$/.test(url)) {
|
|
323
|
+
url += char;
|
|
324
|
+
pos++;
|
|
325
|
+
while (pos < input.length && /[a-zA-Z0-9_-]/.test(input[pos])) {
|
|
326
|
+
url += input[pos++];
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
break;
|
|
330
|
+
}
|
|
331
|
+
if (urlChars.test(char)) {
|
|
332
|
+
url += char;
|
|
333
|
+
pos++;
|
|
334
|
+
} else {
|
|
335
|
+
break;
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
if (url.length < 2) return null;
|
|
339
|
+
return url;
|
|
340
|
+
}
|
|
341
|
+
function extractNumber(input, startPos) {
|
|
342
|
+
if (startPos >= input.length) return null;
|
|
343
|
+
const char = input[startPos];
|
|
344
|
+
if (!isDigit(char) && char !== "-" && char !== "+") return null;
|
|
345
|
+
let pos = startPos;
|
|
346
|
+
let number = "";
|
|
347
|
+
if (input[pos] === "-" || input[pos] === "+") {
|
|
348
|
+
number += input[pos++];
|
|
349
|
+
}
|
|
350
|
+
if (pos >= input.length || !isDigit(input[pos])) {
|
|
351
|
+
return null;
|
|
352
|
+
}
|
|
353
|
+
while (pos < input.length && isDigit(input[pos])) {
|
|
354
|
+
number += input[pos++];
|
|
355
|
+
}
|
|
356
|
+
if (pos < input.length && input[pos] === ".") {
|
|
357
|
+
number += input[pos++];
|
|
358
|
+
while (pos < input.length && isDigit(input[pos])) {
|
|
359
|
+
number += input[pos++];
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
if (pos < input.length) {
|
|
363
|
+
const suffix = input.slice(pos, pos + 2);
|
|
364
|
+
if (suffix === "ms") {
|
|
365
|
+
number += "ms";
|
|
366
|
+
} else if (input[pos] === "s" || input[pos] === "m" || input[pos] === "h") {
|
|
367
|
+
number += input[pos];
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
return number;
|
|
371
|
+
}
|
|
372
|
+
var _BaseTokenizer = class _BaseTokenizer {
|
|
373
|
+
constructor() {
|
|
374
|
+
/** Keywords derived from profile, sorted longest-first for greedy matching */
|
|
375
|
+
this.profileKeywords = [];
|
|
376
|
+
/** Map for O(1) keyword lookups by lowercase native word */
|
|
377
|
+
this.profileKeywordMap = /* @__PURE__ */ new Map();
|
|
378
|
+
}
|
|
379
|
+
/**
|
|
380
|
+
* Initialize keyword mappings from a language profile.
|
|
381
|
+
* Builds a list of native→english mappings from:
|
|
382
|
+
* - profile.keywords (primary + alternatives)
|
|
383
|
+
* - profile.references (me, it, you, etc.)
|
|
384
|
+
* - profile.roleMarkers (into, from, with, etc.)
|
|
385
|
+
*
|
|
386
|
+
* Results are sorted longest-first for greedy matching (important for non-space languages).
|
|
387
|
+
* Extras take precedence over profile entries when there are duplicates.
|
|
388
|
+
*
|
|
389
|
+
* @param profile - Language profile containing keyword translations
|
|
390
|
+
* @param extras - Additional keyword entries to include (literals, positional, events)
|
|
391
|
+
*/
|
|
392
|
+
initializeKeywordsFromProfile(profile, extras = []) {
|
|
393
|
+
const keywordMap = /* @__PURE__ */ new Map();
|
|
394
|
+
if (profile.keywords) {
|
|
395
|
+
for (const [normalized2, translation] of Object.entries(profile.keywords)) {
|
|
396
|
+
keywordMap.set(translation.primary, {
|
|
397
|
+
native: translation.primary,
|
|
398
|
+
normalized: translation.normalized || normalized2
|
|
399
|
+
});
|
|
400
|
+
if (translation.alternatives) {
|
|
401
|
+
for (const alt of translation.alternatives) {
|
|
402
|
+
keywordMap.set(alt, {
|
|
403
|
+
native: alt,
|
|
404
|
+
normalized: translation.normalized || normalized2
|
|
405
|
+
});
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
if (profile.references) {
|
|
411
|
+
for (const [normalized2, native] of Object.entries(profile.references)) {
|
|
412
|
+
keywordMap.set(native, { native, normalized: normalized2 });
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
if (profile.roleMarkers) {
|
|
416
|
+
for (const [role, marker] of Object.entries(profile.roleMarkers)) {
|
|
417
|
+
if (marker.primary) {
|
|
418
|
+
keywordMap.set(marker.primary, { native: marker.primary, normalized: role });
|
|
419
|
+
}
|
|
420
|
+
if (marker.alternatives) {
|
|
421
|
+
for (const alt of marker.alternatives) {
|
|
422
|
+
keywordMap.set(alt, { native: alt, normalized: role });
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
for (const extra of extras) {
|
|
428
|
+
keywordMap.set(extra.native, extra);
|
|
429
|
+
}
|
|
430
|
+
this.profileKeywords = Array.from(keywordMap.values()).sort(
|
|
431
|
+
(a, b) => b.native.length - a.native.length
|
|
432
|
+
);
|
|
433
|
+
this.profileKeywordMap = /* @__PURE__ */ new Map();
|
|
434
|
+
for (const keyword of this.profileKeywords) {
|
|
435
|
+
this.profileKeywordMap.set(keyword.native.toLowerCase(), keyword);
|
|
436
|
+
const normalized2 = this.removeDiacritics(keyword.native);
|
|
437
|
+
if (normalized2 !== keyword.native && !this.profileKeywordMap.has(normalized2.toLowerCase())) {
|
|
438
|
+
this.profileKeywordMap.set(normalized2.toLowerCase(), keyword);
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
/**
|
|
443
|
+
* Remove diacritical marks from a word for normalization.
|
|
444
|
+
* Primarily for Arabic (shadda, fatha, kasra, damma, sukun, etc.)
|
|
445
|
+
* but could be extended for other languages.
|
|
446
|
+
*
|
|
447
|
+
* @param word - Word to normalize
|
|
448
|
+
* @returns Word without diacritics
|
|
449
|
+
*/
|
|
450
|
+
removeDiacritics(word) {
|
|
451
|
+
return word.replace(/[\u064B-\u0652\u0670]/g, "");
|
|
452
|
+
}
|
|
453
|
+
/**
|
|
454
|
+
* Try to match a keyword from profile at the current position.
|
|
455
|
+
* Uses longest-first greedy matching (important for non-space languages).
|
|
456
|
+
*
|
|
457
|
+
* @param input - Input string
|
|
458
|
+
* @param pos - Current position
|
|
459
|
+
* @returns Token if matched, null otherwise
|
|
460
|
+
*/
|
|
461
|
+
tryProfileKeyword(input, pos) {
|
|
462
|
+
for (const entry of this.profileKeywords) {
|
|
463
|
+
if (input.slice(pos).startsWith(entry.native)) {
|
|
464
|
+
return createToken(
|
|
465
|
+
entry.native,
|
|
466
|
+
"keyword",
|
|
467
|
+
createPosition(pos, pos + entry.native.length),
|
|
468
|
+
entry.normalized
|
|
469
|
+
);
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
return null;
|
|
473
|
+
}
|
|
474
|
+
/**
|
|
475
|
+
* Check if the remaining input starts with any known keyword.
|
|
476
|
+
* Useful for non-space languages to detect word boundaries.
|
|
477
|
+
*
|
|
478
|
+
* @param input - Input string
|
|
479
|
+
* @param pos - Current position
|
|
480
|
+
* @returns true if a keyword starts at this position
|
|
481
|
+
*/
|
|
482
|
+
isKeywordStart(input, pos) {
|
|
483
|
+
const remaining = input.slice(pos);
|
|
484
|
+
return this.profileKeywords.some((entry) => remaining.startsWith(entry.native));
|
|
485
|
+
}
|
|
486
|
+
/**
|
|
487
|
+
* Look up a keyword by native word (case-insensitive).
|
|
488
|
+
* O(1) lookup using the keyword map.
|
|
489
|
+
*
|
|
490
|
+
* @param native - Native word to look up
|
|
491
|
+
* @returns KeywordEntry if found, undefined otherwise
|
|
492
|
+
*/
|
|
493
|
+
lookupKeyword(native) {
|
|
494
|
+
return this.profileKeywordMap.get(native.toLowerCase());
|
|
495
|
+
}
|
|
496
|
+
/**
|
|
497
|
+
* Check if a word is a known keyword (case-insensitive).
|
|
498
|
+
* O(1) lookup using the keyword map.
|
|
499
|
+
*
|
|
500
|
+
* @param native - Native word to check
|
|
501
|
+
* @returns true if the word is a keyword
|
|
502
|
+
*/
|
|
503
|
+
isKeyword(native) {
|
|
504
|
+
return this.profileKeywordMap.has(native.toLowerCase());
|
|
505
|
+
}
|
|
506
|
+
/**
|
|
507
|
+
* Set the morphological normalizer for this tokenizer.
|
|
508
|
+
*/
|
|
509
|
+
setNormalizer(normalizer) {
|
|
510
|
+
this.normalizer = normalizer;
|
|
511
|
+
}
|
|
512
|
+
/**
|
|
513
|
+
* Try to normalize a word using the morphological normalizer.
|
|
514
|
+
* Returns null if no normalizer is set or normalization fails.
|
|
515
|
+
*
|
|
516
|
+
* Note: We don't check isNormalizable() here because the individual tokenizers
|
|
517
|
+
* historically called normalize() directly without that check. The normalize()
|
|
518
|
+
* method itself handles returning noChange() for words that can't be normalized.
|
|
519
|
+
*/
|
|
520
|
+
tryNormalize(word) {
|
|
521
|
+
if (!this.normalizer) return null;
|
|
522
|
+
const result = this.normalizer.normalize(word);
|
|
523
|
+
if (result.stem !== word && result.confidence >= 0.7) {
|
|
524
|
+
return result;
|
|
525
|
+
}
|
|
526
|
+
return null;
|
|
527
|
+
}
|
|
528
|
+
/**
|
|
529
|
+
* Try morphological normalization and keyword lookup.
|
|
530
|
+
*
|
|
531
|
+
* If the word can be normalized to a stem that matches a known keyword,
|
|
532
|
+
* returns a keyword token with morphological metadata (stem, stemConfidence).
|
|
533
|
+
*
|
|
534
|
+
* This is the common pattern for handling conjugated verbs across languages:
|
|
535
|
+
* 1. Normalize the word (e.g., "toggled" → "toggle")
|
|
536
|
+
* 2. Look up the stem in the keyword map
|
|
537
|
+
* 3. Create a token with both the original form and stem metadata
|
|
538
|
+
*
|
|
539
|
+
* @param word - The word to normalize and look up
|
|
540
|
+
* @param startPos - Start position for the token
|
|
541
|
+
* @param endPos - End position for the token
|
|
542
|
+
* @returns Token if stem matches a keyword, null otherwise
|
|
543
|
+
*/
|
|
544
|
+
tryMorphKeywordMatch(word, startPos, endPos) {
|
|
545
|
+
const result = this.tryNormalize(word);
|
|
546
|
+
if (!result) return null;
|
|
547
|
+
const stemEntry = this.lookupKeyword(result.stem);
|
|
548
|
+
if (!stemEntry) return null;
|
|
549
|
+
const tokenOptions = {
|
|
550
|
+
normalized: stemEntry.normalized,
|
|
551
|
+
stem: result.stem,
|
|
552
|
+
stemConfidence: result.confidence
|
|
553
|
+
};
|
|
554
|
+
return createToken(word, "keyword", createPosition(startPos, endPos), tokenOptions);
|
|
555
|
+
}
|
|
556
|
+
/**
|
|
557
|
+
* Try to extract a CSS selector at the current position.
|
|
558
|
+
*/
|
|
559
|
+
trySelector(input, pos) {
|
|
560
|
+
const selector = extractCssSelector(input, pos);
|
|
561
|
+
if (selector) {
|
|
562
|
+
return createToken(selector, "selector", createPosition(pos, pos + selector.length));
|
|
563
|
+
}
|
|
564
|
+
return null;
|
|
565
|
+
}
|
|
566
|
+
/**
|
|
567
|
+
* Try to extract an event modifier at the current position.
|
|
568
|
+
* Event modifiers are .once, .debounce(N), .throttle(N), .queue(strategy)
|
|
569
|
+
*/
|
|
570
|
+
tryEventModifier(input, pos) {
|
|
571
|
+
if (input[pos] !== ".") {
|
|
572
|
+
return null;
|
|
573
|
+
}
|
|
574
|
+
const match = input.slice(pos).match(/^\.(?:once|debounce|throttle|queue)(?:\(([^)]+)\))?(?:\s|$|\.)/);
|
|
575
|
+
if (!match) {
|
|
576
|
+
return null;
|
|
577
|
+
}
|
|
578
|
+
const fullMatch = match[0].replace(/(\s|\.)$/, "");
|
|
579
|
+
const modifierName = fullMatch.slice(1).split("(")[0];
|
|
580
|
+
const value = match[1];
|
|
581
|
+
const token = createToken(
|
|
582
|
+
fullMatch,
|
|
583
|
+
"event-modifier",
|
|
584
|
+
createPosition(pos, pos + fullMatch.length)
|
|
585
|
+
);
|
|
586
|
+
return {
|
|
587
|
+
...token,
|
|
588
|
+
metadata: {
|
|
589
|
+
modifierName,
|
|
590
|
+
value: value ? modifierName === "queue" ? value : parseInt(value, 10) : void 0
|
|
591
|
+
}
|
|
592
|
+
};
|
|
593
|
+
}
|
|
594
|
+
/**
|
|
595
|
+
* Try to extract a string literal at the current position.
|
|
596
|
+
*/
|
|
597
|
+
tryString(input, pos) {
|
|
598
|
+
const literal = extractStringLiteral(input, pos);
|
|
599
|
+
if (literal) {
|
|
600
|
+
return createToken(literal, "literal", createPosition(pos, pos + literal.length));
|
|
601
|
+
}
|
|
602
|
+
return null;
|
|
603
|
+
}
|
|
604
|
+
/**
|
|
605
|
+
* Try to extract a number at the current position.
|
|
606
|
+
*/
|
|
607
|
+
tryNumber(input, pos) {
|
|
608
|
+
const number = extractNumber(input, pos);
|
|
609
|
+
if (number) {
|
|
610
|
+
return createToken(number, "literal", createPosition(pos, pos + number.length));
|
|
611
|
+
}
|
|
612
|
+
return null;
|
|
613
|
+
}
|
|
614
|
+
/**
|
|
615
|
+
* Try to match a time unit from a list of patterns.
|
|
616
|
+
*
|
|
617
|
+
* @param input - Input string
|
|
618
|
+
* @param pos - Position after the number
|
|
619
|
+
* @param timeUnits - Array of time unit mappings (native pattern → standard suffix)
|
|
620
|
+
* @param skipWhitespace - Whether to skip whitespace before time unit (default: false)
|
|
621
|
+
* @returns Object with matched suffix and new position, or null if no match
|
|
622
|
+
*/
|
|
623
|
+
tryMatchTimeUnit(input, pos, timeUnits, skipWhitespace = false) {
|
|
624
|
+
let unitPos = pos;
|
|
625
|
+
if (skipWhitespace) {
|
|
626
|
+
while (unitPos < input.length && isWhitespace(input[unitPos])) {
|
|
627
|
+
unitPos++;
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
const remaining = input.slice(unitPos);
|
|
631
|
+
for (const unit of timeUnits) {
|
|
632
|
+
const candidate = remaining.slice(0, unit.length);
|
|
633
|
+
const matches = unit.caseInsensitive ? candidate.toLowerCase() === unit.pattern.toLowerCase() : candidate === unit.pattern;
|
|
634
|
+
if (matches) {
|
|
635
|
+
if (unit.notFollowedBy) {
|
|
636
|
+
const nextChar = remaining[unit.length] || "";
|
|
637
|
+
if (nextChar === unit.notFollowedBy) continue;
|
|
638
|
+
}
|
|
639
|
+
if (unit.checkBoundary) {
|
|
640
|
+
const nextChar = remaining[unit.length] || "";
|
|
641
|
+
if (isAsciiIdentifierChar(nextChar)) continue;
|
|
642
|
+
}
|
|
643
|
+
return { suffix: unit.suffix, endPos: unitPos + unit.length };
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
return null;
|
|
647
|
+
}
|
|
648
|
+
/**
|
|
649
|
+
* Parse a base number (sign, integer, decimal) without time units.
|
|
650
|
+
* Returns the number string and end position.
|
|
651
|
+
*
|
|
652
|
+
* @param input - Input string
|
|
653
|
+
* @param startPos - Start position
|
|
654
|
+
* @param allowSign - Whether to allow +/- sign (default: true)
|
|
655
|
+
* @returns Object with number string and end position, or null
|
|
656
|
+
*/
|
|
657
|
+
parseBaseNumber(input, startPos, allowSign = true) {
|
|
658
|
+
let pos = startPos;
|
|
659
|
+
let number = "";
|
|
660
|
+
if (allowSign && (input[pos] === "-" || input[pos] === "+")) {
|
|
661
|
+
number += input[pos++];
|
|
662
|
+
}
|
|
663
|
+
if (pos >= input.length || !isDigit(input[pos])) {
|
|
664
|
+
return null;
|
|
665
|
+
}
|
|
666
|
+
while (pos < input.length && isDigit(input[pos])) {
|
|
667
|
+
number += input[pos++];
|
|
668
|
+
}
|
|
669
|
+
if (pos < input.length && input[pos] === ".") {
|
|
670
|
+
number += input[pos++];
|
|
671
|
+
while (pos < input.length && isDigit(input[pos])) {
|
|
672
|
+
number += input[pos++];
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
if (!number || number === "-" || number === "+") return null;
|
|
676
|
+
return { number, endPos: pos };
|
|
677
|
+
}
|
|
678
|
+
/**
|
|
679
|
+
* Try to extract a number with native language time units.
|
|
680
|
+
*
|
|
681
|
+
* This is a template method that handles the common pattern:
|
|
682
|
+
* 1. Parse the base number (sign, integer, decimal)
|
|
683
|
+
* 2. Try to match native language time units
|
|
684
|
+
* 3. Fall back to standard time units (ms, s, m, h)
|
|
685
|
+
*
|
|
686
|
+
* @param input - Input string
|
|
687
|
+
* @param pos - Start position
|
|
688
|
+
* @param nativeTimeUnits - Language-specific time unit mappings
|
|
689
|
+
* @param options - Configuration options
|
|
690
|
+
* @returns Token if number found, null otherwise
|
|
691
|
+
*/
|
|
692
|
+
tryNumberWithTimeUnits(input, pos, nativeTimeUnits, options = {}) {
|
|
693
|
+
const { allowSign = true, skipWhitespace = false } = options;
|
|
694
|
+
const baseResult = this.parseBaseNumber(input, pos, allowSign);
|
|
695
|
+
if (!baseResult) return null;
|
|
696
|
+
let { number, endPos } = baseResult;
|
|
697
|
+
const allUnits = [...nativeTimeUnits, ..._BaseTokenizer.STANDARD_TIME_UNITS];
|
|
698
|
+
const timeMatch = this.tryMatchTimeUnit(input, endPos, allUnits, skipWhitespace);
|
|
699
|
+
if (timeMatch) {
|
|
700
|
+
number += timeMatch.suffix;
|
|
701
|
+
endPos = timeMatch.endPos;
|
|
702
|
+
}
|
|
703
|
+
return createToken(number, "literal", createPosition(pos, endPos));
|
|
704
|
+
}
|
|
705
|
+
/**
|
|
706
|
+
* Try to extract a URL at the current position.
|
|
707
|
+
* Handles /path, ./path, ../path, //domain.com, http://, https://
|
|
708
|
+
*/
|
|
709
|
+
tryUrl(input, pos) {
|
|
710
|
+
const url = extractUrl(input, pos);
|
|
711
|
+
if (url) {
|
|
712
|
+
return createToken(url, "url", createPosition(pos, pos + url.length));
|
|
713
|
+
}
|
|
714
|
+
return null;
|
|
715
|
+
}
|
|
716
|
+
/**
|
|
717
|
+
* Try to extract a variable reference (:varname) at the current position.
|
|
718
|
+
* In hyperscript, :x refers to a local variable named x.
|
|
719
|
+
*/
|
|
720
|
+
tryVariableRef(input, pos) {
|
|
721
|
+
if (input[pos] !== ":") return null;
|
|
722
|
+
if (pos + 1 >= input.length) return null;
|
|
723
|
+
if (!isAsciiIdentifierChar(input[pos + 1])) return null;
|
|
724
|
+
let endPos = pos + 1;
|
|
725
|
+
while (endPos < input.length && isAsciiIdentifierChar(input[endPos])) {
|
|
726
|
+
endPos++;
|
|
727
|
+
}
|
|
728
|
+
const varRef = input.slice(pos, endPos);
|
|
729
|
+
return createToken(varRef, "identifier", createPosition(pos, endPos));
|
|
730
|
+
}
|
|
731
|
+
/**
|
|
732
|
+
* Try to extract an operator or punctuation token at the current position.
|
|
733
|
+
* Handles two-character operators (==, !=, etc.) and single-character operators.
|
|
734
|
+
*/
|
|
735
|
+
tryOperator(input, pos) {
|
|
736
|
+
const twoChar = input.slice(pos, pos + 2);
|
|
737
|
+
if (["==", "!=", "<=", ">=", "&&", "||", "->"].includes(twoChar)) {
|
|
738
|
+
return createToken(twoChar, "operator", createPosition(pos, pos + 2));
|
|
739
|
+
}
|
|
740
|
+
const oneChar = input[pos];
|
|
741
|
+
if (["<", ">", "!", "+", "-", "*", "/", "="].includes(oneChar)) {
|
|
742
|
+
return createToken(oneChar, "operator", createPosition(pos, pos + 1));
|
|
743
|
+
}
|
|
744
|
+
if (["(", ")", "{", "}", ",", ";", ":"].includes(oneChar)) {
|
|
745
|
+
return createToken(oneChar, "punctuation", createPosition(pos, pos + 1));
|
|
746
|
+
}
|
|
747
|
+
return null;
|
|
748
|
+
}
|
|
749
|
+
/**
|
|
750
|
+
* Try to match a multi-character particle from a list.
|
|
751
|
+
*
|
|
752
|
+
* Used by languages like Japanese, Korean, and Chinese that have
|
|
753
|
+
* multi-character particles (e.g., Japanese から, まで, より).
|
|
754
|
+
*
|
|
755
|
+
* @param input - Input string
|
|
756
|
+
* @param pos - Current position
|
|
757
|
+
* @param particles - Array of multi-character particles to match
|
|
758
|
+
* @returns Token if matched, null otherwise
|
|
759
|
+
*/
|
|
760
|
+
tryMultiCharParticle(input, pos, particles) {
|
|
761
|
+
for (const particle of particles) {
|
|
762
|
+
if (input.slice(pos, pos + particle.length) === particle) {
|
|
763
|
+
return createToken(particle, "particle", createPosition(pos, pos + particle.length));
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
return null;
|
|
767
|
+
}
|
|
768
|
+
};
|
|
769
|
+
/**
|
|
770
|
+
* Configuration for native language time units.
|
|
771
|
+
* Maps patterns to their standard suffix (ms, s, m, h).
|
|
772
|
+
*/
|
|
773
|
+
_BaseTokenizer.STANDARD_TIME_UNITS = [
|
|
774
|
+
{ pattern: "ms", suffix: "ms", length: 2 },
|
|
775
|
+
{ pattern: "s", suffix: "s", length: 1, checkBoundary: true },
|
|
776
|
+
{ pattern: "m", suffix: "m", length: 1, checkBoundary: true, notFollowedBy: "s" },
|
|
777
|
+
{ pattern: "h", suffix: "h", length: 1, checkBoundary: true }
|
|
778
|
+
];
|
|
779
|
+
var BaseTokenizer = _BaseTokenizer;
|
|
780
|
+
|
|
781
|
+
// src/tokenizers/morphology/types.ts
|
|
782
|
+
function noChange(word) {
|
|
783
|
+
return { stem: word, confidence: 1 };
|
|
784
|
+
}
|
|
785
|
+
function normalized(stem, confidence, metadata) {
|
|
786
|
+
if (metadata) {
|
|
787
|
+
return { stem, confidence, metadata };
|
|
788
|
+
}
|
|
789
|
+
return { stem, confidence };
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
// src/tokenizers/morphology/arabic-normalizer.ts
|
|
793
|
+
function isArabic(char) {
|
|
794
|
+
const code = char.charCodeAt(0);
|
|
795
|
+
return code >= 1536 && code <= 1791 || // Arabic
|
|
796
|
+
code >= 1872 && code <= 1919 || // Arabic Supplement
|
|
797
|
+
code >= 2208 && code <= 2303 || // Arabic Extended-A
|
|
798
|
+
code >= 64336 && code <= 65023 || // Arabic Presentation Forms-A
|
|
799
|
+
code >= 65136 && code <= 65279;
|
|
800
|
+
}
|
|
801
|
+
function containsArabic(word) {
|
|
802
|
+
for (const char of word) {
|
|
803
|
+
if (isArabic(char)) return true;
|
|
804
|
+
}
|
|
805
|
+
return false;
|
|
806
|
+
}
|
|
807
|
+
function removeDiacritics(word) {
|
|
808
|
+
return word.replace(/[\u064B-\u0652\u0670]/g, "");
|
|
809
|
+
}
|
|
810
|
+
var COMBINED_PREFIXES = [
|
|
811
|
+
// Conjunction + article combinations (4 chars)
|
|
812
|
+
{ pattern: "\u0648\u0627\u0644", confidencePenalty: 0.15, prefixType: "conjunction" },
|
|
813
|
+
// wa + al
|
|
814
|
+
{ pattern: "\u0641\u0627\u0644", confidencePenalty: 0.15, prefixType: "conjunction" },
|
|
815
|
+
// fa + al
|
|
816
|
+
{ pattern: "\u0628\u0627\u0644", confidencePenalty: 0.15, prefixType: "preposition" },
|
|
817
|
+
// bi + al
|
|
818
|
+
{ pattern: "\u0643\u0627\u0644", confidencePenalty: 0.15, prefixType: "preposition" },
|
|
819
|
+
// ka + al
|
|
820
|
+
{ pattern: "\u0644\u0644", confidencePenalty: 0.12, prefixType: "preposition" }
|
|
821
|
+
// li + al (assimilation)
|
|
822
|
+
];
|
|
823
|
+
var SINGLE_PREFIXES = [
|
|
824
|
+
// Definite article (2 chars) - can leave 2-char stem
|
|
825
|
+
{ pattern: "\u0627\u0644", confidencePenalty: 0.08, prefixType: "article", minRemaining: 2 },
|
|
826
|
+
// Conjunctions and prepositions (1 char) - need longer stem to be safe
|
|
827
|
+
{ pattern: "\u0648", confidencePenalty: 0.08, prefixType: "conjunction", minRemaining: 3 },
|
|
828
|
+
// wa- (and)
|
|
829
|
+
{ pattern: "\u0641", confidencePenalty: 0.08, prefixType: "conjunction", minRemaining: 3 },
|
|
830
|
+
// fa- (then/so)
|
|
831
|
+
{ pattern: "\u0628", confidencePenalty: 0.1, prefixType: "preposition", minRemaining: 3 },
|
|
832
|
+
// bi- (with/by)
|
|
833
|
+
{ pattern: "\u0644", confidencePenalty: 0.1, prefixType: "preposition", minRemaining: 3 },
|
|
834
|
+
// li- (to/for)
|
|
835
|
+
{ pattern: "\u0643", confidencePenalty: 0.1, prefixType: "preposition", minRemaining: 3 }
|
|
836
|
+
// ka- (like/as)
|
|
837
|
+
];
|
|
838
|
+
var VERB_PREFIXES = [
|
|
839
|
+
{ pattern: "\u064A", confidencePenalty: 0.12, prefixType: "verb-marker", minRemaining: 3 },
|
|
840
|
+
// ya- (he/it)
|
|
841
|
+
{ pattern: "\u062A", confidencePenalty: 0.12, prefixType: "verb-marker", minRemaining: 3 },
|
|
842
|
+
// ta- (she/you)
|
|
843
|
+
{ pattern: "\u0646", confidencePenalty: 0.12, prefixType: "verb-marker", minRemaining: 3 },
|
|
844
|
+
// na- (we)
|
|
845
|
+
{ pattern: "\u0623", confidencePenalty: 0.12, prefixType: "verb-marker", minRemaining: 3 },
|
|
846
|
+
// a- (I)
|
|
847
|
+
{ pattern: "\u0627", confidencePenalty: 0.12, prefixType: "verb-marker", minRemaining: 3 }
|
|
848
|
+
// a- without hamza
|
|
849
|
+
];
|
|
850
|
+
var SUFFIXES = [
|
|
851
|
+
// Plural forms
|
|
852
|
+
{ pattern: "\u0648\u0646", confidencePenalty: 0.1, type: "masculine-plural" },
|
|
853
|
+
{ pattern: "\u064A\u0646", confidencePenalty: 0.1, type: "masculine-plural-accusative" },
|
|
854
|
+
{ pattern: "\u0627\u062A", confidencePenalty: 0.1, type: "feminine-plural" },
|
|
855
|
+
// Dual forms
|
|
856
|
+
{ pattern: "\u0627\u0646", confidencePenalty: 0.1, type: "dual-nominative" },
|
|
857
|
+
{ pattern: "\u064A\u0646", confidencePenalty: 0.1, type: "dual-accusative" },
|
|
858
|
+
// Pronoun suffixes
|
|
859
|
+
{ pattern: "\u0647\u0627", confidencePenalty: 0.1, type: "pronoun-her" },
|
|
860
|
+
{ pattern: "\u0647\u0645", confidencePenalty: 0.1, type: "pronoun-them" },
|
|
861
|
+
{ pattern: "\u0647\u0646", confidencePenalty: 0.1, type: "pronoun-them-f" },
|
|
862
|
+
{ pattern: "\u0646\u0627", confidencePenalty: 0.1, type: "pronoun-us" },
|
|
863
|
+
{ pattern: "\u0643\u0645", confidencePenalty: 0.1, type: "pronoun-you-pl" },
|
|
864
|
+
{ pattern: "\u0643", confidencePenalty: 0.08, type: "pronoun-you" },
|
|
865
|
+
{ pattern: "\u0647", confidencePenalty: 0.08, type: "pronoun-him" },
|
|
866
|
+
{ pattern: "\u064A", confidencePenalty: 0.08, type: "pronoun-me" },
|
|
867
|
+
// Feminine marker
|
|
868
|
+
{ pattern: "\u0629", confidencePenalty: 0.08, type: "feminine" }
|
|
869
|
+
];
|
|
870
|
+
var ArabicMorphologicalNormalizer = class {
|
|
871
|
+
constructor() {
|
|
872
|
+
this.language = "ar";
|
|
873
|
+
}
|
|
874
|
+
/**
|
|
875
|
+
* Check if a word might be an Arabic word that can be normalized.
|
|
876
|
+
*/
|
|
877
|
+
isNormalizable(word) {
|
|
878
|
+
if (!containsArabic(word)) return false;
|
|
879
|
+
if (word.length < 2) return false;
|
|
880
|
+
return true;
|
|
881
|
+
}
|
|
882
|
+
/**
|
|
883
|
+
* Normalize an Arabic word by stripping prefixes and suffixes.
|
|
884
|
+
*/
|
|
885
|
+
normalize(word) {
|
|
886
|
+
let stem = removeDiacritics(word);
|
|
887
|
+
let confidence = 1;
|
|
888
|
+
const removedPrefixes = [];
|
|
889
|
+
const removedSuffixes = [];
|
|
890
|
+
for (const rule of COMBINED_PREFIXES) {
|
|
891
|
+
if (stem.startsWith(rule.pattern)) {
|
|
892
|
+
const remaining = stem.slice(rule.pattern.length);
|
|
893
|
+
if (remaining.length >= 2) {
|
|
894
|
+
stem = remaining;
|
|
895
|
+
confidence -= rule.confidencePenalty;
|
|
896
|
+
removedPrefixes.push(rule.pattern);
|
|
897
|
+
break;
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
if (removedPrefixes.length === 0) {
|
|
902
|
+
for (const rule of SINGLE_PREFIXES) {
|
|
903
|
+
if (stem.startsWith(rule.pattern)) {
|
|
904
|
+
const remaining = stem.slice(rule.pattern.length);
|
|
905
|
+
const minLen = rule.minRemaining ?? 2;
|
|
906
|
+
if (remaining.length >= minLen) {
|
|
907
|
+
stem = remaining;
|
|
908
|
+
confidence -= rule.confidencePenalty;
|
|
909
|
+
removedPrefixes.push(rule.pattern);
|
|
910
|
+
break;
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
}
|
|
914
|
+
}
|
|
915
|
+
const looksLikeNoun = stem.endsWith("\u0627\u062A") || stem.endsWith("\u0629") || stem.endsWith("\u0648\u0646") || stem.endsWith("\u064A\u0646") || stem.endsWith("\u0647\u0627") || stem.endsWith("\u0647\u0645") || stem.endsWith("\u0647\u0646") || stem.endsWith("\u0646\u0627") || stem.endsWith("\u0643\u0645");
|
|
916
|
+
if (!looksLikeNoun && (removedPrefixes.length === 0 || removedPrefixes[0] === "\u0648" || removedPrefixes[0] === "\u0641")) {
|
|
917
|
+
for (const rule of VERB_PREFIXES) {
|
|
918
|
+
if (stem.startsWith(rule.pattern)) {
|
|
919
|
+
const remaining = stem.slice(rule.pattern.length);
|
|
920
|
+
const minLen = rule.minRemaining ?? 3;
|
|
921
|
+
if (remaining.length >= minLen) {
|
|
922
|
+
stem = remaining;
|
|
923
|
+
confidence -= rule.confidencePenalty;
|
|
924
|
+
removedPrefixes.push(rule.pattern);
|
|
925
|
+
break;
|
|
926
|
+
}
|
|
927
|
+
}
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
for (const rule of SUFFIXES) {
|
|
931
|
+
if (stem.endsWith(rule.pattern)) {
|
|
932
|
+
const remaining = stem.slice(0, -rule.pattern.length);
|
|
933
|
+
if (remaining.length >= 2) {
|
|
934
|
+
stem = remaining;
|
|
935
|
+
confidence -= rule.confidencePenalty;
|
|
936
|
+
removedSuffixes.push(rule.pattern);
|
|
937
|
+
}
|
|
938
|
+
}
|
|
939
|
+
}
|
|
940
|
+
confidence = Math.max(0.5, confidence);
|
|
941
|
+
if (removedPrefixes.length === 0 && removedSuffixes.length === 0) {
|
|
942
|
+
return noChange(word);
|
|
943
|
+
}
|
|
944
|
+
return normalized(stem, confidence, {
|
|
945
|
+
removedPrefixes,
|
|
946
|
+
removedSuffixes
|
|
947
|
+
});
|
|
948
|
+
}
|
|
949
|
+
};
|
|
950
|
+
var arabicMorphologicalNormalizer = new ArabicMorphologicalNormalizer();
|
|
951
|
+
|
|
952
|
+
// src/generators/profiles/arabic.ts
|
|
953
|
+
var arabicProfile = {
|
|
954
|
+
code: "ar",
|
|
955
|
+
name: "Arabic",
|
|
956
|
+
nativeName: "\u0627\u0644\u0639\u0631\u0628\u064A\u0629",
|
|
957
|
+
direction: "rtl",
|
|
958
|
+
wordOrder: "VSO",
|
|
959
|
+
markingStrategy: "preposition",
|
|
960
|
+
usesSpaces: true,
|
|
961
|
+
verb: {
|
|
962
|
+
position: "start",
|
|
963
|
+
subjectDrop: true
|
|
964
|
+
},
|
|
965
|
+
references: {
|
|
966
|
+
me: "\u0623\u0646\u0627",
|
|
967
|
+
// "I/me" - first person
|
|
968
|
+
it: "\u0647\u0648",
|
|
969
|
+
// "it" (masculine)
|
|
970
|
+
you: "\u0623\u0646\u062A",
|
|
971
|
+
// "you"
|
|
972
|
+
result: "\u0627\u0644\u0646\u062A\u064A\u062C\u0629",
|
|
973
|
+
event: "\u0627\u0644\u062D\u062F\u062B",
|
|
974
|
+
target: "\u0627\u0644\u0647\u062F\u0641",
|
|
975
|
+
body: "\u0627\u0644\u062C\u0633\u0645"
|
|
976
|
+
},
|
|
977
|
+
possessive: {
|
|
978
|
+
marker: "",
|
|
979
|
+
// No explicit marker - uses possessive pronouns
|
|
980
|
+
markerPosition: "after-object",
|
|
981
|
+
usePossessiveAdjectives: true,
|
|
982
|
+
specialForms: {
|
|
983
|
+
// Arabic: "value لي" (value for-me) - possessive pronoun follows property
|
|
984
|
+
me: "\u0644\u064A",
|
|
985
|
+
// "for me" / "mine"
|
|
986
|
+
it: "\u0644\u0647",
|
|
987
|
+
// "for it" / "its"
|
|
988
|
+
you: "\u0644\u0643"
|
|
989
|
+
// "for you" / "yours"
|
|
990
|
+
},
|
|
991
|
+
keywords: {
|
|
992
|
+
// "my" / "mine"
|
|
993
|
+
\u0644\u064A: "me",
|
|
994
|
+
// "your" / "yours"
|
|
995
|
+
\u0644\u0643: "you",
|
|
996
|
+
// "its/his/her"
|
|
997
|
+
\u0644\u0647: "it",
|
|
998
|
+
// his/its (masculine)
|
|
999
|
+
\u0644\u0647\u0627: "it"
|
|
1000
|
+
// her/its (feminine)
|
|
1001
|
+
}
|
|
1002
|
+
},
|
|
1003
|
+
roleMarkers: {
|
|
1004
|
+
destination: { primary: "\u0639\u0644\u0649", alternatives: ["\u0641\u064A", "\u0625\u0644\u0649", "\u0628"], position: "before" },
|
|
1005
|
+
source: { primary: "\u0645\u0646", position: "before" },
|
|
1006
|
+
patient: { primary: "", position: "before" },
|
|
1007
|
+
style: { primary: "\u0628\u0640", alternatives: ["\u0628\u0627\u0633\u062A\u062E\u062F\u0627\u0645"], position: "before" }
|
|
1008
|
+
},
|
|
1009
|
+
keywords: {
|
|
1010
|
+
// Class/Attribute operations
|
|
1011
|
+
toggle: { primary: "\u0628\u062F\u0651\u0644", alternatives: ["\u0628\u062F\u0644", "\u063A\u064A\u0651\u0631", "\u063A\u064A\u0631"], normalized: "toggle" },
|
|
1012
|
+
add: { primary: "\u0623\u0636\u0641", alternatives: ["\u0627\u0636\u0641", "\u0632\u0650\u062F"], normalized: "add" },
|
|
1013
|
+
remove: { primary: "\u0627\u062D\u0630\u0641", alternatives: ["\u0623\u0632\u0644", "\u0627\u0645\u0633\u062D"], normalized: "remove" },
|
|
1014
|
+
// Content operations
|
|
1015
|
+
put: { primary: "\u0636\u0639", alternatives: ["\u0627\u062C\u0639\u0644"], normalized: "put" },
|
|
1016
|
+
append: { primary: "\u0623\u0644\u062D\u0642", normalized: "append" },
|
|
1017
|
+
prepend: { primary: "\u0633\u0628\u0642", normalized: "prepend" },
|
|
1018
|
+
take: { primary: "\u062E\u0630", alternatives: ["\u0627\u062D\u0635\u0644"], normalized: "take" },
|
|
1019
|
+
make: { primary: "\u0627\u0635\u0646\u0639", alternatives: ["\u0623\u0646\u0634\u0626"], normalized: "make" },
|
|
1020
|
+
clone: { primary: "\u0627\u0633\u062A\u0646\u0633\u062E", alternatives: ["\u0627\u0646\u0633\u062E"], normalized: "clone" },
|
|
1021
|
+
swap: { primary: "\u0627\u0633\u062A\u0628\u062F\u0644", alternatives: ["\u062A\u0628\u0627\u062F\u0644"], normalized: "swap" },
|
|
1022
|
+
morph: { primary: "\u062D\u0648\u0651\u0644", alternatives: ["\u063A\u064A\u0651\u0631"], normalized: "morph" },
|
|
1023
|
+
// Variable operations
|
|
1024
|
+
set: { primary: "\u0627\u0636\u0628\u0637", alternatives: ["\u0639\u064A\u0651\u0646", "\u062D\u062F\u062F"], normalized: "set" },
|
|
1025
|
+
get: { primary: "\u0627\u062D\u0635\u0644", alternatives: ["\u062E\u0630"], normalized: "get" },
|
|
1026
|
+
increment: { primary: "\u0632\u0650\u062F", alternatives: ["\u0632\u062F", "\u0627\u0631\u0641\u0639"], normalized: "increment" },
|
|
1027
|
+
decrement: { primary: "\u0623\u0646\u0642\u0635", alternatives: ["\u0627\u0646\u0642\u0635", "\u0642\u0644\u0644"], normalized: "decrement" },
|
|
1028
|
+
log: { primary: "\u0633\u062C\u0644", normalized: "log" },
|
|
1029
|
+
// Visibility
|
|
1030
|
+
show: { primary: "\u0627\u0638\u0647\u0631", alternatives: ["\u0623\u0638\u0647\u0631", "\u0627\u0639\u0631\u0636"], normalized: "show" },
|
|
1031
|
+
hide: { primary: "\u0627\u062E\u0641", alternatives: ["\u0623\u062E\u0641\u0650", "\u0627\u062E\u0641\u064A"], normalized: "hide" },
|
|
1032
|
+
transition: { primary: "\u0627\u0646\u062A\u0642\u0627\u0644", alternatives: ["\u0627\u0646\u062A\u0642\u0644"], normalized: "transition" },
|
|
1033
|
+
// Events
|
|
1034
|
+
on: { primary: "\u0639\u0644\u0649", alternatives: ["\u0639\u0646\u062F", "\u0644\u062F\u0649", "\u062D\u064A\u0646"], normalized: "on" },
|
|
1035
|
+
trigger: { primary: "\u062A\u0634\u063A\u064A\u0644", alternatives: ["\u0623\u0637\u0644\u0642", "\u0641\u0639\u0651\u0644"], normalized: "trigger" },
|
|
1036
|
+
send: { primary: "\u0623\u0631\u0633\u0644", normalized: "send" },
|
|
1037
|
+
// DOM focus
|
|
1038
|
+
focus: { primary: "\u062A\u0631\u0643\u064A\u0632", alternatives: ["\u0631\u0643\u0632"], normalized: "focus" },
|
|
1039
|
+
blur: { primary: "\u0636\u0628\u0627\u0628\u064A\u0629", alternatives: ["\u0634\u0648\u0634"], normalized: "blur" },
|
|
1040
|
+
// Navigation
|
|
1041
|
+
go: { primary: "\u0627\u0630\u0647\u0628", alternatives: ["\u0627\u0646\u062A\u0642\u0644"], normalized: "go" },
|
|
1042
|
+
// Async
|
|
1043
|
+
wait: { primary: "\u0627\u0646\u062A\u0638\u0631", normalized: "wait" },
|
|
1044
|
+
fetch: { primary: "\u0627\u062D\u0636\u0631", alternatives: ["\u062C\u0644\u0628"], normalized: "fetch" },
|
|
1045
|
+
settle: { primary: "\u0627\u0633\u062A\u0642\u0631", normalized: "settle" },
|
|
1046
|
+
// Control flow
|
|
1047
|
+
if: { primary: "\u0625\u0630\u0627", normalized: "if" },
|
|
1048
|
+
when: { primary: "\u0639\u0646\u062F\u0645\u0627", normalized: "when" },
|
|
1049
|
+
where: { primary: "\u0623\u064A\u0646", normalized: "where" },
|
|
1050
|
+
else: { primary: "\u0648\u0625\u0644\u0627", alternatives: ["\u062E\u0644\u0627\u0641 \u0630\u0644\u0643"], normalized: "else" },
|
|
1051
|
+
repeat: { primary: "\u0643\u0631\u0631", normalized: "repeat" },
|
|
1052
|
+
for: { primary: "\u0644\u0643\u0644", normalized: "for" },
|
|
1053
|
+
while: { primary: "\u0628\u064A\u0646\u0645\u0627", normalized: "while" },
|
|
1054
|
+
continue: { primary: "\u0648\u0627\u0635\u0644", normalized: "continue" },
|
|
1055
|
+
halt: { primary: "\u0623\u0648\u0642\u0641", alternatives: ["\u062A\u0648\u0642\u0641"], normalized: "halt" },
|
|
1056
|
+
throw: { primary: "\u0627\u0631\u0645", alternatives: ["\u0627\u0631\u0645\u0650"], normalized: "throw" },
|
|
1057
|
+
call: { primary: "\u0627\u0633\u062A\u062F\u0639", alternatives: ["\u0646\u0627\u062F\u0650"], normalized: "call" },
|
|
1058
|
+
return: { primary: "\u0627\u0631\u062C\u0639", alternatives: ["\u0639\u064F\u062F"], normalized: "return" },
|
|
1059
|
+
then: { primary: "\u062B\u0645", alternatives: ["\u0628\u0639\u062F\u0647\u0627", "\u062B\u0645\u0651"], normalized: "then" },
|
|
1060
|
+
and: { primary: "\u0648\u0623\u064A\u0636\u0627\u064B", alternatives: ["\u0623\u064A\u0636\u0627\u064B"], normalized: "and" },
|
|
1061
|
+
end: { primary: "\u0646\u0647\u0627\u064A\u0629", alternatives: ["\u0627\u0646\u062A\u0647\u0649", "\u0622\u062E\u0631"], normalized: "end" },
|
|
1062
|
+
// Advanced
|
|
1063
|
+
js: { primary: "\u062C\u0627\u0641\u0627\u0633\u0643\u0631\u0628\u062A", alternatives: ["js"], normalized: "js" },
|
|
1064
|
+
async: { primary: "\u0645\u062A\u0632\u0627\u0645\u0646", normalized: "async" },
|
|
1065
|
+
tell: { primary: "\u0623\u062E\u0628\u0631", normalized: "tell" },
|
|
1066
|
+
default: { primary: "\u0627\u0641\u062A\u0631\u0627\u0636\u064A", normalized: "default" },
|
|
1067
|
+
init: { primary: "\u062A\u0647\u064A\u0626\u0629", alternatives: ["\u0628\u062F\u0621"], normalized: "init" },
|
|
1068
|
+
behavior: { primary: "\u0633\u0644\u0648\u0643", normalized: "behavior" },
|
|
1069
|
+
install: { primary: "\u062A\u062B\u0628\u064A\u062A", alternatives: ["\u062B\u0628\u0651\u062A"], normalized: "install" },
|
|
1070
|
+
measure: { primary: "\u0642\u064A\u0627\u0633", alternatives: ["\u0642\u0650\u0633"], normalized: "measure" },
|
|
1071
|
+
// Modifiers
|
|
1072
|
+
into: { primary: "\u0641\u064A", alternatives: ["\u0625\u0644\u0649"], normalized: "into" },
|
|
1073
|
+
before: { primary: "\u0642\u0628\u0644", normalized: "before" },
|
|
1074
|
+
after: { primary: "\u0628\u0639\u062F", normalized: "after" },
|
|
1075
|
+
// Event modifiers (for repeat until event)
|
|
1076
|
+
until: { primary: "\u062D\u062A\u0649", normalized: "until" },
|
|
1077
|
+
event: { primary: "\u062D\u062F\u062B", normalized: "event" },
|
|
1078
|
+
from: { primary: "\u0645\u0646", normalized: "from" }
|
|
1079
|
+
},
|
|
1080
|
+
tokenization: {
|
|
1081
|
+
prefixes: ["\u0627\u0644", "\u0648", "\u0641", "\u0628", "\u0643", "\u0644"]
|
|
1082
|
+
}
|
|
1083
|
+
};
|
|
1084
|
+
|
|
1085
|
+
// src/tokenizers/arabic.ts
|
|
1086
|
+
var isArabic2 = createUnicodeRangeClassifier([
|
|
1087
|
+
[1536, 1791],
|
|
1088
|
+
// Arabic
|
|
1089
|
+
[1872, 1919],
|
|
1090
|
+
// Arabic Supplement
|
|
1091
|
+
[2208, 2303],
|
|
1092
|
+
// Arabic Extended-A
|
|
1093
|
+
[64336, 65023],
|
|
1094
|
+
// Arabic Presentation Forms-A
|
|
1095
|
+
[65136, 65279]
|
|
1096
|
+
// Arabic Presentation Forms-B
|
|
1097
|
+
]);
|
|
1098
|
+
var ATTACHED_PREFIXES = /* @__PURE__ */ new Set([
|
|
1099
|
+
"\u0628\u0640",
|
|
1100
|
+
// bi- (with, by)
|
|
1101
|
+
"\u0644\u0640",
|
|
1102
|
+
// li- (to, for)
|
|
1103
|
+
"\u0643\u0640",
|
|
1104
|
+
// ka- (like, as)
|
|
1105
|
+
"\u0648\u0640"
|
|
1106
|
+
// wa- (and)
|
|
1107
|
+
]);
|
|
1108
|
+
var PROCLITICS = /* @__PURE__ */ new Map([
|
|
1109
|
+
// Conjunctions (single character)
|
|
1110
|
+
["\u0648", { normalized: "and", type: "conjunction" }],
|
|
1111
|
+
// wa - conjunction "and"
|
|
1112
|
+
["\u0641", { normalized: "then", type: "conjunction" }],
|
|
1113
|
+
// fa - conjunction "then/so"
|
|
1114
|
+
// Attached prefix prepositions
|
|
1115
|
+
["\u0628", { normalized: "with", type: "preposition" }],
|
|
1116
|
+
// bi- (with, by)
|
|
1117
|
+
["\u0644", { normalized: "to", type: "preposition" }],
|
|
1118
|
+
// li- (to, for)
|
|
1119
|
+
["\u0643", { normalized: "like", type: "preposition" }],
|
|
1120
|
+
// ka- (like, as)
|
|
1121
|
+
// Multi-proclitic sequences (conjunction + preposition)
|
|
1122
|
+
["\u0648\u0644", { normalized: "and-to", type: "conjunction" }],
|
|
1123
|
+
// wa + li-
|
|
1124
|
+
["\u0648\u0628", { normalized: "and-with", type: "conjunction" }],
|
|
1125
|
+
// wa + bi-
|
|
1126
|
+
["\u0648\u0643", { normalized: "and-like", type: "conjunction" }],
|
|
1127
|
+
// wa + ka-
|
|
1128
|
+
["\u0641\u0644", { normalized: "then-to", type: "conjunction" }],
|
|
1129
|
+
// fa + li-
|
|
1130
|
+
["\u0641\u0628", { normalized: "then-with", type: "conjunction" }],
|
|
1131
|
+
// fa + bi-
|
|
1132
|
+
["\u0641\u0643", { normalized: "then-like", type: "conjunction" }]
|
|
1133
|
+
// fa + ka-
|
|
1134
|
+
]);
|
|
1135
|
+
var TEMPORAL_MARKERS = /* @__PURE__ */ new Map([
|
|
1136
|
+
[
|
|
1137
|
+
"\u0639\u0646\u062F\u0645\u0627",
|
|
1138
|
+
{
|
|
1139
|
+
normalized: "on",
|
|
1140
|
+
formality: "formal",
|
|
1141
|
+
confidence: 0.95,
|
|
1142
|
+
description: "when (formal MSA)"
|
|
1143
|
+
}
|
|
1144
|
+
],
|
|
1145
|
+
[
|
|
1146
|
+
"\u062D\u064A\u0646\u0645\u0627",
|
|
1147
|
+
{
|
|
1148
|
+
normalized: "on",
|
|
1149
|
+
formality: "formal",
|
|
1150
|
+
confidence: 0.93,
|
|
1151
|
+
description: "when/whenever (formal)"
|
|
1152
|
+
}
|
|
1153
|
+
],
|
|
1154
|
+
[
|
|
1155
|
+
"\u0639\u0646\u062F",
|
|
1156
|
+
{
|
|
1157
|
+
normalized: "on",
|
|
1158
|
+
formality: "neutral",
|
|
1159
|
+
confidence: 0.88,
|
|
1160
|
+
description: "at/when (neutral)"
|
|
1161
|
+
}
|
|
1162
|
+
],
|
|
1163
|
+
[
|
|
1164
|
+
"\u062D\u064A\u0646",
|
|
1165
|
+
{
|
|
1166
|
+
normalized: "on",
|
|
1167
|
+
formality: "neutral",
|
|
1168
|
+
confidence: 0.85,
|
|
1169
|
+
description: "when/time (neutral)"
|
|
1170
|
+
}
|
|
1171
|
+
],
|
|
1172
|
+
[
|
|
1173
|
+
"\u0644\u0645\u0651\u0627",
|
|
1174
|
+
{
|
|
1175
|
+
normalized: "on",
|
|
1176
|
+
formality: "dialectal",
|
|
1177
|
+
confidence: 0.7,
|
|
1178
|
+
description: "when (dialectal, with shadda)"
|
|
1179
|
+
}
|
|
1180
|
+
],
|
|
1181
|
+
[
|
|
1182
|
+
"\u0644\u0645\u0627",
|
|
1183
|
+
{
|
|
1184
|
+
normalized: "on",
|
|
1185
|
+
formality: "dialectal",
|
|
1186
|
+
confidence: 0.68,
|
|
1187
|
+
description: "when (dialectal, no diacritic)"
|
|
1188
|
+
}
|
|
1189
|
+
],
|
|
1190
|
+
[
|
|
1191
|
+
"\u0644\u062F\u0649",
|
|
1192
|
+
{
|
|
1193
|
+
normalized: "on",
|
|
1194
|
+
formality: "neutral",
|
|
1195
|
+
confidence: 0.82,
|
|
1196
|
+
description: "at/with (temporal)"
|
|
1197
|
+
}
|
|
1198
|
+
]
|
|
1199
|
+
]);
|
|
1200
|
+
var PREPOSITIONS = /* @__PURE__ */ new Set([
|
|
1201
|
+
"\u0641\u064A",
|
|
1202
|
+
// fī (in)
|
|
1203
|
+
"\u0639\u0644\u0649",
|
|
1204
|
+
// ʿalā (on)
|
|
1205
|
+
"\u0645\u0646",
|
|
1206
|
+
// min (from)
|
|
1207
|
+
"\u0625\u0644\u0649",
|
|
1208
|
+
// ilā (to)
|
|
1209
|
+
"\u0627\u0644\u0649",
|
|
1210
|
+
// ilā (alternative spelling)
|
|
1211
|
+
// 'عند' removed - it's a temporal marker with metadata
|
|
1212
|
+
"\u0645\u0639",
|
|
1213
|
+
// maʿa (with)
|
|
1214
|
+
"\u0639\u0646",
|
|
1215
|
+
// ʿan (about, from)
|
|
1216
|
+
"\u0642\u0628\u0644",
|
|
1217
|
+
// qabl (before)
|
|
1218
|
+
"\u0628\u0639\u062F",
|
|
1219
|
+
// baʿd (after)
|
|
1220
|
+
"\u0628\u064A\u0646"
|
|
1221
|
+
// bayn (between)
|
|
1222
|
+
]);
|
|
1223
|
+
var ARABIC_EXTRAS = [
|
|
1224
|
+
// Values/Literals
|
|
1225
|
+
{ native: "\u0635\u062D\u064A\u062D", normalized: "true" },
|
|
1226
|
+
{ native: "\u062E\u0637\u0623", normalized: "false" },
|
|
1227
|
+
{ native: "null", normalized: "null" },
|
|
1228
|
+
{ native: "\u0641\u0627\u0631\u063A", normalized: "null" },
|
|
1229
|
+
{ native: "\u063A\u064A\u0631 \u0645\u0639\u0631\u0641", normalized: "undefined" },
|
|
1230
|
+
// Positional
|
|
1231
|
+
{ native: "\u0627\u0644\u0623\u0648\u0644", normalized: "first" },
|
|
1232
|
+
{ native: "\u0623\u0648\u0644", normalized: "first" },
|
|
1233
|
+
{ native: "\u0627\u0644\u0623\u062E\u064A\u0631", normalized: "last" },
|
|
1234
|
+
{ native: "\u0622\u062E\u0631", normalized: "last" },
|
|
1235
|
+
{ native: "\u0627\u0644\u062A\u0627\u0644\u064A", normalized: "next" },
|
|
1236
|
+
{ native: "\u0627\u0644\u0633\u0627\u0628\u0642", normalized: "previous" },
|
|
1237
|
+
{ native: "\u0627\u0644\u0623\u0642\u0631\u0628", normalized: "closest" },
|
|
1238
|
+
{ native: "\u0627\u0644\u0623\u0628", normalized: "parent" },
|
|
1239
|
+
// Events
|
|
1240
|
+
{ native: "\u0627\u0644\u0646\u0642\u0631", normalized: "click" },
|
|
1241
|
+
{ native: "\u0646\u0642\u0631", normalized: "click" },
|
|
1242
|
+
{ native: "\u0627\u0644\u0625\u062F\u062E\u0627\u0644", normalized: "input" },
|
|
1243
|
+
{ native: "\u0625\u062F\u062E\u0627\u0644", normalized: "input" },
|
|
1244
|
+
{ native: "\u0627\u0644\u062A\u063A\u064A\u064A\u0631", normalized: "change" },
|
|
1245
|
+
{ native: "\u062A\u063A\u064A\u064A\u0631", normalized: "change" },
|
|
1246
|
+
{ native: "\u0627\u0644\u0625\u0631\u0633\u0627\u0644", normalized: "submit" },
|
|
1247
|
+
{ native: "\u0625\u0631\u0633\u0627\u0644", normalized: "submit" },
|
|
1248
|
+
{ native: "\u0627\u0644\u062A\u0631\u0643\u064A\u0632", normalized: "focus" },
|
|
1249
|
+
{ native: "\u0641\u0642\u062F\u0627\u0646 \u0627\u0644\u062A\u0631\u0643\u064A\u0632", normalized: "blur" },
|
|
1250
|
+
{ native: "\u0636\u063A\u0637", normalized: "keydown" },
|
|
1251
|
+
{ native: "\u0631\u0641\u0639", normalized: "keyup" },
|
|
1252
|
+
{ native: "\u062A\u0645\u0631\u064A\u0631 \u0627\u0644\u0641\u0623\u0631\u0629", normalized: "mouseover" },
|
|
1253
|
+
{ native: "\u0645\u063A\u0627\u062F\u0631\u0629 \u0627\u0644\u0641\u0623\u0631\u0629", normalized: "mouseout" },
|
|
1254
|
+
{ native: "\u062A\u062D\u0645\u064A\u0644", normalized: "load" },
|
|
1255
|
+
{ native: "\u062A\u0645\u0631\u064A\u0631", normalized: "scroll" },
|
|
1256
|
+
// References
|
|
1257
|
+
{ native: "\u0623\u0646\u0627", normalized: "me" },
|
|
1258
|
+
{ native: "\u0647\u0648", normalized: "it" },
|
|
1259
|
+
{ native: "\u0647\u064A", normalized: "it" },
|
|
1260
|
+
{ native: "\u0627\u0644\u0646\u062A\u064A\u062C\u0629", normalized: "result" },
|
|
1261
|
+
{ native: "\u0627\u0644\u062D\u062F\u062B", normalized: "event" },
|
|
1262
|
+
{ native: "\u0627\u0644\u0647\u062F\u0641", normalized: "target" },
|
|
1263
|
+
// Time units
|
|
1264
|
+
{ native: "\u062B\u0627\u0646\u064A\u0629", normalized: "s" },
|
|
1265
|
+
{ native: "\u062B\u0648\u0627\u0646\u064A", normalized: "s" },
|
|
1266
|
+
{ native: "\u0645\u0644\u064A \u062B\u0627\u0646\u064A\u0629", normalized: "ms" },
|
|
1267
|
+
{ native: "\u062F\u0642\u064A\u0642\u0629", normalized: "m" },
|
|
1268
|
+
{ native: "\u062F\u0642\u0627\u0626\u0642", normalized: "m" },
|
|
1269
|
+
{ native: "\u0633\u0627\u0639\u0629", normalized: "h" },
|
|
1270
|
+
{ native: "\u0633\u0627\u0639\u0627\u062A", normalized: "h" },
|
|
1271
|
+
// Note: Temporal markers (عندما, حينما, etc.) are in TEMPORAL_MARKERS map
|
|
1272
|
+
// with formality metadata, not in ARABIC_EXTRAS
|
|
1273
|
+
// Additional spelling variants (without diacritics)
|
|
1274
|
+
{ native: "\u0628\u062F\u0644", normalized: "toggle" },
|
|
1275
|
+
{ native: "\u063A\u064A\u0631", normalized: "toggle" },
|
|
1276
|
+
{ native: "\u0627\u0636\u0641", normalized: "add" },
|
|
1277
|
+
{ native: "\u0627\u0632\u0644", normalized: "remove" },
|
|
1278
|
+
{ native: "\u0627\u0636\u0639", normalized: "put" },
|
|
1279
|
+
{ native: "\u064A\u0636\u0639", normalized: "put" },
|
|
1280
|
+
{ native: "\u0627\u062C\u0639\u0644", normalized: "put" },
|
|
1281
|
+
{ native: "\u0639\u064A\u0646", normalized: "set" },
|
|
1282
|
+
{ native: "\u0632\u062F", normalized: "increment" },
|
|
1283
|
+
{ native: "\u0627\u0631\u0641\u0639", normalized: "increment" },
|
|
1284
|
+
{ native: "\u0627\u0646\u0642\u0635", normalized: "decrement" },
|
|
1285
|
+
{ native: "\u0642\u0644\u0644", normalized: "decrement" },
|
|
1286
|
+
{ native: "\u0633\u062C\u0644", normalized: "log" },
|
|
1287
|
+
{ native: "\u0627\u0638\u0647\u0631", normalized: "show" },
|
|
1288
|
+
{ native: "\u0627\u0639\u0631\u0636", normalized: "show" },
|
|
1289
|
+
{ native: "\u0627\u062E\u0641", normalized: "hide" },
|
|
1290
|
+
{ native: "\u0627\u062E\u0641\u064A", normalized: "hide" },
|
|
1291
|
+
{ native: "\u0634\u063A\u0644", normalized: "trigger" },
|
|
1292
|
+
{ native: "\u0627\u0631\u0633\u0644", normalized: "send" },
|
|
1293
|
+
{ native: "\u0631\u0643\u0632", normalized: "focus" },
|
|
1294
|
+
{ native: "\u0634\u0648\u0634", normalized: "blur" },
|
|
1295
|
+
{ native: "\u0627\u0630\u0627", normalized: "if" },
|
|
1296
|
+
{ native: "\u0644\u0648", normalized: "if" },
|
|
1297
|
+
{ native: "\u0648\u0627\u0644\u0627", normalized: "else" },
|
|
1298
|
+
{ native: "\u062A\u0648\u0642\u0641", normalized: "halt" },
|
|
1299
|
+
{ native: "\u0627\u0646\u0633\u062E", normalized: "clone" },
|
|
1300
|
+
// Control flow helpers
|
|
1301
|
+
{ native: "\u0625\u0630\u0646", normalized: "then" },
|
|
1302
|
+
{ native: "\u0641\u0625\u0646", normalized: "then" },
|
|
1303
|
+
{ native: "\u0646\u0647\u0627\u064A\u0629", normalized: "end" },
|
|
1304
|
+
// Modifiers
|
|
1305
|
+
{ native: "\u0642\u0628\u0644", normalized: "before" },
|
|
1306
|
+
{ native: "\u0628\u0639\u062F", normalized: "after" }
|
|
1307
|
+
];
|
|
1308
|
+
var ARABIC_TIME_UNITS = [
|
|
1309
|
+
{ pattern: "\u0645\u0644\u064A \u062B\u0627\u0646\u064A\u0629", suffix: "ms", length: 9, caseInsensitive: false },
|
|
1310
|
+
{ pattern: "\u0645\u0644\u064A_\u062B\u0627\u0646\u064A\u0629", suffix: "ms", length: 8, caseInsensitive: false },
|
|
1311
|
+
{ pattern: "\u062F\u0642\u0627\u0626\u0642", suffix: "m", length: 5, caseInsensitive: false },
|
|
1312
|
+
{ pattern: "\u062F\u0642\u064A\u0642\u0629", suffix: "m", length: 5, caseInsensitive: false },
|
|
1313
|
+
{ pattern: "\u062B\u0648\u0627\u0646\u064A", suffix: "s", length: 5, caseInsensitive: false },
|
|
1314
|
+
{ pattern: "\u062B\u0627\u0646\u064A\u0629", suffix: "s", length: 5, caseInsensitive: false },
|
|
1315
|
+
{ pattern: "\u0633\u0627\u0639\u0627\u062A", suffix: "h", length: 5, caseInsensitive: false },
|
|
1316
|
+
{ pattern: "\u0633\u0627\u0639\u0629", suffix: "h", length: 4, caseInsensitive: false }
|
|
1317
|
+
];
|
|
1318
|
+
var ArabicTokenizer = class extends BaseTokenizer {
|
|
1319
|
+
constructor() {
|
|
1320
|
+
super();
|
|
1321
|
+
this.language = "ar";
|
|
1322
|
+
this.direction = "rtl";
|
|
1323
|
+
this.initializeKeywordsFromProfile(arabicProfile, ARABIC_EXTRAS);
|
|
1324
|
+
this.normalizer = new ArabicMorphologicalNormalizer();
|
|
1325
|
+
}
|
|
1326
|
+
tokenize(input) {
|
|
1327
|
+
const tokens = [];
|
|
1328
|
+
let pos = 0;
|
|
1329
|
+
while (pos < input.length) {
|
|
1330
|
+
if (isWhitespace(input[pos])) {
|
|
1331
|
+
pos++;
|
|
1332
|
+
continue;
|
|
1333
|
+
}
|
|
1334
|
+
if (isSelectorStart(input[pos])) {
|
|
1335
|
+
const modifierToken = this.tryEventModifier(input, pos);
|
|
1336
|
+
if (modifierToken) {
|
|
1337
|
+
tokens.push(modifierToken);
|
|
1338
|
+
pos = modifierToken.position.end;
|
|
1339
|
+
continue;
|
|
1340
|
+
}
|
|
1341
|
+
const selectorToken = this.trySelector(input, pos);
|
|
1342
|
+
if (selectorToken) {
|
|
1343
|
+
tokens.push(selectorToken);
|
|
1344
|
+
pos = selectorToken.position.end;
|
|
1345
|
+
continue;
|
|
1346
|
+
}
|
|
1347
|
+
}
|
|
1348
|
+
if (isQuote(input[pos])) {
|
|
1349
|
+
const stringToken = this.tryString(input, pos);
|
|
1350
|
+
if (stringToken) {
|
|
1351
|
+
tokens.push(stringToken);
|
|
1352
|
+
pos = stringToken.position.end;
|
|
1353
|
+
continue;
|
|
1354
|
+
}
|
|
1355
|
+
}
|
|
1356
|
+
if (isUrlStart(input, pos)) {
|
|
1357
|
+
const urlToken = this.tryUrl(input, pos);
|
|
1358
|
+
if (urlToken) {
|
|
1359
|
+
tokens.push(urlToken);
|
|
1360
|
+
pos = urlToken.position.end;
|
|
1361
|
+
continue;
|
|
1362
|
+
}
|
|
1363
|
+
}
|
|
1364
|
+
if (isDigit(input[pos])) {
|
|
1365
|
+
const numberToken = this.extractArabicNumber(input, pos);
|
|
1366
|
+
if (numberToken) {
|
|
1367
|
+
tokens.push(numberToken);
|
|
1368
|
+
pos = numberToken.position.end;
|
|
1369
|
+
continue;
|
|
1370
|
+
}
|
|
1371
|
+
}
|
|
1372
|
+
const varToken = this.tryVariableRef(input, pos);
|
|
1373
|
+
if (varToken) {
|
|
1374
|
+
tokens.push(varToken);
|
|
1375
|
+
pos = varToken.position.end;
|
|
1376
|
+
continue;
|
|
1377
|
+
}
|
|
1378
|
+
const prepToken = this.tryPreposition(input, pos);
|
|
1379
|
+
if (prepToken) {
|
|
1380
|
+
tokens.push(prepToken);
|
|
1381
|
+
pos = prepToken.position.end;
|
|
1382
|
+
continue;
|
|
1383
|
+
}
|
|
1384
|
+
if (isArabic2(input[pos])) {
|
|
1385
|
+
const procliticResult = this.tryProclitic(input, pos);
|
|
1386
|
+
if (procliticResult) {
|
|
1387
|
+
tokens.push(procliticResult.conjunction);
|
|
1388
|
+
pos = procliticResult.conjunction.position.end;
|
|
1389
|
+
continue;
|
|
1390
|
+
}
|
|
1391
|
+
const wordToken = this.extractArabicWord(input, pos);
|
|
1392
|
+
if (wordToken) {
|
|
1393
|
+
tokens.push(wordToken);
|
|
1394
|
+
pos = wordToken.position.end;
|
|
1395
|
+
continue;
|
|
1396
|
+
}
|
|
1397
|
+
}
|
|
1398
|
+
if (isAsciiIdentifierChar(input[pos])) {
|
|
1399
|
+
const asciiToken = this.extractAsciiWord(input, pos);
|
|
1400
|
+
if (asciiToken) {
|
|
1401
|
+
tokens.push(asciiToken);
|
|
1402
|
+
pos = asciiToken.position.end;
|
|
1403
|
+
continue;
|
|
1404
|
+
}
|
|
1405
|
+
}
|
|
1406
|
+
pos++;
|
|
1407
|
+
}
|
|
1408
|
+
return new TokenStreamImpl(tokens, "ar");
|
|
1409
|
+
}
|
|
1410
|
+
classifyToken(token) {
|
|
1411
|
+
if (PREPOSITIONS.has(token)) return "particle";
|
|
1412
|
+
if (this.isKeyword(token)) return "keyword";
|
|
1413
|
+
if (token.startsWith("#") || token.startsWith(".") || token.startsWith("[")) return "selector";
|
|
1414
|
+
if (token.startsWith('"') || token.startsWith("'")) return "literal";
|
|
1415
|
+
if (/^\d/.test(token)) return "literal";
|
|
1416
|
+
return "identifier";
|
|
1417
|
+
}
|
|
1418
|
+
/**
|
|
1419
|
+
* Try to match an Arabic preposition.
|
|
1420
|
+
* Attaches prepositionValue metadata for disambiguation in pattern matching.
|
|
1421
|
+
*/
|
|
1422
|
+
tryPreposition(input, pos) {
|
|
1423
|
+
const sortedPreps = Array.from(PREPOSITIONS).sort((a, b) => b.length - a.length);
|
|
1424
|
+
for (const prep of sortedPreps) {
|
|
1425
|
+
if (input.slice(pos, pos + prep.length) === prep) {
|
|
1426
|
+
const nextPos = pos + prep.length;
|
|
1427
|
+
if (nextPos >= input.length || isWhitespace(input[nextPos]) || !isArabic2(input[nextPos])) {
|
|
1428
|
+
const token = createToken(prep, "particle", createPosition(pos, nextPos));
|
|
1429
|
+
return {
|
|
1430
|
+
...token,
|
|
1431
|
+
metadata: {
|
|
1432
|
+
prepositionValue: prep
|
|
1433
|
+
}
|
|
1434
|
+
};
|
|
1435
|
+
}
|
|
1436
|
+
}
|
|
1437
|
+
}
|
|
1438
|
+
return null;
|
|
1439
|
+
}
|
|
1440
|
+
/**
|
|
1441
|
+
* Try to extract a proclitic (conjunction or preposition) that's attached to the following word.
|
|
1442
|
+
*
|
|
1443
|
+
* Arabic proclitics attach directly to words without space:
|
|
1444
|
+
* - والنقر → و + النقر (and + the-click)
|
|
1445
|
+
* - فالتبديل → ف + التبديل (then + the-toggle)
|
|
1446
|
+
* - بالنقر → ب + النقر (with + the-click)
|
|
1447
|
+
* - ولالنقر → و + ل + النقر (and + to + the-click)
|
|
1448
|
+
*
|
|
1449
|
+
* This enables:
|
|
1450
|
+
* - Polysyndetic coordination: A وB وC
|
|
1451
|
+
* - Attached prepositions: بالنقر (with-the-click)
|
|
1452
|
+
* - Multi-proclitic sequences: ولالنقر (and-to-the-click)
|
|
1453
|
+
*
|
|
1454
|
+
* Returns null if:
|
|
1455
|
+
* - Not a proclitic character/sequence
|
|
1456
|
+
* - Proclitic is standalone (followed by space)
|
|
1457
|
+
* - Remaining word is too short (< 2 chars, to avoid false positives)
|
|
1458
|
+
* - Full word is a recognized keyword (e.g., بدل should NOT be split to ب + دل)
|
|
1459
|
+
*
|
|
1460
|
+
* @see NATIVE_REVIEW_NEEDED.md for implementation rationale
|
|
1461
|
+
*/
|
|
1462
|
+
tryProclitic(input, pos) {
|
|
1463
|
+
let wordEnd = pos;
|
|
1464
|
+
while (wordEnd < input.length && (isArabic2(input[wordEnd]) || input[wordEnd] === "\u0640")) {
|
|
1465
|
+
wordEnd++;
|
|
1466
|
+
}
|
|
1467
|
+
const fullWord = input.slice(pos, wordEnd);
|
|
1468
|
+
if (this.lookupKeyword(fullWord)) {
|
|
1469
|
+
return null;
|
|
1470
|
+
}
|
|
1471
|
+
if (TEMPORAL_MARKERS.has(fullWord)) {
|
|
1472
|
+
return null;
|
|
1473
|
+
}
|
|
1474
|
+
if (PREPOSITIONS.has(fullWord)) {
|
|
1475
|
+
return null;
|
|
1476
|
+
}
|
|
1477
|
+
if (pos + 2 <= input.length) {
|
|
1478
|
+
const twoChar = input.slice(pos, pos + 2);
|
|
1479
|
+
const twoCharEntry = PROCLITICS.get(twoChar);
|
|
1480
|
+
if (twoCharEntry) {
|
|
1481
|
+
const nextPos2 = pos + 2;
|
|
1482
|
+
if (nextPos2 < input.length && isArabic2(input[nextPos2])) {
|
|
1483
|
+
let remainingLength2 = 0;
|
|
1484
|
+
let checkPos2 = nextPos2;
|
|
1485
|
+
while (checkPos2 < input.length && isArabic2(input[checkPos2])) {
|
|
1486
|
+
remainingLength2++;
|
|
1487
|
+
checkPos2++;
|
|
1488
|
+
}
|
|
1489
|
+
if (remainingLength2 >= 2) {
|
|
1490
|
+
const singleCharProclitic = PROCLITICS.get(input[pos]);
|
|
1491
|
+
if (singleCharProclitic) {
|
|
1492
|
+
const afterSingleChar = input.slice(pos + 1, wordEnd);
|
|
1493
|
+
if (this.lookupKeyword(afterSingleChar)) {
|
|
1494
|
+
} else {
|
|
1495
|
+
const tokenKind2 = twoCharEntry.type === "conjunction" ? "conjunction" : "particle";
|
|
1496
|
+
return {
|
|
1497
|
+
conjunction: createToken(
|
|
1498
|
+
twoChar,
|
|
1499
|
+
tokenKind2,
|
|
1500
|
+
createPosition(pos, nextPos2),
|
|
1501
|
+
twoCharEntry.normalized
|
|
1502
|
+
)
|
|
1503
|
+
};
|
|
1504
|
+
}
|
|
1505
|
+
} else {
|
|
1506
|
+
const tokenKind2 = twoCharEntry.type === "conjunction" ? "conjunction" : "particle";
|
|
1507
|
+
return {
|
|
1508
|
+
conjunction: createToken(
|
|
1509
|
+
twoChar,
|
|
1510
|
+
tokenKind2,
|
|
1511
|
+
createPosition(pos, nextPos2),
|
|
1512
|
+
twoCharEntry.normalized
|
|
1513
|
+
)
|
|
1514
|
+
};
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
}
|
|
1518
|
+
}
|
|
1519
|
+
}
|
|
1520
|
+
const char = input[pos];
|
|
1521
|
+
const entry = PROCLITICS.get(char);
|
|
1522
|
+
if (!entry) return null;
|
|
1523
|
+
const nextPos = pos + 1;
|
|
1524
|
+
if (nextPos >= input.length || !isArabic2(input[nextPos])) {
|
|
1525
|
+
return null;
|
|
1526
|
+
}
|
|
1527
|
+
let remainingLength = 0;
|
|
1528
|
+
let checkPos = nextPos;
|
|
1529
|
+
while (checkPos < input.length && isArabic2(input[checkPos])) {
|
|
1530
|
+
remainingLength++;
|
|
1531
|
+
checkPos++;
|
|
1532
|
+
}
|
|
1533
|
+
if (remainingLength < 2) {
|
|
1534
|
+
return null;
|
|
1535
|
+
}
|
|
1536
|
+
const tokenKind = entry.type === "conjunction" ? "conjunction" : "particle";
|
|
1537
|
+
return {
|
|
1538
|
+
conjunction: createToken(char, tokenKind, createPosition(pos, nextPos), entry.normalized)
|
|
1539
|
+
};
|
|
1540
|
+
}
|
|
1541
|
+
/**
|
|
1542
|
+
* Extract an Arabic word.
|
|
1543
|
+
* Uses morphological normalization to handle prefix/suffix variations.
|
|
1544
|
+
* Attaches metadata for temporal markers (formality, confidence).
|
|
1545
|
+
*/
|
|
1546
|
+
extractArabicWord(input, startPos) {
|
|
1547
|
+
let pos = startPos;
|
|
1548
|
+
let word = "";
|
|
1549
|
+
for (const prefix of ATTACHED_PREFIXES) {
|
|
1550
|
+
const basePrefix = prefix.replace("\u0640", "");
|
|
1551
|
+
if (input.slice(pos, pos + basePrefix.length) === basePrefix) {
|
|
1552
|
+
}
|
|
1553
|
+
}
|
|
1554
|
+
while (pos < input.length && (isArabic2(input[pos]) || input[pos] === "\u0640")) {
|
|
1555
|
+
word += input[pos++];
|
|
1556
|
+
}
|
|
1557
|
+
if (!word) return null;
|
|
1558
|
+
const temporalMarker = TEMPORAL_MARKERS.get(word);
|
|
1559
|
+
if (temporalMarker) {
|
|
1560
|
+
const token = createToken(
|
|
1561
|
+
word,
|
|
1562
|
+
"keyword",
|
|
1563
|
+
createPosition(startPos, pos),
|
|
1564
|
+
temporalMarker.normalized
|
|
1565
|
+
);
|
|
1566
|
+
return {
|
|
1567
|
+
...token,
|
|
1568
|
+
metadata: {
|
|
1569
|
+
temporalFormality: temporalMarker.formality,
|
|
1570
|
+
temporalConfidence: temporalMarker.confidence
|
|
1571
|
+
}
|
|
1572
|
+
};
|
|
1573
|
+
}
|
|
1574
|
+
const keywordEntry = this.lookupKeyword(word);
|
|
1575
|
+
if (keywordEntry) {
|
|
1576
|
+
return createToken(word, "keyword", createPosition(startPos, pos), keywordEntry.normalized);
|
|
1577
|
+
}
|
|
1578
|
+
if (PREPOSITIONS.has(word)) {
|
|
1579
|
+
const token = createToken(word, "particle", createPosition(startPos, pos));
|
|
1580
|
+
return {
|
|
1581
|
+
...token,
|
|
1582
|
+
metadata: {
|
|
1583
|
+
prepositionValue: word
|
|
1584
|
+
}
|
|
1585
|
+
};
|
|
1586
|
+
}
|
|
1587
|
+
const morphToken = this.tryMorphKeywordMatch(word, startPos, pos);
|
|
1588
|
+
if (morphToken) return morphToken;
|
|
1589
|
+
return createToken(word, "identifier", createPosition(startPos, pos));
|
|
1590
|
+
}
|
|
1591
|
+
/**
|
|
1592
|
+
* Extract an ASCII word.
|
|
1593
|
+
*/
|
|
1594
|
+
extractAsciiWord(input, startPos) {
|
|
1595
|
+
let pos = startPos;
|
|
1596
|
+
let word = "";
|
|
1597
|
+
while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
|
|
1598
|
+
word += input[pos++];
|
|
1599
|
+
}
|
|
1600
|
+
if (!word) return null;
|
|
1601
|
+
return createToken(word, "identifier", createPosition(startPos, pos));
|
|
1602
|
+
}
|
|
1603
|
+
/**
|
|
1604
|
+
* Extract a number, including Arabic time unit suffixes.
|
|
1605
|
+
* Arabic allows space between number and unit.
|
|
1606
|
+
*/
|
|
1607
|
+
extractArabicNumber(input, startPos) {
|
|
1608
|
+
return this.tryNumberWithTimeUnits(input, startPos, ARABIC_TIME_UNITS, {
|
|
1609
|
+
allowSign: false,
|
|
1610
|
+
skipWhitespace: true
|
|
1611
|
+
});
|
|
1612
|
+
}
|
|
1613
|
+
};
|
|
1614
|
+
var arabicTokenizer = new ArabicTokenizer();
|
|
1615
|
+
|
|
1616
|
+
// src/languages/ar.ts
|
|
1617
|
+
registerLanguage("ar", arabicTokenizer, arabicProfile);
|
|
1618
|
+
export {
|
|
1619
|
+
arabicProfile,
|
|
1620
|
+
arabicTokenizer
|
|
1621
|
+
};
|
|
1622
|
+
//# sourceMappingURL=ar.js.map
|