@lokascript/semantic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +686 -0
- package/dist/browser-ar.ar.global.js +2 -0
- package/dist/browser-core.core.global.js +2 -0
- package/dist/browser-de.de.global.js +2 -0
- package/dist/browser-east-asian.east-asian.global.js +2 -0
- package/dist/browser-en-tr.en-tr.global.js +2 -0
- package/dist/browser-en.en.global.js +2 -0
- package/dist/browser-es-en.es-en.global.js +2 -0
- package/dist/browser-es.es.global.js +2 -0
- package/dist/browser-fr.fr.global.js +2 -0
- package/dist/browser-id.id.global.js +2 -0
- package/dist/browser-ja.ja.global.js +2 -0
- package/dist/browser-ko.ko.global.js +2 -0
- package/dist/browser-lazy.lazy.global.js +2 -0
- package/dist/browser-priority.priority.global.js +2 -0
- package/dist/browser-pt.pt.global.js +2 -0
- package/dist/browser-qu.qu.global.js +2 -0
- package/dist/browser-sw.sw.global.js +2 -0
- package/dist/browser-tr.tr.global.js +2 -0
- package/dist/browser-western.western.global.js +2 -0
- package/dist/browser-zh.zh.global.js +2 -0
- package/dist/browser.global.js +3 -0
- package/dist/browser.global.js.map +1 -0
- package/dist/index.cjs +35051 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +3426 -0
- package/dist/index.d.ts +3426 -0
- package/dist/index.js +34890 -0
- package/dist/index.js.map +1 -0
- package/dist/languages/ar.d.ts +78 -0
- package/dist/languages/ar.js +1622 -0
- package/dist/languages/ar.js.map +1 -0
- package/dist/languages/de.d.ts +38 -0
- package/dist/languages/de.js +1168 -0
- package/dist/languages/de.js.map +1 -0
- package/dist/languages/en.d.ts +44 -0
- package/dist/languages/en.js +3491 -0
- package/dist/languages/en.js.map +1 -0
- package/dist/languages/es.d.ts +52 -0
- package/dist/languages/es.js +1493 -0
- package/dist/languages/es.js.map +1 -0
- package/dist/languages/fr.d.ts +37 -0
- package/dist/languages/fr.js +1159 -0
- package/dist/languages/fr.js.map +1 -0
- package/dist/languages/id.d.ts +35 -0
- package/dist/languages/id.js +1152 -0
- package/dist/languages/id.js.map +1 -0
- package/dist/languages/ja.d.ts +53 -0
- package/dist/languages/ja.js +1430 -0
- package/dist/languages/ja.js.map +1 -0
- package/dist/languages/ko.d.ts +51 -0
- package/dist/languages/ko.js +1729 -0
- package/dist/languages/ko.js.map +1 -0
- package/dist/languages/pt.d.ts +37 -0
- package/dist/languages/pt.js +1127 -0
- package/dist/languages/pt.js.map +1 -0
- package/dist/languages/qu.d.ts +36 -0
- package/dist/languages/qu.js +1143 -0
- package/dist/languages/qu.js.map +1 -0
- package/dist/languages/sw.d.ts +35 -0
- package/dist/languages/sw.js +1147 -0
- package/dist/languages/sw.js.map +1 -0
- package/dist/languages/tr.d.ts +45 -0
- package/dist/languages/tr.js +1529 -0
- package/dist/languages/tr.js.map +1 -0
- package/dist/languages/zh.d.ts +58 -0
- package/dist/languages/zh.js +1257 -0
- package/dist/languages/zh.js.map +1 -0
- package/dist/types-C4dcj53L.d.ts +600 -0
- package/package.json +202 -0
- package/src/__test-utils__/index.ts +7 -0
- package/src/__test-utils__/test-helpers.ts +8 -0
- package/src/__types__/test-helpers.ts +122 -0
- package/src/analysis/index.ts +479 -0
- package/src/ast-builder/command-mappers.ts +1133 -0
- package/src/ast-builder/expression-parser/index.ts +41 -0
- package/src/ast-builder/expression-parser/parser.ts +563 -0
- package/src/ast-builder/expression-parser/tokenizer.ts +394 -0
- package/src/ast-builder/expression-parser/types.ts +208 -0
- package/src/ast-builder/index.ts +536 -0
- package/src/ast-builder/value-converters.ts +172 -0
- package/src/bridge.ts +275 -0
- package/src/browser-ar.ts +162 -0
- package/src/browser-core.ts +231 -0
- package/src/browser-de.ts +162 -0
- package/src/browser-east-asian.ts +173 -0
- package/src/browser-en-tr.ts +165 -0
- package/src/browser-en.ts +157 -0
- package/src/browser-es-en.ts +200 -0
- package/src/browser-es.ts +170 -0
- package/src/browser-fr.ts +162 -0
- package/src/browser-id.ts +162 -0
- package/src/browser-ja.ts +162 -0
- package/src/browser-ko.ts +162 -0
- package/src/browser-lazy.ts +189 -0
- package/src/browser-priority.ts +214 -0
- package/src/browser-pt.ts +162 -0
- package/src/browser-qu.ts +162 -0
- package/src/browser-sw.ts +162 -0
- package/src/browser-tr.ts +162 -0
- package/src/browser-western.ts +181 -0
- package/src/browser-zh.ts +162 -0
- package/src/browser.ts +268 -0
- package/src/cache/index.ts +14 -0
- package/src/cache/semantic-cache.ts +344 -0
- package/src/core-bridge.ts +372 -0
- package/src/explicit/converter.ts +258 -0
- package/src/explicit/index.ts +18 -0
- package/src/explicit/parser.ts +236 -0
- package/src/explicit/renderer.ts +424 -0
- package/src/generators/command-schemas.ts +1636 -0
- package/src/generators/event-handler-generator.ts +109 -0
- package/src/generators/index.ts +117 -0
- package/src/generators/language-profiles.ts +139 -0
- package/src/generators/pattern-generator.ts +537 -0
- package/src/generators/profiles/arabic.ts +131 -0
- package/src/generators/profiles/bengali.ts +132 -0
- package/src/generators/profiles/chinese.ts +124 -0
- package/src/generators/profiles/english.ts +113 -0
- package/src/generators/profiles/french.ts +125 -0
- package/src/generators/profiles/german.ts +126 -0
- package/src/generators/profiles/hindi.ts +146 -0
- package/src/generators/profiles/index.ts +46 -0
- package/src/generators/profiles/indonesian.ts +125 -0
- package/src/generators/profiles/italian.ts +139 -0
- package/src/generators/profiles/japanese.ts +149 -0
- package/src/generators/profiles/korean.ts +127 -0
- package/src/generators/profiles/marker-templates.ts +288 -0
- package/src/generators/profiles/ms.ts +130 -0
- package/src/generators/profiles/polish.ts +249 -0
- package/src/generators/profiles/portuguese.ts +115 -0
- package/src/generators/profiles/quechua.ts +113 -0
- package/src/generators/profiles/russian.ts +260 -0
- package/src/generators/profiles/spanish.ts +130 -0
- package/src/generators/profiles/swahili.ts +129 -0
- package/src/generators/profiles/thai.ts +132 -0
- package/src/generators/profiles/tl.ts +128 -0
- package/src/generators/profiles/turkish.ts +124 -0
- package/src/generators/profiles/types.ts +165 -0
- package/src/generators/profiles/ukrainian.ts +270 -0
- package/src/generators/profiles/vietnamese.ts +133 -0
- package/src/generators/schema-error-codes.ts +160 -0
- package/src/generators/schema-validator.ts +391 -0
- package/src/index.ts +429 -0
- package/src/language-building-schema.ts +3170 -0
- package/src/language-loader.ts +394 -0
- package/src/languages/_all.ts +65 -0
- package/src/languages/ar.ts +15 -0
- package/src/languages/bn.ts +16 -0
- package/src/languages/de.ts +15 -0
- package/src/languages/en.ts +29 -0
- package/src/languages/es.ts +15 -0
- package/src/languages/fr.ts +15 -0
- package/src/languages/hi.ts +26 -0
- package/src/languages/id.ts +15 -0
- package/src/languages/index.ts +18 -0
- package/src/languages/it.ts +15 -0
- package/src/languages/ja.ts +15 -0
- package/src/languages/ko.ts +15 -0
- package/src/languages/ms.ts +16 -0
- package/src/languages/pl.ts +18 -0
- package/src/languages/pt.ts +15 -0
- package/src/languages/qu.ts +15 -0
- package/src/languages/ru.ts +26 -0
- package/src/languages/sw.ts +15 -0
- package/src/languages/th.ts +16 -0
- package/src/languages/tl.ts +16 -0
- package/src/languages/tr.ts +15 -0
- package/src/languages/uk.ts +26 -0
- package/src/languages/vi.ts +16 -0
- package/src/languages/zh.ts +15 -0
- package/src/parser/index.ts +15 -0
- package/src/parser/pattern-matcher.ts +1181 -0
- package/src/parser/semantic-parser.ts +573 -0
- package/src/parser/utils/index.ts +35 -0
- package/src/parser/utils/marker-resolution.ts +111 -0
- package/src/parser/utils/possessive-keywords.ts +43 -0
- package/src/parser/utils/role-positioning.ts +70 -0
- package/src/parser/utils/type-validation.ts +134 -0
- package/src/patterns/add/ar.ts +71 -0
- package/src/patterns/add/bn.ts +70 -0
- package/src/patterns/add/hi.ts +69 -0
- package/src/patterns/add/index.ts +87 -0
- package/src/patterns/add/it.ts +61 -0
- package/src/patterns/add/ja.ts +93 -0
- package/src/patterns/add/ko.ts +74 -0
- package/src/patterns/add/ms.ts +30 -0
- package/src/patterns/add/pl.ts +62 -0
- package/src/patterns/add/ru.ts +62 -0
- package/src/patterns/add/th.ts +49 -0
- package/src/patterns/add/tl.ts +30 -0
- package/src/patterns/add/tr.ts +71 -0
- package/src/patterns/add/uk.ts +62 -0
- package/src/patterns/add/vi.ts +61 -0
- package/src/patterns/add/zh.ts +71 -0
- package/src/patterns/builders.ts +207 -0
- package/src/patterns/decrement/bn.ts +70 -0
- package/src/patterns/decrement/de.ts +42 -0
- package/src/patterns/decrement/hi.ts +68 -0
- package/src/patterns/decrement/index.ts +79 -0
- package/src/patterns/decrement/it.ts +69 -0
- package/src/patterns/decrement/ms.ts +30 -0
- package/src/patterns/decrement/pl.ts +58 -0
- package/src/patterns/decrement/ru.ts +58 -0
- package/src/patterns/decrement/th.ts +49 -0
- package/src/patterns/decrement/tl.ts +30 -0
- package/src/patterns/decrement/tr.ts +48 -0
- package/src/patterns/decrement/uk.ts +58 -0
- package/src/patterns/decrement/vi.ts +61 -0
- package/src/patterns/decrement/zh.ts +32 -0
- package/src/patterns/en.ts +302 -0
- package/src/patterns/event-handler/ar.ts +151 -0
- package/src/patterns/event-handler/bn.ts +72 -0
- package/src/patterns/event-handler/de.ts +117 -0
- package/src/patterns/event-handler/en.ts +117 -0
- package/src/patterns/event-handler/es.ts +136 -0
- package/src/patterns/event-handler/fr.ts +117 -0
- package/src/patterns/event-handler/hi.ts +64 -0
- package/src/patterns/event-handler/id.ts +117 -0
- package/src/patterns/event-handler/index.ts +119 -0
- package/src/patterns/event-handler/it.ts +54 -0
- package/src/patterns/event-handler/ja.ts +118 -0
- package/src/patterns/event-handler/ko.ts +133 -0
- package/src/patterns/event-handler/ms.ts +30 -0
- package/src/patterns/event-handler/pl.ts +62 -0
- package/src/patterns/event-handler/pt.ts +117 -0
- package/src/patterns/event-handler/qu.ts +66 -0
- package/src/patterns/event-handler/ru.ts +62 -0
- package/src/patterns/event-handler/shared.ts +270 -0
- package/src/patterns/event-handler/sw.ts +117 -0
- package/src/patterns/event-handler/th.ts +53 -0
- package/src/patterns/event-handler/tl.ts +30 -0
- package/src/patterns/event-handler/tr.ts +170 -0
- package/src/patterns/event-handler/uk.ts +62 -0
- package/src/patterns/event-handler/vi.ts +61 -0
- package/src/patterns/event-handler/zh.ts +150 -0
- package/src/patterns/get/ar.ts +49 -0
- package/src/patterns/get/bn.ts +47 -0
- package/src/patterns/get/de.ts +32 -0
- package/src/patterns/get/hi.ts +52 -0
- package/src/patterns/get/index.ts +83 -0
- package/src/patterns/get/it.ts +56 -0
- package/src/patterns/get/ja.ts +53 -0
- package/src/patterns/get/ko.ts +53 -0
- package/src/patterns/get/ms.ts +30 -0
- package/src/patterns/get/pl.ts +57 -0
- package/src/patterns/get/ru.ts +57 -0
- package/src/patterns/get/th.ts +29 -0
- package/src/patterns/get/tl.ts +30 -0
- package/src/patterns/get/uk.ts +57 -0
- package/src/patterns/get/vi.ts +48 -0
- package/src/patterns/grammar-transformed/index.ts +39 -0
- package/src/patterns/grammar-transformed/ja.ts +1713 -0
- package/src/patterns/grammar-transformed/ko.ts +1311 -0
- package/src/patterns/grammar-transformed/tr.ts +1067 -0
- package/src/patterns/hide/ar.ts +67 -0
- package/src/patterns/hide/bn.ts +47 -0
- package/src/patterns/hide/de.ts +36 -0
- package/src/patterns/hide/hi.ts +61 -0
- package/src/patterns/hide/index.ts +91 -0
- package/src/patterns/hide/it.ts +56 -0
- package/src/patterns/hide/ja.ts +69 -0
- package/src/patterns/hide/ko.ts +69 -0
- package/src/patterns/hide/ms.ts +30 -0
- package/src/patterns/hide/pl.ts +57 -0
- package/src/patterns/hide/ru.ts +57 -0
- package/src/patterns/hide/th.ts +29 -0
- package/src/patterns/hide/tl.ts +30 -0
- package/src/patterns/hide/tr.ts +65 -0
- package/src/patterns/hide/uk.ts +57 -0
- package/src/patterns/hide/vi.ts +56 -0
- package/src/patterns/hide/zh.ts +68 -0
- package/src/patterns/increment/bn.ts +70 -0
- package/src/patterns/increment/de.ts +36 -0
- package/src/patterns/increment/hi.ts +68 -0
- package/src/patterns/increment/index.ts +79 -0
- package/src/patterns/increment/it.ts +69 -0
- package/src/patterns/increment/ms.ts +30 -0
- package/src/patterns/increment/pl.ts +58 -0
- package/src/patterns/increment/ru.ts +58 -0
- package/src/patterns/increment/th.ts +49 -0
- package/src/patterns/increment/tl.ts +30 -0
- package/src/patterns/increment/tr.ts +52 -0
- package/src/patterns/increment/uk.ts +58 -0
- package/src/patterns/increment/vi.ts +61 -0
- package/src/patterns/increment/zh.ts +32 -0
- package/src/patterns/index.ts +84 -0
- package/src/patterns/languages/en/control-flow.ts +93 -0
- package/src/patterns/languages/en/fetch.ts +62 -0
- package/src/patterns/languages/en/index.ts +42 -0
- package/src/patterns/languages/en/repeat.ts +67 -0
- package/src/patterns/languages/en/set.ts +48 -0
- package/src/patterns/languages/en/swap.ts +38 -0
- package/src/patterns/languages/en/temporal.ts +57 -0
- package/src/patterns/put/ar.ts +74 -0
- package/src/patterns/put/bn.ts +53 -0
- package/src/patterns/put/en.ts +74 -0
- package/src/patterns/put/es.ts +74 -0
- package/src/patterns/put/hi.ts +69 -0
- package/src/patterns/put/id.ts +96 -0
- package/src/patterns/put/index.ts +99 -0
- package/src/patterns/put/it.ts +56 -0
- package/src/patterns/put/ja.ts +75 -0
- package/src/patterns/put/ko.ts +67 -0
- package/src/patterns/put/ms.ts +30 -0
- package/src/patterns/put/pl.ts +81 -0
- package/src/patterns/put/ru.ts +85 -0
- package/src/patterns/put/th.ts +32 -0
- package/src/patterns/put/tl.ts +30 -0
- package/src/patterns/put/tr.ts +67 -0
- package/src/patterns/put/uk.ts +85 -0
- package/src/patterns/put/vi.ts +72 -0
- package/src/patterns/put/zh.ts +62 -0
- package/src/patterns/registry.ts +163 -0
- package/src/patterns/remove/ar.ts +71 -0
- package/src/patterns/remove/bn.ts +68 -0
- package/src/patterns/remove/hi.ts +69 -0
- package/src/patterns/remove/index.ts +87 -0
- package/src/patterns/remove/it.ts +69 -0
- package/src/patterns/remove/ja.ts +74 -0
- package/src/patterns/remove/ko.ts +78 -0
- package/src/patterns/remove/ms.ts +30 -0
- package/src/patterns/remove/pl.ts +62 -0
- package/src/patterns/remove/ru.ts +62 -0
- package/src/patterns/remove/th.ts +49 -0
- package/src/patterns/remove/tl.ts +30 -0
- package/src/patterns/remove/tr.ts +78 -0
- package/src/patterns/remove/uk.ts +62 -0
- package/src/patterns/remove/vi.ts +61 -0
- package/src/patterns/remove/zh.ts +72 -0
- package/src/patterns/set/ar.ts +84 -0
- package/src/patterns/set/bn.ts +53 -0
- package/src/patterns/set/de.ts +84 -0
- package/src/patterns/set/es.ts +92 -0
- package/src/patterns/set/fr.ts +88 -0
- package/src/patterns/set/hi.ts +56 -0
- package/src/patterns/set/id.ts +84 -0
- package/src/patterns/set/index.ts +107 -0
- package/src/patterns/set/it.ts +56 -0
- package/src/patterns/set/ja.ts +86 -0
- package/src/patterns/set/ko.ts +85 -0
- package/src/patterns/set/ms.ts +30 -0
- package/src/patterns/set/pl.ts +57 -0
- package/src/patterns/set/pt.ts +84 -0
- package/src/patterns/set/ru.ts +57 -0
- package/src/patterns/set/th.ts +31 -0
- package/src/patterns/set/tl.ts +30 -0
- package/src/patterns/set/tr.ts +107 -0
- package/src/patterns/set/uk.ts +57 -0
- package/src/patterns/set/vi.ts +53 -0
- package/src/patterns/set/zh.ts +84 -0
- package/src/patterns/show/ar.ts +67 -0
- package/src/patterns/show/bn.ts +47 -0
- package/src/patterns/show/de.ts +32 -0
- package/src/patterns/show/fr.ts +32 -0
- package/src/patterns/show/hi.ts +61 -0
- package/src/patterns/show/index.ts +95 -0
- package/src/patterns/show/it.ts +56 -0
- package/src/patterns/show/ja.ts +69 -0
- package/src/patterns/show/ko.ts +73 -0
- package/src/patterns/show/ms.ts +30 -0
- package/src/patterns/show/pl.ts +57 -0
- package/src/patterns/show/ru.ts +57 -0
- package/src/patterns/show/th.ts +29 -0
- package/src/patterns/show/tl.ts +30 -0
- package/src/patterns/show/tr.ts +65 -0
- package/src/patterns/show/uk.ts +57 -0
- package/src/patterns/show/vi.ts +56 -0
- package/src/patterns/show/zh.ts +68 -0
- package/src/patterns/take/ar.ts +51 -0
- package/src/patterns/take/index.ts +31 -0
- package/src/patterns/toggle/ar.ts +61 -0
- package/src/patterns/toggle/bn.ts +70 -0
- package/src/patterns/toggle/en.ts +61 -0
- package/src/patterns/toggle/es.ts +61 -0
- package/src/patterns/toggle/hi.ts +80 -0
- package/src/patterns/toggle/index.ts +95 -0
- package/src/patterns/toggle/it.ts +69 -0
- package/src/patterns/toggle/ja.ts +156 -0
- package/src/patterns/toggle/ko.ts +113 -0
- package/src/patterns/toggle/ms.ts +30 -0
- package/src/patterns/toggle/pl.ts +62 -0
- package/src/patterns/toggle/ru.ts +62 -0
- package/src/patterns/toggle/th.ts +50 -0
- package/src/patterns/toggle/tl.ts +30 -0
- package/src/patterns/toggle/tr.ts +88 -0
- package/src/patterns/toggle/uk.ts +62 -0
- package/src/patterns/toggle/vi.ts +61 -0
- package/src/patterns/toggle/zh.ts +99 -0
- package/src/public-api.ts +286 -0
- package/src/registry.ts +441 -0
- package/src/tokenizers/arabic.ts +723 -0
- package/src/tokenizers/base.ts +1300 -0
- package/src/tokenizers/bengali.ts +289 -0
- package/src/tokenizers/chinese.ts +481 -0
- package/src/tokenizers/english.ts +416 -0
- package/src/tokenizers/french.ts +326 -0
- package/src/tokenizers/german.ts +324 -0
- package/src/tokenizers/hindi.ts +319 -0
- package/src/tokenizers/index.ts +127 -0
- package/src/tokenizers/indonesian.ts +306 -0
- package/src/tokenizers/italian.ts +458 -0
- package/src/tokenizers/japanese.ts +447 -0
- package/src/tokenizers/korean.ts +642 -0
- package/src/tokenizers/morphology/arabic-normalizer.ts +242 -0
- package/src/tokenizers/morphology/french-normalizer.ts +268 -0
- package/src/tokenizers/morphology/german-normalizer.ts +256 -0
- package/src/tokenizers/morphology/index.ts +46 -0
- package/src/tokenizers/morphology/italian-normalizer.ts +329 -0
- package/src/tokenizers/morphology/japanese-normalizer.ts +288 -0
- package/src/tokenizers/morphology/korean-normalizer.ts +428 -0
- package/src/tokenizers/morphology/polish-normalizer.ts +264 -0
- package/src/tokenizers/morphology/portuguese-normalizer.ts +310 -0
- package/src/tokenizers/morphology/spanish-normalizer.ts +327 -0
- package/src/tokenizers/morphology/turkish-normalizer.ts +412 -0
- package/src/tokenizers/morphology/types.ts +211 -0
- package/src/tokenizers/ms.ts +198 -0
- package/src/tokenizers/polish.ts +354 -0
- package/src/tokenizers/portuguese.ts +304 -0
- package/src/tokenizers/quechua.ts +339 -0
- package/src/tokenizers/russian.ts +375 -0
- package/src/tokenizers/spanish.ts +403 -0
- package/src/tokenizers/swahili.ts +303 -0
- package/src/tokenizers/thai.ts +236 -0
- package/src/tokenizers/tl.ts +198 -0
- package/src/tokenizers/turkish.ts +411 -0
- package/src/tokenizers/ukrainian.ts +369 -0
- package/src/tokenizers/vietnamese.ts +410 -0
- package/src/types/grammar-types.ts +617 -0
- package/src/types/unified-profile.ts +267 -0
- package/src/types.ts +709 -0
- package/src/utils/confidence-calculator.ts +147 -0
- package/src/validators/command-validator.ts +380 -0
- package/src/validators/index.ts +15 -0
|
@@ -0,0 +1,1181 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pattern Matcher
|
|
3
|
+
*
|
|
4
|
+
* Matches tokenized input against language patterns to extract semantic roles.
|
|
5
|
+
* This is the core algorithm for multilingual parsing.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type {
|
|
9
|
+
LanguagePattern,
|
|
10
|
+
PatternToken,
|
|
11
|
+
PatternMatchResult,
|
|
12
|
+
SemanticRole,
|
|
13
|
+
SemanticValue,
|
|
14
|
+
TokenStream,
|
|
15
|
+
LanguageToken,
|
|
16
|
+
} from '../types';
|
|
17
|
+
import { createSelector, createLiteral, createReference, createPropertyPath } from '../types';
|
|
18
|
+
import { isTypeCompatible } from './utils/type-validation';
|
|
19
|
+
import { getPossessiveReference } from './utils/possessive-keywords';
|
|
20
|
+
import type { LanguageProfile } from '../generators/profiles/types';
|
|
21
|
+
import { tryGetProfile } from '../registry';
|
|
22
|
+
|
|
23
|
+
// =============================================================================
|
|
24
|
+
// Pattern Matcher
|
|
25
|
+
// =============================================================================
|
|
26
|
+
|
|
27
|
+
export class PatternMatcher {
|
|
28
|
+
/** Current language profile for the pattern being matched */
|
|
29
|
+
private currentProfile: LanguageProfile | undefined;
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Try to match a single pattern against the token stream.
|
|
33
|
+
* Returns the match result or null if no match.
|
|
34
|
+
*/
|
|
35
|
+
matchPattern(tokens: TokenStream, pattern: LanguagePattern): PatternMatchResult | null {
|
|
36
|
+
const mark = tokens.mark();
|
|
37
|
+
const captured = new Map<SemanticRole, SemanticValue>();
|
|
38
|
+
|
|
39
|
+
// Get language profile for possessive keyword lookup
|
|
40
|
+
this.currentProfile = tryGetProfile(pattern.language);
|
|
41
|
+
|
|
42
|
+
// Reset match counters for this pattern
|
|
43
|
+
this.stemMatchCount = 0;
|
|
44
|
+
this.totalKeywordMatches = 0;
|
|
45
|
+
|
|
46
|
+
const success = this.matchTokenSequence(tokens, pattern.template.tokens, captured);
|
|
47
|
+
|
|
48
|
+
if (!success) {
|
|
49
|
+
tokens.reset(mark);
|
|
50
|
+
return null;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Calculate confidence BEFORE applying defaults
|
|
54
|
+
// This ensures defaulted roles don't artificially inflate confidence
|
|
55
|
+
const confidence = this.calculateConfidence(pattern, captured);
|
|
56
|
+
|
|
57
|
+
// Apply extraction rules to fill in any missing roles with defaults
|
|
58
|
+
this.applyExtractionRules(pattern, captured);
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
pattern,
|
|
62
|
+
captured,
|
|
63
|
+
consumedTokens: tokens.position() - mark.position,
|
|
64
|
+
confidence,
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Try to match multiple patterns, return the best match.
|
|
70
|
+
*/
|
|
71
|
+
matchBest(tokens: TokenStream, patterns: LanguagePattern[]): PatternMatchResult | null {
|
|
72
|
+
const matches: PatternMatchResult[] = [];
|
|
73
|
+
|
|
74
|
+
for (const pattern of patterns) {
|
|
75
|
+
const mark = tokens.mark();
|
|
76
|
+
const result = this.matchPattern(tokens, pattern);
|
|
77
|
+
|
|
78
|
+
if (result) {
|
|
79
|
+
matches.push(result);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
tokens.reset(mark);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
if (matches.length === 0) {
|
|
86
|
+
return null;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Sort by confidence and priority
|
|
90
|
+
matches.sort((a, b) => {
|
|
91
|
+
// First by priority
|
|
92
|
+
const priorityDiff = b.pattern.priority - a.pattern.priority;
|
|
93
|
+
if (priorityDiff !== 0) return priorityDiff;
|
|
94
|
+
|
|
95
|
+
// Then by confidence
|
|
96
|
+
return b.confidence - a.confidence;
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
// Re-consume tokens for the best match
|
|
100
|
+
const best = matches[0];
|
|
101
|
+
this.matchPattern(tokens, best.pattern);
|
|
102
|
+
|
|
103
|
+
return best;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Match a sequence of pattern tokens against the token stream.
|
|
108
|
+
*/
|
|
109
|
+
private matchTokenSequence(
|
|
110
|
+
tokens: TokenStream,
|
|
111
|
+
patternTokens: PatternToken[],
|
|
112
|
+
captured: Map<SemanticRole, SemanticValue>
|
|
113
|
+
): boolean {
|
|
114
|
+
// Skip leading conjunctions for Arabic (proclitics: و, ف, ول, وب, etc.)
|
|
115
|
+
if (this.currentProfile?.code === 'ar') {
|
|
116
|
+
while (tokens.peek()?.kind === 'conjunction') {
|
|
117
|
+
tokens.advance();
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
for (const patternToken of patternTokens) {
|
|
122
|
+
const matched = this.matchPatternToken(tokens, patternToken, captured);
|
|
123
|
+
|
|
124
|
+
if (!matched) {
|
|
125
|
+
// If token is optional, continue
|
|
126
|
+
if (this.isOptional(patternToken)) {
|
|
127
|
+
continue;
|
|
128
|
+
}
|
|
129
|
+
return false;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return true;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Match a single pattern token against the current position in the stream.
|
|
138
|
+
*/
|
|
139
|
+
private matchPatternToken(
|
|
140
|
+
tokens: TokenStream,
|
|
141
|
+
patternToken: PatternToken,
|
|
142
|
+
captured: Map<SemanticRole, SemanticValue>
|
|
143
|
+
): boolean {
|
|
144
|
+
switch (patternToken.type) {
|
|
145
|
+
case 'literal':
|
|
146
|
+
return this.matchLiteralToken(tokens, patternToken);
|
|
147
|
+
|
|
148
|
+
case 'role':
|
|
149
|
+
return this.matchRoleToken(tokens, patternToken, captured);
|
|
150
|
+
|
|
151
|
+
case 'group':
|
|
152
|
+
return this.matchGroupToken(tokens, patternToken, captured);
|
|
153
|
+
|
|
154
|
+
default:
|
|
155
|
+
return false;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Match a literal pattern token (keyword or particle).
|
|
161
|
+
*/
|
|
162
|
+
private matchLiteralToken(
|
|
163
|
+
tokens: TokenStream,
|
|
164
|
+
patternToken: PatternToken & { type: 'literal' }
|
|
165
|
+
): boolean {
|
|
166
|
+
const token = tokens.peek();
|
|
167
|
+
if (!token) return false;
|
|
168
|
+
|
|
169
|
+
// Check main value
|
|
170
|
+
const matchType = this.getMatchType(token, patternToken.value);
|
|
171
|
+
if (matchType !== 'none') {
|
|
172
|
+
this.totalKeywordMatches++;
|
|
173
|
+
if (matchType === 'stem') {
|
|
174
|
+
this.stemMatchCount++;
|
|
175
|
+
}
|
|
176
|
+
tokens.advance();
|
|
177
|
+
return true;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Check alternatives
|
|
181
|
+
if (patternToken.alternatives) {
|
|
182
|
+
for (const alt of patternToken.alternatives) {
|
|
183
|
+
const altMatchType = this.getMatchType(token, alt);
|
|
184
|
+
if (altMatchType !== 'none') {
|
|
185
|
+
this.totalKeywordMatches++;
|
|
186
|
+
if (altMatchType === 'stem') {
|
|
187
|
+
this.stemMatchCount++;
|
|
188
|
+
}
|
|
189
|
+
tokens.advance();
|
|
190
|
+
return true;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
return false;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Match a role pattern token (captures a semantic value).
|
|
200
|
+
* Handles multi-token expressions like:
|
|
201
|
+
* - 'my value' (possessive keyword + property)
|
|
202
|
+
* - '#dialog.showModal()' (method call)
|
|
203
|
+
* - "#element's *opacity" (possessive selector + property)
|
|
204
|
+
*/
|
|
205
|
+
private matchRoleToken(
|
|
206
|
+
tokens: TokenStream,
|
|
207
|
+
patternToken: PatternToken & { type: 'role' },
|
|
208
|
+
captured: Map<SemanticRole, SemanticValue>
|
|
209
|
+
): boolean {
|
|
210
|
+
// Skip noise words like "the" before selectors (English idiom support)
|
|
211
|
+
this.skipNoiseWords(tokens);
|
|
212
|
+
|
|
213
|
+
const token = tokens.peek();
|
|
214
|
+
if (!token) {
|
|
215
|
+
return patternToken.optional || false;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// Check for possessive expression (e.g., 'my value', 'its innerHTML')
|
|
219
|
+
const possessiveValue = this.tryMatchPossessiveExpression(tokens);
|
|
220
|
+
if (possessiveValue) {
|
|
221
|
+
// Validate expected types if specified
|
|
222
|
+
if (patternToken.expectedTypes && patternToken.expectedTypes.length > 0) {
|
|
223
|
+
if (
|
|
224
|
+
!patternToken.expectedTypes.includes(possessiveValue.type) &&
|
|
225
|
+
!patternToken.expectedTypes.includes('expression')
|
|
226
|
+
) {
|
|
227
|
+
return patternToken.optional || false;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
captured.set(patternToken.role, possessiveValue);
|
|
231
|
+
return true;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// Check for method call expression (e.g., '#dialog.showModal()')
|
|
235
|
+
const methodCallValue = this.tryMatchMethodCallExpression(tokens);
|
|
236
|
+
if (methodCallValue) {
|
|
237
|
+
if (patternToken.expectedTypes && patternToken.expectedTypes.length > 0) {
|
|
238
|
+
if (
|
|
239
|
+
!patternToken.expectedTypes.includes(methodCallValue.type) &&
|
|
240
|
+
!patternToken.expectedTypes.includes('expression')
|
|
241
|
+
) {
|
|
242
|
+
return patternToken.optional || false;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
captured.set(patternToken.role, methodCallValue);
|
|
246
|
+
return true;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// Check for possessive selector expression (e.g., "#element's *opacity")
|
|
250
|
+
const possessiveSelectorValue = this.tryMatchPossessiveSelectorExpression(tokens);
|
|
251
|
+
if (possessiveSelectorValue) {
|
|
252
|
+
if (patternToken.expectedTypes && patternToken.expectedTypes.length > 0) {
|
|
253
|
+
// property-path is compatible with selector, reference, and expression
|
|
254
|
+
if (!isTypeCompatible(possessiveSelectorValue.type, patternToken.expectedTypes)) {
|
|
255
|
+
return patternToken.optional || false;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
captured.set(patternToken.role, possessiveSelectorValue);
|
|
259
|
+
return true;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// Check for property access expression (e.g., 'userData.name', 'it.data')
|
|
263
|
+
const propertyAccessValue = this.tryMatchPropertyAccessExpression(tokens);
|
|
264
|
+
if (propertyAccessValue) {
|
|
265
|
+
if (patternToken.expectedTypes && patternToken.expectedTypes.length > 0) {
|
|
266
|
+
if (
|
|
267
|
+
!patternToken.expectedTypes.includes(propertyAccessValue.type) &&
|
|
268
|
+
!patternToken.expectedTypes.includes('expression')
|
|
269
|
+
) {
|
|
270
|
+
return patternToken.optional || false;
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
captured.set(patternToken.role, propertyAccessValue);
|
|
274
|
+
return true;
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// Check for selector + property expression (e.g., '#output.innerText')
|
|
278
|
+
// This handles cases where the tokenizer produces two selector tokens
|
|
279
|
+
const selectorPropertyValue = this.tryMatchSelectorPropertyExpression(tokens);
|
|
280
|
+
if (selectorPropertyValue) {
|
|
281
|
+
if (patternToken.expectedTypes && patternToken.expectedTypes.length > 0) {
|
|
282
|
+
if (!isTypeCompatible(selectorPropertyValue.type, patternToken.expectedTypes)) {
|
|
283
|
+
return patternToken.optional || false;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
captured.set(patternToken.role, selectorPropertyValue);
|
|
287
|
+
return true;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// Try to extract a semantic value from the token
|
|
291
|
+
const value = this.tokenToSemanticValue(token);
|
|
292
|
+
if (!value) {
|
|
293
|
+
return patternToken.optional || false;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Validate expected types if specified
|
|
297
|
+
if (patternToken.expectedTypes && patternToken.expectedTypes.length > 0) {
|
|
298
|
+
if (!patternToken.expectedTypes.includes(value.type)) {
|
|
299
|
+
return patternToken.optional || false;
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
captured.set(patternToken.role, value);
|
|
304
|
+
tokens.advance();
|
|
305
|
+
return true;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
/**
|
|
309
|
+
* Try to match a possessive expression like 'my value' or 'its innerHTML'.
|
|
310
|
+
* Returns the PropertyPathValue if matched, or null if not.
|
|
311
|
+
*/
|
|
312
|
+
private tryMatchPossessiveExpression(tokens: TokenStream): SemanticValue | null {
|
|
313
|
+
const token = tokens.peek();
|
|
314
|
+
if (!token) return null;
|
|
315
|
+
|
|
316
|
+
// Use profile-based possessive keyword lookup
|
|
317
|
+
if (!this.currentProfile) return null;
|
|
318
|
+
|
|
319
|
+
const tokenLower = (token.normalized || token.value).toLowerCase();
|
|
320
|
+
const baseRef = getPossessiveReference(this.currentProfile, tokenLower);
|
|
321
|
+
|
|
322
|
+
if (!baseRef) return null;
|
|
323
|
+
|
|
324
|
+
// We have a possessive keyword, look ahead for property name
|
|
325
|
+
const mark = tokens.mark();
|
|
326
|
+
tokens.advance();
|
|
327
|
+
|
|
328
|
+
const propertyToken = tokens.peek();
|
|
329
|
+
if (!propertyToken) {
|
|
330
|
+
// Just the possessive keyword, no property - revert
|
|
331
|
+
tokens.reset(mark);
|
|
332
|
+
return null;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// Property should be an identifier, keyword (not structural), or selector (for style props)
|
|
336
|
+
// Examples: "my value", "my innerHTML", "my *background", "my *opacity"
|
|
337
|
+
if (
|
|
338
|
+
propertyToken.kind === 'identifier' ||
|
|
339
|
+
(propertyToken.kind === 'keyword' && !this.isStructuralKeyword(propertyToken.value)) ||
|
|
340
|
+
(propertyToken.kind === 'selector' && propertyToken.value.startsWith('*'))
|
|
341
|
+
) {
|
|
342
|
+
tokens.advance();
|
|
343
|
+
|
|
344
|
+
// Create property-path: my value -> { object: me, property: 'value' }
|
|
345
|
+
return createPropertyPath(createReference(baseRef as any), propertyToken.value);
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
// Not a valid property, revert
|
|
349
|
+
tokens.reset(mark);
|
|
350
|
+
return null;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
/**
|
|
354
|
+
* Check if a keyword is a structural keyword (preposition, control flow, etc.)
|
|
355
|
+
* that shouldn't be consumed as a property name.
|
|
356
|
+
*/
|
|
357
|
+
private isStructuralKeyword(value: string): boolean {
|
|
358
|
+
const structural = new Set([
|
|
359
|
+
// Prepositions
|
|
360
|
+
'into',
|
|
361
|
+
'in',
|
|
362
|
+
'to',
|
|
363
|
+
'from',
|
|
364
|
+
'at',
|
|
365
|
+
'by',
|
|
366
|
+
'with',
|
|
367
|
+
'without',
|
|
368
|
+
'before',
|
|
369
|
+
'after',
|
|
370
|
+
'of',
|
|
371
|
+
'as',
|
|
372
|
+
'on',
|
|
373
|
+
// Control flow
|
|
374
|
+
'then',
|
|
375
|
+
'end',
|
|
376
|
+
'else',
|
|
377
|
+
'if',
|
|
378
|
+
'repeat',
|
|
379
|
+
'while',
|
|
380
|
+
'for',
|
|
381
|
+
// Commands (shouldn't be property names)
|
|
382
|
+
'toggle',
|
|
383
|
+
'add',
|
|
384
|
+
'remove',
|
|
385
|
+
'put',
|
|
386
|
+
'set',
|
|
387
|
+
'show',
|
|
388
|
+
'hide',
|
|
389
|
+
'increment',
|
|
390
|
+
'decrement',
|
|
391
|
+
'send',
|
|
392
|
+
'trigger',
|
|
393
|
+
'call',
|
|
394
|
+
]);
|
|
395
|
+
return structural.has(value.toLowerCase());
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
/**
|
|
399
|
+
* Try to match a method call expression like '#dialog.showModal()'.
|
|
400
|
+
* Pattern: selector + '.' + identifier + '(' + [args] + ')'
|
|
401
|
+
* Returns an expression value if matched, or null if not.
|
|
402
|
+
*/
|
|
403
|
+
private tryMatchMethodCallExpression(tokens: TokenStream): SemanticValue | null {
|
|
404
|
+
const token = tokens.peek();
|
|
405
|
+
if (!token || token.kind !== 'selector') return null;
|
|
406
|
+
|
|
407
|
+
// Look ahead for: . identifier (
|
|
408
|
+
const mark = tokens.mark();
|
|
409
|
+
tokens.advance(); // consume selector
|
|
410
|
+
|
|
411
|
+
const dotToken = tokens.peek();
|
|
412
|
+
if (!dotToken || dotToken.kind !== 'operator' || dotToken.value !== '.') {
|
|
413
|
+
tokens.reset(mark);
|
|
414
|
+
return null;
|
|
415
|
+
}
|
|
416
|
+
tokens.advance(); // consume .
|
|
417
|
+
|
|
418
|
+
const methodToken = tokens.peek();
|
|
419
|
+
if (!methodToken || methodToken.kind !== 'identifier') {
|
|
420
|
+
tokens.reset(mark);
|
|
421
|
+
return null;
|
|
422
|
+
}
|
|
423
|
+
tokens.advance(); // consume method name
|
|
424
|
+
|
|
425
|
+
const openParen = tokens.peek();
|
|
426
|
+
if (!openParen || openParen.kind !== 'punctuation' || openParen.value !== '(') {
|
|
427
|
+
tokens.reset(mark);
|
|
428
|
+
return null;
|
|
429
|
+
}
|
|
430
|
+
tokens.advance(); // consume (
|
|
431
|
+
|
|
432
|
+
// Consume arguments until we find ) (with depth limit for security)
|
|
433
|
+
const args: string[] = [];
|
|
434
|
+
while (!tokens.isAtEnd() && args.length < PatternMatcher.MAX_METHOD_ARGS) {
|
|
435
|
+
const argToken = tokens.peek();
|
|
436
|
+
if (!argToken) break;
|
|
437
|
+
if (argToken.kind === 'punctuation' && argToken.value === ')') {
|
|
438
|
+
tokens.advance(); // consume )
|
|
439
|
+
break;
|
|
440
|
+
}
|
|
441
|
+
// Skip commas
|
|
442
|
+
if (argToken.kind === 'punctuation' && argToken.value === ',') {
|
|
443
|
+
tokens.advance();
|
|
444
|
+
continue;
|
|
445
|
+
}
|
|
446
|
+
// Collect arg value
|
|
447
|
+
args.push(argToken.value);
|
|
448
|
+
tokens.advance();
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
// Create expression value: #dialog.showModal()
|
|
452
|
+
const methodCall = `${token.value}.${methodToken.value}(${args.join(', ')})`;
|
|
453
|
+
return {
|
|
454
|
+
type: 'expression',
|
|
455
|
+
raw: methodCall,
|
|
456
|
+
} as SemanticValue;
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
/**
|
|
460
|
+
* Try to match a property access expression like 'userData.name' or 'it.data'.
|
|
461
|
+
* Pattern: (identifier | keyword) + '.' + identifier [+ '.' + identifier ...]
|
|
462
|
+
* Returns an expression value if matched, or null if not.
|
|
463
|
+
*/
|
|
464
|
+
private tryMatchPropertyAccessExpression(tokens: TokenStream): SemanticValue | null {
|
|
465
|
+
const token = tokens.peek();
|
|
466
|
+
if (!token) return null;
|
|
467
|
+
|
|
468
|
+
// Must start with an identifier or keyword reference
|
|
469
|
+
if (token.kind !== 'identifier' && token.kind !== 'keyword') return null;
|
|
470
|
+
|
|
471
|
+
// Look ahead for: . identifier
|
|
472
|
+
const mark = tokens.mark();
|
|
473
|
+
tokens.advance(); // consume first token
|
|
474
|
+
|
|
475
|
+
const dotToken = tokens.peek();
|
|
476
|
+
if (!dotToken || dotToken.kind !== 'operator' || dotToken.value !== '.') {
|
|
477
|
+
tokens.reset(mark);
|
|
478
|
+
return null;
|
|
479
|
+
}
|
|
480
|
+
tokens.advance(); // consume .
|
|
481
|
+
|
|
482
|
+
const propertyToken = tokens.peek();
|
|
483
|
+
if (!propertyToken || propertyToken.kind !== 'identifier') {
|
|
484
|
+
tokens.reset(mark);
|
|
485
|
+
return null;
|
|
486
|
+
}
|
|
487
|
+
tokens.advance(); // consume property name
|
|
488
|
+
|
|
489
|
+
// Build the property chain
|
|
490
|
+
let chain = `${token.value}.${propertyToken.value}`;
|
|
491
|
+
let depth = 1; // Already have one property access
|
|
492
|
+
|
|
493
|
+
// Continue for nested property access (e.g., userData.address.city)
|
|
494
|
+
// With depth limit for security
|
|
495
|
+
while (!tokens.isAtEnd() && depth < PatternMatcher.MAX_PROPERTY_DEPTH) {
|
|
496
|
+
const nextDot = tokens.peek();
|
|
497
|
+
if (!nextDot || nextDot.kind !== 'operator' || nextDot.value !== '.') {
|
|
498
|
+
break;
|
|
499
|
+
}
|
|
500
|
+
tokens.advance(); // consume .
|
|
501
|
+
|
|
502
|
+
const nextProp = tokens.peek();
|
|
503
|
+
if (!nextProp || nextProp.kind !== 'identifier') {
|
|
504
|
+
// Dot without property - put the dot back and stop
|
|
505
|
+
// Can't easily put a single token back, so we'll include it
|
|
506
|
+
break;
|
|
507
|
+
}
|
|
508
|
+
tokens.advance(); // consume property
|
|
509
|
+
chain += `.${nextProp.value}`;
|
|
510
|
+
depth++;
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
// Check for method call: chain + '(' + args + ')'
|
|
514
|
+
// e.g., me.insertBefore(draggedItem, dropTarget)
|
|
515
|
+
const openParen = tokens.peek();
|
|
516
|
+
if (openParen && openParen.kind === 'punctuation' && openParen.value === '(') {
|
|
517
|
+
tokens.advance(); // consume (
|
|
518
|
+
|
|
519
|
+
// Collect arguments (comma-separated values)
|
|
520
|
+
const args: string[] = [];
|
|
521
|
+
let argDepth = 0; // Track nested parentheses
|
|
522
|
+
while (!tokens.isAtEnd() && args.length < PatternMatcher.MAX_METHOD_ARGS) {
|
|
523
|
+
const argToken = tokens.peek();
|
|
524
|
+
if (!argToken) break;
|
|
525
|
+
|
|
526
|
+
// Handle close paren - respecting nesting
|
|
527
|
+
if (argToken.kind === 'punctuation' && argToken.value === ')') {
|
|
528
|
+
if (argDepth === 0) {
|
|
529
|
+
tokens.advance(); // consume )
|
|
530
|
+
break;
|
|
531
|
+
}
|
|
532
|
+
argDepth--;
|
|
533
|
+
}
|
|
534
|
+
// Track nested open parens
|
|
535
|
+
if (argToken.kind === 'punctuation' && argToken.value === '(') {
|
|
536
|
+
argDepth++;
|
|
537
|
+
}
|
|
538
|
+
// Skip commas between arguments
|
|
539
|
+
if (argToken.kind === 'punctuation' && argToken.value === ',') {
|
|
540
|
+
tokens.advance();
|
|
541
|
+
continue;
|
|
542
|
+
}
|
|
543
|
+
// Collect arg value
|
|
544
|
+
args.push(argToken.value);
|
|
545
|
+
tokens.advance();
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
// Create expression value with method call: me.insertBefore(a, b)
|
|
549
|
+
const methodCall = `${chain}(${args.join(', ')})`;
|
|
550
|
+
return {
|
|
551
|
+
type: 'expression',
|
|
552
|
+
raw: methodCall,
|
|
553
|
+
} as SemanticValue;
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
// Create expression value: userData.name
|
|
557
|
+
return {
|
|
558
|
+
type: 'expression',
|
|
559
|
+
raw: chain,
|
|
560
|
+
} as SemanticValue;
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
/**
|
|
564
|
+
* Try to match a possessive selector expression like "#element's *opacity".
|
|
565
|
+
* Pattern: selector + "'s" + (selector | identifier)
|
|
566
|
+
* Returns a property-path value if matched, or null if not.
|
|
567
|
+
*/
|
|
568
|
+
private tryMatchPossessiveSelectorExpression(tokens: TokenStream): SemanticValue | null {
|
|
569
|
+
const token = tokens.peek();
|
|
570
|
+
if (!token || token.kind !== 'selector') return null;
|
|
571
|
+
|
|
572
|
+
// Look ahead for: 's (possessive marker)
|
|
573
|
+
const mark = tokens.mark();
|
|
574
|
+
tokens.advance(); // consume selector
|
|
575
|
+
|
|
576
|
+
const possessiveToken = tokens.peek();
|
|
577
|
+
if (
|
|
578
|
+
!possessiveToken ||
|
|
579
|
+
possessiveToken.kind !== 'punctuation' ||
|
|
580
|
+
possessiveToken.value !== "'s"
|
|
581
|
+
) {
|
|
582
|
+
tokens.reset(mark);
|
|
583
|
+
return null;
|
|
584
|
+
}
|
|
585
|
+
tokens.advance(); // consume 's
|
|
586
|
+
|
|
587
|
+
const propertyToken = tokens.peek();
|
|
588
|
+
if (!propertyToken) {
|
|
589
|
+
tokens.reset(mark);
|
|
590
|
+
return null;
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
// Property can be a selector (*opacity) or identifier
|
|
594
|
+
if (propertyToken.kind !== 'selector' && propertyToken.kind !== 'identifier') {
|
|
595
|
+
tokens.reset(mark);
|
|
596
|
+
return null;
|
|
597
|
+
}
|
|
598
|
+
tokens.advance(); // consume property
|
|
599
|
+
|
|
600
|
+
// Create property-path: #element's *opacity
|
|
601
|
+
return createPropertyPath(createSelector(token.value), propertyToken.value);
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
/**
|
|
605
|
+
* Try to match a selector + property expression like "#output.innerText".
|
|
606
|
+
* This handles cases where the tokenizer produces two selector tokens:
|
|
607
|
+
* - #output (id selector)
|
|
608
|
+
* - .innerText (looks like class selector, but is actually property)
|
|
609
|
+
*
|
|
610
|
+
* Pattern: id-selector + class-selector-that-is-actually-property
|
|
611
|
+
* Returns a property-path value if matched, or null if not.
|
|
612
|
+
*/
|
|
613
|
+
private tryMatchSelectorPropertyExpression(tokens: TokenStream): SemanticValue | null {
|
|
614
|
+
const token = tokens.peek();
|
|
615
|
+
if (!token || token.kind !== 'selector') return null;
|
|
616
|
+
|
|
617
|
+
// Must be an ID selector (starts with #)
|
|
618
|
+
if (!token.value.startsWith('#')) return null;
|
|
619
|
+
|
|
620
|
+
// Look ahead for: selector that looks like a property (.something)
|
|
621
|
+
const mark = tokens.mark();
|
|
622
|
+
tokens.advance(); // consume first selector
|
|
623
|
+
|
|
624
|
+
const propertyToken = tokens.peek();
|
|
625
|
+
if (!propertyToken || propertyToken.kind !== 'selector') {
|
|
626
|
+
tokens.reset(mark);
|
|
627
|
+
return null;
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
// Second token must look like a class selector (starts with .)
|
|
631
|
+
// but we interpret it as a property access
|
|
632
|
+
if (!propertyToken.value.startsWith('.')) {
|
|
633
|
+
tokens.reset(mark);
|
|
634
|
+
return null;
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
// Verify the next token is not a selector (to avoid consuming too many)
|
|
638
|
+
// This helps distinguish "#output.innerText" from "#box .child"
|
|
639
|
+
const peek2 = tokens.peek(1);
|
|
640
|
+
if (peek2 && peek2.kind === 'selector') {
|
|
641
|
+
// Could be a compound selector chain - only take first two
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
tokens.advance(); // consume property selector
|
|
645
|
+
|
|
646
|
+
// Create property-path: #output.innerText
|
|
647
|
+
// Extract property name without the leading dot
|
|
648
|
+
const propertyName = propertyToken.value.slice(1);
|
|
649
|
+
|
|
650
|
+
return createPropertyPath(createSelector(token.value), propertyName);
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
/**
|
|
654
|
+
* Match a group pattern token (optional sequence).
|
|
655
|
+
*/
|
|
656
|
+
private matchGroupToken(
|
|
657
|
+
tokens: TokenStream,
|
|
658
|
+
patternToken: PatternToken & { type: 'group' },
|
|
659
|
+
captured: Map<SemanticRole, SemanticValue>
|
|
660
|
+
): boolean {
|
|
661
|
+
const mark = tokens.mark();
|
|
662
|
+
|
|
663
|
+
// Track which roles were captured before this group
|
|
664
|
+
const capturedBefore = new Set(captured.keys());
|
|
665
|
+
|
|
666
|
+
const success = this.matchTokenSequence(tokens, patternToken.tokens, captured);
|
|
667
|
+
|
|
668
|
+
if (!success) {
|
|
669
|
+
tokens.reset(mark);
|
|
670
|
+
// Clear any roles that were partially captured during the failed group match
|
|
671
|
+
for (const role of captured.keys()) {
|
|
672
|
+
if (!capturedBefore.has(role)) {
|
|
673
|
+
captured.delete(role);
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
return patternToken.optional || false;
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
return true;
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
/**
|
|
683
|
+
* Get the type of match for a token against a value.
|
|
684
|
+
* Used for confidence calculation.
|
|
685
|
+
*/
|
|
686
|
+
private getMatchType(
|
|
687
|
+
token: LanguageToken,
|
|
688
|
+
value: string
|
|
689
|
+
): 'exact' | 'normalized' | 'stem' | 'case-insensitive' | 'none' {
|
|
690
|
+
// Exact match (highest confidence)
|
|
691
|
+
if (token.value === value) return 'exact';
|
|
692
|
+
|
|
693
|
+
// Explicit keyword map normalized match (high confidence)
|
|
694
|
+
if (token.normalized === value) return 'normalized';
|
|
695
|
+
|
|
696
|
+
// Morphologically normalized stem match (medium-high confidence)
|
|
697
|
+
// Only accept if stem confidence is reasonable
|
|
698
|
+
if (token.stem === value && token.stemConfidence !== undefined && token.stemConfidence >= 0.7) {
|
|
699
|
+
return 'stem';
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
// Case-insensitive match for keywords (medium confidence)
|
|
703
|
+
if (token.kind === 'keyword' && token.value.toLowerCase() === value.toLowerCase()) {
|
|
704
|
+
return 'case-insensitive';
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
return 'none';
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
/**
|
|
711
|
+
* Track stem matches for confidence calculation.
|
|
712
|
+
* This is set during matching and read during confidence calculation.
|
|
713
|
+
*/
|
|
714
|
+
private stemMatchCount: number = 0;
|
|
715
|
+
private totalKeywordMatches: number = 0;
|
|
716
|
+
|
|
717
|
+
// ==========================================================================
|
|
718
|
+
// Depth Limits for Expression Parsing (security hardening)
|
|
719
|
+
// ==========================================================================
|
|
720
|
+
|
|
721
|
+
/** Maximum depth for nested property access (e.g., a.b.c.d...) */
|
|
722
|
+
private static readonly MAX_PROPERTY_DEPTH = 10;
|
|
723
|
+
|
|
724
|
+
/** Maximum number of arguments in method calls */
|
|
725
|
+
private static readonly MAX_METHOD_ARGS = 20;
|
|
726
|
+
|
|
727
|
+
/**
|
|
728
|
+
* Convert a language token to a semantic value.
|
|
729
|
+
*/
|
|
730
|
+
private tokenToSemanticValue(token: LanguageToken): SemanticValue | null {
|
|
731
|
+
switch (token.kind) {
|
|
732
|
+
case 'selector':
|
|
733
|
+
return createSelector(token.value);
|
|
734
|
+
|
|
735
|
+
case 'literal':
|
|
736
|
+
return this.parseLiteralValue(token.value);
|
|
737
|
+
|
|
738
|
+
case 'keyword':
|
|
739
|
+
// Keywords might be references or values
|
|
740
|
+
const lower = (token.normalized || token.value).toLowerCase();
|
|
741
|
+
if (['me', 'you', 'it', 'result', 'event', 'target', 'body'].includes(lower)) {
|
|
742
|
+
return createReference(lower as any);
|
|
743
|
+
}
|
|
744
|
+
return createLiteral(token.normalized || token.value);
|
|
745
|
+
|
|
746
|
+
case 'identifier':
|
|
747
|
+
// Check if it's a variable reference (:varname)
|
|
748
|
+
if (token.value.startsWith(':')) {
|
|
749
|
+
return createReference(token.value as any);
|
|
750
|
+
}
|
|
751
|
+
// Check if it's a built-in reference
|
|
752
|
+
const identLower = token.value.toLowerCase();
|
|
753
|
+
if (['me', 'you', 'it', 'result', 'event', 'target', 'body'].includes(identLower)) {
|
|
754
|
+
return createReference(identLower as any);
|
|
755
|
+
}
|
|
756
|
+
// Regular identifiers are variable references - use 'expression' type
|
|
757
|
+
// which gets converted to 'identifier' AST nodes by semantic-integration.ts
|
|
758
|
+
return { type: 'expression', raw: token.value } as const;
|
|
759
|
+
|
|
760
|
+
case 'url':
|
|
761
|
+
// URLs are treated as string literals (paths/URLs for navigation/fetch)
|
|
762
|
+
return createLiteral(token.value, 'string');
|
|
763
|
+
|
|
764
|
+
default:
|
|
765
|
+
return null;
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
/**
|
|
770
|
+
* Parse a literal value (string, number, boolean).
|
|
771
|
+
*/
|
|
772
|
+
private parseLiteralValue(value: string): SemanticValue {
|
|
773
|
+
// String literal
|
|
774
|
+
if (
|
|
775
|
+
value.startsWith('"') ||
|
|
776
|
+
value.startsWith("'") ||
|
|
777
|
+
value.startsWith('`') ||
|
|
778
|
+
value.startsWith('「')
|
|
779
|
+
) {
|
|
780
|
+
const inner = value.slice(1, -1);
|
|
781
|
+
return createLiteral(inner, 'string');
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
// Boolean
|
|
785
|
+
if (value === 'true') return createLiteral(true, 'boolean');
|
|
786
|
+
if (value === 'false') return createLiteral(false, 'boolean');
|
|
787
|
+
|
|
788
|
+
// Duration (number with suffix)
|
|
789
|
+
const durationMatch = value.match(/^(\d+(?:\.\d+)?)(ms|s|m|h)?$/);
|
|
790
|
+
if (durationMatch) {
|
|
791
|
+
const num = parseFloat(durationMatch[1]);
|
|
792
|
+
const unit = durationMatch[2];
|
|
793
|
+
if (unit) {
|
|
794
|
+
return createLiteral(value, 'duration');
|
|
795
|
+
}
|
|
796
|
+
return createLiteral(num, 'number');
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
// Plain number
|
|
800
|
+
const num = parseFloat(value);
|
|
801
|
+
if (!isNaN(num)) {
|
|
802
|
+
return createLiteral(num, 'number');
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
// Default to string
|
|
806
|
+
return createLiteral(value, 'string');
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
/**
|
|
810
|
+
* Apply extraction rules to fill in default values for missing roles.
|
|
811
|
+
*/
|
|
812
|
+
private applyExtractionRules(
|
|
813
|
+
pattern: LanguagePattern,
|
|
814
|
+
captured: Map<SemanticRole, SemanticValue>
|
|
815
|
+
): void {
|
|
816
|
+
for (const [role, rule] of Object.entries(pattern.extraction)) {
|
|
817
|
+
if (!captured.has(role as SemanticRole) && rule.default) {
|
|
818
|
+
captured.set(role as SemanticRole, rule.default);
|
|
819
|
+
}
|
|
820
|
+
}
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
/**
|
|
824
|
+
* Check if a pattern token is optional.
|
|
825
|
+
*/
|
|
826
|
+
private isOptional(patternToken: PatternToken): boolean {
|
|
827
|
+
return (patternToken as any).optional === true;
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
/**
|
|
831
|
+
* Calculate confidence score for a match (0-1).
|
|
832
|
+
*
|
|
833
|
+
* Confidence is reduced for:
|
|
834
|
+
* - Stem matches (morphological normalization has inherent uncertainty)
|
|
835
|
+
* - Missing optional roles (but less penalty if role has a default value)
|
|
836
|
+
*
|
|
837
|
+
* Confidence is increased for:
|
|
838
|
+
* - VSO languages (Arabic) when pattern starts with a verb
|
|
839
|
+
*/
|
|
840
|
+
private calculateConfidence(
|
|
841
|
+
pattern: LanguagePattern,
|
|
842
|
+
captured: Map<SemanticRole, SemanticValue>
|
|
843
|
+
): number {
|
|
844
|
+
let score = 0;
|
|
845
|
+
let maxScore = 0;
|
|
846
|
+
|
|
847
|
+
// Helper to check if a role has a default value in extraction rules
|
|
848
|
+
const hasDefault = (role: SemanticRole): boolean => {
|
|
849
|
+
return pattern.extraction?.[role]?.default !== undefined;
|
|
850
|
+
};
|
|
851
|
+
|
|
852
|
+
// Score based on captured roles
|
|
853
|
+
for (const token of pattern.template.tokens) {
|
|
854
|
+
if (token.type === 'role') {
|
|
855
|
+
maxScore += 1;
|
|
856
|
+
if (captured.has(token.role)) {
|
|
857
|
+
score += 1;
|
|
858
|
+
}
|
|
859
|
+
} else if (token.type === 'group') {
|
|
860
|
+
// Group tokens are optional - weight depends on whether they have defaults
|
|
861
|
+
for (const subToken of token.tokens) {
|
|
862
|
+
if (subToken.type === 'role') {
|
|
863
|
+
const roleHasDefault = hasDefault(subToken.role);
|
|
864
|
+
const weight = 0.8; // Optional roles: 80% weight
|
|
865
|
+
maxScore += weight;
|
|
866
|
+
|
|
867
|
+
if (captured.has(subToken.role)) {
|
|
868
|
+
// Role was explicitly provided by user
|
|
869
|
+
score += weight;
|
|
870
|
+
} else if (roleHasDefault) {
|
|
871
|
+
// Role has a default - give 60% partial credit since command is semantically complete
|
|
872
|
+
// This prevents penalizing common patterns like "toggle .active" (default: me)
|
|
873
|
+
score += weight * 0.6;
|
|
874
|
+
}
|
|
875
|
+
// If no default and not captured, score += 0 (true penalty for missing info)
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
}
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
let baseConfidence = maxScore > 0 ? score / maxScore : 1;
|
|
882
|
+
|
|
883
|
+
// Apply penalty for stem matches
|
|
884
|
+
// Each stem match reduces confidence slightly (e.g., 5% per stem match)
|
|
885
|
+
// This ensures exact matches are preferred over morphological matches
|
|
886
|
+
if (this.stemMatchCount > 0 && this.totalKeywordMatches > 0) {
|
|
887
|
+
const stemPenalty = (this.stemMatchCount / this.totalKeywordMatches) * 0.15;
|
|
888
|
+
baseConfidence = Math.max(0.5, baseConfidence - stemPenalty);
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
// Apply VSO confidence boost for Arabic verb-first patterns
|
|
892
|
+
const vsoBoost = this.calculateVSOConfidenceBoost(pattern);
|
|
893
|
+
baseConfidence = Math.min(1.0, baseConfidence + vsoBoost);
|
|
894
|
+
|
|
895
|
+
// Apply preposition disambiguation adjustment for Arabic
|
|
896
|
+
const prepositionAdjustment = this.arabicPrepositionDisambiguation(pattern, captured);
|
|
897
|
+
baseConfidence = Math.max(0.0, Math.min(1.0, baseConfidence + prepositionAdjustment));
|
|
898
|
+
|
|
899
|
+
return baseConfidence;
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
/**
|
|
903
|
+
* Calculate confidence boost for VSO (Verb-Subject-Object) language patterns.
|
|
904
|
+
* Arabic naturally uses VSO word order, so patterns that start with a verb
|
|
905
|
+
* should receive a confidence boost.
|
|
906
|
+
*
|
|
907
|
+
* Returns +0.15 confidence boost if:
|
|
908
|
+
* - Language is Arabic ('ar')
|
|
909
|
+
* - Pattern's first token is a verb keyword
|
|
910
|
+
*
|
|
911
|
+
* @param pattern The language pattern being matched
|
|
912
|
+
* @returns Confidence boost (0 or 0.15)
|
|
913
|
+
*/
|
|
914
|
+
private calculateVSOConfidenceBoost(pattern: LanguagePattern): number {
|
|
915
|
+
// Only apply to Arabic
|
|
916
|
+
if (pattern.language !== 'ar') {
|
|
917
|
+
return 0;
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
// Check if first token in pattern is a literal (keyword)
|
|
921
|
+
const firstToken = pattern.template.tokens[0];
|
|
922
|
+
if (!firstToken || firstToken.type !== 'literal') {
|
|
923
|
+
return 0;
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
// List of Arabic verb keywords (command verbs)
|
|
927
|
+
const ARABIC_VERBS = new Set([
|
|
928
|
+
'بدل',
|
|
929
|
+
'غير',
|
|
930
|
+
'أضف',
|
|
931
|
+
'أزل',
|
|
932
|
+
'ضع',
|
|
933
|
+
'اجعل',
|
|
934
|
+
'عين',
|
|
935
|
+
'زد',
|
|
936
|
+
'انقص',
|
|
937
|
+
'سجل',
|
|
938
|
+
'أظهر',
|
|
939
|
+
'أخف',
|
|
940
|
+
'شغل',
|
|
941
|
+
'أرسل',
|
|
942
|
+
'ركز',
|
|
943
|
+
'شوش',
|
|
944
|
+
'توقف',
|
|
945
|
+
'انسخ',
|
|
946
|
+
'احذف',
|
|
947
|
+
'اصنع',
|
|
948
|
+
'انتظر',
|
|
949
|
+
'انتقال',
|
|
950
|
+
'أو',
|
|
951
|
+
]);
|
|
952
|
+
|
|
953
|
+
// Check if first token value is a verb
|
|
954
|
+
if (ARABIC_VERBS.has(firstToken.value)) {
|
|
955
|
+
return 0.15;
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
// Check alternatives
|
|
959
|
+
if (firstToken.alternatives) {
|
|
960
|
+
for (const alt of firstToken.alternatives) {
|
|
961
|
+
if (ARABIC_VERBS.has(alt)) {
|
|
962
|
+
return 0.15;
|
|
963
|
+
}
|
|
964
|
+
}
|
|
965
|
+
}
|
|
966
|
+
|
|
967
|
+
return 0;
|
|
968
|
+
}
|
|
969
|
+
|
|
970
|
+
/**
|
|
971
|
+
* Arabic preposition disambiguation for confidence adjustment.
|
|
972
|
+
*
|
|
973
|
+
* Different Arabic prepositions are more or less natural for different semantic roles:
|
|
974
|
+
* - على (on/upon) is preferred for patient/target roles (element selectors)
|
|
975
|
+
* - إلى (to) is preferred for destination roles
|
|
976
|
+
* - من (from) is preferred for source roles
|
|
977
|
+
* - في (in) is preferred for location roles
|
|
978
|
+
*
|
|
979
|
+
* This method analyzes the prepositions used with captured semantic roles and
|
|
980
|
+
* adjusts confidence based on idiomaticity:
|
|
981
|
+
* - +0.10 for highly idiomatic preposition choices
|
|
982
|
+
* - -0.10 for less natural preposition choices
|
|
983
|
+
*
|
|
984
|
+
* @param pattern The language pattern being matched
|
|
985
|
+
* @param captured The captured semantic values
|
|
986
|
+
* @returns Confidence adjustment (-0.10 to +0.10)
|
|
987
|
+
*/
|
|
988
|
+
private arabicPrepositionDisambiguation(
|
|
989
|
+
pattern: LanguagePattern,
|
|
990
|
+
captured: Map<SemanticRole, SemanticValue>
|
|
991
|
+
): number {
|
|
992
|
+
// Only apply to Arabic
|
|
993
|
+
if (pattern.language !== 'ar') {
|
|
994
|
+
return 0;
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
let adjustment = 0;
|
|
998
|
+
|
|
999
|
+
// Preferred prepositions for each semantic role
|
|
1000
|
+
// Only including roles that commonly use prepositions in Arabic
|
|
1001
|
+
const PREFERRED_PREPOSITIONS: Partial<Record<SemanticRole, string[]>> = {
|
|
1002
|
+
patient: ['على'], // element selectors prefer على (on/upon)
|
|
1003
|
+
destination: ['إلى', 'الى'], // destination prefers إلى (to)
|
|
1004
|
+
source: ['من'], // source prefers من (from)
|
|
1005
|
+
agent: ['من'], // agent/by prefers من (from/by)
|
|
1006
|
+
manner: ['ب'], // manner prefers ب (with/by)
|
|
1007
|
+
style: ['ب'], // style prefers ب (with)
|
|
1008
|
+
goal: ['إلى', 'الى'], // target state prefers إلى (to)
|
|
1009
|
+
method: ['ب'], // method prefers ب (with/by)
|
|
1010
|
+
};
|
|
1011
|
+
|
|
1012
|
+
// Check each captured role for preposition metadata
|
|
1013
|
+
for (const [role, value] of captured.entries()) {
|
|
1014
|
+
// Skip if no preferred prepositions defined for this role
|
|
1015
|
+
const preferred = PREFERRED_PREPOSITIONS[role];
|
|
1016
|
+
if (!preferred || preferred.length === 0) {
|
|
1017
|
+
continue;
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
// Check if the value has preposition metadata (from Arabic tokenizer)
|
|
1021
|
+
// This metadata is attached when a preposition particle token is consumed
|
|
1022
|
+
const metadata = (value as any).metadata;
|
|
1023
|
+
if (metadata && typeof metadata.prepositionValue === 'string') {
|
|
1024
|
+
const usedPreposition = metadata.prepositionValue;
|
|
1025
|
+
|
|
1026
|
+
// Check if the used preposition is in the preferred list
|
|
1027
|
+
if (preferred.includes(usedPreposition)) {
|
|
1028
|
+
// Idiomatic choice - boost confidence
|
|
1029
|
+
adjustment += 0.1;
|
|
1030
|
+
} else {
|
|
1031
|
+
// Less natural choice - reduce confidence
|
|
1032
|
+
adjustment -= 0.1;
|
|
1033
|
+
}
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
// Cap total adjustment at ±0.10 (even if multiple roles analyzed)
|
|
1038
|
+
return Math.max(-0.1, Math.min(0.1, adjustment));
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
// ===========================================================================
|
|
1042
|
+
// English Idiom Support - Noise Word Handling
|
|
1043
|
+
// ===========================================================================
|
|
1044
|
+
|
|
1045
|
+
/**
|
|
1046
|
+
* Noise words that can be skipped in English for more natural syntax.
|
|
1047
|
+
* - "the" before selectors: "toggle the .active" → "toggle .active"
|
|
1048
|
+
* - "class" after class selectors: "add the .visible class" → "add .visible"
|
|
1049
|
+
*/
|
|
1050
|
+
private static readonly ENGLISH_NOISE_WORDS = new Set(['the', 'a', 'an']);
|
|
1051
|
+
|
|
1052
|
+
/**
|
|
1053
|
+
* Skip noise words like "the" before selectors.
|
|
1054
|
+
* This enables more natural English syntax like "toggle the .active".
|
|
1055
|
+
*/
|
|
1056
|
+
private skipNoiseWords(tokens: TokenStream): void {
|
|
1057
|
+
const token = tokens.peek();
|
|
1058
|
+
if (!token) return;
|
|
1059
|
+
|
|
1060
|
+
const tokenLower = token.value.toLowerCase();
|
|
1061
|
+
|
|
1062
|
+
// Check if current token is a noise word (like "the")
|
|
1063
|
+
if (PatternMatcher.ENGLISH_NOISE_WORDS.has(tokenLower)) {
|
|
1064
|
+
// Look ahead to see if the next token is a selector
|
|
1065
|
+
const mark = tokens.mark();
|
|
1066
|
+
tokens.advance();
|
|
1067
|
+
const nextToken = tokens.peek();
|
|
1068
|
+
|
|
1069
|
+
if (nextToken && nextToken.kind === 'selector') {
|
|
1070
|
+
// Keep the position after "the" - effectively skipping it
|
|
1071
|
+
return;
|
|
1072
|
+
}
|
|
1073
|
+
|
|
1074
|
+
// Not followed by a selector, revert
|
|
1075
|
+
tokens.reset(mark);
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
// Also handle "class" after class selectors: ".visible class" → ".visible"
|
|
1079
|
+
// This is handled when the selector has already been consumed,
|
|
1080
|
+
// so we check if current token is "class" and skip it
|
|
1081
|
+
if (tokenLower === 'class') {
|
|
1082
|
+
// Skip "class" as it's just noise after a class selector
|
|
1083
|
+
tokens.advance();
|
|
1084
|
+
}
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
/**
|
|
1088
|
+
* Extract event modifiers from the token stream.
|
|
1089
|
+
* Event modifiers are .once, .debounce(N), .throttle(N), .queue(strategy)
|
|
1090
|
+
* that can appear after event names.
|
|
1091
|
+
*
|
|
1092
|
+
* Returns EventModifiers object or undefined if no modifiers found.
|
|
1093
|
+
*/
|
|
1094
|
+
extractEventModifiers(tokens: TokenStream): import('../types').EventModifiers | undefined {
|
|
1095
|
+
const modifiers: {
|
|
1096
|
+
once?: boolean;
|
|
1097
|
+
debounce?: number;
|
|
1098
|
+
throttle?: number;
|
|
1099
|
+
queue?: 'first' | 'last' | 'all' | 'none';
|
|
1100
|
+
from?: SemanticValue;
|
|
1101
|
+
} = {};
|
|
1102
|
+
|
|
1103
|
+
let foundModifier = false;
|
|
1104
|
+
|
|
1105
|
+
// Consume all consecutive event modifier tokens
|
|
1106
|
+
while (!tokens.isAtEnd()) {
|
|
1107
|
+
const token = tokens.peek();
|
|
1108
|
+
if (!token || token.kind !== 'event-modifier') {
|
|
1109
|
+
break;
|
|
1110
|
+
}
|
|
1111
|
+
|
|
1112
|
+
const metadata = token.metadata as
|
|
1113
|
+
| { modifierName: string; value?: number | string }
|
|
1114
|
+
| undefined;
|
|
1115
|
+
if (!metadata) {
|
|
1116
|
+
break;
|
|
1117
|
+
}
|
|
1118
|
+
|
|
1119
|
+
foundModifier = true;
|
|
1120
|
+
|
|
1121
|
+
switch (metadata.modifierName) {
|
|
1122
|
+
case 'once':
|
|
1123
|
+
modifiers.once = true;
|
|
1124
|
+
break;
|
|
1125
|
+
case 'debounce':
|
|
1126
|
+
if (typeof metadata.value === 'number') {
|
|
1127
|
+
modifiers.debounce = metadata.value;
|
|
1128
|
+
}
|
|
1129
|
+
break;
|
|
1130
|
+
case 'throttle':
|
|
1131
|
+
if (typeof metadata.value === 'number') {
|
|
1132
|
+
modifiers.throttle = metadata.value;
|
|
1133
|
+
}
|
|
1134
|
+
break;
|
|
1135
|
+
case 'queue':
|
|
1136
|
+
if (
|
|
1137
|
+
metadata.value === 'first' ||
|
|
1138
|
+
metadata.value === 'last' ||
|
|
1139
|
+
metadata.value === 'all' ||
|
|
1140
|
+
metadata.value === 'none'
|
|
1141
|
+
) {
|
|
1142
|
+
modifiers.queue = metadata.value;
|
|
1143
|
+
}
|
|
1144
|
+
break;
|
|
1145
|
+
}
|
|
1146
|
+
|
|
1147
|
+
tokens.advance();
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
|
+
return foundModifier ? modifiers : undefined;
|
|
1151
|
+
}
|
|
1152
|
+
}
|
|
1153
|
+
|
|
1154
|
+
// =============================================================================
|
|
1155
|
+
// Convenience Functions
|
|
1156
|
+
// =============================================================================
|
|
1157
|
+
|
|
1158
|
+
/**
|
|
1159
|
+
* Singleton pattern matcher instance.
|
|
1160
|
+
*/
|
|
1161
|
+
export const patternMatcher = new PatternMatcher();
|
|
1162
|
+
|
|
1163
|
+
/**
|
|
1164
|
+
* Match tokens against a pattern.
|
|
1165
|
+
*/
|
|
1166
|
+
export function matchPattern(
|
|
1167
|
+
tokens: TokenStream,
|
|
1168
|
+
pattern: LanguagePattern
|
|
1169
|
+
): PatternMatchResult | null {
|
|
1170
|
+
return patternMatcher.matchPattern(tokens, pattern);
|
|
1171
|
+
}
|
|
1172
|
+
|
|
1173
|
+
/**
|
|
1174
|
+
* Match tokens against multiple patterns, return best match.
|
|
1175
|
+
*/
|
|
1176
|
+
export function matchBest(
|
|
1177
|
+
tokens: TokenStream,
|
|
1178
|
+
patterns: LanguagePattern[]
|
|
1179
|
+
): PatternMatchResult | null {
|
|
1180
|
+
return patternMatcher.matchBest(tokens, patterns);
|
|
1181
|
+
}
|