npm - @lokascript/semantic - Versions diffs - 1.0.0 → 1.1.0 - Mend

@lokascript/semantic 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

package/dist/browser-ar.ar.global.js +2 -2
package/dist/browser-core.core.global.js +2 -2
package/dist/browser-de.de.global.js +2 -2
package/dist/browser-east-asian.east-asian.global.js +2 -2
package/dist/browser-en-tr.en-tr.global.js +2 -2
package/dist/browser-en.en.global.js +2 -2
package/dist/browser-es-en.es-en.global.js +2 -2
package/dist/browser-es.es.global.js +2 -2
package/dist/browser-fr.fr.global.js +2 -2
package/dist/browser-id.id.global.js +2 -2
package/dist/browser-ja.ja.global.js +2 -2
package/dist/browser-ko.ko.global.js +2 -2
package/dist/browser-lazy.lazy.global.js +2 -2
package/dist/browser-priority.priority.global.js +2 -2
package/dist/browser-pt.pt.global.js +2 -2
package/dist/browser-qu.qu.global.js +2 -2
package/dist/browser-sw.sw.global.js +2 -2
package/dist/browser-tr.tr.global.js +2 -2
package/dist/browser-western.western.global.js +2 -2
package/dist/browser-zh.zh.global.js +2 -2
package/dist/browser.global.js +2 -2
package/dist/browser.global.js.map +1 -1
package/dist/index.cjs +13042 -17462
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +49 -5
package/dist/index.d.ts +49 -5
package/dist/index.js +14044 -18464
package/dist/index.js.map +1 -1
package/dist/languages/ar.d.ts +1 -1
package/dist/languages/ar.js +31 -44
package/dist/languages/ar.js.map +1 -1
package/dist/languages/de.d.ts +1 -1
package/dist/languages/de.js +14 -2
package/dist/languages/de.js.map +1 -1
package/dist/languages/en.d.ts +1 -1
package/dist/languages/en.js +558 -12
package/dist/languages/en.js.map +1 -1
package/dist/languages/es.d.ts +1 -1
package/dist/languages/es.js +16 -0
package/dist/languages/es.js.map +1 -1
package/dist/languages/fr.d.ts +1 -1
package/dist/languages/fr.js +14 -2
package/dist/languages/fr.js.map +1 -1
package/dist/languages/id.d.ts +1 -1
package/dist/languages/id.js +14 -2
package/dist/languages/id.js.map +1 -1
package/dist/languages/ja.d.ts +1 -1
package/dist/languages/ja.js +18 -3
package/dist/languages/ja.js.map +1 -1
package/dist/languages/ko.d.ts +8 -1
package/dist/languages/ko.js +75 -43
package/dist/languages/ko.js.map +1 -1
package/dist/languages/pt.d.ts +1 -1
package/dist/languages/pt.js +17 -0
package/dist/languages/pt.js.map +1 -1
package/dist/languages/qu.d.ts +12 -1
package/dist/languages/qu.js +77 -2
package/dist/languages/qu.js.map +1 -1
package/dist/languages/sw.d.ts +1 -1
package/dist/languages/sw.js.map +1 -1
package/dist/languages/tr.d.ts +9 -1
package/dist/languages/tr.js +96 -72
package/dist/languages/tr.js.map +1 -1
package/dist/languages/zh.d.ts +1 -1
package/dist/languages/zh.js +16 -0
package/dist/languages/zh.js.map +1 -1
package/dist/{types-C4dcj53L.d.ts → types-BY3Id07j.d.ts} +20 -5
package/package.json +20 -29
package/src/generators/command-schemas.ts +21 -10
package/src/generators/event-handler-generator.ts +50 -44
package/src/generators/language-profiles.ts +6 -0
package/src/generators/pattern-generator.ts +883 -1
package/src/generators/profiles/arabic.ts +19 -3
package/src/generators/profiles/bengali.ts +12 -1
package/src/generators/profiles/chinese.ts +15 -0
package/src/generators/profiles/french.ts +12 -1
package/src/generators/profiles/german.ts +12 -1
package/src/generators/profiles/hebrew.ts +148 -0
package/src/generators/profiles/hindi.ts +12 -1
package/src/generators/profiles/index.ts +2 -0
package/src/generators/profiles/indonesian.ts +12 -1
package/src/generators/profiles/italian.ts +16 -0
package/src/generators/profiles/japanese.ts +11 -2
package/src/generators/profiles/korean.ts +15 -1
package/src/generators/profiles/polish.ts +12 -0
package/src/generators/profiles/portuguese.ts +16 -0
package/src/generators/profiles/russian.ts +11 -0
package/src/generators/profiles/spanish.ts +15 -0
package/src/generators/profiles/spanishMexico.ts +176 -0
package/src/generators/profiles/thai.ts +11 -0
package/src/generators/profiles/turkish.ts +49 -7
package/src/generators/profiles/types.ts +21 -5
package/src/generators/profiles/ukrainian.ts +11 -0
package/src/generators/profiles/vietnamese.ts +11 -0
package/src/language-building-schema.ts +111 -0
package/src/languages/_all.ts +5 -1
package/src/languages/es-MX.ts +32 -0
package/src/languages/he.ts +15 -0
package/src/parser/pattern-matcher.ts +10 -1
package/src/parser/semantic-parser.ts +3 -0
package/src/patterns/add/ar.ts +3 -59
package/src/patterns/add/index.ts +5 -1
package/src/patterns/add/ja.ts +3 -81
package/src/patterns/add/ko.ts +3 -62
package/src/patterns/add/qu.ts +69 -0
package/src/patterns/add/tr.ts +3 -59
package/src/patterns/builders.ts +1 -0
package/src/patterns/decrement/tr.ts +3 -36
package/src/patterns/event-handler/ar.ts +3 -139
package/src/patterns/event-handler/he.ts +15 -0
package/src/patterns/event-handler/index.ts +5 -1
package/src/patterns/event-handler/ja.ts +3 -106
package/src/patterns/event-handler/ko.ts +3 -121
package/src/patterns/event-handler/ms.ts +45 -20
package/src/patterns/event-handler/tr.ts +3 -158
package/src/patterns/get/ar.ts +3 -37
package/src/patterns/get/ja.ts +3 -41
package/src/patterns/get/ko.ts +3 -41
package/src/patterns/grammar-transformed/ja.ts +3 -1701
package/src/patterns/grammar-transformed/ko.ts +3 -1299
package/src/patterns/grammar-transformed/tr.ts +3 -1055
package/src/patterns/hide/ar.ts +3 -55
package/src/patterns/hide/ja.ts +3 -57
package/src/patterns/hide/ko.ts +3 -57
package/src/patterns/hide/tr.ts +3 -53
package/src/patterns/increment/tr.ts +3 -40
package/src/patterns/put/ar.ts +3 -62
package/src/patterns/put/ja.ts +3 -63
package/src/patterns/put/ko.ts +3 -55
package/src/patterns/put/tr.ts +3 -55
package/src/patterns/remove/ar.ts +3 -59
package/src/patterns/remove/index.ts +5 -1
package/src/patterns/remove/ja.ts +3 -62
package/src/patterns/remove/ko.ts +3 -66
package/src/patterns/remove/qu.ts +69 -0
package/src/patterns/remove/tr.ts +3 -66
package/src/patterns/set/ar.ts +3 -72
package/src/patterns/set/ja.ts +3 -74
package/src/patterns/set/ko.ts +3 -73
package/src/patterns/set/tr.ts +3 -95
package/src/patterns/show/ar.ts +3 -55
package/src/patterns/show/ja.ts +3 -57
package/src/patterns/show/ko.ts +3 -61
package/src/patterns/show/tr.ts +3 -53
package/src/patterns/take/ar.ts +3 -39
package/src/patterns/toggle/ar.ts +3 -49
package/src/patterns/toggle/index.ts +5 -1
package/src/patterns/toggle/ja.ts +3 -144
package/src/patterns/toggle/ko.ts +3 -101
package/src/patterns/toggle/qu.ts +90 -0
package/src/patterns/toggle/tr.ts +3 -76
package/src/registry.ts +179 -15
package/src/tokenizers/arabic.ts +13 -46
package/src/tokenizers/bengali.ts +2 -16
package/src/tokenizers/he.ts +542 -0
package/src/tokenizers/index.ts +1 -0
package/src/tokenizers/japanese.ts +3 -1
package/src/tokenizers/korean.ts +104 -48
package/src/tokenizers/ms.ts +3 -0
package/src/tokenizers/quechua.ts +101 -2
package/src/tokenizers/turkish.ts +64 -69
package/src/types.ts +13 -0

package/src/tokenizers/korean.ts CHANGED Viewed

@@ -104,6 +104,17 @@ const SINGLE_CHAR_PARTICLES = new Set([
  */
 const MULTI_CHAR_PARTICLES = ['에서', '으로', '부터', '까지', '처럼', '보다'];
+/**
+ * Temporal event suffixes that should be split from compound words.
+ * These are verb endings that indicate "when" something happens.
+ * Sorted by length (longest first) to ensure greedy matching.
+ *
+ * Examples:
+ * - 클릭할때 → 클릭 + 할때 (click + when)
+ * - 입력할때 → 입력 + 할때 (input + when)
+ */
+const TEMPORAL_EVENT_SUFFIXES = ['할때', '하면', '하니까', '할 때'];
 /**
  * Particle metadata mapping particles to semantic roles, confidence scores,
  * and vowel harmony variants. Korean particles change based on whether the
@@ -231,13 +242,19 @@ const PARTICLE_ROLES = new Map<string, ParticleMetadata>([
 // =============================================================================
 /**
- * Extra keywords not covered by the profile:
+ * Extra keywords not covered by the profile.
+ *
+ * SIMPLIFIED: Following the Tagalog/Hindi model of minimal EXTRAS.
+ * Removed attached particle+verb compounds (를토글, 을토글, etc.) that create
+ * parsing ambiguity. Japanese tokenizer explicitly avoids these - we follow suit.
+ *
+ * Only includes:
  * - Literals (true, false, null, undefined)
  * - Positional words
  * - Event names
- * - Attached particle forms (native idioms)
- * - Conditional event forms
  * - Time units
+ * - References not in profile
+ * - Logical operators
  */
 const KOREAN_EXTRAS: KeywordEntry[] = [
   // Values/Literals
@@ -267,64 +284,26 @@ const KOREAN_EXTRAS: KeywordEntry[] = [
   { native: '마우스오버', normalized: 'mouseover' },
   { native: '마우스아웃', normalized: 'mouseout' },
-  // References (additional forms)
+  // References (additional forms not in profile)
   { native: '내', normalized: 'my' },
   { native: '그것의', normalized: 'its' },
-  // Conditional event forms (native idioms)
-  { native: '하면', normalized: 'on' },
-  { native: '으면', normalized: 'on' },
-  { native: '면', normalized: 'on' },
-  { native: '할때', normalized: 'on' },
-  { native: '할 때', normalized: 'on' },
-  { native: '을때', normalized: 'on' },
-  { native: '을 때', normalized: 'on' },
-  { native: '하니까', normalized: 'on' },
-  { native: '니까', normalized: 'on' },
-  // Control flow helpers
-  { native: '그러면', normalized: 'then' },
-  { native: '그렇지않으면', normalized: 'otherwise' },
-  { native: '중단', normalized: 'break' },
   // Logical
   { native: '그리고', normalized: 'and' },
   { native: '또는', normalized: 'or' },
   { native: '아니', normalized: 'not' },
   { native: '이다', normalized: 'is' },
-  // Command overrides (ensure correct mapping when profile has multiple meanings)
-  { native: '추가', normalized: 'add' }, // Profile may have this as 'append'
-  // Attached particle forms (native idioms - particle + verb without space)
-  // Object particle 를 (after vowel)
-  { native: '를토글', normalized: 'toggle' },
-  { native: '를전환', normalized: 'toggle' },
-  { native: '를추가', normalized: 'add' },
-  { native: '를제거', normalized: 'remove' },
-  { native: '를삭제', normalized: 'remove' },
-  { native: '를증가', normalized: 'increment' },
-  { native: '를감소', normalized: 'decrement' },
-  { native: '를표시', normalized: 'show' },
-  { native: '를숨기다', normalized: 'hide' },
-  { native: '를설정', normalized: 'set' },
-  // Object particle 을 (after consonant)
-  { native: '을토글', normalized: 'toggle' },
-  { native: '을전환', normalized: 'toggle' },
-  { native: '을추가', normalized: 'add' },
-  { native: '을제거', normalized: 'remove' },
-  { native: '을삭제', normalized: 'remove' },
-  { native: '을증가', normalized: 'increment' },
-  { native: '을감소', normalized: 'decrement' },
-  { native: '을표시', normalized: 'show' },
-  { native: '을숨기다', normalized: 'hide' },
-  { native: '을설정', normalized: 'set' },
   // Time units
   { native: '초', normalized: 's' },
   { native: '밀리초', normalized: 'ms' },
   { native: '분', normalized: 'm' },
   { native: '시간', normalized: 'h' },
+  // Note: Attached particle+verb forms (를토글, 을토글, etc.) are intentionally
+  // NOT included because they cause ambiguous parsing. The separate particle + verb
+  // pattern (를 + 토글) is preferred for consistent semantic analysis.
+  // This follows the same approach as the Japanese tokenizer.
 ];
 // =============================================================================
@@ -431,7 +410,14 @@ export class KoreanTokenizer extends BaseTokenizer {
       if (isKorean(input[pos])) {
         const wordToken = this.extractKoreanWord(input, pos);
         if (wordToken) {
-          tokens.push(wordToken);
+          // Check if the word ends with a temporal event suffix (e.g., 클릭할때 → 클릭 + 할때)
+          const splitResult = this.trySplitTemporalSuffix(wordToken);
+          if (splitResult) {
+            tokens.push(splitResult.stemToken);
+            tokens.push(splitResult.suffixToken);
+          } else {
+            tokens.push(wordToken);
+          }
           pos = wordToken.position.end;
           continue;
         }
@@ -528,6 +514,19 @@ export class KoreanTokenizer extends BaseTokenizer {
       }
       if (!allKorean) continue;
+      // If this candidate starting at the beginning is a particle, return null
+      // to let the main tokenize loop handle it as a particle
+      // This prevents roleMarker keywords from overriding particle classification
+      if (PARTICLES.has(candidate) && startPos === startPos) {
+        // Check if this particle-like candidate is at a word boundary (standalone)
+        const afterCandidate = startPos + len;
+        const nextChar = afterCandidate < input.length ? input[afterCandidate] : '';
+        if (nextChar === '' || isWhitespace(nextChar) || !isKorean(nextChar)) {
+          return null; // Let main loop handle as particle
+        }
+        // Otherwise it's part of a larger word, continue checking
+      }
       // O(1) Map lookup instead of O(n) array search
       const keywordEntry = this.lookupKeyword(candidate);
       if (keywordEntry) {
@@ -552,6 +551,12 @@ export class KoreanTokenizer extends BaseTokenizer {
       const char = input[pos];
       const nextChar = pos + 1 < input.length ? input[pos + 1] : '';
+      // If we're at a particle with no content yet, return null to let main loop handle it
+      // This ensures particles like 를, 를 in #count를증가 are separated properly
+      if (word.length === 0 && SINGLE_CHAR_PARTICLES.has(char)) {
+        return null;
+      }
       // Stop at single-char particles only if:
       // 1. We have content already
       // 2. The particle is at a word boundary (followed by whitespace, end, non-Korean, or another particle)
@@ -594,6 +599,12 @@ export class KoreanTokenizer extends BaseTokenizer {
     if (!word) return null;
+    // If the word is a particle, return null to let the main tokenize loop handle it
+    // This prevents roleMarker keywords from overriding particle classification
+    if (PARTICLES.has(word)) {
+      return null;
+    }
     // O(1) Map lookup instead of O(n) array search
     const keywordEntry = this.lookupKeyword(word);
     if (keywordEntry) {
@@ -634,6 +645,51 @@ export class KoreanTokenizer extends BaseTokenizer {
       skipWhitespace: false,
     });
   }
+  /**
+   * Try to split a temporal event suffix from a word token.
+   * This handles compact forms like 클릭할때 → 클릭 + 할때
+   *
+   * @returns Split tokens if a suffix is found, null otherwise
+   */
+  private trySplitTemporalSuffix(
+    wordToken: LanguageToken
+  ): { stemToken: LanguageToken; suffixToken: LanguageToken } | null {
+    const word = wordToken.value;
+    // Check for temporal suffixes (longest first)
+    for (const suffix of TEMPORAL_EVENT_SUFFIXES) {
+      if (word.endsWith(suffix) && word.length > suffix.length) {
+        const stem = word.slice(0, -suffix.length);
+        // Only split if the stem is a known keyword
+        const stemLower = stem.toLowerCase();
+        const keywordEntry = this.lookupKeyword(stemLower);
+        if (!keywordEntry) continue;
+        const stemEnd = wordToken.position.start + stem.length;
+        const stemToken = createToken(
+          stem,
+          'keyword',
+          createPosition(wordToken.position.start, stemEnd),
+          keywordEntry.normalized
+        );
+        // Create suffix token as a keyword (event marker)
+        const suffixToken = createToken(
+          suffix,
+          'keyword',
+          createPosition(stemEnd, wordToken.position.end),
+          'when' // Normalize temporal suffixes to 'when'
+        );
+        return { stemToken, suffixToken };
+      }
+    }
+    return null;
+  }
 }
 /**

package/src/tokenizers/ms.ts CHANGED Viewed

@@ -54,10 +54,13 @@ const MALAY_EXTRAS: KeywordEntry[] = [
   // Events
   { native: 'klik', normalized: 'click' },
   { native: 'berubah', normalized: 'change' },
+  { native: 'ubah', normalized: 'change' }, // Alternative for change
   { native: 'hantar', normalized: 'submit' },
   { native: 'input', normalized: 'input' },
+  { native: 'masuk', normalized: 'input' }, // Alternative for input (means "enter")
   { native: 'muat', normalized: 'load' },
   { native: 'tatal', normalized: 'scroll' },
+  { native: 'hover', normalized: 'hover' }, // English loanword commonly used
 ];
 // =============================================================================

package/src/tokenizers/quechua.ts CHANGED Viewed

@@ -82,6 +82,7 @@ const QUECHUA_EXTRAS: KeywordEntry[] = [
   // Events
   { native: 'llikllay', normalized: 'click' },
+  { native: 'ñitiy', normalized: 'click' },
   { native: 'click', normalized: 'click' },
   { native: 'yaykuy', normalized: 'input' },
   { native: 'llave uray', normalized: 'keydown' },
@@ -172,8 +173,18 @@ export class QuechuaTokenizer extends BaseTokenizer {
         const selectorToken = this.trySelector(input, pos);
         if (selectorToken) {
-          tokens.push(selectorToken);
-          pos = selectorToken.position.end;
+          // Check if selector has a Quechua suffix attached
+          const selectorWithSuffix = this.splitSelectorSuffix(selectorToken);
+          if (selectorWithSuffix.length === 2) {
+            // Selector + suffix: push both tokens
+            tokens.push(selectorWithSuffix[0]);
+            tokens.push(selectorWithSuffix[1]);
+            pos = selectorWithSuffix[1].position.end;
+          } else {
+            // Just selector: push as-is
+            tokens.push(selectorToken);
+            pos = selectorToken.position.end;
+          }
           continue;
         }
       }
@@ -226,6 +237,14 @@ export class QuechuaTokenizer extends BaseTokenizer {
       }
       if (isQuechuaLetter(input[pos])) {
+        // Try multi-word keywords first (e.g., "mana qhawachiy" = blur)
+        const multiWordToken = this.tryMultiWordKeyword(input, pos);
+        if (multiWordToken) {
+          tokens.push(multiWordToken);
+          pos = multiWordToken.position.end;
+          continue;
+        }
         const wordToken = this.extractWord(input, pos);
         if (wordToken) {
           tokens.push(wordToken);
@@ -272,6 +291,86 @@ export class QuechuaTokenizer extends BaseTokenizer {
     return null;
   }
+  /**
+   * Split a selector token if it has a Quechua suffix attached.
+   * E.g., ".openta" -> [".open", "-ta"]
+   * Returns array with 1 token (no suffix) or 2 tokens (selector + suffix)
+   */
+  private splitSelectorSuffix(selectorToken: LanguageToken): LanguageToken[] {
+    const text = selectorToken.value;
+    // Check if selector ends with any known suffix
+    for (const suffix of SUFFIXES) {
+      if (text.toLowerCase().endsWith(suffix)) {
+        const baseEnd = text.length - suffix.length;
+        const base = text.slice(0, baseEnd);
+        const suffixPart = text.slice(baseEnd);
+        // Create base selector token
+        const baseToken = createToken(
+          base,
+          'selector',
+          createPosition(selectorToken.position.start, selectorToken.position.start + baseEnd)
+        );
+        // Create suffix particle token
+        const suffixToken = createToken(
+          suffixPart,
+          'particle',
+          createPosition(selectorToken.position.start + baseEnd, selectorToken.position.end)
+        );
+        return [baseToken, suffixToken];
+      }
+    }
+    // No suffix found, return original token
+    return [selectorToken];
+  }
+  /**
+   * Try to match multi-word keywords that should be treated as a single unit.
+   * E.g., "mana qhawachiy" (not focus = blur)
+   */
+  private tryMultiWordKeyword(input: string, pos: number): LanguageToken | null {
+    // Multi-word keywords (longest first)
+    const multiWordKeywords: Array<{ pattern: string; normalized: string }> = [
+      { pattern: 'mana qhawachiy', normalized: 'blur' },
+      { pattern: 'mana qhaway', normalized: 'blur' },
+      { pattern: 'mana riqsisqa', normalized: 'undefined' },
+      { pattern: 'mana waqtalla', normalized: 'async' },
+      { pattern: 'ñawpaq kaq', normalized: 'previous' },
+      { pattern: 'aswan qayllaqa', normalized: 'closest' },
+      { pattern: 'llave uray', normalized: 'keydown' },
+      { pattern: 'llave hawa', normalized: 'keyup' },
+      { pattern: 'mausiri yayku', normalized: 'mouseover' },
+      { pattern: 'mausiri lluqsi', normalized: 'mouseout' },
+      { pattern: 'waranqa sikundu', normalized: 'ms' },
+    ];
+    const inputLower = input.toLowerCase();
+    for (const { pattern, normalized } of multiWordKeywords) {
+      if (inputLower.slice(pos, pos + pattern.length) === pattern) {
+        // Check that it's followed by whitespace or end of input
+        const endPos = pos + pattern.length;
+        if (
+          endPos >= input.length ||
+          isWhitespace(input[endPos]) ||
+          !isQuechuaLetter(input[endPos])
+        ) {
+          return createToken(
+            input.slice(pos, endPos),
+            'keyword',
+            createPosition(pos, endPos),
+            normalized
+          );
+        }
+      }
+    }
+    return null;
+  }
   private extractWord(input: string, startPos: number): LanguageToken | null {
     let pos = startPos;
     let word = '';

package/src/tokenizers/turkish.ts CHANGED Viewed

@@ -96,13 +96,17 @@ const CASE_SUFFIXES = new Set([
 // =============================================================================
 /**
- * Extra keywords not covered by the profile:
+ * Extra keywords not covered by the profile.
+ *
+ * SIMPLIFIED: Following the Tagalog/Hindi model of minimal EXTRAS.
+ * Command synonyms and diacritic-free variants should be in profile alternatives,
+ * not duplicated here. Only includes:
  * - Literals (true, false, null, undefined)
  * - Positional words
  * - Event names
  * - Time units
- * - Diacritic-free variants
- * - Additional synonyms
+ * - References not in profile
+ * - Logical operators
  */
 const TURKISH_EXTRAS: KeywordEntry[] = [
   // Values/Literals
@@ -149,15 +153,9 @@ const TURKISH_EXTRAS: KeywordEntry[] = [
   { native: 'tuş_bırak', normalized: 'keyup' },
   { native: 'tus_birak', normalized: 'keyup' },
-  // References
-  { native: 'ben', normalized: 'me' },
+  // References (possessive forms not in profile)
   { native: 'benim', normalized: 'my' },
-  { native: 'o', normalized: 'it' },
   { native: 'onun', normalized: 'its' },
-  { native: 'sonuç', normalized: 'result' },
-  { native: 'sonuc', normalized: 'result' },
-  { native: 'olay', normalized: 'event' },
-  { native: 'hedef', normalized: 'target' },
   // Time units
   { native: 'saniye', normalized: 's' },
@@ -171,61 +169,8 @@ const TURKISH_EXTRAS: KeywordEntry[] = [
   { native: 'değil', normalized: 'not' },
   { native: 'degil', normalized: 'not' },
-  // Event triggers (on)
-  { native: 'üzerinde', normalized: 'on' },
-  { native: 'uzerinde', normalized: 'on' },
-  { native: 'olduğunda', normalized: 'on' },
-  { native: 'oldugunda', normalized: 'on' },
-  // Command overrides (ensure correct mapping when profile has multiple meanings)
-  { native: 'ekle', normalized: 'add' }, // Profile may have this as 'append'
-  { native: 'değiştir', normalized: 'toggle' }, // Profile has this as 'swap'
-  // Diacritic-free variants of commands
-  { native: 'değistir', normalized: 'toggle' },
-  { native: 'kaldir', normalized: 'remove' },
-  { native: 'yerlestir', normalized: 'put' },
-  { native: 'olustur', normalized: 'make' },
-  { native: 'artir', normalized: 'increment' },
-  { native: 'yazdir', normalized: 'log' },
-  { native: 'goster', normalized: 'show' },
-  { native: 'gecis', normalized: 'transition' },
-  { native: 'atesle', normalized: 'trigger' },
-  { native: 'gonder', normalized: 'send' },
-  { native: 'bulaniklastir', normalized: 'blur' },
-  { native: 'odak_kaldir', normalized: 'blur' },
-  { native: 'yonlendir', normalized: 'go' },
-  { native: 'cek', normalized: 'fetch' },
-  { native: 'yerles', normalized: 'settle' },
-  { native: 'eger', normalized: 'if' },
-  { native: 'degilse', normalized: 'else' },
-  { native: 'firlat', normalized: 'throw' },
-  { native: 'cagir', normalized: 'call' },
-  { native: 'don', normalized: 'return' },
-  { native: 'dondur', normalized: 'return' },
-  { native: 'eszamansiz', normalized: 'async' },
-  { native: 'soyle', normalized: 'tell' },
-  { native: 'varsayilan', normalized: 'default' },
-  { native: 'baslat', normalized: 'init' },
-  { native: 'basla', normalized: 'init' },
-  { native: 'davranis', normalized: 'behavior' },
-  { native: 'yukle', normalized: 'install' },
-  { native: 'olc', normalized: 'measure' },
-  { native: 'icine', normalized: 'into' },
-  { native: 'once', normalized: 'before' },
-  { native: 'icin', normalized: 'for' },
-  // Colloquial forms
-  { native: 'al', normalized: 'get' },
-  { native: 'yap', normalized: 'set' },
-  // Control flow helpers
-  { native: 'o_zaman', normalized: 'then' },
-  { native: 'bitir', normalized: 'end' },
-  // Case suffix modifiers
-  { native: '-den', normalized: 'from' },
-  { native: '-dan', normalized: 'from' },
+  // Note: Command synonyms and diacritic-free variants should be in profile alternatives.
+  // Event triggers (üzerinde, olduğunda) should be in profile as 'on' alternatives.
 ];
 // =============================================================================
@@ -328,6 +273,14 @@ export class TurkishTokenizer extends BaseTokenizer {
         continue;
       }
+      // Try multi-word phrases first (e.g., "üzerine gelme", "fare üzerinde")
+      const phraseToken = this.tryMultiWordPhrase(input, pos);
+      if (phraseToken) {
+        tokens.push(phraseToken);
+        pos = phraseToken.position.end;
+        continue;
+      }
       // Try Turkish word
       if (isTurkishLetter(input[pos])) {
         const wordToken = this.extractTurkishWord(input, pos);
@@ -358,6 +311,42 @@ export class TurkishTokenizer extends BaseTokenizer {
     return 'identifier';
   }
+  /**
+   * Try to match multi-word phrases that function as single units.
+   * Multi-word phrases are included in profileKeywords and sorted longest-first,
+   * so they'll be matched before their constituent words.
+   *
+   * Examples: "üzerine gelme" (hover), "fare üzerinde" (mouseover)
+   */
+  private tryMultiWordPhrase(input: string, pos: number): LanguageToken | null {
+    // Check against multi-word entries in profileKeywords (sorted longest-first)
+    for (const entry of this.profileKeywords) {
+      // Only check multi-word phrases (contain space)
+      if (!entry.native.includes(' ')) continue;
+      const phrase = entry.native;
+      const candidate = input.slice(pos, pos + phrase.length).toLowerCase();
+      if (candidate === phrase.toLowerCase()) {
+        // Check word boundary
+        const nextPos = pos + phrase.length;
+        if (
+          nextPos >= input.length ||
+          isWhitespace(input[nextPos]) ||
+          !isTurkishLetter(input[nextPos])
+        ) {
+          return createToken(
+            input.slice(pos, pos + phrase.length),
+            'keyword',
+            createPosition(pos, nextPos),
+            entry.normalized
+          );
+        }
+      }
+    }
+    return null;
+  }
   /**
    * Extract a Turkish word.
    * Uses morphological normalization to handle verb conjugations.
@@ -375,10 +364,10 @@ export class TurkishTokenizer extends BaseTokenizer {
     const lowerWord = word.toLowerCase();
-    // O(1) Map lookup instead of O(n) array search
-    const keywordEntry = this.lookupKeyword(lowerWord);
-    if (keywordEntry) {
-      return createToken(word, 'keyword', createPosition(startPos, pos), keywordEntry.normalized);
+    // Check if it's a case suffix (particle) first
+    // This prevents roleMarker keywords from overriding particle classification
+    if (CASE_SUFFIXES.has(lowerWord)) {
+      return createToken(word, 'particle', createPosition(startPos, pos));
     }
     // Check if it's a postposition
@@ -386,6 +375,12 @@ export class TurkishTokenizer extends BaseTokenizer {
       return createToken(word, 'particle', createPosition(startPos, pos));
     }
+    // O(1) Map lookup instead of O(n) array search
+    const keywordEntry = this.lookupKeyword(lowerWord);
+    if (keywordEntry) {
+      return createToken(word, 'keyword', createPosition(startPos, pos), keywordEntry.normalized);
+    }
     // Try morphological normalization for conjugated forms
     const morphToken = this.tryMorphKeywordMatch(lowerWord, startPos, pos);
     if (morphToken) return morphToken;

package/src/types.ts CHANGED Viewed

@@ -157,6 +157,15 @@ export interface SemanticMetadata {
   readonly sourceText?: string;
   readonly sourcePosition?: SourcePosition;
   readonly patternId?: string;
+  /**
+   * Confidence score for the parse (0-1).
+   * Higher values indicate more certain matches.
+   * - 1.0: Exact match with all roles captured
+   * - 0.8-0.99: High confidence with minor uncertainty (stem matching, optional roles)
+   * - 0.6-0.8: Medium confidence (morphological normalization, defaults applied)
+   * - <0.6: Low confidence (may need fallback to traditional parser)
+   */
+  readonly confidence?: number;
 }
 export interface SourcePosition {
@@ -336,6 +345,10 @@ export interface ExtractionRule {
   readonly transform?: (raw: string) => SemanticValue;
   /** Default value if not found (for optional roles) */
   readonly default?: SemanticValue;
+  /** Static value extraction (for event handler wrapped commands) */
+  readonly value?: string;
+  /** Extract value from a pattern role by name */
+  readonly fromRole?: string;
 }
 /**