@lokascript/semantic 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/dist/browser-ar.ar.global.js +2 -2
  2. package/dist/browser-core.core.global.js +2 -2
  3. package/dist/browser-de.de.global.js +2 -2
  4. package/dist/browser-east-asian.east-asian.global.js +2 -2
  5. package/dist/browser-en-tr.en-tr.global.js +2 -2
  6. package/dist/browser-en.en.global.js +2 -2
  7. package/dist/browser-es-en.es-en.global.js +2 -2
  8. package/dist/browser-es.es.global.js +2 -2
  9. package/dist/browser-fr.fr.global.js +2 -2
  10. package/dist/browser-id.id.global.js +2 -2
  11. package/dist/browser-ja.ja.global.js +2 -2
  12. package/dist/browser-ko.ko.global.js +2 -2
  13. package/dist/browser-lazy.lazy.global.js +2 -2
  14. package/dist/browser-priority.priority.global.js +2 -2
  15. package/dist/browser-pt.pt.global.js +2 -2
  16. package/dist/browser-qu.qu.global.js +2 -2
  17. package/dist/browser-sw.sw.global.js +2 -2
  18. package/dist/browser-tr.tr.global.js +2 -2
  19. package/dist/browser-western.western.global.js +2 -2
  20. package/dist/browser-zh.zh.global.js +2 -2
  21. package/dist/browser.global.js +2 -2
  22. package/dist/browser.global.js.map +1 -1
  23. package/dist/index.cjs +13042 -17462
  24. package/dist/index.cjs.map +1 -1
  25. package/dist/index.d.cts +49 -5
  26. package/dist/index.d.ts +49 -5
  27. package/dist/index.js +14044 -18464
  28. package/dist/index.js.map +1 -1
  29. package/dist/languages/ar.d.ts +1 -1
  30. package/dist/languages/ar.js +31 -44
  31. package/dist/languages/ar.js.map +1 -1
  32. package/dist/languages/de.d.ts +1 -1
  33. package/dist/languages/de.js +14 -2
  34. package/dist/languages/de.js.map +1 -1
  35. package/dist/languages/en.d.ts +1 -1
  36. package/dist/languages/en.js +558 -12
  37. package/dist/languages/en.js.map +1 -1
  38. package/dist/languages/es.d.ts +1 -1
  39. package/dist/languages/es.js +16 -0
  40. package/dist/languages/es.js.map +1 -1
  41. package/dist/languages/fr.d.ts +1 -1
  42. package/dist/languages/fr.js +14 -2
  43. package/dist/languages/fr.js.map +1 -1
  44. package/dist/languages/id.d.ts +1 -1
  45. package/dist/languages/id.js +14 -2
  46. package/dist/languages/id.js.map +1 -1
  47. package/dist/languages/ja.d.ts +1 -1
  48. package/dist/languages/ja.js +18 -3
  49. package/dist/languages/ja.js.map +1 -1
  50. package/dist/languages/ko.d.ts +8 -1
  51. package/dist/languages/ko.js +75 -43
  52. package/dist/languages/ko.js.map +1 -1
  53. package/dist/languages/pt.d.ts +1 -1
  54. package/dist/languages/pt.js +17 -0
  55. package/dist/languages/pt.js.map +1 -1
  56. package/dist/languages/qu.d.ts +12 -1
  57. package/dist/languages/qu.js +77 -2
  58. package/dist/languages/qu.js.map +1 -1
  59. package/dist/languages/sw.d.ts +1 -1
  60. package/dist/languages/sw.js.map +1 -1
  61. package/dist/languages/tr.d.ts +9 -1
  62. package/dist/languages/tr.js +96 -72
  63. package/dist/languages/tr.js.map +1 -1
  64. package/dist/languages/zh.d.ts +1 -1
  65. package/dist/languages/zh.js +16 -0
  66. package/dist/languages/zh.js.map +1 -1
  67. package/dist/{types-C4dcj53L.d.ts → types-BY3Id07j.d.ts} +20 -5
  68. package/package.json +20 -29
  69. package/src/generators/command-schemas.ts +21 -10
  70. package/src/generators/event-handler-generator.ts +50 -44
  71. package/src/generators/language-profiles.ts +6 -0
  72. package/src/generators/pattern-generator.ts +883 -1
  73. package/src/generators/profiles/arabic.ts +19 -3
  74. package/src/generators/profiles/bengali.ts +12 -1
  75. package/src/generators/profiles/chinese.ts +15 -0
  76. package/src/generators/profiles/french.ts +12 -1
  77. package/src/generators/profiles/german.ts +12 -1
  78. package/src/generators/profiles/hebrew.ts +148 -0
  79. package/src/generators/profiles/hindi.ts +12 -1
  80. package/src/generators/profiles/index.ts +2 -0
  81. package/src/generators/profiles/indonesian.ts +12 -1
  82. package/src/generators/profiles/italian.ts +16 -0
  83. package/src/generators/profiles/japanese.ts +11 -2
  84. package/src/generators/profiles/korean.ts +15 -1
  85. package/src/generators/profiles/polish.ts +12 -0
  86. package/src/generators/profiles/portuguese.ts +16 -0
  87. package/src/generators/profiles/russian.ts +11 -0
  88. package/src/generators/profiles/spanish.ts +15 -0
  89. package/src/generators/profiles/spanishMexico.ts +176 -0
  90. package/src/generators/profiles/thai.ts +11 -0
  91. package/src/generators/profiles/turkish.ts +49 -7
  92. package/src/generators/profiles/types.ts +21 -5
  93. package/src/generators/profiles/ukrainian.ts +11 -0
  94. package/src/generators/profiles/vietnamese.ts +11 -0
  95. package/src/language-building-schema.ts +111 -0
  96. package/src/languages/_all.ts +5 -1
  97. package/src/languages/es-MX.ts +32 -0
  98. package/src/languages/he.ts +15 -0
  99. package/src/parser/pattern-matcher.ts +10 -1
  100. package/src/parser/semantic-parser.ts +3 -0
  101. package/src/patterns/add/ar.ts +3 -59
  102. package/src/patterns/add/index.ts +5 -1
  103. package/src/patterns/add/ja.ts +3 -81
  104. package/src/patterns/add/ko.ts +3 -62
  105. package/src/patterns/add/qu.ts +69 -0
  106. package/src/patterns/add/tr.ts +3 -59
  107. package/src/patterns/builders.ts +1 -0
  108. package/src/patterns/decrement/tr.ts +3 -36
  109. package/src/patterns/event-handler/ar.ts +3 -139
  110. package/src/patterns/event-handler/he.ts +15 -0
  111. package/src/patterns/event-handler/index.ts +5 -1
  112. package/src/patterns/event-handler/ja.ts +3 -106
  113. package/src/patterns/event-handler/ko.ts +3 -121
  114. package/src/patterns/event-handler/ms.ts +45 -20
  115. package/src/patterns/event-handler/tr.ts +3 -158
  116. package/src/patterns/get/ar.ts +3 -37
  117. package/src/patterns/get/ja.ts +3 -41
  118. package/src/patterns/get/ko.ts +3 -41
  119. package/src/patterns/grammar-transformed/ja.ts +3 -1701
  120. package/src/patterns/grammar-transformed/ko.ts +3 -1299
  121. package/src/patterns/grammar-transformed/tr.ts +3 -1055
  122. package/src/patterns/hide/ar.ts +3 -55
  123. package/src/patterns/hide/ja.ts +3 -57
  124. package/src/patterns/hide/ko.ts +3 -57
  125. package/src/patterns/hide/tr.ts +3 -53
  126. package/src/patterns/increment/tr.ts +3 -40
  127. package/src/patterns/put/ar.ts +3 -62
  128. package/src/patterns/put/ja.ts +3 -63
  129. package/src/patterns/put/ko.ts +3 -55
  130. package/src/patterns/put/tr.ts +3 -55
  131. package/src/patterns/remove/ar.ts +3 -59
  132. package/src/patterns/remove/index.ts +5 -1
  133. package/src/patterns/remove/ja.ts +3 -62
  134. package/src/patterns/remove/ko.ts +3 -66
  135. package/src/patterns/remove/qu.ts +69 -0
  136. package/src/patterns/remove/tr.ts +3 -66
  137. package/src/patterns/set/ar.ts +3 -72
  138. package/src/patterns/set/ja.ts +3 -74
  139. package/src/patterns/set/ko.ts +3 -73
  140. package/src/patterns/set/tr.ts +3 -95
  141. package/src/patterns/show/ar.ts +3 -55
  142. package/src/patterns/show/ja.ts +3 -57
  143. package/src/patterns/show/ko.ts +3 -61
  144. package/src/patterns/show/tr.ts +3 -53
  145. package/src/patterns/take/ar.ts +3 -39
  146. package/src/patterns/toggle/ar.ts +3 -49
  147. package/src/patterns/toggle/index.ts +5 -1
  148. package/src/patterns/toggle/ja.ts +3 -144
  149. package/src/patterns/toggle/ko.ts +3 -101
  150. package/src/patterns/toggle/qu.ts +90 -0
  151. package/src/patterns/toggle/tr.ts +3 -76
  152. package/src/registry.ts +179 -15
  153. package/src/tokenizers/arabic.ts +13 -46
  154. package/src/tokenizers/bengali.ts +2 -16
  155. package/src/tokenizers/he.ts +542 -0
  156. package/src/tokenizers/index.ts +1 -0
  157. package/src/tokenizers/japanese.ts +3 -1
  158. package/src/tokenizers/korean.ts +104 -48
  159. package/src/tokenizers/ms.ts +3 -0
  160. package/src/tokenizers/quechua.ts +101 -2
  161. package/src/tokenizers/turkish.ts +64 -69
  162. package/src/types.ts +13 -0
@@ -104,6 +104,17 @@ const SINGLE_CHAR_PARTICLES = new Set([
104
104
  */
105
105
  const MULTI_CHAR_PARTICLES = ['에서', '으로', '부터', '까지', '처럼', '보다'];
106
106
 
107
+ /**
108
+ * Temporal event suffixes that should be split from compound words.
109
+ * These are verb endings that indicate "when" something happens.
110
+ * Sorted by length (longest first) to ensure greedy matching.
111
+ *
112
+ * Examples:
113
+ * - 클릭할때 → 클릭 + 할때 (click + when)
114
+ * - 입력할때 → 입력 + 할때 (input + when)
115
+ */
116
+ const TEMPORAL_EVENT_SUFFIXES = ['할때', '하면', '하니까', '할 때'];
117
+
107
118
  /**
108
119
  * Particle metadata mapping particles to semantic roles, confidence scores,
109
120
  * and vowel harmony variants. Korean particles change based on whether the
@@ -231,13 +242,19 @@ const PARTICLE_ROLES = new Map<string, ParticleMetadata>([
231
242
  // =============================================================================
232
243
 
233
244
  /**
234
- * Extra keywords not covered by the profile:
245
+ * Extra keywords not covered by the profile.
246
+ *
247
+ * SIMPLIFIED: Following the Tagalog/Hindi model of minimal EXTRAS.
248
+ * Removed attached particle+verb compounds (를토글, 을토글, etc.) that create
249
+ * parsing ambiguity. Japanese tokenizer explicitly avoids these - we follow suit.
250
+ *
251
+ * Only includes:
235
252
  * - Literals (true, false, null, undefined)
236
253
  * - Positional words
237
254
  * - Event names
238
- * - Attached particle forms (native idioms)
239
- * - Conditional event forms
240
255
  * - Time units
256
+ * - References not in profile
257
+ * - Logical operators
241
258
  */
242
259
  const KOREAN_EXTRAS: KeywordEntry[] = [
243
260
  // Values/Literals
@@ -267,64 +284,26 @@ const KOREAN_EXTRAS: KeywordEntry[] = [
267
284
  { native: '마우스오버', normalized: 'mouseover' },
268
285
  { native: '마우스아웃', normalized: 'mouseout' },
269
286
 
270
- // References (additional forms)
287
+ // References (additional forms not in profile)
271
288
  { native: '내', normalized: 'my' },
272
289
  { native: '그것의', normalized: 'its' },
273
290
 
274
- // Conditional event forms (native idioms)
275
- { native: '하면', normalized: 'on' },
276
- { native: '으면', normalized: 'on' },
277
- { native: '면', normalized: 'on' },
278
- { native: '할때', normalized: 'on' },
279
- { native: '할 때', normalized: 'on' },
280
- { native: '을때', normalized: 'on' },
281
- { native: '을 때', normalized: 'on' },
282
- { native: '하니까', normalized: 'on' },
283
- { native: '니까', normalized: 'on' },
284
-
285
- // Control flow helpers
286
- { native: '그러면', normalized: 'then' },
287
- { native: '그렇지않으면', normalized: 'otherwise' },
288
- { native: '중단', normalized: 'break' },
289
-
290
291
  // Logical
291
292
  { native: '그리고', normalized: 'and' },
292
293
  { native: '또는', normalized: 'or' },
293
294
  { native: '아니', normalized: 'not' },
294
295
  { native: '이다', normalized: 'is' },
295
296
 
296
- // Command overrides (ensure correct mapping when profile has multiple meanings)
297
- { native: '추가', normalized: 'add' }, // Profile may have this as 'append'
298
-
299
- // Attached particle forms (native idioms - particle + verb without space)
300
- // Object particle 를 (after vowel)
301
- { native: '를토글', normalized: 'toggle' },
302
- { native: '를전환', normalized: 'toggle' },
303
- { native: '를추가', normalized: 'add' },
304
- { native: '를제거', normalized: 'remove' },
305
- { native: '를삭제', normalized: 'remove' },
306
- { native: '를증가', normalized: 'increment' },
307
- { native: '를감소', normalized: 'decrement' },
308
- { native: '를표시', normalized: 'show' },
309
- { native: '를숨기다', normalized: 'hide' },
310
- { native: '를설정', normalized: 'set' },
311
- // Object particle 을 (after consonant)
312
- { native: '을토글', normalized: 'toggle' },
313
- { native: '을전환', normalized: 'toggle' },
314
- { native: '을추가', normalized: 'add' },
315
- { native: '을제거', normalized: 'remove' },
316
- { native: '을삭제', normalized: 'remove' },
317
- { native: '을증가', normalized: 'increment' },
318
- { native: '을감소', normalized: 'decrement' },
319
- { native: '을표시', normalized: 'show' },
320
- { native: '을숨기다', normalized: 'hide' },
321
- { native: '을설정', normalized: 'set' },
322
-
323
297
  // Time units
324
298
  { native: '초', normalized: 's' },
325
299
  { native: '밀리초', normalized: 'ms' },
326
300
  { native: '분', normalized: 'm' },
327
301
  { native: '시간', normalized: 'h' },
302
+
303
+ // Note: Attached particle+verb forms (를토글, 을토글, etc.) are intentionally
304
+ // NOT included because they cause ambiguous parsing. The separate particle + verb
305
+ // pattern (를 + 토글) is preferred for consistent semantic analysis.
306
+ // This follows the same approach as the Japanese tokenizer.
328
307
  ];
329
308
 
330
309
  // =============================================================================
@@ -431,7 +410,14 @@ export class KoreanTokenizer extends BaseTokenizer {
431
410
  if (isKorean(input[pos])) {
432
411
  const wordToken = this.extractKoreanWord(input, pos);
433
412
  if (wordToken) {
434
- tokens.push(wordToken);
413
+ // Check if the word ends with a temporal event suffix (e.g., 클릭할때 → 클릭 + 할때)
414
+ const splitResult = this.trySplitTemporalSuffix(wordToken);
415
+ if (splitResult) {
416
+ tokens.push(splitResult.stemToken);
417
+ tokens.push(splitResult.suffixToken);
418
+ } else {
419
+ tokens.push(wordToken);
420
+ }
435
421
  pos = wordToken.position.end;
436
422
  continue;
437
423
  }
@@ -528,6 +514,19 @@ export class KoreanTokenizer extends BaseTokenizer {
528
514
  }
529
515
  if (!allKorean) continue;
530
516
 
517
+ // If this candidate starting at the beginning is a particle, return null
518
+ // to let the main tokenize loop handle it as a particle
519
+ // This prevents roleMarker keywords from overriding particle classification
520
+ if (PARTICLES.has(candidate) && startPos === startPos) {
521
+ // Check if this particle-like candidate is at a word boundary (standalone)
522
+ const afterCandidate = startPos + len;
523
+ const nextChar = afterCandidate < input.length ? input[afterCandidate] : '';
524
+ if (nextChar === '' || isWhitespace(nextChar) || !isKorean(nextChar)) {
525
+ return null; // Let main loop handle as particle
526
+ }
527
+ // Otherwise it's part of a larger word, continue checking
528
+ }
529
+
531
530
  // O(1) Map lookup instead of O(n) array search
532
531
  const keywordEntry = this.lookupKeyword(candidate);
533
532
  if (keywordEntry) {
@@ -552,6 +551,12 @@ export class KoreanTokenizer extends BaseTokenizer {
552
551
  const char = input[pos];
553
552
  const nextChar = pos + 1 < input.length ? input[pos + 1] : '';
554
553
 
554
+ // If we're at a particle with no content yet, return null to let main loop handle it
555
+ // This ensures particles like 를, 를 in #count를증가 are separated properly
556
+ if (word.length === 0 && SINGLE_CHAR_PARTICLES.has(char)) {
557
+ return null;
558
+ }
559
+
555
560
  // Stop at single-char particles only if:
556
561
  // 1. We have content already
557
562
  // 2. The particle is at a word boundary (followed by whitespace, end, non-Korean, or another particle)
@@ -594,6 +599,12 @@ export class KoreanTokenizer extends BaseTokenizer {
594
599
 
595
600
  if (!word) return null;
596
601
 
602
+ // If the word is a particle, return null to let the main tokenize loop handle it
603
+ // This prevents roleMarker keywords from overriding particle classification
604
+ if (PARTICLES.has(word)) {
605
+ return null;
606
+ }
607
+
597
608
  // O(1) Map lookup instead of O(n) array search
598
609
  const keywordEntry = this.lookupKeyword(word);
599
610
  if (keywordEntry) {
@@ -634,6 +645,51 @@ export class KoreanTokenizer extends BaseTokenizer {
634
645
  skipWhitespace: false,
635
646
  });
636
647
  }
648
+
649
+ /**
650
+ * Try to split a temporal event suffix from a word token.
651
+ * This handles compact forms like 클릭할때 → 클릭 + 할때
652
+ *
653
+ * @returns Split tokens if a suffix is found, null otherwise
654
+ */
655
+ private trySplitTemporalSuffix(
656
+ wordToken: LanguageToken
657
+ ): { stemToken: LanguageToken; suffixToken: LanguageToken } | null {
658
+ const word = wordToken.value;
659
+
660
+ // Check for temporal suffixes (longest first)
661
+ for (const suffix of TEMPORAL_EVENT_SUFFIXES) {
662
+ if (word.endsWith(suffix) && word.length > suffix.length) {
663
+ const stem = word.slice(0, -suffix.length);
664
+
665
+ // Only split if the stem is a known keyword
666
+ const stemLower = stem.toLowerCase();
667
+ const keywordEntry = this.lookupKeyword(stemLower);
668
+ if (!keywordEntry) continue;
669
+
670
+ const stemEnd = wordToken.position.start + stem.length;
671
+
672
+ const stemToken = createToken(
673
+ stem,
674
+ 'keyword',
675
+ createPosition(wordToken.position.start, stemEnd),
676
+ keywordEntry.normalized
677
+ );
678
+
679
+ // Create suffix token as a keyword (event marker)
680
+ const suffixToken = createToken(
681
+ suffix,
682
+ 'keyword',
683
+ createPosition(stemEnd, wordToken.position.end),
684
+ 'when' // Normalize temporal suffixes to 'when'
685
+ );
686
+
687
+ return { stemToken, suffixToken };
688
+ }
689
+ }
690
+
691
+ return null;
692
+ }
637
693
  }
638
694
 
639
695
  /**
@@ -54,10 +54,13 @@ const MALAY_EXTRAS: KeywordEntry[] = [
54
54
  // Events
55
55
  { native: 'klik', normalized: 'click' },
56
56
  { native: 'berubah', normalized: 'change' },
57
+ { native: 'ubah', normalized: 'change' }, // Alternative for change
57
58
  { native: 'hantar', normalized: 'submit' },
58
59
  { native: 'input', normalized: 'input' },
60
+ { native: 'masuk', normalized: 'input' }, // Alternative for input (means "enter")
59
61
  { native: 'muat', normalized: 'load' },
60
62
  { native: 'tatal', normalized: 'scroll' },
63
+ { native: 'hover', normalized: 'hover' }, // English loanword commonly used
61
64
  ];
62
65
 
63
66
  // =============================================================================
@@ -82,6 +82,7 @@ const QUECHUA_EXTRAS: KeywordEntry[] = [
82
82
 
83
83
  // Events
84
84
  { native: 'llikllay', normalized: 'click' },
85
+ { native: 'ñitiy', normalized: 'click' },
85
86
  { native: 'click', normalized: 'click' },
86
87
  { native: 'yaykuy', normalized: 'input' },
87
88
  { native: 'llave uray', normalized: 'keydown' },
@@ -172,8 +173,18 @@ export class QuechuaTokenizer extends BaseTokenizer {
172
173
 
173
174
  const selectorToken = this.trySelector(input, pos);
174
175
  if (selectorToken) {
175
- tokens.push(selectorToken);
176
- pos = selectorToken.position.end;
176
+ // Check if selector has a Quechua suffix attached
177
+ const selectorWithSuffix = this.splitSelectorSuffix(selectorToken);
178
+ if (selectorWithSuffix.length === 2) {
179
+ // Selector + suffix: push both tokens
180
+ tokens.push(selectorWithSuffix[0]);
181
+ tokens.push(selectorWithSuffix[1]);
182
+ pos = selectorWithSuffix[1].position.end;
183
+ } else {
184
+ // Just selector: push as-is
185
+ tokens.push(selectorToken);
186
+ pos = selectorToken.position.end;
187
+ }
177
188
  continue;
178
189
  }
179
190
  }
@@ -226,6 +237,14 @@ export class QuechuaTokenizer extends BaseTokenizer {
226
237
  }
227
238
 
228
239
  if (isQuechuaLetter(input[pos])) {
240
+ // Try multi-word keywords first (e.g., "mana qhawachiy" = blur)
241
+ const multiWordToken = this.tryMultiWordKeyword(input, pos);
242
+ if (multiWordToken) {
243
+ tokens.push(multiWordToken);
244
+ pos = multiWordToken.position.end;
245
+ continue;
246
+ }
247
+
229
248
  const wordToken = this.extractWord(input, pos);
230
249
  if (wordToken) {
231
250
  tokens.push(wordToken);
@@ -272,6 +291,86 @@ export class QuechuaTokenizer extends BaseTokenizer {
272
291
  return null;
273
292
  }
274
293
 
294
+ /**
295
+ * Split a selector token if it has a Quechua suffix attached.
296
+ * E.g., ".openta" -> [".open", "-ta"]
297
+ * Returns array with 1 token (no suffix) or 2 tokens (selector + suffix)
298
+ */
299
+ private splitSelectorSuffix(selectorToken: LanguageToken): LanguageToken[] {
300
+ const text = selectorToken.value;
301
+
302
+ // Check if selector ends with any known suffix
303
+ for (const suffix of SUFFIXES) {
304
+ if (text.toLowerCase().endsWith(suffix)) {
305
+ const baseEnd = text.length - suffix.length;
306
+ const base = text.slice(0, baseEnd);
307
+ const suffixPart = text.slice(baseEnd);
308
+
309
+ // Create base selector token
310
+ const baseToken = createToken(
311
+ base,
312
+ 'selector',
313
+ createPosition(selectorToken.position.start, selectorToken.position.start + baseEnd)
314
+ );
315
+
316
+ // Create suffix particle token
317
+ const suffixToken = createToken(
318
+ suffixPart,
319
+ 'particle',
320
+ createPosition(selectorToken.position.start + baseEnd, selectorToken.position.end)
321
+ );
322
+
323
+ return [baseToken, suffixToken];
324
+ }
325
+ }
326
+
327
+ // No suffix found, return original token
328
+ return [selectorToken];
329
+ }
330
+
331
+ /**
332
+ * Try to match multi-word keywords that should be treated as a single unit.
333
+ * E.g., "mana qhawachiy" (not focus = blur)
334
+ */
335
+ private tryMultiWordKeyword(input: string, pos: number): LanguageToken | null {
336
+ // Multi-word keywords (longest first)
337
+ const multiWordKeywords: Array<{ pattern: string; normalized: string }> = [
338
+ { pattern: 'mana qhawachiy', normalized: 'blur' },
339
+ { pattern: 'mana qhaway', normalized: 'blur' },
340
+ { pattern: 'mana riqsisqa', normalized: 'undefined' },
341
+ { pattern: 'mana waqtalla', normalized: 'async' },
342
+ { pattern: 'ñawpaq kaq', normalized: 'previous' },
343
+ { pattern: 'aswan qayllaqa', normalized: 'closest' },
344
+ { pattern: 'llave uray', normalized: 'keydown' },
345
+ { pattern: 'llave hawa', normalized: 'keyup' },
346
+ { pattern: 'mausiri yayku', normalized: 'mouseover' },
347
+ { pattern: 'mausiri lluqsi', normalized: 'mouseout' },
348
+ { pattern: 'waranqa sikundu', normalized: 'ms' },
349
+ ];
350
+
351
+ const inputLower = input.toLowerCase();
352
+ for (const { pattern, normalized } of multiWordKeywords) {
353
+ if (inputLower.slice(pos, pos + pattern.length) === pattern) {
354
+ // Check that it's followed by whitespace or end of input
355
+ const endPos = pos + pattern.length;
356
+ if (
357
+ endPos >= input.length ||
358
+ isWhitespace(input[endPos]) ||
359
+ !isQuechuaLetter(input[endPos])
360
+ ) {
361
+ return createToken(
362
+ input.slice(pos, endPos),
363
+ 'keyword',
364
+ createPosition(pos, endPos),
365
+ normalized
366
+ );
367
+ }
368
+ }
369
+ }
370
+
371
+ return null;
372
+ }
373
+
275
374
  private extractWord(input: string, startPos: number): LanguageToken | null {
276
375
  let pos = startPos;
277
376
  let word = '';
@@ -96,13 +96,17 @@ const CASE_SUFFIXES = new Set([
96
96
  // =============================================================================
97
97
 
98
98
  /**
99
- * Extra keywords not covered by the profile:
99
+ * Extra keywords not covered by the profile.
100
+ *
101
+ * SIMPLIFIED: Following the Tagalog/Hindi model of minimal EXTRAS.
102
+ * Command synonyms and diacritic-free variants should be in profile alternatives,
103
+ * not duplicated here. Only includes:
100
104
  * - Literals (true, false, null, undefined)
101
105
  * - Positional words
102
106
  * - Event names
103
107
  * - Time units
104
- * - Diacritic-free variants
105
- * - Additional synonyms
108
+ * - References not in profile
109
+ * - Logical operators
106
110
  */
107
111
  const TURKISH_EXTRAS: KeywordEntry[] = [
108
112
  // Values/Literals
@@ -149,15 +153,9 @@ const TURKISH_EXTRAS: KeywordEntry[] = [
149
153
  { native: 'tuş_bırak', normalized: 'keyup' },
150
154
  { native: 'tus_birak', normalized: 'keyup' },
151
155
 
152
- // References
153
- { native: 'ben', normalized: 'me' },
156
+ // References (possessive forms not in profile)
154
157
  { native: 'benim', normalized: 'my' },
155
- { native: 'o', normalized: 'it' },
156
158
  { native: 'onun', normalized: 'its' },
157
- { native: 'sonuç', normalized: 'result' },
158
- { native: 'sonuc', normalized: 'result' },
159
- { native: 'olay', normalized: 'event' },
160
- { native: 'hedef', normalized: 'target' },
161
159
 
162
160
  // Time units
163
161
  { native: 'saniye', normalized: 's' },
@@ -171,61 +169,8 @@ const TURKISH_EXTRAS: KeywordEntry[] = [
171
169
  { native: 'değil', normalized: 'not' },
172
170
  { native: 'degil', normalized: 'not' },
173
171
 
174
- // Event triggers (on)
175
- { native: 'üzerinde', normalized: 'on' },
176
- { native: 'uzerinde', normalized: 'on' },
177
- { native: 'olduğunda', normalized: 'on' },
178
- { native: 'oldugunda', normalized: 'on' },
179
-
180
- // Command overrides (ensure correct mapping when profile has multiple meanings)
181
- { native: 'ekle', normalized: 'add' }, // Profile may have this as 'append'
182
- { native: 'değiştir', normalized: 'toggle' }, // Profile has this as 'swap'
183
-
184
- // Diacritic-free variants of commands
185
- { native: 'değistir', normalized: 'toggle' },
186
- { native: 'kaldir', normalized: 'remove' },
187
- { native: 'yerlestir', normalized: 'put' },
188
- { native: 'olustur', normalized: 'make' },
189
- { native: 'artir', normalized: 'increment' },
190
- { native: 'yazdir', normalized: 'log' },
191
- { native: 'goster', normalized: 'show' },
192
- { native: 'gecis', normalized: 'transition' },
193
- { native: 'atesle', normalized: 'trigger' },
194
- { native: 'gonder', normalized: 'send' },
195
- { native: 'bulaniklastir', normalized: 'blur' },
196
- { native: 'odak_kaldir', normalized: 'blur' },
197
- { native: 'yonlendir', normalized: 'go' },
198
- { native: 'cek', normalized: 'fetch' },
199
- { native: 'yerles', normalized: 'settle' },
200
- { native: 'eger', normalized: 'if' },
201
- { native: 'degilse', normalized: 'else' },
202
- { native: 'firlat', normalized: 'throw' },
203
- { native: 'cagir', normalized: 'call' },
204
- { native: 'don', normalized: 'return' },
205
- { native: 'dondur', normalized: 'return' },
206
- { native: 'eszamansiz', normalized: 'async' },
207
- { native: 'soyle', normalized: 'tell' },
208
- { native: 'varsayilan', normalized: 'default' },
209
- { native: 'baslat', normalized: 'init' },
210
- { native: 'basla', normalized: 'init' },
211
- { native: 'davranis', normalized: 'behavior' },
212
- { native: 'yukle', normalized: 'install' },
213
- { native: 'olc', normalized: 'measure' },
214
- { native: 'icine', normalized: 'into' },
215
- { native: 'once', normalized: 'before' },
216
- { native: 'icin', normalized: 'for' },
217
-
218
- // Colloquial forms
219
- { native: 'al', normalized: 'get' },
220
- { native: 'yap', normalized: 'set' },
221
-
222
- // Control flow helpers
223
- { native: 'o_zaman', normalized: 'then' },
224
- { native: 'bitir', normalized: 'end' },
225
-
226
- // Case suffix modifiers
227
- { native: '-den', normalized: 'from' },
228
- { native: '-dan', normalized: 'from' },
172
+ // Note: Command synonyms and diacritic-free variants should be in profile alternatives.
173
+ // Event triggers (üzerinde, olduğunda) should be in profile as 'on' alternatives.
229
174
  ];
230
175
 
231
176
  // =============================================================================
@@ -328,6 +273,14 @@ export class TurkishTokenizer extends BaseTokenizer {
328
273
  continue;
329
274
  }
330
275
 
276
+ // Try multi-word phrases first (e.g., "üzerine gelme", "fare üzerinde")
277
+ const phraseToken = this.tryMultiWordPhrase(input, pos);
278
+ if (phraseToken) {
279
+ tokens.push(phraseToken);
280
+ pos = phraseToken.position.end;
281
+ continue;
282
+ }
283
+
331
284
  // Try Turkish word
332
285
  if (isTurkishLetter(input[pos])) {
333
286
  const wordToken = this.extractTurkishWord(input, pos);
@@ -358,6 +311,42 @@ export class TurkishTokenizer extends BaseTokenizer {
358
311
  return 'identifier';
359
312
  }
360
313
 
314
+ /**
315
+ * Try to match multi-word phrases that function as single units.
316
+ * Multi-word phrases are included in profileKeywords and sorted longest-first,
317
+ * so they'll be matched before their constituent words.
318
+ *
319
+ * Examples: "üzerine gelme" (hover), "fare üzerinde" (mouseover)
320
+ */
321
+ private tryMultiWordPhrase(input: string, pos: number): LanguageToken | null {
322
+ // Check against multi-word entries in profileKeywords (sorted longest-first)
323
+ for (const entry of this.profileKeywords) {
324
+ // Only check multi-word phrases (contain space)
325
+ if (!entry.native.includes(' ')) continue;
326
+
327
+ const phrase = entry.native;
328
+ const candidate = input.slice(pos, pos + phrase.length).toLowerCase();
329
+ if (candidate === phrase.toLowerCase()) {
330
+ // Check word boundary
331
+ const nextPos = pos + phrase.length;
332
+ if (
333
+ nextPos >= input.length ||
334
+ isWhitespace(input[nextPos]) ||
335
+ !isTurkishLetter(input[nextPos])
336
+ ) {
337
+ return createToken(
338
+ input.slice(pos, pos + phrase.length),
339
+ 'keyword',
340
+ createPosition(pos, nextPos),
341
+ entry.normalized
342
+ );
343
+ }
344
+ }
345
+ }
346
+
347
+ return null;
348
+ }
349
+
361
350
  /**
362
351
  * Extract a Turkish word.
363
352
  * Uses morphological normalization to handle verb conjugations.
@@ -375,10 +364,10 @@ export class TurkishTokenizer extends BaseTokenizer {
375
364
 
376
365
  const lowerWord = word.toLowerCase();
377
366
 
378
- // O(1) Map lookup instead of O(n) array search
379
- const keywordEntry = this.lookupKeyword(lowerWord);
380
- if (keywordEntry) {
381
- return createToken(word, 'keyword', createPosition(startPos, pos), keywordEntry.normalized);
367
+ // Check if it's a case suffix (particle) first
368
+ // This prevents roleMarker keywords from overriding particle classification
369
+ if (CASE_SUFFIXES.has(lowerWord)) {
370
+ return createToken(word, 'particle', createPosition(startPos, pos));
382
371
  }
383
372
 
384
373
  // Check if it's a postposition
@@ -386,6 +375,12 @@ export class TurkishTokenizer extends BaseTokenizer {
386
375
  return createToken(word, 'particle', createPosition(startPos, pos));
387
376
  }
388
377
 
378
+ // O(1) Map lookup instead of O(n) array search
379
+ const keywordEntry = this.lookupKeyword(lowerWord);
380
+ if (keywordEntry) {
381
+ return createToken(word, 'keyword', createPosition(startPos, pos), keywordEntry.normalized);
382
+ }
383
+
389
384
  // Try morphological normalization for conjugated forms
390
385
  const morphToken = this.tryMorphKeywordMatch(lowerWord, startPos, pos);
391
386
  if (morphToken) return morphToken;
package/src/types.ts CHANGED
@@ -157,6 +157,15 @@ export interface SemanticMetadata {
157
157
  readonly sourceText?: string;
158
158
  readonly sourcePosition?: SourcePosition;
159
159
  readonly patternId?: string;
160
+ /**
161
+ * Confidence score for the parse (0-1).
162
+ * Higher values indicate more certain matches.
163
+ * - 1.0: Exact match with all roles captured
164
+ * - 0.8-0.99: High confidence with minor uncertainty (stem matching, optional roles)
165
+ * - 0.6-0.8: Medium confidence (morphological normalization, defaults applied)
166
+ * - <0.6: Low confidence (may need fallback to traditional parser)
167
+ */
168
+ readonly confidence?: number;
160
169
  }
161
170
 
162
171
  export interface SourcePosition {
@@ -336,6 +345,10 @@ export interface ExtractionRule {
336
345
  readonly transform?: (raw: string) => SemanticValue;
337
346
  /** Default value if not found (for optional roles) */
338
347
  readonly default?: SemanticValue;
348
+ /** Static value extraction (for event handler wrapped commands) */
349
+ readonly value?: string;
350
+ /** Extract value from a pattern role by name */
351
+ readonly fromRole?: string;
339
352
  }
340
353
 
341
354
  /**