@lokascript/semantic 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser-ar.ar.global.js +2 -2
- package/dist/browser-core.core.global.js +2 -2
- package/dist/browser-de.de.global.js +2 -2
- package/dist/browser-east-asian.east-asian.global.js +2 -2
- package/dist/browser-en-tr.en-tr.global.js +2 -2
- package/dist/browser-en.en.global.js +2 -2
- package/dist/browser-es-en.es-en.global.js +2 -2
- package/dist/browser-es.es.global.js +2 -2
- package/dist/browser-fr.fr.global.js +2 -2
- package/dist/browser-id.id.global.js +2 -2
- package/dist/browser-ja.ja.global.js +2 -2
- package/dist/browser-ko.ko.global.js +2 -2
- package/dist/browser-lazy.lazy.global.js +2 -2
- package/dist/browser-priority.priority.global.js +2 -2
- package/dist/browser-pt.pt.global.js +2 -2
- package/dist/browser-qu.qu.global.js +2 -2
- package/dist/browser-sw.sw.global.js +2 -2
- package/dist/browser-tr.tr.global.js +2 -2
- package/dist/browser-western.western.global.js +2 -2
- package/dist/browser-zh.zh.global.js +2 -2
- package/dist/browser.global.js +2 -2
- package/dist/browser.global.js.map +1 -1
- package/dist/index.cjs +13042 -17462
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +49 -5
- package/dist/index.d.ts +49 -5
- package/dist/index.js +14044 -18464
- package/dist/index.js.map +1 -1
- package/dist/languages/ar.d.ts +1 -1
- package/dist/languages/ar.js +31 -44
- package/dist/languages/ar.js.map +1 -1
- package/dist/languages/de.d.ts +1 -1
- package/dist/languages/de.js +14 -2
- package/dist/languages/de.js.map +1 -1
- package/dist/languages/en.d.ts +1 -1
- package/dist/languages/en.js +558 -12
- package/dist/languages/en.js.map +1 -1
- package/dist/languages/es.d.ts +1 -1
- package/dist/languages/es.js +16 -0
- package/dist/languages/es.js.map +1 -1
- package/dist/languages/fr.d.ts +1 -1
- package/dist/languages/fr.js +14 -2
- package/dist/languages/fr.js.map +1 -1
- package/dist/languages/id.d.ts +1 -1
- package/dist/languages/id.js +14 -2
- package/dist/languages/id.js.map +1 -1
- package/dist/languages/ja.d.ts +1 -1
- package/dist/languages/ja.js +18 -3
- package/dist/languages/ja.js.map +1 -1
- package/dist/languages/ko.d.ts +8 -1
- package/dist/languages/ko.js +75 -43
- package/dist/languages/ko.js.map +1 -1
- package/dist/languages/pt.d.ts +1 -1
- package/dist/languages/pt.js +17 -0
- package/dist/languages/pt.js.map +1 -1
- package/dist/languages/qu.d.ts +12 -1
- package/dist/languages/qu.js +77 -2
- package/dist/languages/qu.js.map +1 -1
- package/dist/languages/sw.d.ts +1 -1
- package/dist/languages/sw.js.map +1 -1
- package/dist/languages/tr.d.ts +9 -1
- package/dist/languages/tr.js +96 -72
- package/dist/languages/tr.js.map +1 -1
- package/dist/languages/zh.d.ts +1 -1
- package/dist/languages/zh.js +16 -0
- package/dist/languages/zh.js.map +1 -1
- package/dist/{types-C4dcj53L.d.ts → types-BY3Id07j.d.ts} +20 -5
- package/package.json +20 -29
- package/src/generators/command-schemas.ts +21 -10
- package/src/generators/event-handler-generator.ts +50 -44
- package/src/generators/language-profiles.ts +6 -0
- package/src/generators/pattern-generator.ts +883 -1
- package/src/generators/profiles/arabic.ts +19 -3
- package/src/generators/profiles/bengali.ts +12 -1
- package/src/generators/profiles/chinese.ts +15 -0
- package/src/generators/profiles/french.ts +12 -1
- package/src/generators/profiles/german.ts +12 -1
- package/src/generators/profiles/hebrew.ts +148 -0
- package/src/generators/profiles/hindi.ts +12 -1
- package/src/generators/profiles/index.ts +2 -0
- package/src/generators/profiles/indonesian.ts +12 -1
- package/src/generators/profiles/italian.ts +16 -0
- package/src/generators/profiles/japanese.ts +11 -2
- package/src/generators/profiles/korean.ts +15 -1
- package/src/generators/profiles/polish.ts +12 -0
- package/src/generators/profiles/portuguese.ts +16 -0
- package/src/generators/profiles/russian.ts +11 -0
- package/src/generators/profiles/spanish.ts +15 -0
- package/src/generators/profiles/spanishMexico.ts +176 -0
- package/src/generators/profiles/thai.ts +11 -0
- package/src/generators/profiles/turkish.ts +49 -7
- package/src/generators/profiles/types.ts +21 -5
- package/src/generators/profiles/ukrainian.ts +11 -0
- package/src/generators/profiles/vietnamese.ts +11 -0
- package/src/language-building-schema.ts +111 -0
- package/src/languages/_all.ts +5 -1
- package/src/languages/es-MX.ts +32 -0
- package/src/languages/he.ts +15 -0
- package/src/parser/pattern-matcher.ts +10 -1
- package/src/parser/semantic-parser.ts +3 -0
- package/src/patterns/add/ar.ts +3 -59
- package/src/patterns/add/index.ts +5 -1
- package/src/patterns/add/ja.ts +3 -81
- package/src/patterns/add/ko.ts +3 -62
- package/src/patterns/add/qu.ts +69 -0
- package/src/patterns/add/tr.ts +3 -59
- package/src/patterns/builders.ts +1 -0
- package/src/patterns/decrement/tr.ts +3 -36
- package/src/patterns/event-handler/ar.ts +3 -139
- package/src/patterns/event-handler/he.ts +15 -0
- package/src/patterns/event-handler/index.ts +5 -1
- package/src/patterns/event-handler/ja.ts +3 -106
- package/src/patterns/event-handler/ko.ts +3 -121
- package/src/patterns/event-handler/ms.ts +45 -20
- package/src/patterns/event-handler/tr.ts +3 -158
- package/src/patterns/get/ar.ts +3 -37
- package/src/patterns/get/ja.ts +3 -41
- package/src/patterns/get/ko.ts +3 -41
- package/src/patterns/grammar-transformed/ja.ts +3 -1701
- package/src/patterns/grammar-transformed/ko.ts +3 -1299
- package/src/patterns/grammar-transformed/tr.ts +3 -1055
- package/src/patterns/hide/ar.ts +3 -55
- package/src/patterns/hide/ja.ts +3 -57
- package/src/patterns/hide/ko.ts +3 -57
- package/src/patterns/hide/tr.ts +3 -53
- package/src/patterns/increment/tr.ts +3 -40
- package/src/patterns/put/ar.ts +3 -62
- package/src/patterns/put/ja.ts +3 -63
- package/src/patterns/put/ko.ts +3 -55
- package/src/patterns/put/tr.ts +3 -55
- package/src/patterns/remove/ar.ts +3 -59
- package/src/patterns/remove/index.ts +5 -1
- package/src/patterns/remove/ja.ts +3 -62
- package/src/patterns/remove/ko.ts +3 -66
- package/src/patterns/remove/qu.ts +69 -0
- package/src/patterns/remove/tr.ts +3 -66
- package/src/patterns/set/ar.ts +3 -72
- package/src/patterns/set/ja.ts +3 -74
- package/src/patterns/set/ko.ts +3 -73
- package/src/patterns/set/tr.ts +3 -95
- package/src/patterns/show/ar.ts +3 -55
- package/src/patterns/show/ja.ts +3 -57
- package/src/patterns/show/ko.ts +3 -61
- package/src/patterns/show/tr.ts +3 -53
- package/src/patterns/take/ar.ts +3 -39
- package/src/patterns/toggle/ar.ts +3 -49
- package/src/patterns/toggle/index.ts +5 -1
- package/src/patterns/toggle/ja.ts +3 -144
- package/src/patterns/toggle/ko.ts +3 -101
- package/src/patterns/toggle/qu.ts +90 -0
- package/src/patterns/toggle/tr.ts +3 -76
- package/src/registry.ts +179 -15
- package/src/tokenizers/arabic.ts +13 -46
- package/src/tokenizers/bengali.ts +2 -16
- package/src/tokenizers/he.ts +542 -0
- package/src/tokenizers/index.ts +1 -0
- package/src/tokenizers/japanese.ts +3 -1
- package/src/tokenizers/korean.ts +104 -48
- package/src/tokenizers/ms.ts +3 -0
- package/src/tokenizers/quechua.ts +101 -2
- package/src/tokenizers/turkish.ts +64 -69
- package/src/types.ts +13 -0
package/src/tokenizers/korean.ts
CHANGED
|
@@ -104,6 +104,17 @@ const SINGLE_CHAR_PARTICLES = new Set([
|
|
|
104
104
|
*/
|
|
105
105
|
const MULTI_CHAR_PARTICLES = ['에서', '으로', '부터', '까지', '처럼', '보다'];
|
|
106
106
|
|
|
107
|
+
/**
|
|
108
|
+
* Temporal event suffixes that should be split from compound words.
|
|
109
|
+
* These are verb endings that indicate "when" something happens.
|
|
110
|
+
* Sorted by length (longest first) to ensure greedy matching.
|
|
111
|
+
*
|
|
112
|
+
* Examples:
|
|
113
|
+
* - 클릭할때 → 클릭 + 할때 (click + when)
|
|
114
|
+
* - 입력할때 → 입력 + 할때 (input + when)
|
|
115
|
+
*/
|
|
116
|
+
const TEMPORAL_EVENT_SUFFIXES = ['할때', '하면', '하니까', '할 때'];
|
|
117
|
+
|
|
107
118
|
/**
|
|
108
119
|
* Particle metadata mapping particles to semantic roles, confidence scores,
|
|
109
120
|
* and vowel harmony variants. Korean particles change based on whether the
|
|
@@ -231,13 +242,19 @@ const PARTICLE_ROLES = new Map<string, ParticleMetadata>([
|
|
|
231
242
|
// =============================================================================
|
|
232
243
|
|
|
233
244
|
/**
|
|
234
|
-
* Extra keywords not covered by the profile
|
|
245
|
+
* Extra keywords not covered by the profile.
|
|
246
|
+
*
|
|
247
|
+
* SIMPLIFIED: Following the Tagalog/Hindi model of minimal EXTRAS.
|
|
248
|
+
* Removed attached particle+verb compounds (를토글, 을토글, etc.) that create
|
|
249
|
+
* parsing ambiguity. Japanese tokenizer explicitly avoids these - we follow suit.
|
|
250
|
+
*
|
|
251
|
+
* Only includes:
|
|
235
252
|
* - Literals (true, false, null, undefined)
|
|
236
253
|
* - Positional words
|
|
237
254
|
* - Event names
|
|
238
|
-
* - Attached particle forms (native idioms)
|
|
239
|
-
* - Conditional event forms
|
|
240
255
|
* - Time units
|
|
256
|
+
* - References not in profile
|
|
257
|
+
* - Logical operators
|
|
241
258
|
*/
|
|
242
259
|
const KOREAN_EXTRAS: KeywordEntry[] = [
|
|
243
260
|
// Values/Literals
|
|
@@ -267,64 +284,26 @@ const KOREAN_EXTRAS: KeywordEntry[] = [
|
|
|
267
284
|
{ native: '마우스오버', normalized: 'mouseover' },
|
|
268
285
|
{ native: '마우스아웃', normalized: 'mouseout' },
|
|
269
286
|
|
|
270
|
-
// References (additional forms)
|
|
287
|
+
// References (additional forms not in profile)
|
|
271
288
|
{ native: '내', normalized: 'my' },
|
|
272
289
|
{ native: '그것의', normalized: 'its' },
|
|
273
290
|
|
|
274
|
-
// Conditional event forms (native idioms)
|
|
275
|
-
{ native: '하면', normalized: 'on' },
|
|
276
|
-
{ native: '으면', normalized: 'on' },
|
|
277
|
-
{ native: '면', normalized: 'on' },
|
|
278
|
-
{ native: '할때', normalized: 'on' },
|
|
279
|
-
{ native: '할 때', normalized: 'on' },
|
|
280
|
-
{ native: '을때', normalized: 'on' },
|
|
281
|
-
{ native: '을 때', normalized: 'on' },
|
|
282
|
-
{ native: '하니까', normalized: 'on' },
|
|
283
|
-
{ native: '니까', normalized: 'on' },
|
|
284
|
-
|
|
285
|
-
// Control flow helpers
|
|
286
|
-
{ native: '그러면', normalized: 'then' },
|
|
287
|
-
{ native: '그렇지않으면', normalized: 'otherwise' },
|
|
288
|
-
{ native: '중단', normalized: 'break' },
|
|
289
|
-
|
|
290
291
|
// Logical
|
|
291
292
|
{ native: '그리고', normalized: 'and' },
|
|
292
293
|
{ native: '또는', normalized: 'or' },
|
|
293
294
|
{ native: '아니', normalized: 'not' },
|
|
294
295
|
{ native: '이다', normalized: 'is' },
|
|
295
296
|
|
|
296
|
-
// Command overrides (ensure correct mapping when profile has multiple meanings)
|
|
297
|
-
{ native: '추가', normalized: 'add' }, // Profile may have this as 'append'
|
|
298
|
-
|
|
299
|
-
// Attached particle forms (native idioms - particle + verb without space)
|
|
300
|
-
// Object particle 를 (after vowel)
|
|
301
|
-
{ native: '를토글', normalized: 'toggle' },
|
|
302
|
-
{ native: '를전환', normalized: 'toggle' },
|
|
303
|
-
{ native: '를추가', normalized: 'add' },
|
|
304
|
-
{ native: '를제거', normalized: 'remove' },
|
|
305
|
-
{ native: '를삭제', normalized: 'remove' },
|
|
306
|
-
{ native: '를증가', normalized: 'increment' },
|
|
307
|
-
{ native: '를감소', normalized: 'decrement' },
|
|
308
|
-
{ native: '를표시', normalized: 'show' },
|
|
309
|
-
{ native: '를숨기다', normalized: 'hide' },
|
|
310
|
-
{ native: '를설정', normalized: 'set' },
|
|
311
|
-
// Object particle 을 (after consonant)
|
|
312
|
-
{ native: '을토글', normalized: 'toggle' },
|
|
313
|
-
{ native: '을전환', normalized: 'toggle' },
|
|
314
|
-
{ native: '을추가', normalized: 'add' },
|
|
315
|
-
{ native: '을제거', normalized: 'remove' },
|
|
316
|
-
{ native: '을삭제', normalized: 'remove' },
|
|
317
|
-
{ native: '을증가', normalized: 'increment' },
|
|
318
|
-
{ native: '을감소', normalized: 'decrement' },
|
|
319
|
-
{ native: '을표시', normalized: 'show' },
|
|
320
|
-
{ native: '을숨기다', normalized: 'hide' },
|
|
321
|
-
{ native: '을설정', normalized: 'set' },
|
|
322
|
-
|
|
323
297
|
// Time units
|
|
324
298
|
{ native: '초', normalized: 's' },
|
|
325
299
|
{ native: '밀리초', normalized: 'ms' },
|
|
326
300
|
{ native: '분', normalized: 'm' },
|
|
327
301
|
{ native: '시간', normalized: 'h' },
|
|
302
|
+
|
|
303
|
+
// Note: Attached particle+verb forms (를토글, 을토글, etc.) are intentionally
|
|
304
|
+
// NOT included because they cause ambiguous parsing. The separate particle + verb
|
|
305
|
+
// pattern (를 + 토글) is preferred for consistent semantic analysis.
|
|
306
|
+
// This follows the same approach as the Japanese tokenizer.
|
|
328
307
|
];
|
|
329
308
|
|
|
330
309
|
// =============================================================================
|
|
@@ -431,7 +410,14 @@ export class KoreanTokenizer extends BaseTokenizer {
|
|
|
431
410
|
if (isKorean(input[pos])) {
|
|
432
411
|
const wordToken = this.extractKoreanWord(input, pos);
|
|
433
412
|
if (wordToken) {
|
|
434
|
-
|
|
413
|
+
// Check if the word ends with a temporal event suffix (e.g., 클릭할때 → 클릭 + 할때)
|
|
414
|
+
const splitResult = this.trySplitTemporalSuffix(wordToken);
|
|
415
|
+
if (splitResult) {
|
|
416
|
+
tokens.push(splitResult.stemToken);
|
|
417
|
+
tokens.push(splitResult.suffixToken);
|
|
418
|
+
} else {
|
|
419
|
+
tokens.push(wordToken);
|
|
420
|
+
}
|
|
435
421
|
pos = wordToken.position.end;
|
|
436
422
|
continue;
|
|
437
423
|
}
|
|
@@ -528,6 +514,19 @@ export class KoreanTokenizer extends BaseTokenizer {
|
|
|
528
514
|
}
|
|
529
515
|
if (!allKorean) continue;
|
|
530
516
|
|
|
517
|
+
// If this candidate starting at the beginning is a particle, return null
|
|
518
|
+
// to let the main tokenize loop handle it as a particle
|
|
519
|
+
// This prevents roleMarker keywords from overriding particle classification
|
|
520
|
+
if (PARTICLES.has(candidate) && startPos === startPos) {
|
|
521
|
+
// Check if this particle-like candidate is at a word boundary (standalone)
|
|
522
|
+
const afterCandidate = startPos + len;
|
|
523
|
+
const nextChar = afterCandidate < input.length ? input[afterCandidate] : '';
|
|
524
|
+
if (nextChar === '' || isWhitespace(nextChar) || !isKorean(nextChar)) {
|
|
525
|
+
return null; // Let main loop handle as particle
|
|
526
|
+
}
|
|
527
|
+
// Otherwise it's part of a larger word, continue checking
|
|
528
|
+
}
|
|
529
|
+
|
|
531
530
|
// O(1) Map lookup instead of O(n) array search
|
|
532
531
|
const keywordEntry = this.lookupKeyword(candidate);
|
|
533
532
|
if (keywordEntry) {
|
|
@@ -552,6 +551,12 @@ export class KoreanTokenizer extends BaseTokenizer {
|
|
|
552
551
|
const char = input[pos];
|
|
553
552
|
const nextChar = pos + 1 < input.length ? input[pos + 1] : '';
|
|
554
553
|
|
|
554
|
+
// If we're at a particle with no content yet, return null to let main loop handle it
|
|
555
|
+
// This ensures particles like 를, 를 in #count를증가 are separated properly
|
|
556
|
+
if (word.length === 0 && SINGLE_CHAR_PARTICLES.has(char)) {
|
|
557
|
+
return null;
|
|
558
|
+
}
|
|
559
|
+
|
|
555
560
|
// Stop at single-char particles only if:
|
|
556
561
|
// 1. We have content already
|
|
557
562
|
// 2. The particle is at a word boundary (followed by whitespace, end, non-Korean, or another particle)
|
|
@@ -594,6 +599,12 @@ export class KoreanTokenizer extends BaseTokenizer {
|
|
|
594
599
|
|
|
595
600
|
if (!word) return null;
|
|
596
601
|
|
|
602
|
+
// If the word is a particle, return null to let the main tokenize loop handle it
|
|
603
|
+
// This prevents roleMarker keywords from overriding particle classification
|
|
604
|
+
if (PARTICLES.has(word)) {
|
|
605
|
+
return null;
|
|
606
|
+
}
|
|
607
|
+
|
|
597
608
|
// O(1) Map lookup instead of O(n) array search
|
|
598
609
|
const keywordEntry = this.lookupKeyword(word);
|
|
599
610
|
if (keywordEntry) {
|
|
@@ -634,6 +645,51 @@ export class KoreanTokenizer extends BaseTokenizer {
|
|
|
634
645
|
skipWhitespace: false,
|
|
635
646
|
});
|
|
636
647
|
}
|
|
648
|
+
|
|
649
|
+
/**
|
|
650
|
+
* Try to split a temporal event suffix from a word token.
|
|
651
|
+
* This handles compact forms like 클릭할때 → 클릭 + 할때
|
|
652
|
+
*
|
|
653
|
+
* @returns Split tokens if a suffix is found, null otherwise
|
|
654
|
+
*/
|
|
655
|
+
private trySplitTemporalSuffix(
|
|
656
|
+
wordToken: LanguageToken
|
|
657
|
+
): { stemToken: LanguageToken; suffixToken: LanguageToken } | null {
|
|
658
|
+
const word = wordToken.value;
|
|
659
|
+
|
|
660
|
+
// Check for temporal suffixes (longest first)
|
|
661
|
+
for (const suffix of TEMPORAL_EVENT_SUFFIXES) {
|
|
662
|
+
if (word.endsWith(suffix) && word.length > suffix.length) {
|
|
663
|
+
const stem = word.slice(0, -suffix.length);
|
|
664
|
+
|
|
665
|
+
// Only split if the stem is a known keyword
|
|
666
|
+
const stemLower = stem.toLowerCase();
|
|
667
|
+
const keywordEntry = this.lookupKeyword(stemLower);
|
|
668
|
+
if (!keywordEntry) continue;
|
|
669
|
+
|
|
670
|
+
const stemEnd = wordToken.position.start + stem.length;
|
|
671
|
+
|
|
672
|
+
const stemToken = createToken(
|
|
673
|
+
stem,
|
|
674
|
+
'keyword',
|
|
675
|
+
createPosition(wordToken.position.start, stemEnd),
|
|
676
|
+
keywordEntry.normalized
|
|
677
|
+
);
|
|
678
|
+
|
|
679
|
+
// Create suffix token as a keyword (event marker)
|
|
680
|
+
const suffixToken = createToken(
|
|
681
|
+
suffix,
|
|
682
|
+
'keyword',
|
|
683
|
+
createPosition(stemEnd, wordToken.position.end),
|
|
684
|
+
'when' // Normalize temporal suffixes to 'when'
|
|
685
|
+
);
|
|
686
|
+
|
|
687
|
+
return { stemToken, suffixToken };
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
return null;
|
|
692
|
+
}
|
|
637
693
|
}
|
|
638
694
|
|
|
639
695
|
/**
|
package/src/tokenizers/ms.ts
CHANGED
|
@@ -54,10 +54,13 @@ const MALAY_EXTRAS: KeywordEntry[] = [
|
|
|
54
54
|
// Events
|
|
55
55
|
{ native: 'klik', normalized: 'click' },
|
|
56
56
|
{ native: 'berubah', normalized: 'change' },
|
|
57
|
+
{ native: 'ubah', normalized: 'change' }, // Alternative for change
|
|
57
58
|
{ native: 'hantar', normalized: 'submit' },
|
|
58
59
|
{ native: 'input', normalized: 'input' },
|
|
60
|
+
{ native: 'masuk', normalized: 'input' }, // Alternative for input (means "enter")
|
|
59
61
|
{ native: 'muat', normalized: 'load' },
|
|
60
62
|
{ native: 'tatal', normalized: 'scroll' },
|
|
63
|
+
{ native: 'hover', normalized: 'hover' }, // English loanword commonly used
|
|
61
64
|
];
|
|
62
65
|
|
|
63
66
|
// =============================================================================
|
|
@@ -82,6 +82,7 @@ const QUECHUA_EXTRAS: KeywordEntry[] = [
|
|
|
82
82
|
|
|
83
83
|
// Events
|
|
84
84
|
{ native: 'llikllay', normalized: 'click' },
|
|
85
|
+
{ native: 'ñitiy', normalized: 'click' },
|
|
85
86
|
{ native: 'click', normalized: 'click' },
|
|
86
87
|
{ native: 'yaykuy', normalized: 'input' },
|
|
87
88
|
{ native: 'llave uray', normalized: 'keydown' },
|
|
@@ -172,8 +173,18 @@ export class QuechuaTokenizer extends BaseTokenizer {
|
|
|
172
173
|
|
|
173
174
|
const selectorToken = this.trySelector(input, pos);
|
|
174
175
|
if (selectorToken) {
|
|
175
|
-
|
|
176
|
-
|
|
176
|
+
// Check if selector has a Quechua suffix attached
|
|
177
|
+
const selectorWithSuffix = this.splitSelectorSuffix(selectorToken);
|
|
178
|
+
if (selectorWithSuffix.length === 2) {
|
|
179
|
+
// Selector + suffix: push both tokens
|
|
180
|
+
tokens.push(selectorWithSuffix[0]);
|
|
181
|
+
tokens.push(selectorWithSuffix[1]);
|
|
182
|
+
pos = selectorWithSuffix[1].position.end;
|
|
183
|
+
} else {
|
|
184
|
+
// Just selector: push as-is
|
|
185
|
+
tokens.push(selectorToken);
|
|
186
|
+
pos = selectorToken.position.end;
|
|
187
|
+
}
|
|
177
188
|
continue;
|
|
178
189
|
}
|
|
179
190
|
}
|
|
@@ -226,6 +237,14 @@ export class QuechuaTokenizer extends BaseTokenizer {
|
|
|
226
237
|
}
|
|
227
238
|
|
|
228
239
|
if (isQuechuaLetter(input[pos])) {
|
|
240
|
+
// Try multi-word keywords first (e.g., "mana qhawachiy" = blur)
|
|
241
|
+
const multiWordToken = this.tryMultiWordKeyword(input, pos);
|
|
242
|
+
if (multiWordToken) {
|
|
243
|
+
tokens.push(multiWordToken);
|
|
244
|
+
pos = multiWordToken.position.end;
|
|
245
|
+
continue;
|
|
246
|
+
}
|
|
247
|
+
|
|
229
248
|
const wordToken = this.extractWord(input, pos);
|
|
230
249
|
if (wordToken) {
|
|
231
250
|
tokens.push(wordToken);
|
|
@@ -272,6 +291,86 @@ export class QuechuaTokenizer extends BaseTokenizer {
|
|
|
272
291
|
return null;
|
|
273
292
|
}
|
|
274
293
|
|
|
294
|
+
/**
|
|
295
|
+
* Split a selector token if it has a Quechua suffix attached.
|
|
296
|
+
* E.g., ".openta" -> [".open", "-ta"]
|
|
297
|
+
* Returns array with 1 token (no suffix) or 2 tokens (selector + suffix)
|
|
298
|
+
*/
|
|
299
|
+
private splitSelectorSuffix(selectorToken: LanguageToken): LanguageToken[] {
|
|
300
|
+
const text = selectorToken.value;
|
|
301
|
+
|
|
302
|
+
// Check if selector ends with any known suffix
|
|
303
|
+
for (const suffix of SUFFIXES) {
|
|
304
|
+
if (text.toLowerCase().endsWith(suffix)) {
|
|
305
|
+
const baseEnd = text.length - suffix.length;
|
|
306
|
+
const base = text.slice(0, baseEnd);
|
|
307
|
+
const suffixPart = text.slice(baseEnd);
|
|
308
|
+
|
|
309
|
+
// Create base selector token
|
|
310
|
+
const baseToken = createToken(
|
|
311
|
+
base,
|
|
312
|
+
'selector',
|
|
313
|
+
createPosition(selectorToken.position.start, selectorToken.position.start + baseEnd)
|
|
314
|
+
);
|
|
315
|
+
|
|
316
|
+
// Create suffix particle token
|
|
317
|
+
const suffixToken = createToken(
|
|
318
|
+
suffixPart,
|
|
319
|
+
'particle',
|
|
320
|
+
createPosition(selectorToken.position.start + baseEnd, selectorToken.position.end)
|
|
321
|
+
);
|
|
322
|
+
|
|
323
|
+
return [baseToken, suffixToken];
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
// No suffix found, return original token
|
|
328
|
+
return [selectorToken];
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
/**
|
|
332
|
+
* Try to match multi-word keywords that should be treated as a single unit.
|
|
333
|
+
* E.g., "mana qhawachiy" (not focus = blur)
|
|
334
|
+
*/
|
|
335
|
+
private tryMultiWordKeyword(input: string, pos: number): LanguageToken | null {
|
|
336
|
+
// Multi-word keywords (longest first)
|
|
337
|
+
const multiWordKeywords: Array<{ pattern: string; normalized: string }> = [
|
|
338
|
+
{ pattern: 'mana qhawachiy', normalized: 'blur' },
|
|
339
|
+
{ pattern: 'mana qhaway', normalized: 'blur' },
|
|
340
|
+
{ pattern: 'mana riqsisqa', normalized: 'undefined' },
|
|
341
|
+
{ pattern: 'mana waqtalla', normalized: 'async' },
|
|
342
|
+
{ pattern: 'ñawpaq kaq', normalized: 'previous' },
|
|
343
|
+
{ pattern: 'aswan qayllaqa', normalized: 'closest' },
|
|
344
|
+
{ pattern: 'llave uray', normalized: 'keydown' },
|
|
345
|
+
{ pattern: 'llave hawa', normalized: 'keyup' },
|
|
346
|
+
{ pattern: 'mausiri yayku', normalized: 'mouseover' },
|
|
347
|
+
{ pattern: 'mausiri lluqsi', normalized: 'mouseout' },
|
|
348
|
+
{ pattern: 'waranqa sikundu', normalized: 'ms' },
|
|
349
|
+
];
|
|
350
|
+
|
|
351
|
+
const inputLower = input.toLowerCase();
|
|
352
|
+
for (const { pattern, normalized } of multiWordKeywords) {
|
|
353
|
+
if (inputLower.slice(pos, pos + pattern.length) === pattern) {
|
|
354
|
+
// Check that it's followed by whitespace or end of input
|
|
355
|
+
const endPos = pos + pattern.length;
|
|
356
|
+
if (
|
|
357
|
+
endPos >= input.length ||
|
|
358
|
+
isWhitespace(input[endPos]) ||
|
|
359
|
+
!isQuechuaLetter(input[endPos])
|
|
360
|
+
) {
|
|
361
|
+
return createToken(
|
|
362
|
+
input.slice(pos, endPos),
|
|
363
|
+
'keyword',
|
|
364
|
+
createPosition(pos, endPos),
|
|
365
|
+
normalized
|
|
366
|
+
);
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
return null;
|
|
372
|
+
}
|
|
373
|
+
|
|
275
374
|
private extractWord(input: string, startPos: number): LanguageToken | null {
|
|
276
375
|
let pos = startPos;
|
|
277
376
|
let word = '';
|
|
@@ -96,13 +96,17 @@ const CASE_SUFFIXES = new Set([
|
|
|
96
96
|
// =============================================================================
|
|
97
97
|
|
|
98
98
|
/**
|
|
99
|
-
* Extra keywords not covered by the profile
|
|
99
|
+
* Extra keywords not covered by the profile.
|
|
100
|
+
*
|
|
101
|
+
* SIMPLIFIED: Following the Tagalog/Hindi model of minimal EXTRAS.
|
|
102
|
+
* Command synonyms and diacritic-free variants should be in profile alternatives,
|
|
103
|
+
* not duplicated here. Only includes:
|
|
100
104
|
* - Literals (true, false, null, undefined)
|
|
101
105
|
* - Positional words
|
|
102
106
|
* - Event names
|
|
103
107
|
* - Time units
|
|
104
|
-
* -
|
|
105
|
-
* -
|
|
108
|
+
* - References not in profile
|
|
109
|
+
* - Logical operators
|
|
106
110
|
*/
|
|
107
111
|
const TURKISH_EXTRAS: KeywordEntry[] = [
|
|
108
112
|
// Values/Literals
|
|
@@ -149,15 +153,9 @@ const TURKISH_EXTRAS: KeywordEntry[] = [
|
|
|
149
153
|
{ native: 'tuş_bırak', normalized: 'keyup' },
|
|
150
154
|
{ native: 'tus_birak', normalized: 'keyup' },
|
|
151
155
|
|
|
152
|
-
// References
|
|
153
|
-
{ native: 'ben', normalized: 'me' },
|
|
156
|
+
// References (possessive forms not in profile)
|
|
154
157
|
{ native: 'benim', normalized: 'my' },
|
|
155
|
-
{ native: 'o', normalized: 'it' },
|
|
156
158
|
{ native: 'onun', normalized: 'its' },
|
|
157
|
-
{ native: 'sonuç', normalized: 'result' },
|
|
158
|
-
{ native: 'sonuc', normalized: 'result' },
|
|
159
|
-
{ native: 'olay', normalized: 'event' },
|
|
160
|
-
{ native: 'hedef', normalized: 'target' },
|
|
161
159
|
|
|
162
160
|
// Time units
|
|
163
161
|
{ native: 'saniye', normalized: 's' },
|
|
@@ -171,61 +169,8 @@ const TURKISH_EXTRAS: KeywordEntry[] = [
|
|
|
171
169
|
{ native: 'değil', normalized: 'not' },
|
|
172
170
|
{ native: 'degil', normalized: 'not' },
|
|
173
171
|
|
|
174
|
-
//
|
|
175
|
-
|
|
176
|
-
{ native: 'uzerinde', normalized: 'on' },
|
|
177
|
-
{ native: 'olduğunda', normalized: 'on' },
|
|
178
|
-
{ native: 'oldugunda', normalized: 'on' },
|
|
179
|
-
|
|
180
|
-
// Command overrides (ensure correct mapping when profile has multiple meanings)
|
|
181
|
-
{ native: 'ekle', normalized: 'add' }, // Profile may have this as 'append'
|
|
182
|
-
{ native: 'değiştir', normalized: 'toggle' }, // Profile has this as 'swap'
|
|
183
|
-
|
|
184
|
-
// Diacritic-free variants of commands
|
|
185
|
-
{ native: 'değistir', normalized: 'toggle' },
|
|
186
|
-
{ native: 'kaldir', normalized: 'remove' },
|
|
187
|
-
{ native: 'yerlestir', normalized: 'put' },
|
|
188
|
-
{ native: 'olustur', normalized: 'make' },
|
|
189
|
-
{ native: 'artir', normalized: 'increment' },
|
|
190
|
-
{ native: 'yazdir', normalized: 'log' },
|
|
191
|
-
{ native: 'goster', normalized: 'show' },
|
|
192
|
-
{ native: 'gecis', normalized: 'transition' },
|
|
193
|
-
{ native: 'atesle', normalized: 'trigger' },
|
|
194
|
-
{ native: 'gonder', normalized: 'send' },
|
|
195
|
-
{ native: 'bulaniklastir', normalized: 'blur' },
|
|
196
|
-
{ native: 'odak_kaldir', normalized: 'blur' },
|
|
197
|
-
{ native: 'yonlendir', normalized: 'go' },
|
|
198
|
-
{ native: 'cek', normalized: 'fetch' },
|
|
199
|
-
{ native: 'yerles', normalized: 'settle' },
|
|
200
|
-
{ native: 'eger', normalized: 'if' },
|
|
201
|
-
{ native: 'degilse', normalized: 'else' },
|
|
202
|
-
{ native: 'firlat', normalized: 'throw' },
|
|
203
|
-
{ native: 'cagir', normalized: 'call' },
|
|
204
|
-
{ native: 'don', normalized: 'return' },
|
|
205
|
-
{ native: 'dondur', normalized: 'return' },
|
|
206
|
-
{ native: 'eszamansiz', normalized: 'async' },
|
|
207
|
-
{ native: 'soyle', normalized: 'tell' },
|
|
208
|
-
{ native: 'varsayilan', normalized: 'default' },
|
|
209
|
-
{ native: 'baslat', normalized: 'init' },
|
|
210
|
-
{ native: 'basla', normalized: 'init' },
|
|
211
|
-
{ native: 'davranis', normalized: 'behavior' },
|
|
212
|
-
{ native: 'yukle', normalized: 'install' },
|
|
213
|
-
{ native: 'olc', normalized: 'measure' },
|
|
214
|
-
{ native: 'icine', normalized: 'into' },
|
|
215
|
-
{ native: 'once', normalized: 'before' },
|
|
216
|
-
{ native: 'icin', normalized: 'for' },
|
|
217
|
-
|
|
218
|
-
// Colloquial forms
|
|
219
|
-
{ native: 'al', normalized: 'get' },
|
|
220
|
-
{ native: 'yap', normalized: 'set' },
|
|
221
|
-
|
|
222
|
-
// Control flow helpers
|
|
223
|
-
{ native: 'o_zaman', normalized: 'then' },
|
|
224
|
-
{ native: 'bitir', normalized: 'end' },
|
|
225
|
-
|
|
226
|
-
// Case suffix modifiers
|
|
227
|
-
{ native: '-den', normalized: 'from' },
|
|
228
|
-
{ native: '-dan', normalized: 'from' },
|
|
172
|
+
// Note: Command synonyms and diacritic-free variants should be in profile alternatives.
|
|
173
|
+
// Event triggers (üzerinde, olduğunda) should be in profile as 'on' alternatives.
|
|
229
174
|
];
|
|
230
175
|
|
|
231
176
|
// =============================================================================
|
|
@@ -328,6 +273,14 @@ export class TurkishTokenizer extends BaseTokenizer {
|
|
|
328
273
|
continue;
|
|
329
274
|
}
|
|
330
275
|
|
|
276
|
+
// Try multi-word phrases first (e.g., "üzerine gelme", "fare üzerinde")
|
|
277
|
+
const phraseToken = this.tryMultiWordPhrase(input, pos);
|
|
278
|
+
if (phraseToken) {
|
|
279
|
+
tokens.push(phraseToken);
|
|
280
|
+
pos = phraseToken.position.end;
|
|
281
|
+
continue;
|
|
282
|
+
}
|
|
283
|
+
|
|
331
284
|
// Try Turkish word
|
|
332
285
|
if (isTurkishLetter(input[pos])) {
|
|
333
286
|
const wordToken = this.extractTurkishWord(input, pos);
|
|
@@ -358,6 +311,42 @@ export class TurkishTokenizer extends BaseTokenizer {
|
|
|
358
311
|
return 'identifier';
|
|
359
312
|
}
|
|
360
313
|
|
|
314
|
+
/**
|
|
315
|
+
* Try to match multi-word phrases that function as single units.
|
|
316
|
+
* Multi-word phrases are included in profileKeywords and sorted longest-first,
|
|
317
|
+
* so they'll be matched before their constituent words.
|
|
318
|
+
*
|
|
319
|
+
* Examples: "üzerine gelme" (hover), "fare üzerinde" (mouseover)
|
|
320
|
+
*/
|
|
321
|
+
private tryMultiWordPhrase(input: string, pos: number): LanguageToken | null {
|
|
322
|
+
// Check against multi-word entries in profileKeywords (sorted longest-first)
|
|
323
|
+
for (const entry of this.profileKeywords) {
|
|
324
|
+
// Only check multi-word phrases (contain space)
|
|
325
|
+
if (!entry.native.includes(' ')) continue;
|
|
326
|
+
|
|
327
|
+
const phrase = entry.native;
|
|
328
|
+
const candidate = input.slice(pos, pos + phrase.length).toLowerCase();
|
|
329
|
+
if (candidate === phrase.toLowerCase()) {
|
|
330
|
+
// Check word boundary
|
|
331
|
+
const nextPos = pos + phrase.length;
|
|
332
|
+
if (
|
|
333
|
+
nextPos >= input.length ||
|
|
334
|
+
isWhitespace(input[nextPos]) ||
|
|
335
|
+
!isTurkishLetter(input[nextPos])
|
|
336
|
+
) {
|
|
337
|
+
return createToken(
|
|
338
|
+
input.slice(pos, pos + phrase.length),
|
|
339
|
+
'keyword',
|
|
340
|
+
createPosition(pos, nextPos),
|
|
341
|
+
entry.normalized
|
|
342
|
+
);
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
return null;
|
|
348
|
+
}
|
|
349
|
+
|
|
361
350
|
/**
|
|
362
351
|
* Extract a Turkish word.
|
|
363
352
|
* Uses morphological normalization to handle verb conjugations.
|
|
@@ -375,10 +364,10 @@ export class TurkishTokenizer extends BaseTokenizer {
|
|
|
375
364
|
|
|
376
365
|
const lowerWord = word.toLowerCase();
|
|
377
366
|
|
|
378
|
-
//
|
|
379
|
-
|
|
380
|
-
if (
|
|
381
|
-
return createToken(word, '
|
|
367
|
+
// Check if it's a case suffix (particle) first
|
|
368
|
+
// This prevents roleMarker keywords from overriding particle classification
|
|
369
|
+
if (CASE_SUFFIXES.has(lowerWord)) {
|
|
370
|
+
return createToken(word, 'particle', createPosition(startPos, pos));
|
|
382
371
|
}
|
|
383
372
|
|
|
384
373
|
// Check if it's a postposition
|
|
@@ -386,6 +375,12 @@ export class TurkishTokenizer extends BaseTokenizer {
|
|
|
386
375
|
return createToken(word, 'particle', createPosition(startPos, pos));
|
|
387
376
|
}
|
|
388
377
|
|
|
378
|
+
// O(1) Map lookup instead of O(n) array search
|
|
379
|
+
const keywordEntry = this.lookupKeyword(lowerWord);
|
|
380
|
+
if (keywordEntry) {
|
|
381
|
+
return createToken(word, 'keyword', createPosition(startPos, pos), keywordEntry.normalized);
|
|
382
|
+
}
|
|
383
|
+
|
|
389
384
|
// Try morphological normalization for conjugated forms
|
|
390
385
|
const morphToken = this.tryMorphKeywordMatch(lowerWord, startPos, pos);
|
|
391
386
|
if (morphToken) return morphToken;
|
package/src/types.ts
CHANGED
|
@@ -157,6 +157,15 @@ export interface SemanticMetadata {
|
|
|
157
157
|
readonly sourceText?: string;
|
|
158
158
|
readonly sourcePosition?: SourcePosition;
|
|
159
159
|
readonly patternId?: string;
|
|
160
|
+
/**
|
|
161
|
+
* Confidence score for the parse (0-1).
|
|
162
|
+
* Higher values indicate more certain matches.
|
|
163
|
+
* - 1.0: Exact match with all roles captured
|
|
164
|
+
* - 0.8-0.99: High confidence with minor uncertainty (stem matching, optional roles)
|
|
165
|
+
* - 0.6-0.8: Medium confidence (morphological normalization, defaults applied)
|
|
166
|
+
* - <0.6: Low confidence (may need fallback to traditional parser)
|
|
167
|
+
*/
|
|
168
|
+
readonly confidence?: number;
|
|
160
169
|
}
|
|
161
170
|
|
|
162
171
|
export interface SourcePosition {
|
|
@@ -336,6 +345,10 @@ export interface ExtractionRule {
|
|
|
336
345
|
readonly transform?: (raw: string) => SemanticValue;
|
|
337
346
|
/** Default value if not found (for optional roles) */
|
|
338
347
|
readonly default?: SemanticValue;
|
|
348
|
+
/** Static value extraction (for event handler wrapped commands) */
|
|
349
|
+
readonly value?: string;
|
|
350
|
+
/** Extract value from a pattern role by name */
|
|
351
|
+
readonly fromRole?: string;
|
|
339
352
|
}
|
|
340
353
|
|
|
341
354
|
/**
|