@lokascript/semantic 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/dist/browser-ar.ar.global.js +2 -2
  2. package/dist/browser-core.core.global.js +2 -2
  3. package/dist/browser-de.de.global.js +2 -2
  4. package/dist/browser-east-asian.east-asian.global.js +2 -2
  5. package/dist/browser-en-tr.en-tr.global.js +2 -2
  6. package/dist/browser-en.en.global.js +2 -2
  7. package/dist/browser-es-en.es-en.global.js +2 -2
  8. package/dist/browser-es.es.global.js +2 -2
  9. package/dist/browser-fr.fr.global.js +2 -2
  10. package/dist/browser-id.id.global.js +2 -2
  11. package/dist/browser-ja.ja.global.js +2 -2
  12. package/dist/browser-ko.ko.global.js +2 -2
  13. package/dist/browser-lazy.lazy.global.js +2 -2
  14. package/dist/browser-priority.priority.global.js +2 -2
  15. package/dist/browser-pt.pt.global.js +2 -2
  16. package/dist/browser-qu.qu.global.js +2 -2
  17. package/dist/browser-sw.sw.global.js +2 -2
  18. package/dist/browser-tr.tr.global.js +2 -2
  19. package/dist/browser-western.western.global.js +2 -2
  20. package/dist/browser-zh.zh.global.js +2 -2
  21. package/dist/browser.global.js +2 -2
  22. package/dist/browser.global.js.map +1 -1
  23. package/dist/index.cjs +13042 -17462
  24. package/dist/index.cjs.map +1 -1
  25. package/dist/index.d.cts +49 -5
  26. package/dist/index.d.ts +49 -5
  27. package/dist/index.js +14044 -18464
  28. package/dist/index.js.map +1 -1
  29. package/dist/languages/ar.d.ts +1 -1
  30. package/dist/languages/ar.js +31 -44
  31. package/dist/languages/ar.js.map +1 -1
  32. package/dist/languages/de.d.ts +1 -1
  33. package/dist/languages/de.js +14 -2
  34. package/dist/languages/de.js.map +1 -1
  35. package/dist/languages/en.d.ts +1 -1
  36. package/dist/languages/en.js +558 -12
  37. package/dist/languages/en.js.map +1 -1
  38. package/dist/languages/es.d.ts +1 -1
  39. package/dist/languages/es.js +16 -0
  40. package/dist/languages/es.js.map +1 -1
  41. package/dist/languages/fr.d.ts +1 -1
  42. package/dist/languages/fr.js +14 -2
  43. package/dist/languages/fr.js.map +1 -1
  44. package/dist/languages/id.d.ts +1 -1
  45. package/dist/languages/id.js +14 -2
  46. package/dist/languages/id.js.map +1 -1
  47. package/dist/languages/ja.d.ts +1 -1
  48. package/dist/languages/ja.js +18 -3
  49. package/dist/languages/ja.js.map +1 -1
  50. package/dist/languages/ko.d.ts +8 -1
  51. package/dist/languages/ko.js +75 -43
  52. package/dist/languages/ko.js.map +1 -1
  53. package/dist/languages/pt.d.ts +1 -1
  54. package/dist/languages/pt.js +17 -0
  55. package/dist/languages/pt.js.map +1 -1
  56. package/dist/languages/qu.d.ts +12 -1
  57. package/dist/languages/qu.js +77 -2
  58. package/dist/languages/qu.js.map +1 -1
  59. package/dist/languages/sw.d.ts +1 -1
  60. package/dist/languages/sw.js.map +1 -1
  61. package/dist/languages/tr.d.ts +9 -1
  62. package/dist/languages/tr.js +96 -72
  63. package/dist/languages/tr.js.map +1 -1
  64. package/dist/languages/zh.d.ts +1 -1
  65. package/dist/languages/zh.js +16 -0
  66. package/dist/languages/zh.js.map +1 -1
  67. package/dist/{types-C4dcj53L.d.ts → types-BY3Id07j.d.ts} +20 -5
  68. package/package.json +20 -29
  69. package/src/generators/command-schemas.ts +21 -10
  70. package/src/generators/event-handler-generator.ts +50 -44
  71. package/src/generators/language-profiles.ts +6 -0
  72. package/src/generators/pattern-generator.ts +883 -1
  73. package/src/generators/profiles/arabic.ts +19 -3
  74. package/src/generators/profiles/bengali.ts +12 -1
  75. package/src/generators/profiles/chinese.ts +15 -0
  76. package/src/generators/profiles/french.ts +12 -1
  77. package/src/generators/profiles/german.ts +12 -1
  78. package/src/generators/profiles/hebrew.ts +148 -0
  79. package/src/generators/profiles/hindi.ts +12 -1
  80. package/src/generators/profiles/index.ts +2 -0
  81. package/src/generators/profiles/indonesian.ts +12 -1
  82. package/src/generators/profiles/italian.ts +16 -0
  83. package/src/generators/profiles/japanese.ts +11 -2
  84. package/src/generators/profiles/korean.ts +15 -1
  85. package/src/generators/profiles/polish.ts +12 -0
  86. package/src/generators/profiles/portuguese.ts +16 -0
  87. package/src/generators/profiles/russian.ts +11 -0
  88. package/src/generators/profiles/spanish.ts +15 -0
  89. package/src/generators/profiles/spanishMexico.ts +176 -0
  90. package/src/generators/profiles/thai.ts +11 -0
  91. package/src/generators/profiles/turkish.ts +49 -7
  92. package/src/generators/profiles/types.ts +21 -5
  93. package/src/generators/profiles/ukrainian.ts +11 -0
  94. package/src/generators/profiles/vietnamese.ts +11 -0
  95. package/src/language-building-schema.ts +111 -0
  96. package/src/languages/_all.ts +5 -1
  97. package/src/languages/es-MX.ts +32 -0
  98. package/src/languages/he.ts +15 -0
  99. package/src/parser/pattern-matcher.ts +10 -1
  100. package/src/parser/semantic-parser.ts +3 -0
  101. package/src/patterns/add/ar.ts +3 -59
  102. package/src/patterns/add/index.ts +5 -1
  103. package/src/patterns/add/ja.ts +3 -81
  104. package/src/patterns/add/ko.ts +3 -62
  105. package/src/patterns/add/qu.ts +69 -0
  106. package/src/patterns/add/tr.ts +3 -59
  107. package/src/patterns/builders.ts +1 -0
  108. package/src/patterns/decrement/tr.ts +3 -36
  109. package/src/patterns/event-handler/ar.ts +3 -139
  110. package/src/patterns/event-handler/he.ts +15 -0
  111. package/src/patterns/event-handler/index.ts +5 -1
  112. package/src/patterns/event-handler/ja.ts +3 -106
  113. package/src/patterns/event-handler/ko.ts +3 -121
  114. package/src/patterns/event-handler/ms.ts +45 -20
  115. package/src/patterns/event-handler/tr.ts +3 -158
  116. package/src/patterns/get/ar.ts +3 -37
  117. package/src/patterns/get/ja.ts +3 -41
  118. package/src/patterns/get/ko.ts +3 -41
  119. package/src/patterns/grammar-transformed/ja.ts +3 -1701
  120. package/src/patterns/grammar-transformed/ko.ts +3 -1299
  121. package/src/patterns/grammar-transformed/tr.ts +3 -1055
  122. package/src/patterns/hide/ar.ts +3 -55
  123. package/src/patterns/hide/ja.ts +3 -57
  124. package/src/patterns/hide/ko.ts +3 -57
  125. package/src/patterns/hide/tr.ts +3 -53
  126. package/src/patterns/increment/tr.ts +3 -40
  127. package/src/patterns/put/ar.ts +3 -62
  128. package/src/patterns/put/ja.ts +3 -63
  129. package/src/patterns/put/ko.ts +3 -55
  130. package/src/patterns/put/tr.ts +3 -55
  131. package/src/patterns/remove/ar.ts +3 -59
  132. package/src/patterns/remove/index.ts +5 -1
  133. package/src/patterns/remove/ja.ts +3 -62
  134. package/src/patterns/remove/ko.ts +3 -66
  135. package/src/patterns/remove/qu.ts +69 -0
  136. package/src/patterns/remove/tr.ts +3 -66
  137. package/src/patterns/set/ar.ts +3 -72
  138. package/src/patterns/set/ja.ts +3 -74
  139. package/src/patterns/set/ko.ts +3 -73
  140. package/src/patterns/set/tr.ts +3 -95
  141. package/src/patterns/show/ar.ts +3 -55
  142. package/src/patterns/show/ja.ts +3 -57
  143. package/src/patterns/show/ko.ts +3 -61
  144. package/src/patterns/show/tr.ts +3 -53
  145. package/src/patterns/take/ar.ts +3 -39
  146. package/src/patterns/toggle/ar.ts +3 -49
  147. package/src/patterns/toggle/index.ts +5 -1
  148. package/src/patterns/toggle/ja.ts +3 -144
  149. package/src/patterns/toggle/ko.ts +3 -101
  150. package/src/patterns/toggle/qu.ts +90 -0
  151. package/src/patterns/toggle/tr.ts +3 -76
  152. package/src/registry.ts +179 -15
  153. package/src/tokenizers/arabic.ts +13 -46
  154. package/src/tokenizers/bengali.ts +2 -16
  155. package/src/tokenizers/he.ts +542 -0
  156. package/src/tokenizers/index.ts +1 -0
  157. package/src/tokenizers/japanese.ts +3 -1
  158. package/src/tokenizers/korean.ts +104 -48
  159. package/src/tokenizers/ms.ts +3 -0
  160. package/src/tokenizers/quechua.ts +101 -2
  161. package/src/tokenizers/turkish.ts +64 -69
  162. package/src/types.ts +13 -0
@@ -0,0 +1,542 @@
1
+ /**
2
+ * Hebrew Tokenizer
3
+ *
4
+ * Tokenizes Hebrew hyperscript input.
5
+ * Hebrew is challenging because:
6
+ * - Right-to-left (RTL) text direction
7
+ * - Prefix prepositions that attach to words (ב, ל, מ, כ, ה, ו, ש)
8
+ * - Optional vowel points (nikkud) typically omitted in modern text
9
+ * - CSS selectors are LTR islands within RTL text
10
+ */
11
+
12
+ import type { LanguageToken, TokenKind, TokenStream } from '../types';
13
+ import {
14
+ BaseTokenizer,
15
+ TokenStreamImpl,
16
+ createToken,
17
+ createPosition,
18
+ createUnicodeRangeClassifier,
19
+ isWhitespace,
20
+ isSelectorStart,
21
+ isQuote,
22
+ isDigit,
23
+ isAsciiIdentifierChar,
24
+ isUrlStart,
25
+ type KeywordEntry,
26
+ type TimeUnitMapping,
27
+ } from './base';
28
+ import { hebrewProfile } from '../generators/profiles/hebrew';
29
+
30
+ // =============================================================================
31
+ // Hebrew Character Classification
32
+ // =============================================================================
33
+
34
+ /** Check if character is Hebrew (includes all Hebrew Unicode blocks). */
35
+ const isHebrew = createUnicodeRangeClassifier([
36
+ [0x0590, 0x05ff], // Hebrew
37
+ [0xfb1d, 0xfb4f], // Hebrew Presentation Forms
38
+ ]);
39
+
40
+ // =============================================================================
41
+ // Hebrew Prefixes and Prepositions
42
+ // =============================================================================
43
+
44
+ /**
45
+ * Hebrew prefix prepositions that attach to the following word.
46
+ * These are common prefixes in Hebrew that modify meaning.
47
+ * Reserved for future morphological analysis integration.
48
+ *
49
+ * Prefixes:
50
+ * - ב (b') - in, at, with
51
+ * - ל (l') - to, for
52
+ * - מ (m') - from
53
+ * - כ (k') - like, as
54
+ * - ה (h') - definite article "the"
55
+ * - ו (v') - conjunction "and"
56
+ * - ש (sh') - relative pronoun "that"
57
+ */
58
+
59
+ /**
60
+ * Hebrew conjunctions.
61
+ */
62
+ const CONJUNCTIONS = new Map<string, string>([
63
+ ['ו', 'and'], // v' - conjunction "and"
64
+ ]);
65
+
66
+ /**
67
+ * Hebrew event marker prefixes that attach to event names.
68
+ * These indicate "on/at/when" an event occurs.
69
+ */
70
+ const EVENT_MARKER_PREFIXES = new Map<string, string>([
71
+ ['ב', 'on'], // b' - "at/in/on" (event marker)
72
+ ['כ', 'when'], // k' - "as/when" (temporal)
73
+ ]);
74
+
75
+ /**
76
+ * Hebrew event names that can follow event marker prefixes.
77
+ */
78
+ const EVENT_NAMES = new Set([
79
+ 'לחיצה', // click
80
+ 'קליק', // click (loanword)
81
+ 'שליחה', // submit
82
+ 'הגשה', // submit (alternative)
83
+ 'ריחוף', // hover
84
+ 'מעבר', // hover/transition
85
+ 'שינוי', // change
86
+ 'עדכון', // update/change
87
+ 'קלט', // input
88
+ 'הזנה', // input (alternative)
89
+ 'מיקוד', // focus
90
+ 'טשטוש', // blur
91
+ 'טעינה', // load
92
+ 'גלילה', // scroll
93
+ ]);
94
+
95
+ /**
96
+ * Hebrew prepositions (standalone).
97
+ */
98
+ const PREPOSITIONS = new Set([
99
+ 'על', // al (on, upon)
100
+ 'את', // et (direct object marker)
101
+ 'אל', // el (to, toward)
102
+ 'מן', // min (from)
103
+ 'עם', // im (with)
104
+ 'בתוך', // betoch (inside)
105
+ 'מתוך', // mitoch (from inside)
106
+ 'ליד', // leyad (next to)
107
+ 'אחרי', // acharey (after)
108
+ 'לפני', // lifney (before)
109
+ 'בין', // beyn (between)
110
+ 'עד', // ad (until)
111
+ 'של', // shel (of - possessive)
112
+ ]);
113
+
114
+ // =============================================================================
115
+ // Hebrew Extras (keywords not in profile)
116
+ // =============================================================================
117
+
118
+ /**
119
+ * Extra keywords not covered by the profile.
120
+ */
121
+ const HEBREW_EXTRAS: KeywordEntry[] = [
122
+ // Values/Literals
123
+ { native: 'אמת', normalized: 'true' },
124
+ { native: 'שקר', normalized: 'false' },
125
+ { native: 'null', normalized: 'null' },
126
+ { native: 'ריק', normalized: 'null' },
127
+ { native: 'לא מוגדר', normalized: 'undefined' },
128
+
129
+ // Positional
130
+ { native: 'ראשון', normalized: 'first' },
131
+ { native: 'אחרון', normalized: 'last' },
132
+ { native: 'הבא', normalized: 'next' },
133
+ { native: 'הקודם', normalized: 'previous' },
134
+ { native: 'הקרוב', normalized: 'closest' },
135
+ { native: 'הורה', normalized: 'parent' },
136
+
137
+ // Events
138
+ { native: 'לחיצה', normalized: 'click' },
139
+ { native: 'קליק', normalized: 'click' },
140
+ { native: 'קלט', normalized: 'input' },
141
+ { native: 'שינוי', normalized: 'change' },
142
+ { native: 'שליחה', normalized: 'submit' },
143
+ { native: 'מיקוד', normalized: 'focus' },
144
+ { native: 'טשטוש', normalized: 'blur' },
145
+ { native: 'לחיצת מקש', normalized: 'keydown' },
146
+ { native: 'שחרור מקש', normalized: 'keyup' },
147
+ { native: 'מעבר עכבר', normalized: 'mouseover' },
148
+ { native: 'יציאת עכבר', normalized: 'mouseout' },
149
+ { native: 'טעינה', normalized: 'load' },
150
+ { native: 'גלילה', normalized: 'scroll' },
151
+
152
+ // References (feminine forms not in profile)
153
+ { native: 'היא', normalized: 'it' }, // feminine "it"
154
+ { native: 'הוא', normalized: 'it' }, // masculine "it"
155
+ { native: 'את', normalized: 'you' }, // feminine "you"
156
+
157
+ // Time units
158
+ { native: 'שנייה', normalized: 's' },
159
+ { native: 'שניות', normalized: 's' },
160
+ { native: 'מילישנייה', normalized: 'ms' },
161
+ { native: 'דקה', normalized: 'm' },
162
+ { native: 'דקות', normalized: 'm' },
163
+ { native: 'שעה', normalized: 'h' },
164
+ { native: 'שעות', normalized: 'h' },
165
+ ];
166
+
167
+ // =============================================================================
168
+ // Hebrew Time Units
169
+ // =============================================================================
170
+
171
+ /**
172
+ * Hebrew time unit patterns for number parsing.
173
+ */
174
+ const HEBREW_TIME_UNITS: readonly TimeUnitMapping[] = [
175
+ { pattern: 'מילישנייה', suffix: 'ms', length: 8, caseInsensitive: false },
176
+ { pattern: 'מילישניות', suffix: 'ms', length: 9, caseInsensitive: false },
177
+ { pattern: 'שניות', suffix: 's', length: 5, caseInsensitive: false },
178
+ { pattern: 'שנייה', suffix: 's', length: 5, caseInsensitive: false },
179
+ { pattern: 'דקות', suffix: 'm', length: 4, caseInsensitive: false },
180
+ { pattern: 'דקה', suffix: 'm', length: 3, caseInsensitive: false },
181
+ { pattern: 'שעות', suffix: 'h', length: 4, caseInsensitive: false },
182
+ { pattern: 'שעה', suffix: 'h', length: 3, caseInsensitive: false },
183
+ ];
184
+
185
+ // =============================================================================
186
+ // Hebrew Tokenizer Implementation
187
+ // =============================================================================
188
+
189
+ export class HebrewTokenizer extends BaseTokenizer {
190
+ readonly language = 'he';
191
+ readonly direction = 'rtl' as const;
192
+
193
+ constructor() {
194
+ super();
195
+ this.initializeKeywordsFromProfile(hebrewProfile, HEBREW_EXTRAS);
196
+ }
197
+
198
+ tokenize(input: string): TokenStream {
199
+ const tokens: LanguageToken[] = [];
200
+ let pos = 0;
201
+
202
+ while (pos < input.length) {
203
+ // Skip whitespace
204
+ if (isWhitespace(input[pos])) {
205
+ pos++;
206
+ continue;
207
+ }
208
+
209
+ // Try CSS selector first (LTR island in RTL text)
210
+ if (isSelectorStart(input[pos])) {
211
+ // Check for event modifier first (.once, .debounce(), etc.)
212
+ const modifierToken = this.tryEventModifier(input, pos);
213
+ if (modifierToken) {
214
+ tokens.push(modifierToken);
215
+ pos = modifierToken.position.end;
216
+ continue;
217
+ }
218
+
219
+ const selectorToken = this.trySelector(input, pos);
220
+ if (selectorToken) {
221
+ tokens.push(selectorToken);
222
+ pos = selectorToken.position.end;
223
+ continue;
224
+ }
225
+ }
226
+
227
+ // Try string literal
228
+ if (isQuote(input[pos])) {
229
+ const stringToken = this.tryString(input, pos);
230
+ if (stringToken) {
231
+ tokens.push(stringToken);
232
+ pos = stringToken.position.end;
233
+ continue;
234
+ }
235
+ }
236
+
237
+ // Try URL (/path, ./path, http://, etc.)
238
+ if (isUrlStart(input, pos)) {
239
+ const urlToken = this.tryUrl(input, pos);
240
+ if (urlToken) {
241
+ tokens.push(urlToken);
242
+ pos = urlToken.position.end;
243
+ continue;
244
+ }
245
+ }
246
+
247
+ // Try number
248
+ if (isDigit(input[pos])) {
249
+ const numberToken = this.extractHebrewNumber(input, pos);
250
+ if (numberToken) {
251
+ tokens.push(numberToken);
252
+ pos = numberToken.position.end;
253
+ continue;
254
+ }
255
+ }
256
+
257
+ // Try variable reference (:varname)
258
+ const varToken = this.tryVariableRef(input, pos);
259
+ if (varToken) {
260
+ tokens.push(varToken);
261
+ pos = varToken.position.end;
262
+ continue;
263
+ }
264
+
265
+ // Try Hebrew preposition (multi-word first)
266
+ const prepToken = this.tryPreposition(input, pos);
267
+ if (prepToken) {
268
+ tokens.push(prepToken);
269
+ pos = prepToken.position.end;
270
+ continue;
271
+ }
272
+
273
+ // Try Hebrew word (with prefix detection)
274
+ if (isHebrew(input[pos])) {
275
+ // Check for event marker prefix (ב, כ) attached to event name
276
+ const eventMarkerResult = this.tryEventMarkerPrefix(input, pos);
277
+ if (eventMarkerResult) {
278
+ tokens.push(eventMarkerResult.marker);
279
+ tokens.push(eventMarkerResult.event);
280
+ pos = eventMarkerResult.event.position.end;
281
+ continue;
282
+ }
283
+
284
+ // Check for conjunction prefix (ו) attached to following word
285
+ const prefixResult = this.tryPrefixConjunction(input, pos);
286
+ if (prefixResult) {
287
+ tokens.push(prefixResult.conjunction);
288
+ pos = prefixResult.conjunction.position.end;
289
+ // Continue to let the next iteration extract the remaining word
290
+ continue;
291
+ }
292
+
293
+ const wordToken = this.extractHebrewWord(input, pos);
294
+ if (wordToken) {
295
+ tokens.push(wordToken);
296
+ pos = wordToken.position.end;
297
+ continue;
298
+ }
299
+ }
300
+
301
+ // Try ASCII word (for mixed content)
302
+ if (isAsciiIdentifierChar(input[pos])) {
303
+ const asciiToken = this.extractAsciiWord(input, pos);
304
+ if (asciiToken) {
305
+ tokens.push(asciiToken);
306
+ pos = asciiToken.position.end;
307
+ continue;
308
+ }
309
+ }
310
+
311
+ // Skip unknown character
312
+ pos++;
313
+ }
314
+
315
+ return new TokenStreamImpl(tokens, 'he');
316
+ }
317
+
318
+ classifyToken(token: string): TokenKind {
319
+ if (PREPOSITIONS.has(token)) return 'particle';
320
+ if (this.isKeyword(token)) return 'keyword';
321
+ if (token.startsWith('#') || token.startsWith('.') || token.startsWith('[')) return 'selector';
322
+ if (token.startsWith('"') || token.startsWith("'")) return 'literal';
323
+ if (/^\d/.test(token)) return 'literal';
324
+
325
+ return 'identifier';
326
+ }
327
+
328
+ /**
329
+ * Try to match a Hebrew preposition.
330
+ */
331
+ private tryPreposition(input: string, pos: number): LanguageToken | null {
332
+ // Check prepositions from longest to shortest
333
+ const sortedPreps = Array.from(PREPOSITIONS).sort((a, b) => b.length - a.length);
334
+
335
+ for (const prep of sortedPreps) {
336
+ if (input.slice(pos, pos + prep.length) === prep) {
337
+ // Check that it's a standalone word (followed by space or non-Hebrew)
338
+ const nextPos = pos + prep.length;
339
+ if (nextPos >= input.length || isWhitespace(input[nextPos]) || !isHebrew(input[nextPos])) {
340
+ const token = createToken(prep, 'particle', createPosition(pos, nextPos));
341
+ return {
342
+ ...token,
343
+ metadata: {
344
+ prepositionValue: prep,
345
+ },
346
+ };
347
+ }
348
+ }
349
+ }
350
+ return null;
351
+ }
352
+
353
+ /**
354
+ * Try to extract a prefix conjunction (ו - "and") that's attached to the following word.
355
+ *
356
+ * Hebrew conjunction prefix attaches directly to words without space:
357
+ * - והחלף → ו + החלף (and + toggle)
358
+ * - ולחיצה → ו + לחיצה (and + click)
359
+ */
360
+ private tryPrefixConjunction(input: string, pos: number): { conjunction: LanguageToken } | null {
361
+ // CRITICAL: Check if the full word is a keyword BEFORE splitting
362
+ let wordEnd = pos;
363
+ while (wordEnd < input.length && isHebrew(input[wordEnd])) {
364
+ wordEnd++;
365
+ }
366
+ const fullWord = input.slice(pos, wordEnd);
367
+
368
+ // Check if full word is a keyword
369
+ if (this.lookupKeyword(fullWord)) {
370
+ return null; // Let extractHebrewWord handle it
371
+ }
372
+
373
+ // Check prepositions (they shouldn't be split)
374
+ if (PREPOSITIONS.has(fullWord)) {
375
+ return null;
376
+ }
377
+
378
+ // Check for conjunction prefix (ו)
379
+ const char = input[pos];
380
+ const conjEntry = CONJUNCTIONS.get(char);
381
+
382
+ if (!conjEntry) return null;
383
+
384
+ // Check if there's a following Hebrew character (prefix must be attached)
385
+ const nextPos = pos + 1;
386
+ if (nextPos >= input.length || !isHebrew(input[nextPos])) {
387
+ return null; // Standalone conjunction or end of input
388
+ }
389
+
390
+ // Count remaining Hebrew characters to ensure meaningful word follows
391
+ let remainingLength = 0;
392
+ let checkPos = nextPos;
393
+ while (checkPos < input.length && isHebrew(input[checkPos])) {
394
+ remainingLength++;
395
+ checkPos++;
396
+ }
397
+
398
+ // Require at least 2 characters after prefix to avoid false positives
399
+ if (remainingLength < 2) {
400
+ return null;
401
+ }
402
+
403
+ // Check if the remaining word is a keyword
404
+ const afterPrefix = input.slice(nextPos, wordEnd);
405
+ if (this.lookupKeyword(afterPrefix)) {
406
+ // Split: conjunction + keyword
407
+ return {
408
+ conjunction: createToken(char, 'conjunction', createPosition(pos, nextPos), conjEntry),
409
+ };
410
+ }
411
+
412
+ return null;
413
+ }
414
+
415
+ /**
416
+ * Try to extract an event marker prefix (ב, כ) attached to an event name.
417
+ *
418
+ * Hebrew event markers attach directly to event names without space:
419
+ * - בלחיצה → ב + לחיצה (on + click)
420
+ * - כשינוי → כ + שינוי (when + change)
421
+ *
422
+ * Returns both the marker token and the event name token if successful.
423
+ */
424
+ private tryEventMarkerPrefix(
425
+ input: string,
426
+ pos: number
427
+ ): { marker: LanguageToken; event: LanguageToken } | null {
428
+ const char = input[pos];
429
+ const markerNormalized = EVENT_MARKER_PREFIXES.get(char);
430
+
431
+ if (!markerNormalized) return null;
432
+
433
+ // Check if there's a following Hebrew character
434
+ const nextPos = pos + 1;
435
+ if (nextPos >= input.length || !isHebrew(input[nextPos])) {
436
+ return null;
437
+ }
438
+
439
+ // Extract the word after the prefix
440
+ let wordEnd = nextPos;
441
+ while (wordEnd < input.length && isHebrew(input[wordEnd])) {
442
+ wordEnd++;
443
+ }
444
+ const afterPrefix = input.slice(nextPos, wordEnd);
445
+
446
+ // Check if it's a known event name
447
+ if (EVENT_NAMES.has(afterPrefix)) {
448
+ // Found event marker + event name: split into two tokens
449
+ const markerToken = createToken(
450
+ char,
451
+ 'keyword',
452
+ createPosition(pos, nextPos),
453
+ markerNormalized // normalized to 'on' or 'when'
454
+ );
455
+
456
+ // Look up the event name to get its normalized form
457
+ const eventKeywordEntry = this.lookupKeyword(afterPrefix);
458
+ const eventToken = eventKeywordEntry
459
+ ? createToken(
460
+ afterPrefix,
461
+ 'keyword',
462
+ createPosition(nextPos, wordEnd),
463
+ eventKeywordEntry.normalized
464
+ )
465
+ : createToken(afterPrefix, 'keyword', createPosition(nextPos, wordEnd));
466
+
467
+ return { marker: markerToken, event: eventToken };
468
+ }
469
+
470
+ return null;
471
+ }
472
+
473
+ /**
474
+ * Extract a Hebrew word.
475
+ */
476
+ private extractHebrewWord(input: string, startPos: number): LanguageToken | null {
477
+ let pos = startPos;
478
+ let word = '';
479
+
480
+ // Extract Hebrew characters
481
+ while (pos < input.length && isHebrew(input[pos])) {
482
+ word += input[pos++];
483
+ }
484
+
485
+ if (!word) return null;
486
+
487
+ // Check if it's a keyword
488
+ const keywordEntry = this.lookupKeyword(word);
489
+ if (keywordEntry) {
490
+ return createToken(word, 'keyword', createPosition(startPos, pos), keywordEntry.normalized);
491
+ }
492
+
493
+ // Check if it's a preposition (with metadata for disambiguation)
494
+ if (PREPOSITIONS.has(word)) {
495
+ const token = createToken(word, 'particle', createPosition(startPos, pos));
496
+ return {
497
+ ...token,
498
+ metadata: {
499
+ prepositionValue: word,
500
+ },
501
+ };
502
+ }
503
+
504
+ // Try morphological normalization for prefix variations
505
+ const morphToken = this.tryMorphKeywordMatch(word, startPos, pos);
506
+ if (morphToken) return morphToken;
507
+
508
+ // Not a keyword or recognized form, return as identifier
509
+ return createToken(word, 'identifier', createPosition(startPos, pos));
510
+ }
511
+
512
+ /**
513
+ * Extract an ASCII word.
514
+ */
515
+ private extractAsciiWord(input: string, startPos: number): LanguageToken | null {
516
+ let pos = startPos;
517
+ let word = '';
518
+
519
+ while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
520
+ word += input[pos++];
521
+ }
522
+
523
+ if (!word) return null;
524
+
525
+ return createToken(word, 'identifier', createPosition(startPos, pos));
526
+ }
527
+
528
+ /**
529
+ * Extract a number, including Hebrew time unit suffixes.
530
+ */
531
+ private extractHebrewNumber(input: string, startPos: number): LanguageToken | null {
532
+ return this.tryNumberWithTimeUnits(input, startPos, HEBREW_TIME_UNITS, {
533
+ allowSign: false,
534
+ skipWhitespace: true,
535
+ });
536
+ }
537
+ }
538
+
539
+ /**
540
+ * Singleton instance.
541
+ */
542
+ export const hebrewTokenizer = new HebrewTokenizer();
@@ -85,6 +85,7 @@ export { vietnameseTokenizer } from './vietnamese';
85
85
  export { polishTokenizer } from './polish';
86
86
  export { russianTokenizer } from './russian';
87
87
  export { ukrainianTokenizer } from './ukrainian';
88
+ export { hebrewTokenizer } from './he';
88
89
  export { hindiTokenizer } from './hindi';
89
90
  export { bengaliTokenizer } from './bengali';
90
91
  export { thaiTokenizer } from './thai';
@@ -99,7 +99,7 @@ const PARTICLE_ROLES = new Map<string, ParticleMetadata>([
99
99
  ['まで', { role: 'destination', confidence: 0.75, description: 'until/boundary marker' }],
100
100
  ['へ', { role: 'destination', confidence: 0.9, description: 'direction marker' }],
101
101
  ['と', { role: 'style', confidence: 0.7, description: 'with/and marker' }],
102
- ['の', { role: 'patient', confidence: 0.6, description: 'possessive marker' }],
102
+ ['の', { role: 'destination', confidence: 0.75, description: 'possessive/destination marker' }],
103
103
  ['が', { role: 'agent', confidence: 0.85, description: 'subject marker' }],
104
104
  ['は', { role: 'agent', confidence: 0.75, description: 'topic marker' }],
105
105
  ['も', { role: 'patient', confidence: 0.65, description: 'also/too marker' }],
@@ -165,6 +165,8 @@ const JAPANESE_EXTRAS: KeywordEntry[] = [
165
165
  { native: 'もし', normalized: 'if' }, // Starts with particle も, needs explicit entry
166
166
  { native: 'ならば', normalized: 'then' },
167
167
  { native: 'なら', normalized: 'then' },
168
+ { native: 'それから', normalized: 'then' }, // Chain connector
169
+ { native: 'そして', normalized: 'and' }, // Alternative connector
168
170
 
169
171
  // Time units
170
172
  { native: '秒', normalized: 's' },