@lokascript/semantic 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser-ar.ar.global.js +2 -2
- package/dist/browser-core.core.global.js +2 -2
- package/dist/browser-de.de.global.js +2 -2
- package/dist/browser-east-asian.east-asian.global.js +2 -2
- package/dist/browser-en-tr.en-tr.global.js +2 -2
- package/dist/browser-en.en.global.js +2 -2
- package/dist/browser-es-en.es-en.global.js +2 -2
- package/dist/browser-es.es.global.js +2 -2
- package/dist/browser-fr.fr.global.js +2 -2
- package/dist/browser-id.id.global.js +2 -2
- package/dist/browser-ja.ja.global.js +2 -2
- package/dist/browser-ko.ko.global.js +2 -2
- package/dist/browser-lazy.lazy.global.js +2 -2
- package/dist/browser-priority.priority.global.js +2 -2
- package/dist/browser-pt.pt.global.js +2 -2
- package/dist/browser-qu.qu.global.js +2 -2
- package/dist/browser-sw.sw.global.js +2 -2
- package/dist/browser-tr.tr.global.js +2 -2
- package/dist/browser-western.western.global.js +2 -2
- package/dist/browser-zh.zh.global.js +2 -2
- package/dist/browser.global.js +2 -2
- package/dist/browser.global.js.map +1 -1
- package/dist/index.cjs +13042 -17462
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +49 -5
- package/dist/index.d.ts +49 -5
- package/dist/index.js +14044 -18464
- package/dist/index.js.map +1 -1
- package/dist/languages/ar.d.ts +1 -1
- package/dist/languages/ar.js +31 -44
- package/dist/languages/ar.js.map +1 -1
- package/dist/languages/de.d.ts +1 -1
- package/dist/languages/de.js +14 -2
- package/dist/languages/de.js.map +1 -1
- package/dist/languages/en.d.ts +1 -1
- package/dist/languages/en.js +558 -12
- package/dist/languages/en.js.map +1 -1
- package/dist/languages/es.d.ts +1 -1
- package/dist/languages/es.js +16 -0
- package/dist/languages/es.js.map +1 -1
- package/dist/languages/fr.d.ts +1 -1
- package/dist/languages/fr.js +14 -2
- package/dist/languages/fr.js.map +1 -1
- package/dist/languages/id.d.ts +1 -1
- package/dist/languages/id.js +14 -2
- package/dist/languages/id.js.map +1 -1
- package/dist/languages/ja.d.ts +1 -1
- package/dist/languages/ja.js +18 -3
- package/dist/languages/ja.js.map +1 -1
- package/dist/languages/ko.d.ts +8 -1
- package/dist/languages/ko.js +75 -43
- package/dist/languages/ko.js.map +1 -1
- package/dist/languages/pt.d.ts +1 -1
- package/dist/languages/pt.js +17 -0
- package/dist/languages/pt.js.map +1 -1
- package/dist/languages/qu.d.ts +12 -1
- package/dist/languages/qu.js +77 -2
- package/dist/languages/qu.js.map +1 -1
- package/dist/languages/sw.d.ts +1 -1
- package/dist/languages/sw.js.map +1 -1
- package/dist/languages/tr.d.ts +9 -1
- package/dist/languages/tr.js +96 -72
- package/dist/languages/tr.js.map +1 -1
- package/dist/languages/zh.d.ts +1 -1
- package/dist/languages/zh.js +16 -0
- package/dist/languages/zh.js.map +1 -1
- package/dist/{types-C4dcj53L.d.ts → types-BY3Id07j.d.ts} +20 -5
- package/package.json +20 -29
- package/src/generators/command-schemas.ts +21 -10
- package/src/generators/event-handler-generator.ts +50 -44
- package/src/generators/language-profiles.ts +6 -0
- package/src/generators/pattern-generator.ts +883 -1
- package/src/generators/profiles/arabic.ts +19 -3
- package/src/generators/profiles/bengali.ts +12 -1
- package/src/generators/profiles/chinese.ts +15 -0
- package/src/generators/profiles/french.ts +12 -1
- package/src/generators/profiles/german.ts +12 -1
- package/src/generators/profiles/hebrew.ts +148 -0
- package/src/generators/profiles/hindi.ts +12 -1
- package/src/generators/profiles/index.ts +2 -0
- package/src/generators/profiles/indonesian.ts +12 -1
- package/src/generators/profiles/italian.ts +16 -0
- package/src/generators/profiles/japanese.ts +11 -2
- package/src/generators/profiles/korean.ts +15 -1
- package/src/generators/profiles/polish.ts +12 -0
- package/src/generators/profiles/portuguese.ts +16 -0
- package/src/generators/profiles/russian.ts +11 -0
- package/src/generators/profiles/spanish.ts +15 -0
- package/src/generators/profiles/spanishMexico.ts +176 -0
- package/src/generators/profiles/thai.ts +11 -0
- package/src/generators/profiles/turkish.ts +49 -7
- package/src/generators/profiles/types.ts +21 -5
- package/src/generators/profiles/ukrainian.ts +11 -0
- package/src/generators/profiles/vietnamese.ts +11 -0
- package/src/language-building-schema.ts +111 -0
- package/src/languages/_all.ts +5 -1
- package/src/languages/es-MX.ts +32 -0
- package/src/languages/he.ts +15 -0
- package/src/parser/pattern-matcher.ts +10 -1
- package/src/parser/semantic-parser.ts +3 -0
- package/src/patterns/add/ar.ts +3 -59
- package/src/patterns/add/index.ts +5 -1
- package/src/patterns/add/ja.ts +3 -81
- package/src/patterns/add/ko.ts +3 -62
- package/src/patterns/add/qu.ts +69 -0
- package/src/patterns/add/tr.ts +3 -59
- package/src/patterns/builders.ts +1 -0
- package/src/patterns/decrement/tr.ts +3 -36
- package/src/patterns/event-handler/ar.ts +3 -139
- package/src/patterns/event-handler/he.ts +15 -0
- package/src/patterns/event-handler/index.ts +5 -1
- package/src/patterns/event-handler/ja.ts +3 -106
- package/src/patterns/event-handler/ko.ts +3 -121
- package/src/patterns/event-handler/ms.ts +45 -20
- package/src/patterns/event-handler/tr.ts +3 -158
- package/src/patterns/get/ar.ts +3 -37
- package/src/patterns/get/ja.ts +3 -41
- package/src/patterns/get/ko.ts +3 -41
- package/src/patterns/grammar-transformed/ja.ts +3 -1701
- package/src/patterns/grammar-transformed/ko.ts +3 -1299
- package/src/patterns/grammar-transformed/tr.ts +3 -1055
- package/src/patterns/hide/ar.ts +3 -55
- package/src/patterns/hide/ja.ts +3 -57
- package/src/patterns/hide/ko.ts +3 -57
- package/src/patterns/hide/tr.ts +3 -53
- package/src/patterns/increment/tr.ts +3 -40
- package/src/patterns/put/ar.ts +3 -62
- package/src/patterns/put/ja.ts +3 -63
- package/src/patterns/put/ko.ts +3 -55
- package/src/patterns/put/tr.ts +3 -55
- package/src/patterns/remove/ar.ts +3 -59
- package/src/patterns/remove/index.ts +5 -1
- package/src/patterns/remove/ja.ts +3 -62
- package/src/patterns/remove/ko.ts +3 -66
- package/src/patterns/remove/qu.ts +69 -0
- package/src/patterns/remove/tr.ts +3 -66
- package/src/patterns/set/ar.ts +3 -72
- package/src/patterns/set/ja.ts +3 -74
- package/src/patterns/set/ko.ts +3 -73
- package/src/patterns/set/tr.ts +3 -95
- package/src/patterns/show/ar.ts +3 -55
- package/src/patterns/show/ja.ts +3 -57
- package/src/patterns/show/ko.ts +3 -61
- package/src/patterns/show/tr.ts +3 -53
- package/src/patterns/take/ar.ts +3 -39
- package/src/patterns/toggle/ar.ts +3 -49
- package/src/patterns/toggle/index.ts +5 -1
- package/src/patterns/toggle/ja.ts +3 -144
- package/src/patterns/toggle/ko.ts +3 -101
- package/src/patterns/toggle/qu.ts +90 -0
- package/src/patterns/toggle/tr.ts +3 -76
- package/src/registry.ts +179 -15
- package/src/tokenizers/arabic.ts +13 -46
- package/src/tokenizers/bengali.ts +2 -16
- package/src/tokenizers/he.ts +542 -0
- package/src/tokenizers/index.ts +1 -0
- package/src/tokenizers/japanese.ts +3 -1
- package/src/tokenizers/korean.ts +104 -48
- package/src/tokenizers/ms.ts +3 -0
- package/src/tokenizers/quechua.ts +101 -2
- package/src/tokenizers/turkish.ts +64 -69
- package/src/types.ts +13 -0
|
@@ -0,0 +1,542 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Hebrew Tokenizer
|
|
3
|
+
*
|
|
4
|
+
* Tokenizes Hebrew hyperscript input.
|
|
5
|
+
* Hebrew is challenging because:
|
|
6
|
+
* - Right-to-left (RTL) text direction
|
|
7
|
+
* - Prefix prepositions that attach to words (ב, ל, מ, כ, ה, ו, ש)
|
|
8
|
+
* - Optional vowel points (nikkud) typically omitted in modern text
|
|
9
|
+
* - CSS selectors are LTR islands within RTL text
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import type { LanguageToken, TokenKind, TokenStream } from '../types';
|
|
13
|
+
import {
|
|
14
|
+
BaseTokenizer,
|
|
15
|
+
TokenStreamImpl,
|
|
16
|
+
createToken,
|
|
17
|
+
createPosition,
|
|
18
|
+
createUnicodeRangeClassifier,
|
|
19
|
+
isWhitespace,
|
|
20
|
+
isSelectorStart,
|
|
21
|
+
isQuote,
|
|
22
|
+
isDigit,
|
|
23
|
+
isAsciiIdentifierChar,
|
|
24
|
+
isUrlStart,
|
|
25
|
+
type KeywordEntry,
|
|
26
|
+
type TimeUnitMapping,
|
|
27
|
+
} from './base';
|
|
28
|
+
import { hebrewProfile } from '../generators/profiles/hebrew';
|
|
29
|
+
|
|
30
|
+
// =============================================================================
|
|
31
|
+
// Hebrew Character Classification
|
|
32
|
+
// =============================================================================
|
|
33
|
+
|
|
34
|
+
/** Check if character is Hebrew (includes all Hebrew Unicode blocks). */
|
|
35
|
+
const isHebrew = createUnicodeRangeClassifier([
|
|
36
|
+
[0x0590, 0x05ff], // Hebrew
|
|
37
|
+
[0xfb1d, 0xfb4f], // Hebrew Presentation Forms
|
|
38
|
+
]);
|
|
39
|
+
|
|
40
|
+
// =============================================================================
|
|
41
|
+
// Hebrew Prefixes and Prepositions
|
|
42
|
+
// =============================================================================
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Hebrew prefix prepositions that attach to the following word.
|
|
46
|
+
* These are common prefixes in Hebrew that modify meaning.
|
|
47
|
+
* Reserved for future morphological analysis integration.
|
|
48
|
+
*
|
|
49
|
+
* Prefixes:
|
|
50
|
+
* - ב (b') - in, at, with
|
|
51
|
+
* - ל (l') - to, for
|
|
52
|
+
* - מ (m') - from
|
|
53
|
+
* - כ (k') - like, as
|
|
54
|
+
* - ה (h') - definite article "the"
|
|
55
|
+
* - ו (v') - conjunction "and"
|
|
56
|
+
* - ש (sh') - relative pronoun "that"
|
|
57
|
+
*/
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Hebrew conjunctions.
|
|
61
|
+
*/
|
|
62
|
+
const CONJUNCTIONS = new Map<string, string>([
|
|
63
|
+
['ו', 'and'], // v' - conjunction "and"
|
|
64
|
+
]);
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Hebrew event marker prefixes that attach to event names.
|
|
68
|
+
* These indicate "on/at/when" an event occurs.
|
|
69
|
+
*/
|
|
70
|
+
const EVENT_MARKER_PREFIXES = new Map<string, string>([
|
|
71
|
+
['ב', 'on'], // b' - "at/in/on" (event marker)
|
|
72
|
+
['כ', 'when'], // k' - "as/when" (temporal)
|
|
73
|
+
]);
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Hebrew event names that can follow event marker prefixes.
|
|
77
|
+
*/
|
|
78
|
+
const EVENT_NAMES = new Set([
|
|
79
|
+
'לחיצה', // click
|
|
80
|
+
'קליק', // click (loanword)
|
|
81
|
+
'שליחה', // submit
|
|
82
|
+
'הגשה', // submit (alternative)
|
|
83
|
+
'ריחוף', // hover
|
|
84
|
+
'מעבר', // hover/transition
|
|
85
|
+
'שינוי', // change
|
|
86
|
+
'עדכון', // update/change
|
|
87
|
+
'קלט', // input
|
|
88
|
+
'הזנה', // input (alternative)
|
|
89
|
+
'מיקוד', // focus
|
|
90
|
+
'טשטוש', // blur
|
|
91
|
+
'טעינה', // load
|
|
92
|
+
'גלילה', // scroll
|
|
93
|
+
]);
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Hebrew prepositions (standalone).
|
|
97
|
+
*/
|
|
98
|
+
const PREPOSITIONS = new Set([
|
|
99
|
+
'על', // al (on, upon)
|
|
100
|
+
'את', // et (direct object marker)
|
|
101
|
+
'אל', // el (to, toward)
|
|
102
|
+
'מן', // min (from)
|
|
103
|
+
'עם', // im (with)
|
|
104
|
+
'בתוך', // betoch (inside)
|
|
105
|
+
'מתוך', // mitoch (from inside)
|
|
106
|
+
'ליד', // leyad (next to)
|
|
107
|
+
'אחרי', // acharey (after)
|
|
108
|
+
'לפני', // lifney (before)
|
|
109
|
+
'בין', // beyn (between)
|
|
110
|
+
'עד', // ad (until)
|
|
111
|
+
'של', // shel (of - possessive)
|
|
112
|
+
]);
|
|
113
|
+
|
|
114
|
+
// =============================================================================
|
|
115
|
+
// Hebrew Extras (keywords not in profile)
|
|
116
|
+
// =============================================================================
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Extra keywords not covered by the profile.
|
|
120
|
+
*/
|
|
121
|
+
const HEBREW_EXTRAS: KeywordEntry[] = [
|
|
122
|
+
// Values/Literals
|
|
123
|
+
{ native: 'אמת', normalized: 'true' },
|
|
124
|
+
{ native: 'שקר', normalized: 'false' },
|
|
125
|
+
{ native: 'null', normalized: 'null' },
|
|
126
|
+
{ native: 'ריק', normalized: 'null' },
|
|
127
|
+
{ native: 'לא מוגדר', normalized: 'undefined' },
|
|
128
|
+
|
|
129
|
+
// Positional
|
|
130
|
+
{ native: 'ראשון', normalized: 'first' },
|
|
131
|
+
{ native: 'אחרון', normalized: 'last' },
|
|
132
|
+
{ native: 'הבא', normalized: 'next' },
|
|
133
|
+
{ native: 'הקודם', normalized: 'previous' },
|
|
134
|
+
{ native: 'הקרוב', normalized: 'closest' },
|
|
135
|
+
{ native: 'הורה', normalized: 'parent' },
|
|
136
|
+
|
|
137
|
+
// Events
|
|
138
|
+
{ native: 'לחיצה', normalized: 'click' },
|
|
139
|
+
{ native: 'קליק', normalized: 'click' },
|
|
140
|
+
{ native: 'קלט', normalized: 'input' },
|
|
141
|
+
{ native: 'שינוי', normalized: 'change' },
|
|
142
|
+
{ native: 'שליחה', normalized: 'submit' },
|
|
143
|
+
{ native: 'מיקוד', normalized: 'focus' },
|
|
144
|
+
{ native: 'טשטוש', normalized: 'blur' },
|
|
145
|
+
{ native: 'לחיצת מקש', normalized: 'keydown' },
|
|
146
|
+
{ native: 'שחרור מקש', normalized: 'keyup' },
|
|
147
|
+
{ native: 'מעבר עכבר', normalized: 'mouseover' },
|
|
148
|
+
{ native: 'יציאת עכבר', normalized: 'mouseout' },
|
|
149
|
+
{ native: 'טעינה', normalized: 'load' },
|
|
150
|
+
{ native: 'גלילה', normalized: 'scroll' },
|
|
151
|
+
|
|
152
|
+
// References (feminine forms not in profile)
|
|
153
|
+
{ native: 'היא', normalized: 'it' }, // feminine "it"
|
|
154
|
+
{ native: 'הוא', normalized: 'it' }, // masculine "it"
|
|
155
|
+
{ native: 'את', normalized: 'you' }, // feminine "you"
|
|
156
|
+
|
|
157
|
+
// Time units
|
|
158
|
+
{ native: 'שנייה', normalized: 's' },
|
|
159
|
+
{ native: 'שניות', normalized: 's' },
|
|
160
|
+
{ native: 'מילישנייה', normalized: 'ms' },
|
|
161
|
+
{ native: 'דקה', normalized: 'm' },
|
|
162
|
+
{ native: 'דקות', normalized: 'm' },
|
|
163
|
+
{ native: 'שעה', normalized: 'h' },
|
|
164
|
+
{ native: 'שעות', normalized: 'h' },
|
|
165
|
+
];
|
|
166
|
+
|
|
167
|
+
// =============================================================================
|
|
168
|
+
// Hebrew Time Units
|
|
169
|
+
// =============================================================================
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Hebrew time unit patterns for number parsing.
|
|
173
|
+
*/
|
|
174
|
+
const HEBREW_TIME_UNITS: readonly TimeUnitMapping[] = [
|
|
175
|
+
{ pattern: 'מילישנייה', suffix: 'ms', length: 8, caseInsensitive: false },
|
|
176
|
+
{ pattern: 'מילישניות', suffix: 'ms', length: 9, caseInsensitive: false },
|
|
177
|
+
{ pattern: 'שניות', suffix: 's', length: 5, caseInsensitive: false },
|
|
178
|
+
{ pattern: 'שנייה', suffix: 's', length: 5, caseInsensitive: false },
|
|
179
|
+
{ pattern: 'דקות', suffix: 'm', length: 4, caseInsensitive: false },
|
|
180
|
+
{ pattern: 'דקה', suffix: 'm', length: 3, caseInsensitive: false },
|
|
181
|
+
{ pattern: 'שעות', suffix: 'h', length: 4, caseInsensitive: false },
|
|
182
|
+
{ pattern: 'שעה', suffix: 'h', length: 3, caseInsensitive: false },
|
|
183
|
+
];
|
|
184
|
+
|
|
185
|
+
// =============================================================================
|
|
186
|
+
// Hebrew Tokenizer Implementation
|
|
187
|
+
// =============================================================================
|
|
188
|
+
|
|
189
|
+
export class HebrewTokenizer extends BaseTokenizer {
|
|
190
|
+
readonly language = 'he';
|
|
191
|
+
readonly direction = 'rtl' as const;
|
|
192
|
+
|
|
193
|
+
constructor() {
|
|
194
|
+
super();
|
|
195
|
+
this.initializeKeywordsFromProfile(hebrewProfile, HEBREW_EXTRAS);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
tokenize(input: string): TokenStream {
|
|
199
|
+
const tokens: LanguageToken[] = [];
|
|
200
|
+
let pos = 0;
|
|
201
|
+
|
|
202
|
+
while (pos < input.length) {
|
|
203
|
+
// Skip whitespace
|
|
204
|
+
if (isWhitespace(input[pos])) {
|
|
205
|
+
pos++;
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// Try CSS selector first (LTR island in RTL text)
|
|
210
|
+
if (isSelectorStart(input[pos])) {
|
|
211
|
+
// Check for event modifier first (.once, .debounce(), etc.)
|
|
212
|
+
const modifierToken = this.tryEventModifier(input, pos);
|
|
213
|
+
if (modifierToken) {
|
|
214
|
+
tokens.push(modifierToken);
|
|
215
|
+
pos = modifierToken.position.end;
|
|
216
|
+
continue;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const selectorToken = this.trySelector(input, pos);
|
|
220
|
+
if (selectorToken) {
|
|
221
|
+
tokens.push(selectorToken);
|
|
222
|
+
pos = selectorToken.position.end;
|
|
223
|
+
continue;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// Try string literal
|
|
228
|
+
if (isQuote(input[pos])) {
|
|
229
|
+
const stringToken = this.tryString(input, pos);
|
|
230
|
+
if (stringToken) {
|
|
231
|
+
tokens.push(stringToken);
|
|
232
|
+
pos = stringToken.position.end;
|
|
233
|
+
continue;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Try URL (/path, ./path, http://, etc.)
|
|
238
|
+
if (isUrlStart(input, pos)) {
|
|
239
|
+
const urlToken = this.tryUrl(input, pos);
|
|
240
|
+
if (urlToken) {
|
|
241
|
+
tokens.push(urlToken);
|
|
242
|
+
pos = urlToken.position.end;
|
|
243
|
+
continue;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Try number
|
|
248
|
+
if (isDigit(input[pos])) {
|
|
249
|
+
const numberToken = this.extractHebrewNumber(input, pos);
|
|
250
|
+
if (numberToken) {
|
|
251
|
+
tokens.push(numberToken);
|
|
252
|
+
pos = numberToken.position.end;
|
|
253
|
+
continue;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// Try variable reference (:varname)
|
|
258
|
+
const varToken = this.tryVariableRef(input, pos);
|
|
259
|
+
if (varToken) {
|
|
260
|
+
tokens.push(varToken);
|
|
261
|
+
pos = varToken.position.end;
|
|
262
|
+
continue;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// Try Hebrew preposition (multi-word first)
|
|
266
|
+
const prepToken = this.tryPreposition(input, pos);
|
|
267
|
+
if (prepToken) {
|
|
268
|
+
tokens.push(prepToken);
|
|
269
|
+
pos = prepToken.position.end;
|
|
270
|
+
continue;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// Try Hebrew word (with prefix detection)
|
|
274
|
+
if (isHebrew(input[pos])) {
|
|
275
|
+
// Check for event marker prefix (ב, כ) attached to event name
|
|
276
|
+
const eventMarkerResult = this.tryEventMarkerPrefix(input, pos);
|
|
277
|
+
if (eventMarkerResult) {
|
|
278
|
+
tokens.push(eventMarkerResult.marker);
|
|
279
|
+
tokens.push(eventMarkerResult.event);
|
|
280
|
+
pos = eventMarkerResult.event.position.end;
|
|
281
|
+
continue;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// Check for conjunction prefix (ו) attached to following word
|
|
285
|
+
const prefixResult = this.tryPrefixConjunction(input, pos);
|
|
286
|
+
if (prefixResult) {
|
|
287
|
+
tokens.push(prefixResult.conjunction);
|
|
288
|
+
pos = prefixResult.conjunction.position.end;
|
|
289
|
+
// Continue to let the next iteration extract the remaining word
|
|
290
|
+
continue;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
const wordToken = this.extractHebrewWord(input, pos);
|
|
294
|
+
if (wordToken) {
|
|
295
|
+
tokens.push(wordToken);
|
|
296
|
+
pos = wordToken.position.end;
|
|
297
|
+
continue;
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
// Try ASCII word (for mixed content)
|
|
302
|
+
if (isAsciiIdentifierChar(input[pos])) {
|
|
303
|
+
const asciiToken = this.extractAsciiWord(input, pos);
|
|
304
|
+
if (asciiToken) {
|
|
305
|
+
tokens.push(asciiToken);
|
|
306
|
+
pos = asciiToken.position.end;
|
|
307
|
+
continue;
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
// Skip unknown character
|
|
312
|
+
pos++;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
return new TokenStreamImpl(tokens, 'he');
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
classifyToken(token: string): TokenKind {
|
|
319
|
+
if (PREPOSITIONS.has(token)) return 'particle';
|
|
320
|
+
if (this.isKeyword(token)) return 'keyword';
|
|
321
|
+
if (token.startsWith('#') || token.startsWith('.') || token.startsWith('[')) return 'selector';
|
|
322
|
+
if (token.startsWith('"') || token.startsWith("'")) return 'literal';
|
|
323
|
+
if (/^\d/.test(token)) return 'literal';
|
|
324
|
+
|
|
325
|
+
return 'identifier';
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
/**
|
|
329
|
+
* Try to match a Hebrew preposition.
|
|
330
|
+
*/
|
|
331
|
+
private tryPreposition(input: string, pos: number): LanguageToken | null {
|
|
332
|
+
// Check prepositions from longest to shortest
|
|
333
|
+
const sortedPreps = Array.from(PREPOSITIONS).sort((a, b) => b.length - a.length);
|
|
334
|
+
|
|
335
|
+
for (const prep of sortedPreps) {
|
|
336
|
+
if (input.slice(pos, pos + prep.length) === prep) {
|
|
337
|
+
// Check that it's a standalone word (followed by space or non-Hebrew)
|
|
338
|
+
const nextPos = pos + prep.length;
|
|
339
|
+
if (nextPos >= input.length || isWhitespace(input[nextPos]) || !isHebrew(input[nextPos])) {
|
|
340
|
+
const token = createToken(prep, 'particle', createPosition(pos, nextPos));
|
|
341
|
+
return {
|
|
342
|
+
...token,
|
|
343
|
+
metadata: {
|
|
344
|
+
prepositionValue: prep,
|
|
345
|
+
},
|
|
346
|
+
};
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
return null;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
/**
|
|
354
|
+
* Try to extract a prefix conjunction (ו - "and") that's attached to the following word.
|
|
355
|
+
*
|
|
356
|
+
* Hebrew conjunction prefix attaches directly to words without space:
|
|
357
|
+
* - והחלף → ו + החלף (and + toggle)
|
|
358
|
+
* - ולחיצה → ו + לחיצה (and + click)
|
|
359
|
+
*/
|
|
360
|
+
private tryPrefixConjunction(input: string, pos: number): { conjunction: LanguageToken } | null {
|
|
361
|
+
// CRITICAL: Check if the full word is a keyword BEFORE splitting
|
|
362
|
+
let wordEnd = pos;
|
|
363
|
+
while (wordEnd < input.length && isHebrew(input[wordEnd])) {
|
|
364
|
+
wordEnd++;
|
|
365
|
+
}
|
|
366
|
+
const fullWord = input.slice(pos, wordEnd);
|
|
367
|
+
|
|
368
|
+
// Check if full word is a keyword
|
|
369
|
+
if (this.lookupKeyword(fullWord)) {
|
|
370
|
+
return null; // Let extractHebrewWord handle it
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
// Check prepositions (they shouldn't be split)
|
|
374
|
+
if (PREPOSITIONS.has(fullWord)) {
|
|
375
|
+
return null;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
// Check for conjunction prefix (ו)
|
|
379
|
+
const char = input[pos];
|
|
380
|
+
const conjEntry = CONJUNCTIONS.get(char);
|
|
381
|
+
|
|
382
|
+
if (!conjEntry) return null;
|
|
383
|
+
|
|
384
|
+
// Check if there's a following Hebrew character (prefix must be attached)
|
|
385
|
+
const nextPos = pos + 1;
|
|
386
|
+
if (nextPos >= input.length || !isHebrew(input[nextPos])) {
|
|
387
|
+
return null; // Standalone conjunction or end of input
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
// Count remaining Hebrew characters to ensure meaningful word follows
|
|
391
|
+
let remainingLength = 0;
|
|
392
|
+
let checkPos = nextPos;
|
|
393
|
+
while (checkPos < input.length && isHebrew(input[checkPos])) {
|
|
394
|
+
remainingLength++;
|
|
395
|
+
checkPos++;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// Require at least 2 characters after prefix to avoid false positives
|
|
399
|
+
if (remainingLength < 2) {
|
|
400
|
+
return null;
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
// Check if the remaining word is a keyword
|
|
404
|
+
const afterPrefix = input.slice(nextPos, wordEnd);
|
|
405
|
+
if (this.lookupKeyword(afterPrefix)) {
|
|
406
|
+
// Split: conjunction + keyword
|
|
407
|
+
return {
|
|
408
|
+
conjunction: createToken(char, 'conjunction', createPosition(pos, nextPos), conjEntry),
|
|
409
|
+
};
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
return null;
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
/**
|
|
416
|
+
* Try to extract an event marker prefix (ב, כ) attached to an event name.
|
|
417
|
+
*
|
|
418
|
+
* Hebrew event markers attach directly to event names without space:
|
|
419
|
+
* - בלחיצה → ב + לחיצה (on + click)
|
|
420
|
+
* - כשינוי → כ + שינוי (when + change)
|
|
421
|
+
*
|
|
422
|
+
* Returns both the marker token and the event name token if successful.
|
|
423
|
+
*/
|
|
424
|
+
private tryEventMarkerPrefix(
|
|
425
|
+
input: string,
|
|
426
|
+
pos: number
|
|
427
|
+
): { marker: LanguageToken; event: LanguageToken } | null {
|
|
428
|
+
const char = input[pos];
|
|
429
|
+
const markerNormalized = EVENT_MARKER_PREFIXES.get(char);
|
|
430
|
+
|
|
431
|
+
if (!markerNormalized) return null;
|
|
432
|
+
|
|
433
|
+
// Check if there's a following Hebrew character
|
|
434
|
+
const nextPos = pos + 1;
|
|
435
|
+
if (nextPos >= input.length || !isHebrew(input[nextPos])) {
|
|
436
|
+
return null;
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
// Extract the word after the prefix
|
|
440
|
+
let wordEnd = nextPos;
|
|
441
|
+
while (wordEnd < input.length && isHebrew(input[wordEnd])) {
|
|
442
|
+
wordEnd++;
|
|
443
|
+
}
|
|
444
|
+
const afterPrefix = input.slice(nextPos, wordEnd);
|
|
445
|
+
|
|
446
|
+
// Check if it's a known event name
|
|
447
|
+
if (EVENT_NAMES.has(afterPrefix)) {
|
|
448
|
+
// Found event marker + event name: split into two tokens
|
|
449
|
+
const markerToken = createToken(
|
|
450
|
+
char,
|
|
451
|
+
'keyword',
|
|
452
|
+
createPosition(pos, nextPos),
|
|
453
|
+
markerNormalized // normalized to 'on' or 'when'
|
|
454
|
+
);
|
|
455
|
+
|
|
456
|
+
// Look up the event name to get its normalized form
|
|
457
|
+
const eventKeywordEntry = this.lookupKeyword(afterPrefix);
|
|
458
|
+
const eventToken = eventKeywordEntry
|
|
459
|
+
? createToken(
|
|
460
|
+
afterPrefix,
|
|
461
|
+
'keyword',
|
|
462
|
+
createPosition(nextPos, wordEnd),
|
|
463
|
+
eventKeywordEntry.normalized
|
|
464
|
+
)
|
|
465
|
+
: createToken(afterPrefix, 'keyword', createPosition(nextPos, wordEnd));
|
|
466
|
+
|
|
467
|
+
return { marker: markerToken, event: eventToken };
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
return null;
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
/**
|
|
474
|
+
* Extract a Hebrew word.
|
|
475
|
+
*/
|
|
476
|
+
private extractHebrewWord(input: string, startPos: number): LanguageToken | null {
|
|
477
|
+
let pos = startPos;
|
|
478
|
+
let word = '';
|
|
479
|
+
|
|
480
|
+
// Extract Hebrew characters
|
|
481
|
+
while (pos < input.length && isHebrew(input[pos])) {
|
|
482
|
+
word += input[pos++];
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
if (!word) return null;
|
|
486
|
+
|
|
487
|
+
// Check if it's a keyword
|
|
488
|
+
const keywordEntry = this.lookupKeyword(word);
|
|
489
|
+
if (keywordEntry) {
|
|
490
|
+
return createToken(word, 'keyword', createPosition(startPos, pos), keywordEntry.normalized);
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
// Check if it's a preposition (with metadata for disambiguation)
|
|
494
|
+
if (PREPOSITIONS.has(word)) {
|
|
495
|
+
const token = createToken(word, 'particle', createPosition(startPos, pos));
|
|
496
|
+
return {
|
|
497
|
+
...token,
|
|
498
|
+
metadata: {
|
|
499
|
+
prepositionValue: word,
|
|
500
|
+
},
|
|
501
|
+
};
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
// Try morphological normalization for prefix variations
|
|
505
|
+
const morphToken = this.tryMorphKeywordMatch(word, startPos, pos);
|
|
506
|
+
if (morphToken) return morphToken;
|
|
507
|
+
|
|
508
|
+
// Not a keyword or recognized form, return as identifier
|
|
509
|
+
return createToken(word, 'identifier', createPosition(startPos, pos));
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
/**
|
|
513
|
+
* Extract an ASCII word.
|
|
514
|
+
*/
|
|
515
|
+
private extractAsciiWord(input: string, startPos: number): LanguageToken | null {
|
|
516
|
+
let pos = startPos;
|
|
517
|
+
let word = '';
|
|
518
|
+
|
|
519
|
+
while (pos < input.length && isAsciiIdentifierChar(input[pos])) {
|
|
520
|
+
word += input[pos++];
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
if (!word) return null;
|
|
524
|
+
|
|
525
|
+
return createToken(word, 'identifier', createPosition(startPos, pos));
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
/**
|
|
529
|
+
* Extract a number, including Hebrew time unit suffixes.
|
|
530
|
+
*/
|
|
531
|
+
private extractHebrewNumber(input: string, startPos: number): LanguageToken | null {
|
|
532
|
+
return this.tryNumberWithTimeUnits(input, startPos, HEBREW_TIME_UNITS, {
|
|
533
|
+
allowSign: false,
|
|
534
|
+
skipWhitespace: true,
|
|
535
|
+
});
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
/**
|
|
540
|
+
* Singleton instance.
|
|
541
|
+
*/
|
|
542
|
+
export const hebrewTokenizer = new HebrewTokenizer();
|
package/src/tokenizers/index.ts
CHANGED
|
@@ -85,6 +85,7 @@ export { vietnameseTokenizer } from './vietnamese';
|
|
|
85
85
|
export { polishTokenizer } from './polish';
|
|
86
86
|
export { russianTokenizer } from './russian';
|
|
87
87
|
export { ukrainianTokenizer } from './ukrainian';
|
|
88
|
+
export { hebrewTokenizer } from './he';
|
|
88
89
|
export { hindiTokenizer } from './hindi';
|
|
89
90
|
export { bengaliTokenizer } from './bengali';
|
|
90
91
|
export { thaiTokenizer } from './thai';
|
|
@@ -99,7 +99,7 @@ const PARTICLE_ROLES = new Map<string, ParticleMetadata>([
|
|
|
99
99
|
['まで', { role: 'destination', confidence: 0.75, description: 'until/boundary marker' }],
|
|
100
100
|
['へ', { role: 'destination', confidence: 0.9, description: 'direction marker' }],
|
|
101
101
|
['と', { role: 'style', confidence: 0.7, description: 'with/and marker' }],
|
|
102
|
-
['の', { role: '
|
|
102
|
+
['の', { role: 'destination', confidence: 0.75, description: 'possessive/destination marker' }],
|
|
103
103
|
['が', { role: 'agent', confidence: 0.85, description: 'subject marker' }],
|
|
104
104
|
['は', { role: 'agent', confidence: 0.75, description: 'topic marker' }],
|
|
105
105
|
['も', { role: 'patient', confidence: 0.65, description: 'also/too marker' }],
|
|
@@ -165,6 +165,8 @@ const JAPANESE_EXTRAS: KeywordEntry[] = [
|
|
|
165
165
|
{ native: 'もし', normalized: 'if' }, // Starts with particle も, needs explicit entry
|
|
166
166
|
{ native: 'ならば', normalized: 'then' },
|
|
167
167
|
{ native: 'なら', normalized: 'then' },
|
|
168
|
+
{ native: 'それから', normalized: 'then' }, // Chain connector
|
|
169
|
+
{ native: 'そして', normalized: 'and' }, // Alternative connector
|
|
168
170
|
|
|
169
171
|
// Time units
|
|
170
172
|
{ native: '秒', normalized: 's' },
|