@lokascript/i18n 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +286 -0
- package/dist/browser.cjs +7669 -0
- package/dist/browser.cjs.map +1 -0
- package/dist/browser.d.cts +50 -0
- package/dist/browser.d.ts +50 -0
- package/dist/browser.js +7592 -0
- package/dist/browser.js.map +1 -0
- package/dist/hyperfixi-i18n.min.js +2 -0
- package/dist/hyperfixi-i18n.min.js.map +1 -0
- package/dist/hyperfixi-i18n.mjs +8558 -0
- package/dist/hyperfixi-i18n.mjs.map +1 -0
- package/dist/index.cjs +14205 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +947 -0
- package/dist/index.d.ts +947 -0
- package/dist/index.js +14095 -0
- package/dist/index.js.map +1 -0
- package/dist/transformer-Ckask-yw.d.cts +1041 -0
- package/dist/transformer-Ckask-yw.d.ts +1041 -0
- package/package.json +84 -0
- package/src/browser.ts +122 -0
- package/src/compatibility/browser-tests/grammar-demo.spec.ts +169 -0
- package/src/constants.ts +366 -0
- package/src/dictionaries/ar.ts +233 -0
- package/src/dictionaries/bn.ts +156 -0
- package/src/dictionaries/de.ts +233 -0
- package/src/dictionaries/derive.ts +515 -0
- package/src/dictionaries/en.ts +237 -0
- package/src/dictionaries/es.ts +233 -0
- package/src/dictionaries/fr.ts +233 -0
- package/src/dictionaries/hi.ts +270 -0
- package/src/dictionaries/id.ts +233 -0
- package/src/dictionaries/index.ts +238 -0
- package/src/dictionaries/it.ts +233 -0
- package/src/dictionaries/ja.ts +233 -0
- package/src/dictionaries/ko.ts +233 -0
- package/src/dictionaries/ms.ts +276 -0
- package/src/dictionaries/pl.ts +239 -0
- package/src/dictionaries/pt.ts +237 -0
- package/src/dictionaries/qu.ts +233 -0
- package/src/dictionaries/ru.ts +270 -0
- package/src/dictionaries/sw.ts +233 -0
- package/src/dictionaries/th.ts +156 -0
- package/src/dictionaries/tl.ts +276 -0
- package/src/dictionaries/tr.ts +233 -0
- package/src/dictionaries/uk.ts +270 -0
- package/src/dictionaries/vi.ts +210 -0
- package/src/dictionaries/zh.ts +233 -0
- package/src/enhanced-i18n.test.ts +454 -0
- package/src/enhanced-i18n.ts +713 -0
- package/src/examples/new-languages.ts +326 -0
- package/src/formatting.test.ts +213 -0
- package/src/formatting.ts +416 -0
- package/src/grammar/direct-mappings.ts +353 -0
- package/src/grammar/grammar.test.ts +1053 -0
- package/src/grammar/index.ts +59 -0
- package/src/grammar/profiles/index.ts +860 -0
- package/src/grammar/transformer.ts +1318 -0
- package/src/grammar/types.ts +630 -0
- package/src/index.ts +202 -0
- package/src/new-languages.test.ts +389 -0
- package/src/parser/analyze-conflicts.test.ts +229 -0
- package/src/parser/ar.ts +40 -0
- package/src/parser/create-provider.ts +309 -0
- package/src/parser/de.ts +36 -0
- package/src/parser/es.ts +31 -0
- package/src/parser/fr.ts +31 -0
- package/src/parser/id.ts +34 -0
- package/src/parser/index.ts +50 -0
- package/src/parser/ja.ts +36 -0
- package/src/parser/ko.ts +37 -0
- package/src/parser/locale-manager.test.ts +198 -0
- package/src/parser/locale-manager.ts +197 -0
- package/src/parser/parser-integration.test.ts +439 -0
- package/src/parser/pt.ts +37 -0
- package/src/parser/qu.ts +37 -0
- package/src/parser/sw.ts +37 -0
- package/src/parser/tr.ts +38 -0
- package/src/parser/types.ts +113 -0
- package/src/parser/zh.ts +38 -0
- package/src/plugins/vite.ts +224 -0
- package/src/plugins/webpack.ts +124 -0
- package/src/pluralization.test.ts +197 -0
- package/src/pluralization.ts +393 -0
- package/src/runtime.ts +441 -0
- package/src/ssr-integration.ts +225 -0
- package/src/test-setup.ts +195 -0
- package/src/translation-validation.test.ts +171 -0
- package/src/translator.test.ts +252 -0
- package/src/translator.ts +297 -0
- package/src/types.ts +209 -0
- package/src/utils/locale.ts +190 -0
- package/src/utils/tokenizer-adapter.ts +469 -0
- package/src/utils/tokenizer.ts +19 -0
- package/src/validators/index.ts +174 -0
- package/src/validators/schema.ts +129 -0
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tokenizer Adapter
|
|
3
|
+
*
|
|
4
|
+
* Bridges the semantic package's sophisticated tokenizers to i18n's
|
|
5
|
+
* dictionary-based token categorization system.
|
|
6
|
+
*
|
|
7
|
+
* The semantic tokenizers handle:
|
|
8
|
+
* - Language-specific word boundaries
|
|
9
|
+
* - CSS selectors, URLs, string literals
|
|
10
|
+
* - Grammatical particles (を, に, من)
|
|
11
|
+
* - Morphological normalization
|
|
12
|
+
*
|
|
13
|
+
* This adapter converts semantic tokens to i18n tokens by:
|
|
14
|
+
* 1. Using semantic's tokenize() for sophisticated tokenization
|
|
15
|
+
* 2. Mapping TokenKind to TokenType via dictionary lookup for keywords
|
|
16
|
+
* 3. Preserving position information for round-trip support
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
import type { Token, TokenType } from '../types';
|
|
20
|
+
import { dictionaries } from '../dictionaries';
|
|
21
|
+
|
|
22
|
+
// =============================================================================
|
|
23
|
+
// Semantic Tokenizer Integration
|
|
24
|
+
// =============================================================================
|
|
25
|
+
|
|
26
|
+
// Types from semantic package (inlined to avoid circular dependency issues)
|
|
27
|
+
interface SourcePosition {
|
|
28
|
+
readonly start: number;
|
|
29
|
+
readonly end: number;
|
|
30
|
+
readonly line?: number;
|
|
31
|
+
readonly column?: number;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
type TokenKind =
|
|
35
|
+
| 'keyword'
|
|
36
|
+
| 'selector'
|
|
37
|
+
| 'literal'
|
|
38
|
+
| 'particle'
|
|
39
|
+
| 'conjunction' // Grammatical conjunction (Arabic و/ف proclitics)
|
|
40
|
+
| 'event-modifier' // Event modifiers (.once, .prevent, etc.)
|
|
41
|
+
| 'identifier'
|
|
42
|
+
| 'operator'
|
|
43
|
+
| 'punctuation'
|
|
44
|
+
| 'url';
|
|
45
|
+
|
|
46
|
+
interface LanguageToken {
|
|
47
|
+
readonly value: string;
|
|
48
|
+
readonly kind: TokenKind;
|
|
49
|
+
readonly position: SourcePosition;
|
|
50
|
+
readonly normalized?: string;
|
|
51
|
+
readonly stem?: string;
|
|
52
|
+
readonly stemConfidence?: number;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
interface TokenStream {
|
|
56
|
+
readonly tokens: readonly LanguageToken[];
|
|
57
|
+
readonly language: string;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Lazy load semantic tokenizers to avoid circular dependency at module load time
|
|
61
|
+
let _semanticTokenize: ((input: string, language: string) => TokenStream) | null = null;
|
|
62
|
+
let _isLanguageSupported: ((language: string) => boolean) | null = null;
|
|
63
|
+
let _semanticLoaded = false;
|
|
64
|
+
let _semanticLoadFailed = false;
|
|
65
|
+
|
|
66
|
+
async function loadSemantic(): Promise<boolean> {
|
|
67
|
+
if (_semanticLoaded) return !_semanticLoadFailed;
|
|
68
|
+
if (_semanticLoadFailed) return false;
|
|
69
|
+
|
|
70
|
+
try {
|
|
71
|
+
// Dynamic import to break circular dependency
|
|
72
|
+
const semantic = await import('@lokascript/semantic');
|
|
73
|
+
_semanticTokenize = semantic.tokenize;
|
|
74
|
+
_isLanguageSupported = semantic.isLanguageSupported;
|
|
75
|
+
_semanticLoaded = true;
|
|
76
|
+
return true;
|
|
77
|
+
} catch {
|
|
78
|
+
_semanticLoadFailed = true;
|
|
79
|
+
return false;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// Synchronous check - only works after first async load
|
|
84
|
+
function isSemanticAvailable(): boolean {
|
|
85
|
+
return _semanticLoaded && !_semanticLoadFailed;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function semanticTokenize(input: string, language: string): TokenStream | null {
|
|
89
|
+
if (!_semanticTokenize) return null;
|
|
90
|
+
try {
|
|
91
|
+
return _semanticTokenize(input, language);
|
|
92
|
+
} catch {
|
|
93
|
+
return null;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function semanticIsLanguageSupported(language: string): boolean {
|
|
98
|
+
if (!_isLanguageSupported) return false;
|
|
99
|
+
return _isLanguageSupported(language);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// =============================================================================
|
|
103
|
+
// Token Type Mapping
|
|
104
|
+
// =============================================================================
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Map semantic TokenKind to i18n TokenType.
|
|
108
|
+
*
|
|
109
|
+
* Semantic TokenKind → i18n TokenType:
|
|
110
|
+
* - keyword: Use dictionary lookup to determine specific type
|
|
111
|
+
* - selector: identifier (selectors don't get translated)
|
|
112
|
+
* - literal: literal
|
|
113
|
+
* - particle: modifier (grammatical particles)
|
|
114
|
+
* - identifier: identifier
|
|
115
|
+
* - operator: operator
|
|
116
|
+
* - punctuation: operator
|
|
117
|
+
* - url: literal
|
|
118
|
+
*/
|
|
119
|
+
function mapTokenKind(kind: TokenKind): TokenType {
|
|
120
|
+
switch (kind) {
|
|
121
|
+
case 'keyword':
|
|
122
|
+
// Will be refined by dictionary lookup
|
|
123
|
+
return 'identifier';
|
|
124
|
+
case 'selector':
|
|
125
|
+
return 'identifier';
|
|
126
|
+
case 'literal':
|
|
127
|
+
return 'literal';
|
|
128
|
+
case 'particle':
|
|
129
|
+
return 'modifier';
|
|
130
|
+
case 'conjunction':
|
|
131
|
+
return 'modifier';
|
|
132
|
+
case 'event-modifier':
|
|
133
|
+
return 'modifier';
|
|
134
|
+
case 'identifier':
|
|
135
|
+
return 'identifier';
|
|
136
|
+
case 'operator':
|
|
137
|
+
return 'operator';
|
|
138
|
+
case 'punctuation':
|
|
139
|
+
return 'operator';
|
|
140
|
+
case 'url':
|
|
141
|
+
return 'literal';
|
|
142
|
+
default:
|
|
143
|
+
return 'identifier';
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Determine the specific i18n TokenType for a word by looking up
|
|
149
|
+
* in the locale's dictionary.
|
|
150
|
+
*
|
|
151
|
+
* Order matters: events before commands since many events (click, focus)
|
|
152
|
+
* also appear in commands.
|
|
153
|
+
*/
|
|
154
|
+
function categorizeWord(word: string, locale: string): TokenType {
|
|
155
|
+
const lowerWord = word.toLowerCase();
|
|
156
|
+
|
|
157
|
+
// Map dictionary categories to token types (events first to handle 'click' etc.)
|
|
158
|
+
const categoryToType: Array<[string, TokenType]> = [
|
|
159
|
+
['events', 'event'],
|
|
160
|
+
['commands', 'command'],
|
|
161
|
+
['expressions', 'expression'],
|
|
162
|
+
['modifiers', 'modifier'],
|
|
163
|
+
['logical', 'logical'],
|
|
164
|
+
['temporal', 'temporal'],
|
|
165
|
+
['values', 'value'],
|
|
166
|
+
['attributes', 'attribute'],
|
|
167
|
+
];
|
|
168
|
+
|
|
169
|
+
// Check all supported dictionaries (source locale + English)
|
|
170
|
+
const localesToCheck = locale === 'en' ? ['en'] : [locale, 'en'];
|
|
171
|
+
|
|
172
|
+
for (const loc of localesToCheck) {
|
|
173
|
+
const dict = dictionaries[loc];
|
|
174
|
+
if (!dict) continue;
|
|
175
|
+
|
|
176
|
+
// Check categories in priority order (events before commands)
|
|
177
|
+
for (const [category, tokenType] of categoryToType) {
|
|
178
|
+
const translations = dict[category as keyof typeof dict];
|
|
179
|
+
if (!translations || typeof translations !== 'object') continue;
|
|
180
|
+
|
|
181
|
+
// Check if word matches a key (English) or value (translated)
|
|
182
|
+
for (const [key, value] of Object.entries(translations)) {
|
|
183
|
+
if (key.toLowerCase() === lowerWord || value.toLowerCase() === lowerWord) {
|
|
184
|
+
return tokenType;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Default to identifier
|
|
191
|
+
return 'identifier';
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// =============================================================================
|
|
195
|
+
// Token Conversion
|
|
196
|
+
// =============================================================================
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Convert a semantic LanguageToken to an i18n Token.
|
|
200
|
+
*/
|
|
201
|
+
function convertToken(token: LanguageToken, locale: string): Token {
|
|
202
|
+
let type: TokenType;
|
|
203
|
+
|
|
204
|
+
if (token.kind === 'keyword') {
|
|
205
|
+
// For keywords, use dictionary lookup to get specific type
|
|
206
|
+
// Use normalized form if available (e.g., 切り替え → toggle)
|
|
207
|
+
const lookupWord = token.normalized || token.value;
|
|
208
|
+
type = categorizeWord(lookupWord, locale);
|
|
209
|
+
} else {
|
|
210
|
+
type = mapTokenKind(token.kind);
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
return {
|
|
214
|
+
type,
|
|
215
|
+
value: token.value,
|
|
216
|
+
position: {
|
|
217
|
+
start: token.position.start,
|
|
218
|
+
end: token.position.end,
|
|
219
|
+
line: token.position.line ?? 1,
|
|
220
|
+
column: token.position.column ?? token.position.start + 1,
|
|
221
|
+
},
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// =============================================================================
|
|
226
|
+
// Public API
|
|
227
|
+
// =============================================================================
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Initialize the semantic tokenizer (call once at app startup for best performance).
|
|
231
|
+
* This is optional - tokenize() will work without it, using fallback tokenization.
|
|
232
|
+
*/
|
|
233
|
+
export async function initSemanticTokenizer(): Promise<boolean> {
|
|
234
|
+
return loadSemantic();
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Tokenize input using semantic package's sophisticated tokenizers,
|
|
239
|
+
* converting to i18n-compatible tokens.
|
|
240
|
+
*
|
|
241
|
+
* Falls back to basic tokenization if:
|
|
242
|
+
* - Semantic package not loaded yet
|
|
243
|
+
* - Language not supported by semantic
|
|
244
|
+
* - Any error occurs
|
|
245
|
+
*
|
|
246
|
+
* For best performance, call initSemanticTokenizer() at app startup.
|
|
247
|
+
*/
|
|
248
|
+
export function tokenize(text: string, locale: string): Token[] {
|
|
249
|
+
// Try semantic tokenization if available
|
|
250
|
+
if (isSemanticAvailable() && semanticIsLanguageSupported(locale)) {
|
|
251
|
+
const stream = semanticTokenize(text, locale);
|
|
252
|
+
if (stream) {
|
|
253
|
+
const tokens: Token[] = [];
|
|
254
|
+
for (const semanticToken of stream.tokens) {
|
|
255
|
+
tokens.push(convertToken(semanticToken, locale));
|
|
256
|
+
}
|
|
257
|
+
return tokens;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// Fall back to basic tokenization
|
|
262
|
+
return tokenizeBasic(text, locale);
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* Tokenize with async initialization of semantic tokenizers.
|
|
267
|
+
* Useful when you want to ensure semantic tokenization is used.
|
|
268
|
+
*/
|
|
269
|
+
export async function tokenizeAsync(text: string, locale: string): Promise<Token[]> {
|
|
270
|
+
await loadSemantic();
|
|
271
|
+
return tokenize(text, locale);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Basic tokenization fallback for unsupported languages.
|
|
276
|
+
* This is a simplified version that handles common patterns.
|
|
277
|
+
*/
|
|
278
|
+
function tokenizeBasic(text: string, locale: string): Token[] {
|
|
279
|
+
const tokens: Token[] = [];
|
|
280
|
+
let position = 0;
|
|
281
|
+
let line = 1;
|
|
282
|
+
let column = 1;
|
|
283
|
+
|
|
284
|
+
while (position < text.length) {
|
|
285
|
+
const start = position;
|
|
286
|
+
const startLine = line;
|
|
287
|
+
const startColumn = column;
|
|
288
|
+
|
|
289
|
+
// Skip whitespace but track it
|
|
290
|
+
if (isWhitespace(text[position])) {
|
|
291
|
+
const whitespace = consumeWhitespace(text, position);
|
|
292
|
+
tokens.push({
|
|
293
|
+
type: 'literal',
|
|
294
|
+
value: whitespace,
|
|
295
|
+
position: {
|
|
296
|
+
start,
|
|
297
|
+
end: position + whitespace.length,
|
|
298
|
+
line: startLine,
|
|
299
|
+
column: startColumn,
|
|
300
|
+
},
|
|
301
|
+
});
|
|
302
|
+
|
|
303
|
+
// Update position tracking
|
|
304
|
+
for (let i = 0; i < whitespace.length; i++) {
|
|
305
|
+
if (whitespace[i] === '\n') {
|
|
306
|
+
line++;
|
|
307
|
+
column = 1;
|
|
308
|
+
} else {
|
|
309
|
+
column++;
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
position += whitespace.length;
|
|
313
|
+
continue;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// String literals
|
|
317
|
+
if (text[position] === '"' || text[position] === "'") {
|
|
318
|
+
const quote = text[position];
|
|
319
|
+
let value = quote;
|
|
320
|
+
position++;
|
|
321
|
+
column++;
|
|
322
|
+
|
|
323
|
+
while (position < text.length && text[position] !== quote) {
|
|
324
|
+
if (text[position] === '\\' && position + 1 < text.length) {
|
|
325
|
+
value += text[position] + text[position + 1];
|
|
326
|
+
position += 2;
|
|
327
|
+
column += 2;
|
|
328
|
+
} else {
|
|
329
|
+
value += text[position];
|
|
330
|
+
if (text[position] === '\n') {
|
|
331
|
+
line++;
|
|
332
|
+
column = 1;
|
|
333
|
+
} else {
|
|
334
|
+
column++;
|
|
335
|
+
}
|
|
336
|
+
position++;
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
if (position < text.length) {
|
|
341
|
+
value += text[position];
|
|
342
|
+
position++;
|
|
343
|
+
column++;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
tokens.push({
|
|
347
|
+
type: 'literal',
|
|
348
|
+
value,
|
|
349
|
+
position: {
|
|
350
|
+
start,
|
|
351
|
+
end: position,
|
|
352
|
+
line: startLine,
|
|
353
|
+
column: startColumn,
|
|
354
|
+
},
|
|
355
|
+
});
|
|
356
|
+
continue;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
// Numbers
|
|
360
|
+
if (isDigit(text[position])) {
|
|
361
|
+
const number = consumeNumber(text, position);
|
|
362
|
+
tokens.push({
|
|
363
|
+
type: 'literal',
|
|
364
|
+
value: number,
|
|
365
|
+
position: {
|
|
366
|
+
start,
|
|
367
|
+
end: position + number.length,
|
|
368
|
+
line: startLine,
|
|
369
|
+
column: startColumn,
|
|
370
|
+
},
|
|
371
|
+
});
|
|
372
|
+
position += number.length;
|
|
373
|
+
column += number.length;
|
|
374
|
+
continue;
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// Identifiers and keywords
|
|
378
|
+
if (isIdentifierStart(text[position])) {
|
|
379
|
+
const word = consumeIdentifier(text, position);
|
|
380
|
+
const tokenType = categorizeWord(word, locale);
|
|
381
|
+
|
|
382
|
+
tokens.push({
|
|
383
|
+
type: tokenType,
|
|
384
|
+
value: word,
|
|
385
|
+
position: {
|
|
386
|
+
start,
|
|
387
|
+
end: position + word.length,
|
|
388
|
+
line: startLine,
|
|
389
|
+
column: startColumn,
|
|
390
|
+
},
|
|
391
|
+
});
|
|
392
|
+
position += word.length;
|
|
393
|
+
column += word.length;
|
|
394
|
+
continue;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// Operators and punctuation
|
|
398
|
+
const operator = consumeOperator(text, position);
|
|
399
|
+
tokens.push({
|
|
400
|
+
type: 'operator',
|
|
401
|
+
value: operator,
|
|
402
|
+
position: {
|
|
403
|
+
start,
|
|
404
|
+
end: position + operator.length,
|
|
405
|
+
line: startLine,
|
|
406
|
+
column: startColumn,
|
|
407
|
+
},
|
|
408
|
+
});
|
|
409
|
+
position += operator.length;
|
|
410
|
+
column += operator.length;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
return tokens;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// =============================================================================
|
|
417
|
+
// Helper Functions
|
|
418
|
+
// =============================================================================
|
|
419
|
+
|
|
420
|
+
function isWhitespace(char: string): boolean {
|
|
421
|
+
return /\s/.test(char);
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
function isDigit(char: string): boolean {
|
|
425
|
+
return /\d/.test(char);
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
function isIdentifierStart(char: string): boolean {
|
|
429
|
+
return /[a-zA-Z_$áéíóúñÑàèìòùÀÈÌÒÙ一-龯ㄱ-ㅎㅏ-ㅣ가-힣]/.test(char);
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
function isIdentifierPart(char: string): boolean {
|
|
433
|
+
return /[a-zA-Z0-9_$áéíóúñÑàèìòùÀÈÌÒÙ一-龯ㄱ-ㅎㅏ-ㅣ가-힣-]/.test(char);
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
function consumeWhitespace(text: string, start: number): string {
|
|
437
|
+
let end = start;
|
|
438
|
+
while (end < text.length && isWhitespace(text[end])) {
|
|
439
|
+
end++;
|
|
440
|
+
}
|
|
441
|
+
return text.substring(start, end);
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
function consumeNumber(text: string, start: number): string {
|
|
445
|
+
let end = start;
|
|
446
|
+
while (end < text.length && (isDigit(text[end]) || text[end] === '.')) {
|
|
447
|
+
end++;
|
|
448
|
+
}
|
|
449
|
+
return text.substring(start, end);
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
function consumeIdentifier(text: string, start: number): string {
|
|
453
|
+
let end = start;
|
|
454
|
+
while (end < text.length && isIdentifierPart(text[end])) {
|
|
455
|
+
end++;
|
|
456
|
+
}
|
|
457
|
+
return text.substring(start, end);
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
function consumeOperator(text: string, start: number): string {
|
|
461
|
+
// Try to match multi-character operators first
|
|
462
|
+
const twoChar = text.substring(start, start + 2);
|
|
463
|
+
if (['==', '!=', '<=', '>=', '&&', '||', '..'].includes(twoChar)) {
|
|
464
|
+
return twoChar;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
// Single character operators
|
|
468
|
+
return text[start];
|
|
469
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tokenizer Module
|
|
3
|
+
*
|
|
4
|
+
* This module now delegates to the semantic package's sophisticated tokenizers
|
|
5
|
+
* via the tokenizer-adapter. This consolidation provides:
|
|
6
|
+
*
|
|
7
|
+
* - Language-specific tokenization (13 languages)
|
|
8
|
+
* - CSS selector, URL, and string literal handling
|
|
9
|
+
* - Grammatical particle recognition (を, に, من)
|
|
10
|
+
* - Morphological normalization
|
|
11
|
+
*
|
|
12
|
+
* The adapter converts semantic tokens to i18n-compatible Token types
|
|
13
|
+
* using dictionary-based categorization.
|
|
14
|
+
*
|
|
15
|
+
* @see ./tokenizer-adapter.ts for the implementation
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
// Re-export functions from the adapter
|
|
19
|
+
export { tokenize, tokenizeAsync, initSemanticTokenizer } from './tokenizer-adapter';
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
// packages/i18n/src/validators/index.ts
|
|
2
|
+
|
|
3
|
+
import {
|
|
4
|
+
Dictionary,
|
|
5
|
+
DICTIONARY_CATEGORIES,
|
|
6
|
+
ValidationResult,
|
|
7
|
+
ValidationError,
|
|
8
|
+
ValidationWarning,
|
|
9
|
+
} from '../types';
|
|
10
|
+
import { RequiredCategories, RequiredKeys } from './schema';
|
|
11
|
+
|
|
12
|
+
export function validate(dictionary: Dictionary, locale: string): ValidationResult {
|
|
13
|
+
const errors: ValidationError[] = [];
|
|
14
|
+
const warnings: ValidationWarning[] = [];
|
|
15
|
+
const coverage = {
|
|
16
|
+
total: 0,
|
|
17
|
+
translated: 0,
|
|
18
|
+
missing: [] as string[],
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
// Check required categories
|
|
22
|
+
for (const category of RequiredCategories) {
|
|
23
|
+
if (!(category in dictionary)) {
|
|
24
|
+
errors.push({
|
|
25
|
+
type: 'missing',
|
|
26
|
+
key: category,
|
|
27
|
+
message: `Missing required category: ${category}`,
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// Check required keys in each category
|
|
33
|
+
for (const [category, requiredKeys] of Object.entries(RequiredKeys)) {
|
|
34
|
+
const categoryDict = dictionary[category as keyof Dictionary];
|
|
35
|
+
|
|
36
|
+
if (!categoryDict) continue;
|
|
37
|
+
|
|
38
|
+
for (const key of requiredKeys) {
|
|
39
|
+
coverage.total++;
|
|
40
|
+
|
|
41
|
+
if (!(key in categoryDict)) {
|
|
42
|
+
errors.push({
|
|
43
|
+
type: 'missing',
|
|
44
|
+
key: `${category}.${key}`,
|
|
45
|
+
message: `Missing required key: ${key} in category ${category}`,
|
|
46
|
+
});
|
|
47
|
+
coverage.missing.push(`${category}.${key}`);
|
|
48
|
+
} else if (!categoryDict[key] || categoryDict[key].trim() === '') {
|
|
49
|
+
errors.push({
|
|
50
|
+
type: 'invalid',
|
|
51
|
+
key: `${category}.${key}`,
|
|
52
|
+
message: `Empty translation for key: ${key} in category ${category}`,
|
|
53
|
+
});
|
|
54
|
+
} else {
|
|
55
|
+
coverage.translated++;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Check for duplicates using type-safe category iteration
|
|
61
|
+
const seen = new Map<string, string>();
|
|
62
|
+
|
|
63
|
+
for (const category of DICTIONARY_CATEGORIES) {
|
|
64
|
+
const translations = dictionary[category];
|
|
65
|
+
if (!translations) continue;
|
|
66
|
+
|
|
67
|
+
for (const [key, value] of Object.entries(translations)) {
|
|
68
|
+
if (seen.has(value)) {
|
|
69
|
+
warnings.push({
|
|
70
|
+
type: 'inconsistent',
|
|
71
|
+
key: `${category}.${key}`,
|
|
72
|
+
message: `Duplicate translation "${value}" also used for ${seen.get(value)}`,
|
|
73
|
+
});
|
|
74
|
+
} else {
|
|
75
|
+
seen.set(value, `${category}.${key}`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Language-specific validations
|
|
81
|
+
validateLocaleSpecific(dictionary, locale, errors, warnings);
|
|
82
|
+
|
|
83
|
+
return {
|
|
84
|
+
valid: errors.length === 0,
|
|
85
|
+
errors,
|
|
86
|
+
warnings,
|
|
87
|
+
coverage,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function validateLocaleSpecific(
|
|
92
|
+
dictionary: Dictionary,
|
|
93
|
+
locale: string,
|
|
94
|
+
_errors: ValidationError[],
|
|
95
|
+
warnings: ValidationWarning[]
|
|
96
|
+
): void {
|
|
97
|
+
// Spanish-specific validations
|
|
98
|
+
if (locale === 'es') {
|
|
99
|
+
// Check for gender consistency
|
|
100
|
+
if (dictionary.values?.true === 'verdadero' && dictionary.values?.false === 'falso') {
|
|
101
|
+
// Both masculine, good
|
|
102
|
+
} else if (dictionary.values?.true === 'verdadera' && dictionary.values?.false === 'falsa') {
|
|
103
|
+
// Both feminine, good
|
|
104
|
+
} else if (dictionary.values?.true && dictionary.values?.false) {
|
|
105
|
+
warnings.push({
|
|
106
|
+
type: 'inconsistent',
|
|
107
|
+
key: 'values.true/false',
|
|
108
|
+
message: 'Gender inconsistency between true/false translations',
|
|
109
|
+
});
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// Korean-specific validations
|
|
114
|
+
if (locale === 'ko') {
|
|
115
|
+
// Check for consistent honorific levels
|
|
116
|
+
const formalEndings = ['습니다', '세요'];
|
|
117
|
+
const informalEndings = ['다', '어', '아'];
|
|
118
|
+
|
|
119
|
+
let formalCount = 0;
|
|
120
|
+
let informalCount = 0;
|
|
121
|
+
|
|
122
|
+
for (const category of DICTIONARY_CATEGORIES) {
|
|
123
|
+
const translations = dictionary[category];
|
|
124
|
+
if (!translations) continue;
|
|
125
|
+
for (const value of Object.values(translations)) {
|
|
126
|
+
if (formalEndings.some(ending => value.endsWith(ending))) {
|
|
127
|
+
formalCount++;
|
|
128
|
+
}
|
|
129
|
+
if (informalEndings.some(ending => value.endsWith(ending))) {
|
|
130
|
+
informalCount++;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if (formalCount > 0 && informalCount > 0) {
|
|
136
|
+
warnings.push({
|
|
137
|
+
type: 'inconsistent',
|
|
138
|
+
key: 'global',
|
|
139
|
+
message: 'Mixed formal and informal speech levels',
|
|
140
|
+
});
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Chinese-specific validations
|
|
145
|
+
if (locale === 'zh' || locale === 'zh-TW') {
|
|
146
|
+
// Check for simplified vs traditional consistency
|
|
147
|
+
const simplified = ['设', '获', '显', '发'];
|
|
148
|
+
const traditional = ['設', '獲', '顯', '發'];
|
|
149
|
+
|
|
150
|
+
let hasSimplified = false;
|
|
151
|
+
let hasTraditional = false;
|
|
152
|
+
|
|
153
|
+
for (const category of DICTIONARY_CATEGORIES) {
|
|
154
|
+
const translations = dictionary[category];
|
|
155
|
+
if (!translations) continue;
|
|
156
|
+
for (const value of Object.values(translations)) {
|
|
157
|
+
if (simplified.some(char => value.includes(char))) {
|
|
158
|
+
hasSimplified = true;
|
|
159
|
+
}
|
|
160
|
+
if (traditional.some(char => value.includes(char))) {
|
|
161
|
+
hasTraditional = true;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
if (hasSimplified && hasTraditional) {
|
|
167
|
+
warnings.push({
|
|
168
|
+
type: 'inconsistent',
|
|
169
|
+
key: 'global',
|
|
170
|
+
message: 'Mixed simplified and traditional Chinese characters',
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|