glost 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/types.ts ADDED
@@ -0,0 +1,565 @@
1
+ import type { Literal as NlcstLiteral, Node as NlcstNode, Paragraph as NlcstParagraph, Parent as NlcstParent, Punctuation as NlcstPunctuation, Root as NlcstRoot, RootContent as NlcstRootContent, Sentence as NlcstSentence, Source as NlcstSource, Symbol as NlcstSymbol, Text as NlcstText, WhiteSpace as NlcstWhiteSpace, Word as NlcstWord } from "nlcst";
2
+
3
+
4
+
5
+
6
+
7
+ // ============================================================================
8
+ // Core GLOST Types
9
+ // ============================================================================
10
+
11
+ /**
12
+ * Linguistic level of a text segment
13
+ */
14
+ export type LinguisticLevel =
15
+ | "character"
16
+ | "syllable"
17
+ | "word"
18
+ | "phrase"
19
+ | "sentence"
20
+ | "paragraph";
21
+
22
+ /**
23
+ * Context for pronunciation variants
24
+ */
25
+ export type PronunciationContext =
26
+ | "formal"
27
+ | "informal"
28
+ | "historical"
29
+ | "regional"
30
+ | "dialectal";
31
+
32
+ /**
33
+ * Transcription system identifiers
34
+ */
35
+ export type TranscriptionSystem =
36
+ | "rtgs" // Royal Thai General System
37
+ | "aua" // AUA (American University Alumni)
38
+ | "paiboon" // Paiboon system
39
+ | "romaji" // Japanese romanization
40
+ | "furigana" // Japanese furigana
41
+ | "ipa" // International Phonetic Alphabet
42
+ | "pinyin" // Chinese pinyin
43
+ | "hangul" // Korean hangul
44
+ | string; // Allow custom systems
45
+
46
+ /**
47
+ * Language codes following BCP-47 format (RFC 5646)
48
+ * Format: language[-script][-region][-variant]
49
+ * Examples: th-TH, ja-JP, zh-CN, ko-KR, en-US, fr-FR, de-DE
50
+ */
51
+ export type LanguageCode =
52
+ // Thai
53
+ | "th-TH" // Thai (Thailand)
54
+ | "th" // Thai (generic)
55
+
56
+ // Japanese
57
+ | "ja-JP" // Japanese (Japan)
58
+ | "ja" // Japanese (generic)
59
+
60
+ // Chinese
61
+ | "zh-CN" // Chinese (Simplified, China)
62
+ | "zh-TW" // Chinese (Traditional, Taiwan)
63
+ | "zh-HK" // Chinese (Hong Kong)
64
+ | "zh" // Chinese (generic)
65
+
66
+ // Korean
67
+ | "ko-KR" // Korean (South Korea)
68
+ | "ko-KP" // Korean (North Korea)
69
+ | "ko" // Korean (generic)
70
+
71
+ // English
72
+ | "en-US" // English (United States)
73
+ | "en-GB" // English (United Kingdom)
74
+ | "en-CA" // English (Canada)
75
+ | "en-AU" // English (Australia)
76
+ | "en" // English (generic)
77
+
78
+ // French
79
+ | "fr-FR" // French (France)
80
+ | "fr-CA" // French (Canada)
81
+ | "fr-BE" // French (Belgium)
82
+ | "fr" // French (generic)
83
+
84
+ // German
85
+ | "de-DE" // German (Germany)
86
+ | "de-AT" // German (Austria)
87
+ | "de-CH" // German (Switzerland)
88
+ | "de" // German (generic)
89
+
90
+ // Spanish
91
+ | "es-ES" // Spanish (Spain)
92
+ | "es-MX" // Spanish (Mexico)
93
+ | "es-AR" // Spanish (Argentina)
94
+ | "es" // Spanish (generic)
95
+
96
+ // Italian
97
+ | "it-IT" // Italian (Italy)
98
+ | "it-CH" // Italian (Switzerland)
99
+ | "it" // Italian (generic)
100
+
101
+ // Portuguese
102
+ | "pt-PT" // Portuguese (Portugal)
103
+ | "pt-BR" // Portuguese (Brazil)
104
+ | "pt" // Portuguese (generic)
105
+
106
+ // Russian
107
+ | "ru-RU" // Russian (Russia)
108
+ | "ru" // Russian (generic)
109
+
110
+ // Arabic
111
+ | "ar-SA" // Arabic (Saudi Arabia)
112
+ | "ar-EG" // Arabic (Egypt)
113
+ | "ar" // Arabic (generic)
114
+
115
+ // Hindi
116
+ | "hi-IN" // Hindi (India)
117
+ | "hi" // Hindi (generic)
118
+
119
+ // Allow custom BCP-47 language tags
120
+ | string;
121
+
122
+ /**
123
+ * Script system identifiers
124
+ */
125
+ export type ScriptSystem =
126
+ | "thai" // Thai script
127
+ | "hiragana" // Japanese hiragana
128
+ | "katakana" // Japanese katakana
129
+ | "kanji" // Japanese/Chinese characters
130
+ | "hanzi" // Chinese characters
131
+ | "hangul" // Korean hangul
132
+ | "latin" // Latin alphabet
133
+ | "mixed" // Mixed scripts
134
+ | string; // Allow other scripts
135
+
136
+ // ============================================================================
137
+ // Extras Field Types for i18n and Extensions
138
+ // ============================================================================
139
+
140
+ /**
141
+ * Quick translations in different languages using BCP-47 format
142
+ */
143
+ export type QuickTranslations = {
144
+ /** English translations */
145
+ "en-US"?: string; // English (United States)
146
+ "en-GB"?: string; // English (United Kingdom)
147
+ "en"?: string; // English (generic)
148
+
149
+ /** Thai translations */
150
+ "th-TH"?: string; // Thai (Thailand)
151
+ "th"?: string; // Thai (generic)
152
+
153
+ /** Japanese translations */
154
+ "ja-JP"?: string; // Japanese (Japan)
155
+ "ja"?: string; // Japanese (generic)
156
+
157
+ /** Chinese translations */
158
+ "zh-CN"?: string; // Chinese (Simplified, China)
159
+ "zh-TW"?: string; // Chinese (Traditional, Taiwan)
160
+ "zh"?: string; // Chinese (generic)
161
+
162
+ /** Korean translations */
163
+ "ko-KR"?: string; // Korean (South Korea)
164
+ "ko"?: string; // Korean (generic)
165
+
166
+ /** French translations */
167
+ "fr-FR"?: string; // French (France)
168
+ "fr-CA"?: string; // French (Canada)
169
+ "fr"?: string; // French (generic)
170
+
171
+ /** German translations */
172
+ "de-DE"?: string; // German (Germany)
173
+ "de"?: string; // German (generic)
174
+
175
+ /** Spanish translations */
176
+ "es-ES"?: string; // Spanish (Spain)
177
+ "es-MX"?: string; // Spanish (Mexico)
178
+ "es"?: string; // Spanish (generic)
179
+
180
+ /** Custom language translations using BCP-47 format */
181
+ [lang: string]: string | undefined;
182
+ };
183
+
184
+ /**
185
+ * Extended metadata for enhanced functionality
186
+ */
187
+ export type ExtendedMetadata = {
188
+ /** Quick translations in multiple languages */
189
+ translations?: QuickTranslations;
190
+ /** Difficulty level for learners */
191
+ difficulty?: "beginner" | "intermediate" | "advanced";
192
+ /** Frequency in common usage */
193
+ frequency?: "rare" | "uncommon" | "common" | "very-common";
194
+ /** Cultural notes */
195
+ culturalNotes?: string;
196
+ /** Related words or concepts */
197
+ related?: string[];
198
+ /** Example sentences */
199
+ examples?: string[];
200
+ /** Custom extensions */
201
+ [key: string]: any;
202
+ };
203
+
204
+ /**
205
+ * Extras field for extending GLOST nodes
206
+ */
207
+ export type GLOSTExtras = {
208
+ /** Quick translations */
209
+ translations?: QuickTranslations;
210
+ /** Extended metadata */
211
+ metadata?: ExtendedMetadata;
212
+ /** Custom extensions */
213
+ [key: string]: any;
214
+ };
215
+
216
+ // ============================================================================
217
+ // Transcription and Pronunciation Types
218
+ // ============================================================================
219
+
220
+ /**
221
+ * Pronunciation variant for a text segment
222
+ */
223
+ export type PronunciationVariant = {
224
+ /** The variant text in the transcription system */
225
+ text: string;
226
+ /** Context where this variant is used */
227
+ context: PronunciationContext;
228
+ /** Additional notes about this variant */
229
+ notes?: string;
230
+ };
231
+
232
+ /**
233
+ * Transcription information for a text segment
234
+ */
235
+ export type TranscriptionInfo = {
236
+ /** The transcription text */
237
+ text: string;
238
+ /** The transcription system used */
239
+ system: TranscriptionSystem;
240
+ /** Pronunciation variants */
241
+ variants?: PronunciationVariant[];
242
+ /** Tone information (for tonal languages) */
243
+ tone?: number;
244
+ /** Syllable breakdown */
245
+ syllables?: string[];
246
+ /** Additional phonetic information */
247
+ phonetic?: string;
248
+ };
249
+
250
+ /**
251
+ * Complete transliteration data for a text segment
252
+ */
253
+ export type TransliterationData = {
254
+ /** Map of transcription systems to their data */
255
+ [system: string]: TranscriptionInfo;
256
+ };
257
+
258
+ // ============================================================================
259
+ // Linguistic Metadata Types
260
+ // ============================================================================
261
+
262
+ /**
263
+ * Linguistic metadata for a text segment
264
+ */
265
+ export type LinguisticMetadata = {
266
+ /** @deprecated Use extras.translations instead */
267
+ meaning?: string;
268
+ /** Part of speech */
269
+ partOfSpeech: string;
270
+ /** Usage notes */
271
+ usage?: string;
272
+ /** Etymology information */
273
+ etymology?: string;
274
+ /** Example usage */
275
+ examples?: string[];
276
+ /** Frequency information */
277
+ frequency?: "high" | "medium" | "low";
278
+ /** Formality level */
279
+ formality?: "formal" | "neutral" | "informal";
280
+ /** Register (academic, colloquial, etc.) */
281
+ register?: string;
282
+ /** @deprecated Use extras.translations instead */
283
+ shortDefinition?: string;
284
+ /** @deprecated Use extras.translations instead */
285
+ fullDefinition?: string;
286
+ /** @deprecated Use metadata enrichment extensions instead */
287
+ difficulty?: "beginner" | "intermediate" | "advanced";
288
+ };
289
+
290
+ // ============================================================================
291
+ // Extended Node Types
292
+ // ============================================================================
293
+
294
+ /**
295
+ * Union type for all GLOST nodes
296
+ */
297
+ export type GLOSTNode =
298
+ | GLOSTWord
299
+ | GLOSTSentence
300
+ | GLOSTParagraph
301
+ | GLOSTRoot
302
+ | GLOSTText
303
+ | GLOSTSymbol
304
+ | GLOSTPunctuation
305
+ | GLOSTWhiteSpace
306
+ | GLOSTSource
307
+ // New transformer node types
308
+ | GLOSTClause
309
+ | GLOSTPhrase
310
+ | GLOSTSyllable
311
+ | GLOSTCharacter;
312
+
313
+ /**
314
+ * GLOST nodes that extend nlcst Literal (have a value property)
315
+ */
316
+ export type GLOSTLiteral = NlcstLiteral & {
317
+ /** Language code for this node */
318
+ lang?: LanguageCode;
319
+ /** Script system used */
320
+ script?: ScriptSystem;
321
+ /** Linguistic level of this segment */
322
+ level?: LinguisticLevel;
323
+ /** Extras field for extensions */
324
+ extras?: GLOSTExtras;
325
+ };
326
+
327
+ /**
328
+ * GLOST Punctuation node (extends nlcst PunctuationNode)
329
+ */
330
+ export type GLOSTPunctuation = NlcstPunctuation & {};
331
+
332
+ /**
333
+ * GLOST WhiteSpace node (extends nlcst WhiteSpaceNode)
334
+ */
335
+ export type GLOSTWhiteSpace = NlcstWhiteSpace & {};
336
+
337
+ /**
338
+ * GLOST Symbol node (extends nlcst SymbolNode)
339
+ */
340
+ export type GLOSTSymbol = NlcstSymbol & {};
341
+
342
+ /**
343
+ * GLOST Text node (extends nlcst TextNode)
344
+ */
345
+ export type GLOSTText = NlcstText & {
346
+ // potentially can be character level information
347
+ };
348
+
349
+ /**
350
+ * GLOST Source node (extends nlcst SourceNode)
351
+ */
352
+ export type GLOSTSource = NlcstSource & {
353
+
354
+ };
355
+
356
+ /**
357
+ * Extended word node with transcription support
358
+ * Extends nlcst WordNode and adds GLOST-specific properties
359
+ */
360
+ export type GLOSTWord = Omit<NlcstWord, "children"> & {
361
+ /** Transcription data */
362
+ transcription: TransliterationData;
363
+ /** Linguistic metadata */
364
+ metadata: LinguisticMetadata;
365
+ /** @deprecated Use extras.translations instead */
366
+ shortDefinition?: string;
367
+ /** @deprecated Use extras.translations instead */
368
+ fullDefinition?: string;
369
+ /** @deprecated Use metadata enrichment extensions instead */
370
+ difficulty?: "beginner" | "intermediate" | "advanced";
371
+ /** Language code for this node */
372
+ lang?: LanguageCode;
373
+ /** Script system used */
374
+ script?: ScriptSystem;
375
+ /** Linguistic level of this segment */
376
+ level?: LinguisticLevel;
377
+ /** Extras field for extensions */
378
+ extras?: GLOSTExtras;
379
+ /** Children nodes - must contain at least one Text node */
380
+ children: GLOSTWordContent[];
381
+ };
382
+
383
+ /**
384
+ * Extended sentence node
385
+ * Extends nlcst SentenceNode and adds GLOST-specific properties
386
+ */
387
+ export type GLOSTSentence = Omit<NlcstSentence, "children"> & {
388
+ /** Language of the sentence */
389
+ lang: LanguageCode;
390
+ /** Script system used */
391
+ script: ScriptSystem;
392
+ /** Original text */
393
+ originalText: string;
394
+ /** Transcription data for the entire sentence */
395
+ transcription?: TransliterationData;
396
+ /** Extras field for extensions */
397
+ extras?: GLOSTExtras;
398
+ /** Children nodes - must be nlcst-compliant */
399
+ children: GLOSTSentenceContent[];
400
+ };
401
+
402
+ /**
403
+ * Extended paragraph node
404
+ */
405
+ export type GLOSTParagraph = Omit<NlcstParagraph, "children"> & {
406
+ /** Language of the paragraph */
407
+ lang?: LanguageCode;
408
+ /** Script system used */
409
+ script?: ScriptSystem;
410
+ /** Extras field for extensions */
411
+ extras?: GLOSTExtras;
412
+ /** Children nodes - must be nlcst-compliant */
413
+ children: GLOSTParagraphContent[];
414
+ };
415
+
416
+ /**
417
+ * Extended root node
418
+ */
419
+ export type GLOSTRoot = Omit<NlcstRoot, "children"> & {
420
+ /** Primary language of the document */
421
+ lang: LanguageCode;
422
+ /** Primary script system */
423
+ script: ScriptSystem;
424
+ /** Extras field for extensions */
425
+ extras?: GLOSTExtras;
426
+ /** Document metadata */
427
+ metadata?: {
428
+ title?: string;
429
+ author?: string;
430
+ date?: string;
431
+ description?: string;
432
+ };
433
+ /** Children nodes - must be nlcst-compliant */
434
+ children: GLOSTRootContent[];
435
+ };
436
+
437
+ // ============================================================================
438
+ // Transformer Node Types
439
+ // ============================================================================
440
+
441
+ /**
442
+ * Clause node - represents grammatical clauses within sentences
443
+ * Created by ClauseSegmenterExtension transformer
444
+ */
445
+ export type GLOSTClause = {
446
+ type: "ClauseNode";
447
+ /** Type of clause */
448
+ clauseType: "main" | "subordinate" | "relative" | "adverbial";
449
+ /** Children nodes - phrases, words, or punctuation */
450
+ children: (GLOSTPhrase | GLOSTWord | GLOSTPunctuation)[];
451
+ /** Language code for this clause */
452
+ lang?: LanguageCode;
453
+ /** Script system used */
454
+ script?: ScriptSystem;
455
+ /** Extras field for extensions */
456
+ extras?: GLOSTExtras & {
457
+ /** Whether this clause has been negated */
458
+ isNegated?: boolean;
459
+ /** Grammatical mood */
460
+ mood?: "declarative" | "interrogative" | "imperative" | "conditional";
461
+ /** Tense information */
462
+ tense?: string;
463
+ /** Original form before transformation */
464
+ originalForm?: string;
465
+ };
466
+ };
467
+
468
+ /**
469
+ * Phrase node - groups words into grammatical phrases
470
+ * Created by PhraseSegmenterExtension transformer
471
+ */
472
+ export type GLOSTPhrase = {
473
+ type: "PhraseNode";
474
+ /** Type of phrase */
475
+ phraseType: "noun" | "verb" | "prepositional" | "adjectival" | "adverbial";
476
+ /** Main word of the phrase (head) */
477
+ headWord?: string;
478
+ /** Children nodes - words or punctuation */
479
+ children: (GLOSTWord | GLOSTPunctuation)[];
480
+ /** Language code for this phrase */
481
+ lang?: LanguageCode;
482
+ /** Script system used */
483
+ script?: ScriptSystem;
484
+ /** Extras field for extensions */
485
+ extras?: GLOSTExtras & {
486
+ /** Grammatical role in the clause/sentence */
487
+ role?: "subject" | "object" | "complement" | "modifier";
488
+ };
489
+ };
490
+
491
+ /**
492
+ * Syllable node - represents phonological syllable structure
493
+ * Created by SyllableSegmenterExtension transformer (language-specific)
494
+ */
495
+ export type GLOSTSyllable = {
496
+ type: "SyllableNode";
497
+ /** Syllable structure information */
498
+ structure: {
499
+ /** Initial consonant(s) - Generic (e.g., "h" in "hello") */
500
+ onset?: string;
501
+ /** Vowel - Generic (e.g., "e" in "hello") */
502
+ nucleus: string;
503
+ /** Final consonant(s) - Generic (e.g., "l" in "hello") */
504
+ coda?: string;
505
+
506
+ // Thai-specific structure (optional)
507
+ /** Initial consonant (Thai: พยัญชนะต้น) */
508
+ Ci?: string;
509
+ /** Vowel (Thai: สระ) */
510
+ V?: string;
511
+ /** Final consonant (Thai: ตัวสะกด) */
512
+ Cf?: string;
513
+ /** Tone mark (Thai: วรรณยุกต์) */
514
+ T?: string;
515
+ };
516
+ /** Children nodes - individual characters */
517
+ children: GLOSTCharacter[];
518
+ /** Language code for this syllable */
519
+ lang?: LanguageCode;
520
+ /** Script system used */
521
+ script?: ScriptSystem;
522
+ /** Tone number (for tonal languages like Thai, Mandarin) */
523
+ tone?: number;
524
+ /** Stress level (for stress languages like English) */
525
+ stress?: "primary" | "secondary" | "unstressed";
526
+ /** Extras field for extensions */
527
+ extras?: GLOSTExtras;
528
+ };
529
+
530
+ /**
531
+ * Character node - represents individual characters with linguistic roles
532
+ * Created by SyllableSegmenterExtension or CharacterSegmenterExtension
533
+ */
534
+ export type GLOSTCharacter = {
535
+ type: "CharacterNode";
536
+ /** The character value (single character) */
537
+ value: string;
538
+ /** Linguistic role of the character */
539
+ role?: "consonant" | "vowel" | "tone" | "diacritic" | "modifier";
540
+ /** Placement in the syllable/word (renamed from 'position' to avoid conflict with unist Position) */
541
+ placement?: "initial" | "medial" | "final" | "above" | "below" | "before" | "after";
542
+ /** Language code for this character */
543
+ lang?: LanguageCode;
544
+ /** Script system used */
545
+ script?: ScriptSystem;
546
+ /** Extras field for extensions */
547
+ extras?: GLOSTExtras & {
548
+ /** Unicode code point (e.g., "U+0E04") */
549
+ unicode?: string;
550
+ /** Thai consonant class (high/mid/low) */
551
+ class?: "high" | "mid" | "low";
552
+ /** Phonological sound class */
553
+ soundClass?: string;
554
+ };
555
+ };
556
+
557
+ export type GLOSTRootContent = GLOSTParagraph | GLOSTSentence | GLOSTWord | GLOSTText | GLOSTSymbol | GLOSTPunctuation | GLOSTWhiteSpace | GLOSTSource;
558
+ export type GLOSTParagraphContent = GLOSTSentence | GLOSTPunctuation | GLOSTSymbol | GLOSTWhiteSpace | GLOSTSource;
559
+ export type GLOSTSentenceContent = GLOSTClause | GLOSTWord | GLOSTPunctuation | GLOSTSymbol | GLOSTWhiteSpace | GLOSTSource;
560
+ export type GLOSTWordContent = GLOSTSyllable | GLOSTText | GLOSTSymbol | GLOSTPunctuation | GLOSTWhiteSpace | GLOSTSource;
561
+ // ============================================================================
562
+ // Utility Types
563
+ // ============================================================================
564
+
565
+ // Type guards are now implemented in utils.ts using unist-util-is