glost-core 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/CHANGELOG.md +63 -0
  2. package/LICENSE +21 -0
  3. package/README.md +199 -0
  4. package/dist/__benchmarks__/document-creation.bench.d.ts +7 -0
  5. package/dist/__benchmarks__/document-creation.bench.d.ts.map +1 -0
  6. package/dist/__benchmarks__/document-creation.bench.js +71 -0
  7. package/dist/__benchmarks__/document-creation.bench.js.map +1 -0
  8. package/dist/__benchmarks__/traversal.bench.d.ts +7 -0
  9. package/dist/__benchmarks__/traversal.bench.d.ts.map +1 -0
  10. package/dist/__benchmarks__/traversal.bench.js +124 -0
  11. package/dist/__benchmarks__/traversal.bench.js.map +1 -0
  12. package/dist/cli/migrate.d.ts +8 -0
  13. package/dist/cli/migrate.d.ts.map +1 -0
  14. package/dist/cli/migrate.js +229 -0
  15. package/dist/cli/migrate.js.map +1 -0
  16. package/dist/errors.d.ts +168 -0
  17. package/dist/errors.d.ts.map +1 -0
  18. package/dist/errors.js +300 -0
  19. package/dist/errors.js.map +1 -0
  20. package/dist/guards.d.ts +103 -0
  21. package/dist/guards.d.ts.map +1 -0
  22. package/dist/guards.js +264 -0
  23. package/dist/guards.js.map +1 -0
  24. package/dist/index.d.ts +9 -0
  25. package/dist/index.d.ts.map +1 -0
  26. package/dist/index.js +25 -0
  27. package/dist/index.js.map +1 -0
  28. package/dist/nodes.d.ts +227 -0
  29. package/dist/nodes.d.ts.map +1 -0
  30. package/dist/nodes.js +243 -0
  31. package/dist/nodes.js.map +1 -0
  32. package/dist/types.d.ts +442 -0
  33. package/dist/types.d.ts.map +1 -0
  34. package/dist/types.js +51 -0
  35. package/dist/types.js.map +1 -0
  36. package/dist/utils.d.ts +247 -0
  37. package/dist/utils.d.ts.map +1 -0
  38. package/dist/utils.js +564 -0
  39. package/dist/utils.js.map +1 -0
  40. package/dist/validators.d.ts +1876 -0
  41. package/dist/validators.d.ts.map +1 -0
  42. package/dist/validators.js +302 -0
  43. package/dist/validators.js.map +1 -0
  44. package/package.json +73 -0
  45. package/src/__benchmarks__/document-creation.bench.ts +92 -0
  46. package/src/__benchmarks__/traversal.bench.ts +152 -0
  47. package/src/__tests__/README.md +20 -0
  48. package/src/__tests__/example.test.ts +43 -0
  49. package/src/__tests__/example.ts +186 -0
  50. package/src/__tests__/helpers.test.ts +178 -0
  51. package/src/__tests__/mock-data.ts +624 -0
  52. package/src/__tests__/performance.test.ts +317 -0
  53. package/src/__tests__/traversal.test.ts +170 -0
  54. package/src/cli/migrate.ts +294 -0
  55. package/src/errors.ts +394 -0
  56. package/src/guards.ts +341 -0
  57. package/src/index.ts +69 -0
  58. package/src/nodes.ts +409 -0
  59. package/src/types.ts +633 -0
  60. package/src/utils.ts +730 -0
  61. package/src/validators.ts +336 -0
  62. package/tsconfig.json +9 -0
package/src/utils.ts ADDED
@@ -0,0 +1,730 @@
1
+ import { is as isNode } from "unist-util-is";
2
+ import { visit, SKIP } from "unist-util-visit";
3
+
4
+ import type {
5
+ LanguageCode,
6
+ GLOSTCharacter,
7
+ GLOSTClause,
8
+ GLOSTNode,
9
+ GLOSTParagraph,
10
+ GLOSTPhrase,
11
+ GLOSTRoot,
12
+ GLOSTSentence,
13
+ GLOSTSyllable,
14
+ GLOSTWord,
15
+ TranscriptionSystem,
16
+ } from "./types.js";
17
+
18
+ // ============================================================================
19
+ // BCP-47 Language Tag Utilities
20
+ // ============================================================================
21
+
22
+ /**
23
+ * Parse a BCP-47 language tag into its components
24
+ * Format: language[-script][-region][-variant]
25
+ */
26
+ export function parseLanguageTag(tag: string): {
27
+ language: string;
28
+ script?: string;
29
+ region?: string;
30
+ variant?: string;
31
+ fullTag: string;
32
+ } {
33
+ const parts = tag.split("-");
34
+ const result = {
35
+ language: parts[0] || "",
36
+ script: undefined as string | undefined,
37
+ region: undefined as string | undefined,
38
+ variant: undefined as string | undefined,
39
+ fullTag: tag,
40
+ };
41
+
42
+ if (parts.length >= 2) {
43
+ // Check if second part is a script (4 letters) or region (2-3 letters)
44
+ if (parts[1] && parts[1].length === 4 && /^[A-Za-z]{4}$/.test(parts[1])) {
45
+ result.script = parts[1];
46
+ if (parts.length >= 3 && parts[2]) {
47
+ result.region = parts[2];
48
+ if (parts.length >= 4) {
49
+ result.variant = parts.slice(3).join("-");
50
+ }
51
+ }
52
+ } else {
53
+ // Second part is likely a region
54
+ if (parts[1]) {
55
+ result.region = parts[1];
56
+ if (parts.length >= 3) {
57
+ result.variant = parts.slice(2).join("-");
58
+ }
59
+ }
60
+ }
61
+ }
62
+
63
+ return result;
64
+ }
65
+
66
+ /**
67
+ * Get the base language from a BCP-47 tag
68
+ * Examples: "en-US" -> "en", "zh-CN" -> "zh"
69
+ */
70
+ export function getBaseLanguage(tag: string): string {
71
+ const parts = tag.split("-");
72
+ return parts[0] || tag;
73
+ }
74
+
75
+ /**
76
+ * Check if two language tags are compatible (same base language)
77
+ * Examples: "en-US" and "en-GB" are compatible
78
+ */
79
+ export function areLanguagesCompatible(tag1: string, tag2: string): boolean {
80
+ return getBaseLanguage(tag1) === getBaseLanguage(tag2);
81
+ }
82
+
83
+ /**
84
+ * Find the best matching language tag from available options
85
+ * Prioritizes exact matches, then region matches, then base language matches
86
+ */
87
+ export function findBestLanguageMatch(
88
+ target: string,
89
+ available: string[],
90
+ ): string | null {
91
+ if (available.includes(target)) {
92
+ return target;
93
+ }
94
+
95
+ const targetParts = parseLanguageTag(target);
96
+
97
+ // Try to find region-specific matches
98
+ for (const option of available) {
99
+ const optionParts = parseLanguageTag(option);
100
+ if (
101
+ optionParts.language === targetParts.language &&
102
+ optionParts.region === targetParts.region
103
+ ) {
104
+ return option;
105
+ }
106
+ }
107
+
108
+ // Try to find base language matches
109
+ for (const option of available) {
110
+ const optionParts = parseLanguageTag(option);
111
+ if (optionParts.language === targetParts.language) {
112
+ return option;
113
+ }
114
+ }
115
+
116
+ return null;
117
+ }
118
+
119
+ /**
120
+ * Get a fallback language tag when the exact one isn't available
121
+ * Examples: "en-US" -> "en", "zh-CN" -> "zh"
122
+ */
123
+ export function getLanguageFallback(tag: string): string {
124
+ const parts = parseLanguageTag(tag);
125
+ return parts.language;
126
+ }
127
+
128
+ /**
129
+ * Normalize a language tag to standard format
130
+ * Converts to lowercase and ensures proper formatting
131
+ */
132
+ export function normalizeLanguageTag(tag: string): string {
133
+ const parts = tag.split("-");
134
+ const language = parts[0]?.toLowerCase() || "";
135
+
136
+ if (parts.length === 1) {
137
+ return language;
138
+ }
139
+
140
+ // Handle script (4 letters, title case)
141
+ if (parts[1] && parts[1].length === 4) {
142
+ const script = parts[1]
143
+ .toLowerCase()
144
+ .replace(/\b\w/g, (l) => l.toUpperCase());
145
+
146
+ if (parts.length === 2) {
147
+ return `${language}-${script}`;
148
+ }
149
+
150
+ // Handle region (2-3 letters, uppercase)
151
+ const region = parts[2]?.toUpperCase() || "";
152
+
153
+ if (parts.length === 3) {
154
+ return `${language}-${script}-${region}`;
155
+ }
156
+
157
+ // Handle variants (lowercase)
158
+ const variants = parts.slice(3).join("-").toLowerCase();
159
+ return `${language}-${script}-${region}-${variants}`;
160
+ } else {
161
+ // No script, just language-region
162
+ const region = parts[1]?.toUpperCase() || "";
163
+
164
+ if (parts.length === 2) {
165
+ return `${language}-${region}`;
166
+ }
167
+
168
+ // Handle variants
169
+ const variants = parts.slice(2).join("-").toLowerCase();
170
+ return `${language}-${region}-${variants}`;
171
+ }
172
+ }
173
+
174
+ /**
175
+ * Check if a language tag is valid BCP-47 format
176
+ */
177
+ export function isValidLanguageTag(tag: string): boolean {
178
+ // Basic BCP-47 validation
179
+ const bcp47Pattern =
180
+ /^[a-z]{2,3}(-[A-Za-z]{4})?(-[A-Za-z]{2,3})?(-[A-Za-z0-9]+)*$/;
181
+ return bcp47Pattern.test(tag);
182
+ }
183
+
184
+ // ============================================================================
185
+ // Enhanced Tree Traversal Utilities (using unist-util-visit)
186
+ // ============================================================================
187
+
188
+ /**
189
+ * Get all word nodes from an GLOST tree
190
+ */
191
+ export function getAllWords(node: GLOSTNode): GLOSTWord[] {
192
+ const words: GLOSTWord[] = [];
193
+
194
+ visit(node, "WordNode", (wordNode) => {
195
+ if (isGLOSTWord(wordNode)) {
196
+ words.push(wordNode);
197
+ }
198
+ });
199
+
200
+ return words;
201
+ }
202
+
203
+ /**
204
+ * Get the first word from a document
205
+ *
206
+ * Convenience helper for accessing the first word in document order.
207
+ * Returns undefined if no words are found.
208
+ *
209
+ * @param document - GLOST document root
210
+ * @returns First word node or undefined
211
+ *
212
+ * @example
213
+ * ```typescript
214
+ * const doc = createSimpleDocument([word1, word2], "en");
215
+ * const firstWord = getFirstWord(doc);
216
+ * if (firstWord) {
217
+ * console.log(getWordText(firstWord));
218
+ * }
219
+ * ```
220
+ */
221
+ export function getFirstWord(document: GLOSTRoot): GLOSTWord | undefined {
222
+ let firstWord: GLOSTWord | undefined;
223
+
224
+ visit(document, "WordNode", (wordNode) => {
225
+ if (isGLOSTWord(wordNode) && !firstWord) {
226
+ firstWord = wordNode;
227
+ return SKIP; // Stop traversal after finding first word
228
+ }
229
+ });
230
+
231
+ return firstWord;
232
+ }
233
+
234
+ /**
235
+ * Get word at specific path in document
236
+ *
237
+ * Navigate document hierarchy using paragraph, sentence, and word indices.
238
+ * Returns undefined if path is invalid or doesn't exist.
239
+ *
240
+ * @param document - GLOST document root
241
+ * @param path - Path specifying paragraph, sentence, and word indices (0-based)
242
+ * @returns Word node at path or undefined
243
+ *
244
+ * @example
245
+ * ```typescript
246
+ * // Get the first word of the second sentence in the first paragraph
247
+ * const word = getWordAtPath(doc, {
248
+ * paragraph: 0,
249
+ * sentence: 1,
250
+ * word: 0
251
+ * });
252
+ * ```
253
+ */
254
+ export function getWordAtPath(
255
+ document: GLOSTRoot,
256
+ path: { paragraph: number; sentence: number; word: number }
257
+ ): GLOSTWord | undefined {
258
+ const { paragraph: pIdx, sentence: sIdx, word: wIdx } = path;
259
+
260
+ // Navigate to paragraph
261
+ const para = document.children[pIdx];
262
+ if (!para || para.type !== "ParagraphNode") {
263
+ return undefined;
264
+ }
265
+
266
+ // Navigate to sentence
267
+ const sent = (para as GLOSTParagraph).children[sIdx];
268
+ if (!sent || sent.type !== "SentenceNode") {
269
+ return undefined;
270
+ }
271
+
272
+ // Navigate to word
273
+ const word = (sent as GLOSTSentence).children[wIdx];
274
+ if (!word || word.type !== "WordNode") {
275
+ return undefined;
276
+ }
277
+
278
+ return word as GLOSTWord;
279
+ }
280
+
281
+ /**
282
+ * Get all sentence nodes from an GLOST tree
283
+ */
284
+ export function getAllSentences(node: GLOSTNode): GLOSTSentence[] {
285
+ const sentences: GLOSTSentence[] = [];
286
+
287
+ visit(node, "SentenceNode", (sentenceNode) => {
288
+ if (isGLOSTSentence(sentenceNode)) {
289
+ sentences.push(sentenceNode);
290
+ }
291
+ });
292
+
293
+ return sentences;
294
+ }
295
+
296
+ /**
297
+ * Get all paragraph nodes from an GLOST tree
298
+ */
299
+ export function getAllParagraphs(node: GLOSTNode): GLOSTParagraph[] {
300
+ const paragraphs: GLOSTParagraph[] = [];
301
+
302
+ visit(node, "ParagraphNode", (paragraphNode) => {
303
+ if (isGLOSTParagraph(paragraphNode)) {
304
+ paragraphs.push(paragraphNode);
305
+ }
306
+ });
307
+
308
+ return paragraphs;
309
+ }
310
+
311
+ /**
312
+ * Get all clause nodes from an GLOST tree
313
+ */
314
+ export function getAllClauses(node: GLOSTNode): GLOSTClause[] {
315
+ const clauses: GLOSTClause[] = [];
316
+
317
+ visit(node, "ClauseNode", (clauseNode) => {
318
+ if (isGLOSTClause(clauseNode)) {
319
+ clauses.push(clauseNode);
320
+ }
321
+ });
322
+
323
+ return clauses;
324
+ }
325
+
326
+ /**
327
+ * Get all phrase nodes from an GLOST tree
328
+ */
329
+ export function getAllPhrases(node: GLOSTNode): GLOSTPhrase[] {
330
+ const phrases: GLOSTPhrase[] = [];
331
+
332
+ visit(node, "PhraseNode", (phraseNode) => {
333
+ if (isGLOSTPhrase(phraseNode)) {
334
+ phrases.push(phraseNode);
335
+ }
336
+ });
337
+
338
+ return phrases;
339
+ }
340
+
341
+ /**
342
+ * Get all syllable nodes from an GLOST tree
343
+ */
344
+ export function getAllSyllables(node: GLOSTNode): GLOSTSyllable[] {
345
+ const syllables: GLOSTSyllable[] = [];
346
+
347
+ visit(node, "SyllableNode", (syllableNode) => {
348
+ if (isGLOSTSyllable(syllableNode)) {
349
+ syllables.push(syllableNode);
350
+ }
351
+ });
352
+
353
+ return syllables;
354
+ }
355
+
356
+ /**
357
+ * Get all character nodes from an GLOST tree
358
+ */
359
+ export function getAllCharacters(node: GLOSTNode): GLOSTCharacter[] {
360
+ const characters: GLOSTCharacter[] = [];
361
+
362
+ visit(node, "CharacterNode", (characterNode) => {
363
+ if (isGLOSTCharacter(characterNode)) {
364
+ characters.push(characterNode);
365
+ }
366
+ });
367
+
368
+ return characters;
369
+ }
370
+
371
+ /**
372
+ * Find nodes by type with better typing
373
+ */
374
+ export function findNodesByType<T extends GLOSTNode>(
375
+ node: GLOSTNode,
376
+ type: string,
377
+ ): T[] {
378
+ const results: T[] = [];
379
+
380
+ visit(node, type, (foundNode) => {
381
+ results.push(foundNode as T);
382
+ });
383
+
384
+ return results;
385
+ }
386
+
387
+ // ============================================================================
388
+ // New Utilities for Transcription Components
389
+ // ============================================================================
390
+
391
+ /**
392
+ * Get all words from a document with proper typing
393
+ */
394
+ export function getWordsFromDocument(doc: GLOSTRoot): GLOSTWord[] {
395
+ return getAllWords(doc);
396
+ }
397
+
398
+ /**
399
+ * Get the first sentence from a document
400
+ */
401
+ export function getFirstSentence(doc: GLOSTRoot): GLOSTSentence | null {
402
+ const paragraphs = getAllParagraphs(doc);
403
+ if (paragraphs.length === 0) return null;
404
+
405
+ const firstParagraph = paragraphs[0];
406
+ if (!firstParagraph) return null;
407
+
408
+ const sentences = getAllSentences(firstParagraph);
409
+ if (sentences.length === 0) return null;
410
+
411
+ const firstSentence = sentences[0];
412
+ return firstSentence || null;
413
+ }
414
+
415
+ /**
416
+ * Get words from a specific sentence
417
+ */
418
+ export function getWordsFromSentence(sentence: GLOSTSentence): GLOSTWord[] {
419
+ return getAllWords(sentence);
420
+ }
421
+
422
+ /**
423
+ * Get words from a specific paragraph
424
+ */
425
+ export function getWordsFromParagraph(paragraph: GLOSTParagraph): GLOSTWord[] {
426
+ const words: GLOSTWord[] = [];
427
+
428
+ visit(paragraph, "WordNode", (wordNode) => {
429
+ if (isGLOSTWord(wordNode)) {
430
+ words.push(wordNode);
431
+ }
432
+ });
433
+
434
+ return words;
435
+ }
436
+
437
+ /**
438
+ * Find word nodes with specific language
439
+ */
440
+ export function findWordsByLanguage(
441
+ node: GLOSTNode,
442
+ lang: LanguageCode,
443
+ ): GLOSTWord[] {
444
+ const words = getAllWords(node);
445
+ return words.filter((word) => word.lang === lang);
446
+ }
447
+
448
+ /**
449
+ * Find word nodes with specific transcription system
450
+ */
451
+ export function findWordsByTranscriptionSystem(
452
+ node: GLOSTNode,
453
+ system: TranscriptionSystem,
454
+ ): GLOSTWord[] {
455
+ const words = getAllWords(node);
456
+ return words.filter(
457
+ (word) => word.transcription && word.transcription[system],
458
+ );
459
+ }
460
+
461
+ // ============================================================================
462
+ // Enhanced Type Guards (using unist-util-is)
463
+ // ============================================================================
464
+
465
+ /**
466
+ * Enhanced type guards for the new GLOST types
467
+ */
468
+ export function isGLOSTWord(node: any): node is GLOSTWord {
469
+ return (
470
+ isNode(node, "WordNode") && "transcription" in node && "metadata" in node
471
+ );
472
+ }
473
+
474
+ export function isGLOSTSentence(node: any): node is GLOSTSentence {
475
+ return (
476
+ isNode(node, "SentenceNode") &&
477
+ "lang" in node &&
478
+ "script" in node &&
479
+ "originalText" in node
480
+ );
481
+ }
482
+
483
+ export function isGLOSTParagraph(node: any): node is GLOSTParagraph {
484
+ return isNode(node, "ParagraphNode");
485
+ }
486
+
487
+ export function isGLOSTRoot(node: any): node is GLOSTRoot {
488
+ return isNode(node, "Root") && "lang" in node && "script" in node;
489
+ }
490
+
491
+ /**
492
+ * Type guard for GLOSTClause nodes
493
+ */
494
+ export function isGLOSTClause(node: any): node is GLOSTClause {
495
+ return isNode(node, "ClauseNode") && "clauseType" in node;
496
+ }
497
+
498
+ /**
499
+ * Type guard for GLOSTPhrase nodes
500
+ */
501
+ export function isGLOSTPhrase(node: any): node is GLOSTPhrase {
502
+ return isNode(node, "PhraseNode") && "phraseType" in node;
503
+ }
504
+
505
+ /**
506
+ * Type guard for GLOSTSyllable nodes
507
+ */
508
+ export function isGLOSTSyllable(node: any): node is GLOSTSyllable {
509
+ return isNode(node, "SyllableNode") && "structure" in node;
510
+ }
511
+
512
+ /**
513
+ * Type guard for GLOSTCharacter nodes
514
+ */
515
+ export function isGLOSTCharacter(node: any): node is GLOSTCharacter {
516
+ return isNode(node, "CharacterNode") && "value" in node;
517
+ }
518
+
519
+ // ============================================================================
520
+ // Utility Functions for Transcription Components
521
+ // ============================================================================
522
+
523
+ /**
524
+ * Extract text value from a word node
525
+ */
526
+ export function getWordText(word: GLOSTWord): string {
527
+ const textNode = word.children.find((child) => child.type === "TextNode");
528
+ return textNode?.value ?? "";
529
+ }
530
+
531
+ /**
532
+ * Get transcription for a specific system
533
+ */
534
+ export function getWordTranscription(
535
+ word: GLOSTWord,
536
+ system: TranscriptionSystem,
537
+ ): string | null {
538
+ return word.transcription?.[system]?.text ?? null;
539
+ }
540
+
541
+ /**
542
+ * Check if a word has transcription for a specific system
543
+ */
544
+ export function hasWordTranscription(
545
+ word: GLOSTWord,
546
+ system: TranscriptionSystem,
547
+ ): boolean {
548
+ return !!word.transcription && system in word.transcription && !!word.transcription[system]?.text;
549
+ }
550
+
551
+ /**
552
+ * Get word translation for a specific language
553
+ * @param word - The word node
554
+ * @param language - Target language code (default: "en-US")
555
+ * @returns Translation string or empty string if not found
556
+ */
557
+ export function getWordTranslation(
558
+ word: GLOSTWord,
559
+ language = "en-US",
560
+ ): string {
561
+ // Check extras.translations first (preferred format)
562
+ if (word.extras?.translations?.[language]) {
563
+ return word.extras.translations[language];
564
+ }
565
+ // Also check short language code (e.g., "en" for "en-US")
566
+ const shortLang = language.split("-")[0];
567
+ if (shortLang && word.extras?.translations?.[shortLang]) {
568
+ return word.extras.translations[shortLang];
569
+ }
570
+ return "";
571
+ }
572
+
573
+ /**
574
+ * Get word meaning/definition
575
+ * @deprecated Use getWordTranslation for multi-language support.
576
+ * This function is kept for backward compatibility.
577
+ */
578
+ export function getWordMeaning(word: GLOSTWord): string {
579
+ // Priority: extras.translations (preferred) > metadata.meaning (deprecated) > shortDefinition (deprecated)
580
+ const translation = getWordTranslation(word, "en-US");
581
+ if (translation) return translation;
582
+
583
+ return (
584
+ word.metadata?.meaning ?? word.shortDefinition ?? word.fullDefinition ?? ""
585
+ );
586
+ }
587
+
588
+ /**
589
+ * Get word part of speech
590
+ */
591
+ export function getWordPartOfSpeech(word: GLOSTWord): string {
592
+ return word.metadata?.partOfSpeech ?? "";
593
+ }
594
+
595
+ /**
596
+ * Get word difficulty
597
+ */
598
+ export function getWordDifficulty(word: GLOSTWord): string | number {
599
+ return word.difficulty ?? word.extras?.metadata?.difficulty ?? "";
600
+ }
601
+
602
+ /**
603
+ * Get sentence translation
604
+ */
605
+ export function getSentenceTranslation(
606
+ sentence: GLOSTSentence,
607
+ language = "en",
608
+ ): string | null {
609
+ if (sentence.extras?.translations?.[language]) {
610
+ return sentence.extras.translations[language];
611
+ }
612
+
613
+ // Fallback: build from word meanings
614
+ const words = getWordsFromSentence(sentence);
615
+ const wordMeanings = words
616
+ .map((word) => getWordMeaning(word))
617
+ .filter(Boolean)
618
+ .join(" ");
619
+
620
+ return wordMeanings || null;
621
+ }
622
+
623
+ // ============================================================================
624
+ // Content Statistics Utilities
625
+ // ============================================================================
626
+
627
+ /**
628
+ * Generic paragraph structure for word count calculation
629
+ * This interface allows converting external paragraph structures to GLOST format
630
+ */
631
+ export type ParagraphLike = {
632
+ sentences: Array<{
633
+ sentence: string;
634
+ translation?: string;
635
+ }>;
636
+ };
637
+
638
+ /**
639
+ * Convert a paragraph-like structure to GLOST format for word count calculation
640
+ * This is a minimal adapter that only converts what's needed for word counting
641
+ *
642
+ * @param paragraph - Paragraph structure with sentences containing text and optional translations
643
+ * @returns GLOST paragraph node
644
+ *
645
+ * @example
646
+ * ```ts
647
+ * const paragraph = {
648
+ * sentences: [
649
+ * { sentence: "Hello", translation: "สวัสดี" },
650
+ * { sentence: "World", translation: "โลก" }
651
+ * ]
652
+ * };
653
+ * const mtstParagraph = adaptParagraphLikeToGLOST(paragraph);
654
+ * const wordCount = getGLOSTWordCount(mtstParagraph);
655
+ * ```
656
+ */
657
+ export function adaptParagraphLikeToGLOST(
658
+ paragraph: ParagraphLike,
659
+ ): GLOSTParagraph {
660
+ return {
661
+ type: "ParagraphNode",
662
+ children: paragraph.sentences.map((sentence) => ({
663
+ type: "SentenceNode",
664
+ lang: "unknown",
665
+ script: "unknown",
666
+ originalText: sentence.sentence,
667
+ children: [],
668
+ extras: {
669
+ translations: sentence.translation
670
+ ? { en: sentence.translation }
671
+ : undefined,
672
+ },
673
+ })),
674
+ };
675
+ }
676
+
677
+ /**
678
+ * Calculate word count from GLOST content
679
+ * Counts words from sentence translations or original text
680
+ *
681
+ * @param content - GLOST paragraph, sentence, or root node
682
+ * @param language - Optional language code for translation preference (default: 'en')
683
+ * @returns Word count as a number, or undefined if content is empty
684
+ *
685
+ * @example
686
+ * ```ts
687
+ * const wordCount = getGLOSTWordCount(paragraph, 'en');
688
+ * // Returns: 245
689
+ * ```
690
+ */
691
+ export function getGLOSTWordCount(
692
+ content: GLOSTParagraph | GLOSTSentence | GLOSTRoot,
693
+ language = "en",
694
+ ): number | undefined {
695
+ if (isGLOSTParagraph(content)) {
696
+ const sentences = getAllSentences(content);
697
+ if (sentences.length === 0) {
698
+ return undefined;
699
+ }
700
+
701
+ return sentences.reduce((count, sentence) => {
702
+ const translation = getSentenceTranslation(sentence, language);
703
+ const text = translation || sentence.originalText || "";
704
+ return count + text.split(/\s+/).filter((word) => word.length > 0).length;
705
+ }, 0);
706
+ }
707
+
708
+ if (isGLOSTSentence(content)) {
709
+ const translation = getSentenceTranslation(content, language);
710
+ const text = translation || content.originalText || "";
711
+ if (!text) {
712
+ return undefined;
713
+ }
714
+ return text.split(/\s+/).filter((word) => word.length > 0).length;
715
+ }
716
+
717
+ if (isGLOSTRoot(content)) {
718
+ const paragraphs = getAllParagraphs(content);
719
+ if (paragraphs.length === 0) {
720
+ return undefined;
721
+ }
722
+
723
+ return paragraphs.reduce((count, paragraph) => {
724
+ const paragraphCount = getGLOSTWordCount(paragraph, language);
725
+ return count + (paragraphCount ?? 0);
726
+ }, 0);
727
+ }
728
+
729
+ return undefined;
730
+ }