glost 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +62 -0
- package/dist/example.d.ts +10 -0
- package/dist/example.d.ts.map +1 -0
- package/dist/example.js +82 -0
- package/dist/example.js.map +1 -0
- package/dist/guards.d.ts +103 -0
- package/dist/guards.d.ts.map +1 -0
- package/dist/guards.js +264 -0
- package/dist/guards.js.map +1 -0
- package/dist/index.d.ts +9 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +26 -0
- package/dist/index.js.map +1 -0
- package/dist/mock-data.d.ts +35 -0
- package/dist/mock-data.d.ts.map +1 -0
- package/dist/mock-data.js +494 -0
- package/dist/mock-data.js.map +1 -0
- package/dist/nodes.d.ts +68 -0
- package/dist/nodes.d.ts.map +1 -0
- package/dist/nodes.js +181 -0
- package/dist/nodes.js.map +1 -0
- package/dist/types.d.ts +379 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/utils.d.ts +203 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +497 -0
- package/dist/utils.js.map +1 -0
- package/dist/validators.d.ts +1876 -0
- package/dist/validators.d.ts.map +1 -0
- package/dist/validators.js +302 -0
- package/dist/validators.js.map +1 -0
- package/package.json +67 -0
- package/src/example.ts +186 -0
- package/src/guards.ts +341 -0
- package/src/index.ts +69 -0
- package/src/mock-data.ts +635 -0
- package/src/nodes.ts +301 -0
- package/src/types.ts +565 -0
- package/src/utils.ts +653 -0
- package/src/validators.ts +336 -0
- package/tsconfig.json +9 -0
package/dist/utils.d.ts
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
import type { LanguageCode, GLOSTCharacter, GLOSTClause, GLOSTNode, GLOSTParagraph, GLOSTPhrase, GLOSTRoot, GLOSTSentence, GLOSTSyllable, GLOSTWord, TranscriptionSystem } from "./types";
|
|
2
|
+
/**
|
|
3
|
+
* Parse a BCP-47 language tag into its components
|
|
4
|
+
* Format: language[-script][-region][-variant]
|
|
5
|
+
*/
|
|
6
|
+
export declare function parseLanguageTag(tag: string): {
|
|
7
|
+
language: string;
|
|
8
|
+
script?: string;
|
|
9
|
+
region?: string;
|
|
10
|
+
variant?: string;
|
|
11
|
+
fullTag: string;
|
|
12
|
+
};
|
|
13
|
+
/**
|
|
14
|
+
* Get the base language from a BCP-47 tag
|
|
15
|
+
* Examples: "en-US" -> "en", "zh-CN" -> "zh"
|
|
16
|
+
*/
|
|
17
|
+
export declare function getBaseLanguage(tag: string): string;
|
|
18
|
+
/**
|
|
19
|
+
* Check if two language tags are compatible (same base language)
|
|
20
|
+
* Examples: "en-US" and "en-GB" are compatible
|
|
21
|
+
*/
|
|
22
|
+
export declare function areLanguagesCompatible(tag1: string, tag2: string): boolean;
|
|
23
|
+
/**
|
|
24
|
+
* Find the best matching language tag from available options
|
|
25
|
+
* Prioritizes exact matches, then region matches, then base language matches
|
|
26
|
+
*/
|
|
27
|
+
export declare function findBestLanguageMatch(target: string, available: string[]): string | null;
|
|
28
|
+
/**
|
|
29
|
+
* Get a fallback language tag when the exact one isn't available
|
|
30
|
+
* Examples: "en-US" -> "en", "zh-CN" -> "zh"
|
|
31
|
+
*/
|
|
32
|
+
export declare function getLanguageFallback(tag: string): string;
|
|
33
|
+
/**
|
|
34
|
+
* Normalize a language tag to standard format
|
|
35
|
+
* Converts to lowercase and ensures proper formatting
|
|
36
|
+
*/
|
|
37
|
+
export declare function normalizeLanguageTag(tag: string): string;
|
|
38
|
+
/**
|
|
39
|
+
* Check if a language tag is valid BCP-47 format
|
|
40
|
+
*/
|
|
41
|
+
export declare function isValidLanguageTag(tag: string): boolean;
|
|
42
|
+
/**
|
|
43
|
+
* Get all word nodes from an GLOST tree
|
|
44
|
+
*/
|
|
45
|
+
export declare function getAllWords(node: GLOSTNode): GLOSTWord[];
|
|
46
|
+
/**
|
|
47
|
+
* Get all sentence nodes from an GLOST tree
|
|
48
|
+
*/
|
|
49
|
+
export declare function getAllSentences(node: GLOSTNode): GLOSTSentence[];
|
|
50
|
+
/**
|
|
51
|
+
* Get all paragraph nodes from an GLOST tree
|
|
52
|
+
*/
|
|
53
|
+
export declare function getAllParagraphs(node: GLOSTNode): GLOSTParagraph[];
|
|
54
|
+
/**
|
|
55
|
+
* Get all clause nodes from an GLOST tree
|
|
56
|
+
*/
|
|
57
|
+
export declare function getAllClauses(node: GLOSTNode): GLOSTClause[];
|
|
58
|
+
/**
|
|
59
|
+
* Get all phrase nodes from an GLOST tree
|
|
60
|
+
*/
|
|
61
|
+
export declare function getAllPhrases(node: GLOSTNode): GLOSTPhrase[];
|
|
62
|
+
/**
|
|
63
|
+
* Get all syllable nodes from an GLOST tree
|
|
64
|
+
*/
|
|
65
|
+
export declare function getAllSyllables(node: GLOSTNode): GLOSTSyllable[];
|
|
66
|
+
/**
|
|
67
|
+
* Get all character nodes from an GLOST tree
|
|
68
|
+
*/
|
|
69
|
+
export declare function getAllCharacters(node: GLOSTNode): GLOSTCharacter[];
|
|
70
|
+
/**
|
|
71
|
+
* Find nodes by type with better typing
|
|
72
|
+
*/
|
|
73
|
+
export declare function findNodesByType<T extends GLOSTNode>(node: GLOSTNode, type: string): T[];
|
|
74
|
+
/**
|
|
75
|
+
* Get all words from a document with proper typing
|
|
76
|
+
*/
|
|
77
|
+
export declare function getWordsFromDocument(doc: GLOSTRoot): GLOSTWord[];
|
|
78
|
+
/**
|
|
79
|
+
* Get the first sentence from a document
|
|
80
|
+
*/
|
|
81
|
+
export declare function getFirstSentence(doc: GLOSTRoot): GLOSTSentence | null;
|
|
82
|
+
/**
|
|
83
|
+
* Get words from a specific sentence
|
|
84
|
+
*/
|
|
85
|
+
export declare function getWordsFromSentence(sentence: GLOSTSentence): GLOSTWord[];
|
|
86
|
+
/**
|
|
87
|
+
* Get words from a specific paragraph
|
|
88
|
+
*/
|
|
89
|
+
export declare function getWordsFromParagraph(paragraph: GLOSTParagraph): GLOSTWord[];
|
|
90
|
+
/**
|
|
91
|
+
* Find word nodes with specific language
|
|
92
|
+
*/
|
|
93
|
+
export declare function findWordsByLanguage(node: GLOSTNode, lang: LanguageCode): GLOSTWord[];
|
|
94
|
+
/**
|
|
95
|
+
* Find word nodes with specific transcription system
|
|
96
|
+
*/
|
|
97
|
+
export declare function findWordsByTranscriptionSystem(node: GLOSTNode, system: TranscriptionSystem): GLOSTWord[];
|
|
98
|
+
/**
|
|
99
|
+
* Enhanced type guards for the new GLOST types
|
|
100
|
+
*/
|
|
101
|
+
export declare function isGLOSTWord(node: any): node is GLOSTWord;
|
|
102
|
+
export declare function isGLOSTSentence(node: any): node is GLOSTSentence;
|
|
103
|
+
export declare function isGLOSTParagraph(node: any): node is GLOSTParagraph;
|
|
104
|
+
export declare function isGLOSTRoot(node: any): node is GLOSTRoot;
|
|
105
|
+
/**
|
|
106
|
+
* Type guard for GLOSTClause nodes
|
|
107
|
+
*/
|
|
108
|
+
export declare function isGLOSTClause(node: any): node is GLOSTClause;
|
|
109
|
+
/**
|
|
110
|
+
* Type guard for GLOSTPhrase nodes
|
|
111
|
+
*/
|
|
112
|
+
export declare function isGLOSTPhrase(node: any): node is GLOSTPhrase;
|
|
113
|
+
/**
|
|
114
|
+
* Type guard for GLOSTSyllable nodes
|
|
115
|
+
*/
|
|
116
|
+
export declare function isGLOSTSyllable(node: any): node is GLOSTSyllable;
|
|
117
|
+
/**
|
|
118
|
+
* Type guard for GLOSTCharacter nodes
|
|
119
|
+
*/
|
|
120
|
+
export declare function isGLOSTCharacter(node: any): node is GLOSTCharacter;
|
|
121
|
+
/**
|
|
122
|
+
* Extract text value from a word node
|
|
123
|
+
*/
|
|
124
|
+
export declare function getWordText(word: GLOSTWord): string;
|
|
125
|
+
/**
|
|
126
|
+
* Get transcription for a specific system
|
|
127
|
+
*/
|
|
128
|
+
export declare function getWordTranscription(word: GLOSTWord, system: TranscriptionSystem): string | null;
|
|
129
|
+
/**
|
|
130
|
+
* Check if a word has transcription for a specific system
|
|
131
|
+
*/
|
|
132
|
+
export declare function hasWordTranscription(word: GLOSTWord, system: TranscriptionSystem): boolean;
|
|
133
|
+
/**
|
|
134
|
+
* Get word translation for a specific language
|
|
135
|
+
* @param word - The word node
|
|
136
|
+
* @param language - Target language code (default: "en-US")
|
|
137
|
+
* @returns Translation string or empty string if not found
|
|
138
|
+
*/
|
|
139
|
+
export declare function getWordTranslation(word: GLOSTWord, language?: string): string;
|
|
140
|
+
/**
|
|
141
|
+
* Get word meaning/definition
|
|
142
|
+
* @deprecated Use getWordTranslation for multi-language support.
|
|
143
|
+
* This function is kept for backward compatibility.
|
|
144
|
+
*/
|
|
145
|
+
export declare function getWordMeaning(word: GLOSTWord): string;
|
|
146
|
+
/**
|
|
147
|
+
* Get word part of speech
|
|
148
|
+
*/
|
|
149
|
+
export declare function getWordPartOfSpeech(word: GLOSTWord): string;
|
|
150
|
+
/**
|
|
151
|
+
* Get word difficulty
|
|
152
|
+
*/
|
|
153
|
+
export declare function getWordDifficulty(word: GLOSTWord): string;
|
|
154
|
+
/**
|
|
155
|
+
* Get sentence translation
|
|
156
|
+
*/
|
|
157
|
+
export declare function getSentenceTranslation(sentence: GLOSTSentence, language?: string): string | null;
|
|
158
|
+
/**
|
|
159
|
+
* Generic paragraph structure for word count calculation
|
|
160
|
+
* This interface allows converting external paragraph structures to GLOST format
|
|
161
|
+
*/
|
|
162
|
+
export type ParagraphLike = {
|
|
163
|
+
sentences: Array<{
|
|
164
|
+
sentence: string;
|
|
165
|
+
translation?: string;
|
|
166
|
+
}>;
|
|
167
|
+
};
|
|
168
|
+
/**
|
|
169
|
+
* Convert a paragraph-like structure to GLOST format for word count calculation
|
|
170
|
+
* This is a minimal adapter that only converts what's needed for word counting
|
|
171
|
+
*
|
|
172
|
+
* @param paragraph - Paragraph structure with sentences containing text and optional translations
|
|
173
|
+
* @returns GLOST paragraph node
|
|
174
|
+
*
|
|
175
|
+
* @example
|
|
176
|
+
* ```ts
|
|
177
|
+
* const paragraph = {
|
|
178
|
+
* sentences: [
|
|
179
|
+
* { sentence: "Hello", translation: "สวัสดี" },
|
|
180
|
+
* { sentence: "World", translation: "โลก" }
|
|
181
|
+
* ]
|
|
182
|
+
* };
|
|
183
|
+
* const mtstParagraph = adaptParagraphLikeToGLOST(paragraph);
|
|
184
|
+
* const wordCount = getGLOSTWordCount(mtstParagraph);
|
|
185
|
+
* ```
|
|
186
|
+
*/
|
|
187
|
+
export declare function adaptParagraphLikeToGLOST(paragraph: ParagraphLike): GLOSTParagraph;
|
|
188
|
+
/**
|
|
189
|
+
* Calculate word count from GLOST content
|
|
190
|
+
* Counts words from sentence translations or original text
|
|
191
|
+
*
|
|
192
|
+
* @param content - GLOST paragraph, sentence, or root node
|
|
193
|
+
* @param language - Optional language code for translation preference (default: 'en')
|
|
194
|
+
* @returns Word count as a number, or undefined if content is empty
|
|
195
|
+
*
|
|
196
|
+
* @example
|
|
197
|
+
* ```ts
|
|
198
|
+
* const wordCount = getGLOSTWordCount(paragraph, 'en');
|
|
199
|
+
* // Returns: 245
|
|
200
|
+
* ```
|
|
201
|
+
*/
|
|
202
|
+
export declare function getGLOSTWordCount(content: GLOSTParagraph | GLOSTSentence | GLOSTRoot, language?: string): number | undefined;
|
|
203
|
+
//# sourceMappingURL=utils.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../src/utils.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EACV,YAAY,EACZ,cAAc,EACd,WAAW,EACX,SAAS,EACT,cAAc,EACd,WAAW,EACX,SAAS,EACT,aAAa,EACb,aAAa,EACb,SAAS,EACT,mBAAmB,EAEpB,MAAM,SAAS,CAAC;AAMjB;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,GAAG,EAAE,MAAM,GAAG;IAC7C,QAAQ,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;CACjB,CAgCA;AAED;;;GAGG;AACH,wBAAgB,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAGnD;AAED;;;GAGG;AACH,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,CAE1E;AAED;;;GAGG;AACH,wBAAgB,qBAAqB,CACnC,MAAM,EAAE,MAAM,EACd,SAAS,EAAE,MAAM,EAAE,GAClB,MAAM,GAAG,IAAI,CA2Bf;AAED;;;GAGG;AACH,wBAAgB,mBAAmB,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAGvD;AAED;;;GAGG;AACH,wBAAgB,oBAAoB,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAwCxD;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAKvD;AAMD;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,SAAS,GAAG,SAAS,EAAE,CAUxD;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,SAAS,GAAG,aAAa,EAAE,CAUhE;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,SAAS,GAAG,cAAc,EAAE,CAUlE;AAED;;GAEG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,SAAS,GAAG,WAAW,EAAE,CAU5D;AAED;;GAEG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,SAAS,GAAG,WAAW,EAAE,CAU5D;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,SAAS,GAAG,aAAa,EAAE,CAUhE;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,SAAS,GAAG,cAAc,EAAE,CAUlE;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,CAAC,SAAS,SAAS,EACjD,IAAI,EAAE,SAAS,EACf,IAAI,EAAE,MAAM,GACX,CAAC,EAAE,CAQL;AAMD;;GAEG;AACH,wBAAgB,oBAAoB,CAAC,GAAG,EAAE,SAAS,GAAG,SAAS,EAAE,CAEhE;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,GAAG,EAAE,SAAS,GAAG,aAAa,GAAG,IAAI,CAYrE;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAAC,QAAQ,EAAE,aAAa,GAAG,SAAS,EAAE,CAEzE;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CAAC,SAAS,EAAE,cAAc,GAAG,SAAS,EAAE,CAU5E;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,SAAS,EACf,IAAI,EAAE,YAAY,GACjB,SAAS,EAAE,CAGb;AAED;;GAEG;AACH,wBAAgB,8BAA8B,CAC5C,IAAI,EAAE,SAAS,EACf,MAAM,EAAE,mBAAmB,GAC1B,SAAS,EAAE,CAKb;AAMD;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,GAAG,GAAG,IAAI,IAAI,SAAS,CAIxD;AAED,wBAAgB,eAAe,CAAC,IAAI,EAAE,GAAG,GAAG,IAAI,IAAI,aAAa,CAOhE;AAED,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,GAAG,GAAG,IAAI,IAAI,cAAc,CAElE;AAED,wBAAgB,WAAW,CAAC,IAAI,EAAE,GAAG,GAAG,IAAI,IAAI,SAAS,CAExD;AAED;;GAEG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,GAAG,GAAG,IAAI,IAAI,WAAW,CAE5D;AAED;;GAEG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,GAAG,GAAG,IAAI,IAAI,WAAW,CAE5D;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,GAAG,GAAG,IAAI,IAAI,aAAa,CAEhE;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,GAAG,GAAG,IAAI,IAAI,cAAc,CAElE;AAMD;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,SAAS,GAAG,MAAM,CAGnD;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAClC,IAAI,EAAE,SAAS,EACf,MAAM,EAAE,mBAAmB,GAC1B,MAAM,GAAG,IAAI,CAEf;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAClC,IAAI,EAAE,SAAS,EACf,MAAM,EAAE,mBAAmB,GAC1B,OAAO,CAET;AAED;;;;;GAKG;AACH,wBAAgB,kBAAkB,CAChC,IAAI,EAAE,SAAS,EACf,QAAQ,SAAU,GACjB,MAAM,CAWR;AAED;;;;GAIG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,SAAS,GAAG,MAAM,CAQtD;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,SAAS,GAAG,MAAM,CAE3D;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,SAAS,GAAG,MAAM,CAEzD;AAED;;GAEG;AACH,wBAAgB,sBAAsB,CACpC,QAAQ,EAAE,aAAa,EACvB,QAAQ,SAAO,GACd,MAAM,GAAG,IAAI,CAaf;AAMD;;;GAGG;AACH,MAAM,MAAM,aAAa,GAAG;IAC1B,SAAS,EAAE,KAAK,CAAC;QACf,QAAQ,EAAE,MAAM,CAAC;QACjB,WAAW,CAAC,EAAE,MAAM,CAAC;KACtB,CAAC,CAAC;CACJ,CAAC;AAEF;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAgB,yBAAyB,CACvC,SAAS,EAAE,aAAa,GACvB,cAAc,CAgBhB;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,cAAc,GAAG,aAAa,GAAG,SAAS,EACnD,QAAQ,SAAO,GACd,MAAM,GAAG,SAAS,CAoCpB"}
|
package/dist/utils.js
ADDED
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
import { is as isNode } from "unist-util-is";
|
|
2
|
+
import { visit } from "unist-util-visit";
|
|
3
|
+
// ============================================================================
|
|
4
|
+
// BCP-47 Language Tag Utilities
|
|
5
|
+
// ============================================================================
|
|
6
|
+
/**
|
|
7
|
+
* Parse a BCP-47 language tag into its components
|
|
8
|
+
* Format: language[-script][-region][-variant]
|
|
9
|
+
*/
|
|
10
|
+
export function parseLanguageTag(tag) {
|
|
11
|
+
const parts = tag.split("-");
|
|
12
|
+
const result = {
|
|
13
|
+
language: parts[0] || "",
|
|
14
|
+
script: undefined,
|
|
15
|
+
region: undefined,
|
|
16
|
+
variant: undefined,
|
|
17
|
+
fullTag: tag,
|
|
18
|
+
};
|
|
19
|
+
if (parts.length >= 2) {
|
|
20
|
+
// Check if second part is a script (4 letters) or region (2-3 letters)
|
|
21
|
+
if (parts[1] && parts[1].length === 4 && /^[A-Za-z]{4}$/.test(parts[1])) {
|
|
22
|
+
result.script = parts[1];
|
|
23
|
+
if (parts.length >= 3 && parts[2]) {
|
|
24
|
+
result.region = parts[2];
|
|
25
|
+
if (parts.length >= 4) {
|
|
26
|
+
result.variant = parts.slice(3).join("-");
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
else {
|
|
31
|
+
// Second part is likely a region
|
|
32
|
+
if (parts[1]) {
|
|
33
|
+
result.region = parts[1];
|
|
34
|
+
if (parts.length >= 3) {
|
|
35
|
+
result.variant = parts.slice(2).join("-");
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return result;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Get the base language from a BCP-47 tag
|
|
44
|
+
* Examples: "en-US" -> "en", "zh-CN" -> "zh"
|
|
45
|
+
*/
|
|
46
|
+
export function getBaseLanguage(tag) {
|
|
47
|
+
const parts = tag.split("-");
|
|
48
|
+
return parts[0] || tag;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Check if two language tags are compatible (same base language)
|
|
52
|
+
* Examples: "en-US" and "en-GB" are compatible
|
|
53
|
+
*/
|
|
54
|
+
export function areLanguagesCompatible(tag1, tag2) {
|
|
55
|
+
return getBaseLanguage(tag1) === getBaseLanguage(tag2);
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Find the best matching language tag from available options
|
|
59
|
+
* Prioritizes exact matches, then region matches, then base language matches
|
|
60
|
+
*/
|
|
61
|
+
export function findBestLanguageMatch(target, available) {
|
|
62
|
+
if (available.includes(target)) {
|
|
63
|
+
return target;
|
|
64
|
+
}
|
|
65
|
+
const targetParts = parseLanguageTag(target);
|
|
66
|
+
// Try to find region-specific matches
|
|
67
|
+
for (const option of available) {
|
|
68
|
+
const optionParts = parseLanguageTag(option);
|
|
69
|
+
if (optionParts.language === targetParts.language &&
|
|
70
|
+
optionParts.region === targetParts.region) {
|
|
71
|
+
return option;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
// Try to find base language matches
|
|
75
|
+
for (const option of available) {
|
|
76
|
+
const optionParts = parseLanguageTag(option);
|
|
77
|
+
if (optionParts.language === targetParts.language) {
|
|
78
|
+
return option;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Get a fallback language tag when the exact one isn't available
|
|
85
|
+
* Examples: "en-US" -> "en", "zh-CN" -> "zh"
|
|
86
|
+
*/
|
|
87
|
+
export function getLanguageFallback(tag) {
|
|
88
|
+
const parts = parseLanguageTag(tag);
|
|
89
|
+
return parts.language;
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Normalize a language tag to standard format
|
|
93
|
+
* Converts to lowercase and ensures proper formatting
|
|
94
|
+
*/
|
|
95
|
+
export function normalizeLanguageTag(tag) {
|
|
96
|
+
const parts = tag.split("-");
|
|
97
|
+
const language = parts[0]?.toLowerCase() || "";
|
|
98
|
+
if (parts.length === 1) {
|
|
99
|
+
return language;
|
|
100
|
+
}
|
|
101
|
+
// Handle script (4 letters, title case)
|
|
102
|
+
if (parts[1] && parts[1].length === 4) {
|
|
103
|
+
const script = parts[1]
|
|
104
|
+
.toLowerCase()
|
|
105
|
+
.replace(/\b\w/g, (l) => l.toUpperCase());
|
|
106
|
+
if (parts.length === 2) {
|
|
107
|
+
return `${language}-${script}`;
|
|
108
|
+
}
|
|
109
|
+
// Handle region (2-3 letters, uppercase)
|
|
110
|
+
const region = parts[2]?.toUpperCase() || "";
|
|
111
|
+
if (parts.length === 3) {
|
|
112
|
+
return `${language}-${script}-${region}`;
|
|
113
|
+
}
|
|
114
|
+
// Handle variants (lowercase)
|
|
115
|
+
const variants = parts.slice(3).join("-").toLowerCase();
|
|
116
|
+
return `${language}-${script}-${region}-${variants}`;
|
|
117
|
+
}
|
|
118
|
+
else {
|
|
119
|
+
// No script, just language-region
|
|
120
|
+
const region = parts[1]?.toUpperCase() || "";
|
|
121
|
+
if (parts.length === 2) {
|
|
122
|
+
return `${language}-${region}`;
|
|
123
|
+
}
|
|
124
|
+
// Handle variants
|
|
125
|
+
const variants = parts.slice(2).join("-").toLowerCase();
|
|
126
|
+
return `${language}-${region}-${variants}`;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Check if a language tag is valid BCP-47 format
|
|
131
|
+
*/
|
|
132
|
+
export function isValidLanguageTag(tag) {
|
|
133
|
+
// Basic BCP-47 validation
|
|
134
|
+
const bcp47Pattern = /^[a-z]{2,3}(-[A-Za-z]{4})?(-[A-Za-z]{2,3})?(-[A-Za-z0-9]+)*$/;
|
|
135
|
+
return bcp47Pattern.test(tag);
|
|
136
|
+
}
|
|
137
|
+
// ============================================================================
|
|
138
|
+
// Enhanced Tree Traversal Utilities (using unist-util-visit)
|
|
139
|
+
// ============================================================================
|
|
140
|
+
/**
|
|
141
|
+
* Get all word nodes from an GLOST tree
|
|
142
|
+
*/
|
|
143
|
+
export function getAllWords(node) {
|
|
144
|
+
const words = [];
|
|
145
|
+
visit(node, "WordNode", (wordNode) => {
|
|
146
|
+
if (isGLOSTWord(wordNode)) {
|
|
147
|
+
words.push(wordNode);
|
|
148
|
+
}
|
|
149
|
+
});
|
|
150
|
+
return words;
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Get all sentence nodes from an GLOST tree
|
|
154
|
+
*/
|
|
155
|
+
export function getAllSentences(node) {
|
|
156
|
+
const sentences = [];
|
|
157
|
+
visit(node, "SentenceNode", (sentenceNode) => {
|
|
158
|
+
if (isGLOSTSentence(sentenceNode)) {
|
|
159
|
+
sentences.push(sentenceNode);
|
|
160
|
+
}
|
|
161
|
+
});
|
|
162
|
+
return sentences;
|
|
163
|
+
}
|
|
164
|
+
/**
|
|
165
|
+
* Get all paragraph nodes from an GLOST tree
|
|
166
|
+
*/
|
|
167
|
+
export function getAllParagraphs(node) {
|
|
168
|
+
const paragraphs = [];
|
|
169
|
+
visit(node, "ParagraphNode", (paragraphNode) => {
|
|
170
|
+
if (isGLOSTParagraph(paragraphNode)) {
|
|
171
|
+
paragraphs.push(paragraphNode);
|
|
172
|
+
}
|
|
173
|
+
});
|
|
174
|
+
return paragraphs;
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Get all clause nodes from an GLOST tree
|
|
178
|
+
*/
|
|
179
|
+
export function getAllClauses(node) {
|
|
180
|
+
const clauses = [];
|
|
181
|
+
visit(node, "ClauseNode", (clauseNode) => {
|
|
182
|
+
if (isGLOSTClause(clauseNode)) {
|
|
183
|
+
clauses.push(clauseNode);
|
|
184
|
+
}
|
|
185
|
+
});
|
|
186
|
+
return clauses;
|
|
187
|
+
}
|
|
188
|
+
/**
|
|
189
|
+
* Get all phrase nodes from an GLOST tree
|
|
190
|
+
*/
|
|
191
|
+
export function getAllPhrases(node) {
|
|
192
|
+
const phrases = [];
|
|
193
|
+
visit(node, "PhraseNode", (phraseNode) => {
|
|
194
|
+
if (isGLOSTPhrase(phraseNode)) {
|
|
195
|
+
phrases.push(phraseNode);
|
|
196
|
+
}
|
|
197
|
+
});
|
|
198
|
+
return phrases;
|
|
199
|
+
}
|
|
200
|
+
/**
|
|
201
|
+
* Get all syllable nodes from an GLOST tree
|
|
202
|
+
*/
|
|
203
|
+
export function getAllSyllables(node) {
|
|
204
|
+
const syllables = [];
|
|
205
|
+
visit(node, "SyllableNode", (syllableNode) => {
|
|
206
|
+
if (isGLOSTSyllable(syllableNode)) {
|
|
207
|
+
syllables.push(syllableNode);
|
|
208
|
+
}
|
|
209
|
+
});
|
|
210
|
+
return syllables;
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Get all character nodes from an GLOST tree
|
|
214
|
+
*/
|
|
215
|
+
export function getAllCharacters(node) {
|
|
216
|
+
const characters = [];
|
|
217
|
+
visit(node, "CharacterNode", (characterNode) => {
|
|
218
|
+
if (isGLOSTCharacter(characterNode)) {
|
|
219
|
+
characters.push(characterNode);
|
|
220
|
+
}
|
|
221
|
+
});
|
|
222
|
+
return characters;
|
|
223
|
+
}
|
|
224
|
+
/**
|
|
225
|
+
* Find nodes by type with better typing
|
|
226
|
+
*/
|
|
227
|
+
export function findNodesByType(node, type) {
|
|
228
|
+
const results = [];
|
|
229
|
+
visit(node, type, (foundNode) => {
|
|
230
|
+
results.push(foundNode);
|
|
231
|
+
});
|
|
232
|
+
return results;
|
|
233
|
+
}
|
|
234
|
+
// ============================================================================
|
|
235
|
+
// New Utilities for Transcription Components
|
|
236
|
+
// ============================================================================
|
|
237
|
+
/**
|
|
238
|
+
* Get all words from a document with proper typing
|
|
239
|
+
*/
|
|
240
|
+
export function getWordsFromDocument(doc) {
|
|
241
|
+
return getAllWords(doc);
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* Get the first sentence from a document
|
|
245
|
+
*/
|
|
246
|
+
export function getFirstSentence(doc) {
|
|
247
|
+
const paragraphs = getAllParagraphs(doc);
|
|
248
|
+
if (paragraphs.length === 0)
|
|
249
|
+
return null;
|
|
250
|
+
const firstParagraph = paragraphs[0];
|
|
251
|
+
if (!firstParagraph)
|
|
252
|
+
return null;
|
|
253
|
+
const sentences = getAllSentences(firstParagraph);
|
|
254
|
+
if (sentences.length === 0)
|
|
255
|
+
return null;
|
|
256
|
+
const firstSentence = sentences[0];
|
|
257
|
+
return firstSentence || null;
|
|
258
|
+
}
|
|
259
|
+
/**
|
|
260
|
+
* Get words from a specific sentence
|
|
261
|
+
*/
|
|
262
|
+
export function getWordsFromSentence(sentence) {
|
|
263
|
+
return getAllWords(sentence);
|
|
264
|
+
}
|
|
265
|
+
/**
|
|
266
|
+
* Get words from a specific paragraph
|
|
267
|
+
*/
|
|
268
|
+
export function getWordsFromParagraph(paragraph) {
|
|
269
|
+
const words = [];
|
|
270
|
+
visit(paragraph, "WordNode", (wordNode) => {
|
|
271
|
+
if (isGLOSTWord(wordNode)) {
|
|
272
|
+
words.push(wordNode);
|
|
273
|
+
}
|
|
274
|
+
});
|
|
275
|
+
return words;
|
|
276
|
+
}
|
|
277
|
+
/**
|
|
278
|
+
* Find word nodes with specific language
|
|
279
|
+
*/
|
|
280
|
+
export function findWordsByLanguage(node, lang) {
|
|
281
|
+
const words = getAllWords(node);
|
|
282
|
+
return words.filter((word) => word.lang === lang);
|
|
283
|
+
}
|
|
284
|
+
/**
|
|
285
|
+
* Find word nodes with specific transcription system
|
|
286
|
+
*/
|
|
287
|
+
export function findWordsByTranscriptionSystem(node, system) {
|
|
288
|
+
const words = getAllWords(node);
|
|
289
|
+
return words.filter((word) => word.transcription && word.transcription[system]);
|
|
290
|
+
}
|
|
291
|
+
// ============================================================================
|
|
292
|
+
// Enhanced Type Guards (using unist-util-is)
|
|
293
|
+
// ============================================================================
|
|
294
|
+
/**
|
|
295
|
+
* Enhanced type guards for the new GLOST types
|
|
296
|
+
*/
|
|
297
|
+
export function isGLOSTWord(node) {
|
|
298
|
+
return (isNode(node, "WordNode") && "transcription" in node && "metadata" in node);
|
|
299
|
+
}
|
|
300
|
+
export function isGLOSTSentence(node) {
|
|
301
|
+
return (isNode(node, "SentenceNode") &&
|
|
302
|
+
"lang" in node &&
|
|
303
|
+
"script" in node &&
|
|
304
|
+
"originalText" in node);
|
|
305
|
+
}
|
|
306
|
+
export function isGLOSTParagraph(node) {
|
|
307
|
+
return isNode(node, "ParagraphNode");
|
|
308
|
+
}
|
|
309
|
+
export function isGLOSTRoot(node) {
|
|
310
|
+
return isNode(node, "Root") && "lang" in node && "script" in node;
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Type guard for GLOSTClause nodes
|
|
314
|
+
*/
|
|
315
|
+
export function isGLOSTClause(node) {
|
|
316
|
+
return isNode(node, "ClauseNode") && "clauseType" in node;
|
|
317
|
+
}
|
|
318
|
+
/**
|
|
319
|
+
* Type guard for GLOSTPhrase nodes
|
|
320
|
+
*/
|
|
321
|
+
export function isGLOSTPhrase(node) {
|
|
322
|
+
return isNode(node, "PhraseNode") && "phraseType" in node;
|
|
323
|
+
}
|
|
324
|
+
/**
|
|
325
|
+
* Type guard for GLOSTSyllable nodes
|
|
326
|
+
*/
|
|
327
|
+
export function isGLOSTSyllable(node) {
|
|
328
|
+
return isNode(node, "SyllableNode") && "structure" in node;
|
|
329
|
+
}
|
|
330
|
+
/**
|
|
331
|
+
* Type guard for GLOSTCharacter nodes
|
|
332
|
+
*/
|
|
333
|
+
export function isGLOSTCharacter(node) {
|
|
334
|
+
return isNode(node, "CharacterNode") && "value" in node;
|
|
335
|
+
}
|
|
336
|
+
// ============================================================================
|
|
337
|
+
// Utility Functions for Transcription Components
|
|
338
|
+
// ============================================================================
|
|
339
|
+
/**
|
|
340
|
+
* Extract text value from a word node
|
|
341
|
+
*/
|
|
342
|
+
export function getWordText(word) {
|
|
343
|
+
const textNode = word.children.find((child) => child.type === "TextNode");
|
|
344
|
+
return textNode?.value ?? "";
|
|
345
|
+
}
|
|
346
|
+
/**
|
|
347
|
+
* Get transcription for a specific system
|
|
348
|
+
*/
|
|
349
|
+
export function getWordTranscription(word, system) {
|
|
350
|
+
return word.transcription[system]?.text ?? null;
|
|
351
|
+
}
|
|
352
|
+
/**
|
|
353
|
+
* Check if a word has transcription for a specific system
|
|
354
|
+
*/
|
|
355
|
+
export function hasWordTranscription(word, system) {
|
|
356
|
+
return system in word.transcription && !!word.transcription[system]?.text;
|
|
357
|
+
}
|
|
358
|
+
/**
|
|
359
|
+
* Get word translation for a specific language
|
|
360
|
+
* @param word - The word node
|
|
361
|
+
* @param language - Target language code (default: "en-US")
|
|
362
|
+
* @returns Translation string or empty string if not found
|
|
363
|
+
*/
|
|
364
|
+
export function getWordTranslation(word, language = "en-US") {
|
|
365
|
+
// Check extras.translations first (preferred format)
|
|
366
|
+
if (word.extras?.translations?.[language]) {
|
|
367
|
+
return word.extras.translations[language];
|
|
368
|
+
}
|
|
369
|
+
// Also check short language code (e.g., "en" for "en-US")
|
|
370
|
+
const shortLang = language.split("-")[0];
|
|
371
|
+
if (shortLang && word.extras?.translations?.[shortLang]) {
|
|
372
|
+
return word.extras.translations[shortLang];
|
|
373
|
+
}
|
|
374
|
+
return "";
|
|
375
|
+
}
|
|
376
|
+
/**
|
|
377
|
+
* Get word meaning/definition
|
|
378
|
+
* @deprecated Use getWordTranslation for multi-language support.
|
|
379
|
+
* This function is kept for backward compatibility.
|
|
380
|
+
*/
|
|
381
|
+
export function getWordMeaning(word) {
|
|
382
|
+
// Priority: extras.translations (preferred) > metadata.meaning (deprecated) > shortDefinition (deprecated)
|
|
383
|
+
const translation = getWordTranslation(word, "en-US");
|
|
384
|
+
if (translation)
|
|
385
|
+
return translation;
|
|
386
|
+
return (word.metadata?.meaning ?? word.shortDefinition ?? word.fullDefinition ?? "");
|
|
387
|
+
}
|
|
388
|
+
/**
|
|
389
|
+
* Get word part of speech
|
|
390
|
+
*/
|
|
391
|
+
export function getWordPartOfSpeech(word) {
|
|
392
|
+
return word.metadata?.partOfSpeech ?? "";
|
|
393
|
+
}
|
|
394
|
+
/**
|
|
395
|
+
* Get word difficulty
|
|
396
|
+
*/
|
|
397
|
+
export function getWordDifficulty(word) {
|
|
398
|
+
return word.difficulty ?? word.extras?.metadata?.difficulty ?? "";
|
|
399
|
+
}
|
|
400
|
+
/**
|
|
401
|
+
* Get sentence translation
|
|
402
|
+
*/
|
|
403
|
+
export function getSentenceTranslation(sentence, language = "en") {
|
|
404
|
+
if (sentence.extras?.translations?.[language]) {
|
|
405
|
+
return sentence.extras.translations[language];
|
|
406
|
+
}
|
|
407
|
+
// Fallback: build from word meanings
|
|
408
|
+
const words = getWordsFromSentence(sentence);
|
|
409
|
+
const wordMeanings = words
|
|
410
|
+
.map((word) => getWordMeaning(word))
|
|
411
|
+
.filter(Boolean)
|
|
412
|
+
.join(" ");
|
|
413
|
+
return wordMeanings || null;
|
|
414
|
+
}
|
|
415
|
+
/**
|
|
416
|
+
* Convert a paragraph-like structure to GLOST format for word count calculation
|
|
417
|
+
* This is a minimal adapter that only converts what's needed for word counting
|
|
418
|
+
*
|
|
419
|
+
* @param paragraph - Paragraph structure with sentences containing text and optional translations
|
|
420
|
+
* @returns GLOST paragraph node
|
|
421
|
+
*
|
|
422
|
+
* @example
|
|
423
|
+
* ```ts
|
|
424
|
+
* const paragraph = {
|
|
425
|
+
* sentences: [
|
|
426
|
+
* { sentence: "Hello", translation: "สวัสดี" },
|
|
427
|
+
* { sentence: "World", translation: "โลก" }
|
|
428
|
+
* ]
|
|
429
|
+
* };
|
|
430
|
+
* const mtstParagraph = adaptParagraphLikeToGLOST(paragraph);
|
|
431
|
+
* const wordCount = getGLOSTWordCount(mtstParagraph);
|
|
432
|
+
* ```
|
|
433
|
+
*/
|
|
434
|
+
export function adaptParagraphLikeToGLOST(paragraph) {
|
|
435
|
+
return {
|
|
436
|
+
type: "ParagraphNode",
|
|
437
|
+
children: paragraph.sentences.map((sentence) => ({
|
|
438
|
+
type: "SentenceNode",
|
|
439
|
+
lang: "unknown",
|
|
440
|
+
script: "unknown",
|
|
441
|
+
originalText: sentence.sentence,
|
|
442
|
+
children: [],
|
|
443
|
+
extras: {
|
|
444
|
+
translations: sentence.translation
|
|
445
|
+
? { en: sentence.translation }
|
|
446
|
+
: undefined,
|
|
447
|
+
},
|
|
448
|
+
})),
|
|
449
|
+
};
|
|
450
|
+
}
|
|
451
|
+
/**
|
|
452
|
+
* Calculate word count from GLOST content
|
|
453
|
+
* Counts words from sentence translations or original text
|
|
454
|
+
*
|
|
455
|
+
* @param content - GLOST paragraph, sentence, or root node
|
|
456
|
+
* @param language - Optional language code for translation preference (default: 'en')
|
|
457
|
+
* @returns Word count as a number, or undefined if content is empty
|
|
458
|
+
*
|
|
459
|
+
* @example
|
|
460
|
+
* ```ts
|
|
461
|
+
* const wordCount = getGLOSTWordCount(paragraph, 'en');
|
|
462
|
+
* // Returns: 245
|
|
463
|
+
* ```
|
|
464
|
+
*/
|
|
465
|
+
export function getGLOSTWordCount(content, language = "en") {
|
|
466
|
+
if (isGLOSTParagraph(content)) {
|
|
467
|
+
const sentences = getAllSentences(content);
|
|
468
|
+
if (sentences.length === 0) {
|
|
469
|
+
return undefined;
|
|
470
|
+
}
|
|
471
|
+
return sentences.reduce((count, sentence) => {
|
|
472
|
+
const translation = getSentenceTranslation(sentence, language);
|
|
473
|
+
const text = translation || sentence.originalText || "";
|
|
474
|
+
return count + text.split(/\s+/).filter((word) => word.length > 0).length;
|
|
475
|
+
}, 0);
|
|
476
|
+
}
|
|
477
|
+
if (isGLOSTSentence(content)) {
|
|
478
|
+
const translation = getSentenceTranslation(content, language);
|
|
479
|
+
const text = translation || content.originalText || "";
|
|
480
|
+
if (!text) {
|
|
481
|
+
return undefined;
|
|
482
|
+
}
|
|
483
|
+
return text.split(/\s+/).filter((word) => word.length > 0).length;
|
|
484
|
+
}
|
|
485
|
+
if (isGLOSTRoot(content)) {
|
|
486
|
+
const paragraphs = getAllParagraphs(content);
|
|
487
|
+
if (paragraphs.length === 0) {
|
|
488
|
+
return undefined;
|
|
489
|
+
}
|
|
490
|
+
return paragraphs.reduce((count, paragraph) => {
|
|
491
|
+
const paragraphCount = getGLOSTWordCount(paragraph, language);
|
|
492
|
+
return count + (paragraphCount ?? 0);
|
|
493
|
+
}, 0);
|
|
494
|
+
}
|
|
495
|
+
return undefined;
|
|
496
|
+
}
|
|
497
|
+
//# sourceMappingURL=utils.js.map
|