tibetan-word-tokenizer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,228 @@
1
+ /**
2
+ * Sanskrit detection for Tibetan text
3
+ * Ported from Botok's has_skrt_syl.py
4
+ *
5
+ * Detects Sanskrit syllables in Tibetan script based on:
6
+ * 1. Sanskrit-specific vowels (long vowels, vocalic R/L)
7
+ * 2. Sanskrit consonants (retroflex, aspirated)
8
+ * 3. Invalid Tibetan consonant clusters that are valid in Sanskrit
9
+ */
10
+
11
+ import { TSEK } from './constants.js';
12
+ import { CharMarkers as c } from './constants.js';
13
+
14
+ /**
15
+ * Sanskrit-specific characters in Tibetan script
16
+ */
17
+
18
+ // Sanskrit long vowels and special vowel marks
19
+ const SKRT_LONG_VOWELS = new Set([
20
+ '\u0F71', // ཱ - long a
21
+ '\u0F72', // ི - vowel i (can combine with ཱ)
22
+ '\u0F73', // ཱི - long i
23
+ '\u0F74', // ུ - vowel u (can combine with ཱ)
24
+ '\u0F75', // ཱུ - long u
25
+ '\u0F76', // ྲྀ - vocalic r
26
+ '\u0F77', // ཷ - long vocalic r
27
+ '\u0F78', // ླྀ - vocalic l
28
+ '\u0F79', // ཹ - long vocalic l
29
+ '\u0F7A', // ེ - vowel e
30
+ '\u0F7B', // ཻ - ai vowel (Sanskrit)
31
+ '\u0F7C', // ོ - vowel o
32
+ '\u0F7D', // ཽ - au vowel (Sanskrit)
33
+ '\u0F7E', // ཾ - anusvara
34
+ '\u0F7F', // ཿ - visarga
35
+ '\u0F80', // ྀ - reversed i
36
+ '\u0F81', // ཱྀ - reversed long i
37
+ '\u0F82', // ྂ - candrabindu
38
+ '\u0F83', // ྃ - candrabindu with ornament
39
+ ]);
40
+
41
+ // Retroflex consonants (Sanskrit-specific)
42
+ const SKRT_RETROFLEX = new Set([
43
+ '\u0F4A', // ཊ - ta retroflex
44
+ '\u0F4B', // ཋ - tha retroflex
45
+ '\u0F4C', // ཌ - da retroflex
46
+ '\u0F4D', // ཌྷ - dda
47
+ '\u0F4E', // ཎ - na retroflex
48
+ '\u0F65', // ཥ - ssa (retroflex sha)
49
+ ]);
50
+
51
+ // Aspirated consonants subjoined forms (these indicate Sanskrit)
52
+ const SKRT_ASPIRATION_MARKS = new Set([
53
+ '\u0FB7', // ྷ - subjoined ha (aspiration mark)
54
+ ]);
55
+
56
+ // Base consonants that form aspirated pairs in Sanskrit
57
+ const SKRT_ASPIRATABLE_BASES = new Set([
58
+ '\u0F42', // ག - ga
59
+ '\u0F4C', // ཌ - retroflex da
60
+ '\u0F51', // ད - da
61
+ '\u0F56', // བ - ba
62
+ '\u0F5B', // ཛ - dza
63
+ ]);
64
+
65
+ // Subjoined retroflex consonants
66
+ const SKRT_SUB_RETROFLEX = new Set([
67
+ '\u0F9A', // ྚ - subjoined retroflex ta
68
+ '\u0F9B', // ྛ - subjoined retroflex tha
69
+ '\u0F9C', // ྜ - subjoined retroflex da
70
+ '\u0F9D', // ྜྷ - subjoined retroflex dha
71
+ '\u0F9E', // ྞ - subjoined retroflex na
72
+ '\u0FA5', // ྥ - subjoined sha
73
+ ]);
74
+
75
+ // Special Sanskrit consonants and clusters
76
+ const SKRT_SPECIAL_CONS = new Set([
77
+ '\u0F69', // ཀྵ - kssa
78
+ '\u0F6B', // ཫ - kka
79
+ '\u0F6C', // ཬ - rra
80
+ '\u0FB5', // ྵ - subjoined ssa
81
+ '\u0FB9', // ྐྵ - subjoined kssa
82
+ ]);
83
+
84
+ // Tsa-phru mark (Chinese transliteration)
85
+ const TSA_PHRU = '\u0F39';
86
+
87
+ /**
88
+ * Check if a syllable contains Sanskrit features
89
+ * @param {string} syllable - Single Tibetan syllable (without tsek)
90
+ * @returns {boolean} True if Sanskrit features detected
91
+ */
92
+ export function isSanskritSyllable(syllable) {
93
+ if (!syllable) return false;
94
+
95
+ let prevChar = null;
96
+
97
+ for (let i = 0; i < syllable.length; i++) {
98
+ const char = syllable[i];
99
+
100
+ // Check for Sanskrit long vowels (ཱ and combinations)
101
+ if (char === '\u0F71') { // ཱ - the long vowel mark
102
+ return true;
103
+ }
104
+
105
+ // Check for other Sanskrit-specific vowels
106
+ if (SKRT_LONG_VOWELS.has(char)) {
107
+ // ཻ (ai) and ཽ (au) are Sanskrit-specific
108
+ if (char === '\u0F7B' || char === '\u0F7D') {
109
+ return true;
110
+ }
111
+ // Anusvara (ཾ) is Sanskrit-specific
112
+ if (char === '\u0F7E') {
113
+ return true;
114
+ }
115
+ // Visarga (ཿ) is Sanskrit-specific
116
+ if (char === '\u0F7F') {
117
+ return true;
118
+ }
119
+ // Vocalic R/L
120
+ if (char >= '\u0F76' && char <= '\u0F79') {
121
+ return true;
122
+ }
123
+ // Candrabindu marks
124
+ if (char === '\u0F82' || char === '\u0F83') {
125
+ return true;
126
+ }
127
+ // Reversed vowels
128
+ if (char === '\u0F80' || char === '\u0F81') {
129
+ return true;
130
+ }
131
+ }
132
+
133
+ // Check for retroflex consonants
134
+ if (SKRT_RETROFLEX.has(char)) {
135
+ return true;
136
+ }
137
+
138
+ // Check for subjoined retroflex
139
+ if (SKRT_SUB_RETROFLEX.has(char)) {
140
+ return true;
141
+ }
142
+
143
+ // Check for aspirated consonants (base + ྷ)
144
+ if (char === '\u0FB7' && prevChar && SKRT_ASPIRATABLE_BASES.has(prevChar)) {
145
+ return true;
146
+ }
147
+
148
+ // Check for special Sanskrit consonants
149
+ if (SKRT_SPECIAL_CONS.has(char)) {
150
+ return true;
151
+ }
152
+
153
+ // Check for tsa-phru mark
154
+ if (char === TSA_PHRU) {
155
+ return true;
156
+ }
157
+
158
+ // Check for subjoined ha after aspiratable consonants in subjoined form
159
+ if (char === '\u0FB7') {
160
+ // Check if previous was a subjoined aspiratable: ྒ ྜ ྡ ྦ ྫ
161
+ if (prevChar === '\u0F92' || // ྒ
162
+ prevChar === '\u0F9C' || // ྜ
163
+ prevChar === '\u0FA1' || // ྡ
164
+ prevChar === '\u0FA6' || // ྦ
165
+ prevChar === '\u0FAB') { // ྫ
166
+ return true;
167
+ }
168
+ }
169
+
170
+ prevChar = char;
171
+ }
172
+
173
+ return false;
174
+ }
175
+
176
+ /**
177
+ * Check if a word (potentially multi-syllable) contains any Sanskrit syllables
178
+ * @param {string} word - Tibetan word (may contain tseks)
179
+ * @returns {boolean} True if any syllable is Sanskrit
180
+ */
181
+ export function hasSanskritSyllable(word) {
182
+ if (!word) return false;
183
+
184
+ // First check the whole word directly
185
+ if (isSanskritSyllable(word.replace(new RegExp(TSEK, 'g'), ''))) {
186
+ return true;
187
+ }
188
+
189
+ // Then check individual syllables
190
+ const syllables = word.trim().replace(new RegExp(TSEK + '+$'), '').split(TSEK);
191
+
192
+ for (const syl of syllables) {
193
+ if (syl && isSanskritSyllable(syl)) {
194
+ return true;
195
+ }
196
+ }
197
+
198
+ return false;
199
+ }
200
+
201
+ /**
202
+ * Check if character categories contain Sanskrit markers
203
+ * @param {Map<number, number>} charCategories - Map of index to category
204
+ * @returns {boolean} True if Sanskrit character categories found
205
+ */
206
+ export function hasSanskritCharCategory(charCategories) {
207
+ for (const cat of charCategories.values()) {
208
+ if (
209
+ cat === c.SKRT_VOW ||
210
+ cat === c.SKRT_CONS ||
211
+ cat === c.SKRT_SUB_CONS ||
212
+ cat === c.SKRT_LONG_VOW
213
+ ) {
214
+ return true;
215
+ }
216
+ }
217
+ return false;
218
+ }
219
+
220
+ /**
221
+ * Combined Sanskrit detection using both character categories and pattern matching
222
+ * @param {Map<number, number>} charCategories - Character categories
223
+ * @param {string} word - The word text
224
+ * @returns {boolean} True if Sanskrit detected
225
+ */
226
+ export function isSanskrit(charCategories, word) {
227
+ return hasSanskritCharCategory(charCategories) || hasSanskritSyllable(word);
228
+ }
@@ -0,0 +1,434 @@
1
+ /**
2
+ * Tibetan word tokenizer
3
+ * Ported from Botok's tokenize.py and wordtokenizer.py
4
+ */
5
+
6
+ import { TSEK, NAMCHE, ChunkMarkers as u, WordMarkers as w, chunkMarkerNames } from './constants.js';
7
+ import { TokChunks } from './chunks.js';
8
+ import { Trie } from './trie.js';
9
+ import { isSanskrit } from './sanskrit.js';
10
+
11
+ /**
12
+ * Token - represents a tokenized word or chunk
13
+ */
14
+ export class Token {
15
+ constructor() {
16
+ this.text = ''; // Original text
17
+ this.textCleaned = ''; // Cleaned text (same as text for now)
18
+ this.textUnaffixed = ''; // Text without affixes
19
+ this.start = 0; // Start position in original string
20
+ this.len = 0; // Length in original string
21
+ this.syls = []; // Syllables as strings
22
+ this.sylsIdx = []; // Syllable character indices
23
+ this.sylsStartEnd = []; // Syllable start/end positions
24
+ this.chunkType = ''; // Type of chunk (TEXT, PUNCT, etc.)
25
+ this.charTypes = []; // Character type for each char
26
+ this.pos = null; // Part of speech
27
+ this.lemma = null; // Lemma form
28
+ this.senses = []; // Meanings/senses
29
+ this.skrt = false; // Is Sanskrit
30
+ this.affixed = false; // Has affix
31
+ this.affix = false; // Is an affix
32
+ this.affixHost = false; // Hosts an affix
33
+ }
34
+
35
+ /**
36
+ * Get property from token or senses
37
+ */
38
+ get(key) {
39
+ if (this[key] !== undefined) {
40
+ return this[key];
41
+ }
42
+ if (this.senses.length > 0 && this.senses[0][key] !== undefined) {
43
+ return this.senses[0][key];
44
+ }
45
+ return null;
46
+ }
47
+
48
+ toString() {
49
+ let str = `text: "${this.text}"\n`;
50
+ str += `start: ${this.start}, len: ${this.len}\n`;
51
+ str += `chunkType: ${this.chunkType}\n`;
52
+ if (this.syls.length > 0) {
53
+ str += `syls: [${this.syls.map(s => `"${s}"`).join(', ')}]\n`;
54
+ }
55
+ if (this.pos) {
56
+ str += `pos: ${this.pos}\n`;
57
+ }
58
+ if (this.skrt) {
59
+ str += `skrt: true\n`;
60
+ }
61
+ return str;
62
+ }
63
+ }
64
+
65
+ /**
66
+ * Tokenize - core tokenization algorithm using trie
67
+ */
68
+ export class Tokenize {
69
+ /**
70
+ * @param {Trie} trie - Dictionary trie
71
+ */
72
+ constructor(trie) {
73
+ this.trie = trie;
74
+ this.preProcessed = null;
75
+ }
76
+
77
+ /**
78
+ * Tokenize preprocessed text
79
+ * @param {TokChunks} preProcessed - Preprocessed text chunks
80
+ * @returns {Token[]} Array of tokens
81
+ */
82
+ tokenize(preProcessed) {
83
+ this.preProcessed = preProcessed;
84
+ const tokens = [];
85
+
86
+ let cIdx = 0;
87
+ while (cIdx < preProcessed.chunks.length) {
88
+ let walker = cIdx;
89
+ const syls = [];
90
+ const maxMatch = [];
91
+ const matchData = new Map();
92
+ let currentNode = null;
93
+ let foundMaxMatch = false;
94
+
95
+ while (true) {
96
+ const [curSyl, chunkInfo] = preProcessed.chunks[walker];
97
+
98
+ // Chunk is a syllable
99
+ if (curSyl !== null) {
100
+ const syl = curSyl.map(i => preProcessed.bs.string[i]).join('');
101
+ currentNode = this.trie.walk(syl, currentNode);
102
+
103
+ if (currentNode) {
104
+ syls.push(walker);
105
+
106
+ if (currentNode.isMatch()) {
107
+ matchData.set(walker, currentNode.data);
108
+ maxMatch.push([...syls]);
109
+
110
+ // Check if matched is last chunk
111
+ if (walker + 1 === preProcessed.chunks.length) {
112
+ foundMaxMatch = true;
113
+ }
114
+ } else {
115
+ if (walker + 1 === preProcessed.chunks.length) {
116
+ if (maxMatch.length > 0) {
117
+ foundMaxMatch = true;
118
+ } else {
119
+ // OOV syllables become independent tokens
120
+ this._addFoundWordOrNonWord(walker, matchData, syls, tokens);
121
+ cIdx += syls.length;
122
+ break;
123
+ }
124
+ }
125
+ }
126
+ } else {
127
+ // Can't continue walking
128
+ if (maxMatch.length > 0) {
129
+ foundMaxMatch = true;
130
+ } else {
131
+ if (syls.length > 0) {
132
+ cIdx = this._addFoundWordOrNonWord(walker, matchData, syls, tokens);
133
+ break;
134
+ } else {
135
+ // Syllable not in dictionary
136
+ const nonWord = [walker];
137
+ tokens.push(this._chunksToToken(nonWord, {}, w.NON_WORD));
138
+ cIdx += 1;
139
+ break;
140
+ }
141
+ }
142
+ }
143
+ } else {
144
+ // Chunk is non-syllable (punctuation, etc.)
145
+ if (maxMatch.length > 0) {
146
+ foundMaxMatch = true;
147
+ } else if (syls.length > 0) {
148
+ cIdx = this._addFoundWordOrNonWord(walker, matchData, syls, tokens);
149
+ if (syls.length === 1) {
150
+ cIdx += 1;
151
+ }
152
+ break;
153
+ } else {
154
+ // Non-syllable becomes its own token
155
+ tokens.push(this._chunksToToken([cIdx], {}));
156
+ cIdx += 1;
157
+ break;
158
+ }
159
+ }
160
+
161
+ if (foundMaxMatch) {
162
+ const lastMaxMatch = maxMatch[maxMatch.length - 1];
163
+ this._addFoundWordOrNonWord(
164
+ cIdx + lastMaxMatch.length - 1,
165
+ matchData,
166
+ lastMaxMatch,
167
+ tokens
168
+ );
169
+ cIdx += lastMaxMatch.length;
170
+ break;
171
+ }
172
+
173
+ walker += 1;
174
+ if (walker >= preProcessed.chunks.length) {
175
+ // Reached end without finding more matches
176
+ if (maxMatch.length > 0) {
177
+ const lastMaxMatch = maxMatch[maxMatch.length - 1];
178
+ this._addFoundWordOrNonWord(
179
+ cIdx + lastMaxMatch.length - 1,
180
+ matchData,
181
+ lastMaxMatch,
182
+ tokens
183
+ );
184
+ cIdx += lastMaxMatch.length;
185
+ } else if (syls.length > 0) {
186
+ this._addFoundWordOrNonWord(walker - 1, matchData, syls, tokens);
187
+ cIdx += syls.length;
188
+ }
189
+ break;
190
+ }
191
+ }
192
+ }
193
+
194
+ this.preProcessed = null;
195
+ return tokens;
196
+ }
197
+
198
+ /**
199
+ * Add word or non-word token based on match data
200
+ * @private
201
+ */
202
+ _addFoundWordOrNonWord(cIdx, matchData, syls, tokens) {
203
+ if (matchData.has(cIdx)) {
204
+ // There is a match
205
+ const data = matchData.get(cIdx);
206
+ const ttype = (!data.senses || data.senses.length === 0 ||
207
+ !data.senses.some(m => m.pos)) ? w.NO_POS : null;
208
+ tokens.push(this._chunksToToken(syls, data, ttype));
209
+ } else if (matchData.size > 0) {
210
+ // Use longest partial match
211
+ const sortedKeys = Array.from(matchData.keys()).sort((a, b) => a - b);
212
+ const nonMaxIdx = sortedKeys[sortedKeys.length - 1];
213
+ const nonMaxSyls = syls.filter(s => s <= nonMaxIdx);
214
+ const data = matchData.get(nonMaxIdx);
215
+ const ttype = (!data.senses || data.senses.length === 0 ||
216
+ !data.senses.some(m => m.pos)) ? w.NO_POS : null;
217
+ tokens.push(this._chunksToToken(nonMaxSyls, data, ttype));
218
+ return nonMaxIdx;
219
+ } else {
220
+ // No match - add first syllable as non-word
221
+ tokens.push(this._chunksToToken([syls[0]], {}, w.NO_POS));
222
+
223
+ // Decrement for retry
224
+ if (syls.length > 1) {
225
+ return cIdx - (syls.length - 1) - 1;
226
+ }
227
+ }
228
+ return cIdx;
229
+ }
230
+
231
+ /**
232
+ * Convert chunks to a token
233
+ * @private
234
+ */
235
+ _chunksToToken(sylIndices, data, ttype = null) {
236
+ const token = new Token();
237
+
238
+ if (sylIndices.length === 1) {
239
+ const [sylChars, chunkInfo] = this.preProcessed.chunks[sylIndices[0]];
240
+ const [chunkType, chunkStart, chunkLen] = chunkInfo;
241
+
242
+ token.text = this.preProcessed.bs.string.slice(chunkStart, chunkStart + chunkLen);
243
+ token.start = chunkStart;
244
+ token.len = chunkLen;
245
+ token.chunkType = chunkMarkerNames[chunkType] || 'UNKNOWN';
246
+
247
+ if (sylChars) {
248
+ token.syls = [sylChars.map(i => this.preProcessed.bs.string[i]).join('')];
249
+ token.sylsIdx = [sylChars.map(i => i - chunkStart)];
250
+ token.sylsStartEnd = [{ start: 0, end: chunkLen }];
251
+ }
252
+ } else if (sylIndices.length > 1) {
253
+ const firstChunk = this.preProcessed.chunks[sylIndices[0]];
254
+ const lastChunk = this.preProcessed.chunks[sylIndices[sylIndices.length - 1]];
255
+
256
+ const startPos = firstChunk[1][1];
257
+ let totalLen = 0;
258
+
259
+ token.syls = [];
260
+ token.sylsIdx = [];
261
+ token.sylsStartEnd = [];
262
+
263
+ for (const idx of sylIndices) {
264
+ const [sylChars, chunkInfo] = this.preProcessed.chunks[idx];
265
+ const [_, chunkStart, chunkLen] = chunkInfo;
266
+
267
+ totalLen += chunkLen;
268
+
269
+ if (sylChars) {
270
+ token.syls.push(sylChars.map(i => this.preProcessed.bs.string[i]).join(''));
271
+ token.sylsIdx.push(sylChars.map(i => i - startPos));
272
+ token.sylsStartEnd.push({
273
+ start: chunkStart - startPos,
274
+ end: chunkStart - startPos + chunkLen
275
+ });
276
+ }
277
+ }
278
+
279
+ token.text = this.preProcessed.bs.string.slice(startPos, startPos + totalLen);
280
+ token.start = startPos;
281
+ token.len = totalLen;
282
+ token.chunkType = chunkMarkerNames[lastChunk[1][0]] || 'UNKNOWN';
283
+ }
284
+
285
+ // Set text cleaned and unaffixed
286
+ token.textCleaned = token.text;
287
+ token.textUnaffixed = token.text;
288
+
289
+ // Copy data to token
290
+ if (data) {
291
+ if (data.senses) {
292
+ token.senses = [...data.senses];
293
+ }
294
+ if (data.skrt) {
295
+ token.skrt = data.skrt;
296
+ }
297
+ }
298
+
299
+ // Set token type in senses
300
+ if (ttype) {
301
+ const typeName = ttype === w.NO_POS ? 'NO_POS' :
302
+ ttype === w.NON_WORD ? 'NON_WORD' : 'WORD';
303
+
304
+ if (token.senses.length === 0) {
305
+ token.senses.push({ pos: typeName });
306
+ } else {
307
+ for (const sense of token.senses) {
308
+ if (!sense.pos) {
309
+ sense.pos = typeName;
310
+ }
311
+ }
312
+ }
313
+ }
314
+
315
+ // Sanskrit detection
316
+ if (!token.skrt && token.syls.length > 0) {
317
+ const charGroups = this.preProcessed.bs.exportGroups(token.start, token.len);
318
+ token.skrt = isSanskrit(charGroups, token.text);
319
+ }
320
+
321
+ // Set character types
322
+ token.charTypes = [];
323
+ for (let i = token.start; i < token.start + token.len; i++) {
324
+ token.charTypes.push(this.preProcessed.bs.getCategory(i));
325
+ }
326
+
327
+ // Set POS from first sense if available
328
+ if (token.senses.length > 0 && token.senses[0].pos) {
329
+ token.pos = token.senses[0].pos;
330
+ }
331
+
332
+ return token;
333
+ }
334
+ }
335
+
336
+ /**
337
+ * WordTokenizer - main tokenizer class
338
+ */
339
+ export class WordTokenizer {
340
+ /**
341
+ * @param {Trie} trie - Pre-built dictionary trie
342
+ */
343
+ constructor(trie = null) {
344
+ this.trie = trie || new Trie();
345
+ this.tokenizer = new Tokenize(this.trie);
346
+ }
347
+
348
+ /**
349
+ * Load dictionary from JSON data
350
+ * @param {Object} dictData - Dictionary data { words: [...] }
351
+ */
352
+ loadDictionary(dictData) {
353
+ if (!dictData || !dictData.words) {
354
+ return;
355
+ }
356
+
357
+ for (const entry of dictData.words) {
358
+ const syllables = this._wordToSyllables(entry.word);
359
+ if (syllables && syllables.length > 0) {
360
+ const data = {};
361
+ if (entry.pos || entry.lemma || entry.sense || entry.freq) {
362
+ data.senses = [{
363
+ pos: entry.pos || null,
364
+ lemma: entry.lemma || null,
365
+ sense: entry.sense || null,
366
+ freq: entry.freq || null,
367
+ }];
368
+ }
369
+ this.trie.add(syllables, data);
370
+ }
371
+ }
372
+ }
373
+
374
+ /**
375
+ * Convert word to syllables for trie lookup
376
+ * @private
377
+ */
378
+ _wordToSyllables(word) {
379
+ // Use TokChunks to get clean syllables
380
+ const chunks = new TokChunks(word);
381
+ return chunks.getSyls();
382
+ }
383
+
384
+ /**
385
+ * Tokenize a string
386
+ * @param {string} text - Input text
387
+ * @param {Object} options - Options
388
+ * @param {boolean} options.spacesAsPunct - Treat spaces as punctuation
389
+ * @returns {Token[]} Array of tokens
390
+ */
391
+ tokenize(text, options = {}) {
392
+ const { spacesAsPunct = false } = options;
393
+
394
+ const preProcessed = new TokChunks(text, [], spacesAsPunct);
395
+ preProcessed.serveSylsToTrie();
396
+
397
+ return this.tokenizer.tokenize(preProcessed);
398
+ }
399
+
400
+ /**
401
+ * Tokenize and return simple word list
402
+ * @param {string} text - Input text
403
+ * @returns {string[]} Array of word strings
404
+ */
405
+ tokenizeToStrings(text) {
406
+ const tokens = this.tokenize(text);
407
+ return tokens.map(t => t.text);
408
+ }
409
+
410
+ /**
411
+ * Segment text into words (space-separated string)
412
+ * @param {string} text - Input text
413
+ * @returns {string} Space-separated words
414
+ */
415
+ segment(text) {
416
+ const tokens = this.tokenize(text);
417
+ const words = [];
418
+
419
+ for (const token of tokens) {
420
+ // Skip pure punctuation/whitespace unless it's meaningful
421
+ if (token.chunkType === 'TEXT' || token.chunkType === 'NUM' || token.chunkType === 'SYM') {
422
+ words.push(token.text.trim());
423
+ } else if (token.chunkType === 'PUNCT') {
424
+ // Include punctuation that's not just whitespace
425
+ const trimmed = token.text.trim();
426
+ if (trimmed && trimmed !== TSEK) {
427
+ words.push(trimmed);
428
+ }
429
+ }
430
+ }
431
+
432
+ return words.filter(w => w).join(' ');
433
+ }
434
+ }