npm - tibetan-word-tokenizer - Versions diffs - 1.0.0 - Mend

tibetan-word-tokenizer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/data/dictionary.json +1 -0
package/data/dictionary.pretty.json +123406 -0
package/package.json +38 -0
package/src/char-categories.js +349 -0
package/src/chunks.js +516 -0
package/src/constants.js +102 -0
package/src/index.js +68 -0
package/src/sanskrit.js +228 -0
package/src/tokenizer.js +434 -0
package/src/trie.js +263 -0

package/src/sanskrit.js ADDED Viewed

@@ -0,0 +1,228 @@
+/**
+ * Sanskrit detection for Tibetan text
+ * Ported from Botok's has_skrt_syl.py
+ *
+ * Detects Sanskrit syllables in Tibetan script based on:
+ * 1. Sanskrit-specific vowels (long vowels, vocalic R/L)
+ * 2. Sanskrit consonants (retroflex, aspirated)
+ * 3. Invalid Tibetan consonant clusters that are valid in Sanskrit
+ */
+import { TSEK } from './constants.js';
+import { CharMarkers as c } from './constants.js';
+/**
+ * Sanskrit-specific characters in Tibetan script
+ */
+// Sanskrit long vowels and special vowel marks
+const SKRT_LONG_VOWELS = new Set([
+  '\u0F71',  // ཱ - long a
+  '\u0F72',  // ི - vowel i (can combine with ཱ)
+  '\u0F73',  // ཱི - long i
+  '\u0F74',  // ུ - vowel u (can combine with ཱ)
+  '\u0F75',  // ཱུ - long u
+  '\u0F76',  // ྲྀ - vocalic r
+  '\u0F77',  // ཷ - long vocalic r
+  '\u0F78',  // ླྀ - vocalic l
+  '\u0F79',  // ཹ - long vocalic l
+  '\u0F7A',  // ེ - vowel e
+  '\u0F7B',  // ཻ - ai vowel (Sanskrit)
+  '\u0F7C',  // ོ - vowel o
+  '\u0F7D',  // ཽ - au vowel (Sanskrit)
+  '\u0F7E',  // ཾ - anusvara
+  '\u0F7F',  // ཿ - visarga
+  '\u0F80',  // ྀ - reversed i
+  '\u0F81',  // ཱྀ - reversed long i
+  '\u0F82',  // ྂ - candrabindu
+  '\u0F83',  // ྃ - candrabindu with ornament
+]);
+// Retroflex consonants (Sanskrit-specific)
+const SKRT_RETROFLEX = new Set([
+  '\u0F4A',  // ཊ - ta retroflex
+  '\u0F4B',  // ཋ - tha retroflex
+  '\u0F4C',  // ཌ - da retroflex
+  '\u0F4D',  // ཌྷ - dda
+  '\u0F4E',  // ཎ - na retroflex
+  '\u0F65',  // ཥ - ssa (retroflex sha)
+]);
+// Aspirated consonants subjoined forms (these indicate Sanskrit)
+const SKRT_ASPIRATION_MARKS = new Set([
+  '\u0FB7',  // ྷ - subjoined ha (aspiration mark)
+]);
+// Base consonants that form aspirated pairs in Sanskrit
+const SKRT_ASPIRATABLE_BASES = new Set([
+  '\u0F42',  // ག - ga
+  '\u0F4C',  // ཌ - retroflex da
+  '\u0F51',  // ད - da
+  '\u0F56',  // བ - ba
+  '\u0F5B',  // ཛ - dza
+]);
+// Subjoined retroflex consonants
+const SKRT_SUB_RETROFLEX = new Set([
+  '\u0F9A',  // ྚ - subjoined retroflex ta
+  '\u0F9B',  // ྛ - subjoined retroflex tha
+  '\u0F9C',  // ྜ - subjoined retroflex da
+  '\u0F9D',  // ྜྷ - subjoined retroflex dha
+  '\u0F9E',  // ྞ - subjoined retroflex na
+  '\u0FA5',  // ྥ - subjoined sha
+]);
+// Special Sanskrit consonants and clusters
+const SKRT_SPECIAL_CONS = new Set([
+  '\u0F69',  // ཀྵ - kssa
+  '\u0F6B',  // ཫ - kka
+  '\u0F6C',  // ཬ - rra
+  '\u0FB5',  // ྵ - subjoined ssa
+  '\u0FB9',  // ྐྵ - subjoined kssa
+]);
+// Tsa-phru mark (Chinese transliteration)
+const TSA_PHRU = '\u0F39';
+/**
+ * Check if a syllable contains Sanskrit features
+ * @param {string} syllable - Single Tibetan syllable (without tsek)
+ * @returns {boolean} True if Sanskrit features detected
+ */
+export function isSanskritSyllable(syllable) {
+  if (!syllable) return false;
+  let prevChar = null;
+  for (let i = 0; i < syllable.length; i++) {
+    const char = syllable[i];
+    // Check for Sanskrit long vowels (ཱ and combinations)
+    if (char === '\u0F71') {  // ཱ - the long vowel mark
+      return true;
+    }
+    // Check for other Sanskrit-specific vowels
+    if (SKRT_LONG_VOWELS.has(char)) {
+      // ཻ (ai) and ཽ (au) are Sanskrit-specific
+      if (char === '\u0F7B' || char === '\u0F7D') {
+        return true;
+      }
+      // Anusvara (ཾ) is Sanskrit-specific
+      if (char === '\u0F7E') {
+        return true;
+      }
+      // Visarga (ཿ) is Sanskrit-specific
+      if (char === '\u0F7F') {
+        return true;
+      }
+      // Vocalic R/L
+      if (char >= '\u0F76' && char <= '\u0F79') {
+        return true;
+      }
+      // Candrabindu marks
+      if (char === '\u0F82' || char === '\u0F83') {
+        return true;
+      }
+      // Reversed vowels
+      if (char === '\u0F80' || char === '\u0F81') {
+        return true;
+      }
+    }
+    // Check for retroflex consonants
+    if (SKRT_RETROFLEX.has(char)) {
+      return true;
+    }
+    // Check for subjoined retroflex
+    if (SKRT_SUB_RETROFLEX.has(char)) {
+      return true;
+    }
+    // Check for aspirated consonants (base + ྷ)
+    if (char === '\u0FB7' && prevChar && SKRT_ASPIRATABLE_BASES.has(prevChar)) {
+      return true;
+    }
+    // Check for special Sanskrit consonants
+    if (SKRT_SPECIAL_CONS.has(char)) {
+      return true;
+    }
+    // Check for tsa-phru mark
+    if (char === TSA_PHRU) {
+      return true;
+    }
+    // Check for subjoined ha after aspiratable consonants in subjoined form
+    if (char === '\u0FB7') {
+      // Check if previous was a subjoined aspiratable: ྒ ྜ ྡ ྦ ྫ
+      if (prevChar === '\u0F92' ||  // ྒ
+          prevChar === '\u0F9C' ||  // ྜ
+          prevChar === '\u0FA1' ||  // ྡ
+          prevChar === '\u0FA6' ||  // ྦ
+          prevChar === '\u0FAB') {  // ྫ
+        return true;
+      }
+    }
+    prevChar = char;
+  }
+  return false;
+}
+/**
+ * Check if a word (potentially multi-syllable) contains any Sanskrit syllables
+ * @param {string} word - Tibetan word (may contain tseks)
+ * @returns {boolean} True if any syllable is Sanskrit
+ */
+export function hasSanskritSyllable(word) {
+  if (!word) return false;
+  // First check the whole word directly
+  if (isSanskritSyllable(word.replace(new RegExp(TSEK, 'g'), ''))) {
+    return true;
+  }
+  // Then check individual syllables
+  const syllables = word.trim().replace(new RegExp(TSEK + '+$'), '').split(TSEK);
+  for (const syl of syllables) {
+    if (syl && isSanskritSyllable(syl)) {
+      return true;
+    }
+  }
+  return false;
+}
+/**
+ * Check if character categories contain Sanskrit markers
+ * @param {Map<number, number>} charCategories - Map of index to category
+ * @returns {boolean} True if Sanskrit character categories found
+ */
+export function hasSanskritCharCategory(charCategories) {
+  for (const cat of charCategories.values()) {
+    if (
+      cat === c.SKRT_VOW ||
+      cat === c.SKRT_CONS ||
+      cat === c.SKRT_SUB_CONS ||
+      cat === c.SKRT_LONG_VOW
+    ) {
+      return true;
+    }
+  }
+  return false;
+}
+/**
+ * Combined Sanskrit detection using both character categories and pattern matching
+ * @param {Map<number, number>} charCategories - Character categories
+ * @param {string} word - The word text
+ * @returns {boolean} True if Sanskrit detected
+ */
+export function isSanskrit(charCategories, word) {
+  return hasSanskritCharCategory(charCategories) || hasSanskritSyllable(word);
+}

package/src/tokenizer.js ADDED Viewed

@@ -0,0 +1,434 @@
+/**
+ * Tibetan word tokenizer
+ * Ported from Botok's tokenize.py and wordtokenizer.py
+ */
+import { TSEK, NAMCHE, ChunkMarkers as u, WordMarkers as w, chunkMarkerNames } from './constants.js';
+import { TokChunks } from './chunks.js';
+import { Trie } from './trie.js';
+import { isSanskrit } from './sanskrit.js';
+/**
+ * Token - represents a tokenized word or chunk
+ */
+export class Token {
+  constructor() {
+    this.text = '';           // Original text
+    this.textCleaned = '';    // Cleaned text (same as text for now)
+    this.textUnaffixed = '';  // Text without affixes
+    this.start = 0;           // Start position in original string
+    this.len = 0;             // Length in original string
+    this.syls = [];           // Syllables as strings
+    this.sylsIdx = [];        // Syllable character indices
+    this.sylsStartEnd = [];   // Syllable start/end positions
+    this.chunkType = '';      // Type of chunk (TEXT, PUNCT, etc.)
+    this.charTypes = [];      // Character type for each char
+    this.pos = null;          // Part of speech
+    this.lemma = null;        // Lemma form
+    this.senses = [];         // Meanings/senses
+    this.skrt = false;        // Is Sanskrit
+    this.affixed = false;     // Has affix
+    this.affix = false;       // Is an affix
+    this.affixHost = false;   // Hosts an affix
+  }
+  /**
+   * Get property from token or senses
+   */
+  get(key) {
+    if (this[key] !== undefined) {
+      return this[key];
+    }
+    if (this.senses.length > 0 && this.senses[0][key] !== undefined) {
+      return this.senses[0][key];
+    }
+    return null;
+  }
+  toString() {
+    let str = `text: "${this.text}"\n`;
+    str += `start: ${this.start}, len: ${this.len}\n`;
+    str += `chunkType: ${this.chunkType}\n`;
+    if (this.syls.length > 0) {
+      str += `syls: [${this.syls.map(s => `"${s}"`).join(', ')}]\n`;
+    }
+    if (this.pos) {
+      str += `pos: ${this.pos}\n`;
+    }
+    if (this.skrt) {
+      str += `skrt: true\n`;
+    }
+    return str;
+  }
+}
+/**
+ * Tokenize - core tokenization algorithm using trie
+ */
+export class Tokenize {
+  /**
+   * @param {Trie} trie - Dictionary trie
+   */
+  constructor(trie) {
+    this.trie = trie;
+    this.preProcessed = null;
+  }
+  /**
+   * Tokenize preprocessed text
+   * @param {TokChunks} preProcessed - Preprocessed text chunks
+   * @returns {Token[]} Array of tokens
+   */
+  tokenize(preProcessed) {
+    this.preProcessed = preProcessed;
+    const tokens = [];
+    let cIdx = 0;
+    while (cIdx < preProcessed.chunks.length) {
+      let walker = cIdx;
+      const syls = [];
+      const maxMatch = [];
+      const matchData = new Map();
+      let currentNode = null;
+      let foundMaxMatch = false;
+      while (true) {
+        const [curSyl, chunkInfo] = preProcessed.chunks[walker];
+        // Chunk is a syllable
+        if (curSyl !== null) {
+          const syl = curSyl.map(i => preProcessed.bs.string[i]).join('');
+          currentNode = this.trie.walk(syl, currentNode);
+          if (currentNode) {
+            syls.push(walker);
+            if (currentNode.isMatch()) {
+              matchData.set(walker, currentNode.data);
+              maxMatch.push([...syls]);
+              // Check if matched is last chunk
+              if (walker + 1 === preProcessed.chunks.length) {
+                foundMaxMatch = true;
+              }
+            } else {
+              if (walker + 1 === preProcessed.chunks.length) {
+                if (maxMatch.length > 0) {
+                  foundMaxMatch = true;
+                } else {
+                  // OOV syllables become independent tokens
+                  this._addFoundWordOrNonWord(walker, matchData, syls, tokens);
+                  cIdx += syls.length;
+                  break;
+                }
+              }
+            }
+          } else {
+            // Can't continue walking
+            if (maxMatch.length > 0) {
+              foundMaxMatch = true;
+            } else {
+              if (syls.length > 0) {
+                cIdx = this._addFoundWordOrNonWord(walker, matchData, syls, tokens);
+                break;
+              } else {
+                // Syllable not in dictionary
+                const nonWord = [walker];
+                tokens.push(this._chunksToToken(nonWord, {}, w.NON_WORD));
+                cIdx += 1;
+                break;
+              }
+            }
+          }
+        } else {
+          // Chunk is non-syllable (punctuation, etc.)
+          if (maxMatch.length > 0) {
+            foundMaxMatch = true;
+          } else if (syls.length > 0) {
+            cIdx = this._addFoundWordOrNonWord(walker, matchData, syls, tokens);
+            if (syls.length === 1) {
+              cIdx += 1;
+            }
+            break;
+          } else {
+            // Non-syllable becomes its own token
+            tokens.push(this._chunksToToken([cIdx], {}));
+            cIdx += 1;
+            break;
+          }
+        }
+        if (foundMaxMatch) {
+          const lastMaxMatch = maxMatch[maxMatch.length - 1];
+          this._addFoundWordOrNonWord(
+            cIdx + lastMaxMatch.length - 1,
+            matchData,
+            lastMaxMatch,
+            tokens
+          );
+          cIdx += lastMaxMatch.length;
+          break;
+        }
+        walker += 1;
+        if (walker >= preProcessed.chunks.length) {
+          // Reached end without finding more matches
+          if (maxMatch.length > 0) {
+            const lastMaxMatch = maxMatch[maxMatch.length - 1];
+            this._addFoundWordOrNonWord(
+              cIdx + lastMaxMatch.length - 1,
+              matchData,
+              lastMaxMatch,
+              tokens
+            );
+            cIdx += lastMaxMatch.length;
+          } else if (syls.length > 0) {
+            this._addFoundWordOrNonWord(walker - 1, matchData, syls, tokens);
+            cIdx += syls.length;
+          }
+          break;
+        }
+      }
+    }
+    this.preProcessed = null;
+    return tokens;
+  }
+  /**
+   * Add word or non-word token based on match data
+   * @private
+   */
+  _addFoundWordOrNonWord(cIdx, matchData, syls, tokens) {
+    if (matchData.has(cIdx)) {
+      // There is a match
+      const data = matchData.get(cIdx);
+      const ttype = (!data.senses || data.senses.length === 0 ||
+        !data.senses.some(m => m.pos)) ? w.NO_POS : null;
+      tokens.push(this._chunksToToken(syls, data, ttype));
+    } else if (matchData.size > 0) {
+      // Use longest partial match
+      const sortedKeys = Array.from(matchData.keys()).sort((a, b) => a - b);
+      const nonMaxIdx = sortedKeys[sortedKeys.length - 1];
+      const nonMaxSyls = syls.filter(s => s <= nonMaxIdx);
+      const data = matchData.get(nonMaxIdx);
+      const ttype = (!data.senses || data.senses.length === 0 ||
+        !data.senses.some(m => m.pos)) ? w.NO_POS : null;
+      tokens.push(this._chunksToToken(nonMaxSyls, data, ttype));
+      return nonMaxIdx;
+    } else {
+      // No match - add first syllable as non-word
+      tokens.push(this._chunksToToken([syls[0]], {}, w.NO_POS));
+      // Decrement for retry
+      if (syls.length > 1) {
+        return cIdx - (syls.length - 1) - 1;
+      }
+    }
+    return cIdx;
+  }
+  /**
+   * Convert chunks to a token
+   * @private
+   */
+  _chunksToToken(sylIndices, data, ttype = null) {
+    const token = new Token();
+    if (sylIndices.length === 1) {
+      const [sylChars, chunkInfo] = this.preProcessed.chunks[sylIndices[0]];
+      const [chunkType, chunkStart, chunkLen] = chunkInfo;
+      token.text = this.preProcessed.bs.string.slice(chunkStart, chunkStart + chunkLen);
+      token.start = chunkStart;
+      token.len = chunkLen;
+      token.chunkType = chunkMarkerNames[chunkType] || 'UNKNOWN';
+      if (sylChars) {
+        token.syls = [sylChars.map(i => this.preProcessed.bs.string[i]).join('')];
+        token.sylsIdx = [sylChars.map(i => i - chunkStart)];
+        token.sylsStartEnd = [{ start: 0, end: chunkLen }];
+      }
+    } else if (sylIndices.length > 1) {
+      const firstChunk = this.preProcessed.chunks[sylIndices[0]];
+      const lastChunk = this.preProcessed.chunks[sylIndices[sylIndices.length - 1]];
+      const startPos = firstChunk[1][1];
+      let totalLen = 0;
+      token.syls = [];
+      token.sylsIdx = [];
+      token.sylsStartEnd = [];
+      for (const idx of sylIndices) {
+        const [sylChars, chunkInfo] = this.preProcessed.chunks[idx];
+        const [_, chunkStart, chunkLen] = chunkInfo;
+        totalLen += chunkLen;
+        if (sylChars) {
+          token.syls.push(sylChars.map(i => this.preProcessed.bs.string[i]).join(''));
+          token.sylsIdx.push(sylChars.map(i => i - startPos));
+          token.sylsStartEnd.push({
+            start: chunkStart - startPos,
+            end: chunkStart - startPos + chunkLen
+          });
+        }
+      }
+      token.text = this.preProcessed.bs.string.slice(startPos, startPos + totalLen);
+      token.start = startPos;
+      token.len = totalLen;
+      token.chunkType = chunkMarkerNames[lastChunk[1][0]] || 'UNKNOWN';
+    }
+    // Set text cleaned and unaffixed
+    token.textCleaned = token.text;
+    token.textUnaffixed = token.text;
+    // Copy data to token
+    if (data) {
+      if (data.senses) {
+        token.senses = [...data.senses];
+      }
+      if (data.skrt) {
+        token.skrt = data.skrt;
+      }
+    }
+    // Set token type in senses
+    if (ttype) {
+      const typeName = ttype === w.NO_POS ? 'NO_POS' :
+        ttype === w.NON_WORD ? 'NON_WORD' : 'WORD';
+      if (token.senses.length === 0) {
+        token.senses.push({ pos: typeName });
+      } else {
+        for (const sense of token.senses) {
+          if (!sense.pos) {
+            sense.pos = typeName;
+          }
+        }
+      }
+    }
+    // Sanskrit detection
+    if (!token.skrt && token.syls.length > 0) {
+      const charGroups = this.preProcessed.bs.exportGroups(token.start, token.len);
+      token.skrt = isSanskrit(charGroups, token.text);
+    }
+    // Set character types
+    token.charTypes = [];
+    for (let i = token.start; i < token.start + token.len; i++) {
+      token.charTypes.push(this.preProcessed.bs.getCategory(i));
+    }
+    // Set POS from first sense if available
+    if (token.senses.length > 0 && token.senses[0].pos) {
+      token.pos = token.senses[0].pos;
+    }
+    return token;
+  }
+}
+/**
+ * WordTokenizer - main tokenizer class
+ */
+export class WordTokenizer {
+  /**
+   * @param {Trie} trie - Pre-built dictionary trie
+   */
+  constructor(trie = null) {
+    this.trie = trie || new Trie();
+    this.tokenizer = new Tokenize(this.trie);
+  }
+  /**
+   * Load dictionary from JSON data
+   * @param {Object} dictData - Dictionary data { words: [...] }
+   */
+  loadDictionary(dictData) {
+    if (!dictData || !dictData.words) {
+      return;
+    }
+    for (const entry of dictData.words) {
+      const syllables = this._wordToSyllables(entry.word);
+      if (syllables && syllables.length > 0) {
+        const data = {};
+        if (entry.pos || entry.lemma || entry.sense || entry.freq) {
+          data.senses = [{
+            pos: entry.pos || null,
+            lemma: entry.lemma || null,
+            sense: entry.sense || null,
+            freq: entry.freq || null,
+          }];
+        }
+        this.trie.add(syllables, data);
+      }
+    }
+  }
+  /**
+   * Convert word to syllables for trie lookup
+   * @private
+   */
+  _wordToSyllables(word) {
+    // Use TokChunks to get clean syllables
+    const chunks = new TokChunks(word);
+    return chunks.getSyls();
+  }
+  /**
+   * Tokenize a string
+   * @param {string} text - Input text
+   * @param {Object} options - Options
+   * @param {boolean} options.spacesAsPunct - Treat spaces as punctuation
+   * @returns {Token[]} Array of tokens
+   */
+  tokenize(text, options = {}) {
+    const { spacesAsPunct = false } = options;
+    const preProcessed = new TokChunks(text, [], spacesAsPunct);
+    preProcessed.serveSylsToTrie();
+    return this.tokenizer.tokenize(preProcessed);
+  }
+  /**
+   * Tokenize and return simple word list
+   * @param {string} text - Input text
+   * @returns {string[]} Array of word strings
+   */
+  tokenizeToStrings(text) {
+    const tokens = this.tokenize(text);
+    return tokens.map(t => t.text);
+  }
+  /**
+   * Segment text into words (space-separated string)
+   * @param {string} text - Input text
+   * @returns {string} Space-separated words
+   */
+  segment(text) {
+    const tokens = this.tokenize(text);
+    const words = [];
+    for (const token of tokens) {
+      // Skip pure punctuation/whitespace unless it's meaningful
+      if (token.chunkType === 'TEXT' || token.chunkType === 'NUM' || token.chunkType === 'SYM') {
+        words.push(token.text.trim());
+      } else if (token.chunkType === 'PUNCT') {
+        // Include punctuation that's not just whitespace
+        const trimmed = token.text.trim();
+        if (trimmed && trimmed !== TSEK) {
+          words.push(trimmed);
+        }
+      }
+    }
+    return words.filter(w => w).join(' ');
+  }
+}