npm - tokana - Versions diffs - 0.1.0 - Mend

tokana 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/LICENSE +21 -0
package/README.md +164 -0
package/dist/cli.js +39 -0
package/dist/cli.js.map +1 -0
package/dist/compile-VQJF62SJ.js +741 -0
package/dist/compile-VQJF62SJ.js.map +1 -0
package/dist/index.cjs +1478 -0
package/dist/index.cjs.map +1 -0
package/dist/index.d.cts +566 -0
package/dist/index.d.ts +566 -0
package/dist/index.js +1432 -0
package/dist/index.js.map +1 -0
package/dist/info-OVE32SWZ.js +54 -0
package/dist/info-OVE32SWZ.js.map +1 -0
package/package.json +63 -0

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,566 @@
+/**
+ * Token type definitions for different dictionary formats.
+ */
+/** Base token properties common to all formats */
+interface BaseToken {
+    /** Word ID in the dictionary */
+    wordId: number;
+    /** Token type: "KNOWN", "UNKNOWN", "BOS", "EOS" */
+    type: TokenType;
+    /** Surface form (the actual text) */
+    surface: string;
+    /** Start position in the input string */
+    offset: number;
+    /** Length of the surface form */
+    length: number;
+    /** Cumulative cost from Viterbi */
+    cost: number;
+}
+type TokenType = "KNOWN" | "UNKNOWN" | "BOS" | "EOS";
+/** IPAdic token with detailed grammatical information */
+interface IpadicToken extends BaseToken {
+    /** Part-of-speech (品詞) */
+    pos: string;
+    /** POS detail 1 (品詞細分類1) */
+    posDetail1: string;
+    /** POS detail 2 (品詞細分類2) */
+    posDetail2: string;
+    /** POS detail 3 (品詞細分類3) */
+    posDetail3: string;
+    /** Conjugation type (活用型) */
+    conjugationType: string;
+    /** Conjugation form (活用形) */
+    conjugationForm: string;
+    /** Base form (原形) */
+    baseForm: string;
+    /** Reading (読み) */
+    reading: string;
+    /** Pronunciation (発音) */
+    pronunciation: string;
+}
+/** UniDic token with more detailed linguistic annotations */
+interface UnidicToken extends BaseToken {
+    /** Part-of-speech 1 */
+    pos1: string;
+    /** Part-of-speech 2 */
+    pos2: string;
+    /** Part-of-speech 3 */
+    pos3: string;
+    /** Part-of-speech 4 */
+    pos4: string;
+    /** Conjugation type (活用型) */
+    cType: string;
+    /** Conjugation form (活用形) */
+    cForm: string;
+    /** Lemma reading form (語彙素読み) */
+    lForm: string;
+    /** Lemma (語彙素) */
+    lemma: string;
+    /** Orthographic form (書字形) */
+    orth: string;
+    /** Orthographic base form (書字形出現形) */
+    orthBase: string;
+    /** Pronunciation (発音形) */
+    pron: string;
+    /** Pronunciation base form */
+    pronBase: string;
+    /** Word origin (語種) */
+    goshu: string;
+    /** Accent type */
+    iType: string;
+    /** Accent form */
+    iForm: string;
+    /** Initial sound change type */
+    fType: string;
+    /** Initial sound change form */
+    fForm: string;
+}
+/** Discriminated union of all token types */
+type Token = IpadicToken | UnidicToken;
+/**
+ * Double Array Trie for efficient prefix matching.
+ * Supports exact lookup and common prefix search.
+ */
+interface KeyValue {
+    /** The value associated with the key */
+    value: number;
+    /** Length of the matched key (in code units) */
+    length: number;
+}
+declare class DoubleArray {
+    private base;
+    private check;
+    constructor(base?: Int32Array, check?: Int32Array);
+    /**
+     * Build a DoubleArray from sorted string keys.
+     * Keys must be sorted lexicographically.
+     */
+    static build(keys: {
+        key: string;
+        value: number;
+    }[]): DoubleArray;
+    /**
+     * Exact lookup of a key string.
+     * Returns the associated value, or -1 if not found.
+     */
+    lookup(key: string): number;
+    /**
+     * Find all keys that are prefixes of the given string.
+     * Returns values and their matched lengths.
+     */
+    commonPrefixSearch(key: string): KeyValue[];
+    /**
+     * Check if any key exists that starts with the given prefix.
+     */
+    contain(key: string): boolean;
+    /**
+     * Get the raw base array (for serialization).
+     */
+    getBase(): Int32Array;
+    /**
+     * Get the raw check array (for serialization).
+     */
+    getCheck(): Int32Array;
+    /**
+     * Create a DoubleArray from raw base and check arrays (for deserialization).
+     */
+    static fromArrays(base: Int32Array, check: Int32Array): DoubleArray;
+}
+/**
+ * Connection cost matrix.
+ * Stores the cost of transitioning from one morpheme to the next.
+ * Matrix is indexed by [right_id][left_id].
+ */
+declare class ConnectionCosts {
+    private readonly forwardSize;
+    private readonly backwardSize;
+    private readonly costs;
+    constructor(forwardSize: number, backwardSize: number);
+    put(forwardId: number, backwardId: number, cost: number): void;
+    get(forwardId: number, backwardId: number): number;
+    getForwardSize(): number;
+    getBackwardSize(): number;
+    /**
+     * Load from a raw Int16Array buffer.
+     * Format: [forwardSize (as int16), backwardSize (as int16), ...costs]
+     */
+    static fromBuffer(buffer: Int16Array): ConnectionCosts;
+    /**
+     * Serialize to Int16Array buffer.
+     */
+    toBuffer(): Int16Array;
+}
+/**
+ * Growable byte buffer for binary data manipulation.
+ * Used for reading/writing dictionary binary data.
+ */
+declare class ByteBuffer {
+    private buffer;
+    private view;
+    private position;
+    constructor(arg?: number | Uint8Array);
+    private ensureCapacity;
+    size(): number;
+    getPosition(): number;
+    setPosition(pos: number): void;
+    getInt8(pos: number): number;
+    getInt16(pos: number): number;
+    getInt32(pos: number): number;
+    getUint8(pos: number): number;
+    getUint16(pos: number): number;
+    getUint32(pos: number): number;
+    putInt8(value: number): void;
+    putInt16(value: number): void;
+    putInt32(value: number): void;
+    putUint8(value: number): void;
+    putUint16(value: number): void;
+    putUint32(value: number): void;
+    readInt8(): number;
+    readInt16(): number;
+    readInt32(): number;
+    readUint8(): number;
+    readUint16(): number;
+    readUint32(): number;
+    readString(length: number): string;
+    putString(str: string): void;
+    shrink(): Uint8Array;
+    toUint8Array(): Uint8Array;
+    getArrayBuffer(): ArrayBuffer;
+    getDataView(): DataView;
+}
+/**
+ * Token information dictionary.
+ * Stores morpheme features (POS, reading, pronunciation, etc.)
+ * indexed by word ID.
+ */
+declare class TokenInfoDictionary {
+    /** Feature data for each word, stored as concatenated strings */
+    private dictionary;
+    /** Mapping from word ID to position in features buffer */
+    private targetMap;
+    constructor();
+    buildDictionary(entries: {
+        wordId: number;
+        leftId: number;
+        rightId: number;
+        cost: number;
+    }[]): {
+        wordId: number;
+        leftId: number;
+        rightId: number;
+        cost: number;
+    }[];
+    /**
+     * Add a token entry to the dictionary.
+     */
+    put(wordId: number, leftId: number, rightId: number, cost: number, features: string): void;
+    /**
+     * Add a mapping from word ID to buffer position.
+     */
+    addMapping(wordId: number, position: number): void;
+    /**
+     * Get left context ID for a word.
+     */
+    getLeftId(wordId: number): number;
+    /**
+     * Get right context ID for a word.
+     */
+    getRightId(wordId: number): number;
+    /**
+     * Get word cost for a word.
+     */
+    getWordCost(wordId: number): number;
+    /**
+     * Get feature string for a word.
+     */
+    getFeatures(wordId: number): string;
+    /**
+     * Load the dictionary data buffer.
+     */
+    loadDictionary(data: Uint8Array): this;
+    /**
+     * Load the position data buffer.
+     */
+    loadPosVector(_data: Uint8Array): this;
+    /**
+     * Load the target map buffer.
+     * Format: sequence of int32 pairs [wordId, position]
+     */
+    loadTargetMap(data: Uint8Array): this;
+    getDictionary(): ByteBuffer;
+    getTargetMap(): Record<number, number>;
+    getTargetMapData(): Uint8Array;
+}
+/**
+ * Dictionary-related type definitions.
+ */
+/** Character class definition entry */
+interface CharacterClass {
+    /** Class name (e.g., DEFAULT, SPACE, KANJI) */
+    name: string;
+    /** Whether to invoke unknown word processing */
+    invoke: boolean;
+    /** Whether to group consecutive same-class chars */
+    group: boolean;
+    /** Maximum unknown word length (0 = unlimited) */
+    length: number;
+}
+/**
+ * Character type definition for unknown word processing.
+ * Maps Unicode characters to character classes used in MeCab's char.def.
+ */
+declare class CharacterDefinition {
+    /** Character class definitions */
+    private characterClasses;
+    /** Mapping from character code to class index */
+    private characterCategoryMap;
+    /** Mapping from character code to compatible class bitmask */
+    private compatibleCategoryMap;
+    constructor();
+    addCharacterClass(charClass: CharacterClass): number;
+    setCharacterCategory(start: number, end: number, classId: number, compatibleClasses: number[]): void;
+    /**
+     * Lookup the character class for a given character code.
+     */
+    lookup(charCode: number): CharacterClass;
+    /**
+     * Get the character class index for a character code.
+     */
+    getCharacterClass(charCode: number): number;
+    /**
+     * Check if two character codes are in compatible classes.
+     */
+    isCompatible(charCode1: number, charCode2: number): boolean;
+    /**
+     * Get the invoke flag for a character class.
+     */
+    isInvoke(charCode: number): boolean;
+    /**
+     * Get the group flag for a character class.
+     */
+    isGroup(charCode: number): boolean;
+    /**
+     * Get the max length for a character class.
+     */
+    getMaxLength(charCode: number): number;
+    getCharacterClasses(): CharacterClass[];
+    getCategoryMap(): Uint8Array;
+    getCompatibleCategoryMap(): Uint32Array;
+    /**
+     * Load from binary buffers.
+     */
+    static fromBuffers(categoryMap: Uint8Array, compatMap: Uint32Array, invokeDefBuf: Uint8Array): CharacterDefinition;
+    /**
+     * Serialize invoke definitions to buffer.
+     */
+    toInvokeBuffer(): Uint8Array;
+}
+/**
+ * Unknown word dictionary.
+ * Handles words not found in the main dictionary by using character type information.
+ */
+declare class UnknownDictionary extends TokenInfoDictionary {
+    private characterDefinition;
+    constructor();
+    setCharacterDefinition(charDef: CharacterDefinition): void;
+    getCharacterDefinition(): CharacterDefinition;
+    /**
+     * Lookup unknown word entries for a given character class.
+     * Returns word IDs for the given character class index.
+     */
+    lookup(charCode: number): number[];
+    /**
+     * Lookup word IDs by character class index.
+     */
+    lookupByCharClass(_classId: number): number[];
+    /**
+     * Load from category map and dictionary data.
+     */
+    loadCharacterDefinition(charDef: CharacterDefinition): this;
+}
+/**
+ * Container that holds all dictionary components needed for tokenization.
+ */
+declare class DictionaryContainer {
+    readonly trie: DoubleArray;
+    readonly tokenInfoDictionary: TokenInfoDictionary;
+    readonly connectionCosts: ConnectionCosts;
+    readonly unknownDictionary: UnknownDictionary;
+    constructor(trie: DoubleArray, tokenInfoDictionary: TokenInfoDictionary, connectionCosts: ConnectionCosts, unknownDictionary: UnknownDictionary);
+}
+/**
+ * Dictionary format interface.
+ * Abstracts the differences between IPAdic, UniDic, and NEologd.
+ */
+interface DictionaryFormat<T extends BaseToken = BaseToken> {
+    /** Format name */
+    readonly name: string;
+    /**
+     * Parse features string into a typed token.
+     * Features is a comma-separated string from the dictionary.
+     */
+    parseToken(base: Omit<BaseToken, "cost"> & {
+        cost: number;
+    }, features: string): T;
+    /**
+     * Get the number of feature fields expected in this format.
+     */
+    getFeatureCount(): number;
+}
+/**
+ * Main tokenizer class.
+ * Performs Japanese morphological analysis using Viterbi algorithm.
+ */
+declare class Tokenizer<T extends BaseToken = BaseToken> {
+    private readonly viterbiBuilder;
+    private readonly viterbiSearcher;
+    private readonly dictionary;
+    private readonly format;
+    constructor(dictionary: DictionaryContainer, format: DictionaryFormat<T>);
+    /**
+     * Tokenize input text into morphemes.
+     */
+    tokenize(text: string): T[];
+    /**
+     * Get the dictionary format handler.
+     */
+    getFormat(): DictionaryFormat<T>;
+}
+/**
+ * Tokenizer configuration options.
+ */
+type DictionaryFormatType = "ipadic" | "unidic" | "neologd";
+interface TokenizerOptions {
+    /** Dictionary format to use */
+    format?: DictionaryFormatType;
+    /** Path to compiled dictionary directory */
+    dicPath: string;
+}
+/**
+ * Dictionary loader interface.
+ * Abstracts loading of compiled dictionary files across environments.
+ */
+interface DictionaryLoader {
+    /**
+     * Load a gzipped binary file and return the decompressed data.
+     */
+    loadArrayBuffer(path: string): Promise<ArrayBuffer>;
+}
+/**
+ * Asynchronous tokenizer builder.
+ * Loads dictionary files and constructs a ready-to-use Tokenizer.
+ */
+declare class TokenizerBuilder {
+    /**
+     * Build a tokenizer with the given options.
+     */
+    static build<T extends BaseToken>(_options: TokenizerOptions, loader: DictionaryLoader, format: DictionaryFormat<T>): Promise<Tokenizer<T>>;
+}
+/**
+ * IPAdic dictionary format parser.
+ * Parses features in the IPAdic format (MeCab-IPADIC).
+ */
+declare class IpadicFormatHandler implements DictionaryFormat<IpadicToken> {
+    readonly name: string;
+    parseToken(base: Omit<BaseToken, "cost"> & {
+        cost: number;
+    }, features: string): IpadicToken;
+    getFeatureCount(): number;
+}
+/**
+ * UniDic dictionary format parser.
+ */
+declare class UnidicFormatHandler implements DictionaryFormat<UnidicToken> {
+    readonly name = "unidic";
+    parseToken(base: Omit<BaseToken, "cost"> & {
+        cost: number;
+    }, features: string): UnidicToken;
+    getFeatureCount(): number;
+}
+/**
+ * NEologd dictionary format parser.
+ * NEologd is based on IPAdic format with the same feature structure.
+ */
+declare class NeologdFormatHandler extends IpadicFormatHandler {
+    readonly name: string;
+}
+/**
+ * Builds a double array trie from a set of sorted keys.
+ *
+ * The double array structure uses two parallel arrays (base and check)
+ * to represent a trie compactly. This is the standard Aoe algorithm.
+ */
+interface DoubleArrayBuildResult {
+    base: Int32Array;
+    check: Int32Array;
+}
+declare class DoubleArrayBuilder {
+    /**
+     * Build a double array from sorted key-value pairs.
+     * Keys must be sorted in lexicographic order.
+     * Each key is an array of character codes (unsigned integers > 0).
+     */
+    static build(keys: {
+        key: number[];
+        value: number;
+    }[]): DoubleArrayBuildResult;
+    private base;
+    private check;
+    private nextCheckPos;
+    constructor();
+    private buildFromKeys;
+    private buildTrie;
+    private ensureSize;
+    private insert;
+}
+/**
+ * Dictionary loader for Node.js using native fs and zlib.
+ */
+declare class NodeLoader implements DictionaryLoader {
+    private readonly basePath;
+    constructor(basePath: string);
+    loadArrayBuffer(fileName: string): Promise<ArrayBuffer>;
+}
+/**
+ * Dictionary loader for browser using fetch and DecompressionStream.
+ */
+declare class BrowserLoader implements DictionaryLoader {
+    private readonly baseUrl;
+    constructor(baseUrl: string);
+    loadArrayBuffer(fileName: string): Promise<ArrayBuffer>;
+}
+/**
+ * Unicode-aware string that correctly handles surrogate pairs.
+ * Provides character-level indexing that treats surrogate pairs as single characters.
+ */
+declare class SurrogateAwareString {
+    private readonly codePoints;
+    readonly length: number;
+    constructor(str: string);
+    charAt(index: number): string;
+    charCodeAt(index: number): number;
+    substring(start: number, end?: number): string;
+    slice(start: number, end?: number): string;
+    toString(): string;
+}
+/**
+ * tokana - Modern Japanese Morphological Analyzer
+ */
+/**
+ * Create a tokenizer with the given options.
+ *
+ * @example
+ * ```typescript
+ * const tokenizer = await createTokenizer({
+ *   format: "ipadic",
+ *   dicPath: "./dict",
+ * });
+ * const tokens = tokenizer.tokenize("東京都に住んでいる");
+ * ```
+ */
+declare function createTokenizer(options: TokenizerOptions & {
+    format: "ipadic";
+}): Promise<Tokenizer<IpadicToken>>;
+declare function createTokenizer(options: TokenizerOptions & {
+    format: "unidic";
+}): Promise<Tokenizer<UnidicToken>>;
+declare function createTokenizer(options: TokenizerOptions & {
+    format: "neologd";
+}): Promise<Tokenizer<IpadicToken>>;
+declare function createTokenizer(options: TokenizerOptions): Promise<Tokenizer<BaseToken>>;
+export { type BaseToken, BrowserLoader, ByteBuffer, CharacterDefinition, ConnectionCosts, DictionaryContainer, type DictionaryFormat, type DictionaryFormatType, type DictionaryLoader, DoubleArray, DoubleArrayBuilder, IpadicFormatHandler, type IpadicToken, NeologdFormatHandler, NodeLoader, SurrogateAwareString, type Token, TokenInfoDictionary, type TokenType, Tokenizer, TokenizerBuilder, type TokenizerOptions, UnidicFormatHandler, type UnidicToken, UnknownDictionary, createTokenizer };