npm - @dniskav/neuron - Versions diffs - 0.3.1 → 0.3.2 - Mend

@dniskav/neuron 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md CHANGED Viewed

@@ -678,8 +678,42 @@ npm test        # run test suite
 If you are an AI agent or LLM working with this codebase, read [AGENTS.md](AGENTS.md) first. It contains the full class hierarchy, design constraints, and what this library does not do.
+## Roadmap (nice to have)
+These features are intentionally out of scope for the current didactic focus but are documented here for reference.
+### ONNX export
+Export trained models to the [ONNX](https://onnx.ai/) interchange format so they can be run in Python (onnxruntime), browsers (onnxruntime-web), mobile, or production inference servers.
+**What it would require:**
+- Serialize each layer's weights + op type to the protobuf ONNX schema (`onnx.proto`).
+- Map neuron layer types to standard ONNX operators (`Gemm`, `MatMul`, `LSTM`, `Conv`, `Relu`, `Softmax`, …).
+- Handle dynamic batch dimensions in the graph IR.
+- Ship a build step that compiles the `.proto` definitions (adds a dev dependency on `protobufjs` or `onnx-proto`).
+**Why it's skipped:** It adds a non-trivial build pipeline and a dependency. The library has zero runtime dependencies by design. ONNX export makes sense once you outgrow the library for training — at that point PyTorch/TF are the right tools.
+### WebGL / WASM backend
+Replace the current pure-JS number arrays with a GPU-accelerated or WASM-compiled backend so larger models (e.g. ViT, GPT-2 scale) become feasible in the browser.
+**What it would require:**
+- Abstract `Tensor` type that backends implement (JS arrays, WebGL textures, WASM memory).
+- WebGL backend: encode matrix ops as fragment-shader programs (similar to `gpu.js` or `tfjs-backend-webgl`).
+- WASM backend: compile a BLAS-like C/Rust core (e.g. `wasm-bindgen` + `ndarray`) and bind it to TypeScript.
+- Every layer's `forward` / `backward` rewritten against the `Tensor` API.
+**Why it's skipped:** The goal is to make the math readable. GPU shader code and WASM bindings are implementation details that obscure the algorithms. The library intentionally trades performance for pedagogical clarity.
+---
 ## Changelog
+### v0.3.2
+- **New — NLP:** `Tokenizer` (char / word / whitespace modes, special tokens PAD/UNK/BOS/EOS, one-hot encoding, `fit` / `encode` / `decode` / `encodeBatch`, JSON serialization)
+- **New — Data:** `DatasetLoader` (parse CSV and JSON into `DataPair`; auto one-hot encoding for string columns; returns `categoricalMaps` for decoding predictions)
 ### v0.3.1
 - **New — Embeddings:** `Word2Vec` (Skip-gram + CBOW, full-softmax, cosine similarity, analogies), `TSNE` (binary-search perplexity, Student-t kernel, KL gradient, early exaggeration, seeded PRNG), `PositionalEncoding` (sinusoidal, Vaswani et al.), `LearnedPositionalEncoding` (trainable), `ContrastiveLearning` (NT-Xent, SimCLR encoder + projection head), `Augmenter` (noise, feature dropout, `makePair`)

package/dist/index.d.mts CHANGED Viewed

@@ -570,6 +570,63 @@ declare class Trainer {
     private _computeMetricsArray;
 }
+interface DatasetLoaderOptions {
+    /** Column names to use as input features. */
+    featureCols: string[];
+    /** Column names to use as targets / labels. */
+    targetCols: string[];
+    /**
+     * When true, string values in feature/target columns are one-hot encoded.
+     * When false, non-numeric values throw an error. Default: true.
+     */
+    encodeStrings?: boolean;
+}
+/**
+ * Maps a column name to its {value → one-hot index} dictionary.
+ * Useful for decoding model predictions back to class names.
+ */
+type CategoricalMap = Record<string, Record<string, number>>;
+interface DatasetLoaderResult extends DataPair {
+    /**
+     * For each string column that was one-hot encoded, maps the column name to
+     * the {category → index} dictionary used during encoding.
+     */
+    categoricalMaps: CategoricalMap;
+    /** Column names in the order they appear in each input vector. */
+    featureNames: string[];
+    /** Column names (or expanded one-hot names) in the order they appear in each target vector. */
+    targetNames: string[];
+    /** Total number of rows parsed. */
+    numRows: number;
+}
+declare class DatasetLoader {
+    /**
+     * Parse a CSV string into a DataPair.
+     *
+     * - The first non-empty row is treated as a header.
+     * - Numeric values are parsed with parseFloat.
+     * - String values are one-hot encoded (one column → N binary columns).
+     * - Empty rows and comment lines (starting with #) are skipped.
+     *
+     * @param csv     - raw CSV text
+     * @param options - which columns to use as features / targets
+     */
+    static fromCSV(csv: string, options: DatasetLoaderOptions): DatasetLoaderResult;
+    /**
+     * Parse a JSON string (array of objects) into a DataPair.
+     *
+     * Expected format:
+     *   [{ "col1": 1.0, "col2": "cat", "label": "dog" }, ...]
+     *
+     * @param json    - raw JSON text or a pre-parsed array of objects
+     * @param options - which columns to use as features / targets
+     */
+    static fromJSON(json: string | Record<string, unknown>[], options: DatasetLoaderOptions): DatasetLoaderResult;
+    private static _buildDataPair;
+    private static _parseCSV;
+    private static _parseCSVRow;
+}
 declare class LRScheduler {
     stepDecay(lr: number, epoch: number, dropRate: number, epochsDrop: number): number;
     exponentialDecay(lr: number, epoch: number, decayRate: number): number;
@@ -1107,6 +1164,121 @@ declare function perplexity(yTrue: number[], probabilities: number[][]): number;
 declare function printConfusionMatrix(matrix: number[][], labels?: string[]): void;
 declare function classificationReport(yTrue: number[], yPred: number[], labels?: string[]): void;
+type TokenizerMode = 'char' | 'word' | 'whitespace';
+interface TokenizerOptions {
+    /** Tokenization strategy. Default: 'word' */
+    mode?: TokenizerMode;
+    /** Normalize text to lowercase before processing. Default: true */
+    lowercase?: boolean;
+    /** Maximum vocabulary size (most frequent tokens kept). 0 = unlimited. Default: 0 */
+    maxVocab?: number;
+    /** Additional special tokens to reserve (appended after PAD/UNK/BOS/EOS). */
+    specialTokens?: string[];
+}
+interface EncodeOptions {
+    /** Prepend <BOS> token. Default: false */
+    addBOS?: boolean;
+    /** Append <EOS> token. Default: false */
+    addEOS?: boolean;
+}
+interface EncodeBatchOptions extends EncodeOptions {
+    /**
+     * Pad or truncate all sequences to this length.
+     * Sequences shorter than padTo are right-padded with <PAD> (id 0).
+     * Sequences longer than padTo are truncated on the right.
+     * If omitted, sequences are left at their natural length.
+     */
+    padTo?: number;
+}
+interface TokenizerSnapshot {
+    mode: TokenizerMode;
+    lowercase: boolean;
+    maxVocab: number;
+    token2id: Record<string, number>;
+}
+declare class Tokenizer {
+    static readonly PAD = "<PAD>";
+    static readonly UNK = "<UNK>";
+    static readonly BOS = "<BOS>";
+    static readonly EOS = "<EOS>";
+    private readonly _mode;
+    private readonly _lowercase;
+    private readonly _maxVocab;
+    private readonly _extraSpecial;
+    private _token2id;
+    private _id2token;
+    private _fitted;
+    constructor(options?: TokenizerOptions);
+    /**
+     * Build vocabulary from an array of text strings.
+     * Calling fit() again resets and rebuilds the vocabulary from scratch.
+     *
+     * @param texts - corpus to build the vocabulary from
+     * @returns this (chainable)
+     */
+    fit(texts: string[]): this;
+    /**
+     * Split raw text into an array of string tokens (no ID conversion yet).
+     * Useful for inspecting what the tokenizer produces before encoding.
+     */
+    tokenize(text: string): string[];
+    /**
+     * Convert a text string to a sequence of token IDs.
+     * Unknown tokens map to <UNK> (id 1).
+     *
+     * @param text    - input text
+     * @param options - addBOS / addEOS flags
+     */
+    encode(text: string, options?: EncodeOptions): number[];
+    /**
+     * Encode an array of texts, optionally padding/truncating to a fixed length.
+     *
+     * @param texts   - array of input texts
+     * @param options - addBOS / addEOS / padTo
+     */
+    encodeBatch(texts: string[], options?: EncodeBatchOptions): number[][];
+    /**
+     * Convert a sequence of token IDs back to a human-readable string.
+     *
+     * @param ids          - array of token IDs
+     * @param stripSpecial - remove PAD/BOS/EOS tokens from output. Default: true
+     */
+    decode(ids: number[], stripSpecial?: boolean): string;
+    /**
+     * Convert a sequence of token IDs to one-hot vectors.
+     * Each vector has length `vocabSize` with a single 1 at the token's position.
+     * Useful when feeding tokens directly into a Network without an embedding layer.
+     *
+     * @param ids - array of token IDs (e.g. from encode())
+     * @returns   - 2D array of shape [seqLen, vocabSize]
+     */
+    oneHot(ids: number[]): number[][];
+    /** Number of tokens in the vocabulary (including special tokens). */
+    get vocabSize(): number;
+    /** True if fit() has been called at least once. */
+    get isFitted(): boolean;
+    /** Get the integer ID for a token string, or undefined if not in vocabulary. */
+    tokenToId(token: string): number | undefined;
+    /** Get the token string for an integer ID, or undefined if out of range. */
+    idToToken(id: number): string | undefined;
+    /**
+     * Return the full vocabulary as an array ordered by ID.
+     * Index i of the returned array is the token with ID i.
+     */
+    getVocabulary(): string[];
+    /**
+     * Serialize the fitted tokenizer to a plain JSON-compatible object.
+     * Store it with JSON.stringify(); reload with Tokenizer.fromJSON().
+     */
+    toJSON(): TokenizerSnapshot;
+    /**
+     * Restore a Tokenizer from a snapshot produced by toJSON().
+     */
+    static fromJSON(snapshot: TokenizerSnapshot): Tokenizer;
+    private _register;
+    private _assertFitted;
+}
 declare class EarlyStopping {
     bestValue: number;
     readonly patience: number;
@@ -1179,4 +1351,4 @@ declare class DataAugmentation {
     };
 }
-export { type Activation, Adam, AttentionHead, Augmenter, Autoencoder, BatchNorm, BiasVector, CausalConv1D, ClipOptimizer, ClippedOptimizerFactory, ContrastiveLearning, Conv1D, Conv2D, DataAugmentation, DataLoader, type DataPair, DecisionTree, Dropout, EarlyStopping, EmbeddingMatrix, Flatten, GAN, GRULayer, GaussianNaiveBayes, HopfieldNetwork, KMeans, type KMeansOptions, LRScheduler, LSTMLayer, Layer, LayerNorm, LearnedPositionalEncoding, LinearRegression, LogisticRegression, LossPlotter, MaxPool2D, ModelSaver, Momentum, MultiHeadAttention, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, NetworkTransformer, type NetworkTransformerOptions, NetworkTransformerRL, type NetworkTransformerRLOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, PCA, Perceptron, PositionalEncoding, RNN, SGD, SOM, type SOMOptions, Seq2Seq, type Serializable, SoftmaxRegression, TCN, TSNE, type TSNEOptions, type TrainDataset, type TrainMetrics, type TrainableNetwork, type TrainableNetworkWithWeights, Trainer, type TrainerOptions, TransformerBlock, type TransformerBlockOptions, VAE, Value, WeightInspector, WeightMatrix, type WeightStats, Word2Vec, type Word2VecModel, type Word2VecOptions, accuracy, auc, classificationReport, confusionMatrix, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, defaultOptimizer, elu, f1Score, leakyRelu, linear, mae, makeElu, makeLeakyRelu, matMul, mse, mseDelta, perplexity, precision, printConfusionMatrix, r2Score, recall, relu, rmse, rocCurve, sigmoid, softmax, softmaxBackward, tanh, transpose, validate2DArray, validateArray, validateArrayMinLength, validateNumber };
+export { type Activation, Adam, AttentionHead, Augmenter, Autoencoder, BatchNorm, BiasVector, type CategoricalMap, CausalConv1D, ClipOptimizer, ClippedOptimizerFactory, ContrastiveLearning, Conv1D, Conv2D, DataAugmentation, DataLoader, type DataPair, DatasetLoader, type DatasetLoaderOptions, type DatasetLoaderResult, DecisionTree, Dropout, EarlyStopping, EmbeddingMatrix, type EncodeBatchOptions, type EncodeOptions, Flatten, GAN, GRULayer, GaussianNaiveBayes, HopfieldNetwork, KMeans, type KMeansOptions, LRScheduler, LSTMLayer, Layer, LayerNorm, LearnedPositionalEncoding, LinearRegression, LogisticRegression, LossPlotter, MaxPool2D, ModelSaver, Momentum, MultiHeadAttention, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, NetworkTransformer, type NetworkTransformerOptions, NetworkTransformerRL, type NetworkTransformerRLOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, PCA, Perceptron, PositionalEncoding, RNN, SGD, SOM, type SOMOptions, Seq2Seq, type Serializable, SoftmaxRegression, TCN, TSNE, type TSNEOptions, Tokenizer, type TokenizerMode, type TokenizerOptions, type TokenizerSnapshot, type TrainDataset, type TrainMetrics, type TrainableNetwork, type TrainableNetworkWithWeights, Trainer, type TrainerOptions, TransformerBlock, type TransformerBlockOptions, VAE, Value, WeightInspector, WeightMatrix, type WeightStats, Word2Vec, type Word2VecModel, type Word2VecOptions, accuracy, auc, classificationReport, confusionMatrix, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, defaultOptimizer, elu, f1Score, leakyRelu, linear, mae, makeElu, makeLeakyRelu, matMul, mse, mseDelta, perplexity, precision, printConfusionMatrix, r2Score, recall, relu, rmse, rocCurve, sigmoid, softmax, softmaxBackward, tanh, transpose, validate2DArray, validateArray, validateArrayMinLength, validateNumber };

package/dist/index.d.ts CHANGED Viewed

@@ -570,6 +570,63 @@ declare class Trainer {
     private _computeMetricsArray;
 }
+interface DatasetLoaderOptions {
+    /** Column names to use as input features. */
+    featureCols: string[];
+    /** Column names to use as targets / labels. */
+    targetCols: string[];
+    /**
+     * When true, string values in feature/target columns are one-hot encoded.
+     * When false, non-numeric values throw an error. Default: true.
+     */
+    encodeStrings?: boolean;
+}
+/**
+ * Maps a column name to its {value → one-hot index} dictionary.
+ * Useful for decoding model predictions back to class names.
+ */
+type CategoricalMap = Record<string, Record<string, number>>;
+interface DatasetLoaderResult extends DataPair {
+    /**
+     * For each string column that was one-hot encoded, maps the column name to
+     * the {category → index} dictionary used during encoding.
+     */
+    categoricalMaps: CategoricalMap;
+    /** Column names in the order they appear in each input vector. */
+    featureNames: string[];
+    /** Column names (or expanded one-hot names) in the order they appear in each target vector. */
+    targetNames: string[];
+    /** Total number of rows parsed. */
+    numRows: number;
+}
+declare class DatasetLoader {
+    /**
+     * Parse a CSV string into a DataPair.
+     *
+     * - The first non-empty row is treated as a header.
+     * - Numeric values are parsed with parseFloat.
+     * - String values are one-hot encoded (one column → N binary columns).
+     * - Empty rows and comment lines (starting with #) are skipped.
+     *
+     * @param csv     - raw CSV text
+     * @param options - which columns to use as features / targets
+     */
+    static fromCSV(csv: string, options: DatasetLoaderOptions): DatasetLoaderResult;
+    /**
+     * Parse a JSON string (array of objects) into a DataPair.
+     *
+     * Expected format:
+     *   [{ "col1": 1.0, "col2": "cat", "label": "dog" }, ...]
+     *
+     * @param json    - raw JSON text or a pre-parsed array of objects
+     * @param options - which columns to use as features / targets
+     */
+    static fromJSON(json: string | Record<string, unknown>[], options: DatasetLoaderOptions): DatasetLoaderResult;
+    private static _buildDataPair;
+    private static _parseCSV;
+    private static _parseCSVRow;
+}
 declare class LRScheduler {
     stepDecay(lr: number, epoch: number, dropRate: number, epochsDrop: number): number;
     exponentialDecay(lr: number, epoch: number, decayRate: number): number;
@@ -1107,6 +1164,121 @@ declare function perplexity(yTrue: number[], probabilities: number[][]): number;
 declare function printConfusionMatrix(matrix: number[][], labels?: string[]): void;
 declare function classificationReport(yTrue: number[], yPred: number[], labels?: string[]): void;
+type TokenizerMode = 'char' | 'word' | 'whitespace';
+interface TokenizerOptions {
+    /** Tokenization strategy. Default: 'word' */
+    mode?: TokenizerMode;
+    /** Normalize text to lowercase before processing. Default: true */
+    lowercase?: boolean;
+    /** Maximum vocabulary size (most frequent tokens kept). 0 = unlimited. Default: 0 */
+    maxVocab?: number;
+    /** Additional special tokens to reserve (appended after PAD/UNK/BOS/EOS). */
+    specialTokens?: string[];
+}
+interface EncodeOptions {
+    /** Prepend <BOS> token. Default: false */
+    addBOS?: boolean;
+    /** Append <EOS> token. Default: false */
+    addEOS?: boolean;
+}
+interface EncodeBatchOptions extends EncodeOptions {
+    /**
+     * Pad or truncate all sequences to this length.
+     * Sequences shorter than padTo are right-padded with <PAD> (id 0).
+     * Sequences longer than padTo are truncated on the right.
+     * If omitted, sequences are left at their natural length.
+     */
+    padTo?: number;
+}
+interface TokenizerSnapshot {
+    mode: TokenizerMode;
+    lowercase: boolean;
+    maxVocab: number;
+    token2id: Record<string, number>;
+}
+declare class Tokenizer {
+    static readonly PAD = "<PAD>";
+    static readonly UNK = "<UNK>";
+    static readonly BOS = "<BOS>";
+    static readonly EOS = "<EOS>";
+    private readonly _mode;
+    private readonly _lowercase;
+    private readonly _maxVocab;
+    private readonly _extraSpecial;
+    private _token2id;
+    private _id2token;
+    private _fitted;
+    constructor(options?: TokenizerOptions);
+    /**
+     * Build vocabulary from an array of text strings.
+     * Calling fit() again resets and rebuilds the vocabulary from scratch.
+     *
+     * @param texts - corpus to build the vocabulary from
+     * @returns this (chainable)
+     */
+    fit(texts: string[]): this;
+    /**
+     * Split raw text into an array of string tokens (no ID conversion yet).
+     * Useful for inspecting what the tokenizer produces before encoding.
+     */
+    tokenize(text: string): string[];
+    /**
+     * Convert a text string to a sequence of token IDs.
+     * Unknown tokens map to <UNK> (id 1).
+     *
+     * @param text    - input text
+     * @param options - addBOS / addEOS flags
+     */
+    encode(text: string, options?: EncodeOptions): number[];
+    /**
+     * Encode an array of texts, optionally padding/truncating to a fixed length.
+     *
+     * @param texts   - array of input texts
+     * @param options - addBOS / addEOS / padTo
+     */
+    encodeBatch(texts: string[], options?: EncodeBatchOptions): number[][];
+    /**
+     * Convert a sequence of token IDs back to a human-readable string.
+     *
+     * @param ids          - array of token IDs
+     * @param stripSpecial - remove PAD/BOS/EOS tokens from output. Default: true
+     */
+    decode(ids: number[], stripSpecial?: boolean): string;
+    /**
+     * Convert a sequence of token IDs to one-hot vectors.
+     * Each vector has length `vocabSize` with a single 1 at the token's position.
+     * Useful when feeding tokens directly into a Network without an embedding layer.
+     *
+     * @param ids - array of token IDs (e.g. from encode())
+     * @returns   - 2D array of shape [seqLen, vocabSize]
+     */
+    oneHot(ids: number[]): number[][];
+    /** Number of tokens in the vocabulary (including special tokens). */
+    get vocabSize(): number;
+    /** True if fit() has been called at least once. */
+    get isFitted(): boolean;
+    /** Get the integer ID for a token string, or undefined if not in vocabulary. */
+    tokenToId(token: string): number | undefined;
+    /** Get the token string for an integer ID, or undefined if out of range. */
+    idToToken(id: number): string | undefined;
+    /**
+     * Return the full vocabulary as an array ordered by ID.
+     * Index i of the returned array is the token with ID i.
+     */
+    getVocabulary(): string[];
+    /**
+     * Serialize the fitted tokenizer to a plain JSON-compatible object.
+     * Store it with JSON.stringify(); reload with Tokenizer.fromJSON().
+     */
+    toJSON(): TokenizerSnapshot;
+    /**
+     * Restore a Tokenizer from a snapshot produced by toJSON().
+     */
+    static fromJSON(snapshot: TokenizerSnapshot): Tokenizer;
+    private _register;
+    private _assertFitted;
+}
 declare class EarlyStopping {
     bestValue: number;
     readonly patience: number;
@@ -1179,4 +1351,4 @@ declare class DataAugmentation {
     };
 }
-export { type Activation, Adam, AttentionHead, Augmenter, Autoencoder, BatchNorm, BiasVector, CausalConv1D, ClipOptimizer, ClippedOptimizerFactory, ContrastiveLearning, Conv1D, Conv2D, DataAugmentation, DataLoader, type DataPair, DecisionTree, Dropout, EarlyStopping, EmbeddingMatrix, Flatten, GAN, GRULayer, GaussianNaiveBayes, HopfieldNetwork, KMeans, type KMeansOptions, LRScheduler, LSTMLayer, Layer, LayerNorm, LearnedPositionalEncoding, LinearRegression, LogisticRegression, LossPlotter, MaxPool2D, ModelSaver, Momentum, MultiHeadAttention, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, NetworkTransformer, type NetworkTransformerOptions, NetworkTransformerRL, type NetworkTransformerRLOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, PCA, Perceptron, PositionalEncoding, RNN, SGD, SOM, type SOMOptions, Seq2Seq, type Serializable, SoftmaxRegression, TCN, TSNE, type TSNEOptions, type TrainDataset, type TrainMetrics, type TrainableNetwork, type TrainableNetworkWithWeights, Trainer, type TrainerOptions, TransformerBlock, type TransformerBlockOptions, VAE, Value, WeightInspector, WeightMatrix, type WeightStats, Word2Vec, type Word2VecModel, type Word2VecOptions, accuracy, auc, classificationReport, confusionMatrix, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, defaultOptimizer, elu, f1Score, leakyRelu, linear, mae, makeElu, makeLeakyRelu, matMul, mse, mseDelta, perplexity, precision, printConfusionMatrix, r2Score, recall, relu, rmse, rocCurve, sigmoid, softmax, softmaxBackward, tanh, transpose, validate2DArray, validateArray, validateArrayMinLength, validateNumber };
+export { type Activation, Adam, AttentionHead, Augmenter, Autoencoder, BatchNorm, BiasVector, type CategoricalMap, CausalConv1D, ClipOptimizer, ClippedOptimizerFactory, ContrastiveLearning, Conv1D, Conv2D, DataAugmentation, DataLoader, type DataPair, DatasetLoader, type DatasetLoaderOptions, type DatasetLoaderResult, DecisionTree, Dropout, EarlyStopping, EmbeddingMatrix, type EncodeBatchOptions, type EncodeOptions, Flatten, GAN, GRULayer, GaussianNaiveBayes, HopfieldNetwork, KMeans, type KMeansOptions, LRScheduler, LSTMLayer, Layer, LayerNorm, LearnedPositionalEncoding, LinearRegression, LogisticRegression, LossPlotter, MaxPool2D, ModelSaver, Momentum, MultiHeadAttention, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, NetworkTransformer, type NetworkTransformerOptions, NetworkTransformerRL, type NetworkTransformerRLOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, PCA, Perceptron, PositionalEncoding, RNN, SGD, SOM, type SOMOptions, Seq2Seq, type Serializable, SoftmaxRegression, TCN, TSNE, type TSNEOptions, Tokenizer, type TokenizerMode, type TokenizerOptions, type TokenizerSnapshot, type TrainDataset, type TrainMetrics, type TrainableNetwork, type TrainableNetworkWithWeights, Trainer, type TrainerOptions, TransformerBlock, type TransformerBlockOptions, VAE, Value, WeightInspector, WeightMatrix, type WeightStats, Word2Vec, type Word2VecModel, type Word2VecOptions, accuracy, auc, classificationReport, confusionMatrix, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, defaultOptimizer, elu, f1Score, leakyRelu, linear, mae, makeElu, makeLeakyRelu, matMul, mse, mseDelta, perplexity, precision, printConfusionMatrix, r2Score, recall, relu, rmse, rocCurve, sigmoid, softmax, softmaxBackward, tanh, transpose, validate2DArray, validateArray, validateArrayMinLength, validateNumber };

package/dist/index.js CHANGED Viewed

@@ -34,6 +34,7 @@ __export(index_exports, {
   Conv2D: () => Conv2D,
   DataAugmentation: () => DataAugmentation,
   DataLoader: () => DataLoader,
+  DatasetLoader: () => DatasetLoader,
   DecisionTree: () => DecisionTree,
   Dropout: () => Dropout,
   EarlyStopping: () => EarlyStopping,
@@ -73,6 +74,7 @@ __export(index_exports, {
   SoftmaxRegression: () => SoftmaxRegression,
   TCN: () => TCN,
   TSNE: () => TSNE,
+  Tokenizer: () => Tokenizer,
   Trainer: () => Trainer,
   TransformerBlock: () => TransformerBlock,
   VAE: () => VAE,
@@ -2651,6 +2653,155 @@ var DataLoader = class _DataLoader {
   }
 };
+// src/DatasetLoader.ts
+var DatasetLoader = class _DatasetLoader {
+  // ── CSV ─────────────────────────────────────────────────────────────────────
+  /**
+   * Parse a CSV string into a DataPair.
+   *
+   * - The first non-empty row is treated as a header.
+   * - Numeric values are parsed with parseFloat.
+   * - String values are one-hot encoded (one column → N binary columns).
+   * - Empty rows and comment lines (starting with #) are skipped.
+   *
+   * @param csv     - raw CSV text
+   * @param options - which columns to use as features / targets
+   */
+  static fromCSV(csv, options) {
+    const rows = _DatasetLoader._parseCSV(csv);
+    if (rows.length < 2) throw new Error("DatasetLoader.fromCSV: CSV must have a header row and at least one data row.");
+    const header = rows[0];
+    const dataRows = rows.slice(1);
+    return _DatasetLoader._buildDataPair(header, dataRows, options);
+  }
+  // ── JSON ─────────────────────────────────────────────────────────────────────
+  /**
+   * Parse a JSON string (array of objects) into a DataPair.
+   *
+   * Expected format:
+   *   [{ "col1": 1.0, "col2": "cat", "label": "dog" }, ...]
+   *
+   * @param json    - raw JSON text or a pre-parsed array of objects
+   * @param options - which columns to use as features / targets
+   */
+  static fromJSON(json, options) {
+    const records = typeof json === "string" ? JSON.parse(json) : json;
+    if (!Array.isArray(records) || records.length === 0) {
+      throw new Error("DatasetLoader.fromJSON: expected a non-empty JSON array of objects.");
+    }
+    const header = Object.keys(records[0]);
+    const dataRows = records.map((row) => header.map((col) => String(row[col] ?? "")));
+    return _DatasetLoader._buildDataPair(header, dataRows, options);
+  }
+  // ── Private: shared pipeline ──────────────────────────────────────────────
+  static _buildDataPair(header, dataRows, options) {
+    const { featureCols, targetCols, encodeStrings = true } = options;
+    for (const col of [...featureCols, ...targetCols]) {
+      if (!header.includes(col)) {
+        throw new Error(`DatasetLoader: column "${col}" not found in header [${header.join(", ")}].`);
+      }
+    }
+    const catMaps = {};
+    const buildEncoder = (cols) => {
+      for (const col of cols) {
+        const colIdx = header.indexOf(col);
+        const values = dataRows.map((row) => row[colIdx]);
+        const isNumeric = values.every((v) => v === "" || !isNaN(Number(v)));
+        if (!isNumeric) {
+          if (!encodeStrings) {
+            throw new Error(`DatasetLoader: column "${col}" contains non-numeric values. Set encodeStrings: true to one-hot encode them.`);
+          }
+          const unique = [...new Set(values)].sort();
+          catMaps[col] = Object.fromEntries(unique.map((v, i) => [v, i]));
+        }
+      }
+    };
+    buildEncoder(featureCols);
+    buildEncoder(targetCols);
+    const encodeValue = (col, raw) => {
+      if (catMaps[col]) {
+        const categories = catMaps[col];
+        const n = Object.keys(categories).length;
+        const vec = new Array(n).fill(0);
+        const idx = categories[raw];
+        if (idx !== void 0) vec[idx] = 1;
+        return vec;
+      }
+      return [parseFloat(raw)];
+    };
+    const expandNames = (cols) => cols.flatMap((col) => {
+      if (catMaps[col]) {
+        return Object.keys(catMaps[col]).map((cat) => `${col}_${cat}`);
+      }
+      return [col];
+    });
+    const featureNames = expandNames(featureCols);
+    const targetNames = expandNames(targetCols);
+    const inputs = [];
+    const targets = [];
+    for (const row of dataRows) {
+      const input = featureCols.flatMap((col) => {
+        const raw = row[header.indexOf(col)];
+        return encodeValue(col, raw);
+      });
+      const target = targetCols.flatMap((col) => {
+        const raw = row[header.indexOf(col)];
+        return encodeValue(col, raw);
+      });
+      inputs.push(input);
+      targets.push(target);
+    }
+    return {
+      inputs,
+      targets,
+      categoricalMaps: catMaps,
+      featureNames,
+      targetNames,
+      numRows: dataRows.length
+    };
+  }
+  // ── Private: RFC 4180-compatible CSV parser ───────────────────────────────
+  static _parseCSV(csv) {
+    const rows = [];
+    const lines = csv.split(/\r?\n/);
+    for (const line of lines) {
+      const trimmed = line.trim();
+      if (!trimmed || trimmed.startsWith("#")) continue;
+      rows.push(_DatasetLoader._parseCSVRow(trimmed));
+    }
+    return rows;
+  }
+  static _parseCSVRow(line) {
+    const fields = [];
+    let current = "";
+    let inQuotes = false;
+    for (let i = 0; i < line.length; i++) {
+      const ch = line[i];
+      if (inQuotes) {
+        if (ch === '"' && line[i + 1] === '"') {
+          current += '"';
+          i++;
+        } else if (ch === '"') {
+          inQuotes = false;
+        } else {
+          current += ch;
+        }
+      } else {
+        if (ch === '"') {
+          inQuotes = true;
+        } else if (ch === ",") {
+          fields.push(current.trim());
+          current = "";
+        } else {
+          current += ch;
+        }
+      }
+    }
+    fields.push(current.trim());
+    return fields;
+  }
+};
 // src/LRScheduler.ts
 var LRScheduler = class {
   // ── Step Decay ────────────────────────────────────────────────────────────
@@ -6135,6 +6286,216 @@ function _binaryRecall(yTrue, yPred, pos) {
   return tp + fn > 0 ? tp / (tp + fn) : 0;
 }
+// src/Tokenizer.ts
+var _Tokenizer = class _Tokenizer {
+  constructor(options = {}) {
+    this._token2id = /* @__PURE__ */ new Map();
+    this._id2token = /* @__PURE__ */ new Map();
+    this._fitted = false;
+    this._mode = options.mode ?? "word";
+    this._lowercase = options.lowercase ?? true;
+    this._maxVocab = options.maxVocab ?? 0;
+    this._extraSpecial = options.specialTokens ?? [];
+  }
+  // ── Fit ───────────────────────────────────────────────────────────────────
+  /**
+   * Build vocabulary from an array of text strings.
+   * Calling fit() again resets and rebuilds the vocabulary from scratch.
+   *
+   * @param texts - corpus to build the vocabulary from
+   * @returns this (chainable)
+   */
+  fit(texts) {
+    this._token2id = /* @__PURE__ */ new Map();
+    this._id2token = /* @__PURE__ */ new Map();
+    const specials = [
+      _Tokenizer.PAD,
+      _Tokenizer.UNK,
+      _Tokenizer.BOS,
+      _Tokenizer.EOS,
+      ...this._extraSpecial
+    ];
+    for (const s of specials) this._register(s);
+    const freq = /* @__PURE__ */ new Map();
+    for (const text of texts) {
+      for (const token of this.tokenize(text)) {
+        freq.set(token, (freq.get(token) ?? 0) + 1);
+      }
+    }
+    let entries = [...freq.entries()].sort(
+      ([a, fa], [b, fb]) => fb - fa || a.localeCompare(b)
+    );
+    if (this._maxVocab > 0) {
+      entries = entries.slice(0, this._maxVocab - specials.length);
+    }
+    for (const [token] of entries) this._register(token);
+    this._fitted = true;
+    return this;
+  }
+  // ── Tokenize ──────────────────────────────────────────────────────────────
+  /**
+   * Split raw text into an array of string tokens (no ID conversion yet).
+   * Useful for inspecting what the tokenizer produces before encoding.
+   */
+  tokenize(text) {
+    const t = this._lowercase ? text.toLowerCase() : text;
+    switch (this._mode) {
+      case "char":
+        return t.split("");
+      case "whitespace":
+        return t.split(/\s+/).filter(Boolean);
+      case "word":
+      default:
+        return t.match(/[a-z0-9àáâãäåæçèéêëìíîïðñòóôõöùúûüýþÿ]+|[^\w\s]/gi) ?? [];
+    }
+  }
+  // ── Encode ────────────────────────────────────────────────────────────────
+  /**
+   * Convert a text string to a sequence of token IDs.
+   * Unknown tokens map to <UNK> (id 1).
+   *
+   * @param text    - input text
+   * @param options - addBOS / addEOS flags
+   */
+  encode(text, options = {}) {
+    this._assertFitted();
+    const ids = [];
+    if (options.addBOS) ids.push(this._token2id.get(_Tokenizer.BOS));
+    for (const token of this.tokenize(text)) {
+      ids.push(this._token2id.get(token) ?? this._token2id.get(_Tokenizer.UNK));
+    }
+    if (options.addEOS) ids.push(this._token2id.get(_Tokenizer.EOS));
+    return ids;
+  }
+  // ── Encode batch ──────────────────────────────────────────────────────────
+  /**
+   * Encode an array of texts, optionally padding/truncating to a fixed length.
+   *
+   * @param texts   - array of input texts
+   * @param options - addBOS / addEOS / padTo
+   */
+  encodeBatch(texts, options = {}) {
+    const sequences = texts.map((t) => this.encode(t, options));
+    if (options.padTo !== void 0) {
+      const len = options.padTo;
+      const padId = this._token2id.get(_Tokenizer.PAD);
+      return sequences.map((seq) => {
+        if (seq.length >= len) return seq.slice(0, len);
+        return [...seq, ...Array(len - seq.length).fill(padId)];
+      });
+    }
+    return sequences;
+  }
+  // ── Decode ────────────────────────────────────────────────────────────────
+  /**
+   * Convert a sequence of token IDs back to a human-readable string.
+   *
+   * @param ids          - array of token IDs
+   * @param stripSpecial - remove PAD/BOS/EOS tokens from output. Default: true
+   */
+  decode(ids, stripSpecial = true) {
+    this._assertFitted();
+    const specials = /* @__PURE__ */ new Set([_Tokenizer.PAD, _Tokenizer.BOS, _Tokenizer.EOS]);
+    const tokens = [];
+    for (const id of ids) {
+      const token = this._id2token.get(id) ?? _Tokenizer.UNK;
+      if (stripSpecial && specials.has(token)) continue;
+      tokens.push(token);
+    }
+    return this._mode === "char" ? tokens.join("") : tokens.join(" ");
+  }
+  // ── One-hot encoding ──────────────────────────────────────────────────────
+  /**
+   * Convert a sequence of token IDs to one-hot vectors.
+   * Each vector has length `vocabSize` with a single 1 at the token's position.
+   * Useful when feeding tokens directly into a Network without an embedding layer.
+   *
+   * @param ids - array of token IDs (e.g. from encode())
+   * @returns   - 2D array of shape [seqLen, vocabSize]
+   */
+  oneHot(ids) {
+    this._assertFitted();
+    const V = this.vocabSize;
+    return ids.map((id) => {
+      const vec = new Array(V).fill(0);
+      if (id >= 0 && id < V) vec[id] = 1;
+      return vec;
+    });
+  }
+  // ── Vocabulary helpers ────────────────────────────────────────────────────
+  /** Number of tokens in the vocabulary (including special tokens). */
+  get vocabSize() {
+    return this._token2id.size;
+  }
+  /** True if fit() has been called at least once. */
+  get isFitted() {
+    return this._fitted;
+  }
+  /** Get the integer ID for a token string, or undefined if not in vocabulary. */
+  tokenToId(token) {
+    return this._token2id.get(token);
+  }
+  /** Get the token string for an integer ID, or undefined if out of range. */
+  idToToken(id) {
+    return this._id2token.get(id);
+  }
+  /**
+   * Return the full vocabulary as an array ordered by ID.
+   * Index i of the returned array is the token with ID i.
+   */
+  getVocabulary() {
+    return Array.from({ length: this.vocabSize }, (_, i) => this._id2token.get(i));
+  }
+  // ── Persistence ───────────────────────────────────────────────────────────
+  /**
+   * Serialize the fitted tokenizer to a plain JSON-compatible object.
+   * Store it with JSON.stringify(); reload with Tokenizer.fromJSON().
+   */
+  toJSON() {
+    this._assertFitted();
+    return {
+      mode: this._mode,
+      lowercase: this._lowercase,
+      maxVocab: this._maxVocab,
+      token2id: Object.fromEntries(this._token2id)
+    };
+  }
+  /**
+   * Restore a Tokenizer from a snapshot produced by toJSON().
+   */
+  static fromJSON(snapshot) {
+    const tok = new _Tokenizer({
+      mode: snapshot.mode,
+      lowercase: snapshot.lowercase,
+      maxVocab: snapshot.maxVocab
+    });
+    for (const [token, id] of Object.entries(snapshot.token2id)) {
+      tok._token2id.set(token, id);
+      tok._id2token.set(id, token);
+    }
+    tok._fitted = true;
+    return tok;
+  }
+  // ── Private ───────────────────────────────────────────────────────────────
+  _register(token) {
+    if (this._token2id.has(token)) return;
+    const id = this._token2id.size;
+    this._token2id.set(token, id);
+    this._id2token.set(id, token);
+  }
+  _assertFitted() {
+    if (!this._fitted) {
+      throw new Error("Tokenizer: call fit() before encoding or decoding.");
+    }
+  }
+};
+// ── Built-in special tokens ────────────────────────────────────────────────
+_Tokenizer.PAD = "<PAD>";
+_Tokenizer.UNK = "<UNK>";
+_Tokenizer.BOS = "<BOS>";
+_Tokenizer.EOS = "<EOS>";
+var Tokenizer = _Tokenizer;
 // src/EarlyStopping.ts
 var EarlyStopping = class {
   constructor(options) {
@@ -6422,6 +6783,7 @@ function _sampleNormal() {
   Conv2D,
   DataAugmentation,
   DataLoader,
+  DatasetLoader,
   DecisionTree,
   Dropout,
   EarlyStopping,
@@ -6461,6 +6823,7 @@ function _sampleNormal() {
   SoftmaxRegression,
   TCN,
   TSNE,
+  Tokenizer,
   Trainer,
   TransformerBlock,
   VAE,

package/dist/index.mjs CHANGED Viewed

@@ -2531,6 +2531,155 @@ var DataLoader = class _DataLoader {
   }
 };
+// src/DatasetLoader.ts
+var DatasetLoader = class _DatasetLoader {
+  // ── CSV ─────────────────────────────────────────────────────────────────────
+  /**
+   * Parse a CSV string into a DataPair.
+   *
+   * - The first non-empty row is treated as a header.
+   * - Numeric values are parsed with parseFloat.
+   * - String values are one-hot encoded (one column → N binary columns).
+   * - Empty rows and comment lines (starting with #) are skipped.
+   *
+   * @param csv     - raw CSV text
+   * @param options - which columns to use as features / targets
+   */
+  static fromCSV(csv, options) {
+    const rows = _DatasetLoader._parseCSV(csv);
+    if (rows.length < 2) throw new Error("DatasetLoader.fromCSV: CSV must have a header row and at least one data row.");
+    const header = rows[0];
+    const dataRows = rows.slice(1);
+    return _DatasetLoader._buildDataPair(header, dataRows, options);
+  }
+  // ── JSON ─────────────────────────────────────────────────────────────────────
+  /**
+   * Parse a JSON string (array of objects) into a DataPair.
+   *
+   * Expected format:
+   *   [{ "col1": 1.0, "col2": "cat", "label": "dog" }, ...]
+   *
+   * @param json    - raw JSON text or a pre-parsed array of objects
+   * @param options - which columns to use as features / targets
+   */
+  static fromJSON(json, options) {
+    const records = typeof json === "string" ? JSON.parse(json) : json;
+    if (!Array.isArray(records) || records.length === 0) {
+      throw new Error("DatasetLoader.fromJSON: expected a non-empty JSON array of objects.");
+    }
+    const header = Object.keys(records[0]);
+    const dataRows = records.map((row) => header.map((col) => String(row[col] ?? "")));
+    return _DatasetLoader._buildDataPair(header, dataRows, options);
+  }
+  // ── Private: shared pipeline ──────────────────────────────────────────────
+  static _buildDataPair(header, dataRows, options) {
+    const { featureCols, targetCols, encodeStrings = true } = options;
+    for (const col of [...featureCols, ...targetCols]) {
+      if (!header.includes(col)) {
+        throw new Error(`DatasetLoader: column "${col}" not found in header [${header.join(", ")}].`);
+      }
+    }
+    const catMaps = {};
+    const buildEncoder = (cols) => {
+      for (const col of cols) {
+        const colIdx = header.indexOf(col);
+        const values = dataRows.map((row) => row[colIdx]);
+        const isNumeric = values.every((v) => v === "" || !isNaN(Number(v)));
+        if (!isNumeric) {
+          if (!encodeStrings) {
+            throw new Error(`DatasetLoader: column "${col}" contains non-numeric values. Set encodeStrings: true to one-hot encode them.`);
+          }
+          const unique = [...new Set(values)].sort();
+          catMaps[col] = Object.fromEntries(unique.map((v, i) => [v, i]));
+        }
+      }
+    };
+    buildEncoder(featureCols);
+    buildEncoder(targetCols);
+    const encodeValue = (col, raw) => {
+      if (catMaps[col]) {
+        const categories = catMaps[col];
+        const n = Object.keys(categories).length;
+        const vec = new Array(n).fill(0);
+        const idx = categories[raw];
+        if (idx !== void 0) vec[idx] = 1;
+        return vec;
+      }
+      return [parseFloat(raw)];
+    };
+    const expandNames = (cols) => cols.flatMap((col) => {
+      if (catMaps[col]) {
+        return Object.keys(catMaps[col]).map((cat) => `${col}_${cat}`);
+      }
+      return [col];
+    });
+    const featureNames = expandNames(featureCols);
+    const targetNames = expandNames(targetCols);
+    const inputs = [];
+    const targets = [];
+    for (const row of dataRows) {
+      const input = featureCols.flatMap((col) => {
+        const raw = row[header.indexOf(col)];
+        return encodeValue(col, raw);
+      });
+      const target = targetCols.flatMap((col) => {
+        const raw = row[header.indexOf(col)];
+        return encodeValue(col, raw);
+      });
+      inputs.push(input);
+      targets.push(target);
+    }
+    return {
+      inputs,
+      targets,
+      categoricalMaps: catMaps,
+      featureNames,
+      targetNames,
+      numRows: dataRows.length
+    };
+  }
+  // ── Private: RFC 4180-compatible CSV parser ───────────────────────────────
+  static _parseCSV(csv) {
+    const rows = [];
+    const lines = csv.split(/\r?\n/);
+    for (const line of lines) {
+      const trimmed = line.trim();
+      if (!trimmed || trimmed.startsWith("#")) continue;
+      rows.push(_DatasetLoader._parseCSVRow(trimmed));
+    }
+    return rows;
+  }
+  static _parseCSVRow(line) {
+    const fields = [];
+    let current = "";
+    let inQuotes = false;
+    for (let i = 0; i < line.length; i++) {
+      const ch = line[i];
+      if (inQuotes) {
+        if (ch === '"' && line[i + 1] === '"') {
+          current += '"';
+          i++;
+        } else if (ch === '"') {
+          inQuotes = false;
+        } else {
+          current += ch;
+        }
+      } else {
+        if (ch === '"') {
+          inQuotes = true;
+        } else if (ch === ",") {
+          fields.push(current.trim());
+          current = "";
+        } else {
+          current += ch;
+        }
+      }
+    }
+    fields.push(current.trim());
+    return fields;
+  }
+};
 // src/LRScheduler.ts
 var LRScheduler = class {
   // ── Step Decay ────────────────────────────────────────────────────────────
@@ -6015,6 +6164,216 @@ function _binaryRecall(yTrue, yPred, pos) {
   return tp + fn > 0 ? tp / (tp + fn) : 0;
 }
+// src/Tokenizer.ts
+var _Tokenizer = class _Tokenizer {
+  constructor(options = {}) {
+    this._token2id = /* @__PURE__ */ new Map();
+    this._id2token = /* @__PURE__ */ new Map();
+    this._fitted = false;
+    this._mode = options.mode ?? "word";
+    this._lowercase = options.lowercase ?? true;
+    this._maxVocab = options.maxVocab ?? 0;
+    this._extraSpecial = options.specialTokens ?? [];
+  }
+  // ── Fit ───────────────────────────────────────────────────────────────────
+  /**
+   * Build vocabulary from an array of text strings.
+   * Calling fit() again resets and rebuilds the vocabulary from scratch.
+   *
+   * @param texts - corpus to build the vocabulary from
+   * @returns this (chainable)
+   */
+  fit(texts) {
+    this._token2id = /* @__PURE__ */ new Map();
+    this._id2token = /* @__PURE__ */ new Map();
+    const specials = [
+      _Tokenizer.PAD,
+      _Tokenizer.UNK,
+      _Tokenizer.BOS,
+      _Tokenizer.EOS,
+      ...this._extraSpecial
+    ];
+    for (const s of specials) this._register(s);
+    const freq = /* @__PURE__ */ new Map();
+    for (const text of texts) {
+      for (const token of this.tokenize(text)) {
+        freq.set(token, (freq.get(token) ?? 0) + 1);
+      }
+    }
+    let entries = [...freq.entries()].sort(
+      ([a, fa], [b, fb]) => fb - fa || a.localeCompare(b)
+    );
+    if (this._maxVocab > 0) {
+      entries = entries.slice(0, this._maxVocab - specials.length);
+    }
+    for (const [token] of entries) this._register(token);
+    this._fitted = true;
+    return this;
+  }
+  // ── Tokenize ──────────────────────────────────────────────────────────────
+  /**
+   * Split raw text into an array of string tokens (no ID conversion yet).
+   * Useful for inspecting what the tokenizer produces before encoding.
+   */
+  tokenize(text) {
+    const t = this._lowercase ? text.toLowerCase() : text;
+    switch (this._mode) {
+      case "char":
+        return t.split("");
+      case "whitespace":
+        return t.split(/\s+/).filter(Boolean);
+      case "word":
+      default:
+        return t.match(/[a-z0-9àáâãäåæçèéêëìíîïðñòóôõöùúûüýþÿ]+|[^\w\s]/gi) ?? [];
+    }
+  }
+  // ── Encode ────────────────────────────────────────────────────────────────
+  /**
+   * Convert a text string to a sequence of token IDs.
+   * Unknown tokens map to <UNK> (id 1).
+   *
+   * @param text    - input text
+   * @param options - addBOS / addEOS flags
+   */
+  encode(text, options = {}) {
+    this._assertFitted();
+    const ids = [];
+    if (options.addBOS) ids.push(this._token2id.get(_Tokenizer.BOS));
+    for (const token of this.tokenize(text)) {
+      ids.push(this._token2id.get(token) ?? this._token2id.get(_Tokenizer.UNK));
+    }
+    if (options.addEOS) ids.push(this._token2id.get(_Tokenizer.EOS));
+    return ids;
+  }
+  // ── Encode batch ──────────────────────────────────────────────────────────
+  /**
+   * Encode an array of texts, optionally padding/truncating to a fixed length.
+   *
+   * @param texts   - array of input texts
+   * @param options - addBOS / addEOS / padTo
+   */
+  encodeBatch(texts, options = {}) {
+    const sequences = texts.map((t) => this.encode(t, options));
+    if (options.padTo !== void 0) {
+      const len = options.padTo;
+      const padId = this._token2id.get(_Tokenizer.PAD);
+      return sequences.map((seq) => {
+        if (seq.length >= len) return seq.slice(0, len);
+        return [...seq, ...Array(len - seq.length).fill(padId)];
+      });
+    }
+    return sequences;
+  }
+  // ── Decode ────────────────────────────────────────────────────────────────
+  /**
+   * Convert a sequence of token IDs back to a human-readable string.
+   *
+   * @param ids          - array of token IDs
+   * @param stripSpecial - remove PAD/BOS/EOS tokens from output. Default: true
+   */
+  decode(ids, stripSpecial = true) {
+    this._assertFitted();
+    const specials = /* @__PURE__ */ new Set([_Tokenizer.PAD, _Tokenizer.BOS, _Tokenizer.EOS]);
+    const tokens = [];
+    for (const id of ids) {
+      const token = this._id2token.get(id) ?? _Tokenizer.UNK;
+      if (stripSpecial && specials.has(token)) continue;
+      tokens.push(token);
+    }
+    return this._mode === "char" ? tokens.join("") : tokens.join(" ");
+  }
+  // ── One-hot encoding ──────────────────────────────────────────────────────
+  /**
+   * Convert a sequence of token IDs to one-hot vectors.
+   * Each vector has length `vocabSize` with a single 1 at the token's position.
+   * Useful when feeding tokens directly into a Network without an embedding layer.
+   *
+   * @param ids - array of token IDs (e.g. from encode())
+   * @returns   - 2D array of shape [seqLen, vocabSize]
+   */
+  oneHot(ids) {
+    this._assertFitted();
+    const V = this.vocabSize;
+    return ids.map((id) => {
+      const vec = new Array(V).fill(0);
+      if (id >= 0 && id < V) vec[id] = 1;
+      return vec;
+    });
+  }
+  // ── Vocabulary helpers ────────────────────────────────────────────────────
+  /** Number of tokens in the vocabulary (including special tokens). */
+  get vocabSize() {
+    return this._token2id.size;
+  }
+  /** True if fit() has been called at least once. */
+  get isFitted() {
+    return this._fitted;
+  }
+  /** Get the integer ID for a token string, or undefined if not in vocabulary. */
+  tokenToId(token) {
+    return this._token2id.get(token);
+  }
+  /** Get the token string for an integer ID, or undefined if out of range. */
+  idToToken(id) {
+    return this._id2token.get(id);
+  }
+  /**
+   * Return the full vocabulary as an array ordered by ID.
+   * Index i of the returned array is the token with ID i.
+   */
+  getVocabulary() {
+    return Array.from({ length: this.vocabSize }, (_, i) => this._id2token.get(i));
+  }
+  // ── Persistence ───────────────────────────────────────────────────────────
+  /**
+   * Serialize the fitted tokenizer to a plain JSON-compatible object.
+   * Store it with JSON.stringify(); reload with Tokenizer.fromJSON().
+   */
+  toJSON() {
+    this._assertFitted();
+    return {
+      mode: this._mode,
+      lowercase: this._lowercase,
+      maxVocab: this._maxVocab,
+      token2id: Object.fromEntries(this._token2id)
+    };
+  }
+  /**
+   * Restore a Tokenizer from a snapshot produced by toJSON().
+   */
+  static fromJSON(snapshot) {
+    const tok = new _Tokenizer({
+      mode: snapshot.mode,
+      lowercase: snapshot.lowercase,
+      maxVocab: snapshot.maxVocab
+    });
+    for (const [token, id] of Object.entries(snapshot.token2id)) {
+      tok._token2id.set(token, id);
+      tok._id2token.set(id, token);
+    }
+    tok._fitted = true;
+    return tok;
+  }
+  // ── Private ───────────────────────────────────────────────────────────────
+  _register(token) {
+    if (this._token2id.has(token)) return;
+    const id = this._token2id.size;
+    this._token2id.set(token, id);
+    this._id2token.set(id, token);
+  }
+  _assertFitted() {
+    if (!this._fitted) {
+      throw new Error("Tokenizer: call fit() before encoding or decoding.");
+    }
+  }
+};
+// ── Built-in special tokens ────────────────────────────────────────────────
+_Tokenizer.PAD = "<PAD>";
+_Tokenizer.UNK = "<UNK>";
+_Tokenizer.BOS = "<BOS>";
+_Tokenizer.EOS = "<EOS>";
+var Tokenizer = _Tokenizer;
 // src/EarlyStopping.ts
 var EarlyStopping = class {
   constructor(options) {
@@ -6301,6 +6660,7 @@ export {
   Conv2D,
   DataAugmentation,
   DataLoader,
+  DatasetLoader,
   DecisionTree,
   Dropout,
   EarlyStopping,
@@ -6340,6 +6700,7 @@ export {
   SoftmaxRegression,
   TCN,
   TSNE,
+  Tokenizer,
   Trainer,
   TransformerBlock,
   VAE,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@dniskav/neuron",
-  "version": "0.3.1",
+  "version": "0.3.2",
   "description": "Minimal neural network from scratch — neuron, layer, network, backpropagation, classical ML, unsupervised, generative models, autograd. No dependencies.",
   "main": "dist/index.js",
   "module": "dist/index.mjs",