npm - @dniskav/neuron - Versions diffs - 0.3.0 → 0.3.2 - Mend

@dniskav/neuron 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -570,6 +570,63 @@ declare class Trainer {
     private _computeMetricsArray;
 }
+interface DatasetLoaderOptions {
+    /** Column names to use as input features. */
+    featureCols: string[];
+    /** Column names to use as targets / labels. */
+    targetCols: string[];
+    /**
+     * When true, string values in feature/target columns are one-hot encoded.
+     * When false, non-numeric values throw an error. Default: true.
+     */
+    encodeStrings?: boolean;
+}
+/**
+ * Maps a column name to its {value → one-hot index} dictionary.
+ * Useful for decoding model predictions back to class names.
+ */
+type CategoricalMap = Record<string, Record<string, number>>;
+interface DatasetLoaderResult extends DataPair {
+    /**
+     * For each string column that was one-hot encoded, maps the column name to
+     * the {category → index} dictionary used during encoding.
+     */
+    categoricalMaps: CategoricalMap;
+    /** Column names in the order they appear in each input vector. */
+    featureNames: string[];
+    /** Column names (or expanded one-hot names) in the order they appear in each target vector. */
+    targetNames: string[];
+    /** Total number of rows parsed. */
+    numRows: number;
+}
+declare class DatasetLoader {
+    /**
+     * Parse a CSV string into a DataPair.
+     *
+     * - The first non-empty row is treated as a header.
+     * - Numeric values are parsed with parseFloat.
+     * - String values are one-hot encoded (one column → N binary columns).
+     * - Empty rows and comment lines (starting with #) are skipped.
+     *
+     * @param csv     - raw CSV text
+     * @param options - which columns to use as features / targets
+     */
+    static fromCSV(csv: string, options: DatasetLoaderOptions): DatasetLoaderResult;
+    /**
+     * Parse a JSON string (array of objects) into a DataPair.
+     *
+     * Expected format:
+     *   [{ "col1": 1.0, "col2": "cat", "label": "dog" }, ...]
+     *
+     * @param json    - raw JSON text or a pre-parsed array of objects
+     * @param options - which columns to use as features / targets
+     */
+    static fromJSON(json: string | Record<string, unknown>[], options: DatasetLoaderOptions): DatasetLoaderResult;
+    private static _buildDataPair;
+    private static _parseCSV;
+    private static _parseCSVRow;
+}
 declare class LRScheduler {
     stepDecay(lr: number, epoch: number, dropRate: number, epochsDrop: number): number;
     exponentialDecay(lr: number, epoch: number, decayRate: number): number;
@@ -893,6 +950,123 @@ declare class TCN {
     train(sequence: number[][], targets: number[][], lr: number): number;
 }
+type Word2VecModel = 'skipgram' | 'cbow';
+interface Word2VecOptions {
+    /** Size of the sliding context window on each side of the center word. Default 2. */
+    windowSize?: number;
+    /** Training architecture. Default 'skipgram'. */
+    model?: Word2VecModel;
+    /** Ignore words with corpus frequency below this threshold. Default 1. */
+    minCount?: number;
+}
+declare class Word2Vec {
+    /** Learned word vectors, shape [vocabSize][embeddingDim]. */
+    embeddings: number[][];
+    /** Maps each vocabulary word to its integer index. */
+    vocab: Map<string, number>;
+    vocabSize: number;
+    embeddingDim: number;
+    private _indexToWord;
+    private _W2;
+    private _windowSize;
+    private _model;
+    private _minCount;
+    private _trained;
+    constructor(embeddingDim?: number, options?: Word2VecOptions);
+    buildVocab(sentences: string[][]): void;
+    static tokenize(text: string): string[];
+    train(sentences: string[][], lr?: number, epochs?: number): number[];
+    getEmbedding(word: string): number[];
+    similarity(word1: string, word2: string): number;
+    mostSimilar(word: string, topK?: number): {
+        word: string;
+        score: number;
+    }[];
+    analogy(positive1: string, negative: string, positive2: string, topK?: number): {
+        word: string;
+        score: number;
+    }[];
+    private _skipgramStep;
+    private _cbowStep;
+    private _hiddenToScores;
+    private _nearestByVector;
+    private _cosine;
+}
+interface TSNEOptions {
+    /** Dimensionality of the output embedding. Default 2. */
+    nComponents?: number;
+    /**
+     * Perplexity — loosely controls the effective number of neighbors considered
+     * for each point. Typical values: 5–50. Default 30.
+     * Must be less than the number of data points.
+     */
+    perplexity?: number;
+    /** Learning rate for gradient descent. Default 200. */
+    lr?: number;
+    /** Number of gradient-descent iterations. Default 1000. */
+    nIter?: number;
+    /**
+     * Seed for the pseudo-random number generator.
+     * Set to any integer for reproducible results. Default uses Math.random.
+     */
+    seed?: number;
+}
+declare class TSNE {
+    /** Result of the embedding, shape [n][nComponents]. Available after fit(). */
+    embedding: number[][];
+    private readonly _nComponents;
+    private readonly _perplexity;
+    private readonly _lr;
+    private readonly _nIter;
+    private readonly _seed;
+    private _klDivergence;
+    private _P;
+    constructor(options?: TSNEOptions);
+    fit(X: number[][]): void;
+    fitTransform(X: number[][]): number[][];
+    kl(): number;
+    private _computePcond;
+}
+declare class PositionalEncoding {
+    static encode(pos: number, dModel: number): number[];
+    static encodeSequence(seqLen: number, dModel: number): number[][];
+    static apply(embeddings: number[][], seqLen?: number): number[][];
+}
+declare class LearnedPositionalEncoding {
+    readonly maxSeqLen: number;
+    readonly dModel: number;
+    weights: number[][];
+    constructor(maxSeqLen: number, dModel: number);
+    getEncoding(pos: number): number[];
+    apply(embeddings: number[][], seqLen?: number): number[][];
+    update(dWeights: number[][], lr: number): void;
+}
+declare class Augmenter {
+    static addNoise(x: number[], sigma?: number): number[];
+    static dropoutFeatures(x: number[], rate?: number): number[];
+    static augment(x: number[], noiseStd?: number, dropRate?: number): number[];
+    static makePair(x: number[]): [number[], number[]];
+}
+declare class ContrastiveLearning {
+    encoder: NetworkN;
+    projectionHead: NetworkN;
+    temperature: number;
+    constructor(inputSize: number, encoderHidden: number[], projectionDim: number, options?: {
+        temperature?: number;
+        encoderOptions?: NetworkNOptions;
+    });
+    encode(x: number[]): number[];
+    project(x: number[]): number[];
+    static cosineSimilarity(a: number[], b: number[]): number;
+    computeLoss(pairs: [number[], number[]][]): number;
+    trainStep(pairs: [number[], number[]][], lr: number): number;
+    private _forwardProjections;
+    private _ntXentLoss;
+}
 declare class GAN {
     readonly generator: NetworkN;
     readonly discriminator: NetworkN;
@@ -990,6 +1164,121 @@ declare function perplexity(yTrue: number[], probabilities: number[][]): number;
 declare function printConfusionMatrix(matrix: number[][], labels?: string[]): void;
 declare function classificationReport(yTrue: number[], yPred: number[], labels?: string[]): void;
+type TokenizerMode = 'char' | 'word' | 'whitespace';
+interface TokenizerOptions {
+    /** Tokenization strategy. Default: 'word' */
+    mode?: TokenizerMode;
+    /** Normalize text to lowercase before processing. Default: true */
+    lowercase?: boolean;
+    /** Maximum vocabulary size (most frequent tokens kept). 0 = unlimited. Default: 0 */
+    maxVocab?: number;
+    /** Additional special tokens to reserve (appended after PAD/UNK/BOS/EOS). */
+    specialTokens?: string[];
+}
+interface EncodeOptions {
+    /** Prepend <BOS> token. Default: false */
+    addBOS?: boolean;
+    /** Append <EOS> token. Default: false */
+    addEOS?: boolean;
+}
+interface EncodeBatchOptions extends EncodeOptions {
+    /**
+     * Pad or truncate all sequences to this length.
+     * Sequences shorter than padTo are right-padded with <PAD> (id 0).
+     * Sequences longer than padTo are truncated on the right.
+     * If omitted, sequences are left at their natural length.
+     */
+    padTo?: number;
+}
+interface TokenizerSnapshot {
+    mode: TokenizerMode;
+    lowercase: boolean;
+    maxVocab: number;
+    token2id: Record<string, number>;
+}
+declare class Tokenizer {
+    static readonly PAD = "<PAD>";
+    static readonly UNK = "<UNK>";
+    static readonly BOS = "<BOS>";
+    static readonly EOS = "<EOS>";
+    private readonly _mode;
+    private readonly _lowercase;
+    private readonly _maxVocab;
+    private readonly _extraSpecial;
+    private _token2id;
+    private _id2token;
+    private _fitted;
+    constructor(options?: TokenizerOptions);
+    /**
+     * Build vocabulary from an array of text strings.
+     * Calling fit() again resets and rebuilds the vocabulary from scratch.
+     *
+     * @param texts - corpus to build the vocabulary from
+     * @returns this (chainable)
+     */
+    fit(texts: string[]): this;
+    /**
+     * Split raw text into an array of string tokens (no ID conversion yet).
+     * Useful for inspecting what the tokenizer produces before encoding.
+     */
+    tokenize(text: string): string[];
+    /**
+     * Convert a text string to a sequence of token IDs.
+     * Unknown tokens map to <UNK> (id 1).
+     *
+     * @param text    - input text
+     * @param options - addBOS / addEOS flags
+     */
+    encode(text: string, options?: EncodeOptions): number[];
+    /**
+     * Encode an array of texts, optionally padding/truncating to a fixed length.
+     *
+     * @param texts   - array of input texts
+     * @param options - addBOS / addEOS / padTo
+     */
+    encodeBatch(texts: string[], options?: EncodeBatchOptions): number[][];
+    /**
+     * Convert a sequence of token IDs back to a human-readable string.
+     *
+     * @param ids          - array of token IDs
+     * @param stripSpecial - remove PAD/BOS/EOS tokens from output. Default: true
+     */
+    decode(ids: number[], stripSpecial?: boolean): string;
+    /**
+     * Convert a sequence of token IDs to one-hot vectors.
+     * Each vector has length `vocabSize` with a single 1 at the token's position.
+     * Useful when feeding tokens directly into a Network without an embedding layer.
+     *
+     * @param ids - array of token IDs (e.g. from encode())
+     * @returns   - 2D array of shape [seqLen, vocabSize]
+     */
+    oneHot(ids: number[]): number[][];
+    /** Number of tokens in the vocabulary (including special tokens). */
+    get vocabSize(): number;
+    /** True if fit() has been called at least once. */
+    get isFitted(): boolean;
+    /** Get the integer ID for a token string, or undefined if not in vocabulary. */
+    tokenToId(token: string): number | undefined;
+    /** Get the token string for an integer ID, or undefined if out of range. */
+    idToToken(id: number): string | undefined;
+    /**
+     * Return the full vocabulary as an array ordered by ID.
+     * Index i of the returned array is the token with ID i.
+     */
+    getVocabulary(): string[];
+    /**
+     * Serialize the fitted tokenizer to a plain JSON-compatible object.
+     * Store it with JSON.stringify(); reload with Tokenizer.fromJSON().
+     */
+    toJSON(): TokenizerSnapshot;
+    /**
+     * Restore a Tokenizer from a snapshot produced by toJSON().
+     */
+    static fromJSON(snapshot: TokenizerSnapshot): Tokenizer;
+    private _register;
+    private _assertFitted;
+}
 declare class EarlyStopping {
     bestValue: number;
     readonly patience: number;
@@ -1062,4 +1351,4 @@ declare class DataAugmentation {
     };
 }
-export { type Activation, Adam, AttentionHead, Autoencoder, BatchNorm, BiasVector, CausalConv1D, ClipOptimizer, ClippedOptimizerFactory, Conv1D, Conv2D, DataAugmentation, DataLoader, type DataPair, DecisionTree, Dropout, EarlyStopping, EmbeddingMatrix, Flatten, GAN, GRULayer, GaussianNaiveBayes, HopfieldNetwork, KMeans, type KMeansOptions, LRScheduler, LSTMLayer, Layer, LayerNorm, LinearRegression, LogisticRegression, LossPlotter, MaxPool2D, ModelSaver, Momentum, MultiHeadAttention, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, NetworkTransformer, type NetworkTransformerOptions, NetworkTransformerRL, type NetworkTransformerRLOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, PCA, Perceptron, RNN, SGD, SOM, type SOMOptions, Seq2Seq, type Serializable, SoftmaxRegression, TCN, type TrainDataset, type TrainMetrics, type TrainableNetwork, type TrainableNetworkWithWeights, Trainer, type TrainerOptions, TransformerBlock, type TransformerBlockOptions, VAE, Value, WeightInspector, WeightMatrix, type WeightStats, accuracy, auc, classificationReport, confusionMatrix, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, defaultOptimizer, elu, f1Score, leakyRelu, linear, mae, makeElu, makeLeakyRelu, matMul, mse, mseDelta, perplexity, precision, printConfusionMatrix, r2Score, recall, relu, rmse, rocCurve, sigmoid, softmax, softmaxBackward, tanh, transpose, validate2DArray, validateArray, validateArrayMinLength, validateNumber };
+export { type Activation, Adam, AttentionHead, Augmenter, Autoencoder, BatchNorm, BiasVector, type CategoricalMap, CausalConv1D, ClipOptimizer, ClippedOptimizerFactory, ContrastiveLearning, Conv1D, Conv2D, DataAugmentation, DataLoader, type DataPair, DatasetLoader, type DatasetLoaderOptions, type DatasetLoaderResult, DecisionTree, Dropout, EarlyStopping, EmbeddingMatrix, type EncodeBatchOptions, type EncodeOptions, Flatten, GAN, GRULayer, GaussianNaiveBayes, HopfieldNetwork, KMeans, type KMeansOptions, LRScheduler, LSTMLayer, Layer, LayerNorm, LearnedPositionalEncoding, LinearRegression, LogisticRegression, LossPlotter, MaxPool2D, ModelSaver, Momentum, MultiHeadAttention, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, NetworkTransformer, type NetworkTransformerOptions, NetworkTransformerRL, type NetworkTransformerRLOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, PCA, Perceptron, PositionalEncoding, RNN, SGD, SOM, type SOMOptions, Seq2Seq, type Serializable, SoftmaxRegression, TCN, TSNE, type TSNEOptions, Tokenizer, type TokenizerMode, type TokenizerOptions, type TokenizerSnapshot, type TrainDataset, type TrainMetrics, type TrainableNetwork, type TrainableNetworkWithWeights, Trainer, type TrainerOptions, TransformerBlock, type TransformerBlockOptions, VAE, Value, WeightInspector, WeightMatrix, type WeightStats, Word2Vec, type Word2VecModel, type Word2VecOptions, accuracy, auc, classificationReport, confusionMatrix, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, defaultOptimizer, elu, f1Score, leakyRelu, linear, mae, makeElu, makeLeakyRelu, matMul, mse, mseDelta, perplexity, precision, printConfusionMatrix, r2Score, recall, relu, rmse, rocCurve, sigmoid, softmax, softmaxBackward, tanh, transpose, validate2DArray, validateArray, validateArrayMinLength, validateNumber };