@dniskav/neuron 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -0
- package/dist/index.d.mts +173 -1
- package/dist/index.d.ts +173 -1
- package/dist/index.js +363 -0
- package/dist/index.mjs +361 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -678,8 +678,42 @@ npm test # run test suite
|
|
|
678
678
|
|
|
679
679
|
If you are an AI agent or LLM working with this codebase, read [AGENTS.md](AGENTS.md) first. It contains the full class hierarchy, design constraints, and what this library does not do.
|
|
680
680
|
|
|
681
|
+
## Roadmap (nice to have)
|
|
682
|
+
|
|
683
|
+
These features are intentionally out of scope for the current didactic focus but are documented here for reference.
|
|
684
|
+
|
|
685
|
+
### ONNX export
|
|
686
|
+
|
|
687
|
+
Export trained models to the [ONNX](https://onnx.ai/) interchange format so they can be run in Python (onnxruntime), browsers (onnxruntime-web), mobile, or production inference servers.
|
|
688
|
+
|
|
689
|
+
**What it would require:**
|
|
690
|
+
- Serialize each layer's weights + op type to the protobuf ONNX schema (`onnx.proto`).
|
|
691
|
+
- Map neuron layer types to standard ONNX operators (`Gemm`, `MatMul`, `LSTM`, `Conv`, `Relu`, `Softmax`, …).
|
|
692
|
+
- Handle dynamic batch dimensions in the graph IR.
|
|
693
|
+
- Ship a build step that compiles the `.proto` definitions (adds a dev dependency on `protobufjs` or `onnx-proto`).
|
|
694
|
+
|
|
695
|
+
**Why it's skipped:** It adds a non-trivial build pipeline and a dependency. The library has zero runtime dependencies by design. ONNX export makes sense once you outgrow the library for training — at that point PyTorch/TF are the right tools.
|
|
696
|
+
|
|
697
|
+
### WebGL / WASM backend
|
|
698
|
+
|
|
699
|
+
Replace the current pure-JS number arrays with a GPU-accelerated or WASM-compiled backend so larger models (e.g. ViT, GPT-2 scale) become feasible in the browser.
|
|
700
|
+
|
|
701
|
+
**What it would require:**
|
|
702
|
+
- Abstract `Tensor` type that backends implement (JS arrays, WebGL textures, WASM memory).
|
|
703
|
+
- WebGL backend: encode matrix ops as fragment-shader programs (similar to `gpu.js` or `tfjs-backend-webgl`).
|
|
704
|
+
- WASM backend: compile a BLAS-like C/Rust core (e.g. `wasm-bindgen` + `ndarray`) and bind it to TypeScript.
|
|
705
|
+
- Every layer's `forward` / `backward` rewritten against the `Tensor` API.
|
|
706
|
+
|
|
707
|
+
**Why it's skipped:** The goal is to make the math readable. GPU shader code and WASM bindings are implementation details that obscure the algorithms. The library intentionally trades performance for pedagogical clarity.
|
|
708
|
+
|
|
709
|
+
---
|
|
710
|
+
|
|
681
711
|
## Changelog
|
|
682
712
|
|
|
713
|
+
### v0.3.2
|
|
714
|
+
- **New — NLP:** `Tokenizer` (char / word / whitespace modes, special tokens PAD/UNK/BOS/EOS, one-hot encoding, `fit` / `encode` / `decode` / `encodeBatch`, JSON serialization)
|
|
715
|
+
- **New — Data:** `DatasetLoader` (parse CSV and JSON into `DataPair`; auto one-hot encoding for string columns; returns `categoricalMaps` for decoding predictions)
|
|
716
|
+
|
|
683
717
|
### v0.3.1
|
|
684
718
|
- **New — Embeddings:** `Word2Vec` (Skip-gram + CBOW, full-softmax, cosine similarity, analogies), `TSNE` (binary-search perplexity, Student-t kernel, KL gradient, early exaggeration, seeded PRNG), `PositionalEncoding` (sinusoidal, Vaswani et al.), `LearnedPositionalEncoding` (trainable), `ContrastiveLearning` (NT-Xent, SimCLR encoder + projection head), `Augmenter` (noise, feature dropout, `makePair`)
|
|
685
719
|
|
package/dist/index.d.mts
CHANGED
|
@@ -570,6 +570,63 @@ declare class Trainer {
|
|
|
570
570
|
private _computeMetricsArray;
|
|
571
571
|
}
|
|
572
572
|
|
|
573
|
+
interface DatasetLoaderOptions {
|
|
574
|
+
/** Column names to use as input features. */
|
|
575
|
+
featureCols: string[];
|
|
576
|
+
/** Column names to use as targets / labels. */
|
|
577
|
+
targetCols: string[];
|
|
578
|
+
/**
|
|
579
|
+
* When true, string values in feature/target columns are one-hot encoded.
|
|
580
|
+
* When false, non-numeric values throw an error. Default: true.
|
|
581
|
+
*/
|
|
582
|
+
encodeStrings?: boolean;
|
|
583
|
+
}
|
|
584
|
+
/**
|
|
585
|
+
* Maps a column name to its {value → one-hot index} dictionary.
|
|
586
|
+
* Useful for decoding model predictions back to class names.
|
|
587
|
+
*/
|
|
588
|
+
type CategoricalMap = Record<string, Record<string, number>>;
|
|
589
|
+
interface DatasetLoaderResult extends DataPair {
|
|
590
|
+
/**
|
|
591
|
+
* For each string column that was one-hot encoded, maps the column name to
|
|
592
|
+
* the {category → index} dictionary used during encoding.
|
|
593
|
+
*/
|
|
594
|
+
categoricalMaps: CategoricalMap;
|
|
595
|
+
/** Column names in the order they appear in each input vector. */
|
|
596
|
+
featureNames: string[];
|
|
597
|
+
/** Column names (or expanded one-hot names) in the order they appear in each target vector. */
|
|
598
|
+
targetNames: string[];
|
|
599
|
+
/** Total number of rows parsed. */
|
|
600
|
+
numRows: number;
|
|
601
|
+
}
|
|
602
|
+
declare class DatasetLoader {
|
|
603
|
+
/**
|
|
604
|
+
* Parse a CSV string into a DataPair.
|
|
605
|
+
*
|
|
606
|
+
* - The first non-empty row is treated as a header.
|
|
607
|
+
* - Numeric values are parsed with parseFloat.
|
|
608
|
+
* - String values are one-hot encoded (one column → N binary columns).
|
|
609
|
+
* - Empty rows and comment lines (starting with #) are skipped.
|
|
610
|
+
*
|
|
611
|
+
* @param csv - raw CSV text
|
|
612
|
+
* @param options - which columns to use as features / targets
|
|
613
|
+
*/
|
|
614
|
+
static fromCSV(csv: string, options: DatasetLoaderOptions): DatasetLoaderResult;
|
|
615
|
+
/**
|
|
616
|
+
* Parse a JSON string (array of objects) into a DataPair.
|
|
617
|
+
*
|
|
618
|
+
* Expected format:
|
|
619
|
+
* [{ "col1": 1.0, "col2": "cat", "label": "dog" }, ...]
|
|
620
|
+
*
|
|
621
|
+
* @param json - raw JSON text or a pre-parsed array of objects
|
|
622
|
+
* @param options - which columns to use as features / targets
|
|
623
|
+
*/
|
|
624
|
+
static fromJSON(json: string | Record<string, unknown>[], options: DatasetLoaderOptions): DatasetLoaderResult;
|
|
625
|
+
private static _buildDataPair;
|
|
626
|
+
private static _parseCSV;
|
|
627
|
+
private static _parseCSVRow;
|
|
628
|
+
}
|
|
629
|
+
|
|
573
630
|
declare class LRScheduler {
|
|
574
631
|
stepDecay(lr: number, epoch: number, dropRate: number, epochsDrop: number): number;
|
|
575
632
|
exponentialDecay(lr: number, epoch: number, decayRate: number): number;
|
|
@@ -1107,6 +1164,121 @@ declare function perplexity(yTrue: number[], probabilities: number[][]): number;
|
|
|
1107
1164
|
declare function printConfusionMatrix(matrix: number[][], labels?: string[]): void;
|
|
1108
1165
|
declare function classificationReport(yTrue: number[], yPred: number[], labels?: string[]): void;
|
|
1109
1166
|
|
|
1167
|
+
type TokenizerMode = 'char' | 'word' | 'whitespace';
|
|
1168
|
+
interface TokenizerOptions {
|
|
1169
|
+
/** Tokenization strategy. Default: 'word' */
|
|
1170
|
+
mode?: TokenizerMode;
|
|
1171
|
+
/** Normalize text to lowercase before processing. Default: true */
|
|
1172
|
+
lowercase?: boolean;
|
|
1173
|
+
/** Maximum vocabulary size (most frequent tokens kept). 0 = unlimited. Default: 0 */
|
|
1174
|
+
maxVocab?: number;
|
|
1175
|
+
/** Additional special tokens to reserve (appended after PAD/UNK/BOS/EOS). */
|
|
1176
|
+
specialTokens?: string[];
|
|
1177
|
+
}
|
|
1178
|
+
interface EncodeOptions {
|
|
1179
|
+
/** Prepend <BOS> token. Default: false */
|
|
1180
|
+
addBOS?: boolean;
|
|
1181
|
+
/** Append <EOS> token. Default: false */
|
|
1182
|
+
addEOS?: boolean;
|
|
1183
|
+
}
|
|
1184
|
+
interface EncodeBatchOptions extends EncodeOptions {
|
|
1185
|
+
/**
|
|
1186
|
+
* Pad or truncate all sequences to this length.
|
|
1187
|
+
* Sequences shorter than padTo are right-padded with <PAD> (id 0).
|
|
1188
|
+
* Sequences longer than padTo are truncated on the right.
|
|
1189
|
+
* If omitted, sequences are left at their natural length.
|
|
1190
|
+
*/
|
|
1191
|
+
padTo?: number;
|
|
1192
|
+
}
|
|
1193
|
+
interface TokenizerSnapshot {
|
|
1194
|
+
mode: TokenizerMode;
|
|
1195
|
+
lowercase: boolean;
|
|
1196
|
+
maxVocab: number;
|
|
1197
|
+
token2id: Record<string, number>;
|
|
1198
|
+
}
|
|
1199
|
+
declare class Tokenizer {
|
|
1200
|
+
static readonly PAD = "<PAD>";
|
|
1201
|
+
static readonly UNK = "<UNK>";
|
|
1202
|
+
static readonly BOS = "<BOS>";
|
|
1203
|
+
static readonly EOS = "<EOS>";
|
|
1204
|
+
private readonly _mode;
|
|
1205
|
+
private readonly _lowercase;
|
|
1206
|
+
private readonly _maxVocab;
|
|
1207
|
+
private readonly _extraSpecial;
|
|
1208
|
+
private _token2id;
|
|
1209
|
+
private _id2token;
|
|
1210
|
+
private _fitted;
|
|
1211
|
+
constructor(options?: TokenizerOptions);
|
|
1212
|
+
/**
|
|
1213
|
+
* Build vocabulary from an array of text strings.
|
|
1214
|
+
* Calling fit() again resets and rebuilds the vocabulary from scratch.
|
|
1215
|
+
*
|
|
1216
|
+
* @param texts - corpus to build the vocabulary from
|
|
1217
|
+
* @returns this (chainable)
|
|
1218
|
+
*/
|
|
1219
|
+
fit(texts: string[]): this;
|
|
1220
|
+
/**
|
|
1221
|
+
* Split raw text into an array of string tokens (no ID conversion yet).
|
|
1222
|
+
* Useful for inspecting what the tokenizer produces before encoding.
|
|
1223
|
+
*/
|
|
1224
|
+
tokenize(text: string): string[];
|
|
1225
|
+
/**
|
|
1226
|
+
* Convert a text string to a sequence of token IDs.
|
|
1227
|
+
* Unknown tokens map to <UNK> (id 1).
|
|
1228
|
+
*
|
|
1229
|
+
* @param text - input text
|
|
1230
|
+
* @param options - addBOS / addEOS flags
|
|
1231
|
+
*/
|
|
1232
|
+
encode(text: string, options?: EncodeOptions): number[];
|
|
1233
|
+
/**
|
|
1234
|
+
* Encode an array of texts, optionally padding/truncating to a fixed length.
|
|
1235
|
+
*
|
|
1236
|
+
* @param texts - array of input texts
|
|
1237
|
+
* @param options - addBOS / addEOS / padTo
|
|
1238
|
+
*/
|
|
1239
|
+
encodeBatch(texts: string[], options?: EncodeBatchOptions): number[][];
|
|
1240
|
+
/**
|
|
1241
|
+
* Convert a sequence of token IDs back to a human-readable string.
|
|
1242
|
+
*
|
|
1243
|
+
* @param ids - array of token IDs
|
|
1244
|
+
* @param stripSpecial - remove PAD/BOS/EOS tokens from output. Default: true
|
|
1245
|
+
*/
|
|
1246
|
+
decode(ids: number[], stripSpecial?: boolean): string;
|
|
1247
|
+
/**
|
|
1248
|
+
* Convert a sequence of token IDs to one-hot vectors.
|
|
1249
|
+
* Each vector has length `vocabSize` with a single 1 at the token's position.
|
|
1250
|
+
* Useful when feeding tokens directly into a Network without an embedding layer.
|
|
1251
|
+
*
|
|
1252
|
+
* @param ids - array of token IDs (e.g. from encode())
|
|
1253
|
+
* @returns - 2D array of shape [seqLen, vocabSize]
|
|
1254
|
+
*/
|
|
1255
|
+
oneHot(ids: number[]): number[][];
|
|
1256
|
+
/** Number of tokens in the vocabulary (including special tokens). */
|
|
1257
|
+
get vocabSize(): number;
|
|
1258
|
+
/** True if fit() has been called at least once. */
|
|
1259
|
+
get isFitted(): boolean;
|
|
1260
|
+
/** Get the integer ID for a token string, or undefined if not in vocabulary. */
|
|
1261
|
+
tokenToId(token: string): number | undefined;
|
|
1262
|
+
/** Get the token string for an integer ID, or undefined if out of range. */
|
|
1263
|
+
idToToken(id: number): string | undefined;
|
|
1264
|
+
/**
|
|
1265
|
+
* Return the full vocabulary as an array ordered by ID.
|
|
1266
|
+
* Index i of the returned array is the token with ID i.
|
|
1267
|
+
*/
|
|
1268
|
+
getVocabulary(): string[];
|
|
1269
|
+
/**
|
|
1270
|
+
* Serialize the fitted tokenizer to a plain JSON-compatible object.
|
|
1271
|
+
* Store it with JSON.stringify(); reload with Tokenizer.fromJSON().
|
|
1272
|
+
*/
|
|
1273
|
+
toJSON(): TokenizerSnapshot;
|
|
1274
|
+
/**
|
|
1275
|
+
* Restore a Tokenizer from a snapshot produced by toJSON().
|
|
1276
|
+
*/
|
|
1277
|
+
static fromJSON(snapshot: TokenizerSnapshot): Tokenizer;
|
|
1278
|
+
private _register;
|
|
1279
|
+
private _assertFitted;
|
|
1280
|
+
}
|
|
1281
|
+
|
|
1110
1282
|
declare class EarlyStopping {
|
|
1111
1283
|
bestValue: number;
|
|
1112
1284
|
readonly patience: number;
|
|
@@ -1179,4 +1351,4 @@ declare class DataAugmentation {
|
|
|
1179
1351
|
};
|
|
1180
1352
|
}
|
|
1181
1353
|
|
|
1182
|
-
export { type Activation, Adam, AttentionHead, Augmenter, Autoencoder, BatchNorm, BiasVector, CausalConv1D, ClipOptimizer, ClippedOptimizerFactory, ContrastiveLearning, Conv1D, Conv2D, DataAugmentation, DataLoader, type DataPair, DecisionTree, Dropout, EarlyStopping, EmbeddingMatrix, Flatten, GAN, GRULayer, GaussianNaiveBayes, HopfieldNetwork, KMeans, type KMeansOptions, LRScheduler, LSTMLayer, Layer, LayerNorm, LearnedPositionalEncoding, LinearRegression, LogisticRegression, LossPlotter, MaxPool2D, ModelSaver, Momentum, MultiHeadAttention, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, NetworkTransformer, type NetworkTransformerOptions, NetworkTransformerRL, type NetworkTransformerRLOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, PCA, Perceptron, PositionalEncoding, RNN, SGD, SOM, type SOMOptions, Seq2Seq, type Serializable, SoftmaxRegression, TCN, TSNE, type TSNEOptions, type TrainDataset, type TrainMetrics, type TrainableNetwork, type TrainableNetworkWithWeights, Trainer, type TrainerOptions, TransformerBlock, type TransformerBlockOptions, VAE, Value, WeightInspector, WeightMatrix, type WeightStats, Word2Vec, type Word2VecModel, type Word2VecOptions, accuracy, auc, classificationReport, confusionMatrix, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, defaultOptimizer, elu, f1Score, leakyRelu, linear, mae, makeElu, makeLeakyRelu, matMul, mse, mseDelta, perplexity, precision, printConfusionMatrix, r2Score, recall, relu, rmse, rocCurve, sigmoid, softmax, softmaxBackward, tanh, transpose, validate2DArray, validateArray, validateArrayMinLength, validateNumber };
|
|
1354
|
+
export { type Activation, Adam, AttentionHead, Augmenter, Autoencoder, BatchNorm, BiasVector, type CategoricalMap, CausalConv1D, ClipOptimizer, ClippedOptimizerFactory, ContrastiveLearning, Conv1D, Conv2D, DataAugmentation, DataLoader, type DataPair, DatasetLoader, type DatasetLoaderOptions, type DatasetLoaderResult, DecisionTree, Dropout, EarlyStopping, EmbeddingMatrix, type EncodeBatchOptions, type EncodeOptions, Flatten, GAN, GRULayer, GaussianNaiveBayes, HopfieldNetwork, KMeans, type KMeansOptions, LRScheduler, LSTMLayer, Layer, LayerNorm, LearnedPositionalEncoding, LinearRegression, LogisticRegression, LossPlotter, MaxPool2D, ModelSaver, Momentum, MultiHeadAttention, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, NetworkTransformer, type NetworkTransformerOptions, NetworkTransformerRL, type NetworkTransformerRLOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, PCA, Perceptron, PositionalEncoding, RNN, SGD, SOM, type SOMOptions, Seq2Seq, type Serializable, SoftmaxRegression, TCN, TSNE, type TSNEOptions, Tokenizer, type TokenizerMode, type TokenizerOptions, type TokenizerSnapshot, type TrainDataset, type TrainMetrics, type TrainableNetwork, type TrainableNetworkWithWeights, Trainer, type TrainerOptions, TransformerBlock, type TransformerBlockOptions, VAE, Value, WeightInspector, WeightMatrix, type WeightStats, Word2Vec, type Word2VecModel, type Word2VecOptions, accuracy, auc, classificationReport, confusionMatrix, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, defaultOptimizer, elu, f1Score, leakyRelu, linear, mae, makeElu, makeLeakyRelu, matMul, mse, mseDelta, perplexity, precision, printConfusionMatrix, r2Score, recall, relu, rmse, rocCurve, sigmoid, softmax, softmaxBackward, tanh, transpose, validate2DArray, validateArray, validateArrayMinLength, validateNumber };
|
package/dist/index.d.ts
CHANGED
|
@@ -570,6 +570,63 @@ declare class Trainer {
|
|
|
570
570
|
private _computeMetricsArray;
|
|
571
571
|
}
|
|
572
572
|
|
|
573
|
+
interface DatasetLoaderOptions {
|
|
574
|
+
/** Column names to use as input features. */
|
|
575
|
+
featureCols: string[];
|
|
576
|
+
/** Column names to use as targets / labels. */
|
|
577
|
+
targetCols: string[];
|
|
578
|
+
/**
|
|
579
|
+
* When true, string values in feature/target columns are one-hot encoded.
|
|
580
|
+
* When false, non-numeric values throw an error. Default: true.
|
|
581
|
+
*/
|
|
582
|
+
encodeStrings?: boolean;
|
|
583
|
+
}
|
|
584
|
+
/**
|
|
585
|
+
* Maps a column name to its {value → one-hot index} dictionary.
|
|
586
|
+
* Useful for decoding model predictions back to class names.
|
|
587
|
+
*/
|
|
588
|
+
type CategoricalMap = Record<string, Record<string, number>>;
|
|
589
|
+
interface DatasetLoaderResult extends DataPair {
|
|
590
|
+
/**
|
|
591
|
+
* For each string column that was one-hot encoded, maps the column name to
|
|
592
|
+
* the {category → index} dictionary used during encoding.
|
|
593
|
+
*/
|
|
594
|
+
categoricalMaps: CategoricalMap;
|
|
595
|
+
/** Column names in the order they appear in each input vector. */
|
|
596
|
+
featureNames: string[];
|
|
597
|
+
/** Column names (or expanded one-hot names) in the order they appear in each target vector. */
|
|
598
|
+
targetNames: string[];
|
|
599
|
+
/** Total number of rows parsed. */
|
|
600
|
+
numRows: number;
|
|
601
|
+
}
|
|
602
|
+
declare class DatasetLoader {
|
|
603
|
+
/**
|
|
604
|
+
* Parse a CSV string into a DataPair.
|
|
605
|
+
*
|
|
606
|
+
* - The first non-empty row is treated as a header.
|
|
607
|
+
* - Numeric values are parsed with parseFloat.
|
|
608
|
+
* - String values are one-hot encoded (one column → N binary columns).
|
|
609
|
+
* - Empty rows and comment lines (starting with #) are skipped.
|
|
610
|
+
*
|
|
611
|
+
* @param csv - raw CSV text
|
|
612
|
+
* @param options - which columns to use as features / targets
|
|
613
|
+
*/
|
|
614
|
+
static fromCSV(csv: string, options: DatasetLoaderOptions): DatasetLoaderResult;
|
|
615
|
+
/**
|
|
616
|
+
* Parse a JSON string (array of objects) into a DataPair.
|
|
617
|
+
*
|
|
618
|
+
* Expected format:
|
|
619
|
+
* [{ "col1": 1.0, "col2": "cat", "label": "dog" }, ...]
|
|
620
|
+
*
|
|
621
|
+
* @param json - raw JSON text or a pre-parsed array of objects
|
|
622
|
+
* @param options - which columns to use as features / targets
|
|
623
|
+
*/
|
|
624
|
+
static fromJSON(json: string | Record<string, unknown>[], options: DatasetLoaderOptions): DatasetLoaderResult;
|
|
625
|
+
private static _buildDataPair;
|
|
626
|
+
private static _parseCSV;
|
|
627
|
+
private static _parseCSVRow;
|
|
628
|
+
}
|
|
629
|
+
|
|
573
630
|
declare class LRScheduler {
|
|
574
631
|
stepDecay(lr: number, epoch: number, dropRate: number, epochsDrop: number): number;
|
|
575
632
|
exponentialDecay(lr: number, epoch: number, decayRate: number): number;
|
|
@@ -1107,6 +1164,121 @@ declare function perplexity(yTrue: number[], probabilities: number[][]): number;
|
|
|
1107
1164
|
declare function printConfusionMatrix(matrix: number[][], labels?: string[]): void;
|
|
1108
1165
|
declare function classificationReport(yTrue: number[], yPred: number[], labels?: string[]): void;
|
|
1109
1166
|
|
|
1167
|
+
type TokenizerMode = 'char' | 'word' | 'whitespace';
|
|
1168
|
+
interface TokenizerOptions {
|
|
1169
|
+
/** Tokenization strategy. Default: 'word' */
|
|
1170
|
+
mode?: TokenizerMode;
|
|
1171
|
+
/** Normalize text to lowercase before processing. Default: true */
|
|
1172
|
+
lowercase?: boolean;
|
|
1173
|
+
/** Maximum vocabulary size (most frequent tokens kept). 0 = unlimited. Default: 0 */
|
|
1174
|
+
maxVocab?: number;
|
|
1175
|
+
/** Additional special tokens to reserve (appended after PAD/UNK/BOS/EOS). */
|
|
1176
|
+
specialTokens?: string[];
|
|
1177
|
+
}
|
|
1178
|
+
interface EncodeOptions {
|
|
1179
|
+
/** Prepend <BOS> token. Default: false */
|
|
1180
|
+
addBOS?: boolean;
|
|
1181
|
+
/** Append <EOS> token. Default: false */
|
|
1182
|
+
addEOS?: boolean;
|
|
1183
|
+
}
|
|
1184
|
+
interface EncodeBatchOptions extends EncodeOptions {
|
|
1185
|
+
/**
|
|
1186
|
+
* Pad or truncate all sequences to this length.
|
|
1187
|
+
* Sequences shorter than padTo are right-padded with <PAD> (id 0).
|
|
1188
|
+
* Sequences longer than padTo are truncated on the right.
|
|
1189
|
+
* If omitted, sequences are left at their natural length.
|
|
1190
|
+
*/
|
|
1191
|
+
padTo?: number;
|
|
1192
|
+
}
|
|
1193
|
+
interface TokenizerSnapshot {
|
|
1194
|
+
mode: TokenizerMode;
|
|
1195
|
+
lowercase: boolean;
|
|
1196
|
+
maxVocab: number;
|
|
1197
|
+
token2id: Record<string, number>;
|
|
1198
|
+
}
|
|
1199
|
+
declare class Tokenizer {
|
|
1200
|
+
static readonly PAD = "<PAD>";
|
|
1201
|
+
static readonly UNK = "<UNK>";
|
|
1202
|
+
static readonly BOS = "<BOS>";
|
|
1203
|
+
static readonly EOS = "<EOS>";
|
|
1204
|
+
private readonly _mode;
|
|
1205
|
+
private readonly _lowercase;
|
|
1206
|
+
private readonly _maxVocab;
|
|
1207
|
+
private readonly _extraSpecial;
|
|
1208
|
+
private _token2id;
|
|
1209
|
+
private _id2token;
|
|
1210
|
+
private _fitted;
|
|
1211
|
+
constructor(options?: TokenizerOptions);
|
|
1212
|
+
/**
|
|
1213
|
+
* Build vocabulary from an array of text strings.
|
|
1214
|
+
* Calling fit() again resets and rebuilds the vocabulary from scratch.
|
|
1215
|
+
*
|
|
1216
|
+
* @param texts - corpus to build the vocabulary from
|
|
1217
|
+
* @returns this (chainable)
|
|
1218
|
+
*/
|
|
1219
|
+
fit(texts: string[]): this;
|
|
1220
|
+
/**
|
|
1221
|
+
* Split raw text into an array of string tokens (no ID conversion yet).
|
|
1222
|
+
* Useful for inspecting what the tokenizer produces before encoding.
|
|
1223
|
+
*/
|
|
1224
|
+
tokenize(text: string): string[];
|
|
1225
|
+
/**
|
|
1226
|
+
* Convert a text string to a sequence of token IDs.
|
|
1227
|
+
* Unknown tokens map to <UNK> (id 1).
|
|
1228
|
+
*
|
|
1229
|
+
* @param text - input text
|
|
1230
|
+
* @param options - addBOS / addEOS flags
|
|
1231
|
+
*/
|
|
1232
|
+
encode(text: string, options?: EncodeOptions): number[];
|
|
1233
|
+
/**
|
|
1234
|
+
* Encode an array of texts, optionally padding/truncating to a fixed length.
|
|
1235
|
+
*
|
|
1236
|
+
* @param texts - array of input texts
|
|
1237
|
+
* @param options - addBOS / addEOS / padTo
|
|
1238
|
+
*/
|
|
1239
|
+
encodeBatch(texts: string[], options?: EncodeBatchOptions): number[][];
|
|
1240
|
+
/**
|
|
1241
|
+
* Convert a sequence of token IDs back to a human-readable string.
|
|
1242
|
+
*
|
|
1243
|
+
* @param ids - array of token IDs
|
|
1244
|
+
* @param stripSpecial - remove PAD/BOS/EOS tokens from output. Default: true
|
|
1245
|
+
*/
|
|
1246
|
+
decode(ids: number[], stripSpecial?: boolean): string;
|
|
1247
|
+
/**
|
|
1248
|
+
* Convert a sequence of token IDs to one-hot vectors.
|
|
1249
|
+
* Each vector has length `vocabSize` with a single 1 at the token's position.
|
|
1250
|
+
* Useful when feeding tokens directly into a Network without an embedding layer.
|
|
1251
|
+
*
|
|
1252
|
+
* @param ids - array of token IDs (e.g. from encode())
|
|
1253
|
+
* @returns - 2D array of shape [seqLen, vocabSize]
|
|
1254
|
+
*/
|
|
1255
|
+
oneHot(ids: number[]): number[][];
|
|
1256
|
+
/** Number of tokens in the vocabulary (including special tokens). */
|
|
1257
|
+
get vocabSize(): number;
|
|
1258
|
+
/** True if fit() has been called at least once. */
|
|
1259
|
+
get isFitted(): boolean;
|
|
1260
|
+
/** Get the integer ID for a token string, or undefined if not in vocabulary. */
|
|
1261
|
+
tokenToId(token: string): number | undefined;
|
|
1262
|
+
/** Get the token string for an integer ID, or undefined if out of range. */
|
|
1263
|
+
idToToken(id: number): string | undefined;
|
|
1264
|
+
/**
|
|
1265
|
+
* Return the full vocabulary as an array ordered by ID.
|
|
1266
|
+
* Index i of the returned array is the token with ID i.
|
|
1267
|
+
*/
|
|
1268
|
+
getVocabulary(): string[];
|
|
1269
|
+
/**
|
|
1270
|
+
* Serialize the fitted tokenizer to a plain JSON-compatible object.
|
|
1271
|
+
* Store it with JSON.stringify(); reload with Tokenizer.fromJSON().
|
|
1272
|
+
*/
|
|
1273
|
+
toJSON(): TokenizerSnapshot;
|
|
1274
|
+
/**
|
|
1275
|
+
* Restore a Tokenizer from a snapshot produced by toJSON().
|
|
1276
|
+
*/
|
|
1277
|
+
static fromJSON(snapshot: TokenizerSnapshot): Tokenizer;
|
|
1278
|
+
private _register;
|
|
1279
|
+
private _assertFitted;
|
|
1280
|
+
}
|
|
1281
|
+
|
|
1110
1282
|
declare class EarlyStopping {
|
|
1111
1283
|
bestValue: number;
|
|
1112
1284
|
readonly patience: number;
|
|
@@ -1179,4 +1351,4 @@ declare class DataAugmentation {
|
|
|
1179
1351
|
};
|
|
1180
1352
|
}
|
|
1181
1353
|
|
|
1182
|
-
export { type Activation, Adam, AttentionHead, Augmenter, Autoencoder, BatchNorm, BiasVector, CausalConv1D, ClipOptimizer, ClippedOptimizerFactory, ContrastiveLearning, Conv1D, Conv2D, DataAugmentation, DataLoader, type DataPair, DecisionTree, Dropout, EarlyStopping, EmbeddingMatrix, Flatten, GAN, GRULayer, GaussianNaiveBayes, HopfieldNetwork, KMeans, type KMeansOptions, LRScheduler, LSTMLayer, Layer, LayerNorm, LearnedPositionalEncoding, LinearRegression, LogisticRegression, LossPlotter, MaxPool2D, ModelSaver, Momentum, MultiHeadAttention, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, NetworkTransformer, type NetworkTransformerOptions, NetworkTransformerRL, type NetworkTransformerRLOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, PCA, Perceptron, PositionalEncoding, RNN, SGD, SOM, type SOMOptions, Seq2Seq, type Serializable, SoftmaxRegression, TCN, TSNE, type TSNEOptions, type TrainDataset, type TrainMetrics, type TrainableNetwork, type TrainableNetworkWithWeights, Trainer, type TrainerOptions, TransformerBlock, type TransformerBlockOptions, VAE, Value, WeightInspector, WeightMatrix, type WeightStats, Word2Vec, type Word2VecModel, type Word2VecOptions, accuracy, auc, classificationReport, confusionMatrix, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, defaultOptimizer, elu, f1Score, leakyRelu, linear, mae, makeElu, makeLeakyRelu, matMul, mse, mseDelta, perplexity, precision, printConfusionMatrix, r2Score, recall, relu, rmse, rocCurve, sigmoid, softmax, softmaxBackward, tanh, transpose, validate2DArray, validateArray, validateArrayMinLength, validateNumber };
|
|
1354
|
+
export { type Activation, Adam, AttentionHead, Augmenter, Autoencoder, BatchNorm, BiasVector, type CategoricalMap, CausalConv1D, ClipOptimizer, ClippedOptimizerFactory, ContrastiveLearning, Conv1D, Conv2D, DataAugmentation, DataLoader, type DataPair, DatasetLoader, type DatasetLoaderOptions, type DatasetLoaderResult, DecisionTree, Dropout, EarlyStopping, EmbeddingMatrix, type EncodeBatchOptions, type EncodeOptions, Flatten, GAN, GRULayer, GaussianNaiveBayes, HopfieldNetwork, KMeans, type KMeansOptions, LRScheduler, LSTMLayer, Layer, LayerNorm, LearnedPositionalEncoding, LinearRegression, LogisticRegression, LossPlotter, MaxPool2D, ModelSaver, Momentum, MultiHeadAttention, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, NetworkTransformer, type NetworkTransformerOptions, NetworkTransformerRL, type NetworkTransformerRLOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, PCA, Perceptron, PositionalEncoding, RNN, SGD, SOM, type SOMOptions, Seq2Seq, type Serializable, SoftmaxRegression, TCN, TSNE, type TSNEOptions, Tokenizer, type TokenizerMode, type TokenizerOptions, type TokenizerSnapshot, type TrainDataset, type TrainMetrics, type TrainableNetwork, type TrainableNetworkWithWeights, Trainer, type TrainerOptions, TransformerBlock, type TransformerBlockOptions, VAE, Value, WeightInspector, WeightMatrix, type WeightStats, Word2Vec, type Word2VecModel, type Word2VecOptions, accuracy, auc, classificationReport, confusionMatrix, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, defaultOptimizer, elu, f1Score, leakyRelu, linear, mae, makeElu, makeLeakyRelu, matMul, mse, mseDelta, perplexity, precision, printConfusionMatrix, r2Score, recall, relu, rmse, rocCurve, sigmoid, softmax, softmaxBackward, tanh, transpose, validate2DArray, validateArray, validateArrayMinLength, validateNumber };
|
package/dist/index.js
CHANGED
|
@@ -34,6 +34,7 @@ __export(index_exports, {
|
|
|
34
34
|
Conv2D: () => Conv2D,
|
|
35
35
|
DataAugmentation: () => DataAugmentation,
|
|
36
36
|
DataLoader: () => DataLoader,
|
|
37
|
+
DatasetLoader: () => DatasetLoader,
|
|
37
38
|
DecisionTree: () => DecisionTree,
|
|
38
39
|
Dropout: () => Dropout,
|
|
39
40
|
EarlyStopping: () => EarlyStopping,
|
|
@@ -73,6 +74,7 @@ __export(index_exports, {
|
|
|
73
74
|
SoftmaxRegression: () => SoftmaxRegression,
|
|
74
75
|
TCN: () => TCN,
|
|
75
76
|
TSNE: () => TSNE,
|
|
77
|
+
Tokenizer: () => Tokenizer,
|
|
76
78
|
Trainer: () => Trainer,
|
|
77
79
|
TransformerBlock: () => TransformerBlock,
|
|
78
80
|
VAE: () => VAE,
|
|
@@ -2651,6 +2653,155 @@ var DataLoader = class _DataLoader {
|
|
|
2651
2653
|
}
|
|
2652
2654
|
};
|
|
2653
2655
|
|
|
2656
|
+
// src/DatasetLoader.ts
|
|
2657
|
+
var DatasetLoader = class _DatasetLoader {
|
|
2658
|
+
// ── CSV ─────────────────────────────────────────────────────────────────────
|
|
2659
|
+
/**
|
|
2660
|
+
* Parse a CSV string into a DataPair.
|
|
2661
|
+
*
|
|
2662
|
+
* - The first non-empty row is treated as a header.
|
|
2663
|
+
* - Numeric values are parsed with parseFloat.
|
|
2664
|
+
* - String values are one-hot encoded (one column → N binary columns).
|
|
2665
|
+
* - Empty rows and comment lines (starting with #) are skipped.
|
|
2666
|
+
*
|
|
2667
|
+
* @param csv - raw CSV text
|
|
2668
|
+
* @param options - which columns to use as features / targets
|
|
2669
|
+
*/
|
|
2670
|
+
static fromCSV(csv, options) {
|
|
2671
|
+
const rows = _DatasetLoader._parseCSV(csv);
|
|
2672
|
+
if (rows.length < 2) throw new Error("DatasetLoader.fromCSV: CSV must have a header row and at least one data row.");
|
|
2673
|
+
const header = rows[0];
|
|
2674
|
+
const dataRows = rows.slice(1);
|
|
2675
|
+
return _DatasetLoader._buildDataPair(header, dataRows, options);
|
|
2676
|
+
}
|
|
2677
|
+
// ── JSON ─────────────────────────────────────────────────────────────────────
|
|
2678
|
+
/**
|
|
2679
|
+
* Parse a JSON string (array of objects) into a DataPair.
|
|
2680
|
+
*
|
|
2681
|
+
* Expected format:
|
|
2682
|
+
* [{ "col1": 1.0, "col2": "cat", "label": "dog" }, ...]
|
|
2683
|
+
*
|
|
2684
|
+
* @param json - raw JSON text or a pre-parsed array of objects
|
|
2685
|
+
* @param options - which columns to use as features / targets
|
|
2686
|
+
*/
|
|
2687
|
+
static fromJSON(json, options) {
|
|
2688
|
+
const records = typeof json === "string" ? JSON.parse(json) : json;
|
|
2689
|
+
if (!Array.isArray(records) || records.length === 0) {
|
|
2690
|
+
throw new Error("DatasetLoader.fromJSON: expected a non-empty JSON array of objects.");
|
|
2691
|
+
}
|
|
2692
|
+
const header = Object.keys(records[0]);
|
|
2693
|
+
const dataRows = records.map((row) => header.map((col) => String(row[col] ?? "")));
|
|
2694
|
+
return _DatasetLoader._buildDataPair(header, dataRows, options);
|
|
2695
|
+
}
|
|
2696
|
+
// ── Private: shared pipeline ──────────────────────────────────────────────
|
|
2697
|
+
static _buildDataPair(header, dataRows, options) {
|
|
2698
|
+
const { featureCols, targetCols, encodeStrings = true } = options;
|
|
2699
|
+
for (const col of [...featureCols, ...targetCols]) {
|
|
2700
|
+
if (!header.includes(col)) {
|
|
2701
|
+
throw new Error(`DatasetLoader: column "${col}" not found in header [${header.join(", ")}].`);
|
|
2702
|
+
}
|
|
2703
|
+
}
|
|
2704
|
+
const catMaps = {};
|
|
2705
|
+
const buildEncoder = (cols) => {
|
|
2706
|
+
for (const col of cols) {
|
|
2707
|
+
const colIdx = header.indexOf(col);
|
|
2708
|
+
const values = dataRows.map((row) => row[colIdx]);
|
|
2709
|
+
const isNumeric = values.every((v) => v === "" || !isNaN(Number(v)));
|
|
2710
|
+
if (!isNumeric) {
|
|
2711
|
+
if (!encodeStrings) {
|
|
2712
|
+
throw new Error(`DatasetLoader: column "${col}" contains non-numeric values. Set encodeStrings: true to one-hot encode them.`);
|
|
2713
|
+
}
|
|
2714
|
+
const unique = [...new Set(values)].sort();
|
|
2715
|
+
catMaps[col] = Object.fromEntries(unique.map((v, i) => [v, i]));
|
|
2716
|
+
}
|
|
2717
|
+
}
|
|
2718
|
+
};
|
|
2719
|
+
buildEncoder(featureCols);
|
|
2720
|
+
buildEncoder(targetCols);
|
|
2721
|
+
const encodeValue = (col, raw) => {
|
|
2722
|
+
if (catMaps[col]) {
|
|
2723
|
+
const categories = catMaps[col];
|
|
2724
|
+
const n = Object.keys(categories).length;
|
|
2725
|
+
const vec = new Array(n).fill(0);
|
|
2726
|
+
const idx = categories[raw];
|
|
2727
|
+
if (idx !== void 0) vec[idx] = 1;
|
|
2728
|
+
return vec;
|
|
2729
|
+
}
|
|
2730
|
+
return [parseFloat(raw)];
|
|
2731
|
+
};
|
|
2732
|
+
const expandNames = (cols) => cols.flatMap((col) => {
|
|
2733
|
+
if (catMaps[col]) {
|
|
2734
|
+
return Object.keys(catMaps[col]).map((cat) => `${col}_${cat}`);
|
|
2735
|
+
}
|
|
2736
|
+
return [col];
|
|
2737
|
+
});
|
|
2738
|
+
const featureNames = expandNames(featureCols);
|
|
2739
|
+
const targetNames = expandNames(targetCols);
|
|
2740
|
+
const inputs = [];
|
|
2741
|
+
const targets = [];
|
|
2742
|
+
for (const row of dataRows) {
|
|
2743
|
+
const input = featureCols.flatMap((col) => {
|
|
2744
|
+
const raw = row[header.indexOf(col)];
|
|
2745
|
+
return encodeValue(col, raw);
|
|
2746
|
+
});
|
|
2747
|
+
const target = targetCols.flatMap((col) => {
|
|
2748
|
+
const raw = row[header.indexOf(col)];
|
|
2749
|
+
return encodeValue(col, raw);
|
|
2750
|
+
});
|
|
2751
|
+
inputs.push(input);
|
|
2752
|
+
targets.push(target);
|
|
2753
|
+
}
|
|
2754
|
+
return {
|
|
2755
|
+
inputs,
|
|
2756
|
+
targets,
|
|
2757
|
+
categoricalMaps: catMaps,
|
|
2758
|
+
featureNames,
|
|
2759
|
+
targetNames,
|
|
2760
|
+
numRows: dataRows.length
|
|
2761
|
+
};
|
|
2762
|
+
}
|
|
2763
|
+
// ── Private: RFC 4180-compatible CSV parser ───────────────────────────────
|
|
2764
|
+
static _parseCSV(csv) {
|
|
2765
|
+
const rows = [];
|
|
2766
|
+
const lines = csv.split(/\r?\n/);
|
|
2767
|
+
for (const line of lines) {
|
|
2768
|
+
const trimmed = line.trim();
|
|
2769
|
+
if (!trimmed || trimmed.startsWith("#")) continue;
|
|
2770
|
+
rows.push(_DatasetLoader._parseCSVRow(trimmed));
|
|
2771
|
+
}
|
|
2772
|
+
return rows;
|
|
2773
|
+
}
|
|
2774
|
+
static _parseCSVRow(line) {
|
|
2775
|
+
const fields = [];
|
|
2776
|
+
let current = "";
|
|
2777
|
+
let inQuotes = false;
|
|
2778
|
+
for (let i = 0; i < line.length; i++) {
|
|
2779
|
+
const ch = line[i];
|
|
2780
|
+
if (inQuotes) {
|
|
2781
|
+
if (ch === '"' && line[i + 1] === '"') {
|
|
2782
|
+
current += '"';
|
|
2783
|
+
i++;
|
|
2784
|
+
} else if (ch === '"') {
|
|
2785
|
+
inQuotes = false;
|
|
2786
|
+
} else {
|
|
2787
|
+
current += ch;
|
|
2788
|
+
}
|
|
2789
|
+
} else {
|
|
2790
|
+
if (ch === '"') {
|
|
2791
|
+
inQuotes = true;
|
|
2792
|
+
} else if (ch === ",") {
|
|
2793
|
+
fields.push(current.trim());
|
|
2794
|
+
current = "";
|
|
2795
|
+
} else {
|
|
2796
|
+
current += ch;
|
|
2797
|
+
}
|
|
2798
|
+
}
|
|
2799
|
+
}
|
|
2800
|
+
fields.push(current.trim());
|
|
2801
|
+
return fields;
|
|
2802
|
+
}
|
|
2803
|
+
};
|
|
2804
|
+
|
|
2654
2805
|
// src/LRScheduler.ts
|
|
2655
2806
|
var LRScheduler = class {
|
|
2656
2807
|
// ── Step Decay ────────────────────────────────────────────────────────────
|
|
@@ -6135,6 +6286,216 @@ function _binaryRecall(yTrue, yPred, pos) {
|
|
|
6135
6286
|
return tp + fn > 0 ? tp / (tp + fn) : 0;
|
|
6136
6287
|
}
|
|
6137
6288
|
|
|
6289
|
+
// src/Tokenizer.ts
|
|
6290
|
+
var _Tokenizer = class _Tokenizer {
|
|
6291
|
+
constructor(options = {}) {
|
|
6292
|
+
this._token2id = /* @__PURE__ */ new Map();
|
|
6293
|
+
this._id2token = /* @__PURE__ */ new Map();
|
|
6294
|
+
this._fitted = false;
|
|
6295
|
+
this._mode = options.mode ?? "word";
|
|
6296
|
+
this._lowercase = options.lowercase ?? true;
|
|
6297
|
+
this._maxVocab = options.maxVocab ?? 0;
|
|
6298
|
+
this._extraSpecial = options.specialTokens ?? [];
|
|
6299
|
+
}
|
|
6300
|
+
// ── Fit ───────────────────────────────────────────────────────────────────
|
|
6301
|
+
/**
|
|
6302
|
+
* Build vocabulary from an array of text strings.
|
|
6303
|
+
* Calling fit() again resets and rebuilds the vocabulary from scratch.
|
|
6304
|
+
*
|
|
6305
|
+
* @param texts - corpus to build the vocabulary from
|
|
6306
|
+
* @returns this (chainable)
|
|
6307
|
+
*/
|
|
6308
|
+
fit(texts) {
|
|
6309
|
+
this._token2id = /* @__PURE__ */ new Map();
|
|
6310
|
+
this._id2token = /* @__PURE__ */ new Map();
|
|
6311
|
+
const specials = [
|
|
6312
|
+
_Tokenizer.PAD,
|
|
6313
|
+
_Tokenizer.UNK,
|
|
6314
|
+
_Tokenizer.BOS,
|
|
6315
|
+
_Tokenizer.EOS,
|
|
6316
|
+
...this._extraSpecial
|
|
6317
|
+
];
|
|
6318
|
+
for (const s of specials) this._register(s);
|
|
6319
|
+
const freq = /* @__PURE__ */ new Map();
|
|
6320
|
+
for (const text of texts) {
|
|
6321
|
+
for (const token of this.tokenize(text)) {
|
|
6322
|
+
freq.set(token, (freq.get(token) ?? 0) + 1);
|
|
6323
|
+
}
|
|
6324
|
+
}
|
|
6325
|
+
let entries = [...freq.entries()].sort(
|
|
6326
|
+
([a, fa], [b, fb]) => fb - fa || a.localeCompare(b)
|
|
6327
|
+
);
|
|
6328
|
+
if (this._maxVocab > 0) {
|
|
6329
|
+
entries = entries.slice(0, this._maxVocab - specials.length);
|
|
6330
|
+
}
|
|
6331
|
+
for (const [token] of entries) this._register(token);
|
|
6332
|
+
this._fitted = true;
|
|
6333
|
+
return this;
|
|
6334
|
+
}
|
|
6335
|
+
// ── Tokenize ──────────────────────────────────────────────────────────────
|
|
6336
|
+
/**
|
|
6337
|
+
* Split raw text into an array of string tokens (no ID conversion yet).
|
|
6338
|
+
* Useful for inspecting what the tokenizer produces before encoding.
|
|
6339
|
+
*/
|
|
6340
|
+
tokenize(text) {
|
|
6341
|
+
const t = this._lowercase ? text.toLowerCase() : text;
|
|
6342
|
+
switch (this._mode) {
|
|
6343
|
+
case "char":
|
|
6344
|
+
return t.split("");
|
|
6345
|
+
case "whitespace":
|
|
6346
|
+
return t.split(/\s+/).filter(Boolean);
|
|
6347
|
+
case "word":
|
|
6348
|
+
default:
|
|
6349
|
+
return t.match(/[a-z0-9àáâãäåæçèéêëìíîïðñòóôõöùúûüýþÿ]+|[^\w\s]/gi) ?? [];
|
|
6350
|
+
}
|
|
6351
|
+
}
|
|
6352
|
+
// ── Encode ────────────────────────────────────────────────────────────────
|
|
6353
|
+
/**
|
|
6354
|
+
* Convert a text string to a sequence of token IDs.
|
|
6355
|
+
* Unknown tokens map to <UNK> (id 1).
|
|
6356
|
+
*
|
|
6357
|
+
* @param text - input text
|
|
6358
|
+
* @param options - addBOS / addEOS flags
|
|
6359
|
+
*/
|
|
6360
|
+
encode(text, options = {}) {
|
|
6361
|
+
this._assertFitted();
|
|
6362
|
+
const ids = [];
|
|
6363
|
+
if (options.addBOS) ids.push(this._token2id.get(_Tokenizer.BOS));
|
|
6364
|
+
for (const token of this.tokenize(text)) {
|
|
6365
|
+
ids.push(this._token2id.get(token) ?? this._token2id.get(_Tokenizer.UNK));
|
|
6366
|
+
}
|
|
6367
|
+
if (options.addEOS) ids.push(this._token2id.get(_Tokenizer.EOS));
|
|
6368
|
+
return ids;
|
|
6369
|
+
}
|
|
6370
|
+
// ── Encode batch ──────────────────────────────────────────────────────────
|
|
6371
|
+
/**
|
|
6372
|
+
* Encode an array of texts, optionally padding/truncating to a fixed length.
|
|
6373
|
+
*
|
|
6374
|
+
* @param texts - array of input texts
|
|
6375
|
+
* @param options - addBOS / addEOS / padTo
|
|
6376
|
+
*/
|
|
6377
|
+
encodeBatch(texts, options = {}) {
|
|
6378
|
+
const sequences = texts.map((t) => this.encode(t, options));
|
|
6379
|
+
if (options.padTo !== void 0) {
|
|
6380
|
+
const len = options.padTo;
|
|
6381
|
+
const padId = this._token2id.get(_Tokenizer.PAD);
|
|
6382
|
+
return sequences.map((seq) => {
|
|
6383
|
+
if (seq.length >= len) return seq.slice(0, len);
|
|
6384
|
+
return [...seq, ...Array(len - seq.length).fill(padId)];
|
|
6385
|
+
});
|
|
6386
|
+
}
|
|
6387
|
+
return sequences;
|
|
6388
|
+
}
|
|
6389
|
+
// ── Decode ────────────────────────────────────────────────────────────────
|
|
6390
|
+
/**
|
|
6391
|
+
* Convert a sequence of token IDs back to a human-readable string.
|
|
6392
|
+
*
|
|
6393
|
+
* @param ids - array of token IDs
|
|
6394
|
+
* @param stripSpecial - remove PAD/BOS/EOS tokens from output. Default: true
|
|
6395
|
+
*/
|
|
6396
|
+
decode(ids, stripSpecial = true) {
|
|
6397
|
+
this._assertFitted();
|
|
6398
|
+
const specials = /* @__PURE__ */ new Set([_Tokenizer.PAD, _Tokenizer.BOS, _Tokenizer.EOS]);
|
|
6399
|
+
const tokens = [];
|
|
6400
|
+
for (const id of ids) {
|
|
6401
|
+
const token = this._id2token.get(id) ?? _Tokenizer.UNK;
|
|
6402
|
+
if (stripSpecial && specials.has(token)) continue;
|
|
6403
|
+
tokens.push(token);
|
|
6404
|
+
}
|
|
6405
|
+
return this._mode === "char" ? tokens.join("") : tokens.join(" ");
|
|
6406
|
+
}
|
|
6407
|
+
// ── One-hot encoding ──────────────────────────────────────────────────────
|
|
6408
|
+
/**
|
|
6409
|
+
* Convert a sequence of token IDs to one-hot vectors.
|
|
6410
|
+
* Each vector has length `vocabSize` with a single 1 at the token's position.
|
|
6411
|
+
* Useful when feeding tokens directly into a Network without an embedding layer.
|
|
6412
|
+
*
|
|
6413
|
+
* @param ids - array of token IDs (e.g. from encode())
|
|
6414
|
+
* @returns - 2D array of shape [seqLen, vocabSize]
|
|
6415
|
+
*/
|
|
6416
|
+
oneHot(ids) {
|
|
6417
|
+
this._assertFitted();
|
|
6418
|
+
const V = this.vocabSize;
|
|
6419
|
+
return ids.map((id) => {
|
|
6420
|
+
const vec = new Array(V).fill(0);
|
|
6421
|
+
if (id >= 0 && id < V) vec[id] = 1;
|
|
6422
|
+
return vec;
|
|
6423
|
+
});
|
|
6424
|
+
}
|
|
6425
|
+
// ── Vocabulary helpers ────────────────────────────────────────────────────
|
|
6426
|
+
/** Number of tokens in the vocabulary (including special tokens). */
|
|
6427
|
+
get vocabSize() {
|
|
6428
|
+
return this._token2id.size;
|
|
6429
|
+
}
|
|
6430
|
+
/** True if fit() has been called at least once. */
|
|
6431
|
+
get isFitted() {
|
|
6432
|
+
return this._fitted;
|
|
6433
|
+
}
|
|
6434
|
+
/** Get the integer ID for a token string, or undefined if not in vocabulary. */
|
|
6435
|
+
tokenToId(token) {
|
|
6436
|
+
return this._token2id.get(token);
|
|
6437
|
+
}
|
|
6438
|
+
/** Get the token string for an integer ID, or undefined if out of range. */
|
|
6439
|
+
idToToken(id) {
|
|
6440
|
+
return this._id2token.get(id);
|
|
6441
|
+
}
|
|
6442
|
+
/**
|
|
6443
|
+
* Return the full vocabulary as an array ordered by ID.
|
|
6444
|
+
* Index i of the returned array is the token with ID i.
|
|
6445
|
+
*/
|
|
6446
|
+
getVocabulary() {
|
|
6447
|
+
return Array.from({ length: this.vocabSize }, (_, i) => this._id2token.get(i));
|
|
6448
|
+
}
|
|
6449
|
+
// ── Persistence ───────────────────────────────────────────────────────────
|
|
6450
|
+
/**
|
|
6451
|
+
* Serialize the fitted tokenizer to a plain JSON-compatible object.
|
|
6452
|
+
* Store it with JSON.stringify(); reload with Tokenizer.fromJSON().
|
|
6453
|
+
*/
|
|
6454
|
+
toJSON() {
|
|
6455
|
+
this._assertFitted();
|
|
6456
|
+
return {
|
|
6457
|
+
mode: this._mode,
|
|
6458
|
+
lowercase: this._lowercase,
|
|
6459
|
+
maxVocab: this._maxVocab,
|
|
6460
|
+
token2id: Object.fromEntries(this._token2id)
|
|
6461
|
+
};
|
|
6462
|
+
}
|
|
6463
|
+
/**
|
|
6464
|
+
* Restore a Tokenizer from a snapshot produced by toJSON().
|
|
6465
|
+
*/
|
|
6466
|
+
static fromJSON(snapshot) {
|
|
6467
|
+
const tok = new _Tokenizer({
|
|
6468
|
+
mode: snapshot.mode,
|
|
6469
|
+
lowercase: snapshot.lowercase,
|
|
6470
|
+
maxVocab: snapshot.maxVocab
|
|
6471
|
+
});
|
|
6472
|
+
for (const [token, id] of Object.entries(snapshot.token2id)) {
|
|
6473
|
+
tok._token2id.set(token, id);
|
|
6474
|
+
tok._id2token.set(id, token);
|
|
6475
|
+
}
|
|
6476
|
+
tok._fitted = true;
|
|
6477
|
+
return tok;
|
|
6478
|
+
}
|
|
6479
|
+
// ── Private ───────────────────────────────────────────────────────────────
|
|
6480
|
+
_register(token) {
|
|
6481
|
+
if (this._token2id.has(token)) return;
|
|
6482
|
+
const id = this._token2id.size;
|
|
6483
|
+
this._token2id.set(token, id);
|
|
6484
|
+
this._id2token.set(id, token);
|
|
6485
|
+
}
|
|
6486
|
+
_assertFitted() {
|
|
6487
|
+
if (!this._fitted) {
|
|
6488
|
+
throw new Error("Tokenizer: call fit() before encoding or decoding.");
|
|
6489
|
+
}
|
|
6490
|
+
}
|
|
6491
|
+
};
|
|
6492
|
+
// ── Built-in special tokens ────────────────────────────────────────────────
|
|
6493
|
+
_Tokenizer.PAD = "<PAD>";
|
|
6494
|
+
_Tokenizer.UNK = "<UNK>";
|
|
6495
|
+
_Tokenizer.BOS = "<BOS>";
|
|
6496
|
+
_Tokenizer.EOS = "<EOS>";
|
|
6497
|
+
var Tokenizer = _Tokenizer;
|
|
6498
|
+
|
|
6138
6499
|
// src/EarlyStopping.ts
|
|
6139
6500
|
var EarlyStopping = class {
|
|
6140
6501
|
constructor(options) {
|
|
@@ -6422,6 +6783,7 @@ function _sampleNormal() {
|
|
|
6422
6783
|
Conv2D,
|
|
6423
6784
|
DataAugmentation,
|
|
6424
6785
|
DataLoader,
|
|
6786
|
+
DatasetLoader,
|
|
6425
6787
|
DecisionTree,
|
|
6426
6788
|
Dropout,
|
|
6427
6789
|
EarlyStopping,
|
|
@@ -6461,6 +6823,7 @@ function _sampleNormal() {
|
|
|
6461
6823
|
SoftmaxRegression,
|
|
6462
6824
|
TCN,
|
|
6463
6825
|
TSNE,
|
|
6826
|
+
Tokenizer,
|
|
6464
6827
|
Trainer,
|
|
6465
6828
|
TransformerBlock,
|
|
6466
6829
|
VAE,
|
package/dist/index.mjs
CHANGED
|
@@ -2531,6 +2531,155 @@ var DataLoader = class _DataLoader {
|
|
|
2531
2531
|
}
|
|
2532
2532
|
};
|
|
2533
2533
|
|
|
2534
|
+
// src/DatasetLoader.ts
|
|
2535
|
+
var DatasetLoader = class _DatasetLoader {
|
|
2536
|
+
// ── CSV ─────────────────────────────────────────────────────────────────────
|
|
2537
|
+
/**
|
|
2538
|
+
* Parse a CSV string into a DataPair.
|
|
2539
|
+
*
|
|
2540
|
+
* - The first non-empty row is treated as a header.
|
|
2541
|
+
* - Numeric values are parsed with parseFloat.
|
|
2542
|
+
* - String values are one-hot encoded (one column → N binary columns).
|
|
2543
|
+
* - Empty rows and comment lines (starting with #) are skipped.
|
|
2544
|
+
*
|
|
2545
|
+
* @param csv - raw CSV text
|
|
2546
|
+
* @param options - which columns to use as features / targets
|
|
2547
|
+
*/
|
|
2548
|
+
static fromCSV(csv, options) {
|
|
2549
|
+
const rows = _DatasetLoader._parseCSV(csv);
|
|
2550
|
+
if (rows.length < 2) throw new Error("DatasetLoader.fromCSV: CSV must have a header row and at least one data row.");
|
|
2551
|
+
const header = rows[0];
|
|
2552
|
+
const dataRows = rows.slice(1);
|
|
2553
|
+
return _DatasetLoader._buildDataPair(header, dataRows, options);
|
|
2554
|
+
}
|
|
2555
|
+
// ── JSON ─────────────────────────────────────────────────────────────────────
|
|
2556
|
+
/**
|
|
2557
|
+
* Parse a JSON string (array of objects) into a DataPair.
|
|
2558
|
+
*
|
|
2559
|
+
* Expected format:
|
|
2560
|
+
* [{ "col1": 1.0, "col2": "cat", "label": "dog" }, ...]
|
|
2561
|
+
*
|
|
2562
|
+
* @param json - raw JSON text or a pre-parsed array of objects
|
|
2563
|
+
* @param options - which columns to use as features / targets
|
|
2564
|
+
*/
|
|
2565
|
+
static fromJSON(json, options) {
|
|
2566
|
+
const records = typeof json === "string" ? JSON.parse(json) : json;
|
|
2567
|
+
if (!Array.isArray(records) || records.length === 0) {
|
|
2568
|
+
throw new Error("DatasetLoader.fromJSON: expected a non-empty JSON array of objects.");
|
|
2569
|
+
}
|
|
2570
|
+
const header = Object.keys(records[0]);
|
|
2571
|
+
const dataRows = records.map((row) => header.map((col) => String(row[col] ?? "")));
|
|
2572
|
+
return _DatasetLoader._buildDataPair(header, dataRows, options);
|
|
2573
|
+
}
|
|
2574
|
+
// ── Private: shared pipeline ──────────────────────────────────────────────
|
|
2575
|
+
static _buildDataPair(header, dataRows, options) {
|
|
2576
|
+
const { featureCols, targetCols, encodeStrings = true } = options;
|
|
2577
|
+
for (const col of [...featureCols, ...targetCols]) {
|
|
2578
|
+
if (!header.includes(col)) {
|
|
2579
|
+
throw new Error(`DatasetLoader: column "${col}" not found in header [${header.join(", ")}].`);
|
|
2580
|
+
}
|
|
2581
|
+
}
|
|
2582
|
+
const catMaps = {};
|
|
2583
|
+
const buildEncoder = (cols) => {
|
|
2584
|
+
for (const col of cols) {
|
|
2585
|
+
const colIdx = header.indexOf(col);
|
|
2586
|
+
const values = dataRows.map((row) => row[colIdx]);
|
|
2587
|
+
const isNumeric = values.every((v) => v === "" || !isNaN(Number(v)));
|
|
2588
|
+
if (!isNumeric) {
|
|
2589
|
+
if (!encodeStrings) {
|
|
2590
|
+
throw new Error(`DatasetLoader: column "${col}" contains non-numeric values. Set encodeStrings: true to one-hot encode them.`);
|
|
2591
|
+
}
|
|
2592
|
+
const unique = [...new Set(values)].sort();
|
|
2593
|
+
catMaps[col] = Object.fromEntries(unique.map((v, i) => [v, i]));
|
|
2594
|
+
}
|
|
2595
|
+
}
|
|
2596
|
+
};
|
|
2597
|
+
buildEncoder(featureCols);
|
|
2598
|
+
buildEncoder(targetCols);
|
|
2599
|
+
const encodeValue = (col, raw) => {
|
|
2600
|
+
if (catMaps[col]) {
|
|
2601
|
+
const categories = catMaps[col];
|
|
2602
|
+
const n = Object.keys(categories).length;
|
|
2603
|
+
const vec = new Array(n).fill(0);
|
|
2604
|
+
const idx = categories[raw];
|
|
2605
|
+
if (idx !== void 0) vec[idx] = 1;
|
|
2606
|
+
return vec;
|
|
2607
|
+
}
|
|
2608
|
+
return [parseFloat(raw)];
|
|
2609
|
+
};
|
|
2610
|
+
const expandNames = (cols) => cols.flatMap((col) => {
|
|
2611
|
+
if (catMaps[col]) {
|
|
2612
|
+
return Object.keys(catMaps[col]).map((cat) => `${col}_${cat}`);
|
|
2613
|
+
}
|
|
2614
|
+
return [col];
|
|
2615
|
+
});
|
|
2616
|
+
const featureNames = expandNames(featureCols);
|
|
2617
|
+
const targetNames = expandNames(targetCols);
|
|
2618
|
+
const inputs = [];
|
|
2619
|
+
const targets = [];
|
|
2620
|
+
for (const row of dataRows) {
|
|
2621
|
+
const input = featureCols.flatMap((col) => {
|
|
2622
|
+
const raw = row[header.indexOf(col)];
|
|
2623
|
+
return encodeValue(col, raw);
|
|
2624
|
+
});
|
|
2625
|
+
const target = targetCols.flatMap((col) => {
|
|
2626
|
+
const raw = row[header.indexOf(col)];
|
|
2627
|
+
return encodeValue(col, raw);
|
|
2628
|
+
});
|
|
2629
|
+
inputs.push(input);
|
|
2630
|
+
targets.push(target);
|
|
2631
|
+
}
|
|
2632
|
+
return {
|
|
2633
|
+
inputs,
|
|
2634
|
+
targets,
|
|
2635
|
+
categoricalMaps: catMaps,
|
|
2636
|
+
featureNames,
|
|
2637
|
+
targetNames,
|
|
2638
|
+
numRows: dataRows.length
|
|
2639
|
+
};
|
|
2640
|
+
}
|
|
2641
|
+
// ── Private: RFC 4180-compatible CSV parser ───────────────────────────────
|
|
2642
|
+
static _parseCSV(csv) {
|
|
2643
|
+
const rows = [];
|
|
2644
|
+
const lines = csv.split(/\r?\n/);
|
|
2645
|
+
for (const line of lines) {
|
|
2646
|
+
const trimmed = line.trim();
|
|
2647
|
+
if (!trimmed || trimmed.startsWith("#")) continue;
|
|
2648
|
+
rows.push(_DatasetLoader._parseCSVRow(trimmed));
|
|
2649
|
+
}
|
|
2650
|
+
return rows;
|
|
2651
|
+
}
|
|
2652
|
+
static _parseCSVRow(line) {
|
|
2653
|
+
const fields = [];
|
|
2654
|
+
let current = "";
|
|
2655
|
+
let inQuotes = false;
|
|
2656
|
+
for (let i = 0; i < line.length; i++) {
|
|
2657
|
+
const ch = line[i];
|
|
2658
|
+
if (inQuotes) {
|
|
2659
|
+
if (ch === '"' && line[i + 1] === '"') {
|
|
2660
|
+
current += '"';
|
|
2661
|
+
i++;
|
|
2662
|
+
} else if (ch === '"') {
|
|
2663
|
+
inQuotes = false;
|
|
2664
|
+
} else {
|
|
2665
|
+
current += ch;
|
|
2666
|
+
}
|
|
2667
|
+
} else {
|
|
2668
|
+
if (ch === '"') {
|
|
2669
|
+
inQuotes = true;
|
|
2670
|
+
} else if (ch === ",") {
|
|
2671
|
+
fields.push(current.trim());
|
|
2672
|
+
current = "";
|
|
2673
|
+
} else {
|
|
2674
|
+
current += ch;
|
|
2675
|
+
}
|
|
2676
|
+
}
|
|
2677
|
+
}
|
|
2678
|
+
fields.push(current.trim());
|
|
2679
|
+
return fields;
|
|
2680
|
+
}
|
|
2681
|
+
};
|
|
2682
|
+
|
|
2534
2683
|
// src/LRScheduler.ts
|
|
2535
2684
|
var LRScheduler = class {
|
|
2536
2685
|
// ── Step Decay ────────────────────────────────────────────────────────────
|
|
@@ -6015,6 +6164,216 @@ function _binaryRecall(yTrue, yPred, pos) {
|
|
|
6015
6164
|
return tp + fn > 0 ? tp / (tp + fn) : 0;
|
|
6016
6165
|
}
|
|
6017
6166
|
|
|
6167
|
+
// src/Tokenizer.ts
|
|
6168
|
+
var _Tokenizer = class _Tokenizer {
|
|
6169
|
+
constructor(options = {}) {
|
|
6170
|
+
this._token2id = /* @__PURE__ */ new Map();
|
|
6171
|
+
this._id2token = /* @__PURE__ */ new Map();
|
|
6172
|
+
this._fitted = false;
|
|
6173
|
+
this._mode = options.mode ?? "word";
|
|
6174
|
+
this._lowercase = options.lowercase ?? true;
|
|
6175
|
+
this._maxVocab = options.maxVocab ?? 0;
|
|
6176
|
+
this._extraSpecial = options.specialTokens ?? [];
|
|
6177
|
+
}
|
|
6178
|
+
// ── Fit ───────────────────────────────────────────────────────────────────
|
|
6179
|
+
/**
|
|
6180
|
+
* Build vocabulary from an array of text strings.
|
|
6181
|
+
* Calling fit() again resets and rebuilds the vocabulary from scratch.
|
|
6182
|
+
*
|
|
6183
|
+
* @param texts - corpus to build the vocabulary from
|
|
6184
|
+
* @returns this (chainable)
|
|
6185
|
+
*/
|
|
6186
|
+
fit(texts) {
|
|
6187
|
+
this._token2id = /* @__PURE__ */ new Map();
|
|
6188
|
+
this._id2token = /* @__PURE__ */ new Map();
|
|
6189
|
+
const specials = [
|
|
6190
|
+
_Tokenizer.PAD,
|
|
6191
|
+
_Tokenizer.UNK,
|
|
6192
|
+
_Tokenizer.BOS,
|
|
6193
|
+
_Tokenizer.EOS,
|
|
6194
|
+
...this._extraSpecial
|
|
6195
|
+
];
|
|
6196
|
+
for (const s of specials) this._register(s);
|
|
6197
|
+
const freq = /* @__PURE__ */ new Map();
|
|
6198
|
+
for (const text of texts) {
|
|
6199
|
+
for (const token of this.tokenize(text)) {
|
|
6200
|
+
freq.set(token, (freq.get(token) ?? 0) + 1);
|
|
6201
|
+
}
|
|
6202
|
+
}
|
|
6203
|
+
let entries = [...freq.entries()].sort(
|
|
6204
|
+
([a, fa], [b, fb]) => fb - fa || a.localeCompare(b)
|
|
6205
|
+
);
|
|
6206
|
+
if (this._maxVocab > 0) {
|
|
6207
|
+
entries = entries.slice(0, this._maxVocab - specials.length);
|
|
6208
|
+
}
|
|
6209
|
+
for (const [token] of entries) this._register(token);
|
|
6210
|
+
this._fitted = true;
|
|
6211
|
+
return this;
|
|
6212
|
+
}
|
|
6213
|
+
// ── Tokenize ──────────────────────────────────────────────────────────────
|
|
6214
|
+
/**
|
|
6215
|
+
* Split raw text into an array of string tokens (no ID conversion yet).
|
|
6216
|
+
* Useful for inspecting what the tokenizer produces before encoding.
|
|
6217
|
+
*/
|
|
6218
|
+
tokenize(text) {
|
|
6219
|
+
const t = this._lowercase ? text.toLowerCase() : text;
|
|
6220
|
+
switch (this._mode) {
|
|
6221
|
+
case "char":
|
|
6222
|
+
return t.split("");
|
|
6223
|
+
case "whitespace":
|
|
6224
|
+
return t.split(/\s+/).filter(Boolean);
|
|
6225
|
+
case "word":
|
|
6226
|
+
default:
|
|
6227
|
+
return t.match(/[a-z0-9àáâãäåæçèéêëìíîïðñòóôõöùúûüýþÿ]+|[^\w\s]/gi) ?? [];
|
|
6228
|
+
}
|
|
6229
|
+
}
|
|
6230
|
+
// ── Encode ────────────────────────────────────────────────────────────────
|
|
6231
|
+
/**
|
|
6232
|
+
* Convert a text string to a sequence of token IDs.
|
|
6233
|
+
* Unknown tokens map to <UNK> (id 1).
|
|
6234
|
+
*
|
|
6235
|
+
* @param text - input text
|
|
6236
|
+
* @param options - addBOS / addEOS flags
|
|
6237
|
+
*/
|
|
6238
|
+
encode(text, options = {}) {
|
|
6239
|
+
this._assertFitted();
|
|
6240
|
+
const ids = [];
|
|
6241
|
+
if (options.addBOS) ids.push(this._token2id.get(_Tokenizer.BOS));
|
|
6242
|
+
for (const token of this.tokenize(text)) {
|
|
6243
|
+
ids.push(this._token2id.get(token) ?? this._token2id.get(_Tokenizer.UNK));
|
|
6244
|
+
}
|
|
6245
|
+
if (options.addEOS) ids.push(this._token2id.get(_Tokenizer.EOS));
|
|
6246
|
+
return ids;
|
|
6247
|
+
}
|
|
6248
|
+
// ── Encode batch ──────────────────────────────────────────────────────────
|
|
6249
|
+
/**
|
|
6250
|
+
* Encode an array of texts, optionally padding/truncating to a fixed length.
|
|
6251
|
+
*
|
|
6252
|
+
* @param texts - array of input texts
|
|
6253
|
+
* @param options - addBOS / addEOS / padTo
|
|
6254
|
+
*/
|
|
6255
|
+
encodeBatch(texts, options = {}) {
|
|
6256
|
+
const sequences = texts.map((t) => this.encode(t, options));
|
|
6257
|
+
if (options.padTo !== void 0) {
|
|
6258
|
+
const len = options.padTo;
|
|
6259
|
+
const padId = this._token2id.get(_Tokenizer.PAD);
|
|
6260
|
+
return sequences.map((seq) => {
|
|
6261
|
+
if (seq.length >= len) return seq.slice(0, len);
|
|
6262
|
+
return [...seq, ...Array(len - seq.length).fill(padId)];
|
|
6263
|
+
});
|
|
6264
|
+
}
|
|
6265
|
+
return sequences;
|
|
6266
|
+
}
|
|
6267
|
+
// ── Decode ────────────────────────────────────────────────────────────────
|
|
6268
|
+
/**
|
|
6269
|
+
* Convert a sequence of token IDs back to a human-readable string.
|
|
6270
|
+
*
|
|
6271
|
+
* @param ids - array of token IDs
|
|
6272
|
+
* @param stripSpecial - remove PAD/BOS/EOS tokens from output. Default: true
|
|
6273
|
+
*/
|
|
6274
|
+
decode(ids, stripSpecial = true) {
|
|
6275
|
+
this._assertFitted();
|
|
6276
|
+
const specials = /* @__PURE__ */ new Set([_Tokenizer.PAD, _Tokenizer.BOS, _Tokenizer.EOS]);
|
|
6277
|
+
const tokens = [];
|
|
6278
|
+
for (const id of ids) {
|
|
6279
|
+
const token = this._id2token.get(id) ?? _Tokenizer.UNK;
|
|
6280
|
+
if (stripSpecial && specials.has(token)) continue;
|
|
6281
|
+
tokens.push(token);
|
|
6282
|
+
}
|
|
6283
|
+
return this._mode === "char" ? tokens.join("") : tokens.join(" ");
|
|
6284
|
+
}
|
|
6285
|
+
// ── One-hot encoding ──────────────────────────────────────────────────────
|
|
6286
|
+
/**
|
|
6287
|
+
* Convert a sequence of token IDs to one-hot vectors.
|
|
6288
|
+
* Each vector has length `vocabSize` with a single 1 at the token's position.
|
|
6289
|
+
* Useful when feeding tokens directly into a Network without an embedding layer.
|
|
6290
|
+
*
|
|
6291
|
+
* @param ids - array of token IDs (e.g. from encode())
|
|
6292
|
+
* @returns - 2D array of shape [seqLen, vocabSize]
|
|
6293
|
+
*/
|
|
6294
|
+
oneHot(ids) {
|
|
6295
|
+
this._assertFitted();
|
|
6296
|
+
const V = this.vocabSize;
|
|
6297
|
+
return ids.map((id) => {
|
|
6298
|
+
const vec = new Array(V).fill(0);
|
|
6299
|
+
if (id >= 0 && id < V) vec[id] = 1;
|
|
6300
|
+
return vec;
|
|
6301
|
+
});
|
|
6302
|
+
}
|
|
6303
|
+
// ── Vocabulary helpers ────────────────────────────────────────────────────
|
|
6304
|
+
/** Number of tokens in the vocabulary (including special tokens). */
|
|
6305
|
+
get vocabSize() {
|
|
6306
|
+
return this._token2id.size;
|
|
6307
|
+
}
|
|
6308
|
+
/** True if fit() has been called at least once. */
|
|
6309
|
+
get isFitted() {
|
|
6310
|
+
return this._fitted;
|
|
6311
|
+
}
|
|
6312
|
+
/** Get the integer ID for a token string, or undefined if not in vocabulary. */
|
|
6313
|
+
tokenToId(token) {
|
|
6314
|
+
return this._token2id.get(token);
|
|
6315
|
+
}
|
|
6316
|
+
/** Get the token string for an integer ID, or undefined if out of range. */
|
|
6317
|
+
idToToken(id) {
|
|
6318
|
+
return this._id2token.get(id);
|
|
6319
|
+
}
|
|
6320
|
+
/**
|
|
6321
|
+
* Return the full vocabulary as an array ordered by ID.
|
|
6322
|
+
* Index i of the returned array is the token with ID i.
|
|
6323
|
+
*/
|
|
6324
|
+
getVocabulary() {
|
|
6325
|
+
return Array.from({ length: this.vocabSize }, (_, i) => this._id2token.get(i));
|
|
6326
|
+
}
|
|
6327
|
+
// ── Persistence ───────────────────────────────────────────────────────────
|
|
6328
|
+
/**
|
|
6329
|
+
* Serialize the fitted tokenizer to a plain JSON-compatible object.
|
|
6330
|
+
* Store it with JSON.stringify(); reload with Tokenizer.fromJSON().
|
|
6331
|
+
*/
|
|
6332
|
+
toJSON() {
|
|
6333
|
+
this._assertFitted();
|
|
6334
|
+
return {
|
|
6335
|
+
mode: this._mode,
|
|
6336
|
+
lowercase: this._lowercase,
|
|
6337
|
+
maxVocab: this._maxVocab,
|
|
6338
|
+
token2id: Object.fromEntries(this._token2id)
|
|
6339
|
+
};
|
|
6340
|
+
}
|
|
6341
|
+
/**
|
|
6342
|
+
* Restore a Tokenizer from a snapshot produced by toJSON().
|
|
6343
|
+
*/
|
|
6344
|
+
static fromJSON(snapshot) {
|
|
6345
|
+
const tok = new _Tokenizer({
|
|
6346
|
+
mode: snapshot.mode,
|
|
6347
|
+
lowercase: snapshot.lowercase,
|
|
6348
|
+
maxVocab: snapshot.maxVocab
|
|
6349
|
+
});
|
|
6350
|
+
for (const [token, id] of Object.entries(snapshot.token2id)) {
|
|
6351
|
+
tok._token2id.set(token, id);
|
|
6352
|
+
tok._id2token.set(id, token);
|
|
6353
|
+
}
|
|
6354
|
+
tok._fitted = true;
|
|
6355
|
+
return tok;
|
|
6356
|
+
}
|
|
6357
|
+
// ── Private ───────────────────────────────────────────────────────────────
|
|
6358
|
+
_register(token) {
|
|
6359
|
+
if (this._token2id.has(token)) return;
|
|
6360
|
+
const id = this._token2id.size;
|
|
6361
|
+
this._token2id.set(token, id);
|
|
6362
|
+
this._id2token.set(id, token);
|
|
6363
|
+
}
|
|
6364
|
+
_assertFitted() {
|
|
6365
|
+
if (!this._fitted) {
|
|
6366
|
+
throw new Error("Tokenizer: call fit() before encoding or decoding.");
|
|
6367
|
+
}
|
|
6368
|
+
}
|
|
6369
|
+
};
|
|
6370
|
+
// ── Built-in special tokens ────────────────────────────────────────────────
|
|
6371
|
+
_Tokenizer.PAD = "<PAD>";
|
|
6372
|
+
_Tokenizer.UNK = "<UNK>";
|
|
6373
|
+
_Tokenizer.BOS = "<BOS>";
|
|
6374
|
+
_Tokenizer.EOS = "<EOS>";
|
|
6375
|
+
var Tokenizer = _Tokenizer;
|
|
6376
|
+
|
|
6018
6377
|
// src/EarlyStopping.ts
|
|
6019
6378
|
var EarlyStopping = class {
|
|
6020
6379
|
constructor(options) {
|
|
@@ -6301,6 +6660,7 @@ export {
|
|
|
6301
6660
|
Conv2D,
|
|
6302
6661
|
DataAugmentation,
|
|
6303
6662
|
DataLoader,
|
|
6663
|
+
DatasetLoader,
|
|
6304
6664
|
DecisionTree,
|
|
6305
6665
|
Dropout,
|
|
6306
6666
|
EarlyStopping,
|
|
@@ -6340,6 +6700,7 @@ export {
|
|
|
6340
6700
|
SoftmaxRegression,
|
|
6341
6701
|
TCN,
|
|
6342
6702
|
TSNE,
|
|
6703
|
+
Tokenizer,
|
|
6343
6704
|
Trainer,
|
|
6344
6705
|
TransformerBlock,
|
|
6345
6706
|
VAE,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dniskav/neuron",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.2",
|
|
4
4
|
"description": "Minimal neural network from scratch — neuron, layer, network, backpropagation, classical ML, unsupervised, generative models, autograd. No dependencies.",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.mjs",
|