@dniskav/neuron 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -22,16 +22,19 @@ var index_exports = {};
22
22
  __export(index_exports, {
23
23
  Adam: () => Adam,
24
24
  AttentionHead: () => AttentionHead,
25
+ Augmenter: () => Augmenter,
25
26
  Autoencoder: () => Autoencoder,
26
27
  BatchNorm: () => BatchNorm,
27
28
  BiasVector: () => BiasVector,
28
29
  CausalConv1D: () => CausalConv1D,
29
30
  ClipOptimizer: () => ClipOptimizer,
30
31
  ClippedOptimizerFactory: () => ClippedOptimizerFactory,
32
+ ContrastiveLearning: () => ContrastiveLearning,
31
33
  Conv1D: () => Conv1D,
32
34
  Conv2D: () => Conv2D,
33
35
  DataAugmentation: () => DataAugmentation,
34
36
  DataLoader: () => DataLoader,
37
+ DatasetLoader: () => DatasetLoader,
35
38
  DecisionTree: () => DecisionTree,
36
39
  Dropout: () => Dropout,
37
40
  EarlyStopping: () => EarlyStopping,
@@ -46,6 +49,7 @@ __export(index_exports, {
46
49
  LSTMLayer: () => LSTMLayer,
47
50
  Layer: () => Layer,
48
51
  LayerNorm: () => LayerNorm,
52
+ LearnedPositionalEncoding: () => LearnedPositionalEncoding,
49
53
  LinearRegression: () => LinearRegression,
50
54
  LogisticRegression: () => LogisticRegression,
51
55
  LossPlotter: () => LossPlotter,
@@ -62,18 +66,22 @@ __export(index_exports, {
62
66
  NeuronN: () => NeuronN,
63
67
  PCA: () => PCA,
64
68
  Perceptron: () => Perceptron,
69
+ PositionalEncoding: () => PositionalEncoding,
65
70
  RNN: () => RNN,
66
71
  SGD: () => SGD,
67
72
  SOM: () => SOM,
68
73
  Seq2Seq: () => Seq2Seq,
69
74
  SoftmaxRegression: () => SoftmaxRegression,
70
75
  TCN: () => TCN,
76
+ TSNE: () => TSNE,
77
+ Tokenizer: () => Tokenizer,
71
78
  Trainer: () => Trainer,
72
79
  TransformerBlock: () => TransformerBlock,
73
80
  VAE: () => VAE,
74
81
  Value: () => Value,
75
82
  WeightInspector: () => WeightInspector,
76
83
  WeightMatrix: () => WeightMatrix,
84
+ Word2Vec: () => Word2Vec,
77
85
  accuracy: () => accuracy,
78
86
  auc: () => auc,
79
87
  classificationReport: () => classificationReport,
@@ -2645,6 +2653,155 @@ var DataLoader = class _DataLoader {
2645
2653
  }
2646
2654
  };
2647
2655
 
2656
+ // src/DatasetLoader.ts
2657
+ var DatasetLoader = class _DatasetLoader {
2658
+ // ── CSV ─────────────────────────────────────────────────────────────────────
2659
+ /**
2660
+ * Parse a CSV string into a DataPair.
2661
+ *
2662
+ * - The first non-empty row is treated as a header.
2663
+ * - Numeric values are parsed with parseFloat.
2664
+ * - String values are one-hot encoded (one column → N binary columns).
2665
+ * - Empty rows and comment lines (starting with #) are skipped.
2666
+ *
2667
+ * @param csv - raw CSV text
2668
+ * @param options - which columns to use as features / targets
2669
+ */
2670
+ static fromCSV(csv, options) {
2671
+ const rows = _DatasetLoader._parseCSV(csv);
2672
+ if (rows.length < 2) throw new Error("DatasetLoader.fromCSV: CSV must have a header row and at least one data row.");
2673
+ const header = rows[0];
2674
+ const dataRows = rows.slice(1);
2675
+ return _DatasetLoader._buildDataPair(header, dataRows, options);
2676
+ }
2677
+ // ── JSON ─────────────────────────────────────────────────────────────────────
2678
+ /**
2679
+ * Parse a JSON string (array of objects) into a DataPair.
2680
+ *
2681
+ * Expected format:
2682
+ * [{ "col1": 1.0, "col2": "cat", "label": "dog" }, ...]
2683
+ *
2684
+ * @param json - raw JSON text or a pre-parsed array of objects
2685
+ * @param options - which columns to use as features / targets
2686
+ */
2687
+ static fromJSON(json, options) {
2688
+ const records = typeof json === "string" ? JSON.parse(json) : json;
2689
+ if (!Array.isArray(records) || records.length === 0) {
2690
+ throw new Error("DatasetLoader.fromJSON: expected a non-empty JSON array of objects.");
2691
+ }
2692
+ const header = Object.keys(records[0]);
2693
+ const dataRows = records.map((row) => header.map((col) => String(row[col] ?? "")));
2694
+ return _DatasetLoader._buildDataPair(header, dataRows, options);
2695
+ }
2696
+ // ── Private: shared pipeline ──────────────────────────────────────────────
2697
+ static _buildDataPair(header, dataRows, options) {
2698
+ const { featureCols, targetCols, encodeStrings = true } = options;
2699
+ for (const col of [...featureCols, ...targetCols]) {
2700
+ if (!header.includes(col)) {
2701
+ throw new Error(`DatasetLoader: column "${col}" not found in header [${header.join(", ")}].`);
2702
+ }
2703
+ }
2704
+ const catMaps = {};
2705
+ const buildEncoder = (cols) => {
2706
+ for (const col of cols) {
2707
+ const colIdx = header.indexOf(col);
2708
+ const values = dataRows.map((row) => row[colIdx]);
2709
+ const isNumeric = values.every((v) => v === "" || !isNaN(Number(v)));
2710
+ if (!isNumeric) {
2711
+ if (!encodeStrings) {
2712
+ throw new Error(`DatasetLoader: column "${col}" contains non-numeric values. Set encodeStrings: true to one-hot encode them.`);
2713
+ }
2714
+ const unique = [...new Set(values)].sort();
2715
+ catMaps[col] = Object.fromEntries(unique.map((v, i) => [v, i]));
2716
+ }
2717
+ }
2718
+ };
2719
+ buildEncoder(featureCols);
2720
+ buildEncoder(targetCols);
2721
+ const encodeValue = (col, raw) => {
2722
+ if (catMaps[col]) {
2723
+ const categories = catMaps[col];
2724
+ const n = Object.keys(categories).length;
2725
+ const vec = new Array(n).fill(0);
2726
+ const idx = categories[raw];
2727
+ if (idx !== void 0) vec[idx] = 1;
2728
+ return vec;
2729
+ }
2730
+ return [parseFloat(raw)];
2731
+ };
2732
+ const expandNames = (cols) => cols.flatMap((col) => {
2733
+ if (catMaps[col]) {
2734
+ return Object.keys(catMaps[col]).map((cat) => `${col}_${cat}`);
2735
+ }
2736
+ return [col];
2737
+ });
2738
+ const featureNames = expandNames(featureCols);
2739
+ const targetNames = expandNames(targetCols);
2740
+ const inputs = [];
2741
+ const targets = [];
2742
+ for (const row of dataRows) {
2743
+ const input = featureCols.flatMap((col) => {
2744
+ const raw = row[header.indexOf(col)];
2745
+ return encodeValue(col, raw);
2746
+ });
2747
+ const target = targetCols.flatMap((col) => {
2748
+ const raw = row[header.indexOf(col)];
2749
+ return encodeValue(col, raw);
2750
+ });
2751
+ inputs.push(input);
2752
+ targets.push(target);
2753
+ }
2754
+ return {
2755
+ inputs,
2756
+ targets,
2757
+ categoricalMaps: catMaps,
2758
+ featureNames,
2759
+ targetNames,
2760
+ numRows: dataRows.length
2761
+ };
2762
+ }
2763
+ // ── Private: RFC 4180-compatible CSV parser ───────────────────────────────
2764
+ static _parseCSV(csv) {
2765
+ const rows = [];
2766
+ const lines = csv.split(/\r?\n/);
2767
+ for (const line of lines) {
2768
+ const trimmed = line.trim();
2769
+ if (!trimmed || trimmed.startsWith("#")) continue;
2770
+ rows.push(_DatasetLoader._parseCSVRow(trimmed));
2771
+ }
2772
+ return rows;
2773
+ }
2774
+ static _parseCSVRow(line) {
2775
+ const fields = [];
2776
+ let current = "";
2777
+ let inQuotes = false;
2778
+ for (let i = 0; i < line.length; i++) {
2779
+ const ch = line[i];
2780
+ if (inQuotes) {
2781
+ if (ch === '"' && line[i + 1] === '"') {
2782
+ current += '"';
2783
+ i++;
2784
+ } else if (ch === '"') {
2785
+ inQuotes = false;
2786
+ } else {
2787
+ current += ch;
2788
+ }
2789
+ } else {
2790
+ if (ch === '"') {
2791
+ inQuotes = true;
2792
+ } else if (ch === ",") {
2793
+ fields.push(current.trim());
2794
+ current = "";
2795
+ } else {
2796
+ current += ch;
2797
+ }
2798
+ }
2799
+ }
2800
+ fields.push(current.trim());
2801
+ return fields;
2802
+ }
2803
+ };
2804
+
2648
2805
  // src/LRScheduler.ts
2649
2806
  var LRScheduler = class {
2650
2807
  // ── Step Decay ────────────────────────────────────────────────────────────
@@ -4850,6 +5007,749 @@ var TCN = class {
4850
5007
  }
4851
5008
  };
4852
5009
 
5010
+ // src/Word2Vec.ts
5011
+ var Word2Vec = class {
5012
+ constructor(embeddingDim = 50, options = {}) {
5013
+ this._trained = false;
5014
+ this.embeddingDim = embeddingDim;
5015
+ this._windowSize = options.windowSize ?? 2;
5016
+ this._model = options.model ?? "skipgram";
5017
+ this._minCount = options.minCount ?? 1;
5018
+ this.embeddings = [];
5019
+ this._W2 = [];
5020
+ this.vocab = /* @__PURE__ */ new Map();
5021
+ this._indexToWord = [];
5022
+ this.vocabSize = 0;
5023
+ }
5024
+ // ── buildVocab ─────────────────────────────────────────────────────────────
5025
+ // Scans the corpus, counts word frequencies, discards rare words (< minCount),
5026
+ // and assigns each remaining word a unique integer index.
5027
+ buildVocab(sentences) {
5028
+ const freq = /* @__PURE__ */ new Map();
5029
+ for (const sentence of sentences) {
5030
+ for (const word of sentence) {
5031
+ freq.set(word, (freq.get(word) ?? 0) + 1);
5032
+ }
5033
+ }
5034
+ this.vocab = /* @__PURE__ */ new Map();
5035
+ this._indexToWord = [];
5036
+ for (const [word, count] of freq) {
5037
+ if (count >= this._minCount) {
5038
+ const idx = this._indexToWord.length;
5039
+ this.vocab.set(word, idx);
5040
+ this._indexToWord.push(word);
5041
+ }
5042
+ }
5043
+ this.vocabSize = this._indexToWord.length;
5044
+ if (this.vocabSize === 0) {
5045
+ throw new Error("Word2Vec.buildVocab: vocabulary is empty after applying minCount filter");
5046
+ }
5047
+ const scale1 = Math.sqrt(1 / this.embeddingDim);
5048
+ const scale2 = Math.sqrt(1 / this.vocabSize);
5049
+ this.embeddings = Array.from(
5050
+ { length: this.vocabSize },
5051
+ () => Array.from({ length: this.embeddingDim }, () => (Math.random() * 2 - 1) * scale1)
5052
+ );
5053
+ this._W2 = Array.from(
5054
+ { length: this.embeddingDim },
5055
+ () => Array.from({ length: this.vocabSize }, () => (Math.random() * 2 - 1) * scale2)
5056
+ );
5057
+ this._trained = false;
5058
+ }
5059
+ // ── tokenize ───────────────────────────────────────────────────────────────
5060
+ // Simple tokenizer: lowercase, strip punctuation, split on whitespace.
5061
+ // Returns an array of tokens suitable for buildVocab / train.
5062
+ static tokenize(text) {
5063
+ return text.toLowerCase().replace(/[^a-z0-9\s'-]/g, " ").split(/\s+/).filter((t) => t.length > 0);
5064
+ }
5065
+ // ── train ──────────────────────────────────────────────────────────────────
5066
+ // Runs SGD over all (center, context) pairs in the corpus for `epochs` passes.
5067
+ // Returns the average cross-entropy loss per epoch.
5068
+ //
5069
+ // Note: uses full-vocabulary softmax (not negative sampling) for educational
5070
+ // clarity. This is O(vocabSize) per step — for large vocabularies you would
5071
+ // normally switch to negative sampling or hierarchical softmax.
5072
+ train(sentences, lr = 0.025, epochs = 5) {
5073
+ if (this.vocabSize === 0) this.buildVocab(sentences);
5074
+ const lossHistory = [];
5075
+ for (let epoch = 0; epoch < epochs; epoch++) {
5076
+ let totalLoss = 0;
5077
+ let nPairs = 0;
5078
+ for (const sentence of sentences) {
5079
+ const indices = sentence.map((w) => this.vocab.get(w)).filter((idx) => idx !== void 0);
5080
+ for (let t = 0; t < indices.length; t++) {
5081
+ const centerIdx = indices[t];
5082
+ const contextIndices = [];
5083
+ for (let offset = -this._windowSize; offset <= this._windowSize; offset++) {
5084
+ if (offset === 0) continue;
5085
+ const pos = t + offset;
5086
+ if (pos >= 0 && pos < indices.length) {
5087
+ contextIndices.push(indices[pos]);
5088
+ }
5089
+ }
5090
+ if (contextIndices.length === 0) continue;
5091
+ if (this._model === "skipgram") {
5092
+ for (const contextIdx of contextIndices) {
5093
+ totalLoss += this._skipgramStep(centerIdx, contextIdx, lr);
5094
+ nPairs++;
5095
+ }
5096
+ } else {
5097
+ totalLoss += this._cbowStep(centerIdx, contextIndices, lr);
5098
+ nPairs++;
5099
+ }
5100
+ }
5101
+ }
5102
+ lossHistory.push(nPairs > 0 ? totalLoss / nPairs : 0);
5103
+ }
5104
+ this._trained = true;
5105
+ return lossHistory;
5106
+ }
5107
+ // ── getEmbedding ───────────────────────────────────────────────────────────
5108
+ // Returns the learned embedding vector for a word. Throws if unknown.
5109
+ getEmbedding(word) {
5110
+ const idx = this.vocab.get(word);
5111
+ if (idx === void 0) throw new Error(`Word2Vec: unknown word "${word}"`);
5112
+ return this.embeddings[idx];
5113
+ }
5114
+ // ── similarity ─────────────────────────────────────────────────────────────
5115
+ // Cosine similarity between two words.
5116
+ // cos(v1, v2) = (v1 · v2) / (‖v1‖ · ‖v2‖)
5117
+ // Returns a value in [-1, 1]. Higher → more similar context usage.
5118
+ similarity(word1, word2) {
5119
+ const v1 = this.getEmbedding(word1);
5120
+ const v2 = this.getEmbedding(word2);
5121
+ return this._cosine(v1, v2);
5122
+ }
5123
+ // ── mostSimilar ────────────────────────────────────────────────────────────
5124
+ // Returns the topK words (excluding `word` itself) sorted by cosine similarity.
5125
+ mostSimilar(word, topK = 10) {
5126
+ const v = this.getEmbedding(word);
5127
+ return this._nearestByVector(v, topK, /* @__PURE__ */ new Set([word]));
5128
+ }
5129
+ // ── analogy ───────────────────────────────────────────────────────────────
5130
+ // Vector arithmetic analogy: positive1 - negative + positive2 ≈ result
5131
+ //
5132
+ // getAnalogy('king', 'man', 'woman') finds the word closest to
5133
+ // vec('king') - vec('man') + vec('woman') ≈ vec('queen')
5134
+ //
5135
+ // The result is excluded from the input words so they don't pollute the top-K.
5136
+ analogy(positive1, negative, positive2, topK = 5) {
5137
+ const vPos1 = this.getEmbedding(positive1);
5138
+ const vNeg = this.getEmbedding(negative);
5139
+ const vPos2 = this.getEmbedding(positive2);
5140
+ const target = vPos1.map((v, i) => v - vNeg[i] + vPos2[i]);
5141
+ const exclude = /* @__PURE__ */ new Set([positive1, negative, positive2]);
5142
+ return this._nearestByVector(target, topK, exclude);
5143
+ }
5144
+ // ── Private: skip-gram step ───────────────────────────────────────────────
5145
+ // Forward + backward for one (center, target) pair.
5146
+ // Returns the cross-entropy loss for this pair.
5147
+ _skipgramStep(centerIdx, targetIdx, lr) {
5148
+ const h = this.embeddings[centerIdx];
5149
+ const scores = this._hiddenToScores(h);
5150
+ const probs = _softmax(scores);
5151
+ const loss = -Math.log(probs[targetIdx] + 1e-12);
5152
+ const err = probs.map((p, j) => j === targetIdx ? p - 1 : p);
5153
+ const dh = new Array(this.embeddingDim).fill(0);
5154
+ for (let d = 0; d < this.embeddingDim; d++) {
5155
+ for (let j = 0; j < this.vocabSize; j++) {
5156
+ this._W2[d][j] -= lr * h[d] * err[j];
5157
+ dh[d] += this._W2[d][j] * err[j];
5158
+ }
5159
+ }
5160
+ for (let d = 0; d < this.embeddingDim; d++) {
5161
+ this.embeddings[centerIdx][d] -= lr * dh[d];
5162
+ }
5163
+ return loss;
5164
+ }
5165
+ // ── Private: CBOW step ────────────────────────────────────────────────────
5166
+ // Forward + backward for one (contextIndices → centerIdx) pair.
5167
+ // h is the mean of all context embeddings. The gradient is distributed
5168
+ // equally back to each context word's embedding row.
5169
+ _cbowStep(centerIdx, contextIndices, lr) {
5170
+ const k = contextIndices.length;
5171
+ const h = new Array(this.embeddingDim).fill(0);
5172
+ for (const ci of contextIndices) {
5173
+ for (let d = 0; d < this.embeddingDim; d++) {
5174
+ h[d] += this.embeddings[ci][d];
5175
+ }
5176
+ }
5177
+ for (let d = 0; d < this.embeddingDim; d++) h[d] /= k;
5178
+ const scores = this._hiddenToScores(h);
5179
+ const probs = _softmax(scores);
5180
+ const loss = -Math.log(probs[centerIdx] + 1e-12);
5181
+ const err = probs.map((p, j) => j === centerIdx ? p - 1 : p);
5182
+ const dh = new Array(this.embeddingDim).fill(0);
5183
+ for (let d = 0; d < this.embeddingDim; d++) {
5184
+ for (let j = 0; j < this.vocabSize; j++) {
5185
+ this._W2[d][j] -= lr * h[d] * err[j];
5186
+ dh[d] += this._W2[d][j] * err[j];
5187
+ }
5188
+ }
5189
+ for (const ci of contextIndices) {
5190
+ for (let d = 0; d < this.embeddingDim; d++) {
5191
+ this.embeddings[ci][d] -= lr * dh[d] / k;
5192
+ }
5193
+ }
5194
+ return loss;
5195
+ }
5196
+ // Computes scores = h · W2 → [vocabSize]
5197
+ _hiddenToScores(h) {
5198
+ const scores = new Array(this.vocabSize).fill(0);
5199
+ for (let d = 0; d < this.embeddingDim; d++) {
5200
+ for (let j = 0; j < this.vocabSize; j++) {
5201
+ scores[j] += h[d] * this._W2[d][j];
5202
+ }
5203
+ }
5204
+ return scores;
5205
+ }
5206
+ // Returns topK words (from all embeddings) sorted by cosine similarity to v,
5207
+ // skipping any word in the exclude set.
5208
+ _nearestByVector(v, topK, exclude) {
5209
+ const results = [];
5210
+ for (let i = 0; i < this.vocabSize; i++) {
5211
+ const w = this._indexToWord[i];
5212
+ if (exclude.has(w)) continue;
5213
+ results.push({ word: w, score: this._cosine(v, this.embeddings[i]) });
5214
+ }
5215
+ results.sort((a, b) => b.score - a.score);
5216
+ return results.slice(0, topK);
5217
+ }
5218
+ // Cosine similarity: (v1 · v2) / (‖v1‖ · ‖v2‖)
5219
+ _cosine(v1, v2) {
5220
+ let dot = 0, n1 = 0, n2 = 0;
5221
+ for (let i = 0; i < v1.length; i++) {
5222
+ dot += v1[i] * v2[i];
5223
+ n1 += v1[i] * v1[i];
5224
+ n2 += v2[i] * v2[i];
5225
+ }
5226
+ const denom = Math.sqrt(n1) * Math.sqrt(n2);
5227
+ return denom < 1e-12 ? 0 : dot / denom;
5228
+ }
5229
+ };
5230
+ function _softmax(scores) {
5231
+ const max = Math.max(...scores);
5232
+ const exps = scores.map((s) => Math.exp(s - max));
5233
+ const sum = exps.reduce((a, b) => a + b, 0);
5234
+ return exps.map((e) => e / sum);
5235
+ }
5236
+
5237
+ // src/TSNE.ts
5238
+ var TSNE = class {
5239
+ constructor(options = {}) {
5240
+ // KL divergence tracked during the last fit() call.
5241
+ this._klDivergence = 0;
5242
+ // P matrix stored for kl() reporting.
5243
+ this._P = [];
5244
+ this._nComponents = options.nComponents ?? 2;
5245
+ this._perplexity = options.perplexity ?? 30;
5246
+ this._lr = options.lr ?? 200;
5247
+ this._nIter = options.nIter ?? 1e3;
5248
+ this._seed = options.seed;
5249
+ this.embedding = [];
5250
+ }
5251
+ // ── fit ────────────────────────────────────────────────────────────────────
5252
+ // Runs the full t-SNE algorithm on X (shape [n][d]).
5253
+ // Stores the result in this.embedding ([n][nComponents]).
5254
+ fit(X) {
5255
+ const n = X.length;
5256
+ if (n < 2) throw new Error("TSNE.fit: need at least 2 data points");
5257
+ if (this._perplexity >= n) {
5258
+ throw new Error(
5259
+ `TSNE.fit: perplexity (${this._perplexity}) must be less than n (${n})`
5260
+ );
5261
+ }
5262
+ const rng = this._seed !== void 0 ? _mulberry32(this._seed) : Math.random;
5263
+ const distSq = _pairwiseDistSq(X, n);
5264
+ const Pcond = this._computePcond(distSq, n);
5265
+ const P = _symmetrize(Pcond, n);
5266
+ this._P = P;
5267
+ let Y = Array.from({ length: n }, () => {
5268
+ return Array.from({ length: this._nComponents }, () => {
5269
+ const u1 = Math.max(rng(), 1e-12);
5270
+ const u2 = rng();
5271
+ const z = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
5272
+ return z * 0.01;
5273
+ });
5274
+ });
5275
+ let Yprev = Y.map((row) => [...row]);
5276
+ const EXAGGERATION_ITERS = 50;
5277
+ const EXAGGERATION_FACTOR = 4;
5278
+ const MOMENTUM_SWITCH = 20;
5279
+ for (let iter = 0; iter < this._nIter; iter++) {
5280
+ const momentum = iter < MOMENTUM_SWITCH ? 0.5 : 0.8;
5281
+ const pScale = iter < EXAGGERATION_ITERS ? EXAGGERATION_FACTOR : 1;
5282
+ const { Q, invDist } = _computeQ(Y, n, this._nComponents);
5283
+ const grad = Array.from(
5284
+ { length: n },
5285
+ () => new Array(this._nComponents).fill(0)
5286
+ );
5287
+ for (let i = 0; i < n; i++) {
5288
+ for (let j = 0; j < n; j++) {
5289
+ if (i === j) continue;
5290
+ const pq = pScale * P[i][j] - Q[i][j];
5291
+ const c = 4 * pq * invDist[i][j];
5292
+ for (let d = 0; d < this._nComponents; d++) {
5293
+ grad[i][d] += c * (Y[i][d] - Y[j][d]);
5294
+ }
5295
+ }
5296
+ }
5297
+ const Ynext = Array.from(
5298
+ { length: n },
5299
+ (_, i) => Array.from(
5300
+ { length: this._nComponents },
5301
+ (_2, d) => Y[i][d] - this._lr * grad[i][d] + momentum * (Y[i][d] - Yprev[i][d])
5302
+ )
5303
+ );
5304
+ Yprev = Y;
5305
+ Y = Ynext;
5306
+ }
5307
+ this.embedding = Y;
5308
+ const { Q: Qfinal } = _computeQ(Y, n, this._nComponents);
5309
+ let kl = 0;
5310
+ for (let i = 0; i < n; i++) {
5311
+ for (let j = 0; j < n; j++) {
5312
+ if (i === j) continue;
5313
+ const p = P[i][j];
5314
+ if (p > 1e-12) {
5315
+ kl += p * Math.log(p / (Qfinal[i][j] + 1e-12));
5316
+ }
5317
+ }
5318
+ }
5319
+ this._klDivergence = kl;
5320
+ }
5321
+ // ── fitTransform ───────────────────────────────────────────────────────────
5322
+ // Convenience: fit() then return this.embedding.
5323
+ fitTransform(X) {
5324
+ this.fit(X);
5325
+ return this.embedding;
5326
+ }
5327
+ // ── kl ─────────────────────────────────────────────────────────────────────
5328
+ // Returns the KL divergence KL(P ‖ Q) from the last fit() call.
5329
+ // Lower is better. Useful for comparing perplexity settings or iteration counts.
5330
+ kl() {
5331
+ return this._klDivergence;
5332
+ }
5333
+ // ── Private: binary search for σi ─────────────────────────────────────────
5334
+ // For each point i, find σi such that the Shannon entropy of P(·|i) equals
5335
+ // log₂(perplexity). We use binary search on σ².
5336
+ _computePcond(distSq, n) {
5337
+ const targetEntropy = Math.log2(this._perplexity);
5338
+ const Pcond = Array.from({ length: n }, () => new Array(n).fill(0));
5339
+ for (let i = 0; i < n; i++) {
5340
+ let sigmaLo = 0;
5341
+ let sigmaHi = 1e10;
5342
+ let sigma2 = 1;
5343
+ for (let attempt = 0; attempt < 50; attempt++) {
5344
+ const dists = distSq[i];
5345
+ let sumExp = 0;
5346
+ const exps = new Array(n).fill(0);
5347
+ for (let j = 0; j < n; j++) {
5348
+ if (j === i) continue;
5349
+ const e = Math.exp(-dists[j] / (2 * sigma2));
5350
+ exps[j] = e;
5351
+ sumExp += e;
5352
+ }
5353
+ if (sumExp < 1e-12) break;
5354
+ let H = 0;
5355
+ for (let j = 0; j < n; j++) {
5356
+ if (j === i) continue;
5357
+ const p = exps[j] / sumExp;
5358
+ Pcond[i][j] = p;
5359
+ if (p > 1e-12) H -= p * Math.log2(p);
5360
+ }
5361
+ const delta = H - targetEntropy;
5362
+ if (Math.abs(delta) < 1e-5) break;
5363
+ if (delta > 0) {
5364
+ sigmaHi = sigma2;
5365
+ sigma2 = (sigmaLo + sigma2) / 2;
5366
+ } else {
5367
+ sigmaLo = sigma2;
5368
+ sigma2 = sigmaHi < 1e9 ? (sigma2 + sigmaHi) / 2 : sigma2 * 2;
5369
+ }
5370
+ }
5371
+ }
5372
+ return Pcond;
5373
+ }
5374
+ };
5375
+ function _pairwiseDistSq(X, n) {
5376
+ const D = Array.from({ length: n }, () => new Array(n).fill(0));
5377
+ for (let i = 0; i < n; i++) {
5378
+ for (let j = i + 1; j < n; j++) {
5379
+ let d = 0;
5380
+ for (let k = 0; k < X[i].length; k++) {
5381
+ const diff = X[i][k] - X[j][k];
5382
+ d += diff * diff;
5383
+ }
5384
+ D[i][j] = d;
5385
+ D[j][i] = d;
5386
+ }
5387
+ }
5388
+ return D;
5389
+ }
5390
+ function _symmetrize(Pcond, n) {
5391
+ const P = Array.from({ length: n }, () => new Array(n).fill(0));
5392
+ for (let i = 0; i < n; i++) {
5393
+ for (let j = 0; j < n; j++) {
5394
+ P[i][j] = (Pcond[i][j] + Pcond[j][i]) / (2 * n);
5395
+ }
5396
+ }
5397
+ return P;
5398
+ }
5399
+ function _computeQ(Y, n, nComponents) {
5400
+ const num = Array.from({ length: n }, () => new Array(n).fill(0));
5401
+ let Z = 0;
5402
+ for (let i = 0; i < n; i++) {
5403
+ for (let j = i + 1; j < n; j++) {
5404
+ let d2 = 0;
5405
+ for (let d = 0; d < nComponents; d++) {
5406
+ const diff = Y[i][d] - Y[j][d];
5407
+ d2 += diff * diff;
5408
+ }
5409
+ const inv = 1 / (1 + d2);
5410
+ num[i][j] = inv;
5411
+ num[j][i] = inv;
5412
+ Z += 2 * inv;
5413
+ }
5414
+ }
5415
+ if (Z < 1e-12) Z = 1e-12;
5416
+ const Q = Array.from(
5417
+ { length: n },
5418
+ (_, i) => num[i].map((v) => v / Z)
5419
+ );
5420
+ return { Q, invDist: num };
5421
+ }
5422
+ function _mulberry32(seed) {
5423
+ let s = seed >>> 0;
5424
+ return function() {
5425
+ s = s + 1831565813 >>> 0;
5426
+ let z = s;
5427
+ z = Math.imul(z ^ z >>> 15, z | 1);
5428
+ z ^= z + Math.imul(z ^ z >>> 7, z | 61);
5429
+ z = (z ^ z >>> 14) >>> 0;
5430
+ return z / 4294967296;
5431
+ };
5432
+ }
5433
+
5434
+ // src/PositionalEncoding.ts
5435
+ var PositionalEncoding = class _PositionalEncoding {
5436
+ // Compute the full PE vector for one token at position `pos`.
5437
+ // Returns an array of length `dModel`.
5438
+ //
5439
+ // Each pair of dimensions (2i, 2i+1) shares the same frequency 1/10000^(2i/dModel)
5440
+ // but is 90° out of phase (sin vs cos), which ensures no two positions produce
5441
+ // the identical vector.
5442
+ static encode(pos, dModel) {
5443
+ const pe = new Array(dModel);
5444
+ for (let i = 0; i < Math.floor(dModel / 2); i++) {
5445
+ const freq = Math.pow(1e4, 2 * i / dModel);
5446
+ pe[2 * i] = Math.sin(pos / freq);
5447
+ pe[2 * i + 1] = Math.cos(pos / freq);
5448
+ }
5449
+ if (dModel % 2 !== 0) {
5450
+ const i = Math.floor(dModel / 2);
5451
+ const freq = Math.pow(1e4, 2 * i / dModel);
5452
+ pe[dModel - 1] = Math.sin(pos / freq);
5453
+ }
5454
+ return pe;
5455
+ }
5456
+ // Build the full positional encoding matrix for a sequence of `seqLen` tokens.
5457
+ // Returns shape [seqLen][dModel].
5458
+ //
5459
+ // In practice this matrix is computed once and cached — it doesn't change
5460
+ // across examples, batches, or epochs.
5461
+ static encodeSequence(seqLen, dModel) {
5462
+ return Array.from(
5463
+ { length: seqLen },
5464
+ (_, pos) => _PositionalEncoding.encode(pos, dModel)
5465
+ );
5466
+ }
5467
+ // Add positional encoding to an existing embedding matrix (in-place on a copy).
5468
+ //
5469
+ // `embeddings` shape: [seqLen][dModel].
5470
+ // `seqLen` is optional; defaults to embeddings.length.
5471
+ //
5472
+ // The sum e = token_embedding + PE is what actually enters the first
5473
+ // Transformer layer. Summing (rather than concatenating) keeps the model
5474
+ // dimension fixed and lets the network distribute its capacity freely —
5475
+ // it can choose how much of each dimension to allocate to content vs. position.
5476
+ static apply(embeddings, seqLen) {
5477
+ const len = seqLen ?? embeddings.length;
5478
+ const dModel = embeddings[0].length;
5479
+ const pe = _PositionalEncoding.encodeSequence(len, dModel);
5480
+ return embeddings.map(
5481
+ (emb, pos) => emb.map((val, d) => val + pe[pos][d])
5482
+ );
5483
+ }
5484
+ };
5485
+ var LearnedPositionalEncoding = class {
5486
+ constructor(maxSeqLen, dModel) {
5487
+ this.maxSeqLen = maxSeqLen;
5488
+ this.dModel = dModel;
5489
+ const limit = Math.sqrt(1 / dModel);
5490
+ this.weights = Array.from(
5491
+ { length: maxSeqLen },
5492
+ () => Array.from({ length: dModel }, () => (Math.random() * 2 - 1) * limit)
5493
+ );
5494
+ }
5495
+ // Return the learned encoding for one position.
5496
+ // Returns a copy so callers cannot accidentally mutate the weight table.
5497
+ getEncoding(pos) {
5498
+ if (pos >= this.maxSeqLen) {
5499
+ throw new Error(
5500
+ `Position ${pos} exceeds maxSeqLen=${this.maxSeqLen}. Learned encodings cannot generalize beyond their training length.`
5501
+ );
5502
+ }
5503
+ return [...this.weights[pos]];
5504
+ }
5505
+ // Add learned positional encodings to `embeddings` (returns a new matrix).
5506
+ // Shape: [seqLen][dModel] → [seqLen][dModel].
5507
+ apply(embeddings, seqLen) {
5508
+ const len = seqLen ?? embeddings.length;
5509
+ if (len > this.maxSeqLen) {
5510
+ throw new Error(
5511
+ `Sequence length ${len} exceeds maxSeqLen=${this.maxSeqLen}.`
5512
+ );
5513
+ }
5514
+ return embeddings.map(
5515
+ (emb, pos) => emb.map((val, d) => val + this.weights[pos][d])
5516
+ );
5517
+ }
5518
+ // Apply gradient update to position encoding weights.
5519
+ //
5520
+ // `dWeights` has the same shape as `weights`: [maxSeqLen][dModel].
5521
+ // Each entry is dL/dW_pos[pos][d] — the loss gradient w.r.t. that weight.
5522
+ //
5523
+ // Simple SGD is used here (matching EmbeddingMatrix in MatMul.ts):
5524
+ // position embeddings are updated every step for all positions in the batch,
5525
+ // so the sparse-update problem of token embeddings doesn't apply.
5526
+ update(dWeights, lr) {
5527
+ for (let pos = 0; pos < this.maxSeqLen; pos++) {
5528
+ for (let d = 0; d < this.dModel; d++) {
5529
+ this.weights[pos][d] += lr * dWeights[pos][d];
5530
+ }
5531
+ }
5532
+ }
5533
+ };
5534
+
5535
+ // src/ContrastiveLearning.ts
5536
+ var Augmenter = class _Augmenter {
5537
+ // Add zero-mean Gaussian noise with standard deviation `sigma`.
5538
+ //
5539
+ // Uses the Box-Muller transform to produce normally distributed noise from
5540
+ // two uniform random variables:
5541
+ // z = √(-2·ln(u₁)) · cos(2π·u₂) where u₁, u₂ ~ Uniform(0, 1)
5542
+ //
5543
+ // This keeps us dependency-free while yielding proper Gaussian samples.
5544
+ static addNoise(x, sigma = 0.05) {
5545
+ return x.map((v) => {
5546
+ const u1 = Math.max(1e-10, Math.random());
5547
+ const u2 = Math.random();
5548
+ const z = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
5549
+ return v + sigma * z;
5550
+ });
5551
+ }
5552
+ // Randomly zero out features with probability `rate`.
5553
+ //
5554
+ // Analogous to masking in BERT or random crops in vision contrastive learning.
5555
+ // The encoder must learn representations that are robust to missing features —
5556
+ // it cannot simply memorize individual dimensions.
5557
+ static dropoutFeatures(x, rate = 0.1) {
5558
+ return x.map((v) => Math.random() < rate ? 0 : v);
5559
+ }
5560
+ // Apply both noise and feature dropout in sequence.
5561
+ //
5562
+ // Combining augmentations is standard in SimCLR — stronger augmentations
5563
+ // force the encoder to learn more robust, abstract representations.
5564
+ static augment(x, noiseStd = 0.05, dropRate = 0.1) {
5565
+ return _Augmenter.dropoutFeatures(_Augmenter.addNoise(x, noiseStd), dropRate);
5566
+ }
5567
+ // Generate a positive pair: [original, augmented_copy].
5568
+ //
5569
+ // These two views are used as the (i, j) positive pair in NT-Xent.
5570
+ // Everything else in the batch acts as a negative.
5571
+ static makePair(x) {
5572
+ return [x, _Augmenter.augment(x)];
5573
+ }
5574
+ };
5575
+ var ContrastiveLearning = class _ContrastiveLearning {
5576
+ // encoderHidden: hidden layer sizes for the encoder (not counting input/output).
5577
+ // e.g. inputSize=64, encoderHidden=[256, 128] → NetworkN([64, 256, 128])
5578
+ // The encoder output dimension is encoderHidden[last].
5579
+ //
5580
+ // projectionDim: dimension of the projection head output (the z space).
5581
+ // e.g. 64. Typically smaller than the encoder's output.
5582
+ //
5583
+ // The encoder uses ReLU activations throughout — empirically stronger than
5584
+ // sigmoid for representation learning because it doesn't saturate.
5585
+ constructor(inputSize, encoderHidden, projectionDim, options = {}) {
5586
+ if (encoderHidden.length === 0) {
5587
+ throw new Error("encoderHidden must have at least one element.");
5588
+ }
5589
+ this.temperature = options.temperature ?? 0.5;
5590
+ const encoderStructure = [inputSize, ...encoderHidden];
5591
+ const encoderActivations = encoderHidden.map(() => relu);
5592
+ this.encoder = new NetworkN(encoderStructure, {
5593
+ activations: encoderActivations,
5594
+ ...options.encoderOptions
5595
+ });
5596
+ const encoderOut = encoderHidden[encoderHidden.length - 1];
5597
+ const projHidden = Math.max(projectionDim, Math.floor(encoderOut / 2));
5598
+ this.projectionHead = new NetworkN(
5599
+ [encoderOut, projHidden, projectionDim],
5600
+ { activations: [relu, relu] }
5601
+ );
5602
+ }
5603
+ // ── Inference (downstream tasks use this, not project()) ─────────────────
5604
+ //
5605
+ // Returns h — the encoder representation before the projection head.
5606
+ // This is the vector to use for classification, clustering, retrieval, etc.
5607
+ //
5608
+ // The projection head is only active during training.
5609
+ encode(x) {
5610
+ return this.encoder.predict(x);
5611
+ }
5612
+ // ── Training path: encode then project ───────────────────────────────────
5613
+ //
5614
+ // Returns z — the projected representation used to compute NT-Xent.
5615
+ // Do NOT use this for downstream tasks (see encode() above).
5616
+ project(x) {
5617
+ const h = this.encoder.predict(x);
5618
+ return this.projectionHead.predict(h);
5619
+ }
5620
+ // ── Cosine similarity ─────────────────────────────────────────────────────
5621
+ //
5622
+ // sim(u, v) = uᵀv / (||u|| · ||v||)
5623
+ //
5624
+ // Range: [-1, 1]. We use cosine rather than Euclidean distance because it is
5625
+ // scale-invariant — only the direction of the projection matters, not its
5626
+ // magnitude. This prevents the trivial solution of making ||z|| → ∞.
5627
+ static cosineSimilarity(a, b) {
5628
+ let dot = 0, normA = 0, normB = 0;
5629
+ for (let d = 0; d < a.length; d++) {
5630
+ dot += a[d] * b[d];
5631
+ normA += a[d] * a[d];
5632
+ normB += b[d] * b[d];
5633
+ }
5634
+ const denom = Math.sqrt(normA) * Math.sqrt(normB);
5635
+ return denom < 1e-10 ? 0 : dot / denom;
5636
+ }
5637
+ // ── NT-Xent loss (no weight update) ──────────────────────────────────────
5638
+ //
5639
+ // Forward-only pass. Used for validation / monitoring during training.
5640
+ computeLoss(pairs) {
5641
+ const { projections, N } = this._forwardProjections(pairs);
5642
+ return this._ntXentLoss(projections, N);
5643
+ }
5644
+ // ── Training step ─────────────────────────────────────────────────────────
5645
+ //
5646
+ // Given a batch of positive pairs, compute NT-Xent loss and update weights
5647
+ // via finite-difference gradient approximation.
5648
+ //
5649
+ // Full analytical backprop through NT-Xent is complex to implement from
5650
+ // scratch without an autograd engine. Finite differences are slower but
5651
+ // correct and keep the implementation readable for educational purposes.
5652
+ // For production use, couple this with the Tape (autograd) module.
5653
+ //
5654
+ // Step-by-step:
5655
+ // 1. Forward all 2N inputs through encoder + projection head → { z_i }.
5656
+ // 2. Build the 2N×2N cosine similarity matrix (scaled by 1/τ).
5657
+ // 3. For each anchor i, identify its positive pair and all 2N-2 negatives.
5658
+ // 4. Apply softmax over the row; loss = -log(softmax at positive index).
5659
+ // 5. Average over all 2N anchors.
5660
+ // 6. Approximate ∂L/∂w per weight with finite differences and apply update.
5661
+ //
5662
+ // Returns: NT-Xent loss before the weight update.
5663
+ trainStep(pairs, lr) {
5664
+ const loss = this.computeLoss(pairs);
5665
+ const eps = 1e-4;
5666
+ for (const layer of this.encoder.layers) {
5667
+ for (const neuron of layer.neurons) {
5668
+ for (let j = 0; j < neuron.weights.length; j++) {
5669
+ neuron.weights[j] += eps;
5670
+ const lossPlus2 = this.computeLoss(pairs);
5671
+ neuron.weights[j] -= 2 * eps;
5672
+ const lossMinus2 = this.computeLoss(pairs);
5673
+ neuron.weights[j] += eps;
5674
+ const grad2 = (lossPlus2 - lossMinus2) / (2 * eps);
5675
+ neuron.weights[j] += lr * -grad2;
5676
+ }
5677
+ neuron.bias += eps;
5678
+ const lossPlus = this.computeLoss(pairs);
5679
+ neuron.bias -= 2 * eps;
5680
+ const lossMinus = this.computeLoss(pairs);
5681
+ neuron.bias += eps;
5682
+ const grad = (lossPlus - lossMinus) / (2 * eps);
5683
+ neuron.bias += lr * -grad;
5684
+ }
5685
+ }
5686
+ for (const layer of this.projectionHead.layers) {
5687
+ for (const neuron of layer.neurons) {
5688
+ for (let j = 0; j < neuron.weights.length; j++) {
5689
+ neuron.weights[j] += eps;
5690
+ const lossPlus2 = this.computeLoss(pairs);
5691
+ neuron.weights[j] -= 2 * eps;
5692
+ const lossMinus2 = this.computeLoss(pairs);
5693
+ neuron.weights[j] += eps;
5694
+ const grad2 = (lossPlus2 - lossMinus2) / (2 * eps);
5695
+ neuron.weights[j] += lr * -grad2;
5696
+ }
5697
+ neuron.bias += eps;
5698
+ const lossPlus = this.computeLoss(pairs);
5699
+ neuron.bias -= 2 * eps;
5700
+ const lossMinus = this.computeLoss(pairs);
5701
+ neuron.bias += eps;
5702
+ const grad = (lossPlus - lossMinus) / (2 * eps);
5703
+ neuron.bias += lr * -grad;
5704
+ }
5705
+ }
5706
+ return loss;
5707
+ }
5708
+ // ── Private: forward all pairs through the projection head ───────────────
5709
+ //
5710
+ // Returns a flat array of 2N projections.
5711
+ // Layout: [ z_0, z_0', z_1, z_1', ..., z_{N-1}, z_{N-1}' ]
5712
+ // Even indices 2i → original view of pair i
5713
+ // Odd indices 2i+1 → augmented view of pair i (the positive)
5714
+ _forwardProjections(pairs) {
5715
+ const N = pairs.length;
5716
+ const projections = [];
5717
+ for (const [x, xAug] of pairs) {
5718
+ projections.push(this.project(x));
5719
+ projections.push(this.project(xAug));
5720
+ }
5721
+ return { projections, N };
5722
+ }
5723
+ // ── Private: NT-Xent loss over a set of 2N projections ───────────────────
5724
+ //
5725
+ // pairs[2i] and pairs[2i+1] are positives.
5726
+ // All other 2N-2 samples are negatives for each anchor.
5727
+ _ntXentLoss(projections, N) {
5728
+ const total = 2 * N;
5729
+ const tau = this.temperature;
5730
+ const sim = Array.from(
5731
+ { length: total },
5732
+ (_, i) => Array.from(
5733
+ { length: total },
5734
+ (_2, j) => _ContrastiveLearning.cosineSimilarity(projections[i], projections[j]) / tau
5735
+ )
5736
+ );
5737
+ let totalLoss = 0;
5738
+ for (let i = 0; i < total; i++) {
5739
+ const posIdx = i % 2 === 0 ? i + 1 : i - 1;
5740
+ const numerator = Math.exp(sim[i][posIdx]);
5741
+ let denominator = 0;
5742
+ for (let k = 0; k < total; k++) {
5743
+ if (k !== i) {
5744
+ denominator += Math.exp(sim[i][k]);
5745
+ }
5746
+ }
5747
+ totalLoss += -Math.log(numerator / (denominator + 1e-10));
5748
+ }
5749
+ return totalLoss / total;
5750
+ }
5751
+ };
5752
+
4853
5753
  // src/GAN.ts
4854
5754
  var GAN = class {
4855
5755
  constructor(latentDim, generatorHidden, outputDim, discriminatorHidden, options) {
@@ -5386,6 +6286,216 @@ function _binaryRecall(yTrue, yPred, pos) {
5386
6286
  return tp + fn > 0 ? tp / (tp + fn) : 0;
5387
6287
  }
5388
6288
 
6289
+ // src/Tokenizer.ts
6290
+ var _Tokenizer = class _Tokenizer {
6291
+ constructor(options = {}) {
6292
+ this._token2id = /* @__PURE__ */ new Map();
6293
+ this._id2token = /* @__PURE__ */ new Map();
6294
+ this._fitted = false;
6295
+ this._mode = options.mode ?? "word";
6296
+ this._lowercase = options.lowercase ?? true;
6297
+ this._maxVocab = options.maxVocab ?? 0;
6298
+ this._extraSpecial = options.specialTokens ?? [];
6299
+ }
6300
+ // ── Fit ───────────────────────────────────────────────────────────────────
6301
+ /**
6302
+ * Build vocabulary from an array of text strings.
6303
+ * Calling fit() again resets and rebuilds the vocabulary from scratch.
6304
+ *
6305
+ * @param texts - corpus to build the vocabulary from
6306
+ * @returns this (chainable)
6307
+ */
6308
+ fit(texts) {
6309
+ this._token2id = /* @__PURE__ */ new Map();
6310
+ this._id2token = /* @__PURE__ */ new Map();
6311
+ const specials = [
6312
+ _Tokenizer.PAD,
6313
+ _Tokenizer.UNK,
6314
+ _Tokenizer.BOS,
6315
+ _Tokenizer.EOS,
6316
+ ...this._extraSpecial
6317
+ ];
6318
+ for (const s of specials) this._register(s);
6319
+ const freq = /* @__PURE__ */ new Map();
6320
+ for (const text of texts) {
6321
+ for (const token of this.tokenize(text)) {
6322
+ freq.set(token, (freq.get(token) ?? 0) + 1);
6323
+ }
6324
+ }
6325
+ let entries = [...freq.entries()].sort(
6326
+ ([a, fa], [b, fb]) => fb - fa || a.localeCompare(b)
6327
+ );
6328
+ if (this._maxVocab > 0) {
6329
+ entries = entries.slice(0, this._maxVocab - specials.length);
6330
+ }
6331
+ for (const [token] of entries) this._register(token);
6332
+ this._fitted = true;
6333
+ return this;
6334
+ }
6335
+ // ── Tokenize ──────────────────────────────────────────────────────────────
6336
+ /**
6337
+ * Split raw text into an array of string tokens (no ID conversion yet).
6338
+ * Useful for inspecting what the tokenizer produces before encoding.
6339
+ */
6340
+ tokenize(text) {
6341
+ const t = this._lowercase ? text.toLowerCase() : text;
6342
+ switch (this._mode) {
6343
+ case "char":
6344
+ return t.split("");
6345
+ case "whitespace":
6346
+ return t.split(/\s+/).filter(Boolean);
6347
+ case "word":
6348
+ default:
6349
+ return t.match(/[a-z0-9àáâãäåæçèéêëìíîïðñòóôõöùúûüýþÿ]+|[^\w\s]/gi) ?? [];
6350
+ }
6351
+ }
6352
+ // ── Encode ────────────────────────────────────────────────────────────────
6353
+ /**
6354
+ * Convert a text string to a sequence of token IDs.
6355
+ * Unknown tokens map to <UNK> (id 1).
6356
+ *
6357
+ * @param text - input text
6358
+ * @param options - addBOS / addEOS flags
6359
+ */
6360
+ encode(text, options = {}) {
6361
+ this._assertFitted();
6362
+ const ids = [];
6363
+ if (options.addBOS) ids.push(this._token2id.get(_Tokenizer.BOS));
6364
+ for (const token of this.tokenize(text)) {
6365
+ ids.push(this._token2id.get(token) ?? this._token2id.get(_Tokenizer.UNK));
6366
+ }
6367
+ if (options.addEOS) ids.push(this._token2id.get(_Tokenizer.EOS));
6368
+ return ids;
6369
+ }
6370
+ // ── Encode batch ──────────────────────────────────────────────────────────
6371
+ /**
6372
+ * Encode an array of texts, optionally padding/truncating to a fixed length.
6373
+ *
6374
+ * @param texts - array of input texts
6375
+ * @param options - addBOS / addEOS / padTo
6376
+ */
6377
+ encodeBatch(texts, options = {}) {
6378
+ const sequences = texts.map((t) => this.encode(t, options));
6379
+ if (options.padTo !== void 0) {
6380
+ const len = options.padTo;
6381
+ const padId = this._token2id.get(_Tokenizer.PAD);
6382
+ return sequences.map((seq) => {
6383
+ if (seq.length >= len) return seq.slice(0, len);
6384
+ return [...seq, ...Array(len - seq.length).fill(padId)];
6385
+ });
6386
+ }
6387
+ return sequences;
6388
+ }
6389
+ // ── Decode ────────────────────────────────────────────────────────────────
6390
+ /**
6391
+ * Convert a sequence of token IDs back to a human-readable string.
6392
+ *
6393
+ * @param ids - array of token IDs
6394
+ * @param stripSpecial - remove PAD/BOS/EOS tokens from output. Default: true
6395
+ */
6396
+ decode(ids, stripSpecial = true) {
6397
+ this._assertFitted();
6398
+ const specials = /* @__PURE__ */ new Set([_Tokenizer.PAD, _Tokenizer.BOS, _Tokenizer.EOS]);
6399
+ const tokens = [];
6400
+ for (const id of ids) {
6401
+ const token = this._id2token.get(id) ?? _Tokenizer.UNK;
6402
+ if (stripSpecial && specials.has(token)) continue;
6403
+ tokens.push(token);
6404
+ }
6405
+ return this._mode === "char" ? tokens.join("") : tokens.join(" ");
6406
+ }
6407
+ // ── One-hot encoding ──────────────────────────────────────────────────────
6408
+ /**
6409
+ * Convert a sequence of token IDs to one-hot vectors.
6410
+ * Each vector has length `vocabSize` with a single 1 at the token's position.
6411
+ * Useful when feeding tokens directly into a Network without an embedding layer.
6412
+ *
6413
+ * @param ids - array of token IDs (e.g. from encode())
6414
+ * @returns - 2D array of shape [seqLen, vocabSize]
6415
+ */
6416
+ oneHot(ids) {
6417
+ this._assertFitted();
6418
+ const V = this.vocabSize;
6419
+ return ids.map((id) => {
6420
+ const vec = new Array(V).fill(0);
6421
+ if (id >= 0 && id < V) vec[id] = 1;
6422
+ return vec;
6423
+ });
6424
+ }
6425
+ // ── Vocabulary helpers ────────────────────────────────────────────────────
6426
+ /** Number of tokens in the vocabulary (including special tokens). */
6427
+ get vocabSize() {
6428
+ return this._token2id.size;
6429
+ }
6430
+ /** True if fit() has been called at least once. */
6431
+ get isFitted() {
6432
+ return this._fitted;
6433
+ }
6434
+ /** Get the integer ID for a token string, or undefined if not in vocabulary. */
6435
+ tokenToId(token) {
6436
+ return this._token2id.get(token);
6437
+ }
6438
+ /** Get the token string for an integer ID, or undefined if out of range. */
6439
+ idToToken(id) {
6440
+ return this._id2token.get(id);
6441
+ }
6442
+ /**
6443
+ * Return the full vocabulary as an array ordered by ID.
6444
+ * Index i of the returned array is the token with ID i.
6445
+ */
6446
+ getVocabulary() {
6447
+ return Array.from({ length: this.vocabSize }, (_, i) => this._id2token.get(i));
6448
+ }
6449
+ // ── Persistence ───────────────────────────────────────────────────────────
6450
+ /**
6451
+ * Serialize the fitted tokenizer to a plain JSON-compatible object.
6452
+ * Store it with JSON.stringify(); reload with Tokenizer.fromJSON().
6453
+ */
6454
+ toJSON() {
6455
+ this._assertFitted();
6456
+ return {
6457
+ mode: this._mode,
6458
+ lowercase: this._lowercase,
6459
+ maxVocab: this._maxVocab,
6460
+ token2id: Object.fromEntries(this._token2id)
6461
+ };
6462
+ }
6463
+ /**
6464
+ * Restore a Tokenizer from a snapshot produced by toJSON().
6465
+ */
6466
+ static fromJSON(snapshot) {
6467
+ const tok = new _Tokenizer({
6468
+ mode: snapshot.mode,
6469
+ lowercase: snapshot.lowercase,
6470
+ maxVocab: snapshot.maxVocab
6471
+ });
6472
+ for (const [token, id] of Object.entries(snapshot.token2id)) {
6473
+ tok._token2id.set(token, id);
6474
+ tok._id2token.set(id, token);
6475
+ }
6476
+ tok._fitted = true;
6477
+ return tok;
6478
+ }
6479
+ // ── Private ───────────────────────────────────────────────────────────────
6480
+ _register(token) {
6481
+ if (this._token2id.has(token)) return;
6482
+ const id = this._token2id.size;
6483
+ this._token2id.set(token, id);
6484
+ this._id2token.set(id, token);
6485
+ }
6486
+ _assertFitted() {
6487
+ if (!this._fitted) {
6488
+ throw new Error("Tokenizer: call fit() before encoding or decoding.");
6489
+ }
6490
+ }
6491
+ };
6492
+ // ── Built-in special tokens ────────────────────────────────────────────────
6493
+ _Tokenizer.PAD = "<PAD>";
6494
+ _Tokenizer.UNK = "<UNK>";
6495
+ _Tokenizer.BOS = "<BOS>";
6496
+ _Tokenizer.EOS = "<EOS>";
6497
+ var Tokenizer = _Tokenizer;
6498
+
5389
6499
  // src/EarlyStopping.ts
5390
6500
  var EarlyStopping = class {
5391
6501
  constructor(options) {
@@ -5661,16 +6771,19 @@ function _sampleNormal() {
5661
6771
  0 && (module.exports = {
5662
6772
  Adam,
5663
6773
  AttentionHead,
6774
+ Augmenter,
5664
6775
  Autoencoder,
5665
6776
  BatchNorm,
5666
6777
  BiasVector,
5667
6778
  CausalConv1D,
5668
6779
  ClipOptimizer,
5669
6780
  ClippedOptimizerFactory,
6781
+ ContrastiveLearning,
5670
6782
  Conv1D,
5671
6783
  Conv2D,
5672
6784
  DataAugmentation,
5673
6785
  DataLoader,
6786
+ DatasetLoader,
5674
6787
  DecisionTree,
5675
6788
  Dropout,
5676
6789
  EarlyStopping,
@@ -5685,6 +6798,7 @@ function _sampleNormal() {
5685
6798
  LSTMLayer,
5686
6799
  Layer,
5687
6800
  LayerNorm,
6801
+ LearnedPositionalEncoding,
5688
6802
  LinearRegression,
5689
6803
  LogisticRegression,
5690
6804
  LossPlotter,
@@ -5701,18 +6815,22 @@ function _sampleNormal() {
5701
6815
  NeuronN,
5702
6816
  PCA,
5703
6817
  Perceptron,
6818
+ PositionalEncoding,
5704
6819
  RNN,
5705
6820
  SGD,
5706
6821
  SOM,
5707
6822
  Seq2Seq,
5708
6823
  SoftmaxRegression,
5709
6824
  TCN,
6825
+ TSNE,
6826
+ Tokenizer,
5710
6827
  Trainer,
5711
6828
  TransformerBlock,
5712
6829
  VAE,
5713
6830
  Value,
5714
6831
  WeightInspector,
5715
6832
  WeightMatrix,
6833
+ Word2Vec,
5716
6834
  accuracy,
5717
6835
  auc,
5718
6836
  classificationReport,