npm - @genai-fi/nanogpt - Versions diffs - 0.9.0 → 0.10.0 - Mend

@genai-fi/nanogpt 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (343) hide show

package/README.md +352 -14
package/dist/Generator.js +69 -78
package/dist/{RealDiv-D4EzDsC0.js → RealDiv-DgA3z9oO.js} +32 -206
package/dist/Reshape-CF6odzV4.js +16 -0
package/dist/Reshape-_kILl6tK.js +81 -0
package/dist/TeachableLLM.js +28 -22
package/dist/Trainer.d.ts +2 -0
package/dist/Trainer.js +3 -2
package/dist/{axis_util-TbGYJ208.js → axis_util-BvHEw88j.js} +7 -23
package/dist/backend.d.ts +2 -1
package/dist/backend.js +10 -4
package/dist/backend_util-D-rUb2ty.js +474 -0
package/dist/backend_webgpu-B0u2ndUn.js +547 -0
package/dist/binary_op_util-pKXltfxI.js +192 -0
package/dist/broadcast_to-CwF7XIeu.js +30 -0
package/dist/checks/appendCache.js +2 -2
package/dist/checks/attentionMask.js +3 -3
package/dist/checks/check.d.ts +1 -1
package/dist/checks/check.js +8 -8
package/dist/checks/gelu.js +2 -2
package/dist/checks/index.d.ts +2 -0
package/dist/checks/index.js +7 -5
package/dist/checks/matMulGelu.js +6 -6
package/dist/checks/normRMS.js +7 -7
package/dist/checks/normRMSGrad.js +3 -3
package/dist/checks/packUnpack.d.ts +1 -0
package/dist/checks/packUnpack.js +18 -0
package/dist/checks/qkv.js +12 -27
package/dist/checks/rope.js +2 -2
package/dist/checks/weights.js +18 -16
package/dist/complex-CSlYz-2T.js +13 -0
package/dist/complex_util-Yc1A_gV1.js +55 -0
package/dist/concat-BHlIJeyT.js +19 -0
package/dist/concat_util-DcJk7YHS.js +22 -0
package/dist/data/docx.js +1 -1
package/dist/data/parquet.js +2 -2
package/dist/data/pdf.js +1 -1
package/dist/data/textLoader.js +1 -1
package/dist/{dataset-DlZtKmBq.js → dataset-0xP8GjwI.js} +136 -236
package/dist/dropout-C1pM3f11.js +99 -0
package/dist/expand_dims-BPG4fwBP.js +13 -0
package/dist/exports_initializers-xuidcwI4.js +7 -0
package/dist/gather-DykLGqmW.js +10 -0
package/dist/{gelu-Bp_-935b.js → gelu-CNLFZWea.js} +11 -10
package/dist/{gpgpu_math-CDaYiyE_.js → gpgpu_math-DDVJCn6-.js} +90 -265
package/dist/{index-C4L8Cm77.js → index-CieiGp4Y.js} +14 -14
package/dist/index-CjOj7j-u.js +7308 -0
package/dist/{index-Tf7vU29b.js → index-Cp39cXWe.js} +3 -10
package/dist/{index-Dwqa6Zy2.js → index-DvYrXKkX.js} +2 -2
package/dist/index-ZyQhjEPo.js +2157 -0
package/dist/{jszip.min-CjP2V1VV.js → jszip.min-Bz5-11Bk.js} +56 -57
package/dist/kernel_funcs_utils-Dg_-E44D.js +308 -0
package/dist/layers/BaseLayer.d.ts +1 -0
package/dist/layers/BaseLayer.js +7 -6
package/dist/layers/CausalSelfAttention.d.ts +0 -1
package/dist/layers/CausalSelfAttention.js +56 -55
package/dist/layers/MLP.js +15 -16
package/dist/layers/PositionEmbedding.js +5 -14
package/dist/layers/RMSNorm.js +3 -3
package/dist/layers/RoPECache.d.ts +2 -0
package/dist/layers/RoPECache.js +22 -17
package/dist/layers/TiedEmbedding.js +22 -17
package/dist/layers/TransformerBlock.js +21 -20
package/dist/loader/load.js +1 -1
package/dist/loader/loadTransformers.js +1 -1
package/dist/loader/oldZipLoad.js +39 -33
package/dist/loader/save.js +1 -1
package/dist/log_sum_exp-DWI-76TI.js +41 -0
package/dist/main.d.ts +8 -0
package/dist/main.js +63 -52
package/dist/matMul16--R5hOwDG.js +77 -0
package/dist/mat_mul-DeAh4uTH.js +12 -0
package/dist/mod-Gt1rMB4n.js +12 -0
package/dist/models/NanoGPTV1.js +40 -31
package/dist/models/model.d.ts +2 -0
package/dist/models/model.js +37 -29
package/dist/{mulmat_packed_gpu-BT60jmzP.js → mulmat_packed_gpu-BMFhLwta.js} +1 -17
package/dist/{non_max_suppression_impl-CsEgBuMA.js → non_max_suppression_impl-B2W7YjZB.js} +0 -32
package/dist/ones-CAMiP4I2.js +15 -0
package/dist/ops/adamAdjust.js +1 -1
package/dist/ops/adamMoments.d.ts +1 -1
package/dist/ops/adamMoments.js +4 -4
package/dist/ops/add16.d.ts +2 -0
package/dist/ops/add16.js +9 -0
package/dist/ops/appendCache.js +16 -9
package/dist/ops/attentionMask.js +4 -4
package/dist/ops/concat16.d.ts +2 -0
package/dist/ops/concat16.js +9 -0
package/dist/ops/cpu/adamAdjust.js +14 -13
package/dist/ops/cpu/adamMoments.js +10 -9
package/dist/ops/cpu/appendCache.js +9 -8
package/dist/ops/cpu/attentionMask.js +15 -14
package/dist/ops/cpu/fusedSoftmax.js +13 -12
package/dist/ops/cpu/gatherSub.js +9 -24
package/dist/ops/cpu/gelu.js +13 -12
package/dist/ops/cpu/matMul16.d.ts +1 -0
package/dist/ops/cpu/matMul16.js +16 -0
package/dist/ops/cpu/matMulGelu.js +18 -16
package/dist/ops/cpu/matMulMul.js +8 -7
package/dist/ops/cpu/mulDropout.js +4 -3
package/dist/ops/cpu/normRMS.js +11 -10
package/dist/ops/cpu/qkv.js +17 -13
package/dist/ops/cpu/rope.js +23 -22
package/dist/ops/cpu/scatterSub.js +16 -30
package/dist/ops/dot16.d.ts +2 -0
package/dist/ops/dot16.js +42 -0
package/dist/ops/gatherSub.js +1 -1
package/dist/ops/gelu.js +2 -2
package/dist/ops/grads/add16.d.ts +1 -0
package/dist/ops/grads/add16.js +27 -0
package/dist/ops/grads/attentionMask.js +12 -19
package/dist/ops/grads/gelu.js +4 -3
package/dist/ops/grads/matMul16.d.ts +2 -0
package/dist/ops/grads/matMul16.js +9 -0
package/dist/ops/grads/matMulGelu.js +8 -7
package/dist/ops/grads/normRMS.js +8 -7
package/dist/ops/grads/{fusedSoftmax.d.ts → pack16.d.ts} +1 -1
package/dist/ops/grads/pack16.js +7 -0
package/dist/ops/grads/qkv.d.ts +3 -1
package/dist/ops/grads/qkv.js +28 -22
package/dist/ops/grads/rope.d.ts +2 -1
package/dist/ops/grads/rope.js +6 -13
package/dist/ops/grads/softmax16.d.ts +2 -0
package/dist/ops/grads/softmax16.js +26 -0
package/dist/ops/grads/unpack16.d.ts +2 -0
package/dist/ops/grads/unpack16.js +6 -0
package/dist/ops/grads/utils.d.ts +3 -0
package/dist/ops/grads/utils.js +10 -0
package/dist/ops/matMul16.d.ts +15 -0
package/dist/ops/matMul16.js +13 -0
package/dist/ops/matMulGelu.js +1 -1
package/dist/ops/matMulMul.js +1 -1
package/dist/ops/mul16.d.ts +2 -0
package/dist/ops/mul16.js +8 -0
package/dist/ops/mulDrop.js +1 -1
package/dist/ops/normRMS.js +1 -1
package/dist/ops/pack16.d.ts +2 -0
package/dist/ops/pack16.js +6 -0
package/dist/ops/qkv.d.ts +1 -1
package/dist/ops/qkv.js +8 -4
package/dist/ops/reshape16.d.ts +2 -0
package/dist/ops/reshape16.js +43 -0
package/dist/ops/rope.d.ts +1 -1
package/dist/ops/rope.js +8 -10
package/dist/ops/scatterSub.js +1 -1
package/dist/ops/slice16.d.ts +2 -0
package/dist/ops/slice16.js +9 -0
package/dist/ops/softmax16.d.ts +2 -0
package/dist/ops/softmax16.js +12 -0
package/dist/ops/sub16.d.ts +2 -0
package/dist/ops/sub16.js +8 -0
package/dist/ops/sum16.d.ts +2 -0
package/dist/ops/sum16.js +13 -0
package/dist/ops/transpose16.d.ts +3 -0
package/dist/ops/transpose16.js +41 -0
package/dist/ops/unpack16.d.ts +2 -0
package/dist/ops/unpack16.js +6 -0
package/dist/ops/webgl/adamAdjust.js +3 -2
package/dist/ops/webgl/adamMoments.js +2 -1
package/dist/ops/webgl/appendCache.js +2 -1
package/dist/ops/webgl/attentionMask.js +5 -4
package/dist/ops/webgl/fusedSoftmax.js +6 -4
package/dist/ops/webgl/gatherSub.js +7 -6
package/dist/ops/webgl/gelu.js +3 -2
package/dist/ops/webgl/log.js +12 -27
package/dist/ops/webgl/matMul16.d.ts +1 -0
package/dist/ops/webgl/matMul16.js +37 -0
package/dist/ops/webgl/matMulGelu.js +17 -15
package/dist/ops/webgl/matMulMul.js +13 -12
package/dist/ops/webgl/mulDropout.js +9 -8
package/dist/ops/webgl/normRMS.js +8 -7
package/dist/ops/webgl/qkv.js +6 -5
package/dist/ops/webgl/rope.js +11 -10
package/dist/ops/webgl/scatterSub.js +6 -5
package/dist/ops/webgpu/adamAdjust.js +12 -10
package/dist/ops/webgpu/adamMoments.js +27 -22
package/dist/ops/webgpu/add16.d.ts +1 -0
package/dist/ops/webgpu/add16.js +14 -0
package/dist/ops/webgpu/appendCache.js +64 -17
package/dist/ops/webgpu/attentionMask.js +19 -62
package/dist/ops/webgpu/attentionMask32_program.d.ts +19 -0
package/dist/ops/webgpu/attentionMask32_program.js +54 -0
package/dist/ops/webgpu/concat16.d.ts +19 -0
package/dist/ops/webgpu/concat16.js +128 -0
package/dist/ops/webgpu/gatherSub.js +9 -7
package/dist/ops/webgpu/gelu.js +78 -31
package/dist/ops/webgpu/index.js +12 -0
package/dist/ops/webgpu/matMul16.d.ts +1 -0
package/dist/ops/webgpu/matMul16.js +58 -0
package/dist/ops/webgpu/matMul16_program.d.ts +42 -0
package/dist/ops/webgpu/matMul16_program.js +336 -0
package/dist/ops/webgpu/mul16.d.ts +1 -0
package/dist/ops/webgpu/mul16.js +14 -0
package/dist/ops/webgpu/normRMS.js +21 -40
package/dist/ops/webgpu/normRMS16_program.d.ts +9 -0
package/dist/ops/webgpu/normRMS16_program.js +24 -0
package/dist/ops/webgpu/normRMS32_program.d.ts +9 -0
package/dist/ops/webgpu/normRMS32_program.js +24 -0
package/dist/ops/webgpu/normRMSGrad.js +113 -64
package/dist/ops/webgpu/pack16.d.ts +1 -0
package/dist/ops/webgpu/pack16.js +19 -0
package/dist/ops/webgpu/pack16_program.d.ts +19 -0
package/dist/ops/webgpu/pack16_program.js +92 -0
package/dist/ops/webgpu/qkv.js +20 -55
package/dist/ops/webgpu/rope.js +77 -22
package/dist/ops/webgpu/scatterSub.js +9 -7
package/dist/ops/webgpu/slice16.d.ts +7 -0
package/dist/ops/webgpu/slice16.js +71 -0
package/dist/{variable-Bm2OFwGI.js → ops/webgpu/softmax16.d.ts} +2 -8
package/dist/ops/webgpu/softmax16.js +23 -0
package/dist/ops/webgpu/softmax16_program.d.ts +13 -0
package/dist/ops/webgpu/softmax16_program.js +73 -0
package/dist/ops/webgpu/softmax16_subgroup_program.d.ts +17 -0
package/dist/ops/webgpu/softmax16_subgroup_program.js +75 -0
package/dist/ops/webgpu/softmax16grad.d.ts +1 -0
package/dist/ops/webgpu/softmax16grad.js +38 -0
package/dist/ops/webgpu/sub16.d.ts +1 -0
package/dist/ops/webgpu/sub16.js +14 -0
package/dist/ops/webgpu/sum16.d.ts +1 -0
package/dist/ops/webgpu/sum16.js +40 -0
package/dist/ops/webgpu/transpose16.d.ts +1 -0
package/dist/ops/webgpu/transpose16.js +35 -0
package/dist/ops/webgpu/transpose16_program.d.ts +16 -0
package/dist/ops/webgpu/transpose16_program.js +50 -0
package/dist/ops/webgpu/transpose16_shared_program.d.ts +15 -0
package/dist/ops/webgpu/transpose16_shared_program.js +71 -0
package/dist/ops/webgpu/unpack16.d.ts +1 -0
package/dist/ops/webgpu/unpack16.js +49 -0
package/dist/ops/webgpu/utils/binary_op.d.ts +19 -0
package/dist/ops/webgpu/utils/binary_op.js +79 -0
package/dist/ops/webgpu/utils/deviceInfo.d.ts +7 -0
package/dist/ops/webgpu/utils/deviceInfo.js +11 -0
package/dist/ops/webgpu/utils/reductions.d.ts +32 -4
package/dist/ops/webgpu/utils/reductions.js +236 -45
package/dist/ops-CNI3TwqM.js +645 -0
package/dist/pack16-CFUqumar.js +41 -0
package/dist/{papaparse.min-C8l2Kvo1.js → papaparse.min-C0cScC2i.js} +2 -8
package/dist/{parquet-C0Tlmv9c.js → parquet-BE8MU_ge.js} +201 -278
package/dist/patches/PackedTensor.d.ts +12 -0
package/dist/patches/PackedTensor.js +11 -0
package/dist/patches/engine.d.ts +261 -0
package/dist/patches/engine.js +10 -0
package/dist/patches/tape.d.ts +12 -0
package/dist/patches/tape.js +5 -0
package/dist/patches/webgpu_backend.d.ts +18 -0
package/dist/patches/webgpu_backend.js +57 -0
package/dist/{tensor-CZr4dh61.js → patches/webgpu_base.d.ts} +5 -8
package/dist/patches/webgpu_base.js +34 -0
package/dist/patches/webgpu_program.d.ts +36 -0
package/dist/patches/webgpu_program.js +401 -0
package/dist/{pdf-kJD-f258.js → pdf-NIhmP3sq.js} +424 -428
package/dist/random_width-DY6Kk2Dl.js +10051 -0
package/dist/range-BMS52eQi.js +11 -0
package/dist/reciprocal-CTmshQ9J.js +10 -0
package/dist/{register_all_kernels-DIGpEwcf.js → register_all_kernels-Bwu1PTuU.js} +719 -9766
package/dist/relu-yZ2-7WxU.js +10 -0
package/dist/reshape-DevtBWtf.js +10 -0
package/dist/rope-B5UUMsPi.js +32 -0
package/dist/{scatter_nd_util-BQdz--Gn.js → scatter_nd_util-5EL-8VAQ.js} +1 -1
package/dist/selu_util-D1w6yyTO.js +303 -0
package/dist/{shared-DuP7ue-R.js → shared-BRksrJb3.js} +1 -17
package/dist/shared-BuAXb4CI.js +2145 -0
package/dist/sin-BGfy2HZo.js +16 -0
package/dist/slice-D_gkkqZK.js +13 -0
package/dist/slice_util-DtEldBfK.js +261 -0
package/dist/softmax-ZHVebtR1.js +13 -0
package/dist/split-DrfihRpZ.js +10 -0
package/dist/squeeze-DZEpeblb.js +11 -0
package/dist/stack-yOIAalTq.js +13 -0
package/dist/sum-_fzj5ZTB.js +12 -0
package/dist/tensor-DdQUJZlz.js +909 -0
package/dist/tensor-f35l8Odg.js +8 -0
package/dist/tensor1d-CeZuc-Rv.js +12 -0
package/dist/tensor2d-G4Ys2GxX.js +15 -0
package/dist/tensor4d-B8roDgtc.js +15 -0
package/dist/tensor_util-DV-FP5Q3.js +523 -0
package/dist/tfjs_backend-kNyO5L2d.js +653 -0
package/dist/tile-BzyEiF-F.js +13 -0
package/dist/tokeniser/CharTokeniser.js +1 -1
package/dist/tokeniser/bpe.js +1 -1
package/dist/training/Adam.d.ts +2 -1
package/dist/training/Adam.js +12 -28
package/dist/training/AdamExt.d.ts +1 -0
package/dist/training/AdamExt.js +2 -2
package/dist/training/DatasetBuilder.js +3 -20
package/dist/training/FullTrainer.js +82 -64
package/dist/training/Trainer.d.ts +11 -6
package/dist/training/Trainer.js +51 -39
package/dist/training/sparseCrossEntropy.js +3 -3
package/dist/transpose-DKELTqhe.js +38 -0
package/dist/utilities/arrayClose.js +7 -7
package/dist/utilities/dummy.js +35 -27
package/dist/utilities/multinomialCPU.js +2 -2
package/dist/utilities/packed.d.ts +7 -0
package/dist/utilities/packed.js +716 -0
package/dist/utilities/performance.js +1 -1
package/dist/utilities/profile.js +1 -1
package/dist/utilities/safetensors.js +2 -2
package/dist/utilities/sentences.d.ts +5 -0
package/dist/utilities/sentences.js +41 -0
package/dist/utilities/weights.js +2 -2
package/dist/variable-Bhn5bHYv.js +7 -0
package/dist/{webgpu_program-DkQJOJSd.js → webgpu_program-Cigz-7RF.js} +15 -44
package/dist/webgpu_util-BBCnKm2X.js +65 -0
package/dist/zeros-2gldETuK.js +14 -0
package/package.json +4 -3
package/dist/Reshape-Bowtk9BP.js +0 -127
package/dist/Reshape-DUqYftGC.js +0 -30
package/dist/backend_util-CJIiDoV1.js +0 -749
package/dist/broadcast_to-DzlNweb8.js +0 -44
package/dist/concat-B912vBbo.js +0 -33
package/dist/dropout-C-csYCLj.js +0 -193
package/dist/exports_initializers-B8iZMgQ0.js +0 -16
package/dist/gather-Dnpgw-YQ.js +0 -25
package/dist/index-BzFyqcy-.js +0 -4457
package/dist/index-C1rx_Ajs.js +0 -12076
package/dist/kernel_funcs_utils-DKLK0Mg3.js +0 -466
package/dist/log_sum_exp-DO6z8tSE.js +0 -103
package/dist/mat_mul-DzjTFx-u.js +0 -27
package/dist/mod-Dobti4j4.js +0 -27
package/dist/ones-tIJeHlq-.js +0 -29
package/dist/ops/fusedSoftmax.d.ts +0 -2
package/dist/ops/fusedSoftmax.js +0 -10
package/dist/ops/grads/fusedSoftmax.js +0 -22
package/dist/ops-LuCMAnmM.js +0 -1525
package/dist/random_width-CXVRloNK.js +0 -13670
package/dist/range-CWcz7xFA.js +0 -26
package/dist/reciprocal-C4rNcM-S.js +0 -25
package/dist/relu-BjCh_SYb.js +0 -25
package/dist/reshape-CnIwVG1c.js +0 -25
package/dist/selu_util-OtRzVwW5.js +0 -719
package/dist/shared-DmRsFyaJ.js +0 -3134
package/dist/sin-gpDNRxE0.js +0 -47
package/dist/slice-d0Vo9XTN.js +0 -28
package/dist/softmax-D7Jj3p_P.js +0 -28
package/dist/split-DK2k5eHf.js +0 -25
package/dist/stack-DFatutCx.js +0 -27
package/dist/sum-CJ0ULhmt.js +0 -27
package/dist/tensor1d-vML0r3q6.js +0 -27
package/dist/tensor2d-D76QGjF3.js +0 -30
package/dist/tensor4d-Df1WlVDY.js +0 -30
package/dist/webgpu_util-pLEV9tks.js +0 -80
package/dist/zeros-Bj5rMYA7.js +0 -52

package/README.md CHANGED Viewed

@@ -1,28 +1,366 @@
 # GenAI NanoGPT
-Developed as a part of the Finnish Generation AI research project. This is an implementation of [NanoGPT](https://github.com/karpathy/nanoGPT) for Tensorflow.js. It allows GPT models to be training and loaded within a web browser and exposes some XAI functionality.
+A browser-native implementation of GPT language models built on TensorFlow.js, developed as part of the Finnish Generation AI research project. This library enables training, fine-tuning, and inference of transformer-based language models entirely in the browser with support for explainable AI (XAI) features. It is intended to be used as an educational tool for learning about the model training process since it targets mostly tiny models. In principle it could be adapted to load other pre-trained models from Hugging Face.
-Work in progress...
+Live version available here: https://lm.gen-ai.fi
-# Install
+## Overview
-```
+GenAI NanoGPT is inspired by [Andrej Karpathy's NanoGPT](https://github.com/karpathy/nanoGPT) but reimagined for the browser using TensorFlow.js. It provides a complete pipeline for:
+-   **Training** language models from scratch in the browser
+-   **Loading** pre-trained models from various sources (Hugging Face, local files)
+-   **Generating** text efficiently on a wide range of devices
+-   **Analyzing** model behavior through attention visualization and embeddings
+-   **Optimizing** performance across CPU, WebGL, and WebGPU backends
+### Key Features
+-   🚀 **Browser-Native**: No server required - train and run models entirely client-side
+-   📱 **Works on Small Devices**: Train models on iPads, phones, and Chromebooks - no powerful hardware needed
+-   🎯 **Multiple Backends**: Automatic backend selection (CPU, WebGL, WebGPU) for optimal performance
+-   🔧 **Flexible Tokenization**: Support for both character-level and BPE tokenizers
+-   📊 **XAI Support**: Attention score visualization, gradient analysis, and embedding extraction
+-   💾 **Model Persistence**: Save and load models in SafeTensors format
+-   ⚡ **Performance Optimizations**: Custom WebGPU kernels, gradient checkpointing, and mixed precision training
+-   🎨 **Real-time Training**: Live training metrics and generation during training
+## Installation
+```bash
 npm install @genai-fi/nanogpt
 ```
-# Usage
+## Quick Start
+### Creating and Training a Model
+```javascript
+import { TeachableLLM, selectBackend } from '@genai-fi/nanogpt';
+// Select the best available backend
+await selectBackend('webgpu'); // or 'webgl', 'cpu'
+// Create a new model
+const model = TeachableLLM.create('char', {
+    vocabSize: 200,
+    blockSize: 128, // Context window size
+    nLayer: 4, // Number of transformer layers
+    nHead: 4, // Number of attention heads
+    nEmbed: 192, // Embedding dimension
+    dropout: 0.1,
+    useRope: true, // Use Rotary Position Embeddings
+});
+// Training data
+const trainingText = [
+    'The quick brown fox jumps over the lazy dog.',
+    'A journey of a thousand miles begins with a single step.',
+    // ... more text
+];
+// Train the model
+await model.train(trainingText, {
+    batchSize: 16,
+    learningRate: 3e-4,
+    maxSteps: 1000,
+    logInterval: 10,
+    validationSplit: 0.1,
+});
+// Generate text
+const output = await model.generateText('Once upon a time', {
+    maxLength: 100,
+    temperature: 0.8,
+    topP: 0.9,
+});
+console.log(output);
+```
+### Loading a Pre-trained Model
+```javascript
+import { TeachableLLM, waitForModel } from '@genai-fi/nanogpt';
+// Load from Hugging Face
+const model = TeachableLLM.loadModel('username/model-name');
+// Or load from a file
+const fileInput = document.getElementById('fileInput');
+fileInput.addEventListener('change', async (event) => {
+    const file = event.target.files[0];
+    const model = TeachableLLM.loadModel(file);
+    await waitForModel(model);
+    const text = await model.generateText('Hello');
+    console.log(text);
+});
 ```
-import { TeachableLLM, CharTokeniser } from '@genai-fi/nanogpt';
-import * as tf from '@tensorflow/tfjs';
-const tokeniser = new CharTokeniser();
-const model = TeachableLLM.create(tf, tokeniser, {
+## Event Handlers and Real-time Updates
+### Monitoring Training Progress
+Track training metrics in real-time with event handlers:
+```javascript
+const model = TeachableLLM.create('char', config);
+// Listen for training step updates
+model.on('trainStep', (step, progress) => {
+    console.log(`Step ${step.step}/${progress.totalSteps}`);
+    console.log(`Loss: ${step.loss.toFixed(4)}`);
+    console.log(`Validation Loss: ${step.valLoss?.toFixed(4) || 'N/A'}`);
+    console.log(`Progress: ${(progress.progress * 100).toFixed(1)}%`);
+    console.log(`Time Remaining: ${progress.timeRemaining}s`);
+    // Update UI progress bar
+    updateProgressBar(progress.progress);
+    updateLossChart(step.loss, step.valLoss);
+});
+await model.train(trainingText, options);
+```
+### Real-time Token Generation
+Stream generated tokens as they're produced:
+```javascript
+const generator = model.generator();
+// Listen for generated tokens
+generator.on('tokens', (tokens) => {
+    // tokens is an array of new token IDs
+    const text = model.tokeniser.decode(tokens);
+    console.log('New tokens:', text);
+    // Update UI incrementally
+    appendToOutput(text);
+});
+// Generation lifecycle events
+generator.on('start', () => {
+    console.log('Generation started');
+    showSpinner();
+});
+generator.on('stop', () => {
+    console.log('Generation complete');
+    hideSpinner();
+});
+generator.on('error', (error) => {
+    console.error('Generation error:', error);
+});
+// Start generation
+await generator.generate('Once upon a time', {
+    maxLength: 200,
+    temperature: 0.8,
+});
+```
+## Training on Small Devices
+GenAI NanoGPT is designed to work efficiently on resource-constrained devices like iPads, phones, and Chromebooks:
+### Recommended Settings for Small Devices
+```javascript
+// Smaller model configuration for mobile devices
+const mobileModel = TeachableLLM.create('char', {
     vocabSize: 200,
-    blockSize: 128,
-    nLayer: 4,
-    nHead: 3,
-    nEmbed: 192,
-    dropout: 0.0,
+    blockSize: 128, // Smaller context window
+    nLayer: 4, // Fewer layers
+    nHead: 3, // Fewer attention heads
+    nEmbed: 192, // Smaller embeddings
+});
+// Training options optimized for limited memory
+await mobileModel.train(trainingText, {
+    batchSize: 8, // Smaller batch size
+    learningRate: 3e-4,
+    maxSteps: 500,
+    validationSplit: 0.1,
+    logInterval: 50,
+    gradientCheckpointing: true,
+    mixedPrecision: true,
+});
+```
+### Tips for Training on Mobile Devices
+1. **Start Small**: Use smaller models (4 layers) and shorter context windows (128 tokens)
+2. **Reduce Batch Size**: Use batch sizes of 8-16 depending on available memory
+3. **Use Character Tokenization**: Character-level tokenizers use less memory than BPE
+4. **Optimize Training Data**: Use smaller datasets or train in stages
+## Advanced Usage
+### Attention Visualization
+```javascript
+const generator = model.generator();
+const text = await generator.generate('Prompt', {
+    attentionScores: true,
+    maxLength: 50,
 });
+// Get attention data for visualization
+const attentionData = generator.getAttentionData();
+// Shape: [num_tokens][num_layers][num_heads][seq_len][seq_len]
+const probabilities = generator.getProbabilitiesData();
+// Shape: [num_tokens][seq_len][vocab_size]
+```
+### Streaming Generation
+```javascript
+const generator = model.generator();
+generator.on('tokens', (tokens) => {
+    // Update UI with new tokens in real-time
+    updateDisplay(tokens);
+});
+generator.on('start', () => console.log('Generation started'));
+generator.on('stop', () => console.log('Generation complete'));
+await generator.generate('Once upon a time', {
+    maxLength: 200,
+});
+```
+### Memory Management
+```javascript
+// Enable profiling
+model.enableProfiler = true;
+// After training/generation
+const profiler = model.getProfiler();
+if (profiler) {
+    console.log('Memory stats:', profiler.getStats());
+}
+// Clean up
+model.dispose();
+```
+## Examples
+See the [`browser-tests`](browser-tests/) directory for complete examples:
+-   [`generate.html`](browser-tests/generate.html): Text generation with UI
+-   [`rope-train.html`](browser-tests/rope-train.html): Training a model with RoPE
+-   [`hf.html`](browser-tests/hf.html): Loading from Hugging Face
+-   [`loader.html`](browser-tests/loader.html): Loading different file formats
+-   [`perf.html`](browser-tests/perf.html): Performance testing
+## Development
+### Setup
+```bash
+git clone https://github.com/knicos/genai-nanogpt.git
+cd genai-nanogpt
+npm install
+```
+### Building
+```bash
+npm run build       # Build for production
+npm run dev         # Development mode with watch
+```
+### Testing
+```bash
+npm test            # Run all tests
+```
+### Browser Tests
+```bash
+npm run test:gl       # Start dev server
+```
+### Project Structure
+```
+lib/
+├── models/          # Model architectures (NanoGPT)
+├── layers/          # Transformer layers (attention, MLP, etc.)
+├── ops/             # Custom TensorFlow.js operations
+│   ├── cpu/         # CPU kernels
+│   ├── webgl/       # WebGL kernels
+│   └── webgpu/      # WebGPU kernels
+├── training/        # Training utilities and optimizers
+├── tokeniser/       # Tokenization implementations
+├── loader/          # Model loading/saving
+├── utilities/       # Helper functions
+└── TeachableLLM.ts  # Main API
+```
+### Custom Operations
+This library implements several custom TensorFlow.js operations optimized for transformer models:
+-   **RoPE**: Rotary Position Embeddings
+-   **Attention Mask**: Causal attention masking
+-   **RMS Norm**: Root Mean Square normalization
+-   **Adam Optimizer**: Extended Adam with weight decay
+-   **16-bit Operators**: To enable mixed-precision training
+See [`lib/ops`](lib/ops/) for implementations.
+### Contributing
+1. Fork the repository
+2. Create a feature branch: `git checkout -b feature/amazing-feature`
+3. Commit your changes: `git commit -m 'Add amazing feature'`
+4. Push to the branch: `git push origin feature/amazing-feature`
+5. Open a Pull Request
+### Code Style
+This project uses ESLint and Prettier for code formatting:
+```bash
+npm run lint        # Check code style
+```
+## Performance Tips
+1. **Use WebGPU**: Provides the best performance for training and inference
+2. **Batch Size**: Larger batches improve GPU utilization but require more memory
+3. **Mixed Precision**: Enable for faster training on supported hardware (coming soon)
+4. **Gradient Checkpointing**: Reduce memory usage during training, but slower
+5. **Use RoPE**: More efficient than absolute position embeddings
+6. **Start Small on Mobile**: Use 2-4 layers and batch size 2-8 on phones/tablets
+## Acknowledgments
+-   Inspired by [Andrej Karpathy's NanoGPT](https://github.com/karpathy/nanoGPT)
+-   Built with [TensorFlow.js](https://www.tensorflow.org/js)
+-   Developed as part of the Finnish [Generation AI research project](https://generation-ai-stn.fi)
+## Citation
+If you use this library in your research, please cite:
+```bibtex
+@inproceedings{10.1145/3769994.3770061,
+author = {Pope, Nicolas and Tedre, Matti},
+title = {A Teachable Machine for Transformers},
+year = {2025},
+publisher = {Association for Computing Machinery},
+doi = {10.1145/3769994.3770061},
+booktitle = {Proceedings of the 25th Koli Calling International Conference on Computing Education Research},
+}
 ```

package/dist/Generator.js CHANGED Viewed

@@ -1,82 +1,73 @@
-import { E as C } from "./index-Dwqa6Zy2.js";
-import { E as _, F as I, G as O, a6 as R, t as q, k as K } from "./index-BzFyqcy-.js";
+import { E as C } from "./index-DvYrXKkX.js";
+import { A as _, B as I, E as O, t as R, k as q } from "./index-ZyQhjEPo.js";
+import "./utilities/packed.js";
 import "./ops/cpu/attentionMask.js";
 import "./ops/webgl/attentionMask.js";
 import "./ops/grads/attentionMask.js";
-import "./ops/cpu/qkv.js";
-import "./ops/webgl/qkv.js";
-import "./ops/grads/qkv.js";
-import { p as j } from "./random_width-CXVRloNK.js";
-import { t as G } from "./register_all_kernels-DIGpEwcf.js";
-import "./index-Tf7vU29b.js";
-import "./dataset-DlZtKmBq.js";
+import { p as K } from "./random_width-DY6Kk2Dl.js";
+import { t as j } from "./register_all_kernels-Bwu1PTuU.js";
+import "./index-Cp39cXWe.js";
+import "./dataset-0xP8GjwI.js";
 import "./ops/cpu/rope.js";
 import "./ops/webgl/rope.js";
-import "./ops/grads/rope.js";
+import "./rope-B5UUMsPi.js";
 import "./ops/cpu/appendCache.js";
 import "./ops/webgl/appendCache.js";
-import "./ops/cpu/fusedSoftmax.js";
-import "./ops/webgl/fusedSoftmax.js";
-import "./ops/grads/fusedSoftmax.js";
-import "./ops/cpu/matMulGelu.js";
-import "./ops/webgl/matMulGelu.js";
-import "./ops/grads/matMulGelu.js";
+import "./ops/grads/softmax16.js";
+import "./matMul16--R5hOwDG.js";
+import "./ops/webgl/matMul16.js";
+import "./ops/cpu/matMul16.js";
+import "./pack16-CFUqumar.js";
+import "./ops/transpose16.js";
+import "./ops/reshape16.js";
+import "./ops/cpu/qkv.js";
+import "./ops/webgl/qkv.js";
+import "./ops/grads/qkv.js";
 import "./ops/cpu/normRMS.js";
 import "./ops/webgl/normRMS.js";
 import "./ops/grads/normRMS.js";
+import "./ops/grads/add16.js";
 import { sparseSoftmaxCrossEntropy as V } from "./training/sparseCrossEntropy.js";
-import "./jszip.min-CjP2V1VV.js";
+import "./jszip.min-Bz5-11Bk.js";
 import $ from "./tokeniser/CharTokeniser.js";
 import "./ops/cpu/adamAdjust.js";
 import "./ops/webgl/adamAdjust.js";
 import "./ops/cpu/adamMoments.js";
 import "./ops/webgl/adamMoments.js";
-import "./papaparse.min-C8l2Kvo1.js";
-import M from "./utilities/topP.js";
+import "./papaparse.min-C0cScC2i.js";
+import G from "./utilities/topP.js";
 import "./ops/cpu/scatterSub.js";
 import "./ops/webgl/scatterSub.js";
 import "./ops/cpu/gatherSub.js";
 import "./ops/webgl/gatherSub.js";
+import "./ops/cpu/matMulGelu.js";
+import "./ops/webgl/matMulGelu.js";
+import "./ops/grads/matMulGelu.js";
 import "./ops/cpu/gelu.js";
 import "./ops/webgl/gelu.js";
-import "./gelu-Bp_-935b.js";
+import "./gelu-CNLFZWea.js";
 import "./ops/webgl/log.js";
 import "./checks/normRMS.js";
 import "./checks/normRMSGrad.js";
-import N from "./utilities/multinomialCPU.js";
-import { r as E } from "./reshape-CnIwVG1c.js";
-import { t as P } from "./tensor2d-D76QGjF3.js";
-import { s as S } from "./softmax-D7Jj3p_P.js";
-import { g as F } from "./gather-Dnpgw-YQ.js";
-import { c as H } from "./concat-B912vBbo.js";
-/**
- * @license
- * Copyright 2020 Google LLC. All Rights Reserved.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * =============================================================================
- */
-function U(p, t, s, e = !1) {
-  const o = I(p, "logits", "multinomial"), i = o.size, c = o.rank;
+import M from "./utilities/multinomialCPU.js";
+import { i as N } from "./tensor_util-DV-FP5Q3.js";
+import { r as E } from "./reshape-DevtBWtf.js";
+import { t as P } from "./tensor2d-G4Ys2GxX.js";
+import { s as S } from "./softmax-ZHVebtR1.js";
+import { g as B } from "./gather-DykLGqmW.js";
+import { c as H } from "./concat-BHlIJeyT.js";
+function U(l, t, s, e = !1) {
+  const o = I(l, "logits", "multinomial"), i = o.size, c = o.rank;
   if (i < 2)
     throw new Error(`Error in multinomial: you need at least 2 outcomes, but got ${i}.`);
   if (c > 2)
     throw new Error(`Rank of probabilities must be 1 or 2, but is ${c}`);
   s = s || Math.random();
-  const n = { logits: c === 1 ? E(o, [1, -1]) : o }, l = { numSamples: t, seed: s, normalized: e }, d = O.runKernel(R, n, l);
+  const n = { logits: c === 1 ? E(o, [1, -1]) : o }, p = { numSamples: t, seed: s, normalized: e }, d = O.runKernel(N, n, p);
   return c === 1 ? E(d, [d.size]) : d;
 }
 const z = /* @__PURE__ */ _({ multinomial_: U }), W = [
-  ...Array.from({ length: 95 }, (p, t) => String.fromCharCode(t + 32)),
+  ...Array.from({ length: 95 }, (l, t) => String.fromCharCode(t + 32)),
   // ASCII
   // Spanish accented letters and punctuation
   ..."áéíóúüñ¿¡",
@@ -87,10 +78,10 @@ const z = /* @__PURE__ */ _({ multinomial_: U }), W = [
   // Cyrillic letters
   ..."абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
 ];
-function B(p, t) {
-  return p.length === t ? p : p.length > t ? p.slice(0, t) : p.concat(Array(t - p.length).fill(""));
+function F(l, t) {
+  return l.length === t ? l : l.length > t ? l.slice(0, t) : l.concat(Array(t - l.length).fill(""));
 }
-class Wt extends C {
+class te extends C {
   constructor(t, s) {
     super(), this.model = t, this.tokeniser = s, this.actualTokeniser = s;
   }
@@ -116,7 +107,7 @@ class Wt extends C {
     const c = await t.decode([i]);
     if (e) {
       const T = await Promise.all(
-        e.map((n) => n.array().then((l) => l))
+        e.map((n) => n.array().then((p) => p))
       );
       e.forEach((n) => n.dispose()), this.attentionData.push(T);
     }
@@ -131,14 +122,14 @@ class Wt extends C {
       } : void 0,
       cache: s,
       outputEmbeddings: !!e?.embeddings
-    }, [l, d] = q(() => {
-      const a = t, m = a.shape[1], h = m <= this.model.config.blockSize ? a : a.slice(
+    }, [p, d] = R(() => {
+      const r = t, m = r.shape[1], h = m <= this.model.config.blockSize ? r : r.slice(
         [0, m - this.model.config.blockSize],
-        [a.shape[0], this.model.config.blockSize]
-      ), r = T ? this.model.config.blockSize - h.shape[1] : 0, v = r > 0 ? j(h, [
+        [r.shape[0], this.model.config.blockSize]
+      ), a = T ? this.model.config.blockSize - h.shape[1] : 0, v = a > 0 ? K(h, [
         [0, 0],
-        [0, r]
-      ]) : h, [g] = this.model.forward(n, v), u = g.shape[1] - 1 - r, f = g.slice([0, u, 0], [g.shape[0], 1, g.shape[2]]);
+        [0, a]
+      ]) : h, [g] = this.model.forward(n, v), u = g.shape[1] - 1 - a, f = g.slice([0, u, 0], [g.shape[0], 1, g.shape[2]]);
       let y;
       if (e?.targets) {
         const k = e.targets.shift();
@@ -148,46 +139,46 @@ class Wt extends C {
         }
       }
       return n.attentionScores?.attentionOut && n.attentionScores.attentionOut.forEach((k, w) => {
-        k.shape[1] !== 1 && (n.attentionScores.attentionOut[w] = K(
+        k.shape[1] !== 1 && (n.attentionScores.attentionOut[w] = q(
           k.slice([0, u, 0], [k.shape[0], 1, k.shape[2]])
         ), k.dispose());
       }), g.dispose(), [f.div(o).squeeze([1]), y];
     });
     let b, x;
     if (c) {
-      const a = S(l), m = await a.array();
-      a.dispose();
-      const h = M(m, c);
-      e?.includeProbabilities && (x = m), b = N(h);
+      const r = S(p), m = await r.array();
+      r.dispose();
+      const h = G(m, c);
+      e?.includeProbabilities && (x = m), b = M(h);
     } else if (i) {
-      const { values: a, indices: m } = G(l, i), h = z(a, 1);
-      b = F(m, h, 1), a.dispose(), m.dispose(), h.dispose();
-    } else if (b = z(l, 1), e?.includeProbabilities) {
-      const a = S(l);
-      x = await a.array(), a.dispose();
+      const { values: r, indices: m } = j(p, i), h = z(r, 1);
+      b = B(m, h, 1), r.dispose(), m.dispose(), h.dispose();
+    } else if (b = z(p, 1), e?.includeProbabilities) {
+      const r = S(p);
+      x = await r.array(), r.dispose();
     }
     if (n.embeddings) {
-      const m = (e?.embeddings === "all" ? n.embeddings : n.embeddings.filter((r) => r.name.startsWith("block_output_"))).map(async (r) => {
-        const v = r.tensor.shape[1], g = r.tensor.slice([0, v - 1, 0], [r.tensor.shape[0], 1, r.tensor.shape[2]]);
-        r.tensor.dispose();
+      const m = (e?.embeddings === "all" ? n.embeddings : n.embeddings.filter((a) => a.name.startsWith("block_output_"))).map(async (a) => {
+        const v = a.tensor.shape[1], g = a.tensor.slice([0, v - 1, 0], [a.tensor.shape[0], 1, a.tensor.shape[2]]);
+        a.tensor.dispose();
         const u = g.squeeze([1]);
         if (g.dispose(), e?.embeddings === "softmax") {
           const f = this.model.project(u);
           u.dispose();
           const y = S(f, -1);
-          return f.dispose(), { name: r.name, tensor: await y.array() };
+          return f.dispose(), { name: a.name, tensor: await y.array() };
         } else if (e?.embeddings === "logits") {
           const f = this.model.project(u);
-          return u.dispose(), { name: r.name, tensor: await f.array() };
+          return u.dispose(), { name: a.name, tensor: await f.array() };
         } else {
           const f = await u.array();
-          return u.dispose(), { name: r.name, tensor: f };
+          return u.dispose(), { name: a.name, tensor: f };
         }
       }), h = await Promise.all(m);
       this.embeddingsData.push(h);
     }
     const A = b.reshape([1, 1]);
-    b.dispose(), b = A, l.dispose();
+    b.dispose(), b = A, p.dispose();
     let L;
     return d && (L = await d.array(), d.dispose()), { output: b, probabilities: x, attention: n.attentionScores?.attentionOut, loss: L };
   }
@@ -211,10 +202,10 @@ class Wt extends C {
         const d = s;
         s = H([s, i], 1), d.dispose();
       }
-      const l = await this.processResponse(this.actualTokeniser, i, T, c);
-      if (this.cache || i.dispose(), l === null)
+      const p = await this.processResponse(this.actualTokeniser, i, T, c);
+      if (this.cache || i.dispose(), p === null)
         break;
-      this.outputText += l;
+      this.outputText += p;
     }
     return s.dispose(), this.outputText;
   }
@@ -233,7 +224,7 @@ class Wt extends C {
         o[i] = { k: void 0, v: void 0, length: 0, cumulativeLength: 0 };
       this.cache = o, this.lastToken = -1;
     }
-    const e = this.tokeniser.trained ? this.tokeniser : new $(B(W, this.tokeniser.vocabSize));
+    const e = this.tokeniser.trained ? this.tokeniser : new $(F(W, this.tokeniser.vocabSize));
     this.actualTokeniser = e;
   }
   async step(t, s) {
@@ -268,5 +259,5 @@ class Wt extends C {
   }
 }
 export {
-  Wt as default
+  te as default
 };