neutro 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neutro-0.1.0/LICENSE +21 -0
- neutro-0.1.0/PKG-INFO +126 -0
- neutro-0.1.0/README.md +109 -0
- neutro-0.1.0/neutro/__init__.py +8 -0
- neutro-0.1.0/neutro/activations/__init__.py +15 -0
- neutro-0.1.0/neutro/activations/base.py +5 -0
- neutro-0.1.0/neutro/activations/relu.py +9 -0
- neutro-0.1.0/neutro/activations/sigmoid.py +10 -0
- neutro-0.1.0/neutro/activations/silu.py +20 -0
- neutro-0.1.0/neutro/activations/softmax.py +22 -0
- neutro-0.1.0/neutro/activations/tanh.py +8 -0
- neutro-0.1.0/neutro/callbacks/__init__.py +5 -0
- neutro-0.1.0/neutro/callbacks/base.py +13 -0
- neutro-0.1.0/neutro/callbacks/checkpoint.py +25 -0
- neutro-0.1.0/neutro/callbacks/early_stopping.py +27 -0
- neutro-0.1.0/neutro/callbacks/history.py +10 -0
- neutro-0.1.0/neutro/callbacks/lr_scheduler.py +79 -0
- neutro-0.1.0/neutro/data.py +44 -0
- neutro-0.1.0/neutro/initializers/__init__.py +13 -0
- neutro-0.1.0/neutro/initializers/base.py +3 -0
- neutro-0.1.0/neutro/initializers/constant.py +10 -0
- neutro-0.1.0/neutro/initializers/glorot.py +19 -0
- neutro-0.1.0/neutro/initializers/he.py +19 -0
- neutro-0.1.0/neutro/initializers/random.py +9 -0
- neutro-0.1.0/neutro/layers/__init__.py +21 -0
- neutro-0.1.0/neutro/layers/attention/base_attention.py +21 -0
- neutro-0.1.0/neutro/layers/attention/flash_attention.py +209 -0
- neutro-0.1.0/neutro/layers/attention/gqa.py +33 -0
- neutro-0.1.0/neutro/layers/attention/kv_cache.py +32 -0
- neutro-0.1.0/neutro/layers/attention/mha.py +68 -0
- neutro-0.1.0/neutro/layers/attention/mla.py +121 -0
- neutro-0.1.0/neutro/layers/attention/mqa.py +32 -0
- neutro-0.1.0/neutro/layers/base.py +77 -0
- neutro-0.1.0/neutro/layers/convolutional/conv1d.py +105 -0
- neutro-0.1.0/neutro/layers/convolutional/conv2d.py +125 -0
- neutro-0.1.0/neutro/layers/core/__init__.py +7 -0
- neutro-0.1.0/neutro/layers/core/activation.py +32 -0
- neutro-0.1.0/neutro/layers/core/dense.py +49 -0
- neutro-0.1.0/neutro/layers/core/dropout.py +19 -0
- neutro-0.1.0/neutro/layers/core/flatten.py +20 -0
- neutro-0.1.0/neutro/layers/core/merging.py +68 -0
- neutro-0.1.0/neutro/layers/core/moe.py +119 -0
- neutro-0.1.0/neutro/layers/core/reparameterization.py +33 -0
- neutro-0.1.0/neutro/layers/embedding/__init__.py +2 -0
- neutro-0.1.0/neutro/layers/embedding/embedding.py +24 -0
- neutro-0.1.0/neutro/layers/embedding/time_embedding.py +39 -0
- neutro-0.1.0/neutro/layers/normalization/__init__.py +4 -0
- neutro-0.1.0/neutro/layers/normalization/batchnorm.py +59 -0
- neutro-0.1.0/neutro/layers/normalization/groupnorm.py +70 -0
- neutro-0.1.0/neutro/layers/normalization/layernorm.py +28 -0
- neutro-0.1.0/neutro/layers/normalization/rmsnorm.py +42 -0
- neutro-0.1.0/neutro/layers/pooling/__init__.py +3 -0
- neutro-0.1.0/neutro/layers/pooling/global_pooling.py +68 -0
- neutro-0.1.0/neutro/layers/pooling/maxpooling2d.py +80 -0
- neutro-0.1.0/neutro/layers/pooling/upsampling2d.py +33 -0
- neutro-0.1.0/neutro/layers/recurrent/__init__.py +3 -0
- neutro-0.1.0/neutro/layers/recurrent/gru.py +136 -0
- neutro-0.1.0/neutro/layers/recurrent/lstm.py +61 -0
- neutro-0.1.0/neutro/layers/recurrent/simple_rnn.py +56 -0
- neutro-0.1.0/neutro/layers/transformer/transformer_block.py +104 -0
- neutro-0.1.0/neutro/losses/__init__.py +14 -0
- neutro-0.1.0/neutro/losses/base.py +5 -0
- neutro-0.1.0/neutro/losses/categorical_crossentropy.py +12 -0
- neutro-0.1.0/neutro/losses/mse.py +8 -0
- neutro-0.1.0/neutro/losses/sparse_categorical_crossentropy.py +45 -0
- neutro-0.1.0/neutro/losses/vae_loss.py +50 -0
- neutro-0.1.0/neutro/metrics/__init__.py +15 -0
- neutro-0.1.0/neutro/metrics/accuracy.py +10 -0
- neutro-0.1.0/neutro/metrics/base.py +5 -0
- neutro-0.1.0/neutro/metrics/f1_score.py +12 -0
- neutro-0.1.0/neutro/metrics/precision.py +12 -0
- neutro-0.1.0/neutro/metrics/recall.py +12 -0
- neutro-0.1.0/neutro/metrics/sparse_accuracy.py +12 -0
- neutro-0.1.0/neutro/models/__init__.py +7 -0
- neutro-0.1.0/neutro/models/base_model.py +300 -0
- neutro-0.1.0/neutro/models/language/__init__.py +4 -0
- neutro-0.1.0/neutro/models/language/deepseek.py +122 -0
- neutro-0.1.0/neutro/models/language/gpt.py +39 -0
- neutro-0.1.0/neutro/models/language/llama.py +117 -0
- neutro-0.1.0/neutro/models/language/qwen.py +57 -0
- neutro-0.1.0/neutro/models/moe/__init__.py +0 -0
- neutro-0.1.0/neutro/models/vision/__init__.py +2 -0
- neutro-0.1.0/neutro/models/vision/alexnet.py +45 -0
- neutro-0.1.0/neutro/models/vision/diffusion_model.py +48 -0
- neutro-0.1.0/neutro/models/vision/unet.py +95 -0
- neutro-0.1.0/neutro/models/vision/vae.py +72 -0
- neutro-0.1.0/neutro/models/vision/vgg.py +82 -0
- neutro-0.1.0/neutro/optimizers/__init__.py +4 -0
- neutro-0.1.0/neutro/optimizers/adam.py +33 -0
- neutro-0.1.0/neutro/optimizers/adamw.py +37 -0
- neutro-0.1.0/neutro/optimizers/base.py +14 -0
- neutro-0.1.0/neutro/optimizers/schedules.py +37 -0
- neutro-0.1.0/neutro/optimizers/sgd.py +32 -0
- neutro-0.1.0/neutro/preprocessing/__init__.py +3 -0
- neutro-0.1.0/neutro/preprocessing/image.py +91 -0
- neutro-0.1.0/neutro/preprocessing/sequence.py +46 -0
- neutro-0.1.0/neutro/preprocessing/text.py +105 -0
- neutro-0.1.0/neutro/tokenizers/__init__.py +4 -0
- neutro-0.1.0/neutro/tokenizers/bpe.py +202 -0
- neutro-0.1.0/neutro/tokenizers/tiktoken_compat.py +62 -0
- neutro-0.1.0/neutro/utils/__init__.py +0 -0
- neutro-0.1.0/neutro/utils/conv_utils.py +76 -0
- neutro-0.1.0/neutro/utils/data_utils.py +71 -0
- neutro-0.1.0/neutro/utils/diffusion_utils.py +47 -0
- neutro-0.1.0/neutro/utils/rope_utils.py +33 -0
- neutro-0.1.0/neutro/utils/visualization.py +43 -0
- neutro-0.1.0/neutro.egg-info/PKG-INFO +126 -0
- neutro-0.1.0/neutro.egg-info/SOURCES.txt +113 -0
- neutro-0.1.0/neutro.egg-info/dependency_links.txt +1 -0
- neutro-0.1.0/neutro.egg-info/requires.txt +9 -0
- neutro-0.1.0/neutro.egg-info/top_level.txt +1 -0
- neutro-0.1.0/pyproject.toml +23 -0
- neutro-0.1.0/setup.cfg +4 -0
- neutro-0.1.0/tests/test_data.py +35 -0
- neutro-0.1.0/tests/test_preprocessing.py +60 -0
neutro-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
neutro-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: neutro
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Keras-style deep learning library using NumPy and SciPy
|
|
5
|
+
Requires-Python: >=3.8
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Requires-Dist: scipy
|
|
10
|
+
Requires-Dist: joblib
|
|
11
|
+
Requires-Dist: tqdm
|
|
12
|
+
Requires-Dist: regex
|
|
13
|
+
Provides-Extra: test
|
|
14
|
+
Requires-Dist: pytest; extra == "test"
|
|
15
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
16
|
+
Dynamic: license-file
|
|
17
|
+
|
|
18
|
+
# 🧠 Neutro: The "Old School" Deep Learning Playground
|
|
19
|
+
|
|
20
|
+
[](https://codecov.io/gh/sourcepirate/neutro)
|
|
21
|
+
[](https://opensource.org/licenses/MIT)
|
|
22
|
+
|
|
23
|
+
**Neutro** is a intentionally naive, NumPy-only implementation of modern deep learning architectures. It’s the Keras experience you love, powered by the NumPy you tolerate, built specifically for people who want to peek under the hood and actually *understand* how the gears turn.
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## 👴 The Philosophy: Why Does This Exist?
|
|
28
|
+
|
|
29
|
+
Let's be honest: modern DL frameworks are black boxes. You pip install 4GB of binaries and suddenly you're "doing AI."
|
|
30
|
+
|
|
31
|
+
**Neutro is for the curious, the learners, and the "old-school" folks like me** who believe that if you can't build it in a matrix, you don't really know it.
|
|
32
|
+
|
|
33
|
+
- **Learn, Don't just Run**: Every line of code is designed to be readable. We don't hide behind C++ kernels or CUDA kernels. If you want to know how FlashAttention *actually* tiles memory, you can just read the Python file.
|
|
34
|
+
- **A Toy, not a Tool**: This isn't meant for production. It's a playground for learning advanced algorithms (MHA, GQA, FlashAttention, LSTM) in their purest form.
|
|
35
|
+
- **For the Wisdom-Rich**: If you remember when 64MB of RAM was a flex and "vectorization" meant loop unrolling, this is for you. It's a fun way to play with cutting-edge 2024 algorithms using 1990s-era clarity.
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## 🚀 What's Inside?
|
|
40
|
+
|
|
41
|
+
- **"I can't believe it's not Keras!"**: Your muscle memory is safe here. `.compile()`, `.fit()`, `.predict()`—it’s all exactly where you left it.
|
|
42
|
+
- **Pure NumPy Math**: We did the math so you don't have to. Every gradient, from Softmax to LSTM gates, is hand-derived and vectorized.
|
|
43
|
+
- **Speed (for a CPU)**: We use `im2col` for convolutions and **FlashAttention** (yes, really) to keep your CPU fans humming in a way that sounds productive.
|
|
44
|
+
- **Zero Heavy Dependencies**: Tired of downloading 4GB of CUDA binaries just to train on MNIST? We require exactly `numpy` and `scipy`. That’s it.
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## 🛠 Features That'll Make You Say "Wait, You Implemented That?"
|
|
49
|
+
|
|
50
|
+
| Category | The "Fancy" Stuff | Why You Should Care |
|
|
51
|
+
| :--- | :--- | :--- |
|
|
52
|
+
| **Attention** | `FlashAttention`, `MQA`, `GQA`, `RoPE` | We have more attention variants than a distracted toddler. |
|
|
53
|
+
| **Tokenization** | `BPETokenizer`, `RegexTokenizer` | Byte-level BPE with regex splitting, just like the big kids. |
|
|
54
|
+
| **Vision** | `AlexNet`, `VGG16`, `VGG19`, `im2col` | Classical and modern vision architectures, vectorized. |
|
|
55
|
+
| **LLMs** | `Llama`, `Qwen`, `DeepSeek` (MoE) | Yes, you can run a (very tiny) MoE model on your CPU. |
|
|
56
|
+
| **Modern Ops** | `RMSNorm`, `SiLU`, `SwiGLU` | The secret sauce of modern LLMs, hand-implemented. |
|
|
57
|
+
| **Optimizers** | `AdamW`, `Adam`, `SGD+Momentum` | Keep your weights from exploding like a bad science fair project. |
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## 🏆 The Hall of Fame: Pre-built Architectures
|
|
62
|
+
|
|
63
|
+
Why build from scratch when we've already done the heavy lifting?
|
|
64
|
+
|
|
65
|
+
- **The Visionaries**: `AlexNet`, `VGG16`, `VGG19`
|
|
66
|
+
- **The Linguists**: `GPT-2`, `LlamaTiny`, `QwenTiny`, `DeepSeekTiny` (Mixture of Experts)
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## 💻 Show Me The Code!
|
|
71
|
+
|
|
72
|
+
If you know Keras, you already know Neutro. It's that simple.
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from neutro.models import Sequential
|
|
76
|
+
from neutro.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
|
|
77
|
+
|
|
78
|
+
# Build a CNN that actually fits in your head
|
|
79
|
+
model = Sequential([
|
|
80
|
+
Conv2D(32, kernel_size=3, activation='relu', input_shape=(28, 28, 1)),
|
|
81
|
+
MaxPooling2D(pool_size=2),
|
|
82
|
+
Flatten(),
|
|
83
|
+
Dense(128, activation='relu'),
|
|
84
|
+
Dropout(0.5),
|
|
85
|
+
Dense(10, activation='softmax')
|
|
86
|
+
])
|
|
87
|
+
|
|
88
|
+
# Compile it like it's 2015
|
|
89
|
+
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
|
|
90
|
+
|
|
91
|
+
# Fit it like a tailored suit
|
|
92
|
+
model.fit(train_flow, epochs=10)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## 📂 Deep Dives & Nerdy Stuff
|
|
98
|
+
|
|
99
|
+
We documented everything because we know you like to check the math:
|
|
100
|
+
|
|
101
|
+
- [**Attention Mechanisms**](./docs/layers/attention/) - How we made FlashAttention work on a CPU.
|
|
102
|
+
- [**Convolutional Magic**](./docs/layers/convolutional/) - The `im2col` deep dive.
|
|
103
|
+
- [**Activations & Gradients**](./docs/activations/) - Proofs for the brave.
|
|
104
|
+
- [**Optimizers**](./docs/optimizers/) - Why AdamW is better than your ex.
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## 🧪 Examples to Flex Your CPU
|
|
109
|
+
|
|
110
|
+
Check out the `examples/` folder for end-to-end scripts:
|
|
111
|
+
- `mnist_cnn.py`: Standard digit classification with real-time augmentation.
|
|
112
|
+
- `wikitext_llm.py`: A character-level Transformer that actually talks back.
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## 🏗 Installation
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
git clone https://github.com/sourcepirate/neutro.git
|
|
120
|
+
cd neutro
|
|
121
|
+
pip install -e .
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
**Disclaimer**: This is a hobby project for learning and exploration. It is intentionally naive, likely inefficient compared to compiled kernels, and 100% focused on the joy of understanding advanced algorithms. If you're looking to change the world with AGI, go to PyTorch. If you're looking to understand why your Transformer works while drinking a nice cup of tea, you're in the right place.
|
neutro-0.1.0/README.md
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# 🧠 Neutro: The "Old School" Deep Learning Playground
|
|
2
|
+
|
|
3
|
+
[](https://codecov.io/gh/sourcepirate/neutro)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
|
|
6
|
+
**Neutro** is a intentionally naive, NumPy-only implementation of modern deep learning architectures. It’s the Keras experience you love, powered by the NumPy you tolerate, built specifically for people who want to peek under the hood and actually *understand* how the gears turn.
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## 👴 The Philosophy: Why Does This Exist?
|
|
11
|
+
|
|
12
|
+
Let's be honest: modern DL frameworks are black boxes. You pip install 4GB of binaries and suddenly you're "doing AI."
|
|
13
|
+
|
|
14
|
+
**Neutro is for the curious, the learners, and the "old-school" folks like me** who believe that if you can't build it in a matrix, you don't really know it.
|
|
15
|
+
|
|
16
|
+
- **Learn, Don't just Run**: Every line of code is designed to be readable. We don't hide behind C++ kernels or CUDA kernels. If you want to know how FlashAttention *actually* tiles memory, you can just read the Python file.
|
|
17
|
+
- **A Toy, not a Tool**: This isn't meant for production. It's a playground for learning advanced algorithms (MHA, GQA, FlashAttention, LSTM) in their purest form.
|
|
18
|
+
- **For the Wisdom-Rich**: If you remember when 64MB of RAM was a flex and "vectorization" meant loop unrolling, this is for you. It's a fun way to play with cutting-edge 2024 algorithms using 1990s-era clarity.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## 🚀 What's Inside?
|
|
23
|
+
|
|
24
|
+
- **"I can't believe it's not Keras!"**: Your muscle memory is safe here. `.compile()`, `.fit()`, `.predict()`—it’s all exactly where you left it.
|
|
25
|
+
- **Pure NumPy Math**: We did the math so you don't have to. Every gradient, from Softmax to LSTM gates, is hand-derived and vectorized.
|
|
26
|
+
- **Speed (for a CPU)**: We use `im2col` for convolutions and **FlashAttention** (yes, really) to keep your CPU fans humming in a way that sounds productive.
|
|
27
|
+
- **Zero Heavy Dependencies**: Tired of downloading 4GB of CUDA binaries just to train on MNIST? We require exactly `numpy` and `scipy`. That’s it.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## 🛠 Features That'll Make You Say "Wait, You Implemented That?"
|
|
32
|
+
|
|
33
|
+
| Category | The "Fancy" Stuff | Why You Should Care |
|
|
34
|
+
| :--- | :--- | :--- |
|
|
35
|
+
| **Attention** | `FlashAttention`, `MQA`, `GQA`, `RoPE` | We have more attention variants than a distracted toddler. |
|
|
36
|
+
| **Tokenization** | `BPETokenizer`, `RegexTokenizer` | Byte-level BPE with regex splitting, just like the big kids. |
|
|
37
|
+
| **Vision** | `AlexNet`, `VGG16`, `VGG19`, `im2col` | Classical and modern vision architectures, vectorized. |
|
|
38
|
+
| **LLMs** | `Llama`, `Qwen`, `DeepSeek` (MoE) | Yes, you can run a (very tiny) MoE model on your CPU. |
|
|
39
|
+
| **Modern Ops** | `RMSNorm`, `SiLU`, `SwiGLU` | The secret sauce of modern LLMs, hand-implemented. |
|
|
40
|
+
| **Optimizers** | `AdamW`, `Adam`, `SGD+Momentum` | Keep your weights from exploding like a bad science fair project. |
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## 🏆 The Hall of Fame: Pre-built Architectures
|
|
45
|
+
|
|
46
|
+
Why build from scratch when we've already done the heavy lifting?
|
|
47
|
+
|
|
48
|
+
- **The Visionaries**: `AlexNet`, `VGG16`, `VGG19`
|
|
49
|
+
- **The Linguists**: `GPT-2`, `LlamaTiny`, `QwenTiny`, `DeepSeekTiny` (Mixture of Experts)
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## 💻 Show Me The Code!
|
|
54
|
+
|
|
55
|
+
If you know Keras, you already know Neutro. It's that simple.
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from neutro.models import Sequential
|
|
59
|
+
from neutro.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
|
|
60
|
+
|
|
61
|
+
# Build a CNN that actually fits in your head
|
|
62
|
+
model = Sequential([
|
|
63
|
+
Conv2D(32, kernel_size=3, activation='relu', input_shape=(28, 28, 1)),
|
|
64
|
+
MaxPooling2D(pool_size=2),
|
|
65
|
+
Flatten(),
|
|
66
|
+
Dense(128, activation='relu'),
|
|
67
|
+
Dropout(0.5),
|
|
68
|
+
Dense(10, activation='softmax')
|
|
69
|
+
])
|
|
70
|
+
|
|
71
|
+
# Compile it like it's 2015
|
|
72
|
+
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
|
|
73
|
+
|
|
74
|
+
# Fit it like a tailored suit
|
|
75
|
+
model.fit(train_flow, epochs=10)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## 📂 Deep Dives & Nerdy Stuff
|
|
81
|
+
|
|
82
|
+
We documented everything because we know you like to check the math:
|
|
83
|
+
|
|
84
|
+
- [**Attention Mechanisms**](./docs/layers/attention/) - How we made FlashAttention work on a CPU.
|
|
85
|
+
- [**Convolutional Magic**](./docs/layers/convolutional/) - The `im2col` deep dive.
|
|
86
|
+
- [**Activations & Gradients**](./docs/activations/) - Proofs for the brave.
|
|
87
|
+
- [**Optimizers**](./docs/optimizers/) - Why AdamW is better than your ex.
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## 🧪 Examples to Flex Your CPU
|
|
92
|
+
|
|
93
|
+
Check out the `examples/` folder for end-to-end scripts:
|
|
94
|
+
- `mnist_cnn.py`: Standard digit classification with real-time augmentation.
|
|
95
|
+
- `wikitext_llm.py`: A character-level Transformer that actually talks back.
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## 🏗 Installation
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
git clone https://github.com/sourcepirate/neutro.git
|
|
103
|
+
cd neutro
|
|
104
|
+
pip install -e .
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
**Disclaimer**: This is a hobby project for learning and exploration. It is intentionally naive, likely inefficient compared to compiled kernels, and 100% focused on the joy of understanding advanced algorithms. If you're looking to change the world with AGI, go to PyTorch. If you're looking to understand why your Transformer works while drinking a nice cup of tea, you're in the right place.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from .base import Activation
|
|
2
|
+
from .relu import ReLU
|
|
3
|
+
from .sigmoid import Sigmoid
|
|
4
|
+
from .tanh import Tanh
|
|
5
|
+
from .softmax import Softmax
|
|
6
|
+
from .silu import SiLU
|
|
7
|
+
|
|
8
|
+
def get(identifier):
|
|
9
|
+
if identifier == 'relu': return ReLU()
|
|
10
|
+
if identifier == 'sigmoid': return Sigmoid()
|
|
11
|
+
if identifier == 'tanh': return Tanh()
|
|
12
|
+
if identifier == 'softmax': return Softmax()
|
|
13
|
+
if identifier == 'silu': return SiLU()
|
|
14
|
+
if isinstance(identifier, Activation): return identifier
|
|
15
|
+
return identifier
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from .base import Activation
|
|
3
|
+
|
|
4
|
+
class Sigmoid(Activation):
|
|
5
|
+
def __call__(self, x):
|
|
6
|
+
self.last_output = 1 / (1 + np.exp(-np.clip(x, -500, 500)))
|
|
7
|
+
return self.last_output
|
|
8
|
+
def gradient(self, x):
|
|
9
|
+
s = self.__call__(x)
|
|
10
|
+
return s * (1 - s)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from .base import Activation
|
|
3
|
+
|
|
4
|
+
class SiLU(Activation):
|
|
5
|
+
"""
|
|
6
|
+
SiLU (Sigmoid Linear Unit) or Swish activation function: x * sigmoid(x).
|
|
7
|
+
Commonly used in Llama, Qwen, and DeepSeek.
|
|
8
|
+
"""
|
|
9
|
+
def __call__(self, x):
|
|
10
|
+
self.sigmoid_x = 1 / (1 + np.exp(-x))
|
|
11
|
+
self.x = x
|
|
12
|
+
return x * self.sigmoid_x
|
|
13
|
+
|
|
14
|
+
def gradient(self, x):
|
|
15
|
+
# f'(x) = f(x) + sigmoid(x) * (1 - f(x))
|
|
16
|
+
f_x = x * self.sigmoid_x
|
|
17
|
+
return f_x + self.sigmoid_x * (1 - f_x)
|
|
18
|
+
|
|
19
|
+
def gradient_fast(self, x, grad_output):
|
|
20
|
+
return grad_output * self.gradient(x)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from .base import Activation
|
|
3
|
+
|
|
4
|
+
class Softmax(Activation):
|
|
5
|
+
def __call__(self, x):
|
|
6
|
+
exps = np.exp(x - np.max(x, axis=-1, keepdims=True))
|
|
7
|
+
self.last_output = exps / np.sum(exps, axis=-1, keepdims=True)
|
|
8
|
+
return self.last_output
|
|
9
|
+
def gradient(self, x):
|
|
10
|
+
return self.last_output * (1 - self.last_output)
|
|
11
|
+
def gradient_fast(self, x, grad_output):
|
|
12
|
+
orig_shape = grad_output.shape
|
|
13
|
+
grad_flat = grad_output.reshape(-1, orig_shape[-1])
|
|
14
|
+
out_flat = self.last_output.reshape(-1, orig_shape[-1])
|
|
15
|
+
|
|
16
|
+
n_samples, units = grad_flat.shape
|
|
17
|
+
res = np.zeros_like(grad_flat)
|
|
18
|
+
for i in range(n_samples):
|
|
19
|
+
s = out_flat[i].reshape(-1, 1)
|
|
20
|
+
jacobian = np.diagflat(s) - np.dot(s, s.T)
|
|
21
|
+
res[i] = np.dot(grad_flat[i], jacobian)
|
|
22
|
+
return res.reshape(orig_shape)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
class Callback:
|
|
2
|
+
def __init__(self):
|
|
3
|
+
self.model = None
|
|
4
|
+
|
|
5
|
+
def set_model(self, model):
|
|
6
|
+
self.model = model
|
|
7
|
+
|
|
8
|
+
def on_epoch_begin(self, epoch, logs=None): pass
|
|
9
|
+
def on_epoch_end(self, epoch, logs=None): pass
|
|
10
|
+
def on_batch_begin(self, batch, logs=None): pass
|
|
11
|
+
def on_batch_end(self, batch, logs=None): pass
|
|
12
|
+
def on_train_begin(self, logs=None): pass
|
|
13
|
+
def on_train_end(self, logs=None): pass
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from .base import Callback
|
|
3
|
+
|
|
4
|
+
class ModelCheckpoint(Callback):
|
|
5
|
+
def __init__(self, filepath, monitor='val_loss', save_best_only=False, mode='auto'):
|
|
6
|
+
super().__init__()
|
|
7
|
+
self.filepath = filepath
|
|
8
|
+
self.monitor = monitor
|
|
9
|
+
self.save_best_only = save_best_only
|
|
10
|
+
self.best = -np.inf if mode == 'max' or (mode == 'auto' and 'acc' in monitor) else np.inf
|
|
11
|
+
self.mode = mode
|
|
12
|
+
|
|
13
|
+
def on_epoch_end(self, epoch, logs=None):
|
|
14
|
+
logs = logs or {}
|
|
15
|
+
current = logs.get(self.monitor)
|
|
16
|
+
if current is None: return
|
|
17
|
+
|
|
18
|
+
if self.save_best_only:
|
|
19
|
+
if (self.mode == 'min' and current < self.best) or \
|
|
20
|
+
(self.mode == 'max' and current > self.best) or \
|
|
21
|
+
(self.mode == 'auto' and (('acc' in self.monitor and current > self.best) or ('loss' in self.monitor and current < self.best))):
|
|
22
|
+
self.best = current
|
|
23
|
+
self.model.save(self.filepath)
|
|
24
|
+
else:
|
|
25
|
+
self.model.save(self.filepath.format(epoch=epoch + 1, **logs))
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from .base import Callback
|
|
3
|
+
|
|
4
|
+
class EarlyStopping(Callback):
|
|
5
|
+
def __init__(self, monitor='val_loss', patience=0, mode='auto'):
|
|
6
|
+
super().__init__()
|
|
7
|
+
self.monitor = monitor
|
|
8
|
+
self.patience = patience
|
|
9
|
+
self.wait = 0
|
|
10
|
+
self.best = -np.inf if mode == 'max' or (mode == 'auto' and 'acc' in monitor) else np.inf
|
|
11
|
+
self.mode = mode
|
|
12
|
+
|
|
13
|
+
def on_epoch_end(self, epoch, logs=None):
|
|
14
|
+
logs = logs or {}
|
|
15
|
+
current = logs.get(self.monitor)
|
|
16
|
+
if current is None: return
|
|
17
|
+
|
|
18
|
+
if (self.mode == 'min' and current < self.best) or \
|
|
19
|
+
(self.mode == 'max' and current > self.best) or \
|
|
20
|
+
(self.mode == 'auto' and (('acc' in self.monitor and current > self.best) or ('loss' in self.monitor and current < self.best))):
|
|
21
|
+
self.best = current
|
|
22
|
+
self.wait = 0
|
|
23
|
+
else:
|
|
24
|
+
self.wait += 1
|
|
25
|
+
if self.wait >= self.patience:
|
|
26
|
+
self.model.stop_training = True
|
|
27
|
+
print(f"Epoch {epoch+1}: early stopping")
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from .base import Callback
|
|
2
|
+
|
|
3
|
+
class History(Callback):
|
|
4
|
+
def on_train_begin(self, logs=None):
|
|
5
|
+
self.history = {'loss': [], 'epoch': []}
|
|
6
|
+
|
|
7
|
+
def on_epoch_end(self, epoch, logs=None):
|
|
8
|
+
self.history['epoch'].append(epoch)
|
|
9
|
+
for k, v in logs.items():
|
|
10
|
+
self.history.setdefault(k, []).append(v)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from .base import Callback
|
|
3
|
+
|
|
4
|
+
class LearningRateScheduler(Callback):
|
|
5
|
+
"""
|
|
6
|
+
Learning rate scheduler.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
schedule: a function that takes an epoch index (integer, indexed from 0) and current learning rate as inputs and returns a new learning rate as output (float).
|
|
10
|
+
"""
|
|
11
|
+
def __init__(self, schedule, verbose=0):
|
|
12
|
+
super().__init__()
|
|
13
|
+
self.schedule = schedule
|
|
14
|
+
self.verbose = verbose
|
|
15
|
+
|
|
16
|
+
def on_epoch_begin(self, epoch, logs=None):
|
|
17
|
+
if not hasattr(self.model.optimizer, 'lr'):
|
|
18
|
+
raise ValueError('Optimizer must have a "lr" attribute.')
|
|
19
|
+
|
|
20
|
+
lr = float(self.model.optimizer.lr)
|
|
21
|
+
lr = self.schedule(epoch, lr)
|
|
22
|
+
self.model.optimizer.lr = lr
|
|
23
|
+
if self.verbose > 0:
|
|
24
|
+
print(f'\nEpoch {epoch + 1}: LearningRateScheduler setting learning rate to {lr}.')
|
|
25
|
+
|
|
26
|
+
class ReduceLROnPlateau(Callback):
|
|
27
|
+
"""
|
|
28
|
+
Reduce learning rate when a metric has stopped improving.
|
|
29
|
+
"""
|
|
30
|
+
def __init__(self, monitor='val_loss', factor=0.1, patience=10, verbose=0, mode='auto', min_delta=1e-4, cooldown=0, min_lr=0):
|
|
31
|
+
super().__init__()
|
|
32
|
+
self.monitor = monitor
|
|
33
|
+
self.factor = factor
|
|
34
|
+
self.patience = patience
|
|
35
|
+
self.verbose = verbose
|
|
36
|
+
self.mode = mode
|
|
37
|
+
self.min_delta = min_delta
|
|
38
|
+
self.cooldown = cooldown
|
|
39
|
+
self.min_lr = min_lr
|
|
40
|
+
self.wait = 0
|
|
41
|
+
self.best = np.inf if 'loss' in monitor else -np.inf
|
|
42
|
+
self.cooldown_counter = 0
|
|
43
|
+
|
|
44
|
+
def on_train_begin(self, logs=None):
|
|
45
|
+
self.wait = 0
|
|
46
|
+
self.best = np.inf if 'loss' in self.monitor else -np.inf
|
|
47
|
+
self.cooldown_counter = 0
|
|
48
|
+
|
|
49
|
+
def on_epoch_end(self, epoch, logs=None):
|
|
50
|
+
logs = logs or {}
|
|
51
|
+
current = logs.get(self.monitor)
|
|
52
|
+
if current is None:
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
if self.cooldown_counter > 0:
|
|
56
|
+
self.cooldown_counter -= 1
|
|
57
|
+
self.wait = 0
|
|
58
|
+
|
|
59
|
+
if self._is_improvement(current, self.best):
|
|
60
|
+
self.best = current
|
|
61
|
+
self.wait = 0
|
|
62
|
+
elif self.cooldown_counter <= 0:
|
|
63
|
+
self.wait += 1
|
|
64
|
+
if self.wait >= self.patience:
|
|
65
|
+
old_lr = float(self.model.optimizer.lr)
|
|
66
|
+
if old_lr > self.min_lr:
|
|
67
|
+
new_lr = old_lr * self.factor
|
|
68
|
+
new_lr = max(new_lr, self.min_lr)
|
|
69
|
+
self.model.optimizer.lr = new_lr
|
|
70
|
+
if self.verbose > 0:
|
|
71
|
+
print(f'\nEpoch {epoch + 1}: ReduceLROnPlateau reducing learning rate to {new_lr}.')
|
|
72
|
+
self.cooldown_counter = self.cooldown
|
|
73
|
+
self.wait = 0
|
|
74
|
+
|
|
75
|
+
def _is_improvement(self, current, best):
|
|
76
|
+
if 'loss' in self.monitor or self.mode == 'min':
|
|
77
|
+
return current < best - self.min_delta
|
|
78
|
+
else:
|
|
79
|
+
return current > best + self.min_delta
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
class DataLoader:
|
|
4
|
+
"""
|
|
5
|
+
Data loader for batching and shuffling data.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
x: Input data (NumPy array).
|
|
9
|
+
y: Target data (NumPy array).
|
|
10
|
+
batch_size: Number of samples per batch.
|
|
11
|
+
shuffle: Whether to shuffle the data at the beginning of each epoch.
|
|
12
|
+
"""
|
|
13
|
+
def __init__(self, x, y, batch_size=32, shuffle=True, augmenter=None):
|
|
14
|
+
self.x = x
|
|
15
|
+
self.y = y
|
|
16
|
+
self.batch_size = batch_size
|
|
17
|
+
self.shuffle = shuffle
|
|
18
|
+
self.augmenter = augmenter
|
|
19
|
+
self.indices = np.arange(len(x))
|
|
20
|
+
self.on_epoch_end()
|
|
21
|
+
|
|
22
|
+
def __len__(self):
|
|
23
|
+
return int(np.ceil(len(self.x) / self.batch_size))
|
|
24
|
+
|
|
25
|
+
def on_epoch_end(self):
|
|
26
|
+
if self.shuffle:
|
|
27
|
+
np.random.shuffle(self.indices)
|
|
28
|
+
|
|
29
|
+
def __getitem__(self, index):
|
|
30
|
+
indices = self.indices[index * self.batch_size : (index + 1) * self.batch_size]
|
|
31
|
+
batch_x, batch_y = self.x[indices], self.y[indices]
|
|
32
|
+
|
|
33
|
+
if self.augmenter:
|
|
34
|
+
augmented_x = np.zeros_like(batch_x)
|
|
35
|
+
for i in range(len(batch_x)):
|
|
36
|
+
augmented_x[i] = self.augmenter.apply_transform(batch_x[i])
|
|
37
|
+
batch_x = augmented_x
|
|
38
|
+
|
|
39
|
+
return batch_x, batch_y
|
|
40
|
+
|
|
41
|
+
def __iter__(self):
|
|
42
|
+
for i in range(len(self)):
|
|
43
|
+
yield self[i]
|
|
44
|
+
self.on_epoch_end()
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .base import Initializer
|
|
2
|
+
from .constant import Zeros, Ones
|
|
3
|
+
from .random import RandomNormal
|
|
4
|
+
from .glorot import GlorotUniform
|
|
5
|
+
from .he import HeNormal
|
|
6
|
+
|
|
7
|
+
def get(identifier):
|
|
8
|
+
if identifier == 'zeros': return Zeros()
|
|
9
|
+
if identifier == 'ones': return Ones()
|
|
10
|
+
if identifier == 'glorot_uniform': return GlorotUniform()
|
|
11
|
+
if identifier == 'he_normal': return HeNormal()
|
|
12
|
+
if isinstance(identifier, Initializer): return identifier
|
|
13
|
+
return identifier
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from .base import Initializer
|
|
3
|
+
|
|
4
|
+
class GlorotUniform(Initializer):
|
|
5
|
+
def __call__(self, shape):
|
|
6
|
+
fan_in, fan_out = self._calculate_fan_in_and_fan_out(shape)
|
|
7
|
+
limit = np.sqrt(6 / (fan_in + fan_out))
|
|
8
|
+
return np.random.uniform(-limit, limit, shape)
|
|
9
|
+
|
|
10
|
+
def _calculate_fan_in_and_fan_out(self, shape):
|
|
11
|
+
if len(shape) < 2:
|
|
12
|
+
return shape[0], shape[0]
|
|
13
|
+
if len(shape) == 2:
|
|
14
|
+
return shape[0], shape[1]
|
|
15
|
+
else:
|
|
16
|
+
receptive_field_size = np.prod(shape[:-2])
|
|
17
|
+
fan_in = shape[-2] * receptive_field_size
|
|
18
|
+
fan_out = shape[-1] * receptive_field_size
|
|
19
|
+
return fan_in, fan_out
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from .base import Initializer
|
|
3
|
+
|
|
4
|
+
class HeNormal(Initializer):
|
|
5
|
+
def __call__(self, shape):
|
|
6
|
+
fan_in, _ = self._calculate_fan_in_and_fan_out(shape)
|
|
7
|
+
std = np.sqrt(2 / fan_in)
|
|
8
|
+
return np.random.normal(0, std, shape)
|
|
9
|
+
|
|
10
|
+
def _calculate_fan_in_and_fan_out(self, shape):
|
|
11
|
+
if len(shape) < 2:
|
|
12
|
+
return shape[0], shape[0]
|
|
13
|
+
if len(shape) == 2:
|
|
14
|
+
return shape[0], shape[1]
|
|
15
|
+
else:
|
|
16
|
+
receptive_field_size = np.prod(shape[:-2])
|
|
17
|
+
fan_in = shape[-2] * receptive_field_size
|
|
18
|
+
fan_out = shape[-1] * receptive_field_size
|
|
19
|
+
return fan_in, fan_out
|