neutro 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. neutro-0.1.0/LICENSE +21 -0
  2. neutro-0.1.0/PKG-INFO +126 -0
  3. neutro-0.1.0/README.md +109 -0
  4. neutro-0.1.0/neutro/__init__.py +8 -0
  5. neutro-0.1.0/neutro/activations/__init__.py +15 -0
  6. neutro-0.1.0/neutro/activations/base.py +5 -0
  7. neutro-0.1.0/neutro/activations/relu.py +9 -0
  8. neutro-0.1.0/neutro/activations/sigmoid.py +10 -0
  9. neutro-0.1.0/neutro/activations/silu.py +20 -0
  10. neutro-0.1.0/neutro/activations/softmax.py +22 -0
  11. neutro-0.1.0/neutro/activations/tanh.py +8 -0
  12. neutro-0.1.0/neutro/callbacks/__init__.py +5 -0
  13. neutro-0.1.0/neutro/callbacks/base.py +13 -0
  14. neutro-0.1.0/neutro/callbacks/checkpoint.py +25 -0
  15. neutro-0.1.0/neutro/callbacks/early_stopping.py +27 -0
  16. neutro-0.1.0/neutro/callbacks/history.py +10 -0
  17. neutro-0.1.0/neutro/callbacks/lr_scheduler.py +79 -0
  18. neutro-0.1.0/neutro/data.py +44 -0
  19. neutro-0.1.0/neutro/initializers/__init__.py +13 -0
  20. neutro-0.1.0/neutro/initializers/base.py +3 -0
  21. neutro-0.1.0/neutro/initializers/constant.py +10 -0
  22. neutro-0.1.0/neutro/initializers/glorot.py +19 -0
  23. neutro-0.1.0/neutro/initializers/he.py +19 -0
  24. neutro-0.1.0/neutro/initializers/random.py +9 -0
  25. neutro-0.1.0/neutro/layers/__init__.py +21 -0
  26. neutro-0.1.0/neutro/layers/attention/base_attention.py +21 -0
  27. neutro-0.1.0/neutro/layers/attention/flash_attention.py +209 -0
  28. neutro-0.1.0/neutro/layers/attention/gqa.py +33 -0
  29. neutro-0.1.0/neutro/layers/attention/kv_cache.py +32 -0
  30. neutro-0.1.0/neutro/layers/attention/mha.py +68 -0
  31. neutro-0.1.0/neutro/layers/attention/mla.py +121 -0
  32. neutro-0.1.0/neutro/layers/attention/mqa.py +32 -0
  33. neutro-0.1.0/neutro/layers/base.py +77 -0
  34. neutro-0.1.0/neutro/layers/convolutional/conv1d.py +105 -0
  35. neutro-0.1.0/neutro/layers/convolutional/conv2d.py +125 -0
  36. neutro-0.1.0/neutro/layers/core/__init__.py +7 -0
  37. neutro-0.1.0/neutro/layers/core/activation.py +32 -0
  38. neutro-0.1.0/neutro/layers/core/dense.py +49 -0
  39. neutro-0.1.0/neutro/layers/core/dropout.py +19 -0
  40. neutro-0.1.0/neutro/layers/core/flatten.py +20 -0
  41. neutro-0.1.0/neutro/layers/core/merging.py +68 -0
  42. neutro-0.1.0/neutro/layers/core/moe.py +119 -0
  43. neutro-0.1.0/neutro/layers/core/reparameterization.py +33 -0
  44. neutro-0.1.0/neutro/layers/embedding/__init__.py +2 -0
  45. neutro-0.1.0/neutro/layers/embedding/embedding.py +24 -0
  46. neutro-0.1.0/neutro/layers/embedding/time_embedding.py +39 -0
  47. neutro-0.1.0/neutro/layers/normalization/__init__.py +4 -0
  48. neutro-0.1.0/neutro/layers/normalization/batchnorm.py +59 -0
  49. neutro-0.1.0/neutro/layers/normalization/groupnorm.py +70 -0
  50. neutro-0.1.0/neutro/layers/normalization/layernorm.py +28 -0
  51. neutro-0.1.0/neutro/layers/normalization/rmsnorm.py +42 -0
  52. neutro-0.1.0/neutro/layers/pooling/__init__.py +3 -0
  53. neutro-0.1.0/neutro/layers/pooling/global_pooling.py +68 -0
  54. neutro-0.1.0/neutro/layers/pooling/maxpooling2d.py +80 -0
  55. neutro-0.1.0/neutro/layers/pooling/upsampling2d.py +33 -0
  56. neutro-0.1.0/neutro/layers/recurrent/__init__.py +3 -0
  57. neutro-0.1.0/neutro/layers/recurrent/gru.py +136 -0
  58. neutro-0.1.0/neutro/layers/recurrent/lstm.py +61 -0
  59. neutro-0.1.0/neutro/layers/recurrent/simple_rnn.py +56 -0
  60. neutro-0.1.0/neutro/layers/transformer/transformer_block.py +104 -0
  61. neutro-0.1.0/neutro/losses/__init__.py +14 -0
  62. neutro-0.1.0/neutro/losses/base.py +5 -0
  63. neutro-0.1.0/neutro/losses/categorical_crossentropy.py +12 -0
  64. neutro-0.1.0/neutro/losses/mse.py +8 -0
  65. neutro-0.1.0/neutro/losses/sparse_categorical_crossentropy.py +45 -0
  66. neutro-0.1.0/neutro/losses/vae_loss.py +50 -0
  67. neutro-0.1.0/neutro/metrics/__init__.py +15 -0
  68. neutro-0.1.0/neutro/metrics/accuracy.py +10 -0
  69. neutro-0.1.0/neutro/metrics/base.py +5 -0
  70. neutro-0.1.0/neutro/metrics/f1_score.py +12 -0
  71. neutro-0.1.0/neutro/metrics/precision.py +12 -0
  72. neutro-0.1.0/neutro/metrics/recall.py +12 -0
  73. neutro-0.1.0/neutro/metrics/sparse_accuracy.py +12 -0
  74. neutro-0.1.0/neutro/models/__init__.py +7 -0
  75. neutro-0.1.0/neutro/models/base_model.py +300 -0
  76. neutro-0.1.0/neutro/models/language/__init__.py +4 -0
  77. neutro-0.1.0/neutro/models/language/deepseek.py +122 -0
  78. neutro-0.1.0/neutro/models/language/gpt.py +39 -0
  79. neutro-0.1.0/neutro/models/language/llama.py +117 -0
  80. neutro-0.1.0/neutro/models/language/qwen.py +57 -0
  81. neutro-0.1.0/neutro/models/moe/__init__.py +0 -0
  82. neutro-0.1.0/neutro/models/vision/__init__.py +2 -0
  83. neutro-0.1.0/neutro/models/vision/alexnet.py +45 -0
  84. neutro-0.1.0/neutro/models/vision/diffusion_model.py +48 -0
  85. neutro-0.1.0/neutro/models/vision/unet.py +95 -0
  86. neutro-0.1.0/neutro/models/vision/vae.py +72 -0
  87. neutro-0.1.0/neutro/models/vision/vgg.py +82 -0
  88. neutro-0.1.0/neutro/optimizers/__init__.py +4 -0
  89. neutro-0.1.0/neutro/optimizers/adam.py +33 -0
  90. neutro-0.1.0/neutro/optimizers/adamw.py +37 -0
  91. neutro-0.1.0/neutro/optimizers/base.py +14 -0
  92. neutro-0.1.0/neutro/optimizers/schedules.py +37 -0
  93. neutro-0.1.0/neutro/optimizers/sgd.py +32 -0
  94. neutro-0.1.0/neutro/preprocessing/__init__.py +3 -0
  95. neutro-0.1.0/neutro/preprocessing/image.py +91 -0
  96. neutro-0.1.0/neutro/preprocessing/sequence.py +46 -0
  97. neutro-0.1.0/neutro/preprocessing/text.py +105 -0
  98. neutro-0.1.0/neutro/tokenizers/__init__.py +4 -0
  99. neutro-0.1.0/neutro/tokenizers/bpe.py +202 -0
  100. neutro-0.1.0/neutro/tokenizers/tiktoken_compat.py +62 -0
  101. neutro-0.1.0/neutro/utils/__init__.py +0 -0
  102. neutro-0.1.0/neutro/utils/conv_utils.py +76 -0
  103. neutro-0.1.0/neutro/utils/data_utils.py +71 -0
  104. neutro-0.1.0/neutro/utils/diffusion_utils.py +47 -0
  105. neutro-0.1.0/neutro/utils/rope_utils.py +33 -0
  106. neutro-0.1.0/neutro/utils/visualization.py +43 -0
  107. neutro-0.1.0/neutro.egg-info/PKG-INFO +126 -0
  108. neutro-0.1.0/neutro.egg-info/SOURCES.txt +113 -0
  109. neutro-0.1.0/neutro.egg-info/dependency_links.txt +1 -0
  110. neutro-0.1.0/neutro.egg-info/requires.txt +9 -0
  111. neutro-0.1.0/neutro.egg-info/top_level.txt +1 -0
  112. neutro-0.1.0/pyproject.toml +23 -0
  113. neutro-0.1.0/setup.cfg +4 -0
  114. neutro-0.1.0/tests/test_data.py +35 -0
  115. neutro-0.1.0/tests/test_preprocessing.py +60 -0
neutro-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
neutro-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,126 @@
1
+ Metadata-Version: 2.4
2
+ Name: neutro
3
+ Version: 0.1.0
4
+ Summary: A Keras-style deep learning library using NumPy and SciPy
5
+ Requires-Python: >=3.8
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: numpy
9
+ Requires-Dist: scipy
10
+ Requires-Dist: joblib
11
+ Requires-Dist: tqdm
12
+ Requires-Dist: regex
13
+ Provides-Extra: test
14
+ Requires-Dist: pytest; extra == "test"
15
+ Requires-Dist: pytest-cov; extra == "test"
16
+ Dynamic: license-file
17
+
18
+ # 🧠 Neutro: The "Old School" Deep Learning Playground
19
+
20
+ [![codecov](https://codecov.io/gh/sourcepirate/neutro/graph/badge.svg?token=8H4Q2Q2Q2Q)](https://codecov.io/gh/sourcepirate/neutro)
21
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
22
+
23
+ **Neutro** is a intentionally naive, NumPy-only implementation of modern deep learning architectures. It’s the Keras experience you love, powered by the NumPy you tolerate, built specifically for people who want to peek under the hood and actually *understand* how the gears turn.
24
+
25
+ ---
26
+
27
+ ## 👴 The Philosophy: Why Does This Exist?
28
+
29
+ Let's be honest: modern DL frameworks are black boxes. You pip install 4GB of binaries and suddenly you're "doing AI."
30
+
31
+ **Neutro is for the curious, the learners, and the "old-school" folks like me** who believe that if you can't build it in a matrix, you don't really know it.
32
+
33
+ - **Learn, Don't just Run**: Every line of code is designed to be readable. We don't hide behind C++ kernels or CUDA kernels. If you want to know how FlashAttention *actually* tiles memory, you can just read the Python file.
34
+ - **A Toy, not a Tool**: This isn't meant for production. It's a playground for learning advanced algorithms (MHA, GQA, FlashAttention, LSTM) in their purest form.
35
+ - **For the Wisdom-Rich**: If you remember when 64MB of RAM was a flex and "vectorization" meant loop unrolling, this is for you. It's a fun way to play with cutting-edge 2024 algorithms using 1990s-era clarity.
36
+
37
+ ---
38
+
39
+ ## 🚀 What's Inside?
40
+
41
+ - **"I can't believe it's not Keras!"**: Your muscle memory is safe here. `.compile()`, `.fit()`, `.predict()`—it’s all exactly where you left it.
42
+ - **Pure NumPy Math**: We did the math so you don't have to. Every gradient, from Softmax to LSTM gates, is hand-derived and vectorized.
43
+ - **Speed (for a CPU)**: We use `im2col` for convolutions and **FlashAttention** (yes, really) to keep your CPU fans humming in a way that sounds productive.
44
+ - **Zero Heavy Dependencies**: Tired of downloading 4GB of CUDA binaries just to train on MNIST? We require exactly `numpy` and `scipy`. That’s it.
45
+
46
+ ---
47
+
48
+ ## 🛠 Features That'll Make You Say "Wait, You Implemented That?"
49
+
50
+ | Category | The "Fancy" Stuff | Why You Should Care |
51
+ | :--- | :--- | :--- |
52
+ | **Attention** | `FlashAttention`, `MQA`, `GQA`, `RoPE` | We have more attention variants than a distracted toddler. |
53
+ | **Tokenization** | `BPETokenizer`, `RegexTokenizer` | Byte-level BPE with regex splitting, just like the big kids. |
54
+ | **Vision** | `AlexNet`, `VGG16`, `VGG19`, `im2col` | Classical and modern vision architectures, vectorized. |
55
+ | **LLMs** | `Llama`, `Qwen`, `DeepSeek` (MoE) | Yes, you can run a (very tiny) MoE model on your CPU. |
56
+ | **Modern Ops** | `RMSNorm`, `SiLU`, `SwiGLU` | The secret sauce of modern LLMs, hand-implemented. |
57
+ | **Optimizers** | `AdamW`, `Adam`, `SGD+Momentum` | Keep your weights from exploding like a bad science fair project. |
58
+
59
+ ---
60
+
61
+ ## 🏆 The Hall of Fame: Pre-built Architectures
62
+
63
+ Why build from scratch when we've already done the heavy lifting?
64
+
65
+ - **The Visionaries**: `AlexNet`, `VGG16`, `VGG19`
66
+ - **The Linguists**: `GPT-2`, `LlamaTiny`, `QwenTiny`, `DeepSeekTiny` (Mixture of Experts)
67
+
68
+ ---
69
+
70
+ ## 💻 Show Me The Code!
71
+
72
+ If you know Keras, you already know Neutro. It's that simple.
73
+
74
+ ```python
75
+ from neutro.models import Sequential
76
+ from neutro.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
77
+
78
+ # Build a CNN that actually fits in your head
79
+ model = Sequential([
80
+ Conv2D(32, kernel_size=3, activation='relu', input_shape=(28, 28, 1)),
81
+ MaxPooling2D(pool_size=2),
82
+ Flatten(),
83
+ Dense(128, activation='relu'),
84
+ Dropout(0.5),
85
+ Dense(10, activation='softmax')
86
+ ])
87
+
88
+ # Compile it like it's 2015
89
+ model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
90
+
91
+ # Fit it like a tailored suit
92
+ model.fit(train_flow, epochs=10)
93
+ ```
94
+
95
+ ---
96
+
97
+ ## 📂 Deep Dives & Nerdy Stuff
98
+
99
+ We documented everything because we know you like to check the math:
100
+
101
+ - [**Attention Mechanisms**](./docs/layers/attention/) - How we made FlashAttention work on a CPU.
102
+ - [**Convolutional Magic**](./docs/layers/convolutional/) - The `im2col` deep dive.
103
+ - [**Activations & Gradients**](./docs/activations/) - Proofs for the brave.
104
+ - [**Optimizers**](./docs/optimizers/) - Why AdamW is better than your ex.
105
+
106
+ ---
107
+
108
+ ## 🧪 Examples to Flex Your CPU
109
+
110
+ Check out the `examples/` folder for end-to-end scripts:
111
+ - `mnist_cnn.py`: Standard digit classification with real-time augmentation.
112
+ - `wikitext_llm.py`: A character-level Transformer that actually talks back.
113
+
114
+ ---
115
+
116
+ ## 🏗 Installation
117
+
118
+ ```bash
119
+ git clone https://github.com/sourcepirate/neutro.git
120
+ cd neutro
121
+ pip install -e .
122
+ ```
123
+
124
+ ---
125
+
126
+ **Disclaimer**: This is a hobby project for learning and exploration. It is intentionally naive, likely inefficient compared to compiled kernels, and 100% focused on the joy of understanding advanced algorithms. If you're looking to change the world with AGI, go to PyTorch. If you're looking to understand why your Transformer works while drinking a nice cup of tea, you're in the right place.
neutro-0.1.0/README.md ADDED
@@ -0,0 +1,109 @@
1
+ # 🧠 Neutro: The "Old School" Deep Learning Playground
2
+
3
+ [![codecov](https://codecov.io/gh/sourcepirate/neutro/graph/badge.svg?token=8H4Q2Q2Q2Q)](https://codecov.io/gh/sourcepirate/neutro)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
+
6
+ **Neutro** is a intentionally naive, NumPy-only implementation of modern deep learning architectures. It’s the Keras experience you love, powered by the NumPy you tolerate, built specifically for people who want to peek under the hood and actually *understand* how the gears turn.
7
+
8
+ ---
9
+
10
+ ## 👴 The Philosophy: Why Does This Exist?
11
+
12
+ Let's be honest: modern DL frameworks are black boxes. You pip install 4GB of binaries and suddenly you're "doing AI."
13
+
14
+ **Neutro is for the curious, the learners, and the "old-school" folks like me** who believe that if you can't build it in a matrix, you don't really know it.
15
+
16
+ - **Learn, Don't just Run**: Every line of code is designed to be readable. We don't hide behind C++ kernels or CUDA kernels. If you want to know how FlashAttention *actually* tiles memory, you can just read the Python file.
17
+ - **A Toy, not a Tool**: This isn't meant for production. It's a playground for learning advanced algorithms (MHA, GQA, FlashAttention, LSTM) in their purest form.
18
+ - **For the Wisdom-Rich**: If you remember when 64MB of RAM was a flex and "vectorization" meant loop unrolling, this is for you. It's a fun way to play with cutting-edge 2024 algorithms using 1990s-era clarity.
19
+
20
+ ---
21
+
22
+ ## 🚀 What's Inside?
23
+
24
+ - **"I can't believe it's not Keras!"**: Your muscle memory is safe here. `.compile()`, `.fit()`, `.predict()`—it’s all exactly where you left it.
25
+ - **Pure NumPy Math**: We did the math so you don't have to. Every gradient, from Softmax to LSTM gates, is hand-derived and vectorized.
26
+ - **Speed (for a CPU)**: We use `im2col` for convolutions and **FlashAttention** (yes, really) to keep your CPU fans humming in a way that sounds productive.
27
+ - **Zero Heavy Dependencies**: Tired of downloading 4GB of CUDA binaries just to train on MNIST? We require exactly `numpy` and `scipy`. That’s it.
28
+
29
+ ---
30
+
31
+ ## 🛠 Features That'll Make You Say "Wait, You Implemented That?"
32
+
33
+ | Category | The "Fancy" Stuff | Why You Should Care |
34
+ | :--- | :--- | :--- |
35
+ | **Attention** | `FlashAttention`, `MQA`, `GQA`, `RoPE` | We have more attention variants than a distracted toddler. |
36
+ | **Tokenization** | `BPETokenizer`, `RegexTokenizer` | Byte-level BPE with regex splitting, just like the big kids. |
37
+ | **Vision** | `AlexNet`, `VGG16`, `VGG19`, `im2col` | Classical and modern vision architectures, vectorized. |
38
+ | **LLMs** | `Llama`, `Qwen`, `DeepSeek` (MoE) | Yes, you can run a (very tiny) MoE model on your CPU. |
39
+ | **Modern Ops** | `RMSNorm`, `SiLU`, `SwiGLU` | The secret sauce of modern LLMs, hand-implemented. |
40
+ | **Optimizers** | `AdamW`, `Adam`, `SGD+Momentum` | Keep your weights from exploding like a bad science fair project. |
41
+
42
+ ---
43
+
44
+ ## 🏆 The Hall of Fame: Pre-built Architectures
45
+
46
+ Why build from scratch when we've already done the heavy lifting?
47
+
48
+ - **The Visionaries**: `AlexNet`, `VGG16`, `VGG19`
49
+ - **The Linguists**: `GPT-2`, `LlamaTiny`, `QwenTiny`, `DeepSeekTiny` (Mixture of Experts)
50
+
51
+ ---
52
+
53
+ ## 💻 Show Me The Code!
54
+
55
+ If you know Keras, you already know Neutro. It's that simple.
56
+
57
+ ```python
58
+ from neutro.models import Sequential
59
+ from neutro.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
60
+
61
+ # Build a CNN that actually fits in your head
62
+ model = Sequential([
63
+ Conv2D(32, kernel_size=3, activation='relu', input_shape=(28, 28, 1)),
64
+ MaxPooling2D(pool_size=2),
65
+ Flatten(),
66
+ Dense(128, activation='relu'),
67
+ Dropout(0.5),
68
+ Dense(10, activation='softmax')
69
+ ])
70
+
71
+ # Compile it like it's 2015
72
+ model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
73
+
74
+ # Fit it like a tailored suit
75
+ model.fit(train_flow, epochs=10)
76
+ ```
77
+
78
+ ---
79
+
80
+ ## 📂 Deep Dives & Nerdy Stuff
81
+
82
+ We documented everything because we know you like to check the math:
83
+
84
+ - [**Attention Mechanisms**](./docs/layers/attention/) - How we made FlashAttention work on a CPU.
85
+ - [**Convolutional Magic**](./docs/layers/convolutional/) - The `im2col` deep dive.
86
+ - [**Activations & Gradients**](./docs/activations/) - Proofs for the brave.
87
+ - [**Optimizers**](./docs/optimizers/) - Why AdamW is better than your ex.
88
+
89
+ ---
90
+
91
+ ## 🧪 Examples to Flex Your CPU
92
+
93
+ Check out the `examples/` folder for end-to-end scripts:
94
+ - `mnist_cnn.py`: Standard digit classification with real-time augmentation.
95
+ - `wikitext_llm.py`: A character-level Transformer that actually talks back.
96
+
97
+ ---
98
+
99
+ ## 🏗 Installation
100
+
101
+ ```bash
102
+ git clone https://github.com/sourcepirate/neutro.git
103
+ cd neutro
104
+ pip install -e .
105
+ ```
106
+
107
+ ---
108
+
109
+ **Disclaimer**: This is a hobby project for learning and exploration. It is intentionally naive, likely inefficient compared to compiled kernels, and 100% focused on the joy of understanding advanced algorithms. If you're looking to change the world with AGI, go to PyTorch. If you're looking to understand why your Transformer works while drinking a nice cup of tea, you're in the right place.
@@ -0,0 +1,8 @@
1
+ from . import layers
2
+ from . import activations
3
+ from . import initializers
4
+ from . import losses
5
+ from . import optimizers
6
+ from . import metrics
7
+ from . import callbacks
8
+ from .models import Sequential, Model
@@ -0,0 +1,15 @@
1
+ from .base import Activation
2
+ from .relu import ReLU
3
+ from .sigmoid import Sigmoid
4
+ from .tanh import Tanh
5
+ from .softmax import Softmax
6
+ from .silu import SiLU
7
+
8
+ def get(identifier):
9
+ if identifier == 'relu': return ReLU()
10
+ if identifier == 'sigmoid': return Sigmoid()
11
+ if identifier == 'tanh': return Tanh()
12
+ if identifier == 'softmax': return Softmax()
13
+ if identifier == 'silu': return SiLU()
14
+ if isinstance(identifier, Activation): return identifier
15
+ return identifier
@@ -0,0 +1,5 @@
1
+ class Activation:
2
+ def __call__(self, x):
3
+ raise NotImplementedError
4
+ def gradient(self, x):
5
+ raise NotImplementedError
@@ -0,0 +1,9 @@
1
+ import numpy as np
2
+ from .base import Activation
3
+
4
+ class ReLU(Activation):
5
+ def __call__(self, x):
6
+ self.last_x = x
7
+ return np.maximum(0, x)
8
+ def gradient(self, x):
9
+ return (x > 0).astype(float)
@@ -0,0 +1,10 @@
1
+ import numpy as np
2
+ from .base import Activation
3
+
4
+ class Sigmoid(Activation):
5
+ def __call__(self, x):
6
+ self.last_output = 1 / (1 + np.exp(-np.clip(x, -500, 500)))
7
+ return self.last_output
8
+ def gradient(self, x):
9
+ s = self.__call__(x)
10
+ return s * (1 - s)
@@ -0,0 +1,20 @@
1
+ import numpy as np
2
+ from .base import Activation
3
+
4
+ class SiLU(Activation):
5
+ """
6
+ SiLU (Sigmoid Linear Unit) or Swish activation function: x * sigmoid(x).
7
+ Commonly used in Llama, Qwen, and DeepSeek.
8
+ """
9
+ def __call__(self, x):
10
+ self.sigmoid_x = 1 / (1 + np.exp(-x))
11
+ self.x = x
12
+ return x * self.sigmoid_x
13
+
14
+ def gradient(self, x):
15
+ # f'(x) = f(x) + sigmoid(x) * (1 - f(x))
16
+ f_x = x * self.sigmoid_x
17
+ return f_x + self.sigmoid_x * (1 - f_x)
18
+
19
+ def gradient_fast(self, x, grad_output):
20
+ return grad_output * self.gradient(x)
@@ -0,0 +1,22 @@
1
+ import numpy as np
2
+ from .base import Activation
3
+
4
+ class Softmax(Activation):
5
+ def __call__(self, x):
6
+ exps = np.exp(x - np.max(x, axis=-1, keepdims=True))
7
+ self.last_output = exps / np.sum(exps, axis=-1, keepdims=True)
8
+ return self.last_output
9
+ def gradient(self, x):
10
+ return self.last_output * (1 - self.last_output)
11
+ def gradient_fast(self, x, grad_output):
12
+ orig_shape = grad_output.shape
13
+ grad_flat = grad_output.reshape(-1, orig_shape[-1])
14
+ out_flat = self.last_output.reshape(-1, orig_shape[-1])
15
+
16
+ n_samples, units = grad_flat.shape
17
+ res = np.zeros_like(grad_flat)
18
+ for i in range(n_samples):
19
+ s = out_flat[i].reshape(-1, 1)
20
+ jacobian = np.diagflat(s) - np.dot(s, s.T)
21
+ res[i] = np.dot(grad_flat[i], jacobian)
22
+ return res.reshape(orig_shape)
@@ -0,0 +1,8 @@
1
+ import numpy as np
2
+ from .base import Activation
3
+
4
+ class Tanh(Activation):
5
+ def __call__(self, x):
6
+ return np.tanh(x)
7
+ def gradient(self, x):
8
+ return 1 - np.tanh(x)**2
@@ -0,0 +1,5 @@
1
+ from .base import Callback
2
+ from .checkpoint import ModelCheckpoint
3
+ from .early_stopping import EarlyStopping
4
+ from .history import History
5
+ from .lr_scheduler import LearningRateScheduler, ReduceLROnPlateau
@@ -0,0 +1,13 @@
1
+ class Callback:
2
+ def __init__(self):
3
+ self.model = None
4
+
5
+ def set_model(self, model):
6
+ self.model = model
7
+
8
+ def on_epoch_begin(self, epoch, logs=None): pass
9
+ def on_epoch_end(self, epoch, logs=None): pass
10
+ def on_batch_begin(self, batch, logs=None): pass
11
+ def on_batch_end(self, batch, logs=None): pass
12
+ def on_train_begin(self, logs=None): pass
13
+ def on_train_end(self, logs=None): pass
@@ -0,0 +1,25 @@
1
+ import numpy as np
2
+ from .base import Callback
3
+
4
+ class ModelCheckpoint(Callback):
5
+ def __init__(self, filepath, monitor='val_loss', save_best_only=False, mode='auto'):
6
+ super().__init__()
7
+ self.filepath = filepath
8
+ self.monitor = monitor
9
+ self.save_best_only = save_best_only
10
+ self.best = -np.inf if mode == 'max' or (mode == 'auto' and 'acc' in monitor) else np.inf
11
+ self.mode = mode
12
+
13
+ def on_epoch_end(self, epoch, logs=None):
14
+ logs = logs or {}
15
+ current = logs.get(self.monitor)
16
+ if current is None: return
17
+
18
+ if self.save_best_only:
19
+ if (self.mode == 'min' and current < self.best) or \
20
+ (self.mode == 'max' and current > self.best) or \
21
+ (self.mode == 'auto' and (('acc' in self.monitor and current > self.best) or ('loss' in self.monitor and current < self.best))):
22
+ self.best = current
23
+ self.model.save(self.filepath)
24
+ else:
25
+ self.model.save(self.filepath.format(epoch=epoch + 1, **logs))
@@ -0,0 +1,27 @@
1
+ import numpy as np
2
+ from .base import Callback
3
+
4
+ class EarlyStopping(Callback):
5
+ def __init__(self, monitor='val_loss', patience=0, mode='auto'):
6
+ super().__init__()
7
+ self.monitor = monitor
8
+ self.patience = patience
9
+ self.wait = 0
10
+ self.best = -np.inf if mode == 'max' or (mode == 'auto' and 'acc' in monitor) else np.inf
11
+ self.mode = mode
12
+
13
+ def on_epoch_end(self, epoch, logs=None):
14
+ logs = logs or {}
15
+ current = logs.get(self.monitor)
16
+ if current is None: return
17
+
18
+ if (self.mode == 'min' and current < self.best) or \
19
+ (self.mode == 'max' and current > self.best) or \
20
+ (self.mode == 'auto' and (('acc' in self.monitor and current > self.best) or ('loss' in self.monitor and current < self.best))):
21
+ self.best = current
22
+ self.wait = 0
23
+ else:
24
+ self.wait += 1
25
+ if self.wait >= self.patience:
26
+ self.model.stop_training = True
27
+ print(f"Epoch {epoch+1}: early stopping")
@@ -0,0 +1,10 @@
1
+ from .base import Callback
2
+
3
+ class History(Callback):
4
+ def on_train_begin(self, logs=None):
5
+ self.history = {'loss': [], 'epoch': []}
6
+
7
+ def on_epoch_end(self, epoch, logs=None):
8
+ self.history['epoch'].append(epoch)
9
+ for k, v in logs.items():
10
+ self.history.setdefault(k, []).append(v)
@@ -0,0 +1,79 @@
1
+ import numpy as np
2
+ from .base import Callback
3
+
4
+ class LearningRateScheduler(Callback):
5
+ """
6
+ Learning rate scheduler.
7
+
8
+ Args:
9
+ schedule: a function that takes an epoch index (integer, indexed from 0) and current learning rate as inputs and returns a new learning rate as output (float).
10
+ """
11
+ def __init__(self, schedule, verbose=0):
12
+ super().__init__()
13
+ self.schedule = schedule
14
+ self.verbose = verbose
15
+
16
+ def on_epoch_begin(self, epoch, logs=None):
17
+ if not hasattr(self.model.optimizer, 'lr'):
18
+ raise ValueError('Optimizer must have a "lr" attribute.')
19
+
20
+ lr = float(self.model.optimizer.lr)
21
+ lr = self.schedule(epoch, lr)
22
+ self.model.optimizer.lr = lr
23
+ if self.verbose > 0:
24
+ print(f'\nEpoch {epoch + 1}: LearningRateScheduler setting learning rate to {lr}.')
25
+
26
+ class ReduceLROnPlateau(Callback):
27
+ """
28
+ Reduce learning rate when a metric has stopped improving.
29
+ """
30
+ def __init__(self, monitor='val_loss', factor=0.1, patience=10, verbose=0, mode='auto', min_delta=1e-4, cooldown=0, min_lr=0):
31
+ super().__init__()
32
+ self.monitor = monitor
33
+ self.factor = factor
34
+ self.patience = patience
35
+ self.verbose = verbose
36
+ self.mode = mode
37
+ self.min_delta = min_delta
38
+ self.cooldown = cooldown
39
+ self.min_lr = min_lr
40
+ self.wait = 0
41
+ self.best = np.inf if 'loss' in monitor else -np.inf
42
+ self.cooldown_counter = 0
43
+
44
+ def on_train_begin(self, logs=None):
45
+ self.wait = 0
46
+ self.best = np.inf if 'loss' in self.monitor else -np.inf
47
+ self.cooldown_counter = 0
48
+
49
+ def on_epoch_end(self, epoch, logs=None):
50
+ logs = logs or {}
51
+ current = logs.get(self.monitor)
52
+ if current is None:
53
+ return
54
+
55
+ if self.cooldown_counter > 0:
56
+ self.cooldown_counter -= 1
57
+ self.wait = 0
58
+
59
+ if self._is_improvement(current, self.best):
60
+ self.best = current
61
+ self.wait = 0
62
+ elif self.cooldown_counter <= 0:
63
+ self.wait += 1
64
+ if self.wait >= self.patience:
65
+ old_lr = float(self.model.optimizer.lr)
66
+ if old_lr > self.min_lr:
67
+ new_lr = old_lr * self.factor
68
+ new_lr = max(new_lr, self.min_lr)
69
+ self.model.optimizer.lr = new_lr
70
+ if self.verbose > 0:
71
+ print(f'\nEpoch {epoch + 1}: ReduceLROnPlateau reducing learning rate to {new_lr}.')
72
+ self.cooldown_counter = self.cooldown
73
+ self.wait = 0
74
+
75
+ def _is_improvement(self, current, best):
76
+ if 'loss' in self.monitor or self.mode == 'min':
77
+ return current < best - self.min_delta
78
+ else:
79
+ return current > best + self.min_delta
@@ -0,0 +1,44 @@
1
+ import numpy as np
2
+
3
+ class DataLoader:
4
+ """
5
+ Data loader for batching and shuffling data.
6
+
7
+ Args:
8
+ x: Input data (NumPy array).
9
+ y: Target data (NumPy array).
10
+ batch_size: Number of samples per batch.
11
+ shuffle: Whether to shuffle the data at the beginning of each epoch.
12
+ """
13
+ def __init__(self, x, y, batch_size=32, shuffle=True, augmenter=None):
14
+ self.x = x
15
+ self.y = y
16
+ self.batch_size = batch_size
17
+ self.shuffle = shuffle
18
+ self.augmenter = augmenter
19
+ self.indices = np.arange(len(x))
20
+ self.on_epoch_end()
21
+
22
+ def __len__(self):
23
+ return int(np.ceil(len(self.x) / self.batch_size))
24
+
25
+ def on_epoch_end(self):
26
+ if self.shuffle:
27
+ np.random.shuffle(self.indices)
28
+
29
+ def __getitem__(self, index):
30
+ indices = self.indices[index * self.batch_size : (index + 1) * self.batch_size]
31
+ batch_x, batch_y = self.x[indices], self.y[indices]
32
+
33
+ if self.augmenter:
34
+ augmented_x = np.zeros_like(batch_x)
35
+ for i in range(len(batch_x)):
36
+ augmented_x[i] = self.augmenter.apply_transform(batch_x[i])
37
+ batch_x = augmented_x
38
+
39
+ return batch_x, batch_y
40
+
41
+ def __iter__(self):
42
+ for i in range(len(self)):
43
+ yield self[i]
44
+ self.on_epoch_end()
@@ -0,0 +1,13 @@
1
+ from .base import Initializer
2
+ from .constant import Zeros, Ones
3
+ from .random import RandomNormal
4
+ from .glorot import GlorotUniform
5
+ from .he import HeNormal
6
+
7
+ def get(identifier):
8
+ if identifier == 'zeros': return Zeros()
9
+ if identifier == 'ones': return Ones()
10
+ if identifier == 'glorot_uniform': return GlorotUniform()
11
+ if identifier == 'he_normal': return HeNormal()
12
+ if isinstance(identifier, Initializer): return identifier
13
+ return identifier
@@ -0,0 +1,3 @@
1
+ class Initializer:
2
+ def __call__(self, shape):
3
+ raise NotImplementedError
@@ -0,0 +1,10 @@
1
+ import numpy as np
2
+ from .base import Initializer
3
+
4
+ class Zeros(Initializer):
5
+ def __call__(self, shape):
6
+ return np.zeros(shape)
7
+
8
+ class Ones(Initializer):
9
+ def __call__(self, shape):
10
+ return np.ones(shape)
@@ -0,0 +1,19 @@
1
+ import numpy as np
2
+ from .base import Initializer
3
+
4
+ class GlorotUniform(Initializer):
5
+ def __call__(self, shape):
6
+ fan_in, fan_out = self._calculate_fan_in_and_fan_out(shape)
7
+ limit = np.sqrt(6 / (fan_in + fan_out))
8
+ return np.random.uniform(-limit, limit, shape)
9
+
10
+ def _calculate_fan_in_and_fan_out(self, shape):
11
+ if len(shape) < 2:
12
+ return shape[0], shape[0]
13
+ if len(shape) == 2:
14
+ return shape[0], shape[1]
15
+ else:
16
+ receptive_field_size = np.prod(shape[:-2])
17
+ fan_in = shape[-2] * receptive_field_size
18
+ fan_out = shape[-1] * receptive_field_size
19
+ return fan_in, fan_out
@@ -0,0 +1,19 @@
1
+ import numpy as np
2
+ from .base import Initializer
3
+
4
+ class HeNormal(Initializer):
5
+ def __call__(self, shape):
6
+ fan_in, _ = self._calculate_fan_in_and_fan_out(shape)
7
+ std = np.sqrt(2 / fan_in)
8
+ return np.random.normal(0, std, shape)
9
+
10
+ def _calculate_fan_in_and_fan_out(self, shape):
11
+ if len(shape) < 2:
12
+ return shape[0], shape[0]
13
+ if len(shape) == 2:
14
+ return shape[0], shape[1]
15
+ else:
16
+ receptive_field_size = np.prod(shape[:-2])
17
+ fan_in = shape[-2] * receptive_field_size
18
+ fan_out = shape[-1] * receptive_field_size
19
+ return fan_in, fan_out
@@ -0,0 +1,9 @@
1
+ import numpy as np
2
+ from .base import Initializer
3
+
4
+ class RandomNormal(Initializer):
5
+ def __init__(self, mean=0.0, stddev=0.05):
6
+ self.mean = mean
7
+ self.stddev = stddev
8
+ def __call__(self, shape):
9
+ return np.random.normal(self.mean, self.stddev, shape)