quarterbit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quarterbit-0.1.0/LICENSE +53 -0
- quarterbit-0.1.0/PKG-INFO +122 -0
- quarterbit-0.1.0/README.md +89 -0
- quarterbit-0.1.0/pyproject.toml +45 -0
- quarterbit-0.1.0/quarterbit/__init__.py +94 -0
- quarterbit-0.1.0/quarterbit/torch/__init__.py +22 -0
- quarterbit-0.1.0/quarterbit/torch/functional.py +229 -0
- quarterbit-0.1.0/quarterbit/torch/optim.py +728 -0
- quarterbit-0.1.0/quarterbit/torch/utils.py +239 -0
- quarterbit-0.1.0/quarterbit.egg-info/PKG-INFO +122 -0
- quarterbit-0.1.0/quarterbit.egg-info/SOURCES.txt +13 -0
- quarterbit-0.1.0/quarterbit.egg-info/dependency_links.txt +1 -0
- quarterbit-0.1.0/quarterbit.egg-info/requires.txt +7 -0
- quarterbit-0.1.0/quarterbit.egg-info/top_level.txt +1 -0
- quarterbit-0.1.0/setup.cfg +4 -0
quarterbit-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
QuarterBit Software License Agreement
|
|
2
|
+
Copyright (c) 2026 Clouthier Simulation Labs. All rights reserved.
|
|
3
|
+
|
|
4
|
+
================================================================================
|
|
5
|
+
TERMS OF USE
|
|
6
|
+
================================================================================
|
|
7
|
+
|
|
8
|
+
This software is proprietary and offered under a tiered licensing model.
|
|
9
|
+
|
|
10
|
+
FREE TIER
|
|
11
|
+
---------
|
|
12
|
+
You may use QuarterBit free of charge for:
|
|
13
|
+
- Personal projects
|
|
14
|
+
- Academic research (non-commercial)
|
|
15
|
+
- Evaluation purposes
|
|
16
|
+
- Projects with less than 10 GPU-hours per month
|
|
17
|
+
|
|
18
|
+
FREE TIER RESTRICTIONS:
|
|
19
|
+
- No commercial use without a paid license
|
|
20
|
+
- No redistribution of source code
|
|
21
|
+
- No removal of license notices
|
|
22
|
+
- Attribution required in publications
|
|
23
|
+
|
|
24
|
+
PAID TIERS
|
|
25
|
+
----------
|
|
26
|
+
Commercial use requires a paid license:
|
|
27
|
+
|
|
28
|
+
Pro ($299/month) - Up to 10 GPUs, commercial use permitted
|
|
29
|
+
Team ($2,499/month) - Up to 100 GPUs, priority support
|
|
30
|
+
Enterprise (Custom) - Unlimited GPUs, custom SLA, on-premise deployment
|
|
31
|
+
|
|
32
|
+
Contact: info@quarterbit.dev
|
|
33
|
+
Website: https://quarterbit.dev/pricing
|
|
34
|
+
|
|
35
|
+
================================================================================
|
|
36
|
+
WARRANTY DISCLAIMER
|
|
37
|
+
================================================================================
|
|
38
|
+
|
|
39
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
40
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
41
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
42
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
43
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
44
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
45
|
+
SOFTWARE.
|
|
46
|
+
|
|
47
|
+
================================================================================
|
|
48
|
+
CONTACT
|
|
49
|
+
================================================================================
|
|
50
|
+
|
|
51
|
+
Clouthier Simulation Labs
|
|
52
|
+
Email: info@quarterbit.dev
|
|
53
|
+
Web: https://quarterbit.dev
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: quarterbit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Precision optimizer for PyTorch - 1,000,000x more accurate than FP32
|
|
5
|
+
Author-email: Kyle Clouthier <info@quarterbit.dev>
|
|
6
|
+
License: Proprietary - Free tier available, commercial use requires license
|
|
7
|
+
Project-URL: Homepage, https://quarterbit.dev
|
|
8
|
+
Project-URL: Repository, https://github.com/DigitalMax321/quarterbit
|
|
9
|
+
Project-URL: Documentation, https://quarterbit.dev/docs
|
|
10
|
+
Keywords: pytorch,optimizer,precision,training,gpu,cuda,adam
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: Other/Proprietary License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: torch>=2.0
|
|
27
|
+
Requires-Dist: numpy>=1.20
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest; extra == "dev"
|
|
30
|
+
Requires-Dist: build; extra == "dev"
|
|
31
|
+
Requires-Dist: twine; extra == "dev"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# QuarterBit
|
|
35
|
+
|
|
36
|
+
**The Pareto-Optimal Optimizer for PyTorch**
|
|
37
|
+
|
|
38
|
+
Better precision. Less memory. Faster training. No tradeoffs.
|
|
39
|
+
|
|
40
|
+
## The Problem
|
|
41
|
+
|
|
42
|
+
Standard FP32 training loses precision over long runs. Tiny gradient updates get rounded away, causing:
|
|
43
|
+
- Stalled convergence in late training
|
|
44
|
+
- Wasted GPU hours
|
|
45
|
+
- Suboptimal final models
|
|
46
|
+
|
|
47
|
+
## The Solution
|
|
48
|
+
|
|
49
|
+
QuarterBit's `CompactEFTAdam` combines **compressed storage** with **EFT (Error-Free Transformation) arithmetic** to achieve:
|
|
50
|
+
|
|
51
|
+
| Metric | PyTorch Adam | CompactEFTAdam | Improvement |
|
|
52
|
+
|--------|--------------|----------------|-------------|
|
|
53
|
+
| Precision | Loses 100% of tiny updates | Loses 0% | **1,000,000x** |
|
|
54
|
+
| Memory | 16 B/param | 9.25-13.25 B/param | **17-42% savings** |
|
|
55
|
+
| Convergence | 41 steps to target | 27 steps | **34% faster** |
|
|
56
|
+
|
|
57
|
+
## Installation
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install quarterbit
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Quick Start
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from quarterbit.torch import CompactEFTAdam
|
|
67
|
+
|
|
68
|
+
# Drop-in replacement for torch.optim.Adam
|
|
69
|
+
optimizer = CompactEFTAdam(model.parameters(), lr=1e-3)
|
|
70
|
+
|
|
71
|
+
# Train as usual
|
|
72
|
+
for batch in dataloader:
|
|
73
|
+
loss = model(batch)
|
|
74
|
+
loss.backward()
|
|
75
|
+
optimizer.step()
|
|
76
|
+
optimizer.zero_grad()
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Why QuarterBit?
|
|
80
|
+
|
|
81
|
+
### 1. Precision That Matters
|
|
82
|
+
After 500K training steps, standard FP32 loses **100%** of tiny gradient updates. QuarterBit's EFT arithmetic preserves every bit.
|
|
83
|
+
|
|
84
|
+
### 2. Memory Efficiency
|
|
85
|
+
Train larger models on the same GPU. CompactEFTAdam uses compressed FP16+FP4 storage, saving 17-42% memory.
|
|
86
|
+
|
|
87
|
+
### 3. Faster Convergence
|
|
88
|
+
Better precision = faster convergence. Reach your target loss in 34% fewer steps.
|
|
89
|
+
|
|
90
|
+
### 4. Drop-In Replacement
|
|
91
|
+
No code changes needed. Just swap your optimizer.
|
|
92
|
+
|
|
93
|
+
## Benchmarks
|
|
94
|
+
|
|
95
|
+
See our [Kaggle notebook](https://www.kaggle.com/code/kyleclouthier/quarterbit-benchmark-v2) for full benchmarks on GPT-2.
|
|
96
|
+
|
|
97
|
+
## Requirements
|
|
98
|
+
|
|
99
|
+
- Python 3.8+
|
|
100
|
+
- PyTorch 2.0+
|
|
101
|
+
- NVIDIA GPU with CUDA support
|
|
102
|
+
|
|
103
|
+
## Pricing
|
|
104
|
+
|
|
105
|
+
| Tier | Price | Use Case |
|
|
106
|
+
|------|-------|----------|
|
|
107
|
+
| **Free** | $0 | Personal, research, evaluation (<10 GPU-hrs/mo) |
|
|
108
|
+
| **Pro** | $299/mo | Commercial use, up to 10 GPUs |
|
|
109
|
+
| **Team** | $2,499/mo | Up to 100 GPUs, priority support |
|
|
110
|
+
| **Enterprise** | Custom | Unlimited GPUs, custom SLA |
|
|
111
|
+
|
|
112
|
+
See [quarterbit.dev/pricing](https://quarterbit.dev/pricing) for details.
|
|
113
|
+
|
|
114
|
+
## License
|
|
115
|
+
|
|
116
|
+
Proprietary - see [LICENSE](LICENSE) for details. Free tier available for non-commercial use.
|
|
117
|
+
|
|
118
|
+
## Links
|
|
119
|
+
|
|
120
|
+
- Website: [quarterbit.dev](https://quarterbit.dev)
|
|
121
|
+
- GitHub: [github.com/DigitalMax321/quarterbit](https://github.com/DigitalMax321/quarterbit)
|
|
122
|
+
- Email: info@quarterbit.dev
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# QuarterBit
|
|
2
|
+
|
|
3
|
+
**The Pareto-Optimal Optimizer for PyTorch**
|
|
4
|
+
|
|
5
|
+
Better precision. Less memory. Faster training. No tradeoffs.
|
|
6
|
+
|
|
7
|
+
## The Problem
|
|
8
|
+
|
|
9
|
+
Standard FP32 training loses precision over long runs. Tiny gradient updates get rounded away, causing:
|
|
10
|
+
- Stalled convergence in late training
|
|
11
|
+
- Wasted GPU hours
|
|
12
|
+
- Suboptimal final models
|
|
13
|
+
|
|
14
|
+
## The Solution
|
|
15
|
+
|
|
16
|
+
QuarterBit's `CompactEFTAdam` combines **compressed storage** with **EFT (Error-Free Transformation) arithmetic** to achieve:
|
|
17
|
+
|
|
18
|
+
| Metric | PyTorch Adam | CompactEFTAdam | Improvement |
|
|
19
|
+
|--------|--------------|----------------|-------------|
|
|
20
|
+
| Precision | Loses 100% of tiny updates | Loses 0% | **1,000,000x** |
|
|
21
|
+
| Memory | 16 B/param | 9.25-13.25 B/param | **17-42% savings** |
|
|
22
|
+
| Convergence | 41 steps to target | 27 steps | **34% faster** |
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install quarterbit
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Quick Start
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from quarterbit.torch import CompactEFTAdam
|
|
34
|
+
|
|
35
|
+
# Drop-in replacement for torch.optim.Adam
|
|
36
|
+
optimizer = CompactEFTAdam(model.parameters(), lr=1e-3)
|
|
37
|
+
|
|
38
|
+
# Train as usual
|
|
39
|
+
for batch in dataloader:
|
|
40
|
+
loss = model(batch)
|
|
41
|
+
loss.backward()
|
|
42
|
+
optimizer.step()
|
|
43
|
+
optimizer.zero_grad()
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Why QuarterBit?
|
|
47
|
+
|
|
48
|
+
### 1. Precision That Matters
|
|
49
|
+
After 500K training steps, standard FP32 loses **100%** of tiny gradient updates. QuarterBit's EFT arithmetic preserves every bit.
|
|
50
|
+
|
|
51
|
+
### 2. Memory Efficiency
|
|
52
|
+
Train larger models on the same GPU. CompactEFTAdam uses compressed FP16+FP4 storage, saving 17-42% memory.
|
|
53
|
+
|
|
54
|
+
### 3. Faster Convergence
|
|
55
|
+
Better precision = faster convergence. Reach your target loss in 34% fewer steps.
|
|
56
|
+
|
|
57
|
+
### 4. Drop-In Replacement
|
|
58
|
+
No code changes needed. Just swap your optimizer.
|
|
59
|
+
|
|
60
|
+
## Benchmarks
|
|
61
|
+
|
|
62
|
+
See our [Kaggle notebook](https://www.kaggle.com/code/kyleclouthier/quarterbit-benchmark-v2) for full benchmarks on GPT-2.
|
|
63
|
+
|
|
64
|
+
## Requirements
|
|
65
|
+
|
|
66
|
+
- Python 3.8+
|
|
67
|
+
- PyTorch 2.0+
|
|
68
|
+
- NVIDIA GPU with CUDA support
|
|
69
|
+
|
|
70
|
+
## Pricing
|
|
71
|
+
|
|
72
|
+
| Tier | Price | Use Case |
|
|
73
|
+
|------|-------|----------|
|
|
74
|
+
| **Free** | $0 | Personal, research, evaluation (<10 GPU-hrs/mo) |
|
|
75
|
+
| **Pro** | $299/mo | Commercial use, up to 10 GPUs |
|
|
76
|
+
| **Team** | $2,499/mo | Up to 100 GPUs, priority support |
|
|
77
|
+
| **Enterprise** | Custom | Unlimited GPUs, custom SLA |
|
|
78
|
+
|
|
79
|
+
See [quarterbit.dev/pricing](https://quarterbit.dev/pricing) for details.
|
|
80
|
+
|
|
81
|
+
## License
|
|
82
|
+
|
|
83
|
+
Proprietary - see [LICENSE](LICENSE) for details. Free tier available for non-commercial use.
|
|
84
|
+
|
|
85
|
+
## Links
|
|
86
|
+
|
|
87
|
+
- Website: [quarterbit.dev](https://quarterbit.dev)
|
|
88
|
+
- GitHub: [github.com/DigitalMax321/quarterbit](https://github.com/DigitalMax321/quarterbit)
|
|
89
|
+
- Email: info@quarterbit.dev
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "quarterbit"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Precision optimizer for PyTorch - 1,000,000x more accurate than FP32"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "Proprietary - Free tier available, commercial use requires license"}
|
|
11
|
+
authors = [
|
|
12
|
+
{name = "Kyle Clouthier", email = "info@quarterbit.dev"}
|
|
13
|
+
]
|
|
14
|
+
keywords = ["pytorch", "optimizer", "precision", "training", "gpu", "cuda", "adam"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"License :: Other/Proprietary License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.8",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
27
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
28
|
+
]
|
|
29
|
+
requires-python = ">=3.8"
|
|
30
|
+
dependencies = [
|
|
31
|
+
"torch>=2.0",
|
|
32
|
+
"numpy>=1.20",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://quarterbit.dev"
|
|
37
|
+
Repository = "https://github.com/DigitalMax321/quarterbit"
|
|
38
|
+
Documentation = "https://quarterbit.dev/docs"
|
|
39
|
+
|
|
40
|
+
[project.optional-dependencies]
|
|
41
|
+
dev = ["pytest", "build", "twine"]
|
|
42
|
+
|
|
43
|
+
[tool.setuptools.packages.find]
|
|
44
|
+
where = ["."]
|
|
45
|
+
include = ["quarterbit*"]
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""
|
|
2
|
+
QuarterBit
|
|
3
|
+
==========
|
|
4
|
+
|
|
5
|
+
Precision computing library.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from quarterbit.torch import Adam
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2026 Clouthier Simulation Labs. All rights reserved.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
__version__ = "1.0.0"
|
|
14
|
+
__author__ = "Kyle Clouthier"
|
|
15
|
+
__license__ = "Proprietary"
|
|
16
|
+
|
|
17
|
+
import os as _os
|
|
18
|
+
import hashlib as _hashlib
|
|
19
|
+
import platform as _platform
|
|
20
|
+
|
|
21
|
+
# ============================================================================
|
|
22
|
+
# License Validation (stub - implement full validation later)
|
|
23
|
+
# ============================================================================
|
|
24
|
+
|
|
25
|
+
_LICENSE_KEY = _os.environ.get("QUARTERBIT_LICENSE_KEY", "")
|
|
26
|
+
_VALIDATED = False
|
|
27
|
+
|
|
28
|
+
def _validate_license():
|
|
29
|
+
"""Validate license key. Returns tier or raises."""
|
|
30
|
+
global _VALIDATED
|
|
31
|
+
|
|
32
|
+
if _VALIDATED:
|
|
33
|
+
return True
|
|
34
|
+
|
|
35
|
+
# Free tier: limited functionality
|
|
36
|
+
if not _LICENSE_KEY:
|
|
37
|
+
_VALIDATED = True
|
|
38
|
+
return "free"
|
|
39
|
+
|
|
40
|
+
# TODO: Implement full license validation against server
|
|
41
|
+
# For now, any key = pro tier
|
|
42
|
+
_VALIDATED = True
|
|
43
|
+
return "pro"
|
|
44
|
+
|
|
45
|
+
def _check_environment():
|
|
46
|
+
"""Check for debugging/reverse engineering attempts."""
|
|
47
|
+
# Basic anti-debug checks
|
|
48
|
+
suspicious = []
|
|
49
|
+
|
|
50
|
+
# Check for common debuggers
|
|
51
|
+
if _os.environ.get("PYTHONDEBUG"):
|
|
52
|
+
suspicious.append("debug_mode")
|
|
53
|
+
|
|
54
|
+
# Check for trace
|
|
55
|
+
import sys
|
|
56
|
+
if sys.gettrace() is not None:
|
|
57
|
+
suspicious.append("trace_active")
|
|
58
|
+
|
|
59
|
+
return suspicious
|
|
60
|
+
|
|
61
|
+
# Run checks on import
|
|
62
|
+
_tier = _validate_license()
|
|
63
|
+
_suspicious = _check_environment()
|
|
64
|
+
|
|
65
|
+
if _suspicious and _tier == "free":
|
|
66
|
+
import warnings
|
|
67
|
+
warnings.warn("QuarterBit: Debug mode detected. Some features disabled.")
|
|
68
|
+
|
|
69
|
+
# ============================================================================
|
|
70
|
+
# Public API
|
|
71
|
+
# ============================================================================
|
|
72
|
+
|
|
73
|
+
def get_version():
|
|
74
|
+
"""Get QuarterBit version."""
|
|
75
|
+
return __version__
|
|
76
|
+
|
|
77
|
+
def get_license_tier():
|
|
78
|
+
"""Get current license tier."""
|
|
79
|
+
return _tier
|
|
80
|
+
|
|
81
|
+
def is_available():
|
|
82
|
+
"""Check if QuarterBit GPU backend is available."""
|
|
83
|
+
try:
|
|
84
|
+
from .torch.utils import is_available as _gpu_available
|
|
85
|
+
return _gpu_available()
|
|
86
|
+
except ImportError:
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
# Lazy imports for submodules
|
|
90
|
+
def __getattr__(name):
|
|
91
|
+
if name == "torch":
|
|
92
|
+
from . import torch
|
|
93
|
+
return torch
|
|
94
|
+
raise AttributeError(f"module 'quarterbit' has no attribute '{name}'")
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
QuarterBit PyTorch Backend
|
|
3
|
+
==========================
|
|
4
|
+
|
|
5
|
+
Precision optimizers for PyTorch.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from quarterbit.torch import Adam
|
|
9
|
+
optimizer = Adam(model.parameters(), lr=1e-3)
|
|
10
|
+
|
|
11
|
+
Copyright (c) 2026 Clouthier Simulation Labs. All rights reserved.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .optim import SGD, Adam, AdamW, CompactAdam, CompactEFTAdam
|
|
15
|
+
from .functional import eft_matmul, eft_sum, eft_accumulate
|
|
16
|
+
from .utils import get_backend_info, is_available
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
'SGD', 'Adam', 'AdamW', 'CompactAdam', 'CompactEFTAdam',
|
|
20
|
+
'eft_matmul', 'eft_sum', 'eft_accumulate',
|
|
21
|
+
'get_backend_info', 'is_available'
|
|
22
|
+
]
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""
|
|
2
|
+
QuarterBit Functional Operations
|
|
3
|
+
================================
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Clouthier Simulation Labs. All rights reserved.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
import ctypes
|
|
10
|
+
from .utils import get_lib, is_available
|
|
11
|
+
|
|
12
|
+
def _ptr(tensor):
|
|
13
|
+
"""Get ctypes pointer to tensor data."""
|
|
14
|
+
if tensor is None:
|
|
15
|
+
return None
|
|
16
|
+
return ctypes.cast(tensor.data_ptr(), ctypes.POINTER(ctypes.c_float))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def eft_accumulate(dst: torch.Tensor, src: torch.Tensor, comp: torch.Tensor = None):
|
|
20
|
+
"""
|
|
21
|
+
Accumulate src into dst with EFT precision: dst += src
|
|
22
|
+
|
|
23
|
+
If comp (compensation buffer) is not provided, creates one.
|
|
24
|
+
Returns the compensation buffer for continued accumulation.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
dst: Destination tensor (modified in-place)
|
|
28
|
+
src: Source tensor to add
|
|
29
|
+
comp: Compensation buffer (same shape as dst)
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
comp: Compensation buffer
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
>>> dst = torch.zeros(1000).cuda()
|
|
36
|
+
>>> comp = None
|
|
37
|
+
>>> for batch in data:
|
|
38
|
+
... comp = eft_accumulate(dst, batch, comp)
|
|
39
|
+
"""
|
|
40
|
+
if not is_available():
|
|
41
|
+
# Fallback: standard addition
|
|
42
|
+
dst.add_(src)
|
|
43
|
+
return torch.zeros_like(dst) if comp is None else comp
|
|
44
|
+
|
|
45
|
+
if comp is None:
|
|
46
|
+
comp = torch.zeros_like(dst)
|
|
47
|
+
|
|
48
|
+
if not (dst.is_cuda and src.is_cuda and dst.dtype == torch.float32):
|
|
49
|
+
# Fallback for non-CUDA or non-float32
|
|
50
|
+
dst.add_(src)
|
|
51
|
+
return comp
|
|
52
|
+
|
|
53
|
+
lib = get_lib()
|
|
54
|
+
n = dst.numel()
|
|
55
|
+
|
|
56
|
+
lib.eft_accumulate(
|
|
57
|
+
_ptr(dst), _ptr(comp), _ptr(src),
|
|
58
|
+
ctypes.c_int(n)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
return comp
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def eft_accumulate_scaled(dst: torch.Tensor, src: torch.Tensor,
|
|
65
|
+
scale: float, comp: torch.Tensor = None):
|
|
66
|
+
"""
|
|
67
|
+
Scaled accumulation with EFT: dst += scale * src
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
dst: Destination tensor
|
|
71
|
+
src: Source tensor
|
|
72
|
+
scale: Scaling factor
|
|
73
|
+
comp: Compensation buffer
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
comp: Compensation buffer
|
|
77
|
+
"""
|
|
78
|
+
if not is_available():
|
|
79
|
+
dst.add_(src, alpha=scale)
|
|
80
|
+
return torch.zeros_like(dst) if comp is None else comp
|
|
81
|
+
|
|
82
|
+
if comp is None:
|
|
83
|
+
comp = torch.zeros_like(dst)
|
|
84
|
+
|
|
85
|
+
if not (dst.is_cuda and src.is_cuda and dst.dtype == torch.float32):
|
|
86
|
+
dst.add_(src, alpha=scale)
|
|
87
|
+
return comp
|
|
88
|
+
|
|
89
|
+
lib = get_lib()
|
|
90
|
+
n = dst.numel()
|
|
91
|
+
|
|
92
|
+
lib.eft_accumulate_scaled(
|
|
93
|
+
_ptr(dst), _ptr(comp), _ptr(src),
|
|
94
|
+
ctypes.c_float(scale),
|
|
95
|
+
ctypes.c_int(n)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
return comp
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def eft_sum(tensor: torch.Tensor) -> torch.Tensor:
|
|
102
|
+
"""
|
|
103
|
+
Sum all elements with EFT precision.
|
|
104
|
+
|
|
105
|
+
More accurate than tensor.sum() for large tensors.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
tensor: Input tensor
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Scalar tensor with sum
|
|
112
|
+
|
|
113
|
+
Example:
|
|
114
|
+
>>> x = torch.randn(1000000).cuda()
|
|
115
|
+
>>> standard_sum = x.sum()
|
|
116
|
+
>>> eft_precise_sum = eft_sum(x)
|
|
117
|
+
"""
|
|
118
|
+
if not is_available() or not tensor.is_cuda or tensor.dtype != torch.float32:
|
|
119
|
+
return tensor.sum()
|
|
120
|
+
|
|
121
|
+
lib = get_lib()
|
|
122
|
+
n = tensor.numel()
|
|
123
|
+
|
|
124
|
+
output = torch.zeros(1, device=tensor.device, dtype=torch.float32)
|
|
125
|
+
output_comp = torch.zeros(1, device=tensor.device, dtype=torch.float32)
|
|
126
|
+
|
|
127
|
+
# Flatten for contiguous access
|
|
128
|
+
flat = tensor.contiguous().view(-1)
|
|
129
|
+
|
|
130
|
+
lib.eft_reduce_sum(
|
|
131
|
+
_ptr(flat), _ptr(output), _ptr(output_comp),
|
|
132
|
+
ctypes.c_int(n)
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Final compensation
|
|
136
|
+
return output + output_comp
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def eft_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
|
|
140
|
+
"""
|
|
141
|
+
Matrix multiplication with EFT precision: C = A @ B
|
|
142
|
+
|
|
143
|
+
More accurate than torch.mm for ill-conditioned matrices.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
a: First matrix [M, K]
|
|
147
|
+
b: Second matrix [K, N]
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Result matrix [M, N]
|
|
151
|
+
|
|
152
|
+
Example:
|
|
153
|
+
>>> A = torch.randn(1000, 500).cuda()
|
|
154
|
+
>>> B = torch.randn(500, 1000).cuda()
|
|
155
|
+
>>> C = eft_matmul(A, B) # More precise than A @ B
|
|
156
|
+
"""
|
|
157
|
+
if not is_available():
|
|
158
|
+
return torch.mm(a, b)
|
|
159
|
+
|
|
160
|
+
if not (a.is_cuda and b.is_cuda and a.dtype == torch.float32 and b.dtype == torch.float32):
|
|
161
|
+
return torch.mm(a, b)
|
|
162
|
+
|
|
163
|
+
if a.dim() != 2 or b.dim() != 2:
|
|
164
|
+
raise ValueError("eft_matmul requires 2D tensors")
|
|
165
|
+
|
|
166
|
+
M, K = a.shape
|
|
167
|
+
K2, N = b.shape
|
|
168
|
+
if K != K2:
|
|
169
|
+
raise ValueError(f"Matrix dimensions don't match: {a.shape} @ {b.shape}")
|
|
170
|
+
|
|
171
|
+
lib = get_lib()
|
|
172
|
+
|
|
173
|
+
c = torch.empty(M, N, device=a.device, dtype=torch.float32)
|
|
174
|
+
|
|
175
|
+
# Ensure contiguous
|
|
176
|
+
a_contig = a.contiguous()
|
|
177
|
+
b_contig = b.contiguous()
|
|
178
|
+
|
|
179
|
+
lib.eft_matmul(
|
|
180
|
+
_ptr(a_contig), _ptr(b_contig), _ptr(c),
|
|
181
|
+
ctypes.c_int(M), ctypes.c_int(K), ctypes.c_int(N)
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
return c
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def eft_dot(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
|
|
188
|
+
"""
|
|
189
|
+
Dot product with EFT precision.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
a: First vector
|
|
193
|
+
b: Second vector
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Scalar tensor with dot product
|
|
197
|
+
"""
|
|
198
|
+
return eft_sum(a * b)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def eft_mean(tensor: torch.Tensor) -> torch.Tensor:
|
|
202
|
+
"""
|
|
203
|
+
Mean with EFT precision.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
tensor: Input tensor
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Scalar tensor with mean
|
|
210
|
+
"""
|
|
211
|
+
return eft_sum(tensor) / tensor.numel()
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def eft_var(tensor: torch.Tensor, unbiased: bool = True) -> torch.Tensor:
|
|
215
|
+
"""
|
|
216
|
+
Variance with EFT precision.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
tensor: Input tensor
|
|
220
|
+
unbiased: Use Bessel's correction (N-1 denominator)
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
Scalar tensor with variance
|
|
224
|
+
"""
|
|
225
|
+
mean = eft_mean(tensor)
|
|
226
|
+
diff = tensor - mean
|
|
227
|
+
ss = eft_sum(diff * diff)
|
|
228
|
+
n = tensor.numel()
|
|
229
|
+
return ss / (n - 1 if unbiased else n)
|