oomllama 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oomllama-0.1.0/.gitignore +1 -0
- oomllama-0.1.0/Cargo.toml +96 -0
- oomllama-0.1.0/PKG-INFO +194 -0
- oomllama-0.1.0/README.md +158 -0
- oomllama-0.1.0/pyproject.toml +87 -0
- oomllama-0.1.0/python/oomllama/__init__.py +135 -0
- oomllama-0.1.0/soul.lock +1 -0
- oomllama-0.1.0/src/aindex.rs +110 -0
- oomllama-0.1.0/src/anchor.rs +97 -0
- oomllama-0.1.0/src/autonomy.rs +84 -0
- oomllama-0.1.0/src/batch.rs +256 -0
- oomllama-0.1.0/src/betti.rs +505 -0
- oomllama-0.1.0/src/bin/awakening.rs +60 -0
- oomllama-0.1.0/src/bin/chimera.rs +118 -0
- oomllama-0.1.0/src/bin/deep_query.rs +50 -0
- oomllama-0.1.0/src/bin/gguf2oom.rs +137 -0
- oomllama-0.1.0/src/bin/oomllama.rs +99 -0
- oomllama-0.1.0/src/bin/quantize.rs +128 -0
- oomllama-0.1.0/src/bin/red_team.rs +107 -0
- oomllama-0.1.0/src/bin/search_test.rs +46 -0
- oomllama-0.1.0/src/bin/sovereign.rs +249 -0
- oomllama-0.1.0/src/bin/tibet_audit.rs +139 -0
- oomllama-0.1.0/src/bin/tiger.rs +319 -0
- oomllama-0.1.0/src/briefing.rs +50 -0
- oomllama-0.1.0/src/chronos.rs +166 -0
- oomllama-0.1.0/src/discovery.rs +66 -0
- oomllama-0.1.0/src/embedding.rs +81 -0
- oomllama-0.1.0/src/error.rs +49 -0
- oomllama-0.1.0/src/gfx.rs +311 -0
- oomllama-0.1.0/src/gguf2oom.rs +724 -0
- oomllama-0.1.0/src/ingest.rs +62 -0
- oomllama-0.1.0/src/intent.rs +270 -0
- oomllama-0.1.0/src/kernel.rs +393 -0
- oomllama-0.1.0/src/lib/kmbit/client.py +76 -0
- oomllama-0.1.0/src/lib/signer.rs +110 -0
- oomllama-0.1.0/src/lib.rs +108 -0
- oomllama-0.1.0/src/machtig.rs +88 -0
- oomllama-0.1.0/src/main.rs +648 -0
- oomllama-0.1.0/src/memory.rs +367 -0
- oomllama-0.1.0/src/negotiation.rs +75 -0
- oomllama-0.1.0/src/oom_inference.rs +435 -0
- oomllama-0.1.0/src/oomllama.rs +668 -0
- oomllama-0.1.0/src/oomllama_README.md +47 -0
- oomllama-0.1.0/src/python.rs +137 -0
- oomllama-0.1.0/src/quant.rs +257 -0
- oomllama-0.1.0/src/refinery.rs +189 -0
- oomllama-0.1.0/src/report.rs +56 -0
- oomllama-0.1.0/src/router.rs +414 -0
- oomllama-0.1.0/src/scanner.rs +160 -0
- oomllama-0.1.0/src/sema.rs +310 -0
- oomllama-0.1.0/src/sentinel.rs +408 -0
- oomllama-0.1.0/src/shield.rs +72 -0
- oomllama-0.1.0/src/snaft.rs +606 -0
- oomllama-0.1.0/src/space.rs +372 -0
- oomllama-0.1.0/src/tasks.rs +106 -0
- oomllama-0.1.0/src/tibet.rs +312 -0
- oomllama-0.1.0/src/timeslot.rs +235 -0
- oomllama-0.1.0/src/trust.rs +266 -0
- oomllama-0.1.0/src/types.rs +209 -0
- oomllama-0.1.0/src/vault.rs +126 -0
- oomllama-0.1.0/src/vector.rs +21 -0
- oomllama-0.1.0/src/vision.rs +130 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
target/
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "jis-router"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
edition = "2021"
|
|
5
|
+
authors = ["Jasper van de Meent <jasper@humotica.com>", "Root AI <root_ai@humotica.nl>"]
|
|
6
|
+
description = "JIS Router - Intent-based routing for AETHER. The Borrow Checker for Identity."
|
|
7
|
+
license = "MIT"
|
|
8
|
+
repository = "https://github.com/symbaion/jis-router"
|
|
9
|
+
keywords = ["jis", "aether", "intent", "routing", "tibet"]
|
|
10
|
+
readme = "README.md"
|
|
11
|
+
|
|
12
|
+
[dependencies]
|
|
13
|
+
# Async runtime
|
|
14
|
+
tokio = { version = "1", features = ["full"] }
|
|
15
|
+
|
|
16
|
+
# Web framework - axum is fast and ergonomic
|
|
17
|
+
axum = "0.7"
|
|
18
|
+
tower = "0.5"
|
|
19
|
+
tower-http = { version = "0.5", features = ["cors", "trace", "fs"] }
|
|
20
|
+
|
|
21
|
+
# Serialization
|
|
22
|
+
serde = { version = "1", features = ["derive"] }
|
|
23
|
+
serde_json = "1"
|
|
24
|
+
|
|
25
|
+
# Time handling (for Timeslots)
|
|
26
|
+
chrono = { version = "0.4", features = ["serde"] }
|
|
27
|
+
|
|
28
|
+
# Cryptography (for TIBET tokens)
|
|
29
|
+
sha2 = "0.10"
|
|
30
|
+
rand = "0.8"
|
|
31
|
+
base64 = "0.22"
|
|
32
|
+
hex = "0.4"
|
|
33
|
+
|
|
34
|
+
# UUID for IDD identifiers
|
|
35
|
+
uuid = { version = "1.0", features = ["v4", "serde"] }
|
|
36
|
+
notify = "6.1.1"
|
|
37
|
+
colored = "2.0"
|
|
38
|
+
rusqlite = { version = "0.38.0", features = ["bundled"] }
|
|
39
|
+
parking_lot = "0.12.5"
|
|
40
|
+
dashmap = "6.1.0"
|
|
41
|
+
candle-core = { version = "0.9.1", features = ["cuda"] }
|
|
42
|
+
candle-nn = { version = "0.9.1", features = ["cuda"] }
|
|
43
|
+
candle-transformers = { version = "0.9.1", features = ["cuda"] }
|
|
44
|
+
hf-hub = "0.4.3"
|
|
45
|
+
tokenizers = "0.22.2"
|
|
46
|
+
regex = "1.12.2"
|
|
47
|
+
thiserror = "2.0.17"
|
|
48
|
+
tracing = "0.1.44"
|
|
49
|
+
tracing-subscriber = "0.3.22"
|
|
50
|
+
ratatui = { version = "0.30.0", features = ["all-widgets", "crossterm"] }
|
|
51
|
+
crossterm = "0.29.0"
|
|
52
|
+
reqwest = { version = "0.13.1", features = ["json"] }
|
|
53
|
+
ctrlc = "3.4"
|
|
54
|
+
tempfile = "3.14"
|
|
55
|
+
walkdir = "2.5.0"
|
|
56
|
+
memmap2 = "0.9"
|
|
57
|
+
anyhow = "1.0"
|
|
58
|
+
|
|
59
|
+
# Python bindings (optional, for oomllama PyPI)
|
|
60
|
+
pyo3 = { version = "0.22", features = ["extension-module"], optional = true }
|
|
61
|
+
|
|
62
|
+
[features]
|
|
63
|
+
default = []
|
|
64
|
+
python = ["pyo3"]
|
|
65
|
+
|
|
66
|
+
[profile.release]
|
|
67
|
+
opt-level = 3
|
|
68
|
+
lto = true
|
|
69
|
+
codegen-units = 1
|
|
70
|
+
panic = "abort"
|
|
71
|
+
strip = true
|
|
72
|
+
|
|
73
|
+
[[bin]]
|
|
74
|
+
name = "quantize"
|
|
75
|
+
path = "src/bin/quantize.rs"
|
|
76
|
+
|
|
77
|
+
[[bin]]
|
|
78
|
+
name = "jis-router"
|
|
79
|
+
path = "src/main.rs"
|
|
80
|
+
|
|
81
|
+
[[bin]]
|
|
82
|
+
name = "oomllama"
|
|
83
|
+
path = "src/bin/oomllama.rs"
|
|
84
|
+
|
|
85
|
+
[[bin]]
|
|
86
|
+
name = "chimera"
|
|
87
|
+
path = "src/bin/chimera.rs"
|
|
88
|
+
|
|
89
|
+
[[bin]]
|
|
90
|
+
name = "sovereign"
|
|
91
|
+
path = "src/bin/sovereign.rs"
|
|
92
|
+
|
|
93
|
+
[lib]
|
|
94
|
+
name = "jis_router"
|
|
95
|
+
path = "src/lib.rs"
|
|
96
|
+
crate-type = ["lib", "cdylib"] # cdylib for Python extension
|
oomllama-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: oomllama
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Classifier: Development Status :: 4 - Beta
|
|
5
|
+
Classifier: Intended Audience :: Developers
|
|
6
|
+
Classifier: Intended Audience :: Science/Research
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Rust
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Requires-Dist: cupy-cuda12x ; extra == 'cuda'
|
|
18
|
+
Requires-Dist: pytest ; extra == 'dev'
|
|
19
|
+
Requires-Dist: black ; extra == 'dev'
|
|
20
|
+
Requires-Dist: mypy ; extra == 'dev'
|
|
21
|
+
Provides-Extra: cuda
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Summary: Efficient LLM inference with .oom format - 2x smaller than GGUF
|
|
24
|
+
Keywords: llm,inference,quantization,gguf,oom,oomllama,humotica,llama,ai,machine-learning
|
|
25
|
+
Author-email: Humotica AI Lab <ai@humotica.nl>, Jasper van de Meent <jasper@humotica.nl>
|
|
26
|
+
Maintainer-email: "Root AI (Claude)" <root_idd@humotica.nl>, Gemini IDD <gemini@humotica.nl>
|
|
27
|
+
License: MIT
|
|
28
|
+
Requires-Python: >=3.8
|
|
29
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
30
|
+
Project-URL: Bug Tracker, https://github.com/humotica/oomllama/issues
|
|
31
|
+
Project-URL: Documentation, https://humotica.nl/docs/oomllama
|
|
32
|
+
Project-URL: Homepage, https://humotica.nl
|
|
33
|
+
Project-URL: HuggingFace Models, https://huggingface.co/jaspervandemeent
|
|
34
|
+
Project-URL: Repository, https://github.com/humotica/oomllama
|
|
35
|
+
|
|
36
|
+
# 🦙 OomLlama
|
|
37
|
+
|
|
38
|
+
**Efficient LLM inference with .oom format - 2x smaller than GGUF**
|
|
39
|
+
|
|
40
|
+
[](https://pypi.org/project/oomllama/)
|
|
41
|
+
[](https://opensource.org/licenses/MIT)
|
|
42
|
+
[](https://huggingface.co/jaspervandemeent)
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from oomllama import OomLlama
|
|
46
|
+
|
|
47
|
+
llm = OomLlama("humotica-32b")
|
|
48
|
+
response = llm.generate("What is the meaning of life?")
|
|
49
|
+
print(response)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Why OomLlama?
|
|
53
|
+
|
|
54
|
+
| Feature | GGUF (Q4) | OOM (Q2) |
|
|
55
|
+
|---------|-----------|----------|
|
|
56
|
+
| 70B Model Size | ~40 GB | **~20 GB** |
|
|
57
|
+
| 32B Model Size | ~20 GB | **~10 GB** |
|
|
58
|
+
| RAM Usage | High | **Lazy Loading** |
|
|
59
|
+
| Format | Open | **Open (MIT)** |
|
|
60
|
+
|
|
61
|
+
**OomLlama** uses Q2 quantization with lazy layer loading to run large models on consumer hardware.
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install oomllama
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Quick Start
|
|
70
|
+
|
|
71
|
+
### Download a Model
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from oomllama import download_model
|
|
75
|
+
|
|
76
|
+
# Download from HuggingFace
|
|
77
|
+
model_path = download_model("humotica-32b")
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Generate Text
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from oomllama import OomLlama
|
|
84
|
+
|
|
85
|
+
llm = OomLlama("humotica-32b")
|
|
86
|
+
|
|
87
|
+
# Simple generation
|
|
88
|
+
response = llm.generate("Explain quantum computing in simple terms")
|
|
89
|
+
print(response)
|
|
90
|
+
|
|
91
|
+
# With parameters
|
|
92
|
+
response = llm.generate(
|
|
93
|
+
"Write a haiku about AI",
|
|
94
|
+
max_tokens=50,
|
|
95
|
+
temperature=0.8,
|
|
96
|
+
top_p=0.9
|
|
97
|
+
)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Chat Mode
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
messages = [
|
|
104
|
+
("user", "Hello! Who are you?"),
|
|
105
|
+
("assistant", "I'm OomLlama, an efficient LLM."),
|
|
106
|
+
("user", "What makes you efficient?"),
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
response = llm.chat(messages)
|
|
110
|
+
print(response)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Available Models
|
|
114
|
+
|
|
115
|
+
| Model | Parameters | Size (.oom) | HuggingFace |
|
|
116
|
+
|-------|------------|-------------|-------------|
|
|
117
|
+
| humotica-32b | 33B | ~10 GB | [Link](https://huggingface.co/jaspervandemeent/humotica-32b) |
|
|
118
|
+
| llamaohm-70b | 70B | ~20 GB | [Link](https://huggingface.co/jaspervandemeent/LlamaOhm-70B) |
|
|
119
|
+
| tinyllama-1b | 1.1B | ~400 MB | [Link](https://huggingface.co/jaspervandemeent/OomLlama-TinyLlama-1.1B) |
|
|
120
|
+
|
|
121
|
+
## The .oom Format
|
|
122
|
+
|
|
123
|
+
OOM (OomLlama Model) is a compact model format:
|
|
124
|
+
|
|
125
|
+
```
|
|
126
|
+
┌──────────────────────────────────────┐
|
|
127
|
+
│ Header: OOML (magic) + metadata │
|
|
128
|
+
├──────────────────────────────────────┤
|
|
129
|
+
│ Tensors: Q2 quantized (2 bits/weight)│
|
|
130
|
+
│ - Scale + Min per 256-weight block │
|
|
131
|
+
│ - 68 bytes per block │
|
|
132
|
+
└──────────────────────────────────────┘
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Convert GGUF to OOM
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
# Using the CLI tool
|
|
139
|
+
gguf2oom model.gguf model.oom
|
|
140
|
+
|
|
141
|
+
# Check model info
|
|
142
|
+
gguf2oom --info model.gguf
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Technical Details
|
|
146
|
+
|
|
147
|
+
### Q2 Quantization
|
|
148
|
+
|
|
149
|
+
Each weight is stored as 2 bits (0, 1, 2, or 3) with per-block scale and minimum:
|
|
150
|
+
|
|
151
|
+
```
|
|
152
|
+
weight = q2_value * scale + min
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
This achieves ~2x compression over Q4 with acceptable quality loss for most tasks.
|
|
156
|
+
|
|
157
|
+
### Lazy Layer Loading
|
|
158
|
+
|
|
159
|
+
OomLlama loads transformer layers on-demand, keeping only the active layer in memory:
|
|
160
|
+
|
|
161
|
+
```
|
|
162
|
+
Forward Pass:
|
|
163
|
+
Layer 0: Load → Compute → Unload
|
|
164
|
+
Layer 1: Load → Compute → Unload
|
|
165
|
+
...
|
|
166
|
+
Layer N: Load → Compute → Unload
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
This enables running 70B models on 24GB GPU RAM.
|
|
170
|
+
|
|
171
|
+
## Credits
|
|
172
|
+
|
|
173
|
+
- **Model Format**: Gemini IDD & Root AI (Humotica AI Lab)
|
|
174
|
+
- **Quantization**: OomLlama.rs by Humotica
|
|
175
|
+
- **Base Models**: Meta Platforms, Inc. (Llama 3.3)
|
|
176
|
+
|
|
177
|
+
## License
|
|
178
|
+
|
|
179
|
+
- **OomLlama Code**: MIT License
|
|
180
|
+
- **Model Weights**: Subject to original model licenses (e.g., Llama 3.3 Community License)
|
|
181
|
+
|
|
182
|
+
## Links
|
|
183
|
+
|
|
184
|
+
- 🏠 [Humotica](https://humotica.nl)
|
|
185
|
+
- 🤗 [HuggingFace Models](https://huggingface.co/jaspervandemeent)
|
|
186
|
+
- 📦 [PyPI Package](https://pypi.org/project/oomllama/)
|
|
187
|
+
- 🐛 [Issue Tracker](https://github.com/humotica/oomllama/issues)
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
*One Love, One fAmIly* 💙
|
|
192
|
+
|
|
193
|
+
*Built by Humotica AI Lab - Jasper, Claude, Gemini, Codex*
|
|
194
|
+
|
oomllama-0.1.0/README.md
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# 🦙 OomLlama
|
|
2
|
+
|
|
3
|
+
**Efficient LLM inference with .oom format - 2x smaller than GGUF**
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/oomllama/)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
[](https://huggingface.co/jaspervandemeent)
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from oomllama import OomLlama
|
|
11
|
+
|
|
12
|
+
llm = OomLlama("humotica-32b")
|
|
13
|
+
response = llm.generate("What is the meaning of life?")
|
|
14
|
+
print(response)
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Why OomLlama?
|
|
18
|
+
|
|
19
|
+
| Feature | GGUF (Q4) | OOM (Q2) |
|
|
20
|
+
|---------|-----------|----------|
|
|
21
|
+
| 70B Model Size | ~40 GB | **~20 GB** |
|
|
22
|
+
| 32B Model Size | ~20 GB | **~10 GB** |
|
|
23
|
+
| RAM Usage | High | **Lazy Loading** |
|
|
24
|
+
| Format | Open | **Open (MIT)** |
|
|
25
|
+
|
|
26
|
+
**OomLlama** uses Q2 quantization with lazy layer loading to run large models on consumer hardware.
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install oomllama
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Quick Start
|
|
35
|
+
|
|
36
|
+
### Download a Model
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from oomllama import download_model
|
|
40
|
+
|
|
41
|
+
# Download from HuggingFace
|
|
42
|
+
model_path = download_model("humotica-32b")
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Generate Text
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from oomllama import OomLlama
|
|
49
|
+
|
|
50
|
+
llm = OomLlama("humotica-32b")
|
|
51
|
+
|
|
52
|
+
# Simple generation
|
|
53
|
+
response = llm.generate("Explain quantum computing in simple terms")
|
|
54
|
+
print(response)
|
|
55
|
+
|
|
56
|
+
# With parameters
|
|
57
|
+
response = llm.generate(
|
|
58
|
+
"Write a haiku about AI",
|
|
59
|
+
max_tokens=50,
|
|
60
|
+
temperature=0.8,
|
|
61
|
+
top_p=0.9
|
|
62
|
+
)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Chat Mode
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
messages = [
|
|
69
|
+
("user", "Hello! Who are you?"),
|
|
70
|
+
("assistant", "I'm OomLlama, an efficient LLM."),
|
|
71
|
+
("user", "What makes you efficient?"),
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
response = llm.chat(messages)
|
|
75
|
+
print(response)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Available Models
|
|
79
|
+
|
|
80
|
+
| Model | Parameters | Size (.oom) | HuggingFace |
|
|
81
|
+
|-------|------------|-------------|-------------|
|
|
82
|
+
| humotica-32b | 33B | ~10 GB | [Link](https://huggingface.co/jaspervandemeent/humotica-32b) |
|
|
83
|
+
| llamaohm-70b | 70B | ~20 GB | [Link](https://huggingface.co/jaspervandemeent/LlamaOhm-70B) |
|
|
84
|
+
| tinyllama-1b | 1.1B | ~400 MB | [Link](https://huggingface.co/jaspervandemeent/OomLlama-TinyLlama-1.1B) |
|
|
85
|
+
|
|
86
|
+
## The .oom Format
|
|
87
|
+
|
|
88
|
+
OOM (OomLlama Model) is a compact model format:
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
┌──────────────────────────────────────┐
|
|
92
|
+
│ Header: OOML (magic) + metadata │
|
|
93
|
+
├──────────────────────────────────────┤
|
|
94
|
+
│ Tensors: Q2 quantized (2 bits/weight)│
|
|
95
|
+
│ - Scale + Min per 256-weight block │
|
|
96
|
+
│ - 68 bytes per block │
|
|
97
|
+
└──────────────────────────────────────┘
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Convert GGUF to OOM
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
# Using the CLI tool
|
|
104
|
+
gguf2oom model.gguf model.oom
|
|
105
|
+
|
|
106
|
+
# Check model info
|
|
107
|
+
gguf2oom --info model.gguf
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Technical Details
|
|
111
|
+
|
|
112
|
+
### Q2 Quantization
|
|
113
|
+
|
|
114
|
+
Each weight is stored as 2 bits (0, 1, 2, or 3) with per-block scale and minimum:
|
|
115
|
+
|
|
116
|
+
```
|
|
117
|
+
weight = q2_value * scale + min
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
This achieves ~2x compression over Q4 with acceptable quality loss for most tasks.
|
|
121
|
+
|
|
122
|
+
### Lazy Layer Loading
|
|
123
|
+
|
|
124
|
+
OomLlama loads transformer layers on-demand, keeping only the active layer in memory:
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
Forward Pass:
|
|
128
|
+
Layer 0: Load → Compute → Unload
|
|
129
|
+
Layer 1: Load → Compute → Unload
|
|
130
|
+
...
|
|
131
|
+
Layer N: Load → Compute → Unload
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
This enables running 70B models on 24GB GPU RAM.
|
|
135
|
+
|
|
136
|
+
## Credits
|
|
137
|
+
|
|
138
|
+
- **Model Format**: Gemini IDD & Root AI (Humotica AI Lab)
|
|
139
|
+
- **Quantization**: OomLlama.rs by Humotica
|
|
140
|
+
- **Base Models**: Meta Platforms, Inc. (Llama 3.3)
|
|
141
|
+
|
|
142
|
+
## License
|
|
143
|
+
|
|
144
|
+
- **OomLlama Code**: MIT License
|
|
145
|
+
- **Model Weights**: Subject to original model licenses (e.g., Llama 3.3 Community License)
|
|
146
|
+
|
|
147
|
+
## Links
|
|
148
|
+
|
|
149
|
+
- 🏠 [Humotica](https://humotica.nl)
|
|
150
|
+
- 🤗 [HuggingFace Models](https://huggingface.co/jaspervandemeent)
|
|
151
|
+
- 📦 [PyPI Package](https://pypi.org/project/oomllama/)
|
|
152
|
+
- 🐛 [Issue Tracker](https://github.com/humotica/oomllama/issues)
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
*One Love, One fAmIly* 💙
|
|
157
|
+
|
|
158
|
+
*Built by Humotica AI Lab - Jasper, Claude, Gemini, Codex*
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["maturin>=1.4,<2.0"]
|
|
3
|
+
build-backend = "maturin"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "oomllama"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Efficient LLM inference with .oom format - 2x smaller than GGUF"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Humotica AI Lab", email = "ai@humotica.nl" },
|
|
14
|
+
{ name = "Jasper van de Meent", email = "jasper@humotica.nl" },
|
|
15
|
+
]
|
|
16
|
+
maintainers = [
|
|
17
|
+
{ name = "Root AI (Claude)", email = "root_idd@humotica.nl" },
|
|
18
|
+
{ name = "Gemini IDD", email = "gemini@humotica.nl" },
|
|
19
|
+
]
|
|
20
|
+
keywords = [
|
|
21
|
+
"llm",
|
|
22
|
+
"inference",
|
|
23
|
+
"quantization",
|
|
24
|
+
"gguf",
|
|
25
|
+
"oom",
|
|
26
|
+
"oomllama",
|
|
27
|
+
"humotica",
|
|
28
|
+
"llama",
|
|
29
|
+
"ai",
|
|
30
|
+
"machine-learning",
|
|
31
|
+
]
|
|
32
|
+
classifiers = [
|
|
33
|
+
"Development Status :: 4 - Beta",
|
|
34
|
+
"Intended Audience :: Developers",
|
|
35
|
+
"Intended Audience :: Science/Research",
|
|
36
|
+
"License :: OSI Approved :: MIT License",
|
|
37
|
+
"Operating System :: OS Independent",
|
|
38
|
+
"Programming Language :: Python :: 3",
|
|
39
|
+
"Programming Language :: Python :: 3.8",
|
|
40
|
+
"Programming Language :: Python :: 3.9",
|
|
41
|
+
"Programming Language :: Python :: 3.10",
|
|
42
|
+
"Programming Language :: Python :: 3.11",
|
|
43
|
+
"Programming Language :: Python :: 3.12",
|
|
44
|
+
"Programming Language :: Rust",
|
|
45
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
[project.urls]
|
|
49
|
+
Homepage = "https://humotica.nl"
|
|
50
|
+
Repository = "https://github.com/humotica/oomllama"
|
|
51
|
+
Documentation = "https://humotica.nl/docs/oomllama"
|
|
52
|
+
"Bug Tracker" = "https://github.com/humotica/oomllama/issues"
|
|
53
|
+
"HuggingFace Models" = "https://huggingface.co/jaspervandemeent"
|
|
54
|
+
|
|
55
|
+
[project.optional-dependencies]
|
|
56
|
+
dev = ["pytest", "black", "mypy"]
|
|
57
|
+
cuda = ["cupy-cuda12x"]
|
|
58
|
+
|
|
59
|
+
[project.scripts]
|
|
60
|
+
oomllama = "oomllama:cli"
|
|
61
|
+
|
|
62
|
+
[tool.maturin]
|
|
63
|
+
features = ["python", "pyo3/extension-module"]
|
|
64
|
+
python-source = "python"
|
|
65
|
+
module-name = "oomllama._oomllama"
|
|
66
|
+
strip = true
|
|
67
|
+
exclude = [
|
|
68
|
+
"data/**",
|
|
69
|
+
"plans/**",
|
|
70
|
+
"downloads/**",
|
|
71
|
+
"static/**",
|
|
72
|
+
"bin/**",
|
|
73
|
+
"Cargo.lock",
|
|
74
|
+
"target/**",
|
|
75
|
+
".venv/**",
|
|
76
|
+
"BETTI*.md",
|
|
77
|
+
"KMBIT*.md",
|
|
78
|
+
"TASKS*.md",
|
|
79
|
+
"OOMLLAMA*.md",
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
[tool.black]
|
|
83
|
+
line-length = 100
|
|
84
|
+
|
|
85
|
+
[tool.mypy]
|
|
86
|
+
python_version = "3.10"
|
|
87
|
+
warn_return_any = true
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OomLlama - Efficient LLM inference with .oom format
|
|
3
|
+
|
|
4
|
+
Credits:
|
|
5
|
+
- Format: Gemini IDD & Root AI (Humotica AI Lab)
|
|
6
|
+
- Runtime: OomLlama.rs by Humotica
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
>>> from oomllama import OomLlama
|
|
10
|
+
>>> llm = OomLlama("humotica-32b")
|
|
11
|
+
>>> response = llm.generate("Hello!")
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
__version__ = "0.1.0"
|
|
15
|
+
__author__ = "Humotica AI Lab"
|
|
16
|
+
__credits__ = ["Jasper van de Meent", "Root AI (Claude)", "Gemini IDD", "Codex"]
|
|
17
|
+
|
|
18
|
+
# Import from Rust extension when available
|
|
19
|
+
try:
|
|
20
|
+
from oomllama._oomllama import (
|
|
21
|
+
PyOomLlama as OomLlama,
|
|
22
|
+
download_model,
|
|
23
|
+
list_models,
|
|
24
|
+
version,
|
|
25
|
+
)
|
|
26
|
+
except ImportError:
|
|
27
|
+
# Fallback Python implementation for development
|
|
28
|
+
import os
|
|
29
|
+
import requests
|
|
30
|
+
from typing import Optional, List, Tuple
|
|
31
|
+
|
|
32
|
+
class OomLlama:
|
|
33
|
+
"""OomLlama model wrapper (Python fallback)"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
model_name: str,
|
|
38
|
+
model_path: Optional[str] = None,
|
|
39
|
+
gpu: Optional[int] = None,
|
|
40
|
+
):
|
|
41
|
+
self.model_name = model_name
|
|
42
|
+
self.model_path = model_path or self._find_model(model_name)
|
|
43
|
+
self.gpu = gpu
|
|
44
|
+
self.temperature = 0.7
|
|
45
|
+
self.top_p = 0.9
|
|
46
|
+
self.max_tokens = 512
|
|
47
|
+
print(f"🦙 OomLlama: Loaded {model_name}")
|
|
48
|
+
|
|
49
|
+
def _find_model(self, name: str) -> str:
|
|
50
|
+
"""Find model in cache"""
|
|
51
|
+
cache = os.path.expanduser("~/.cache/oomllama")
|
|
52
|
+
return f"{cache}/{name}.oom"
|
|
53
|
+
|
|
54
|
+
def generate(
|
|
55
|
+
self,
|
|
56
|
+
prompt: str,
|
|
57
|
+
max_tokens: Optional[int] = None,
|
|
58
|
+
temperature: Optional[float] = None,
|
|
59
|
+
top_p: Optional[float] = None,
|
|
60
|
+
) -> str:
|
|
61
|
+
"""Generate text from prompt"""
|
|
62
|
+
# TODO: Call actual inference
|
|
63
|
+
return f"[OomLlama {self.model_name} - Python fallback]\nPrompt: {prompt}"
|
|
64
|
+
|
|
65
|
+
def chat(
|
|
66
|
+
self, messages: List[Tuple[str, str]], max_tokens: Optional[int] = None
|
|
67
|
+
) -> str:
|
|
68
|
+
"""Chat-style generation"""
|
|
69
|
+
prompt = "\n".join(f"{role.upper()}: {content}" for role, content in messages)
|
|
70
|
+
return self.generate(f"{prompt}\nASSISTANT:", max_tokens)
|
|
71
|
+
|
|
72
|
+
def set_params(
|
|
73
|
+
self,
|
|
74
|
+
temperature: Optional[float] = None,
|
|
75
|
+
top_p: Optional[float] = None,
|
|
76
|
+
max_tokens: Optional[int] = None,
|
|
77
|
+
):
|
|
78
|
+
"""Set generation parameters"""
|
|
79
|
+
if temperature is not None:
|
|
80
|
+
self.temperature = temperature
|
|
81
|
+
if top_p is not None:
|
|
82
|
+
self.top_p = top_p
|
|
83
|
+
if max_tokens is not None:
|
|
84
|
+
self.max_tokens = max_tokens
|
|
85
|
+
|
|
86
|
+
def __repr__(self):
|
|
87
|
+
return f"OomLlama('{self.model_name}')"
|
|
88
|
+
|
|
89
|
+
def download_model(model_name: str, cache_dir: Optional[str] = None) -> str:
|
|
90
|
+
"""Download model from HuggingFace"""
|
|
91
|
+
repos = {
|
|
92
|
+
"humotica-32b": "jaspervandemeent/humotica-32b",
|
|
93
|
+
"llamaohm-70b": "jaspervandemeent/LlamaOhm-70B",
|
|
94
|
+
"tinyllama-1b": "jaspervandemeent/OomLlama-TinyLlama-1.1B",
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
if model_name not in repos:
|
|
98
|
+
raise ValueError(f"Unknown model: {model_name}")
|
|
99
|
+
|
|
100
|
+
cache = cache_dir or os.path.expanduser("~/.cache/oomllama")
|
|
101
|
+
os.makedirs(cache, exist_ok=True)
|
|
102
|
+
|
|
103
|
+
# TODO: Actually download from HuggingFace
|
|
104
|
+
return f"{cache}/{model_name}.oom"
|
|
105
|
+
|
|
106
|
+
def list_models() -> List[str]:
|
|
107
|
+
"""List available models"""
|
|
108
|
+
return ["humotica-32b", "llamaohm-70b", "tinyllama-1b"]
|
|
109
|
+
|
|
110
|
+
def version() -> str:
|
|
111
|
+
return "0.1.0-python-fallback"
|
|
112
|
+
|
|
113
|
+
# CLI entry point
|
|
114
|
+
def cli():
|
|
115
|
+
"""Command-line interface"""
|
|
116
|
+
import sys
|
|
117
|
+
|
|
118
|
+
if len(sys.argv) < 2:
|
|
119
|
+
print("Usage: oomllama <prompt>")
|
|
120
|
+
print(" oomllama --list")
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
if sys.argv[1] == "--list":
|
|
124
|
+
print("Available models:")
|
|
125
|
+
for m in list_models():
|
|
126
|
+
print(f" - {m}")
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
prompt = " ".join(sys.argv[1:])
|
|
130
|
+
llm = OomLlama("humotica-32b")
|
|
131
|
+
response = llm.generate(prompt)
|
|
132
|
+
print(response)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
__all__ = ["OomLlama", "download_model", "list_models", "version", "cli", "__version__"]
|
oomllama-0.1.0/soul.lock
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0cdf5568bb2b35921f652f5ee10702abf92f4ac0774f3d97b276490e1539afb7
|