kandiga 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kandiga-0.1.0/LICENSE +21 -0
- kandiga-0.1.0/PKG-INFO +17 -0
- kandiga-0.1.0/README.md +153 -0
- kandiga-0.1.0/kandiga/__init__.py +2 -0
- kandiga-0.1.0/kandiga/__main__.py +4 -0
- kandiga-0.1.0/kandiga/_build.py +48 -0
- kandiga-0.1.0/kandiga/_pack_experts.py +167 -0
- kandiga-0.1.0/kandiga/_split_experts.py +120 -0
- kandiga-0.1.0/kandiga/bench.py +104 -0
- kandiga-0.1.0/kandiga/chat.py +160 -0
- kandiga-0.1.0/kandiga/cli.py +107 -0
- kandiga-0.1.0/kandiga/engine.py +450 -0
- kandiga-0.1.0/kandiga/metal/Makefile +16 -0
- kandiga-0.1.0/kandiga/metal/kandiga_cpu_expert.h +102 -0
- kandiga-0.1.0/kandiga/metal/kandiga_cpu_expert.m +446 -0
- kandiga-0.1.0/kandiga/metal/libkandiga_cpu_expert.dylib +0 -0
- kandiga-0.1.0/kandiga/serve.py +169 -0
- kandiga-0.1.0/kandiga/setup.py +117 -0
- kandiga-0.1.0/kandiga/tools/__init__.py +1 -0
- kandiga-0.1.0/kandiga.egg-info/PKG-INFO +17 -0
- kandiga-0.1.0/kandiga.egg-info/SOURCES.txt +31 -0
- kandiga-0.1.0/kandiga.egg-info/dependency_links.txt +1 -0
- kandiga-0.1.0/kandiga.egg-info/entry_points.txt +2 -0
- kandiga-0.1.0/kandiga.egg-info/requires.txt +11 -0
- kandiga-0.1.0/kandiga.egg-info/top_level.txt +1 -0
- kandiga-0.1.0/pyproject.toml +32 -0
- kandiga-0.1.0/setup.cfg +27 -0
- kandiga-0.1.0/setup.py +3 -0
- kandiga-0.1.0/tests/__init__.py +0 -0
- kandiga-0.1.0/tests/test_cli.py +66 -0
- kandiga-0.1.0/tests/test_engine.py +98 -0
- kandiga-0.1.0/tests/test_pack.py +49 -0
kandiga-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Kandiga Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
kandiga-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kandiga
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Run 35B AI models in 1.5GB of RAM. Any Mac.
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: mlx>=0.30.0
|
|
9
|
+
Requires-Dist: mlx-lm>=0.24.0
|
|
10
|
+
Requires-Dist: numpy
|
|
11
|
+
Requires-Dist: rich
|
|
12
|
+
Provides-Extra: serve
|
|
13
|
+
Requires-Dist: fastapi; extra == "serve"
|
|
14
|
+
Requires-Dist: uvicorn[standard]; extra == "serve"
|
|
15
|
+
Provides-Extra: tools
|
|
16
|
+
Requires-Dist: duckduckgo-search; extra == "tools"
|
|
17
|
+
Dynamic: license-file
|
kandiga-0.1.0/README.md
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# Kandiga
|
|
2
|
+
|
|
3
|
+
Run 35B AI models in 1.5GB of RAM. Any Mac.
|
|
4
|
+
|
|
5
|
+
Kandiga is an open-source MoE inference engine that uses **Selective Expert Materialization** to run models that would normally require 20GB+ of memory in under 2GB on any Apple Silicon Mac.
|
|
6
|
+
|
|
7
|
+
## How it works
|
|
8
|
+
|
|
9
|
+
Large MoE (Mixture of Experts) models like Qwen3.5-35B-A3B have 256 experts per layer, but only activate 8 per token. Kandiga exploits this sparsity:
|
|
10
|
+
|
|
11
|
+
1. **Shared layers** (attention, norms, embeddings) load to GPU memory (~1.5GB)
|
|
12
|
+
2. **Expert MLP weights** stay on disk in packed binary files (~17GB SSD)
|
|
13
|
+
3. **Per token**: the router selects 8 experts, which are read from SSD via `pread`
|
|
14
|
+
4. **CPU computes** expert MLP with NEON-vectorized 4-bit dequant + GCD parallelism
|
|
15
|
+
5. **GPU computes** attention simultaneously via MLX (unified memory, zero copy)
|
|
16
|
+
|
|
17
|
+
This is the [KTransformers](https://github.com/kvcache-ai/ktransformers) architecture adapted for Apple Silicon's unified memory.
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install kandiga
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Requirements: macOS with Apple Silicon (M1/M2/M3/M4), Python 3.10+
|
|
26
|
+
|
|
27
|
+
## Quick start
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
# One-time setup: download model + prepare expert files (~20 min)
|
|
31
|
+
kandiga setup
|
|
32
|
+
|
|
33
|
+
# Interactive chat
|
|
34
|
+
kandiga chat
|
|
35
|
+
|
|
36
|
+
# Fast mode (K=4 experts instead of 8, ~2x speed, slightly less quality)
|
|
37
|
+
kandiga chat --fast
|
|
38
|
+
|
|
39
|
+
# One-shot prompt
|
|
40
|
+
kandiga "What is the capital of France?"
|
|
41
|
+
|
|
42
|
+
# Start an OpenAI-compatible API server
|
|
43
|
+
kandiga serve
|
|
44
|
+
|
|
45
|
+
# Run benchmarks
|
|
46
|
+
kandiga bench
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Benchmarks
|
|
50
|
+
|
|
51
|
+
Measured on M4 Mac Mini (16GB), Qwen3.5-35B-A3B-4bit:
|
|
52
|
+
|
|
53
|
+
| Mode | Experts | Speed | RAM | Quality |
|
|
54
|
+
|------|---------|-------|-----|---------|
|
|
55
|
+
| Quality (K=8) | 8/256 per layer | ~3.5 tok/s | 1.5GB | Full |
|
|
56
|
+
| Fast (K=4) | 4/256 per layer | ~6.5 tok/s | 1.5GB | Near-equal |
|
|
57
|
+
|
|
58
|
+
For comparison, loading the full model requires 20.4GB of RAM and MLX alone achieves ~25 tok/s when it fits in memory. Kandiga trades speed for accessibility: if your Mac has 8-16GB of RAM, you can now run a 35B model that previously required 24GB+.
|
|
59
|
+
|
|
60
|
+
## Architecture
|
|
61
|
+
|
|
62
|
+
```
|
|
63
|
+
User prompt
|
|
64
|
+
|
|
|
65
|
+
v
|
|
66
|
+
[Tokenizer + Chat Template]
|
|
67
|
+
|
|
|
68
|
+
v
|
|
69
|
+
[MLX Forward Pass]
|
|
70
|
+
|
|
|
71
|
+
+---> GPU: Attention + Norms + Router + Shared Expert + Blending
|
|
72
|
+
|
|
|
73
|
+
+---> CPU: Routed Expert MLP (NEON 4-bit dequant + GCD parallel)
|
|
74
|
+
| |
|
|
75
|
+
| +-- pread expert weights from SSD (OS page cache)
|
|
76
|
+
| +-- gate_proj matvec (512x2048)
|
|
77
|
+
| +-- up_proj matvec (512x2048)
|
|
78
|
+
| +-- SwiGLU activation
|
|
79
|
+
| +-- down_proj matvec (2048x512)
|
|
80
|
+
|
|
|
81
|
+
v
|
|
82
|
+
[Token Output]
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Both CPU and GPU operate on the same physical DRAM (Apple Silicon unified memory), so there is zero data transfer overhead between them.
|
|
86
|
+
|
|
87
|
+
## API Server
|
|
88
|
+
|
|
89
|
+
Kandiga includes an OpenAI-compatible HTTP API:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
kandiga serve --port 8340
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
import openai
|
|
97
|
+
|
|
98
|
+
client = openai.OpenAI(base_url="http://localhost:8340/v1", api_key="unused")
|
|
99
|
+
response = client.chat.completions.create(
|
|
100
|
+
model="mlx-community/Qwen3.5-35B-A3B-4bit",
|
|
101
|
+
messages=[{"role": "user", "content": "Hello!"}],
|
|
102
|
+
stream=True,
|
|
103
|
+
)
|
|
104
|
+
for chunk in response:
|
|
105
|
+
print(chunk.choices[0].delta.content or "", end="")
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Project structure
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
kandiga/
|
|
112
|
+
__init__.py # Package version
|
|
113
|
+
cli.py # CLI entry point (argparse)
|
|
114
|
+
engine.py # Core inference engine (SEM)
|
|
115
|
+
chat.py # Interactive chat (Rich terminal UI)
|
|
116
|
+
serve.py # OpenAI-compatible HTTP API (FastAPI)
|
|
117
|
+
bench.py # Benchmarking suite
|
|
118
|
+
setup.py # Model download + expert splitting + packing
|
|
119
|
+
_split_experts.py # Split stacked weights into per-expert files
|
|
120
|
+
_pack_experts.py # Pack per-expert files into binary format
|
|
121
|
+
_build.py # Compile CPU expert dylib from source
|
|
122
|
+
metal/
|
|
123
|
+
kandiga_cpu_expert.h # C API header
|
|
124
|
+
kandiga_cpu_expert.m # NEON + GCD implementation
|
|
125
|
+
Makefile # Build the dylib
|
|
126
|
+
tools/
|
|
127
|
+
__init__.py # Future: web search, file access
|
|
128
|
+
scripts/
|
|
129
|
+
install.sh # Quick install script
|
|
130
|
+
tests/
|
|
131
|
+
...
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Development
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
# Clone
|
|
138
|
+
git clone https://github.com/yourusername/kandiga.git
|
|
139
|
+
cd kandiga
|
|
140
|
+
|
|
141
|
+
# Install in development mode
|
|
142
|
+
pip install -e ".[serve]"
|
|
143
|
+
|
|
144
|
+
# Build the CPU expert library
|
|
145
|
+
cd kandiga/metal && make && cd ../..
|
|
146
|
+
|
|
147
|
+
# Run tests
|
|
148
|
+
pytest tests/ -v
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## License
|
|
152
|
+
|
|
153
|
+
MIT
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Build the CPU expert dylib from source."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def build_cpu_expert_dylib() -> str:
|
|
11
|
+
"""Compile libkandiga_cpu_expert.dylib from the Objective-C source.
|
|
12
|
+
|
|
13
|
+
Returns the path to the built dylib.
|
|
14
|
+
"""
|
|
15
|
+
metal_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "metal")
|
|
16
|
+
source = os.path.join(metal_dir, "kandiga_cpu_expert.m")
|
|
17
|
+
header = os.path.join(metal_dir, "kandiga_cpu_expert.h")
|
|
18
|
+
dylib = os.path.join(metal_dir, "libkandiga_cpu_expert.dylib")
|
|
19
|
+
|
|
20
|
+
if not os.path.exists(source):
|
|
21
|
+
raise FileNotFoundError(f"Source file not found: {source}")
|
|
22
|
+
|
|
23
|
+
# Build command
|
|
24
|
+
cmd = [
|
|
25
|
+
"clang",
|
|
26
|
+
"-shared",
|
|
27
|
+
"-o", dylib,
|
|
28
|
+
source,
|
|
29
|
+
"-fobjc-arc",
|
|
30
|
+
"-framework", "Foundation",
|
|
31
|
+
"-O2",
|
|
32
|
+
"-march=native",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
print(f" Building: {' '.join(cmd)}")
|
|
36
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
37
|
+
|
|
38
|
+
if result.returncode != 0:
|
|
39
|
+
raise RuntimeError(
|
|
40
|
+
f"Compilation failed (exit {result.returncode}):\n"
|
|
41
|
+
f"stdout: {result.stdout}\n"
|
|
42
|
+
f"stderr: {result.stderr}"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
if not os.path.exists(dylib):
|
|
46
|
+
raise RuntimeError(f"Build succeeded but dylib not found at {dylib}")
|
|
47
|
+
|
|
48
|
+
return dylib
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Pack per-expert safetensors into single raw binary files per layer.
|
|
2
|
+
|
|
3
|
+
Reads the split per-expert safetensors files and packs them into a compact
|
|
4
|
+
binary format that can be read with zero parsing overhead using pread().
|
|
5
|
+
|
|
6
|
+
Binary format per file:
|
|
7
|
+
Header (4096 bytes):
|
|
8
|
+
magic: 4 bytes "BKEX"
|
|
9
|
+
version: uint32 1
|
|
10
|
+
num_experts: uint32 256
|
|
11
|
+
expert_size: uint64 1769472 (bytes per expert block)
|
|
12
|
+
num_tensors: uint32 9
|
|
13
|
+
tensor descriptors...
|
|
14
|
+
padding to 4096 bytes
|
|
15
|
+
|
|
16
|
+
Expert data (256 x 1769472 bytes):
|
|
17
|
+
expert_000: [gate.weight][gate.scales][gate.biases]
|
|
18
|
+
[up.weight][up.scales][up.biases]
|
|
19
|
+
[down.weight][down.scales][down.biases]
|
|
20
|
+
...
|
|
21
|
+
expert_255: same layout
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import os
|
|
27
|
+
import struct
|
|
28
|
+
import time
|
|
29
|
+
|
|
30
|
+
import mlx.core as mx
|
|
31
|
+
import numpy as np
|
|
32
|
+
|
|
33
|
+
NUM_EXPERTS = 256
|
|
34
|
+
NUM_MOE_LAYERS = 40
|
|
35
|
+
HEADER_SIZE = 4096
|
|
36
|
+
EXPERT_SIZE = 1_769_472 # 1728KB exactly
|
|
37
|
+
|
|
38
|
+
# Tensor order must match the C library's byte offsets
|
|
39
|
+
TENSOR_ORDER = [
|
|
40
|
+
("gate_proj.weight", (512, 256), "uint32"),
|
|
41
|
+
("gate_proj.scales", (512, 32), "bfloat16"),
|
|
42
|
+
("gate_proj.biases", (512, 32), "bfloat16"),
|
|
43
|
+
("up_proj.weight", (512, 256), "uint32"),
|
|
44
|
+
("up_proj.scales", (512, 32), "bfloat16"),
|
|
45
|
+
("up_proj.biases", (512, 32), "bfloat16"),
|
|
46
|
+
("down_proj.weight", (2048, 64), "uint32"),
|
|
47
|
+
("down_proj.scales", (2048, 8), "bfloat16"),
|
|
48
|
+
("down_proj.biases", (2048, 8), "bfloat16"),
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _tensor_nbytes(shape: tuple, dtype_str: str) -> int:
|
|
53
|
+
"""Calculate raw byte size for a tensor."""
|
|
54
|
+
itemsize = 4 if dtype_str == "uint32" else 2
|
|
55
|
+
return shape[0] * shape[1] * itemsize
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _build_header() -> bytes:
|
|
59
|
+
"""Build the 4096-byte binary header."""
|
|
60
|
+
buf = bytearray(HEADER_SIZE)
|
|
61
|
+
|
|
62
|
+
buf[0:4] = b"BKEX"
|
|
63
|
+
struct.pack_into("<I", buf, 4, 1) # version
|
|
64
|
+
struct.pack_into("<I", buf, 8, NUM_EXPERTS)
|
|
65
|
+
struct.pack_into("<Q", buf, 12, EXPERT_SIZE)
|
|
66
|
+
struct.pack_into("<I", buf, 20, len(TENSOR_ORDER))
|
|
67
|
+
|
|
68
|
+
offset_in_expert = 0
|
|
69
|
+
pos = 24
|
|
70
|
+
for name, shape, dtype_str in TENSOR_ORDER:
|
|
71
|
+
nbytes = _tensor_nbytes(shape, dtype_str)
|
|
72
|
+
dtype_code = 0 if dtype_str == "uint32" else 1
|
|
73
|
+
|
|
74
|
+
name_bytes = name.encode("ascii")
|
|
75
|
+
struct.pack_into("<B", buf, pos, len(name_bytes))
|
|
76
|
+
pos += 1
|
|
77
|
+
buf[pos: pos + len(name_bytes)] = name_bytes
|
|
78
|
+
pos += 24
|
|
79
|
+
struct.pack_into("<I", buf, pos, offset_in_expert)
|
|
80
|
+
pos += 4
|
|
81
|
+
struct.pack_into("<I", buf, pos, nbytes)
|
|
82
|
+
pos += 4
|
|
83
|
+
struct.pack_into("<I", buf, pos, shape[0])
|
|
84
|
+
pos += 4
|
|
85
|
+
struct.pack_into("<I", buf, pos, shape[1])
|
|
86
|
+
pos += 4
|
|
87
|
+
struct.pack_into("<B", buf, pos, dtype_code)
|
|
88
|
+
pos += 1
|
|
89
|
+
|
|
90
|
+
offset_in_expert += nbytes
|
|
91
|
+
|
|
92
|
+
assert offset_in_expert == EXPERT_SIZE, (
|
|
93
|
+
f"Tensor sizes don't sum to EXPERT_SIZE: {offset_in_expert} != {EXPERT_SIZE}"
|
|
94
|
+
)
|
|
95
|
+
return bytes(buf)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _expert_to_bytes(tensors: dict[str, mx.array]) -> bytes:
|
|
99
|
+
"""Convert an expert's tensor dict to raw bytes in canonical order."""
|
|
100
|
+
parts = []
|
|
101
|
+
for name, shape, dtype_str in TENSOR_ORDER:
|
|
102
|
+
tensor = tensors[name]
|
|
103
|
+
mx.eval(tensor)
|
|
104
|
+
|
|
105
|
+
if dtype_str == "uint32":
|
|
106
|
+
np_arr = np.array(tensor, copy=False)
|
|
107
|
+
raw = np_arr.tobytes()
|
|
108
|
+
else:
|
|
109
|
+
u16 = tensor.view(mx.uint16)
|
|
110
|
+
mx.eval(u16)
|
|
111
|
+
np_arr = np.array(u16, copy=False)
|
|
112
|
+
raw = np_arr.tobytes()
|
|
113
|
+
|
|
114
|
+
expected = _tensor_nbytes(shape, dtype_str)
|
|
115
|
+
assert len(raw) == expected, f"{name}: got {len(raw)} bytes, expected {expected}"
|
|
116
|
+
parts.append(raw)
|
|
117
|
+
|
|
118
|
+
data = b"".join(parts)
|
|
119
|
+
assert len(data) == EXPERT_SIZE, f"Expert data {len(data)} != {EXPERT_SIZE}"
|
|
120
|
+
return data
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _pack_layer(layer_idx: int, input_dir: str, output_dir: str) -> None:
|
|
124
|
+
"""Pack all 256 experts for one layer into a single binary file."""
|
|
125
|
+
layer_dir = os.path.join(input_dir, f"layer_{layer_idx:02d}")
|
|
126
|
+
out_path = os.path.join(output_dir, f"layer_{layer_idx:02d}.bin")
|
|
127
|
+
|
|
128
|
+
header = _build_header()
|
|
129
|
+
|
|
130
|
+
with open(out_path, "wb") as f:
|
|
131
|
+
f.write(header)
|
|
132
|
+
for expert_idx in range(NUM_EXPERTS):
|
|
133
|
+
st_path = os.path.join(layer_dir, f"expert_{expert_idx:03d}.safetensors")
|
|
134
|
+
tensors = mx.load(st_path)
|
|
135
|
+
mx.eval(*tensors.values())
|
|
136
|
+
raw = _expert_to_bytes(tensors)
|
|
137
|
+
f.write(raw)
|
|
138
|
+
del tensors
|
|
139
|
+
|
|
140
|
+
# Verify file size
|
|
141
|
+
expected_size = HEADER_SIZE + NUM_EXPERTS * EXPERT_SIZE
|
|
142
|
+
actual_size = os.path.getsize(out_path)
|
|
143
|
+
assert actual_size == expected_size, (
|
|
144
|
+
f"File size mismatch: {actual_size} != {expected_size}"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def pack_experts(
|
|
149
|
+
input_dir: str,
|
|
150
|
+
output_dir: str,
|
|
151
|
+
num_layers: int = NUM_MOE_LAYERS,
|
|
152
|
+
) -> None:
|
|
153
|
+
"""Pack all layers from split expert files into binary format."""
|
|
154
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
155
|
+
|
|
156
|
+
total_start = time.time()
|
|
157
|
+
for layer_idx in range(num_layers):
|
|
158
|
+
layer_start = time.time()
|
|
159
|
+
print(f" Packing layer {layer_idx:2d}/{num_layers - 1}...", end=" ", flush=True)
|
|
160
|
+
_pack_layer(layer_idx, input_dir, output_dir)
|
|
161
|
+
elapsed = time.time() - layer_start
|
|
162
|
+
total_elapsed = time.time() - total_start
|
|
163
|
+
eta = (total_elapsed / (layer_idx + 1)) * (num_layers - layer_idx - 1)
|
|
164
|
+
print(f"done ({elapsed:.1f}s, ETA {eta:.0f}s)")
|
|
165
|
+
|
|
166
|
+
total_elapsed = time.time() - total_start
|
|
167
|
+
print(f" {num_layers} layer files packed in {total_elapsed:.1f}s")
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Split stacked MoE expert weights into per-expert safetensors files.
|
|
2
|
+
|
|
3
|
+
Reads the downloaded Qwen3.5-35B-A3B-4bit model's stacked expert weights
|
|
4
|
+
(256 experts per layer, 40 MoE layers) and splits them into individual
|
|
5
|
+
per-expert files for selective loading.
|
|
6
|
+
|
|
7
|
+
Output structure:
|
|
8
|
+
~/.kandiga/experts/Qwen3.5-35B-A3B-4bit/
|
|
9
|
+
layer_00/
|
|
10
|
+
expert_000.safetensors
|
|
11
|
+
...
|
|
12
|
+
expert_255.safetensors
|
|
13
|
+
layer_01/
|
|
14
|
+
...
|
|
15
|
+
layer_39/
|
|
16
|
+
...
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
import os
|
|
23
|
+
import time
|
|
24
|
+
|
|
25
|
+
import mlx.core as mx
|
|
26
|
+
|
|
27
|
+
WEIGHT_PREFIX = "language_model.model.layers"
|
|
28
|
+
PROJECTIONS = ("gate_proj", "up_proj", "down_proj")
|
|
29
|
+
COMPONENTS = ("weight", "scales", "biases")
|
|
30
|
+
NUM_EXPERTS = 256
|
|
31
|
+
NUM_MOE_LAYERS = 40
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _build_weight_map(model_dir: str) -> dict[str, str]:
|
|
35
|
+
"""Map tensor names to absolute shard file paths."""
|
|
36
|
+
index_file = os.path.join(model_dir, "model.safetensors.index.json")
|
|
37
|
+
if not os.path.exists(index_file):
|
|
38
|
+
raise FileNotFoundError(f"No index file found at {index_file}")
|
|
39
|
+
with open(index_file) as f:
|
|
40
|
+
data = json.load(f)
|
|
41
|
+
return {k: os.path.join(model_dir, v) for k, v in data["weight_map"].items()}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _split_layer(
|
|
45
|
+
layer_idx: int,
|
|
46
|
+
weight_map: dict[str, str],
|
|
47
|
+
output_dir: str,
|
|
48
|
+
) -> None:
|
|
49
|
+
"""Split one layer's stacked expert weights into per-expert files."""
|
|
50
|
+
layer_dir = os.path.join(output_dir, f"layer_{layer_idx:02d}")
|
|
51
|
+
os.makedirs(layer_dir, exist_ok=True)
|
|
52
|
+
|
|
53
|
+
prefix = f"{WEIGHT_PREFIX}.{layer_idx}.mlp.switch_mlp."
|
|
54
|
+
|
|
55
|
+
# Collect all 9 tensor keys for this layer's experts
|
|
56
|
+
tensor_keys = {}
|
|
57
|
+
for proj in PROJECTIONS:
|
|
58
|
+
for comp in COMPONENTS:
|
|
59
|
+
key = f"{prefix}{proj}.{comp}"
|
|
60
|
+
if key not in weight_map:
|
|
61
|
+
raise KeyError(f"Missing weight key: {key}")
|
|
62
|
+
tensor_keys[(proj, comp)] = key
|
|
63
|
+
|
|
64
|
+
# Group by shard file to minimize file opens
|
|
65
|
+
shards: dict[str, list[tuple[str, str, str]]] = {}
|
|
66
|
+
for (proj, comp), key in tensor_keys.items():
|
|
67
|
+
shard_file = weight_map[key]
|
|
68
|
+
shards.setdefault(shard_file, []).append((proj, comp, key))
|
|
69
|
+
|
|
70
|
+
# Load all stacked tensors for this layer
|
|
71
|
+
stacked: dict[tuple[str, str], mx.array] = {}
|
|
72
|
+
for shard_file, entries in shards.items():
|
|
73
|
+
shard_data = mx.load(shard_file)
|
|
74
|
+
for proj, comp, key in entries:
|
|
75
|
+
tensor = shard_data[key]
|
|
76
|
+
mx.eval(tensor)
|
|
77
|
+
stacked[(proj, comp)] = tensor
|
|
78
|
+
del shard_data
|
|
79
|
+
|
|
80
|
+
# Split and save per-expert files
|
|
81
|
+
for expert_idx in range(NUM_EXPERTS):
|
|
82
|
+
expert_tensors = {}
|
|
83
|
+
for proj in PROJECTIONS:
|
|
84
|
+
for comp in COMPONENTS:
|
|
85
|
+
full_tensor = stacked[(proj, comp)]
|
|
86
|
+
sliced = full_tensor[expert_idx]
|
|
87
|
+
mx.eval(sliced)
|
|
88
|
+
expert_tensors[f"{proj}.{comp}"] = sliced
|
|
89
|
+
|
|
90
|
+
out_path = os.path.join(layer_dir, f"expert_{expert_idx:03d}.safetensors")
|
|
91
|
+
mx.save_safetensors(out_path, expert_tensors)
|
|
92
|
+
del expert_tensors
|
|
93
|
+
|
|
94
|
+
del stacked
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def split_experts(
|
|
98
|
+
model_dir: str,
|
|
99
|
+
output_dir: str,
|
|
100
|
+
num_layers: int = NUM_MOE_LAYERS,
|
|
101
|
+
) -> None:
|
|
102
|
+
"""Split all layers' expert weights into per-expert files."""
|
|
103
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
104
|
+
|
|
105
|
+
print(f" Building weight map...")
|
|
106
|
+
weight_map = _build_weight_map(model_dir)
|
|
107
|
+
|
|
108
|
+
total_start = time.time()
|
|
109
|
+
for layer_idx in range(num_layers):
|
|
110
|
+
layer_start = time.time()
|
|
111
|
+
print(f" Splitting layer {layer_idx:2d}/{num_layers - 1}...", end=" ", flush=True)
|
|
112
|
+
_split_layer(layer_idx, weight_map, output_dir)
|
|
113
|
+
elapsed = time.time() - layer_start
|
|
114
|
+
total_elapsed = time.time() - total_start
|
|
115
|
+
eta = (total_elapsed / (layer_idx + 1)) * (num_layers - layer_idx - 1)
|
|
116
|
+
print(f"done ({elapsed:.1f}s, ETA {eta:.0f}s)")
|
|
117
|
+
|
|
118
|
+
total_elapsed = time.time() - total_start
|
|
119
|
+
total_files = num_layers * NUM_EXPERTS
|
|
120
|
+
print(f" {total_files:,} expert files created in {total_elapsed:.1f}s")
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Benchmarking suite for Kandiga inference engine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
console = Console()
|
|
11
|
+
|
|
12
|
+
PROMPTS = [
|
|
13
|
+
("Short", "What is 2+2?"),
|
|
14
|
+
("Medium", "Explain how a transformer neural network works in 3 sentences."),
|
|
15
|
+
("Long", "Write a detailed comparison of Python and Rust for systems programming. "
|
|
16
|
+
"Cover performance, safety, ecosystem, and learning curve."),
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _bench_one(engine, prompt: str, max_tokens: int = 256) -> dict:
|
|
21
|
+
"""Benchmark a single prompt. Returns timing stats."""
|
|
22
|
+
# Warmup: ensure model is loaded
|
|
23
|
+
if not engine.is_ready:
|
|
24
|
+
engine.load()
|
|
25
|
+
|
|
26
|
+
# Time to first token
|
|
27
|
+
t_start = time.time()
|
|
28
|
+
tokens = []
|
|
29
|
+
t_first = None
|
|
30
|
+
|
|
31
|
+
for token in engine.generate(prompt, max_tokens=max_tokens, stream=True):
|
|
32
|
+
if t_first is None:
|
|
33
|
+
t_first = time.time()
|
|
34
|
+
tokens.append(token)
|
|
35
|
+
|
|
36
|
+
t_end = time.time()
|
|
37
|
+
|
|
38
|
+
total_time = t_end - t_start
|
|
39
|
+
ttft = (t_first - t_start) if t_first else total_time
|
|
40
|
+
gen_time = (t_end - t_first) if t_first else 0
|
|
41
|
+
num_tokens = len(tokens)
|
|
42
|
+
tps = num_tokens / gen_time if gen_time > 0 else 0
|
|
43
|
+
|
|
44
|
+
return {
|
|
45
|
+
"num_tokens": num_tokens,
|
|
46
|
+
"total_time": total_time,
|
|
47
|
+
"ttft": ttft,
|
|
48
|
+
"gen_time": gen_time,
|
|
49
|
+
"tps": tps,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def run_bench():
|
|
54
|
+
"""Run inference benchmarks and display results."""
|
|
55
|
+
console.print()
|
|
56
|
+
console.print("[bold cyan]Kandiga Benchmark[/]")
|
|
57
|
+
console.print()
|
|
58
|
+
|
|
59
|
+
from kandiga.engine import KandigaEngine
|
|
60
|
+
|
|
61
|
+
# Benchmark both modes
|
|
62
|
+
for mode_name, fast in [("Quality (K=8)", False), ("Fast (K=4)", True)]:
|
|
63
|
+
console.print(f"[bold]{mode_name}[/]")
|
|
64
|
+
console.print("[dim]Loading model...[/]")
|
|
65
|
+
|
|
66
|
+
engine = KandigaEngine(fast_mode=fast, log_memory=False)
|
|
67
|
+
engine.load()
|
|
68
|
+
|
|
69
|
+
stats = engine.stats
|
|
70
|
+
console.print(
|
|
71
|
+
f"[dim]RSS: {stats['rss_mb']:.0f}MB | "
|
|
72
|
+
f"GPU: {stats['gpu_active_mb']:.0f}MB[/]"
|
|
73
|
+
)
|
|
74
|
+
console.print()
|
|
75
|
+
|
|
76
|
+
table = Table(show_header=True, header_style="bold")
|
|
77
|
+
table.add_column("Prompt", width=12)
|
|
78
|
+
table.add_column("Tokens", justify="right")
|
|
79
|
+
table.add_column("TTFT", justify="right")
|
|
80
|
+
table.add_column("Gen Time", justify="right")
|
|
81
|
+
table.add_column("tok/s", justify="right", style="cyan")
|
|
82
|
+
|
|
83
|
+
for label, prompt in PROMPTS:
|
|
84
|
+
console.print(f" Running: {label}...", end=" ")
|
|
85
|
+
result = _bench_one(engine, prompt, max_tokens=256)
|
|
86
|
+
console.print(f"[green]done[/]")
|
|
87
|
+
|
|
88
|
+
table.add_row(
|
|
89
|
+
label,
|
|
90
|
+
str(result["num_tokens"]),
|
|
91
|
+
f"{result['ttft']:.2f}s",
|
|
92
|
+
f"{result['gen_time']:.1f}s",
|
|
93
|
+
f"{result['tps']:.1f}",
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
console.print()
|
|
97
|
+
console.print(table)
|
|
98
|
+
console.print()
|
|
99
|
+
|
|
100
|
+
# Clean up
|
|
101
|
+
del engine
|
|
102
|
+
|
|
103
|
+
console.print("[bold green]Benchmark complete.[/]")
|
|
104
|
+
console.print()
|