kandiga 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kandiga-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Kandiga Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
kandiga-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: kandiga
3
+ Version: 0.1.0
4
+ Summary: Run 35B AI models in 1.5GB of RAM. Any Mac.
5
+ License: MIT
6
+ Requires-Python: >=3.9
7
+ License-File: LICENSE
8
+ Requires-Dist: mlx>=0.30.0
9
+ Requires-Dist: mlx-lm>=0.24.0
10
+ Requires-Dist: numpy
11
+ Requires-Dist: rich
12
+ Provides-Extra: serve
13
+ Requires-Dist: fastapi; extra == "serve"
14
+ Requires-Dist: uvicorn[standard]; extra == "serve"
15
+ Provides-Extra: tools
16
+ Requires-Dist: duckduckgo-search; extra == "tools"
17
+ Dynamic: license-file
@@ -0,0 +1,153 @@
1
+ # Kandiga
2
+
3
+ Run 35B AI models in 1.5GB of RAM. Any Mac.
4
+
5
+ Kandiga is an open-source MoE inference engine that uses **Selective Expert Materialization** to run models that would normally require 20GB+ of memory in under 2GB on any Apple Silicon Mac.
6
+
7
+ ## How it works
8
+
9
+ Large MoE (Mixture of Experts) models like Qwen3.5-35B-A3B have 256 experts per layer, but only activate 8 per token. Kandiga exploits this sparsity:
10
+
11
+ 1. **Shared layers** (attention, norms, embeddings) load to GPU memory (~1.5GB)
12
+ 2. **Expert MLP weights** stay on disk in packed binary files (~17GB SSD)
13
+ 3. **Per token**: the router selects 8 experts, which are read from SSD via `pread`
14
+ 4. **CPU computes** expert MLP with NEON-vectorized 4-bit dequant + GCD parallelism
15
+ 5. **GPU computes** attention simultaneously via MLX (unified memory, zero copy)
16
+
17
+ This is the [KTransformers](https://github.com/kvcache-ai/ktransformers) architecture adapted for Apple Silicon's unified memory.
18
+
19
+ ## Install
20
+
21
+ ```bash
22
+ pip install kandiga
23
+ ```
24
+
25
+ Requirements: macOS with Apple Silicon (M1/M2/M3/M4), Python 3.10+
26
+
27
+ ## Quick start
28
+
29
+ ```bash
30
+ # One-time setup: download model + prepare expert files (~20 min)
31
+ kandiga setup
32
+
33
+ # Interactive chat
34
+ kandiga chat
35
+
36
+ # Fast mode (K=4 experts instead of 8, ~2x speed, slightly less quality)
37
+ kandiga chat --fast
38
+
39
+ # One-shot prompt
40
+ kandiga "What is the capital of France?"
41
+
42
+ # Start an OpenAI-compatible API server
43
+ kandiga serve
44
+
45
+ # Run benchmarks
46
+ kandiga bench
47
+ ```
48
+
49
+ ## Benchmarks
50
+
51
+ Measured on M4 Mac Mini (16GB), Qwen3.5-35B-A3B-4bit:
52
+
53
+ | Mode | Experts | Speed | RAM | Quality |
54
+ |------|---------|-------|-----|---------|
55
+ | Quality (K=8) | 8/256 per layer | ~3.5 tok/s | 1.5GB | Full |
56
+ | Fast (K=4) | 4/256 per layer | ~6.5 tok/s | 1.5GB | Near-equal |
57
+
58
+ For comparison, loading the full model requires 20.4GB of RAM and MLX alone achieves ~25 tok/s when it fits in memory. Kandiga trades speed for accessibility: if your Mac has 8-16GB of RAM, you can now run a 35B model that previously required 24GB+.
59
+
60
+ ## Architecture
61
+
62
+ ```
63
+ User prompt
64
+ |
65
+ v
66
+ [Tokenizer + Chat Template]
67
+ |
68
+ v
69
+ [MLX Forward Pass]
70
+ |
71
+ +---> GPU: Attention + Norms + Router + Shared Expert + Blending
72
+ |
73
+ +---> CPU: Routed Expert MLP (NEON 4-bit dequant + GCD parallel)
74
+ | |
75
+ | +-- pread expert weights from SSD (OS page cache)
76
+ | +-- gate_proj matvec (512x2048)
77
+ | +-- up_proj matvec (512x2048)
78
+ | +-- SwiGLU activation
79
+ | +-- down_proj matvec (2048x512)
80
+ |
81
+ v
82
+ [Token Output]
83
+ ```
84
+
85
+ Both CPU and GPU operate on the same physical DRAM (Apple Silicon unified memory), so there is zero data transfer overhead between them.
86
+
87
+ ## API Server
88
+
89
+ Kandiga includes an OpenAI-compatible HTTP API:
90
+
91
+ ```bash
92
+ kandiga serve --port 8340
93
+ ```
94
+
95
+ ```python
96
+ import openai
97
+
98
+ client = openai.OpenAI(base_url="http://localhost:8340/v1", api_key="unused")
99
+ response = client.chat.completions.create(
100
+ model="mlx-community/Qwen3.5-35B-A3B-4bit",
101
+ messages=[{"role": "user", "content": "Hello!"}],
102
+ stream=True,
103
+ )
104
+ for chunk in response:
105
+ print(chunk.choices[0].delta.content or "", end="")
106
+ ```
107
+
108
+ ## Project structure
109
+
110
+ ```
111
+ kandiga/
112
+ __init__.py # Package version
113
+ cli.py # CLI entry point (argparse)
114
+ engine.py # Core inference engine (SEM)
115
+ chat.py # Interactive chat (Rich terminal UI)
116
+ serve.py # OpenAI-compatible HTTP API (FastAPI)
117
+ bench.py # Benchmarking suite
118
+ setup.py # Model download + expert splitting + packing
119
+ _split_experts.py # Split stacked weights into per-expert files
120
+ _pack_experts.py # Pack per-expert files into binary format
121
+ _build.py # Compile CPU expert dylib from source
122
+ metal/
123
+ kandiga_cpu_expert.h # C API header
124
+ kandiga_cpu_expert.m # NEON + GCD implementation
125
+ Makefile # Build the dylib
126
+ tools/
127
+ __init__.py # Future: web search, file access
128
+ scripts/
129
+ install.sh # Quick install script
130
+ tests/
131
+ ...
132
+ ```
133
+
134
+ ## Development
135
+
136
+ ```bash
137
+ # Clone
138
+ git clone https://github.com/yourusername/kandiga.git
139
+ cd kandiga
140
+
141
+ # Install in development mode
142
+ pip install -e ".[serve]"
143
+
144
+ # Build the CPU expert library
145
+ cd kandiga/metal && make && cd ../..
146
+
147
+ # Run tests
148
+ pytest tests/ -v
149
+ ```
150
+
151
+ ## License
152
+
153
+ MIT
@@ -0,0 +1,2 @@
1
+ """Kandiga: Run 35B AI models in 1.5GB of RAM."""
2
+ __version__ = "0.1.0"
@@ -0,0 +1,4 @@
1
+ """Allow running kandiga as a module: python -m kandiga"""
2
+ from kandiga.cli import main
3
+
4
+ main()
@@ -0,0 +1,48 @@
1
+ """Build the CPU expert dylib from source."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import subprocess
7
+ import sys
8
+
9
+
10
+ def build_cpu_expert_dylib() -> str:
11
+ """Compile libkandiga_cpu_expert.dylib from the Objective-C source.
12
+
13
+ Returns the path to the built dylib.
14
+ """
15
+ metal_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "metal")
16
+ source = os.path.join(metal_dir, "kandiga_cpu_expert.m")
17
+ header = os.path.join(metal_dir, "kandiga_cpu_expert.h")
18
+ dylib = os.path.join(metal_dir, "libkandiga_cpu_expert.dylib")
19
+
20
+ if not os.path.exists(source):
21
+ raise FileNotFoundError(f"Source file not found: {source}")
22
+
23
+ # Build command
24
+ cmd = [
25
+ "clang",
26
+ "-shared",
27
+ "-o", dylib,
28
+ source,
29
+ "-fobjc-arc",
30
+ "-framework", "Foundation",
31
+ "-O2",
32
+ "-march=native",
33
+ ]
34
+
35
+ print(f" Building: {' '.join(cmd)}")
36
+ result = subprocess.run(cmd, capture_output=True, text=True)
37
+
38
+ if result.returncode != 0:
39
+ raise RuntimeError(
40
+ f"Compilation failed (exit {result.returncode}):\n"
41
+ f"stdout: {result.stdout}\n"
42
+ f"stderr: {result.stderr}"
43
+ )
44
+
45
+ if not os.path.exists(dylib):
46
+ raise RuntimeError(f"Build succeeded but dylib not found at {dylib}")
47
+
48
+ return dylib
@@ -0,0 +1,167 @@
1
+ """Pack per-expert safetensors into single raw binary files per layer.
2
+
3
+ Reads the split per-expert safetensors files and packs them into a compact
4
+ binary format that can be read with zero parsing overhead using pread().
5
+
6
+ Binary format per file:
7
+ Header (4096 bytes):
8
+ magic: 4 bytes "BKEX"
9
+ version: uint32 1
10
+ num_experts: uint32 256
11
+ expert_size: uint64 1769472 (bytes per expert block)
12
+ num_tensors: uint32 9
13
+ tensor descriptors...
14
+ padding to 4096 bytes
15
+
16
+ Expert data (256 x 1769472 bytes):
17
+ expert_000: [gate.weight][gate.scales][gate.biases]
18
+ [up.weight][up.scales][up.biases]
19
+ [down.weight][down.scales][down.biases]
20
+ ...
21
+ expert_255: same layout
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import os
27
+ import struct
28
+ import time
29
+
30
+ import mlx.core as mx
31
+ import numpy as np
32
+
33
+ NUM_EXPERTS = 256
34
+ NUM_MOE_LAYERS = 40
35
+ HEADER_SIZE = 4096
36
+ EXPERT_SIZE = 1_769_472 # 1728KB exactly
37
+
38
+ # Tensor order must match the C library's byte offsets
39
+ TENSOR_ORDER = [
40
+ ("gate_proj.weight", (512, 256), "uint32"),
41
+ ("gate_proj.scales", (512, 32), "bfloat16"),
42
+ ("gate_proj.biases", (512, 32), "bfloat16"),
43
+ ("up_proj.weight", (512, 256), "uint32"),
44
+ ("up_proj.scales", (512, 32), "bfloat16"),
45
+ ("up_proj.biases", (512, 32), "bfloat16"),
46
+ ("down_proj.weight", (2048, 64), "uint32"),
47
+ ("down_proj.scales", (2048, 8), "bfloat16"),
48
+ ("down_proj.biases", (2048, 8), "bfloat16"),
49
+ ]
50
+
51
+
52
+ def _tensor_nbytes(shape: tuple, dtype_str: str) -> int:
53
+ """Calculate raw byte size for a tensor."""
54
+ itemsize = 4 if dtype_str == "uint32" else 2
55
+ return shape[0] * shape[1] * itemsize
56
+
57
+
58
+ def _build_header() -> bytes:
59
+ """Build the 4096-byte binary header."""
60
+ buf = bytearray(HEADER_SIZE)
61
+
62
+ buf[0:4] = b"BKEX"
63
+ struct.pack_into("<I", buf, 4, 1) # version
64
+ struct.pack_into("<I", buf, 8, NUM_EXPERTS)
65
+ struct.pack_into("<Q", buf, 12, EXPERT_SIZE)
66
+ struct.pack_into("<I", buf, 20, len(TENSOR_ORDER))
67
+
68
+ offset_in_expert = 0
69
+ pos = 24
70
+ for name, shape, dtype_str in TENSOR_ORDER:
71
+ nbytes = _tensor_nbytes(shape, dtype_str)
72
+ dtype_code = 0 if dtype_str == "uint32" else 1
73
+
74
+ name_bytes = name.encode("ascii")
75
+ struct.pack_into("<B", buf, pos, len(name_bytes))
76
+ pos += 1
77
+ buf[pos: pos + len(name_bytes)] = name_bytes
78
+ pos += 24
79
+ struct.pack_into("<I", buf, pos, offset_in_expert)
80
+ pos += 4
81
+ struct.pack_into("<I", buf, pos, nbytes)
82
+ pos += 4
83
+ struct.pack_into("<I", buf, pos, shape[0])
84
+ pos += 4
85
+ struct.pack_into("<I", buf, pos, shape[1])
86
+ pos += 4
87
+ struct.pack_into("<B", buf, pos, dtype_code)
88
+ pos += 1
89
+
90
+ offset_in_expert += nbytes
91
+
92
+ assert offset_in_expert == EXPERT_SIZE, (
93
+ f"Tensor sizes don't sum to EXPERT_SIZE: {offset_in_expert} != {EXPERT_SIZE}"
94
+ )
95
+ return bytes(buf)
96
+
97
+
98
+ def _expert_to_bytes(tensors: dict[str, mx.array]) -> bytes:
99
+ """Convert an expert's tensor dict to raw bytes in canonical order."""
100
+ parts = []
101
+ for name, shape, dtype_str in TENSOR_ORDER:
102
+ tensor = tensors[name]
103
+ mx.eval(tensor)
104
+
105
+ if dtype_str == "uint32":
106
+ np_arr = np.array(tensor, copy=False)
107
+ raw = np_arr.tobytes()
108
+ else:
109
+ u16 = tensor.view(mx.uint16)
110
+ mx.eval(u16)
111
+ np_arr = np.array(u16, copy=False)
112
+ raw = np_arr.tobytes()
113
+
114
+ expected = _tensor_nbytes(shape, dtype_str)
115
+ assert len(raw) == expected, f"{name}: got {len(raw)} bytes, expected {expected}"
116
+ parts.append(raw)
117
+
118
+ data = b"".join(parts)
119
+ assert len(data) == EXPERT_SIZE, f"Expert data {len(data)} != {EXPERT_SIZE}"
120
+ return data
121
+
122
+
123
+ def _pack_layer(layer_idx: int, input_dir: str, output_dir: str) -> None:
124
+ """Pack all 256 experts for one layer into a single binary file."""
125
+ layer_dir = os.path.join(input_dir, f"layer_{layer_idx:02d}")
126
+ out_path = os.path.join(output_dir, f"layer_{layer_idx:02d}.bin")
127
+
128
+ header = _build_header()
129
+
130
+ with open(out_path, "wb") as f:
131
+ f.write(header)
132
+ for expert_idx in range(NUM_EXPERTS):
133
+ st_path = os.path.join(layer_dir, f"expert_{expert_idx:03d}.safetensors")
134
+ tensors = mx.load(st_path)
135
+ mx.eval(*tensors.values())
136
+ raw = _expert_to_bytes(tensors)
137
+ f.write(raw)
138
+ del tensors
139
+
140
+ # Verify file size
141
+ expected_size = HEADER_SIZE + NUM_EXPERTS * EXPERT_SIZE
142
+ actual_size = os.path.getsize(out_path)
143
+ assert actual_size == expected_size, (
144
+ f"File size mismatch: {actual_size} != {expected_size}"
145
+ )
146
+
147
+
148
+ def pack_experts(
149
+ input_dir: str,
150
+ output_dir: str,
151
+ num_layers: int = NUM_MOE_LAYERS,
152
+ ) -> None:
153
+ """Pack all layers from split expert files into binary format."""
154
+ os.makedirs(output_dir, exist_ok=True)
155
+
156
+ total_start = time.time()
157
+ for layer_idx in range(num_layers):
158
+ layer_start = time.time()
159
+ print(f" Packing layer {layer_idx:2d}/{num_layers - 1}...", end=" ", flush=True)
160
+ _pack_layer(layer_idx, input_dir, output_dir)
161
+ elapsed = time.time() - layer_start
162
+ total_elapsed = time.time() - total_start
163
+ eta = (total_elapsed / (layer_idx + 1)) * (num_layers - layer_idx - 1)
164
+ print(f"done ({elapsed:.1f}s, ETA {eta:.0f}s)")
165
+
166
+ total_elapsed = time.time() - total_start
167
+ print(f" {num_layers} layer files packed in {total_elapsed:.1f}s")
@@ -0,0 +1,120 @@
1
+ """Split stacked MoE expert weights into per-expert safetensors files.
2
+
3
+ Reads the downloaded Qwen3.5-35B-A3B-4bit model's stacked expert weights
4
+ (256 experts per layer, 40 MoE layers) and splits them into individual
5
+ per-expert files for selective loading.
6
+
7
+ Output structure:
8
+ ~/.kandiga/experts/Qwen3.5-35B-A3B-4bit/
9
+ layer_00/
10
+ expert_000.safetensors
11
+ ...
12
+ expert_255.safetensors
13
+ layer_01/
14
+ ...
15
+ layer_39/
16
+ ...
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import os
23
+ import time
24
+
25
+ import mlx.core as mx
26
+
27
+ WEIGHT_PREFIX = "language_model.model.layers"
28
+ PROJECTIONS = ("gate_proj", "up_proj", "down_proj")
29
+ COMPONENTS = ("weight", "scales", "biases")
30
+ NUM_EXPERTS = 256
31
+ NUM_MOE_LAYERS = 40
32
+
33
+
34
+ def _build_weight_map(model_dir: str) -> dict[str, str]:
35
+ """Map tensor names to absolute shard file paths."""
36
+ index_file = os.path.join(model_dir, "model.safetensors.index.json")
37
+ if not os.path.exists(index_file):
38
+ raise FileNotFoundError(f"No index file found at {index_file}")
39
+ with open(index_file) as f:
40
+ data = json.load(f)
41
+ return {k: os.path.join(model_dir, v) for k, v in data["weight_map"].items()}
42
+
43
+
44
+ def _split_layer(
45
+ layer_idx: int,
46
+ weight_map: dict[str, str],
47
+ output_dir: str,
48
+ ) -> None:
49
+ """Split one layer's stacked expert weights into per-expert files."""
50
+ layer_dir = os.path.join(output_dir, f"layer_{layer_idx:02d}")
51
+ os.makedirs(layer_dir, exist_ok=True)
52
+
53
+ prefix = f"{WEIGHT_PREFIX}.{layer_idx}.mlp.switch_mlp."
54
+
55
+ # Collect all 9 tensor keys for this layer's experts
56
+ tensor_keys = {}
57
+ for proj in PROJECTIONS:
58
+ for comp in COMPONENTS:
59
+ key = f"{prefix}{proj}.{comp}"
60
+ if key not in weight_map:
61
+ raise KeyError(f"Missing weight key: {key}")
62
+ tensor_keys[(proj, comp)] = key
63
+
64
+ # Group by shard file to minimize file opens
65
+ shards: dict[str, list[tuple[str, str, str]]] = {}
66
+ for (proj, comp), key in tensor_keys.items():
67
+ shard_file = weight_map[key]
68
+ shards.setdefault(shard_file, []).append((proj, comp, key))
69
+
70
+ # Load all stacked tensors for this layer
71
+ stacked: dict[tuple[str, str], mx.array] = {}
72
+ for shard_file, entries in shards.items():
73
+ shard_data = mx.load(shard_file)
74
+ for proj, comp, key in entries:
75
+ tensor = shard_data[key]
76
+ mx.eval(tensor)
77
+ stacked[(proj, comp)] = tensor
78
+ del shard_data
79
+
80
+ # Split and save per-expert files
81
+ for expert_idx in range(NUM_EXPERTS):
82
+ expert_tensors = {}
83
+ for proj in PROJECTIONS:
84
+ for comp in COMPONENTS:
85
+ full_tensor = stacked[(proj, comp)]
86
+ sliced = full_tensor[expert_idx]
87
+ mx.eval(sliced)
88
+ expert_tensors[f"{proj}.{comp}"] = sliced
89
+
90
+ out_path = os.path.join(layer_dir, f"expert_{expert_idx:03d}.safetensors")
91
+ mx.save_safetensors(out_path, expert_tensors)
92
+ del expert_tensors
93
+
94
+ del stacked
95
+
96
+
97
+ def split_experts(
98
+ model_dir: str,
99
+ output_dir: str,
100
+ num_layers: int = NUM_MOE_LAYERS,
101
+ ) -> None:
102
+ """Split all layers' expert weights into per-expert files."""
103
+ os.makedirs(output_dir, exist_ok=True)
104
+
105
+ print(f" Building weight map...")
106
+ weight_map = _build_weight_map(model_dir)
107
+
108
+ total_start = time.time()
109
+ for layer_idx in range(num_layers):
110
+ layer_start = time.time()
111
+ print(f" Splitting layer {layer_idx:2d}/{num_layers - 1}...", end=" ", flush=True)
112
+ _split_layer(layer_idx, weight_map, output_dir)
113
+ elapsed = time.time() - layer_start
114
+ total_elapsed = time.time() - total_start
115
+ eta = (total_elapsed / (layer_idx + 1)) * (num_layers - layer_idx - 1)
116
+ print(f"done ({elapsed:.1f}s, ETA {eta:.0f}s)")
117
+
118
+ total_elapsed = time.time() - total_start
119
+ total_files = num_layers * NUM_EXPERTS
120
+ print(f" {total_files:,} expert files created in {total_elapsed:.1f}s")
@@ -0,0 +1,104 @@
1
+ """Benchmarking suite for Kandiga inference engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+
7
+ from rich.console import Console
8
+ from rich.table import Table
9
+
10
+ console = Console()
11
+
12
+ PROMPTS = [
13
+ ("Short", "What is 2+2?"),
14
+ ("Medium", "Explain how a transformer neural network works in 3 sentences."),
15
+ ("Long", "Write a detailed comparison of Python and Rust for systems programming. "
16
+ "Cover performance, safety, ecosystem, and learning curve."),
17
+ ]
18
+
19
+
20
+ def _bench_one(engine, prompt: str, max_tokens: int = 256) -> dict:
21
+ """Benchmark a single prompt. Returns timing stats."""
22
+ # Warmup: ensure model is loaded
23
+ if not engine.is_ready:
24
+ engine.load()
25
+
26
+ # Time to first token
27
+ t_start = time.time()
28
+ tokens = []
29
+ t_first = None
30
+
31
+ for token in engine.generate(prompt, max_tokens=max_tokens, stream=True):
32
+ if t_first is None:
33
+ t_first = time.time()
34
+ tokens.append(token)
35
+
36
+ t_end = time.time()
37
+
38
+ total_time = t_end - t_start
39
+ ttft = (t_first - t_start) if t_first else total_time
40
+ gen_time = (t_end - t_first) if t_first else 0
41
+ num_tokens = len(tokens)
42
+ tps = num_tokens / gen_time if gen_time > 0 else 0
43
+
44
+ return {
45
+ "num_tokens": num_tokens,
46
+ "total_time": total_time,
47
+ "ttft": ttft,
48
+ "gen_time": gen_time,
49
+ "tps": tps,
50
+ }
51
+
52
+
53
+ def run_bench():
54
+ """Run inference benchmarks and display results."""
55
+ console.print()
56
+ console.print("[bold cyan]Kandiga Benchmark[/]")
57
+ console.print()
58
+
59
+ from kandiga.engine import KandigaEngine
60
+
61
+ # Benchmark both modes
62
+ for mode_name, fast in [("Quality (K=8)", False), ("Fast (K=4)", True)]:
63
+ console.print(f"[bold]{mode_name}[/]")
64
+ console.print("[dim]Loading model...[/]")
65
+
66
+ engine = KandigaEngine(fast_mode=fast, log_memory=False)
67
+ engine.load()
68
+
69
+ stats = engine.stats
70
+ console.print(
71
+ f"[dim]RSS: {stats['rss_mb']:.0f}MB | "
72
+ f"GPU: {stats['gpu_active_mb']:.0f}MB[/]"
73
+ )
74
+ console.print()
75
+
76
+ table = Table(show_header=True, header_style="bold")
77
+ table.add_column("Prompt", width=12)
78
+ table.add_column("Tokens", justify="right")
79
+ table.add_column("TTFT", justify="right")
80
+ table.add_column("Gen Time", justify="right")
81
+ table.add_column("tok/s", justify="right", style="cyan")
82
+
83
+ for label, prompt in PROMPTS:
84
+ console.print(f" Running: {label}...", end=" ")
85
+ result = _bench_one(engine, prompt, max_tokens=256)
86
+ console.print(f"[green]done[/]")
87
+
88
+ table.add_row(
89
+ label,
90
+ str(result["num_tokens"]),
91
+ f"{result['ttft']:.2f}s",
92
+ f"{result['gen_time']:.1f}s",
93
+ f"{result['tps']:.1f}",
94
+ )
95
+
96
+ console.print()
97
+ console.print(table)
98
+ console.print()
99
+
100
+ # Clean up
101
+ del engine
102
+
103
+ console.print("[bold green]Benchmark complete.[/]")
104
+ console.print()