bit-ttt-engine 0.6.2__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bit_ttt_engine-0.7.0/PKG-INFO +136 -0
- bit_ttt_engine-0.7.0/README_PYPI.md +103 -0
- bit_ttt_engine-0.7.0/cortex_rust/__init__.py +25 -0
- bit_ttt_engine-0.7.0/cortex_rust/__main__.py +4 -0
- bit_ttt_engine-0.7.0/cortex_rust/chat.py +196 -0
- bit_ttt_engine-0.7.0/cortex_rust/cli.py +381 -0
- bit_ttt_engine-0.7.0/cortex_rust/engine.py +253 -0
- bit_ttt_engine-0.7.0/cortex_rust/server.py +493 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/Cargo.toml +1 -1
- bit_ttt_engine-0.7.0/crates/rust_engine/README_PYPI.md +103 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/build.rs +61 -7
- bit_ttt_engine-0.7.0/crates/rust_engine/cortex_rust/__init__.py +25 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/cortex_rust/__main__.py +4 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/cortex_rust/chat.py +196 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/cortex_rust/cli.py +381 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/cortex_rust/engine.py +253 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/cortex_rust/server.py +493 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/examples/bench_qmatmul.rs +150 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/examples/bench_qmatmul_v2.rs +124 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/examples/debug_gemma2.rs +148 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/examples/profile_7b.rs +104 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/examples/test_7b.rs +108 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/examples/test_qmatmul.rs +97 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/examples/test_quantize_q8.rs +121 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/examples/ttt_benchmark.rs +1 -1
- bit_ttt_engine-0.7.0/crates/rust_engine/src/bin/bench_wmma.rs +166 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/bin/test_wmma_small.rs +105 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/cublas_gemv.rs +58 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_attention.cu +189 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_attention.ptx +1022 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_attention_cuda.rs +214 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_decoding.cu +401 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_decoding.ptx +2002 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_decoding.rs +659 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_decoding_fp16.cu +377 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_decoding_fp16.ptx +2880 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/fused_ops.cu +419 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/fused_ops.ptx +864 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/fused_ops.rs +881 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/matmul_4bit.rs +19 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/mod.rs +12 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/quantize_q8.cu +589 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/quantize_q8.ptx +3582 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/quantize_q8.rs +767 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/wmma_matmul.cu +463 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/wmma_matmul.ptx +3730 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/wmma_matmul.rs +376 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/layers/lora.rs +548 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/layers/lora_train.rs +469 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/ttt.rs +241 -2
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers.rs +4 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/lib.rs +10 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/block.rs +4 -2
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/config.rs +62 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/detector.rs +12 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/gguf_loader.rs +55 -15
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/gguf_model.rs +979 -703
- bit_ttt_engine-0.7.0/crates/rust_engine/src/model/gguf_model_quantized.rs +1626 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model.rs +2 -0
- bit_ttt_engine-0.7.0/crates/rust_engine/src/python.rs +2142 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/tests/ttt_test.rs +3 -0
- bit_ttt_engine-0.7.0/pyproject.toml +45 -0
- bit_ttt_engine-0.6.2/PKG-INFO +0 -118
- bit_ttt_engine-0.6.2/README_PYPI.md +0 -99
- bit_ttt_engine-0.6.2/crates/rust_engine/README_PYPI.md +0 -99
- bit_ttt_engine-0.6.2/crates/rust_engine/src/kernels/fused_ops.cu +0 -261
- bit_ttt_engine-0.6.2/crates/rust_engine/src/kernels/fused_ops.ptx +0 -498
- bit_ttt_engine-0.6.2/crates/rust_engine/src/kernels/fused_ops.rs +0 -366
- bit_ttt_engine-0.6.2/crates/rust_engine/src/kernels/mod.rs +0 -7
- bit_ttt_engine-0.6.2/crates/rust_engine/src/python.rs +0 -988
- bit_ttt_engine-0.6.2/pyproject.toml +0 -27
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/LICENSE +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/.cargo/config.toml +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/CHANGELOG.md +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/Cargo.lock +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/LICENSE +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/README.md +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/cortex_rust.pyi +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/docs/paged_attention_quality_investigation.md +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/examples/basic_generate.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/examples/benchmark.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/examples/cuda_test.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/examples/debug_load.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/examples/e2e_benchmark.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/examples/interactive_chat.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/examples/model_info.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/examples/python_sanity_check.py +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/bench_4bit_gpu.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/bench_cpu_kernel.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/bench_gemm_4bit.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/bench_sizes.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/bench_tinyllama.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/detect_model.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/fast_download.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/quick_gen.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/run_4bit_llama.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/test_13b.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/test_4bit_inference.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/test_cuda_gemm.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/test_memory.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/device_utils.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/download.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/error.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/eval/mod.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/eval/perplexity.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/adaptive_bit_op.cu +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/adaptive_bit_op.ptx +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/bit_op.cu +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/bit_op.ptx +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/cpu.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/cuda.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/matmul_4bit.cu +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/matmul_4bit.ptx +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/packing.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/packing_4bit.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/paged_attention.cu +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/paged_attention.ptx +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/paged_attention.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/adaptive_linear.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/attention.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/bit_linear.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/flash_attention.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/isomorphic.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/kv_cache/mod.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/linear_4bit.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/rms_norm.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/swiglu.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/config_common.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/llama/bitllama.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/llama/llama_fp16.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/llama/mod.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/llama_4bit.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/unified.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/optim/mod.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/optim/schedule_free.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/pack/install.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/pack/lib.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/pack/reader.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/pack/types.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/pack/verify.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/pack/writer.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/paged_attention/block_manager.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/paged_attention/cache_engine.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/paged_attention/mod.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/scheduler/mod.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/speculative/mod.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/tests/attention_test.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/tests/bit_linear_test.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/tests/format_diagnosis.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/tests/isomorphic_test.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/wasm.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/tests/accuracy_test.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/tests/bitllama_e2e.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/tests/common.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/tests/gguf_e2e.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/tests/load_direct_benchmark.rs +0 -0
- {bit_ttt_engine-0.6.2 → bit_ttt_engine-0.7.0}/crates/rust_engine/tests/load_packed_e2e.rs +0 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bit-ttt-engine
|
|
3
|
+
Version: 0.7.0
|
|
4
|
+
Classifier: Development Status :: 4 - Beta
|
|
5
|
+
Classifier: Programming Language :: Rust
|
|
6
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
16
|
+
Requires-Dist: tokenizers>=0.19
|
|
17
|
+
Requires-Dist: huggingface-hub>=0.20
|
|
18
|
+
Requires-Dist: bit-ttt-engine[server] ; extra == 'all'
|
|
19
|
+
Requires-Dist: fastapi>=0.100 ; extra == 'server'
|
|
20
|
+
Requires-Dist: uvicorn>=0.20 ; extra == 'server'
|
|
21
|
+
Requires-Dist: sse-starlette>=1.6 ; extra == 'server'
|
|
22
|
+
Provides-Extra: all
|
|
23
|
+
Provides-Extra: server
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Summary: Fast local LLM inference with TTT (Test-Time Training) and LoRA — the model that learns while it runs
|
|
26
|
+
Keywords: llm,inference,ttt,lora,gguf,quantization,cuda
|
|
27
|
+
Home-Page: https://github.com/imonoonoko/Bit-TTT-Engine
|
|
28
|
+
Author: imonoonoko
|
|
29
|
+
License: MIT
|
|
30
|
+
Requires-Python: >=3.8
|
|
31
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
32
|
+
|
|
33
|
+
# 🧠 Bit-TTT-Engine
|
|
34
|
+
|
|
35
|
+
[](https://pypi.org/project/bit-ttt-engine/)
|
|
36
|
+
[](LICENSE)
|
|
37
|
+
[](https://www.rust-lang.org/)
|
|
38
|
+
|
|
39
|
+
**Fast local LLM inference that learns while it runs.**
|
|
40
|
+
|
|
41
|
+
- 🏎️ **47+ tok/s** on RTX 4060 Ti (7B Q4_K_M)
|
|
42
|
+
- 🧠 **TTT** (Test-Time Training) — adapts during inference (world's first!)
|
|
43
|
+
- 🎨 **LoRA** — fine-tune with one flag
|
|
44
|
+
- 📦 **5 models** — Llama-2/3, Gemma-2, Qwen2.5, Mistral
|
|
45
|
+
- 🔌 **OpenAI-compatible API** — drop-in replacement
|
|
46
|
+
|
|
47
|
+
## 🚀 Quick Start
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install bit-ttt-engine
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
import cortex_rust
|
|
55
|
+
|
|
56
|
+
# Load any GGUF model (auto-downloads from HuggingFace!)
|
|
57
|
+
model = cortex_rust.load("user/model-GGUF")
|
|
58
|
+
|
|
59
|
+
# Chat
|
|
60
|
+
response = model.chat([
|
|
61
|
+
{"role": "user", "content": "Hello!"}
|
|
62
|
+
])
|
|
63
|
+
print(response)
|
|
64
|
+
|
|
65
|
+
# Stream
|
|
66
|
+
for token in model.chat_stream([
|
|
67
|
+
{"role": "user", "content": "Tell me a story"}
|
|
68
|
+
]):
|
|
69
|
+
print(token, end="", flush=True)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## 🖥️ CLI
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# Interactive chat
|
|
76
|
+
bit-ttt chat model.gguf
|
|
77
|
+
|
|
78
|
+
# Generate text
|
|
79
|
+
bit-ttt generate model.gguf -p "Once upon a time"
|
|
80
|
+
|
|
81
|
+
# OpenAI-compatible API server
|
|
82
|
+
bit-ttt serve model.gguf --port 8000
|
|
83
|
+
|
|
84
|
+
# With LoRA + Q8 KV cache
|
|
85
|
+
bit-ttt chat model.gguf --lora adapter.bin --q8-cache
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## 🧠 TTT — Test-Time Training
|
|
89
|
+
|
|
90
|
+
**The model learns while it generates.** No other local LLM does this.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
model = cortex_rust.load("model.gguf")
|
|
94
|
+
model.enable_ttt(True)
|
|
95
|
+
|
|
96
|
+
# Each conversation makes the model smarter
|
|
97
|
+
response = model.chat([{"role": "user", "content": "My name is Alice"}])
|
|
98
|
+
# Next time, it remembers context better!
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## ⚡ Performance
|
|
102
|
+
|
|
103
|
+
| Model | Speed | VRAM |
|
|
104
|
+
|-------|-------|------|
|
|
105
|
+
| Llama-2 7B (Q4_K_M) | 47.8 tok/s | ~5 GB |
|
|
106
|
+
| Llama-3 8B (Q4_K_M) | 36.8 tok/s | ~6 GB |
|
|
107
|
+
| Mistral 7B (Q4_K_M) | 40.8 tok/s | ~5 GB |
|
|
108
|
+
| Qwen2.5 1.5B (Q4_K_M) | 70.4 tok/s | ~2 GB |
|
|
109
|
+
|
|
110
|
+
With `--q8-cache`: **82% VRAM reduction** for KV cache.
|
|
111
|
+
|
|
112
|
+
## 🔌 OpenAI-Compatible API
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
bit-ttt serve model.gguf --port 8000
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from openai import OpenAI
|
|
120
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="none")
|
|
121
|
+
response = client.chat.completions.create(
|
|
122
|
+
model="default",
|
|
123
|
+
messages=[{"role": "user", "content": "Hi!"}],
|
|
124
|
+
stream=True,
|
|
125
|
+
)
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## 📖 Links
|
|
129
|
+
|
|
130
|
+
- [GitHub](https://github.com/imonoonoko/Bit-TTT-Engine)
|
|
131
|
+
- [Documentation](https://github.com/imonoonoko/Bit-TTT-Engine#readme)
|
|
132
|
+
|
|
133
|
+
## 💖 License
|
|
134
|
+
|
|
135
|
+
MIT License
|
|
136
|
+
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# 🧠 Bit-TTT-Engine
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/bit-ttt-engine/)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
[](https://www.rust-lang.org/)
|
|
6
|
+
|
|
7
|
+
**Fast local LLM inference that learns while it runs.**
|
|
8
|
+
|
|
9
|
+
- 🏎️ **47+ tok/s** on RTX 4060 Ti (7B Q4_K_M)
|
|
10
|
+
- 🧠 **TTT** (Test-Time Training) — adapts during inference (world's first!)
|
|
11
|
+
- 🎨 **LoRA** — fine-tune with one flag
|
|
12
|
+
- 📦 **5 models** — Llama-2/3, Gemma-2, Qwen2.5, Mistral
|
|
13
|
+
- 🔌 **OpenAI-compatible API** — drop-in replacement
|
|
14
|
+
|
|
15
|
+
## 🚀 Quick Start
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install bit-ttt-engine
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
import cortex_rust
|
|
23
|
+
|
|
24
|
+
# Load any GGUF model (auto-downloads from HuggingFace!)
|
|
25
|
+
model = cortex_rust.load("user/model-GGUF")
|
|
26
|
+
|
|
27
|
+
# Chat
|
|
28
|
+
response = model.chat([
|
|
29
|
+
{"role": "user", "content": "Hello!"}
|
|
30
|
+
])
|
|
31
|
+
print(response)
|
|
32
|
+
|
|
33
|
+
# Stream
|
|
34
|
+
for token in model.chat_stream([
|
|
35
|
+
{"role": "user", "content": "Tell me a story"}
|
|
36
|
+
]):
|
|
37
|
+
print(token, end="", flush=True)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## 🖥️ CLI
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# Interactive chat
|
|
44
|
+
bit-ttt chat model.gguf
|
|
45
|
+
|
|
46
|
+
# Generate text
|
|
47
|
+
bit-ttt generate model.gguf -p "Once upon a time"
|
|
48
|
+
|
|
49
|
+
# OpenAI-compatible API server
|
|
50
|
+
bit-ttt serve model.gguf --port 8000
|
|
51
|
+
|
|
52
|
+
# With LoRA + Q8 KV cache
|
|
53
|
+
bit-ttt chat model.gguf --lora adapter.bin --q8-cache
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## 🧠 TTT — Test-Time Training
|
|
57
|
+
|
|
58
|
+
**The model learns while it generates.** No other local LLM does this.
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
model = cortex_rust.load("model.gguf")
|
|
62
|
+
model.enable_ttt(True)
|
|
63
|
+
|
|
64
|
+
# Each conversation makes the model smarter
|
|
65
|
+
response = model.chat([{"role": "user", "content": "My name is Alice"}])
|
|
66
|
+
# Next time, it remembers context better!
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## ⚡ Performance
|
|
70
|
+
|
|
71
|
+
| Model | Speed | VRAM |
|
|
72
|
+
|-------|-------|------|
|
|
73
|
+
| Llama-2 7B (Q4_K_M) | 47.8 tok/s | ~5 GB |
|
|
74
|
+
| Llama-3 8B (Q4_K_M) | 36.8 tok/s | ~6 GB |
|
|
75
|
+
| Mistral 7B (Q4_K_M) | 40.8 tok/s | ~5 GB |
|
|
76
|
+
| Qwen2.5 1.5B (Q4_K_M) | 70.4 tok/s | ~2 GB |
|
|
77
|
+
|
|
78
|
+
With `--q8-cache`: **82% VRAM reduction** for KV cache.
|
|
79
|
+
|
|
80
|
+
## 🔌 OpenAI-Compatible API
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
bit-ttt serve model.gguf --port 8000
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from openai import OpenAI
|
|
88
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="none")
|
|
89
|
+
response = client.chat.completions.create(
|
|
90
|
+
model="default",
|
|
91
|
+
messages=[{"role": "user", "content": "Hi!"}],
|
|
92
|
+
stream=True,
|
|
93
|
+
)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## 📖 Links
|
|
97
|
+
|
|
98
|
+
- [GitHub](https://github.com/imonoonoko/Bit-TTT-Engine)
|
|
99
|
+
- [Documentation](https://github.com/imonoonoko/Bit-TTT-Engine#readme)
|
|
100
|
+
|
|
101
|
+
## 💖 License
|
|
102
|
+
|
|
103
|
+
MIT License
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Auto-add CUDA DLL path on Windows
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
if sys.platform == "win32":
|
|
6
|
+
cuda_paths = [
|
|
7
|
+
os.environ.get("CUDA_PATH", ""),
|
|
8
|
+
r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4",
|
|
9
|
+
r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.3",
|
|
10
|
+
r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.0",
|
|
11
|
+
r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8",
|
|
12
|
+
]
|
|
13
|
+
for cuda_path in cuda_paths:
|
|
14
|
+
bin_path = os.path.join(cuda_path, "bin")
|
|
15
|
+
if os.path.isdir(bin_path):
|
|
16
|
+
os.add_dll_directory(bin_path)
|
|
17
|
+
break
|
|
18
|
+
|
|
19
|
+
from .cortex_rust import *
|
|
20
|
+
from .chat import format_chat, format_simple, detect_template, list_templates
|
|
21
|
+
from .engine import load, Model
|
|
22
|
+
|
|
23
|
+
__doc__ = cortex_rust.__doc__
|
|
24
|
+
if hasattr(cortex_rust, "__all__"):
|
|
25
|
+
__all__ = cortex_rust.__all__
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""Chat template support for various LLM architectures.
|
|
2
|
+
|
|
3
|
+
Provides format_chat() to convert messages into model-specific prompt strings.
|
|
4
|
+
Supports Llama-3, Llama-2, Gemma-2, Qwen/ChatML, and generic formats.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from cortex_rust.chat import format_chat, detect_template
|
|
8
|
+
|
|
9
|
+
messages = [
|
|
10
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
11
|
+
{"role": "user", "content": "Hello!"},
|
|
12
|
+
]
|
|
13
|
+
prompt = format_chat(messages, template="llama3")
|
|
14
|
+
# Or auto-detect from model path:
|
|
15
|
+
template = detect_template("Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
|
|
16
|
+
prompt = format_chat(messages, template=template)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from typing import List, Dict, Optional
|
|
20
|
+
|
|
21
|
+
# ============================================================================
|
|
22
|
+
# Template Definitions
|
|
23
|
+
# ============================================================================
|
|
24
|
+
|
|
25
|
+
TEMPLATES = {
|
|
26
|
+
"llama3": {
|
|
27
|
+
"bos": "<|begin_of_text|>",
|
|
28
|
+
"system_start": "<|start_header_id|>system<|end_header_id|>\n\n",
|
|
29
|
+
"system_end": "<|eot_id|>",
|
|
30
|
+
"user_start": "<|start_header_id|>user<|end_header_id|>\n\n",
|
|
31
|
+
"user_end": "<|eot_id|>",
|
|
32
|
+
"assistant_start": "<|start_header_id|>assistant<|end_header_id|>\n\n",
|
|
33
|
+
"assistant_end": "<|eot_id|>",
|
|
34
|
+
"default_system": "You are a helpful assistant.",
|
|
35
|
+
},
|
|
36
|
+
"llama2": {
|
|
37
|
+
"bos": "<s>",
|
|
38
|
+
"system_start": "<<SYS>>\n",
|
|
39
|
+
"system_end": "\n<</SYS>>\n\n",
|
|
40
|
+
"user_start": "[INST] ",
|
|
41
|
+
"user_end": " [/INST]",
|
|
42
|
+
"assistant_start": " ",
|
|
43
|
+
"assistant_end": " </s>",
|
|
44
|
+
"default_system": "You are a helpful, respectful and honest assistant.",
|
|
45
|
+
# Llama-2 embeds system inside first [INST]
|
|
46
|
+
"system_inside_user": True,
|
|
47
|
+
},
|
|
48
|
+
"gemma2": {
|
|
49
|
+
"bos": "<bos>",
|
|
50
|
+
"system_start": "", # Gemma-2 has no system role
|
|
51
|
+
"system_end": "",
|
|
52
|
+
"user_start": "<start_of_turn>user\n",
|
|
53
|
+
"user_end": "<end_of_turn>\n",
|
|
54
|
+
"assistant_start": "<start_of_turn>model\n",
|
|
55
|
+
"assistant_end": "<end_of_turn>\n",
|
|
56
|
+
"default_system": None, # No system support
|
|
57
|
+
},
|
|
58
|
+
"chatml": {
|
|
59
|
+
# Used by Qwen, Mistral-Instruct, etc.
|
|
60
|
+
"bos": "",
|
|
61
|
+
"system_start": "<|im_start|>system\n",
|
|
62
|
+
"system_end": "<|im_end|>\n",
|
|
63
|
+
"user_start": "<|im_start|>user\n",
|
|
64
|
+
"user_end": "<|im_end|>\n",
|
|
65
|
+
"assistant_start": "<|im_start|>assistant\n",
|
|
66
|
+
"assistant_end": "<|im_end|>\n",
|
|
67
|
+
"default_system": "You are a helpful assistant.",
|
|
68
|
+
},
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
# Model name patterns → template mapping
|
|
72
|
+
_DETECTION_PATTERNS = [
|
|
73
|
+
("llama-3", "llama3"),
|
|
74
|
+
("llama3", "llama3"),
|
|
75
|
+
("meta-llama-3", "llama3"),
|
|
76
|
+
("llama-2", "llama2"),
|
|
77
|
+
("llama2", "llama2"),
|
|
78
|
+
("gemma-2", "gemma2"),
|
|
79
|
+
("gemma2", "gemma2"),
|
|
80
|
+
("qwen", "chatml"),
|
|
81
|
+
("mistral", "chatml"),
|
|
82
|
+
("yi-", "chatml"),
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def detect_template(model_path: str) -> str:
|
|
87
|
+
"""Auto-detect chat template from model filename.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
model_path: Path to model file (e.g., "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Template name (e.g., "llama3", "chatml"). Falls back to "chatml" if unknown.
|
|
94
|
+
"""
|
|
95
|
+
name = model_path.lower().replace("\\", "/").split("/")[-1]
|
|
96
|
+
for pattern, template in _DETECTION_PATTERNS:
|
|
97
|
+
if pattern in name:
|
|
98
|
+
return template
|
|
99
|
+
return "chatml" # Safe default
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def list_templates() -> List[str]:
|
|
103
|
+
"""List all available template names."""
|
|
104
|
+
return list(TEMPLATES.keys())
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def format_chat(
|
|
108
|
+
messages: List[Dict[str, str]],
|
|
109
|
+
template: str = "chatml",
|
|
110
|
+
add_generation_prompt: bool = True,
|
|
111
|
+
) -> str:
|
|
112
|
+
"""Format chat messages into a model-specific prompt string.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
messages: List of {"role": "system"|"user"|"assistant", "content": "..."}
|
|
116
|
+
template: Template name ("llama3", "llama2", "gemma2", "chatml")
|
|
117
|
+
add_generation_prompt: If True, append assistant start token at the end
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Formatted prompt string ready for model.generate()
|
|
121
|
+
|
|
122
|
+
Example:
|
|
123
|
+
>>> messages = [{"role": "user", "content": "Hello!"}]
|
|
124
|
+
>>> format_chat(messages, template="llama3")
|
|
125
|
+
'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\\n\\nHello!<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n'
|
|
126
|
+
"""
|
|
127
|
+
if template not in TEMPLATES:
|
|
128
|
+
raise ValueError(f"Unknown template '{template}'. Available: {list_templates()}")
|
|
129
|
+
|
|
130
|
+
tmpl = TEMPLATES[template]
|
|
131
|
+
parts = [tmpl["bos"]]
|
|
132
|
+
|
|
133
|
+
system_inside_user = tmpl.get("system_inside_user", False)
|
|
134
|
+
system_content = None
|
|
135
|
+
|
|
136
|
+
for msg in messages:
|
|
137
|
+
role = msg["role"]
|
|
138
|
+
content = msg["content"]
|
|
139
|
+
|
|
140
|
+
if role == "system":
|
|
141
|
+
if system_inside_user:
|
|
142
|
+
# Llama-2 style: save system for embedding in first user message
|
|
143
|
+
system_content = content
|
|
144
|
+
elif tmpl["system_start"]: # Skip if no system support (Gemma-2)
|
|
145
|
+
parts.append(tmpl["system_start"])
|
|
146
|
+
parts.append(content)
|
|
147
|
+
parts.append(tmpl["system_end"])
|
|
148
|
+
|
|
149
|
+
elif role == "user":
|
|
150
|
+
parts.append(tmpl["user_start"])
|
|
151
|
+
if system_inside_user and system_content is not None:
|
|
152
|
+
# Llama-2: embed system before user content
|
|
153
|
+
parts.append(tmpl["system_start"])
|
|
154
|
+
parts.append(system_content)
|
|
155
|
+
parts.append(tmpl["system_end"])
|
|
156
|
+
system_content = None # Only first user message
|
|
157
|
+
parts.append(content)
|
|
158
|
+
parts.append(tmpl["user_end"])
|
|
159
|
+
|
|
160
|
+
elif role == "assistant":
|
|
161
|
+
parts.append(tmpl["assistant_start"])
|
|
162
|
+
parts.append(content)
|
|
163
|
+
parts.append(tmpl["assistant_end"])
|
|
164
|
+
|
|
165
|
+
if add_generation_prompt:
|
|
166
|
+
parts.append(tmpl["assistant_start"])
|
|
167
|
+
|
|
168
|
+
return "".join(parts)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def format_simple(
|
|
172
|
+
user_message: str,
|
|
173
|
+
system_message: Optional[str] = None,
|
|
174
|
+
template: str = "chatml",
|
|
175
|
+
) -> str:
|
|
176
|
+
"""Convenience: format a single user message (with optional system prompt).
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
user_message: The user's message
|
|
180
|
+
system_message: Optional system prompt (uses template default if None)
|
|
181
|
+
template: Template name
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Formatted prompt string
|
|
185
|
+
"""
|
|
186
|
+
messages = []
|
|
187
|
+
|
|
188
|
+
tmpl = TEMPLATES.get(template, TEMPLATES["chatml"])
|
|
189
|
+
if system_message is not None:
|
|
190
|
+
messages.append({"role": "system", "content": system_message})
|
|
191
|
+
elif tmpl.get("default_system"):
|
|
192
|
+
messages.append({"role": "system", "content": tmpl["default_system"]})
|
|
193
|
+
|
|
194
|
+
messages.append({"role": "user", "content": user_message})
|
|
195
|
+
|
|
196
|
+
return format_chat(messages, template=template)
|