bit-ttt-engine 0.6.0__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. bit_ttt_engine-0.7.0/PKG-INFO +136 -0
  2. bit_ttt_engine-0.7.0/README_PYPI.md +103 -0
  3. bit_ttt_engine-0.7.0/cortex_rust/__init__.py +25 -0
  4. bit_ttt_engine-0.7.0/cortex_rust/__main__.py +4 -0
  5. bit_ttt_engine-0.7.0/cortex_rust/chat.py +196 -0
  6. bit_ttt_engine-0.7.0/cortex_rust/cli.py +381 -0
  7. bit_ttt_engine-0.7.0/cortex_rust/engine.py +253 -0
  8. bit_ttt_engine-0.7.0/cortex_rust/server.py +493 -0
  9. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/Cargo.toml +9 -2
  10. bit_ttt_engine-0.7.0/crates/rust_engine/LICENSE +21 -0
  11. bit_ttt_engine-0.7.0/crates/rust_engine/README_PYPI.md +103 -0
  12. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/build.rs +61 -7
  13. bit_ttt_engine-0.7.0/crates/rust_engine/cortex_rust/__init__.py +25 -0
  14. bit_ttt_engine-0.7.0/crates/rust_engine/cortex_rust/__main__.py +4 -0
  15. bit_ttt_engine-0.7.0/crates/rust_engine/cortex_rust/chat.py +196 -0
  16. bit_ttt_engine-0.7.0/crates/rust_engine/cortex_rust/cli.py +381 -0
  17. bit_ttt_engine-0.7.0/crates/rust_engine/cortex_rust/engine.py +253 -0
  18. bit_ttt_engine-0.7.0/crates/rust_engine/cortex_rust/server.py +493 -0
  19. bit_ttt_engine-0.7.0/crates/rust_engine/examples/basic_generate.rs +134 -0
  20. bit_ttt_engine-0.7.0/crates/rust_engine/examples/bench_qmatmul.rs +150 -0
  21. bit_ttt_engine-0.7.0/crates/rust_engine/examples/bench_qmatmul_v2.rs +124 -0
  22. bit_ttt_engine-0.7.0/crates/rust_engine/examples/debug_gemma2.rs +148 -0
  23. bit_ttt_engine-0.7.0/crates/rust_engine/examples/interactive_chat.rs +161 -0
  24. bit_ttt_engine-0.7.0/crates/rust_engine/examples/model_info.rs +113 -0
  25. bit_ttt_engine-0.7.0/crates/rust_engine/examples/profile_7b.rs +104 -0
  26. bit_ttt_engine-0.7.0/crates/rust_engine/examples/test_7b.rs +108 -0
  27. bit_ttt_engine-0.7.0/crates/rust_engine/examples/test_qmatmul.rs +97 -0
  28. bit_ttt_engine-0.7.0/crates/rust_engine/examples/test_quantize_q8.rs +121 -0
  29. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/examples/ttt_benchmark.rs +1 -1
  30. bit_ttt_engine-0.7.0/crates/rust_engine/src/bin/bench_wmma.rs +166 -0
  31. bit_ttt_engine-0.7.0/crates/rust_engine/src/bin/test_wmma_small.rs +105 -0
  32. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/cublas_gemv.rs +58 -0
  33. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_attention.cu +189 -0
  34. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_attention.ptx +1022 -0
  35. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_attention_cuda.rs +214 -0
  36. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_decoding.cu +401 -0
  37. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_decoding.ptx +2002 -0
  38. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_decoding.rs +659 -0
  39. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_decoding_fp16.cu +377 -0
  40. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/flash_decoding_fp16.ptx +2880 -0
  41. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/fused_ops.cu +419 -0
  42. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/fused_ops.ptx +864 -0
  43. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/fused_ops.rs +881 -0
  44. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/matmul_4bit.rs +19 -0
  45. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/mod.rs +12 -0
  46. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/quantize_q8.cu +589 -0
  47. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/quantize_q8.ptx +3582 -0
  48. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/quantize_q8.rs +767 -0
  49. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/wmma_matmul.cu +463 -0
  50. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/wmma_matmul.ptx +3730 -0
  51. bit_ttt_engine-0.7.0/crates/rust_engine/src/kernels/wmma_matmul.rs +376 -0
  52. bit_ttt_engine-0.7.0/crates/rust_engine/src/layers/lora.rs +548 -0
  53. bit_ttt_engine-0.7.0/crates/rust_engine/src/layers/lora_train.rs +469 -0
  54. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/ttt.rs +241 -2
  55. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers.rs +4 -0
  56. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/lib.rs +19 -0
  57. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/block.rs +4 -2
  58. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/config.rs +62 -0
  59. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/detector.rs +12 -0
  60. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/gguf_loader.rs +55 -15
  61. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/gguf_model.rs +979 -700
  62. bit_ttt_engine-0.7.0/crates/rust_engine/src/model/gguf_model_quantized.rs +1626 -0
  63. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model.rs +2 -0
  64. bit_ttt_engine-0.7.0/crates/rust_engine/src/python.rs +2142 -0
  65. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/tests/ttt_test.rs +3 -0
  66. bit_ttt_engine-0.7.0/pyproject.toml +45 -0
  67. bit_ttt_engine-0.6.0/PKG-INFO +0 -91
  68. bit_ttt_engine-0.6.0/README_PYPI.md +0 -73
  69. bit_ttt_engine-0.6.0/crates/rust_engine/README_PYPI.md +0 -73
  70. bit_ttt_engine-0.6.0/crates/rust_engine/src/kernels/fused_ops.cu +0 -261
  71. bit_ttt_engine-0.6.0/crates/rust_engine/src/kernels/fused_ops.ptx +0 -498
  72. bit_ttt_engine-0.6.0/crates/rust_engine/src/kernels/fused_ops.rs +0 -366
  73. bit_ttt_engine-0.6.0/crates/rust_engine/src/kernels/mod.rs +0 -7
  74. bit_ttt_engine-0.6.0/crates/rust_engine/src/python.rs +0 -432
  75. bit_ttt_engine-0.6.0/pyproject.toml +0 -26
  76. {bit_ttt_engine-0.6.0/crates/rust_engine → bit_ttt_engine-0.7.0}/LICENSE +0 -0
  77. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/.cargo/config.toml +0 -0
  78. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/CHANGELOG.md +0 -0
  79. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/Cargo.lock +0 -0
  80. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/README.md +0 -0
  81. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/cortex_rust.pyi +0 -0
  82. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/docs/paged_attention_quality_investigation.md +0 -0
  83. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/examples/benchmark.rs +0 -0
  84. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/examples/cuda_test.rs +0 -0
  85. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/examples/debug_load.rs +0 -0
  86. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/examples/e2e_benchmark.rs +0 -0
  87. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/examples/python_sanity_check.py +0 -0
  88. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/bench_4bit_gpu.rs +0 -0
  89. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/bench_cpu_kernel.rs +0 -0
  90. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/bench_gemm_4bit.rs +0 -0
  91. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/bench_sizes.rs +0 -0
  92. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/bench_tinyllama.rs +0 -0
  93. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/detect_model.rs +0 -0
  94. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/fast_download.rs +0 -0
  95. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/quick_gen.rs +0 -0
  96. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/run_4bit_llama.rs +0 -0
  97. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/test_13b.rs +0 -0
  98. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/test_4bit_inference.rs +0 -0
  99. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/test_cuda_gemm.rs +0 -0
  100. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/bin/test_memory.rs +0 -0
  101. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/device_utils.rs +0 -0
  102. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/download.rs +0 -0
  103. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/error.rs +0 -0
  104. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/eval/mod.rs +0 -0
  105. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/eval/perplexity.rs +0 -0
  106. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/adaptive_bit_op.cu +0 -0
  107. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/adaptive_bit_op.ptx +0 -0
  108. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/bit_op.cu +0 -0
  109. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/bit_op.ptx +0 -0
  110. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/cpu.rs +0 -0
  111. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/cuda.rs +0 -0
  112. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/matmul_4bit.cu +0 -0
  113. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/matmul_4bit.ptx +0 -0
  114. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/packing.rs +0 -0
  115. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/packing_4bit.rs +0 -0
  116. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/paged_attention.cu +0 -0
  117. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/paged_attention.ptx +0 -0
  118. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/kernels/paged_attention.rs +0 -0
  119. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/adaptive_linear.rs +0 -0
  120. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/attention.rs +0 -0
  121. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/bit_linear.rs +0 -0
  122. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/flash_attention.rs +0 -0
  123. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/isomorphic.rs +0 -0
  124. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/kv_cache/mod.rs +0 -0
  125. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/linear_4bit.rs +0 -0
  126. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/rms_norm.rs +0 -0
  127. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/layers/swiglu.rs +0 -0
  128. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/config_common.rs +0 -0
  129. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/llama/bitllama.rs +0 -0
  130. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/llama/llama_fp16.rs +0 -0
  131. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/llama/mod.rs +0 -0
  132. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/llama_4bit.rs +0 -0
  133. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/model/unified.rs +0 -0
  134. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/optim/mod.rs +0 -0
  135. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/optim/schedule_free.rs +0 -0
  136. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/pack/install.rs +0 -0
  137. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/pack/lib.rs +0 -0
  138. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/pack/reader.rs +0 -0
  139. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/pack/types.rs +0 -0
  140. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/pack/verify.rs +0 -0
  141. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/pack/writer.rs +0 -0
  142. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/paged_attention/block_manager.rs +0 -0
  143. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/paged_attention/cache_engine.rs +0 -0
  144. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/paged_attention/mod.rs +0 -0
  145. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/scheduler/mod.rs +0 -0
  146. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/speculative/mod.rs +0 -0
  147. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/tests/attention_test.rs +0 -0
  148. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/tests/bit_linear_test.rs +0 -0
  149. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/tests/format_diagnosis.rs +0 -0
  150. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/tests/isomorphic_test.rs +0 -0
  151. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/src/wasm.rs +0 -0
  152. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/tests/accuracy_test.rs +0 -0
  153. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/tests/bitllama_e2e.rs +0 -0
  154. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/tests/common.rs +0 -0
  155. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/tests/gguf_e2e.rs +0 -0
  156. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/tests/load_direct_benchmark.rs +0 -0
  157. {bit_ttt_engine-0.6.0 → bit_ttt_engine-0.7.0}/crates/rust_engine/tests/load_packed_e2e.rs +0 -0
@@ -0,0 +1,136 @@
1
+ Metadata-Version: 2.4
2
+ Name: bit-ttt-engine
3
+ Version: 0.7.0
4
+ Classifier: Development Status :: 4 - Beta
5
+ Classifier: Programming Language :: Rust
6
+ Classifier: Programming Language :: Python :: Implementation :: CPython
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.8
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Classifier: Operating System :: Microsoft :: Windows
16
+ Requires-Dist: tokenizers>=0.19
17
+ Requires-Dist: huggingface-hub>=0.20
18
+ Requires-Dist: bit-ttt-engine[server] ; extra == 'all'
19
+ Requires-Dist: fastapi>=0.100 ; extra == 'server'
20
+ Requires-Dist: uvicorn>=0.20 ; extra == 'server'
21
+ Requires-Dist: sse-starlette>=1.6 ; extra == 'server'
22
+ Provides-Extra: all
23
+ Provides-Extra: server
24
+ License-File: LICENSE
25
+ Summary: Fast local LLM inference with TTT (Test-Time Training) and LoRA — the model that learns while it runs
26
+ Keywords: llm,inference,ttt,lora,gguf,quantization,cuda
27
+ Home-Page: https://github.com/imonoonoko/Bit-TTT-Engine
28
+ Author: imonoonoko
29
+ License: MIT
30
+ Requires-Python: >=3.8
31
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
32
+
33
+ # 🧠 Bit-TTT-Engine
34
+
35
+ [![PyPI](https://img.shields.io/pypi/v/bit-ttt-engine.svg)](https://pypi.org/project/bit-ttt-engine/)
36
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
37
+ [![Rust](https://img.shields.io/badge/rust-1.70+-orange.svg)](https://www.rust-lang.org/)
38
+
39
+ **Fast local LLM inference that learns while it runs.**
40
+
41
+ - 🏎️ **47+ tok/s** on RTX 4060 Ti (7B Q4_K_M)
42
+ - 🧠 **TTT** (Test-Time Training) — adapts during inference (world's first!)
43
+ - 🎨 **LoRA** — fine-tune with one flag
44
+ - 📦 **5 models** — Llama-2/3, Gemma-2, Qwen2.5, Mistral
45
+ - 🔌 **OpenAI-compatible API** — drop-in replacement
46
+
47
+ ## 🚀 Quick Start
48
+
49
+ ```bash
50
+ pip install bit-ttt-engine
51
+ ```
52
+
53
+ ```python
54
+ import cortex_rust
55
+
56
+ # Load any GGUF model (auto-downloads from HuggingFace!)
57
+ model = cortex_rust.load("user/model-GGUF")
58
+
59
+ # Chat
60
+ response = model.chat([
61
+ {"role": "user", "content": "Hello!"}
62
+ ])
63
+ print(response)
64
+
65
+ # Stream
66
+ for token in model.chat_stream([
67
+ {"role": "user", "content": "Tell me a story"}
68
+ ]):
69
+ print(token, end="", flush=True)
70
+ ```
71
+
72
+ ## 🖥️ CLI
73
+
74
+ ```bash
75
+ # Interactive chat
76
+ bit-ttt chat model.gguf
77
+
78
+ # Generate text
79
+ bit-ttt generate model.gguf -p "Once upon a time"
80
+
81
+ # OpenAI-compatible API server
82
+ bit-ttt serve model.gguf --port 8000
83
+
84
+ # With LoRA + Q8 KV cache
85
+ bit-ttt chat model.gguf --lora adapter.bin --q8-cache
86
+ ```
87
+
88
+ ## 🧠 TTT — Test-Time Training
89
+
90
+ **The model learns while it generates.** No other local LLM does this.
91
+
92
+ ```python
93
+ model = cortex_rust.load("model.gguf")
94
+ model.enable_ttt(True)
95
+
96
+ # Each conversation makes the model smarter
97
+ response = model.chat([{"role": "user", "content": "My name is Alice"}])
98
+ # Next time, it remembers context better!
99
+ ```
100
+
101
+ ## ⚡ Performance
102
+
103
+ | Model | Speed | VRAM |
104
+ |-------|-------|------|
105
+ | Llama-2 7B (Q4_K_M) | 47.8 tok/s | ~5 GB |
106
+ | Llama-3 8B (Q4_K_M) | 36.8 tok/s | ~6 GB |
107
+ | Mistral 7B (Q4_K_M) | 40.8 tok/s | ~5 GB |
108
+ | Qwen2.5 1.5B (Q4_K_M) | 70.4 tok/s | ~2 GB |
109
+
110
+ With `--q8-cache`: **82% VRAM reduction** for KV cache.
111
+
112
+ ## 🔌 OpenAI-Compatible API
113
+
114
+ ```bash
115
+ bit-ttt serve model.gguf --port 8000
116
+ ```
117
+
118
+ ```python
119
+ from openai import OpenAI
120
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="none")
121
+ response = client.chat.completions.create(
122
+ model="default",
123
+ messages=[{"role": "user", "content": "Hi!"}],
124
+ stream=True,
125
+ )
126
+ ```
127
+
128
+ ## 📖 Links
129
+
130
+ - [GitHub](https://github.com/imonoonoko/Bit-TTT-Engine)
131
+ - [Documentation](https://github.com/imonoonoko/Bit-TTT-Engine#readme)
132
+
133
+ ## 💖 License
134
+
135
+ MIT License
136
+
@@ -0,0 +1,103 @@
1
+ # 🧠 Bit-TTT-Engine
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/bit-ttt-engine.svg)](https://pypi.org/project/bit-ttt-engine/)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
5
+ [![Rust](https://img.shields.io/badge/rust-1.70+-orange.svg)](https://www.rust-lang.org/)
6
+
7
+ **Fast local LLM inference that learns while it runs.**
8
+
9
+ - 🏎️ **47+ tok/s** on RTX 4060 Ti (7B Q4_K_M)
10
+ - 🧠 **TTT** (Test-Time Training) — adapts during inference (world's first!)
11
+ - 🎨 **LoRA** — fine-tune with one flag
12
+ - 📦 **5 models** — Llama-2/3, Gemma-2, Qwen2.5, Mistral
13
+ - 🔌 **OpenAI-compatible API** — drop-in replacement
14
+
15
+ ## 🚀 Quick Start
16
+
17
+ ```bash
18
+ pip install bit-ttt-engine
19
+ ```
20
+
21
+ ```python
22
+ import cortex_rust
23
+
24
+ # Load any GGUF model (auto-downloads from HuggingFace!)
25
+ model = cortex_rust.load("user/model-GGUF")
26
+
27
+ # Chat
28
+ response = model.chat([
29
+ {"role": "user", "content": "Hello!"}
30
+ ])
31
+ print(response)
32
+
33
+ # Stream
34
+ for token in model.chat_stream([
35
+ {"role": "user", "content": "Tell me a story"}
36
+ ]):
37
+ print(token, end="", flush=True)
38
+ ```
39
+
40
+ ## 🖥️ CLI
41
+
42
+ ```bash
43
+ # Interactive chat
44
+ bit-ttt chat model.gguf
45
+
46
+ # Generate text
47
+ bit-ttt generate model.gguf -p "Once upon a time"
48
+
49
+ # OpenAI-compatible API server
50
+ bit-ttt serve model.gguf --port 8000
51
+
52
+ # With LoRA + Q8 KV cache
53
+ bit-ttt chat model.gguf --lora adapter.bin --q8-cache
54
+ ```
55
+
56
+ ## 🧠 TTT — Test-Time Training
57
+
58
+ **The model learns while it generates.** No other local LLM does this.
59
+
60
+ ```python
61
+ model = cortex_rust.load("model.gguf")
62
+ model.enable_ttt(True)
63
+
64
+ # Each conversation makes the model smarter
65
+ response = model.chat([{"role": "user", "content": "My name is Alice"}])
66
+ # Next time, it remembers context better!
67
+ ```
68
+
69
+ ## ⚡ Performance
70
+
71
+ | Model | Speed | VRAM |
72
+ |-------|-------|------|
73
+ | Llama-2 7B (Q4_K_M) | 47.8 tok/s | ~5 GB |
74
+ | Llama-3 8B (Q4_K_M) | 36.8 tok/s | ~6 GB |
75
+ | Mistral 7B (Q4_K_M) | 40.8 tok/s | ~5 GB |
76
+ | Qwen2.5 1.5B (Q4_K_M) | 70.4 tok/s | ~2 GB |
77
+
78
+ With `--q8-cache`: **82% VRAM reduction** for KV cache.
79
+
80
+ ## 🔌 OpenAI-Compatible API
81
+
82
+ ```bash
83
+ bit-ttt serve model.gguf --port 8000
84
+ ```
85
+
86
+ ```python
87
+ from openai import OpenAI
88
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="none")
89
+ response = client.chat.completions.create(
90
+ model="default",
91
+ messages=[{"role": "user", "content": "Hi!"}],
92
+ stream=True,
93
+ )
94
+ ```
95
+
96
+ ## 📖 Links
97
+
98
+ - [GitHub](https://github.com/imonoonoko/Bit-TTT-Engine)
99
+ - [Documentation](https://github.com/imonoonoko/Bit-TTT-Engine#readme)
100
+
101
+ ## 💖 License
102
+
103
+ MIT License
@@ -0,0 +1,25 @@
1
+ # Auto-add CUDA DLL path on Windows
2
+ import os
3
+ import sys
4
+
5
+ if sys.platform == "win32":
6
+ cuda_paths = [
7
+ os.environ.get("CUDA_PATH", ""),
8
+ r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4",
9
+ r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.3",
10
+ r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.0",
11
+ r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8",
12
+ ]
13
+ for cuda_path in cuda_paths:
14
+ bin_path = os.path.join(cuda_path, "bin")
15
+ if os.path.isdir(bin_path):
16
+ os.add_dll_directory(bin_path)
17
+ break
18
+
19
+ from .cortex_rust import *
20
+ from .chat import format_chat, format_simple, detect_template, list_templates
21
+ from .engine import load, Model
22
+
23
+ __doc__ = cortex_rust.__doc__
24
+ if hasattr(cortex_rust, "__all__"):
25
+ __all__ = cortex_rust.__all__
@@ -0,0 +1,4 @@
1
+ """Enable `python -m cortex_rust` as CLI entry point."""
2
+ from cortex_rust.cli import main
3
+
4
+ main()
@@ -0,0 +1,196 @@
1
+ """Chat template support for various LLM architectures.
2
+
3
+ Provides format_chat() to convert messages into model-specific prompt strings.
4
+ Supports Llama-3, Llama-2, Gemma-2, Qwen/ChatML, and generic formats.
5
+
6
+ Usage:
7
+ from cortex_rust.chat import format_chat, detect_template
8
+
9
+ messages = [
10
+ {"role": "system", "content": "You are a helpful assistant."},
11
+ {"role": "user", "content": "Hello!"},
12
+ ]
13
+ prompt = format_chat(messages, template="llama3")
14
+ # Or auto-detect from model path:
15
+ template = detect_template("Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
16
+ prompt = format_chat(messages, template=template)
17
+ """
18
+
19
+ from typing import List, Dict, Optional
20
+
21
+ # ============================================================================
22
+ # Template Definitions
23
+ # ============================================================================
24
+
25
+ TEMPLATES = {
26
+ "llama3": {
27
+ "bos": "<|begin_of_text|>",
28
+ "system_start": "<|start_header_id|>system<|end_header_id|>\n\n",
29
+ "system_end": "<|eot_id|>",
30
+ "user_start": "<|start_header_id|>user<|end_header_id|>\n\n",
31
+ "user_end": "<|eot_id|>",
32
+ "assistant_start": "<|start_header_id|>assistant<|end_header_id|>\n\n",
33
+ "assistant_end": "<|eot_id|>",
34
+ "default_system": "You are a helpful assistant.",
35
+ },
36
+ "llama2": {
37
+ "bos": "<s>",
38
+ "system_start": "<<SYS>>\n",
39
+ "system_end": "\n<</SYS>>\n\n",
40
+ "user_start": "[INST] ",
41
+ "user_end": " [/INST]",
42
+ "assistant_start": " ",
43
+ "assistant_end": " </s>",
44
+ "default_system": "You are a helpful, respectful and honest assistant.",
45
+ # Llama-2 embeds system inside first [INST]
46
+ "system_inside_user": True,
47
+ },
48
+ "gemma2": {
49
+ "bos": "<bos>",
50
+ "system_start": "", # Gemma-2 has no system role
51
+ "system_end": "",
52
+ "user_start": "<start_of_turn>user\n",
53
+ "user_end": "<end_of_turn>\n",
54
+ "assistant_start": "<start_of_turn>model\n",
55
+ "assistant_end": "<end_of_turn>\n",
56
+ "default_system": None, # No system support
57
+ },
58
+ "chatml": {
59
+ # Used by Qwen, Mistral-Instruct, etc.
60
+ "bos": "",
61
+ "system_start": "<|im_start|>system\n",
62
+ "system_end": "<|im_end|>\n",
63
+ "user_start": "<|im_start|>user\n",
64
+ "user_end": "<|im_end|>\n",
65
+ "assistant_start": "<|im_start|>assistant\n",
66
+ "assistant_end": "<|im_end|>\n",
67
+ "default_system": "You are a helpful assistant.",
68
+ },
69
+ }
70
+
71
+ # Model name patterns → template mapping
72
+ _DETECTION_PATTERNS = [
73
+ ("llama-3", "llama3"),
74
+ ("llama3", "llama3"),
75
+ ("meta-llama-3", "llama3"),
76
+ ("llama-2", "llama2"),
77
+ ("llama2", "llama2"),
78
+ ("gemma-2", "gemma2"),
79
+ ("gemma2", "gemma2"),
80
+ ("qwen", "chatml"),
81
+ ("mistral", "chatml"),
82
+ ("yi-", "chatml"),
83
+ ]
84
+
85
+
86
+ def detect_template(model_path: str) -> str:
87
+ """Auto-detect chat template from model filename.
88
+
89
+ Args:
90
+ model_path: Path to model file (e.g., "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
91
+
92
+ Returns:
93
+ Template name (e.g., "llama3", "chatml"). Falls back to "chatml" if unknown.
94
+ """
95
+ name = model_path.lower().replace("\\", "/").split("/")[-1]
96
+ for pattern, template in _DETECTION_PATTERNS:
97
+ if pattern in name:
98
+ return template
99
+ return "chatml" # Safe default
100
+
101
+
102
+ def list_templates() -> List[str]:
103
+ """List all available template names."""
104
+ return list(TEMPLATES.keys())
105
+
106
+
107
+ def format_chat(
108
+ messages: List[Dict[str, str]],
109
+ template: str = "chatml",
110
+ add_generation_prompt: bool = True,
111
+ ) -> str:
112
+ """Format chat messages into a model-specific prompt string.
113
+
114
+ Args:
115
+ messages: List of {"role": "system"|"user"|"assistant", "content": "..."}
116
+ template: Template name ("llama3", "llama2", "gemma2", "chatml")
117
+ add_generation_prompt: If True, append assistant start token at the end
118
+
119
+ Returns:
120
+ Formatted prompt string ready for model.generate()
121
+
122
+ Example:
123
+ >>> messages = [{"role": "user", "content": "Hello!"}]
124
+ >>> format_chat(messages, template="llama3")
125
+ '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\\n\\nHello!<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n'
126
+ """
127
+ if template not in TEMPLATES:
128
+ raise ValueError(f"Unknown template '{template}'. Available: {list_templates()}")
129
+
130
+ tmpl = TEMPLATES[template]
131
+ parts = [tmpl["bos"]]
132
+
133
+ system_inside_user = tmpl.get("system_inside_user", False)
134
+ system_content = None
135
+
136
+ for msg in messages:
137
+ role = msg["role"]
138
+ content = msg["content"]
139
+
140
+ if role == "system":
141
+ if system_inside_user:
142
+ # Llama-2 style: save system for embedding in first user message
143
+ system_content = content
144
+ elif tmpl["system_start"]: # Skip if no system support (Gemma-2)
145
+ parts.append(tmpl["system_start"])
146
+ parts.append(content)
147
+ parts.append(tmpl["system_end"])
148
+
149
+ elif role == "user":
150
+ parts.append(tmpl["user_start"])
151
+ if system_inside_user and system_content is not None:
152
+ # Llama-2: embed system before user content
153
+ parts.append(tmpl["system_start"])
154
+ parts.append(system_content)
155
+ parts.append(tmpl["system_end"])
156
+ system_content = None # Only first user message
157
+ parts.append(content)
158
+ parts.append(tmpl["user_end"])
159
+
160
+ elif role == "assistant":
161
+ parts.append(tmpl["assistant_start"])
162
+ parts.append(content)
163
+ parts.append(tmpl["assistant_end"])
164
+
165
+ if add_generation_prompt:
166
+ parts.append(tmpl["assistant_start"])
167
+
168
+ return "".join(parts)
169
+
170
+
171
+ def format_simple(
172
+ user_message: str,
173
+ system_message: Optional[str] = None,
174
+ template: str = "chatml",
175
+ ) -> str:
176
+ """Convenience: format a single user message (with optional system prompt).
177
+
178
+ Args:
179
+ user_message: The user's message
180
+ system_message: Optional system prompt (uses template default if None)
181
+ template: Template name
182
+
183
+ Returns:
184
+ Formatted prompt string
185
+ """
186
+ messages = []
187
+
188
+ tmpl = TEMPLATES.get(template, TEMPLATES["chatml"])
189
+ if system_message is not None:
190
+ messages.append({"role": "system", "content": system_message})
191
+ elif tmpl.get("default_system"):
192
+ messages.append({"role": "system", "content": tmpl["default_system"]})
193
+
194
+ messages.append({"role": "user", "content": user_message})
195
+
196
+ return format_chat(messages, template=template)