titan-synapse 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONTRIBUTING.md +187 -0
- package/Cargo.lock +3976 -0
- package/Cargo.toml +10 -0
- package/LICENSE +190 -0
- package/PROGRESS.md +151 -0
- package/README.md +514 -0
- package/TEST_LOG.md +220 -0
- package/config/default.yaml +36 -0
- package/crates/synapse/Cargo.toml +70 -0
- package/crates/synapse/src/cli/bench.rs +44 -0
- package/crates/synapse/src/cli/eval.rs +395 -0
- package/crates/synapse/src/cli/export.rs +45 -0
- package/crates/synapse/src/cli/hub.rs +179 -0
- package/crates/synapse/src/cli/import.rs +35 -0
- package/crates/synapse/src/cli/learn.rs +53 -0
- package/crates/synapse/src/cli/mod.rs +10 -0
- package/crates/synapse/src/cli/models.rs +36 -0
- package/crates/synapse/src/cli/pull.rs +60 -0
- package/crates/synapse/src/cli/status.rs +52 -0
- package/crates/synapse/src/cli/train.rs +99 -0
- package/crates/synapse/src/config.rs +220 -0
- package/crates/synapse/src/dashboard.rs +281 -0
- package/crates/synapse/src/format/manifest.rs +57 -0
- package/crates/synapse/src/format/mod.rs +4 -0
- package/crates/synapse/src/format/packer.rs +213 -0
- package/crates/synapse/src/inference/engine.rs +361 -0
- package/crates/synapse/src/inference/kv_cache.rs +97 -0
- package/crates/synapse/src/inference/lora.rs +166 -0
- package/crates/synapse/src/inference/mod.rs +9 -0
- package/crates/synapse/src/inference/model.rs +167 -0
- package/crates/synapse/src/inference/sampler.rs +133 -0
- package/crates/synapse/src/inference/speculative.rs +153 -0
- package/crates/synapse/src/learn/cloud_fallback.rs +186 -0
- package/crates/synapse/src/learn/engine.rs +109 -0
- package/crates/synapse/src/learn/mod.rs +5 -0
- package/crates/synapse/src/main.rs +185 -0
- package/crates/synapse/src/memory/extractor.rs +201 -0
- package/crates/synapse/src/memory/graph.rs +332 -0
- package/crates/synapse/src/memory/hallucination.rs +259 -0
- package/crates/synapse/src/memory/mod.rs +7 -0
- package/crates/synapse/src/openai.rs +232 -0
- package/crates/synapse/src/server.rs +166 -0
- package/crates/synapse/src/streaming.rs +80 -0
- package/crates/synapse/src/swarm/coordinator.rs +198 -0
- package/crates/synapse/src/swarm/mod.rs +8 -0
- package/crates/synapse/src/swarm/orchestrator.rs +225 -0
- package/crates/synapse/src/swarm/pool.rs +64 -0
- package/crates/synapse/src/swarm/spawner.rs +199 -0
- package/crates/synapse/src/swarm/synthesizer.rs +26 -0
- package/crates/synapse/src/vram/manager.rs +67 -0
- package/crates/synapse/src/vram/mod.rs +3 -0
- package/docker-compose.yml +19 -0
- package/install.sh +311 -0
- package/package.json +36 -0
- package/python/Dockerfile.learn +18 -0
- package/python/requirements.txt +11 -0
- package/python/synapse_learn/__init__.py +0 -0
- package/python/synapse_learn/datasets.py +233 -0
- package/python/synapse_learn/real_eval.py +616 -0
- package/python/synapse_learn/server.py +431 -0
- package/python/synapse_learn/train_base.py +672 -0
- package/python/synapse_learn/train_specialists.py +787 -0
package/TEST_LOG.md
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
# TITAN Synapse — Test Log
|
|
2
|
+
|
|
3
|
+
**Date**: March 20, 2026 (updated throughout DAY 1-2)
|
|
4
|
+
**Platform**: RTX 5090 (32GB VRAM), i9-14900KF, 64GB DDR5-6000, Ubuntu 24.04
|
|
5
|
+
**Models**: Qwen2.5-3B-Instruct (Q4_K_M, 1.9GB) + Qwen2.5-0.5B-Instruct (Q4_K_M, 491MB)
|
|
6
|
+
**Rust**: 1.94.0 (release build)
|
|
7
|
+
**Binary size**: ~6.3MB
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Unit Tests (15/15 PASSING)
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
cargo test — 15/15 PASSING in 0.01s
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
| # | Test | Module | Result |
|
|
18
|
+
|---|------|--------|--------|
|
|
19
|
+
| 1 | test_default_config | config | PASS |
|
|
20
|
+
| 2 | test_config_serialization | config | PASS |
|
|
21
|
+
| 3 | test_load_missing_config | config | PASS |
|
|
22
|
+
| 4 | test_greedy_sampling | inference::sampler | PASS |
|
|
23
|
+
| 5 | test_empty_logits | inference::sampler | PASS |
|
|
24
|
+
| 6 | test_stochastic_sampling | inference::sampler | PASS |
|
|
25
|
+
| 7 | test_cache_allocation | inference::kv_cache | PASS |
|
|
26
|
+
| 8 | test_knowledge_graph | memory::graph | PASS |
|
|
27
|
+
| 9 | test_preferences | memory::graph | PASS |
|
|
28
|
+
| 10 | test_hebbian_routing | memory::graph | PASS |
|
|
29
|
+
| 11 | test_specialist_stats | memory::graph | PASS |
|
|
30
|
+
| 12 | test_manifest_creation | format::manifest | PASS |
|
|
31
|
+
| 13 | test_manifest_serialization | format::manifest | PASS |
|
|
32
|
+
| 14 | test_pack_and_unpack | format::packer | PASS |
|
|
33
|
+
| 15 | test_list_bundles | format::packer | PASS |
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## Benchmark Results — CPU (Qwen2.5-3B, Q4_K_M)
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
synapse bench — 4 prompts, 759 tokens total (CPU inference)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
| Prompt | Tokens | Time | Tok/s |
|
|
44
|
+
|--------|--------|------|-------|
|
|
45
|
+
| Python decorator explanation | 109 | 5,001ms | 21.8 |
|
|
46
|
+
| SQL top-10 query | 139 | 6,467ms | 21.5 |
|
|
47
|
+
| TCP vs UDP explanation | 256 | 10,893ms | 23.5 |
|
|
48
|
+
| Go garbage collection | 255 | 10,875ms | 23.4 |
|
|
49
|
+
| **Average** | **190** | **8,309ms** | **23 tok/s** |
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Benchmark Results — CUDA GPU (Qwen2.5-3B, Q4_K_M, RTX 5090)
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
CUDA 12.8 (Blackwell) — 5x speedup over CPU
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
| Test | Tokens | Time | Tok/s |
|
|
60
|
+
|------|--------|------|-------|
|
|
61
|
+
| TCP vs UDP (run 1) | 119 | 935ms | 127.3 |
|
|
62
|
+
| TCP vs UDP (run 2) | 119 | 1,014ms | 117.4 |
|
|
63
|
+
| TCP vs UDP (run 3) | 120 | 1,017ms | 118.0 |
|
|
64
|
+
| Prime function (256 tokens) | 256 | 7,403ms | 34.6 |
|
|
65
|
+
| BST class (512 tokens) | 512 | 4,028ms | 127.1 |
|
|
66
|
+
| Programming languages list | 128 | ~1,300ms | 97.6 |
|
|
67
|
+
| **Average sustained** | — | — | **~100-128 tok/s** |
|
|
68
|
+
|
|
69
|
+
**Model load times (GPU)**:
|
|
70
|
+
- 3B model: **0.6s** (was 1.1s on CPU — 1.8x faster)
|
|
71
|
+
- 0.5B model: **0.3s** (was 0.7s on CPU — 2.3x faster)
|
|
72
|
+
|
|
73
|
+
**Consistency** (3 runs, same prompt, 119 tokens):
|
|
74
|
+
- Run 1: 119 tokens in 935ms (127.3 tok/s)
|
|
75
|
+
- Run 2: 119 tokens in 1,014ms (117.4 tok/s)
|
|
76
|
+
- Run 3: 120 tokens in 1,017ms (118.0 tok/s)
|
|
77
|
+
- **Variance**: <8% — very consistent
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Integration Tests (Live Server, RTX 5090)
|
|
82
|
+
|
|
83
|
+
All tests run against the live server at `http://192.168.1.11:6900`
|
|
84
|
+
|
|
85
|
+
### Test 1: Health Check
|
|
86
|
+
- **Endpoint**: `GET /health`
|
|
87
|
+
- **Result**: PASS
|
|
88
|
+
- **Response**: `ok`
|
|
89
|
+
- **HTTP Code**: 200
|
|
90
|
+
|
|
91
|
+
### Test 2: List Models
|
|
92
|
+
- **Endpoint**: `GET /v1/models`
|
|
93
|
+
- **Result**: PASS
|
|
94
|
+
- **Response**: 4 models listed (synapse, synapse/general, synapse/python_expert, synapse/sql_expert)
|
|
95
|
+
|
|
96
|
+
### Test 3: API Status (Enhanced)
|
|
97
|
+
- **Endpoint**: `GET /api/status`
|
|
98
|
+
- **Result**: PASS
|
|
99
|
+
- **Response**: Includes `models_loaded: ["qwen2.5-3b-instruct-q4_k_m", "qwen2.5-0.5b-instruct-q4_k_m"]`, Hebbian pathway data, knowledge stats
|
|
100
|
+
|
|
101
|
+
### Test 4: Simple Math — "What is 2+2?"
|
|
102
|
+
- **Endpoint**: `POST /v1/chat/completions`
|
|
103
|
+
- **Result**: PASS
|
|
104
|
+
- **Response**: `"2 + 2 equals 4."` (correct, clean stop)
|
|
105
|
+
- **Usage**: `{prompt_tokens: 34, completion_tokens: 8, total_tokens: 42}`
|
|
106
|
+
|
|
107
|
+
### Test 5: Python Code Generation
|
|
108
|
+
- **Endpoint**: `POST /v1/chat/completions`
|
|
109
|
+
- **Result**: PASS
|
|
110
|
+
- **Response**: Correct decorator explanation with working code example
|
|
111
|
+
|
|
112
|
+
### Test 6: SSE Streaming
|
|
113
|
+
- **Endpoint**: `POST /v1/chat/completions` (stream=true)
|
|
114
|
+
- **Result**: PASS
|
|
115
|
+
- **Response**: Role delta → content deltas → [DONE], proper SSE format
|
|
116
|
+
|
|
117
|
+
### Test 7: SQL Query Generation
|
|
118
|
+
- **Endpoint**: `POST /v1/chat/completions`
|
|
119
|
+
- **Result**: PASS
|
|
120
|
+
- **Response**: Correct SQL with JOIN, GROUP BY, COUNT
|
|
121
|
+
- **Specialist routed**: sql_expert
|
|
122
|
+
|
|
123
|
+
### Test 8: Token Counting Accuracy
|
|
124
|
+
- **Endpoint**: `POST /v1/chat/completions`
|
|
125
|
+
- **Result**: PASS
|
|
126
|
+
- **Response**: `usage` field with accurate prompt_tokens, completion_tokens, total_tokens
|
|
127
|
+
- **Verified**: total_tokens = prompt_tokens + completion_tokens
|
|
128
|
+
|
|
129
|
+
### Test 9: Hebbian Routing Accumulation
|
|
130
|
+
- **Method**: Made 3 Python queries, then checked /api/status
|
|
131
|
+
- **Result**: PASS
|
|
132
|
+
- **Hebbian state**: `python_expert: strength=4, avg_score=4.0`, `sql_expert: strength=3`, `general: strength=2`
|
|
133
|
+
- **Verified**: Pathways strengthen with repeated use
|
|
134
|
+
|
|
135
|
+
### Test 10: Multi-Model Loading
|
|
136
|
+
- **Result**: PASS
|
|
137
|
+
- **Models loaded**: qwen2.5-3b-instruct-q4_k_m (1.1s), qwen2.5-0.5b-instruct-q4_k_m (0.7s)
|
|
138
|
+
- **Simultaneous**: Both models in memory, auto-selected by engine
|
|
139
|
+
|
|
140
|
+
### Test 11: Counting Task
|
|
141
|
+
- **Prompt**: "Count from 1 to 5"
|
|
142
|
+
- **Result**: PASS
|
|
143
|
+
- **Response**: "1\n2\n3\n4\n5" (correct, clean)
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Performance Summary
|
|
148
|
+
|
|
149
|
+
| Metric | CPU | GPU (CUDA) |
|
|
150
|
+
|--------|-----|------------|
|
|
151
|
+
| Model load time (3B) | 1.1s | **0.6s** |
|
|
152
|
+
| Model load time (0.5B) | 0.7s | **0.3s** |
|
|
153
|
+
| Health check latency | <1ms | <1ms |
|
|
154
|
+
| Short response (8 tokens) | ~370ms | ~80ms |
|
|
155
|
+
| Medium response (128 tokens) | ~5.5s | **~1s** |
|
|
156
|
+
| Long response (512 tokens) | ~22s | **~4s** |
|
|
157
|
+
| **Throughput (3B Q4)** | **21-24 tok/s** | **97-128 tok/s** |
|
|
158
|
+
| VRAM used by models | N/A | ~2.4 GB |
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## Model Output Samples
|
|
163
|
+
|
|
164
|
+
### Math
|
|
165
|
+
```
|
|
166
|
+
Q: What is 2+2?
|
|
167
|
+
A: 2 + 2 equals 4.
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### Code
|
|
171
|
+
```
|
|
172
|
+
Q: What is a Python decorator? Explain briefly.
|
|
173
|
+
A: A Python decorator is a special function that adds functionality
|
|
174
|
+
to another function without modifying its code. [includes working
|
|
175
|
+
code example with @my_decorator syntax]
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### SQL
|
|
179
|
+
```
|
|
180
|
+
Q: Write a SQL query to find the top 5 users by order count.
|
|
181
|
+
A: SELECT u.user_id, u.user_name, COUNT(o.order_id) AS order_count
|
|
182
|
+
FROM users u JOIN orders o ON u.user_id = o.user_id
|
|
183
|
+
GROUP BY u.user_id, u.user_name
|
|
184
|
+
ORDER BY order_count DESC LIMIT 5;
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Issues Found & Fixed
|
|
190
|
+
|
|
191
|
+
1. **Tensor rank error**: candle's quantized_qwen2 returns `(batch, vocab)` not `(batch, seq, vocab)`. Fixed with `squeeze(0)`.
|
|
192
|
+
|
|
193
|
+
2. **Chat template**: Model requires `<|im_start|>/<|im_end|>` format. Added `format_chat_prompt()`.
|
|
194
|
+
|
|
195
|
+
3. **Stop token bleed**: Model generated past answer. Fixed by collecting all stop token IDs.
|
|
196
|
+
|
|
197
|
+
4. **`gen` reserved keyword**: Rust 2024 reserves `gen`. Renamed variable.
|
|
198
|
+
|
|
199
|
+
5. **bench.rs after GenerationResult**: `response.len()` broke when return type changed from String to GenerationResult. Fixed to use `response.completion_tokens`.
|
|
200
|
+
|
|
201
|
+
6. **KnowledgeGraph not Send+Sync**: `rusqlite::Connection` uses RefCell. Fixed with `std::sync::Mutex<Connection>`.
|
|
202
|
+
|
|
203
|
+
7. **CUDA "no cuda implementation for rms-norm"**: Only `candle-core` had the `cuda` feature. Fixed by enabling `cuda` on `candle-nn` and `candle-transformers` too. See [candle#1916](https://github.com/huggingface/candle/issues/1916).
|
|
204
|
+
|
|
205
|
+
8. **CUDA 12.6 → 12.8**: RTX 5090 (Blackwell) requires compute capability sm_120, which needs CUDA 12.8+. Upgraded from 12.6 to 12.8.
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
## Build Info
|
|
210
|
+
|
|
211
|
+
```
|
|
212
|
+
Rust edition: 2024
|
|
213
|
+
Cargo workspace: titan-synapse
|
|
214
|
+
Binary crate: synapse
|
|
215
|
+
Dependencies: 385 crates
|
|
216
|
+
Build time (release, CPU): ~36s (first), ~3s (incremental)
|
|
217
|
+
Build time (release, CUDA): ~23s (first), ~12s (incremental)
|
|
218
|
+
CUDA toolkit: 12.8 (Blackwell sm_120)
|
|
219
|
+
Target: x86_64-unknown-linux-gnu
|
|
220
|
+
```
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
port: 6900
|
|
2
|
+
|
|
3
|
+
coordinator_model: qwen3-0.6b
|
|
4
|
+
base_model: qwen3-3b
|
|
5
|
+
|
|
6
|
+
learning:
|
|
7
|
+
enabled: true
|
|
8
|
+
min_pairs_before_training: 10
|
|
9
|
+
sidecar_url: http://localhost:8090
|
|
10
|
+
eval_threshold: 3.0
|
|
11
|
+
|
|
12
|
+
specialists:
|
|
13
|
+
- name: general
|
|
14
|
+
capabilities: [general, chat, help]
|
|
15
|
+
system_prompt: "You are a helpful AI assistant powered by TITAN Synapse."
|
|
16
|
+
priority: 50
|
|
17
|
+
|
|
18
|
+
- name: python_expert
|
|
19
|
+
capabilities: [python, debugging, testing, refactoring, fastapi, django]
|
|
20
|
+
system_prompt: "You are an expert Python developer. Write clean, efficient, well-tested code."
|
|
21
|
+
priority: 60
|
|
22
|
+
|
|
23
|
+
- name: sql_expert
|
|
24
|
+
capabilities: [sql, database, query, postgres, mysql, sqlite]
|
|
25
|
+
system_prompt: "You are an expert database engineer. Write optimized SQL queries."
|
|
26
|
+
priority: 60
|
|
27
|
+
|
|
28
|
+
- name: devops_expert
|
|
29
|
+
capabilities: [docker, kubernetes, ci, cd, deploy, infrastructure, terraform]
|
|
30
|
+
system_prompt: "You are a DevOps expert. Focus on reliability, automation, and security."
|
|
31
|
+
priority: 55
|
|
32
|
+
|
|
33
|
+
- name: frontend_expert
|
|
34
|
+
capabilities: [react, javascript, typescript, css, html, ui, ux, nextjs, vue]
|
|
35
|
+
system_prompt: "You are a frontend expert. Build fast, accessible, beautiful interfaces."
|
|
36
|
+
priority: 55
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "synapse"
|
|
3
|
+
version.workspace = true
|
|
4
|
+
edition.workspace = true
|
|
5
|
+
license.workspace = true
|
|
6
|
+
repository.workspace = true
|
|
7
|
+
description.workspace = true
|
|
8
|
+
default-run = "synapse"
|
|
9
|
+
|
|
10
|
+
[[bin]]
|
|
11
|
+
name = "synapse"
|
|
12
|
+
path = "src/main.rs"
|
|
13
|
+
|
|
14
|
+
[dependencies]
|
|
15
|
+
# HTTP / Async
|
|
16
|
+
axum = { version = "0.8", features = ["macros"] }
|
|
17
|
+
tokio = { version = "1", features = ["full"] }
|
|
18
|
+
tokio-stream = "0.1"
|
|
19
|
+
tower-http = { version = "0.6", features = ["cors", "trace"] }
|
|
20
|
+
|
|
21
|
+
# CLI
|
|
22
|
+
clap = { version = "4", features = ["derive"] }
|
|
23
|
+
|
|
24
|
+
# Serialization
|
|
25
|
+
serde = { version = "1", features = ["derive"] }
|
|
26
|
+
serde_json = "1"
|
|
27
|
+
serde_yaml = "0.9"
|
|
28
|
+
|
|
29
|
+
# Inference — we use candle for tensor ops (pure Rust, no C deps)
|
|
30
|
+
candle-core = "0.8"
|
|
31
|
+
candle-nn = "0.8"
|
|
32
|
+
candle-transformers = "0.8"
|
|
33
|
+
tokenizers = "0.21"
|
|
34
|
+
safetensors = "0.4"
|
|
35
|
+
half = "2"
|
|
36
|
+
memmap2 = "0.9"
|
|
37
|
+
|
|
38
|
+
# GPU monitoring
|
|
39
|
+
# nvml-wrapper = "0.10" # Linux only — enabled via feature flag
|
|
40
|
+
|
|
41
|
+
# Storage
|
|
42
|
+
rusqlite = { version = "0.32", features = ["bundled"] }
|
|
43
|
+
|
|
44
|
+
# HTTP client
|
|
45
|
+
reqwest = { version = "0.12", features = ["json", "stream"] }
|
|
46
|
+
|
|
47
|
+
# Logging
|
|
48
|
+
tracing = "0.1"
|
|
49
|
+
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|
50
|
+
|
|
51
|
+
# Utils
|
|
52
|
+
uuid = { version = "1", features = ["v4"] }
|
|
53
|
+
chrono = { version = "0.4", features = ["serde"] }
|
|
54
|
+
anyhow = "1"
|
|
55
|
+
thiserror = "2"
|
|
56
|
+
dirs = "6"
|
|
57
|
+
indicatif = "0.17"
|
|
58
|
+
colored = "3"
|
|
59
|
+
futures = "0.3"
|
|
60
|
+
async-stream = "0.3"
|
|
61
|
+
bytes = "1"
|
|
62
|
+
|
|
63
|
+
[features]
|
|
64
|
+
default = []
|
|
65
|
+
cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
|
|
66
|
+
metal = ["candle-core/metal", "candle-nn/metal", "candle-transformers/metal"]
|
|
67
|
+
nvidia = [] # Enable NVML GPU monitoring
|
|
68
|
+
|
|
69
|
+
[dev-dependencies]
|
|
70
|
+
tempfile = "3"
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
use anyhow::Result;
|
|
2
|
+
use colored::Colorize;
|
|
3
|
+
use crate::config::SynapseConfig;
|
|
4
|
+
use crate::inference::InferenceEngine;
|
|
5
|
+
|
|
6
|
+
pub async fn run(config: &SynapseConfig, model: Option<&str>) -> Result<()> {
|
|
7
|
+
let model_name = model.unwrap_or(&config.base_model);
|
|
8
|
+
println!("{} {model_name}", "Benchmarking".bold().cyan());
|
|
9
|
+
println!("{}", "═".repeat(50));
|
|
10
|
+
|
|
11
|
+
let engine = InferenceEngine::new(config)?;
|
|
12
|
+
|
|
13
|
+
let prompts = [
|
|
14
|
+
"What is a Python decorator?",
|
|
15
|
+
"Write a SQL query to find the top 10 users by posts.",
|
|
16
|
+
"Explain the difference between TCP and UDP.",
|
|
17
|
+
"How does garbage collection work in Go?",
|
|
18
|
+
];
|
|
19
|
+
|
|
20
|
+
let mut total_tokens = 0u64;
|
|
21
|
+
let start = std::time::Instant::now();
|
|
22
|
+
|
|
23
|
+
for prompt in &prompts {
|
|
24
|
+
let prompt_start = std::time::Instant::now();
|
|
25
|
+
let response = engine.generate(prompt, None, 256, 0.7).await?;
|
|
26
|
+
let elapsed = prompt_start.elapsed();
|
|
27
|
+
|
|
28
|
+
let tokens = response.completion_tokens as u64;
|
|
29
|
+
total_tokens += tokens;
|
|
30
|
+
|
|
31
|
+
println!(" {} {} tokens in {:.0}ms ({:.1} tok/s)",
|
|
32
|
+
"•".green(), tokens, elapsed.as_millis(), response.tok_per_sec);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
let total_elapsed = start.elapsed();
|
|
36
|
+
let avg_tok_s = total_tokens as f64 / total_elapsed.as_secs_f64();
|
|
37
|
+
|
|
38
|
+
println!("\n{}", "Results".bold().yellow());
|
|
39
|
+
println!(" {} {:.0} tok/s", "Average throughput:".bold(), avg_tok_s);
|
|
40
|
+
println!(" {} {:.0}ms", "Total time:".bold(), total_elapsed.as_millis());
|
|
41
|
+
println!(" {} {}", "Total tokens:".bold(), total_tokens);
|
|
42
|
+
|
|
43
|
+
Ok(())
|
|
44
|
+
}
|