artemisllmbench 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. artemisllmbench-0.1.0/LICENSE +21 -0
  2. artemisllmbench-0.1.0/PKG-INFO +145 -0
  3. artemisllmbench-0.1.0/PYPI_README.md +109 -0
  4. artemisllmbench-0.1.0/README.md +353 -0
  5. artemisllmbench-0.1.0/artemisllmbench.egg-info/PKG-INFO +145 -0
  6. artemisllmbench-0.1.0/artemisllmbench.egg-info/SOURCES.txt +58 -0
  7. artemisllmbench-0.1.0/artemisllmbench.egg-info/dependency_links.txt +1 -0
  8. artemisllmbench-0.1.0/artemisllmbench.egg-info/entry_points.txt +2 -0
  9. artemisllmbench-0.1.0/artemisllmbench.egg-info/requires.txt +13 -0
  10. artemisllmbench-0.1.0/artemisllmbench.egg-info/top_level.txt +2 -0
  11. artemisllmbench-0.1.0/benchmark.py +2269 -0
  12. artemisllmbench-0.1.0/pyproject.toml +61 -0
  13. artemisllmbench-0.1.0/setup.cfg +4 -0
  14. artemisllmbench-0.1.0/tests/test_candidate_export.py +84 -0
  15. artemisllmbench-0.1.0/tests/test_cli_integration.py +220 -0
  16. artemisllmbench-0.1.0/tests/test_new_metrics.py +312 -0
  17. artemisllmbench-0.1.0/tests/test_progress_writer.py +73 -0
  18. artemisllmbench-0.1.0/tests/test_semantic.py +89 -0
  19. artemisllmbench-0.1.0/tests/test_stability_analysis.py +175 -0
  20. artemisllmbench-0.1.0/tests/test_validity_integration.py +253 -0
  21. artemisllmbench-0.1.0/turing_bench/__init__.py +16 -0
  22. artemisllmbench-0.1.0/turing_bench/adapters/_default.yaml +75 -0
  23. artemisllmbench-0.1.0/turing_bench/adapters/beast3_gpu.yaml +36 -0
  24. artemisllmbench-0.1.0/turing_bench/cli.py +652 -0
  25. artemisllmbench-0.1.0/turing_bench/dashboard/__init__.py +0 -0
  26. artemisllmbench-0.1.0/turing_bench/dashboard/app.py +2799 -0
  27. artemisllmbench-0.1.0/turing_bench/dashboard/components/__init__.py +0 -0
  28. artemisllmbench-0.1.0/turing_bench/export/__init__.py +3 -0
  29. artemisllmbench-0.1.0/turing_bench/export/streamlit_config.py +248 -0
  30. artemisllmbench-0.1.0/turing_bench/report/__init__.py +1 -0
  31. artemisllmbench-0.1.0/turing_bench/report/baseline.py +287 -0
  32. artemisllmbench-0.1.0/turing_bench/report/formatter.py +82 -0
  33. artemisllmbench-0.1.0/turing_bench/runner/__init__.py +30 -0
  34. artemisllmbench-0.1.0/turing_bench/runner/concurrent.py +216 -0
  35. artemisllmbench-0.1.0/turing_bench/runner/conformance.py +110 -0
  36. artemisllmbench-0.1.0/turing_bench/runner/sequential.py +205 -0
  37. artemisllmbench-0.1.0/turing_bench/runner/sse_parser.py +248 -0
  38. artemisllmbench-0.1.0/turing_bench/runner/sweep.py +267 -0
  39. artemisllmbench-0.1.0/turing_bench/scenarios/control_prompt_v1.yaml +37 -0
  40. artemisllmbench-0.1.0/turing_bench/scenarios/decode_probe_v1.yaml +40 -0
  41. artemisllmbench-0.1.0/turing_bench/scenarios/large_prompt_v1.yaml +51 -0
  42. artemisllmbench-0.1.0/turing_bench/scenarios/long_context_v1.yaml +86 -0
  43. artemisllmbench-0.1.0/turing_bench/scenarios/mixed_realistic_v1.yaml +110 -0
  44. artemisllmbench-0.1.0/turing_bench/scenarios/prefill_probe_v1.yaml +57 -0
  45. artemisllmbench-0.1.0/turing_bench/scenarios/prefix_cache_probe_v1.yaml +61 -0
  46. artemisllmbench-0.1.0/turing_bench/scenarios/small_prompt_v1.yaml +36 -0
  47. artemisllmbench-0.1.0/turing_bench/stats/__init__.py +1 -0
  48. artemisllmbench-0.1.0/turing_bench/stats/cv.py +38 -0
  49. artemisllmbench-0.1.0/turing_bench/stats/distribution.py +167 -0
  50. artemisllmbench-0.1.0/turing_bench/stats/drift.py +56 -0
  51. artemisllmbench-0.1.0/turing_bench/stats/live_dashboard.py +249 -0
  52. artemisllmbench-0.1.0/turing_bench/stats/percentiles.py +49 -0
  53. artemisllmbench-0.1.0/turing_bench/stats/progress_writer.py +208 -0
  54. artemisllmbench-0.1.0/turing_bench/stats/spike.py +55 -0
  55. artemisllmbench-0.1.0/turing_bench/stats/visualize.py +167 -0
  56. artemisllmbench-0.1.0/turing_bench/validity/__init__.py +370 -0
  57. artemisllmbench-0.1.0/turing_bench/validity/exact_match.py +35 -0
  58. artemisllmbench-0.1.0/turing_bench/validity/sanity.py +42 -0
  59. artemisllmbench-0.1.0/turing_bench/validity/semantic.py +120 -0
  60. artemisllmbench-0.1.0/turing_bench/validity/structural.py +40 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 TurinTech AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,145 @@
1
+ Metadata-Version: 2.4
2
+ Name: artemisllmbench
3
+ Version: 0.1.0
4
+ Summary: LLM benchmark that proves optimization gains — correctness validation + performance measurement for any OpenAI-compatible endpoint
5
+ Author-email: TurinTech AI <neda@turintech.ai>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://pypi.org/project/artemisllmbench
8
+ Project-URL: Documentation, https://pypi.org/project/artemisllmbench
9
+ Project-URL: Bug Tracker, https://turintech.ai
10
+ Keywords: llm,benchmark,vllm,inference,performance,validation
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: System :: Benchmark
21
+ Requires-Python: >=3.9
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: httpx>=0.28
25
+ Requires-Dist: click>=8.0
26
+ Requires-Dist: rich>=13.0
27
+ Requires-Dist: pyyaml>=6.0
28
+ Requires-Dist: numpy>=1.24
29
+ Provides-Extra: dashboard
30
+ Requires-Dist: streamlit>=1.30; extra == "dashboard"
31
+ Requires-Dist: plotly>=5.0; extra == "dashboard"
32
+ Provides-Extra: full
33
+ Requires-Dist: artemisllmbench[dashboard]; extra == "full"
34
+ Requires-Dist: sentence-transformers>=2.2; extra == "full"
35
+ Dynamic: license-file
36
+
37
+ # Artemis LLM Benchmark
38
+
39
+ A Python CLI for **correctness validation and performance benchmarking** of LLM serving endpoints. Works with any OpenAI-compatible server — vLLM, Ollama, llama.cpp, and more.
40
+
41
+ ---
42
+
43
+ ## Install
44
+
45
+ ```bash
46
+ pip install artemisllmbench # core
47
+ pip install "artemisllmbench[dashboard]" # + Streamlit dashboard
48
+ pip install "artemisllmbench[full]" # + dashboard + semantic similarity
49
+ ```
50
+
51
+ > `[full]` adds `sentence-transformers` for semantic similarity checks (Layer 3 validity).
52
+
53
+ ---
54
+
55
+ ## What It Does
56
+
57
+ Artemis answers two questions after you optimize an LLM endpoint:
58
+ - **Did the optimization preserve correctness?** — multi-layer validity checks on every response
59
+ - **What is the publishable performance number?** — reproducible latency, throughput, and goodput metrics
60
+
61
+ ---
62
+
63
+ ## Quick Start
64
+
65
+ **Validate a single endpoint** — correctness + performance in one command:
66
+
67
+ ```bash
68
+ artemisllmbench validate \
69
+ --endpoint http://localhost:9000 \
70
+ --model Qwen/Qwen2.5-7B-Instruct \
71
+ --hardware a100
72
+ ```
73
+
74
+ **Compare stock vs optimized** — sequential benchmarking with full GPU resources for each:
75
+
76
+ ```bash
77
+ artemisllmbench compare \
78
+ --endpoint-a http://localhost:9000 \
79
+ --endpoint-b http://localhost:9001 \
80
+ --model Qwen/Qwen2.5-7B-Instruct \
81
+ --hardware a100
82
+ ```
83
+
84
+ **Split-session compare** — when the stock endpoint is already torn down:
85
+
86
+ ```bash
87
+ # Session 1: save the stock baseline
88
+ artemisllmbench baseline --endpoint http://localhost:9000 --model <model> --hardware a100
89
+
90
+ # Session 2: run the optimized candidate
91
+ artemisllmbench candidate --endpoint http://localhost:9000 --model <model> --hardware a100
92
+ ```
93
+
94
+ The dashboard launches automatically after each run. Open it at `http://<your-ip>:8501`.
95
+
96
+ ---
97
+
98
+ ## Key Features
99
+
100
+ - **Multi-layer validity** — sanity, structural, semantic (embedding similarity ≥ 0.92), and exact-match checks catch regressions that latency numbers alone miss
101
+ - **Reproducible metrics** — TTFT, P95/P99 latency, ITL (inter-token latency), throughput, CV, drift, and spike detection
102
+ - **SLO / goodput tracking** — set `--slo-ttft` and `--slo-latency` thresholds; get the % of requests that met them
103
+ - **Streamlit dashboard** — live progress, side-by-side results, analytics charts, and a live response comparison panel
104
+ - **Fast mode** — `--fast` cuts runtime by ~75% for quick iteration checks
105
+ - **Cross-machine support** — endpoints can be on different hosts or different hardware
106
+
107
+ ---
108
+
109
+ ## Common Flags
110
+
111
+ | Flag | Description |
112
+ |------|-------------|
113
+ | `--fast` | Reduced runs (~75% faster). For quick checks only. |
114
+ | `--production` | Full 50-run sequential + concurrent load (default). |
115
+ | `--slo-ttft <ms>` | TTFT SLO threshold — enables goodput reporting. |
116
+ | `--slo-latency <ms>` | End-to-end latency SLO threshold. |
117
+ | `--plots` | ASCII charts inline in terminal output. |
118
+ | `--live` | Rich terminal live view during concurrent phases. |
119
+ | `--no-dashboard` | Skip auto-launching Streamlit. |
120
+ | `--port N` | Streamlit port (default: 8501). |
121
+
122
+ ---
123
+
124
+ ## Validity Layers
125
+
126
+ | Layer | Check | On failure |
127
+ |-------|-------|------------|
128
+ | 1 Sanity | Non-empty, complete sentence, token bounds | Hard fail |
129
+ | 2 Structural | JSON/Python syntax where required | Hard fail |
130
+ | 3 Semantic | Cosine similarity ≥ 0.92 vs. reference | Hard fail / warning |
131
+ | 4 Exact match | String equality (`control_prompt_v1` only) | Warning |
132
+
133
+ ---
134
+
135
+ ## Pre-flight Conformance Check
136
+
137
+ ```bash
138
+ artemisllmbench check-conformance --endpoint http://localhost:9000
139
+ ```
140
+
141
+ Verifies your endpoint speaks the required OpenAI-compatible SSE format before a full benchmark run.
142
+
143
+ ---
144
+
145
+ **Full documentation and source:** `artemisllmbench --help` or `artemisllmbench <command> --help`
@@ -0,0 +1,109 @@
1
+ # Artemis LLM Benchmark
2
+
3
+ A Python CLI for **correctness validation and performance benchmarking** of LLM serving endpoints. Works with any OpenAI-compatible server — vLLM, Ollama, llama.cpp, and more.
4
+
5
+ ---
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install artemisllmbench # core
11
+ pip install "artemisllmbench[dashboard]" # + Streamlit dashboard
12
+ pip install "artemisllmbench[full]" # + dashboard + semantic similarity
13
+ ```
14
+
15
+ > `[full]` adds `sentence-transformers` for semantic similarity checks (Layer 3 validity).
16
+
17
+ ---
18
+
19
+ ## What It Does
20
+
21
+ Artemis answers two questions after you optimize an LLM endpoint:
22
+ - **Did the optimization preserve correctness?** — multi-layer validity checks on every response
23
+ - **What is the publishable performance number?** — reproducible latency, throughput, and goodput metrics
24
+
25
+ ---
26
+
27
+ ## Quick Start
28
+
29
+ **Validate a single endpoint** — correctness + performance in one command:
30
+
31
+ ```bash
32
+ artemisllmbench validate \
33
+ --endpoint http://localhost:9000 \
34
+ --model Qwen/Qwen2.5-7B-Instruct \
35
+ --hardware a100
36
+ ```
37
+
38
+ **Compare stock vs optimized** — sequential benchmarking with full GPU resources for each:
39
+
40
+ ```bash
41
+ artemisllmbench compare \
42
+ --endpoint-a http://localhost:9000 \
43
+ --endpoint-b http://localhost:9001 \
44
+ --model Qwen/Qwen2.5-7B-Instruct \
45
+ --hardware a100
46
+ ```
47
+
48
+ **Split-session compare** — when the stock endpoint is already torn down:
49
+
50
+ ```bash
51
+ # Session 1: save the stock baseline
52
+ artemisllmbench baseline --endpoint http://localhost:9000 --model <model> --hardware a100
53
+
54
+ # Session 2: run the optimized candidate
55
+ artemisllmbench candidate --endpoint http://localhost:9000 --model <model> --hardware a100
56
+ ```
57
+
58
+ The dashboard launches automatically after each run. Open it at `http://<your-ip>:8501`.
59
+
60
+ ---
61
+
62
+ ## Key Features
63
+
64
+ - **Multi-layer validity** — sanity, structural, semantic (embedding similarity ≥ 0.92), and exact-match checks catch regressions that latency numbers alone miss
65
+ - **Reproducible metrics** — TTFT, P95/P99 latency, ITL (inter-token latency), throughput, CV, drift, and spike detection
66
+ - **SLO / goodput tracking** — set `--slo-ttft` and `--slo-latency` thresholds; get the % of requests that met them
67
+ - **Streamlit dashboard** — live progress, side-by-side results, analytics charts, and a live response comparison panel
68
+ - **Fast mode** — `--fast` cuts runtime by ~75% for quick iteration checks
69
+ - **Cross-machine support** — endpoints can be on different hosts or different hardware
70
+
71
+ ---
72
+
73
+ ## Common Flags
74
+
75
+ | Flag | Description |
76
+ |------|-------------|
77
+ | `--fast` | Reduced runs (~75% faster). For quick checks only. |
78
+ | `--production` | Full 50-run sequential + concurrent load (default). |
79
+ | `--slo-ttft <ms>` | TTFT SLO threshold — enables goodput reporting. |
80
+ | `--slo-latency <ms>` | End-to-end latency SLO threshold. |
81
+ | `--plots` | ASCII charts inline in terminal output. |
82
+ | `--live` | Rich terminal live view during concurrent phases. |
83
+ | `--no-dashboard` | Skip auto-launching Streamlit. |
84
+ | `--port N` | Streamlit port (default: 8501). |
85
+
86
+ ---
87
+
88
+ ## Validity Layers
89
+
90
+ | Layer | Check | On failure |
91
+ |-------|-------|------------|
92
+ | 1 Sanity | Non-empty, complete sentence, token bounds | Hard fail |
93
+ | 2 Structural | JSON/Python syntax where required | Hard fail |
94
+ | 3 Semantic | Cosine similarity ≥ 0.92 vs. reference | Hard fail / warning |
95
+ | 4 Exact match | String equality (`control_prompt_v1` only) | Warning |
96
+
97
+ ---
98
+
99
+ ## Pre-flight Conformance Check
100
+
101
+ ```bash
102
+ artemisllmbench check-conformance --endpoint http://localhost:9000
103
+ ```
104
+
105
+ Verifies your endpoint speaks the required OpenAI-compatible SSE format before a full benchmark run.
106
+
107
+ ---
108
+
109
+ **Full documentation and source:** `artemisllmbench --help` or `artemisllmbench <command> --help`
@@ -0,0 +1,353 @@
1
+ # Artemis LLM Benchmark
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/artemisllmbench)](https://pypi.org/project/artemisllmbench/)
4
+
5
+ A standalone Python CLI for service-level correctness validation and performance benchmarking of LLM serving endpoints. Works with any OpenAI-compatible server (vLLM, Ollama, llama.cpp, etc.).
6
+
7
+ ---
8
+
9
+ ## What This Benchmark Does
10
+
11
+ Artemis is the **final validation gate** — it answers two questions: did the optimization preserve correctness, and what is the publishable performance number?
12
+
13
+ **The correct workflow:**
14
+
15
+ ```
16
+ Project benchmark (llama-bench / vLLM scripts / benchmark_app)
17
+ → Guides optimization opportunities
18
+
19
+ [Apply optimizations]
20
+
21
+ Project benchmark again → Confirms internal improvement
22
+
23
+ Artemis LLM Benchmark ← this tool
24
+ → Validates improvement holds at service level
25
+ → Confirms correctness is preserved
26
+ → Produces the publishable, comparable number
27
+ ```
28
+
29
+ **Use cases:**
30
+
31
+ | Use case | Command |
32
+ |----------|---------|
33
+ | Validate a single optimized endpoint (correctness + performance) | `validate` |
34
+ | Compare Endpoint A vs Endpoint B side-by-side in one session | `compare` |
35
+ | Compare across sessions or machines (stock already torn down) | `baseline` + `candidate` |
36
+ | Export existing results to the dashboard without re-running | `export-dashboard` |
37
+
38
+ ---
39
+
40
+ ## Install
41
+
42
+ ```bash
43
+ pip install artemisllmbench # core
44
+ pip install "artemisllmbench[dashboard]" # + Streamlit dashboard
45
+ pip install "artemisllmbench[full]" # + dashboard + semantic similarity
46
+ ```
47
+
48
+ > `[full]` includes `sentence-transformers` for semantic similarity checks (Layer 3 validity). Requires **sentence-transformers ≥ 3.0** and a compatible `huggingface_hub`. If you see `cannot import name 'cached_download'`, upgrade: `pip install --upgrade sentence-transformers`.
49
+
50
+ **Development (from source):**
51
+
52
+ ```bash
53
+ python -m venv .venv
54
+ .venv/bin/pip install -e ".[full]"
55
+ ```
56
+
57
+ `artemisllmbench` is available after install. In development you can also invoke directly with `python benchmark.py`.
58
+
59
+ ---
60
+
61
+ ## Commands
62
+
63
+ ### `validate` — Single endpoint
64
+
65
+ Validate correctness and measure performance for one endpoint.
66
+
67
+ ```bash
68
+ artemisllmbench validate \
69
+ --endpoint http://localhost:9000 \
70
+ --model Qwen/Qwen2.5-7B-Instruct \
71
+ --hardware a100
72
+ ```
73
+
74
+ The dashboard launches automatically. Open it at `http://<your-ip>:8501`.
75
+
76
+ ---
77
+
78
+ ### `compare` — Stock vs optimized
79
+
80
+ Benchmark both endpoints sequentially — each with full dedicated resources — then produce a side-by-side report.
81
+
82
+ ```bash
83
+ artemisllmbench compare \
84
+ --endpoint-a http://localhost:9000 \
85
+ --endpoint-b http://localhost:9001 \
86
+ --model Qwen/Qwen2.5-7B-Instruct \
87
+ --hardware a100
88
+ ```
89
+
90
+ **Why sequential, not simultaneous?** LLM inference is memory-bandwidth bound. Running two models simultaneously splits GPU VRAM and bandwidth, producing numbers that represent neither endpoint accurately. Sequential measurement with full resources for each gives reproducible, representative results.
91
+
92
+ **Flow:**
93
+
94
+ 1. Full benchmark on Endpoint A (sequential validation → concurrent load)
95
+ 2. CLI pauses — prepare Endpoint B (optionally tear down Endpoint A to free memory)
96
+ 3. Full benchmark on Endpoint B
97
+ 4. Side-by-side comparison report printed; dashboard updated automatically
98
+
99
+ **Cross-machine compare** — endpoints do not need to be on the same machine:
100
+
101
+ ```bash
102
+ artemisllmbench compare \
103
+ --endpoint-a http://192.168.1.10:9000 \
104
+ --endpoint-b http://192.168.1.20:9000 \
105
+ --model Qwen/Qwen2.5-7B-Instruct \
106
+ --hardware a100 \
107
+ --hardware-b h100
108
+ ```
109
+
110
+ `--hardware-b` labels the optimized side separately in the dashboard (e.g., `NVIDIA A100 80GB → NVIDIA H100 80GB`). If omitted, both sides use the same hardware label.
111
+
112
+ **Verifying endpoint reachability** before running:
113
+
114
+ ```bash
115
+ curl -s --max-time 5 http://<ip>:<port>/v1/models
116
+ ```
117
+
118
+ Expected: JSON with the model name. If it hangs → port is blocked (check firewall / `ufw allow <port>`). If "connection refused" → vLLM not running or bound to `127.0.0.1` instead of `0.0.0.0`.
119
+
120
+ ---
121
+
122
+ ### `baseline` + `candidate` — Split sessions
123
+
124
+ Use when Endpoint A is already torn down, or when the two endpoints run on different days or machines.
125
+
126
+ ```bash
127
+ # Step 1: benchmark Endpoint A, save result
128
+ artemisllmbench baseline \
129
+ --endpoint http://localhost:9000 \
130
+ --model Qwen/Qwen2.5-7B-Instruct \
131
+ --hardware a100
132
+
133
+ # Step 2: benchmark Endpoint B — auto-loads saved baseline for comparison
134
+ artemisllmbench candidate \
135
+ --endpoint http://localhost:9000 \
136
+ --model Qwen/Qwen2.5-7B-Instruct \
137
+ --hardware a100
138
+ ```
139
+
140
+ After `candidate` completes, the dashboard shows the same compare-mode layout as `compare` — side-by-side charts, delta table, and the live comparison panel.
141
+
142
+ For most cases, prefer `compare` — it runs both in one session and produces cleaner results.
143
+
144
+ ---
145
+
146
+ ### Common flags
147
+
148
+ | Flag | Description |
149
+ |------|-------------|
150
+ | `--fast` | Fast mode: 5 warmup + reduced runs. Cuts runtime by ~75%. Good for quick checks — not for production comparisons. |
151
+ | `--production` | Production mode: full warmup + 50 sequential runs. Default — explicit flag is optional. |
152
+ | `--slo-ttft <ms>` | SLO threshold for TTFT. Reports **goodput** — % of requests that satisfy all configured SLOs. |
153
+ | `--slo-latency <ms>` | SLO threshold for end-to-end latency. Used together with `--slo-ttft` or alone. |
154
+ | `--hardware-b <hw>` | Label Endpoint B hardware separately (`compare` only) |
155
+ | `--live` | Rich terminal dashboard during concurrent phases |
156
+ | `--plots` | ASCII time-series and histogram charts in terminal report |
157
+ | `--rps N` | Override target requests-per-second |
158
+ | `--requests N` | Override total concurrent request count |
159
+ | `--no-dashboard` | Skip auto-launching Streamlit |
160
+ | `--port N` | Streamlit port (default: 8501) |
161
+
162
+ **Goodput example** — answers "did the optimization improve SLO compliance?":
163
+
164
+ ```bash
165
+ artemisllmbench compare \
166
+ --endpoint-a http://localhost:9000 \
167
+ --endpoint-b http://localhost:9001 \
168
+ --model Qwen/Qwen2.5-7B-Instruct \
169
+ --hardware a100 \
170
+ --slo-ttft 200 \
171
+ --slo-latency 500
172
+ ```
173
+
174
+ Terminal output per scenario:
175
+ ```
176
+ PASS: P95=142ms Goodput: 98.7%
177
+ ```
178
+
179
+ Dashboard Results tab shows a goodput compliance card with Endpoint A vs Endpoint B percentages and the pp delta.
180
+
181
+ Before a run starts, Artemis sends one timing probe and prints an estimate:
182
+
183
+ ```
184
+ Mode: PRODUCTION
185
+ Estimated runtime:
186
+ Sequential 7 scenarios ~95 min
187
+ Concurrent 6 scenarios ~18 min
188
+ Total ~113 min
189
+
190
+ Tip: add --fast to cut this to ~25 min
191
+ ```
192
+
193
+ ---
194
+
195
+ ## Dashboard
196
+
197
+ Launches automatically with every command. Open at `http://<your-ip>:8501`.
198
+
199
+ ### Tabs
200
+
201
+ **Live** — real-time progress during a run:
202
+ - Single-run mode: one progress panel showing phase, scenario, metrics as they arrive
203
+ - Compare mode: frozen Endpoint A panel (top) + live Endpoint B panel (bottom), both visible simultaneously. During the CLI pause between phases, a waiting placeholder appears automatically.
204
+
205
+ **Results** — after the run completes:
206
+ - `validate`: validity gate → output correctness → performance metrics table (TTFT, P95/P99 latency, CV, ITL mean/P95) → goodput card (if `--slo-ttft`/`--slo-latency` were set)
207
+ - `compare` / `candidate`: optimization impact banner → output correctness → side-by-side metrics table with deltas (includes ITL and goodput) → goodput compliance card → live side-by-side comparison panel
208
+
209
+ **Analytics** — detailed charts:
210
+ - TTFT P50/P95/P99 grouped bar charts (Endpoint A vs Endpoint B in compare mode)
211
+ - Queue pressure: sequential → concurrent P95 delta
212
+ - Latency stability (CV%) with 5% / 20% thresholds
213
+ - Isolation probes: prefill throughput, decode throughput, prefix cache detection
214
+ - RPS saturation curve
215
+
216
+ The Results and Analytics tabs are cleared during active runs and populate automatically once the benchmark and export complete — no manual refresh needed.
217
+
218
+ ### Live side-by-side comparison panel
219
+
220
+ Available in `compare` and `candidate` modes under the Results tab. The panel only appears when there is something to show:
221
+
222
+ - **Both endpoints live** — responses stream directly from both models in real time
223
+ - **Recorded responses available** — replays responses captured during the benchmark run at actual benchmark resources
224
+ - **Neither** — the panel is hidden entirely (this is the case in `validate` mode and when a `baseline`/`candidate` result is viewed without a live endpoint)
225
+
226
+ Select a preset prompt or type your own, then click **Run comparison** to see responses stream side by side with TTFT and tok/s metrics.
227
+
228
+ ### Manual launch
229
+
230
+ ```bash
231
+ artemisllmbench dashboard
232
+ ```
233
+
234
+ ### Export existing results
235
+
236
+ ```bash
237
+ artemisllmbench export-dashboard \
238
+ --baseline baselines/<baseline-file>.json \
239
+ --candidate baselines/<candidate-file>.json
240
+ ```
241
+
242
+ ---
243
+
244
+ ## How It Works
245
+
246
+ ### Scenarios
247
+
248
+ All scenarios are frozen, versioned YAML files. A version bump is required to change a scenario.
249
+
250
+ | Scenario | Input | Output | Measures |
251
+ |----------|-------|--------|----------|
252
+ | `small_prompt_v1` | "What is the capital of Japan?" | ~20 tokens | TTFT, scheduling latency, serving overhead |
253
+ | `large_prompt_v1` | ~1500-token technical content | ~120 tokens | PagedAttention, KV-cache efficiency, prefill throughput |
254
+ | `long_context_v1` | ~6500-token context + factual question | ~60 tokens | Memory pressure, attention scaling |
255
+ | `control_prompt_v1` | "What is 144 divided by 12?" | "12" (exact) | Determinism, precision drift (fp32 → bf16) |
256
+ | `mixed_realistic_v1` | 26-prompt pool: code, reasoning, explanation, Q&A | 25–400 tokens | Realistic workload distribution, variable-length ITL |
257
+
258
+ **Isolation probes** (run but not shown as primary metrics):
259
+
260
+ | Probe | Measures |
261
+ |-------|---------|
262
+ | `prefill_probe_v1` | Cold vs. warm prefill throughput (tokens/s) — detects hugepage activation |
263
+ | `decode_probe_v1` | Token generation throughput (tokens/s) and scheduling overhead |
264
+ | `prefix_cache_probe_v1` | KV-cache prefix reuse — speedup ratio cold → warm |
265
+
266
+ ### Execution order
267
+
268
+ For each scenario:
269
+
270
+ 1. **Warmup** (discarded) — GPU cache init, CUDA graph compilation, JIT
271
+ 2. **Sequential ×50** → validity gate + per-request latency distribution
272
+ 3. **Concurrent at pinned RPS ×500** → throughput metrics (only if sequential passed)
273
+ 4. **RPS sweep** (optional `--rps-sweep`) → saturation curve
274
+
275
+ ### Validity layers
276
+
277
+ | Layer | Check | Failure mode |
278
+ |-------|-------|--------------|
279
+ | 1 Sanity | Non-empty, complete sentence, token bounds | Hard fail — stops benchmark |
280
+ | 2 Structural | JSON/Python syntax validity where required | Hard fail |
281
+ | 3 Semantic | Embedding cosine similarity ≥ 0.92 vs. reference outputs | Hard fail (<0.85), warning (0.85–0.92) |
282
+ | 4 Exact match | String equality after strip (`control_prompt_v1` only) | Warning only |
283
+
284
+ Layer 3 uses `sentence-transformers` and handles precision changes (fp32 → bf16) that produce semantically equivalent but not byte-identical outputs.
285
+
286
+ ### Observability signals
287
+
288
+ | Signal | What it reveals |
289
+ |--------|----------------|
290
+ | **Drift** | Mean latency rising over sequential runs — memory pressure, thermal throttling, KV-cache eviction |
291
+ | **Spikes** | Isolated requests >2.5× median — GC pauses, OS scheduling jitter |
292
+ | **Fat tails** | P99 ≫ P95 — degraded long requests, attention scaling problems |
293
+ | **Bimodal** | Two latency clusters — cache-hit vs. cache-miss path divergence |
294
+ | **ITL (Inter-Token Latency)** | Mean / P95 / P99 time between consecutive tokens — exposes decode-stage jitter from speculative decoding, dynamic batching, or scheduler interference |
295
+
296
+ Use `--plots` for ASCII time-series and histograms inline in the terminal report.
297
+ Use `--live` for a Rich terminal dashboard during concurrent phases.
298
+
299
+ ---
300
+
301
+ ## Backend Conformance
302
+
303
+ The benchmark measures a deployed service. It requires an OpenAI-compatible `/v1/chat/completions` endpoint with SSE streaming.
304
+
305
+ **Pre-flight check:**
306
+
307
+ ```bash
308
+ artemisllmbench check-conformance --endpoint http://localhost:9000
309
+ ```
310
+
311
+ **Adapters** handle SSE format variation (one YAML per backend):
312
+
313
+ - vLLM: `choices[0].delta.content`
314
+ - llama.cpp: `choices[0].delta.content`
315
+ - `_default.yaml`: fallback for conformant endpoints
316
+
317
+ Run `artemisllmbench check-conformance --help` for all options.
318
+
319
+ ---
320
+
321
+ ## Development
322
+
323
+ **Run tests:**
324
+
325
+ ```bash
326
+ pytest tests/
327
+ ```
328
+
329
+ **Project structure:**
330
+
331
+ ```
332
+ benchmark.py # CLI entry point (installed as `artemisllmbench`)
333
+ turing_bench/
334
+ ├── scenarios/ # Frozen, versioned scenario YAMLs
335
+ ├── adapters/ # SSE format configs per backend
336
+ ├── runner/ # Sequential and concurrent execution engines
337
+ ├── validity/ # Multi-layer correctness checks
338
+ ├── stats/ # Percentiles, CV, drift, spikes, distribution
339
+ │ ├── live_dashboard.py # Rich terminal live view
340
+ │ └── progress_writer.py # Writes _progress.json for Streamlit polling
341
+ ├── export/ # Converts result JSON → dashboard config JSON
342
+ ├── dashboard/ # Streamlit dashboard (bundled in the package)
343
+ │ └── app.py
344
+ └── report/ # Baseline manager, report formatter
345
+ dashboard/
346
+ └── data/ # Dashboard config JSONs (auto-written after runs)
347
+ tests/
348
+ baselines/ # Saved baseline JSONs (created on first run)
349
+ ```
350
+
351
+ ---
352
+
353
+ **Questions?** Run `artemisllmbench` for the quick-start guide, or `artemisllmbench <command> --help` for full options.