artemisllmbench 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- artemisllmbench-0.1.0/LICENSE +21 -0
- artemisllmbench-0.1.0/PKG-INFO +145 -0
- artemisllmbench-0.1.0/PYPI_README.md +109 -0
- artemisllmbench-0.1.0/README.md +353 -0
- artemisllmbench-0.1.0/artemisllmbench.egg-info/PKG-INFO +145 -0
- artemisllmbench-0.1.0/artemisllmbench.egg-info/SOURCES.txt +58 -0
- artemisllmbench-0.1.0/artemisllmbench.egg-info/dependency_links.txt +1 -0
- artemisllmbench-0.1.0/artemisllmbench.egg-info/entry_points.txt +2 -0
- artemisllmbench-0.1.0/artemisllmbench.egg-info/requires.txt +13 -0
- artemisllmbench-0.1.0/artemisllmbench.egg-info/top_level.txt +2 -0
- artemisllmbench-0.1.0/benchmark.py +2269 -0
- artemisllmbench-0.1.0/pyproject.toml +61 -0
- artemisllmbench-0.1.0/setup.cfg +4 -0
- artemisllmbench-0.1.0/tests/test_candidate_export.py +84 -0
- artemisllmbench-0.1.0/tests/test_cli_integration.py +220 -0
- artemisllmbench-0.1.0/tests/test_new_metrics.py +312 -0
- artemisllmbench-0.1.0/tests/test_progress_writer.py +73 -0
- artemisllmbench-0.1.0/tests/test_semantic.py +89 -0
- artemisllmbench-0.1.0/tests/test_stability_analysis.py +175 -0
- artemisllmbench-0.1.0/tests/test_validity_integration.py +253 -0
- artemisllmbench-0.1.0/turing_bench/__init__.py +16 -0
- artemisllmbench-0.1.0/turing_bench/adapters/_default.yaml +75 -0
- artemisllmbench-0.1.0/turing_bench/adapters/beast3_gpu.yaml +36 -0
- artemisllmbench-0.1.0/turing_bench/cli.py +652 -0
- artemisllmbench-0.1.0/turing_bench/dashboard/__init__.py +0 -0
- artemisllmbench-0.1.0/turing_bench/dashboard/app.py +2799 -0
- artemisllmbench-0.1.0/turing_bench/dashboard/components/__init__.py +0 -0
- artemisllmbench-0.1.0/turing_bench/export/__init__.py +3 -0
- artemisllmbench-0.1.0/turing_bench/export/streamlit_config.py +248 -0
- artemisllmbench-0.1.0/turing_bench/report/__init__.py +1 -0
- artemisllmbench-0.1.0/turing_bench/report/baseline.py +287 -0
- artemisllmbench-0.1.0/turing_bench/report/formatter.py +82 -0
- artemisllmbench-0.1.0/turing_bench/runner/__init__.py +30 -0
- artemisllmbench-0.1.0/turing_bench/runner/concurrent.py +216 -0
- artemisllmbench-0.1.0/turing_bench/runner/conformance.py +110 -0
- artemisllmbench-0.1.0/turing_bench/runner/sequential.py +205 -0
- artemisllmbench-0.1.0/turing_bench/runner/sse_parser.py +248 -0
- artemisllmbench-0.1.0/turing_bench/runner/sweep.py +267 -0
- artemisllmbench-0.1.0/turing_bench/scenarios/control_prompt_v1.yaml +37 -0
- artemisllmbench-0.1.0/turing_bench/scenarios/decode_probe_v1.yaml +40 -0
- artemisllmbench-0.1.0/turing_bench/scenarios/large_prompt_v1.yaml +51 -0
- artemisllmbench-0.1.0/turing_bench/scenarios/long_context_v1.yaml +86 -0
- artemisllmbench-0.1.0/turing_bench/scenarios/mixed_realistic_v1.yaml +110 -0
- artemisllmbench-0.1.0/turing_bench/scenarios/prefill_probe_v1.yaml +57 -0
- artemisllmbench-0.1.0/turing_bench/scenarios/prefix_cache_probe_v1.yaml +61 -0
- artemisllmbench-0.1.0/turing_bench/scenarios/small_prompt_v1.yaml +36 -0
- artemisllmbench-0.1.0/turing_bench/stats/__init__.py +1 -0
- artemisllmbench-0.1.0/turing_bench/stats/cv.py +38 -0
- artemisllmbench-0.1.0/turing_bench/stats/distribution.py +167 -0
- artemisllmbench-0.1.0/turing_bench/stats/drift.py +56 -0
- artemisllmbench-0.1.0/turing_bench/stats/live_dashboard.py +249 -0
- artemisllmbench-0.1.0/turing_bench/stats/percentiles.py +49 -0
- artemisllmbench-0.1.0/turing_bench/stats/progress_writer.py +208 -0
- artemisllmbench-0.1.0/turing_bench/stats/spike.py +55 -0
- artemisllmbench-0.1.0/turing_bench/stats/visualize.py +167 -0
- artemisllmbench-0.1.0/turing_bench/validity/__init__.py +370 -0
- artemisllmbench-0.1.0/turing_bench/validity/exact_match.py +35 -0
- artemisllmbench-0.1.0/turing_bench/validity/sanity.py +42 -0
- artemisllmbench-0.1.0/turing_bench/validity/semantic.py +120 -0
- artemisllmbench-0.1.0/turing_bench/validity/structural.py +40 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 TurinTech AI
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: artemisllmbench
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LLM benchmark that proves optimization gains — correctness validation + performance measurement for any OpenAI-compatible endpoint
|
|
5
|
+
Author-email: TurinTech AI <neda@turintech.ai>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://pypi.org/project/artemisllmbench
|
|
8
|
+
Project-URL: Documentation, https://pypi.org/project/artemisllmbench
|
|
9
|
+
Project-URL: Bug Tracker, https://turintech.ai
|
|
10
|
+
Keywords: llm,benchmark,vllm,inference,performance,validation
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: System :: Benchmark
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: httpx>=0.28
|
|
25
|
+
Requires-Dist: click>=8.0
|
|
26
|
+
Requires-Dist: rich>=13.0
|
|
27
|
+
Requires-Dist: pyyaml>=6.0
|
|
28
|
+
Requires-Dist: numpy>=1.24
|
|
29
|
+
Provides-Extra: dashboard
|
|
30
|
+
Requires-Dist: streamlit>=1.30; extra == "dashboard"
|
|
31
|
+
Requires-Dist: plotly>=5.0; extra == "dashboard"
|
|
32
|
+
Provides-Extra: full
|
|
33
|
+
Requires-Dist: artemisllmbench[dashboard]; extra == "full"
|
|
34
|
+
Requires-Dist: sentence-transformers>=2.2; extra == "full"
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
# Artemis LLM Benchmark
|
|
38
|
+
|
|
39
|
+
A Python CLI for **correctness validation and performance benchmarking** of LLM serving endpoints. Works with any OpenAI-compatible server — vLLM, Ollama, llama.cpp, and more.
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Install
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install artemisllmbench # core
|
|
47
|
+
pip install "artemisllmbench[dashboard]" # + Streamlit dashboard
|
|
48
|
+
pip install "artemisllmbench[full]" # + dashboard + semantic similarity
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
> `[full]` adds `sentence-transformers` for semantic similarity checks (Layer 3 validity).
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## What It Does
|
|
56
|
+
|
|
57
|
+
Artemis answers two questions after you optimize an LLM endpoint:
|
|
58
|
+
- **Did the optimization preserve correctness?** — multi-layer validity checks on every response
|
|
59
|
+
- **What is the publishable performance number?** — reproducible latency, throughput, and goodput metrics
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Quick Start
|
|
64
|
+
|
|
65
|
+
**Validate a single endpoint** — correctness + performance in one command:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
artemisllmbench validate \
|
|
69
|
+
--endpoint http://localhost:9000 \
|
|
70
|
+
--model Qwen/Qwen2.5-7B-Instruct \
|
|
71
|
+
--hardware a100
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
**Compare stock vs optimized** — sequential benchmarking with full GPU resources for each:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
artemisllmbench compare \
|
|
78
|
+
--endpoint-a http://localhost:9000 \
|
|
79
|
+
--endpoint-b http://localhost:9001 \
|
|
80
|
+
--model Qwen/Qwen2.5-7B-Instruct \
|
|
81
|
+
--hardware a100
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**Split-session compare** — when the stock endpoint is already torn down:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
# Session 1: save the stock baseline
|
|
88
|
+
artemisllmbench baseline --endpoint http://localhost:9000 --model <model> --hardware a100
|
|
89
|
+
|
|
90
|
+
# Session 2: run the optimized candidate
|
|
91
|
+
artemisllmbench candidate --endpoint http://localhost:9000 --model <model> --hardware a100
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
The dashboard launches automatically after each run. Open it at `http://<your-ip>:8501`.
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## Key Features
|
|
99
|
+
|
|
100
|
+
- **Multi-layer validity** — sanity, structural, semantic (embedding similarity ≥ 0.92), and exact-match checks catch regressions that latency numbers alone miss
|
|
101
|
+
- **Reproducible metrics** — TTFT, P95/P99 latency, ITL (inter-token latency), throughput, CV, drift, and spike detection
|
|
102
|
+
- **SLO / goodput tracking** — set `--slo-ttft` and `--slo-latency` thresholds; get the % of requests that met them
|
|
103
|
+
- **Streamlit dashboard** — live progress, side-by-side results, analytics charts, and a live response comparison panel
|
|
104
|
+
- **Fast mode** — `--fast` cuts runtime by ~75% for quick iteration checks
|
|
105
|
+
- **Cross-machine support** — endpoints can be on different hosts or different hardware
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## Common Flags
|
|
110
|
+
|
|
111
|
+
| Flag | Description |
|
|
112
|
+
|------|-------------|
|
|
113
|
+
| `--fast` | Reduced runs (~75% faster). For quick checks only. |
|
|
114
|
+
| `--production` | Full 50-run sequential + concurrent load (default). |
|
|
115
|
+
| `--slo-ttft <ms>` | TTFT SLO threshold — enables goodput reporting. |
|
|
116
|
+
| `--slo-latency <ms>` | End-to-end latency SLO threshold. |
|
|
117
|
+
| `--plots` | ASCII charts inline in terminal output. |
|
|
118
|
+
| `--live` | Rich terminal live view during concurrent phases. |
|
|
119
|
+
| `--no-dashboard` | Skip auto-launching Streamlit. |
|
|
120
|
+
| `--port N` | Streamlit port (default: 8501). |
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Validity Layers
|
|
125
|
+
|
|
126
|
+
| Layer | Check | On failure |
|
|
127
|
+
|-------|-------|------------|
|
|
128
|
+
| 1 Sanity | Non-empty, complete sentence, token bounds | Hard fail |
|
|
129
|
+
| 2 Structural | JSON/Python syntax where required | Hard fail |
|
|
130
|
+
| 3 Semantic | Cosine similarity ≥ 0.92 vs. reference | Hard fail / warning |
|
|
131
|
+
| 4 Exact match | String equality (`control_prompt_v1` only) | Warning |
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## Pre-flight Conformance Check
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
artemisllmbench check-conformance --endpoint http://localhost:9000
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Verifies your endpoint speaks the required OpenAI-compatible SSE format before a full benchmark run.
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
**Full documentation and source:** `artemisllmbench --help` or `artemisllmbench <command> --help`
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# Artemis LLM Benchmark
|
|
2
|
+
|
|
3
|
+
A Python CLI for **correctness validation and performance benchmarking** of LLM serving endpoints. Works with any OpenAI-compatible server — vLLM, Ollama, llama.cpp, and more.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install artemisllmbench # core
|
|
11
|
+
pip install "artemisllmbench[dashboard]" # + Streamlit dashboard
|
|
12
|
+
pip install "artemisllmbench[full]" # + dashboard + semantic similarity
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
> `[full]` adds `sentence-transformers` for semantic similarity checks (Layer 3 validity).
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## What It Does
|
|
20
|
+
|
|
21
|
+
Artemis answers two questions after you optimize an LLM endpoint:
|
|
22
|
+
- **Did the optimization preserve correctness?** — multi-layer validity checks on every response
|
|
23
|
+
- **What is the publishable performance number?** — reproducible latency, throughput, and goodput metrics
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
**Validate a single endpoint** — correctness + performance in one command:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
artemisllmbench validate \
|
|
33
|
+
--endpoint http://localhost:9000 \
|
|
34
|
+
--model Qwen/Qwen2.5-7B-Instruct \
|
|
35
|
+
--hardware a100
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
**Compare stock vs optimized** — sequential benchmarking with full GPU resources for each:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
artemisllmbench compare \
|
|
42
|
+
--endpoint-a http://localhost:9000 \
|
|
43
|
+
--endpoint-b http://localhost:9001 \
|
|
44
|
+
--model Qwen/Qwen2.5-7B-Instruct \
|
|
45
|
+
--hardware a100
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
**Split-session compare** — when the stock endpoint is already torn down:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# Session 1: save the stock baseline
|
|
52
|
+
artemisllmbench baseline --endpoint http://localhost:9000 --model <model> --hardware a100
|
|
53
|
+
|
|
54
|
+
# Session 2: run the optimized candidate
|
|
55
|
+
artemisllmbench candidate --endpoint http://localhost:9000 --model <model> --hardware a100
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
The dashboard launches automatically after each run. Open it at `http://<your-ip>:8501`.
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Key Features
|
|
63
|
+
|
|
64
|
+
- **Multi-layer validity** — sanity, structural, semantic (embedding similarity ≥ 0.92), and exact-match checks catch regressions that latency numbers alone miss
|
|
65
|
+
- **Reproducible metrics** — TTFT, P95/P99 latency, ITL (inter-token latency), throughput, CV, drift, and spike detection
|
|
66
|
+
- **SLO / goodput tracking** — set `--slo-ttft` and `--slo-latency` thresholds; get the % of requests that met them
|
|
67
|
+
- **Streamlit dashboard** — live progress, side-by-side results, analytics charts, and a live response comparison panel
|
|
68
|
+
- **Fast mode** — `--fast` cuts runtime by ~75% for quick iteration checks
|
|
69
|
+
- **Cross-machine support** — endpoints can be on different hosts or different hardware
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Common Flags
|
|
74
|
+
|
|
75
|
+
| Flag | Description |
|
|
76
|
+
|------|-------------|
|
|
77
|
+
| `--fast` | Reduced runs (~75% faster). For quick checks only. |
|
|
78
|
+
| `--production` | Full 50-run sequential + concurrent load (default). |
|
|
79
|
+
| `--slo-ttft <ms>` | TTFT SLO threshold — enables goodput reporting. |
|
|
80
|
+
| `--slo-latency <ms>` | End-to-end latency SLO threshold. |
|
|
81
|
+
| `--plots` | ASCII charts inline in terminal output. |
|
|
82
|
+
| `--live` | Rich terminal live view during concurrent phases. |
|
|
83
|
+
| `--no-dashboard` | Skip auto-launching Streamlit. |
|
|
84
|
+
| `--port N` | Streamlit port (default: 8501). |
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Validity Layers
|
|
89
|
+
|
|
90
|
+
| Layer | Check | On failure |
|
|
91
|
+
|-------|-------|------------|
|
|
92
|
+
| 1 Sanity | Non-empty, complete sentence, token bounds | Hard fail |
|
|
93
|
+
| 2 Structural | JSON/Python syntax where required | Hard fail |
|
|
94
|
+
| 3 Semantic | Cosine similarity ≥ 0.92 vs. reference | Hard fail / warning |
|
|
95
|
+
| 4 Exact match | String equality (`control_prompt_v1` only) | Warning |
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Pre-flight Conformance Check
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
artemisllmbench check-conformance --endpoint http://localhost:9000
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Verifies your endpoint speaks the required OpenAI-compatible SSE format before a full benchmark run.
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
**Full documentation and source:** `artemisllmbench --help` or `artemisllmbench <command> --help`
|
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
# Artemis LLM Benchmark
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/artemisllmbench/)
|
|
4
|
+
|
|
5
|
+
A standalone Python CLI for service-level correctness validation and performance benchmarking of LLM serving endpoints. Works with any OpenAI-compatible server (vLLM, Ollama, llama.cpp, etc.).
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## What This Benchmark Does
|
|
10
|
+
|
|
11
|
+
Artemis is the **final validation gate** — it answers two questions: did the optimization preserve correctness, and what is the publishable performance number?
|
|
12
|
+
|
|
13
|
+
**The correct workflow:**
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
Project benchmark (llama-bench / vLLM scripts / benchmark_app)
|
|
17
|
+
→ Guides optimization opportunities
|
|
18
|
+
|
|
19
|
+
[Apply optimizations]
|
|
20
|
+
|
|
21
|
+
Project benchmark again → Confirms internal improvement
|
|
22
|
+
|
|
23
|
+
Artemis LLM Benchmark ← this tool
|
|
24
|
+
→ Validates improvement holds at service level
|
|
25
|
+
→ Confirms correctness is preserved
|
|
26
|
+
→ Produces the publishable, comparable number
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
**Use cases:**
|
|
30
|
+
|
|
31
|
+
| Use case | Command |
|
|
32
|
+
|----------|---------|
|
|
33
|
+
| Validate a single optimized endpoint (correctness + performance) | `validate` |
|
|
34
|
+
| Compare Endpoint A vs Endpoint B side-by-side in one session | `compare` |
|
|
35
|
+
| Compare across sessions or machines (stock already torn down) | `baseline` + `candidate` |
|
|
36
|
+
| Export existing results to the dashboard without re-running | `export-dashboard` |
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Install
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install artemisllmbench # core
|
|
44
|
+
pip install "artemisllmbench[dashboard]" # + Streamlit dashboard
|
|
45
|
+
pip install "artemisllmbench[full]" # + dashboard + semantic similarity
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
> `[full]` includes `sentence-transformers` for semantic similarity checks (Layer 3 validity). Requires **sentence-transformers ≥ 3.0** and a compatible `huggingface_hub`. If you see `cannot import name 'cached_download'`, upgrade: `pip install --upgrade sentence-transformers`.
|
|
49
|
+
|
|
50
|
+
**Development (from source):**
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
python -m venv .venv
|
|
54
|
+
.venv/bin/pip install -e ".[full]"
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
`artemisllmbench` is available after install. In development you can also invoke directly with `python benchmark.py`.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Commands
|
|
62
|
+
|
|
63
|
+
### `validate` — Single endpoint
|
|
64
|
+
|
|
65
|
+
Validate correctness and measure performance for one endpoint.
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
artemisllmbench validate \
|
|
69
|
+
--endpoint http://localhost:9000 \
|
|
70
|
+
--model Qwen/Qwen2.5-7B-Instruct \
|
|
71
|
+
--hardware a100
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
The dashboard launches automatically. Open it at `http://<your-ip>:8501`.
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
### `compare` — Stock vs optimized
|
|
79
|
+
|
|
80
|
+
Benchmark both endpoints sequentially — each with full dedicated resources — then produce a side-by-side report.
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
artemisllmbench compare \
|
|
84
|
+
--endpoint-a http://localhost:9000 \
|
|
85
|
+
--endpoint-b http://localhost:9001 \
|
|
86
|
+
--model Qwen/Qwen2.5-7B-Instruct \
|
|
87
|
+
--hardware a100
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
**Why sequential, not simultaneous?** LLM inference is memory-bandwidth bound. Running two models simultaneously splits GPU VRAM and bandwidth, producing numbers that represent neither endpoint accurately. Sequential measurement with full resources for each gives reproducible, representative results.
|
|
91
|
+
|
|
92
|
+
**Flow:**
|
|
93
|
+
|
|
94
|
+
1. Full benchmark on Endpoint A (sequential validation → concurrent load)
|
|
95
|
+
2. CLI pauses — prepare Endpoint B (optionally tear down Endpoint A to free memory)
|
|
96
|
+
3. Full benchmark on Endpoint B
|
|
97
|
+
4. Side-by-side comparison report printed; dashboard updated automatically
|
|
98
|
+
|
|
99
|
+
**Cross-machine compare** — endpoints do not need to be on the same machine:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
artemisllmbench compare \
|
|
103
|
+
--endpoint-a http://192.168.1.10:9000 \
|
|
104
|
+
--endpoint-b http://192.168.1.20:9000 \
|
|
105
|
+
--model Qwen/Qwen2.5-7B-Instruct \
|
|
106
|
+
--hardware a100 \
|
|
107
|
+
--hardware-b h100
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
`--hardware-b` labels the optimized side separately in the dashboard (e.g., `NVIDIA A100 80GB → NVIDIA H100 80GB`). If omitted, both sides use the same hardware label.
|
|
111
|
+
|
|
112
|
+
**Verifying endpoint reachability** before running:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
curl -s --max-time 5 http://<ip>:<port>/v1/models
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Expected: JSON with the model name. If it hangs → port is blocked (check firewall / `ufw allow <port>`). If "connection refused" → vLLM not running or bound to `127.0.0.1` instead of `0.0.0.0`.
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
### `baseline` + `candidate` — Split sessions
|
|
123
|
+
|
|
124
|
+
Use when Endpoint A is already torn down, or when the two endpoints run on different days or machines.
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
# Step 1: benchmark Endpoint A, save result
|
|
128
|
+
artemisllmbench baseline \
|
|
129
|
+
--endpoint http://localhost:9000 \
|
|
130
|
+
--model Qwen/Qwen2.5-7B-Instruct \
|
|
131
|
+
--hardware a100
|
|
132
|
+
|
|
133
|
+
# Step 2: benchmark Endpoint B — auto-loads saved baseline for comparison
|
|
134
|
+
artemisllmbench candidate \
|
|
135
|
+
--endpoint http://localhost:9000 \
|
|
136
|
+
--model Qwen/Qwen2.5-7B-Instruct \
|
|
137
|
+
--hardware a100
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
After `candidate` completes, the dashboard shows the same compare-mode layout as `compare` — side-by-side charts, delta table, and the live comparison panel.
|
|
141
|
+
|
|
142
|
+
For most cases, prefer `compare` — it runs both in one session and produces cleaner results.
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
### Common flags
|
|
147
|
+
|
|
148
|
+
| Flag | Description |
|
|
149
|
+
|------|-------------|
|
|
150
|
+
| `--fast` | Fast mode: 5 warmup + reduced runs. Cuts runtime by ~75%. Good for quick checks — not for production comparisons. |
|
|
151
|
+
| `--production` | Production mode: full warmup + 50 sequential runs. Default — explicit flag is optional. |
|
|
152
|
+
| `--slo-ttft <ms>` | SLO threshold for TTFT. Reports **goodput** — % of requests that satisfy all configured SLOs. |
|
|
153
|
+
| `--slo-latency <ms>` | SLO threshold for end-to-end latency. Used together with `--slo-ttft` or alone. |
|
|
154
|
+
| `--hardware-b <hw>` | Label Endpoint B hardware separately (`compare` only) |
|
|
155
|
+
| `--live` | Rich terminal dashboard during concurrent phases |
|
|
156
|
+
| `--plots` | ASCII time-series and histogram charts in terminal report |
|
|
157
|
+
| `--rps N` | Override target requests-per-second |
|
|
158
|
+
| `--requests N` | Override total concurrent request count |
|
|
159
|
+
| `--no-dashboard` | Skip auto-launching Streamlit |
|
|
160
|
+
| `--port N` | Streamlit port (default: 8501) |
|
|
161
|
+
|
|
162
|
+
**Goodput example** — answers "did the optimization improve SLO compliance?":
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
artemisllmbench compare \
|
|
166
|
+
--endpoint-a http://localhost:9000 \
|
|
167
|
+
--endpoint-b http://localhost:9001 \
|
|
168
|
+
--model Qwen/Qwen2.5-7B-Instruct \
|
|
169
|
+
--hardware a100 \
|
|
170
|
+
--slo-ttft 200 \
|
|
171
|
+
--slo-latency 500
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
Terminal output per scenario:
|
|
175
|
+
```
|
|
176
|
+
PASS: P95=142ms Goodput: 98.7%
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Dashboard Results tab shows a goodput compliance card with Endpoint A vs Endpoint B percentages and the pp delta.
|
|
180
|
+
|
|
181
|
+
Before a run starts, Artemis sends one timing probe and prints an estimate:
|
|
182
|
+
|
|
183
|
+
```
|
|
184
|
+
Mode: PRODUCTION
|
|
185
|
+
Estimated runtime:
|
|
186
|
+
Sequential 7 scenarios ~95 min
|
|
187
|
+
Concurrent 6 scenarios ~18 min
|
|
188
|
+
Total ~113 min
|
|
189
|
+
|
|
190
|
+
Tip: add --fast to cut this to ~25 min
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## Dashboard
|
|
196
|
+
|
|
197
|
+
Launches automatically with every command. Open at `http://<your-ip>:8501`.
|
|
198
|
+
|
|
199
|
+
### Tabs
|
|
200
|
+
|
|
201
|
+
**Live** — real-time progress during a run:
|
|
202
|
+
- Single-run mode: one progress panel showing phase, scenario, metrics as they arrive
|
|
203
|
+
- Compare mode: frozen Endpoint A panel (top) + live Endpoint B panel (bottom), both visible simultaneously. During the CLI pause between phases, a waiting placeholder appears automatically.
|
|
204
|
+
|
|
205
|
+
**Results** — after the run completes:
|
|
206
|
+
- `validate`: validity gate → output correctness → performance metrics table (TTFT, P95/P99 latency, CV, ITL mean/P95) → goodput card (if `--slo-ttft`/`--slo-latency` were set)
|
|
207
|
+
- `compare` / `candidate`: optimization impact banner → output correctness → side-by-side metrics table with deltas (includes ITL and goodput) → goodput compliance card → live side-by-side comparison panel
|
|
208
|
+
|
|
209
|
+
**Analytics** — detailed charts:
|
|
210
|
+
- TTFT P50/P95/P99 grouped bar charts (Endpoint A vs Endpoint B in compare mode)
|
|
211
|
+
- Queue pressure: sequential → concurrent P95 delta
|
|
212
|
+
- Latency stability (CV%) with 5% / 20% thresholds
|
|
213
|
+
- Isolation probes: prefill throughput, decode throughput, prefix cache detection
|
|
214
|
+
- RPS saturation curve
|
|
215
|
+
|
|
216
|
+
The Results and Analytics tabs are cleared during active runs and populate automatically once the benchmark and export complete — no manual refresh needed.
|
|
217
|
+
|
|
218
|
+
### Live side-by-side comparison panel
|
|
219
|
+
|
|
220
|
+
Available in `compare` and `candidate` modes under the Results tab. The panel only appears when there is something to show:
|
|
221
|
+
|
|
222
|
+
- **Both endpoints live** — responses stream directly from both models in real time
|
|
223
|
+
- **Recorded responses available** — replays responses captured during the benchmark run at actual benchmark resources
|
|
224
|
+
- **Neither** — the panel is hidden entirely (this is the case in `validate` mode and when a `baseline`/`candidate` result is viewed without a live endpoint)
|
|
225
|
+
|
|
226
|
+
Select a preset prompt or type your own, then click **Run comparison** to see responses stream side by side with TTFT and tok/s metrics.
|
|
227
|
+
|
|
228
|
+
### Manual launch
|
|
229
|
+
|
|
230
|
+
```bash
|
|
231
|
+
artemisllmbench dashboard
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### Export existing results
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
artemisllmbench export-dashboard \
|
|
238
|
+
--baseline baselines/<baseline-file>.json \
|
|
239
|
+
--candidate baselines/<candidate-file>.json
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## How It Works
|
|
245
|
+
|
|
246
|
+
### Scenarios
|
|
247
|
+
|
|
248
|
+
All scenarios are frozen, versioned YAML files. A version bump is required to change a scenario.
|
|
249
|
+
|
|
250
|
+
| Scenario | Input | Output | Measures |
|
|
251
|
+
|----------|-------|--------|----------|
|
|
252
|
+
| `small_prompt_v1` | "What is the capital of Japan?" | ~20 tokens | TTFT, scheduling latency, serving overhead |
|
|
253
|
+
| `large_prompt_v1` | ~1500-token technical content | ~120 tokens | PagedAttention, KV-cache efficiency, prefill throughput |
|
|
254
|
+
| `long_context_v1` | ~6500-token context + factual question | ~60 tokens | Memory pressure, attention scaling |
|
|
255
|
+
| `control_prompt_v1` | "What is 144 divided by 12?" | "12" (exact) | Determinism, precision drift (fp32 → bf16) |
|
|
256
|
+
| `mixed_realistic_v1` | 26-prompt pool: code, reasoning, explanation, Q&A | 25–400 tokens | Realistic workload distribution, variable-length ITL |
|
|
257
|
+
|
|
258
|
+
**Isolation probes** (run but not shown as primary metrics):
|
|
259
|
+
|
|
260
|
+
| Probe | Measures |
|
|
261
|
+
|-------|---------|
|
|
262
|
+
| `prefill_probe_v1` | Cold vs. warm prefill throughput (tokens/s) — detects hugepage activation |
|
|
263
|
+
| `decode_probe_v1` | Token generation throughput (tokens/s) and scheduling overhead |
|
|
264
|
+
| `prefix_cache_probe_v1` | KV-cache prefix reuse — speedup ratio cold → warm |
|
|
265
|
+
|
|
266
|
+
### Execution order
|
|
267
|
+
|
|
268
|
+
For each scenario:
|
|
269
|
+
|
|
270
|
+
1. **Warmup** (discarded) — GPU cache init, CUDA graph compilation, JIT
|
|
271
|
+
2. **Sequential ×50** → validity gate + per-request latency distribution
|
|
272
|
+
3. **Concurrent at pinned RPS ×500** → throughput metrics (only if sequential passed)
|
|
273
|
+
4. **RPS sweep** (optional `--rps-sweep`) → saturation curve
|
|
274
|
+
|
|
275
|
+
### Validity layers
|
|
276
|
+
|
|
277
|
+
| Layer | Check | Failure mode |
|
|
278
|
+
|-------|-------|--------------|
|
|
279
|
+
| 1 Sanity | Non-empty, complete sentence, token bounds | Hard fail — stops benchmark |
|
|
280
|
+
| 2 Structural | JSON/Python syntax validity where required | Hard fail |
|
|
281
|
+
| 3 Semantic | Embedding cosine similarity ≥ 0.92 vs. reference outputs | Hard fail (<0.85), warning (0.85–0.92) |
|
|
282
|
+
| 4 Exact match | String equality after strip (`control_prompt_v1` only) | Warning only |
|
|
283
|
+
|
|
284
|
+
Layer 3 uses `sentence-transformers` and handles precision changes (fp32 → bf16) that produce semantically equivalent but not byte-identical outputs.
|
|
285
|
+
|
|
286
|
+
### Observability signals
|
|
287
|
+
|
|
288
|
+
| Signal | What it reveals |
|
|
289
|
+
|--------|----------------|
|
|
290
|
+
| **Drift** | Mean latency rising over sequential runs — memory pressure, thermal throttling, KV-cache eviction |
|
|
291
|
+
| **Spikes** | Isolated requests >2.5× median — GC pauses, OS scheduling jitter |
|
|
292
|
+
| **Fat tails** | P99 ≫ P95 — degraded long requests, attention scaling problems |
|
|
293
|
+
| **Bimodal** | Two latency clusters — cache-hit vs. cache-miss path divergence |
|
|
294
|
+
| **ITL (Inter-Token Latency)** | Mean / P95 / P99 time between consecutive tokens — exposes decode-stage jitter from speculative decoding, dynamic batching, or scheduler interference |
|
|
295
|
+
|
|
296
|
+
Use `--plots` for ASCII time-series and histograms inline in the terminal report.
|
|
297
|
+
Use `--live` for a Rich terminal dashboard during concurrent phases.
|
|
298
|
+
|
|
299
|
+
---
|
|
300
|
+
|
|
301
|
+
## Backend Conformance
|
|
302
|
+
|
|
303
|
+
The benchmark measures a deployed service. It requires an OpenAI-compatible `/v1/chat/completions` endpoint with SSE streaming.
|
|
304
|
+
|
|
305
|
+
**Pre-flight check:**
|
|
306
|
+
|
|
307
|
+
```bash
|
|
308
|
+
artemisllmbench check-conformance --endpoint http://localhost:9000
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
**Adapters** handle SSE format variation (one YAML per backend):
|
|
312
|
+
|
|
313
|
+
- vLLM: `choices[0].delta.content`
|
|
314
|
+
- llama.cpp: `choices[0].delta.content`
|
|
315
|
+
- `_default.yaml`: fallback for conformant endpoints
|
|
316
|
+
|
|
317
|
+
Run `artemisllmbench check-conformance --help` for all options.
|
|
318
|
+
|
|
319
|
+
---
|
|
320
|
+
|
|
321
|
+
## Development
|
|
322
|
+
|
|
323
|
+
**Run tests:**
|
|
324
|
+
|
|
325
|
+
```bash
|
|
326
|
+
pytest tests/
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
**Project structure:**
|
|
330
|
+
|
|
331
|
+
```
|
|
332
|
+
benchmark.py # CLI entry point (installed as `artemisllmbench`)
|
|
333
|
+
turing_bench/
|
|
334
|
+
├── scenarios/ # Frozen, versioned scenario YAMLs
|
|
335
|
+
├── adapters/ # SSE format configs per backend
|
|
336
|
+
├── runner/ # Sequential and concurrent execution engines
|
|
337
|
+
├── validity/ # Multi-layer correctness checks
|
|
338
|
+
├── stats/ # Percentiles, CV, drift, spikes, distribution
|
|
339
|
+
│ ├── live_dashboard.py # Rich terminal live view
|
|
340
|
+
│ └── progress_writer.py # Writes _progress.json for Streamlit polling
|
|
341
|
+
├── export/ # Converts result JSON → dashboard config JSON
|
|
342
|
+
├── dashboard/ # Streamlit dashboard (bundled in the package)
|
|
343
|
+
│ └── app.py
|
|
344
|
+
└── report/ # Baseline manager, report formatter
|
|
345
|
+
dashboard/
|
|
346
|
+
└── data/ # Dashboard config JSONs (auto-written after runs)
|
|
347
|
+
tests/
|
|
348
|
+
baselines/ # Saved baseline JSONs (created on first run)
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
---
|
|
352
|
+
|
|
353
|
+
**Questions?** Run `artemisllmbench` for the quick-start guide, or `artemisllmbench <command> --help` for full options.
|