clawperf 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clawperf/__init__.py +8 -0
- clawperf/__main__.py +6 -0
- clawperf/cli.py +107 -0
- clawperf/config.py +107 -0
- clawperf/context.py +138 -0
- clawperf/mock_server.py +415 -0
- clawperf/runner.py +705 -0
- clawperf/scheduler.py +38 -0
- clawperf/system_metrics.py +167 -0
- clawperf/tokenizer.py +146 -0
- clawperf-0.1.0.dist-info/METADATA +281 -0
- clawperf-0.1.0.dist-info/RECORD +16 -0
- clawperf-0.1.0.dist-info/WHEEL +5 -0
- clawperf-0.1.0.dist-info/entry_points.txt +3 -0
- clawperf-0.1.0.dist-info/licenses/LICENSE +190 -0
- clawperf-0.1.0.dist-info/top_level.txt +1 -0
clawperf/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""ClawPerf - Performance benchmarking tool for LLM Serving backends.
|
|
2
|
+
|
|
3
|
+
Reuses EvalScope's perf infrastructure for HTTP, streaming, and timing,
|
|
4
|
+
and adds multi-turn long-context workloads with append-mode compaction,
|
|
5
|
+
user arrival scheduling, and system metrics polling.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
clawperf/__main__.py
ADDED
clawperf/cli.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""CLI argument parser for ClawPerf."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import asyncio
|
|
7
|
+
|
|
8
|
+
from clawperf.config import BenchmarkConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
12
|
+
parser = argparse.ArgumentParser(
|
|
13
|
+
prog="clawperf",
|
|
14
|
+
description=(
|
|
15
|
+
"ClawPerf - Performance testing tool for LLM Serving backends. "
|
|
16
|
+
"Simulates multi-user, multi-turn, long-context workloads against "
|
|
17
|
+
"vLLM, SGLang, and MindIE backends. "
|
|
18
|
+
"Built on EvalScope's perf infrastructure."
|
|
19
|
+
),
|
|
20
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# ── User configuration ──
|
|
24
|
+
g = parser.add_argument_group("User Configuration")
|
|
25
|
+
g.add_argument("--num-users", type=int, default=1, help="Total concurrent users.")
|
|
26
|
+
g.add_argument(
|
|
27
|
+
"--user-arrival", type=str, default="burst",
|
|
28
|
+
help="'burst', 'steady:<seconds>', or 'poisson:<lambda>'.",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# ── Context configuration ──
|
|
32
|
+
g = parser.add_argument_group("Context Configuration")
|
|
33
|
+
g.add_argument("--system-prefix-tokens", type=int, default=15000)
|
|
34
|
+
g.add_argument("--system-prefix-source", type=str, default="random")
|
|
35
|
+
g.add_argument("--user-prefix-tokens", type=int, default=5000)
|
|
36
|
+
g.add_argument("--input-tokens-per-turn", type=int, default=5000)
|
|
37
|
+
g.add_argument("--output-tokens-per-turn", type=int, default=1000)
|
|
38
|
+
g.add_argument("--max-context-tokens", type=int, default=128000)
|
|
39
|
+
g.add_argument("--compaction-prefix-increment", type=int, default=5000)
|
|
40
|
+
|
|
41
|
+
# ── Run configuration ──
|
|
42
|
+
g = parser.add_argument_group("Run Configuration")
|
|
43
|
+
g.add_argument("--max-turns", type=int, default=100)
|
|
44
|
+
|
|
45
|
+
# ── API configuration ──
|
|
46
|
+
g = parser.add_argument_group("API Configuration")
|
|
47
|
+
g.add_argument("--endpoint", type=str, required=True)
|
|
48
|
+
g.add_argument("--model", type=str, required=True)
|
|
49
|
+
g.add_argument("--api-key", type=str, default="")
|
|
50
|
+
g.add_argument("--tokenizer", type=str, default="")
|
|
51
|
+
g.add_argument("--ignore-eos", action="store_true", default=True)
|
|
52
|
+
g.add_argument("--no-ignore-eos", action="store_false", dest="ignore_eos")
|
|
53
|
+
g.add_argument("--request-timeout", type=int, default=600)
|
|
54
|
+
|
|
55
|
+
# ── System metrics ──
|
|
56
|
+
g = parser.add_argument_group("System Metrics")
|
|
57
|
+
g.add_argument("--metrics-endpoint", type=str, default=None)
|
|
58
|
+
g.add_argument("--metrics-interval", type=int, default=5)
|
|
59
|
+
g.add_argument("--backend", type=str, default="vllm", choices=["vllm", "sglang", "mindie"])
|
|
60
|
+
|
|
61
|
+
# ── Output ──
|
|
62
|
+
g = parser.add_argument_group("Output")
|
|
63
|
+
g.add_argument("--output", type=str, default="results.json")
|
|
64
|
+
g.add_argument("-v", "--verbose", action="store_true", default=False,
|
|
65
|
+
help="Print per-turn progress lines (default: tqdm progress bar)")
|
|
66
|
+
|
|
67
|
+
return parser
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def parse_args(argv: list[str] | None = None) -> BenchmarkConfig:
|
|
71
|
+
parser = build_parser()
|
|
72
|
+
args = parser.parse_args(argv)
|
|
73
|
+
return BenchmarkConfig(
|
|
74
|
+
num_users=args.num_users,
|
|
75
|
+
user_arrival=args.user_arrival,
|
|
76
|
+
system_prefix_tokens=args.system_prefix_tokens,
|
|
77
|
+
system_prefix_source=args.system_prefix_source,
|
|
78
|
+
user_prefix_tokens=args.user_prefix_tokens,
|
|
79
|
+
input_tokens_per_turn=args.input_tokens_per_turn,
|
|
80
|
+
output_tokens_per_turn=args.output_tokens_per_turn,
|
|
81
|
+
max_context_tokens=args.max_context_tokens,
|
|
82
|
+
compaction_prefix_increment=args.compaction_prefix_increment,
|
|
83
|
+
max_turns=args.max_turns,
|
|
84
|
+
endpoint=args.endpoint,
|
|
85
|
+
model=args.model,
|
|
86
|
+
api_key=args.api_key,
|
|
87
|
+
tokenizer=args.tokenizer,
|
|
88
|
+
ignore_eos=args.ignore_eos,
|
|
89
|
+
request_timeout=args.request_timeout,
|
|
90
|
+
metrics_endpoint=args.metrics_endpoint,
|
|
91
|
+
metrics_interval=args.metrics_interval,
|
|
92
|
+
backend=args.backend,
|
|
93
|
+
output=args.output,
|
|
94
|
+
verbose=args.verbose,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def main():
|
|
99
|
+
config = parse_args()
|
|
100
|
+
from clawperf.runner import BenchmarkRunner
|
|
101
|
+
|
|
102
|
+
runner = BenchmarkRunner(config)
|
|
103
|
+
try:
|
|
104
|
+
asyncio.run(runner.run())
|
|
105
|
+
except KeyboardInterrupt:
|
|
106
|
+
print("\n[ClawPerf] Interrupted. Saving partial results...")
|
|
107
|
+
asyncio.run(runner.shutdown_and_save())
|
clawperf/config.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Configuration for ClawPerfBench.
|
|
2
|
+
|
|
3
|
+
Wraps EvalScope's Arguments where possible, extends with
|
|
4
|
+
ClawPerfBench-specific context/compaction/scheduling parameters.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import dataclasses
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclasses.dataclass
|
|
14
|
+
class BenchmarkConfig:
|
|
15
|
+
"""All configurable parameters for a benchmark run."""
|
|
16
|
+
|
|
17
|
+
# ── User configuration ──
|
|
18
|
+
num_users: int = 1
|
|
19
|
+
user_arrival: str = "burst" # "burst", "steady:<interval>", "poisson:<lambda>"
|
|
20
|
+
|
|
21
|
+
# ── Context configuration ──
|
|
22
|
+
system_prefix_tokens: int = 15000
|
|
23
|
+
system_prefix_source: str = "random" # "random" or file path
|
|
24
|
+
user_prefix_tokens: int = 5000
|
|
25
|
+
input_tokens_per_turn: int = 5000
|
|
26
|
+
output_tokens_per_turn: int = 1000
|
|
27
|
+
max_context_tokens: int = 128000
|
|
28
|
+
compaction_prefix_increment: int = 5000
|
|
29
|
+
|
|
30
|
+
# ── Run configuration ──
|
|
31
|
+
max_turns: int = 100
|
|
32
|
+
|
|
33
|
+
# ── API configuration ──
|
|
34
|
+
endpoint: str = ""
|
|
35
|
+
model: str = ""
|
|
36
|
+
api_key: str = ""
|
|
37
|
+
tokenizer: str = "" # defaults to model if empty
|
|
38
|
+
ignore_eos: bool = True
|
|
39
|
+
request_timeout: int = 600
|
|
40
|
+
|
|
41
|
+
# ── System metrics configuration ──
|
|
42
|
+
metrics_endpoint: Optional[str] = None
|
|
43
|
+
metrics_interval: int = 5
|
|
44
|
+
backend: str = "vllm" # "vllm", "sglang", "mindie"
|
|
45
|
+
|
|
46
|
+
# ── Output configuration ──
|
|
47
|
+
output: str = "results.json"
|
|
48
|
+
verbose: bool = False # per-turn detailed logging
|
|
49
|
+
|
|
50
|
+
# ── Derived fields ──
|
|
51
|
+
arrival_mode: str = ""
|
|
52
|
+
arrival_param: float = 0.0
|
|
53
|
+
|
|
54
|
+
def __post_init__(self):
|
|
55
|
+
if not self.tokenizer:
|
|
56
|
+
self.tokenizer = self.model
|
|
57
|
+
self._parse_arrival_mode()
|
|
58
|
+
|
|
59
|
+
def _parse_arrival_mode(self):
|
|
60
|
+
if self.user_arrival == "burst":
|
|
61
|
+
self.arrival_mode = "burst"
|
|
62
|
+
self.arrival_param = 0.0
|
|
63
|
+
elif self.user_arrival.startswith("steady:"):
|
|
64
|
+
self.arrival_mode = "steady"
|
|
65
|
+
self.arrival_param = float(self.user_arrival.split(":")[1])
|
|
66
|
+
elif self.user_arrival.startswith("poisson:"):
|
|
67
|
+
self.arrival_mode = "poisson"
|
|
68
|
+
self.arrival_param = float(self.user_arrival.split(":")[1])
|
|
69
|
+
else:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"Invalid user_arrival format: {self.user_arrival!r}. "
|
|
72
|
+
"Expected 'burst', 'steady:<interval>', or 'poisson:<lambda>'."
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def to_evalscope_args(self):
|
|
76
|
+
"""Build an EvalScope Arguments object from this config.
|
|
77
|
+
|
|
78
|
+
Reuses EvalScope's connection/timeout/stream/model settings.
|
|
79
|
+
Note: EvalScope's number/parallel are lists after validation.
|
|
80
|
+
"""
|
|
81
|
+
from evalscope.perf.arguments import Arguments
|
|
82
|
+
|
|
83
|
+
args = Arguments(
|
|
84
|
+
model=self.model,
|
|
85
|
+
url=self.endpoint,
|
|
86
|
+
tokenizer_path=self.tokenizer,
|
|
87
|
+
stream=True,
|
|
88
|
+
max_tokens=self.output_tokens_per_turn,
|
|
89
|
+
number=[self.num_users],
|
|
90
|
+
parallel=[self.num_users],
|
|
91
|
+
total_timeout=self.request_timeout,
|
|
92
|
+
api="openai",
|
|
93
|
+
no_test_connection=True,
|
|
94
|
+
)
|
|
95
|
+
if self.api_key:
|
|
96
|
+
args.headers["Authorization"] = f"Bearer {self.api_key}"
|
|
97
|
+
|
|
98
|
+
# Handle ignore_eos via extra_args
|
|
99
|
+
if self.ignore_eos:
|
|
100
|
+
extra = dict(args.extra_args) if args.extra_args else {}
|
|
101
|
+
extra["ignore_eos"] = True
|
|
102
|
+
args.extra_args = extra
|
|
103
|
+
|
|
104
|
+
return args
|
|
105
|
+
|
|
106
|
+
def to_dict(self) -> dict:
|
|
107
|
+
return dataclasses.asdict(self)
|
clawperf/context.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Context assembly and compaction logic.
|
|
2
|
+
|
|
3
|
+
Implements the per-turn context structure:
|
|
4
|
+
[System Prefix] [User Prefix] [History] [Current Input]
|
|
5
|
+
|
|
6
|
+
And the append-mode compaction mechanism.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import List, Tuple
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger("clawperf")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class CompactionEvent:
|
|
20
|
+
user_id: int
|
|
21
|
+
turn: int
|
|
22
|
+
time: float
|
|
23
|
+
old_prefix_len: int
|
|
24
|
+
new_prefix_len: int
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class UserContext:
|
|
29
|
+
"""Maintains the evolving context state for a single user."""
|
|
30
|
+
|
|
31
|
+
user_id: int
|
|
32
|
+
system_prefix: str
|
|
33
|
+
user_prefix_tokens: int
|
|
34
|
+
user_prefix_content: str
|
|
35
|
+
input_tokens_per_turn: int
|
|
36
|
+
max_context_tokens: int
|
|
37
|
+
compaction_prefix_increment: int
|
|
38
|
+
max_turns: int
|
|
39
|
+
|
|
40
|
+
history: List[Tuple[str, str]] = field(default_factory=list)
|
|
41
|
+
compaction_events: List[CompactionEvent] = field(default_factory=list)
|
|
42
|
+
|
|
43
|
+
def prepare_turn(
|
|
44
|
+
self,
|
|
45
|
+
turn_id: int,
|
|
46
|
+
current_input_content: str,
|
|
47
|
+
tokenizer_manager,
|
|
48
|
+
) -> dict:
|
|
49
|
+
"""Prepare context for a turn. May trigger compaction.
|
|
50
|
+
|
|
51
|
+
Returns dict with: messages, context_tokens, compaction_triggered,
|
|
52
|
+
compaction_event, context_overflow.
|
|
53
|
+
"""
|
|
54
|
+
compaction_event = None
|
|
55
|
+
context_overflow = False
|
|
56
|
+
messages = self._build_messages(current_input_content)
|
|
57
|
+
context_tokens = tokenizer_manager.count_chat_tokens(messages)
|
|
58
|
+
|
|
59
|
+
if context_tokens >= self.max_context_tokens:
|
|
60
|
+
# Check if base context (without history) already exceeds limit
|
|
61
|
+
# — compaction can't help if system+prefix+input alone is too large
|
|
62
|
+
base_messages = self._build_messages(current_input_content, skip_history=True)
|
|
63
|
+
base_tokens = tokenizer_manager.count_chat_tokens(base_messages)
|
|
64
|
+
if base_tokens >= self.max_context_tokens:
|
|
65
|
+
context_overflow = True
|
|
66
|
+
logger.warning(
|
|
67
|
+
"[User %02d] Base context (%d tokens) already exceeds limit (%d) "
|
|
68
|
+
"— compaction cannot help, skipping turn %d",
|
|
69
|
+
self.user_id, base_tokens, self.max_context_tokens, turn_id,
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
old_prefix_len = self.user_prefix_tokens
|
|
73
|
+
self.history.clear()
|
|
74
|
+
self.user_prefix_tokens += self.compaction_prefix_increment
|
|
75
|
+
self.user_prefix_content = tokenizer_manager.generate_random_content(
|
|
76
|
+
self.user_prefix_tokens
|
|
77
|
+
)
|
|
78
|
+
compaction_event = CompactionEvent(
|
|
79
|
+
user_id=self.user_id,
|
|
80
|
+
turn=turn_id,
|
|
81
|
+
time=0.0,
|
|
82
|
+
old_prefix_len=old_prefix_len,
|
|
83
|
+
new_prefix_len=self.user_prefix_tokens,
|
|
84
|
+
)
|
|
85
|
+
self.compaction_events.append(compaction_event)
|
|
86
|
+
logger.info(
|
|
87
|
+
"[User %02d] Compaction at turn %d: prefix %d → %d",
|
|
88
|
+
self.user_id, turn_id, old_prefix_len, self.user_prefix_tokens,
|
|
89
|
+
)
|
|
90
|
+
messages = self._build_messages(current_input_content)
|
|
91
|
+
context_tokens = tokenizer_manager.count_chat_tokens(messages)
|
|
92
|
+
|
|
93
|
+
if context_tokens >= self.max_context_tokens:
|
|
94
|
+
context_overflow = True
|
|
95
|
+
logger.warning(
|
|
96
|
+
"[User %02d] Context still exceeds limit after compaction: "
|
|
97
|
+
"%d >= %d tokens",
|
|
98
|
+
self.user_id, context_tokens, self.max_context_tokens,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
"messages": messages,
|
|
103
|
+
"context_tokens": context_tokens,
|
|
104
|
+
"compaction_triggered": compaction_event is not None,
|
|
105
|
+
"compaction_event": compaction_event,
|
|
106
|
+
"context_overflow": context_overflow,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
def _build_messages(self, current_input: str, skip_history: bool = False) -> list[dict]:
|
|
110
|
+
messages = []
|
|
111
|
+
if self.system_prefix:
|
|
112
|
+
messages.append({"role": "system", "content": self.system_prefix})
|
|
113
|
+
|
|
114
|
+
user_parts = []
|
|
115
|
+
if self.user_prefix_content:
|
|
116
|
+
user_parts.append(self.user_prefix_content)
|
|
117
|
+
|
|
118
|
+
for i, (user_msg, assistant_msg) in enumerate(self.history):
|
|
119
|
+
if skip_history:
|
|
120
|
+
continue
|
|
121
|
+
if i == 0 and user_parts:
|
|
122
|
+
user_parts.append(user_msg)
|
|
123
|
+
messages.append({"role": "user", "content": "\n".join(user_parts)})
|
|
124
|
+
user_parts = []
|
|
125
|
+
else:
|
|
126
|
+
messages.append({"role": "user", "content": user_msg})
|
|
127
|
+
messages.append({"role": "assistant", "content": assistant_msg})
|
|
128
|
+
|
|
129
|
+
if user_parts:
|
|
130
|
+
user_parts.append(current_input)
|
|
131
|
+
messages.append({"role": "user", "content": "\n".join(user_parts)})
|
|
132
|
+
else:
|
|
133
|
+
messages.append({"role": "user", "content": current_input})
|
|
134
|
+
|
|
135
|
+
return messages
|
|
136
|
+
|
|
137
|
+
def append_history(self, user_message: str, assistant_reply: str):
|
|
138
|
+
self.history.append((user_message, assistant_reply))
|