xpyd-sim 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpyd_sim-0.1.0/PKG-INFO +55 -0
- xpyd_sim-0.1.0/README.md +35 -0
- xpyd_sim-0.1.0/pyproject.toml +49 -0
- xpyd_sim-0.1.0/setup.cfg +4 -0
- xpyd_sim-0.1.0/src/xpyd_sim/__init__.py +3 -0
- xpyd_sim-0.1.0/src/xpyd_sim/calibrate.py +180 -0
- xpyd_sim-0.1.0/src/xpyd_sim/cli.py +248 -0
- xpyd_sim-0.1.0/src/xpyd_sim/common/__init__.py +1 -0
- xpyd_sim-0.1.0/src/xpyd_sim/common/helpers.py +47 -0
- xpyd_sim-0.1.0/src/xpyd_sim/common/logprobs.py +83 -0
- xpyd_sim-0.1.0/src/xpyd_sim/common/models.py +153 -0
- xpyd_sim-0.1.0/src/xpyd_sim/decode/__init__.py +1 -0
- xpyd_sim-0.1.0/src/xpyd_sim/observability.py +123 -0
- xpyd_sim-0.1.0/src/xpyd_sim/prefill/__init__.py +5 -0
- xpyd_sim-0.1.0/src/xpyd_sim/prefill/app.py +300 -0
- xpyd_sim-0.1.0/src/xpyd_sim/profile.py +61 -0
- xpyd_sim-0.1.0/src/xpyd_sim/scheduler.py +353 -0
- xpyd_sim-0.1.0/src/xpyd_sim/server.py +945 -0
- xpyd_sim-0.1.0/src/xpyd_sim.egg-info/PKG-INFO +55 -0
- xpyd_sim-0.1.0/src/xpyd_sim.egg-info/SOURCES.txt +32 -0
- xpyd_sim-0.1.0/src/xpyd_sim.egg-info/dependency_links.txt +1 -0
- xpyd_sim-0.1.0/src/xpyd_sim.egg-info/entry_points.txt +2 -0
- xpyd_sim-0.1.0/src/xpyd_sim.egg-info/requires.txt +14 -0
- xpyd_sim-0.1.0/src/xpyd_sim.egg-info/top_level.txt +1 -0
- xpyd_sim-0.1.0/tests/test_helpers.py +54 -0
- xpyd_sim-0.1.0/tests/test_m3_params.py +256 -0
- xpyd_sim-0.1.0/tests/test_m4_config.py +268 -0
- xpyd_sim-0.1.0/tests/test_m5_calibrate.py +223 -0
- xpyd_sim-0.1.0/tests/test_m6_observability.py +282 -0
- xpyd_sim-0.1.0/tests/test_m7_e2e_concurrency.py +443 -0
- xpyd_sim-0.1.0/tests/test_m8_backward_compat.py +560 -0
- xpyd_sim-0.1.0/tests/test_models.py +105 -0
- xpyd_sim-0.1.0/tests/test_prefill.py +187 -0
- xpyd_sim-0.1.0/tests/test_server.py +586 -0
xpyd_sim-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xpyd-sim
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: OpenAI-compatible LLM inference simulator for xPyD
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: fastapi>=0.111.0
|
|
8
|
+
Requires-Dist: uvicorn>=0.30.0
|
|
9
|
+
Requires-Dist: pydantic>=2.0.0
|
|
10
|
+
Requires-Dist: httpx>=0.27.0
|
|
11
|
+
Requires-Dist: pyyaml>=6.0
|
|
12
|
+
Requires-Dist: numpy>=1.26.0
|
|
13
|
+
Requires-Dist: scipy>=1.12.0
|
|
14
|
+
Requires-Dist: matplotlib>=3.8.0
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
17
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
|
|
18
|
+
Requires-Dist: ruff>=0.3.0; extra == "dev"
|
|
19
|
+
Requires-Dist: isort>=5.13.0; extra == "dev"
|
|
20
|
+
|
|
21
|
+
# xPyD-sim
|
|
22
|
+
|
|
23
|
+
OpenAI-compatible LLM inference simulator for [xPyD](https://github.com/xPyD-hub).
|
|
24
|
+
|
|
25
|
+
Simulates prefill and decode nodes with realistic behavior for testing xPyD-bench and xPyD-proxy without real GPU hardware.
|
|
26
|
+
|
|
27
|
+
## Features
|
|
28
|
+
|
|
29
|
+
- Separate prefill and decode node simulators
|
|
30
|
+
- Full OpenAI API compatibility (/v1/completions, /v1/chat/completions, /v1/models)
|
|
31
|
+
- Configurable latency (prefill delay, decode delay per token)
|
|
32
|
+
- Streaming support with realistic token-by-token delivery
|
|
33
|
+
- EOS simulation with configurable output length distribution
|
|
34
|
+
- All OpenAI API parameters accepted
|
|
35
|
+
- Spec-compliant response formats
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install xpyd-sim
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Quick Start
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
# Start prefill node
|
|
47
|
+
xpyd-sim prefill --port 8001
|
|
48
|
+
|
|
49
|
+
# Start decode node
|
|
50
|
+
xpyd-sim decode --port 8002
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## License
|
|
54
|
+
|
|
55
|
+
TBD
|
xpyd_sim-0.1.0/README.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# xPyD-sim
|
|
2
|
+
|
|
3
|
+
OpenAI-compatible LLM inference simulator for [xPyD](https://github.com/xPyD-hub).
|
|
4
|
+
|
|
5
|
+
Simulates prefill and decode nodes with realistic behavior for testing xPyD-bench and xPyD-proxy without real GPU hardware.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- Separate prefill and decode node simulators
|
|
10
|
+
- Full OpenAI API compatibility (/v1/completions, /v1/chat/completions, /v1/models)
|
|
11
|
+
- Configurable latency (prefill delay, decode delay per token)
|
|
12
|
+
- Streaming support with realistic token-by-token delivery
|
|
13
|
+
- EOS simulation with configurable output length distribution
|
|
14
|
+
- All OpenAI API parameters accepted
|
|
15
|
+
- Spec-compliant response formats
|
|
16
|
+
|
|
17
|
+
## Install
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install xpyd-sim
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
# Start prefill node
|
|
27
|
+
xpyd-sim prefill --port 8001
|
|
28
|
+
|
|
29
|
+
# Start decode node
|
|
30
|
+
xpyd-sim decode --port 8002
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## License
|
|
34
|
+
|
|
35
|
+
TBD
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "xpyd-sim"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "OpenAI-compatible LLM inference simulator for xPyD"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"fastapi>=0.111.0",
|
|
13
|
+
"uvicorn>=0.30.0",
|
|
14
|
+
"pydantic>=2.0.0",
|
|
15
|
+
"httpx>=0.27.0",
|
|
16
|
+
"pyyaml>=6.0",
|
|
17
|
+
"numpy>=1.26.0",
|
|
18
|
+
"scipy>=1.12.0",
|
|
19
|
+
"matplotlib>=3.8.0",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.optional-dependencies]
|
|
23
|
+
dev = [
|
|
24
|
+
"pytest>=8.0.0",
|
|
25
|
+
"pytest-asyncio>=0.23.0",
|
|
26
|
+
"ruff>=0.3.0",
|
|
27
|
+
"isort>=5.13.0",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.scripts]
|
|
31
|
+
xpyd-sim = "xpyd_sim.cli:main"
|
|
32
|
+
|
|
33
|
+
[tool.setuptools.packages.find]
|
|
34
|
+
where = ["src"]
|
|
35
|
+
|
|
36
|
+
[tool.ruff]
|
|
37
|
+
line-length = 100
|
|
38
|
+
target-version = "py310"
|
|
39
|
+
|
|
40
|
+
[tool.ruff.lint]
|
|
41
|
+
select = ["E", "F", "W", "I"]
|
|
42
|
+
|
|
43
|
+
[tool.isort]
|
|
44
|
+
profile = "black"
|
|
45
|
+
line_length = 100
|
|
46
|
+
|
|
47
|
+
[tool.pytest.ini_options]
|
|
48
|
+
testpaths = ["tests"]
|
|
49
|
+
asyncio_mode = "auto"
|
xpyd_sim-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Calibrate tool: fit latency curves from sample points."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import yaml
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _fit_1d(points: list[dict[str, float]], x_key: str, y_key: str) -> dict[str, Any]:
|
|
13
|
+
"""Fit a 1D polynomial (degree 2) to sample points.
|
|
14
|
+
|
|
15
|
+
Returns coefficients for: y = a*x^2 + b*x + c
|
|
16
|
+
"""
|
|
17
|
+
xs = np.array([p[x_key] for p in points], dtype=np.float64)
|
|
18
|
+
ys = np.array([p[y_key] for p in points], dtype=np.float64)
|
|
19
|
+
degree = min(2, len(xs) - 1)
|
|
20
|
+
coeffs = np.polyfit(xs, ys, degree)
|
|
21
|
+
# Pad to always have 3 coefficients (a, b, c)
|
|
22
|
+
padded = [0.0] * (3 - len(coeffs)) + list(coeffs)
|
|
23
|
+
return {
|
|
24
|
+
"type": "poly1d",
|
|
25
|
+
"coefficients": [float(c) for c in padded],
|
|
26
|
+
"x_key": x_key,
|
|
27
|
+
"y_key": y_key,
|
|
28
|
+
"x_range": [float(xs.min()), float(xs.max())],
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _fit_2d(points: list[dict[str, float]]) -> dict[str, Any]:
|
|
33
|
+
"""Fit a 2D surface: delay = a + b*bs + c*ctx + d*bs*ctx + e*bs^2 + f*ctx^2.
|
|
34
|
+
|
|
35
|
+
Uses least-squares on polynomial features.
|
|
36
|
+
"""
|
|
37
|
+
bs = np.array([p["batch_size"] for p in points], dtype=np.float64)
|
|
38
|
+
ctx = np.array([p["context_length"] for p in points], dtype=np.float64)
|
|
39
|
+
y = np.array([p["delay_per_token_ms"] for p in points], dtype=np.float64)
|
|
40
|
+
|
|
41
|
+
# Build design matrix: [1, bs, ctx, bs*ctx, bs^2, ctx^2]
|
|
42
|
+
X = np.column_stack([ # noqa: N806
|
|
43
|
+
np.ones_like(bs),
|
|
44
|
+
bs,
|
|
45
|
+
ctx,
|
|
46
|
+
bs * ctx,
|
|
47
|
+
bs**2,
|
|
48
|
+
ctx**2,
|
|
49
|
+
])
|
|
50
|
+
|
|
51
|
+
# Least squares fit
|
|
52
|
+
coeffs, _, _, _ = np.linalg.lstsq(X, y, rcond=None)
|
|
53
|
+
|
|
54
|
+
return {
|
|
55
|
+
"type": "poly2d",
|
|
56
|
+
"coefficients": [float(c) for c in coeffs],
|
|
57
|
+
"bs_range": [float(bs.min()), float(bs.max())],
|
|
58
|
+
"ctx_range": [float(ctx.min()), float(ctx.max())],
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _validate_points(points: list[dict], min_count: int, label: str) -> None:
|
|
63
|
+
"""Validate minimum number of sample points."""
|
|
64
|
+
if len(points) < min_count:
|
|
65
|
+
print(
|
|
66
|
+
f"Error: {label} requires at least {min_count} sample points, got {len(points)}",
|
|
67
|
+
file=sys.stderr,
|
|
68
|
+
)
|
|
69
|
+
sys.exit(1)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def calibrate(input_path: str, output_path: str, plot_path: str | None = None) -> None:
|
|
73
|
+
"""Run calibration: load samples, fit curves, write profile + optional plot."""
|
|
74
|
+
with open(input_path) as f:
|
|
75
|
+
data = yaml.safe_load(f)
|
|
76
|
+
|
|
77
|
+
profile: dict[str, Any] = {}
|
|
78
|
+
|
|
79
|
+
# Prefill: 1D fit (batch_size → delay_ms)
|
|
80
|
+
if "prefill" in data:
|
|
81
|
+
points = data["prefill"]
|
|
82
|
+
_validate_points(points, 3, "prefill")
|
|
83
|
+
profile["prefill"] = _fit_1d(points, "batch_size", "delay_ms")
|
|
84
|
+
|
|
85
|
+
# KV transfer: 1D fit (batch_size → delay_ms)
|
|
86
|
+
if "kv_transfer" in data:
|
|
87
|
+
points = data["kv_transfer"]
|
|
88
|
+
_validate_points(points, 3, "kv_transfer")
|
|
89
|
+
profile["kv_transfer"] = _fit_1d(points, "batch_size", "delay_ms")
|
|
90
|
+
|
|
91
|
+
# Decode: 2D fit (batch_size, context_length → delay_per_token_ms)
|
|
92
|
+
if "decode" in data:
|
|
93
|
+
points = data["decode"]
|
|
94
|
+
_validate_points(points, 9, "decode (2D)")
|
|
95
|
+
profile["decode"] = _fit_2d(points)
|
|
96
|
+
|
|
97
|
+
# Write profile
|
|
98
|
+
with open(output_path, "w") as f:
|
|
99
|
+
yaml.dump(profile, f, default_flow_style=False, sort_keys=False)
|
|
100
|
+
|
|
101
|
+
print(f"Profile written to {output_path}")
|
|
102
|
+
|
|
103
|
+
# Optional visualization
|
|
104
|
+
if plot_path:
|
|
105
|
+
_plot(data, profile, plot_path)
|
|
106
|
+
print(f"Plot written to {plot_path}")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _plot(
|
|
110
|
+
data: dict[str, Any], profile: dict[str, Any], plot_path: str
|
|
111
|
+
) -> None:
|
|
112
|
+
"""Generate visualization PNG with sample points and fitted curves."""
|
|
113
|
+
import matplotlib
|
|
114
|
+
|
|
115
|
+
matplotlib.use("Agg")
|
|
116
|
+
import matplotlib.pyplot as plt
|
|
117
|
+
|
|
118
|
+
num_plots = sum(1 for k in ("prefill", "kv_transfer", "decode") if k in data)
|
|
119
|
+
fig, axes = plt.subplots(1, num_plots, figsize=(6 * num_plots, 5))
|
|
120
|
+
if num_plots == 1:
|
|
121
|
+
axes = [axes]
|
|
122
|
+
|
|
123
|
+
idx = 0
|
|
124
|
+
|
|
125
|
+
for key, label in [("prefill", "Prefill"), ("kv_transfer", "KV Transfer")]:
|
|
126
|
+
if key not in data:
|
|
127
|
+
continue
|
|
128
|
+
ax = axes[idx]
|
|
129
|
+
idx += 1
|
|
130
|
+
points = data[key]
|
|
131
|
+
xs = [p["batch_size"] for p in points]
|
|
132
|
+
ys = [p["delay_ms"] for p in points]
|
|
133
|
+
ax.scatter(xs, ys, color="red", zorder=5, label="Sample points")
|
|
134
|
+
|
|
135
|
+
# Plot fitted curve
|
|
136
|
+
coeffs = profile[key]["coefficients"]
|
|
137
|
+
x_fit = np.linspace(min(xs) * 0.8, max(xs) * 1.2, 100)
|
|
138
|
+
y_fit = np.polyval(coeffs, x_fit)
|
|
139
|
+
ax.plot(x_fit, y_fit, color="blue", label="Fitted curve")
|
|
140
|
+
ax.set_xlabel("Batch size (tokens)")
|
|
141
|
+
ax.set_ylabel("Delay (ms)")
|
|
142
|
+
ax.set_title(f"{label} Latency")
|
|
143
|
+
ax.legend()
|
|
144
|
+
|
|
145
|
+
if "decode" in data:
|
|
146
|
+
ax = axes[idx]
|
|
147
|
+
points = data["decode"]
|
|
148
|
+
bs_vals = sorted(set(p["batch_size"] for p in points))
|
|
149
|
+
for b in bs_vals:
|
|
150
|
+
pts = [p for p in points if p["batch_size"] == b]
|
|
151
|
+
ctxs = [p["context_length"] for p in pts]
|
|
152
|
+
delays = [p["delay_per_token_ms"] for p in pts]
|
|
153
|
+
ax.scatter(ctxs, delays, zorder=5, label=f"bs={b} (data)")
|
|
154
|
+
|
|
155
|
+
# Fitted curves per batch_size
|
|
156
|
+
coeffs = profile["decode"]["coefficients"]
|
|
157
|
+
ctx_fit = np.linspace(
|
|
158
|
+
min(p["context_length"] for p in points) * 0.8,
|
|
159
|
+
max(p["context_length"] for p in points) * 1.2,
|
|
160
|
+
100,
|
|
161
|
+
)
|
|
162
|
+
for b in bs_vals:
|
|
163
|
+
y_fit = (
|
|
164
|
+
coeffs[0]
|
|
165
|
+
+ coeffs[1] * b
|
|
166
|
+
+ coeffs[2] * ctx_fit
|
|
167
|
+
+ coeffs[3] * b * ctx_fit
|
|
168
|
+
+ coeffs[4] * b**2
|
|
169
|
+
+ coeffs[5] * ctx_fit**2
|
|
170
|
+
)
|
|
171
|
+
ax.plot(ctx_fit, y_fit, linestyle="--", label=f"bs={b} (fit)")
|
|
172
|
+
|
|
173
|
+
ax.set_xlabel("Context length")
|
|
174
|
+
ax.set_ylabel("Delay per token (ms)")
|
|
175
|
+
ax.set_title("Decode Latency (2D)")
|
|
176
|
+
ax.legend(fontsize=7)
|
|
177
|
+
|
|
178
|
+
plt.tight_layout()
|
|
179
|
+
plt.savefig(plot_path, dpi=150)
|
|
180
|
+
plt.close()
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""CLI entry point with YAML config and environment variable support."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import yaml
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _load_yaml_config(path: str | Path) -> dict[str, Any]:
|
|
14
|
+
"""Load and flatten a YAML config file into CLI-compatible keys."""
|
|
15
|
+
with open(path) as f:
|
|
16
|
+
raw = yaml.safe_load(f) or {}
|
|
17
|
+
|
|
18
|
+
flat: dict[str, Any] = {}
|
|
19
|
+
# Top-level keys
|
|
20
|
+
for key in ("mode", "port", "host", "model", "max_model_len"):
|
|
21
|
+
if key in raw:
|
|
22
|
+
flat[key] = raw[key]
|
|
23
|
+
|
|
24
|
+
# Nested latency section
|
|
25
|
+
latency = raw.get("latency", {})
|
|
26
|
+
for key in ("prefill_delay_ms", "kv_transfer_delay_ms", "decode_delay_per_token_ms"):
|
|
27
|
+
if key in latency:
|
|
28
|
+
flat[key] = latency[key]
|
|
29
|
+
|
|
30
|
+
# Nested eos section
|
|
31
|
+
eos = raw.get("eos", {})
|
|
32
|
+
if "min_ratio" in eos:
|
|
33
|
+
flat["eos_min_ratio"] = eos["min_ratio"]
|
|
34
|
+
|
|
35
|
+
# Nested warmup section
|
|
36
|
+
warmup = raw.get("warmup", {})
|
|
37
|
+
if "requests" in warmup:
|
|
38
|
+
flat["warmup_requests"] = warmup["requests"]
|
|
39
|
+
if "penalty_ms" in warmup:
|
|
40
|
+
flat["warmup_penalty_ms"] = warmup["penalty_ms"]
|
|
41
|
+
|
|
42
|
+
# Nested logging section
|
|
43
|
+
logging_cfg = raw.get("logging", {})
|
|
44
|
+
if "request_log" in logging_cfg:
|
|
45
|
+
flat["log_requests"] = logging_cfg["request_log"]
|
|
46
|
+
|
|
47
|
+
# Profile
|
|
48
|
+
if "profile" in raw:
|
|
49
|
+
flat["profile"] = raw["profile"]
|
|
50
|
+
|
|
51
|
+
# Scheduling section
|
|
52
|
+
scheduling = raw.get("scheduling", {})
|
|
53
|
+
for key in ("max_num_batched_tokens", "max_num_seqs"):
|
|
54
|
+
if key in scheduling:
|
|
55
|
+
flat[key] = scheduling[key]
|
|
56
|
+
if "max_model_len" in scheduling:
|
|
57
|
+
flat["max_model_len"] = scheduling["max_model_len"]
|
|
58
|
+
if "enabled" in scheduling:
|
|
59
|
+
flat["scheduling_enabled"] = scheduling["enabled"]
|
|
60
|
+
|
|
61
|
+
return flat
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# Mapping: config key -> (env var name, type converter)
|
|
65
|
+
_ENV_MAP: dict[str, tuple[str, type]] = {
|
|
66
|
+
"mode": ("XPYD_SIM_MODE", str),
|
|
67
|
+
"port": ("XPYD_SIM_PORT", int),
|
|
68
|
+
"host": ("XPYD_SIM_HOST", str),
|
|
69
|
+
"model": ("XPYD_SIM_MODEL", str),
|
|
70
|
+
"prefill_delay_ms": ("XPYD_SIM_PREFILL_DELAY_MS", float),
|
|
71
|
+
"kv_transfer_delay_ms": ("XPYD_SIM_KV_TRANSFER_DELAY_MS", float),
|
|
72
|
+
"decode_delay_per_token_ms": ("XPYD_SIM_DECODE_DELAY_PER_TOKEN_MS", float),
|
|
73
|
+
"eos_min_ratio": ("XPYD_SIM_EOS_MIN_RATIO", float),
|
|
74
|
+
"max_model_len": ("XPYD_SIM_MAX_MODEL_LEN", int),
|
|
75
|
+
"warmup_requests": ("XPYD_SIM_WARMUP_REQUESTS", int),
|
|
76
|
+
"warmup_penalty_ms": ("XPYD_SIM_WARMUP_PENALTY_MS", float),
|
|
77
|
+
"log_requests": ("XPYD_SIM_LOG_REQUESTS", str),
|
|
78
|
+
"profile": ("XPYD_SIM_PROFILE", str),
|
|
79
|
+
"max_num_batched_tokens": ("XPYD_SIM_MAX_NUM_BATCHED_TOKENS", int),
|
|
80
|
+
"max_num_seqs": ("XPYD_SIM_MAX_NUM_SEQS", int),
|
|
81
|
+
"scheduling_enabled": ("XPYD_SIM_SCHEDULING_ENABLED", lambda v: v.lower() in ("1", "true")),
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
# Default values for all config keys
|
|
85
|
+
_DEFAULTS: dict[str, Any] = {
|
|
86
|
+
"mode": "dual",
|
|
87
|
+
"port": 8000,
|
|
88
|
+
"host": "0.0.0.0",
|
|
89
|
+
"model": "dummy",
|
|
90
|
+
"prefill_delay_ms": 50.0,
|
|
91
|
+
"kv_transfer_delay_ms": 5.0,
|
|
92
|
+
"decode_delay_per_token_ms": 10.0,
|
|
93
|
+
"eos_min_ratio": 0.5,
|
|
94
|
+
"max_model_len": 131072,
|
|
95
|
+
"warmup_requests": 0,
|
|
96
|
+
"warmup_penalty_ms": 0.0,
|
|
97
|
+
"log_requests": None,
|
|
98
|
+
"profile": None,
|
|
99
|
+
"max_num_batched_tokens": 8192,
|
|
100
|
+
"max_num_seqs": 256,
|
|
101
|
+
"scheduling_enabled": False,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _resolve_config(
|
|
106
|
+
cli_args: argparse.Namespace,
|
|
107
|
+
yaml_config: dict[str, Any] | None = None,
|
|
108
|
+
) -> dict[str, Any]:
|
|
109
|
+
"""Resolve config with priority: CLI > env vars > YAML > defaults."""
|
|
110
|
+
result: dict[str, Any] = {}
|
|
111
|
+
|
|
112
|
+
# Map CLI arg names to config keys
|
|
113
|
+
cli_to_key = {
|
|
114
|
+
"mode": "mode",
|
|
115
|
+
"port": "port",
|
|
116
|
+
"host": "host",
|
|
117
|
+
"model": "model",
|
|
118
|
+
"prefill_delay_ms": "prefill_delay_ms",
|
|
119
|
+
"kv_transfer_delay_ms": "kv_transfer_delay_ms",
|
|
120
|
+
"decode_delay_per_token_ms": "decode_delay_per_token_ms",
|
|
121
|
+
"eos_min_ratio": "eos_min_ratio",
|
|
122
|
+
"max_model_len": "max_model_len",
|
|
123
|
+
"warmup_requests": "warmup_requests",
|
|
124
|
+
"warmup_penalty_ms": "warmup_penalty_ms",
|
|
125
|
+
"log_requests": "log_requests",
|
|
126
|
+
"profile": "profile",
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
for cli_attr, key in cli_to_key.items():
|
|
130
|
+
# 1. Check if CLI arg was explicitly provided
|
|
131
|
+
cli_val = getattr(cli_args, cli_attr, None)
|
|
132
|
+
default_val = _DEFAULTS[key]
|
|
133
|
+
# argparse sets unset args to their default — we use None defaults
|
|
134
|
+
# and check _explicitly_set to detect user intent
|
|
135
|
+
explicitly_set = key in getattr(cli_args, "_explicitly_set", set())
|
|
136
|
+
|
|
137
|
+
if explicitly_set:
|
|
138
|
+
result[key] = cli_val
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
# 2. Check environment variable
|
|
142
|
+
env_name, converter = _ENV_MAP[key]
|
|
143
|
+
env_val = os.environ.get(env_name)
|
|
144
|
+
if env_val is not None:
|
|
145
|
+
try:
|
|
146
|
+
result[key] = converter(env_val)
|
|
147
|
+
except (ValueError, TypeError):
|
|
148
|
+
result[key] = default_val
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
# 3. Check YAML config
|
|
152
|
+
if yaml_config and key in yaml_config:
|
|
153
|
+
result[key] = yaml_config[key]
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
# 4. Use default
|
|
157
|
+
result[key] = default_val
|
|
158
|
+
|
|
159
|
+
return result
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class _TrackingNamespace(argparse.Namespace):
|
|
163
|
+
"""Namespace that tracks which arguments were explicitly set on CLI."""
|
|
164
|
+
|
|
165
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
166
|
+
super().__init__(**kwargs)
|
|
167
|
+
self._explicitly_set: set[str] = set()
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class _TrackAction(argparse.Action):
|
|
171
|
+
"""Custom action that records when an arg is explicitly provided."""
|
|
172
|
+
|
|
173
|
+
def __call__( # type: ignore[override]
|
|
174
|
+
self, parser: argparse.ArgumentParser, namespace: argparse.Namespace,
|
|
175
|
+
values: Any, option_string: str | None = None,
|
|
176
|
+
) -> None:
|
|
177
|
+
setattr(namespace, self.dest, values)
|
|
178
|
+
if hasattr(namespace, "_explicitly_set"):
|
|
179
|
+
namespace._explicitly_set.add(self.dest)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def main(argv: list[str] | None = None) -> None:
|
|
183
|
+
parser = argparse.ArgumentParser(prog="xpyd-sim", description="xPyD inference simulator")
|
|
184
|
+
sub = parser.add_subparsers(dest="command")
|
|
185
|
+
|
|
186
|
+
serve = sub.add_parser("serve", help="Start the unified simulator server")
|
|
187
|
+
serve.add_argument("--mode", choices=["dual", "prefill", "decode"], default="dual",
|
|
188
|
+
action=_TrackAction)
|
|
189
|
+
serve.add_argument("--port", type=int, default=8000, action=_TrackAction)
|
|
190
|
+
serve.add_argument("--host", default="0.0.0.0", action=_TrackAction)
|
|
191
|
+
serve.add_argument("--model", default="dummy", action=_TrackAction)
|
|
192
|
+
serve.add_argument("--prefill-delay-ms", type=float, default=50.0, action=_TrackAction)
|
|
193
|
+
serve.add_argument("--kv-transfer-delay-ms", type=float, default=5.0, action=_TrackAction)
|
|
194
|
+
serve.add_argument("--decode-delay-per-token-ms", type=float, default=10.0,
|
|
195
|
+
action=_TrackAction)
|
|
196
|
+
serve.add_argument("--eos-min-ratio", type=float, default=0.5, action=_TrackAction)
|
|
197
|
+
serve.add_argument("--max-model-len", type=int, default=131072, action=_TrackAction)
|
|
198
|
+
serve.add_argument("--warmup-requests", type=int, default=0, action=_TrackAction)
|
|
199
|
+
serve.add_argument("--warmup-penalty-ms", type=float, default=0.0, action=_TrackAction)
|
|
200
|
+
serve.add_argument("--log-requests", type=str, default=None, action=_TrackAction)
|
|
201
|
+
serve.add_argument("--profile", type=str, default=None, action=_TrackAction)
|
|
202
|
+
serve.add_argument("--config", type=str, default=None, help="YAML config file path",
|
|
203
|
+
action=_TrackAction)
|
|
204
|
+
|
|
205
|
+
# Calibrate subcommand
|
|
206
|
+
cal = sub.add_parser("calibrate", help="Fit latency curves from sample data")
|
|
207
|
+
cal.add_argument("--input", required=True, help="Path to sample points YAML")
|
|
208
|
+
cal.add_argument("--output", required=True, help="Path to write profile YAML")
|
|
209
|
+
cal.add_argument("--plot", default=None, help="Path to write visualization PNG")
|
|
210
|
+
|
|
211
|
+
args = parser.parse_args(argv, namespace=_TrackingNamespace())
|
|
212
|
+
if not args.command:
|
|
213
|
+
parser.print_help()
|
|
214
|
+
return
|
|
215
|
+
|
|
216
|
+
if args.command == "calibrate":
|
|
217
|
+
from xpyd_sim.calibrate import calibrate
|
|
218
|
+
|
|
219
|
+
calibrate(args.input, args.output, args.plot)
|
|
220
|
+
return
|
|
221
|
+
|
|
222
|
+
if args.command == "serve":
|
|
223
|
+
import uvicorn
|
|
224
|
+
|
|
225
|
+
from xpyd_sim.server import ServerConfig, create_app
|
|
226
|
+
|
|
227
|
+
# Load YAML config if provided
|
|
228
|
+
yaml_config = None
|
|
229
|
+
if args.config:
|
|
230
|
+
yaml_config = _load_yaml_config(args.config)
|
|
231
|
+
|
|
232
|
+
cfg = _resolve_config(args, yaml_config)
|
|
233
|
+
|
|
234
|
+
config = ServerConfig(
|
|
235
|
+
mode=cfg["mode"],
|
|
236
|
+
model_name=cfg["model"],
|
|
237
|
+
prefill_delay_ms=cfg["prefill_delay_ms"],
|
|
238
|
+
kv_transfer_delay_ms=cfg["kv_transfer_delay_ms"],
|
|
239
|
+
decode_delay_per_token_ms=cfg["decode_delay_per_token_ms"],
|
|
240
|
+
eos_min_ratio=cfg["eos_min_ratio"],
|
|
241
|
+
max_model_len=cfg["max_model_len"],
|
|
242
|
+
warmup_requests=cfg["warmup_requests"],
|
|
243
|
+
warmup_penalty_ms=cfg["warmup_penalty_ms"],
|
|
244
|
+
log_requests=cfg["log_requests"],
|
|
245
|
+
profile=cfg["profile"],
|
|
246
|
+
)
|
|
247
|
+
app = create_app(config)
|
|
248
|
+
uvicorn.run(app, host=cfg["host"], port=cfg["port"])
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Common models and helpers."""
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Shared helper functions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
import uuid
|
|
7
|
+
from typing import Any, Optional
|
|
8
|
+
|
|
9
|
+
DUMMY_TOKENS = list("The quick brown fox jumps over the lazy dog. " * 20)
|
|
10
|
+
DEFAULT_MAX_TOKENS = 16
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def generate_id(prefix: str = "chatcmpl") -> str:
|
|
14
|
+
return f"{prefix}-{uuid.uuid4().hex[:12]}"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def now_ts() -> int:
|
|
18
|
+
return int(time.time())
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_effective_max_tokens(*values: Optional[int]) -> int:
|
|
22
|
+
for v in values:
|
|
23
|
+
if v is not None:
|
|
24
|
+
return v
|
|
25
|
+
return DEFAULT_MAX_TOKENS
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def count_prompt_tokens(prompt: Any = None, messages: list | None = None) -> int:
|
|
29
|
+
if messages is not None:
|
|
30
|
+
total = sum(
|
|
31
|
+
len(str(getattr(m, "content", ""))) + len(getattr(m, "role", ""))
|
|
32
|
+
for m in messages
|
|
33
|
+
)
|
|
34
|
+
return max(1, total // 4)
|
|
35
|
+
if prompt is None:
|
|
36
|
+
return 1
|
|
37
|
+
if isinstance(prompt, str):
|
|
38
|
+
return max(1, len(prompt) // 4)
|
|
39
|
+
if isinstance(prompt, list):
|
|
40
|
+
if all(isinstance(i, int) for i in prompt):
|
|
41
|
+
return len(prompt)
|
|
42
|
+
return max(1, sum(len(str(i)) for i in prompt) // 4)
|
|
43
|
+
return max(1, len(str(prompt)) // 4)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def render_dummy_text(n_tokens: int) -> str:
|
|
47
|
+
return "".join(DUMMY_TOKENS[: min(n_tokens, len(DUMMY_TOKENS))])
|