xpyd-sim 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. xpyd_sim-0.1.0/PKG-INFO +55 -0
  2. xpyd_sim-0.1.0/README.md +35 -0
  3. xpyd_sim-0.1.0/pyproject.toml +49 -0
  4. xpyd_sim-0.1.0/setup.cfg +4 -0
  5. xpyd_sim-0.1.0/src/xpyd_sim/__init__.py +3 -0
  6. xpyd_sim-0.1.0/src/xpyd_sim/calibrate.py +180 -0
  7. xpyd_sim-0.1.0/src/xpyd_sim/cli.py +248 -0
  8. xpyd_sim-0.1.0/src/xpyd_sim/common/__init__.py +1 -0
  9. xpyd_sim-0.1.0/src/xpyd_sim/common/helpers.py +47 -0
  10. xpyd_sim-0.1.0/src/xpyd_sim/common/logprobs.py +83 -0
  11. xpyd_sim-0.1.0/src/xpyd_sim/common/models.py +153 -0
  12. xpyd_sim-0.1.0/src/xpyd_sim/decode/__init__.py +1 -0
  13. xpyd_sim-0.1.0/src/xpyd_sim/observability.py +123 -0
  14. xpyd_sim-0.1.0/src/xpyd_sim/prefill/__init__.py +5 -0
  15. xpyd_sim-0.1.0/src/xpyd_sim/prefill/app.py +300 -0
  16. xpyd_sim-0.1.0/src/xpyd_sim/profile.py +61 -0
  17. xpyd_sim-0.1.0/src/xpyd_sim/scheduler.py +353 -0
  18. xpyd_sim-0.1.0/src/xpyd_sim/server.py +945 -0
  19. xpyd_sim-0.1.0/src/xpyd_sim.egg-info/PKG-INFO +55 -0
  20. xpyd_sim-0.1.0/src/xpyd_sim.egg-info/SOURCES.txt +32 -0
  21. xpyd_sim-0.1.0/src/xpyd_sim.egg-info/dependency_links.txt +1 -0
  22. xpyd_sim-0.1.0/src/xpyd_sim.egg-info/entry_points.txt +2 -0
  23. xpyd_sim-0.1.0/src/xpyd_sim.egg-info/requires.txt +14 -0
  24. xpyd_sim-0.1.0/src/xpyd_sim.egg-info/top_level.txt +1 -0
  25. xpyd_sim-0.1.0/tests/test_helpers.py +54 -0
  26. xpyd_sim-0.1.0/tests/test_m3_params.py +256 -0
  27. xpyd_sim-0.1.0/tests/test_m4_config.py +268 -0
  28. xpyd_sim-0.1.0/tests/test_m5_calibrate.py +223 -0
  29. xpyd_sim-0.1.0/tests/test_m6_observability.py +282 -0
  30. xpyd_sim-0.1.0/tests/test_m7_e2e_concurrency.py +443 -0
  31. xpyd_sim-0.1.0/tests/test_m8_backward_compat.py +560 -0
  32. xpyd_sim-0.1.0/tests/test_models.py +105 -0
  33. xpyd_sim-0.1.0/tests/test_prefill.py +187 -0
  34. xpyd_sim-0.1.0/tests/test_server.py +586 -0
@@ -0,0 +1,55 @@
1
+ Metadata-Version: 2.4
2
+ Name: xpyd-sim
3
+ Version: 0.1.0
4
+ Summary: OpenAI-compatible LLM inference simulator for xPyD
5
+ Requires-Python: >=3.10
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: fastapi>=0.111.0
8
+ Requires-Dist: uvicorn>=0.30.0
9
+ Requires-Dist: pydantic>=2.0.0
10
+ Requires-Dist: httpx>=0.27.0
11
+ Requires-Dist: pyyaml>=6.0
12
+ Requires-Dist: numpy>=1.26.0
13
+ Requires-Dist: scipy>=1.12.0
14
+ Requires-Dist: matplotlib>=3.8.0
15
+ Provides-Extra: dev
16
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
17
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
18
+ Requires-Dist: ruff>=0.3.0; extra == "dev"
19
+ Requires-Dist: isort>=5.13.0; extra == "dev"
20
+
21
+ # xPyD-sim
22
+
23
+ OpenAI-compatible LLM inference simulator for [xPyD](https://github.com/xPyD-hub).
24
+
25
+ Simulates prefill and decode nodes with realistic behavior for testing xPyD-bench and xPyD-proxy without real GPU hardware.
26
+
27
+ ## Features
28
+
29
+ - Separate prefill and decode node simulators
30
+ - Full OpenAI API compatibility (/v1/completions, /v1/chat/completions, /v1/models)
31
+ - Configurable latency (prefill delay, decode delay per token)
32
+ - Streaming support with realistic token-by-token delivery
33
+ - EOS simulation with configurable output length distribution
34
+ - All OpenAI API parameters accepted
35
+ - Spec-compliant response formats
36
+
37
+ ## Install
38
+
39
+ ```bash
40
+ pip install xpyd-sim
41
+ ```
42
+
43
+ ## Quick Start
44
+
45
+ ```bash
46
+ # Start prefill node
47
+ xpyd-sim prefill --port 8001
48
+
49
+ # Start decode node
50
+ xpyd-sim decode --port 8002
51
+ ```
52
+
53
+ ## License
54
+
55
+ TBD
@@ -0,0 +1,35 @@
1
+ # xPyD-sim
2
+
3
+ OpenAI-compatible LLM inference simulator for [xPyD](https://github.com/xPyD-hub).
4
+
5
+ Simulates prefill and decode nodes with realistic behavior for testing xPyD-bench and xPyD-proxy without real GPU hardware.
6
+
7
+ ## Features
8
+
9
+ - Separate prefill and decode node simulators
10
+ - Full OpenAI API compatibility (/v1/completions, /v1/chat/completions, /v1/models)
11
+ - Configurable latency (prefill delay, decode delay per token)
12
+ - Streaming support with realistic token-by-token delivery
13
+ - EOS simulation with configurable output length distribution
14
+ - All OpenAI API parameters accepted
15
+ - Spec-compliant response formats
16
+
17
+ ## Install
18
+
19
+ ```bash
20
+ pip install xpyd-sim
21
+ ```
22
+
23
+ ## Quick Start
24
+
25
+ ```bash
26
+ # Start prefill node
27
+ xpyd-sim prefill --port 8001
28
+
29
+ # Start decode node
30
+ xpyd-sim decode --port 8002
31
+ ```
32
+
33
+ ## License
34
+
35
+ TBD
@@ -0,0 +1,49 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "xpyd-sim"
7
+ version = "0.1.0"
8
+ description = "OpenAI-compatible LLM inference simulator for xPyD"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ dependencies = [
12
+ "fastapi>=0.111.0",
13
+ "uvicorn>=0.30.0",
14
+ "pydantic>=2.0.0",
15
+ "httpx>=0.27.0",
16
+ "pyyaml>=6.0",
17
+ "numpy>=1.26.0",
18
+ "scipy>=1.12.0",
19
+ "matplotlib>=3.8.0",
20
+ ]
21
+
22
+ [project.optional-dependencies]
23
+ dev = [
24
+ "pytest>=8.0.0",
25
+ "pytest-asyncio>=0.23.0",
26
+ "ruff>=0.3.0",
27
+ "isort>=5.13.0",
28
+ ]
29
+
30
+ [project.scripts]
31
+ xpyd-sim = "xpyd_sim.cli:main"
32
+
33
+ [tool.setuptools.packages.find]
34
+ where = ["src"]
35
+
36
+ [tool.ruff]
37
+ line-length = 100
38
+ target-version = "py310"
39
+
40
+ [tool.ruff.lint]
41
+ select = ["E", "F", "W", "I"]
42
+
43
+ [tool.isort]
44
+ profile = "black"
45
+ line_length = 100
46
+
47
+ [tool.pytest.ini_options]
48
+ testpaths = ["tests"]
49
+ asyncio_mode = "auto"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ """xPyD-sim: OpenAI-compatible LLM inference simulator."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,180 @@
1
+ """Calibrate tool: fit latency curves from sample points."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from typing import Any
7
+
8
+ import numpy as np
9
+ import yaml
10
+
11
+
12
+ def _fit_1d(points: list[dict[str, float]], x_key: str, y_key: str) -> dict[str, Any]:
13
+ """Fit a 1D polynomial (degree 2) to sample points.
14
+
15
+ Returns coefficients for: y = a*x^2 + b*x + c
16
+ """
17
+ xs = np.array([p[x_key] for p in points], dtype=np.float64)
18
+ ys = np.array([p[y_key] for p in points], dtype=np.float64)
19
+ degree = min(2, len(xs) - 1)
20
+ coeffs = np.polyfit(xs, ys, degree)
21
+ # Pad to always have 3 coefficients (a, b, c)
22
+ padded = [0.0] * (3 - len(coeffs)) + list(coeffs)
23
+ return {
24
+ "type": "poly1d",
25
+ "coefficients": [float(c) for c in padded],
26
+ "x_key": x_key,
27
+ "y_key": y_key,
28
+ "x_range": [float(xs.min()), float(xs.max())],
29
+ }
30
+
31
+
32
+ def _fit_2d(points: list[dict[str, float]]) -> dict[str, Any]:
33
+ """Fit a 2D surface: delay = a + b*bs + c*ctx + d*bs*ctx + e*bs^2 + f*ctx^2.
34
+
35
+ Uses least-squares on polynomial features.
36
+ """
37
+ bs = np.array([p["batch_size"] for p in points], dtype=np.float64)
38
+ ctx = np.array([p["context_length"] for p in points], dtype=np.float64)
39
+ y = np.array([p["delay_per_token_ms"] for p in points], dtype=np.float64)
40
+
41
+ # Build design matrix: [1, bs, ctx, bs*ctx, bs^2, ctx^2]
42
+ X = np.column_stack([ # noqa: N806
43
+ np.ones_like(bs),
44
+ bs,
45
+ ctx,
46
+ bs * ctx,
47
+ bs**2,
48
+ ctx**2,
49
+ ])
50
+
51
+ # Least squares fit
52
+ coeffs, _, _, _ = np.linalg.lstsq(X, y, rcond=None)
53
+
54
+ return {
55
+ "type": "poly2d",
56
+ "coefficients": [float(c) for c in coeffs],
57
+ "bs_range": [float(bs.min()), float(bs.max())],
58
+ "ctx_range": [float(ctx.min()), float(ctx.max())],
59
+ }
60
+
61
+
62
+ def _validate_points(points: list[dict], min_count: int, label: str) -> None:
63
+ """Validate minimum number of sample points."""
64
+ if len(points) < min_count:
65
+ print(
66
+ f"Error: {label} requires at least {min_count} sample points, got {len(points)}",
67
+ file=sys.stderr,
68
+ )
69
+ sys.exit(1)
70
+
71
+
72
+ def calibrate(input_path: str, output_path: str, plot_path: str | None = None) -> None:
73
+ """Run calibration: load samples, fit curves, write profile + optional plot."""
74
+ with open(input_path) as f:
75
+ data = yaml.safe_load(f)
76
+
77
+ profile: dict[str, Any] = {}
78
+
79
+ # Prefill: 1D fit (batch_size → delay_ms)
80
+ if "prefill" in data:
81
+ points = data["prefill"]
82
+ _validate_points(points, 3, "prefill")
83
+ profile["prefill"] = _fit_1d(points, "batch_size", "delay_ms")
84
+
85
+ # KV transfer: 1D fit (batch_size → delay_ms)
86
+ if "kv_transfer" in data:
87
+ points = data["kv_transfer"]
88
+ _validate_points(points, 3, "kv_transfer")
89
+ profile["kv_transfer"] = _fit_1d(points, "batch_size", "delay_ms")
90
+
91
+ # Decode: 2D fit (batch_size, context_length → delay_per_token_ms)
92
+ if "decode" in data:
93
+ points = data["decode"]
94
+ _validate_points(points, 9, "decode (2D)")
95
+ profile["decode"] = _fit_2d(points)
96
+
97
+ # Write profile
98
+ with open(output_path, "w") as f:
99
+ yaml.dump(profile, f, default_flow_style=False, sort_keys=False)
100
+
101
+ print(f"Profile written to {output_path}")
102
+
103
+ # Optional visualization
104
+ if plot_path:
105
+ _plot(data, profile, plot_path)
106
+ print(f"Plot written to {plot_path}")
107
+
108
+
109
+ def _plot(
110
+ data: dict[str, Any], profile: dict[str, Any], plot_path: str
111
+ ) -> None:
112
+ """Generate visualization PNG with sample points and fitted curves."""
113
+ import matplotlib
114
+
115
+ matplotlib.use("Agg")
116
+ import matplotlib.pyplot as plt
117
+
118
+ num_plots = sum(1 for k in ("prefill", "kv_transfer", "decode") if k in data)
119
+ fig, axes = plt.subplots(1, num_plots, figsize=(6 * num_plots, 5))
120
+ if num_plots == 1:
121
+ axes = [axes]
122
+
123
+ idx = 0
124
+
125
+ for key, label in [("prefill", "Prefill"), ("kv_transfer", "KV Transfer")]:
126
+ if key not in data:
127
+ continue
128
+ ax = axes[idx]
129
+ idx += 1
130
+ points = data[key]
131
+ xs = [p["batch_size"] for p in points]
132
+ ys = [p["delay_ms"] for p in points]
133
+ ax.scatter(xs, ys, color="red", zorder=5, label="Sample points")
134
+
135
+ # Plot fitted curve
136
+ coeffs = profile[key]["coefficients"]
137
+ x_fit = np.linspace(min(xs) * 0.8, max(xs) * 1.2, 100)
138
+ y_fit = np.polyval(coeffs, x_fit)
139
+ ax.plot(x_fit, y_fit, color="blue", label="Fitted curve")
140
+ ax.set_xlabel("Batch size (tokens)")
141
+ ax.set_ylabel("Delay (ms)")
142
+ ax.set_title(f"{label} Latency")
143
+ ax.legend()
144
+
145
+ if "decode" in data:
146
+ ax = axes[idx]
147
+ points = data["decode"]
148
+ bs_vals = sorted(set(p["batch_size"] for p in points))
149
+ for b in bs_vals:
150
+ pts = [p for p in points if p["batch_size"] == b]
151
+ ctxs = [p["context_length"] for p in pts]
152
+ delays = [p["delay_per_token_ms"] for p in pts]
153
+ ax.scatter(ctxs, delays, zorder=5, label=f"bs={b} (data)")
154
+
155
+ # Fitted curves per batch_size
156
+ coeffs = profile["decode"]["coefficients"]
157
+ ctx_fit = np.linspace(
158
+ min(p["context_length"] for p in points) * 0.8,
159
+ max(p["context_length"] for p in points) * 1.2,
160
+ 100,
161
+ )
162
+ for b in bs_vals:
163
+ y_fit = (
164
+ coeffs[0]
165
+ + coeffs[1] * b
166
+ + coeffs[2] * ctx_fit
167
+ + coeffs[3] * b * ctx_fit
168
+ + coeffs[4] * b**2
169
+ + coeffs[5] * ctx_fit**2
170
+ )
171
+ ax.plot(ctx_fit, y_fit, linestyle="--", label=f"bs={b} (fit)")
172
+
173
+ ax.set_xlabel("Context length")
174
+ ax.set_ylabel("Delay per token (ms)")
175
+ ax.set_title("Decode Latency (2D)")
176
+ ax.legend(fontsize=7)
177
+
178
+ plt.tight_layout()
179
+ plt.savefig(plot_path, dpi=150)
180
+ plt.close()
@@ -0,0 +1,248 @@
1
+ """CLI entry point with YAML config and environment variable support."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import os
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import yaml
11
+
12
+
13
+ def _load_yaml_config(path: str | Path) -> dict[str, Any]:
14
+ """Load and flatten a YAML config file into CLI-compatible keys."""
15
+ with open(path) as f:
16
+ raw = yaml.safe_load(f) or {}
17
+
18
+ flat: dict[str, Any] = {}
19
+ # Top-level keys
20
+ for key in ("mode", "port", "host", "model", "max_model_len"):
21
+ if key in raw:
22
+ flat[key] = raw[key]
23
+
24
+ # Nested latency section
25
+ latency = raw.get("latency", {})
26
+ for key in ("prefill_delay_ms", "kv_transfer_delay_ms", "decode_delay_per_token_ms"):
27
+ if key in latency:
28
+ flat[key] = latency[key]
29
+
30
+ # Nested eos section
31
+ eos = raw.get("eos", {})
32
+ if "min_ratio" in eos:
33
+ flat["eos_min_ratio"] = eos["min_ratio"]
34
+
35
+ # Nested warmup section
36
+ warmup = raw.get("warmup", {})
37
+ if "requests" in warmup:
38
+ flat["warmup_requests"] = warmup["requests"]
39
+ if "penalty_ms" in warmup:
40
+ flat["warmup_penalty_ms"] = warmup["penalty_ms"]
41
+
42
+ # Nested logging section
43
+ logging_cfg = raw.get("logging", {})
44
+ if "request_log" in logging_cfg:
45
+ flat["log_requests"] = logging_cfg["request_log"]
46
+
47
+ # Profile
48
+ if "profile" in raw:
49
+ flat["profile"] = raw["profile"]
50
+
51
+ # Scheduling section
52
+ scheduling = raw.get("scheduling", {})
53
+ for key in ("max_num_batched_tokens", "max_num_seqs"):
54
+ if key in scheduling:
55
+ flat[key] = scheduling[key]
56
+ if "max_model_len" in scheduling:
57
+ flat["max_model_len"] = scheduling["max_model_len"]
58
+ if "enabled" in scheduling:
59
+ flat["scheduling_enabled"] = scheduling["enabled"]
60
+
61
+ return flat
62
+
63
+
64
+ # Mapping: config key -> (env var name, type converter)
65
+ _ENV_MAP: dict[str, tuple[str, type]] = {
66
+ "mode": ("XPYD_SIM_MODE", str),
67
+ "port": ("XPYD_SIM_PORT", int),
68
+ "host": ("XPYD_SIM_HOST", str),
69
+ "model": ("XPYD_SIM_MODEL", str),
70
+ "prefill_delay_ms": ("XPYD_SIM_PREFILL_DELAY_MS", float),
71
+ "kv_transfer_delay_ms": ("XPYD_SIM_KV_TRANSFER_DELAY_MS", float),
72
+ "decode_delay_per_token_ms": ("XPYD_SIM_DECODE_DELAY_PER_TOKEN_MS", float),
73
+ "eos_min_ratio": ("XPYD_SIM_EOS_MIN_RATIO", float),
74
+ "max_model_len": ("XPYD_SIM_MAX_MODEL_LEN", int),
75
+ "warmup_requests": ("XPYD_SIM_WARMUP_REQUESTS", int),
76
+ "warmup_penalty_ms": ("XPYD_SIM_WARMUP_PENALTY_MS", float),
77
+ "log_requests": ("XPYD_SIM_LOG_REQUESTS", str),
78
+ "profile": ("XPYD_SIM_PROFILE", str),
79
+ "max_num_batched_tokens": ("XPYD_SIM_MAX_NUM_BATCHED_TOKENS", int),
80
+ "max_num_seqs": ("XPYD_SIM_MAX_NUM_SEQS", int),
81
+ "scheduling_enabled": ("XPYD_SIM_SCHEDULING_ENABLED", lambda v: v.lower() in ("1", "true")),
82
+ }
83
+
84
+ # Default values for all config keys
85
+ _DEFAULTS: dict[str, Any] = {
86
+ "mode": "dual",
87
+ "port": 8000,
88
+ "host": "0.0.0.0",
89
+ "model": "dummy",
90
+ "prefill_delay_ms": 50.0,
91
+ "kv_transfer_delay_ms": 5.0,
92
+ "decode_delay_per_token_ms": 10.0,
93
+ "eos_min_ratio": 0.5,
94
+ "max_model_len": 131072,
95
+ "warmup_requests": 0,
96
+ "warmup_penalty_ms": 0.0,
97
+ "log_requests": None,
98
+ "profile": None,
99
+ "max_num_batched_tokens": 8192,
100
+ "max_num_seqs": 256,
101
+ "scheduling_enabled": False,
102
+ }
103
+
104
+
105
+ def _resolve_config(
106
+ cli_args: argparse.Namespace,
107
+ yaml_config: dict[str, Any] | None = None,
108
+ ) -> dict[str, Any]:
109
+ """Resolve config with priority: CLI > env vars > YAML > defaults."""
110
+ result: dict[str, Any] = {}
111
+
112
+ # Map CLI arg names to config keys
113
+ cli_to_key = {
114
+ "mode": "mode",
115
+ "port": "port",
116
+ "host": "host",
117
+ "model": "model",
118
+ "prefill_delay_ms": "prefill_delay_ms",
119
+ "kv_transfer_delay_ms": "kv_transfer_delay_ms",
120
+ "decode_delay_per_token_ms": "decode_delay_per_token_ms",
121
+ "eos_min_ratio": "eos_min_ratio",
122
+ "max_model_len": "max_model_len",
123
+ "warmup_requests": "warmup_requests",
124
+ "warmup_penalty_ms": "warmup_penalty_ms",
125
+ "log_requests": "log_requests",
126
+ "profile": "profile",
127
+ }
128
+
129
+ for cli_attr, key in cli_to_key.items():
130
+ # 1. Check if CLI arg was explicitly provided
131
+ cli_val = getattr(cli_args, cli_attr, None)
132
+ default_val = _DEFAULTS[key]
133
+ # argparse sets unset args to their default — we use None defaults
134
+ # and check _explicitly_set to detect user intent
135
+ explicitly_set = key in getattr(cli_args, "_explicitly_set", set())
136
+
137
+ if explicitly_set:
138
+ result[key] = cli_val
139
+ continue
140
+
141
+ # 2. Check environment variable
142
+ env_name, converter = _ENV_MAP[key]
143
+ env_val = os.environ.get(env_name)
144
+ if env_val is not None:
145
+ try:
146
+ result[key] = converter(env_val)
147
+ except (ValueError, TypeError):
148
+ result[key] = default_val
149
+ continue
150
+
151
+ # 3. Check YAML config
152
+ if yaml_config and key in yaml_config:
153
+ result[key] = yaml_config[key]
154
+ continue
155
+
156
+ # 4. Use default
157
+ result[key] = default_val
158
+
159
+ return result
160
+
161
+
162
+ class _TrackingNamespace(argparse.Namespace):
163
+ """Namespace that tracks which arguments were explicitly set on CLI."""
164
+
165
+ def __init__(self, **kwargs: Any) -> None:
166
+ super().__init__(**kwargs)
167
+ self._explicitly_set: set[str] = set()
168
+
169
+
170
+ class _TrackAction(argparse.Action):
171
+ """Custom action that records when an arg is explicitly provided."""
172
+
173
+ def __call__( # type: ignore[override]
174
+ self, parser: argparse.ArgumentParser, namespace: argparse.Namespace,
175
+ values: Any, option_string: str | None = None,
176
+ ) -> None:
177
+ setattr(namespace, self.dest, values)
178
+ if hasattr(namespace, "_explicitly_set"):
179
+ namespace._explicitly_set.add(self.dest)
180
+
181
+
182
+ def main(argv: list[str] | None = None) -> None:
183
+ parser = argparse.ArgumentParser(prog="xpyd-sim", description="xPyD inference simulator")
184
+ sub = parser.add_subparsers(dest="command")
185
+
186
+ serve = sub.add_parser("serve", help="Start the unified simulator server")
187
+ serve.add_argument("--mode", choices=["dual", "prefill", "decode"], default="dual",
188
+ action=_TrackAction)
189
+ serve.add_argument("--port", type=int, default=8000, action=_TrackAction)
190
+ serve.add_argument("--host", default="0.0.0.0", action=_TrackAction)
191
+ serve.add_argument("--model", default="dummy", action=_TrackAction)
192
+ serve.add_argument("--prefill-delay-ms", type=float, default=50.0, action=_TrackAction)
193
+ serve.add_argument("--kv-transfer-delay-ms", type=float, default=5.0, action=_TrackAction)
194
+ serve.add_argument("--decode-delay-per-token-ms", type=float, default=10.0,
195
+ action=_TrackAction)
196
+ serve.add_argument("--eos-min-ratio", type=float, default=0.5, action=_TrackAction)
197
+ serve.add_argument("--max-model-len", type=int, default=131072, action=_TrackAction)
198
+ serve.add_argument("--warmup-requests", type=int, default=0, action=_TrackAction)
199
+ serve.add_argument("--warmup-penalty-ms", type=float, default=0.0, action=_TrackAction)
200
+ serve.add_argument("--log-requests", type=str, default=None, action=_TrackAction)
201
+ serve.add_argument("--profile", type=str, default=None, action=_TrackAction)
202
+ serve.add_argument("--config", type=str, default=None, help="YAML config file path",
203
+ action=_TrackAction)
204
+
205
+ # Calibrate subcommand
206
+ cal = sub.add_parser("calibrate", help="Fit latency curves from sample data")
207
+ cal.add_argument("--input", required=True, help="Path to sample points YAML")
208
+ cal.add_argument("--output", required=True, help="Path to write profile YAML")
209
+ cal.add_argument("--plot", default=None, help="Path to write visualization PNG")
210
+
211
+ args = parser.parse_args(argv, namespace=_TrackingNamespace())
212
+ if not args.command:
213
+ parser.print_help()
214
+ return
215
+
216
+ if args.command == "calibrate":
217
+ from xpyd_sim.calibrate import calibrate
218
+
219
+ calibrate(args.input, args.output, args.plot)
220
+ return
221
+
222
+ if args.command == "serve":
223
+ import uvicorn
224
+
225
+ from xpyd_sim.server import ServerConfig, create_app
226
+
227
+ # Load YAML config if provided
228
+ yaml_config = None
229
+ if args.config:
230
+ yaml_config = _load_yaml_config(args.config)
231
+
232
+ cfg = _resolve_config(args, yaml_config)
233
+
234
+ config = ServerConfig(
235
+ mode=cfg["mode"],
236
+ model_name=cfg["model"],
237
+ prefill_delay_ms=cfg["prefill_delay_ms"],
238
+ kv_transfer_delay_ms=cfg["kv_transfer_delay_ms"],
239
+ decode_delay_per_token_ms=cfg["decode_delay_per_token_ms"],
240
+ eos_min_ratio=cfg["eos_min_ratio"],
241
+ max_model_len=cfg["max_model_len"],
242
+ warmup_requests=cfg["warmup_requests"],
243
+ warmup_penalty_ms=cfg["warmup_penalty_ms"],
244
+ log_requests=cfg["log_requests"],
245
+ profile=cfg["profile"],
246
+ )
247
+ app = create_app(config)
248
+ uvicorn.run(app, host=cfg["host"], port=cfg["port"])
@@ -0,0 +1 @@
1
+ """Common models and helpers."""
@@ -0,0 +1,47 @@
1
+ """Shared helper functions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+ import uuid
7
+ from typing import Any, Optional
8
+
9
+ DUMMY_TOKENS = list("The quick brown fox jumps over the lazy dog. " * 20)
10
+ DEFAULT_MAX_TOKENS = 16
11
+
12
+
13
+ def generate_id(prefix: str = "chatcmpl") -> str:
14
+ return f"{prefix}-{uuid.uuid4().hex[:12]}"
15
+
16
+
17
+ def now_ts() -> int:
18
+ return int(time.time())
19
+
20
+
21
+ def get_effective_max_tokens(*values: Optional[int]) -> int:
22
+ for v in values:
23
+ if v is not None:
24
+ return v
25
+ return DEFAULT_MAX_TOKENS
26
+
27
+
28
+ def count_prompt_tokens(prompt: Any = None, messages: list | None = None) -> int:
29
+ if messages is not None:
30
+ total = sum(
31
+ len(str(getattr(m, "content", ""))) + len(getattr(m, "role", ""))
32
+ for m in messages
33
+ )
34
+ return max(1, total // 4)
35
+ if prompt is None:
36
+ return 1
37
+ if isinstance(prompt, str):
38
+ return max(1, len(prompt) // 4)
39
+ if isinstance(prompt, list):
40
+ if all(isinstance(i, int) for i in prompt):
41
+ return len(prompt)
42
+ return max(1, sum(len(str(i)) for i in prompt) // 4)
43
+ return max(1, len(str(prompt)) // 4)
44
+
45
+
46
+ def render_dummy_text(n_tokens: int) -> str:
47
+ return "".join(DUMMY_TOKENS[: min(n_tokens, len(DUMMY_TOKENS))])