sglang 0.3.5.post2__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. sglang/bench_latency.py +1 -553
  2. sglang/bench_offline_throughput.py +48 -20
  3. sglang/bench_one_batch.py +474 -0
  4. sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
  5. sglang/bench_serving.py +71 -1
  6. sglang/check_env.py +3 -6
  7. sglang/srt/constrained/outlines_backend.py +15 -2
  8. sglang/srt/constrained/xgrammar_backend.py +22 -14
  9. sglang/srt/layers/activation.py +3 -0
  10. sglang/srt/layers/attention/flashinfer_backend.py +93 -48
  11. sglang/srt/layers/attention/triton_backend.py +9 -7
  12. sglang/srt/layers/custom_op_util.py +26 -0
  13. sglang/srt/layers/fused_moe/fused_moe.py +11 -4
  14. sglang/srt/layers/layernorm.py +4 -0
  15. sglang/srt/layers/logits_processor.py +10 -10
  16. sglang/srt/layers/sampler.py +4 -8
  17. sglang/srt/layers/torchao_utils.py +2 -0
  18. sglang/srt/managers/data_parallel_controller.py +74 -9
  19. sglang/srt/managers/detokenizer_manager.py +1 -0
  20. sglang/srt/managers/io_struct.py +27 -0
  21. sglang/srt/managers/schedule_batch.py +104 -38
  22. sglang/srt/managers/schedule_policy.py +5 -1
  23. sglang/srt/managers/scheduler.py +204 -54
  24. sglang/srt/managers/session_controller.py +62 -0
  25. sglang/srt/managers/tokenizer_manager.py +38 -0
  26. sglang/srt/managers/tp_worker.py +12 -1
  27. sglang/srt/managers/tp_worker_overlap_thread.py +49 -52
  28. sglang/srt/model_executor/cuda_graph_runner.py +43 -6
  29. sglang/srt/model_executor/forward_batch_info.py +109 -15
  30. sglang/srt/model_executor/model_runner.py +99 -43
  31. sglang/srt/model_parallel.py +98 -0
  32. sglang/srt/models/deepseek_v2.py +147 -44
  33. sglang/srt/models/gemma2.py +9 -8
  34. sglang/srt/models/llava.py +1 -1
  35. sglang/srt/models/llavavid.py +1 -1
  36. sglang/srt/models/olmo.py +3 -3
  37. sglang/srt/models/phi3_small.py +447 -0
  38. sglang/srt/models/qwen2_vl.py +13 -6
  39. sglang/srt/models/torch_native_llama.py +94 -78
  40. sglang/srt/openai_api/adapter.py +6 -2
  41. sglang/srt/openai_api/protocol.py +1 -1
  42. sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
  43. sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
  44. sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
  45. sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
  46. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
  47. sglang/srt/sampling/sampling_batch_info.py +58 -57
  48. sglang/srt/sampling/sampling_params.py +1 -1
  49. sglang/srt/server.py +27 -1
  50. sglang/srt/server_args.py +78 -62
  51. sglang/srt/utils.py +71 -52
  52. sglang/test/runners.py +25 -6
  53. sglang/test/srt/sampling/penaltylib/utils.py +23 -21
  54. sglang/test/test_utils.py +30 -19
  55. sglang/version.py +1 -1
  56. {sglang-0.3.5.post2.dist-info → sglang-0.3.6.dist-info}/METADATA +43 -43
  57. {sglang-0.3.5.post2.dist-info → sglang-0.3.6.dist-info}/RECORD +60 -55
  58. {sglang-0.3.5.post2.dist-info → sglang-0.3.6.dist-info}/WHEEL +1 -1
  59. {sglang-0.3.5.post2.dist-info → sglang-0.3.6.dist-info}/LICENSE +0 -0
  60. {sglang-0.3.5.post2.dist-info → sglang-0.3.6.dist-info}/top_level.txt +0 -0
sglang/bench_latency.py CHANGED
@@ -1,553 +1 @@
1
- """
2
- Benchmark the latency of running a single static batch.
3
- This script does not launch a server and uses the low-level APIs.
4
- It accepts arguments similar to those of launch_server.py.
5
-
6
- # Usage (latency test)
7
- ## with dummy weights:
8
- python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
9
- ## sweep through multiple data points and store (append) the results in a jsonl file:
10
- python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl
11
- ## do some changes, and store the results under a different run_name:
12
- python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl --run-name after
13
- ## plot the results in series of lines:
14
- python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
15
-
16
- # Usage (correctness test):
17
- python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
18
-
19
- ## Reference output (of the correctness test above, can be gpu dependent):
20
- input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
21
-
22
- prefill logits (first half): tensor([[-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
23
- [-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
24
- [ -9.1875, -10.2500, 2.7129, ..., -4.3359, -4.0664, -4.1328]],
25
- device='cuda:0')
26
-
27
- prefill logits (final): tensor([[-8.3125, -7.1172, 3.3457, ..., -4.9570, -4.1328, -3.4141],
28
- [-8.9141, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0781],
29
- [-9.6328, -9.0547, 4.0195, ..., -5.3047, -4.7148, -4.4570]],
30
- device='cuda:0')
31
-
32
- ========== Prompt 0 ==========
33
- <s> The capital of France is Paris.
34
- The capital of the United States is Washington, D.C.
35
-
36
-
37
- ========== Prompt 1 ==========
38
- <s> The capital of the United Kindom is London.
39
- The capital of the United Kingdom is London.
40
- The capital of the
41
-
42
- ========== Prompt 2 ==========
43
- <s> Today is a sunny day and I like to go for a walk in the park.
44
- I'm going to the park
45
- """
46
-
47
- import argparse
48
- import dataclasses
49
- import itertools
50
- import json
51
- import logging
52
- import multiprocessing
53
- import os
54
- import sqlite3
55
- import time
56
- from typing import Tuple
57
-
58
- import numpy as np
59
- import pandas as pd
60
- import torch
61
- import torch.distributed as dist
62
-
63
- from sglang.srt.configs.model_config import ModelConfig
64
- from sglang.srt.hf_transformers_utils import get_tokenizer
65
- from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
66
- from sglang.srt.model_executor.forward_batch_info import ForwardBatch
67
- from sglang.srt.model_executor.model_runner import ModelRunner
68
- from sglang.srt.sampling.sampling_params import SamplingParams
69
- from sglang.srt.server import _set_envs_and_config
70
- from sglang.srt.server_args import PortArgs, ServerArgs
71
- from sglang.srt.utils import (
72
- configure_logger,
73
- kill_child_process,
74
- suppress_other_loggers,
75
- )
76
-
77
-
78
- @dataclasses.dataclass
79
- class BenchArgs:
80
- run_name: str = "before"
81
- batch_size: Tuple[int] = (1,)
82
- input_len: Tuple[int] = (1024,)
83
- output_len: Tuple[int] = (16,)
84
- result_filename: str = ""
85
- correctness_test: bool = False
86
- # This is only used for correctness test
87
- cut_len: int = 4
88
- # Plotting args
89
- graph_sql: str = (
90
- "select run_name, batch_size, prefill_throughput from results where run_name='before'"
91
- )
92
- graph_filename: str = "out.png"
93
-
94
- @staticmethod
95
- def add_cli_args(parser: argparse.ArgumentParser):
96
- parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
97
- parser.add_argument(
98
- "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
99
- )
100
- parser.add_argument(
101
- "--input-len", type=int, nargs="+", default=BenchArgs.input_len
102
- )
103
- parser.add_argument(
104
- "--output-len", type=int, nargs="+", default=BenchArgs.output_len
105
- )
106
- parser.add_argument(
107
- "--result-filename", type=str, default=BenchArgs.result_filename
108
- )
109
- parser.add_argument("--correctness-test", action="store_true")
110
- parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
111
- # graphing
112
- parser.add_argument("--graph-sql", type=str, default=BenchArgs.graph_sql)
113
- parser.add_argument(
114
- "--graph-filename", type=str, default=BenchArgs.graph_filename
115
- )
116
-
117
- @classmethod
118
- def from_cli_args(cls, args: argparse.Namespace):
119
- # use the default value's type to case the args into correct types.
120
- attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
121
- return cls(
122
- **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
123
- )
124
-
125
-
126
- def load_model(server_args, port_args, tp_rank):
127
- suppress_other_loggers()
128
- rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
129
-
130
- model_config = ModelConfig(
131
- server_args.model_path,
132
- trust_remote_code=server_args.trust_remote_code,
133
- context_length=server_args.context_length,
134
- model_override_args=server_args.json_model_override_args,
135
- )
136
- model_runner = ModelRunner(
137
- model_config=model_config,
138
- mem_fraction_static=server_args.mem_fraction_static,
139
- gpu_id=tp_rank,
140
- tp_rank=tp_rank,
141
- tp_size=server_args.tp_size,
142
- nccl_port=port_args.nccl_port,
143
- server_args=server_args,
144
- )
145
- rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
146
- tokenizer = get_tokenizer(
147
- server_args.tokenizer_path,
148
- tokenizer_mode=server_args.tokenizer_mode,
149
- trust_remote_code=server_args.trust_remote_code,
150
- )
151
- if server_args.tp_size > 1:
152
- dist.barrier()
153
- return model_runner, tokenizer
154
-
155
-
156
- def prepare_inputs_for_correctness_test(bench_args, tokenizer):
157
- prompts = [
158
- "The capital of France is",
159
- "The capital of the United Kindom is",
160
- "Today is a sunny day and I like",
161
- ]
162
- input_ids = [tokenizer.encode(p) for p in prompts]
163
- sampling_params = SamplingParams(
164
- temperature=0,
165
- max_new_tokens=BenchArgs.output_len,
166
- )
167
-
168
- reqs = []
169
- for i in range(len(prompts)):
170
- assert len(input_ids[i]) > bench_args.cut_len
171
-
172
- tmp_input_ids = input_ids[i][: bench_args.cut_len]
173
- req = Req(
174
- rid=i,
175
- origin_input_text=prompts[i],
176
- origin_input_ids=tmp_input_ids,
177
- sampling_params=sampling_params,
178
- )
179
- req.prefix_indices = []
180
- req.fill_ids = req.origin_input_ids
181
- req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
182
- reqs.append(req)
183
-
184
- return input_ids, reqs
185
-
186
-
187
- def prepare_extend_inputs_for_correctness_test(
188
- bench_args, input_ids, reqs, model_runner
189
- ):
190
- for i in range(len(reqs)):
191
- req = reqs[i]
192
- req.fill_ids += input_ids[i][bench_args.cut_len :]
193
- req.prefix_indices = model_runner.req_to_token_pool.req_to_token[
194
- i, : bench_args.cut_len
195
- ]
196
- req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
197
- return reqs
198
-
199
-
200
- def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
201
- input_ids = np.ones((batch_size, input_len), dtype=np.int32)
202
- sampling_params = SamplingParams(
203
- temperature=0,
204
- max_new_tokens=BenchArgs.output_len,
205
- )
206
-
207
- reqs = []
208
- for i in range(len(input_ids)):
209
- req = Req(
210
- rid=i,
211
- origin_input_text="",
212
- origin_input_ids=list(input_ids[i]),
213
- sampling_params=sampling_params,
214
- )
215
- req.prefix_indices = []
216
- req.fill_ids = req.origin_input_ids
217
- req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
218
- reqs.append(req)
219
-
220
- return reqs
221
-
222
-
223
- @torch.inference_mode()
224
- def extend(reqs, model_runner):
225
- batch = ScheduleBatch.init_new(
226
- reqs=reqs,
227
- req_to_token_pool=model_runner.req_to_token_pool,
228
- token_to_kv_pool=model_runner.token_to_kv_pool,
229
- tree_cache=None,
230
- model_config=model_runner.model_config,
231
- )
232
- batch.prepare_for_extend()
233
- model_worker_batch = batch.get_model_worker_batch()
234
- forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
235
- logits_output = model_runner.forward(forward_batch)
236
- next_token_ids = model_runner.sample(logits_output, forward_batch)
237
- return next_token_ids, logits_output.next_token_logits, batch
238
-
239
-
240
- @torch.inference_mode()
241
- def decode(input_token_ids, batch, model_runner):
242
- batch.output_ids = input_token_ids
243
- batch.prepare_for_decode()
244
- model_worker_batch = batch.get_model_worker_batch()
245
- forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
246
- logits_output = model_runner.forward(forward_batch)
247
- next_token_ids = model_runner.sample(logits_output, forward_batch)
248
- return next_token_ids, logits_output.next_token_logits
249
-
250
-
251
- def correctness_test(
252
- server_args,
253
- port_args,
254
- bench_args,
255
- tp_rank,
256
- ):
257
- configure_logger(server_args, prefix=f" TP{tp_rank}")
258
- rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
259
-
260
- # Load the model
261
- model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
262
-
263
- # Prepare inputs
264
- input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
265
- rank_print(f"\n{input_ids=}\n")
266
-
267
- if bench_args.cut_len > 0:
268
- # Prefill
269
- next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
270
- rank_print(f"prefill logits (first half): {next_token_logits} \n")
271
-
272
- # Prepare extend inputs
273
- reqs = prepare_extend_inputs_for_correctness_test(
274
- bench_args, input_ids, reqs, model_runner
275
- )
276
-
277
- # Extend
278
- next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
279
- rank_print(f"prefill logits (final): {next_token_logits} \n")
280
-
281
- # Decode
282
- output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
283
- for _ in range(bench_args.output_len[0] - 1):
284
- next_token_ids, _ = decode(next_token_ids, batch, model_runner)
285
- next_token_ids_list = next_token_ids.tolist()
286
- for i in range(len(reqs)):
287
- output_ids[i].append(next_token_ids_list[i])
288
-
289
- # Print
290
- for i in range(len(reqs)):
291
- rank_print(f"========== Prompt {i} ==========")
292
- rank_print(tokenizer.decode(output_ids[i]), "\n")
293
-
294
-
295
- def synchronize(device):
296
- if device == "cuda":
297
- torch.cuda.synchronize()
298
- elif device == "xpu":
299
- torch.xpu.synchronize()
300
-
301
-
302
- def latency_test_run_once(
303
- run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len, device
304
- ):
305
- max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
306
- if batch_size > max_batch_size:
307
- rank_print(
308
- f"skipping ({batch_size}, {input_len}, {output_len}) due to max batch size limit"
309
- )
310
- return
311
-
312
- # Clear the pools.
313
- model_runner.req_to_token_pool.clear()
314
- model_runner.token_to_kv_pool.clear()
315
-
316
- measurement_results = {
317
- "run_name": run_name,
318
- "batch_size": batch_size,
319
- "input_len": input_len,
320
- "output_len": output_len,
321
- }
322
-
323
- tot_latency = 0
324
-
325
- # Prefill
326
- synchronize(device)
327
- tic = time.time()
328
- next_token_ids, _, batch = extend(reqs, model_runner)
329
- synchronize(device)
330
- prefill_latency = time.time() - tic
331
- tot_latency += prefill_latency
332
- throughput = input_len * batch_size / prefill_latency
333
- rank_print(
334
- f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s"
335
- )
336
- measurement_results["prefill_latency"] = prefill_latency
337
- measurement_results["prefill_throughput"] = throughput
338
-
339
- # Decode
340
- decode_latencies = []
341
- for i in range(output_len - 1):
342
- synchronize(device)
343
- tic = time.time()
344
- next_token_ids, _ = decode(next_token_ids, batch, model_runner)
345
- synchronize(device)
346
- latency = time.time() - tic
347
- tot_latency += latency
348
- throughput = batch_size / latency
349
- decode_latencies.append(latency)
350
- if i < 5:
351
- rank_print(
352
- f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
353
- )
354
-
355
- # record decode timing from 2nd output
356
- if output_len > 1:
357
- med_decode_latency = np.median(decode_latencies)
358
- med_decode_throughput = batch_size / med_decode_latency
359
- rank_print(
360
- f"Decode. median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s"
361
- )
362
- measurement_results["median_decode_latency"] = med_decode_latency
363
- measurement_results["median_decode_throughput"] = med_decode_throughput
364
-
365
- throughput = (input_len + output_len) * batch_size / tot_latency
366
- rank_print(
367
- f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
368
- )
369
- measurement_results["total_latency"] = tot_latency
370
- measurement_results["total_throughput"] = throughput
371
- return measurement_results
372
-
373
-
374
- def latency_test(
375
- server_args,
376
- port_args,
377
- bench_args,
378
- tp_rank,
379
- ):
380
- configure_logger(server_args, prefix=f" TP{tp_rank}")
381
- rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
382
-
383
- # Load the model
384
- model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
385
-
386
- # Prepare inputs for warm up
387
- reqs = prepare_synthetic_inputs_for_latency_test(
388
- bench_args.batch_size[0], bench_args.input_len[0]
389
- )
390
-
391
- # Warm up
392
- rank_print("Warmup ...")
393
- latency_test_run_once(
394
- bench_args.run_name,
395
- model_runner,
396
- rank_print,
397
- reqs,
398
- bench_args.batch_size[0],
399
- bench_args.input_len[0],
400
- 8, # shorter decoding to speed up the warmup
401
- server_args.device,
402
- )
403
- rank_print("Benchmark ...")
404
-
405
- # Run the sweep
406
- result_list = []
407
- for bs, il, ol in itertools.product(
408
- bench_args.batch_size, bench_args.input_len, bench_args.output_len
409
- ):
410
- reqs = prepare_synthetic_inputs_for_latency_test(bs, il)
411
- ret = latency_test_run_once(
412
- bench_args.run_name,
413
- model_runner,
414
- rank_print,
415
- reqs,
416
- bs,
417
- il,
418
- ol,
419
- server_args.device,
420
- )
421
- if ret is not None:
422
- result_list.append(ret)
423
-
424
- # Write results in jsonlines format on rank 0.
425
- if tp_rank == 0 and bench_args.result_filename:
426
- import jsonlines
427
-
428
- with jsonlines.open(bench_args.result_filename, "a") as f:
429
- f.write_all(result_list)
430
-
431
-
432
- def plot_latency_test(
433
- server_args,
434
- bench_args,
435
- tp_rank,
436
- ):
437
- assert tp_rank == 0
438
-
439
- # read the jsonl file and put in sqlite
440
- df = pd.read_json(bench_args.result_filename, lines=True)
441
- conn = sqlite3.connect(":memory:")
442
- cur = conn.cursor()
443
-
444
- # get the columns and their types
445
- column_names = list(df.iloc[0].keys())
446
- type_dict = {
447
- str: "TEXT",
448
- np.int64: "INTEGER",
449
- np.float64: "FLOAT",
450
- }
451
- column_types = [type_dict[type(i)] for i in list(df.iloc[0])]
452
-
453
- # create the table
454
- cur.execute(
455
- f"""
456
- CREATE TABLE IF NOT EXISTS results (
457
- {", ".join([f"{name} {type}" for name, type in zip(column_names, column_types)])}
458
- )
459
- """
460
- )
461
- conn.commit()
462
-
463
- # write the results to DB
464
- df.to_sql("results", conn, if_exists="replace", index=False)
465
- conn.commit()
466
-
467
- # read it back using sql
468
- df = pd.read_sql_query(bench_args.graph_sql, conn)
469
- conn.close()
470
-
471
- # plot it and save to a file
472
- import matplotlib.pyplot as plt
473
-
474
- assert (
475
- len(df.columns) == 3
476
- ), f"The sql should have fetched <series, x, y> columns, not {df.columns}"
477
- for label in df[df.columns[0]].unique():
478
- q = f"{df.columns[0]}=='{label}'"
479
- series = df.query(q)
480
- plt.plot(series[df.columns[1]], series[df.columns[2]], label=q, marker="o")
481
- plt.xlabel(df.columns[1])
482
- plt.ylabel(df.columns[2])
483
- plt.legend()
484
- plt.savefig(bench_args.graph_filename, dpi=300)
485
-
486
- # if in kitty, just dump it to the terminal
487
- if os.environ["TERM"] == "xterm-kitty":
488
- os.system(
489
- f"kitty icat --use-window-size 1,1,600,600 {bench_args.graph_filename}"
490
- )
491
-
492
-
493
- def main(server_args, bench_args):
494
- _set_envs_and_config(server_args)
495
-
496
- if server_args.model_path:
497
- if bench_args.correctness_test:
498
- work_func = correctness_test
499
- else:
500
- work_func = latency_test
501
- elif os.path.isfile(bench_args.result_filename):
502
- assert bench_args.graph_filename, "please provide a filename for the graph"
503
- work_func = plot_latency_test
504
- else:
505
- raise ValueError(
506
- "Provide --model-path for running the tests or "
507
- "provide --result-filename for plotting the results"
508
- )
509
-
510
- port_args = PortArgs.init_new(server_args)
511
-
512
- if server_args.tp_size == 1:
513
- work_func(server_args, port_args, bench_args, 0)
514
- else:
515
- workers = []
516
- for tp_rank in range(server_args.tp_size):
517
- proc = multiprocessing.Process(
518
- target=work_func,
519
- args=(
520
- server_args,
521
- port_args,
522
- bench_args,
523
- tp_rank,
524
- ),
525
- )
526
- proc.start()
527
- workers.append(proc)
528
-
529
- for proc in workers:
530
- proc.join()
531
-
532
- proc.terminate()
533
-
534
-
535
- if __name__ == "__main__":
536
- parser = argparse.ArgumentParser()
537
- ServerArgs.add_cli_args(parser)
538
- BenchArgs.add_cli_args(parser)
539
- args = parser.parse_args()
540
- server_args = ServerArgs.from_cli_args(args)
541
- bench_args = BenchArgs.from_cli_args(args)
542
-
543
- logging.basicConfig(
544
- level=getattr(logging, server_args.log_level.upper()),
545
- format="%(message)s",
546
- )
547
-
548
- try:
549
- main(server_args, bench_args)
550
- except Exception as e:
551
- raise e
552
- finally:
553
- kill_child_process()
1
+ raise ValueError("bench_latency.py has been renamed to bench_one_batch.py")
@@ -1,20 +1,13 @@
1
1
  """
2
- Benchmark the throughput of using the offline LLM engine.
3
- This script does not launch a server.
2
+ Benchmark the throughput in the offline mode.
4
3
  It accepts server arguments (the same as launch_server.py) and benchmark arguments (the same as bench_serving.py).
5
4
 
6
5
  # Usage
7
6
  ## Sharegpt dataset with default args
8
- python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct
7
+ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --num-prompts 10
9
8
 
10
9
  ## Random dataset with default args
11
- python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random
12
-
13
- ## Shared prefix dataset with default args
14
- python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name generated-shared-prefix
15
-
16
- ## Sharegpt dataset on runtime backend
17
- python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --backend runtime
10
+ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024
18
11
  """
19
12
 
20
13
  import argparse
@@ -23,7 +16,7 @@ import json
23
16
  import logging
24
17
  import random
25
18
  import time
26
- from typing import List, Optional, Tuple
19
+ from typing import Dict, List, Optional, Tuple
27
20
 
28
21
  import numpy as np
29
22
 
@@ -55,7 +48,10 @@ class BenchArgs:
55
48
  gen_question_len: int = 128
56
49
  gen_output_len: int = 256
57
50
  disable_ignore_eos: bool = False
51
+ extra_request_body: Optional[str] = None
58
52
  seed: int = 1
53
+ skip_warmup: bool = False
54
+ do_not_exit: bool = False
59
55
 
60
56
  @staticmethod
61
57
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -142,7 +138,24 @@ class BenchArgs:
142
138
  default=BenchArgs.disable_ignore_eos,
143
139
  help="Disable ignore EOS token",
144
140
  )
141
+ parser.add_argument(
142
+ "--extra-request-body",
143
+ metavar='{"key1": "value1", "key2": "value2"}',
144
+ type=str,
145
+ help="Append given JSON object to the request payload. You can use this to specify"
146
+ "additional generate params like sampling params.",
147
+ )
145
148
  parser.add_argument("--seed", type=int, default=1, help="The random seed.")
149
+ parser.add_argument(
150
+ "--skip-warmup",
151
+ action="store_true",
152
+ help="Skip the warmup batches.",
153
+ )
154
+ parser.add_argument(
155
+ "--do-not-exit",
156
+ action="store_true",
157
+ help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
158
+ )
146
159
 
147
160
  @classmethod
148
161
  def from_cli_args(cls, args: argparse.Namespace):
@@ -155,6 +168,7 @@ def throughput_test_once(
155
168
  backend,
156
169
  reqs: List[Tuple[str, int, int]],
157
170
  ignore_eos: bool,
171
+ extra_request_body: Dict,
158
172
  ):
159
173
  measurement_results = {
160
174
  "backend": backend_name,
@@ -174,6 +188,7 @@ def throughput_test_once(
174
188
  "temperature": 0,
175
189
  "max_new_tokens": r[2],
176
190
  "ignore_eos": ignore_eos,
191
+ **extra_request_body,
177
192
  }
178
193
  for r in reqs
179
194
  ]
@@ -227,31 +242,41 @@ def throughput_test(
227
242
  random.seed(bench_args.seed)
228
243
  np.random.seed(bench_args.seed)
229
244
 
245
+ # Parse args
246
+ extra_request_body = {}
247
+ if bench_args.extra_request_body:
248
+ extra_request_body = json.loads(args.extra_request_body)
249
+
230
250
  # Read dataset
231
251
  input_requests = get_dataset(bench_args, tokenizer)
232
252
 
233
253
  warmup_requests = sample_random_requests(
234
- input_len=20,
235
- output_len=4,
236
- num_prompts=2,
254
+ input_len=256,
255
+ output_len=16,
256
+ num_prompts=16,
237
257
  range_ratio=0.8,
238
258
  tokenizer=tokenizer,
239
259
  dataset_path=bench_args.dataset_path,
240
260
  )
241
261
 
242
262
  # Warm up
243
- throughput_test_once(
244
- backend_name=bench_args.backend,
245
- backend=backend,
246
- reqs=warmup_requests,
247
- ignore_eos=not bench_args.disable_ignore_eos,
248
- )
263
+ if not bench_args.skip_warmup:
264
+ logging.info("\nWarmup...")
265
+ throughput_test_once(
266
+ backend_name=bench_args.backend,
267
+ backend=backend,
268
+ reqs=warmup_requests,
269
+ ignore_eos=not bench_args.disable_ignore_eos,
270
+ extra_request_body=extra_request_body,
271
+ )
249
272
 
273
+ logging.info("\nBenchmark...")
250
274
  result = throughput_test_once(
251
275
  backend_name=bench_args.backend,
252
276
  backend=backend,
253
277
  reqs=input_requests,
254
278
  ignore_eos=not bench_args.disable_ignore_eos,
279
+ extra_request_body=extra_request_body,
255
280
  )
256
281
 
257
282
  if bench_args.result_filename:
@@ -307,3 +332,6 @@ if __name__ == "__main__":
307
332
  )
308
333
 
309
334
  throughput_test(server_args, bench_args)
335
+
336
+ while bench_args.do_not_exit:
337
+ pass