sglang 0.1.22__py3-none-any.whl → 0.1.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. sglang/__init__.py +2 -2
  2. sglang/bench_serving.py +243 -25
  3. sglang/global_config.py +3 -2
  4. sglang/lang/interpreter.py +1 -0
  5. sglang/srt/hf_transformers_utils.py +13 -1
  6. sglang/srt/layers/logits_processor.py +4 -5
  7. sglang/srt/layers/radix_attention.py +38 -49
  8. sglang/srt/managers/controller/cuda_graph_runner.py +58 -16
  9. sglang/srt/managers/controller/infer_batch.py +51 -22
  10. sglang/srt/managers/controller/model_runner.py +58 -4
  11. sglang/srt/managers/controller/schedule_heuristic.py +8 -3
  12. sglang/srt/managers/controller/tp_worker.py +9 -11
  13. sglang/srt/memory_pool.py +13 -5
  14. sglang/srt/models/deepseek.py +430 -0
  15. sglang/srt/models/gpt_bigcode.py +282 -0
  16. sglang/srt/models/llama2.py +19 -10
  17. sglang/srt/server.py +26 -1
  18. sglang/srt/server_args.py +12 -6
  19. sglang/srt/utils.py +93 -1
  20. sglang/version.py +1 -0
  21. {sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/METADATA +10 -6
  22. {sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/RECORD +25 -36
  23. {sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/WHEEL +1 -1
  24. sglang/backend/__init__.py +0 -0
  25. sglang/backend/anthropic.py +0 -77
  26. sglang/backend/base_backend.py +0 -80
  27. sglang/backend/litellm.py +0 -90
  28. sglang/backend/openai.py +0 -438
  29. sglang/backend/runtime_endpoint.py +0 -283
  30. sglang/backend/vertexai.py +0 -149
  31. sglang/bench.py +0 -627
  32. sglang/srt/managers/controller/dp_worker.py +0 -113
  33. sglang/srt/openai_api/api_adapter.py +0 -432
  34. sglang/srt/openai_api/openai_api_adapter.py +0 -431
  35. sglang/srt/openai_api/openai_protocol.py +0 -207
  36. sglang/srt/openai_api_adapter.py +0 -411
  37. sglang/srt/openai_protocol.py +0 -207
  38. {sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/LICENSE +0 -0
  39. {sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/top_level.txt +0 -0
sglang/bench.py DELETED
@@ -1,627 +0,0 @@
1
- # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/backend_request_func.py
2
- # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
3
-
4
- import argparse
5
- import asyncio
6
- import json
7
- import os
8
- import random
9
- import resource
10
- import sys
11
- import time
12
- import traceback
13
- import warnings
14
- from argparse import ArgumentParser as FlexibleArgumentParser
15
- from dataclasses import dataclass, field
16
- from typing import AsyncGenerator, List, Optional, Tuple, Union
17
-
18
- import aiohttp
19
- import numpy as np
20
- import requests
21
- from tqdm.asyncio import tqdm
22
- from transformers import (
23
- AutoTokenizer,
24
- PreTrainedTokenizer,
25
- PreTrainedTokenizerBase,
26
- PreTrainedTokenizerFast,
27
- )
28
-
29
- AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
30
-
31
-
32
- @dataclass
33
- class RequestFuncInput:
34
- prompt: str
35
- api_url: str
36
- prompt_len: int
37
- output_len: int
38
- model: str
39
-
40
-
41
- @dataclass
42
- class RequestFuncOutput:
43
- generated_text: str = ""
44
- success: bool = False
45
- latency: float = 0.0
46
- ttft: float = 0.0 # Time to first token
47
- itl: List[float] = field(default_factory=list) # List of inter-token latencies
48
- prompt_len: int = 0
49
- error: str = ""
50
-
51
-
52
- def remove_prefix(text: str, prefix: str) -> str:
53
- return text[len(prefix) :] if text.startswith(prefix) else text
54
-
55
-
56
- # set ignore_eos True by default
57
- async def async_request_openai_completions(
58
- request_func_input: RequestFuncInput,
59
- pbar: Optional[tqdm] = None,
60
- ) -> RequestFuncOutput:
61
- api_url = request_func_input.api_url
62
- assert api_url.endswith(
63
- "completions"
64
- ), "OpenAI Completions API URL must end with 'completions'."
65
-
66
- async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
67
- payload = {
68
- "model": request_func_input.model,
69
- "prompt": request_func_input.prompt,
70
- "temperature": 0.0,
71
- "best_of": 1,
72
- "max_tokens": request_func_input.output_len,
73
- "stream": True,
74
- "ignore_eos": True,
75
- }
76
- headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
77
-
78
- output = RequestFuncOutput()
79
- output.prompt_len = request_func_input.prompt_len
80
-
81
- generated_text = ""
82
- ttft = 0.0
83
- st = time.perf_counter()
84
- most_recent_timestamp = st
85
- try:
86
- async with session.post(
87
- url=api_url, json=payload, headers=headers
88
- ) as response:
89
- if response.status == 200:
90
- async for chunk_bytes in response.content:
91
- chunk_bytes = chunk_bytes.strip()
92
- if not chunk_bytes:
93
- continue
94
-
95
- chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
96
- if chunk == "[DONE]":
97
- latency = time.perf_counter() - st
98
- else:
99
- data = json.loads(chunk)
100
-
101
- # NOTE: Some completion API might have a last
102
- # usage summary response without a token so we
103
- # want to check a token was generated
104
- if data["choices"][0]["text"]:
105
- timestamp = time.perf_counter()
106
- # First token
107
- if ttft == 0.0:
108
- ttft = time.perf_counter() - st
109
- output.ttft = ttft
110
-
111
- # Decoding phase
112
- output.itl.append(timestamp - most_recent_timestamp)
113
-
114
- most_recent_timestamp = timestamp
115
- generated_text += data["choices"][0]["text"]
116
-
117
- output.generated_text = generated_text
118
- output.success = True
119
- output.latency = latency
120
- else:
121
- output.error = response.reason or ""
122
- output.success = False
123
- except Exception:
124
- output.success = False
125
- exc_info = sys.exc_info()
126
- output.error = "".join(traceback.format_exception(*exc_info))
127
-
128
- if pbar:
129
- pbar.update(1)
130
- return output
131
-
132
-
133
- def get_model(pretrained_model_name_or_path: str) -> str:
134
- if os.getenv("SGLANG_USE_MODELSCOPE", "False").lower() == "true":
135
- import huggingface_hub.constants
136
- from modelscope import snapshot_download
137
-
138
- model_path = snapshot_download(
139
- model_id=pretrained_model_name_or_path,
140
- local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
141
- ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
142
- )
143
-
144
- return model_path
145
- return pretrained_model_name_or_path
146
-
147
-
148
- def get_tokenizer(
149
- pretrained_model_name_or_path: str,
150
- ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
151
- if pretrained_model_name_or_path is not None and not os.path.exists(
152
- pretrained_model_name_or_path
153
- ):
154
- pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
155
- return AutoTokenizer.from_pretrained(
156
- pretrained_model_name_or_path, trust_remote_code=True
157
- )
158
-
159
-
160
- ASYNC_REQUEST_FUNCS = {
161
- "sglang": async_request_openai_completions,
162
- "vllm": async_request_openai_completions,
163
- "lmdeploy": async_request_openai_completions,
164
- }
165
-
166
-
167
- @dataclass
168
- class BenchmarkMetrics:
169
- completed: int
170
- total_input: int
171
- total_output: int
172
- request_throughput: float
173
- input_throughput: float
174
- output_throughput: float
175
- mean_ttft_ms: float
176
- median_ttft_ms: float
177
- std_ttft_ms: float
178
- p99_ttft_ms: float
179
- mean_tpot_ms: float
180
- median_tpot_ms: float
181
- std_tpot_ms: float
182
- p99_tpot_ms: float
183
- mean_itl_ms: float
184
- median_itl_ms: float
185
- std_itl_ms: float
186
- p99_itl_ms: float
187
-
188
-
189
- def sample_sharegpt_requests(
190
- dataset_path: str,
191
- num_requests: int,
192
- tokenizer: PreTrainedTokenizerBase,
193
- fixed_output_len: Optional[int] = None,
194
- ) -> List[Tuple[str, int, int]]:
195
- if fixed_output_len is not None and fixed_output_len < 4:
196
- raise ValueError("output_len too small")
197
-
198
- default_dataset_path = "ShareGPT_V3_unfiltered_cleaned_split.json"
199
- url = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
200
-
201
- if not os.path.isfile(dataset_path) and not os.path.isfile(default_dataset_path):
202
- print(f"Downloading dataset from {url}")
203
- try:
204
- response = requests.get(url, stream=True)
205
- response.raise_for_status()
206
-
207
- total_size = int(response.headers.get("content-length", 0))
208
- block_size = 8192
209
-
210
- with open(default_dataset_path, "wb") as f, tqdm(
211
- desc="Downloading",
212
- total=total_size,
213
- unit="iB",
214
- unit_scale=True,
215
- unit_divisor=1024,
216
- ) as progress_bar:
217
- for data in response.iter_content(block_size):
218
- size = f.write(data)
219
- progress_bar.update(size)
220
-
221
- print(f"Dataset downloaded and saved to {default_dataset_path}")
222
- dataset_path = default_dataset_path
223
- except requests.RequestException as e:
224
- raise Exception(f"Failed to download dataset: {e}")
225
- else:
226
- dataset_path = (
227
- dataset_path if os.path.isfile(dataset_path) else default_dataset_path
228
- )
229
-
230
- # Load the dataset.
231
- with open(dataset_path) as f:
232
- dataset = json.load(f)
233
- # Filter out the conversations with less than 2 turns.
234
- dataset = [data for data in dataset if len(data["conversations"]) >= 2]
235
- # Only keep the first two turns of each conversation.
236
- dataset = [
237
- (data["conversations"][0]["value"], data["conversations"][1]["value"])
238
- for data in dataset
239
- ]
240
-
241
- # Shuffle the dataset.
242
- random.shuffle(dataset)
243
-
244
- # Filter out sequences that are too long or too short
245
- filtered_dataset: List[Tuple[str, int, int]] = []
246
- for i in range(len(dataset)):
247
- if len(filtered_dataset) == num_requests:
248
- break
249
-
250
- # Tokenize the prompts and completions.
251
- prompt = dataset[i][0]
252
- prompt_token_ids = tokenizer(prompt).input_ids
253
- completion = dataset[i][1]
254
- completion_token_ids = tokenizer(completion).input_ids
255
- prompt_len = len(prompt_token_ids)
256
- output_len = (
257
- len(completion_token_ids) if fixed_output_len is None else fixed_output_len
258
- )
259
- if prompt_len < 4 or output_len < 4:
260
- # Prune too short sequences.
261
- continue
262
- if prompt_len > 1024 or prompt_len + output_len > 2048:
263
- # Prune too long sequences.
264
- continue
265
- filtered_dataset.append((prompt, prompt_len, output_len))
266
-
267
- return filtered_dataset
268
-
269
-
270
- async def get_request(
271
- input_requests: List[Tuple[str, int, int]],
272
- request_rate: float,
273
- ) -> AsyncGenerator[Tuple[str, int, int], None]:
274
- input_requests = iter(input_requests)
275
- for request in input_requests:
276
- yield request
277
-
278
- if request_rate == float("inf"):
279
- # If the request rate is infinity, then we don't need to wait.
280
- continue
281
-
282
- # Sample the request interval from the exponential distribution.
283
- interval = np.random.exponential(1.0 / request_rate)
284
- # The next request will be sent after the interval.
285
- await asyncio.sleep(interval)
286
-
287
-
288
- def calculate_metrics(
289
- input_requests: List[Tuple[str, int, int]],
290
- outputs: List[RequestFuncOutput],
291
- dur_s: float,
292
- tokenizer: PreTrainedTokenizerBase,
293
- ) -> Tuple[BenchmarkMetrics, List[int]]:
294
- actual_output_lens: List[int] = []
295
- total_input = 0
296
- completed = 0
297
- itls: List[float] = []
298
- tpots: List[float] = []
299
- ttfts: List[float] = []
300
- for i in range(len(outputs)):
301
- if outputs[i].success:
302
- # We use the tokenizer to count the number of output tokens for all
303
- # serving backends instead of looking at len(outputs[i].itl) since
304
- # multiple output tokens may be bundled together
305
- # Note : this may inflate the output token count slightly
306
- output_len = len(
307
- tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids
308
- )
309
- actual_output_lens.append(output_len)
310
- total_input += input_requests[i][1]
311
- if output_len > 1:
312
- tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
313
- itls += outputs[i].itl
314
- ttfts.append(outputs[i].ttft)
315
- completed += 1
316
- else:
317
- actual_output_lens.append(0)
318
-
319
- if completed == 0:
320
- warnings.warn(
321
- "All requests failed. This is likely due to a misconfiguration "
322
- "on the benchmark arguments.",
323
- stacklevel=2,
324
- )
325
- metrics = BenchmarkMetrics(
326
- completed=completed,
327
- total_input=total_input,
328
- total_output=sum(actual_output_lens),
329
- request_throughput=completed / dur_s,
330
- input_throughput=total_input / dur_s,
331
- output_throughput=sum(actual_output_lens) / dur_s,
332
- mean_ttft_ms=np.mean(ttfts or 0)
333
- * 1000, # ttfts is empty if streaming is not supported by backend
334
- median_ttft_ms=np.median(ttfts or 0) * 1000,
335
- std_ttft_ms=np.std(ttfts or 0) * 1000,
336
- p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
337
- mean_tpot_ms=np.mean(tpots or 0) * 1000,
338
- median_tpot_ms=np.median(tpots or 0) * 1000,
339
- std_tpot_ms=np.std(tpots or 0) * 1000,
340
- p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
341
- mean_itl_ms=np.mean(itls or 0) * 1000,
342
- median_itl_ms=np.median(itls or 0) * 1000,
343
- std_itl_ms=np.std(itls or 0) * 1000,
344
- p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
345
- )
346
-
347
- return metrics, actual_output_lens
348
-
349
-
350
- async def benchmark(
351
- backend: str,
352
- api_url: str,
353
- model_id: str,
354
- tokenizer: PreTrainedTokenizerBase,
355
- input_requests: List[Tuple[str, int, int]],
356
- request_rate: float,
357
- disable_tqdm: bool,
358
- ):
359
- if backend in ASYNC_REQUEST_FUNCS:
360
- request_func = ASYNC_REQUEST_FUNCS[backend]
361
- else:
362
- raise ValueError(f"Unknown backend: {backend}")
363
-
364
- print("Starting initial single prompt test run...")
365
- test_prompt, test_prompt_len, test_output_len = input_requests[0]
366
- test_input = RequestFuncInput(
367
- model=model_id,
368
- prompt=test_prompt,
369
- api_url=api_url,
370
- prompt_len=test_prompt_len,
371
- output_len=test_output_len,
372
- )
373
- test_output = await request_func(request_func_input=test_input)
374
- if not test_output.success:
375
- raise ValueError(
376
- "Initial test run failed - Please make sure benchmark arguments "
377
- f"are correctly specified. Error: {test_output.error}"
378
- )
379
- else:
380
- print("Initial test run completed. Starting main benchmark run...")
381
-
382
- pbar = None if disable_tqdm else tqdm(total=len(input_requests))
383
-
384
- benchmark_start_time = time.perf_counter()
385
- tasks: List[asyncio.Task] = []
386
- async for request in get_request(input_requests, request_rate):
387
- prompt, prompt_len, output_len = request
388
- request_func_input = RequestFuncInput(
389
- model=model_id,
390
- prompt=prompt,
391
- api_url=api_url,
392
- prompt_len=prompt_len,
393
- output_len=output_len,
394
- )
395
- tasks.append(
396
- asyncio.create_task(
397
- request_func(request_func_input=request_func_input, pbar=pbar)
398
- )
399
- )
400
- outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
401
-
402
- if pbar is not None:
403
- pbar.close()
404
-
405
- benchmark_duration = time.perf_counter() - benchmark_start_time
406
-
407
- metrics, actual_output_lens = calculate_metrics(
408
- input_requests=input_requests,
409
- outputs=outputs,
410
- dur_s=benchmark_duration,
411
- tokenizer=tokenizer,
412
- )
413
-
414
- print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
415
- print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
416
- print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
417
- print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
418
- print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
419
- print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
420
- print(
421
- "{:<40} {:<10.2f}".format(
422
- "Request throughput (req/s):", metrics.request_throughput
423
- )
424
- )
425
- print(
426
- "{:<40} {:<10.2f}".format(
427
- "Input token throughput (tok/s):", metrics.input_throughput
428
- )
429
- )
430
- print(
431
- "{:<40} {:<10.2f}".format(
432
- "Output token throughput (tok/s):", metrics.output_throughput
433
- )
434
- )
435
- print("{s:{c}^{n}}".format(s="Time to First Token", n=50, c="-"))
436
- print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
437
- print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
438
- print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
439
- print(
440
- "{s:{c}^{n}}".format(s="Time per Output Token (excl. 1st token)", n=50, c="-")
441
- )
442
- print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
443
- print("{:<40} {:<10.2f}".format("Median TPOT (ms):", metrics.median_tpot_ms))
444
- print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
445
- print("{s:{c}^{n}}".format(s="Inter-token Latency", n=50, c="-"))
446
- print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
447
- print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
448
- print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
449
- print("=" * 50)
450
-
451
- result = {
452
- "duration": benchmark_duration,
453
- "completed": metrics.completed,
454
- "total_input_tokens": metrics.total_input,
455
- "total_output_tokens": metrics.total_output,
456
- "request_throughput": metrics.request_throughput,
457
- "input_throughput": metrics.input_throughput,
458
- "output_throughput": metrics.output_throughput,
459
- "mean_ttft_ms": metrics.mean_ttft_ms,
460
- "median_ttft_ms": metrics.median_ttft_ms,
461
- "std_ttft_ms": metrics.std_ttft_ms,
462
- "p99_ttft_ms": metrics.p99_ttft_ms,
463
- "mean_tpot_ms": metrics.mean_tpot_ms,
464
- "median_tpot_ms": metrics.median_tpot_ms,
465
- "std_tpot_ms": metrics.std_tpot_ms,
466
- "p99_tpot_ms": metrics.p99_tpot_ms,
467
- "mean_itl_ms": metrics.mean_itl_ms,
468
- "median_itl_ms": metrics.median_itl_ms,
469
- "std_itl_ms": metrics.std_itl_ms,
470
- "p99_itl_ms": metrics.p99_itl_ms,
471
- "input_lens": [output.prompt_len for output in outputs],
472
- "output_lens": actual_output_lens,
473
- "ttfts": [output.ttft for output in outputs],
474
- "itls": [output.itl for output in outputs],
475
- "generated_texts": [output.generated_text for output in outputs],
476
- "errors": [output.error for output in outputs],
477
- }
478
- return result
479
-
480
-
481
- def fire(args: argparse.Namespace):
482
- random.seed(args.seed)
483
- np.random.seed(args.seed)
484
-
485
- if args.port is None:
486
- args.port = {
487
- "sglang": 30000,
488
- "lmdeploy": 23333,
489
- "vllm": 8000,
490
- }.get(args.backend, 30000)
491
-
492
- api_url = (
493
- f"{args.base_url}/v1/completions"
494
- if args.base_url
495
- else f"http://{args.host}:{args.port}/v1/completions"
496
- )
497
- model_url = (
498
- f"{args.base_url}/v1/models"
499
- if args.base_url
500
- else f"http://{args.host}:{args.port}/v1/models"
501
- )
502
-
503
- if args.model is None:
504
- try:
505
- response = requests.get(model_url)
506
- model_list = response.json().get("data", [])
507
- args.model = model_list[0]["id"] if model_list else None
508
- except Exception as e:
509
- print(f"Failed to fetch model from {model_url}. Error: {e}")
510
- print(
511
- "Please specify the correct host and port using `--host` and `--port`."
512
- )
513
- sys.exit(1)
514
-
515
- if args.model is None:
516
- print("No model specified or found. Please provide a model using `--model`.")
517
- sys.exit(1)
518
-
519
- print(f"{args}\n")
520
-
521
- backend = args.backend
522
- model_id = args.model
523
- tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
524
-
525
- tokenizer = get_tokenizer(tokenizer_id)
526
-
527
- assert args.dataset is not None
528
- input_requests = sample_sharegpt_requests(
529
- dataset_path=args.dataset,
530
- num_requests=args.num_prompts,
531
- tokenizer=tokenizer,
532
- fixed_output_len=args.sharegpt_output_len,
533
- )
534
-
535
- asyncio.run(
536
- benchmark(
537
- backend=backend,
538
- api_url=api_url,
539
- model_id=model_id,
540
- tokenizer=tokenizer,
541
- input_requests=input_requests,
542
- request_rate=args.request_rate,
543
- disable_tqdm=args.disable_tqdm,
544
- )
545
- )
546
-
547
-
548
- # to avoid relying on SGLang's components
549
- def set_ulimit(target_soft_limit=65535):
550
- resource_type = resource.RLIMIT_NOFILE
551
- current_soft, current_hard = resource.getrlimit(resource_type)
552
-
553
- if current_soft < target_soft_limit:
554
- try:
555
- resource.setrlimit(resource_type, (target_soft_limit, current_hard))
556
- except ValueError as e:
557
- print(f"Fail to set RLIMIT_NOFILE: {e}")
558
-
559
-
560
- if __name__ == "__main__":
561
- parser = FlexibleArgumentParser(
562
- description="Benchmark the online serving throughput."
563
- )
564
- parser.add_argument(
565
- "--backend",
566
- type=str,
567
- required=True,
568
- choices=list(ASYNC_REQUEST_FUNCS.keys()),
569
- help="Must specify a backend, depending on the LLM Inference Engine.",
570
- )
571
- parser.add_argument(
572
- "--base-url",
573
- type=str,
574
- default=None,
575
- help="Server or API base url if not using http host and port.",
576
- )
577
- parser.add_argument(
578
- "--host", type=str, default="0.0.0.0", help="Default host is 0.0.0.0."
579
- )
580
- parser.add_argument(
581
- "--port",
582
- type=int,
583
- help="If not set, the default port is configured according to its default value for different LLM Inference Engines.",
584
- )
585
- parser.add_argument(
586
- "--dataset", type=str, default="sharegpt", help="Path to the ShareGPT dataset"
587
- )
588
- parser.add_argument(
589
- "--model",
590
- type=str,
591
- help="Name or path of the model. If not set, the default model will request /v1/models for conf.",
592
- )
593
- parser.add_argument(
594
- "--tokenizer",
595
- type=str,
596
- help="Name or path of the tokenizer. If not set, using the model conf.",
597
- )
598
- parser.add_argument(
599
- "--num-prompts",
600
- type=int,
601
- default=1000,
602
- help="Number of prompts to process. Default is 1000.",
603
- )
604
- parser.add_argument(
605
- "--sharegpt-output-len",
606
- type=int,
607
- default=None,
608
- help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
609
- )
610
- parser.add_argument(
611
- "--request-rate",
612
- type=float,
613
- default=128.0,
614
- help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
615
- "Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0.",
616
- )
617
- parser.add_argument("--seed", type=int, default=0, help="Default is 0.")
618
- parser.add_argument(
619
- "--disable-tqdm",
620
- action="store_true",
621
- help="Specify to disable tqdm progress bar.",
622
- )
623
-
624
- set_ulimit()
625
-
626
- args = parser.parse_args()
627
- fire(args)