sglang 0.2.13__tar.gz → 0.2.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. {sglang-0.2.13/sglang.egg-info → sglang-0.2.14}/PKG-INFO +92 -25
  2. {sglang-0.2.13 → sglang-0.2.14}/README.md +87 -23
  3. {sglang-0.2.13 → sglang-0.2.14}/pyproject.toml +4 -4
  4. {sglang-0.2.13 → sglang-0.2.14}/sglang/api.py +6 -0
  5. {sglang-0.2.13 → sglang-0.2.14}/sglang/bench_latency.py +7 -3
  6. {sglang-0.2.13 → sglang-0.2.14}/sglang/bench_serving.py +50 -26
  7. {sglang-0.2.13 → sglang-0.2.14}/sglang/check_env.py +15 -0
  8. {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/chat_template.py +10 -5
  9. {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/compiler.py +4 -0
  10. {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/interpreter.py +1 -0
  11. {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/ir.py +9 -0
  12. {sglang-0.2.13 → sglang-0.2.14}/sglang/launch_server.py +8 -1
  13. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/conversation.py +50 -1
  14. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/hf_transformers_utils.py +22 -23
  15. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/activation.py +24 -1
  16. sglang-0.2.14/sglang/srt/layers/decode_attention.py +627 -0
  17. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/fused_moe/layer.py +2 -2
  18. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/layernorm.py +3 -0
  19. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/logits_processor.py +60 -23
  20. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/radix_attention.py +3 -4
  21. sglang-0.2.14/sglang/srt/layers/sampler.py +154 -0
  22. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/controller_multi.py +2 -8
  23. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/controller_single.py +7 -10
  24. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/detokenizer_manager.py +20 -9
  25. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/io_struct.py +44 -11
  26. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/policy_scheduler.py +5 -2
  27. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/schedule_batch.py +52 -167
  28. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/tokenizer_manager.py +192 -83
  29. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/tp_worker.py +130 -43
  30. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/mem_cache/memory_pool.py +82 -8
  31. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/mm_utils.py +79 -7
  32. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/model_executor/cuda_graph_runner.py +49 -11
  33. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/model_executor/forward_batch_info.py +59 -27
  34. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/model_executor/model_runner.py +210 -61
  35. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/chatglm.py +4 -12
  36. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/commandr.py +5 -1
  37. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/dbrx.py +5 -1
  38. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/deepseek.py +5 -1
  39. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/deepseek_v2.py +5 -1
  40. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/gemma.py +5 -1
  41. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/gemma2.py +15 -7
  42. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/gpt_bigcode.py +5 -1
  43. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/grok.py +16 -2
  44. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/internlm2.py +5 -1
  45. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/llama2.py +7 -3
  46. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/llama_classification.py +2 -2
  47. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/llama_embedding.py +4 -0
  48. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/llava.py +176 -59
  49. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/minicpm.py +5 -1
  50. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/mixtral.py +5 -1
  51. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/mixtral_quant.py +5 -1
  52. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/qwen.py +5 -2
  53. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/qwen2.py +13 -3
  54. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/qwen2_moe.py +5 -14
  55. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/stablelm.py +5 -1
  56. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/openai_api/adapter.py +117 -37
  57. sglang-0.2.14/sglang/srt/sampling/sampling_batch_info.py +209 -0
  58. {sglang-0.2.13/sglang/srt → sglang-0.2.14/sglang/srt/sampling}/sampling_params.py +18 -0
  59. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/server.py +84 -56
  60. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/server_args.py +43 -15
  61. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/utils.py +26 -16
  62. {sglang-0.2.13 → sglang-0.2.14}/sglang/test/runners.py +23 -31
  63. {sglang-0.2.13 → sglang-0.2.14}/sglang/test/simple_eval_common.py +9 -10
  64. {sglang-0.2.13 → sglang-0.2.14}/sglang/test/simple_eval_gpqa.py +2 -1
  65. {sglang-0.2.13 → sglang-0.2.14}/sglang/test/simple_eval_humaneval.py +2 -2
  66. {sglang-0.2.13 → sglang-0.2.14}/sglang/test/simple_eval_math.py +2 -1
  67. {sglang-0.2.13 → sglang-0.2.14}/sglang/test/simple_eval_mmlu.py +2 -1
  68. sglang-0.2.14/sglang/test/test_activation.py +55 -0
  69. {sglang-0.2.13 → sglang-0.2.14}/sglang/test/test_utils.py +36 -53
  70. sglang-0.2.14/sglang/version.py +1 -0
  71. {sglang-0.2.13 → sglang-0.2.14/sglang.egg-info}/PKG-INFO +92 -25
  72. {sglang-0.2.13 → sglang-0.2.14}/sglang.egg-info/SOURCES.txt +4 -2
  73. {sglang-0.2.13 → sglang-0.2.14}/sglang.egg-info/requires.txt +4 -1
  74. sglang-0.2.13/sglang/launch_server_llavavid.py +0 -29
  75. sglang-0.2.13/sglang/srt/layers/decode_attention.py +0 -339
  76. sglang-0.2.13/sglang/version.py +0 -1
  77. {sglang-0.2.13 → sglang-0.2.14}/LICENSE +0 -0
  78. {sglang-0.2.13 → sglang-0.2.14}/setup.cfg +0 -0
  79. {sglang-0.2.13 → sglang-0.2.14}/sglang/__init__.py +0 -0
  80. {sglang-0.2.13 → sglang-0.2.14}/sglang/global_config.py +0 -0
  81. {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/__init__.py +0 -0
  82. {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/backend/__init__.py +0 -0
  83. {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/backend/anthropic.py +0 -0
  84. {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/backend/base_backend.py +0 -0
  85. {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/backend/litellm.py +0 -0
  86. {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/backend/openai.py +0 -0
  87. {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/backend/runtime_endpoint.py +0 -0
  88. {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/backend/vertexai.py +0 -0
  89. {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/choices.py +0 -0
  90. {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/tracer.py +0 -0
  91. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/constrained/__init__.py +0 -0
  92. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/constrained/base_tool_cache.py +0 -0
  93. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/constrained/fsm_cache.py +0 -0
  94. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/constrained/jump_forward.py +0 -0
  95. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/extend_attention.py +0 -0
  96. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  97. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
  98. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/pooler.py +0 -0
  99. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/prefill_attention.py +0 -0
  100. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  101. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  102. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/mem_cache/flush_cache.py +0 -0
  103. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/mem_cache/radix_cache.py +0 -0
  104. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/model_config.py +0 -0
  105. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/llavavid.py +0 -0
  106. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/mistral.py +0 -0
  107. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/yivl.py +0 -0
  108. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/openai_api/protocol.py +0 -0
  109. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  110. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  111. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  112. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  113. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  114. {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  115. {sglang-0.2.13 → sglang-0.2.14}/sglang/test/run_eval.py +0 -0
  116. {sglang-0.2.13 → sglang-0.2.14}/sglang/test/simple_eval_mgsm.py +0 -0
  117. {sglang-0.2.13 → sglang-0.2.14}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  118. {sglang-0.2.13 → sglang-0.2.14}/sglang/test/test_layernorm.py +0 -0
  119. {sglang-0.2.13 → sglang-0.2.14}/sglang/test/test_programs.py +0 -0
  120. {sglang-0.2.13 → sglang-0.2.14}/sglang/utils.py +0 -0
  121. {sglang-0.2.13 → sglang-0.2.14}/sglang.egg-info/dependency_links.txt +0 -0
  122. {sglang-0.2.13 → sglang-0.2.14}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.13
3
+ Version: 0.2.14
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -216,6 +216,7 @@ Requires-Dist: tqdm
216
216
  Requires-Dist: numpy
217
217
  Provides-Extra: srt
218
218
  Requires-Dist: aiohttp; extra == "srt"
219
+ Requires-Dist: decord; extra == "srt"
219
220
  Requires-Dist: fastapi; extra == "srt"
220
221
  Requires-Dist: hf_transfer; extra == "srt"
221
222
  Requires-Dist: huggingface_hub; extra == "srt"
@@ -229,7 +230,7 @@ Requires-Dist: torch; extra == "srt"
229
230
  Requires-Dist: uvicorn; extra == "srt"
230
231
  Requires-Dist: uvloop; extra == "srt"
231
232
  Requires-Dist: zmq; extra == "srt"
232
- Requires-Dist: vllm==0.5.4; extra == "srt"
233
+ Requires-Dist: vllm==0.5.5; extra == "srt"
233
234
  Requires-Dist: outlines>=0.0.44; extra == "srt"
234
235
  Provides-Extra: openai
235
236
  Requires-Dist: openai>=1.0; extra == "openai"
@@ -242,6 +243,8 @@ Provides-Extra: test
242
243
  Requires-Dist: jsonlines; extra == "test"
243
244
  Requires-Dist: matplotlib; extra == "test"
244
245
  Requires-Dist: pandas; extra == "test"
246
+ Requires-Dist: sentence_transformers; extra == "test"
247
+ Requires-Dist: accelerate; extra == "test"
245
248
  Provides-Extra: all
246
249
  Requires-Dist: sglang[srt]; extra == "all"
247
250
  Requires-Dist: sglang[openai]; extra == "all"
@@ -270,17 +273,18 @@ SGLang is a fast serving framework for large language models and vision language
270
273
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
271
274
 
272
275
  The core features include:
273
- - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
276
+ - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
274
277
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
275
278
 
276
279
  ## News
277
280
  - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
278
- - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
281
+ - [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
279
282
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
280
283
 
281
284
  <details>
282
285
  <summary>More</summary>
283
286
 
287
+ - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
284
288
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
285
289
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
286
290
 
@@ -308,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
308
312
  ### Method 2: From source
309
313
  ```
310
314
  # Use the last release branch
311
- git clone -b v0.2.13 https://github.com/sgl-project/sglang.git
315
+ git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
312
316
  cd sglang
313
317
 
314
318
  pip install --upgrade pip
@@ -334,11 +338,55 @@ docker run --gpus all \
334
338
 
335
339
  ### Method 4: Using docker compose
336
340
 
341
+ <details>
342
+
337
343
  > This method is recommended if you plan to serve it as a service.
338
344
  > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
339
345
 
340
346
  1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
341
347
  2. Execute the command `docker compose up -d` in your terminal.
348
+ </details>
349
+
350
+ ### Method 5: Run on Kubernetes or Clouds with SkyPilot
351
+
352
+ <details>
353
+
354
+ To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
355
+
356
+ 1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
357
+ 2. Deploy on your own infra with a single command and get the HTTP API endpoint:
358
+ <details>
359
+ <summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
360
+
361
+ ```yaml
362
+ # sglang.yaml
363
+ envs:
364
+ HF_TOKEN: null
365
+
366
+ resources:
367
+ image_id: docker:lmsysorg/sglang:latest
368
+ accelerators: A100
369
+ ports: 30000
370
+
371
+ run: |
372
+ conda deactivate
373
+ python3 -m sglang.launch_server \
374
+ --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
375
+ --host 0.0.0.0 \
376
+ --port 30000
377
+ ```
378
+ </details>
379
+
380
+ ```bash
381
+ # Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
382
+ HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
383
+
384
+ # Get the HTTP API endpoint
385
+ sky status --endpoint 30000 sglang
386
+ ```
387
+ 3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
388
+ </details>
389
+
342
390
 
343
391
  ### Common Notes
344
392
  - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
@@ -395,6 +443,13 @@ response = client.chat.completions.create(
395
443
  max_tokens=64,
396
444
  )
397
445
  print(response)
446
+
447
+ # Text embedding
448
+ response = client.embeddings.create(
449
+ model="default",
450
+ input="How are you today",
451
+ )
452
+ print(response)
398
453
  ```
399
454
 
400
455
  It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
@@ -431,19 +486,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
431
486
 
432
487
  ### Supported Models
433
488
 
489
+ **Generative Models**
490
+
434
491
  - Llama / Llama 2 / Llama 3 / Llama 3.1
435
492
  - Mistral / Mixtral / Mistral NeMo
436
493
  - Gemma / Gemma 2
437
494
  - Qwen / Qwen 2 / Qwen 2 MoE
438
495
  - DeepSeek / DeepSeek 2
439
- - LLaVA 1.5 / 1.6
440
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
441
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
442
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
443
- - LLaVA-NeXT-Video
444
- - see [examples/usage/llava_video](examples/usage/llava_video)
496
+ - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
497
+ - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
498
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
499
+ - LLaVA 1.5 / 1.6 / NeXT
500
+ - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
501
+ - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
502
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
445
503
  - Yi-VL
446
- - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
447
504
  - StableLM
448
505
  - Command-R
449
506
  - DBRX
@@ -451,34 +508,45 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
451
508
  - ChatGLM
452
509
  - InternLM 2
453
510
 
511
+ **Embedding Models**
512
+
513
+ - e5-mistral
514
+ - gte-Qwen2
515
+ - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
516
+
454
517
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
455
518
 
456
519
  #### Use Models From ModelScope
457
- To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
520
+ <details>
521
+
522
+ To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
458
523
  ```
459
524
  export SGLANG_USE_MODELSCOPE=true
460
525
  ```
461
526
  Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
462
527
  ```
463
528
  SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
464
- ```
529
+ ```
530
+
531
+ </details>
465
532
 
466
533
  #### Run Llama 3.1 405B
534
+ <details>
467
535
 
468
536
  ```bash
469
- ## Run 405B (fp8) on a single node
537
+ # Run 405B (fp8) on a single node
470
538
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
471
539
 
472
- ## Run 405B (fp16) on two nodes
473
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
540
+ # Run 405B (fp16) on two nodes
541
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
542
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
474
543
 
475
- # on the first node
476
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
477
-
478
- # on the second
479
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
544
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
545
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
480
546
  ```
481
547
 
548
+ </details>
549
+
482
550
  ### Benchmark Performance
483
551
 
484
552
  - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
@@ -614,7 +682,7 @@ def tip_suggestion(s):
614
682
  s += "In summary" + sgl.gen("summary")
615
683
  ```
616
684
 
617
- #### Multi Modality
685
+ #### Multi-Modality
618
686
  Use `sgl.image` to pass an image as input.
619
687
 
620
688
  ```python
@@ -668,7 +736,7 @@ def character_gen(s, name):
668
736
  s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
669
737
  ```
670
738
 
671
- See also [json_decode.py](examples/usage/json_decode.py) for an additional example on specifying formats with Pydantic models.
739
+ See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
672
740
 
673
741
  #### Batching
674
742
  Use `run_batch` to run a batch of requests with continuous batching.
@@ -730,7 +798,6 @@ def chat_example(s):
730
798
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
731
799
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
732
800
 
733
-
734
801
  ## Benchmark And Performance
735
802
  ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
736
803
  ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
@@ -17,17 +17,18 @@ SGLang is a fast serving framework for large language models and vision language
17
17
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
18
18
 
19
19
  The core features include:
20
- - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
20
+ - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
21
21
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
22
22
 
23
23
  ## News
24
24
  - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
25
- - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
25
+ - [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
26
26
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
27
27
 
28
28
  <details>
29
29
  <summary>More</summary>
30
30
 
31
+ - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
31
32
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
32
33
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
33
34
 
@@ -55,7 +56,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
55
56
  ### Method 2: From source
56
57
  ```
57
58
  # Use the last release branch
58
- git clone -b v0.2.13 https://github.com/sgl-project/sglang.git
59
+ git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
59
60
  cd sglang
60
61
 
61
62
  pip install --upgrade pip
@@ -81,11 +82,55 @@ docker run --gpus all \
81
82
 
82
83
  ### Method 4: Using docker compose
83
84
 
85
+ <details>
86
+
84
87
  > This method is recommended if you plan to serve it as a service.
85
88
  > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
86
89
 
87
90
  1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
88
91
  2. Execute the command `docker compose up -d` in your terminal.
92
+ </details>
93
+
94
+ ### Method 5: Run on Kubernetes or Clouds with SkyPilot
95
+
96
+ <details>
97
+
98
+ To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
99
+
100
+ 1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
101
+ 2. Deploy on your own infra with a single command and get the HTTP API endpoint:
102
+ <details>
103
+ <summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
104
+
105
+ ```yaml
106
+ # sglang.yaml
107
+ envs:
108
+ HF_TOKEN: null
109
+
110
+ resources:
111
+ image_id: docker:lmsysorg/sglang:latest
112
+ accelerators: A100
113
+ ports: 30000
114
+
115
+ run: |
116
+ conda deactivate
117
+ python3 -m sglang.launch_server \
118
+ --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
119
+ --host 0.0.0.0 \
120
+ --port 30000
121
+ ```
122
+ </details>
123
+
124
+ ```bash
125
+ # Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
126
+ HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
127
+
128
+ # Get the HTTP API endpoint
129
+ sky status --endpoint 30000 sglang
130
+ ```
131
+ 3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
132
+ </details>
133
+
89
134
 
90
135
  ### Common Notes
91
136
  - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
@@ -142,6 +187,13 @@ response = client.chat.completions.create(
142
187
  max_tokens=64,
143
188
  )
144
189
  print(response)
190
+
191
+ # Text embedding
192
+ response = client.embeddings.create(
193
+ model="default",
194
+ input="How are you today",
195
+ )
196
+ print(response)
145
197
  ```
146
198
 
147
199
  It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
@@ -178,19 +230,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
178
230
 
179
231
  ### Supported Models
180
232
 
233
+ **Generative Models**
234
+
181
235
  - Llama / Llama 2 / Llama 3 / Llama 3.1
182
236
  - Mistral / Mixtral / Mistral NeMo
183
237
  - Gemma / Gemma 2
184
238
  - Qwen / Qwen 2 / Qwen 2 MoE
185
239
  - DeepSeek / DeepSeek 2
186
- - LLaVA 1.5 / 1.6
187
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
188
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
189
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
190
- - LLaVA-NeXT-Video
191
- - see [examples/usage/llava_video](examples/usage/llava_video)
240
+ - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
241
+ - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
242
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
243
+ - LLaVA 1.5 / 1.6 / NeXT
244
+ - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
245
+ - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
246
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
192
247
  - Yi-VL
193
- - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
194
248
  - StableLM
195
249
  - Command-R
196
250
  - DBRX
@@ -198,34 +252,45 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
198
252
  - ChatGLM
199
253
  - InternLM 2
200
254
 
255
+ **Embedding Models**
256
+
257
+ - e5-mistral
258
+ - gte-Qwen2
259
+ - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
260
+
201
261
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
202
262
 
203
263
  #### Use Models From ModelScope
204
- To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
264
+ <details>
265
+
266
+ To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
205
267
  ```
206
268
  export SGLANG_USE_MODELSCOPE=true
207
269
  ```
208
270
  Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
209
271
  ```
210
272
  SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
211
- ```
273
+ ```
274
+
275
+ </details>
212
276
 
213
277
  #### Run Llama 3.1 405B
278
+ <details>
214
279
 
215
280
  ```bash
216
- ## Run 405B (fp8) on a single node
281
+ # Run 405B (fp8) on a single node
217
282
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
218
283
 
219
- ## Run 405B (fp16) on two nodes
220
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
284
+ # Run 405B (fp16) on two nodes
285
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
286
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
221
287
 
222
- # on the first node
223
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
224
-
225
- # on the second
226
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
288
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
289
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
227
290
  ```
228
291
 
292
+ </details>
293
+
229
294
  ### Benchmark Performance
230
295
 
231
296
  - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
@@ -361,7 +426,7 @@ def tip_suggestion(s):
361
426
  s += "In summary" + sgl.gen("summary")
362
427
  ```
363
428
 
364
- #### Multi Modality
429
+ #### Multi-Modality
365
430
  Use `sgl.image` to pass an image as input.
366
431
 
367
432
  ```python
@@ -415,7 +480,7 @@ def character_gen(s, name):
415
480
  s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
416
481
  ```
417
482
 
418
- See also [json_decode.py](examples/usage/json_decode.py) for an additional example on specifying formats with Pydantic models.
483
+ See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
419
484
 
420
485
  #### Batching
421
486
  Use `run_batch` to run a batch of requests with continuous batching.
@@ -477,7 +542,6 @@ def chat_example(s):
477
542
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
478
543
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
479
544
 
480
-
481
545
  ## Benchmark And Performance
482
546
  ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
483
547
  ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.2.13"
7
+ version = "0.2.14"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -20,14 +20,14 @@ dependencies = [
20
20
  ]
21
21
 
22
22
  [project.optional-dependencies]
23
- srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
23
+ srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
24
24
  "packaging", "pillow", "psutil", "pydantic", "python-multipart",
25
25
  "torch", "uvicorn", "uvloop", "zmq",
26
- "vllm==0.5.4", "outlines>=0.0.44"]
26
+ "vllm==0.5.5", "outlines>=0.0.44"]
27
27
  openai = ["openai>=1.0", "tiktoken"]
28
28
  anthropic = ["anthropic>=0.20.0"]
29
29
  litellm = ["litellm>=1.0.0"]
30
- test = ["jsonlines", "matplotlib", "pandas"]
30
+ test = ["jsonlines", "matplotlib", "pandas", "sentence_transformers", "accelerate"]
31
31
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
32
32
  dev = ["sglang[all]", "sglang[test]"]
33
33
 
@@ -66,6 +66,7 @@ def gen(
66
66
  temperature: Optional[float] = None,
67
67
  top_p: Optional[float] = None,
68
68
  top_k: Optional[int] = None,
69
+ min_p: Optional[float] = None,
69
70
  frequency_penalty: Optional[float] = None,
70
71
  presence_penalty: Optional[float] = None,
71
72
  ignore_eos: Optional[bool] = None,
@@ -103,6 +104,7 @@ def gen(
103
104
  temperature,
104
105
  top_p,
105
106
  top_k,
107
+ min_p,
106
108
  frequency_penalty,
107
109
  presence_penalty,
108
110
  ignore_eos,
@@ -123,6 +125,7 @@ def gen_int(
123
125
  temperature: Optional[float] = None,
124
126
  top_p: Optional[float] = None,
125
127
  top_k: Optional[int] = None,
128
+ min_p: Optional[float] = None,
126
129
  frequency_penalty: Optional[float] = None,
127
130
  presence_penalty: Optional[float] = None,
128
131
  ignore_eos: Optional[bool] = None,
@@ -139,6 +142,7 @@ def gen_int(
139
142
  temperature,
140
143
  top_p,
141
144
  top_k,
145
+ min_p,
142
146
  frequency_penalty,
143
147
  presence_penalty,
144
148
  ignore_eos,
@@ -159,6 +163,7 @@ def gen_string(
159
163
  temperature: Optional[float] = None,
160
164
  top_p: Optional[float] = None,
161
165
  top_k: Optional[int] = None,
166
+ min_p: Optional[float] = None,
162
167
  frequency_penalty: Optional[float] = None,
163
168
  presence_penalty: Optional[float] = None,
164
169
  ignore_eos: Optional[bool] = None,
@@ -175,6 +180,7 @@ def gen_string(
175
180
  temperature,
176
181
  top_p,
177
182
  top_k,
183
+ min_p,
178
184
  frequency_penalty,
179
185
  presence_penalty,
180
186
  ignore_eos,
@@ -54,7 +54,7 @@ from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
54
54
  from sglang.srt.model_config import ModelConfig
55
55
  from sglang.srt.model_executor.forward_batch_info import ForwardMode
56
56
  from sglang.srt.model_executor.model_runner import ModelRunner
57
- from sglang.srt.sampling_params import SamplingParams
57
+ from sglang.srt.sampling.sampling_params import SamplingParams
58
58
  from sglang.srt.server_args import ServerArgs
59
59
  from sglang.srt.utils import suppress_other_loggers
60
60
 
@@ -111,7 +111,11 @@ def load_model(server_args, tp_rank):
111
111
  suppress_other_loggers()
112
112
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
113
113
 
114
- model_config = ModelConfig(path=server_args.model_path)
114
+ model_config = ModelConfig(
115
+ server_args.model_path,
116
+ server_args.trust_remote_code,
117
+ context_length=server_args.context_length,
118
+ )
115
119
  model_runner = ModelRunner(
116
120
  model_config=model_config,
117
121
  mem_fraction_static=server_args.mem_fraction_static,
@@ -350,7 +354,7 @@ def latency_test(
350
354
  for bs, il, ol in itertools.product(
351
355
  bench_args.batch_size, bench_args.input_len, bench_args.output_len
352
356
  ):
353
- req = prepare_synthetic_inputs_for_latency_test(bs, il)
357
+ reqs = prepare_synthetic_inputs_for_latency_test(bs, il)
354
358
  ret = latency_test_run_once(
355
359
  bench_args.run_name, model_runner, rank_print, reqs, bs, il, ol
356
360
  )