sglang 0.2.12__tar.gz → 0.2.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. {sglang-0.2.12/sglang.egg-info → sglang-0.2.14}/PKG-INFO +102 -27
  2. {sglang-0.2.12 → sglang-0.2.14}/README.md +97 -25
  3. {sglang-0.2.12 → sglang-0.2.14}/pyproject.toml +4 -4
  4. {sglang-0.2.12 → sglang-0.2.14}/sglang/api.py +13 -1
  5. {sglang-0.2.12 → sglang-0.2.14}/sglang/bench_latency.py +10 -5
  6. {sglang-0.2.12 → sglang-0.2.14}/sglang/bench_serving.py +50 -26
  7. {sglang-0.2.12 → sglang-0.2.14}/sglang/check_env.py +15 -0
  8. {sglang-0.2.12 → sglang-0.2.14}/sglang/global_config.py +1 -1
  9. {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/backend/runtime_endpoint.py +60 -49
  10. {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/chat_template.py +10 -5
  11. {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/compiler.py +4 -0
  12. {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/interpreter.py +5 -2
  13. {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/ir.py +22 -4
  14. {sglang-0.2.12 → sglang-0.2.14}/sglang/launch_server.py +8 -1
  15. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/constrained/jump_forward.py +13 -2
  16. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/conversation.py +50 -1
  17. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/hf_transformers_utils.py +22 -23
  18. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/layers/activation.py +24 -2
  19. sglang-0.2.14/sglang/srt/layers/decode_attention.py +627 -0
  20. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/layers/extend_attention.py +3 -1
  21. sglang-0.2.14/sglang/srt/layers/fused_moe/__init__.py +1 -0
  22. {sglang-0.2.12/sglang/srt/layers → sglang-0.2.14/sglang/srt/layers/fused_moe}/fused_moe.py +165 -108
  23. sglang-0.2.14/sglang/srt/layers/fused_moe/layer.py +587 -0
  24. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/layers/layernorm.py +3 -0
  25. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/layers/logits_processor.py +64 -27
  26. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/layers/radix_attention.py +41 -18
  27. sglang-0.2.14/sglang/srt/layers/sampler.py +154 -0
  28. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/controller_multi.py +2 -8
  29. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/controller_single.py +7 -10
  30. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/detokenizer_manager.py +20 -9
  31. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/io_struct.py +44 -11
  32. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/policy_scheduler.py +5 -2
  33. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/schedule_batch.py +59 -179
  34. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/tokenizer_manager.py +193 -84
  35. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/tp_worker.py +131 -50
  36. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/mem_cache/memory_pool.py +82 -8
  37. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/mm_utils.py +79 -7
  38. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/model_executor/cuda_graph_runner.py +97 -28
  39. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/model_executor/forward_batch_info.py +188 -82
  40. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/model_executor/model_runner.py +269 -87
  41. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/chatglm.py +6 -14
  42. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/commandr.py +6 -2
  43. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/dbrx.py +5 -1
  44. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/deepseek.py +7 -3
  45. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/deepseek_v2.py +12 -7
  46. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/gemma.py +6 -2
  47. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/gemma2.py +22 -8
  48. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/gpt_bigcode.py +5 -1
  49. sglang-0.2.14/sglang/srt/models/grok.py +422 -0
  50. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/internlm2.py +5 -1
  51. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/llama2.py +7 -3
  52. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/llama_classification.py +2 -2
  53. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/llama_embedding.py +4 -0
  54. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/llava.py +176 -59
  55. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/minicpm.py +7 -3
  56. sglang-0.2.14/sglang/srt/models/mixtral.py +384 -0
  57. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/mixtral_quant.py +6 -5
  58. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/qwen.py +7 -4
  59. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/qwen2.py +15 -5
  60. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/qwen2_moe.py +7 -16
  61. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/stablelm.py +6 -2
  62. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/openai_api/adapter.py +149 -58
  63. sglang-0.2.14/sglang/srt/sampling/sampling_batch_info.py +209 -0
  64. {sglang-0.2.12/sglang/srt → sglang-0.2.14/sglang/srt/sampling}/sampling_params.py +18 -4
  65. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/server.py +107 -71
  66. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/server_args.py +49 -15
  67. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/utils.py +27 -18
  68. {sglang-0.2.12 → sglang-0.2.14}/sglang/test/runners.py +38 -38
  69. {sglang-0.2.12 → sglang-0.2.14}/sglang/test/simple_eval_common.py +9 -10
  70. {sglang-0.2.12 → sglang-0.2.14}/sglang/test/simple_eval_gpqa.py +2 -1
  71. {sglang-0.2.12 → sglang-0.2.14}/sglang/test/simple_eval_humaneval.py +2 -2
  72. {sglang-0.2.12 → sglang-0.2.14}/sglang/test/simple_eval_math.py +2 -1
  73. {sglang-0.2.12 → sglang-0.2.14}/sglang/test/simple_eval_mmlu.py +2 -1
  74. sglang-0.2.14/sglang/test/test_activation.py +55 -0
  75. {sglang-0.2.12 → sglang-0.2.14}/sglang/test/test_programs.py +32 -5
  76. {sglang-0.2.12 → sglang-0.2.14}/sglang/test/test_utils.py +37 -50
  77. sglang-0.2.14/sglang/version.py +1 -0
  78. {sglang-0.2.12 → sglang-0.2.14/sglang.egg-info}/PKG-INFO +102 -27
  79. {sglang-0.2.12 → sglang-0.2.14}/sglang.egg-info/SOURCES.txt +7 -5
  80. {sglang-0.2.12 → sglang-0.2.14}/sglang.egg-info/requires.txt +4 -1
  81. sglang-0.2.12/sglang/launch_server_llavavid.py +0 -29
  82. sglang-0.2.12/sglang/srt/layers/decode_attention.py +0 -339
  83. sglang-0.2.12/sglang/srt/model_loader/model_loader.py +0 -292
  84. sglang-0.2.12/sglang/srt/model_loader/utils.py +0 -275
  85. sglang-0.2.12/sglang/srt/models/grok.py +0 -754
  86. sglang-0.2.12/sglang/srt/models/mixtral.py +0 -578
  87. sglang-0.2.12/sglang/version.py +0 -1
  88. {sglang-0.2.12 → sglang-0.2.14}/LICENSE +0 -0
  89. {sglang-0.2.12 → sglang-0.2.14}/setup.cfg +0 -0
  90. {sglang-0.2.12 → sglang-0.2.14}/sglang/__init__.py +0 -0
  91. {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/__init__.py +0 -0
  92. {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/backend/__init__.py +0 -0
  93. {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/backend/anthropic.py +0 -0
  94. {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/backend/base_backend.py +0 -0
  95. {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/backend/litellm.py +0 -0
  96. {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/backend/openai.py +0 -0
  97. {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/backend/vertexai.py +0 -0
  98. {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/choices.py +0 -0
  99. {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/tracer.py +0 -0
  100. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/constrained/__init__.py +0 -0
  101. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/constrained/base_tool_cache.py +0 -0
  102. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/constrained/fsm_cache.py +0 -0
  103. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/layers/pooler.py +0 -0
  104. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/layers/prefill_attention.py +0 -0
  105. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  106. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  107. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/mem_cache/flush_cache.py +0 -0
  108. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/mem_cache/radix_cache.py +0 -0
  109. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/model_config.py +0 -0
  110. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/llavavid.py +0 -0
  111. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/mistral.py +0 -0
  112. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/yivl.py +0 -0
  113. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/openai_api/protocol.py +0 -0
  114. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  115. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  116. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  117. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  118. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  119. {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  120. {sglang-0.2.12 → sglang-0.2.14}/sglang/test/run_eval.py +0 -0
  121. {sglang-0.2.12 → sglang-0.2.14}/sglang/test/simple_eval_mgsm.py +0 -0
  122. {sglang-0.2.12 → sglang-0.2.14}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  123. {sglang-0.2.12 → sglang-0.2.14}/sglang/test/test_layernorm.py +0 -0
  124. {sglang-0.2.12 → sglang-0.2.14}/sglang/utils.py +0 -0
  125. {sglang-0.2.12 → sglang-0.2.14}/sglang.egg-info/dependency_links.txt +0 -0
  126. {sglang-0.2.12 → sglang-0.2.14}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.12
3
+ Version: 0.2.14
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -216,6 +216,7 @@ Requires-Dist: tqdm
216
216
  Requires-Dist: numpy
217
217
  Provides-Extra: srt
218
218
  Requires-Dist: aiohttp; extra == "srt"
219
+ Requires-Dist: decord; extra == "srt"
219
220
  Requires-Dist: fastapi; extra == "srt"
220
221
  Requires-Dist: hf_transfer; extra == "srt"
221
222
  Requires-Dist: huggingface_hub; extra == "srt"
@@ -229,7 +230,7 @@ Requires-Dist: torch; extra == "srt"
229
230
  Requires-Dist: uvicorn; extra == "srt"
230
231
  Requires-Dist: uvloop; extra == "srt"
231
232
  Requires-Dist: zmq; extra == "srt"
232
- Requires-Dist: vllm==0.5.4; extra == "srt"
233
+ Requires-Dist: vllm==0.5.5; extra == "srt"
233
234
  Requires-Dist: outlines>=0.0.44; extra == "srt"
234
235
  Provides-Extra: openai
235
236
  Requires-Dist: openai>=1.0; extra == "openai"
@@ -242,6 +243,8 @@ Provides-Extra: test
242
243
  Requires-Dist: jsonlines; extra == "test"
243
244
  Requires-Dist: matplotlib; extra == "test"
244
245
  Requires-Dist: pandas; extra == "test"
246
+ Requires-Dist: sentence_transformers; extra == "test"
247
+ Requires-Dist: accelerate; extra == "test"
245
248
  Provides-Extra: all
246
249
  Requires-Dist: sglang[srt]; extra == "all"
247
250
  Requires-Dist: sglang[openai]; extra == "all"
@@ -270,17 +273,18 @@ SGLang is a fast serving framework for large language models and vision language
270
273
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
271
274
 
272
275
  The core features include:
273
- - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
276
+ - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
274
277
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
275
278
 
276
279
  ## News
277
280
  - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
278
- - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
281
+ - [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
279
282
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
280
283
 
281
284
  <details>
282
285
  <summary>More</summary>
283
286
 
287
+ - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
284
288
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
285
289
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
286
290
 
@@ -308,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
308
312
  ### Method 2: From source
309
313
  ```
310
314
  # Use the last release branch
311
- git clone -b v0.2.12 https://github.com/sgl-project/sglang.git
315
+ git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
312
316
  cd sglang
313
317
 
314
318
  pip install --upgrade pip
@@ -329,11 +333,63 @@ docker run --gpus all \
329
333
  --env "HF_TOKEN=<secret>" \
330
334
  --ipc=host \
331
335
  lmsysorg/sglang:latest \
332
- python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
336
+ python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
333
337
  ```
334
338
 
339
+ ### Method 4: Using docker compose
340
+
341
+ <details>
342
+
343
+ > This method is recommended if you plan to serve it as a service.
344
+ > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
345
+
346
+ 1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
347
+ 2. Execute the command `docker compose up -d` in your terminal.
348
+ </details>
349
+
350
+ ### Method 5: Run on Kubernetes or Clouds with SkyPilot
351
+
352
+ <details>
353
+
354
+ To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
355
+
356
+ 1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
357
+ 2. Deploy on your own infra with a single command and get the HTTP API endpoint:
358
+ <details>
359
+ <summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
360
+
361
+ ```yaml
362
+ # sglang.yaml
363
+ envs:
364
+ HF_TOKEN: null
365
+
366
+ resources:
367
+ image_id: docker:lmsysorg/sglang:latest
368
+ accelerators: A100
369
+ ports: 30000
370
+
371
+ run: |
372
+ conda deactivate
373
+ python3 -m sglang.launch_server \
374
+ --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
375
+ --host 0.0.0.0 \
376
+ --port 30000
377
+ ```
378
+ </details>
379
+
380
+ ```bash
381
+ # Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
382
+ HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
383
+
384
+ # Get the HTTP API endpoint
385
+ sky status --endpoint 30000 sglang
386
+ ```
387
+ 3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
388
+ </details>
389
+
390
+
335
391
  ### Common Notes
336
- - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
392
+ - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
337
393
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
338
394
 
339
395
  ## Backend: SGLang Runtime (SRT)
@@ -387,6 +443,13 @@ response = client.chat.completions.create(
387
443
  max_tokens=64,
388
444
  )
389
445
  print(response)
446
+
447
+ # Text embedding
448
+ response = client.embeddings.create(
449
+ model="default",
450
+ input="How are you today",
451
+ )
452
+ print(response)
390
453
  ```
391
454
 
392
455
  It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
@@ -423,19 +486,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
423
486
 
424
487
  ### Supported Models
425
488
 
489
+ **Generative Models**
490
+
426
491
  - Llama / Llama 2 / Llama 3 / Llama 3.1
427
492
  - Mistral / Mixtral / Mistral NeMo
428
493
  - Gemma / Gemma 2
429
494
  - Qwen / Qwen 2 / Qwen 2 MoE
430
495
  - DeepSeek / DeepSeek 2
431
- - LLaVA 1.5 / 1.6
432
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
433
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
434
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
435
- - LLaVA-NeXT-Video
436
- - see [examples/usage/llava_video](examples/usage/llava_video)
496
+ - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
497
+ - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
498
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
499
+ - LLaVA 1.5 / 1.6 / NeXT
500
+ - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
501
+ - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
502
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
437
503
  - Yi-VL
438
- - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
439
504
  - StableLM
440
505
  - Command-R
441
506
  - DBRX
@@ -443,34 +508,45 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
443
508
  - ChatGLM
444
509
  - InternLM 2
445
510
 
511
+ **Embedding Models**
512
+
513
+ - e5-mistral
514
+ - gte-Qwen2
515
+ - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
516
+
446
517
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
447
518
 
448
519
  #### Use Models From ModelScope
449
- To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
520
+ <details>
521
+
522
+ To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
450
523
  ```
451
524
  export SGLANG_USE_MODELSCOPE=true
452
525
  ```
453
526
  Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
454
527
  ```
455
528
  SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
456
- ```
529
+ ```
530
+
531
+ </details>
457
532
 
458
533
  #### Run Llama 3.1 405B
534
+ <details>
459
535
 
460
536
  ```bash
461
- ## Run 405B (fp8) on a single node
537
+ # Run 405B (fp8) on a single node
462
538
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
463
539
 
464
- ## Run 405B (fp16) on two nodes
465
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
466
-
467
- # on the first node
468
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
540
+ # Run 405B (fp16) on two nodes
541
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
542
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
469
543
 
470
- # on the second
471
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
544
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
545
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
472
546
  ```
473
547
 
548
+ </details>
549
+
474
550
  ### Benchmark Performance
475
551
 
476
552
  - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
@@ -606,7 +682,7 @@ def tip_suggestion(s):
606
682
  s += "In summary" + sgl.gen("summary")
607
683
  ```
608
684
 
609
- #### Multi Modality
685
+ #### Multi-Modality
610
686
  Use `sgl.image` to pass an image as input.
611
687
 
612
688
  ```python
@@ -660,7 +736,7 @@ def character_gen(s, name):
660
736
  s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
661
737
  ```
662
738
 
663
- See also [json_decode.py](examples/usage/json_decode.py) for an additional example on specifying formats with Pydantic models.
739
+ See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
664
740
 
665
741
  #### Batching
666
742
  Use `run_batch` to run a batch of requests with continuous batching.
@@ -722,7 +798,6 @@ def chat_example(s):
722
798
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
723
799
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
724
800
 
725
-
726
801
  ## Benchmark And Performance
727
802
  ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
728
803
  ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
@@ -17,17 +17,18 @@ SGLang is a fast serving framework for large language models and vision language
17
17
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
18
18
 
19
19
  The core features include:
20
- - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
20
+ - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
21
21
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
22
22
 
23
23
  ## News
24
24
  - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
25
- - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
25
+ - [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
26
26
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
27
27
 
28
28
  <details>
29
29
  <summary>More</summary>
30
30
 
31
+ - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
31
32
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
32
33
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
33
34
 
@@ -55,7 +56,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
55
56
  ### Method 2: From source
56
57
  ```
57
58
  # Use the last release branch
58
- git clone -b v0.2.12 https://github.com/sgl-project/sglang.git
59
+ git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
59
60
  cd sglang
60
61
 
61
62
  pip install --upgrade pip
@@ -76,11 +77,63 @@ docker run --gpus all \
76
77
  --env "HF_TOKEN=<secret>" \
77
78
  --ipc=host \
78
79
  lmsysorg/sglang:latest \
79
- python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
80
+ python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
80
81
  ```
81
82
 
83
+ ### Method 4: Using docker compose
84
+
85
+ <details>
86
+
87
+ > This method is recommended if you plan to serve it as a service.
88
+ > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
89
+
90
+ 1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
91
+ 2. Execute the command `docker compose up -d` in your terminal.
92
+ </details>
93
+
94
+ ### Method 5: Run on Kubernetes or Clouds with SkyPilot
95
+
96
+ <details>
97
+
98
+ To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
99
+
100
+ 1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
101
+ 2. Deploy on your own infra with a single command and get the HTTP API endpoint:
102
+ <details>
103
+ <summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
104
+
105
+ ```yaml
106
+ # sglang.yaml
107
+ envs:
108
+ HF_TOKEN: null
109
+
110
+ resources:
111
+ image_id: docker:lmsysorg/sglang:latest
112
+ accelerators: A100
113
+ ports: 30000
114
+
115
+ run: |
116
+ conda deactivate
117
+ python3 -m sglang.launch_server \
118
+ --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
119
+ --host 0.0.0.0 \
120
+ --port 30000
121
+ ```
122
+ </details>
123
+
124
+ ```bash
125
+ # Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
126
+ HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
127
+
128
+ # Get the HTTP API endpoint
129
+ sky status --endpoint 30000 sglang
130
+ ```
131
+ 3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
132
+ </details>
133
+
134
+
82
135
  ### Common Notes
83
- - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
136
+ - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
84
137
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
85
138
 
86
139
  ## Backend: SGLang Runtime (SRT)
@@ -134,6 +187,13 @@ response = client.chat.completions.create(
134
187
  max_tokens=64,
135
188
  )
136
189
  print(response)
190
+
191
+ # Text embedding
192
+ response = client.embeddings.create(
193
+ model="default",
194
+ input="How are you today",
195
+ )
196
+ print(response)
137
197
  ```
138
198
 
139
199
  It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
@@ -170,19 +230,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
170
230
 
171
231
  ### Supported Models
172
232
 
233
+ **Generative Models**
234
+
173
235
  - Llama / Llama 2 / Llama 3 / Llama 3.1
174
236
  - Mistral / Mixtral / Mistral NeMo
175
237
  - Gemma / Gemma 2
176
238
  - Qwen / Qwen 2 / Qwen 2 MoE
177
239
  - DeepSeek / DeepSeek 2
178
- - LLaVA 1.5 / 1.6
179
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
180
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
181
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
182
- - LLaVA-NeXT-Video
183
- - see [examples/usage/llava_video](examples/usage/llava_video)
240
+ - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
241
+ - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
242
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
243
+ - LLaVA 1.5 / 1.6 / NeXT
244
+ - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
245
+ - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
246
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
184
247
  - Yi-VL
185
- - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
186
248
  - StableLM
187
249
  - Command-R
188
250
  - DBRX
@@ -190,34 +252,45 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
190
252
  - ChatGLM
191
253
  - InternLM 2
192
254
 
255
+ **Embedding Models**
256
+
257
+ - e5-mistral
258
+ - gte-Qwen2
259
+ - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
260
+
193
261
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
194
262
 
195
263
  #### Use Models From ModelScope
196
- To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
264
+ <details>
265
+
266
+ To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
197
267
  ```
198
268
  export SGLANG_USE_MODELSCOPE=true
199
269
  ```
200
270
  Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
201
271
  ```
202
272
  SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
203
- ```
273
+ ```
274
+
275
+ </details>
204
276
 
205
277
  #### Run Llama 3.1 405B
278
+ <details>
206
279
 
207
280
  ```bash
208
- ## Run 405B (fp8) on a single node
281
+ # Run 405B (fp8) on a single node
209
282
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
210
283
 
211
- ## Run 405B (fp16) on two nodes
212
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
213
-
214
- # on the first node
215
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
284
+ # Run 405B (fp16) on two nodes
285
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
286
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
216
287
 
217
- # on the second
218
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
288
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
289
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
219
290
  ```
220
291
 
292
+ </details>
293
+
221
294
  ### Benchmark Performance
222
295
 
223
296
  - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
@@ -353,7 +426,7 @@ def tip_suggestion(s):
353
426
  s += "In summary" + sgl.gen("summary")
354
427
  ```
355
428
 
356
- #### Multi Modality
429
+ #### Multi-Modality
357
430
  Use `sgl.image` to pass an image as input.
358
431
 
359
432
  ```python
@@ -407,7 +480,7 @@ def character_gen(s, name):
407
480
  s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
408
481
  ```
409
482
 
410
- See also [json_decode.py](examples/usage/json_decode.py) for an additional example on specifying formats with Pydantic models.
483
+ See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
411
484
 
412
485
  #### Batching
413
486
  Use `run_batch` to run a batch of requests with continuous batching.
@@ -469,7 +542,6 @@ def chat_example(s):
469
542
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
470
543
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
471
544
 
472
-
473
545
  ## Benchmark And Performance
474
546
  ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
475
547
  ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.2.12"
7
+ version = "0.2.14"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -20,14 +20,14 @@ dependencies = [
20
20
  ]
21
21
 
22
22
  [project.optional-dependencies]
23
- srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
23
+ srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
24
24
  "packaging", "pillow", "psutil", "pydantic", "python-multipart",
25
25
  "torch", "uvicorn", "uvloop", "zmq",
26
- "vllm==0.5.4", "outlines>=0.0.44"]
26
+ "vllm==0.5.5", "outlines>=0.0.44"]
27
27
  openai = ["openai>=1.0", "tiktoken"]
28
28
  anthropic = ["anthropic>=0.20.0"]
29
29
  litellm = ["litellm>=1.0.0"]
30
- test = ["jsonlines", "matplotlib", "pandas"]
30
+ test = ["jsonlines", "matplotlib", "pandas", "sentence_transformers", "accelerate"]
31
31
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
32
32
  dev = ["sglang[all]", "sglang[test]"]
33
33
 
@@ -62,9 +62,11 @@ def gen(
62
62
  name: Optional[str] = None,
63
63
  max_tokens: Optional[int] = None,
64
64
  stop: Optional[Union[str, List[str]]] = None,
65
+ stop_token_ids: Optional[List[int]] = None,
65
66
  temperature: Optional[float] = None,
66
67
  top_p: Optional[float] = None,
67
68
  top_k: Optional[int] = None,
69
+ min_p: Optional[float] = None,
68
70
  frequency_penalty: Optional[float] = None,
69
71
  presence_penalty: Optional[float] = None,
70
72
  ignore_eos: Optional[bool] = None,
@@ -72,7 +74,7 @@ def gen(
72
74
  logprob_start_len: Optional[int] = None,
73
75
  top_logprobs_num: Optional[int] = None,
74
76
  return_text_in_logprobs: Optional[bool] = None,
75
- dtype: Optional[type] = None,
77
+ dtype: Optional[Union[type, str]] = None,
76
78
  choices: Optional[List[str]] = None,
77
79
  choices_method: Optional[ChoicesSamplingMethod] = None,
78
80
  regex: Optional[str] = None,
@@ -98,9 +100,11 @@ def gen(
98
100
  name,
99
101
  max_tokens,
100
102
  stop,
103
+ stop_token_ids,
101
104
  temperature,
102
105
  top_p,
103
106
  top_k,
107
+ min_p,
104
108
  frequency_penalty,
105
109
  presence_penalty,
106
110
  ignore_eos,
@@ -117,9 +121,11 @@ def gen_int(
117
121
  name: Optional[str] = None,
118
122
  max_tokens: Optional[int] = None,
119
123
  stop: Optional[Union[str, List[str]]] = None,
124
+ stop_token_ids: Optional[List[int]] = None,
120
125
  temperature: Optional[float] = None,
121
126
  top_p: Optional[float] = None,
122
127
  top_k: Optional[int] = None,
128
+ min_p: Optional[float] = None,
123
129
  frequency_penalty: Optional[float] = None,
124
130
  presence_penalty: Optional[float] = None,
125
131
  ignore_eos: Optional[bool] = None,
@@ -132,9 +138,11 @@ def gen_int(
132
138
  name,
133
139
  max_tokens,
134
140
  stop,
141
+ stop_token_ids,
135
142
  temperature,
136
143
  top_p,
137
144
  top_k,
145
+ min_p,
138
146
  frequency_penalty,
139
147
  presence_penalty,
140
148
  ignore_eos,
@@ -151,9 +159,11 @@ def gen_string(
151
159
  name: Optional[str] = None,
152
160
  max_tokens: Optional[int] = None,
153
161
  stop: Optional[Union[str, List[str]]] = None,
162
+ stop_token_ids: Optional[List[int]] = None,
154
163
  temperature: Optional[float] = None,
155
164
  top_p: Optional[float] = None,
156
165
  top_k: Optional[int] = None,
166
+ min_p: Optional[float] = None,
157
167
  frequency_penalty: Optional[float] = None,
158
168
  presence_penalty: Optional[float] = None,
159
169
  ignore_eos: Optional[bool] = None,
@@ -166,9 +176,11 @@ def gen_string(
166
176
  name,
167
177
  max_tokens,
168
178
  stop,
179
+ stop_token_ids,
169
180
  temperature,
170
181
  top_p,
171
182
  top_k,
183
+ min_p,
172
184
  frequency_penalty,
173
185
  presence_penalty,
174
186
  ignore_eos,
@@ -54,7 +54,7 @@ from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
54
54
  from sglang.srt.model_config import ModelConfig
55
55
  from sglang.srt.model_executor.forward_batch_info import ForwardMode
56
56
  from sglang.srt.model_executor.model_runner import ModelRunner
57
- from sglang.srt.sampling_params import SamplingParams
57
+ from sglang.srt.sampling.sampling_params import SamplingParams
58
58
  from sglang.srt.server_args import ServerArgs
59
59
  from sglang.srt.utils import suppress_other_loggers
60
60
 
@@ -64,7 +64,7 @@ class BenchArgs:
64
64
  run_name: str = "before"
65
65
  batch_size: Tuple[int] = (1,)
66
66
  input_len: Tuple[int] = (1024,)
67
- output_len: Tuple[int] = (4,)
67
+ output_len: Tuple[int] = (16,)
68
68
  result_filename: str = ""
69
69
  correctness_test: bool = False
70
70
  # This is only used for correctness test
@@ -111,7 +111,11 @@ def load_model(server_args, tp_rank):
111
111
  suppress_other_loggers()
112
112
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
113
113
 
114
- model_config = ModelConfig(path=server_args.model_path)
114
+ model_config = ModelConfig(
115
+ server_args.model_path,
116
+ server_args.trust_remote_code,
117
+ context_length=server_args.context_length,
118
+ )
115
119
  model_runner = ModelRunner(
116
120
  model_config=model_config,
117
121
  mem_fraction_static=server_args.mem_fraction_static,
@@ -195,7 +199,7 @@ def extend(reqs, model_runner):
195
199
  token_to_kv_pool=model_runner.token_to_kv_pool,
196
200
  tree_cache=None,
197
201
  )
198
- batch.prepare_for_extend(model_runner.model_config.vocab_size, None)
202
+ batch.prepare_for_extend(model_runner.model_config.vocab_size)
199
203
  output = model_runner.forward(batch, ForwardMode.EXTEND)
200
204
  next_token_ids = batch.sample(output.next_token_logits)
201
205
  return next_token_ids, output.next_token_logits, batch
@@ -221,6 +225,7 @@ def correctness_test(
221
225
 
222
226
  # Prepare inputs
223
227
  input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
228
+ rank_print(f"{input_ids=}")
224
229
 
225
230
  if bench_args.cut_len > 0:
226
231
  # Prefill
@@ -349,7 +354,7 @@ def latency_test(
349
354
  for bs, il, ol in itertools.product(
350
355
  bench_args.batch_size, bench_args.input_len, bench_args.output_len
351
356
  ):
352
- req = prepare_synthetic_inputs_for_latency_test(bs, il)
357
+ reqs = prepare_synthetic_inputs_for_latency_test(bs, il)
353
358
  ret = latency_test_run_once(
354
359
  bench_args.run_name, model_runner, rank_print, reqs, bs, il, ol
355
360
  )