sglang 0.2.13__tar.gz → 0.2.14.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. {sglang-0.2.13/sglang.egg-info → sglang-0.2.14.post1}/PKG-INFO +100 -27
  2. {sglang-0.2.13 → sglang-0.2.14.post1}/README.md +95 -25
  3. {sglang-0.2.13 → sglang-0.2.14.post1}/pyproject.toml +4 -4
  4. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/api.py +6 -0
  5. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/bench_latency.py +7 -3
  6. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/bench_serving.py +50 -26
  7. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/check_env.py +15 -0
  8. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/chat_template.py +10 -5
  9. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/compiler.py +4 -0
  10. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/interpreter.py +1 -0
  11. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/ir.py +9 -0
  12. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/launch_server.py +8 -1
  13. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/constrained/fsm_cache.py +11 -2
  14. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/constrained/jump_forward.py +1 -0
  15. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/conversation.py +50 -1
  16. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/hf_transformers_utils.py +22 -23
  17. sglang-0.2.14.post1/sglang/srt/layers/activation.py +131 -0
  18. sglang-0.2.14.post1/sglang/srt/layers/decode_attention.py +627 -0
  19. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/fused_moe/layer.py +2 -2
  20. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/logits_processor.py +56 -19
  21. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/radix_attention.py +3 -4
  22. sglang-0.2.14.post1/sglang/srt/layers/sampler.py +101 -0
  23. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/controller_multi.py +2 -8
  24. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/controller_single.py +7 -10
  25. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/detokenizer_manager.py +20 -9
  26. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/io_struct.py +44 -11
  27. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/policy_scheduler.py +5 -2
  28. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/schedule_batch.py +46 -166
  29. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/tokenizer_manager.py +192 -83
  30. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/tp_worker.py +118 -24
  31. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/mem_cache/memory_pool.py +82 -8
  32. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/mm_utils.py +79 -7
  33. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/model_executor/cuda_graph_runner.py +32 -8
  34. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/model_executor/forward_batch_info.py +51 -26
  35. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/model_executor/model_runner.py +201 -58
  36. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/gemma2.py +10 -6
  37. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/gpt_bigcode.py +1 -1
  38. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/grok.py +11 -1
  39. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/llama_embedding.py +4 -0
  40. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/llava.py +176 -59
  41. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/qwen2.py +9 -3
  42. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/openai_api/adapter.py +200 -39
  43. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/openai_api/protocol.py +2 -0
  44. sglang-0.2.14.post1/sglang/srt/sampling/sampling_batch_info.py +136 -0
  45. {sglang-0.2.13/sglang/srt → sglang-0.2.14.post1/sglang/srt/sampling}/sampling_params.py +22 -0
  46. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/server.py +92 -57
  47. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/server_args.py +43 -15
  48. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/utils.py +26 -16
  49. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/runners.py +22 -30
  50. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/simple_eval_common.py +9 -10
  51. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/simple_eval_gpqa.py +2 -1
  52. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/simple_eval_humaneval.py +2 -2
  53. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/simple_eval_math.py +2 -1
  54. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/simple_eval_mmlu.py +2 -1
  55. sglang-0.2.14.post1/sglang/test/test_activation.py +55 -0
  56. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/test_utils.py +36 -53
  57. sglang-0.2.14.post1/sglang/version.py +1 -0
  58. {sglang-0.2.13 → sglang-0.2.14.post1/sglang.egg-info}/PKG-INFO +100 -27
  59. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang.egg-info/SOURCES.txt +4 -2
  60. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang.egg-info/requires.txt +4 -1
  61. sglang-0.2.13/sglang/launch_server_llavavid.py +0 -29
  62. sglang-0.2.13/sglang/srt/layers/activation.py +0 -32
  63. sglang-0.2.13/sglang/srt/layers/decode_attention.py +0 -339
  64. sglang-0.2.13/sglang/version.py +0 -1
  65. {sglang-0.2.13 → sglang-0.2.14.post1}/LICENSE +0 -0
  66. {sglang-0.2.13 → sglang-0.2.14.post1}/setup.cfg +0 -0
  67. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/__init__.py +0 -0
  68. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/global_config.py +0 -0
  69. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/__init__.py +0 -0
  70. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/backend/__init__.py +0 -0
  71. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/backend/anthropic.py +0 -0
  72. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/backend/base_backend.py +0 -0
  73. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/backend/litellm.py +0 -0
  74. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/backend/openai.py +0 -0
  75. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  76. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/backend/vertexai.py +0 -0
  77. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/choices.py +0 -0
  78. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/tracer.py +0 -0
  79. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/constrained/__init__.py +0 -0
  80. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/constrained/base_tool_cache.py +0 -0
  81. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/extend_attention.py +0 -0
  82. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  83. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
  84. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/layernorm.py +0 -0
  85. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/pooler.py +0 -0
  86. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/prefill_attention.py +0 -0
  87. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  88. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  89. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  90. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
  91. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/model_config.py +0 -0
  92. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/chatglm.py +0 -0
  93. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/commandr.py +0 -0
  94. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/dbrx.py +0 -0
  95. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/deepseek.py +0 -0
  96. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/deepseek_v2.py +0 -0
  97. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/gemma.py +0 -0
  98. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/internlm2.py +0 -0
  99. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/llama2.py +0 -0
  100. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/llama_classification.py +0 -0
  101. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/llavavid.py +0 -0
  102. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/minicpm.py +0 -0
  103. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/mistral.py +0 -0
  104. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/mixtral.py +0 -0
  105. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/mixtral_quant.py +0 -0
  106. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/qwen.py +0 -0
  107. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/qwen2_moe.py +0 -0
  108. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/stablelm.py +0 -0
  109. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/yivl.py +0 -0
  110. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  111. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  112. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  113. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  114. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  115. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  116. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/run_eval.py +0 -0
  117. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/simple_eval_mgsm.py +0 -0
  118. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  119. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/test_layernorm.py +0 -0
  120. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/test_programs.py +0 -0
  121. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/utils.py +0 -0
  122. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang.egg-info/dependency_links.txt +0 -0
  123. {sglang-0.2.13 → sglang-0.2.14.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.13
3
+ Version: 0.2.14.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -216,6 +216,7 @@ Requires-Dist: tqdm
216
216
  Requires-Dist: numpy
217
217
  Provides-Extra: srt
218
218
  Requires-Dist: aiohttp; extra == "srt"
219
+ Requires-Dist: decord; extra == "srt"
219
220
  Requires-Dist: fastapi; extra == "srt"
220
221
  Requires-Dist: hf_transfer; extra == "srt"
221
222
  Requires-Dist: huggingface_hub; extra == "srt"
@@ -229,7 +230,7 @@ Requires-Dist: torch; extra == "srt"
229
230
  Requires-Dist: uvicorn; extra == "srt"
230
231
  Requires-Dist: uvloop; extra == "srt"
231
232
  Requires-Dist: zmq; extra == "srt"
232
- Requires-Dist: vllm==0.5.4; extra == "srt"
233
+ Requires-Dist: vllm==0.5.5; extra == "srt"
233
234
  Requires-Dist: outlines>=0.0.44; extra == "srt"
234
235
  Provides-Extra: openai
235
236
  Requires-Dist: openai>=1.0; extra == "openai"
@@ -242,6 +243,8 @@ Provides-Extra: test
242
243
  Requires-Dist: jsonlines; extra == "test"
243
244
  Requires-Dist: matplotlib; extra == "test"
244
245
  Requires-Dist: pandas; extra == "test"
246
+ Requires-Dist: sentence_transformers; extra == "test"
247
+ Requires-Dist: accelerate; extra == "test"
245
248
  Provides-Extra: all
246
249
  Requires-Dist: sglang[srt]; extra == "all"
247
250
  Requires-Dist: sglang[openai]; extra == "all"
@@ -270,17 +273,18 @@ SGLang is a fast serving framework for large language models and vision language
270
273
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
271
274
 
272
275
  The core features include:
273
- - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
276
+ - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
274
277
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
275
278
 
276
279
  ## News
277
280
  - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
278
- - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
281
+ - [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
279
282
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
280
283
 
281
284
  <details>
282
285
  <summary>More</summary>
283
286
 
287
+ - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
284
288
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
285
289
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
286
290
 
@@ -308,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
308
312
  ### Method 2: From source
309
313
  ```
310
314
  # Use the last release branch
311
- git clone -b v0.2.13 https://github.com/sgl-project/sglang.git
315
+ git clone -b v0.2.14.post1 https://github.com/sgl-project/sglang.git
312
316
  cd sglang
313
317
 
314
318
  pip install --upgrade pip
@@ -334,14 +338,60 @@ docker run --gpus all \
334
338
 
335
339
  ### Method 4: Using docker compose
336
340
 
341
+ <details>
342
+ <summary>More</summary>
343
+
337
344
  > This method is recommended if you plan to serve it as a service.
338
345
  > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
339
346
 
340
347
  1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
341
348
  2. Execute the command `docker compose up -d` in your terminal.
349
+ </details>
350
+
351
+ ### Method 5: Run on Kubernetes or Clouds with SkyPilot
352
+
353
+ <details>
354
+ <summary>More</summary>
355
+
356
+ To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
357
+
358
+ 1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
359
+ 2. Deploy on your own infra with a single command and get the HTTP API endpoint:
360
+ <details>
361
+ <summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
362
+
363
+ ```yaml
364
+ # sglang.yaml
365
+ envs:
366
+ HF_TOKEN: null
367
+
368
+ resources:
369
+ image_id: docker:lmsysorg/sglang:latest
370
+ accelerators: A100
371
+ ports: 30000
372
+
373
+ run: |
374
+ conda deactivate
375
+ python3 -m sglang.launch_server \
376
+ --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
377
+ --host 0.0.0.0 \
378
+ --port 30000
379
+ ```
380
+ </details>
381
+
382
+ ```bash
383
+ # Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
384
+ HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
385
+
386
+ # Get the HTTP API endpoint
387
+ sky status --endpoint 30000 sglang
388
+ ```
389
+ 3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
390
+ </details>
391
+
342
392
 
343
393
  ### Common Notes
344
- - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
394
+ - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
345
395
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
346
396
 
347
397
  ## Backend: SGLang Runtime (SRT)
@@ -395,6 +445,13 @@ response = client.chat.completions.create(
395
445
  max_tokens=64,
396
446
  )
397
447
  print(response)
448
+
449
+ # Text embedding
450
+ response = client.embeddings.create(
451
+ model="default",
452
+ input="How are you today",
453
+ )
454
+ print(response)
398
455
  ```
399
456
 
400
457
  It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
@@ -431,19 +488,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
431
488
 
432
489
  ### Supported Models
433
490
 
491
+ **Generative Models**
492
+
434
493
  - Llama / Llama 2 / Llama 3 / Llama 3.1
435
494
  - Mistral / Mixtral / Mistral NeMo
436
495
  - Gemma / Gemma 2
437
496
  - Qwen / Qwen 2 / Qwen 2 MoE
438
497
  - DeepSeek / DeepSeek 2
439
- - LLaVA 1.5 / 1.6
440
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
441
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
442
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
443
- - LLaVA-NeXT-Video
444
- - see [examples/usage/llava_video](examples/usage/llava_video)
498
+ - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
499
+ - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
500
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
501
+ - LLaVA 1.5 / 1.6 / NeXT
502
+ - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
503
+ - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
504
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
445
505
  - Yi-VL
446
- - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
447
506
  - StableLM
448
507
  - Command-R
449
508
  - DBRX
@@ -451,37 +510,52 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
451
510
  - ChatGLM
452
511
  - InternLM 2
453
512
 
513
+ **Embedding Models**
514
+
515
+ - e5-mistral
516
+ - gte-Qwen2
517
+ - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
518
+
454
519
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
455
520
 
456
521
  #### Use Models From ModelScope
457
- To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
522
+ <details>
523
+ <summary>More</summary>
524
+
525
+ To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
458
526
  ```
459
527
  export SGLANG_USE_MODELSCOPE=true
460
528
  ```
461
529
  Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
462
530
  ```
463
531
  SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
464
- ```
532
+ ```
533
+
534
+ </details>
465
535
 
466
536
  #### Run Llama 3.1 405B
537
+ <details>
538
+ <summary>More</summary>
467
539
 
468
540
  ```bash
469
- ## Run 405B (fp8) on a single node
541
+ # Run 405B (fp8) on a single node
470
542
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
471
543
 
472
- ## Run 405B (fp16) on two nodes
473
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
474
-
475
- # on the first node
476
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
544
+ # Run 405B (fp16) on two nodes
545
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
546
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
477
547
 
478
- # on the second
479
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
548
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
549
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
480
550
  ```
481
551
 
552
+ </details>
553
+
482
554
  ### Benchmark Performance
483
555
 
484
- - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
556
+ - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
557
+ Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle.
558
+ A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, please use `sglang.bench_serving` instead.
485
559
  ```
486
560
  python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
487
561
  ```
@@ -614,7 +688,7 @@ def tip_suggestion(s):
614
688
  s += "In summary" + sgl.gen("summary")
615
689
  ```
616
690
 
617
- #### Multi Modality
691
+ #### Multi-Modality
618
692
  Use `sgl.image` to pass an image as input.
619
693
 
620
694
  ```python
@@ -668,7 +742,7 @@ def character_gen(s, name):
668
742
  s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
669
743
  ```
670
744
 
671
- See also [json_decode.py](examples/usage/json_decode.py) for an additional example on specifying formats with Pydantic models.
745
+ See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
672
746
 
673
747
  #### Batching
674
748
  Use `run_batch` to run a batch of requests with continuous batching.
@@ -730,7 +804,6 @@ def chat_example(s):
730
804
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
731
805
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
732
806
 
733
-
734
807
  ## Benchmark And Performance
735
808
  ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
736
809
  ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
@@ -17,17 +17,18 @@ SGLang is a fast serving framework for large language models and vision language
17
17
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
18
18
 
19
19
  The core features include:
20
- - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
20
+ - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
21
21
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
22
22
 
23
23
  ## News
24
24
  - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
25
- - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
25
+ - [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
26
26
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
27
27
 
28
28
  <details>
29
29
  <summary>More</summary>
30
30
 
31
+ - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
31
32
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
32
33
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
33
34
 
@@ -55,7 +56,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
55
56
  ### Method 2: From source
56
57
  ```
57
58
  # Use the last release branch
58
- git clone -b v0.2.13 https://github.com/sgl-project/sglang.git
59
+ git clone -b v0.2.14.post1 https://github.com/sgl-project/sglang.git
59
60
  cd sglang
60
61
 
61
62
  pip install --upgrade pip
@@ -81,14 +82,60 @@ docker run --gpus all \
81
82
 
82
83
  ### Method 4: Using docker compose
83
84
 
85
+ <details>
86
+ <summary>More</summary>
87
+
84
88
  > This method is recommended if you plan to serve it as a service.
85
89
  > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
86
90
 
87
91
  1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
88
92
  2. Execute the command `docker compose up -d` in your terminal.
93
+ </details>
94
+
95
+ ### Method 5: Run on Kubernetes or Clouds with SkyPilot
96
+
97
+ <details>
98
+ <summary>More</summary>
99
+
100
+ To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
101
+
102
+ 1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
103
+ 2. Deploy on your own infra with a single command and get the HTTP API endpoint:
104
+ <details>
105
+ <summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
106
+
107
+ ```yaml
108
+ # sglang.yaml
109
+ envs:
110
+ HF_TOKEN: null
111
+
112
+ resources:
113
+ image_id: docker:lmsysorg/sglang:latest
114
+ accelerators: A100
115
+ ports: 30000
116
+
117
+ run: |
118
+ conda deactivate
119
+ python3 -m sglang.launch_server \
120
+ --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
121
+ --host 0.0.0.0 \
122
+ --port 30000
123
+ ```
124
+ </details>
125
+
126
+ ```bash
127
+ # Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
128
+ HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
129
+
130
+ # Get the HTTP API endpoint
131
+ sky status --endpoint 30000 sglang
132
+ ```
133
+ 3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
134
+ </details>
135
+
89
136
 
90
137
  ### Common Notes
91
- - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
138
+ - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
92
139
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
93
140
 
94
141
  ## Backend: SGLang Runtime (SRT)
@@ -142,6 +189,13 @@ response = client.chat.completions.create(
142
189
  max_tokens=64,
143
190
  )
144
191
  print(response)
192
+
193
+ # Text embedding
194
+ response = client.embeddings.create(
195
+ model="default",
196
+ input="How are you today",
197
+ )
198
+ print(response)
145
199
  ```
146
200
 
147
201
  It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
@@ -178,19 +232,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
178
232
 
179
233
  ### Supported Models
180
234
 
235
+ **Generative Models**
236
+
181
237
  - Llama / Llama 2 / Llama 3 / Llama 3.1
182
238
  - Mistral / Mixtral / Mistral NeMo
183
239
  - Gemma / Gemma 2
184
240
  - Qwen / Qwen 2 / Qwen 2 MoE
185
241
  - DeepSeek / DeepSeek 2
186
- - LLaVA 1.5 / 1.6
187
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
188
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
189
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
190
- - LLaVA-NeXT-Video
191
- - see [examples/usage/llava_video](examples/usage/llava_video)
242
+ - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
243
+ - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
244
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
245
+ - LLaVA 1.5 / 1.6 / NeXT
246
+ - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
247
+ - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
248
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
192
249
  - Yi-VL
193
- - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
194
250
  - StableLM
195
251
  - Command-R
196
252
  - DBRX
@@ -198,37 +254,52 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
198
254
  - ChatGLM
199
255
  - InternLM 2
200
256
 
257
+ **Embedding Models**
258
+
259
+ - e5-mistral
260
+ - gte-Qwen2
261
+ - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
262
+
201
263
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
202
264
 
203
265
  #### Use Models From ModelScope
204
- To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
266
+ <details>
267
+ <summary>More</summary>
268
+
269
+ To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
205
270
  ```
206
271
  export SGLANG_USE_MODELSCOPE=true
207
272
  ```
208
273
  Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
209
274
  ```
210
275
  SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
211
- ```
276
+ ```
277
+
278
+ </details>
212
279
 
213
280
  #### Run Llama 3.1 405B
281
+ <details>
282
+ <summary>More</summary>
214
283
 
215
284
  ```bash
216
- ## Run 405B (fp8) on a single node
285
+ # Run 405B (fp8) on a single node
217
286
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
218
287
 
219
- ## Run 405B (fp16) on two nodes
220
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
221
-
222
- # on the first node
223
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
288
+ # Run 405B (fp16) on two nodes
289
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
290
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
224
291
 
225
- # on the second
226
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
292
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
293
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
227
294
  ```
228
295
 
296
+ </details>
297
+
229
298
  ### Benchmark Performance
230
299
 
231
- - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
300
+ - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
301
+ Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle.
302
+ A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, please use `sglang.bench_serving` instead.
232
303
  ```
233
304
  python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
234
305
  ```
@@ -361,7 +432,7 @@ def tip_suggestion(s):
361
432
  s += "In summary" + sgl.gen("summary")
362
433
  ```
363
434
 
364
- #### Multi Modality
435
+ #### Multi-Modality
365
436
  Use `sgl.image` to pass an image as input.
366
437
 
367
438
  ```python
@@ -415,7 +486,7 @@ def character_gen(s, name):
415
486
  s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
416
487
  ```
417
488
 
418
- See also [json_decode.py](examples/usage/json_decode.py) for an additional example on specifying formats with Pydantic models.
489
+ See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
419
490
 
420
491
  #### Batching
421
492
  Use `run_batch` to run a batch of requests with continuous batching.
@@ -477,7 +548,6 @@ def chat_example(s):
477
548
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
478
549
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
479
550
 
480
-
481
551
  ## Benchmark And Performance
482
552
  ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
483
553
  ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.2.13"
7
+ version = "0.2.14.post1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -20,14 +20,14 @@ dependencies = [
20
20
  ]
21
21
 
22
22
  [project.optional-dependencies]
23
- srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
23
+ srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
24
24
  "packaging", "pillow", "psutil", "pydantic", "python-multipart",
25
25
  "torch", "uvicorn", "uvloop", "zmq",
26
- "vllm==0.5.4", "outlines>=0.0.44"]
26
+ "vllm==0.5.5", "outlines>=0.0.44"]
27
27
  openai = ["openai>=1.0", "tiktoken"]
28
28
  anthropic = ["anthropic>=0.20.0"]
29
29
  litellm = ["litellm>=1.0.0"]
30
- test = ["jsonlines", "matplotlib", "pandas"]
30
+ test = ["jsonlines", "matplotlib", "pandas", "sentence_transformers", "accelerate"]
31
31
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
32
32
  dev = ["sglang[all]", "sglang[test]"]
33
33
 
@@ -66,6 +66,7 @@ def gen(
66
66
  temperature: Optional[float] = None,
67
67
  top_p: Optional[float] = None,
68
68
  top_k: Optional[int] = None,
69
+ min_p: Optional[float] = None,
69
70
  frequency_penalty: Optional[float] = None,
70
71
  presence_penalty: Optional[float] = None,
71
72
  ignore_eos: Optional[bool] = None,
@@ -103,6 +104,7 @@ def gen(
103
104
  temperature,
104
105
  top_p,
105
106
  top_k,
107
+ min_p,
106
108
  frequency_penalty,
107
109
  presence_penalty,
108
110
  ignore_eos,
@@ -123,6 +125,7 @@ def gen_int(
123
125
  temperature: Optional[float] = None,
124
126
  top_p: Optional[float] = None,
125
127
  top_k: Optional[int] = None,
128
+ min_p: Optional[float] = None,
126
129
  frequency_penalty: Optional[float] = None,
127
130
  presence_penalty: Optional[float] = None,
128
131
  ignore_eos: Optional[bool] = None,
@@ -139,6 +142,7 @@ def gen_int(
139
142
  temperature,
140
143
  top_p,
141
144
  top_k,
145
+ min_p,
142
146
  frequency_penalty,
143
147
  presence_penalty,
144
148
  ignore_eos,
@@ -159,6 +163,7 @@ def gen_string(
159
163
  temperature: Optional[float] = None,
160
164
  top_p: Optional[float] = None,
161
165
  top_k: Optional[int] = None,
166
+ min_p: Optional[float] = None,
162
167
  frequency_penalty: Optional[float] = None,
163
168
  presence_penalty: Optional[float] = None,
164
169
  ignore_eos: Optional[bool] = None,
@@ -175,6 +180,7 @@ def gen_string(
175
180
  temperature,
176
181
  top_p,
177
182
  top_k,
183
+ min_p,
178
184
  frequency_penalty,
179
185
  presence_penalty,
180
186
  ignore_eos,
@@ -54,7 +54,7 @@ from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
54
54
  from sglang.srt.model_config import ModelConfig
55
55
  from sglang.srt.model_executor.forward_batch_info import ForwardMode
56
56
  from sglang.srt.model_executor.model_runner import ModelRunner
57
- from sglang.srt.sampling_params import SamplingParams
57
+ from sglang.srt.sampling.sampling_params import SamplingParams
58
58
  from sglang.srt.server_args import ServerArgs
59
59
  from sglang.srt.utils import suppress_other_loggers
60
60
 
@@ -111,7 +111,11 @@ def load_model(server_args, tp_rank):
111
111
  suppress_other_loggers()
112
112
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
113
113
 
114
- model_config = ModelConfig(path=server_args.model_path)
114
+ model_config = ModelConfig(
115
+ server_args.model_path,
116
+ server_args.trust_remote_code,
117
+ context_length=server_args.context_length,
118
+ )
115
119
  model_runner = ModelRunner(
116
120
  model_config=model_config,
117
121
  mem_fraction_static=server_args.mem_fraction_static,
@@ -350,7 +354,7 @@ def latency_test(
350
354
  for bs, il, ol in itertools.product(
351
355
  bench_args.batch_size, bench_args.input_len, bench_args.output_len
352
356
  ):
353
- req = prepare_synthetic_inputs_for_latency_test(bs, il)
357
+ reqs = prepare_synthetic_inputs_for_latency_test(bs, il)
354
358
  ret = latency_test_run_once(
355
359
  bench_args.run_name, model_runner, rank_print, reqs, bs, il, ol
356
360
  )