sglang 0.2.11__tar.gz → 0.2.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. {sglang-0.2.11/sglang.egg-info → sglang-0.2.13}/PKG-INFO +33 -16
  2. {sglang-0.2.11 → sglang-0.2.13}/README.md +32 -15
  3. {sglang-0.2.11 → sglang-0.2.13}/pyproject.toml +1 -1
  4. {sglang-0.2.11 → sglang-0.2.13}/sglang/api.py +7 -1
  5. {sglang-0.2.11 → sglang-0.2.13}/sglang/bench_latency.py +9 -6
  6. {sglang-0.2.11 → sglang-0.2.13}/sglang/bench_serving.py +46 -22
  7. {sglang-0.2.11 → sglang-0.2.13}/sglang/global_config.py +1 -1
  8. {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/backend/runtime_endpoint.py +60 -49
  9. {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/compiler.py +2 -2
  10. {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/interpreter.py +4 -2
  11. {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/ir.py +16 -7
  12. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/constrained/base_tool_cache.py +1 -1
  13. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/constrained/fsm_cache.py +12 -2
  14. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/constrained/jump_forward.py +13 -2
  15. sglang-0.2.13/sglang/srt/layers/activation.py +32 -0
  16. sglang-0.2.11/sglang/srt/layers/token_attention.py → sglang-0.2.13/sglang/srt/layers/decode_attention.py +9 -5
  17. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/layers/extend_attention.py +9 -2
  18. sglang-0.2.13/sglang/srt/layers/fused_moe/__init__.py +1 -0
  19. {sglang-0.2.11/sglang/srt/layers → sglang-0.2.13/sglang/srt/layers/fused_moe}/fused_moe.py +165 -108
  20. sglang-0.2.13/sglang/srt/layers/fused_moe/layer.py +587 -0
  21. sglang-0.2.13/sglang/srt/layers/layernorm.py +65 -0
  22. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/layers/logits_processor.py +7 -2
  23. sglang-0.2.13/sglang/srt/layers/pooler.py +50 -0
  24. sglang-0.2.11/sglang/srt/layers/context_flashattention_nopad.py → sglang-0.2.13/sglang/srt/layers/prefill_attention.py +5 -0
  25. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/layers/radix_attention.py +40 -16
  26. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/managers/detokenizer_manager.py +31 -9
  27. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/managers/io_struct.py +63 -0
  28. sglang-0.2.13/sglang/srt/managers/policy_scheduler.py +233 -0
  29. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/managers/schedule_batch.py +115 -97
  30. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/managers/tokenizer_manager.py +194 -112
  31. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/managers/tp_worker.py +290 -359
  32. sglang-0.2.11/sglang/srt/mem_cache/base_cache.py → sglang-0.2.13/sglang/srt/mem_cache/base_prefix_cache.py +9 -4
  33. sglang-0.2.13/sglang/srt/mem_cache/chunk_cache.py +83 -0
  34. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/mem_cache/memory_pool.py +2 -2
  35. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/mem_cache/radix_cache.py +74 -40
  36. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/model_executor/cuda_graph_runner.py +71 -25
  37. sglang-0.2.13/sglang/srt/model_executor/forward_batch_info.py +393 -0
  38. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/model_executor/model_runner.py +77 -57
  39. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/chatglm.py +2 -2
  40. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/commandr.py +1 -1
  41. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/deepseek.py +2 -2
  42. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/deepseek_v2.py +7 -6
  43. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/gemma.py +1 -1
  44. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/gemma2.py +11 -6
  45. sglang-0.2.13/sglang/srt/models/grok.py +408 -0
  46. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/internlm2.py +2 -7
  47. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/llama2.py +4 -4
  48. sglang-0.2.13/sglang/srt/models/llama_embedding.py +88 -0
  49. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/minicpm.py +2 -2
  50. sglang-0.2.13/sglang/srt/models/mixtral.py +380 -0
  51. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/mixtral_quant.py +1 -4
  52. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/qwen.py +2 -2
  53. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/qwen2.py +2 -2
  54. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/qwen2_moe.py +2 -13
  55. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/stablelm.py +1 -1
  56. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/openai_api/adapter.py +187 -48
  57. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/openai_api/protocol.py +37 -1
  58. sglang-0.2.13/sglang/srt/sampling/penaltylib/__init__.py +13 -0
  59. sglang-0.2.13/sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
  60. sglang-0.2.13/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
  61. sglang-0.2.13/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
  62. sglang-0.2.13/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
  63. sglang-0.2.13/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
  64. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/sampling_params.py +31 -8
  65. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/server.py +91 -29
  66. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/server_args.py +32 -19
  67. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/utils.py +32 -15
  68. {sglang-0.2.11 → sglang-0.2.13}/sglang/test/run_eval.py +10 -1
  69. {sglang-0.2.11 → sglang-0.2.13}/sglang/test/runners.py +81 -73
  70. {sglang-0.2.11 → sglang-0.2.13}/sglang/test/simple_eval_humaneval.py +2 -8
  71. sglang-0.2.13/sglang/test/simple_eval_mgsm.py +203 -0
  72. sglang-0.2.13/sglang/test/srt/sampling/penaltylib/utils.py +337 -0
  73. sglang-0.2.13/sglang/test/test_layernorm.py +60 -0
  74. {sglang-0.2.11 → sglang-0.2.13}/sglang/test/test_programs.py +36 -7
  75. {sglang-0.2.11 → sglang-0.2.13}/sglang/test/test_utils.py +24 -2
  76. {sglang-0.2.11 → sglang-0.2.13}/sglang/utils.py +0 -1
  77. sglang-0.2.13/sglang/version.py +1 -0
  78. {sglang-0.2.11 → sglang-0.2.13/sglang.egg-info}/PKG-INFO +33 -16
  79. {sglang-0.2.11 → sglang-0.2.13}/sglang.egg-info/SOURCES.txt +20 -10
  80. sglang-0.2.11/sglang/srt/layers/linear.py +0 -884
  81. sglang-0.2.11/sglang/srt/layers/quantization/__init__.py +0 -64
  82. sglang-0.2.11/sglang/srt/layers/quantization/fp8.py +0 -677
  83. sglang-0.2.11/sglang/srt/managers/policy_scheduler.py +0 -85
  84. sglang-0.2.11/sglang/srt/mem_cache/chunk_cache.py +0 -60
  85. sglang-0.2.11/sglang/srt/model_executor/forward_batch_info.py +0 -256
  86. sglang-0.2.11/sglang/srt/model_loader/model_loader.py +0 -292
  87. sglang-0.2.11/sglang/srt/model_loader/utils.py +0 -275
  88. sglang-0.2.11/sglang/srt/models/grok.py +0 -754
  89. sglang-0.2.11/sglang/srt/models/mixtral.py +0 -578
  90. sglang-0.2.11/sglang/version.py +0 -1
  91. {sglang-0.2.11 → sglang-0.2.13}/LICENSE +0 -0
  92. {sglang-0.2.11 → sglang-0.2.13}/setup.cfg +0 -0
  93. {sglang-0.2.11 → sglang-0.2.13}/sglang/__init__.py +0 -0
  94. {sglang-0.2.11 → sglang-0.2.13}/sglang/check_env.py +0 -0
  95. {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/__init__.py +0 -0
  96. {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/backend/__init__.py +0 -0
  97. {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/backend/anthropic.py +0 -0
  98. {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/backend/base_backend.py +0 -0
  99. {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/backend/litellm.py +0 -0
  100. {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/backend/openai.py +0 -0
  101. {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/backend/vertexai.py +0 -0
  102. {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/chat_template.py +0 -0
  103. {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/choices.py +0 -0
  104. {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/tracer.py +0 -0
  105. {sglang-0.2.11 → sglang-0.2.13}/sglang/launch_server.py +0 -0
  106. {sglang-0.2.11 → sglang-0.2.13}/sglang/launch_server_llavavid.py +0 -0
  107. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/constrained/__init__.py +0 -0
  108. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/conversation.py +0 -0
  109. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/hf_transformers_utils.py +0 -0
  110. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/managers/controller_multi.py +0 -0
  111. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/managers/controller_single.py +0 -0
  112. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/mem_cache/flush_cache.py +0 -0
  113. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/mm_utils.py +0 -0
  114. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/model_config.py +0 -0
  115. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/dbrx.py +0 -0
  116. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/gpt_bigcode.py +0 -0
  117. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/llama_classification.py +0 -0
  118. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/llava.py +0 -0
  119. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/llavavid.py +0 -0
  120. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/mistral.py +0 -0
  121. {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/yivl.py +0 -0
  122. {sglang-0.2.11 → sglang-0.2.13}/sglang/test/simple_eval_common.py +0 -0
  123. {sglang-0.2.11 → sglang-0.2.13}/sglang/test/simple_eval_gpqa.py +0 -0
  124. {sglang-0.2.11 → sglang-0.2.13}/sglang/test/simple_eval_math.py +0 -0
  125. {sglang-0.2.11 → sglang-0.2.13}/sglang/test/simple_eval_mmlu.py +0 -0
  126. {sglang-0.2.11 → sglang-0.2.13}/sglang.egg-info/dependency_links.txt +0 -0
  127. {sglang-0.2.11 → sglang-0.2.13}/sglang.egg-info/requires.txt +0 -0
  128. {sglang-0.2.11 → sglang-0.2.13}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.11
3
+ Version: 0.2.13
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -308,7 +308,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
308
308
  ### Method 2: From source
309
309
  ```
310
310
  # Use the last release branch
311
- git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
311
+ git clone -b v0.2.13 https://github.com/sgl-project/sglang.git
312
312
  cd sglang
313
313
 
314
314
  pip install --upgrade pip
@@ -329,11 +329,19 @@ docker run --gpus all \
329
329
  --env "HF_TOKEN=<secret>" \
330
330
  --ipc=host \
331
331
  lmsysorg/sglang:latest \
332
- python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
332
+ python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
333
333
  ```
334
334
 
335
+ ### Method 4: Using docker compose
336
+
337
+ > This method is recommended if you plan to serve it as a service.
338
+ > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
339
+
340
+ 1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
341
+ 2. Execute the command `docker compose up -d` in your terminal.
342
+
335
343
  ### Common Notes
336
- - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
344
+ - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
337
345
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
338
346
 
339
347
  ## Backend: SGLang Runtime (SRT)
@@ -392,23 +400,23 @@ print(response)
392
400
  It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
393
401
 
394
402
  ### Additional Server Arguments
395
- - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
403
+ - Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
396
404
  ```
397
405
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
398
406
  ```
399
- - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
407
+ - Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
400
408
  ```
401
409
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
402
410
  ```
403
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
411
+ - If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
404
412
  ```
405
413
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
406
414
  ```
407
- - If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
415
+ - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
416
+ - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
408
417
  ```
409
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
418
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
410
419
  ```
411
- - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
412
420
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
413
421
  ```
414
422
  # Node 0
@@ -418,13 +426,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
418
426
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
419
427
  ```
420
428
  - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
421
- - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
422
429
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
423
-
430
+ - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
431
+
424
432
  ### Supported Models
425
433
 
426
434
  - Llama / Llama 2 / Llama 3 / Llama 3.1
427
- - Mistral / Mixtral
435
+ - Mistral / Mixtral / Mistral NeMo
428
436
  - Gemma / Gemma 2
429
437
  - Qwen / Qwen 2 / Qwen 2 MoE
430
438
  - DeepSeek / DeepSeek 2
@@ -442,11 +450,20 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
442
450
  - Grok
443
451
  - ChatGLM
444
452
  - InternLM 2
445
- - Mistral NeMo
446
453
 
447
454
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
448
455
 
449
- ### Run Llama 3.1 405B
456
+ #### Use Models From ModelScope
457
+ To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
458
+ ```
459
+ export SGLANG_USE_MODELSCOPE=true
460
+ ```
461
+ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
462
+ ```
463
+ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
464
+ ```
465
+
466
+ #### Run Llama 3.1 405B
450
467
 
451
468
  ```bash
452
469
  ## Run 405B (fp8) on a single node
@@ -474,7 +491,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
474
491
  ```
475
492
 
476
493
  ## Frontend: Structured Generation Language (SGLang)
477
- The frontend language can be used with local models or API models.
494
+ The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
478
495
 
479
496
  ### Quick Start
480
497
  The example below shows how to use sglang to answer a mulit-turn question.
@@ -55,7 +55,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
55
55
  ### Method 2: From source
56
56
  ```
57
57
  # Use the last release branch
58
- git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
58
+ git clone -b v0.2.13 https://github.com/sgl-project/sglang.git
59
59
  cd sglang
60
60
 
61
61
  pip install --upgrade pip
@@ -76,11 +76,19 @@ docker run --gpus all \
76
76
  --env "HF_TOKEN=<secret>" \
77
77
  --ipc=host \
78
78
  lmsysorg/sglang:latest \
79
- python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
79
+ python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
80
80
  ```
81
81
 
82
+ ### Method 4: Using docker compose
83
+
84
+ > This method is recommended if you plan to serve it as a service.
85
+ > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
86
+
87
+ 1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
88
+ 2. Execute the command `docker compose up -d` in your terminal.
89
+
82
90
  ### Common Notes
83
- - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
91
+ - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
84
92
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
85
93
 
86
94
  ## Backend: SGLang Runtime (SRT)
@@ -139,23 +147,23 @@ print(response)
139
147
  It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
140
148
 
141
149
  ### Additional Server Arguments
142
- - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
150
+ - Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
143
151
  ```
144
152
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
145
153
  ```
146
- - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
154
+ - Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
147
155
  ```
148
156
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
149
157
  ```
150
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
158
+ - If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
151
159
  ```
152
160
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
153
161
  ```
154
- - If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
162
+ - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
163
+ - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
155
164
  ```
156
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
165
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
157
166
  ```
158
- - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
159
167
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
160
168
  ```
161
169
  # Node 0
@@ -165,13 +173,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
165
173
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
166
174
  ```
167
175
  - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
168
- - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
169
176
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
170
-
177
+ - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
178
+
171
179
  ### Supported Models
172
180
 
173
181
  - Llama / Llama 2 / Llama 3 / Llama 3.1
174
- - Mistral / Mixtral
182
+ - Mistral / Mixtral / Mistral NeMo
175
183
  - Gemma / Gemma 2
176
184
  - Qwen / Qwen 2 / Qwen 2 MoE
177
185
  - DeepSeek / DeepSeek 2
@@ -189,11 +197,20 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
189
197
  - Grok
190
198
  - ChatGLM
191
199
  - InternLM 2
192
- - Mistral NeMo
193
200
 
194
201
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
195
202
 
196
- ### Run Llama 3.1 405B
203
+ #### Use Models From ModelScope
204
+ To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
205
+ ```
206
+ export SGLANG_USE_MODELSCOPE=true
207
+ ```
208
+ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
209
+ ```
210
+ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
211
+ ```
212
+
213
+ #### Run Llama 3.1 405B
197
214
 
198
215
  ```bash
199
216
  ## Run 405B (fp8) on a single node
@@ -221,7 +238,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
221
238
  ```
222
239
 
223
240
  ## Frontend: Structured Generation Language (SGLang)
224
- The frontend language can be used with local models or API models.
241
+ The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
225
242
 
226
243
  ### Quick Start
227
244
  The example below shows how to use sglang to answer a mulit-turn question.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.2.11"
7
+ version = "0.2.13"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -62,6 +62,7 @@ def gen(
62
62
  name: Optional[str] = None,
63
63
  max_tokens: Optional[int] = None,
64
64
  stop: Optional[Union[str, List[str]]] = None,
65
+ stop_token_ids: Optional[List[int]] = None,
65
66
  temperature: Optional[float] = None,
66
67
  top_p: Optional[float] = None,
67
68
  top_k: Optional[int] = None,
@@ -72,7 +73,7 @@ def gen(
72
73
  logprob_start_len: Optional[int] = None,
73
74
  top_logprobs_num: Optional[int] = None,
74
75
  return_text_in_logprobs: Optional[bool] = None,
75
- dtype: Optional[type] = None,
76
+ dtype: Optional[Union[type, str]] = None,
76
77
  choices: Optional[List[str]] = None,
77
78
  choices_method: Optional[ChoicesSamplingMethod] = None,
78
79
  regex: Optional[str] = None,
@@ -98,6 +99,7 @@ def gen(
98
99
  name,
99
100
  max_tokens,
100
101
  stop,
102
+ stop_token_ids,
101
103
  temperature,
102
104
  top_p,
103
105
  top_k,
@@ -117,6 +119,7 @@ def gen_int(
117
119
  name: Optional[str] = None,
118
120
  max_tokens: Optional[int] = None,
119
121
  stop: Optional[Union[str, List[str]]] = None,
122
+ stop_token_ids: Optional[List[int]] = None,
120
123
  temperature: Optional[float] = None,
121
124
  top_p: Optional[float] = None,
122
125
  top_k: Optional[int] = None,
@@ -132,6 +135,7 @@ def gen_int(
132
135
  name,
133
136
  max_tokens,
134
137
  stop,
138
+ stop_token_ids,
135
139
  temperature,
136
140
  top_p,
137
141
  top_k,
@@ -151,6 +155,7 @@ def gen_string(
151
155
  name: Optional[str] = None,
152
156
  max_tokens: Optional[int] = None,
153
157
  stop: Optional[Union[str, List[str]]] = None,
158
+ stop_token_ids: Optional[List[int]] = None,
154
159
  temperature: Optional[float] = None,
155
160
  top_p: Optional[float] = None,
156
161
  top_k: Optional[int] = None,
@@ -166,6 +171,7 @@ def gen_string(
166
171
  name,
167
172
  max_tokens,
168
173
  stop,
174
+ stop_token_ids,
169
175
  temperature,
170
176
  top_p,
171
177
  top_k,
@@ -64,7 +64,7 @@ class BenchArgs:
64
64
  run_name: str = "before"
65
65
  batch_size: Tuple[int] = (1,)
66
66
  input_len: Tuple[int] = (1024,)
67
- output_len: Tuple[int] = (4,)
67
+ output_len: Tuple[int] = (16,)
68
68
  result_filename: str = ""
69
69
  correctness_test: bool = False
70
70
  # This is only used for correctness test
@@ -152,7 +152,7 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
152
152
  req = Req(rid=i, origin_input_text=prompts[i], origin_input_ids=tmp_input_ids)
153
153
  req.prefix_indices = []
154
154
  req.sampling_params = sampling_params
155
- req.input_ids = req.origin_input_ids
155
+ req.fill_ids = req.origin_input_ids
156
156
  reqs.append(req)
157
157
 
158
158
  return input_ids, reqs
@@ -163,7 +163,7 @@ def prepare_extend_inputs_for_correctness_test(
163
163
  ):
164
164
  for i in range(len(reqs)):
165
165
  req = reqs[i]
166
- req.input_ids += input_ids[i][bench_args.cut_len :]
166
+ req.fill_ids += input_ids[i][bench_args.cut_len :]
167
167
  req.prefix_indices = model_runner.req_to_token_pool.req_to_token[
168
168
  i, : bench_args.cut_len
169
169
  ]
@@ -182,7 +182,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
182
182
  req = Req(rid=i, origin_input_text="", origin_input_ids=list(input_ids[i]))
183
183
  req.prefix_indices = []
184
184
  req.sampling_params = sampling_params
185
- req.input_ids = req.origin_input_ids
185
+ req.fill_ids = req.origin_input_ids
186
186
  reqs.append(req)
187
187
 
188
188
  return reqs
@@ -195,7 +195,7 @@ def extend(reqs, model_runner):
195
195
  token_to_kv_pool=model_runner.token_to_kv_pool,
196
196
  tree_cache=None,
197
197
  )
198
- batch.prepare_for_extend(model_runner.model_config.vocab_size, None)
198
+ batch.prepare_for_extend(model_runner.model_config.vocab_size)
199
199
  output = model_runner.forward(batch, ForwardMode.EXTEND)
200
200
  next_token_ids = batch.sample(output.next_token_logits)
201
201
  return next_token_ids, output.next_token_logits, batch
@@ -221,6 +221,7 @@ def correctness_test(
221
221
 
222
222
  # Prepare inputs
223
223
  input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
224
+ rank_print(f"{input_ids=}")
224
225
 
225
226
  if bench_args.cut_len > 0:
226
227
  # Prefill
@@ -238,7 +239,7 @@ def correctness_test(
238
239
 
239
240
  # Decode
240
241
  output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
241
- for _ in range(bench_args.output_len):
242
+ for _ in range(bench_args.output_len[0]):
242
243
  next_token_ids, _ = decode(next_token_ids, batch, model_runner)
243
244
  for i in range(len(reqs)):
244
245
  output_ids[i].append(next_token_ids[i])
@@ -332,6 +333,7 @@ def latency_test(
332
333
  )
333
334
 
334
335
  # Warm up
336
+ rank_print("Warmup ...")
335
337
  latency_test_run_once(
336
338
  bench_args.run_name,
337
339
  model_runner,
@@ -341,6 +343,7 @@ def latency_test(
341
343
  bench_args.input_len[0],
342
344
  4, # shorter decoding to speed up the warmup
343
345
  )
346
+ rank_print("Benchmark ...")
344
347
 
345
348
  # Run the sweep
346
349
  result_list = []
@@ -24,7 +24,7 @@ import warnings
24
24
  from argparse import ArgumentParser
25
25
  from dataclasses import dataclass, field
26
26
  from datetime import datetime
27
- from typing import AsyncGenerator, List, Optional, Tuple, Union
27
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
28
28
 
29
29
  import aiohttp
30
30
  import numpy as np
@@ -39,6 +39,8 @@ from transformers import (
39
39
 
40
40
  AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
41
41
 
42
+ global args
43
+
42
44
 
43
45
  @dataclass
44
46
  class RequestFuncInput:
@@ -47,6 +49,7 @@ class RequestFuncInput:
47
49
  prompt_len: int
48
50
  output_len: int
49
51
  model: str
52
+ extra_request_body: Dict[str, Any]
50
53
 
51
54
 
52
55
  @dataclass
@@ -84,6 +87,7 @@ async def async_request_trt_llm(
84
87
  "stream": True,
85
88
  "min_length": request_func_input.output_len,
86
89
  "end_id": 1048576,
90
+ **request_func_input.extra_request_body,
87
91
  }
88
92
  if args.disable_ignore_eos:
89
93
  del payload["min_length"]
@@ -154,6 +158,7 @@ async def async_request_openai_completions(
154
158
  "max_tokens": request_func_input.output_len,
155
159
  "stream": not args.disable_stream,
156
160
  "ignore_eos": not args.disable_ignore_eos,
161
+ **request_func_input.extra_request_body,
157
162
  }
158
163
  headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
159
164
 
@@ -192,7 +197,8 @@ async def async_request_openai_completions(
192
197
  output.ttft = ttft
193
198
 
194
199
  # Decoding phase
195
- output.itl.append(timestamp - most_recent_timestamp)
200
+ else:
201
+ output.itl.append(timestamp - most_recent_timestamp)
196
202
 
197
203
  most_recent_timestamp = timestamp
198
204
  generated_text += data["choices"][0]["text"]
@@ -542,6 +548,7 @@ async def benchmark(
542
548
  request_rate: float,
543
549
  disable_tqdm: bool,
544
550
  enable_multi: bool,
551
+ extra_request_body: Dict[str, Any],
545
552
  ):
546
553
  if backend in ASYNC_REQUEST_FUNCS:
547
554
  request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -556,6 +563,7 @@ async def benchmark(
556
563
  api_url=api_url,
557
564
  prompt_len=test_prompt_len,
558
565
  output_len=test_output_len,
566
+ extra_request_body=extra_request_body,
559
567
  )
560
568
  test_output = await request_func(request_func_input=test_input)
561
569
  if not test_output.success:
@@ -578,6 +586,7 @@ async def benchmark(
578
586
  api_url=api_url,
579
587
  prompt_len=prompt_len,
580
588
  output_len=output_len,
589
+ extra_request_body=extra_request_body,
581
590
  )
582
591
  tasks.append(
583
592
  asyncio.create_task(
@@ -660,19 +669,20 @@ async def benchmark(
660
669
  "backend": args.backend,
661
670
  "dataset_name": args.dataset_name,
662
671
  "request_rate": request_rate,
663
- "total_input": metrics.total_input,
664
- "total_output": metrics.total_output,
665
- "total_output_retokenized": metrics.total_output_retokenized,
666
- "mean_e2e_latency": metrics.mean_e2e_latency_ms,
667
- "median_e2e_latency": metrics.median_e2e_latency_ms,
668
- "median_ttft": metrics.median_ttft_ms,
669
- "median_itl": metrics.median_itl_ms,
670
- "output_token_throughput": metrics.output_throughput,
672
+ "total_input_tokens": metrics.total_input,
673
+ "total_output_tokens": metrics.total_output,
674
+ "total_output_tokens_retokenized": metrics.total_output_retokenized,
675
+ "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
676
+ "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
677
+ "median_ttft_ms": metrics.median_ttft_ms,
678
+ "median_itl_ms": metrics.median_itl_ms,
679
+ "output_throughput": metrics.output_throughput,
671
680
  "sharegpt_output_len": args.sharegpt_output_len,
672
681
  "random_input_len": args.random_input_len,
673
682
  "random_output_len": args.random_output_len,
674
683
  "random_range_ratio": args.random_range_ratio,
675
- "benchmark_duration": benchmark_duration,
684
+ "duration": benchmark_duration,
685
+ "completed": metrics.completed,
676
686
  }
677
687
  else:
678
688
  print(f"Error running benchmark for request rate: {request_rate}")
@@ -742,10 +752,18 @@ def check_chat_template(model_path):
742
752
  return False
743
753
 
744
754
 
745
- def fire(args: argparse.Namespace):
755
+ def run_benchmark(args_: argparse.Namespace):
756
+ global args
757
+ args = args_
758
+
759
+ set_ulimit()
746
760
  random.seed(args.seed)
747
761
  np.random.seed(args.seed)
748
762
 
763
+ extra_request_body = {}
764
+ if args.extra_request_body:
765
+ extra_request_body = json.loads(args.extra_request_body)
766
+
749
767
  if args.port is None:
750
768
  args.port = {
751
769
  "sglang": 30000,
@@ -838,10 +856,11 @@ def fire(args: argparse.Namespace):
838
856
  request_rate=rate,
839
857
  disable_tqdm=args.disable_tqdm,
840
858
  enable_multi=args.multi,
859
+ extra_request_body=extra_request_body,
841
860
  )
842
861
  )
843
862
  else:
844
- asyncio.run(
863
+ return asyncio.run(
845
864
  benchmark(
846
865
  backend=backend,
847
866
  api_url=api_url,
@@ -851,6 +870,7 @@ def fire(args: argparse.Namespace):
851
870
  request_rate=args.request_rate,
852
871
  disable_tqdm=args.disable_tqdm,
853
872
  enable_multi=args.multi,
873
+ extra_request_body=extra_request_body,
854
874
  )
855
875
  )
856
876
 
@@ -949,11 +969,6 @@ if __name__ == "__main__":
949
969
  "Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0.",
950
970
  )
951
971
  parser.add_argument("--seed", type=int, default=0, help="Default is 0.")
952
- parser.add_argument(
953
- "--disable-tqdm",
954
- action="store_true",
955
- help="Specify to disable tqdm progress bar.",
956
- )
957
972
  parser.add_argument(
958
973
  "--multi",
959
974
  action="store_true",
@@ -966,6 +981,11 @@ if __name__ == "__main__":
966
981
  help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
967
982
  )
968
983
  parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
984
+ parser.add_argument(
985
+ "--disable-tqdm",
986
+ action="store_true",
987
+ help="Specify to disable tqdm progress bar.",
988
+ )
969
989
  parser.add_argument(
970
990
  "--disable-stream",
971
991
  action="store_true",
@@ -976,8 +996,12 @@ if __name__ == "__main__":
976
996
  action="store_true",
977
997
  help="Disable ignoring EOS.",
978
998
  )
979
-
980
- set_ulimit()
981
-
999
+ parser.add_argument(
1000
+ "--extra-request-body",
1001
+ metavar='{"key1": "value1", "key2": "value2"}',
1002
+ type=str,
1003
+ help="Append given JSON object to the request payload. You can use this to specify"
1004
+ "additional generate params like sampling params.",
1005
+ )
982
1006
  args = parser.parse_args()
983
- fire(args)
1007
+ run_benchmark(args)
@@ -27,7 +27,7 @@ class GlobalConfig:
27
27
  # Runtime constants: others
28
28
  self.num_continue_decode_steps = 10
29
29
  self.retract_decode_steps = 20
30
- self.flashinfer_workspace_size = 192 * 1024 * 1024
30
+ self.flashinfer_workspace_size = 384 * 1024 * 1024
31
31
 
32
32
  # Output tokenization configs
33
33
  self.skip_special_tokens_in_output = True