sglang 0.1.18__tar.gz → 0.1.20__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {sglang-0.1.18/sglang.egg-info → sglang-0.1.20}/PKG-INFO +19 -13
  2. {sglang-0.1.18 → sglang-0.1.20}/README.md +17 -11
  3. {sglang-0.1.18 → sglang-0.1.20}/pyproject.toml +2 -2
  4. {sglang-0.1.18 → sglang-0.1.20}/sglang/__init__.py +1 -1
  5. {sglang-0.1.18 → sglang-0.1.20}/sglang/api.py +26 -0
  6. {sglang-0.1.18 → sglang-0.1.20}/sglang/backend/runtime_endpoint.py +18 -14
  7. {sglang-0.1.18 → sglang-0.1.20}/sglang/bench_latency.py +40 -18
  8. {sglang-0.1.18 → sglang-0.1.20}/sglang/global_config.py +21 -16
  9. {sglang-0.1.18 → sglang-0.1.20}/sglang/lang/chat_template.py +41 -6
  10. {sglang-0.1.18 → sglang-0.1.20}/sglang/lang/interpreter.py +5 -1
  11. {sglang-0.1.18 → sglang-0.1.20}/sglang/lang/ir.py +61 -25
  12. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/constrained/__init__.py +3 -2
  13. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/hf_transformers_utils.py +7 -3
  14. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/layers/extend_attention.py +2 -1
  15. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/layers/fused_moe.py +181 -167
  16. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/layers/logits_processor.py +55 -19
  17. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/layers/radix_attention.py +33 -59
  18. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/layers/token_attention.py +4 -8
  19. sglang-0.1.20/sglang/srt/managers/controller/cuda_graph_runner.py +172 -0
  20. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/controller/infer_batch.py +244 -36
  21. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/controller/manager_single.py +1 -1
  22. sglang-0.1.20/sglang/srt/managers/controller/model_runner.py +347 -0
  23. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/controller/tp_worker.py +39 -20
  24. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/detokenizer_manager.py +4 -2
  25. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/io_struct.py +1 -1
  26. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/tokenizer_manager.py +14 -13
  27. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/memory_pool.py +33 -6
  28. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/model_config.py +6 -0
  29. sglang-0.1.20/sglang/srt/models/gemma2.py +436 -0
  30. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/llama2.py +3 -3
  31. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/llama_classification.py +10 -7
  32. sglang-0.1.20/sglang/srt/models/minicpm.py +373 -0
  33. sglang-0.1.20/sglang/srt/models/qwen2_moe.py +454 -0
  34. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/openai_api_adapter.py +2 -2
  35. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/openai_protocol.py +1 -1
  36. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/server.py +18 -8
  37. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/server_args.py +24 -20
  38. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/utils.py +68 -35
  39. {sglang-0.1.18 → sglang-0.1.20/sglang.egg-info}/PKG-INFO +19 -13
  40. {sglang-0.1.18 → sglang-0.1.20}/sglang.egg-info/SOURCES.txt +4 -0
  41. {sglang-0.1.18 → sglang-0.1.20}/sglang.egg-info/requires.txt +1 -1
  42. sglang-0.1.18/sglang/srt/managers/controller/model_runner.py +0 -562
  43. {sglang-0.1.18 → sglang-0.1.20}/LICENSE +0 -0
  44. {sglang-0.1.18 → sglang-0.1.20}/setup.cfg +0 -0
  45. {sglang-0.1.18 → sglang-0.1.20}/sglang/backend/__init__.py +0 -0
  46. {sglang-0.1.18 → sglang-0.1.20}/sglang/backend/anthropic.py +0 -0
  47. {sglang-0.1.18 → sglang-0.1.20}/sglang/backend/base_backend.py +0 -0
  48. {sglang-0.1.18 → sglang-0.1.20}/sglang/backend/litellm.py +0 -0
  49. {sglang-0.1.18 → sglang-0.1.20}/sglang/backend/openai.py +0 -0
  50. {sglang-0.1.18 → sglang-0.1.20}/sglang/backend/vertexai.py +0 -0
  51. {sglang-0.1.18 → sglang-0.1.20}/sglang/lang/__init__.py +0 -0
  52. {sglang-0.1.18 → sglang-0.1.20}/sglang/lang/compiler.py +0 -0
  53. {sglang-0.1.18 → sglang-0.1.20}/sglang/lang/tracer.py +0 -0
  54. {sglang-0.1.18 → sglang-0.1.20}/sglang/launch_server.py +0 -0
  55. {sglang-0.1.18 → sglang-0.1.20}/sglang/launch_server_llavavid.py +0 -0
  56. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/constrained/base_cache.py +0 -0
  57. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/constrained/fsm_cache.py +0 -0
  58. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/constrained/jump_forward.py +0 -0
  59. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/conversation.py +0 -0
  60. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/flush_cache.py +0 -0
  61. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
  62. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/controller/dp_worker.py +0 -0
  63. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/controller/manager_multi.py +0 -0
  64. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/controller/radix_cache.py +0 -0
  65. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/controller/schedule_heuristic.py +0 -0
  66. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/mm_utils.py +0 -0
  67. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/chatglm.py +0 -0
  68. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/commandr.py +0 -0
  69. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/dbrx.py +0 -0
  70. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/gemma.py +0 -0
  71. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/grok.py +0 -0
  72. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/llava.py +0 -0
  73. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/llavavid.py +0 -0
  74. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/mistral.py +0 -0
  75. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/mixtral.py +0 -0
  76. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/mixtral_quant.py +0 -0
  77. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/qwen.py +0 -0
  78. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/qwen2.py +0 -0
  79. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/stablelm.py +0 -0
  80. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/yivl.py +0 -0
  81. {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/sampling_params.py +0 -0
  82. {sglang-0.1.18 → sglang-0.1.20}/sglang/test/test_conversation.py +0 -0
  83. {sglang-0.1.18 → sglang-0.1.20}/sglang/test/test_openai_protocol.py +0 -0
  84. {sglang-0.1.18 → sglang-0.1.20}/sglang/test/test_programs.py +0 -0
  85. {sglang-0.1.18 → sglang-0.1.20}/sglang/test/test_utils.py +0 -0
  86. {sglang-0.1.18 → sglang-0.1.20}/sglang/utils.py +0 -0
  87. {sglang-0.1.18 → sglang-0.1.20}/sglang.egg-info/dependency_links.txt +0 -0
  88. {sglang-0.1.18 → sglang-0.1.20}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.18
3
+ Version: 0.1.20
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -229,7 +229,7 @@ Requires-Dist: torch; extra == "srt"
229
229
  Requires-Dist: uvicorn; extra == "srt"
230
230
  Requires-Dist: uvloop; extra == "srt"
231
231
  Requires-Dist: zmq; extra == "srt"
232
- Requires-Dist: vllm==0.5.0; extra == "srt"
232
+ Requires-Dist: vllm==0.5.1; extra == "srt"
233
233
  Requires-Dist: outlines>=0.0.44; extra == "srt"
234
234
  Provides-Extra: openai
235
235
  Requires-Dist: openai>=1.0; extra == "openai"
@@ -257,7 +257,7 @@ It makes your interaction with LLMs faster and more controllable by co-designing
257
257
 
258
258
  The core features include:
259
259
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
260
- - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
260
+ - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
261
261
 
262
262
  ## News
263
263
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -288,15 +288,21 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
288
288
  git clone https://github.com/sgl-project/sglang.git
289
289
  cd sglang
290
290
 
291
- pip install --upgrade pip
292
291
  pip install -e "python[all]"
293
292
 
294
293
  # Install FlashInfer CUDA kernels
295
294
  pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
296
295
  ```
297
296
 
298
- ### Notes
299
- - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
297
+ ### Method 3: Using docker
298
+ The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
299
+
300
+ ### Common Notes
301
+ - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
302
+ ```
303
+ pip uninstall -y triton triton-nightly
304
+ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
305
+ ```
300
306
  - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
301
307
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
302
308
 
@@ -518,8 +524,8 @@ for out in state.text_iter():
518
524
  ```
519
525
 
520
526
  ### Tips and Implementation Details
521
- - The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
522
- - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
527
+ - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
528
+ - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
523
529
 
524
530
  ## Backend: SGLang Runtime (SRT)
525
531
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
@@ -576,7 +582,6 @@ response = client.chat.completions.create(
576
582
  print(response)
577
583
  ```
578
584
 
579
-
580
585
  By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
581
586
 
582
587
  If needed, you can also override the chat template when launching the server:
@@ -605,7 +610,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
605
610
  ```
606
611
 
607
612
  ### Additional Arguments
608
- - Add `--tp 2` to enable tensor parallelism.
613
+ - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
609
614
  ```
610
615
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
611
616
  ```
@@ -623,9 +628,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
623
628
  - Llama
624
629
  - Mistral
625
630
  - Mixtral
626
- - Qwen / Qwen 2
627
- - Gemma
628
- - Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
631
+ - Qwen / Qwen 2 / Qwen 2 MoE
632
+ - Gemma / Gemma 2
629
633
  - `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
630
634
  - LLaVA
631
635
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
@@ -638,6 +642,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
638
642
  - StableLM
639
643
  - Command-R
640
644
  - DBRX
645
+ - Grok
646
+ - ChatGLM
641
647
  - AWQ/GPTQ/Marlin quantization
642
648
 
643
649
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
@@ -11,7 +11,7 @@ It makes your interaction with LLMs faster and more controllable by co-designing
11
11
 
12
12
  The core features include:
13
13
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
14
- - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
14
+ - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
15
15
 
16
16
  ## News
17
17
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -42,15 +42,21 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
42
42
  git clone https://github.com/sgl-project/sglang.git
43
43
  cd sglang
44
44
 
45
- pip install --upgrade pip
46
45
  pip install -e "python[all]"
47
46
 
48
47
  # Install FlashInfer CUDA kernels
49
48
  pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
50
49
  ```
51
50
 
52
- ### Notes
53
- - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
51
+ ### Method 3: Using docker
52
+ The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
53
+
54
+ ### Common Notes
55
+ - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
56
+ ```
57
+ pip uninstall -y triton triton-nightly
58
+ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
59
+ ```
54
60
  - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
55
61
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
56
62
 
@@ -272,8 +278,8 @@ for out in state.text_iter():
272
278
  ```
273
279
 
274
280
  ### Tips and Implementation Details
275
- - The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
276
- - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
281
+ - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
282
+ - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
277
283
 
278
284
  ## Backend: SGLang Runtime (SRT)
279
285
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
@@ -330,7 +336,6 @@ response = client.chat.completions.create(
330
336
  print(response)
331
337
  ```
332
338
 
333
-
334
339
  By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
335
340
 
336
341
  If needed, you can also override the chat template when launching the server:
@@ -359,7 +364,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
359
364
  ```
360
365
 
361
366
  ### Additional Arguments
362
- - Add `--tp 2` to enable tensor parallelism.
367
+ - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
363
368
  ```
364
369
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
365
370
  ```
@@ -377,9 +382,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
377
382
  - Llama
378
383
  - Mistral
379
384
  - Mixtral
380
- - Qwen / Qwen 2
381
- - Gemma
382
- - Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
385
+ - Qwen / Qwen 2 / Qwen 2 MoE
386
+ - Gemma / Gemma 2
383
387
  - `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
384
388
  - LLaVA
385
389
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
@@ -392,6 +396,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
392
396
  - StableLM
393
397
  - Command-R
394
398
  - DBRX
399
+ - Grok
400
+ - ChatGLM
395
401
  - AWQ/GPTQ/Marlin quantization
396
402
 
397
403
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.1.18"
7
+ version = "0.1.20"
8
8
  description = "A structured generation langauge for LLMs."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -21,7 +21,7 @@ dependencies = [
21
21
 
22
22
  [project.optional-dependencies]
23
23
  srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
24
- "psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.0", "outlines>=0.0.44"]
24
+ "psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.1", "outlines>=0.0.44"]
25
25
  openai = ["openai>=1.0", "tiktoken"]
26
26
  anthropic = ["anthropic>=0.20.0"]
27
27
  litellm = ["litellm>=1.0.0"]
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.18"
1
+ __version__ = "0.1.20"
2
2
 
3
3
  # SGL API Components
4
4
  from sglang.api import (
@@ -67,10 +67,16 @@ def gen(
67
67
  frequency_penalty: Optional[float] = None,
68
68
  presence_penalty: Optional[float] = None,
69
69
  ignore_eos: Optional[bool] = None,
70
+ return_logprob: Optional[bool] = None,
71
+ logprob_start_len: Optional[int] = None,
72
+ top_logprobs_num: Optional[int] = None,
73
+ return_text_in_logprobs: Optional[bool] = None,
70
74
  dtype: Optional[type] = None,
71
75
  choices: Optional[List[str]] = None,
72
76
  regex: Optional[str] = None,
73
77
  ):
78
+ """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
79
+
74
80
  if choices:
75
81
  return SglSelect(name, choices, 0.0 if temperature is None else temperature)
76
82
 
@@ -91,6 +97,10 @@ def gen(
91
97
  frequency_penalty,
92
98
  presence_penalty,
93
99
  ignore_eos,
100
+ return_logprob,
101
+ logprob_start_len,
102
+ top_logprobs_num,
103
+ return_text_in_logprobs,
94
104
  dtype,
95
105
  regex,
96
106
  )
@@ -106,6 +116,10 @@ def gen_int(
106
116
  frequency_penalty: Optional[float] = None,
107
117
  presence_penalty: Optional[float] = None,
108
118
  ignore_eos: Optional[bool] = None,
119
+ return_logprob: Optional[bool] = None,
120
+ logprob_start_len: Optional[int] = None,
121
+ top_logprobs_num: Optional[int] = None,
122
+ return_text_in_logprobs: Optional[bool] = None,
109
123
  ):
110
124
  return SglGen(
111
125
  name,
@@ -117,6 +131,10 @@ def gen_int(
117
131
  frequency_penalty,
118
132
  presence_penalty,
119
133
  ignore_eos,
134
+ return_logprob,
135
+ logprob_start_len,
136
+ top_logprobs_num,
137
+ return_text_in_logprobs,
120
138
  int,
121
139
  None,
122
140
  )
@@ -132,6 +150,10 @@ def gen_string(
132
150
  frequency_penalty: Optional[float] = None,
133
151
  presence_penalty: Optional[float] = None,
134
152
  ignore_eos: Optional[bool] = None,
153
+ return_logprob: Optional[bool] = None,
154
+ logprob_start_len: Optional[int] = None,
155
+ top_logprobs_num: Optional[int] = None,
156
+ return_text_in_logprobs: Optional[bool] = None,
135
157
  ):
136
158
  return SglGen(
137
159
  name,
@@ -143,6 +165,10 @@ def gen_string(
143
165
  frequency_penalty,
144
166
  presence_penalty,
145
167
  ignore_eos,
168
+ return_logprob,
169
+ logprob_start_len,
170
+ top_logprobs_num,
171
+ return_text_in_logprobs,
146
172
  str,
147
173
  None,
148
174
  )
@@ -1,18 +1,18 @@
1
1
  import json
2
- from typing import Callable, List, Optional, Union
2
+ from typing import List, Optional
3
3
 
4
4
  import numpy as np
5
- import requests
6
5
 
7
6
  from sglang.backend.base_backend import BaseBackend
8
7
  from sglang.global_config import global_config
9
8
  from sglang.lang.chat_template import get_chat_template_by_model_path
10
9
  from sglang.lang.interpreter import StreamExecutor
11
- from sglang.lang.ir import SglArgument, SglSamplingParams
12
- from sglang.utils import encode_image_base64, find_printable_text, http_request
10
+ from sglang.lang.ir import SglSamplingParams
11
+ from sglang.utils import http_request
13
12
 
14
13
 
15
14
  class RuntimeEndpoint(BaseBackend):
15
+
16
16
  def __init__(
17
17
  self,
18
18
  base_url: str,
@@ -38,8 +38,7 @@ class RuntimeEndpoint(BaseBackend):
38
38
  self.model_info = res.json()
39
39
 
40
40
  self.chat_template = get_chat_template_by_model_path(
41
- self.model_info["model_path"]
42
- )
41
+ self.model_info["model_path"])
43
42
 
44
43
  def get_model_name(self):
45
44
  return self.model_info["model_path"]
@@ -125,6 +124,11 @@ class RuntimeEndpoint(BaseBackend):
125
124
  else:
126
125
  raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
127
126
 
127
+ for item in ["return_logprob", "logprob_start_len", "top_logprobs_num", "return_text_in_logprobs"]:
128
+ value = getattr(sampling_params, item, None)
129
+ if value is not None:
130
+ data[item] = value
131
+
128
132
  self._add_images(s, data)
129
133
 
130
134
  res = http_request(
@@ -167,6 +171,11 @@ class RuntimeEndpoint(BaseBackend):
167
171
  else:
168
172
  raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
169
173
 
174
+ for item in ["return_logprob", "logprob_start_len", "top_logprobs_num", "return_text_in_logprobs"]:
175
+ value = getattr(sampling_params, item, None)
176
+ if value is not None:
177
+ data[item] = value
178
+
170
179
  data["stream"] = True
171
180
  self._add_images(s, data)
172
181
 
@@ -181,21 +190,16 @@ class RuntimeEndpoint(BaseBackend):
181
190
  self._assert_success(res)
182
191
  pos = 0
183
192
 
184
- incomplete_text = ""
185
193
  for chunk in res.iter_lines(decode_unicode=False):
186
194
  chunk = chunk.decode("utf-8")
187
195
  if chunk and chunk.startswith("data:"):
188
196
  if chunk == "data: [DONE]":
189
197
  break
190
198
  data = json.loads(chunk[5:].strip("\n"))
191
- text = find_printable_text(data["text"][pos:])
199
+ chunk_text = data["text"][pos:]
192
200
  meta_info = data["meta_info"]
193
- pos += len(text)
194
- incomplete_text = data["text"][pos:]
195
- yield text, meta_info
196
-
197
- if len(incomplete_text) > 0:
198
- yield incomplete_text, meta_info
201
+ pos += len(chunk_text)
202
+ yield chunk_text, meta_info
199
203
 
200
204
  def select(
201
205
  self,
@@ -32,6 +32,7 @@ import logging
32
32
  import multiprocessing
33
33
  import time
34
34
 
35
+
35
36
  import numpy as np
36
37
  import torch
37
38
  import torch.distributed as dist
@@ -70,6 +71,7 @@ class BenchArgs:
70
71
 
71
72
  def load_model(server_args, tp_rank):
72
73
  suppress_other_loggers()
74
+ rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
73
75
 
74
76
  model_config = ModelConfig(path=server_args.model_path)
75
77
  model_runner = ModelRunner(
@@ -81,7 +83,7 @@ def load_model(server_args, tp_rank):
81
83
  nccl_port=28888,
82
84
  server_args=server_args,
83
85
  )
84
- print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
86
+ rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
85
87
  tokenizer = get_tokenizer(
86
88
  server_args.tokenizer_path,
87
89
  tokenizer_mode=server_args.tokenizer_mode,
@@ -108,7 +110,7 @@ def prepare_inputs(bench_args, tokenizer):
108
110
  for i in range(len(prompts)):
109
111
  assert len(input_ids[i]) > bench_args.cut_len
110
112
 
111
- tmp_input_ids = input_ids[i][:bench_args.cut_len]
113
+ tmp_input_ids = input_ids[i][: bench_args.cut_len]
112
114
  req = Req(rid=i, origin_input_text=prompts[i], origin_input_ids=tmp_input_ids)
113
115
  req.prefix_indices = []
114
116
  req.sampling_params = sampling_params
@@ -121,9 +123,9 @@ def prepare_inputs(bench_args, tokenizer):
121
123
  def prepare_extend_inputs(bench_args, input_ids, reqs, model_runner):
122
124
  for i in range(len(reqs)):
123
125
  req = reqs[i]
124
- req.input_ids += input_ids[i][bench_args.cut_len:]
126
+ req.input_ids += input_ids[i][bench_args.cut_len :]
125
127
  req.prefix_indices = model_runner.req_to_token_pool.req_to_token[
126
- i, :bench_args.cut_len
128
+ i, : bench_args.cut_len
127
129
  ]
128
130
  return reqs
129
131
 
@@ -151,7 +153,8 @@ def extend(reqs, model_runner):
151
153
  reqs=reqs,
152
154
  req_to_token_pool=model_runner.req_to_token_pool,
153
155
  token_to_kv_pool=model_runner.token_to_kv_pool,
154
- tree_cache=None)
156
+ tree_cache=None,
157
+ )
155
158
  batch.prepare_for_extend(model_runner.model_config.vocab_size, None)
156
159
  output = model_runner.forward(batch, ForwardMode.EXTEND)
157
160
  next_token_ids, _ = batch.sample(output.next_token_logits)
@@ -165,6 +168,7 @@ def decode(input_token_ids, batch, model_runner):
165
168
  return next_token_ids, output.next_token_logits
166
169
 
167
170
 
171
+ @torch.inference_mode()
168
172
  def correctness_test(
169
173
  server_args,
170
174
  bench_args,
@@ -178,9 +182,10 @@ def correctness_test(
178
182
  # Prepare inputs
179
183
  input_ids, reqs = prepare_inputs(bench_args, tokenizer)
180
184
 
181
- # Prefill
182
- next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
183
- rank_print("prefill logits (first half)", next_token_logits)
185
+ if bench_args.cut_len > 0:
186
+ # Prefill
187
+ next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
188
+ rank_print("prefill logits (first half)", next_token_logits)
184
189
 
185
190
  # Prepare extend inputs
186
191
  reqs = prepare_extend_inputs(bench_args, input_ids, reqs, model_runner)
@@ -190,7 +195,7 @@ def correctness_test(
190
195
  rank_print("prefill logits (final)", next_token_logits)
191
196
 
192
197
  # Decode
193
- output_ids = [list(req.input_ids) for req in reqs]
198
+ output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
194
199
  for _ in range(bench_args.output_len):
195
200
  next_token_ids, _ = decode(next_token_ids, batch, model_runner)
196
201
  for i in range(len(reqs)):
@@ -198,7 +203,7 @@ def correctness_test(
198
203
 
199
204
  # Print
200
205
  for i in range(len(reqs)):
201
- print(tokenizer.decode(output_ids[i]))
206
+ rank_print(tokenizer.decode(output_ids[i]))
202
207
 
203
208
 
204
209
  def latency_test(
@@ -210,7 +215,9 @@ def latency_test(
210
215
 
211
216
  # Load the model
212
217
  model_runner, tokenizer = load_model(server_args, tp_rank)
213
- print(f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}")
218
+ rank_print(
219
+ f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}"
220
+ )
214
221
 
215
222
  # Prepare inputs
216
223
  reqs = prepare_synthetic_inputs(bench_args, tokenizer)
@@ -230,7 +237,9 @@ def latency_test(
230
237
  prefill_latency = time.time() - tic
231
238
  tot_latency += prefill_latency
232
239
  throughput = bench_args.input_len * bench_args.batch_size / prefill_latency
233
- rank_print(f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s")
240
+ rank_print(
241
+ f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s"
242
+ )
234
243
 
235
244
  # Decode
236
245
  for i in range(output_len):
@@ -241,13 +250,24 @@ def latency_test(
241
250
  latency = time.time() - tic
242
251
  tot_latency += latency
243
252
  throughput = bench_args.batch_size / latency
244
- if i < 5: rank_print(f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s")
253
+ if i < 5:
254
+ rank_print(
255
+ f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
256
+ )
245
257
  avg_decode_latency = (tot_latency - prefill_latency) / output_len
246
258
  avg_decode_throughput = bench_args.batch_size / avg_decode_latency
247
- rank_print(f"Decode. avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s")
248
-
249
- throughput = (bench_args.input_len + bench_args.output_len) * bench_args.batch_size / tot_latency
250
- rank_print(f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s")
259
+ rank_print(
260
+ f"Decode. avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s"
261
+ )
262
+
263
+ throughput = (
264
+ (bench_args.input_len + bench_args.output_len)
265
+ * bench_args.batch_size
266
+ / tot_latency
267
+ )
268
+ rank_print(
269
+ f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
270
+ )
251
271
 
252
272
  # Warm up
253
273
  run_once(4)
@@ -281,6 +301,8 @@ def main(server_args, bench_args):
281
301
  for proc in workers:
282
302
  proc.join()
283
303
 
304
+ proc.terminate()
305
+
284
306
 
285
307
  if __name__ == "__main__":
286
308
  parser = argparse.ArgumentParser()
@@ -296,4 +318,4 @@ if __name__ == "__main__":
296
318
  format="%(message)s",
297
319
  )
298
320
 
299
- main(server_args, bench_args)
321
+ main(server_args, bench_args)
@@ -8,35 +8,40 @@ class GlobalConfig:
8
8
  # 2: output final text after every run
9
9
  self.verbosity = 0
10
10
 
11
+ # Default backend of the language
11
12
  self.default_backend = None
12
13
 
13
- # Output configs
14
+ # Runtime constants: Request dependency time due to network delay
15
+ self.request_dependency_delay = 0.02
16
+ self.wait_for_new_request_delay = 0.0006
17
+
18
+ # Runtime constants: New generation token ratio estimation
19
+ self.base_new_token_ratio = 0.4
20
+ self.base_min_new_token_ratio = 0.2
21
+ self.new_token_ratio_decay = 0.0001
22
+ self.new_token_ratio_recovery = 0.05
23
+
24
+ # Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
25
+ # This can improve the speed for large batch sizes during prefill.
26
+ self.layer_sync_threshold = 8192
27
+
28
+ # Runtime constants: Flashinfer
29
+ self.flashinfer_workspace_size = 192 * 1024 * 1024
30
+
31
+ # Output tokenization configs
14
32
  self.skip_special_tokens_in_output = True
15
33
  self.spaces_between_special_tokens_in_out = True
16
34
 
17
- # Optimization configs
35
+ # Interpreter optimization configs
18
36
  self.eager_fill_image = False
19
37
  self.enable_precache_with_tracing = True
20
38
  self.enable_parallel_encoding = True
21
39
  self.enable_parallel_decoding = True
22
40
 
41
+ # Deprecated
23
42
  # Choices: ["no_adjust", "adjust_cache"]
24
43
  # no_adjust: Do not adjust the position embedding of KV cache.
25
44
  # adjust_cache: Adjust the position embedding of KV cache.
26
45
  self.concate_and_append_mode = "no_adjust"
27
46
 
28
- # Request dependency time due to network delay
29
- self.request_dependency_delay = 0.02
30
- self.wait_for_new_request_delay = 0.0006
31
-
32
- # New generation token ratio estimation
33
- self.base_new_token_ratio = 0.4
34
- self.base_min_new_token_ratio = 0.2
35
- self.new_token_ratio_decay = 0.0001
36
- self.new_token_ratio_recovery = 0.05
37
-
38
- # The threshold (number of tokens) to trigger layer-wise cuda sync.
39
- # This can improve the speed for large batch sizes during prefill.
40
- self.layer_sync_threshold = 8192
41
-
42
47
  global_config = GlobalConfig()
@@ -84,7 +84,7 @@ register_chat_template(
84
84
  "system": ("SYSTEM:", "\n"),
85
85
  "user": ("USER:", "\n"),
86
86
  "assistant": ("ASSISTANT:", "\n"),
87
- },
87
+ }
88
88
  )
89
89
  )
90
90
 
@@ -116,6 +116,23 @@ register_chat_template(
116
116
  )
117
117
  )
118
118
 
119
+ # There is default system prompt for qwen
120
+ # reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
121
+ # The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
122
+ register_chat_template(
123
+ ChatTemplate(
124
+ name="qwen",
125
+ default_system_prompt="You are a helpful assistant.",
126
+ role_prefix_and_suffix={
127
+ "system": ("<|im_start|>system\n", "<|im_end|>\n"),
128
+ "user": ("<|im_start|>user\n", "<|im_end|>\n"),
129
+ "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
130
+ },
131
+ style=ChatTemplateStyle.PLAIN,
132
+ stop_str=("<|im_end|>",),
133
+ )
134
+ )
135
+
119
136
 
120
137
  register_chat_template(
121
138
  ChatTemplate(
@@ -132,6 +149,7 @@ register_chat_template(
132
149
  )
133
150
  )
134
151
 
152
+ # Reference: https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
135
153
  register_chat_template(
136
154
  ChatTemplate(
137
155
  name="vicuna_v1.1",
@@ -148,6 +166,20 @@ register_chat_template(
148
166
  )
149
167
  )
150
168
 
169
+ # Reference: https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/file/view/master?fileName=tokenizer_config.json&status=1
170
+ register_chat_template(
171
+ ChatTemplate(
172
+ name="yi-1.5",
173
+ default_system_prompt=None,
174
+ role_prefix_and_suffix={
175
+ "system": ("", ""),
176
+ "user": ("<|im_start|>user\n", "<|im_end|>\n<|im_start|>assistant\n"),
177
+ "assistant": ("", "<|im_end|>\n"),
178
+ },
179
+ style=ChatTemplateStyle.PLAIN,
180
+ stop_str=("<|im_end|>",)
181
+ )
182
+ )
151
183
 
152
184
  register_chat_template(
153
185
  ChatTemplate(
@@ -187,7 +219,7 @@ register_chat_template(
187
219
  # Reference: https://github.com/01-ai/Yi/tree/main/VL#major-difference-with-llava
188
220
  register_chat_template(
189
221
  ChatTemplate(
190
- name="yi",
222
+ name="yi-vl",
191
223
  default_system_prompt=(
192
224
  "This is a chat between an inquisitive human and an AI assistant. Assume the role of the AI assistant. Read all the images carefully, and respond to the human's questions with informative, helpful, detailed and polite answers."
193
225
  "这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。仔细阅读所有的图像,并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。"
@@ -289,8 +321,9 @@ def match_chat_ml(model_path: str):
289
321
  model_path = model_path.lower()
290
322
  if "tinyllama" in model_path:
291
323
  return get_chat_template("chatml")
292
- if "qwen" in model_path and "chat" in model_path:
293
- return get_chat_template("chatml")
324
+ # Now the suffix for qwen2 chat model is "instruct"
325
+ if "qwen" in model_path and ("chat" in model_path or "instruct" in model_path):
326
+ return get_chat_template("qwen")
294
327
  if (
295
328
  "llava-v1.6-34b" in model_path
296
329
  or "llava-v1.6-yi-34b" in model_path
@@ -302,8 +335,10 @@ def match_chat_ml(model_path: str):
302
335
  @register_chat_template_matching_function
303
336
  def match_chat_yi(model_path: str):
304
337
  model_path = model_path.lower()
305
- if "yi" in model_path and "llava" not in model_path:
306
- return get_chat_template("yi")
338
+ if "yi-vl" in model_path and "llava" not in model_path:
339
+ return get_chat_template("yi-vl")
340
+ elif "yi-1.5" in model_path and "chat" in model_path:
341
+ return get_chat_template("yi-1.5")
307
342
 
308
343
 
309
344
  @register_chat_template_matching_function