sglang 0.2.5__tar.gz → 0.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. {sglang-0.2.5/sglang.egg-info → sglang-0.2.7}/PKG-INFO +40 -12
  2. {sglang-0.2.5 → sglang-0.2.7}/README.md +39 -11
  3. {sglang-0.2.5 → sglang-0.2.7}/pyproject.toml +1 -1
  4. {sglang-0.2.5 → sglang-0.2.7}/sglang/__init__.py +33 -26
  5. {sglang-0.2.5 → sglang-0.2.7}/sglang/api.py +9 -1
  6. {sglang-0.2.5 → sglang-0.2.7}/sglang/bench_latency.py +2 -2
  7. {sglang-0.2.5 → sglang-0.2.7}/sglang/bench_serving.py +10 -1
  8. {sglang-0.2.5 → sglang-0.2.7}/sglang/check_env.py +1 -1
  9. {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/backend/litellm.py +1 -1
  10. {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/backend/openai.py +1 -1
  11. {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/backend/runtime_endpoint.py +4 -4
  12. {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/interpreter.py +24 -9
  13. {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/ir.py +1 -1
  14. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/constrained/__init__.py +15 -0
  15. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/constrained/base_cache.py +15 -0
  16. sglang-0.2.7/sglang/srt/constrained/fsm_cache.py +66 -0
  17. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/constrained/jump_forward.py +15 -0
  18. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/conversation.py +26 -0
  19. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/hf_transformers_utils.py +18 -1
  20. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/context_flashattention_nopad.py +15 -0
  21. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/extend_attention.py +15 -0
  22. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/fused_moe.py +15 -0
  23. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/linear.py +15 -0
  24. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/logits_processor.py +109 -72
  25. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/quantization/__init__.py +15 -0
  26. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/quantization/fp8.py +15 -0
  27. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/radix_attention.py +21 -3
  28. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/token_attention.py +16 -1
  29. sglang-0.2.5/sglang/srt/managers/controller/manager_multi.py → sglang-0.2.7/sglang/srt/managers/controller_multi.py +17 -2
  30. sglang-0.2.5/sglang/srt/managers/controller/manager_single.py → sglang-0.2.7/sglang/srt/managers/controller_single.py +17 -2
  31. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/managers/detokenizer_manager.py +16 -1
  32. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/managers/io_struct.py +38 -5
  33. sglang-0.2.5/sglang/srt/managers/controller/schedule_heuristic.py → sglang-0.2.7/sglang/srt/managers/policy_scheduler.py +37 -22
  34. sglang-0.2.5/sglang/srt/managers/controller/infer_batch.py → sglang-0.2.7/sglang/srt/managers/schedule_batch.py +85 -25
  35. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/managers/tokenizer_manager.py +99 -57
  36. {sglang-0.2.5/sglang/srt/managers/controller → sglang-0.2.7/sglang/srt/managers}/tp_worker.py +177 -81
  37. sglang-0.2.7/sglang/srt/mem_cache/flush_cache.py +33 -0
  38. {sglang-0.2.5/sglang/srt → sglang-0.2.7/sglang/srt/mem_cache}/memory_pool.py +16 -1
  39. {sglang-0.2.5/sglang/srt/managers/controller → sglang-0.2.7/sglang/srt/mem_cache}/radix_cache.py +15 -0
  40. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/mm_utils.py +15 -0
  41. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/model_config.py +20 -0
  42. {sglang-0.2.5/sglang/srt/managers/controller → sglang-0.2.7/sglang/srt/model_executor}/cuda_graph_runner.py +42 -18
  43. {sglang-0.2.5/sglang/srt/managers/controller → sglang-0.2.7/sglang/srt/model_executor}/model_runner.py +51 -16
  44. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/model_loader/model_loader.py +15 -0
  45. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/model_loader/utils.py +16 -1
  46. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/chatglm.py +16 -1
  47. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/commandr.py +16 -1
  48. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/dbrx.py +16 -1
  49. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/deepseek.py +16 -1
  50. sglang-0.2.7/sglang/srt/models/deepseek_v2.py +532 -0
  51. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/gemma.py +16 -1
  52. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/gemma2.py +16 -1
  53. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/gpt_bigcode.py +16 -1
  54. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/grok.py +16 -1
  55. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/internlm2.py +16 -1
  56. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/llama2.py +16 -1
  57. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/llama_classification.py +19 -4
  58. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/llava.py +17 -2
  59. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/llavavid.py +17 -2
  60. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/minicpm.py +16 -1
  61. sglang-0.2.7/sglang/srt/models/mistral.py +26 -0
  62. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/mixtral.py +16 -1
  63. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/mixtral_quant.py +16 -1
  64. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/qwen.py +16 -1
  65. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/qwen2.py +16 -1
  66. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/qwen2_moe.py +16 -1
  67. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/stablelm.py +16 -1
  68. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/yivl.py +15 -0
  69. sglang-0.2.7/sglang/srt/openai_api/adapter.py +822 -0
  70. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/openai_api/protocol.py +65 -1
  71. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/sampling_params.py +20 -4
  72. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/server.py +90 -37
  73. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/server_args.py +76 -17
  74. {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/utils.py +15 -0
  75. {sglang-0.2.5 → sglang-0.2.7}/sglang/test/test_programs.py +5 -1
  76. {sglang-0.2.5 → sglang-0.2.7}/sglang/utils.py +22 -0
  77. sglang-0.2.7/sglang/version.py +1 -0
  78. {sglang-0.2.5 → sglang-0.2.7/sglang.egg-info}/PKG-INFO +40 -12
  79. {sglang-0.2.5 → sglang-0.2.7}/sglang.egg-info/SOURCES.txt +11 -10
  80. sglang-0.2.5/sglang/srt/constrained/fsm_cache.py +0 -31
  81. sglang-0.2.5/sglang/srt/flush_cache.py +0 -18
  82. sglang-0.2.5/sglang/srt/models/mistral.py +0 -11
  83. sglang-0.2.5/sglang/srt/openai_api/adapter.py +0 -437
  84. sglang-0.2.5/sglang/version.py +0 -1
  85. {sglang-0.2.5 → sglang-0.2.7}/LICENSE +0 -0
  86. {sglang-0.2.5 → sglang-0.2.7}/setup.cfg +0 -0
  87. {sglang-0.2.5 → sglang-0.2.7}/sglang/global_config.py +0 -0
  88. {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/__init__.py +0 -0
  89. {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/backend/__init__.py +0 -0
  90. {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/backend/anthropic.py +0 -0
  91. {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/backend/base_backend.py +0 -0
  92. {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/backend/vertexai.py +0 -0
  93. {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/chat_template.py +0 -0
  94. {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/compiler.py +0 -0
  95. {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/tracer.py +0 -0
  96. {sglang-0.2.5 → sglang-0.2.7}/sglang/launch_server.py +0 -0
  97. {sglang-0.2.5 → sglang-0.2.7}/sglang/launch_server_llavavid.py +0 -0
  98. {sglang-0.2.5 → sglang-0.2.7}/sglang/test/test_conversation.py +0 -0
  99. {sglang-0.2.5 → sglang-0.2.7}/sglang/test/test_openai_protocol.py +0 -0
  100. {sglang-0.2.5 → sglang-0.2.7}/sglang/test/test_utils.py +0 -0
  101. {sglang-0.2.5 → sglang-0.2.7}/sglang.egg-info/dependency_links.txt +0 -0
  102. {sglang-0.2.5 → sglang-0.2.7}/sglang.egg-info/requires.txt +0 -0
  103. {sglang-0.2.5 → sglang-0.2.7}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.5
3
+ Version: 0.2.7
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -245,11 +245,18 @@ Requires-Dist: sglang[litellm]; extra == "all"
245
245
 
246
246
  <div align="center">
247
247
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
248
+
249
+ [![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
250
+ ![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
251
+ [![license](https://img.shields.io/github/license/sgl-project/sglang.svg)](https://github.com/sgl-project/sglang/tree/main/LICENSE)
252
+ [![issue resolution](https://img.shields.io/github/issues-closed-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
253
+ [![open issues](https://img.shields.io/github/issues-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
254
+
248
255
  </div>
249
256
 
250
257
  --------------------------------------------------------------------------------
251
258
 
252
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
259
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
253
260
 
254
261
  SGLang is a fast serving framework for large language models and vision language models.
255
262
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -292,7 +299,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
292
299
 
293
300
  ### Method 2: From source
294
301
  ```
295
- git clone https://github.com/sgl-project/sglang.git
302
+ # Use the stable release branch
303
+ git clone -b release https://github.com/sgl-project/sglang.git
296
304
  cd sglang
297
305
 
298
306
  pip install --upgrade pip
@@ -341,7 +349,7 @@ curl http://localhost:30000/generate \
341
349
  }
342
350
  }'
343
351
  ```
344
- Learn more about the argument format [here](docs/sampling_params.md).
352
+ Learn more about the argument format [here](docs/en/sampling_params.md).
345
353
 
346
354
  ### OpenAI Compatible API
347
355
  In addition, the server supports OpenAI-compatible APIs.
@@ -388,7 +396,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
388
396
  ```
389
397
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
390
398
  ```
391
- - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
399
+ - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
392
400
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
393
401
  ```
394
402
  # Node 0
@@ -397,23 +405,24 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
397
405
  # Node 1
398
406
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
399
407
  ```
400
- - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
408
+ - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
401
409
  - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
402
410
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
403
411
 
404
412
  ### Run Llama 3.1 405B
405
413
 
406
414
  ```bash
407
- # 2 nodes run 405B fp16
415
+ ## Run 405B (fp8) on a single node
416
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
417
+
418
+ ## Run 405B (fp16) on two nodes
408
419
  # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
420
+
409
421
  # on the first node
410
422
  GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
411
423
 
412
424
  # on the second
413
425
  GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
414
-
415
- # single node run 405B fp8
416
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
417
426
  ```
418
427
 
419
428
  ### Supported Models
@@ -422,6 +431,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instr
422
431
  - Mistral / Mixtral
423
432
  - Gemma / Gemma 2
424
433
  - Qwen / Qwen 2 / Qwen 2 MoE
434
+ - DeepSeek / DeepSeek 2
425
435
  - LLaVA 1.5 / 1.6
426
436
  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
427
437
  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
@@ -438,11 +448,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instr
438
448
  - InternLM 2
439
449
  - Mistral NeMo
440
450
 
441
- Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
451
+ Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
442
452
 
443
453
  ### Benchmark Performance
444
454
 
445
- - Benchmark a single static batch. Run the following command without launching a server. The arguments are the same as those for `launch_server.py`.
455
+ - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as those for `launch_server.py`. This is not a dynamic batching server, so it may run out of memory for a batch size that can run successfully with a real server. This is because a real server will truncate the prefill into several batches/chunks, while this unit test does not do this.
446
456
  ```
447
457
  python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
448
458
  ```
@@ -669,6 +679,24 @@ for out in state.text_iter():
669
679
  print(out, end="", flush=True)
670
680
  ```
671
681
 
682
+ #### Roles
683
+
684
+ Use `sgl.system`, `sgl.user` and `sgl.assistant` to set roles when using Chat models. You can also define more complex role prompts using begin and end tokens.
685
+
686
+ ```python
687
+ @sgl.function
688
+ def chat_example(s):
689
+ s += sgl.system("You are a helpful assistant.")
690
+ # Same as: s += s.system("You are a helpful assistant.")
691
+
692
+ with s.user():
693
+ s += "Question: What is the capital of France?"
694
+
695
+ s += sgl.assistant_begin()
696
+ s += "Answer: " + sgl.gen(max_tokens=100, stop="\n")
697
+ s += sgl.assistant_end()
698
+ ```
699
+
672
700
  #### Tips and Implementation Details
673
701
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
674
702
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
@@ -1,10 +1,17 @@
1
1
  <div align="center">
2
2
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
3
+
4
+ [![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
5
+ ![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
6
+ [![license](https://img.shields.io/github/license/sgl-project/sglang.svg)](https://github.com/sgl-project/sglang/tree/main/LICENSE)
7
+ [![issue resolution](https://img.shields.io/github/issues-closed-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
8
+ [![open issues](https://img.shields.io/github/issues-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
9
+
3
10
  </div>
4
11
 
5
12
  --------------------------------------------------------------------------------
6
13
 
7
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
14
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
8
15
 
9
16
  SGLang is a fast serving framework for large language models and vision language models.
10
17
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -47,7 +54,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
47
54
 
48
55
  ### Method 2: From source
49
56
  ```
50
- git clone https://github.com/sgl-project/sglang.git
57
+ # Use the stable release branch
58
+ git clone -b release https://github.com/sgl-project/sglang.git
51
59
  cd sglang
52
60
 
53
61
  pip install --upgrade pip
@@ -96,7 +104,7 @@ curl http://localhost:30000/generate \
96
104
  }
97
105
  }'
98
106
  ```
99
- Learn more about the argument format [here](docs/sampling_params.md).
107
+ Learn more about the argument format [here](docs/en/sampling_params.md).
100
108
 
101
109
  ### OpenAI Compatible API
102
110
  In addition, the server supports OpenAI-compatible APIs.
@@ -143,7 +151,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
143
151
  ```
144
152
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
145
153
  ```
146
- - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
154
+ - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
147
155
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
148
156
  ```
149
157
  # Node 0
@@ -152,23 +160,24 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
152
160
  # Node 1
153
161
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
154
162
  ```
155
- - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
163
+ - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
156
164
  - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
157
165
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
158
166
 
159
167
  ### Run Llama 3.1 405B
160
168
 
161
169
  ```bash
162
- # 2 nodes run 405B fp16
170
+ ## Run 405B (fp8) on a single node
171
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
172
+
173
+ ## Run 405B (fp16) on two nodes
163
174
  # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
175
+
164
176
  # on the first node
165
177
  GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
166
178
 
167
179
  # on the second
168
180
  GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
169
-
170
- # single node run 405B fp8
171
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
172
181
  ```
173
182
 
174
183
  ### Supported Models
@@ -177,6 +186,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instr
177
186
  - Mistral / Mixtral
178
187
  - Gemma / Gemma 2
179
188
  - Qwen / Qwen 2 / Qwen 2 MoE
189
+ - DeepSeek / DeepSeek 2
180
190
  - LLaVA 1.5 / 1.6
181
191
  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
182
192
  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
@@ -193,11 +203,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instr
193
203
  - InternLM 2
194
204
  - Mistral NeMo
195
205
 
196
- Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
206
+ Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
197
207
 
198
208
  ### Benchmark Performance
199
209
 
200
- - Benchmark a single static batch. Run the following command without launching a server. The arguments are the same as those for `launch_server.py`.
210
+ - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as those for `launch_server.py`. This is not a dynamic batching server, so it may run out of memory for a batch size that can run successfully with a real server. This is because a real server will truncate the prefill into several batches/chunks, while this unit test does not do this.
201
211
  ```
202
212
  python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
203
213
  ```
@@ -424,6 +434,24 @@ for out in state.text_iter():
424
434
  print(out, end="", flush=True)
425
435
  ```
426
436
 
437
+ #### Roles
438
+
439
+ Use `sgl.system`, `sgl.user` and `sgl.assistant` to set roles when using Chat models. You can also define more complex role prompts using begin and end tokens.
440
+
441
+ ```python
442
+ @sgl.function
443
+ def chat_example(s):
444
+ s += sgl.system("You are a helpful assistant.")
445
+ # Same as: s += s.system("You are a helpful assistant.")
446
+
447
+ with s.user():
448
+ s += "Question: What is the capital of France?"
449
+
450
+ s += sgl.assistant_begin()
451
+ s += "Answer: " + sgl.gen(max_tokens=100, stop="\n")
452
+ s += sgl.assistant_end()
453
+ ```
454
+
427
455
  #### Tips and Implementation Details
428
456
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
429
457
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.2.5"
7
+ version = "0.2.7"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -1,4 +1,5 @@
1
1
  # SGL API Components
2
+
2
3
  from sglang.api import (
3
4
  Runtime,
4
5
  assistant,
@@ -14,48 +15,54 @@ from sglang.api import (
14
15
  select,
15
16
  set_default_backend,
16
17
  system,
18
+ system_begin,
19
+ system_end,
17
20
  user,
18
21
  user_begin,
19
22
  user_end,
20
23
  video,
21
24
  )
22
25
 
23
- # Global Configurations
24
- from sglang.global_config import global_config
25
-
26
- # SGL Backends
27
- from sglang.lang.backend.anthropic import Anthropic
28
- from sglang.lang.backend.litellm import LiteLLM
29
- from sglang.lang.backend.openai import OpenAI
30
- from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
31
- from sglang.lang.backend.vertexai import VertexAI
32
-
33
- from .version import __version__
34
-
35
- # public APIs management
26
+ # SGLang DSL APIs
36
27
  __all__ = [
37
- "global_config",
38
- "Anthropic",
39
- "LiteLLM",
40
- "OpenAI",
41
- "RuntimeEndpoint",
42
- "VertexAI",
43
- "function",
44
28
  "Runtime",
45
- "set_default_backend",
29
+ "assistant",
30
+ "assistant_begin",
31
+ "assistant_end",
46
32
  "flush_cache",
47
- "get_server_args",
33
+ "function",
48
34
  "gen",
49
35
  "gen_int",
50
36
  "gen_string",
37
+ "get_server_args",
51
38
  "image",
52
- "video",
53
39
  "select",
40
+ "set_default_backend",
54
41
  "system",
42
+ "system_begin",
43
+ "system_end",
55
44
  "user",
56
- "assistant",
57
45
  "user_begin",
58
46
  "user_end",
59
- "assistant_begin",
60
- "assistant_end",
47
+ "video",
61
48
  ]
49
+
50
+ # Global Configurations
51
+ from sglang.global_config import global_config
52
+
53
+ __all__ += ["global_config"]
54
+
55
+ from sglang.version import __version__
56
+
57
+ __all__ += ["__version__"]
58
+
59
+ # SGL Backends
60
+ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
61
+ from sglang.utils import LazyImport
62
+
63
+ Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
64
+ LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
65
+ OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
66
+ VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
67
+
68
+ __all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]
@@ -75,7 +75,7 @@ def gen(
75
75
  choices: Optional[List[str]] = None,
76
76
  regex: Optional[str] = None,
77
77
  ):
78
- """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
78
+ """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
79
79
 
80
80
  if choices:
81
81
  return SglSelect(name, choices, 0.0 if temperature is None else temperature)
@@ -210,6 +210,14 @@ def assistant(expr: Optional[SglExpr] = None):
210
210
  return _role_common("assistant", expr)
211
211
 
212
212
 
213
+ def system_begin():
214
+ return SglRoleBegin("system")
215
+
216
+
217
+ def system_end():
218
+ return SglRoleEnd("system")
219
+
220
+
213
221
  def user_begin():
214
222
  return SglRoleBegin("user")
215
223
 
@@ -37,9 +37,9 @@ import torch
37
37
  import torch.distributed as dist
38
38
 
39
39
  from sglang.srt.hf_transformers_utils import get_tokenizer
40
- from sglang.srt.managers.controller.infer_batch import Batch, ForwardMode, Req
41
- from sglang.srt.managers.controller.model_runner import ModelRunner
40
+ from sglang.srt.managers.schedule_batch import Batch, ForwardMode, Req
42
41
  from sglang.srt.model_config import ModelConfig
42
+ from sglang.srt.model_executor.model_runner import ModelRunner
43
43
  from sglang.srt.sampling_params import SamplingParams
44
44
  from sglang.srt.server_args import ServerArgs
45
45
  from sglang.srt.utils import suppress_other_loggers
@@ -1,5 +1,6 @@
1
1
  # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/backend_request_func.py
2
2
  # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
3
+
3
4
  """
4
5
  Benchmark online serving.
5
6
 
@@ -84,6 +85,9 @@ async def async_request_trt_llm(
84
85
  "min_length": request_func_input.output_len,
85
86
  "end_id": 1048576,
86
87
  }
88
+ if args.disable_ignore_eos:
89
+ del payload["min_length"]
90
+ del payload["end_id"]
87
91
  output = RequestFuncOutput()
88
92
  output.prompt_len = request_func_input.prompt_len
89
93
 
@@ -149,7 +153,7 @@ async def async_request_openai_completions(
149
153
  "best_of": 1,
150
154
  "max_tokens": request_func_input.output_len,
151
155
  "stream": not args.disable_stream,
152
- "ignore_eos": True,
156
+ "ignore_eos": not args.disable_ignore_eos,
153
157
  }
154
158
  headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
155
159
 
@@ -969,6 +973,11 @@ if __name__ == "__main__":
969
973
  action="store_true",
970
974
  help="Disable streaming mode.",
971
975
  )
976
+ parser.add_argument(
977
+ "--disable-ignore-eos",
978
+ action="store_true",
979
+ help="Disable ignoring EOS.",
980
+ )
972
981
 
973
982
  set_ulimit()
974
983
 
@@ -22,7 +22,7 @@ PACKAGE_LIST = [
22
22
  "huggingface_hub",
23
23
  "interegular",
24
24
  "packaging",
25
- "pillow",
25
+ "PIL",
26
26
  "psutil",
27
27
  "pydantic",
28
28
  "uvicorn",
@@ -61,7 +61,7 @@ class LiteLLM(BaseBackend):
61
61
  model=self.model_name,
62
62
  messages=messages,
63
63
  **self.client_params,
64
- **sampling_params.to_anthropic_kwargs(),
64
+ **sampling_params.to_litellm_kwargs(),
65
65
  )
66
66
  comp = ret.choices[0].message.content
67
67
 
@@ -18,7 +18,7 @@ except ImportError as e:
18
18
  openai = tiktoken = e
19
19
 
20
20
 
21
- logger = logging.getLogger("openai")
21
+ logger = logging.getLogger(__name__)
22
22
 
23
23
 
24
24
  def create_logit_bias_int(tokenizer):
@@ -253,14 +253,14 @@ class RuntimeEndpoint(BaseBackend):
253
253
  r["meta_info"]["normalized_prompt_logprob"] for r in obj
254
254
  ]
255
255
  decision = choices[np.argmax(normalized_prompt_logprobs)]
256
- prefill_token_logprobs = [r["meta_info"]["prefill_token_logprobs"] for r in obj]
257
- decode_token_logprobs = [r["meta_info"]["decode_token_logprobs"] for r in obj]
256
+ input_token_logprobs = [r["meta_info"]["input_token_logprobs"] for r in obj]
257
+ output_token_logprobs = [r["meta_info"]["output_token_logprobs"] for r in obj]
258
258
 
259
259
  return (
260
260
  decision,
261
261
  normalized_prompt_logprobs,
262
- prefill_token_logprobs,
263
- decode_token_logprobs,
262
+ input_token_logprobs,
263
+ output_token_logprobs,
264
264
  )
265
265
 
266
266
  def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
@@ -541,18 +541,19 @@ class StreamExecutor:
541
541
  (
542
542
  decision,
543
543
  normalized_prompt_logprobs,
544
- prefill_token_logprobs,
545
- decode_token_logprobs,
544
+ input_token_logprobs,
545
+ output_token_logprobs,
546
546
  ) = self.backend.select(self, expr.choices, expr.temperature)
547
547
  if expr.name is not None:
548
548
  name = expr.name
549
549
  self.variables[name] = decision
550
550
  self.meta_info[name] = {
551
551
  "normalized_prompt_logprobs": normalized_prompt_logprobs,
552
- "prefill_token_logprobs": prefill_token_logprobs,
553
- "decode_token_logprobs": decode_token_logprobs,
552
+ "input_token_logprobs": input_token_logprobs,
553
+ "output_token_logprobs": output_token_logprobs,
554
554
  }
555
555
  self.variable_event[name].set()
556
+ self.stream_var_event[name].set()
556
557
  self.text_ += decision
557
558
 
558
559
  def _execute_variable(self, expr: SglVariable):
@@ -705,9 +706,9 @@ class ProgramState:
705
706
 
706
707
  def _role_common(self, name: str, expr: Optional[SglExpr] = None):
707
708
  if expr is not None:
708
- self.stream_executor.submit(
709
- SglExprList([SglRoleBegin(name), expr, SglRoleEnd(name)])
710
- )
709
+ role_expr = SglExprList([SglRoleBegin(name), expr, SglRoleEnd(name)])
710
+ self.stream_executor.submit(role_expr)
711
+ return role_expr
711
712
  else:
712
713
 
713
714
  @contextmanager
@@ -778,7 +779,14 @@ class ProgramState:
778
779
  if self.stream_executor.is_finished:
779
780
  break
780
781
  else:
781
- event = self.stream_executor.stream_var_event[var_name]
782
+ event = None
783
+ while not event:
784
+ if var_name in self.stream_executor.stream_var_event:
785
+ event = self.stream_executor.stream_var_event[var_name]
786
+ if self.stream_executor.is_finished:
787
+ yield ""
788
+ return
789
+
782
790
  while True:
783
791
  event.wait()
784
792
  event.clear()
@@ -813,7 +821,14 @@ class ProgramState:
813
821
  if self.stream_executor.is_finished:
814
822
  break
815
823
  else:
816
- event = self.stream_executor.stream_var_event[var_name]
824
+ event = None
825
+ while not event:
826
+ if var_name in self.stream_executor.stream_var_event:
827
+ event = self.stream_executor.stream_var_event[var_name]
828
+ if self.stream_executor.is_finished:
829
+ yield ""
830
+ return
831
+
817
832
  while True:
818
833
  await loop.run_in_executor(None, event.wait)
819
834
  event.clear()
@@ -410,7 +410,7 @@ class SglGen(SglExpr):
410
410
  dtype: Optional[type] = None,
411
411
  regex: Optional[str] = None,
412
412
  ):
413
- """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
413
+ """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
414
414
  super().__init__()
415
415
  self.name = name
416
416
  self.sampling_params = SglSamplingParams(
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  import json
2
17
  from typing import Dict, Optional, Union
3
18
 
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Base cache class."""
2
17
 
3
18
  import time
@@ -0,0 +1,66 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
16
+ """Cache for the compressed finite state machine."""
17
+
18
+ from sglang.srt.constrained import RegexGuide, TransformerTokenizer
19
+ from sglang.srt.constrained.base_cache import BaseCache
20
+
21
+
22
+ class FSMCache(BaseCache):
23
+ def __init__(self, tokenizer_path, tokenizer_args_dict, enable=True):
24
+ super().__init__(enable=enable)
25
+
26
+ if tokenizer_path.endswith(".json") or tokenizer_path.endswith(".model"):
27
+ # Do not support TiktokenTokenizer or SentencePieceTokenizer
28
+ return
29
+
30
+ from importlib.metadata import version
31
+
32
+ if version("outlines") >= "0.0.35":
33
+ from transformers import AutoTokenizer
34
+
35
+ tokenizer_args_dict.setdefault("padding_side", "left")
36
+ tokenizer = AutoTokenizer.from_pretrained(
37
+ tokenizer_path, **tokenizer_args_dict
38
+ )
39
+ try:
40
+ self.outlines_tokenizer = TransformerTokenizer(tokenizer)
41
+ except AttributeError:
42
+ # FIXME: tmp fix for chatglm2 & chatglm3 (pad_token_id=0)
43
+ origin_pad_token_id = tokenizer.pad_token_id
44
+
45
+ def fset(self, value):
46
+ self._value = value
47
+
48
+ type(tokenizer).pad_token_id = property(
49
+ fget=type(tokenizer).pad_token_id.fget, fset=fset
50
+ )
51
+ self.outlines_tokenizer = TransformerTokenizer(tokenizer)
52
+ self.outlines_tokenizer.tokenizer.pad_token_id = origin_pad_token_id
53
+ self.outlines_tokenizer.pad_token_id = origin_pad_token_id
54
+ self.outlines_tokenizer.pad_token = (
55
+ self.outlines_tokenizer.tokenizer.pad_token
56
+ )
57
+ self.outlines_tokenizer.vocabulary = (
58
+ self.outlines_tokenizer.tokenizer.get_vocab()
59
+ )
60
+ else:
61
+ self.outlines_tokenizer = TransformerTokenizer(
62
+ tokenizer_path, **tokenizer_args_dict
63
+ )
64
+
65
+ def init_value(self, regex):
66
+ return RegexGuide(regex, self.outlines_tokenizer)