sglang 0.3.3__tar.gz → 0.3.3.post1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.3.3/sglang.egg-info → sglang-0.3.3.post1}/PKG-INFO +13 -6
- {sglang-0.3.3 → sglang-0.3.3.post1}/README.md +12 -5
- {sglang-0.3.3 → sglang-0.3.3.post1}/pyproject.toml +1 -1
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/bench_latency.py +3 -3
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/conversation.py +11 -2
- sglang-0.3.3.post1/sglang/srt/managers/data_parallel_controller.py +177 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/io_struct.py +7 -2
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/schedule_batch.py +6 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/scheduler.py +46 -5
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/tokenizer_manager.py +9 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/model_executor/model_runner.py +40 -35
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/openai_api/adapter.py +5 -3
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/sampling_batch_info.py +19 -7
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/server.py +55 -20
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/server_args.py +14 -11
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/utils.py +26 -11
- sglang-0.3.3.post1/sglang/version.py +1 -0
- {sglang-0.3.3 → sglang-0.3.3.post1/sglang.egg-info}/PKG-INFO +13 -6
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang.egg-info/SOURCES.txt +1 -0
- sglang-0.3.3/sglang/version.py +0 -1
- {sglang-0.3.3 → sglang-0.3.3.post1}/LICENSE +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/setup.cfg +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/__init__.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/api.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/bench_server_latency.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/bench_serving.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/check_env.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/global_config.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/__init__.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/chat_template.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/choices.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/compiler.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/interpreter.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/ir.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/lang/tracer.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/launch_server.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/configs/model_config.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/constrained/base_tool_cache.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/activation.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/attention/__init__.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/attention/flashinfer_utils.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/__init__.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/layer.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/patch.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/sampler.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/image_processor.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/schedule_policy.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/managers/tp_worker.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/mem_cache/memory_pool.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/model_executor/cuda_graph_runner.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/model_executor/forward_batch_info.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/deepseek_v2.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/exaone.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/gemma2.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/grok.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/llama.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/llama_reward.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/llava.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/minicpm3.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/olmoe.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/qwen2.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/qwen2_moe.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/torch_native_llama.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/xverse.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/xverse_moe.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/openai_api/protocol.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/srt/sampling/sampling_params.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/run_eval.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/runners.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/test_activation.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/test_programs.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/test/test_utils.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang/utils.py +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang.egg-info/requires.txt +0 -0
- {sglang-0.3.3 → sglang-0.3.3.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.3
|
3
|
+
Version: 0.3.3.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -257,8 +257,8 @@ Provides-Extra: dev
|
|
257
257
|
Requires-Dist: sglang[all]; extra == "dev"
|
258
258
|
Requires-Dist: sglang[test]; extra == "dev"
|
259
259
|
|
260
|
-
<div align="center">
|
261
|
-
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
260
|
+
<div align="center" id="sglangtop">
|
261
|
+
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
262
262
|
|
263
263
|
[](https://pypi.org/project/sglang)
|
264
264
|

|
@@ -270,10 +270,9 @@ Requires-Dist: sglang[test]; extra == "dev"
|
|
270
270
|
|
271
271
|
--------------------------------------------------------------------------------
|
272
272
|
|
273
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
273
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pptx) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
274
274
|
|
275
275
|
## Upcoming Events
|
276
|
-
- [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
|
277
276
|
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
278
277
|
|
279
278
|
## News
|
@@ -324,7 +323,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
324
323
|
### Method 2: From source
|
325
324
|
```
|
326
325
|
# Use the last release branch
|
327
|
-
git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
|
326
|
+
git clone -b v0.3.3.post1 https://github.com/sgl-project/sglang.git
|
328
327
|
cd sglang
|
329
328
|
|
330
329
|
pip install --upgrade pip
|
@@ -848,3 +847,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
|
848
847
|
## Citation And Acknowledgment
|
849
848
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
850
849
|
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
850
|
+
|
851
|
+
|
852
|
+
|
853
|
+
<p align="center">
|
854
|
+
<a href="#sglangtop" target="_blank">
|
855
|
+
<bold>Back To Top </bold>
|
856
|
+
</a>
|
857
|
+
</p>
|
@@ -1,5 +1,5 @@
|
|
1
|
-
<div align="center">
|
2
|
-
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
1
|
+
<div align="center" id="sglangtop">
|
2
|
+
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
3
3
|
|
4
4
|
[](https://pypi.org/project/sglang)
|
5
5
|

|
@@ -11,10 +11,9 @@
|
|
11
11
|
|
12
12
|
--------------------------------------------------------------------------------
|
13
13
|
|
14
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
14
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pptx) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
15
15
|
|
16
16
|
## Upcoming Events
|
17
|
-
- [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
|
18
17
|
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
19
18
|
|
20
19
|
## News
|
@@ -65,7 +64,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
65
64
|
### Method 2: From source
|
66
65
|
```
|
67
66
|
# Use the last release branch
|
68
|
-
git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
|
67
|
+
git clone -b v0.3.3.post1 https://github.com/sgl-project/sglang.git
|
69
68
|
cd sglang
|
70
69
|
|
71
70
|
pip install --upgrade pip
|
@@ -589,3 +588,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
|
589
588
|
## Citation And Acknowledgment
|
590
589
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
591
590
|
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
591
|
+
|
592
|
+
|
593
|
+
|
594
|
+
<p align="center">
|
595
|
+
<a href="#sglangtop" target="_blank">
|
596
|
+
<bold>Back To Top </bold>
|
597
|
+
</a>
|
598
|
+
</p>
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.3.3"
|
7
|
+
version = "0.3.3.post1"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -139,7 +139,7 @@ def load_model(server_args, port_args, tp_rank):
|
|
139
139
|
gpu_id=tp_rank,
|
140
140
|
tp_rank=tp_rank,
|
141
141
|
tp_size=server_args.tp_size,
|
142
|
-
nccl_port=port_args.
|
142
|
+
nccl_port=port_args.nccl_port,
|
143
143
|
server_args=server_args,
|
144
144
|
)
|
145
145
|
rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
|
@@ -220,6 +220,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
|
|
220
220
|
return reqs
|
221
221
|
|
222
222
|
|
223
|
+
@torch.inference_mode()
|
223
224
|
def extend(reqs, model_runner):
|
224
225
|
batch = ScheduleBatch.init_new(
|
225
226
|
reqs=reqs,
|
@@ -235,6 +236,7 @@ def extend(reqs, model_runner):
|
|
235
236
|
return next_token_ids, logits_output.next_token_logits, batch
|
236
237
|
|
237
238
|
|
239
|
+
@torch.inference_mode()
|
238
240
|
def decode(input_token_ids, batch, model_runner):
|
239
241
|
batch.prepare_for_decode(input_token_ids)
|
240
242
|
model_worker_batch = batch.get_model_worker_batch()
|
@@ -244,7 +246,6 @@ def decode(input_token_ids, batch, model_runner):
|
|
244
246
|
return next_token_ids, logits_output.next_token_logits
|
245
247
|
|
246
248
|
|
247
|
-
@torch.inference_mode()
|
248
249
|
def correctness_test(
|
249
250
|
server_args,
|
250
251
|
port_args,
|
@@ -287,7 +288,6 @@ def correctness_test(
|
|
287
288
|
rank_print(tokenizer.decode(output_ids[i]), "\n")
|
288
289
|
|
289
290
|
|
290
|
-
@torch.inference_mode()
|
291
291
|
def latency_test_run_once(
|
292
292
|
run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len
|
293
293
|
):
|
@@ -70,6 +70,9 @@ class Conversation:
|
|
70
70
|
sep2: str = None
|
71
71
|
# Stop criteria (the default one is EOS token)
|
72
72
|
stop_str: Union[str, List[str]] = None
|
73
|
+
# The string that represents an image token in the prompt
|
74
|
+
image_token: str = "<image>"
|
75
|
+
|
73
76
|
image_data: Optional[List[str]] = None
|
74
77
|
modalities: Optional[List[str]] = None
|
75
78
|
|
@@ -334,6 +337,7 @@ class Conversation:
|
|
334
337
|
sep=self.sep,
|
335
338
|
sep2=self.sep2,
|
336
339
|
stop_str=self.stop_str,
|
340
|
+
image_token=self.image_token,
|
337
341
|
)
|
338
342
|
|
339
343
|
def dict(self):
|
@@ -381,6 +385,7 @@ def generate_chat_conv(
|
|
381
385
|
stop_str=conv.stop_str,
|
382
386
|
image_data=[],
|
383
387
|
modalities=[],
|
388
|
+
image_token=conv.image_token,
|
384
389
|
)
|
385
390
|
|
386
391
|
if isinstance(request.messages, str):
|
@@ -412,9 +417,13 @@ def generate_chat_conv(
|
|
412
417
|
num_image_url += 1
|
413
418
|
conv.modalities.append(content.modalities)
|
414
419
|
if num_image_url > 1:
|
415
|
-
image_token =
|
420
|
+
image_token = conv.image_token
|
416
421
|
else:
|
417
|
-
image_token =
|
422
|
+
image_token = (
|
423
|
+
conv.image_token + "\n"
|
424
|
+
if conv.name != "qwen2-vl"
|
425
|
+
else conv.image_token
|
426
|
+
)
|
418
427
|
for content in message.content:
|
419
428
|
if content.type == "text":
|
420
429
|
if num_image_url > 16:
|
@@ -0,0 +1,177 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
16
|
+
"""A controller that dispatches requests to multiple data parallel workers."""
|
17
|
+
|
18
|
+
import logging
|
19
|
+
import multiprocessing as mp
|
20
|
+
from enum import Enum, auto
|
21
|
+
|
22
|
+
import zmq
|
23
|
+
|
24
|
+
from sglang.srt.managers.io_struct import (
|
25
|
+
TokenizedEmbeddingReqInput,
|
26
|
+
TokenizedGenerateReqInput,
|
27
|
+
TokenizedRewardReqInput,
|
28
|
+
)
|
29
|
+
from sglang.srt.managers.scheduler import run_scheduler_process
|
30
|
+
from sglang.srt.server_args import PortArgs, ServerArgs
|
31
|
+
from sglang.srt.utils import (
|
32
|
+
configure_logger,
|
33
|
+
kill_parent_process,
|
34
|
+
suppress_other_loggers,
|
35
|
+
)
|
36
|
+
from sglang.utils import get_exception_traceback
|
37
|
+
|
38
|
+
logger = logging.getLogger(__name__)
|
39
|
+
|
40
|
+
|
41
|
+
class LoadBalanceMethod(Enum):
|
42
|
+
"""Load balance method."""
|
43
|
+
|
44
|
+
ROUND_ROBIN = auto()
|
45
|
+
SHORTEST_QUEUE = auto()
|
46
|
+
|
47
|
+
@classmethod
|
48
|
+
def from_str(cls, method: str):
|
49
|
+
method = method.upper()
|
50
|
+
try:
|
51
|
+
return cls[method]
|
52
|
+
except KeyError as exc:
|
53
|
+
raise ValueError(f"Invalid load balance method: {method}") from exc
|
54
|
+
|
55
|
+
|
56
|
+
class DataParallelController:
|
57
|
+
"""A controller that dispatches requests to multiple data parallel workers."""
|
58
|
+
|
59
|
+
def __init__(self, server_args, port_args) -> None:
|
60
|
+
# Parse args
|
61
|
+
self.server_args = server_args
|
62
|
+
self.port_args = port_args
|
63
|
+
self.load_balance_method = LoadBalanceMethod.from_str(
|
64
|
+
server_args.load_balance_method
|
65
|
+
)
|
66
|
+
|
67
|
+
# Init inter-process communication
|
68
|
+
self.context = zmq.Context(1 + server_args.dp_size)
|
69
|
+
self.recv_from_tokenizer = self.context.socket(zmq.PULL)
|
70
|
+
self.recv_from_tokenizer.bind(f"ipc://{port_args.scheduler_input_ipc_name}")
|
71
|
+
|
72
|
+
# Dispatch method
|
73
|
+
self.round_robin_counter = 0
|
74
|
+
dispatch_lookup = {
|
75
|
+
LoadBalanceMethod.ROUND_ROBIN: self.round_robin_scheduler,
|
76
|
+
LoadBalanceMethod.SHORTEST_QUEUE: self.shortest_queue_scheduler,
|
77
|
+
}
|
78
|
+
self.dispatching = dispatch_lookup[self.load_balance_method]
|
79
|
+
|
80
|
+
# Start data parallel workers
|
81
|
+
base_gpu_id = 0
|
82
|
+
self.workers = []
|
83
|
+
for dp_rank in range(server_args.dp_size):
|
84
|
+
tmp_port_args = PortArgs.init_new(server_args)
|
85
|
+
tmp_port_args.detokenizer_ipc_name = port_args.detokenizer_ipc_name
|
86
|
+
|
87
|
+
send_to = self.launch_tensor_parallel_group(
|
88
|
+
server_args,
|
89
|
+
tmp_port_args,
|
90
|
+
base_gpu_id,
|
91
|
+
dp_rank,
|
92
|
+
)
|
93
|
+
|
94
|
+
self.workers.append(send_to)
|
95
|
+
base_gpu_id += server_args.tp_size
|
96
|
+
|
97
|
+
def launch_tensor_parallel_group(
|
98
|
+
self,
|
99
|
+
server_args: ServerArgs,
|
100
|
+
port_args: PortArgs,
|
101
|
+
base_gpu_id: int,
|
102
|
+
dp_rank: int,
|
103
|
+
):
|
104
|
+
# Launch tensor parallel scheduler processes
|
105
|
+
scheduler_procs = []
|
106
|
+
scheduler_pipe_readers = []
|
107
|
+
tp_size_per_node = server_args.tp_size // server_args.nnodes
|
108
|
+
tp_rank_range = range(
|
109
|
+
tp_size_per_node * server_args.node_rank,
|
110
|
+
tp_size_per_node * (server_args.node_rank + 1),
|
111
|
+
)
|
112
|
+
for tp_rank in tp_rank_range:
|
113
|
+
reader, writer = mp.Pipe(duplex=False)
|
114
|
+
gpu_id = base_gpu_id + tp_rank % tp_size_per_node
|
115
|
+
proc = mp.Process(
|
116
|
+
target=run_scheduler_process,
|
117
|
+
args=(server_args, port_args, gpu_id, tp_rank, dp_rank, writer),
|
118
|
+
)
|
119
|
+
proc.start()
|
120
|
+
scheduler_procs.append(proc)
|
121
|
+
scheduler_pipe_readers.append(reader)
|
122
|
+
|
123
|
+
send_to = self.context.socket(zmq.PUSH)
|
124
|
+
send_to.connect(f"ipc://{port_args.scheduler_input_ipc_name}")
|
125
|
+
|
126
|
+
# Wait for model to finish loading
|
127
|
+
for i in range(len(scheduler_pipe_readers)):
|
128
|
+
scheduler_pipe_readers[i].recv()
|
129
|
+
|
130
|
+
return send_to
|
131
|
+
|
132
|
+
def round_robin_scheduler(self, req):
|
133
|
+
self.workers[self.round_robin_counter].send_pyobj(req)
|
134
|
+
self.round_robin_counter = (self.round_robin_counter + 1) % len(self.workers)
|
135
|
+
|
136
|
+
def shortest_queue_scheduler(self, input_requests):
|
137
|
+
raise NotImplementedError()
|
138
|
+
|
139
|
+
def event_loop(self):
|
140
|
+
while True:
|
141
|
+
while True:
|
142
|
+
try:
|
143
|
+
recv_req = self.recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK)
|
144
|
+
except zmq.ZMQError:
|
145
|
+
break
|
146
|
+
|
147
|
+
if isinstance(
|
148
|
+
recv_req,
|
149
|
+
(
|
150
|
+
TokenizedGenerateReqInput,
|
151
|
+
TokenizedEmbeddingReqInput,
|
152
|
+
TokenizedRewardReqInput,
|
153
|
+
),
|
154
|
+
):
|
155
|
+
self.dispatching(recv_req)
|
156
|
+
else:
|
157
|
+
# Send other control messages to all workers
|
158
|
+
for worker in self.workers:
|
159
|
+
worker.queue.put(recv_req)
|
160
|
+
|
161
|
+
|
162
|
+
def run_data_parallel_controller_process(
|
163
|
+
server_args: ServerArgs,
|
164
|
+
port_args: PortArgs,
|
165
|
+
pipe_writer,
|
166
|
+
):
|
167
|
+
configure_logger(server_args)
|
168
|
+
suppress_other_loggers()
|
169
|
+
|
170
|
+
try:
|
171
|
+
controller = DataParallelController(server_args, port_args)
|
172
|
+
pipe_writer.send("ready")
|
173
|
+
controller.event_loop()
|
174
|
+
except Exception:
|
175
|
+
msg = get_exception_traceback()
|
176
|
+
logger.error(msg)
|
177
|
+
kill_parent_process()
|
@@ -20,6 +20,7 @@ processes (TokenizerManager, DetokenizerManager, Controller).
|
|
20
20
|
|
21
21
|
import uuid
|
22
22
|
from dataclasses import dataclass
|
23
|
+
from enum import Enum
|
23
24
|
from typing import Dict, List, Optional, Union
|
24
25
|
|
25
26
|
from sglang.srt.managers.schedule_batch import BaseFinishReason
|
@@ -119,8 +120,7 @@ class GenerateReqInput:
|
|
119
120
|
elif not isinstance(self.image_data, list):
|
120
121
|
self.image_data = [self.image_data] * num
|
121
122
|
elif isinstance(self.image_data, list):
|
122
|
-
|
123
|
-
self.image_data = self.image_data * num
|
123
|
+
pass
|
124
124
|
|
125
125
|
if self.sampling_params is None:
|
126
126
|
self.sampling_params = [{}] * num
|
@@ -344,3 +344,8 @@ class UpdateWeightReqOutput:
|
|
344
344
|
class AbortReq:
|
345
345
|
# The request id
|
346
346
|
rid: str
|
347
|
+
|
348
|
+
|
349
|
+
class ProfileReq(Enum):
|
350
|
+
START_PROFILE = 1
|
351
|
+
STOP_PROFILE = 2
|
@@ -423,6 +423,9 @@ class ScheduleBatch:
|
|
423
423
|
# Stream
|
424
424
|
has_stream: bool = False
|
425
425
|
|
426
|
+
# device
|
427
|
+
device: str = "cuda"
|
428
|
+
|
426
429
|
# Has regex
|
427
430
|
has_regex: bool = False
|
428
431
|
|
@@ -439,6 +442,7 @@ class ScheduleBatch:
|
|
439
442
|
tree_cache=tree_cache,
|
440
443
|
return_logprob=return_logprob,
|
441
444
|
has_stream=has_stream,
|
445
|
+
device=req_to_token_pool.device,
|
442
446
|
has_regex=has_regex,
|
443
447
|
)
|
444
448
|
|
@@ -806,6 +810,8 @@ class ScheduleBatch:
|
|
806
810
|
self.sampling_info.regex_fsm_states = [
|
807
811
|
req.regex_fsm_state for req in self.reqs
|
808
812
|
]
|
813
|
+
else:
|
814
|
+
self.sampling_info.regex_fsms = None
|
809
815
|
|
810
816
|
return ModelWorkerBatch(
|
811
817
|
forward_mode=self.forward_mode,
|
@@ -37,6 +37,7 @@ from sglang.srt.managers.io_struct import (
|
|
37
37
|
BatchEmbeddingOut,
|
38
38
|
BatchTokenIDOut,
|
39
39
|
FlushCacheReq,
|
40
|
+
ProfileReq,
|
40
41
|
TokenizedEmbeddingReqInput,
|
41
42
|
TokenizedGenerateReqInput,
|
42
43
|
TokenizedRewardReqInput,
|
@@ -141,7 +142,7 @@ class Scheduler:
|
|
141
142
|
gpu_id=gpu_id,
|
142
143
|
tp_rank=tp_rank,
|
143
144
|
server_args=server_args,
|
144
|
-
nccl_port=port_args.
|
145
|
+
nccl_port=port_args.nccl_port,
|
145
146
|
)
|
146
147
|
self.tp_cpu_group = self.tp_worker.model_runner.tp_group.cpu_group
|
147
148
|
|
@@ -229,6 +230,22 @@ class Scheduler:
|
|
229
230
|
self.new_token_ratio_decay = global_config.new_token_ratio_decay
|
230
231
|
self.batch_is_full = False
|
231
232
|
|
233
|
+
if os.getenv("SGLANG_TORCH_PROFILER_DIR", "") == "":
|
234
|
+
self.profiler = None
|
235
|
+
else:
|
236
|
+
self.torch_profiler_trace_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR")
|
237
|
+
logger.info(
|
238
|
+
"Profiling enabled. Traces will be saved to: %s",
|
239
|
+
self.torch_profiler_trace_dir,
|
240
|
+
)
|
241
|
+
self.profiler = torch.profiler.profile(
|
242
|
+
activities=[
|
243
|
+
torch.profiler.ProfilerActivity.CPU,
|
244
|
+
torch.profiler.ProfilerActivity.CUDA,
|
245
|
+
],
|
246
|
+
with_stack=True,
|
247
|
+
)
|
248
|
+
|
232
249
|
@torch.inference_mode()
|
233
250
|
def event_loop(self):
|
234
251
|
while True:
|
@@ -271,6 +288,11 @@ class Scheduler:
|
|
271
288
|
elif isinstance(recv_req, UpdateWeightReqInput):
|
272
289
|
success, message = self.update_weights(recv_req)
|
273
290
|
self.out_pyobjs.append(UpdateWeightReqOutput(success, message))
|
291
|
+
elif isinstance(recv_req, ProfileReq):
|
292
|
+
if recv_req == ProfileReq.START_PROFILE:
|
293
|
+
self.start_profile()
|
294
|
+
else:
|
295
|
+
self.stop_profile()
|
274
296
|
else:
|
275
297
|
raise ValueError(f"Invalid request: {recv_req}")
|
276
298
|
|
@@ -433,6 +455,9 @@ class Scheduler:
|
|
433
455
|
result = self.run_batch(batch)
|
434
456
|
self.process_batch_result(batch, result)
|
435
457
|
|
458
|
+
if self.running_batch.is_empty():
|
459
|
+
self.running_batch = None
|
460
|
+
|
436
461
|
if self.running_batch is None:
|
437
462
|
break
|
438
463
|
|
@@ -772,9 +797,6 @@ class Scheduler:
|
|
772
797
|
if self.tp_rank == 0 and self.decode_forward_ct % 40 == 0:
|
773
798
|
self.print_decode_stats()
|
774
799
|
|
775
|
-
if self.running_batch.is_empty():
|
776
|
-
self.running_batch = None
|
777
|
-
|
778
800
|
def add_logprob_return_values(
|
779
801
|
self,
|
780
802
|
i: int,
|
@@ -1000,15 +1022,34 @@ class Scheduler:
|
|
1000
1022
|
logger.error(message)
|
1001
1023
|
return success, message
|
1002
1024
|
|
1025
|
+
def start_profile(self) -> None:
|
1026
|
+
if self.profiler is None:
|
1027
|
+
raise RuntimeError("Profiler is not enabled.")
|
1028
|
+
self.profiler.start()
|
1029
|
+
|
1030
|
+
def stop_profile(self) -> None:
|
1031
|
+
if self.profiler is None:
|
1032
|
+
raise RuntimeError("Profiler is not enabled.")
|
1033
|
+
self.profiler.stop()
|
1034
|
+
self.profiler.export_chrome_trace(
|
1035
|
+
self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
|
1036
|
+
)
|
1037
|
+
logger.info("Profiler is done")
|
1038
|
+
|
1003
1039
|
|
1004
1040
|
def run_scheduler_process(
|
1005
1041
|
server_args: ServerArgs,
|
1006
1042
|
port_args: PortArgs,
|
1007
1043
|
gpu_id: int,
|
1008
1044
|
tp_rank: int,
|
1045
|
+
dp_rank: Optional[int],
|
1009
1046
|
pipe_writer,
|
1010
1047
|
):
|
1011
|
-
|
1048
|
+
if dp_rank is None:
|
1049
|
+
configure_logger(server_args, prefix=f" TP{tp_rank}")
|
1050
|
+
else:
|
1051
|
+
configure_logger(server_args, prefix=f" DP{dp_rank} TP{tp_rank}")
|
1052
|
+
|
1012
1053
|
suppress_other_loggers()
|
1013
1054
|
|
1014
1055
|
try:
|
@@ -46,6 +46,7 @@ from sglang.srt.managers.io_struct import (
|
|
46
46
|
EmbeddingReqInput,
|
47
47
|
FlushCacheReq,
|
48
48
|
GenerateReqInput,
|
49
|
+
ProfileReq,
|
49
50
|
RewardReqInput,
|
50
51
|
TokenizedEmbeddingReqInput,
|
51
52
|
TokenizedGenerateReqInput,
|
@@ -512,6 +513,14 @@ class TokenizerManager:
|
|
512
513
|
req = AbortReq(rid)
|
513
514
|
self.send_to_scheduler.send_pyobj(req)
|
514
515
|
|
516
|
+
def start_profile(self):
|
517
|
+
req = ProfileReq.START_PROFILE
|
518
|
+
self.send_to_scheduler.send_pyobj(req)
|
519
|
+
|
520
|
+
def stop_profile(self):
|
521
|
+
req = ProfileReq.STOP_PROFILE
|
522
|
+
self.send_to_scheduler.send_pyobj(req)
|
523
|
+
|
515
524
|
async def update_weights(
|
516
525
|
self, obj: UpdateWeightReqInput, request: Optional[fastapi.Request] = None
|
517
526
|
):
|