sglang 0.3.1.post2__tar.gz → 0.3.1.post3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.3.1.post2/sglang.egg-info → sglang-0.3.1.post3}/PKG-INFO +3 -2
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/README.md +2 -1
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/pyproject.toml +1 -1
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/bench_latency.py +8 -1
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/activation.py +3 -2
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/attention_backend.py +3 -1
- sglang-0.3.1.post3/sglang/srt/layers/linear.py +1133 -0
- sglang-0.3.1.post3/sglang/srt/layers/quantization/__init__.py +76 -0
- sglang-0.3.1.post3/sglang/srt/layers/quantization/base_config.py +122 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/baichuan.py +1 -1
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/chatglm.py +6 -6
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/commandr.py +7 -7
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/dbrx.py +7 -7
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/deepseek.py +7 -7
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/deepseek_v2.py +7 -7
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/exaone.py +6 -6
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/gemma.py +6 -6
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/gemma2.py +6 -6
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/gpt_bigcode.py +6 -6
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/grok.py +6 -6
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/internlm2.py +6 -6
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/llama.py +6 -6
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/llama_classification.py +1 -1
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/llava.py +1 -1
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/llavavid.py +1 -1
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/minicpm.py +6 -6
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/minicpm3.py +1 -1
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/mixtral.py +6 -6
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/mixtral_quant.py +6 -6
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/olmoe.py +1 -1
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/qwen.py +6 -6
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/qwen2.py +6 -6
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/qwen2_moe.py +7 -7
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/stablelm.py +6 -6
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/xverse.py +1 -1
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/xverse_moe.py +1 -1
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/yivl.py +1 -1
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/utils.py +21 -1
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/test_utils.py +4 -2
- sglang-0.3.1.post3/sglang/version.py +1 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3/sglang.egg-info}/PKG-INFO +3 -2
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang.egg-info/SOURCES.txt +3 -0
- sglang-0.3.1.post2/sglang/version.py +0 -1
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/LICENSE +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/setup.cfg +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/__init__.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/api.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/bench_server_latency.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/bench_serving.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/check_env.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/global_config.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/__init__.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/chat_template.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/choices.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/compiler.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/interpreter.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/ir.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/tracer.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/launch_server.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/configs/model_config.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/constrained/base_tool_cache.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/conversation.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/flashinfer_utils.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/fused_moe/__init__.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/fused_moe/layer.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/sampler.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/triton_attention/decode_attention.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/triton_attention/extend_attention.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/triton_attention/prefill_attention.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/controller_multi.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/controller_single.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/io_struct.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/policy_scheduler.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/schedule_batch.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/tokenizer_manager.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/tp_worker.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/mem_cache/memory_pool.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/model_executor/cuda_graph_runner.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/model_executor/forward_batch_info.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/model_executor/model_runner.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/openai_api/adapter.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/openai_api/protocol.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/sampling_batch_info.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/sampling_params.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/server.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/server_args.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/run_eval.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/runners.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/test_activation.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/test_programs.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/utils.py +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang.egg-info/requires.txt +0 -0
- {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.1.
|
3
|
+
Version: 0.3.1.post3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
318
318
|
### Method 2: From source
|
319
319
|
```
|
320
320
|
# Use the last release branch
|
321
|
-
git clone -b v0.3.1.
|
321
|
+
git clone -b v0.3.1.post3 https://github.com/sgl-project/sglang.git
|
322
322
|
cd sglang
|
323
323
|
|
324
324
|
pip install --upgrade pip
|
@@ -499,6 +499,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
499
499
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
500
500
|
- Mistral / Mixtral / Mistral NeMo
|
501
501
|
- Gemma / Gemma 2
|
502
|
+
- OLMoE
|
502
503
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
503
504
|
- DeepSeek / DeepSeek 2
|
504
505
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
@@ -60,7 +60,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
60
60
|
### Method 2: From source
|
61
61
|
```
|
62
62
|
# Use the last release branch
|
63
|
-
git clone -b v0.3.1.
|
63
|
+
git clone -b v0.3.1.post3 https://github.com/sgl-project/sglang.git
|
64
64
|
cd sglang
|
65
65
|
|
66
66
|
pip install --upgrade pip
|
@@ -241,6 +241,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
241
241
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
242
242
|
- Mistral / Mixtral / Mistral NeMo
|
243
243
|
- Gemma / Gemma 2
|
244
|
+
- OLMoE
|
244
245
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
245
246
|
- DeepSeek / DeepSeek 2
|
246
247
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.3.1.
|
7
|
+
version = "0.3.1.post3"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -64,8 +64,13 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
|
|
64
64
|
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
65
65
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
66
66
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
67
|
+
from sglang.srt.server import _set_envs_and_config
|
67
68
|
from sglang.srt.server_args import ServerArgs
|
68
|
-
from sglang.srt.utils import
|
69
|
+
from sglang.srt.utils import (
|
70
|
+
configure_logger,
|
71
|
+
kill_child_process,
|
72
|
+
suppress_other_loggers,
|
73
|
+
)
|
69
74
|
|
70
75
|
|
71
76
|
@dataclasses.dataclass
|
@@ -341,6 +346,8 @@ def latency_test(
|
|
341
346
|
bench_args,
|
342
347
|
tp_rank,
|
343
348
|
):
|
349
|
+
configure_logger(server_args, prefix=f" TP{tp_rank}")
|
350
|
+
_set_envs_and_config(server_args)
|
344
351
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
345
352
|
|
346
353
|
# Load the model
|
@@ -31,8 +31,9 @@ from vllm.distributed import (
|
|
31
31
|
get_tensor_model_parallel_world_size,
|
32
32
|
)
|
33
33
|
from vllm.model_executor.custom_op import CustomOp
|
34
|
-
|
35
|
-
from
|
34
|
+
|
35
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
36
|
+
from sglang.srt.utils import set_weight_attrs
|
36
37
|
|
37
38
|
logger = logging.getLogger(__name__)
|
38
39
|
|
@@ -346,7 +346,9 @@ class TritonAttnBackend(AttentionBackend):
|
|
346
346
|
|
347
347
|
self.decode_attention_fwd = decode_attention_fwd
|
348
348
|
self.extend_attention_fwd = extend_attention_fwd
|
349
|
-
self.num_head =
|
349
|
+
self.num_head = (
|
350
|
+
model_runner.model_config.num_attention_heads // model_runner.tp_size
|
351
|
+
)
|
350
352
|
|
351
353
|
if global_server_args_dict.get("triton_attention_reduce_in_fp32", False):
|
352
354
|
self.reduce_dtype = torch.float32
|