sglang 0.1.20__py3-none-any.whl → 0.1.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -8
- sglang/api.py +1 -1
- sglang/backend/runtime_endpoint.py +14 -4
- sglang/backend/vertexai.py +5 -4
- sglang/bench.py +627 -0
- sglang/bench_latency.py +22 -20
- sglang/bench_serving.py +758 -0
- sglang/check_env.py +171 -0
- sglang/global_config.py +3 -1
- sglang/lang/backend/__init__.py +0 -0
- sglang/lang/backend/anthropic.py +77 -0
- sglang/lang/backend/base_backend.py +80 -0
- sglang/lang/backend/litellm.py +90 -0
- sglang/lang/backend/openai.py +438 -0
- sglang/lang/backend/runtime_endpoint.py +283 -0
- sglang/lang/backend/vertexai.py +149 -0
- sglang/lang/chat_template.py +2 -2
- sglang/lang/ir.py +3 -3
- sglang/lang/tracer.py +1 -1
- sglang/launch_server.py +1 -1
- sglang/launch_server_llavavid.py +1 -4
- sglang/srt/conversation.py +1 -1
- sglang/srt/layers/context_flashattention_nopad.py +0 -29
- sglang/srt/layers/extend_attention.py +0 -39
- sglang/srt/layers/linear.py +869 -0
- sglang/srt/layers/quantization/__init__.py +49 -0
- sglang/srt/layers/quantization/fp8.py +662 -0
- sglang/srt/layers/radix_attention.py +31 -5
- sglang/srt/layers/token_attention.py +1 -51
- sglang/srt/managers/controller/cuda_graph_runner.py +44 -18
- sglang/srt/managers/controller/infer_batch.py +76 -72
- sglang/srt/managers/controller/manager_multi.py +109 -98
- sglang/srt/managers/controller/manager_single.py +105 -50
- sglang/srt/managers/controller/model_runner.py +42 -18
- sglang/srt/managers/controller/radix_cache.py +4 -3
- sglang/srt/managers/controller/schedule_heuristic.py +4 -0
- sglang/srt/managers/controller/tp_worker.py +143 -156
- sglang/srt/managers/detokenizer_manager.py +49 -5
- sglang/srt/managers/io_struct.py +36 -17
- sglang/srt/managers/tokenizer_manager.py +228 -125
- sglang/srt/memory_pool.py +46 -58
- sglang/srt/model_loader/model_loader.py +277 -0
- sglang/srt/model_loader/utils.py +260 -0
- sglang/srt/models/chatglm.py +1 -0
- sglang/srt/models/dbrx.py +1 -0
- sglang/srt/models/grok.py +1 -0
- sglang/srt/models/internlm2.py +317 -0
- sglang/srt/models/llama2.py +65 -16
- sglang/srt/models/llama_classification.py +1 -0
- sglang/srt/models/llava.py +1 -0
- sglang/srt/models/llavavid.py +1 -0
- sglang/srt/models/minicpm.py +2 -8
- sglang/srt/models/mixtral.py +1 -0
- sglang/srt/models/mixtral_quant.py +1 -0
- sglang/srt/models/qwen.py +1 -0
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_moe.py +130 -108
- sglang/srt/models/stablelm.py +1 -0
- sglang/srt/openai_api/adapter.py +432 -0
- sglang/srt/openai_api/api_adapter.py +432 -0
- sglang/srt/openai_api/openai_api_adapter.py +431 -0
- sglang/srt/openai_api/openai_protocol.py +207 -0
- sglang/srt/openai_api/protocol.py +208 -0
- sglang/srt/openai_protocol.py +17 -0
- sglang/srt/sampling_params.py +2 -0
- sglang/srt/server.py +114 -90
- sglang/srt/server_args.py +27 -17
- sglang/srt/utils.py +17 -118
- sglang/test/test_conversation.py +1 -1
- sglang/test/test_openai_protocol.py +1 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +2 -2
- {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/METADATA +157 -159
- sglang-0.1.22.dist-info/RECORD +103 -0
- {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/WHEEL +1 -1
- sglang-0.1.20.dist-info/RECORD +0 -82
- {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/LICENSE +0 -0
- {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/top_level.txt +0 -0
sglang/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.22"
|
2
2
|
|
3
3
|
# SGL API Components
|
4
4
|
from sglang.api import (
|
@@ -22,16 +22,16 @@ from sglang.api import (
|
|
22
22
|
video,
|
23
23
|
)
|
24
24
|
|
25
|
-
# SGL Backends
|
26
|
-
from sglang.backend.anthropic import Anthropic
|
27
|
-
from sglang.backend.litellm import LiteLLM
|
28
|
-
from sglang.backend.openai import OpenAI
|
29
|
-
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
30
|
-
from sglang.backend.vertexai import VertexAI
|
31
|
-
|
32
25
|
# Global Configurations
|
33
26
|
from sglang.global_config import global_config
|
34
27
|
|
28
|
+
# SGL Backends
|
29
|
+
from sglang.lang.backend.anthropic import Anthropic
|
30
|
+
from sglang.lang.backend.litellm import LiteLLM
|
31
|
+
from sglang.lang.backend.openai import OpenAI
|
32
|
+
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
33
|
+
from sglang.lang.backend.vertexai import VertexAI
|
34
|
+
|
35
35
|
# public APIs management
|
36
36
|
__all__ = [
|
37
37
|
"global_config",
|
sglang/api.py
CHANGED
@@ -4,8 +4,8 @@ import os
|
|
4
4
|
import re
|
5
5
|
from typing import Callable, List, Optional, Union
|
6
6
|
|
7
|
-
from sglang.backend.base_backend import BaseBackend
|
8
7
|
from sglang.global_config import global_config
|
8
|
+
from sglang.lang.backend.base_backend import BaseBackend
|
9
9
|
from sglang.lang.ir import (
|
10
10
|
SglExpr,
|
11
11
|
SglExprList,
|
@@ -12,7 +12,6 @@ from sglang.utils import http_request
|
|
12
12
|
|
13
13
|
|
14
14
|
class RuntimeEndpoint(BaseBackend):
|
15
|
-
|
16
15
|
def __init__(
|
17
16
|
self,
|
18
17
|
base_url: str,
|
@@ -38,7 +37,8 @@ class RuntimeEndpoint(BaseBackend):
|
|
38
37
|
self.model_info = res.json()
|
39
38
|
|
40
39
|
self.chat_template = get_chat_template_by_model_path(
|
41
|
-
self.model_info["model_path"]
|
40
|
+
self.model_info["model_path"]
|
41
|
+
)
|
42
42
|
|
43
43
|
def get_model_name(self):
|
44
44
|
return self.model_info["model_path"]
|
@@ -124,7 +124,12 @@ class RuntimeEndpoint(BaseBackend):
|
|
124
124
|
else:
|
125
125
|
raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
|
126
126
|
|
127
|
-
for item in [
|
127
|
+
for item in [
|
128
|
+
"return_logprob",
|
129
|
+
"logprob_start_len",
|
130
|
+
"top_logprobs_num",
|
131
|
+
"return_text_in_logprobs",
|
132
|
+
]:
|
128
133
|
value = getattr(sampling_params, item, None)
|
129
134
|
if value is not None:
|
130
135
|
data[item] = value
|
@@ -171,7 +176,12 @@ class RuntimeEndpoint(BaseBackend):
|
|
171
176
|
else:
|
172
177
|
raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
|
173
178
|
|
174
|
-
for item in [
|
179
|
+
for item in [
|
180
|
+
"return_logprob",
|
181
|
+
"logprob_start_len",
|
182
|
+
"top_logprobs_num",
|
183
|
+
"return_text_in_logprobs",
|
184
|
+
]:
|
175
185
|
value = getattr(sampling_params, item, None)
|
176
186
|
if value is not None:
|
177
187
|
data[item] = value
|
sglang/backend/vertexai.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
import os
|
2
2
|
import warnings
|
3
|
-
from typing import
|
4
|
-
|
5
|
-
import numpy as np
|
3
|
+
from typing import Optional
|
6
4
|
|
7
5
|
from sglang.backend.base_backend import BaseBackend
|
8
6
|
from sglang.lang.chat_template import get_chat_template
|
@@ -21,7 +19,7 @@ except ImportError as e:
|
|
21
19
|
|
22
20
|
|
23
21
|
class VertexAI(BaseBackend):
|
24
|
-
def __init__(self, model_name):
|
22
|
+
def __init__(self, model_name, safety_settings=None):
|
25
23
|
super().__init__()
|
26
24
|
|
27
25
|
if isinstance(GenerativeModel, Exception):
|
@@ -33,6 +31,7 @@ class VertexAI(BaseBackend):
|
|
33
31
|
|
34
32
|
self.model_name = model_name
|
35
33
|
self.chat_template = get_chat_template("default")
|
34
|
+
self.safety_settings = safety_settings
|
36
35
|
|
37
36
|
def get_chat_template(self):
|
38
37
|
return self.chat_template
|
@@ -54,6 +53,7 @@ class VertexAI(BaseBackend):
|
|
54
53
|
ret = GenerativeModel(self.model_name).generate_content(
|
55
54
|
prompt,
|
56
55
|
generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
|
56
|
+
safety_settings=self.safety_settings,
|
57
57
|
)
|
58
58
|
|
59
59
|
comp = ret.text
|
@@ -78,6 +78,7 @@ class VertexAI(BaseBackend):
|
|
78
78
|
prompt,
|
79
79
|
stream=True,
|
80
80
|
generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
|
81
|
+
safety_settings=self.safety_settings,
|
81
82
|
)
|
82
83
|
for ret in generator:
|
83
84
|
yield ret.text, {}
|