sglang 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +1 -1
- sglang/backend/anthropic.py +18 -12
- sglang/lang/ir.py +1 -1
- sglang/srt/constrained/fsm_cache.py +14 -3
- sglang/srt/managers/router/model_runner.py +32 -10
- sglang/srt/models/stablelm.py +293 -0
- sglang/srt/server.py +8 -0
- sglang/test/test_utils.py +1 -1
- {sglang-0.1.13.dist-info → sglang-0.1.14.dist-info}/METADATA +2 -2
- {sglang-0.1.13.dist-info → sglang-0.1.14.dist-info}/RECORD +13 -12
- {sglang-0.1.13.dist-info → sglang-0.1.14.dist-info}/WHEEL +1 -1
- {sglang-0.1.13.dist-info → sglang-0.1.14.dist-info}/LICENSE +0 -0
- {sglang-0.1.13.dist-info → sglang-0.1.14.dist-info}/top_level.txt +0 -0
sglang/__init__.py
CHANGED
sglang/backend/anthropic.py
CHANGED
@@ -30,13 +30,17 @@ class Anthropic(BaseBackend):
|
|
30
30
|
s: StreamExecutor,
|
31
31
|
sampling_params: SglSamplingParams,
|
32
32
|
):
|
33
|
-
|
34
|
-
|
33
|
+
if s.messages_:
|
34
|
+
messages = s.messages_
|
35
|
+
else:
|
36
|
+
messages = [{"role": "user", "content": s.text_}]
|
37
|
+
|
38
|
+
ret = anthropic.Anthropic().messages.create(
|
35
39
|
model=self.model_name,
|
36
|
-
|
40
|
+
messages=messages,
|
37
41
|
**sampling_params.to_anthropic_kwargs(),
|
38
42
|
)
|
39
|
-
comp = ret.
|
43
|
+
comp = ret.content[0].text
|
40
44
|
|
41
45
|
return comp, {}
|
42
46
|
|
@@ -45,13 +49,15 @@ class Anthropic(BaseBackend):
|
|
45
49
|
s: StreamExecutor,
|
46
50
|
sampling_params: SglSamplingParams,
|
47
51
|
):
|
48
|
-
|
49
|
-
|
52
|
+
if s.messages_:
|
53
|
+
messages = s.messages_
|
54
|
+
else:
|
55
|
+
messages = [{"role": "user", "content": s.text_}]
|
56
|
+
|
57
|
+
with anthropic.Anthropic().messages.stream(
|
50
58
|
model=self.model_name,
|
51
|
-
|
52
|
-
stream=True,
|
59
|
+
messages=messages,
|
53
60
|
**sampling_params.to_anthropic_kwargs(),
|
54
|
-
)
|
55
|
-
|
56
|
-
|
57
|
-
yield ret.completion, {}
|
61
|
+
) as stream:
|
62
|
+
for text in stream.text_stream:
|
63
|
+
yield text, {}
|
sglang/lang/ir.py
CHANGED
@@ -73,7 +73,7 @@ class SglSamplingParams:
|
|
73
73
|
"Regular expression is not supported in the Anthropic backend."
|
74
74
|
)
|
75
75
|
return {
|
76
|
-
"
|
76
|
+
"max_tokens": self.max_new_tokens,
|
77
77
|
"stop_sequences": (
|
78
78
|
self.stop if isinstance(self.stop, (list, tuple)) else [self.stop]
|
79
79
|
),
|
@@ -5,9 +5,20 @@ from sglang.srt.constrained.base_cache import BaseCache
|
|
5
5
|
class FSMCache(BaseCache):
|
6
6
|
def __init__(self, tokenizer_path, tokenizer_args_dict, enable=True):
|
7
7
|
super().__init__(enable=enable)
|
8
|
-
|
9
|
-
|
10
|
-
)
|
8
|
+
|
9
|
+
from importlib.metadata import version
|
10
|
+
if version("outlines") >= "0.0.35":
|
11
|
+
from transformers import AutoTokenizer
|
12
|
+
|
13
|
+
tokenizer_args_dict.setdefault("padding_side", "left")
|
14
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
15
|
+
tokenizer_path, **tokenizer_args_dict
|
16
|
+
)
|
17
|
+
self.outlines_tokenizer = TransformerTokenizer(tokenizer)
|
18
|
+
else:
|
19
|
+
self.outlines_tokenizer = TransformerTokenizer(
|
20
|
+
tokenizer_path, **tokenizer_args_dict
|
21
|
+
)
|
11
22
|
|
12
23
|
def init_value(self, regex):
|
13
24
|
return RegexFSM(regex, self.outlines_tokenizer)
|
@@ -1,8 +1,10 @@
|
|
1
1
|
import importlib
|
2
2
|
import logging
|
3
|
+
import inspect
|
3
4
|
from dataclasses import dataclass
|
4
5
|
from functools import lru_cache
|
5
6
|
from pathlib import Path
|
7
|
+
import importlib.resources
|
6
8
|
|
7
9
|
import numpy as np
|
8
10
|
import torch
|
@@ -12,12 +14,16 @@ from sglang.srt.utils import is_multimodal_model
|
|
12
14
|
from sglang.utils import get_available_gpu_memory
|
13
15
|
from vllm.model_executor.layers.quantization.awq import AWQConfig
|
14
16
|
from vllm.model_executor.layers.quantization.gptq import GPTQConfig
|
17
|
+
from vllm.model_executor.layers.quantization.marlin import MarlinConfig
|
15
18
|
from vllm.model_executor.model_loader import _set_default_torch_dtype
|
16
19
|
from vllm.model_executor.parallel_utils.parallel_state import initialize_model_parallel
|
17
20
|
|
21
|
+
import importlib
|
22
|
+
import pkgutil
|
23
|
+
|
18
24
|
import sglang
|
19
25
|
|
20
|
-
QUANTIONCONFIG_MAPPING = {"awq": AWQConfig, "gptq": GPTQConfig}
|
26
|
+
QUANTIONCONFIG_MAPPING = {"awq": AWQConfig, "gptq": GPTQConfig, "marlin": MarlinConfig}
|
21
27
|
|
22
28
|
logger = logging.getLogger("model_runner")
|
23
29
|
|
@@ -29,10 +35,13 @@ global_server_args_dict: dict = None
|
|
29
35
|
@lru_cache()
|
30
36
|
def import_model_classes():
|
31
37
|
model_arch_name_to_cls = {}
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
38
|
+
package_name = "sglang.srt.models"
|
39
|
+
package = importlib.import_module(package_name)
|
40
|
+
for finder, name, ispkg in pkgutil.iter_modules(package.__path__, package_name + '.'):
|
41
|
+
if not ispkg:
|
42
|
+
module = importlib.import_module(name)
|
43
|
+
if hasattr(module, "EntryClass"):
|
44
|
+
model_arch_name_to_cls[module.EntryClass.__name__] = module.EntryClass
|
36
45
|
return model_arch_name_to_cls
|
37
46
|
|
38
47
|
|
@@ -124,14 +133,21 @@ class InputMetadata:
|
|
124
133
|
self.prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
|
125
134
|
workspace_buffer, "NHD"
|
126
135
|
)
|
127
|
-
|
136
|
+
args = [
|
128
137
|
self.qo_indptr,
|
129
138
|
self.kv_indptr,
|
130
139
|
self.kv_indices,
|
131
140
|
self.kv_last_page_len,
|
132
141
|
self.model_runner.model_config.num_attention_heads // tp_size,
|
133
142
|
self.model_runner.model_config.num_key_value_heads // tp_size,
|
134
|
-
|
143
|
+
]
|
144
|
+
|
145
|
+
# flashinfer >= 0.0.3
|
146
|
+
# FIXME: Drop this when flashinfer updates to 0.0.4
|
147
|
+
if len(inspect.signature(self.prefill_wrapper.begin_forward).parameters) == 7:
|
148
|
+
args.append(self.model_runner.model_config.head_dim)
|
149
|
+
|
150
|
+
self.prefill_wrapper.begin_forward(*args)
|
135
151
|
else:
|
136
152
|
self.decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
|
137
153
|
workspace_buffer, "NHD"
|
@@ -288,9 +304,15 @@ class ModelRunner:
|
|
288
304
|
self.model_config.hf_config, "quantization_config", None
|
289
305
|
)
|
290
306
|
if hf_quant_config is not None:
|
291
|
-
|
292
|
-
|
293
|
-
|
307
|
+
hf_quant_method = hf_quant_config["quant_method"]
|
308
|
+
|
309
|
+
# compat: autogptq uses is_marlin_format within quant config
|
310
|
+
if (hf_quant_method == "gptq"
|
311
|
+
and "is_marlin_format" in hf_quant_config
|
312
|
+
and hf_quant_config["is_marlin_format"]):
|
313
|
+
hf_quant_method = "marlin"
|
314
|
+
quant_config_class = QUANTIONCONFIG_MAPPING.get(hf_quant_method)
|
315
|
+
|
294
316
|
if quant_config_class is None:
|
295
317
|
raise ValueError(
|
296
318
|
f"Unsupported quantization method: {hf_quant_config['quant_method']}"
|
@@ -0,0 +1,293 @@
|
|
1
|
+
# This code is based on:
|
2
|
+
# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/stablelm.py
|
3
|
+
"""Inference-only StableLM-2 (https://huggingface.co/stabilityai/stablelm-2-1_6b)
|
4
|
+
model compatible with HuggingFace weights."""
|
5
|
+
from typing import Optional, Tuple
|
6
|
+
|
7
|
+
import torch
|
8
|
+
from torch import nn
|
9
|
+
from transformers import PretrainedConfig
|
10
|
+
|
11
|
+
from sglang.srt.layers.logits_processor import LogitsProcessor
|
12
|
+
from sglang.srt.layers.radix_attention import RadixAttention
|
13
|
+
from sglang.srt.managers.router.model_runner import InputMetadata
|
14
|
+
from vllm.model_executor.layers.activation import SiluAndMul
|
15
|
+
from vllm.model_executor.layers.linear import (
|
16
|
+
LinearMethodBase,
|
17
|
+
MergedColumnParallelLinear,
|
18
|
+
QKVParallelLinear,
|
19
|
+
RowParallelLinear,
|
20
|
+
)
|
21
|
+
from vllm.model_executor.layers.rotary_embedding import get_rope
|
22
|
+
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
23
|
+
VocabParallelEmbedding,
|
24
|
+
ParallelLMHead,
|
25
|
+
)
|
26
|
+
from vllm.model_executor.parallel_utils.parallel_state import (
|
27
|
+
get_tensor_model_parallel_world_size,
|
28
|
+
)
|
29
|
+
from vllm.model_executor.weight_utils import (
|
30
|
+
default_weight_loader,
|
31
|
+
hf_model_weights_iterator,
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
class StablelmMLP(nn.Module):
|
36
|
+
def __init__(
|
37
|
+
self, config: PretrainedConfig, linear_method: Optional[LinearMethodBase] = None
|
38
|
+
) -> None:
|
39
|
+
super().__init__()
|
40
|
+
self.config = config
|
41
|
+
self.hidden_size = config.hidden_size
|
42
|
+
self.intermediate_size = config.intermediate_size
|
43
|
+
self.gate_up_proj = MergedColumnParallelLinear(
|
44
|
+
config.hidden_size,
|
45
|
+
[config.intermediate_size] * 2,
|
46
|
+
bias=False,
|
47
|
+
linear_method=linear_method,
|
48
|
+
)
|
49
|
+
self.down_proj = RowParallelLinear(
|
50
|
+
config.intermediate_size, config.hidden_size, bias=False
|
51
|
+
)
|
52
|
+
self.act_fn = SiluAndMul()
|
53
|
+
|
54
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
55
|
+
gate_up, _ = self.gate_up_proj(x)
|
56
|
+
x = self.act_fn(gate_up)
|
57
|
+
x, _ = self.down_proj(x)
|
58
|
+
return x
|
59
|
+
|
60
|
+
|
61
|
+
class StablelmAttention(nn.Module):
|
62
|
+
def __init__(
|
63
|
+
self,
|
64
|
+
config: PretrainedConfig,
|
65
|
+
layer_id: int = 0,
|
66
|
+
linear_method: Optional[LinearMethodBase] = None,
|
67
|
+
) -> None:
|
68
|
+
super().__init__()
|
69
|
+
self.config = config
|
70
|
+
self.hidden_size = config.hidden_size
|
71
|
+
tp_size = get_tensor_model_parallel_world_size()
|
72
|
+
self.total_num_heads = config.num_attention_heads
|
73
|
+
self.num_heads = self.total_num_heads // tp_size
|
74
|
+
|
75
|
+
self.total_num_key_value_heads = config.num_key_value_heads
|
76
|
+
if self.total_num_key_value_heads >= tp_size:
|
77
|
+
# Number of KV heads is greater than TP size, so we partition
|
78
|
+
# the KV heads across multiple tensor parallel GPUs.
|
79
|
+
assert self.total_num_key_value_heads % tp_size == 0
|
80
|
+
else:
|
81
|
+
# Number of KV heads is less than TP size, so we replicate
|
82
|
+
# the KV heads across multiple tensor parallel GPUs.
|
83
|
+
assert tp_size % self.total_num_key_value_heads == 0
|
84
|
+
self.num_key_value_heads = max(1, self.total_num_key_value_heads // tp_size)
|
85
|
+
self.head_dim = self.hidden_size // self.total_num_heads
|
86
|
+
self.max_position_embeddings = config.max_position_embeddings
|
87
|
+
rope_pct = getattr(
|
88
|
+
config, "rope_pct", getattr(config, "partial_rotary_factor", 1)
|
89
|
+
)
|
90
|
+
self.rotary_ndims = int(self.head_dim * rope_pct)
|
91
|
+
self.scaling = self.head_dim**-0.5
|
92
|
+
self.q_size = self.num_heads * self.head_dim
|
93
|
+
self.kv_size = self.num_key_value_heads * self.head_dim
|
94
|
+
self.qkv_bias = getattr(config, "use_qkv_bias", False)
|
95
|
+
if (self.head_dim * self.num_heads * tp_size) != self.hidden_size:
|
96
|
+
raise ValueError(
|
97
|
+
f"hidden_size must be divisible by num_heads "
|
98
|
+
f"(got `hidden_size`: {self.hidden_size}"
|
99
|
+
f" and `num_heads`: {self.num_heads})."
|
100
|
+
)
|
101
|
+
|
102
|
+
self.qkv_proj = QKVParallelLinear(
|
103
|
+
self.hidden_size,
|
104
|
+
self.head_dim,
|
105
|
+
self.total_num_heads,
|
106
|
+
self.total_num_key_value_heads,
|
107
|
+
self.qkv_bias,
|
108
|
+
linear_method=linear_method,
|
109
|
+
)
|
110
|
+
self.o_proj = RowParallelLinear(
|
111
|
+
self.total_num_heads * self.head_dim,
|
112
|
+
self.hidden_size,
|
113
|
+
bias=False,
|
114
|
+
linear_method=linear_method,
|
115
|
+
)
|
116
|
+
self.rotary_emb = get_rope(
|
117
|
+
self.head_dim,
|
118
|
+
rotary_dim=self.rotary_ndims,
|
119
|
+
max_position=self.config.max_position_embeddings,
|
120
|
+
base=self.config.rope_theta,
|
121
|
+
)
|
122
|
+
self.attn = RadixAttention(
|
123
|
+
self.num_heads,
|
124
|
+
self.head_dim,
|
125
|
+
self.scaling,
|
126
|
+
num_kv_heads=self.num_key_value_heads,
|
127
|
+
layer_id=layer_id,
|
128
|
+
)
|
129
|
+
|
130
|
+
def forward(
|
131
|
+
self,
|
132
|
+
positions: torch.Tensor,
|
133
|
+
hidden_states: torch.Tensor,
|
134
|
+
input_metadata: InputMetadata,
|
135
|
+
) -> torch.Tensor:
|
136
|
+
qkv, _ = self.qkv_proj(hidden_states)
|
137
|
+
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
138
|
+
q, k = self.rotary_emb(positions, q, k)
|
139
|
+
attn_output = self.attn(q, k, v, input_metadata)
|
140
|
+
output, _ = self.o_proj(attn_output)
|
141
|
+
return output
|
142
|
+
|
143
|
+
|
144
|
+
class StablelmDecoderLayer(nn.Module):
|
145
|
+
def __init__(
|
146
|
+
self,
|
147
|
+
config: PretrainedConfig,
|
148
|
+
layer_id: int = 0,
|
149
|
+
linear_method: Optional[LinearMethodBase] = None,
|
150
|
+
) -> None:
|
151
|
+
super().__init__()
|
152
|
+
self.self_attn = StablelmAttention(config, layer_id=layer_id)
|
153
|
+
self.mlp = StablelmMLP(config, linear_method)
|
154
|
+
norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05))
|
155
|
+
self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
|
156
|
+
self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
|
157
|
+
|
158
|
+
def forward(
|
159
|
+
self,
|
160
|
+
positions: torch.Tensor,
|
161
|
+
hidden_states: torch.Tensor,
|
162
|
+
input_metadata: InputMetadata,
|
163
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
164
|
+
# Self Attention
|
165
|
+
residual = hidden_states
|
166
|
+
hidden_states = self.input_layernorm(hidden_states)
|
167
|
+
hidden_states = self.self_attn(
|
168
|
+
positions=positions,
|
169
|
+
hidden_states=hidden_states,
|
170
|
+
input_metadata=input_metadata,
|
171
|
+
)
|
172
|
+
hidden_states = residual + hidden_states
|
173
|
+
|
174
|
+
# Fully Connected
|
175
|
+
residual = hidden_states
|
176
|
+
hidden_states = self.post_attention_layernorm(hidden_states)
|
177
|
+
hidden_states = self.mlp(hidden_states)
|
178
|
+
hidden_states = residual + hidden_states
|
179
|
+
|
180
|
+
return hidden_states, residual
|
181
|
+
|
182
|
+
|
183
|
+
class StableLMEpochModel(nn.Module):
|
184
|
+
def __init__(
|
185
|
+
self, config: PretrainedConfig, linear_method: Optional[LinearMethodBase] = None
|
186
|
+
) -> None:
|
187
|
+
super().__init__()
|
188
|
+
self.embed_tokens = VocabParallelEmbedding(
|
189
|
+
config.vocab_size,
|
190
|
+
config.hidden_size,
|
191
|
+
)
|
192
|
+
self.layers = nn.ModuleList(
|
193
|
+
[
|
194
|
+
StablelmDecoderLayer(config, i, linear_method)
|
195
|
+
for i in range(config.num_hidden_layers)
|
196
|
+
]
|
197
|
+
)
|
198
|
+
norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05))
|
199
|
+
self.norm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
|
200
|
+
|
201
|
+
def forward(
|
202
|
+
self,
|
203
|
+
input_ids: torch.Tensor,
|
204
|
+
positions: torch.Tensor,
|
205
|
+
input_metadata: InputMetadata,
|
206
|
+
input_embeds: torch.Tensor = None,
|
207
|
+
) -> torch.Tensor:
|
208
|
+
if input_embeds is None:
|
209
|
+
hidden_states = self.embed_tokens(input_ids)
|
210
|
+
else:
|
211
|
+
hidden_states = input_embeds
|
212
|
+
for i in range(len(self.layers)):
|
213
|
+
layer = self.layers[i]
|
214
|
+
hidden_states, residual = layer(
|
215
|
+
positions,
|
216
|
+
hidden_states,
|
217
|
+
input_metadata,
|
218
|
+
)
|
219
|
+
hidden_states = self.norm(hidden_states)
|
220
|
+
return hidden_states
|
221
|
+
|
222
|
+
|
223
|
+
class StableLmForCausalLM(nn.Module):
|
224
|
+
def __init__(
|
225
|
+
self,
|
226
|
+
config: PretrainedConfig,
|
227
|
+
linear_method: Optional[LinearMethodBase] = None,
|
228
|
+
) -> None:
|
229
|
+
super().__init__()
|
230
|
+
self.config = config
|
231
|
+
self.linear_method = linear_method
|
232
|
+
self.model = StableLMEpochModel(config, linear_method)
|
233
|
+
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
234
|
+
self.logits_processor = LogitsProcessor(config)
|
235
|
+
|
236
|
+
def forward(
|
237
|
+
self,
|
238
|
+
input_ids: torch.Tensor,
|
239
|
+
positions: torch.Tensor,
|
240
|
+
input_metadata: InputMetadata,
|
241
|
+
input_embeds: torch.Tensor = None,
|
242
|
+
) -> torch.Tensor:
|
243
|
+
hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
|
244
|
+
return self.logits_processor(
|
245
|
+
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
246
|
+
)
|
247
|
+
|
248
|
+
def load_weights(
|
249
|
+
self,
|
250
|
+
model_name_or_path: str,
|
251
|
+
cache_dir: Optional[str] = None,
|
252
|
+
load_format: str = "auto",
|
253
|
+
revision: Optional[str] = None,
|
254
|
+
):
|
255
|
+
stacked_params_mapping = [
|
256
|
+
# (param_name, shard_name, shard_id)
|
257
|
+
("qkv_proj", "q_proj", "q"),
|
258
|
+
("qkv_proj", "k_proj", "k"),
|
259
|
+
("qkv_proj", "v_proj", "v"),
|
260
|
+
("gate_up_proj", "gate_proj", 0),
|
261
|
+
("gate_up_proj", "up_proj", 1),
|
262
|
+
]
|
263
|
+
params_dict = dict(self.named_parameters())
|
264
|
+
for name, loaded_weight in hf_model_weights_iterator(
|
265
|
+
model_name_or_path, cache_dir, load_format, revision
|
266
|
+
):
|
267
|
+
if "rotary_emb.inv_freq" in name:
|
268
|
+
continue
|
269
|
+
if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
|
270
|
+
# Models trained using ColossalAI may include these tensors in
|
271
|
+
# the checkpoint. Skip them.
|
272
|
+
continue
|
273
|
+
for param_name, weight_name, shard_id in stacked_params_mapping:
|
274
|
+
if weight_name not in name:
|
275
|
+
continue
|
276
|
+
name = name.replace(weight_name, param_name)
|
277
|
+
# Skip loading extra bias for GPTQ models.
|
278
|
+
if name.endswith(".bias") and name not in params_dict:
|
279
|
+
continue
|
280
|
+
param = params_dict[name]
|
281
|
+
weight_loader = param.weight_loader
|
282
|
+
weight_loader(param, loaded_weight, shard_id)
|
283
|
+
break
|
284
|
+
else:
|
285
|
+
# Skip loading extra bias for GPTQ models.
|
286
|
+
if name.endswith(".bias") and name not in params_dict:
|
287
|
+
continue
|
288
|
+
param = params_dict[name]
|
289
|
+
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
290
|
+
weight_loader(param, loaded_weight)
|
291
|
+
|
292
|
+
|
293
|
+
EntryClass = StableLmForCausalLM
|
sglang/srt/server.py
CHANGED
@@ -587,6 +587,10 @@ class Runtime:
|
|
587
587
|
attention_reduce_in_fp32: bool = False,
|
588
588
|
random_seed: int = 42,
|
589
589
|
log_level: str = "error",
|
590
|
+
disable_radix_cache: bool = False,
|
591
|
+
enable_flashinfer: bool = False,
|
592
|
+
disable_regex_jump_forward: bool = False,
|
593
|
+
disable_disk_cache: bool = False,
|
590
594
|
api_key: str = "",
|
591
595
|
port: Optional[int] = None,
|
592
596
|
additional_ports: Optional[Union[List[int], int]] = None,
|
@@ -610,6 +614,10 @@ class Runtime:
|
|
610
614
|
attention_reduce_in_fp32=attention_reduce_in_fp32,
|
611
615
|
random_seed=random_seed,
|
612
616
|
log_level=log_level,
|
617
|
+
disable_radix_cache=disable_radix_cache,
|
618
|
+
enable_flashinfer=enable_flashinfer,
|
619
|
+
disable_regex_jump_forward=disable_regex_jump_forward,
|
620
|
+
disable_disk_cache=disable_disk_cache,
|
613
621
|
api_key=api_key,
|
614
622
|
)
|
615
623
|
|
sglang/test/test_utils.py
CHANGED
@@ -155,7 +155,7 @@ def select_sglang_backend(args):
|
|
155
155
|
global_config.enable_parallel_decoding = False
|
156
156
|
global_config.enable_parallel_encoding = False
|
157
157
|
backend = RuntimeEndpoint(f"{args.host}:{args.port}")
|
158
|
-
elif args.backend.startswith("gpt"):
|
158
|
+
elif args.backend.startswith("gpt-"):
|
159
159
|
backend = OpenAI(args.backend)
|
160
160
|
else:
|
161
161
|
raise ValueError(f"Invalid backend: {args.backend}")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.14
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -217,7 +217,7 @@ Requires-Dist: sglang[srt] ; extra == 'all'
|
|
217
217
|
Requires-Dist: sglang[openai] ; extra == 'all'
|
218
218
|
Requires-Dist: sglang[anthropic] ; extra == 'all'
|
219
219
|
Provides-Extra: anthropic
|
220
|
-
Requires-Dist: anthropic ; extra == 'anthropic'
|
220
|
+
Requires-Dist: anthropic >=0.20.0 ; extra == 'anthropic'
|
221
221
|
Requires-Dist: numpy ; extra == 'anthropic'
|
222
222
|
Provides-Extra: openai
|
223
223
|
Requires-Dist: openai >=1.0 ; extra == 'openai'
|
@@ -1,10 +1,10 @@
|
|
1
|
-
sglang/__init__.py,sha256=
|
1
|
+
sglang/__init__.py,sha256=Nxa2M7XCh2-e6I7VrCg7OSBL6BvEW3gyRD14ZdykpRM,96
|
2
2
|
sglang/api.py,sha256=0-Eh7c41hWKjPXrzzvLFdLAUVkvmPGJGLAsrG9evDTE,4576
|
3
3
|
sglang/global_config.py,sha256=PAX7TWeFcq0HBzNUWyCONAOjqIokWqw8vT7I6sBSKTc,797
|
4
4
|
sglang/launch_server.py,sha256=jKPZRDN5bUe8Wgz5eoDkqeePhmKa8DLD4DpXQLT5auo,294
|
5
5
|
sglang/utils.py,sha256=2dUXLMPz9VhhzbIRQABmfZnVW5yz61F3UVtb6yKyevM,6237
|
6
6
|
sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
sglang/backend/anthropic.py,sha256=
|
7
|
+
sglang/backend/anthropic.py,sha256=GJ_T1Jg0VOtajgkgczPKt5sjuVYdbAiWd2jXlJRNRmg,1677
|
8
8
|
sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
|
9
9
|
sglang/backend/openai.py,sha256=nPdA88A5GISJTH88svJdww3qHWIHZcGG2NEn0XjMkLU,9578
|
10
10
|
sglang/backend/runtime_endpoint.py,sha256=r7dTazselaudlFx8hqk-PQLYDHZhpbAKjyFF1zLuM_E,8022
|
@@ -13,7 +13,7 @@ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
sglang/lang/chat_template.py,sha256=MaCF0fvNky0nJC9OvmAeApeHYgM6Lr03mtRhF0lS31U,8000
|
14
14
|
sglang/lang/compiler.py,sha256=wNn_UqV6Sxl22mv-PpzFUtRgiFFV-Y4OYpO4LshEoRM,7527
|
15
15
|
sglang/lang/interpreter.py,sha256=ahRxuEJZ7b1Tts2Lr7wViWIqL-Z12T3anvgj0XdvMN8,26666
|
16
|
-
sglang/lang/ir.py,sha256=
|
16
|
+
sglang/lang/ir.py,sha256=8Ap-uEUz6K9eNQTOKtMixePuLwRFHFKcN0Z5Yn44nKk,13320
|
17
17
|
sglang/lang/tracer.py,sha256=pFiSNzPSg0l7ZZIlGqJDLCmQALR-wyo2dFgJP73J4_Y,8260
|
18
18
|
sglang/srt/backend_config.py,sha256=UIV6kIU2j-Xh0eoezn1aXcYIy0miftHsWFeAZwqpbGE,227
|
19
19
|
sglang/srt/conversation.py,sha256=mTstD-SsXG5p_YhWQUPEWU-vzzDMF4RgQ7KmLkOOC7U,15496
|
@@ -22,12 +22,12 @@ sglang/srt/memory_pool.py,sha256=BMoX2wvicj214mV-xvcr_Iv_Je0qs3zTuzXfQVpV8u4,360
|
|
22
22
|
sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
|
23
23
|
sglang/srt/model_config.py,sha256=ned-odjmKBKBhVPo04FEpus9gJsUWxrFLrLxahLwSaw,1328
|
24
24
|
sglang/srt/sampling_params.py,sha256=83Fp-4HWThC20TEh139XcIb_erBqfI7KZg5txdRBq7c,2896
|
25
|
-
sglang/srt/server.py,sha256=
|
25
|
+
sglang/srt/server.py,sha256=WLXissKuXQI7JFb2V8D47QSF-PPHnW-JZCiQm4YW0xE,24070
|
26
26
|
sglang/srt/server_args.py,sha256=bvbi-Rb_JudqztFFfRsuXBYtUsG9hq4zMFt7X97uDhA,8954
|
27
27
|
sglang/srt/utils.py,sha256=IEqpmWx_hl4eXn_KoHM0EPXmxeN2wKkgK7H01_t0x5Q,7355
|
28
28
|
sglang/srt/constrained/__init__.py,sha256=BPRNDJnWtzYJ13X4urRS5aE6wFuwAVNBA9qeWIHF8rE,1236
|
29
29
|
sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
|
30
|
-
sglang/srt/constrained/fsm_cache.py,sha256=
|
30
|
+
sglang/srt/constrained/fsm_cache.py,sha256=20mEgtDXU1Zeoicl5KBQC3arkg-RhRWiYnchJc00m1g,901
|
31
31
|
sglang/srt/constrained/jump_forward.py,sha256=Z-pz2Jnvk1CxSEZA65OVq0GryqdiKuOkhhc13v5T6Lo,2482
|
32
32
|
sglang/srt/layers/context_flashattention_nopad.py,sha256=TVYQ6IjftWVXORmKpEROMqQxDOnF6n2g0G1Ci4LquYM,5209
|
33
33
|
sglang/srt/layers/extend_attention.py,sha256=KGqQOA5mel9qScXMAQP_3Qyhp3BNbiQ7Y_6wi38Lxcs,12622
|
@@ -41,7 +41,7 @@ sglang/srt/managers/tokenizer_manager.py,sha256=hgsR9AMj6ic9S3-2WiELh7Hnp8Xnb_bz
|
|
41
41
|
sglang/srt/managers/router/infer_batch.py,sha256=U-Ckt9ad1WaOQF_dW6Eo9AMIRQoOJQ-Pm-MMXnEmPP8,18399
|
42
42
|
sglang/srt/managers/router/manager.py,sha256=TNYs0IrkZGkPvZJViwL7BMUg0VlvzeyTjDMjuvRoMDI,2529
|
43
43
|
sglang/srt/managers/router/model_rpc.py,sha256=VlwLNpHZ92bnteQl4PhVKoAXM0C8Y4_2LBBVaffeu3g,26766
|
44
|
-
sglang/srt/managers/router/model_runner.py,sha256
|
44
|
+
sglang/srt/managers/router/model_runner.py,sha256=-wWv00EbB_UkkLpio6VKGBTagfzxLHfY-eKDDQ0rZQc,18292
|
45
45
|
sglang/srt/managers/router/radix_cache.py,sha256=XGUF5mxQTSCzD7GW_ltNP2p5aelEKrMXzdezufJ7NCQ,6484
|
46
46
|
sglang/srt/managers/router/scheduler.py,sha256=V-LAnVSzgD2ddy2eXW3jWURCeq9Lv7YxCGk4kHyytfM,2818
|
47
47
|
sglang/srt/models/gemma.py,sha256=8XlfHPtVixPYYjz5F9T4DOAuoordWFStmyFFWGfny1k,11582
|
@@ -51,13 +51,14 @@ sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,
|
|
51
51
|
sglang/srt/models/mixtral.py,sha256=wqIwKfR90ih0gDiTZkFZcQD4PIYpZFD3CmzxRcuKIqw,13915
|
52
52
|
sglang/srt/models/qwen.py,sha256=CvdbcF90aI1tJPSQ-3OMUaQGMuaxCGe0y29m5nU_Yj0,9225
|
53
53
|
sglang/srt/models/qwen2.py,sha256=myPc0wvgf5ZzJyGhUGN49YjY-tMf4t8Jn_Imjg8D7Mk,11307
|
54
|
+
sglang/srt/models/stablelm.py,sha256=vMZUNgwXKPGYr5FcdYHw5g3QifVu9owKqq51_-EBOY0,10817
|
54
55
|
sglang/srt/models/yivl.py,sha256=Qvp-zQ93cOZGg3zVyaiQLhRsfXiLrQhxu9TyQP2FMm4,4414
|
55
56
|
sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
|
56
57
|
sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
|
57
58
|
sglang/test/test_programs.py,sha256=mrLhGuprwvx8ZJ-0Qe28E-iCw5Qv-9T0SAv1Jgo1AJw,11421
|
58
|
-
sglang/test/test_utils.py,sha256=
|
59
|
-
sglang-0.1.
|
60
|
-
sglang-0.1.
|
61
|
-
sglang-0.1.
|
62
|
-
sglang-0.1.
|
63
|
-
sglang-0.1.
|
59
|
+
sglang/test/test_utils.py,sha256=6PhTRi8UnR-BRNjit6aGu0M5lO0RebNQwEcDt712hE4,4830
|
60
|
+
sglang-0.1.14.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
61
|
+
sglang-0.1.14.dist-info/METADATA,sha256=C5N0VOYRHixdJcsf4dExIvP-Q099kYBMKs_dA4LBXSM,28809
|
62
|
+
sglang-0.1.14.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
63
|
+
sglang-0.1.14.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
64
|
+
sglang-0.1.14.dist-info/RECORD,,
|
File without changes
|
File without changes
|