sglang 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sglang/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.13"
1
+ __version__ = "0.1.14"
2
2
 
3
3
  from sglang.api import *
4
4
  from sglang.global_config import global_config
@@ -30,13 +30,17 @@ class Anthropic(BaseBackend):
30
30
  s: StreamExecutor,
31
31
  sampling_params: SglSamplingParams,
32
32
  ):
33
- prompt = s.text_
34
- ret = anthropic.Anthropic().completions.create(
33
+ if s.messages_:
34
+ messages = s.messages_
35
+ else:
36
+ messages = [{"role": "user", "content": s.text_}]
37
+
38
+ ret = anthropic.Anthropic().messages.create(
35
39
  model=self.model_name,
36
- prompt=prompt,
40
+ messages=messages,
37
41
  **sampling_params.to_anthropic_kwargs(),
38
42
  )
39
- comp = ret.completion
43
+ comp = ret.content[0].text
40
44
 
41
45
  return comp, {}
42
46
 
@@ -45,13 +49,15 @@ class Anthropic(BaseBackend):
45
49
  s: StreamExecutor,
46
50
  sampling_params: SglSamplingParams,
47
51
  ):
48
- prompt = s.text_
49
- generator = anthropic.Anthropic().completions.create(
52
+ if s.messages_:
53
+ messages = s.messages_
54
+ else:
55
+ messages = [{"role": "user", "content": s.text_}]
56
+
57
+ with anthropic.Anthropic().messages.stream(
50
58
  model=self.model_name,
51
- prompt=prompt,
52
- stream=True,
59
+ messages=messages,
53
60
  **sampling_params.to_anthropic_kwargs(),
54
- )
55
-
56
- for ret in generator:
57
- yield ret.completion, {}
61
+ ) as stream:
62
+ for text in stream.text_stream:
63
+ yield text, {}
sglang/lang/ir.py CHANGED
@@ -73,7 +73,7 @@ class SglSamplingParams:
73
73
  "Regular expression is not supported in the Anthropic backend."
74
74
  )
75
75
  return {
76
- "max_tokens_to_sample": self.max_new_tokens,
76
+ "max_tokens": self.max_new_tokens,
77
77
  "stop_sequences": (
78
78
  self.stop if isinstance(self.stop, (list, tuple)) else [self.stop]
79
79
  ),
@@ -5,9 +5,20 @@ from sglang.srt.constrained.base_cache import BaseCache
5
5
  class FSMCache(BaseCache):
6
6
  def __init__(self, tokenizer_path, tokenizer_args_dict, enable=True):
7
7
  super().__init__(enable=enable)
8
- self.outlines_tokenizer = TransformerTokenizer(
9
- tokenizer_path, **tokenizer_args_dict
10
- )
8
+
9
+ from importlib.metadata import version
10
+ if version("outlines") >= "0.0.35":
11
+ from transformers import AutoTokenizer
12
+
13
+ tokenizer_args_dict.setdefault("padding_side", "left")
14
+ tokenizer = AutoTokenizer.from_pretrained(
15
+ tokenizer_path, **tokenizer_args_dict
16
+ )
17
+ self.outlines_tokenizer = TransformerTokenizer(tokenizer)
18
+ else:
19
+ self.outlines_tokenizer = TransformerTokenizer(
20
+ tokenizer_path, **tokenizer_args_dict
21
+ )
11
22
 
12
23
  def init_value(self, regex):
13
24
  return RegexFSM(regex, self.outlines_tokenizer)
@@ -1,8 +1,10 @@
1
1
  import importlib
2
2
  import logging
3
+ import inspect
3
4
  from dataclasses import dataclass
4
5
  from functools import lru_cache
5
6
  from pathlib import Path
7
+ import importlib.resources
6
8
 
7
9
  import numpy as np
8
10
  import torch
@@ -12,12 +14,16 @@ from sglang.srt.utils import is_multimodal_model
12
14
  from sglang.utils import get_available_gpu_memory
13
15
  from vllm.model_executor.layers.quantization.awq import AWQConfig
14
16
  from vllm.model_executor.layers.quantization.gptq import GPTQConfig
17
+ from vllm.model_executor.layers.quantization.marlin import MarlinConfig
15
18
  from vllm.model_executor.model_loader import _set_default_torch_dtype
16
19
  from vllm.model_executor.parallel_utils.parallel_state import initialize_model_parallel
17
20
 
21
+ import importlib
22
+ import pkgutil
23
+
18
24
  import sglang
19
25
 
20
- QUANTIONCONFIG_MAPPING = {"awq": AWQConfig, "gptq": GPTQConfig}
26
+ QUANTIONCONFIG_MAPPING = {"awq": AWQConfig, "gptq": GPTQConfig, "marlin": MarlinConfig}
21
27
 
22
28
  logger = logging.getLogger("model_runner")
23
29
 
@@ -29,10 +35,13 @@ global_server_args_dict: dict = None
29
35
  @lru_cache()
30
36
  def import_model_classes():
31
37
  model_arch_name_to_cls = {}
32
- for module_path in (Path(sglang.__file__).parent / "srt" / "models").glob("*.py"):
33
- module = importlib.import_module(f"sglang.srt.models.{module_path.stem}")
34
- if hasattr(module, "EntryClass"):
35
- model_arch_name_to_cls[module.EntryClass.__name__] = module.EntryClass
38
+ package_name = "sglang.srt.models"
39
+ package = importlib.import_module(package_name)
40
+ for finder, name, ispkg in pkgutil.iter_modules(package.__path__, package_name + '.'):
41
+ if not ispkg:
42
+ module = importlib.import_module(name)
43
+ if hasattr(module, "EntryClass"):
44
+ model_arch_name_to_cls[module.EntryClass.__name__] = module.EntryClass
36
45
  return model_arch_name_to_cls
37
46
 
38
47
 
@@ -124,14 +133,21 @@ class InputMetadata:
124
133
  self.prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
125
134
  workspace_buffer, "NHD"
126
135
  )
127
- self.prefill_wrapper.begin_forward(
136
+ args = [
128
137
  self.qo_indptr,
129
138
  self.kv_indptr,
130
139
  self.kv_indices,
131
140
  self.kv_last_page_len,
132
141
  self.model_runner.model_config.num_attention_heads // tp_size,
133
142
  self.model_runner.model_config.num_key_value_heads // tp_size,
134
- )
143
+ ]
144
+
145
+ # flashinfer >= 0.0.3
146
+ # FIXME: Drop this when flashinfer updates to 0.0.4
147
+ if len(inspect.signature(self.prefill_wrapper.begin_forward).parameters) == 7:
148
+ args.append(self.model_runner.model_config.head_dim)
149
+
150
+ self.prefill_wrapper.begin_forward(*args)
135
151
  else:
136
152
  self.decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
137
153
  workspace_buffer, "NHD"
@@ -288,9 +304,15 @@ class ModelRunner:
288
304
  self.model_config.hf_config, "quantization_config", None
289
305
  )
290
306
  if hf_quant_config is not None:
291
- quant_config_class = QUANTIONCONFIG_MAPPING.get(
292
- hf_quant_config["quant_method"]
293
- )
307
+ hf_quant_method = hf_quant_config["quant_method"]
308
+
309
+ # compat: autogptq uses is_marlin_format within quant config
310
+ if (hf_quant_method == "gptq"
311
+ and "is_marlin_format" in hf_quant_config
312
+ and hf_quant_config["is_marlin_format"]):
313
+ hf_quant_method = "marlin"
314
+ quant_config_class = QUANTIONCONFIG_MAPPING.get(hf_quant_method)
315
+
294
316
  if quant_config_class is None:
295
317
  raise ValueError(
296
318
  f"Unsupported quantization method: {hf_quant_config['quant_method']}"
@@ -0,0 +1,293 @@
1
+ # This code is based on:
2
+ # https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/stablelm.py
3
+ """Inference-only StableLM-2 (https://huggingface.co/stabilityai/stablelm-2-1_6b)
4
+ model compatible with HuggingFace weights."""
5
+ from typing import Optional, Tuple
6
+
7
+ import torch
8
+ from torch import nn
9
+ from transformers import PretrainedConfig
10
+
11
+ from sglang.srt.layers.logits_processor import LogitsProcessor
12
+ from sglang.srt.layers.radix_attention import RadixAttention
13
+ from sglang.srt.managers.router.model_runner import InputMetadata
14
+ from vllm.model_executor.layers.activation import SiluAndMul
15
+ from vllm.model_executor.layers.linear import (
16
+ LinearMethodBase,
17
+ MergedColumnParallelLinear,
18
+ QKVParallelLinear,
19
+ RowParallelLinear,
20
+ )
21
+ from vllm.model_executor.layers.rotary_embedding import get_rope
22
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
23
+ VocabParallelEmbedding,
24
+ ParallelLMHead,
25
+ )
26
+ from vllm.model_executor.parallel_utils.parallel_state import (
27
+ get_tensor_model_parallel_world_size,
28
+ )
29
+ from vllm.model_executor.weight_utils import (
30
+ default_weight_loader,
31
+ hf_model_weights_iterator,
32
+ )
33
+
34
+
35
+ class StablelmMLP(nn.Module):
36
+ def __init__(
37
+ self, config: PretrainedConfig, linear_method: Optional[LinearMethodBase] = None
38
+ ) -> None:
39
+ super().__init__()
40
+ self.config = config
41
+ self.hidden_size = config.hidden_size
42
+ self.intermediate_size = config.intermediate_size
43
+ self.gate_up_proj = MergedColumnParallelLinear(
44
+ config.hidden_size,
45
+ [config.intermediate_size] * 2,
46
+ bias=False,
47
+ linear_method=linear_method,
48
+ )
49
+ self.down_proj = RowParallelLinear(
50
+ config.intermediate_size, config.hidden_size, bias=False
51
+ )
52
+ self.act_fn = SiluAndMul()
53
+
54
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
55
+ gate_up, _ = self.gate_up_proj(x)
56
+ x = self.act_fn(gate_up)
57
+ x, _ = self.down_proj(x)
58
+ return x
59
+
60
+
61
+ class StablelmAttention(nn.Module):
62
+ def __init__(
63
+ self,
64
+ config: PretrainedConfig,
65
+ layer_id: int = 0,
66
+ linear_method: Optional[LinearMethodBase] = None,
67
+ ) -> None:
68
+ super().__init__()
69
+ self.config = config
70
+ self.hidden_size = config.hidden_size
71
+ tp_size = get_tensor_model_parallel_world_size()
72
+ self.total_num_heads = config.num_attention_heads
73
+ self.num_heads = self.total_num_heads // tp_size
74
+
75
+ self.total_num_key_value_heads = config.num_key_value_heads
76
+ if self.total_num_key_value_heads >= tp_size:
77
+ # Number of KV heads is greater than TP size, so we partition
78
+ # the KV heads across multiple tensor parallel GPUs.
79
+ assert self.total_num_key_value_heads % tp_size == 0
80
+ else:
81
+ # Number of KV heads is less than TP size, so we replicate
82
+ # the KV heads across multiple tensor parallel GPUs.
83
+ assert tp_size % self.total_num_key_value_heads == 0
84
+ self.num_key_value_heads = max(1, self.total_num_key_value_heads // tp_size)
85
+ self.head_dim = self.hidden_size // self.total_num_heads
86
+ self.max_position_embeddings = config.max_position_embeddings
87
+ rope_pct = getattr(
88
+ config, "rope_pct", getattr(config, "partial_rotary_factor", 1)
89
+ )
90
+ self.rotary_ndims = int(self.head_dim * rope_pct)
91
+ self.scaling = self.head_dim**-0.5
92
+ self.q_size = self.num_heads * self.head_dim
93
+ self.kv_size = self.num_key_value_heads * self.head_dim
94
+ self.qkv_bias = getattr(config, "use_qkv_bias", False)
95
+ if (self.head_dim * self.num_heads * tp_size) != self.hidden_size:
96
+ raise ValueError(
97
+ f"hidden_size must be divisible by num_heads "
98
+ f"(got `hidden_size`: {self.hidden_size}"
99
+ f" and `num_heads`: {self.num_heads})."
100
+ )
101
+
102
+ self.qkv_proj = QKVParallelLinear(
103
+ self.hidden_size,
104
+ self.head_dim,
105
+ self.total_num_heads,
106
+ self.total_num_key_value_heads,
107
+ self.qkv_bias,
108
+ linear_method=linear_method,
109
+ )
110
+ self.o_proj = RowParallelLinear(
111
+ self.total_num_heads * self.head_dim,
112
+ self.hidden_size,
113
+ bias=False,
114
+ linear_method=linear_method,
115
+ )
116
+ self.rotary_emb = get_rope(
117
+ self.head_dim,
118
+ rotary_dim=self.rotary_ndims,
119
+ max_position=self.config.max_position_embeddings,
120
+ base=self.config.rope_theta,
121
+ )
122
+ self.attn = RadixAttention(
123
+ self.num_heads,
124
+ self.head_dim,
125
+ self.scaling,
126
+ num_kv_heads=self.num_key_value_heads,
127
+ layer_id=layer_id,
128
+ )
129
+
130
+ def forward(
131
+ self,
132
+ positions: torch.Tensor,
133
+ hidden_states: torch.Tensor,
134
+ input_metadata: InputMetadata,
135
+ ) -> torch.Tensor:
136
+ qkv, _ = self.qkv_proj(hidden_states)
137
+ q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
138
+ q, k = self.rotary_emb(positions, q, k)
139
+ attn_output = self.attn(q, k, v, input_metadata)
140
+ output, _ = self.o_proj(attn_output)
141
+ return output
142
+
143
+
144
+ class StablelmDecoderLayer(nn.Module):
145
+ def __init__(
146
+ self,
147
+ config: PretrainedConfig,
148
+ layer_id: int = 0,
149
+ linear_method: Optional[LinearMethodBase] = None,
150
+ ) -> None:
151
+ super().__init__()
152
+ self.self_attn = StablelmAttention(config, layer_id=layer_id)
153
+ self.mlp = StablelmMLP(config, linear_method)
154
+ norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05))
155
+ self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
156
+ self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
157
+
158
+ def forward(
159
+ self,
160
+ positions: torch.Tensor,
161
+ hidden_states: torch.Tensor,
162
+ input_metadata: InputMetadata,
163
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
164
+ # Self Attention
165
+ residual = hidden_states
166
+ hidden_states = self.input_layernorm(hidden_states)
167
+ hidden_states = self.self_attn(
168
+ positions=positions,
169
+ hidden_states=hidden_states,
170
+ input_metadata=input_metadata,
171
+ )
172
+ hidden_states = residual + hidden_states
173
+
174
+ # Fully Connected
175
+ residual = hidden_states
176
+ hidden_states = self.post_attention_layernorm(hidden_states)
177
+ hidden_states = self.mlp(hidden_states)
178
+ hidden_states = residual + hidden_states
179
+
180
+ return hidden_states, residual
181
+
182
+
183
+ class StableLMEpochModel(nn.Module):
184
+ def __init__(
185
+ self, config: PretrainedConfig, linear_method: Optional[LinearMethodBase] = None
186
+ ) -> None:
187
+ super().__init__()
188
+ self.embed_tokens = VocabParallelEmbedding(
189
+ config.vocab_size,
190
+ config.hidden_size,
191
+ )
192
+ self.layers = nn.ModuleList(
193
+ [
194
+ StablelmDecoderLayer(config, i, linear_method)
195
+ for i in range(config.num_hidden_layers)
196
+ ]
197
+ )
198
+ norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05))
199
+ self.norm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
200
+
201
+ def forward(
202
+ self,
203
+ input_ids: torch.Tensor,
204
+ positions: torch.Tensor,
205
+ input_metadata: InputMetadata,
206
+ input_embeds: torch.Tensor = None,
207
+ ) -> torch.Tensor:
208
+ if input_embeds is None:
209
+ hidden_states = self.embed_tokens(input_ids)
210
+ else:
211
+ hidden_states = input_embeds
212
+ for i in range(len(self.layers)):
213
+ layer = self.layers[i]
214
+ hidden_states, residual = layer(
215
+ positions,
216
+ hidden_states,
217
+ input_metadata,
218
+ )
219
+ hidden_states = self.norm(hidden_states)
220
+ return hidden_states
221
+
222
+
223
+ class StableLmForCausalLM(nn.Module):
224
+ def __init__(
225
+ self,
226
+ config: PretrainedConfig,
227
+ linear_method: Optional[LinearMethodBase] = None,
228
+ ) -> None:
229
+ super().__init__()
230
+ self.config = config
231
+ self.linear_method = linear_method
232
+ self.model = StableLMEpochModel(config, linear_method)
233
+ self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
234
+ self.logits_processor = LogitsProcessor(config)
235
+
236
+ def forward(
237
+ self,
238
+ input_ids: torch.Tensor,
239
+ positions: torch.Tensor,
240
+ input_metadata: InputMetadata,
241
+ input_embeds: torch.Tensor = None,
242
+ ) -> torch.Tensor:
243
+ hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
244
+ return self.logits_processor(
245
+ input_ids, hidden_states, self.lm_head.weight, input_metadata
246
+ )
247
+
248
+ def load_weights(
249
+ self,
250
+ model_name_or_path: str,
251
+ cache_dir: Optional[str] = None,
252
+ load_format: str = "auto",
253
+ revision: Optional[str] = None,
254
+ ):
255
+ stacked_params_mapping = [
256
+ # (param_name, shard_name, shard_id)
257
+ ("qkv_proj", "q_proj", "q"),
258
+ ("qkv_proj", "k_proj", "k"),
259
+ ("qkv_proj", "v_proj", "v"),
260
+ ("gate_up_proj", "gate_proj", 0),
261
+ ("gate_up_proj", "up_proj", 1),
262
+ ]
263
+ params_dict = dict(self.named_parameters())
264
+ for name, loaded_weight in hf_model_weights_iterator(
265
+ model_name_or_path, cache_dir, load_format, revision
266
+ ):
267
+ if "rotary_emb.inv_freq" in name:
268
+ continue
269
+ if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
270
+ # Models trained using ColossalAI may include these tensors in
271
+ # the checkpoint. Skip them.
272
+ continue
273
+ for param_name, weight_name, shard_id in stacked_params_mapping:
274
+ if weight_name not in name:
275
+ continue
276
+ name = name.replace(weight_name, param_name)
277
+ # Skip loading extra bias for GPTQ models.
278
+ if name.endswith(".bias") and name not in params_dict:
279
+ continue
280
+ param = params_dict[name]
281
+ weight_loader = param.weight_loader
282
+ weight_loader(param, loaded_weight, shard_id)
283
+ break
284
+ else:
285
+ # Skip loading extra bias for GPTQ models.
286
+ if name.endswith(".bias") and name not in params_dict:
287
+ continue
288
+ param = params_dict[name]
289
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
290
+ weight_loader(param, loaded_weight)
291
+
292
+
293
+ EntryClass = StableLmForCausalLM
sglang/srt/server.py CHANGED
@@ -587,6 +587,10 @@ class Runtime:
587
587
  attention_reduce_in_fp32: bool = False,
588
588
  random_seed: int = 42,
589
589
  log_level: str = "error",
590
+ disable_radix_cache: bool = False,
591
+ enable_flashinfer: bool = False,
592
+ disable_regex_jump_forward: bool = False,
593
+ disable_disk_cache: bool = False,
590
594
  api_key: str = "",
591
595
  port: Optional[int] = None,
592
596
  additional_ports: Optional[Union[List[int], int]] = None,
@@ -610,6 +614,10 @@ class Runtime:
610
614
  attention_reduce_in_fp32=attention_reduce_in_fp32,
611
615
  random_seed=random_seed,
612
616
  log_level=log_level,
617
+ disable_radix_cache=disable_radix_cache,
618
+ enable_flashinfer=enable_flashinfer,
619
+ disable_regex_jump_forward=disable_regex_jump_forward,
620
+ disable_disk_cache=disable_disk_cache,
613
621
  api_key=api_key,
614
622
  )
615
623
 
sglang/test/test_utils.py CHANGED
@@ -155,7 +155,7 @@ def select_sglang_backend(args):
155
155
  global_config.enable_parallel_decoding = False
156
156
  global_config.enable_parallel_encoding = False
157
157
  backend = RuntimeEndpoint(f"{args.host}:{args.port}")
158
- elif args.backend.startswith("gpt"):
158
+ elif args.backend.startswith("gpt-"):
159
159
  backend = OpenAI(args.backend)
160
160
  else:
161
161
  raise ValueError(f"Invalid backend: {args.backend}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.13
3
+ Version: 0.1.14
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -217,7 +217,7 @@ Requires-Dist: sglang[srt] ; extra == 'all'
217
217
  Requires-Dist: sglang[openai] ; extra == 'all'
218
218
  Requires-Dist: sglang[anthropic] ; extra == 'all'
219
219
  Provides-Extra: anthropic
220
- Requires-Dist: anthropic ; extra == 'anthropic'
220
+ Requires-Dist: anthropic >=0.20.0 ; extra == 'anthropic'
221
221
  Requires-Dist: numpy ; extra == 'anthropic'
222
222
  Provides-Extra: openai
223
223
  Requires-Dist: openai >=1.0 ; extra == 'openai'
@@ -1,10 +1,10 @@
1
- sglang/__init__.py,sha256=_FURrkSPmWc17ErPLtvNXIu760F-ChqEyi5wfn08WJA,96
1
+ sglang/__init__.py,sha256=Nxa2M7XCh2-e6I7VrCg7OSBL6BvEW3gyRD14ZdykpRM,96
2
2
  sglang/api.py,sha256=0-Eh7c41hWKjPXrzzvLFdLAUVkvmPGJGLAsrG9evDTE,4576
3
3
  sglang/global_config.py,sha256=PAX7TWeFcq0HBzNUWyCONAOjqIokWqw8vT7I6sBSKTc,797
4
4
  sglang/launch_server.py,sha256=jKPZRDN5bUe8Wgz5eoDkqeePhmKa8DLD4DpXQLT5auo,294
5
5
  sglang/utils.py,sha256=2dUXLMPz9VhhzbIRQABmfZnVW5yz61F3UVtb6yKyevM,6237
6
6
  sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- sglang/backend/anthropic.py,sha256=y5TN9EDrJtOH4JEUxpXu-endloeYBy7xMUr3r7Ah3MA,1462
7
+ sglang/backend/anthropic.py,sha256=GJ_T1Jg0VOtajgkgczPKt5sjuVYdbAiWd2jXlJRNRmg,1677
8
8
  sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
9
9
  sglang/backend/openai.py,sha256=nPdA88A5GISJTH88svJdww3qHWIHZcGG2NEn0XjMkLU,9578
10
10
  sglang/backend/runtime_endpoint.py,sha256=r7dTazselaudlFx8hqk-PQLYDHZhpbAKjyFF1zLuM_E,8022
@@ -13,7 +13,7 @@ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  sglang/lang/chat_template.py,sha256=MaCF0fvNky0nJC9OvmAeApeHYgM6Lr03mtRhF0lS31U,8000
14
14
  sglang/lang/compiler.py,sha256=wNn_UqV6Sxl22mv-PpzFUtRgiFFV-Y4OYpO4LshEoRM,7527
15
15
  sglang/lang/interpreter.py,sha256=ahRxuEJZ7b1Tts2Lr7wViWIqL-Z12T3anvgj0XdvMN8,26666
16
- sglang/lang/ir.py,sha256=QSx0vMepQ01SaQ4EQjUqbJknHSrF557CqHuosQi6otQ,13330
16
+ sglang/lang/ir.py,sha256=8Ap-uEUz6K9eNQTOKtMixePuLwRFHFKcN0Z5Yn44nKk,13320
17
17
  sglang/lang/tracer.py,sha256=pFiSNzPSg0l7ZZIlGqJDLCmQALR-wyo2dFgJP73J4_Y,8260
18
18
  sglang/srt/backend_config.py,sha256=UIV6kIU2j-Xh0eoezn1aXcYIy0miftHsWFeAZwqpbGE,227
19
19
  sglang/srt/conversation.py,sha256=mTstD-SsXG5p_YhWQUPEWU-vzzDMF4RgQ7KmLkOOC7U,15496
@@ -22,12 +22,12 @@ sglang/srt/memory_pool.py,sha256=BMoX2wvicj214mV-xvcr_Iv_Je0qs3zTuzXfQVpV8u4,360
22
22
  sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
23
23
  sglang/srt/model_config.py,sha256=ned-odjmKBKBhVPo04FEpus9gJsUWxrFLrLxahLwSaw,1328
24
24
  sglang/srt/sampling_params.py,sha256=83Fp-4HWThC20TEh139XcIb_erBqfI7KZg5txdRBq7c,2896
25
- sglang/srt/server.py,sha256=fiIhWV5a4J1dprgiq_56K29Y24hIdsWJvd7CckJ71GE,23674
25
+ sglang/srt/server.py,sha256=WLXissKuXQI7JFb2V8D47QSF-PPHnW-JZCiQm4YW0xE,24070
26
26
  sglang/srt/server_args.py,sha256=bvbi-Rb_JudqztFFfRsuXBYtUsG9hq4zMFt7X97uDhA,8954
27
27
  sglang/srt/utils.py,sha256=IEqpmWx_hl4eXn_KoHM0EPXmxeN2wKkgK7H01_t0x5Q,7355
28
28
  sglang/srt/constrained/__init__.py,sha256=BPRNDJnWtzYJ13X4urRS5aE6wFuwAVNBA9qeWIHF8rE,1236
29
29
  sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
30
- sglang/srt/constrained/fsm_cache.py,sha256=Q0J4St3XUOt2tKFVpj0B2KIZ6z3X6cIzTcjREVqy3pg,471
30
+ sglang/srt/constrained/fsm_cache.py,sha256=20mEgtDXU1Zeoicl5KBQC3arkg-RhRWiYnchJc00m1g,901
31
31
  sglang/srt/constrained/jump_forward.py,sha256=Z-pz2Jnvk1CxSEZA65OVq0GryqdiKuOkhhc13v5T6Lo,2482
32
32
  sglang/srt/layers/context_flashattention_nopad.py,sha256=TVYQ6IjftWVXORmKpEROMqQxDOnF6n2g0G1Ci4LquYM,5209
33
33
  sglang/srt/layers/extend_attention.py,sha256=KGqQOA5mel9qScXMAQP_3Qyhp3BNbiQ7Y_6wi38Lxcs,12622
@@ -41,7 +41,7 @@ sglang/srt/managers/tokenizer_manager.py,sha256=hgsR9AMj6ic9S3-2WiELh7Hnp8Xnb_bz
41
41
  sglang/srt/managers/router/infer_batch.py,sha256=U-Ckt9ad1WaOQF_dW6Eo9AMIRQoOJQ-Pm-MMXnEmPP8,18399
42
42
  sglang/srt/managers/router/manager.py,sha256=TNYs0IrkZGkPvZJViwL7BMUg0VlvzeyTjDMjuvRoMDI,2529
43
43
  sglang/srt/managers/router/model_rpc.py,sha256=VlwLNpHZ92bnteQl4PhVKoAXM0C8Y4_2LBBVaffeu3g,26766
44
- sglang/srt/managers/router/model_runner.py,sha256=wbNyctWZURvFyPA2FxpLrZoT4g60W7RB1fXiTaMsyeE,17396
44
+ sglang/srt/managers/router/model_runner.py,sha256=-wWv00EbB_UkkLpio6VKGBTagfzxLHfY-eKDDQ0rZQc,18292
45
45
  sglang/srt/managers/router/radix_cache.py,sha256=XGUF5mxQTSCzD7GW_ltNP2p5aelEKrMXzdezufJ7NCQ,6484
46
46
  sglang/srt/managers/router/scheduler.py,sha256=V-LAnVSzgD2ddy2eXW3jWURCeq9Lv7YxCGk4kHyytfM,2818
47
47
  sglang/srt/models/gemma.py,sha256=8XlfHPtVixPYYjz5F9T4DOAuoordWFStmyFFWGfny1k,11582
@@ -51,13 +51,14 @@ sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,
51
51
  sglang/srt/models/mixtral.py,sha256=wqIwKfR90ih0gDiTZkFZcQD4PIYpZFD3CmzxRcuKIqw,13915
52
52
  sglang/srt/models/qwen.py,sha256=CvdbcF90aI1tJPSQ-3OMUaQGMuaxCGe0y29m5nU_Yj0,9225
53
53
  sglang/srt/models/qwen2.py,sha256=myPc0wvgf5ZzJyGhUGN49YjY-tMf4t8Jn_Imjg8D7Mk,11307
54
+ sglang/srt/models/stablelm.py,sha256=vMZUNgwXKPGYr5FcdYHw5g3QifVu9owKqq51_-EBOY0,10817
54
55
  sglang/srt/models/yivl.py,sha256=Qvp-zQ93cOZGg3zVyaiQLhRsfXiLrQhxu9TyQP2FMm4,4414
55
56
  sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
56
57
  sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
57
58
  sglang/test/test_programs.py,sha256=mrLhGuprwvx8ZJ-0Qe28E-iCw5Qv-9T0SAv1Jgo1AJw,11421
58
- sglang/test/test_utils.py,sha256=DyZAic3KIBQ0PmZeLc9uv1ckcM5jpEE5CirjHO48_sk,4829
59
- sglang-0.1.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
60
- sglang-0.1.13.dist-info/METADATA,sha256=eRFaM9_QZeLWLXQb4NmrC8zJMQLFnE9zt49O_jJ7ybA,28800
61
- sglang-0.1.13.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
62
- sglang-0.1.13.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
63
- sglang-0.1.13.dist-info/RECORD,,
59
+ sglang/test/test_utils.py,sha256=6PhTRi8UnR-BRNjit6aGu0M5lO0RebNQwEcDt712hE4,4830
60
+ sglang-0.1.14.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
61
+ sglang-0.1.14.dist-info/METADATA,sha256=C5N0VOYRHixdJcsf4dExIvP-Q099kYBMKs_dA4LBXSM,28809
62
+ sglang-0.1.14.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
63
+ sglang-0.1.14.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
64
+ sglang-0.1.14.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5