sglang 0.2.15__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,7 @@ from typing import List, Optional
4
4
 
5
5
  from sglang.global_config import global_config
6
6
  from sglang.lang.backend.base_backend import BaseBackend
7
- from sglang.lang.chat_template import get_chat_template_by_model_path
7
+ from sglang.lang.chat_template import get_chat_template, get_chat_template_by_model_path
8
8
  from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod
9
9
  from sglang.lang.interpreter import StreamExecutor
10
10
  from sglang.lang.ir import (
@@ -23,6 +23,7 @@ class RuntimeEndpoint(BaseBackend):
23
23
  base_url: str,
24
24
  api_key: Optional[str] = None,
25
25
  verify: Optional[str] = None,
26
+ chat_template_name: Optional[str] = None,
26
27
  ):
27
28
  super().__init__()
28
29
  self.support_concate_and_append = True
@@ -39,9 +40,12 @@ class RuntimeEndpoint(BaseBackend):
39
40
  self._assert_success(res)
40
41
  self.model_info = res.json()
41
42
 
42
- self.chat_template = get_chat_template_by_model_path(
43
- self.model_info["model_path"]
44
- )
43
+ if chat_template_name:
44
+ self.chat_template = get_chat_template(chat_template_name)
45
+ else:
46
+ self.chat_template = get_chat_template_by_model_path(
47
+ self.model_info["model_path"]
48
+ )
45
49
 
46
50
  def get_model_name(self):
47
51
  return self.model_info["model_path"]
@@ -92,7 +92,7 @@ def get_context_length(config):
92
92
  """Get the context length of a model from a huggingface model configs."""
93
93
  rope_scaling = getattr(config, "rope_scaling", None)
94
94
  if rope_scaling:
95
- rope_scaling_factor = config.rope_scaling["factor"]
95
+ rope_scaling_factor = config.rope_scaling.get("factor", 1)
96
96
  if "original_max_position_embeddings" in rope_scaling:
97
97
  rope_scaling_factor = 1
98
98
  if config.rope_scaling.get("rope_type", None) == "llama3":
@@ -1,6 +1,6 @@
1
1
  import dataclasses
2
2
  import logging
3
- from typing import Union
3
+ from typing import Tuple, Union
4
4
 
5
5
  import torch
6
6
  from flashinfer.sampling import (
@@ -9,6 +9,7 @@ from flashinfer.sampling import (
9
9
  top_k_top_p_sampling_from_probs,
10
10
  top_p_renorm_prob,
11
11
  )
12
+ from torch.library import custom_op as torch_custom_op
12
13
  from vllm.model_executor.custom_op import CustomOp
13
14
 
14
15
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
@@ -30,6 +31,9 @@ class SampleOutput:
30
31
  class Sampler(CustomOp):
31
32
  def __init__(self):
32
33
  super().__init__()
34
+ # FIXME: torch.multinomial has too many bugs
35
+ self.forward_native = self.forward_cuda
36
+ self.is_torch_compile = False
33
37
 
34
38
  def _apply_penalties(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo):
35
39
  # min-token, presence, frequency
@@ -46,16 +50,11 @@ class Sampler(CustomOp):
46
50
 
47
51
  return logits
48
52
 
49
- def _get_probs(
50
- self,
51
- logits: torch.Tensor,
52
- sampling_info: SamplingBatchInfo,
53
- is_torch_compile: bool = False,
54
- ):
53
+ def _get_probs(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo):
55
54
  # Post process logits
56
55
  logits = logits.contiguous()
57
56
  logits.div_(sampling_info.temperatures)
58
- if is_torch_compile:
57
+ if self.is_torch_compile:
59
58
  # FIXME: Temporary workaround for unknown bugs in torch.compile
60
59
  logits.add_(0)
61
60
 
@@ -91,7 +90,7 @@ class Sampler(CustomOp):
91
90
  probs, uniform_samples, sampling_info.min_ps
92
91
  )
93
92
  else:
94
- batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
93
+ batch_next_token_ids, success = flashinfer_top_k_top_p(
95
94
  probs, uniform_samples, sampling_info.top_ks, sampling_info.top_ps
96
95
  )
97
96
  else:
@@ -110,7 +109,7 @@ class Sampler(CustomOp):
110
109
  if isinstance(logits, LogitsProcessorOutput):
111
110
  logits = logits.next_token_logits
112
111
 
113
- probs = self._get_probs(logits, sampling_info, is_torch_compile=True)
112
+ probs = self._get_probs(logits, sampling_info)
114
113
 
115
114
  batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch(
116
115
  probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
@@ -119,6 +118,31 @@ class Sampler(CustomOp):
119
118
  return SampleOutput(success, probs, batch_next_token_ids)
120
119
 
121
120
 
121
+ @torch_custom_op("my_lib::flashinfer_top_k_top_p", mutates_args={})
122
+ def flashinfer_top_k_top_p(
123
+ probs: torch.Tensor,
124
+ uniform_samples: torch.Tensor,
125
+ top_ks: torch.Tensor,
126
+ top_ps: torch.Tensor,
127
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
128
+ # NOTE: we do not use min_p neither in CUDA nor in torch.compile
129
+ return top_k_top_p_sampling_from_probs(probs, uniform_samples, top_ks, top_ps)
130
+
131
+
132
+ @flashinfer_top_k_top_p.register_fake
133
+ def _(
134
+ probs: torch.Tensor,
135
+ uniform_samples: torch.Tensor,
136
+ top_ks: torch.Tensor,
137
+ top_ps: torch.Tensor,
138
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
139
+ bs = probs.shape[0]
140
+ return (
141
+ torch.ones(bs, dtype=torch.bool, device=probs.device),
142
+ torch.zeros(bs, dtype=torch.int32, device=probs.device),
143
+ )
144
+
145
+
122
146
  def top_k_top_p_min_p_sampling_from_probs_torch(
123
147
  probs: torch.Tensor,
124
148
  top_ks: torch.Tensor,
@@ -178,19 +178,22 @@ class Req:
178
178
  def adjust_max_prefix_ids(self):
179
179
  self.fill_ids = self.origin_input_ids + self.output_ids
180
180
  input_len = len(self.fill_ids)
181
- max_prefix_len = input_len
181
+
182
+ # FIXME: To work around some bugs in logprob computation, we need to ensure each
183
+ # request has at least one token. Later, we can relax this requirement and use `input_len`.
184
+ max_prefix_len = input_len - 1
182
185
 
183
186
  if self.sampling_params.max_new_tokens > 0:
184
187
  # Need at least one token to compute logits
185
188
  max_prefix_len = min(max_prefix_len, input_len - 1)
186
189
 
187
190
  if self.return_logprob:
188
- max_prefix_len = min(max_prefix_len, self.logprob_start_len)
189
-
190
191
  if self.normalized_prompt_logprob is None:
191
192
  # Need at least two tokens to compute normalized logprob
192
193
  max_prefix_len = min(max_prefix_len, input_len - 2)
194
+ max_prefix_len = min(max_prefix_len, self.logprob_start_len)
193
195
 
196
+ max_prefix_len = max(max_prefix_len, 0)
194
197
  return self.fill_ids[:max_prefix_len]
195
198
 
196
199
  # Based on https://github.com/vllm-project/vllm/blob/7a64d24aad69e4d2548aa0bf528d9fe63428ab01/vllm/transformers_utils/detokenizer.py#L194-L313
@@ -86,8 +86,8 @@ class TokenizerManager:
86
86
  self.recv_from_detokenizer = context.socket(zmq.PULL)
87
87
  self.recv_from_detokenizer.bind(f"tcp://127.0.0.1:{port_args.tokenizer_port}")
88
88
 
89
- self.send_to_router = context.socket(zmq.PUSH)
90
- self.send_to_router.connect(f"tcp://127.0.0.1:{port_args.controller_port}")
89
+ self.send_to_controller = context.socket(zmq.PUSH)
90
+ self.send_to_controller.connect(f"tcp://127.0.0.1:{port_args.controller_port}")
91
91
 
92
92
  # Read model args
93
93
  self.model_path = server_args.model_path
@@ -271,7 +271,7 @@ class TokenizerManager:
271
271
  input_ids,
272
272
  sampling_params,
273
273
  )
274
- self.send_to_router.send_pyobj(tokenized_obj)
274
+ self.send_to_controller.send_pyobj(tokenized_obj)
275
275
 
276
276
  # Recv results
277
277
  event = asyncio.Event()
@@ -367,7 +367,7 @@ class TokenizerManager:
367
367
  input_ids,
368
368
  sampling_params,
369
369
  )
370
- self.send_to_router.send_pyobj(tokenized_obj)
370
+ self.send_to_controller.send_pyobj(tokenized_obj)
371
371
 
372
372
  event = asyncio.Event()
373
373
  state = ReqState([], False, event)
@@ -500,14 +500,14 @@ class TokenizerManager:
500
500
 
501
501
  def flush_cache(self):
502
502
  req = FlushCacheReq()
503
- self.send_to_router.send_pyobj(req)
503
+ self.send_to_controller.send_pyobj(req)
504
504
 
505
505
  def abort_request(self, rid: str):
506
506
  if rid not in self.rid_to_state:
507
507
  return
508
508
  del self.rid_to_state[rid]
509
509
  req = AbortReq(rid)
510
- self.send_to_router.send_pyobj(req)
510
+ self.send_to_controller.send_pyobj(req)
511
511
 
512
512
  async def update_weights(
513
513
  self, obj: UpdateWeightReqInput, request: Optional[fastapi.Request] = None
@@ -524,7 +524,7 @@ class TokenizerManager:
524
524
  # wait for the previous generation requests to finish
525
525
  while len(self.rid_to_state) > 0:
526
526
  await asyncio.sleep(0)
527
- self.send_to_router.send_pyobj(obj)
527
+ self.send_to_controller.send_pyobj(obj)
528
528
  self.model_update_result = asyncio.Future()
529
529
  result = await self.model_update_result
530
530
  if result.success:
@@ -46,8 +46,10 @@ def _to_torch(model: torch.nn.Module, reverse: bool = False):
46
46
  if isinstance(sub, CustomOp):
47
47
  if reverse:
48
48
  sub._forward_method = sub.forward_cuda
49
+ setattr(sub, "is_torch_compile", False)
49
50
  else:
50
51
  sub._forward_method = sub.forward_native
52
+ setattr(sub, "is_torch_compile", True)
51
53
  if isinstance(sub, torch.nn.Module):
52
54
  _to_torch(sub, reverse)
53
55
 
@@ -162,6 +162,7 @@ class ModelRunner:
162
162
  return min_per_gpu_memory
163
163
 
164
164
  def load_model(self):
165
+ torch.set_num_threads(1)
165
166
  logger.info(
166
167
  f"Load weight begin. avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
167
168
  )
@@ -523,7 +524,7 @@ class ModelRunner:
523
524
  if (
524
525
  self.cuda_graph_runner
525
526
  and self.cuda_graph_runner.can_run(len(batch.reqs))
526
- and not batch.sampling_info.has_bias()
527
+ and batch.sampling_info.can_run_in_cuda_graph()
527
528
  ):
528
529
  return self.cuda_graph_runner.replay(batch)
529
530
 
@@ -606,16 +607,6 @@ def import_model_classes():
606
607
  assert entry.__name__ not in model_arch_name_to_cls
607
608
  model_arch_name_to_cls[entry.__name__] = entry
608
609
 
609
- # compat: some models such as chatglm has incorrect class set in config.json
610
- # usage: [ tuple("From_Entry_Class_Name": EntryClass), ]
611
- if hasattr(module, "EntryClassRemapping") and isinstance(
612
- module.EntryClassRemapping, list
613
- ):
614
- for remap in module.EntryClassRemapping:
615
- if isinstance(remap, tuple) and len(remap) == 2:
616
- assert remap[0] not in model_arch_name_to_cls
617
- model_arch_name_to_cls[remap[0]] = remap[1]
618
-
619
610
  return model_arch_name_to_cls
620
611
 
621
612
 
@@ -402,6 +402,8 @@ class ChatGLMForCausalLM(nn.Module):
402
402
  weight_loader(param, loaded_weight)
403
403
 
404
404
 
405
- EntryClass = ChatGLMForCausalLM
406
- # compat: glm model.config class == ChatGLMModel
407
- EntryClassRemapping = [("ChatGLMModel", ChatGLMForCausalLM)]
405
+ class ChatGLMModel(ChatGLMForCausalLM):
406
+ pass
407
+
408
+
409
+ EntryClass = [ChatGLMForCausalLM, ChatGLMModel]
@@ -297,7 +297,6 @@ class ExaoneForCausalLM(nn.Module):
297
297
  config,
298
298
  quant_config: Optional[QuantizationConfig] = None,
299
299
  cache_config: Optional[CacheConfig] = None,
300
- efficient_weight_load=False,
301
300
  ) -> None:
302
301
  super().__init__()
303
302
  self.config = config
@@ -324,30 +323,7 @@ class ExaoneForCausalLM(nn.Module):
324
323
  sample_output = self.sampler(logits_output, input_metadata.sampling_info)
325
324
  return sample_output, logits_output
326
325
 
327
- def get_module_name(self, name):
328
- stacked_params_mapping = [
329
- # (param_name, shard_name, shard_id, num_shard)
330
- ("qkv_proj", "q_proj", "q", 3),
331
- ("qkv_proj", "k_proj", "k", 3),
332
- ("qkv_proj", "v_proj", "v", 3),
333
- ("gate_up_proj", "c_fc_0", 0, 2),
334
- ("gate_up_proj", "c_fc_1", 1, 2),
335
- ]
336
- for param_name, weight_name, shard_id, num_shard in stacked_params_mapping:
337
- if weight_name in name:
338
- return (
339
- name.replace(weight_name, param_name)[: -len(".weight")],
340
- num_shard,
341
- )
342
- return name[: -len(".weight")], 1
343
-
344
- def get_num_params(self):
345
- params_dict = dict(self.named_parameters())
346
- return len(params_dict)
347
-
348
- def load_weights(
349
- self, weights: Iterable[Tuple[str, torch.Tensor]], name=None, loaded_weight=None
350
- ):
326
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
351
327
  stacked_params_mapping = [
352
328
  # (param_name, shard_name, shard_id)
353
329
  ("qkv_proj", "q_proj", "q"),
@@ -358,16 +334,17 @@ class ExaoneForCausalLM(nn.Module):
358
334
  ]
359
335
  params_dict = dict(self.named_parameters())
360
336
 
361
- def load_weights_per_param(name, loaded_weight):
337
+ for name, loaded_weight in weights:
362
338
  if "rotary_emb.inv_freq" in name or "projector" in name:
363
- return
339
+ continue
364
340
  if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
365
341
  # Models trained using ColossalAI may include these tensors in
366
342
  # the checkpoint. Skip them.
367
- return
343
+ continue
368
344
  if name.startswith("model.vision_tower") and name not in params_dict:
369
- return
345
+ continue
370
346
 
347
+ name = name.replace("attn.attention", "self_attn")
371
348
  for param_name, weight_name, shard_id in stacked_params_mapping:
372
349
  if weight_name not in name:
373
350
  continue
@@ -382,18 +359,10 @@ class ExaoneForCausalLM(nn.Module):
382
359
  else:
383
360
  # Skip loading extra bias for GPTQ models.
384
361
  if name.endswith(".bias") and name not in params_dict:
385
- return
362
+ continue
386
363
  param = params_dict[name]
387
364
  weight_loader = getattr(param, "weight_loader", default_weight_loader)
388
365
  weight_loader(param, loaded_weight)
389
366
 
390
- if name is None or loaded_weight is None:
391
- for name, loaded_weight in weights:
392
- name = name.replace("attn.attention", "self_attn")
393
- load_weights_per_param(name, loaded_weight)
394
- else:
395
- name = name.replace("attn.attention", "self_attn")
396
- load_weights_per_param(name, loaded_weight)
397
-
398
367
 
399
368
  EntryClass = ExaoneForCausalLM
@@ -295,7 +295,6 @@ class LlamaForCausalLM(nn.Module):
295
295
  config: LlamaConfig,
296
296
  quant_config: Optional[QuantizationConfig] = None,
297
297
  cache_config: Optional[CacheConfig] = None,
298
- efficient_weight_load=False,
299
298
  ) -> None:
300
299
  super().__init__()
301
300
  self.config = config
@@ -305,6 +304,8 @@ class LlamaForCausalLM(nn.Module):
305
304
  self.logits_processor = LogitsProcessor(config)
306
305
  self.sampler = Sampler()
307
306
 
307
+ self.param_dict = dict(self.named_parameters())
308
+
308
309
  @torch.no_grad()
309
310
  def forward(
310
311
  self,
@@ -320,49 +321,26 @@ class LlamaForCausalLM(nn.Module):
320
321
  sample_output = self.sampler(logits_output, input_metadata.sampling_info)
321
322
  return sample_output, logits_output
322
323
 
323
- def get_module_name(self, name):
324
- stacked_params_mapping = [
325
- # (param_name, shard_name, shard_id, num_shard)
326
- ("qkv_proj", "q_proj", "q", 3),
327
- ("qkv_proj", "k_proj", "k", 3),
328
- ("qkv_proj", "v_proj", "v", 3),
329
- ("gate_up_proj", "gate_proj", 0, 2),
330
- ("gate_up_proj", "up_proj", 1, 2),
331
- ]
332
- for param_name, weight_name, shard_id, num_shard in stacked_params_mapping:
333
- if weight_name in name:
334
- return (
335
- name.replace(weight_name, param_name)[: -len(".weight")],
336
- num_shard,
337
- )
338
- return name[: -len(".weight")], 1
339
-
340
- def get_num_params(self):
341
- params_dict = dict(self.named_parameters())
342
- return len(params_dict)
343
-
344
- def load_weights(
345
- self, weights: Iterable[Tuple[str, torch.Tensor]], name=None, loaded_weight=None
346
- ):
324
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
347
325
  stacked_params_mapping = [
348
326
  # (param_name, shard_name, shard_id)
349
- ("qkv_proj", "q_proj", "q"),
350
- ("qkv_proj", "k_proj", "k"),
351
- ("qkv_proj", "v_proj", "v"),
352
- ("gate_up_proj", "gate_proj", 0),
353
- ("gate_up_proj", "up_proj", 1),
327
+ (".qkv_proj", ".q_proj", "q"),
328
+ (".qkv_proj", ".k_proj", "k"),
329
+ (".qkv_proj", ".v_proj", "v"),
330
+ (".gate_up_proj", ".gate_proj", 0),
331
+ (".gate_up_proj", ".up_proj", 1),
354
332
  ]
355
- params_dict = dict(self.named_parameters())
333
+ params_dict = self.param_dict
356
334
 
357
- def load_weights_per_param(name, loaded_weight):
335
+ for name, loaded_weight in weights:
358
336
  if "rotary_emb.inv_freq" in name or "projector" in name:
359
- return
337
+ continue
360
338
  if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
361
339
  # Models trained using ColossalAI may include these tensors in
362
340
  # the checkpoint. Skip them.
363
- return
341
+ continue
364
342
  if name.startswith("model.vision_tower") and name not in params_dict:
365
- return
343
+ continue
366
344
 
367
345
  for param_name, weight_name, shard_id in stacked_params_mapping:
368
346
  if weight_name not in name:
@@ -378,16 +356,14 @@ class LlamaForCausalLM(nn.Module):
378
356
  else:
379
357
  # Skip loading extra bias for GPTQ models.
380
358
  if name.endswith(".bias") and name not in params_dict:
381
- return
359
+ continue
382
360
  param = params_dict[name]
383
361
  weight_loader = getattr(param, "weight_loader", default_weight_loader)
384
362
  weight_loader(param, loaded_weight)
385
363
 
386
- if name is None or loaded_weight is None:
387
- for name, loaded_weight in weights:
388
- load_weights_per_param(name, loaded_weight)
389
- else:
390
- load_weights_per_param(name, loaded_weight)
364
+
365
+ class Phi3ForCausalLM(LlamaForCausalLM):
366
+ pass
391
367
 
392
368
 
393
- EntryClass = LlamaForCausalLM
369
+ EntryClass = [LlamaForCausalLM, Phi3ForCausalLM]
@@ -16,17 +16,16 @@ limitations under the License.
16
16
  from typing import Iterable, Optional, Tuple
17
17
 
18
18
  import torch
19
- import tqdm
20
19
  from torch import nn
21
20
  from transformers import LlamaConfig
22
21
  from vllm.config import CacheConfig
23
- from vllm.distributed import get_tensor_model_parallel_rank
24
22
  from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
25
23
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
26
24
 
27
25
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
26
+ from sglang.srt.layers.sampler import SampleOutput
28
27
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
29
- from sglang.srt.models.llama2 import LlamaModel
28
+ from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel
30
29
 
31
30
 
32
31
  class LlamaForClassification(nn.Module):
@@ -42,10 +41,12 @@ class LlamaForClassification(nn.Module):
42
41
  self.model = LlamaModel(config, quant_config=quant_config)
43
42
 
44
43
  self.classification_head = nn.Linear(
45
- config.hidden_size, config.classification_out_size
44
+ config.hidden_size, config.classification_out_size, bias=False
46
45
  )
47
46
  self.eos_token_id = config.eos_token_id
48
47
 
48
+ self.param_dict = dict(self.named_parameters())
49
+
49
50
  @torch.no_grad()
50
51
  def forward(
51
52
  self,
@@ -65,7 +66,7 @@ class LlamaForClassification(nn.Module):
65
66
  (input_metadata.batch_size, self.config.classification_out_size)
66
67
  ).to(input_ids.device)
67
68
 
68
- return LogitsProcessorOutput(
69
+ logits_output = LogitsProcessorOutput(
69
70
  next_token_logits=scores,
70
71
  next_token_logprobs=scores,
71
72
  normalized_prompt_logprobs=scores,
@@ -74,46 +75,38 @@ class LlamaForClassification(nn.Module):
74
75
  output_top_logprobs=None,
75
76
  )
76
77
 
78
+ # A dummy to make this work
79
+ sample_output = SampleOutput(
80
+ success=torch.full(
81
+ size=(scores.shape[0],),
82
+ fill_value=True,
83
+ dtype=torch.bool,
84
+ ),
85
+ probs=torch.full(
86
+ size=(scores.shape[0], 1),
87
+ fill_value=1.0,
88
+ dtype=torch.float16,
89
+ ),
90
+ batch_next_token_ids=torch.full(
91
+ size=(scores.shape[0],),
92
+ fill_value=0,
93
+ dtype=torch.long,
94
+ ),
95
+ )
96
+ return sample_output, logits_output
97
+
77
98
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
78
- stacked_params_mapping = [
79
- # (param_name, shard_name, shard_id)
80
- ("qkv_proj", "q_proj", "q"),
81
- ("qkv_proj", "k_proj", "k"),
82
- ("qkv_proj", "v_proj", "v"),
83
- ("gate_up_proj", "gate_proj", 0),
84
- ("gate_up_proj", "up_proj", 1),
85
- ]
86
- params_dict = dict(self.named_parameters())
87
- if get_tensor_model_parallel_rank() == 0:
88
- weights = tqdm.tqdm(weights, total=int(len(params_dict) * 1.5))
89
- for name, loaded_weight in weights:
90
- if "rotary_emb.inv_freq" in name or "projector" in name:
91
- continue
92
- if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
93
- # Models trained using ColossalAI may include these tensors in
94
- # the checkpoint. Skip them.
95
- continue
96
- if "lm_head" in name:
97
- continue
99
+ params_dict = self.param_dict
98
100
 
99
- for param_name, weight_name, shard_id in stacked_params_mapping:
100
- if weight_name not in name:
101
- continue
102
- name = name.replace(weight_name, param_name)
103
- # Skip loading extra bias for GPTQ models.
104
- if name.endswith(".bias") and name not in params_dict:
105
- continue
106
- param = params_dict[name]
107
- weight_loader = param.weight_loader
108
- weight_loader(param, loaded_weight, shard_id)
109
- break
110
- else:
111
- # Skip loading extra bias for GPTQ models.
112
- if name.endswith(".bias") and name not in params_dict:
113
- continue
101
+ for name, loaded_weight in weights:
102
+ if "classification_head" in name:
114
103
  param = params_dict[name]
115
104
  weight_loader = getattr(param, "weight_loader", default_weight_loader)
116
105
  weight_loader(param, loaded_weight)
106
+ elif "lm_head" in name:
107
+ continue
108
+ else:
109
+ LlamaForCausalLM.load_weights(self, [(name, loaded_weight)])
117
110
 
118
111
 
119
112
  EntryClass = LlamaForClassification
@@ -1,4 +1,4 @@
1
- from typing import Iterable, Optional, Tuple
1
+ from typing import Iterable, Tuple
2
2
 
3
3
  import torch
4
4
  from torch import nn
@@ -7,7 +7,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
7
7
 
8
8
  from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
9
9
  from sglang.srt.model_executor.model_runner import InputMetadata
10
- from sglang.srt.models.llama2 import LlamaForCausalLM, LlamaModel
10
+ from sglang.srt.models.llama import LlamaModel
11
11
 
12
12
 
13
13
  class LlamaEmbeddingModel(nn.Module):
@@ -16,7 +16,6 @@ class LlamaEmbeddingModel(nn.Module):
16
16
  config: LlamaConfig,
17
17
  quant_config=None,
18
18
  cache_config=None,
19
- efficient_weight_load=False,
20
19
  ) -> None:
21
20
  super().__init__()
22
21
  self.model = LlamaModel(config, quant_config=quant_config)
@@ -86,6 +85,8 @@ class LlamaEmbeddingModel(nn.Module):
86
85
  load_weights_per_param(name, loaded_weight)
87
86
 
88
87
 
89
- EntryClass = LlamaEmbeddingModel
90
- # compat: e5-mistral model.config class == MistralModel
91
- EntryClassRemapping = [("MistralModel", LlamaEmbeddingModel)]
88
+ class MistralModel(LlamaEmbeddingModel):
89
+ pass
90
+
91
+
92
+ EntryClass = [LlamaEmbeddingModel, MistralModel]
@@ -41,7 +41,7 @@ from sglang.srt.mm_utils import (
41
41
  unpad_image_shape,
42
42
  )
43
43
  from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
44
- from sglang.srt.models.llama2 import LlamaForCausalLM
44
+ from sglang.srt.models.llama import LlamaForCausalLM
45
45
  from sglang.srt.models.mistral import MistralForCausalLM
46
46
  from sglang.srt.models.qwen2 import Qwen2ForCausalLM
47
47
 
@@ -395,21 +395,19 @@ class LlavaBaseForCausalLM(nn.Module):
395
395
  "model.mm_projector.0": "multi_modal_projector.linear_1",
396
396
  "model.mm_projector.2": "multi_modal_projector.linear_2",
397
397
  "model.vision_tower.vision_tower": "vision_tower", # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
398
+ "model.image_newline": "language_model.model.image_newline",
398
399
  }
399
400
  params_dict = dict(self.named_parameters())
400
- weights = list(weights)
401
401
  for name, loaded_weight in weights:
402
- # FIXME: why projector weights read two times?
403
- if "projector" in name or "vision_tower" in name:
402
+ if "projector" in name or "vision_tower" in name or "image_newline" in name:
404
403
  for weight_name, param_name in projector_weights.items():
405
404
  if weight_name in name:
406
405
  name = name.replace(weight_name, param_name)
407
406
  param = params_dict[name]
408
407
  weight_loader = getattr(param, "weight_loader", default_weight_loader)
409
408
  weight_loader(param, loaded_weight)
410
-
411
- # load language model
412
- self.language_model.load_weights(weights)
409
+ else:
410
+ self.language_model.load_weights([(name, loaded_weight)])
413
411
 
414
412
  @property
415
413
  def num_patches_per_side(self):
@@ -429,6 +427,7 @@ class LlavaLlamaForCausalLM(LlavaBaseForCausalLM):
429
427
  self.vision_tower = None
430
428
  self.config.vision_config.hidden_size = config.mm_hidden_size
431
429
  self.config.text_config.hidden_size = config.hidden_size
430
+
432
431
  self.multi_modal_projector = LlavaMultiModalProjector(config)
433
432
  self.language_model = LlamaForCausalLM(config, quant_config=quant_config)
434
433
  if "unpad" in getattr(config, "mm_patch_merge_type", ""):
@@ -448,9 +447,9 @@ class LlavaQwenForCausalLM(LlavaBaseForCausalLM):
448
447
 
449
448
  self.config = config
450
449
  self.vision_tower = None
450
+
451
451
  if getattr(self.config, "vision_config", None) is None:
452
452
  self.config.vision_config = CLIPVisionConfig(self.config.mm_vision_tower)
453
-
454
453
  if getattr(self.config, "text_config", None) is None:
455
454
  self.config.text_config = Qwen2Config(self.config._name_or_path)
456
455
 
@@ -459,7 +458,6 @@ class LlavaQwenForCausalLM(LlavaBaseForCausalLM):
459
458
 
460
459
  if getattr(self.config, "projector_hidden_act", None) is None:
461
460
  self.config.projector_hidden_act = "gelu"
462
-
463
461
  if getattr(self.config, "image_token_index", None) is None:
464
462
  self.config.image_token_index = 151646
465
463
 
@@ -482,9 +480,9 @@ class LlavaMistralForCausalLM(LlavaBaseForCausalLM):
482
480
 
483
481
  self.config = config
484
482
  self.vision_tower = None
483
+
485
484
  if getattr(self.config, "vision_config", None) is None:
486
485
  self.config.vision_config = CLIPVisionConfig(self.config.mm_vision_tower)
487
-
488
486
  if getattr(self.config, "text_config", None) is None:
489
487
  self.config.text_config = MistralConfig(self.config._name_or_path)
490
488
 
@@ -493,7 +491,6 @@ class LlavaMistralForCausalLM(LlavaBaseForCausalLM):
493
491
 
494
492
  if getattr(self.config, "projector_hidden_act", None) is None:
495
493
  self.config.projector_hidden_act = "gelu"
496
-
497
494
  if getattr(self.config, "image_token_index", None) is None:
498
495
  self.config.image_token_index = 32000
499
496
 
@@ -27,7 +27,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConf
27
27
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
28
28
 
29
29
  from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
30
- from sglang.srt.models.llama2 import LlamaForCausalLM
30
+ from sglang.srt.models.llama import LlamaForCausalLM
31
31
 
32
32
 
33
33
  class LlavaVidForCausalLM(nn.Module):
@@ -239,12 +239,12 @@ class LlavaVidForCausalLM(nn.Module):
239
239
  "model.vision_resampler.mm_projector.0": "multi_modal_projector.linear_1",
240
240
  "model.vision_resampler.mm_projector.2": "multi_modal_projector.linear_2",
241
241
  "model.vision_tower.vision_tower": "vision_tower", # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
242
+ "model.image_newline": "language_model.model.image_newline",
242
243
  }
243
244
  params_dict = dict(self.named_parameters())
244
- weights = list(weights)
245
245
  for name, loaded_weight in weights:
246
246
  # FIXME: why projector weights read two times?
247
- if "projector" in name or "vision_tower" in name:
247
+ if "projector" in name or "vision_tower" in name or "image_newline" in name:
248
248
  for weight_name, param_name in projector_weights.items():
249
249
  if weight_name in name:
250
250
  name = name.replace(weight_name, param_name)
@@ -255,9 +255,8 @@ class LlavaVidForCausalLM(nn.Module):
255
255
  continue
256
256
  weight_loader = getattr(param, "weight_loader", default_weight_loader)
257
257
  weight_loader(param, loaded_weight)
258
-
259
- # load language model
260
- self.language_model.load_weights(weights)
258
+ else:
259
+ self.language_model.load_weights([(name, loaded_weight)])
261
260
 
262
261
  @property
263
262
  def num_patches_per_side(self):
@@ -15,12 +15,11 @@ limitations under the License.
15
15
 
16
16
  """Inference-only Mistral model."""
17
17
 
18
- from sglang.srt.models.llama2 import LlamaForCausalLM
18
+ from sglang.srt.models.llama import LlamaForCausalLM
19
19
 
20
20
 
21
21
  class MistralForCausalLM(LlamaForCausalLM):
22
- def __init__(self, *args, **kwargs):
23
- super().__init__(*args, **kwargs)
22
+ pass
24
23
 
25
24
 
26
25
  EntryClass = MistralForCausalLM
@@ -34,12 +34,14 @@ class SamplingBatchInfo:
34
34
  linear_penalties: torch.Tensor = None
35
35
  scaling_penalties: torch.Tensor = None
36
36
 
37
- def has_bias(self):
37
+ def can_run_in_cuda_graph(self):
38
+ # Vocab bias and min_ps are not supported in CUDA graph
38
39
  return (
39
- self.logit_bias is not None
40
- or self.vocab_mask is not None
41
- or self.linear_penalties is not None
42
- or self.scaling_penalties is not None
40
+ self.logit_bias is None
41
+ and self.vocab_mask is None
42
+ and self.linear_penalties is None
43
+ and self.scaling_penalties is None
44
+ and not self.need_min_p_sampling
43
45
  )
44
46
 
45
47
  @classmethod
@@ -48,35 +50,29 @@ class SamplingBatchInfo:
48
50
  ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
49
51
  ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
50
52
  ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
51
- ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda")
52
53
  return ret
53
54
 
54
55
  def __getitem__(self, key):
55
56
  if isinstance(key, slice):
56
- # NOTE: We do not use cuda graph when there is bias tensors
57
- assert not self.has_bias()
57
+ # NOTE:This method is only used in CUDA graph
58
+ assert self.can_run_in_cuda_graph()
58
59
  return SamplingBatchInfo(
59
60
  vocab_size=self.vocab_size,
60
61
  temperatures=self.temperatures[key],
61
62
  top_ps=self.top_ps[key],
62
63
  top_ks=self.top_ks[key],
63
- min_ps=self.min_ps[key],
64
- need_min_p_sampling=self.need_min_p_sampling,
65
64
  )
66
65
  else:
67
66
  raise NotImplementedError
68
67
 
69
68
  def inplace_assign(self, bs: int, other: SamplingBatchInfo):
70
- # NOTE: We do not use cuda graph when there is bias tensors
71
- assert not self.has_bias()
69
+ # NOTE:This method is only used in CUDA graph
70
+ assert self.can_run_in_cuda_graph()
72
71
 
73
72
  self.vocab_size = other.vocab_size
74
- self.need_min_p_sampling = other.need_min_p_sampling
75
-
76
73
  self.temperatures[:bs] = other.temperatures
77
74
  self.top_ps[:bs] = other.top_ps
78
75
  self.top_ks[:bs] = other.top_ks
79
- self.min_ps[:bs] = other.min_ps
80
76
 
81
77
  @classmethod
82
78
  def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
@@ -2,8 +2,12 @@
2
2
 
3
3
  import json
4
4
  import re
5
+ import time
6
+
7
+ import numpy as np
5
8
 
6
9
  import sglang as sgl
10
+ from sglang.utils import fetch_and_cache_jsonl
7
11
 
8
12
 
9
13
  def test_few_shot_qa():
@@ -447,3 +451,67 @@ def test_chat_completion_speculative():
447
451
  )
448
452
 
449
453
  gen_character_spec().sync()
454
+
455
+
456
+ def test_hellaswag_select():
457
+ """Benchmark the accuracy of sgl.select on the HellaSwag dataset."""
458
+
459
+ url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
460
+ lines = fetch_and_cache_jsonl(url)
461
+
462
+ # Construct prompts
463
+ def get_one_example(lines, i, include_answer):
464
+ ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
465
+ if include_answer:
466
+ ret += lines[i]["endings"][lines[i]["label"]]
467
+ return ret
468
+
469
+ def get_few_shot_examples(lines, k):
470
+ ret = ""
471
+ for i in range(k):
472
+ ret += get_one_example(lines, i, True) + "\n\n"
473
+ return ret
474
+
475
+ num_questions = 200
476
+ num_shots = 20
477
+ few_shot_examples = get_few_shot_examples(lines, num_shots)
478
+
479
+ questions = []
480
+ choices = []
481
+ labels = []
482
+ for i in range(len(lines[:num_questions])):
483
+ questions.append(get_one_example(lines, i, False))
484
+ choices.append(lines[i]["endings"])
485
+ labels.append(lines[i]["label"])
486
+ arguments = [{"question": q, "choices": c} for q, c in zip(questions, choices)]
487
+
488
+ #####################################
489
+ ######### SGL Program Begin #########
490
+ #####################################
491
+
492
+ import sglang as sgl
493
+
494
+ @sgl.function
495
+ def few_shot_hellaswag(s, question, choices):
496
+ s += few_shot_examples + question
497
+ s += sgl.select("answer", choices=choices)
498
+
499
+ #####################################
500
+ ########## SGL Program End ##########
501
+ #####################################
502
+
503
+ # Run requests
504
+ tic = time.time()
505
+ rets = few_shot_hellaswag.run_batch(
506
+ arguments,
507
+ temperature=0,
508
+ num_threads=64,
509
+ progress_bar=True,
510
+ )
511
+ preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
512
+ latency = time.time() - tic
513
+
514
+ # Compute accuracy
515
+ accuracy = np.mean(np.array(preds) == np.array(labels))
516
+
517
+ return accuracy, latency
sglang/test/test_utils.py CHANGED
@@ -23,6 +23,10 @@ from sglang.utils import get_exception_traceback
23
23
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
24
24
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
25
25
  DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
26
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
27
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
28
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
29
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
26
30
 
27
31
  if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
28
32
  DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
sglang/utils.py CHANGED
@@ -4,6 +4,7 @@ import base64
4
4
  import importlib
5
5
  import json
6
6
  import logging
7
+ import os
7
8
  import signal
8
9
  import sys
9
10
  import traceback
@@ -15,6 +16,7 @@ from typing import Union
15
16
 
16
17
  import numpy as np
17
18
  import requests
19
+ from tqdm import tqdm
18
20
 
19
21
  logger = logging.getLogger(__name__)
20
22
 
@@ -260,3 +262,40 @@ class LazyImport:
260
262
  def __call__(self, *args, **kwargs):
261
263
  module = self._load()
262
264
  return module(*args, **kwargs)
265
+
266
+
267
+ def fetch_and_cache_jsonl(url, cache_file="cached_data.jsonl"):
268
+ """Read and cache a jsonl file from a url."""
269
+
270
+ # Check if the cache file already exists
271
+ if os.path.exists(cache_file):
272
+ print("Loading data from cache...")
273
+ with open(cache_file, "r") as f:
274
+ data = [json.loads(line) for line in f]
275
+ else:
276
+ print("Downloading data from URL...")
277
+ # Stream the response to show the progress bar
278
+ response = requests.get(url, stream=True)
279
+ response.raise_for_status() # Check for request errors
280
+
281
+ # Total size of the file in bytes
282
+ total_size = int(response.headers.get("content-length", 0))
283
+ chunk_size = 1024 # Download in chunks of 1KB
284
+
285
+ # Use tqdm to display the progress bar
286
+ with open(cache_file, "wb") as f, tqdm(
287
+ desc=cache_file,
288
+ total=total_size,
289
+ unit="B",
290
+ unit_scale=True,
291
+ unit_divisor=1024,
292
+ ) as bar:
293
+ for chunk in response.iter_content(chunk_size=chunk_size):
294
+ f.write(chunk)
295
+ bar.update(len(chunk))
296
+
297
+ # Convert the data to a list of dictionaries
298
+ with open(cache_file, "r") as f:
299
+ data = [json.loads(line) for line in f]
300
+
301
+ return data
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.15"
1
+ __version__ = "0.3.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.15
3
+ Version: 0.3.0
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
312
312
  ### Method 2: From source
313
313
  ```
314
314
  # Use the last release branch
315
- git clone -b v0.2.15 https://github.com/sgl-project/sglang.git
315
+ git clone -b v0.3.0 https://github.com/sgl-project/sglang.git
316
316
  cd sglang
317
317
 
318
318
  pip install --upgrade pip
@@ -461,7 +461,7 @@ It supports streaming, vision, and most features of the Chat/Completions/Models/
461
461
  ```
462
462
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
463
463
  ```
464
- - Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
464
+ - Add `--dp 2` to enable multi-GPU data parallelism. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total.
465
465
  ```
466
466
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
467
467
  ```
@@ -495,6 +495,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
495
495
  - Qwen / Qwen 2 / Qwen 2 MoE
496
496
  - DeepSeek / DeepSeek 2
497
497
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
498
+ - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
498
499
  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
499
500
  - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
500
501
  - LLaVA 1.5 / 1.6 / NeXT
@@ -6,8 +6,8 @@ sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
6
6
  sglang/global_config.py,sha256=nwOjUflwqLQySPUMvk8Hk63TIS6mknh_ODSW3CZ1rJw,1704
7
7
  sglang/launch_server.py,sha256=FODfO0DW546dh-u1qDlWtrhsmj6hxkarXXv3cIdgkj8,549
8
8
  sglang/launch_server_llavavid.py,sha256=xnpSILJxsrbvqkERav5P26bErCQnhoTFmoKeScJltUA,1034
9
- sglang/utils.py,sha256=zFYGkC4vOUR3sTv1TmQXcsOLZDtDBR3wnjqnDp3xMIs,8352
10
- sglang/version.py,sha256=ogr0x4sazo5ruMrKOQDYO_YrTwtaXZTE8fKnwCajH7I,23
9
+ sglang/utils.py,sha256=zxHwQhVxW_lWf-IH0wUw_pBTRLHLPypdRiU5M4XosMM,9669
10
+ sglang/version.py,sha256=VrXpHDu3erkzwl_WXrqINBm9xWkcyUy53IQOj042dOs,22
11
11
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
13
13
  sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
@@ -20,10 +20,10 @@ sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtx
20
20
  sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
21
21
  sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
22
22
  sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
23
- sglang/lang/backend/runtime_endpoint.py,sha256=SDlp03EuQEK1eGK4_IaFySWgxlp4wCs3EPewZ6O640E,9549
23
+ sglang/lang/backend/runtime_endpoint.py,sha256=hpezro0H6vG9KzLeKfYpPMwb4TaE0UanCIM0uG8Kdjw,9746
24
24
  sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
25
25
  sglang/srt/conversation.py,sha256=2KDNe1suUPy6xqSkCx2xcO3pDPxTwqx5FaUxaqwCJ-M,19525
26
- sglang/srt/hf_transformers_utils.py,sha256=kNGJ5OfAth7dZrWfhpKpt7s2LQWvLH2d-v0GtcEs3R0,6078
26
+ sglang/srt/hf_transformers_utils.py,sha256=5UXJ-LdP92Sk_T843M9BHdnxRrcyiYfWH2IEg3dWgKI,6085
27
27
  sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
28
28
  sglang/srt/model_config.py,sha256=68QQ8iUWQHPv01RBeH23mvay6iJg9DWmCogC_vUgFLk,6371
29
29
  sglang/srt/server.py,sha256=yi8prs9_M0P0dOInrQLkHKiZ-oTigk_uzW8otEHImbU,19846
@@ -43,7 +43,7 @@ sglang/srt/layers/logits_processor.py,sha256=Zx4eFAkFlThPrmz_-HuCN9SqGLanARm0wdZ
43
43
  sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
44
44
  sglang/srt/layers/prefill_attention.py,sha256=y7vdcuX8lMa9Qf_jQYNDvQO9PVCBQSs3hb5LV2DFgpU,5256
45
45
  sglang/srt/layers/radix_attention.py,sha256=o5a8r3XQ-oRwaxBlAgzJGv7p3dMbu0LrYsDc4uvpPgA,8338
46
- sglang/srt/layers/sampler.py,sha256=YEDZrwzshX-fZZ5tkW57yBBIJRu2SPAUZzXhhrpQs4Q,5543
46
+ sglang/srt/layers/sampler.py,sha256=zPVa3PHc-tjDM_oP-1XFeHSRIErx844SLoe6MG8Qef0,6418
47
47
  sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
48
48
  sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
49
49
  sglang/srt/layers/fused_moe/layer.py,sha256=GT3r2UPx_PAufJd0SUMOXyh76ymAeYDubd0SM0H71bo,20977
@@ -52,35 +52,35 @@ sglang/srt/managers/controller_single.py,sha256=5brrZ8vZxjvrSJHWrm5H3qGEZShN4ERO
52
52
  sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
53
53
  sglang/srt/managers/io_struct.py,sha256=Bd91cydX9_960NNP2xngqK-lsIaDB3oMYd56QddN4_Q,10722
54
54
  sglang/srt/managers/policy_scheduler.py,sha256=7HNUxBKJE444s_bHcPpbnHCygsnH-NIXYNSC2q6mRmc,8584
55
- sglang/srt/managers/schedule_batch.py,sha256=D3NBNi_6_KEMfBTn_8XPrtCbXHjnUki0sOVhQ7kgqqM,26182
56
- sglang/srt/managers/tokenizer_manager.py,sha256=ung-uQrvtPn-vzpQMjpYW_jKWDJR_B8NL88WW3OWyy0,29435
55
+ sglang/srt/managers/schedule_batch.py,sha256=i68O-e9I_gDlme96xSBDjA2xDF1p-XBKvJRiJ9CsgcY,26423
56
+ sglang/srt/managers/tokenizer_manager.py,sha256=8aHR5h9nYZsfdZE80uBc9egDFOQgKvjxmp-30Ha4ELk,29463
57
57
  sglang/srt/managers/tp_worker.py,sha256=4UuaBLzV6NMsG4XEIcpa4xMcOKIFvTan51ynKz85HXg,36842
58
58
  sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
59
59
  sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
60
60
  sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
61
61
  sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
62
62
  sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
63
- sglang/srt/model_executor/cuda_graph_runner.py,sha256=qyKjW9TjSjZ-NZI3aspJwnmuKSKT6DX1MMTFwqJtNE8,12751
63
+ sglang/srt/model_executor/cuda_graph_runner.py,sha256=4vIUqVQpnHNhwWrokMVmGM4Dp5JFPHyXIvpEQsi2pNU,12862
64
64
  sglang/srt/model_executor/forward_batch_info.py,sha256=fSLhatN8vCgxn0Mft9D-r0pNi3SN0EQSTJmgaOtrqJc,16471
65
- sglang/srt/model_executor/model_runner.py,sha256=9ard4FLjb_rz0EUS3KMrlDkos0zNGh5TQ6wlHSIsev4,24408
66
- sglang/srt/models/chatglm.py,sha256=BzLtDK_CsD1Pmn-sHnJuLulJCUuSbNm1q1fqCShRdQ8,13628
65
+ sglang/srt/model_executor/model_runner.py,sha256=93YCStmZfdZlY0r-GGIVi0Xw66VwF77dEtGVmQf1VfU,23893
66
+ sglang/srt/models/chatglm.py,sha256=PPOaeqipbkcsTUhMPbLb1HItWgW7KntefUfjEoMSxUM,13585
67
67
  sglang/srt/models/commandr.py,sha256=k86ykwWOlxLGaBbGUoMSaXngUxCbMVRbY5AoMOWpbU8,14377
68
68
  sglang/srt/models/dbrx.py,sha256=goLJ9Yt-9vxkwhCUFBidvP41H_dYTFsvrMZ4xm4FqGA,14875
69
69
  sglang/srt/models/deepseek.py,sha256=aYP6HUgxQbhcQGQEF4vX0ronBF8AirqIFG98EQn0YzY,16220
70
70
  sglang/srt/models/deepseek_v2.py,sha256=Htw_HDju9huYU5gBu2dqq6bKVao-AsifxfkGl2xRx-8,28521
71
- sglang/srt/models/exaone.py,sha256=58JELgg-dZl6CUNd2PEWR0ok9u4osOuE5QKSfX6MzhE,14480
71
+ sglang/srt/models/exaone.py,sha256=ZFr0G0WITxg3dDfV_-vWqZpK_wMmiZi4r0vOT0gO9V4,13301
72
72
  sglang/srt/models/gemma.py,sha256=Ya_u2lKPKAc9iHEsW_HAEfCDgYTbxUOCzBI0LDuoOYs,12489
73
73
  sglang/srt/models/gemma2.py,sha256=MCmzzRAAafEQuQj6aGtB-TF4jH0RWrXcOPxSz6LRsXs,15137
74
74
  sglang/srt/models/gpt_bigcode.py,sha256=HEhMRO1Y37JfZtP7mDp0MexWj5h6XT9rKvxorOMKoQA,10409
75
75
  sglang/srt/models/grok.py,sha256=ZcJ4E11rKh-xo4k_j-H1XRreJWWv8yii-bMYC1lO2R8,15143
76
76
  sglang/srt/models/internlm2.py,sha256=VtWATs2eLIqbadYXTPY_vycFIstVk4zg3kxycA9H0Qw,12416
77
- sglang/srt/models/llama2.py,sha256=NriIElOdhhsiJFmNPc4bDXjxU_FgqfqdtoagSuIcnnc,14394
78
- sglang/srt/models/llama_classification.py,sha256=ClNlaLi3Z0ME1ETOwGxl8DtJy8VJu8kobVRFX9jKJqM,4704
79
- sglang/srt/models/llama_embedding.py,sha256=Z3FWGNEWrperMxnVqOhxv6vApNpChh-AaahlEqeYOrk,3574
80
- sglang/srt/models/llava.py,sha256=ypq0hWprqN73P-VuYfSAZ1_Otm48qDqEPA2YO583goM,23453
81
- sglang/srt/models/llavavid.py,sha256=Dx_wED6stC8lTASUrGt6B3c8wQ9lVrX-76-dNyyuVVg,11934
77
+ sglang/srt/models/llama.py,sha256=MfDnlVWoJUG9DxgGYPiwhoU-0ZeRbhp6UmBR2ZAJSNk,13402
78
+ sglang/srt/models/llama_classification.py,sha256=oSeROs633Gnak8vrbnWnCWDxfgP_zmKGO1A_43ukEQ4,4029
79
+ sglang/srt/models/llama_embedding.py,sha256=RI2mpYheP5WwhuTINU-6IrU61usuMyCK9h2zDEyLW4g,3458
80
+ sglang/srt/models/llava.py,sha256=OXmlOVIjFnMRKGwLweYB1N-xlfpZlTlZpqhsbwUCY6Y,23471
81
+ sglang/srt/models/llavavid.py,sha256=4R2t8BZJKN85IrTLsLFb4yZuKVI2Cwp7kY8AJ-nEVoE,12012
82
82
  sglang/srt/models/minicpm.py,sha256=7RZEJ2TCqBL1JmMFVJ3J9DmZHRw0q90st49Wkh-sdL4,14039
83
- sglang/srt/models/mistral.py,sha256=jlrWBVNXbAUziAaIdHAjFcOJnKtn9Bl8rBd65ypJM-I,819
83
+ sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
84
84
  sglang/srt/models/mixtral.py,sha256=KIsvruhXNq3Fwrs4_YE7J6fx54ObfnMuRNxgScE3Bmo,13830
85
85
  sglang/srt/models/mixtral_quant.py,sha256=O_97UKDYZokFhIBnamWfw0HLhln9_BUk_KfQ-sQnd8s,14286
86
86
  sglang/srt/models/qwen.py,sha256=geK88AyEyPbbDvMHJNY8XMSNpsCeu8g9kxnKyiJBpK4,10168
@@ -90,7 +90,7 @@ sglang/srt/models/stablelm.py,sha256=9feHoiDEXSIe0WCrt4AfWXqxliJwRvr8w4XSnk6ipSI
90
90
  sglang/srt/models/yivl.py,sha256=B6MELthWIm5KdSzX3o2tbbpApY8XdjUdmcQSD4dQe_I,4835
91
91
  sglang/srt/openai_api/adapter.py,sha256=3EeqASZXogpUkOP4xj7Rg_LfOLiIMUrZ9uFdeAy_pcc,50144
92
92
  sglang/srt/openai_api/protocol.py,sha256=onhnCjXpXCysvx_dLgOEmXz5XHHYB1t772cvHcK1GlY,9538
93
- sglang/srt/sampling/sampling_batch_info.py,sha256=WO7fgURK7XqXU3jORWpkz7Tyx3FC34r--hPMKvkt4Iw,7735
93
+ sglang/srt/sampling/sampling_batch_info.py,sha256=CIoD0SzHSWCe7Wc4jkJj5vIPHGnOdfbgkC6fG5KQxOw,7551
94
94
  sglang/srt/sampling/sampling_params.py,sha256=ggOXxafqfCD-xrGYcM57byLZ79CIeBP4AD5F44L_CW0,5635
95
95
  sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
96
96
  sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
@@ -108,11 +108,11 @@ sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2
108
108
  sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
109
109
  sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
110
110
  sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
111
- sglang/test/test_programs.py,sha256=V_-Bx3lLkw37P6gDyA7mZCqxlyNMaFLBkRrPMQQQqn4,14909
112
- sglang/test/test_utils.py,sha256=HD-9rcj7EFS_NX1GQFU5613ITQlZaTK2l9RmqA0F7x4,14380
111
+ sglang/test/test_programs.py,sha256=l21J8N91QTMO9TOvXPWNvPZVT0DgxYxOPHh1pOoFV_k,16927
112
+ sglang/test/test_utils.py,sha256=3tt-BBv-lx7BT3whbVTMyRz6sh5jIbdBEbLZ08m2Ms8,15132
113
113
  sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
114
- sglang-0.2.15.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
115
- sglang-0.2.15.dist-info/METADATA,sha256=bmiMZPX1vW_NYDBk92pG1u9_PZRcXanJ2KXtxBmaiF4,37211
116
- sglang-0.2.15.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
117
- sglang-0.2.15.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
118
- sglang-0.2.15.dist-info/RECORD,,
114
+ sglang-0.3.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
115
+ sglang-0.3.0.dist-info/METADATA,sha256=muukBuN4kq_4mCG_r_RFY94pQliDcVh-WuXNMApXoak,37383
116
+ sglang-0.3.0.dist-info/WHEEL,sha256=uCRv0ZEik_232NlR4YDw4Pv3Ajt5bKvMH13NUU7hFuI,91
117
+ sglang-0.3.0.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
118
+ sglang-0.3.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (74.0.0)
2
+ Generator: setuptools (74.1.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5