sglang 0.4.1__py3-none-any.whl → 0.4.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sglang/bench_serving.py CHANGED
@@ -897,6 +897,7 @@ async def benchmark(
897
897
  else:
898
898
  raise ValueError(f"Unknown backend: {backend}")
899
899
 
900
+ # Limit concurrency
900
901
  # From https://github.com/vllm-project/vllm/pull/9390
901
902
  semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
902
903
 
@@ -906,6 +907,7 @@ async def benchmark(
906
907
  async with semaphore:
907
908
  return await request_func(request_func_input=request_func_input, pbar=pbar)
908
909
 
910
+ # Warmup
909
911
  print("Starting initial single prompt test run...")
910
912
  test_prompt, test_prompt_len, test_output_len = input_requests[0]
911
913
  test_input = RequestFuncInput(
@@ -924,11 +926,15 @@ async def benchmark(
924
926
  f"are correctly specified. Error: {test_output.error}"
925
927
  )
926
928
  else:
927
- requests.post(base_url + "/flush_cache")
928
929
  print("Initial test run completed. Starting main benchmark run...")
929
930
 
930
- time.sleep(1.5)
931
+ # Flush cache
932
+ if "sglang" in backend:
933
+ requests.post(base_url + "/flush_cache")
934
+
935
+ time.sleep(1.0)
931
936
 
937
+ # Start profiler
932
938
  if profile:
933
939
  print("Starting profiler...")
934
940
  profile_output = await async_request_profile(
@@ -939,6 +945,7 @@ async def benchmark(
939
945
 
940
946
  pbar = None if disable_tqdm else tqdm(total=len(input_requests))
941
947
 
948
+ # Run all requests
942
949
  benchmark_start_time = time.perf_counter()
943
950
  tasks: List[asyncio.Task] = []
944
951
  async for request in get_request(input_requests, request_rate):
@@ -959,6 +966,7 @@ async def benchmark(
959
966
  )
960
967
  outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
961
968
 
969
+ # Stop profiler
962
970
  if profile:
963
971
  print("Stopping profiler...")
964
972
  profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
@@ -968,8 +976,8 @@ async def benchmark(
968
976
  if pbar is not None:
969
977
  pbar.close()
970
978
 
979
+ # Compute metrics and print results
971
980
  benchmark_duration = time.perf_counter() - benchmark_start_time
972
-
973
981
  metrics, output_lens = calculate_metrics(
974
982
  input_requests=input_requests,
975
983
  outputs=outputs,
@@ -366,6 +366,11 @@ class OpenAI(BaseBackend):
366
366
  def openai_completion(
367
367
  client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
368
368
  ):
369
+ # if "ebnf" is in kwargs, warn and remove
370
+ if "ebnf" in kwargs:
371
+ warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
372
+ del kwargs["ebnf"]
373
+
369
374
  for attempt in range(retries):
370
375
  try:
371
376
  if is_chat:
@@ -398,6 +403,11 @@ def openai_completion(
398
403
  def openai_completion_stream(
399
404
  client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
400
405
  ):
406
+ # if "ebnf" is in kwargs, warn and remove
407
+ if "ebnf" in kwargs:
408
+ warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
409
+ del kwargs["ebnf"]
410
+
401
411
  for attempt in range(retries):
402
412
  try:
403
413
  if is_chat:
@@ -126,6 +126,12 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
126
126
  f"Skip invalid json_schema: json_schema={key_string}, {e=}"
127
127
  )
128
128
  return None
129
+ elif key_type == "ebnf":
130
+ try:
131
+ ctx = self.grammar_compiler.compile_grammar(key_string)
132
+ except RuntimeError as e:
133
+ logging.warning(f"Skip invalid ebnf: ebnf={key_string}, {e=}")
134
+ return None
129
135
  elif key_type == "regex":
130
136
  logger.warning(
131
137
  "regex hasn't been supported by xgrammar yet. This is skipped."
@@ -292,27 +292,33 @@ def extend_attention_fwd(
292
292
  BLOCK_DPE = 0
293
293
  BLOCK_DV = triton.next_power_of_2(Lv)
294
294
 
295
- if is_cuda_available and CUDA_CAPABILITY[0] >= 9:
296
- if Lq <= 256:
297
- BLOCK_M, BLOCK_N = (128, 64)
298
- else:
299
- BLOCK_M, BLOCK_N = (32, 64)
300
- elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
301
- if Lq <= 128:
302
- BLOCK_M, BLOCK_N = (128, 128)
303
- elif Lq <= 256:
304
- BLOCK_M, BLOCK_N = (64, 64)
305
- else:
306
- BLOCK_M, BLOCK_N = (32, 64)
295
+ if is_hip_:
296
+ BLOCK_M, BLOCK_N = (64, 64)
297
+ num_warps = 4
298
+
307
299
  else:
308
- BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
300
+ if is_cuda_available and CUDA_CAPABILITY[0] >= 9:
301
+ if Lq <= 256:
302
+ BLOCK_M, BLOCK_N = (128, 64)
303
+ else:
304
+ BLOCK_M, BLOCK_N = (32, 64)
305
+ elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
306
+ if Lq <= 128:
307
+ BLOCK_M, BLOCK_N = (128, 128)
308
+ elif Lq <= 256:
309
+ BLOCK_M, BLOCK_N = (64, 64)
310
+ else:
311
+ BLOCK_M, BLOCK_N = (32, 64)
312
+ else:
313
+ BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
314
+
315
+ num_warps = 4 if Lk <= 64 else 8
309
316
 
310
317
  sm_scale = sm_scale or 1.0 / (Lq**0.5)
311
318
  batch_size, head_num = b_seq_len.shape[0], q_extend.shape[1]
312
319
  kv_group_num = q_extend.shape[1] // k_extend.shape[1]
313
320
 
314
321
  grid = (batch_size, head_num, triton.cdiv(max_len_extend, BLOCK_M))
315
- num_warps = 4 if Lk <= 64 else 8
316
322
  num_stages = 1
317
323
 
318
324
  extra_kargs = {}
@@ -11,12 +11,17 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
11
11
  import torch
12
12
  import triton
13
13
  import triton.language as tl
14
- from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
15
14
  from vllm import _custom_ops as ops
16
15
 
17
16
  from sglang.srt.layers.moe.topk import select_experts
18
17
  from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
19
- from sglang.srt.utils import direct_register_custom_op, get_device_name
18
+ from sglang.srt.utils import direct_register_custom_op, get_device_name, is_hip
19
+
20
+ not_hip = False
21
+ if not is_hip():
22
+ from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
23
+
24
+ not_hip = True
20
25
 
21
26
  logger = logging.getLogger(__name__)
22
27
  padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0
@@ -267,8 +272,14 @@ def moe_align_block_size(
267
272
  (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
268
273
  )
269
274
  num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
270
- # FIXME(zhyncs)
271
- if num_experts >= 256:
275
+ if not_hip and num_experts >= 224:
276
+ token_cnts_buffer = torch.empty(
277
+ (num_experts + 1) * num_experts, dtype=torch.int32, device=topk_ids.device
278
+ )
279
+ cumsum_buffer = torch.empty(
280
+ num_experts + 1, dtype=torch.int32, device=topk_ids.device
281
+ )
282
+
272
283
  sgl_moe_align_block_size(
273
284
  topk_ids,
274
285
  num_experts,
@@ -276,6 +287,8 @@ def moe_align_block_size(
276
287
  sorted_ids,
277
288
  expert_ids,
278
289
  num_tokens_post_pad,
290
+ token_cnts_buffer,
291
+ cumsum_buffer,
279
292
  )
280
293
  else:
281
294
  ops.moe_align_block_size(
@@ -1,3 +1,17 @@
1
+ # Copyright 2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+
1
15
  from typing import Callable, Optional
2
16
 
3
17
  import torch
@@ -1,3 +1,17 @@
1
+ # Copyright 2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+
1
15
  from typing import List, Tuple
2
16
 
3
17
  import torch
@@ -248,7 +248,7 @@ class PrefillAdder:
248
248
  self.can_run_list.append(req)
249
249
 
250
250
  self._prefill_one_req(
251
- len(req.prefix_indices),
251
+ 0,
252
252
  req.extend_input_len,
253
253
  (
254
254
  min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION)
@@ -468,9 +468,6 @@ class Scheduler:
468
468
  self.send_to_tokenizer.send_pyobj(
469
469
  UpdateWeightFromDiskReqOutput(success, message)
470
470
  )
471
- elif isinstance(recv_req, GetWeightsByNameReqInput):
472
- parameter = self.get_weights_by_name(recv_req)
473
- self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
474
471
  elif isinstance(recv_req, InitWeightsUpdateGroupReqInput):
475
472
  success, message = self.init_weights_update_group(recv_req)
476
473
  self.send_to_tokenizer.send_pyobj(
@@ -565,7 +562,7 @@ class Scheduler:
565
562
 
566
563
  if req.logprob_start_len == -1:
567
564
  # By default, only return the logprobs for output tokens
568
- req.logprob_start_len = len(recv_req.input_ids) - 1
565
+ req.logprob_start_len = len(req.origin_input_ids) - 1
569
566
 
570
567
  # Truncate prompts that are too long
571
568
  if len(req.origin_input_ids) > self.max_req_input_len:
@@ -589,12 +586,15 @@ class Scheduler:
589
586
  if (
590
587
  req.sampling_params.json_schema is not None
591
588
  or req.sampling_params.regex is not None
589
+ or req.sampling_params.ebnf is not None
592
590
  ):
593
591
  assert self.grammar_backend is not None
594
592
  if req.sampling_params.json_schema is not None:
595
593
  key = ("json", req.sampling_params.json_schema)
596
594
  elif req.sampling_params.regex is not None:
597
595
  key = ("regex", req.sampling_params.regex)
596
+ elif req.sampling_params.ebnf is not None:
597
+ key = ("ebnf", req.sampling_params.ebnf)
598
598
 
599
599
  req.grammar = self.grammar_backend.get_cached_value(key)
600
600
  if not req.grammar:
@@ -629,16 +629,13 @@ class Scheduler:
629
629
  self.waiting_queue.append(req)
630
630
 
631
631
  def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked):
632
- if isinstance(self.tree_cache, RadixCache):
633
- self.tree_cache_metrics["total"] += (
634
- adder.log_input_tokens + adder.log_hit_tokens
635
- ) / 10**9
636
- self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
637
- tree_cache_hit_rate = (
638
- self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
639
- )
640
- else:
641
- tree_cache_hit_rate = 0.0
632
+ self.tree_cache_metrics["total"] += (
633
+ adder.log_input_tokens + adder.log_hit_tokens
634
+ ) / 10**9
635
+ self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
636
+ tree_cache_hit_rate = (
637
+ self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
638
+ )
642
639
 
643
640
  num_used = self.max_total_num_tokens - (
644
641
  self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
@@ -22,7 +22,7 @@ import signal
22
22
  import sys
23
23
  import time
24
24
  import uuid
25
- from typing import Any, Awaitable, Dict, List, Optional, Tuple, Union
25
+ from typing import Any, Awaitable, Dict, Generic, List, Optional, Tuple, TypeVar, Union
26
26
 
27
27
  import fastapi
28
28
  import uvloop
@@ -173,6 +173,15 @@ class TokenizerManager:
173
173
 
174
174
  # Others
175
175
  self.gracefully_exit = False
176
+ self.init_weights_update_group_communicator = _Communicator(
177
+ self.send_to_scheduler, server_args.dp_size
178
+ )
179
+ self.update_weights_from_distributed_communicator = _Communicator(
180
+ self.send_to_scheduler, server_args.dp_size
181
+ )
182
+ self.get_weights_by_name_communicator = _Communicator(
183
+ self.send_to_scheduler, server_args.dp_size
184
+ )
176
185
 
177
186
  # Metrics
178
187
  if self.enable_metrics:
@@ -190,8 +199,7 @@ class TokenizerManager:
190
199
  ):
191
200
  created_time = time.time()
192
201
 
193
- if self.to_create_loop:
194
- self.create_handle_loop()
202
+ self.auto_create_handle_loop()
195
203
 
196
204
  if isinstance(obj, EmbeddingReqInput) and self.is_generation:
197
205
  raise ValueError(
@@ -440,8 +448,7 @@ class TokenizerManager:
440
448
  obj: UpdateWeightFromDiskReqInput,
441
449
  request: Optional[fastapi.Request] = None,
442
450
  ) -> Tuple[bool, str]:
443
- if self.to_create_loop:
444
- self.create_handle_loop()
451
+ self.auto_create_handle_loop()
445
452
 
446
453
  # default the load format to the server_args
447
454
  if obj.load_format is None:
@@ -456,7 +463,7 @@ class TokenizerManager:
456
463
 
457
464
  async def _wait_for_model_update_from_disk(
458
465
  self, obj: UpdateWeightFromDiskReqInput
459
- ) -> Tuple[bool, str, int]:
466
+ ) -> Tuple[bool, str]:
460
467
  self.send_to_scheduler.send_pyobj(obj)
461
468
  self.model_update_result = asyncio.Future()
462
469
  if self.server_args.dp_size == 1:
@@ -485,15 +492,11 @@ class TokenizerManager:
485
492
  obj: InitWeightsUpdateGroupReqInput,
486
493
  request: Optional[fastapi.Request] = None,
487
494
  ) -> Tuple[bool, str]:
488
- if self.to_create_loop:
489
- self.create_handle_loop()
490
- self.send_to_scheduler.send_pyobj(obj)
491
-
492
- self.init_weights_update_group_result = asyncio.Future()
495
+ self.auto_create_handle_loop()
493
496
  assert (
494
497
  self.server_args.dp_size == 1
495
498
  ), "dp_size must be 1 for init parameter update group"
496
- result = await self.init_weights_update_group_result
499
+ result = (await self.init_weights_update_group_communicator(obj))[0]
497
500
  return result.success, result.message
498
501
 
499
502
  async def update_weights_from_distributed(
@@ -501,44 +504,32 @@ class TokenizerManager:
501
504
  obj: UpdateWeightsFromDistributedReqInput,
502
505
  request: Optional[fastapi.Request] = None,
503
506
  ) -> Tuple[bool, str]:
504
- if self.to_create_loop:
505
- self.create_handle_loop()
507
+ self.auto_create_handle_loop()
508
+ assert (
509
+ self.server_args.dp_size == 1
510
+ ), "dp_size must be for update weights from distributed"
506
511
 
507
512
  # This means that weight sync
508
513
  # cannot run while requests are in progress.
509
514
  async with self.model_update_lock.writer_lock:
510
- self.send_to_scheduler.send_pyobj(obj)
511
- self.parameter_update_result: Awaitable[
512
- UpdateWeightsFromDistributedReqOutput
513
- ] = asyncio.Future()
514
- assert (
515
- self.server_args.dp_size == 1
516
- ), "dp_size must be for update weights from distributed"
517
- result = await self.parameter_update_result
515
+ result = (await self.update_weights_from_distributed_communicator(obj))[0]
518
516
  return result.success, result.message
519
517
 
520
518
  async def get_weights_by_name(
521
519
  self, obj: GetWeightsByNameReqInput, request: Optional[fastapi.Request] = None
522
520
  ):
523
- if self.to_create_loop:
524
- self.create_handle_loop()
525
-
526
- self.send_to_scheduler.send_pyobj(obj)
527
- self.get_weights_by_name_result = asyncio.Future()
521
+ self.auto_create_handle_loop()
522
+ results = await self.get_weights_by_name_communicator(obj)
523
+ all_parameters = [r.parameter for r in results]
528
524
  if self.server_args.dp_size == 1:
529
- result = await self.get_weights_by_name_result
530
- return result.parameter
525
+ return all_parameters[0]
531
526
  else:
532
- self.get_weights_by_name_tmp = []
533
- result = await self.get_weights_by_name_result
534
- all_parameters = [r.parameter for r in result]
535
527
  return all_parameters
536
528
 
537
529
  async def open_session(
538
530
  self, obj: OpenSessionReqInput, request: Optional[fastapi.Request] = None
539
531
  ):
540
- if self.to_create_loop:
541
- self.create_handle_loop()
532
+ self.auto_create_handle_loop()
542
533
 
543
534
  session_id = uuid.uuid4().hex
544
535
  obj.session_id = session_id
@@ -568,7 +559,7 @@ class TokenizerManager:
568
559
  background_tasks.add_task(abort_request)
569
560
  return background_tasks
570
561
 
571
- def create_handle_loop(self):
562
+ def auto_create_handle_loop(self):
572
563
  if not self.to_create_loop:
573
564
  return
574
565
 
@@ -711,21 +702,14 @@ class TokenizerManager:
711
702
  assert (
712
703
  self.server_args.dp_size == 1
713
704
  ), "dp_size must be 1 for init parameter update group"
714
- self.init_weights_update_group_result.set_result(recv_obj)
705
+ self.init_weights_update_group_communicator.handle_recv(recv_obj)
715
706
  elif isinstance(recv_obj, UpdateWeightsFromDistributedReqOutput):
716
707
  assert (
717
708
  self.server_args.dp_size == 1
718
709
  ), "dp_size must be 1 for update weights from distributed"
719
- self.parameter_update_result.set_result(recv_obj)
710
+ self.update_weights_from_distributed_communicator.handle_recv(recv_obj)
720
711
  elif isinstance(recv_obj, GetWeightsByNameReqOutput):
721
- if self.server_args.dp_size == 1:
722
- self.get_weights_by_name_result.set_result(recv_obj)
723
- else:
724
- self.get_weights_by_name_tmp.append(recv_obj)
725
- if len(self.get_weights_by_name_tmp) == self.server_args.dp_size:
726
- self.get_weights_by_name_result.set_result(
727
- self.get_weights_by_name_tmp
728
- )
712
+ self.get_weights_by_name_communicator.handle_recv(recv_obj)
729
713
  else:
730
714
  raise ValueError(f"Invalid object: {recv_obj=}")
731
715
 
@@ -809,3 +793,28 @@ class SignalHandler:
809
793
  f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
810
794
  )
811
795
  self.tokenizer_manager.gracefully_exit = True
796
+
797
+
798
+ T = TypeVar("T")
799
+
800
+
801
+ class _Communicator(Generic[T]):
802
+ def __init__(self, sender, fan_out: int):
803
+ self._sender = sender
804
+ self._fan_out = fan_out
805
+ self._result_future: Optional[asyncio.Future] = None
806
+ self._result_values: Optional[List[T]] = None
807
+
808
+ async def __call__(self, obj):
809
+ self._sender.send_pyobj(obj)
810
+ self._result_future = asyncio.Future()
811
+ self._result_values = []
812
+ await self._result_future
813
+ result_values = self._result_values
814
+ self._result_future = self._result_values = None
815
+ return result_values
816
+
817
+ def handle_recv(self, recv_obj: T):
818
+ self._result_values.append(recv_obj)
819
+ if len(self._result_values) == self._fan_out:
820
+ self._result_future.set_result(None)
@@ -95,12 +95,6 @@ class ModelRunner:
95
95
  ):
96
96
  logger.info("MLA optimization is turned on. Use triton backend.")
97
97
  self.server_args.attention_backend = "triton"
98
- # FIXME(HandH1998)
99
- if (
100
- "DeepseekV3ForCausalLM" in self.model_config.hf_config.architectures
101
- and not self.server_args.disable_cuda_graph
102
- ):
103
- self.server_args.disable_cuda_graph = True
104
98
 
105
99
  if self.server_args.enable_double_sparsity:
106
100
  logger.info(
@@ -770,6 +770,21 @@ class BitsAndBytesModelLoader(BaseModelLoader):
770
770
  quant_state_dict,
771
771
  )
772
772
 
773
+ def _is_8bit_weight_name(self, weight_name: str):
774
+ quantized_suffix = {".scb", ".weight_format"}
775
+ return any(weight_name.lower().endswith(suffix) for suffix in quantized_suffix)
776
+
777
+ def _is_4bit_weight_name(self, weight_name: str):
778
+ quantized_suffix = {
779
+ "absmax",
780
+ "quant_map",
781
+ "nested_absmax",
782
+ "nested_quant_map",
783
+ "bitsandbytes",
784
+ }
785
+ suffix = weight_name.split(".")[-1]
786
+ return any(q_suffix in suffix for q_suffix in quantized_suffix)
787
+
773
788
  def _quantized_8bit_generator(
774
789
  self, hf_weights_files, use_safetensors, quant_state_dict
775
790
  ) -> Generator:
@@ -779,21 +794,18 @@ class BitsAndBytesModelLoader(BaseModelLoader):
779
794
  if not weight_name.lower().endswith(".scb"):
780
795
  continue
781
796
 
782
- weight_key = weight_name.lower().replace(".scb", ".qweight")
797
+ weight_key = weight_name.lower().replace(".scb", ".weight")
783
798
  quant_state_dict[weight_key] = weight_tensor
784
799
 
785
800
  for weight_name, weight_tensor in self._hf_weight_iter(
786
801
  hf_weights_files, use_safetensors
787
802
  ):
788
-
789
- if not weight_name.endswith((".weight", ".bias")):
803
+ if self._is_8bit_weight_name(weight_name):
790
804
  continue
791
805
 
792
- qweight_name = weight_name.replace(".weight", ".qweight")
793
-
794
- if qweight_name in quant_state_dict:
806
+ if weight_name in quant_state_dict:
795
807
  set_weight_attrs(weight_tensor, {"load_in_8bit": True})
796
- yield qweight_name, weight_tensor
808
+ yield weight_name, weight_tensor
797
809
  else:
798
810
  yield weight_name, weight_tensor
799
811
 
@@ -806,7 +818,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
806
818
  weight_iterator = self._hf_weight_iter(hf_weights_files, use_safetensors)
807
819
  temp_state_dict = {}
808
820
  for weight_name, weight_tensor in weight_iterator:
809
- if weight_name.endswith((".weight", ".bias")):
821
+ if not self._is_4bit_weight_name(weight_name):
810
822
  continue
811
823
  # bitsandbytes library requires
812
824
  # weight.quant_state.bitsandbytes__* in CPU
@@ -830,16 +842,15 @@ class BitsAndBytesModelLoader(BaseModelLoader):
830
842
  hf_weights_files, use_safetensors
831
843
  ):
832
844
 
833
- if not weight_name.endswith((".weight", ".bias")):
845
+ if self._is_4bit_weight_name(weight_name):
834
846
  continue
835
847
 
836
848
  if (f"{weight_name}.quant_state.bitsandbytes__nf4" in temp_state_dict) or (
837
849
  f"{weight_name}.quant_state.bitsandbytes__fp4" in temp_state_dict
838
850
  ):
839
851
  quant_state = _parse_quant_state(weight_name, temp_state_dict)
840
- weight_name = weight_name.replace(".weight", ".qweight")
841
852
  quant_state_dict[weight_name] = quant_state
842
- yield weight_name.replace(".weight", ".qweight"), weight_tensor
853
+ yield weight_name, weight_tensor
843
854
  else:
844
855
  yield weight_name, weight_tensor
845
856
 
@@ -307,6 +307,25 @@ class Gemma2Model(nn.Module):
307
307
 
308
308
 
309
309
  class Gemma2ForCausalLM(nn.Module):
310
+ # BitandBytes specific attributes
311
+ default_bitsandbytes_target_modules = [
312
+ ".gate_proj.",
313
+ ".down_proj.",
314
+ ".up_proj.",
315
+ ".q_proj.",
316
+ ".k_proj.",
317
+ ".v_proj.",
318
+ ".o_proj.",
319
+ ]
320
+ bitsandbytes_stacked_params_mapping = {
321
+ # shard_name, weight_name, index
322
+ "q_proj": ("qkv_proj", 0),
323
+ "k_proj": ("qkv_proj", 1),
324
+ "v_proj": ("qkv_proj", 2),
325
+ "gate_proj": ("gate_up_proj", 0),
326
+ "up_proj": ("gate_up_proj", 1),
327
+ }
328
+
310
329
  packed_modules_mapping = {
311
330
  "qkv_proj": [
312
331
  "q_proj",
@@ -325,8 +325,8 @@ class LlamaForCausalLM(nn.Module):
325
325
  self.config = config
326
326
  self.quant_config = quant_config
327
327
  self.model = LlamaModel(config, quant_config=quant_config)
328
- # Llama 3.2 1B Insturct set tie_word_embeddings to True
329
- # Llama 3.1 8B Insturct set tie_word_embeddings to False
328
+ # Llama 3.2 1B Instruct set tie_word_embeddings to True
329
+ # Llama 3.1 8B Instruct set tie_word_embeddings to False
330
330
  if self.config.tie_word_embeddings:
331
331
  self.lm_head = self.model.embed_tokens
332
332
  else:
@@ -517,6 +517,7 @@ def v1_generate_request(
517
517
  "repetition_penalty": request.repetition_penalty,
518
518
  "regex": request.regex,
519
519
  "json_schema": request.json_schema,
520
+ "ebnf": request.ebnf,
520
521
  "n": request.n,
521
522
  "no_stop_trim": request.no_stop_trim,
522
523
  "ignore_eos": request.ignore_eos,
@@ -692,6 +693,14 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
692
693
 
693
694
  async def v1_completions(tokenizer_manager, raw_request: Request):
694
695
  request_json = await raw_request.json()
696
+ if "extra_body" in request_json:
697
+ extra = request_json["extra_body"]
698
+ if "ebnf" in extra:
699
+ request_json["ebnf"] = extra["ebnf"]
700
+ if "regex" in extra:
701
+ request_json["regex"] = extra["regex"]
702
+ # remove extra_body to avoid pydantic conflict
703
+ del request_json["extra_body"]
695
704
  all_requests = [CompletionRequest(**request_json)]
696
705
  adapted_request, request = v1_generate_request(all_requests)
697
706
 
@@ -936,6 +945,7 @@ def v1_chat_generate_request(
936
945
  "frequency_penalty": request.frequency_penalty,
937
946
  "repetition_penalty": request.repetition_penalty,
938
947
  "regex": request.regex,
948
+ "ebnf": request.ebnf,
939
949
  "n": request.n,
940
950
  "no_stop_trim": request.no_stop_trim,
941
951
  "ignore_eos": request.ignore_eos,
@@ -1108,6 +1118,15 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
1108
1118
 
1109
1119
  async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1110
1120
  request_json = await raw_request.json()
1121
+ if "extra_body" in request_json:
1122
+ extra = request_json["extra_body"]
1123
+ # For example, if 'ebnf' is given:
1124
+ if "ebnf" in extra:
1125
+ request_json["ebnf"] = extra["ebnf"]
1126
+ if "regex" in extra:
1127
+ request_json["regex"] = extra["regex"]
1128
+ # remove extra_body to avoid pydantic conflict
1129
+ del request_json["extra_body"]
1111
1130
  all_requests = [ChatCompletionRequest(**request_json)]
1112
1131
  adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
1113
1132
 
@@ -179,6 +179,7 @@ class CompletionRequest(BaseModel):
179
179
  ignore_eos: bool = False
180
180
  skip_special_tokens: bool = True
181
181
  lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
182
+ ebnf: Optional[str] = None
182
183
 
183
184
 
184
185
  class CompletionResponseChoice(BaseModel):
@@ -288,6 +289,7 @@ class ChatCompletionRequest(BaseModel):
288
289
  ignore_eos: bool = False
289
290
  skip_special_tokens: bool = True
290
291
  lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
292
+ ebnf: Optional[str] = None
291
293
 
292
294
 
293
295
  class ChatMessage(BaseModel):
@@ -36,6 +36,7 @@ class SamplingParams:
36
36
  regex: Optional[str] = None,
37
37
  n: int = 1,
38
38
  json_schema: Optional[str] = None,
39
+ ebnf: Optional[str] = None,
39
40
  no_stop_trim: bool = False,
40
41
  ignore_eos: bool = False,
41
42
  skip_special_tokens: bool = True,
@@ -60,6 +61,7 @@ class SamplingParams:
60
61
  self.regex = regex
61
62
  self.n = n
62
63
  self.json_schema = json_schema
64
+ self.ebnf = ebnf
63
65
  self.no_stop_trim = no_stop_trim
64
66
 
65
67
  # Process some special cases
@@ -111,8 +113,13 @@ class SamplingParams:
111
113
  f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
112
114
  f"{self.min_new_tokens}."
113
115
  )
114
- if self.regex is not None and self.json_schema is not None:
115
- raise ValueError("regex and json_schema cannot be both set.")
116
+ grammars = [
117
+ self.json_schema,
118
+ self.regex,
119
+ self.ebnf,
120
+ ] # since mutually exclusive, only one can be set
121
+ if sum(x is not None for x in grammars) > 1:
122
+ raise ValueError("Only one of regex, json_schema, or ebnf can be set.")
116
123
 
117
124
  def normalize(self, tokenizer):
118
125
  # Process stop strings
sglang/srt/server.py CHANGED
@@ -245,16 +245,11 @@ async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
245
245
  try:
246
246
  ret = await tokenizer_manager.get_weights_by_name(obj, request)
247
247
  if ret is None:
248
- return ORJSONResponse(
249
- {"error": {"message": "Get parameter by name failed"}},
250
- status_code=HTTPStatus.BAD_REQUEST,
251
- )
248
+ return _create_error_response("Get parameter by name failed")
252
249
  else:
253
250
  return ORJSONResponse(ret, status_code=200)
254
251
  except Exception as e:
255
- return ORJSONResponse(
256
- {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
257
- )
252
+ return _create_error_response(e)
258
253
 
259
254
 
260
255
  @app.api_route("/open_session", methods=["GET", "POST"])
@@ -264,9 +259,7 @@ async def open_session(obj: OpenSessionReqInput, request: Request):
264
259
  session_id = await tokenizer_manager.open_session(obj, request)
265
260
  return session_id
266
261
  except Exception as e:
267
- return ORJSONResponse(
268
- {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
269
- )
262
+ return _create_error_response(e)
270
263
 
271
264
 
272
265
  @app.api_route("/close_session", methods=["GET", "POST"])
@@ -276,9 +269,7 @@ async def close_session(obj: CloseSessionReqInput, request: Request):
276
269
  await tokenizer_manager.close_session(obj, request)
277
270
  return Response(status_code=200)
278
271
  except Exception as e:
279
- return ORJSONResponse(
280
- {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
281
- )
272
+ return _create_error_response(e)
282
273
 
283
274
 
284
275
  # fastapi implicitly converts json in the request to obj (dataclass)
@@ -312,9 +303,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
312
303
  return ret
313
304
  except ValueError as e:
314
305
  logger.error(f"Error: {e}")
315
- return ORJSONResponse(
316
- {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
317
- )
306
+ return _create_error_response(e)
318
307
 
319
308
 
320
309
  @app.api_route("/encode", methods=["POST", "PUT"])
@@ -325,9 +314,7 @@ async def encode_request(obj: EmbeddingReqInput, request: Request):
325
314
  ret = await tokenizer_manager.generate_request(obj, request).__anext__()
326
315
  return ret
327
316
  except ValueError as e:
328
- return ORJSONResponse(
329
- {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
330
- )
317
+ return _create_error_response(e)
331
318
 
332
319
 
333
320
  @app.api_route("/classify", methods=["POST", "PUT"])
@@ -338,9 +325,7 @@ async def classify_request(obj: EmbeddingReqInput, request: Request):
338
325
  ret = await tokenizer_manager.generate_request(obj, request).__anext__()
339
326
  return ret
340
327
  except ValueError as e:
341
- return ORJSONResponse(
342
- {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
343
- )
328
+ return _create_error_response(e)
344
329
 
345
330
 
346
331
  ##### OpenAI-compatible API endpoints #####
@@ -416,6 +401,12 @@ async def retrieve_file_content(file_id: str):
416
401
  return await v1_retrieve_file_content(file_id)
417
402
 
418
403
 
404
+ def _create_error_response(e):
405
+ return ORJSONResponse(
406
+ {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
407
+ )
408
+
409
+
419
410
  def launch_engine(
420
411
  server_args: ServerArgs,
421
412
  ):
@@ -849,12 +840,10 @@ class Engine:
849
840
  group_name=group_name,
850
841
  backend=backend,
851
842
  )
852
-
853
- async def _init_group():
854
- return await tokenizer_manager.init_weights_update_group(obj, None)
855
-
856
843
  loop = asyncio.get_event_loop()
857
- return loop.run_until_complete(_init_group())
844
+ return loop.run_until_complete(
845
+ tokenizer_manager.init_weights_update_group(obj, None)
846
+ )
858
847
 
859
848
  def update_weights_from_distributed(self, name, dtype, shape):
860
849
  """Update weights from distributed source."""
@@ -863,22 +852,16 @@ class Engine:
863
852
  dtype=dtype,
864
853
  shape=shape,
865
854
  )
866
-
867
- async def _update_weights():
868
- return await tokenizer_manager.update_weights_from_distributed(obj, None)
869
-
870
855
  loop = asyncio.get_event_loop()
871
- return loop.run_until_complete(_update_weights())
856
+ return loop.run_until_complete(
857
+ tokenizer_manager.update_weights_from_distributed(obj, None)
858
+ )
872
859
 
873
860
  def get_weights_by_name(self, name, truncate_size=100):
874
861
  """Get weights by parameter name."""
875
862
  obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
876
-
877
- async def _get_weights():
878
- return await tokenizer_manager.get_weights_by_name(obj, None)
879
-
880
863
  loop = asyncio.get_event_loop()
881
- return loop.run_until_complete(_get_weights())
864
+ return loop.run_until_complete(tokenizer_manager.get_weights_by_name(obj, None))
882
865
 
883
866
 
884
867
  class Runtime:
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.1"
1
+ __version__ = "0.4.1.post1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.4.1
3
+ Version: 0.4.1.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -243,7 +243,7 @@ Requires-Dist: torch; extra == "srt"
243
243
  Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
244
244
  Requires-Dist: cuda-python; extra == "srt"
245
245
  Requires-Dist: flashinfer==0.1.6; extra == "srt"
246
- Requires-Dist: sgl-kernel>=0.0.2.post8; extra == "srt"
246
+ Requires-Dist: sgl-kernel>=0.0.2.post10; extra == "srt"
247
247
  Provides-Extra: srt-hip
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
249
249
  Requires-Dist: torch; extra == "srt-hip"
@@ -358,8 +358,8 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
358
358
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
359
359
 
360
360
  ## Adoption and Sponsorship
361
- The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI and DataCrunch.
361
+ The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI.
362
362
 
363
363
  ## Acknowledgment and Citation
364
364
  We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
365
- Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
365
+ Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -4,14 +4,14 @@ sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
4
4
  sglang/bench_offline_throughput.py,sha256=iQiJCK3KQDCdwU1NVbIwbtthssWzBXiIsKUDA7Z_hO0,12510
5
5
  sglang/bench_one_batch.py,sha256=jkyMhK0lqn5dRCYgAh30qZrNHP4gAbXODymBMNXK86I,15859
6
6
  sglang/bench_one_batch_server.py,sha256=-fV9FTLNNcSIy0pgYeggXedPVK0fVsXZqVQswT8OMOY,5945
7
- sglang/bench_serving.py,sha256=3VQatM51v9f55aUQQ5crYMxxKHr1AbThicsWfBy_tjU,53190
7
+ sglang/bench_serving.py,sha256=YQiCZreejCPBTqMmZsCB99RMi1N-Jx-dZtaafcQ8-14,53377
8
8
  sglang/check_env.py,sha256=4OqpZaEJOfBM6-vtPILto5kqDmgiZM1Koc7lK78A7CI,8427
9
9
  sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
10
10
  sglang/launch_server.py,sha256=4y2QeSj0wVNB9MJQZeahD4ahTDU6gwqo7MPUytyFop0,403
11
11
  sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
12
12
  sglang/llama3_eval.py,sha256=gWSboDchIGybIce88bJlrCG0yiLZ513mw4gcutJlzGM,10017
13
13
  sglang/utils.py,sha256=23jf4Mz8E5p5a6JOkjnfYZixdjZUk88F_mZ8rZcby5Q,11597
14
- sglang/version.py,sha256=pMtTmSUht-XtbR_7Doz6bsQqopJJd8rZ8I8zy2HwwoA,22
14
+ sglang/version.py,sha256=ARioq8ApVNckeQorLPVfHZeN9mlHMLbaNgLGNbGq-ys,28
15
15
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  sglang/lang/chat_template.py,sha256=cnfjjxIIcYRGRxXlJlOGnpFxFuhMHut7DS52LsOMKcA,15826
17
17
  sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
@@ -23,7 +23,7 @@ sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
23
23
  sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
24
24
  sglang/lang/backend/base_backend.py,sha256=tdoh9YF3CyekY1BKiX9n7-aA4srDWIuA4RDJLM7q8qg,1985
25
25
  sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
26
- sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
26
+ sglang/lang/backend/openai.py,sha256=ha9a2P6T80TmSgYlyIwB1qYawWkjcOgiOptkktkqa1U,15436
27
27
  sglang/lang/backend/runtime_endpoint.py,sha256=dfs-yZ1ekKmnbpZLluQHWPmMeZJKbaaZRRGYRa9eBE8,10541
28
28
  sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
29
29
  sglang/srt/_custom_ops.py,sha256=Y4gyTDGhWz-W2Igq25Ojm8XFiyvkawW9I-79iwYvxJ0,3574
@@ -32,7 +32,7 @@ sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21
32
32
  sglang/srt/hf_transformers_utils.py,sha256=38Ms0H2-VMerOS6jnczcFtZMS6lhw9B5rSWKAfxVUfQ,7945
33
33
  sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
34
34
  sglang/srt/model_parallel.py,sha256=eLXZhvJ4wG6dh0FontNCIdVZvHYdWgaeY-5cu7TD9tE,6078
35
- sglang/srt/server.py,sha256=E9YKKXpXv3vPvRy0-cgcy0-5UA-OZz42-32EZWKTicA,34661
35
+ sglang/srt/server.py,sha256=vDucJl6qtEK2swzPJ_wYitaJvsI4MigMagGlBlH5V54,34033
36
36
  sglang/srt/server_args.py,sha256=LgnQ-kBJZ3E7hMMZj9bSK0mn7Bhjk1nJHxLcxl-lGTM,34572
37
37
  sglang/srt/utils.py,sha256=J8kFl6kDBwFZCM6AKaVTiqdhJKRg0JOH0pNrD1ZeWmM,41726
38
38
  sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
@@ -45,7 +45,7 @@ sglang/srt/constrained/__init__.py,sha256=UWZNVLvOT5ZBX8M36sONgDmnKtkQ0cSfhQD2jO
45
45
  sglang/srt/constrained/base_grammar_backend.py,sha256=FhVm7PxhXDl0joV9NP5RjKgz7dR1dZvUAQnh0mdtvVY,2353
46
46
  sglang/srt/constrained/outlines_backend.py,sha256=CipNHNNXs8xtnJNVNe6FCwZUlSbIXbGmWVlZz3hUpFQ,6820
47
47
  sglang/srt/constrained/outlines_jump_forward.py,sha256=iZWXeR3gNYoMubLGyFmLPO4V2YsN5DiGjD71Xk9iFaE,6418
48
- sglang/srt/constrained/xgrammar_backend.py,sha256=4It9_GqU4UZFhxIw_7hkzpXaMPUtksk6Xfe0Agsfw7A,4620
48
+ sglang/srt/constrained/xgrammar_backend.py,sha256=76oUFXeB29bfnEVWa1-rIrwQm5jhuMlzAX10HtAq1fQ,4887
49
49
  sglang/srt/distributed/__init__.py,sha256=__tl9Frrf3PFrSyNYcn5i-y2rL-J4-Qn6RJwrsZ4xgc,83
50
50
  sglang/srt/distributed/communication_op.py,sha256=ZoIhboZyefiAwr-1K-wF3rAFSQ4Wt-RxXpsX443Gbt4,1157
51
51
  sglang/srt/distributed/parallel_state.py,sha256=HplRH5S0AWdwSdhoHYX9_UWQZlFjh2Z1LHaz68EXlpE,47555
@@ -77,20 +77,20 @@ sglang/srt/layers/attention/torch_native_backend.py,sha256=nQdeqWEMMH_wrod5wssDC
77
77
  sglang/srt/layers/attention/triton_backend.py,sha256=-TobyZHwlbJ5HhbFg-jgCqVOw4Y-opgEuFo-EusASQc,6264
78
78
  sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=oJ_UK1t229zF3hbTDiQe7t-X-IbM2dOxx4U2ch-vmjA,17847
79
79
  sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
80
- sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=tZJhzqcf1KKMT8z7_32eVk_D1NHP71c-S3UNxemfAHM,11542
80
+ sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=DWOZXSTVN5ZbcFjDjcqs-nPdUkxSwum0SVXhVKqwh2g,11688
81
81
  sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=lojFXRZMLWkzS2Y8uxaolnQhXaWKG19mCAWaF5KQeiI,6087
82
82
  sglang/srt/layers/moe/fused_moe_native.py,sha256=8q-LFZMSCGLc2_Gltp2lH0gSb4A1WOuKQW3wo3rpj5g,1601
83
- sglang/srt/layers/moe/topk.py,sha256=YjIiFqMERvkChkwZUqTrL_xaQyzsYsZzVUe4PzAhRZI,6299
83
+ sglang/srt/layers/moe/topk.py,sha256=JpeIl_-CNk0yyG3k5fmmNbbmR2_9bkKC23UoLOlMkjw,6954
84
84
  sglang/srt/layers/moe/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
85
  sglang/srt/layers/moe/ep_moe/kernels.py,sha256=wb_S2qLxoWWgQu9coXy0XLNGvHzdZSdwXr0PGy4QySg,10940
86
86
  sglang/srt/layers/moe/ep_moe/layer.py,sha256=6iQU5ZjQ8IXGoQ8ZlBuJqyQxYTEem9vXI6rbVIWKlZw,22303
87
87
  sglang/srt/layers/moe/fused_moe_triton/__init__.py,sha256=h9yMFAL_bagUf-qBED8gSWdCOb7d8IdA-pE-L_nIg8E,842
88
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=GVugCKapd3CvgkvPQ_FmQplC12-grv3n1FRkLJc6WhY,30790
88
+ sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=zXwWUtthLa9E35EvlQ9A_mnIsQyA0_NYKsUBdJqONHo,31163
89
89
  sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=BclDj5JyCy-8Bfue4broL1-IG6a4dUyggE9WQLa06sg,20575
90
90
  sglang/srt/layers/quantization/__init__.py,sha256=VPYXShHvbvkOgVBlkIqic4RhdJ1y6EZ3r34T-nZMT1k,4606
91
91
  sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
92
92
  sglang/srt/layers/quantization/fp8.py,sha256=wNnpXLroIl7D98mlfCiXZPE9hrP5ricHrXY1WZBzEEo,30810
93
- sglang/srt/layers/quantization/fp8_kernel.py,sha256=v4-7hCQFyuUSZmeJS_5VDCu6a1-EGWXQ088FdPTjO_0,8137
93
+ sglang/srt/layers/quantization/fp8_kernel.py,sha256=eoO1enzD9jPC80id2oC3i8bt-LN6-4Ey223yOQ9yIPE,8792
94
94
  sglang/srt/layers/quantization/fp8_utils.py,sha256=HBJBaNcln1NrLxzw0ppUjMd6w-ryuGDDHCYJq7mRQac,4035
95
95
  sglang/srt/lora/lora.py,sha256=-o2mBmUvoVpdkgdAkWTARN4kfyep3UNEJLcg6moh0SU,15056
96
96
  sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
@@ -100,10 +100,10 @@ sglang/srt/managers/detokenizer_manager.py,sha256=nZkbwt4yty_oy8rvg4T7PbgyVLoBLo
100
100
  sglang/srt/managers/image_processor.py,sha256=Y8RgyrzbJjJTpjbnZDa5qiiG5wWjZ68rOXUPDi6kkFo,13698
101
101
  sglang/srt/managers/io_struct.py,sha256=_LWWqT3LNwZGaWhg2d3kTg1V2MTHKzRasCvxF9Nfpi4,15429
102
102
  sglang/srt/managers/schedule_batch.py,sha256=qryPWCdOTFzxomDa80U-5guShOb1K4kBUWcPCCchYB8,45762
103
- sglang/srt/managers/schedule_policy.py,sha256=cLNi__smbg02keWgUMfB_nEM3vllocPB0XyG1P5qO7I,15469
104
- sglang/srt/managers/scheduler.py,sha256=3Olw4Yf4Qtn1i4PqK3PT9hkXYGE8nemL2_Xjn8JLxAQ,61819
103
+ sglang/srt/managers/schedule_policy.py,sha256=QxjQ8-le062AMHHxool6CxkhvB4FIwhOQPzTX_JwL6U,15447
104
+ sglang/srt/managers/scheduler.py,sha256=Yh15uQFhJlku8a20-lhtIsiEHAcUmpL3BzL42kLVwiI,61637
105
105
  sglang/srt/managers/session_controller.py,sha256=Yp-IV3rXczACZxZXmF-QxW9CWICGy8KHQ9ttBGJ8WXA,2800
106
- sglang/srt/managers/tokenizer_manager.py,sha256=Vta7Lysvh4rPWqEB00shqAzpGUfv7GdPETDqFCU8RxA,31556
106
+ sglang/srt/managers/tokenizer_manager.py,sha256=uKiTt__lCFXG60zQhmM_K7dU7IuedVSIQHVw3x3y5-E,31758
107
107
  sglang/srt/managers/tp_worker.py,sha256=X1EwFX3FSsmXx7jeeX2tjZRocaujabQYWm-M-0CFEBE,7363
108
108
  sglang/srt/managers/tp_worker_overlap_thread.py,sha256=-QNBJRKxraa9Xt2WI1AFzZYdneIJ1eXv0GjFzDqXoE0,8926
109
109
  sglang/srt/mem_cache/base_prefix_cache.py,sha256=QC8HS8RC5DXu14kyXsxAgEUsn0f932p2DjqzbKjc6Bs,962
@@ -115,9 +115,9 @@ sglang/srt/metrics/collector.py,sha256=ZWoFx_FKN0sNMSZ8RJWUVQ0RFEYhIHxdw0d4TZTlu
115
115
  sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
116
116
  sglang/srt/model_executor/cuda_graph_runner.py,sha256=1n5WxoE9-0B3unwkkcR355K_D290h2LGt_7EvH02DQM,16246
117
117
  sglang/srt/model_executor/forward_batch_info.py,sha256=L5mVoW5SaO6To-7nGk0TZM-FFB5_78cARpJ-aC2rwD0,12883
118
- sglang/srt/model_executor/model_runner.py,sha256=Bm3NWTS3xmOGXEJnucnJZQldpVOzu-DCEUfaJy_PTU0,30104
118
+ sglang/srt/model_executor/model_runner.py,sha256=MLYBcYIQihu2I3PBTUghiU2mSWsDMzlKzcnX7yHa9JU,29837
119
119
  sglang/srt/model_loader/__init__.py,sha256=zGZkOBz1zx-pkaIy47BasL3fjDlAcxAXUTjInOhXHAE,919
120
- sglang/srt/model_loader/loader.py,sha256=VBrY4W9CiVvS_D8yXhdkW9jReV9rSMSkJplabz0Fxgk,43528
120
+ sglang/srt/model_loader/loader.py,sha256=7OG_8-66vFDFZ9kVKGNK1BFBjZ6ql449dlyvdCbMqvE,43876
121
121
  sglang/srt/model_loader/utils.py,sha256=0NaMR67fESFopaklmsleiL27XH1QUrjZW246MUu1EJ0,1369
122
122
  sglang/srt/model_loader/weight_utils.py,sha256=kQo9KPThjH3HAOCfC_tdwdrshdWuWJOVpPR0skSyaRY,24193
123
123
  sglang/srt/models/baichuan.py,sha256=PzBOFcEAixakPEkQSaJwC0Xc1fu-yCsN9T0I67r8QmY,14919
@@ -128,7 +128,7 @@ sglang/srt/models/deepseek.py,sha256=_cVOvR6eSEgRf6TUBpTD5uMdijDWFw4sSt4lGzl8tbg
128
128
  sglang/srt/models/deepseek_v2.py,sha256=-v_OJr2c3gJ0NMxQjvT3Jknz1XPGkzKx0TVR3NIiC6A,37284
129
129
  sglang/srt/models/exaone.py,sha256=dkERTZVxrRroqu5AGLP7D4N6n8HvDqlNaDQUIe15mZY,13038
130
130
  sglang/srt/models/gemma.py,sha256=ydRqsG-7004r1fAiz01LHUmcj_6XN0Tn4xO1keJnMQk,12126
131
- sglang/srt/models/gemma2.py,sha256=41PlW8pMb4rMETdAni_JWDhZeIn_QsTQireAyUjsURA,15848
131
+ sglang/srt/models/gemma2.py,sha256=-bFN-Te3YWAunLCrF-XFk_6fJS7gHM4Ca6h6aesXUTM,16362
132
132
  sglang/srt/models/gemma2_reward.py,sha256=nJ01KfqLSJtqMLm3sG8p2mGZFK1xhhjh7I7Ccb-_Hq8,2494
133
133
  sglang/srt/models/gpt2.py,sha256=2je1kE09sGcaORWnJuGYAkcwwOrT9EK-KhQaoCKjCSA,9517
134
134
  sglang/srt/models/gpt_bigcode.py,sha256=tovyOdJu2x3LkzmkdFXX_iJdkxuyChIDxwgvPBy6UPo,9528
@@ -136,7 +136,7 @@ sglang/srt/models/granite.py,sha256=AeQY9Dxd1ZnwgCYBK0vSXXiMGM-yt9iaOVf_ruOUHXw,
136
136
  sglang/srt/models/grok.py,sha256=J9lgNbFebvXgF19nfZyHwlGPlGWY_m0LgP506YvOYrU,15668
137
137
  sglang/srt/models/internlm2.py,sha256=_xcKtd6YtEFUTozaN-yUb0xbSYckRpomfPSKcAk4j-Y,12127
138
138
  sglang/srt/models/internlm2_reward.py,sha256=8K26A9oIFFGx_9U2mF87j7FX8K87HGKMnVL3ht1Uc7I,2398
139
- sglang/srt/models/llama.py,sha256=S7nS05hhFGghXu0v-w9RZyBTY6OCEVF5Aaw4GX_E_9g,19929
139
+ sglang/srt/models/llama.py,sha256=o3FYyOhkZJirzugyYz1kxs6RpY84O_uKowWWmt3jv24,19929
140
140
  sglang/srt/models/llama_classification.py,sha256=DwboM1xHXdf3Fddf7xGnrfdOLJwXdiJs994cIpAPa2g,2984
141
141
  sglang/srt/models/llama_embedding.py,sha256=rh-AiczPY_pTpzcACHvSMVjh1hsV_MZBBwP0LQxPsGM,3130
142
142
  sglang/srt/models/llama_reward.py,sha256=oPxh5E2UkxLULNdR68dFvt2I7j33CJFN6nyA-8L2_cg,4516
@@ -162,10 +162,10 @@ sglang/srt/models/torch_native_llama.py,sha256=YeXHorFm6QfnczLXwPb5TG9a-He0uiA9R
162
162
  sglang/srt/models/xverse.py,sha256=Oq--KqvbYu2H4TMVGEHpSnJLEwXBpxlncR9ilsQeckc,13579
163
163
  sglang/srt/models/xverse_moe.py,sha256=7E60YIST4ELYwLRgjtHiLRI5Uyc7XqQTM7jQXiWaQs4,15541
164
164
  sglang/srt/models/yivl.py,sha256=88OubtuZ38Dxb2LzfV_MTPBI4wKhh4NJqFu--efbhFM,4809
165
- sglang/srt/openai_api/adapter.py,sha256=DbLA4-v-QrKJHYDH4fpDSXqmyz_vpcFE-1tnhh60m6o,54057
166
- sglang/srt/openai_api/protocol.py,sha256=ecRNNqkhwwKZaIoJlPhtp2VTcHxBJDbNN8lrKS7uBx8,10406
165
+ sglang/srt/openai_api/adapter.py,sha256=X0HLuNhg-chDQjcdsQIRpZijlImEwZLHum3G0JgU4Go,54834
166
+ sglang/srt/openai_api/protocol.py,sha256=RMzeDfh2tZITjhNwB2nX68wZwQe40N6HBuVebCzEWiU,10468
167
167
  sglang/srt/sampling/sampling_batch_info.py,sha256=s--zNjk-LErZ5lMqnZ7KiuJltaziKRbQAU5qYpKIxAc,8564
168
- sglang/srt/sampling/sampling_params.py,sha256=n7RbBg_bS5fYhsiWa8uJYnfoXy_i5DvtTBOkuFnHDNU,5286
168
+ sglang/srt/sampling/sampling_params.py,sha256=BkgCJAOSmQXwJrNXg26zSjKfMy0d5mMN6oHRk_ZuESI,5499
169
169
  sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
170
170
  sglang/srt/sampling/penaltylib/orchestrator.py,sha256=J-DEemZcKm1--o37kf3qDOE8SZ_6H3d5oex49Mgq2ZU,10762
171
171
  sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=1Zp2aL6dD60mwD1tCcSG0x5IYo0v4z9ce-q_YwbJ9f8,2490
@@ -188,8 +188,8 @@ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c
188
188
  sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
189
189
  sglang/test/test_utils.py,sha256=HJG7kUQOk6n9FBbH89PDtQ41C3kt1cfJODhAEcFT0AQ,23823
190
190
  sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
191
- sglang-0.4.1.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
192
- sglang-0.4.1.dist-info/METADATA,sha256=RlVEQtwr_CCGTs83vNPwWXQukutbFfBz9xBPlXSl6qc,22523
193
- sglang-0.4.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
194
- sglang-0.4.1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
195
- sglang-0.4.1.dist-info/RECORD,,
191
+ sglang-0.4.1.post1.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
192
+ sglang-0.4.1.post1.dist-info/METADATA,sha256=R2YDOrUU_49x5TEbNUODNlXvkSIzFqT7-hvInlSCs5k,22527
193
+ sglang-0.4.1.post1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
194
+ sglang-0.4.1.post1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
195
+ sglang-0.4.1.post1.dist-info/RECORD,,