sglang 0.3.1__py3-none-any.whl → 0.3.1.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. sglang/bench_latency.py +10 -3
  2. sglang/bench_server_latency.py +187 -0
  3. sglang/bench_serving.py +1 -1
  4. sglang/global_config.py +5 -13
  5. sglang/lang/interpreter.py +0 -3
  6. sglang/srt/constrained/fsm_cache.py +5 -1
  7. sglang/srt/layers/activation.py +16 -1
  8. sglang/srt/layers/attention_backend.py +12 -12
  9. sglang/srt/layers/fused_moe/layer.py +27 -7
  10. sglang/srt/layers/layernorm.py +21 -6
  11. sglang/srt/layers/sampler.py +40 -98
  12. sglang/srt/lora/lora_manager.py +11 -8
  13. sglang/srt/managers/io_struct.py +3 -0
  14. sglang/srt/managers/policy_scheduler.py +49 -93
  15. sglang/srt/managers/schedule_batch.py +2 -1
  16. sglang/srt/managers/tp_worker.py +19 -13
  17. sglang/srt/model_executor/cuda_graph_runner.py +25 -13
  18. sglang/srt/model_executor/model_runner.py +37 -46
  19. sglang/srt/models/deepseek_v2.py +8 -3
  20. sglang/srt/models/llama.py +1 -3
  21. sglang/srt/models/llama_classification.py +2 -3
  22. sglang/srt/models/minicpm3.py +7 -3
  23. sglang/srt/models/olmoe.py +415 -0
  24. sglang/srt/models/xverse.py +1 -3
  25. sglang/srt/models/xverse_moe.py +1 -4
  26. sglang/srt/sampling/sampling_batch_info.py +3 -50
  27. sglang/srt/server.py +6 -1
  28. sglang/srt/server_args.py +39 -10
  29. sglang/srt/utils.py +7 -51
  30. sglang/test/few_shot_gsm8k.py +8 -2
  31. sglang/test/test_utils.py +1 -1
  32. sglang/version.py +1 -1
  33. {sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/METADATA +4 -5
  34. {sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/RECORD +37 -35
  35. {sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/WHEEL +1 -1
  36. {sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/LICENSE +0 -0
  37. {sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -21,6 +21,8 @@ import logging
21
21
  import random
22
22
  from typing import List, Optional, Union
23
23
 
24
+ from sglang.srt.utils import is_hip
25
+
24
26
  logger = logging.getLogger(__name__)
25
27
 
26
28
 
@@ -59,6 +61,7 @@ class ServerArgs:
59
61
  tp_size: int = 1
60
62
  stream_interval: int = 1
61
63
  random_seed: Optional[int] = None
64
+ constrained_json_whitespace_pattern: Optional[str] = None
62
65
 
63
66
  # Logging
64
67
  log_level: str = "info"
@@ -94,11 +97,12 @@ class ServerArgs:
94
97
  disable_cuda_graph_padding: bool = False
95
98
  disable_disk_cache: bool = False
96
99
  disable_custom_all_reduce: bool = False
100
+ disable_mla: bool = False
97
101
  enable_mixed_chunk: bool = False
98
102
  enable_torch_compile: bool = False
103
+ max_torch_compile_bs: int = 32
99
104
  torchao_config: str = ""
100
105
  enable_p2p_check: bool = False
101
- enable_mla: bool = False
102
106
  triton_attention_reduce_in_fp32: bool = False
103
107
 
104
108
  # LoRA
@@ -152,11 +156,12 @@ class ServerArgs:
152
156
  )
153
157
  self.sampling_backend = "pytorch"
154
158
 
155
- # Default kernel backends
156
- if self.enable_mla:
157
- logger.info("MLA optimization is tunred on. Use triton backend.")
159
+ # ROCm: flashinfer available later
160
+ if is_hip():
158
161
  self.attention_backend = "triton"
162
+ self.sampling_backend = "pytorch"
159
163
 
164
+ # Default kernel backends
160
165
  if self.attention_backend is None:
161
166
  self.attention_backend = "flashinfer"
162
167
 
@@ -359,6 +364,12 @@ class ServerArgs:
359
364
  default=ServerArgs.random_seed,
360
365
  help="The random seed.",
361
366
  )
367
+ parser.add_argument(
368
+ "--constrained-json-whitespace-pattern",
369
+ type=str,
370
+ default=ServerArgs.constrained_json_whitespace_pattern,
371
+ help=r"Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
372
+ )
362
373
  parser.add_argument(
363
374
  "--log-level",
364
375
  type=str,
@@ -488,6 +499,11 @@ class ServerArgs:
488
499
  default=False,
489
500
  help="Disable the custom all-reduce kernel and fall back to NCCL.",
490
501
  )
502
+ parser.add_argument(
503
+ "--disable-mla",
504
+ action="store_true",
505
+ help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
506
+ )
491
507
  parser.add_argument(
492
508
  "--enable-mixed-chunk",
493
509
  action="store_true",
@@ -498,6 +514,12 @@ class ServerArgs:
498
514
  action="store_true",
499
515
  help="Optimize the model with torch.compile. Experimental feature.",
500
516
  )
517
+ parser.add_argument(
518
+ "--max-torch-compile-bs",
519
+ type=int,
520
+ default=ServerArgs.max_torch_compile_bs,
521
+ help="Set the maximum batch size when using torch compile.",
522
+ )
501
523
  parser.add_argument(
502
524
  "--torchao-config",
503
525
  type=str,
@@ -509,11 +531,6 @@ class ServerArgs:
509
531
  action="store_true",
510
532
  help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
511
533
  )
512
- parser.add_argument(
513
- "--enable-mla",
514
- action="store_true",
515
- help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
516
- )
517
534
  parser.add_argument(
518
535
  "--triton-attention-reduce-in-fp32",
519
536
  action="store_true",
@@ -532,7 +549,8 @@ class ServerArgs:
532
549
  type=str,
533
550
  nargs="*",
534
551
  default=None,
535
- help="The list of LoRA adapters.",
552
+ action=LoRAPathAction,
553
+ help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}",
536
554
  )
537
555
  parser.add_argument(
538
556
  "--max-loras-per-batch",
@@ -590,3 +608,14 @@ class PortArgs:
590
608
  controller_port: int
591
609
  detokenizer_port: int
592
610
  nccl_ports: List[int]
611
+
612
+
613
+ class LoRAPathAction(argparse.Action):
614
+ def __call__(self, parser, namespace, values, option_string=None):
615
+ setattr(namespace, self.dest, {})
616
+ for lora_path in values:
617
+ if "=" in lora_path:
618
+ name, path = lora_path.split("=", 1)
619
+ getattr(namespace, self.dest)[name] = path
620
+ else:
621
+ getattr(namespace, self.dest)[lora_path] = lora_path
sglang/srt/utils.py CHANGED
@@ -51,6 +51,11 @@ show_time_cost = False
51
51
  time_infos = {}
52
52
 
53
53
 
54
+ # torch flag AMD GPU
55
+ def is_hip() -> bool:
56
+ return torch.version.hip is not None
57
+
58
+
54
59
  def enable_show_time_cost():
55
60
  global show_time_cost
56
61
  show_time_cost = True
@@ -187,7 +192,7 @@ def allocate_init_ports(
187
192
  cur_port += 1
188
193
 
189
194
  if port is not None and ret_ports[0] != port:
190
- logger.warn(
195
+ logger.warning(
191
196
  f"WARNING: Port {port} is not available. Use port {ret_ports[0]} instead."
192
197
  )
193
198
 
@@ -623,56 +628,7 @@ def set_ulimit(target_soft_limit=65535):
623
628
  try:
624
629
  resource.setrlimit(resource_type, (target_soft_limit, current_hard))
625
630
  except ValueError as e:
626
- logger.warn(f"Fail to set RLIMIT_NOFILE: {e}")
627
-
628
-
629
- def is_llama3_405b_fp8_head_16(model_config):
630
- """Return whether the model is meta-llama/Meta-Llama-3.1-405B-FP8 with 16 kv heads."""
631
- if (
632
- model_config.hf_config.architectures[0] == "LlamaForCausalLM"
633
- and model_config.hf_config.hidden_size == 16384
634
- and model_config.hf_config.intermediate_size == 53248
635
- and model_config.hf_config.num_hidden_layers == 126
636
- and model_config.hf_config.num_key_value_heads == 16
637
- and hasattr(model_config.hf_config, "quantization_config")
638
- and model_config.hf_config.quantization_config["quant_method"] == "fbgemm_fp8"
639
- ):
640
- return True
641
- return False
642
-
643
-
644
- def monkey_patch_vllm_qvk_linear_loader():
645
- """A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints."""
646
- from vllm.model_executor.layers.linear import QKVParallelLinear
647
-
648
- origin_weight_loader = QKVParallelLinear.weight_loader
649
-
650
- def get_original_weight(loaded_weight, head_dim):
651
- n_kv_head = loaded_weight.shape[0] // (2 * head_dim)
652
- dim = loaded_weight.shape[1]
653
- for i in range(n_kv_head):
654
- loaded_weight[i * head_dim : (i + 1) * head_dim, :] = loaded_weight[
655
- 2 * i * head_dim : (2 * i + 1) * head_dim, :
656
- ]
657
- original_kv_weight = loaded_weight[: n_kv_head * head_dim, :]
658
- assert original_kv_weight.shape == (n_kv_head * head_dim, dim)
659
- return original_kv_weight
660
-
661
- def weight_loader_srt(
662
- self,
663
- param: Parameter,
664
- loaded_weight: torch.Tensor,
665
- loaded_shard_id: Optional[str] = None,
666
- ):
667
- if (
668
- loaded_shard_id in ["k", "v"]
669
- and loaded_weight.shape[0] == self.head_size * self.total_num_kv_heads * 2
670
- ):
671
- loaded_weight = get_original_weight(loaded_weight, self.head_size)
672
-
673
- origin_weight_loader(self, param, loaded_weight, loaded_shard_id)
674
-
675
- setattr(QKVParallelLinear, "weight_loader", weight_loader_srt)
631
+ logger.warning(f"Fail to set RLIMIT_NOFILE: {e}")
676
632
 
677
633
 
678
634
  def add_api_key_middleware(app, api_key: str):
@@ -44,7 +44,7 @@ def get_answer_value(answer_str):
44
44
  return INVALID
45
45
 
46
46
 
47
- def main(args):
47
+ def run_eval(args):
48
48
  # Select backend
49
49
  set_default_backend(RuntimeEndpoint(f"{args.host}:{args.port}"))
50
50
 
@@ -119,6 +119,12 @@ def main(args):
119
119
  # Dump results
120
120
  dump_state_text("tmp_output_gsm8k.txt", states)
121
121
 
122
+ return {
123
+ "accuracy": acc,
124
+ "latency": latency,
125
+ "output_throughput": output_throughput,
126
+ }
127
+
122
128
 
123
129
  if __name__ == "__main__":
124
130
  parser = argparse.ArgumentParser()
@@ -129,4 +135,4 @@ if __name__ == "__main__":
129
135
  parser.add_argument("--host", type=str, default="http://127.0.0.1")
130
136
  parser.add_argument("--port", type=int, default=30000)
131
137
  args = parser.parse_args()
132
- main(args)
138
+ run_eval(args)
sglang/test/test_utils.py CHANGED
@@ -22,6 +22,7 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
22
22
  from sglang.srt.utils import kill_child_process
23
23
  from sglang.utils import get_exception_traceback
24
24
 
25
+ DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
25
26
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
26
27
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
27
28
  DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
@@ -304,7 +305,6 @@ def add_common_sglang_args_and_parse(parser: argparse.ArgumentParser):
304
305
  def select_sglang_backend(args: argparse.Namespace):
305
306
  if args.backend.startswith("srt"):
306
307
  if args.backend == "srt-no-parallel":
307
- global_config.enable_parallel_decoding = False
308
308
  global_config.enable_parallel_encoding = False
309
309
  backend = RuntimeEndpoint(f"{args.host}:{args.port}")
310
310
  elif args.backend.startswith("gpt-"):
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.1"
1
+ __version__ = "0.3.1.post2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.1
3
+ Version: 0.3.1.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -269,7 +269,7 @@ Requires-Dist: peft; extra == "test"
269
269
 
270
270
  --------------------------------------------------------------------------------
271
271
 
272
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
272
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
273
273
 
274
274
  SGLang is a fast serving framework for large language models and vision language models.
275
275
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -278,7 +278,7 @@ The core features include:
278
278
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
279
279
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
280
280
  - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
281
- - **Active Community**: SGLang is open-source and backed by an active community with industry adoption, welcoming contributions to improve LLM and VLM serving.
281
+ - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
282
282
 
283
283
  ## News
284
284
  - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
318
318
  ### Method 2: From source
319
319
  ```
320
320
  # Use the last release branch
321
- git clone -b v0.3.1 https://github.com/sgl-project/sglang.git
321
+ git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
322
322
  cd sglang
323
323
 
324
324
  pip install --upgrade pip
@@ -483,7 +483,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
483
483
  - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
484
484
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
485
485
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
486
- - To enable DeepSeek MLA acceleration, add `--enable-mla`.
487
486
  - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
488
487
  - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
489
488
  ```
@@ -1,18 +1,19 @@
1
1
  sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
2
2
  sglang/api.py,sha256=pH4CjwOXUweL5MF1sIkFMddDxfnF7PyUxEHC5kvNVbI,6468
3
- sglang/bench_latency.py,sha256=EvmXpaREU-g25OTcOUTgAUPmA-txfnyjaqY-4hlq97w,16925
4
- sglang/bench_serving.py,sha256=6OM5JIDuoxJDg-VLE4ijGGcS8-6ViaidV05lIrZmSzo,36239
3
+ sglang/bench_latency.py,sha256=bA50iUYOxEnLjzY2S4AgwxtSAqujUbGfQFwbLZj5XNc,17160
4
+ sglang/bench_server_latency.py,sha256=KvFJgKQTSons7KOG0CBqnnOOx1gW29bBM1Z3GQO_6-E,5599
5
+ sglang/bench_serving.py,sha256=3gIJ1O2x51Fwd4wYJjgwluTbWKXL-azckQte7YC5zIc,36261
5
6
  sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
6
- sglang/global_config.py,sha256=KWpXd4OCCWW2TRQo-dShvLs4jb15ej9Ejhxr_wggzBg,1535
7
+ sglang/global_config.py,sha256=38id86i3tRGCSOFZlN1LM01a3xt-V98xuNgKGG9boCk,1058
7
8
  sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
8
9
  sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
9
10
  sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
10
- sglang/version.py,sha256=r4xAFihOf72W9TD-lpMi6ntWSTKTP2SlzKP1ytkjRbI,22
11
+ sglang/version.py,sha256=U9F0UlFDynnYN5dX-kxehylWCwXo9a6E6W4FfDusfRg,28
11
12
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
13
  sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
13
14
  sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
14
15
  sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
15
- sglang/lang/interpreter.py,sha256=M42SuOnijFaHWOe3Qyi-bNanRt-mYhSDa1wWn1J42Hw,30324
16
+ sglang/lang/interpreter.py,sha256=rOquFbMzxry7IItZlAn5TwtQfxMy718JPxOkiXO-yrg,30234
16
17
  sglang/lang/ir.py,sha256=W3UfZikcGeT86PDDjDjw-yNzrKY2e2UYO4DTatMCfm0,17704
17
18
  sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
18
19
  sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -25,82 +26,83 @@ sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bE
25
26
  sglang/srt/conversation.py,sha256=S5w5V6G1xigNxa3UQoSxRcMpQLWWDT9EPBoHBvHkSAk,19663
26
27
  sglang/srt/hf_transformers_utils.py,sha256=6HlqcmGPIvnSGaEEICeuzwag1QylSoSGbXRVvUdIMDo,6016
27
28
  sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
28
- sglang/srt/server.py,sha256=FNmTpX7E9fVWj_NFzp4AtE5ODaA_rg5Xm8uZ0FB0X4o,20041
29
- sglang/srt/server_args.py,sha256=5OHH3gaO1s5Y2UQw2_FnFxwxrsqnUQ_WNqP1R1IWUAA,21877
30
- sglang/srt/utils.py,sha256=pckOt7gyQfJaV3-h8FPurWyrPij5_EBUX_Xp7x6y6YM,24229
29
+ sglang/srt/server.py,sha256=n4QRn36_t-HAH-lSME3tiZSCUGRQwqMUckgs0paHq5g,20179
30
+ sglang/srt/server_args.py,sha256=3XjDt6SSjTfbOe0HSXA--2aUvrpWSnQmAHYwmeS1-M0,23159
31
+ sglang/srt/utils.py,sha256=8yxiMRttCcfswynkNPWD3yZFNAGFz2P1PzSuxHCBGns,22340
31
32
  sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
32
33
  sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
33
34
  sglang/srt/configs/model_config.py,sha256=OqHrucJQHbH-wxgkGj-Dcx_B888uUGASpLRjz40HaLY,6651
34
35
  sglang/srt/constrained/__init__.py,sha256=ze8awDPvwAzdeMwzJ-25kXOQ4nVWoaP55jBDt5UOS_4,2070
35
36
  sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
36
- sglang/srt/constrained/fsm_cache.py,sha256=jItSvCu_XrAgltfejwgvdltaiT98-8lJGBe_84cSnTk,2786
37
+ sglang/srt/constrained/fsm_cache.py,sha256=k7DRUAaiLTEX5UarfJ17gEYQ-QWQAGfvykjYFkM_Y2U,2982
37
38
  sglang/srt/constrained/jump_forward.py,sha256=9_HxmXtWjr5S6a5e0cBimbY3ZhiLiJC74V6jIqDXfuo,6575
38
- sglang/srt/layers/activation.py,sha256=JEXNTgqxoiU4N-gVm4XMjobhft4JKDcMrgTkfpsRUzM,4856
39
- sglang/srt/layers/attention_backend.py,sha256=39P3iMs7B1iEzCA3EHdqUp3BLafeIVFnFWGzpEhlTRk,18182
39
+ sglang/srt/layers/activation.py,sha256=i3omgj3GdUIZBqJNUjpdJsMc2UM3Lx07FT2J1WICrqA,5171
40
+ sglang/srt/layers/attention_backend.py,sha256=lqMsY4VaOO_szIWoTAinXf1DnP2UsbF32kzvwFySz9w,18119
40
41
  sglang/srt/layers/flashinfer_utils.py,sha256=jyaO7XiEisFZg_dfaCbfRCHSHSKYoM1wOzfHa0h1q14,7413
41
- sglang/srt/layers/layernorm.py,sha256=RXuS4UyksatqTF6lSK7VYyEiUEnBiNIBlEn8q4w84UA,3404
42
+ sglang/srt/layers/layernorm.py,sha256=p_7bnmSpJ_slpoP0Gk5wQPpHtLllUu3imSIRBqGqTP0,3737
42
43
  sglang/srt/layers/logits_processor.py,sha256=Js2qSk1Z3uPL2cYO1ARai51f2i8OedV3qdwByQVSJtI,12439
43
44
  sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
44
45
  sglang/srt/layers/radix_attention.py,sha256=EcVO0fUSmgvE_9R-MlpgJq0O_uT8ACuHzbMi19bANYc,1874
45
- sglang/srt/layers/sampler.py,sha256=1BKsZbSLBGFVtTJo1LsThuoRjOSOnsL1AiwFxJNIXRs,5800
46
+ sglang/srt/layers/sampler.py,sha256=Y0o1bndTGRD713fHMbN5-LRUiyneBkb7bH_QlkkeqSs,3836
46
47
  sglang/srt/layers/torchao_utils.py,sha256=rTECwKSXhj_ylh_iSzfbopz9_lZOFHatquQrNJNLZlE,2703
47
48
  sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
48
49
  sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
49
- sglang/srt/layers/fused_moe/layer.py,sha256=GT3r2UPx_PAufJd0SUMOXyh76ymAeYDubd0SM0H71bo,20977
50
+ sglang/srt/layers/fused_moe/layer.py,sha256=raFyvPzjYz-Fv8B3IcOxQYKKCWqXis5mXwg1GFE61y4,22243
50
51
  sglang/srt/layers/triton_attention/decode_attention.py,sha256=XCQTX0kUttT1AG5FRMgfQbiXgvoempYD0UR2r6D_vJg,16711
51
52
  sglang/srt/layers/triton_attention/extend_attention.py,sha256=XTUTMrE-5jfMEufQUifZ-8NJQABSPcF47qhnNT5Z1iI,11050
52
53
  sglang/srt/layers/triton_attention/prefill_attention.py,sha256=QkXPcT02c13zha2M4mBm2S5dh_sS-Gc4FkkrcywRqvc,5377
53
54
  sglang/srt/lora/lora.py,sha256=ksj866lgDul6zxO30Jm7Nrjv-mFAMrzdvP8sez3Pl6U,14938
54
55
  sglang/srt/lora/lora_config.py,sha256=paVB7F7SIuxr_vodvKf8zzAlH2fdVYHhXxcXV62D0Vo,1411
55
- sglang/srt/lora/lora_manager.py,sha256=Q7rk1SMEZ75wda68rAZDGVyX_o8ZdIW2I5Fo_llaqHs,9475
56
+ sglang/srt/lora/lora_manager.py,sha256=7J7cGmyy1Ph4HCvLdM-ViAizAbV1snZqD-S7JLWXasI,9561
56
57
  sglang/srt/managers/controller_multi.py,sha256=KolZDso2WqH1ZhQw9p1eTmlFRgo4bcvzBxE44_sNE_o,6300
57
58
  sglang/srt/managers/controller_single.py,sha256=DiZALP_iIPZQMRx09a-LwT5_Dg7p-WU8HXyMoxJ9sRA,4955
58
59
  sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
59
- sglang/srt/managers/io_struct.py,sha256=bqmL3NDPLqOn6Au3WLF0NOe8Dh7ECMN7BTHCkEZ_Edk,11247
60
- sglang/srt/managers/policy_scheduler.py,sha256=tiBUi2GJU5eQEBK6HfsO1_YjWtFkougo40954DIp4dM,13026
61
- sglang/srt/managers/schedule_batch.py,sha256=QfixWzh7ks60eYE52mZHfUseXqcb89h4ZO1Aur3weLU,27340
60
+ sglang/srt/managers/io_struct.py,sha256=yNV5BmeUzLPqv19j79kXQ50Iaqdk4vP-_TciiRf4OEE,11396
61
+ sglang/srt/managers/policy_scheduler.py,sha256=PVo0DV0-5ODNN7FkPkeF1Y8BQ6uuLldPETOlB_YvvL4,11560
62
+ sglang/srt/managers/schedule_batch.py,sha256=ns2qkaYAvzul-LCV1BEB6q1t5jKyftNsReMv62PC8M0,27386
62
63
  sglang/srt/managers/tokenizer_manager.py,sha256=ql-sObjl1oRigJwnLtqqTaaw-i7gPTDMoNXDEMftr40,29643
63
- sglang/srt/managers/tp_worker.py,sha256=Zbl_tFUAsD6Qv1fUEJCn_jyUc3JjDm33yI3Nmu1HY8w,39174
64
+ sglang/srt/managers/tp_worker.py,sha256=0Y0k-roDrBxWZxD0axv5CCvUUW8vsJ8n78TANHLzEFs,39503
64
65
  sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
65
66
  sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
66
67
  sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
67
68
  sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
68
69
  sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
69
- sglang/srt/model_executor/cuda_graph_runner.py,sha256=LngmwtBcvobJ_9G8lD966SihjmMJlgMgHe_ZogK1kDg,10090
70
+ sglang/srt/model_executor/cuda_graph_runner.py,sha256=gZ0Wukqz6u67MMIj4MC8JET9jcHdh0rotYzpuPlHruY,10512
70
71
  sglang/srt/model_executor/forward_batch_info.py,sha256=yvkhayY9Zu6gysoojcGT73lADGOtfHKkFKWdJLRyACI,6141
71
- sglang/srt/model_executor/model_runner.py,sha256=7jBSCdZxyDLWMOdwv1vRa7Oue-xbp8lA6I11ZPKFdAc,23457
72
+ sglang/srt/model_executor/model_runner.py,sha256=X7AG1k9AI_kqS8q1i5Bfv-kFysIdqJAVWMGGZoAPThY,22726
72
73
  sglang/srt/models/baichuan.py,sha256=NrG1rMJXhemkrUCEf8xKOSDQVsOD-nN8RQz6MWHOg84,15124
73
74
  sglang/srt/models/chatglm.py,sha256=KwxLHBEvK02McXDvBS0gnRxfIvOAu2QP7lgibrj9Nbc,13371
74
75
  sglang/srt/models/commandr.py,sha256=2rAXRZRb4PkJZ4NWEqP_rIgsjxbdZyHpuoMOarqTWzQ,14163
75
76
  sglang/srt/models/dbrx.py,sha256=N_0Ku_p1NCsc29NktUBNqPv7Z33XhYxOZK5xN7nzW4s,14661
76
77
  sglang/srt/models/deepseek.py,sha256=7UJgde1EV9ey6d-CKRcEyTKh1_WhZdatpZiltIuqpik,16006
77
- sglang/srt/models/deepseek_v2.py,sha256=3D9WtPvVOu8U40x_KOksnmWBLmLIcgtV958go8NSj5Q,28307
78
+ sglang/srt/models/deepseek_v2.py,sha256=1J0pt1jZRcBBGYbgt1wGiuxPcrdpfTEUEaGFqju6TVA,28431
78
79
  sglang/srt/models/exaone.py,sha256=3I5ZoiLotf7U-8c9QJRubpgf6JDx9I_z-ViXQlCC-x8,13087
79
80
  sglang/srt/models/gemma.py,sha256=GkwgGFHgGlXgBZN7s7Wooz5tMyCp1YtgLahU2NOo66M,12273
80
81
  sglang/srt/models/gemma2.py,sha256=sFfCNEm0_OOWElRSTDuroRv8wNMX8v_81Uko9m546KA,14923
81
82
  sglang/srt/models/gpt_bigcode.py,sha256=kzHYogeGXZF4KHpkXA-RGqvs016mA-6klWxD2QJTi9E,10195
82
83
  sglang/srt/models/grok.py,sha256=6I4OwQwNyAbh5GF24_SRm12XYBvM9iGWB-T4TSTJ0wU,14929
83
84
  sglang/srt/models/internlm2.py,sha256=6j7JH0p3yib8GZDH8Cmrs-pgwfH3eOlAK6V3Cq64O7w,12202
84
- sglang/srt/models/llama.py,sha256=tjdjlIxJr31vgbzGBP_el9RgYxw1kzvmqnVinnTVVUw,15259
85
- sglang/srt/models/llama_classification.py,sha256=A2ABTUD5u4XoWv1dsIPU7wcCQP3jhbDJblMhLgaiFBA,3402
85
+ sglang/srt/models/llama.py,sha256=nbJwRcG9DnurVNSGLKJjnmBmTXP1_5WZpudth_0PVpw,15216
86
+ sglang/srt/models/llama_classification.py,sha256=HF-69J9qIYdfX0R5wEtIgvafMzprKcXdvF3W_orl_kA,3394
86
87
  sglang/srt/models/llama_embedding.py,sha256=RI2mpYheP5WwhuTINU-6IrU61usuMyCK9h2zDEyLW4g,3458
87
88
  sglang/srt/models/llava.py,sha256=O4XGdl70Hh4tM_OHapFGHbReC82mbe9xLw6GELKWKhU,24881
88
89
  sglang/srt/models/llavavid.py,sha256=ou5uIuskBoBo0lXvqFFfDLBYYVfehx27n-Lu8X9gpLs,11992
89
90
  sglang/srt/models/minicpm.py,sha256=ioqCsTCE_oF8xqGF5fm5cK9dclK5Y0EQ1UJfyteIDDo,13825
90
- sglang/srt/models/minicpm3.py,sha256=S7bNeCAsfvL44Vn350KLaqX674SCb4CpUuDnhjLjr3U,25113
91
+ sglang/srt/models/minicpm3.py,sha256=McPWyy2fQqfHUhi9Nk36rkvvPAS8RmLOY7Vh4ah5c1w,25216
91
92
  sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
92
93
  sglang/srt/models/mixtral.py,sha256=oRC7mKBrPJhvzkWSabrbeQQQac-jtF4EV6H2Sgjc5JY,13897
93
94
  sglang/srt/models/mixtral_quant.py,sha256=wMACJq78OTWj7HlqPDRNEh8cjrVAjKqJEsOG3CO5xow,14072
95
+ sglang/srt/models/olmoe.py,sha256=d0ECpU-IXXwGYg9tkVeMARUbqVcqEnWfpH3rrNiGKA0,15336
94
96
  sglang/srt/models/qwen.py,sha256=nqSRzkiZzpRVG6WGQ1MBUclQnXyw8jlvoOq-euM8j5s,9954
95
97
  sglang/srt/models/qwen2.py,sha256=9_M-VkHN1_T1XN-gsl_L636QMQ9BLF2WqvTcx_1L6aw,12432
96
98
  sglang/srt/models/qwen2_moe.py,sha256=s7b5XnSvsBYtZZUkjPp442m59CqPJ3HxGUIwXBVWsXw,17153
97
99
  sglang/srt/models/stablelm.py,sha256=30ngpc0Xq3VxzXJlf6svP1oax8Q3krMJkxM8PVKtZWU,11359
98
- sglang/srt/models/xverse.py,sha256=luhp_90ZNkTpXHDCURO4MZBy1vbvHTVCwSe4PYYLWBs,13701
99
- sglang/srt/models/xverse_moe.py,sha256=YR--WZ33G7XEMsS7ZJl1cQ62Q8PDo9gWqpvJBY_cb-M,15886
100
+ sglang/srt/models/xverse.py,sha256=L3g32-je_7JmzF2-hztaIVshHYCIv7jOM3oFs-fb2MY,13658
101
+ sglang/srt/models/xverse_moe.py,sha256=CgDD9cR83UVfTsPU6WcbHVYBrkYKv_kTdwncTIx7Q7U,15842
100
102
  sglang/srt/models/yivl.py,sha256=B6MELthWIm5KdSzX3o2tbbpApY8XdjUdmcQSD4dQe_I,4835
101
103
  sglang/srt/openai_api/adapter.py,sha256=CJ47YftRHAip1FMcHIhtCorBtzlIkv7F0Wz_JUcI4T4,51032
102
104
  sglang/srt/openai_api/protocol.py,sha256=rdSwUAoO5-KLemJOE50xwSUagxY4T1QIiNyCYsTtCi0,9868
103
- sglang/srt/sampling/sampling_batch_info.py,sha256=vkwy59Jt51FESYukmwDKwPbCM45WMb16dx_408B3oqc,7900
105
+ sglang/srt/sampling/sampling_batch_info.py,sha256=GewqyxCrW2PFwuzGHaCR59Pvw6j0n2dKGrlJWYQWwW4,6149
104
106
  sglang/srt/sampling/sampling_params.py,sha256=ggOXxafqfCD-xrGYcM57byLZ79CIeBP4AD5F44L_CW0,5635
105
107
  sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
106
108
  sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
@@ -108,7 +110,7 @@ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq
108
110
  sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
109
111
  sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
110
112
  sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
111
- sglang/test/few_shot_gsm8k.py,sha256=uSHEPvUFbAgWKtaqxkhBpQrQV_SlTk0HN9FhjNLpL4g,3731
113
+ sglang/test/few_shot_gsm8k.py,sha256=To7Sdg-DLF8poIQLwiOBYKbkz-1C_gn6H79vIbyPR-o,3860
112
114
  sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
113
115
  sglang/test/runners.py,sha256=ZoWhT1TDXfLBVdbivXx1KUu9dhPlGjL_xrP18WLzVLo,11404
114
116
  sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
@@ -120,10 +122,10 @@ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9
120
122
  sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
121
123
  sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
122
124
  sglang/test/test_programs.py,sha256=3-XKnppQdCNWjaJb6jwib5Z9OSpgKvH8SFLJbE4J9qI,17001
123
- sglang/test/test_utils.py,sha256=iBs07MBFxOidipTG1-s2hrCvcURFJVXo7gg10pzAQX8,17168
125
+ sglang/test/test_utils.py,sha256=dsHRd1xLzcjlarxUnDIz2XEHfut7HvqVPwx2Fn7vf10,17179
124
126
  sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
125
- sglang-0.3.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
126
- sglang-0.3.1.dist-info/METADATA,sha256=QKZQ7PjuK22x_QlQy1LqPX6y4zLgJJ9FPoNNSkw3cEk,38125
127
- sglang-0.3.1.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
128
- sglang-0.3.1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
129
- sglang-0.3.1.dist-info/RECORD,,
127
+ sglang-0.3.1.post2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
128
+ sglang-0.3.1.post2.dist-info/METADATA,sha256=WxMy8Ur_rjPxqVOoWSFoM3eBHWt0cKGyrtwOUfWL-Vc,38114
129
+ sglang-0.3.1.post2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
130
+ sglang-0.3.1.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
131
+ sglang-0.3.1.post2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (74.1.2)
2
+ Generator: setuptools (75.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5