sglang 0.3.1.post1__py3-none-any.whl → 0.3.1.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. sglang/bench_latency.py +11 -2
  2. sglang/bench_server_latency.py +187 -0
  3. sglang/bench_serving.py +1 -1
  4. sglang/srt/layers/activation.py +8 -4
  5. sglang/srt/layers/attention_backend.py +3 -1
  6. sglang/srt/layers/layernorm.py +10 -7
  7. sglang/srt/layers/linear.py +1133 -0
  8. sglang/srt/layers/quantization/__init__.py +76 -0
  9. sglang/srt/layers/quantization/base_config.py +122 -0
  10. sglang/srt/layers/sampler.py +9 -2
  11. sglang/srt/managers/io_struct.py +3 -0
  12. sglang/srt/managers/policy_scheduler.py +49 -93
  13. sglang/srt/managers/schedule_batch.py +1 -1
  14. sglang/srt/managers/tp_worker.py +11 -6
  15. sglang/srt/model_executor/cuda_graph_runner.py +15 -14
  16. sglang/srt/model_executor/model_runner.py +13 -5
  17. sglang/srt/models/baichuan.py +1 -1
  18. sglang/srt/models/chatglm.py +6 -6
  19. sglang/srt/models/commandr.py +7 -7
  20. sglang/srt/models/dbrx.py +7 -7
  21. sglang/srt/models/deepseek.py +7 -7
  22. sglang/srt/models/deepseek_v2.py +9 -9
  23. sglang/srt/models/exaone.py +6 -6
  24. sglang/srt/models/gemma.py +6 -6
  25. sglang/srt/models/gemma2.py +6 -6
  26. sglang/srt/models/gpt_bigcode.py +6 -6
  27. sglang/srt/models/grok.py +6 -6
  28. sglang/srt/models/internlm2.py +6 -6
  29. sglang/srt/models/llama.py +7 -9
  30. sglang/srt/models/llama_classification.py +3 -4
  31. sglang/srt/models/llava.py +1 -1
  32. sglang/srt/models/llavavid.py +1 -1
  33. sglang/srt/models/minicpm.py +6 -6
  34. sglang/srt/models/minicpm3.py +3 -3
  35. sglang/srt/models/mixtral.py +6 -6
  36. sglang/srt/models/mixtral_quant.py +6 -6
  37. sglang/srt/models/olmoe.py +1 -1
  38. sglang/srt/models/qwen.py +6 -6
  39. sglang/srt/models/qwen2.py +6 -6
  40. sglang/srt/models/qwen2_moe.py +7 -7
  41. sglang/srt/models/stablelm.py +6 -6
  42. sglang/srt/models/xverse.py +2 -4
  43. sglang/srt/models/xverse_moe.py +2 -5
  44. sglang/srt/models/yivl.py +1 -1
  45. sglang/srt/server_args.py +17 -21
  46. sglang/srt/utils.py +21 -1
  47. sglang/test/few_shot_gsm8k.py +8 -2
  48. sglang/test/test_utils.py +5 -2
  49. sglang/version.py +1 -1
  50. {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/METADATA +5 -5
  51. {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/RECORD +54 -50
  52. {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/LICENSE +0 -0
  53. {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/WHEEL +0 -0
  54. {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/top_level.txt +0 -0
@@ -34,7 +34,6 @@ from vllm.model_executor.layers.linear import (
34
34
  ReplicatedLinear,
35
35
  RowParallelLinear,
36
36
  )
37
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
38
37
  from vllm.model_executor.layers.rotary_embedding import get_rope
39
38
  from vllm.model_executor.layers.vocab_parallel_embedding import (
40
39
  ParallelLMHead,
@@ -43,6 +42,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
43
42
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
44
43
 
45
44
  from sglang.srt.layers.logits_processor import LogitsProcessor
45
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
46
46
  from sglang.srt.layers.radix_attention import RadixAttention
47
47
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
48
48
 
@@ -383,8 +383,6 @@ class XverseMoeForCausalLM(nn.Module):
383
383
  )
384
384
  self.logits_processor = LogitsProcessor(config)
385
385
 
386
- self.param_dict = dict(self.named_parameters())
387
-
388
386
  @torch.no_grad()
389
387
  def forward(
390
388
  self,
@@ -406,8 +404,7 @@ class XverseMoeForCausalLM(nn.Module):
406
404
  ("gate_up_proj", "gate_proj", 0),
407
405
  ("gate_up_proj", "up_proj", 1),
408
406
  ]
409
-
410
- params_dict = self.param_dict
407
+ params_dict = dict(self.named_parameters())
411
408
 
412
409
  for name, loaded_weight in weights:
413
410
  if "rotary_emb.inv_freq" in name:
sglang/srt/models/yivl.py CHANGED
@@ -21,9 +21,9 @@ import torch
21
21
  import torch.nn as nn
22
22
  from transformers import CLIPVisionModel, LlavaConfig
23
23
  from vllm.config import CacheConfig
24
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
25
24
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
26
25
 
26
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
27
27
  from sglang.srt.models.llava import LlavaLlamaForCausalLM
28
28
 
29
29
 
sglang/srt/server_args.py CHANGED
@@ -26,17 +26,6 @@ from sglang.srt.utils import is_hip
26
26
  logger = logging.getLogger(__name__)
27
27
 
28
28
 
29
- class LoRAPathAction(argparse.Action):
30
- def __call__(self, parser, namespace, values, option_string=None):
31
- setattr(namespace, self.dest, {})
32
- for lora_path in values:
33
- if "=" in lora_path:
34
- name, path = lora_path.split("=", 1)
35
- getattr(namespace, self.dest)[name] = path
36
- else:
37
- getattr(namespace, self.dest)[lora_path] = lora_path
38
-
39
-
40
29
  @dataclasses.dataclass
41
30
  class ServerArgs:
42
31
  # Model and tokenizer
@@ -108,12 +97,12 @@ class ServerArgs:
108
97
  disable_cuda_graph_padding: bool = False
109
98
  disable_disk_cache: bool = False
110
99
  disable_custom_all_reduce: bool = False
100
+ disable_mla: bool = False
111
101
  enable_mixed_chunk: bool = False
112
102
  enable_torch_compile: bool = False
113
103
  max_torch_compile_bs: int = 32
114
104
  torchao_config: str = ""
115
105
  enable_p2p_check: bool = False
116
- enable_mla: bool = False
117
106
  triton_attention_reduce_in_fp32: bool = False
118
107
 
119
108
  # LoRA
@@ -173,10 +162,6 @@ class ServerArgs:
173
162
  self.sampling_backend = "pytorch"
174
163
 
175
164
  # Default kernel backends
176
- if self.enable_mla:
177
- logger.info("MLA optimization is tunred on. Use triton backend.")
178
- self.attention_backend = "triton"
179
-
180
165
  if self.attention_backend is None:
181
166
  self.attention_backend = "flashinfer"
182
167
 
@@ -514,6 +499,11 @@ class ServerArgs:
514
499
  default=False,
515
500
  help="Disable the custom all-reduce kernel and fall back to NCCL.",
516
501
  )
502
+ parser.add_argument(
503
+ "--disable-mla",
504
+ action="store_true",
505
+ help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
506
+ )
517
507
  parser.add_argument(
518
508
  "--enable-mixed-chunk",
519
509
  action="store_true",
@@ -541,11 +531,6 @@ class ServerArgs:
541
531
  action="store_true",
542
532
  help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
543
533
  )
544
- parser.add_argument(
545
- "--enable-mla",
546
- action="store_true",
547
- help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
548
- )
549
534
  parser.add_argument(
550
535
  "--triton-attention-reduce-in-fp32",
551
536
  action="store_true",
@@ -623,3 +608,14 @@ class PortArgs:
623
608
  controller_port: int
624
609
  detokenizer_port: int
625
610
  nccl_ports: List[int]
611
+
612
+
613
+ class LoRAPathAction(argparse.Action):
614
+ def __call__(self, parser, namespace, values, option_string=None):
615
+ setattr(namespace, self.dest, {})
616
+ for lora_path in values:
617
+ if "=" in lora_path:
618
+ name, path = lora_path.split("=", 1)
619
+ getattr(namespace, self.dest)[name] = path
620
+ else:
621
+ getattr(namespace, self.dest)[lora_path] = lora_path
sglang/srt/utils.py CHANGED
@@ -26,7 +26,7 @@ import struct
26
26
  import time
27
27
  from importlib.metadata import PackageNotFoundError, version
28
28
  from io import BytesIO
29
- from typing import List, Optional, Union
29
+ from typing import Any, Dict, List, Optional, Union
30
30
 
31
31
  import numpy as np
32
32
  import psutil
@@ -682,3 +682,23 @@ def replace_submodule(
682
682
  target_name = module_name.split(".")[-1]
683
683
  setattr(parent, target_name, new_module)
684
684
  return new_module
685
+
686
+
687
+ def set_weight_attrs(
688
+ weight: torch.Tensor,
689
+ weight_attrs: Optional[Dict[str, Any]],
690
+ ):
691
+ """Set attributes on a weight tensor.
692
+
693
+ This method is used to set attributes on a weight tensor. This method
694
+ will not overwrite existing attributes.
695
+
696
+ Args:
697
+ weight: The weight tensor.
698
+ weight_attrs: A dictionary of attributes to set on the weight tensor.
699
+ """
700
+ if weight_attrs is None:
701
+ return
702
+ for key, value in weight_attrs.items():
703
+ assert not hasattr(weight, key), f"Overwriting existing tensor attribute: {key}"
704
+ setattr(weight, key, value)
@@ -44,7 +44,7 @@ def get_answer_value(answer_str):
44
44
  return INVALID
45
45
 
46
46
 
47
- def main(args):
47
+ def run_eval(args):
48
48
  # Select backend
49
49
  set_default_backend(RuntimeEndpoint(f"{args.host}:{args.port}"))
50
50
 
@@ -119,6 +119,12 @@ def main(args):
119
119
  # Dump results
120
120
  dump_state_text("tmp_output_gsm8k.txt", states)
121
121
 
122
+ return {
123
+ "accuracy": acc,
124
+ "latency": latency,
125
+ "output_throughput": output_throughput,
126
+ }
127
+
122
128
 
123
129
  if __name__ == "__main__":
124
130
  parser = argparse.ArgumentParser()
@@ -129,4 +135,4 @@ if __name__ == "__main__":
129
135
  parser.add_argument("--host", type=str, default="http://127.0.0.1")
130
136
  parser.add_argument("--port", type=int, default=30000)
131
137
  args = parser.parse_args()
132
- main(args)
138
+ run_eval(args)
sglang/test/test_utils.py CHANGED
@@ -22,13 +22,16 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
22
22
  from sglang.srt.utils import kill_child_process
23
23
  from sglang.utils import get_exception_traceback
24
24
 
25
+ DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
25
26
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
26
27
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
28
+ DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
27
29
  DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
28
30
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
29
- DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
31
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
30
32
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
31
- DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
33
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
34
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
32
35
 
33
36
 
34
37
  def is_in_ci():
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.1.post1"
1
+ __version__ = "0.3.1.post3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.1.post1
3
+ Version: 0.3.1.post3
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -269,7 +269,7 @@ Requires-Dist: peft; extra == "test"
269
269
 
270
270
  --------------------------------------------------------------------------------
271
271
 
272
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
272
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
273
273
 
274
274
  SGLang is a fast serving framework for large language models and vision language models.
275
275
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -278,7 +278,7 @@ The core features include:
278
278
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
279
279
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
280
280
  - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
281
- - **Active Community**: SGLang is open-source and backed by an active community with industry adoption, welcoming contributions to improve LLM and VLM serving.
281
+ - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
282
282
 
283
283
  ## News
284
284
  - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
318
318
  ### Method 2: From source
319
319
  ```
320
320
  # Use the last release branch
321
- git clone -b v0.3.1.post1 https://github.com/sgl-project/sglang.git
321
+ git clone -b v0.3.1.post3 https://github.com/sgl-project/sglang.git
322
322
  cd sglang
323
323
 
324
324
  pip install --upgrade pip
@@ -483,7 +483,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
483
483
  - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
484
484
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
485
485
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
486
- - To enable DeepSeek MLA acceleration, add `--enable-mla`.
487
486
  - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
488
487
  - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
489
488
  ```
@@ -500,6 +499,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
500
499
  - Llama / Llama 2 / Llama 3 / Llama 3.1
501
500
  - Mistral / Mixtral / Mistral NeMo
502
501
  - Gemma / Gemma 2
502
+ - OLMoE
503
503
  - Qwen / Qwen 2 / Qwen 2 MoE
504
504
  - DeepSeek / DeepSeek 2
505
505
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
@@ -1,13 +1,14 @@
1
1
  sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
2
2
  sglang/api.py,sha256=pH4CjwOXUweL5MF1sIkFMddDxfnF7PyUxEHC5kvNVbI,6468
3
- sglang/bench_latency.py,sha256=CDMrch4QwIyb2DTH2kBIgQ6Q8sGHwtrx3Cz49qZNfpU,17078
4
- sglang/bench_serving.py,sha256=6OM5JIDuoxJDg-VLE4ijGGcS8-6ViaidV05lIrZmSzo,36239
3
+ sglang/bench_latency.py,sha256=lyA_AwlhDbLMrH9Ca5_X3NUYQdwbHn_vpNbMyvqOZic,17342
4
+ sglang/bench_server_latency.py,sha256=KvFJgKQTSons7KOG0CBqnnOOx1gW29bBM1Z3GQO_6-E,5599
5
+ sglang/bench_serving.py,sha256=3gIJ1O2x51Fwd4wYJjgwluTbWKXL-azckQte7YC5zIc,36261
5
6
  sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
6
7
  sglang/global_config.py,sha256=38id86i3tRGCSOFZlN1LM01a3xt-V98xuNgKGG9boCk,1058
7
8
  sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
8
9
  sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
9
10
  sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
10
- sglang/version.py,sha256=83xK6WSmRR5ba-i5fDLUmoJT83Eg_dpsWgwcnsUhMpA,28
11
+ sglang/version.py,sha256=vtapUd7gvia5JFNpZOX5Q2A4TqgNWABeKFK66x_VeZU,28
11
12
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
13
  sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
13
14
  sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
@@ -26,8 +27,8 @@ sglang/srt/conversation.py,sha256=S5w5V6G1xigNxa3UQoSxRcMpQLWWDT9EPBoHBvHkSAk,19
26
27
  sglang/srt/hf_transformers_utils.py,sha256=6HlqcmGPIvnSGaEEICeuzwag1QylSoSGbXRVvUdIMDo,6016
27
28
  sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
28
29
  sglang/srt/server.py,sha256=n4QRn36_t-HAH-lSME3tiZSCUGRQwqMUckgs0paHq5g,20179
29
- sglang/srt/server_args.py,sha256=M1Bm9u2JRsEptne-kw-D-B_29Q-M6V4UpAM7K-JxXAc,23309
30
- sglang/srt/utils.py,sha256=8yxiMRttCcfswynkNPWD3yZFNAGFz2P1PzSuxHCBGns,22340
30
+ sglang/srt/server_args.py,sha256=3XjDt6SSjTfbOe0HSXA--2aUvrpWSnQmAHYwmeS1-M0,23159
31
+ sglang/srt/utils.py,sha256=Vly46zMM_rz__DaU15vbidYtS0Gh2s7TnAMj4WLyAO4,22954
31
32
  sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
32
33
  sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
33
34
  sglang/srt/configs/model_config.py,sha256=OqHrucJQHbH-wxgkGj-Dcx_B888uUGASpLRjz40HaLY,6651
@@ -35,18 +36,21 @@ sglang/srt/constrained/__init__.py,sha256=ze8awDPvwAzdeMwzJ-25kXOQ4nVWoaP55jBDt5
35
36
  sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
36
37
  sglang/srt/constrained/fsm_cache.py,sha256=k7DRUAaiLTEX5UarfJ17gEYQ-QWQAGfvykjYFkM_Y2U,2982
37
38
  sglang/srt/constrained/jump_forward.py,sha256=9_HxmXtWjr5S6a5e0cBimbY3ZhiLiJC74V6jIqDXfuo,6575
38
- sglang/srt/layers/activation.py,sha256=awcwOODYcVdUtC2JxJ1TGsV8Tru0eACKcxYN6cWHbl4,5148
39
- sglang/srt/layers/attention_backend.py,sha256=lqMsY4VaOO_szIWoTAinXf1DnP2UsbF32kzvwFySz9w,18119
39
+ sglang/srt/layers/activation.py,sha256=tRWHxIjcIopkOremkb5Jy5O0rgdB1PAhHfIEONfyj6Y,5166
40
+ sglang/srt/layers/attention_backend.py,sha256=TMxsN1HwgqAURD1i77c-TN-3Xy53H9Kbg6HgpRHHoj0,18167
40
41
  sglang/srt/layers/flashinfer_utils.py,sha256=jyaO7XiEisFZg_dfaCbfRCHSHSKYoM1wOzfHa0h1q14,7413
41
- sglang/srt/layers/layernorm.py,sha256=-9Yph4nnMZYX_Q31MUGAimLajNclHXjgDkswpU2BTos,3694
42
+ sglang/srt/layers/layernorm.py,sha256=p_7bnmSpJ_slpoP0Gk5wQPpHtLllUu3imSIRBqGqTP0,3737
43
+ sglang/srt/layers/linear.py,sha256=9rjCiSb_QOn5RgpVjIhEKdReRvSYVfcTSjbWBEbApLI,45173
42
44
  sglang/srt/layers/logits_processor.py,sha256=Js2qSk1Z3uPL2cYO1ARai51f2i8OedV3qdwByQVSJtI,12439
43
45
  sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
44
46
  sglang/srt/layers/radix_attention.py,sha256=EcVO0fUSmgvE_9R-MlpgJq0O_uT8ACuHzbMi19bANYc,1874
45
- sglang/srt/layers/sampler.py,sha256=Q4u46oYu66e34rBNzr50VoXO8FM-assYiCoROolq3Zs,3661
47
+ sglang/srt/layers/sampler.py,sha256=Y0o1bndTGRD713fHMbN5-LRUiyneBkb7bH_QlkkeqSs,3836
46
48
  sglang/srt/layers/torchao_utils.py,sha256=rTECwKSXhj_ylh_iSzfbopz9_lZOFHatquQrNJNLZlE,2703
47
49
  sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
48
50
  sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
49
51
  sglang/srt/layers/fused_moe/layer.py,sha256=raFyvPzjYz-Fv8B3IcOxQYKKCWqXis5mXwg1GFE61y4,22243
52
+ sglang/srt/layers/quantization/__init__.py,sha256=wl9mIOeA6mtKIaW1LWUJABWPdqOb-2uZ-kSijWoxLtU,3095
53
+ sglang/srt/layers/quantization/base_config.py,sha256=vlpSPvSrFmUe65ETg4SoPocQ9bVNY6As3QuHdr_3Dr4,4023
50
54
  sglang/srt/layers/triton_attention/decode_attention.py,sha256=XCQTX0kUttT1AG5FRMgfQbiXgvoempYD0UR2r6D_vJg,16711
51
55
  sglang/srt/layers/triton_attention/extend_attention.py,sha256=XTUTMrE-5jfMEufQUifZ-8NJQABSPcF47qhnNT5Z1iI,11050
52
56
  sglang/srt/layers/triton_attention/prefill_attention.py,sha256=QkXPcT02c13zha2M4mBm2S5dh_sS-Gc4FkkrcywRqvc,5377
@@ -56,49 +60,49 @@ sglang/srt/lora/lora_manager.py,sha256=7J7cGmyy1Ph4HCvLdM-ViAizAbV1snZqD-S7JLWXa
56
60
  sglang/srt/managers/controller_multi.py,sha256=KolZDso2WqH1ZhQw9p1eTmlFRgo4bcvzBxE44_sNE_o,6300
57
61
  sglang/srt/managers/controller_single.py,sha256=DiZALP_iIPZQMRx09a-LwT5_Dg7p-WU8HXyMoxJ9sRA,4955
58
62
  sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
59
- sglang/srt/managers/io_struct.py,sha256=bqmL3NDPLqOn6Au3WLF0NOe8Dh7ECMN7BTHCkEZ_Edk,11247
60
- sglang/srt/managers/policy_scheduler.py,sha256=tiBUi2GJU5eQEBK6HfsO1_YjWtFkougo40954DIp4dM,13026
61
- sglang/srt/managers/schedule_batch.py,sha256=ppHYK65GP0dtuCEzpSbGm9uAne5rEoRmW8osLknXJpI,27384
63
+ sglang/srt/managers/io_struct.py,sha256=yNV5BmeUzLPqv19j79kXQ50Iaqdk4vP-_TciiRf4OEE,11396
64
+ sglang/srt/managers/policy_scheduler.py,sha256=PVo0DV0-5ODNN7FkPkeF1Y8BQ6uuLldPETOlB_YvvL4,11560
65
+ sglang/srt/managers/schedule_batch.py,sha256=ns2qkaYAvzul-LCV1BEB6q1t5jKyftNsReMv62PC8M0,27386
62
66
  sglang/srt/managers/tokenizer_manager.py,sha256=ql-sObjl1oRigJwnLtqqTaaw-i7gPTDMoNXDEMftr40,29643
63
- sglang/srt/managers/tp_worker.py,sha256=4Hhla9rfGYEdQtzGmxlIEqxt_WVkn2dkLLNQZHgpkf0,39270
67
+ sglang/srt/managers/tp_worker.py,sha256=0Y0k-roDrBxWZxD0axv5CCvUUW8vsJ8n78TANHLzEFs,39503
64
68
  sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
65
69
  sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
66
70
  sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
67
71
  sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
68
72
  sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
69
- sglang/srt/model_executor/cuda_graph_runner.py,sha256=ZeO-8Mg4Tf0iP-L9FXcyhHfNzGWpTPEDGeUoC2lzHTE,10418
73
+ sglang/srt/model_executor/cuda_graph_runner.py,sha256=gZ0Wukqz6u67MMIj4MC8JET9jcHdh0rotYzpuPlHruY,10512
70
74
  sglang/srt/model_executor/forward_batch_info.py,sha256=yvkhayY9Zu6gysoojcGT73lADGOtfHKkFKWdJLRyACI,6141
71
- sglang/srt/model_executor/model_runner.py,sha256=LoQ7OFVwOiK_BfdpRfitss1TfJ8qrysHgWM-xXu7n2Y,22433
72
- sglang/srt/models/baichuan.py,sha256=NrG1rMJXhemkrUCEf8xKOSDQVsOD-nN8RQz6MWHOg84,15124
73
- sglang/srt/models/chatglm.py,sha256=KwxLHBEvK02McXDvBS0gnRxfIvOAu2QP7lgibrj9Nbc,13371
74
- sglang/srt/models/commandr.py,sha256=2rAXRZRb4PkJZ4NWEqP_rIgsjxbdZyHpuoMOarqTWzQ,14163
75
- sglang/srt/models/dbrx.py,sha256=N_0Ku_p1NCsc29NktUBNqPv7Z33XhYxOZK5xN7nzW4s,14661
76
- sglang/srt/models/deepseek.py,sha256=7UJgde1EV9ey6d-CKRcEyTKh1_WhZdatpZiltIuqpik,16006
77
- sglang/srt/models/deepseek_v2.py,sha256=bPaGRL8ieBCXKIf-KY7-D9Rus7Qj3VGvvtERzAXAZWs,28421
78
- sglang/srt/models/exaone.py,sha256=3I5ZoiLotf7U-8c9QJRubpgf6JDx9I_z-ViXQlCC-x8,13087
79
- sglang/srt/models/gemma.py,sha256=GkwgGFHgGlXgBZN7s7Wooz5tMyCp1YtgLahU2NOo66M,12273
80
- sglang/srt/models/gemma2.py,sha256=sFfCNEm0_OOWElRSTDuroRv8wNMX8v_81Uko9m546KA,14923
81
- sglang/srt/models/gpt_bigcode.py,sha256=kzHYogeGXZF4KHpkXA-RGqvs016mA-6klWxD2QJTi9E,10195
82
- sglang/srt/models/grok.py,sha256=6I4OwQwNyAbh5GF24_SRm12XYBvM9iGWB-T4TSTJ0wU,14929
83
- sglang/srt/models/internlm2.py,sha256=6j7JH0p3yib8GZDH8Cmrs-pgwfH3eOlAK6V3Cq64O7w,12202
84
- sglang/srt/models/llama.py,sha256=tjdjlIxJr31vgbzGBP_el9RgYxw1kzvmqnVinnTVVUw,15259
85
- sglang/srt/models/llama_classification.py,sha256=A2ABTUD5u4XoWv1dsIPU7wcCQP3jhbDJblMhLgaiFBA,3402
75
+ sglang/srt/model_executor/model_runner.py,sha256=X7AG1k9AI_kqS8q1i5Bfv-kFysIdqJAVWMGGZoAPThY,22726
76
+ sglang/srt/models/baichuan.py,sha256=d2PFmyLBXjzS7X7FL9uz139_CpBPb5WYhzcHgF--gRE,15115
77
+ sglang/srt/models/chatglm.py,sha256=chDkgLTRU3bPxTUilhW_FGnsUWj_2fkvulCi9pdDxBY,13353
78
+ sglang/srt/models/commandr.py,sha256=FspSRkMRAXUjD3xzAkxkMiGiRg91czn9T5bagrf3l9M,14136
79
+ sglang/srt/models/dbrx.py,sha256=UmpbTCuf8rYe2Grut7YUPU1gEwsDhgNIs8vW4DNiaf0,14634
80
+ sglang/srt/models/deepseek.py,sha256=TWwfwKYvZZyu2UbimvimeyU_7u7HyIYZlRdlPtOCTfo,15988
81
+ sglang/srt/models/deepseek_v2.py,sha256=36iH4HrObMasOY801Tacub_40BR_0ImdqdKcJ6nHOD8,28413
82
+ sglang/srt/models/exaone.py,sha256=0OTgeAzyi_xvoQTx4TwYkCxRq8sMa-4EYL0_KJRmiAU,13069
83
+ sglang/srt/models/gemma.py,sha256=qo-4F602DKuv33zp4i4dayteFoVhnTYgVbFWKYms5Og,12255
84
+ sglang/srt/models/gemma2.py,sha256=8wGqNQPaPjuTtgHiKsUP4nowOukPvXwRywD4lkAW9Dg,14905
85
+ sglang/srt/models/gpt_bigcode.py,sha256=k_pZa4Sg5GEsr4ln0kjP765moGUPNs5a6iANPjE2W8U,10177
86
+ sglang/srt/models/grok.py,sha256=71Zx-4Q3wggNMtRYlXuPMA-auK-sHBYukI1Usn8LVrE,14911
87
+ sglang/srt/models/internlm2.py,sha256=nEr6MSHFkTjPLvWl1jQQdGFO7iOHex6YtE-I4rYuLao,12184
88
+ sglang/srt/models/llama.py,sha256=bdIt9IfZBgsg6CoZT3lvB-dqXhfxempdRHLkY3Su_VU,15198
89
+ sglang/srt/models/llama_classification.py,sha256=UpwYsgNVS1065t7Yjmi2XGbk9Or8bq2cF82zH1Yx2Mg,3385
86
90
  sglang/srt/models/llama_embedding.py,sha256=RI2mpYheP5WwhuTINU-6IrU61usuMyCK9h2zDEyLW4g,3458
87
- sglang/srt/models/llava.py,sha256=O4XGdl70Hh4tM_OHapFGHbReC82mbe9xLw6GELKWKhU,24881
88
- sglang/srt/models/llavavid.py,sha256=ou5uIuskBoBo0lXvqFFfDLBYYVfehx27n-Lu8X9gpLs,11992
89
- sglang/srt/models/minicpm.py,sha256=ioqCsTCE_oF8xqGF5fm5cK9dclK5Y0EQ1UJfyteIDDo,13825
90
- sglang/srt/models/minicpm3.py,sha256=_C96kO3qGK0KRctXZf8LBR9s0sEW0QXWSGU0Vf6OrI8,25206
91
+ sglang/srt/models/llava.py,sha256=1MG1JDDQb7xc67BSimDo98Gmvza6PmrHQHmKybsDui4,24872
92
+ sglang/srt/models/llavavid.py,sha256=RqOUFROt-gqTlFYqnySAVBXJO9g-NMU2yke-AW5cV6o,11983
93
+ sglang/srt/models/minicpm.py,sha256=Xvy99mkfwzRZCLOe3BhfmNSuJyDhGjjAJq0YOpepu_Q,13807
94
+ sglang/srt/models/minicpm3.py,sha256=yuiwWNfJeWvfUgwkbEfpuc9_uPB6odqBCbdYj8t9aDQ,25207
91
95
  sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
92
- sglang/srt/models/mixtral.py,sha256=oRC7mKBrPJhvzkWSabrbeQQQac-jtF4EV6H2Sgjc5JY,13897
93
- sglang/srt/models/mixtral_quant.py,sha256=wMACJq78OTWj7HlqPDRNEh8cjrVAjKqJEsOG3CO5xow,14072
94
- sglang/srt/models/olmoe.py,sha256=d0ECpU-IXXwGYg9tkVeMARUbqVcqEnWfpH3rrNiGKA0,15336
95
- sglang/srt/models/qwen.py,sha256=nqSRzkiZzpRVG6WGQ1MBUclQnXyw8jlvoOq-euM8j5s,9954
96
- sglang/srt/models/qwen2.py,sha256=9_M-VkHN1_T1XN-gsl_L636QMQ9BLF2WqvTcx_1L6aw,12432
97
- sglang/srt/models/qwen2_moe.py,sha256=s7b5XnSvsBYtZZUkjPp442m59CqPJ3HxGUIwXBVWsXw,17153
98
- sglang/srt/models/stablelm.py,sha256=30ngpc0Xq3VxzXJlf6svP1oax8Q3krMJkxM8PVKtZWU,11359
99
- sglang/srt/models/xverse.py,sha256=luhp_90ZNkTpXHDCURO4MZBy1vbvHTVCwSe4PYYLWBs,13701
100
- sglang/srt/models/xverse_moe.py,sha256=YR--WZ33G7XEMsS7ZJl1cQ62Q8PDo9gWqpvJBY_cb-M,15886
101
- sglang/srt/models/yivl.py,sha256=B6MELthWIm5KdSzX3o2tbbpApY8XdjUdmcQSD4dQe_I,4835
96
+ sglang/srt/models/mixtral.py,sha256=QzWIhjk8gW9DquTvgQsWK3VK0ccdTMT0hCDDHI03KPI,13879
97
+ sglang/srt/models/mixtral_quant.py,sha256=e2x1AykUSVRqEVw6Pg7uKW1Uj8xyn4jZSfLJL4Kl5o8,14054
98
+ sglang/srt/models/olmoe.py,sha256=hGh2IlCg9kr1WIeGyRWwNpa1CfyZH163vq7eSx5d598,15327
99
+ sglang/srt/models/qwen.py,sha256=Vs6f8Jn1TswEzgiPS0G9qxeDU_DdC60JnhDeRDTH3FQ,9936
100
+ sglang/srt/models/qwen2.py,sha256=pamZrETUcaXbWN4tVTjObFPNjqaMu49-8g267NzxkFI,12414
101
+ sglang/srt/models/qwen2_moe.py,sha256=2BFsp1oPs7o_3uc8xvIGfGRNNU2TKkmKZY9P1qtgtlQ,17135
102
+ sglang/srt/models/stablelm.py,sha256=v67JM1SHb-LinrsX598WMsLVeyzjoKquW6G5G30X5fQ,11341
103
+ sglang/srt/models/xverse.py,sha256=VThXXKg3DzepcEP1JHcqSyhRBvq6yL14oh4uj5TJOEM,13649
104
+ sglang/srt/models/xverse_moe.py,sha256=BqmV-uk9ipp4nrj6-lnFfvkwUcuKmV7yfGAYB6Ob-UQ,15833
105
+ sglang/srt/models/yivl.py,sha256=N3noJ5M-FiZS-E_zfaJs4prQOu_ineRt11MWloYgOR8,4826
102
106
  sglang/srt/openai_api/adapter.py,sha256=CJ47YftRHAip1FMcHIhtCorBtzlIkv7F0Wz_JUcI4T4,51032
103
107
  sglang/srt/openai_api/protocol.py,sha256=rdSwUAoO5-KLemJOE50xwSUagxY4T1QIiNyCYsTtCi0,9868
104
108
  sglang/srt/sampling/sampling_batch_info.py,sha256=GewqyxCrW2PFwuzGHaCR59Pvw6j0n2dKGrlJWYQWwW4,6149
@@ -109,7 +113,7 @@ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq
109
113
  sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
110
114
  sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
111
115
  sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
112
- sglang/test/few_shot_gsm8k.py,sha256=uSHEPvUFbAgWKtaqxkhBpQrQV_SlTk0HN9FhjNLpL4g,3731
116
+ sglang/test/few_shot_gsm8k.py,sha256=To7Sdg-DLF8poIQLwiOBYKbkz-1C_gn6H79vIbyPR-o,3860
113
117
  sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
114
118
  sglang/test/runners.py,sha256=ZoWhT1TDXfLBVdbivXx1KUu9dhPlGjL_xrP18WLzVLo,11404
115
119
  sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
@@ -121,10 +125,10 @@ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9
121
125
  sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
122
126
  sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
123
127
  sglang/test/test_programs.py,sha256=3-XKnppQdCNWjaJb6jwib5Z9OSpgKvH8SFLJbE4J9qI,17001
124
- sglang/test/test_utils.py,sha256=NLiJqFRWnCeQ-gdCBe0ubNFCsig1CPb1EU-Ay9CtSfU,17109
128
+ sglang/test/test_utils.py,sha256=OnAFpTA94GmQCHCV5XpaYImn11U7Cg4yfSw0nC17GRs,17504
125
129
  sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
126
- sglang-0.3.1.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
127
- sglang-0.3.1.post1.dist-info/METADATA,sha256=zswdq5UTi5aLVmpEyjnc7SzIi60yc4w2hlMhckdxmcU,38137
128
- sglang-0.3.1.post1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
129
- sglang-0.3.1.post1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
130
- sglang-0.3.1.post1.dist-info/RECORD,,
130
+ sglang-0.3.1.post3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
131
+ sglang-0.3.1.post3.dist-info/METADATA,sha256=uhvB-z9UZsAafHaPfU9qYU6oKxrC6BLcyBspbtoFAY8,38122
132
+ sglang-0.3.1.post3.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
133
+ sglang-0.3.1.post3.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
134
+ sglang-0.3.1.post3.dist-info/RECORD,,