sglang 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,14 +5,10 @@
5
5
  from typing import Any, Dict, Iterable, Optional, Tuple
6
6
 
7
7
  import torch
8
- import tqdm
9
8
  from torch import nn
10
9
  from transformers import LlamaConfig
11
10
  from vllm.config import CacheConfig
12
- from vllm.distributed import (
13
- get_tensor_model_parallel_rank,
14
- get_tensor_model_parallel_world_size,
15
- )
11
+ from vllm.distributed import get_tensor_model_parallel_world_size
16
12
  from vllm.model_executor.layers.activation import SiluAndMul
17
13
  from vllm.model_executor.layers.layernorm import RMSNorm
18
14
  from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
@@ -39,6 +35,7 @@ class LlamaMLP(nn.Module):
39
35
  intermediate_size: int,
40
36
  hidden_act: str,
41
37
  quant_config: Optional[QuantizationConfig] = None,
38
+ prefix: str = "",
42
39
  ) -> None:
43
40
  super().__init__()
44
41
  self.gate_up_proj = MergedColumnParallelLinear(
@@ -46,12 +43,14 @@ class LlamaMLP(nn.Module):
46
43
  [intermediate_size] * 2,
47
44
  bias=False,
48
45
  quant_config=quant_config,
46
+ prefix=f"{prefix}.gate_up_proj",
49
47
  )
50
48
  self.down_proj = RowParallelLinear(
51
49
  intermediate_size,
52
50
  hidden_size,
53
51
  bias=False,
54
52
  quant_config=quant_config,
53
+ prefix=f"{prefix}.down_proj",
55
54
  )
56
55
  if hidden_act != "silu":
57
56
  raise ValueError(
@@ -70,6 +69,7 @@ class LlamaMLP(nn.Module):
70
69
  class LlamaAttention(nn.Module):
71
70
  def __init__(
72
71
  self,
72
+ config: LlamaConfig,
73
73
  hidden_size: int,
74
74
  num_heads: int,
75
75
  num_kv_heads: int,
@@ -79,6 +79,7 @@ class LlamaAttention(nn.Module):
79
79
  rope_is_neox_style: bool = True,
80
80
  max_position_embeddings: int = 8192,
81
81
  quant_config: Optional[QuantizationConfig] = None,
82
+ prefix: str = "",
82
83
  ) -> None:
83
84
  super().__init__()
84
85
  self.hidden_size = hidden_size
@@ -96,7 +97,10 @@ class LlamaAttention(nn.Module):
96
97
  # the KV heads across multiple tensor parallel GPUs.
97
98
  assert tp_size % self.total_num_kv_heads == 0
98
99
  self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
99
- self.head_dim = hidden_size // self.total_num_heads
100
+ # MistralConfig has an optional head_dim introduced by Mistral-Nemo
101
+ self.head_dim = getattr(
102
+ config, "head_dim", self.hidden_size // self.total_num_heads
103
+ )
100
104
  self.q_size = self.num_heads * self.head_dim
101
105
  self.kv_size = self.num_kv_heads * self.head_dim
102
106
  self.scaling = self.head_dim**-0.5
@@ -110,12 +114,14 @@ class LlamaAttention(nn.Module):
110
114
  self.total_num_kv_heads,
111
115
  bias=False,
112
116
  quant_config=quant_config,
117
+ prefix=f"{prefix}.qkv_proj",
113
118
  )
114
119
  self.o_proj = RowParallelLinear(
115
120
  self.total_num_heads * self.head_dim,
116
121
  hidden_size,
117
122
  bias=False,
118
123
  quant_config=quant_config,
124
+ prefix=f"{prefix}.o_proj",
119
125
  )
120
126
 
121
127
  self.rotary_emb = get_rope(
@@ -154,6 +160,7 @@ class LlamaDecoderLayer(nn.Module):
154
160
  config: LlamaConfig,
155
161
  layer_id: int = 0,
156
162
  quant_config: Optional[QuantizationConfig] = None,
163
+ prefix: str = "",
157
164
  ) -> None:
158
165
  super().__init__()
159
166
  self.hidden_size = config.hidden_size
@@ -168,6 +175,7 @@ class LlamaDecoderLayer(nn.Module):
168
175
  rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
169
176
  max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
170
177
  self.self_attn = LlamaAttention(
178
+ config=config,
171
179
  hidden_size=self.hidden_size,
172
180
  num_heads=config.num_attention_heads,
173
181
  num_kv_heads=config.num_key_value_heads,
@@ -177,12 +185,14 @@ class LlamaDecoderLayer(nn.Module):
177
185
  rope_is_neox_style=rope_is_neox_style,
178
186
  max_position_embeddings=max_position_embeddings,
179
187
  quant_config=quant_config,
188
+ prefix=f"{prefix}.self_attn",
180
189
  )
181
190
  self.mlp = LlamaMLP(
182
191
  hidden_size=self.hidden_size,
183
192
  intermediate_size=config.intermediate_size,
184
193
  hidden_act=config.hidden_act,
185
194
  quant_config=quant_config,
195
+ prefix=f"{prefix}.mlp",
186
196
  )
187
197
  self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
188
198
  self.post_attention_layernorm = RMSNorm(
@@ -230,7 +240,9 @@ class LlamaModel(nn.Module):
230
240
  )
231
241
  self.layers = nn.ModuleList(
232
242
  [
233
- LlamaDecoderLayer(config, i, quant_config=quant_config)
243
+ LlamaDecoderLayer(
244
+ config, i, quant_config=quant_config, prefix=f"model.layers.{i}"
245
+ )
234
246
  for i in range(config.num_hidden_layers)
235
247
  ]
236
248
  )
@@ -370,9 +382,6 @@ class LlamaForCausalLM(nn.Module):
370
382
  weight_loader(param, loaded_weight)
371
383
 
372
384
  if name is None or loaded_weight is None:
373
- if get_tensor_model_parallel_rank() == 0:
374
- weights = tqdm.tqdm(weights, total=int(len(params_dict) * 1.5))
375
-
376
385
  for name, loaded_weight in weights:
377
386
  load_weights_per_param(name, loaded_weight)
378
387
  else:
sglang/srt/server.py CHANGED
@@ -157,6 +157,19 @@ def _set_global_server_args(server_args: ServerArgs):
157
157
  }
158
158
 
159
159
 
160
+ def _set_torch_compile_config():
161
+ # The following configurations are for torch compile optimizations
162
+ import torch._dynamo.config
163
+ import torch._inductor.config
164
+
165
+ torch._inductor.config.coordinate_descent_tuning = True
166
+ torch._inductor.config.triton.unique_kernel_names = True
167
+ torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
168
+
169
+ # FIXME: tmp workaround
170
+ torch._dynamo.config.accumulated_cache_size_limit = 256
171
+
172
+
160
173
  def launch_server(
161
174
  server_args: ServerArgs,
162
175
  model_overide_args: Optional[dict] = None,
@@ -174,6 +187,7 @@ def launch_server(
174
187
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
175
188
  os.environ["NCCL_CUMEM_ENABLE"] = "0"
176
189
  os.environ["NCCL_NVLS_ENABLE"] = "0"
190
+ os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
177
191
  set_ulimit()
178
192
  if server_args.show_time_cost:
179
193
  enable_show_time_cost()
@@ -182,7 +196,7 @@ def launch_server(
182
196
  if not server_args.disable_flashinfer:
183
197
  assert_pkg_version(
184
198
  "flashinfer",
185
- "0.1.0",
199
+ "0.1.1",
186
200
  "Please uninstall the old version and "
187
201
  "reinstall the latest version by following the instructions "
188
202
  "at https://docs.flashinfer.ai/installation.html.",
@@ -190,6 +204,10 @@ def launch_server(
190
204
  if server_args.chat_template:
191
205
  # TODO: replace this with huggingface transformers template
192
206
  load_chat_template_for_openai_api(server_args.chat_template)
207
+
208
+ if server_args.enable_torch_compile:
209
+ _set_torch_compile_config()
210
+
193
211
  _set_global_server_args(server_args)
194
212
 
195
213
  # Allocate ports
@@ -205,6 +223,7 @@ def launch_server(
205
223
  detokenizer_port=ports[2],
206
224
  nccl_ports=ports[3:],
207
225
  )
226
+ logger.info(f"{server_args=}")
208
227
 
209
228
  # Handle multi-node tensor parallelism
210
229
  if server_args.nnodes > 1:
sglang/srt/server_args.py CHANGED
@@ -29,7 +29,7 @@ class ServerArgs:
29
29
  max_prefill_tokens: Optional[int] = None
30
30
  max_running_requests: Optional[int] = None
31
31
  schedule_heuristic: str = "lpm"
32
- schedule_conservativeness: float = 0.8
32
+ schedule_conservativeness: float = 1.0
33
33
 
34
34
  # Other runtime options
35
35
  tp_size: int = 1
@@ -55,6 +55,7 @@ class ServerArgs:
55
55
  disable_regex_jump_forward: bool = False
56
56
  disable_cuda_graph: bool = False
57
57
  disable_disk_cache: bool = False
58
+ enable_torch_compile: bool = False
58
59
  attention_reduce_in_fp32: bool = False
59
60
  enable_p2p_check: bool = False
60
61
  efficient_weight_load: bool = False
@@ -69,15 +70,15 @@ class ServerArgs:
69
70
  self.tokenizer_path = self.model_path
70
71
  if self.mem_fraction_static is None:
71
72
  if self.tp_size >= 16:
72
- self.mem_fraction_static = 0.74
73
+ self.mem_fraction_static = 0.80
73
74
  elif self.tp_size >= 8:
74
- self.mem_fraction_static = 0.78
75
+ self.mem_fraction_static = 0.84
75
76
  elif self.tp_size >= 4:
76
- self.mem_fraction_static = 0.82
77
+ self.mem_fraction_static = 0.86
77
78
  elif self.tp_size >= 2:
78
- self.mem_fraction_static = 0.85
79
- else:
80
79
  self.mem_fraction_static = 0.88
80
+ else:
81
+ self.mem_fraction_static = 0.89
81
82
  if isinstance(self.additional_ports, int):
82
83
  self.additional_ports = [self.additional_ports]
83
84
  elif self.additional_ports is None:
@@ -317,6 +318,11 @@ class ServerArgs:
317
318
  action="store_true",
318
319
  help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
319
320
  )
321
+ parser.add_argument(
322
+ "--enable-torch-compile",
323
+ action="store_true",
324
+ help="Optimize the model with torch.compile, experimental feature.",
325
+ )
320
326
  parser.add_argument(
321
327
  "--attention-reduce-in-fp32",
322
328
  action="store_true",
sglang/srt/utils.py CHANGED
@@ -312,6 +312,9 @@ def suppress_other_loggers():
312
312
  logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
313
313
  logging.WARN
314
314
  )
315
+ logging.getLogger("vllm.distributed.device_communicators.shm_broadcast").setLevel(
316
+ logging.WARN
317
+ )
315
318
  logging.getLogger("vllm.selector").setLevel(logging.WARN)
316
319
  logging.getLogger("vllm.utils").setLevel(logging.WARN)
317
320
 
@@ -411,6 +414,52 @@ def monkey_patch_vllm_dummy_weight_loader():
411
414
  setattr(DummyModelLoader, "load_model", load_model)
412
415
 
413
416
 
417
+ vllm_all_gather_backup = None
418
+
419
+
420
+ def monkey_patch_vllm_all_gather(reverse: bool = False):
421
+ """Monkey patch all-gather to remove in-place operations."""
422
+ from torch.distributed import _functional_collectives as funcol
423
+ from vllm.distributed.parallel_state import GroupCoordinator
424
+
425
+ global vllm_all_gather_backup
426
+ if vllm_all_gather_backup is None:
427
+ vllm_all_gather_backup = GroupCoordinator.all_gather
428
+
429
+ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
430
+ world_size = self.world_size
431
+ # Bypass the function if we are using only 1 GPU.
432
+ if world_size == 1:
433
+ return input_
434
+ assert (
435
+ -input_.dim() <= dim < input_.dim()
436
+ ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
437
+ if dim < 0:
438
+ # Convert negative dim to positive.
439
+ dim += input_.dim()
440
+ input_size = input_.size()
441
+ # Allocate output tensor.
442
+ output_tensor = torch.empty(
443
+ (world_size,) + input_size, dtype=input_.dtype, device=input_.device
444
+ )
445
+
446
+ output_tensor = funcol.all_gather_tensor(
447
+ input_, gather_dim=0, group=self.device_group
448
+ ).view((world_size,) + input_size)
449
+
450
+ # Reshape
451
+ output_tensor = output_tensor.movedim(0, dim)
452
+ output_tensor = output_tensor.reshape(
453
+ input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
454
+ )
455
+ return output_tensor
456
+
457
+ if reverse:
458
+ setattr(GroupCoordinator, "all_gather", vllm_all_gather_backup)
459
+ else:
460
+ setattr(GroupCoordinator, "all_gather", all_gather)
461
+
462
+
414
463
  API_KEY_HEADER_NAME = "X-API-Key"
415
464
 
416
465
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.22
3
+ Version: 0.1.24
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -240,7 +240,7 @@ Requires-Dist: torch ; extra == 'srt'
240
240
  Requires-Dist: uvicorn ; extra == 'srt'
241
241
  Requires-Dist: uvloop ; extra == 'srt'
242
242
  Requires-Dist: zmq ; extra == 'srt'
243
- Requires-Dist: vllm ==0.5.1 ; extra == 'srt'
243
+ Requires-Dist: vllm ==0.5.3.post1 ; extra == 'srt'
244
244
  Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
245
245
 
246
246
  <div align="center">
@@ -282,6 +282,7 @@ The core features include:
282
282
 
283
283
  ### Method 1: With pip
284
284
  ```
285
+ pip install --upgrade pip setuptools wheel
285
286
  pip install "sglang[all]"
286
287
 
287
288
  # Install FlashInfer CUDA kernels
@@ -293,6 +294,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
293
294
  git clone https://github.com/sgl-project/sglang.git
294
295
  cd sglang
295
296
 
297
+ pip install --upgrade pip
296
298
  pip install -e "python[all]"
297
299
 
298
300
  # Install FlashInfer CUDA kernels
@@ -390,15 +392,16 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
390
392
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
391
393
  ```
392
394
  - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
393
- - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
395
+ - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
394
396
  ```
395
397
  # Node 0
396
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
398
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
397
399
 
398
400
  # Node 1
399
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
401
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
400
402
  ```
401
403
  - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
404
+ - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
402
405
 
403
406
  ### Supported Models
404
407
 
@@ -420,6 +423,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
420
423
  - Grok
421
424
  - ChatGLM
422
425
  - InternLM 2
426
+ - Mistral NeMo
423
427
 
424
428
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
425
429
 
@@ -1,10 +1,10 @@
1
- sglang/__init__.py,sha256=7-tQgpOarxM1MfYy5nCbpqhqSKB_hKRAI4tekucmYz4,1141
1
+ sglang/__init__.py,sha256=nMs6lYeKcQpYArIaZLQ2VGNleY1dVvdBFaHyG7fpOsA,1141
2
2
  sglang/api.py,sha256=1JARbc1wNYF6tODdUpgmNgTyLOvMnxdTBctLvEwzGTY,5565
3
3
  sglang/bench.py,sha256=p34wnfMRdiedOUf9GKGZkkNxehmyTzK6Q1O20q_SGjY,21841
4
4
  sglang/bench_latency.py,sha256=UPy6WhrddMTDX7HqIeHNhCn5vF0YMOKxJlQRvhMC8zU,10552
5
- sglang/bench_serving.py,sha256=IebHhb0AM_4FhA74Xu13QK1-KXpkRZ_k3ohwKiot9mU,26116
5
+ sglang/bench_serving.py,sha256=zKGgVX3S-ggUvOxvEM4AszzXRPRVU6NGNnBG5vAAvRY,34577
6
6
  sglang/check_env.py,sha256=CscuPMlf68dkgZf0m-FiLpUisNNDoihMck4qhLOeV1Q,4124
7
- sglang/global_config.py,sha256=6WAMjRR1lDeGFdFu-18xUAbWVM2Vj0_L5ExvQ5wofus,1711
7
+ sglang/global_config.py,sha256=QG-ABVJksKK_llvUx7fSZcmK4GGCs-hBUVcM4LCr7Nw,1749
8
8
  sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
9
9
  sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
10
10
  sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
@@ -18,7 +18,7 @@ sglang/backend/vertexai.py,sha256=98toR-L0OTi4dYHaSmmzJdlQ2qN_0lImoKZFlVgYLRE,48
18
18
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
20
20
  sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
21
- sglang/lang/interpreter.py,sha256=0phpQs4PooVvVJCzzyNrTv2OFevI5fsU1FcN4roxqhY,29628
21
+ sglang/lang/interpreter.py,sha256=27j7H9p7TY4uUfF9f5E17FxK1xCNeNju4aut_PaWCrQ,29693
22
22
  sglang/lang/ir.py,sha256=5VVK2JnbspdysrhcGgkmp_JlAprd2XqqRnS_GfP_XWc,16645
23
23
  sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
24
24
  sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -30,16 +30,16 @@ sglang/lang/backend/runtime_endpoint.py,sha256=TZ0NV89or5_3MIZZFnc1JXAAjnv7tCfeQ
30
30
  sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
31
31
  sglang/srt/conversation.py,sha256=Il7JJuu4o42k2xdBWVfONNmstTsAM-4idX6AcEOnrXQ,15526
32
32
  sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
33
- sglang/srt/hf_transformers_utils.py,sha256=H3YnLtx05q65A1tn1JWNZOUhMtq6jANRhhMo6JJr6mg,10728
34
- sglang/srt/memory_pool.py,sha256=rzJq9-kgO9ON5mgHcLT5GKiQCWBCFaczPE8-9M6ckaU,3680
33
+ sglang/srt/hf_transformers_utils.py,sha256=94mOI93B2xOmXKqfJfEoGxqHgwwlWNbPHgsA47AQJK8,11245
34
+ sglang/srt/memory_pool.py,sha256=FhJk5GtYortO3MJIsMMQ-o49agwDHVX1aEQH2LITq6c,3949
35
35
  sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
36
36
  sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
37
37
  sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
38
38
  sglang/srt/openai_protocol.py,sha256=lGBhfxG6jmgUkMOh2NpBK9w9TUTRZKrsfHdW7XYhKKI,5700
39
39
  sglang/srt/sampling_params.py,sha256=OI11asr1Bd_E5soDjih614v4flgWxdMZU9HAF0aBafQ,3062
40
- sglang/srt/server.py,sha256=c0Ldp-10tvTroJI0msHWorrqObR90FuNK6SM4KP-qeU,13682
41
- sglang/srt/server_args.py,sha256=6pMKJN0S1QoTcVAstmxc5Laub2OAxMYpMykQky-Ym10,12959
42
- sglang/srt/utils.py,sha256=GFO0K-BnpAGi1_Cp4cSKOVTjfILz8qNltF-feZPR7yE,16804
40
+ sglang/srt/server.py,sha256=JC6rs8mkWg2mWwriwZvYEZyO514_HJFOUNda-pu8U_4,14369
41
+ sglang/srt/server_args.py,sha256=aF6L35mEB-FU3BL_ooKuCIcOXLhYLxA9-MjpaOTQRCo,13189
42
+ sglang/srt/utils.py,sha256=ZB9WLlZ_GpKVpPJiETrYkqH10J8iWrN_4buxDnQoA88,18568
43
43
  sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
44
44
  sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
45
45
  sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
@@ -48,33 +48,35 @@ sglang/srt/layers/context_flashattention_nopad.py,sha256=7ps_9W_ia9zikL9HqsSUwWH
48
48
  sglang/srt/layers/extend_attention.py,sha256=aYAAL9HZJpaSASp-ulMvbmSmyMcqdYUsgVQC-Lbm7_U,12008
49
49
  sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
50
50
  sglang/srt/layers/linear.py,sha256=qLwFkOiRAljzE7LkAkLRdcCdVMk-t7b56jEjwQAuYDM,33953
51
- sglang/srt/layers/logits_processor.py,sha256=RCHjWxlKlB_Mc2iOMHQKvKN9gjqg4oqgodS6gr3qCbA,9672
52
- sglang/srt/layers/radix_attention.py,sha256=xdj4v0L5DEcQDDHSbfo_VFqdvHLAWpiT2oU8wKqz3Gk,6212
51
+ sglang/srt/layers/logits_processor.py,sha256=KyRYANCiq9Cfu_VPjrIbSBAlqN_clcAgF3JrG9waU5k,9674
52
+ sglang/srt/layers/radix_attention.py,sha256=A3J_wOlysjblFXHgehAqRHBQmpYAHLyUovyLFsrMJ7A,6386
53
53
  sglang/srt/layers/token_attention.py,sha256=EJ4gjbVLfshOZ_vr1iB-Eq8_B-4F26n_wPDj6e1Zrww,7386
54
54
  sglang/srt/layers/quantization/__init__.py,sha256=PQFzdPpul98DvywBA6YMBOnrMjtHE1LMlMpJ7FM8J3I,1971
55
55
  sglang/srt/layers/quantization/fp8.py,sha256=jaqgRFnHC--IL8iqB6Qygi-KXYPYBKKqt_j4Rk55_h4,24946
56
56
  sglang/srt/managers/detokenizer_manager.py,sha256=8rN2cdMr61LWy07lingEqLnNy0W5Rebdn14IsTQ9PCs,5049
57
57
  sglang/srt/managers/io_struct.py,sha256=Y6jW3p0cNg0jcrEQNki1H8MMEWxwWA4p6Y-xVgUVWaI,5404
58
58
  sglang/srt/managers/tokenizer_manager.py,sha256=SbivhFhZUR9HU9pLTe93MlYprAFAHzOU3KMBA2piQUk,19308
59
- sglang/srt/managers/controller/cuda_graph_runner.py,sha256=xWyLPg7vG2EAsgmSG1AI0aEk_AueyOD0-aNbK3Mt_DE,7043
59
+ sglang/srt/managers/controller/cuda_graph_runner.py,sha256=0aRqA1_34oJ557Zn8PjpJecex5bBWJdnCmBlcDVvYO0,8509
60
60
  sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
61
- sglang/srt/managers/controller/infer_batch.py,sha256=phXzANqBUFyqFwRVl06bd5yBnGK2hem6qzf5i0lrTq0,33086
61
+ sglang/srt/managers/controller/infer_batch.py,sha256=SKwCwhnZ_CNlG0mVCEc4X0e4HNjJFke-c8zdWP3TzjQ,34186
62
62
  sglang/srt/managers/controller/manager_multi.py,sha256=DT8Y9RF5OyTxlrLEZYz4claNWir3UrVztdOZaVPiA6g,6077
63
63
  sglang/srt/managers/controller/manager_single.py,sha256=2xO_iWK6tWvc0B31nKbe2N3klxwQBJmPTnFhNjzhVSI,4566
64
- sglang/srt/managers/controller/model_runner.py,sha256=UBvaHShjBWWFMWSEKeDh2tNqd0zWTwtfK37BbYR7c6w,13864
64
+ sglang/srt/managers/controller/model_runner.py,sha256=927tf6nJjLjEDgz2wCDj2kvpZ-E_rAVm8PVKFVfP4p8,13951
65
65
  sglang/srt/managers/controller/radix_cache.py,sha256=tx8LEQpqLxipw9UUVj4D1YQLMMDmWnjDYv8oDlOl-co,8210
66
- sglang/srt/managers/controller/schedule_heuristic.py,sha256=tw9WEiA_pzL4dkPnoS34SYhhQ3hJXBL6K03zRm2n_g8,2482
67
- sglang/srt/managers/controller/tp_worker.py,sha256=uyoAW4O08UPciRYxGBPK6U5jaVuwEOvKBjaeJNNAe8s,30531
66
+ sglang/srt/managers/controller/schedule_heuristic.py,sha256=SQAGzPS3aB_TPj7rnPBhewwyR6W1sVwW4D3zG3JUY00,2714
67
+ sglang/srt/managers/controller/tp_worker.py,sha256=yjz-Xzl0zEy4QSU-EYneZH5vi3oHtBuXTtYe4VuDp2g,30517
68
68
  sglang/srt/model_loader/model_loader.py,sha256=VS8VQL5ITN3akZ9eU_-uHWMan1axLMNG2_O12HzGysA,10132
69
69
  sglang/srt/model_loader/utils.py,sha256=I2PS5HIH5Cg-p7xKO_Cw_foK2vQ61xVc3zQv7CbeGEw,10120
70
70
  sglang/srt/models/chatglm.py,sha256=pH8g2Dj8qQLGPYpWVTb-IONfXsdfmpWi0-IEYNdSi4s,13296
71
71
  sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
72
72
  sglang/srt/models/dbrx.py,sha256=rRxOusGPu670ommeqXg62AllwB1apzE4yZoWc1fcr2M,14095
73
+ sglang/srt/models/deepseek.py,sha256=YtoPmv4fKmiH_jsRMSab9Wxq3aOZga9pCPGnkCs3Vvs,15457
73
74
  sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
74
75
  sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
76
+ sglang/srt/models/gpt_bigcode.py,sha256=XHO1naPdXfiKYQRQ6uZe1fN3PBDhKH3-bchsaaZvfE4,9637
75
77
  sglang/srt/models/grok.py,sha256=611zrlIchvFaVfztRdBY7z97oU3KB-anykbOZy1hK6M,27295
76
78
  sglang/srt/models/internlm2.py,sha256=8MNcwxU5Th9IxWa314HqqmbCRlPUFScnfneBDs0riIU,11659
77
- sglang/srt/models/llama2.py,sha256=i97Ib4zq0-AbW7Wwp_ctFWnK528vipmlZVD_a7gB8L8,13819
79
+ sglang/srt/models/llama2.py,sha256=OyAf_lun5aZEsT80WmrIYBF8QXTXRpW8sUlylr4AZIc,14204
78
80
  sglang/srt/models/llama_classification.py,sha256=foCPvNyP2bTZ0YcRBF-qkmBv-gT24lhLNCXP30Oq4VU,4370
79
81
  sglang/srt/models/llava.py,sha256=vBI6EEeOG_9o23Shi9h8k58rxTOHZnSKMmPl3B3Q3uc,17924
80
82
  sglang/srt/models/llavavid.py,sha256=SrNQ-U2wekHvP_up-ZXRkCSros2NzheHpPfXHrp0YBU,13050
@@ -96,8 +98,8 @@ sglang/test/test_conversation.py,sha256=gF_AyOxQgpPQBPnA57-kq-M0p_zFu-rBDMFgAq65
96
98
  sglang/test/test_openai_protocol.py,sha256=DVx3r6hrb8oRqbo5AYIleldxbqMBTtb-gtORM6t_Y1c,1661
97
99
  sglang/test/test_programs.py,sha256=uefeHUFKT2NJESOujj-CsnPXdw1aQQN2TzUbPCHJjGs,13654
98
100
  sglang/test/test_utils.py,sha256=kD_fQe3WroZ9Kc3NBRKPiZOFJ_JD2uEE9XIvPp6AD9Y,11048
99
- sglang-0.1.22.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
100
- sglang-0.1.22.dist-info/METADATA,sha256=O1pihQWf_523B_fgluftctwOxcou6oj13_Wuquj7ztU,30691
101
- sglang-0.1.22.dist-info/WHEEL,sha256=rWxmBtp7hEUqVLOnTaDOPpR-cZpCDkzhhcBce-Zyd5k,91
102
- sglang-0.1.22.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
103
- sglang-0.1.22.dist-info/RECORD,,
101
+ sglang-0.1.24.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
102
+ sglang-0.1.24.dist-info/METADATA,sha256=_HKFljParVedu-eht7OKKb_RpEkVcB-Wh_P_jRW3TJk,30933
103
+ sglang-0.1.24.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
104
+ sglang-0.1.24.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
105
+ sglang-0.1.24.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (71.0.4)
2
+ Generator: setuptools (71.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5