sglang 0.1.22__py3-none-any.whl → 0.1.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. sglang/__init__.py +2 -2
  2. sglang/bench_serving.py +243 -25
  3. sglang/global_config.py +3 -2
  4. sglang/lang/interpreter.py +1 -0
  5. sglang/srt/hf_transformers_utils.py +13 -1
  6. sglang/srt/layers/logits_processor.py +4 -5
  7. sglang/srt/layers/radix_attention.py +38 -49
  8. sglang/srt/managers/controller/cuda_graph_runner.py +58 -16
  9. sglang/srt/managers/controller/infer_batch.py +51 -22
  10. sglang/srt/managers/controller/model_runner.py +58 -4
  11. sglang/srt/managers/controller/schedule_heuristic.py +8 -3
  12. sglang/srt/managers/controller/tp_worker.py +9 -11
  13. sglang/srt/memory_pool.py +13 -5
  14. sglang/srt/models/deepseek.py +430 -0
  15. sglang/srt/models/gpt_bigcode.py +282 -0
  16. sglang/srt/models/llama2.py +19 -10
  17. sglang/srt/server.py +26 -1
  18. sglang/srt/server_args.py +12 -6
  19. sglang/srt/utils.py +93 -1
  20. sglang/version.py +1 -0
  21. {sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/METADATA +10 -6
  22. {sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/RECORD +25 -36
  23. {sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/WHEEL +1 -1
  24. sglang/backend/__init__.py +0 -0
  25. sglang/backend/anthropic.py +0 -77
  26. sglang/backend/base_backend.py +0 -80
  27. sglang/backend/litellm.py +0 -90
  28. sglang/backend/openai.py +0 -438
  29. sglang/backend/runtime_endpoint.py +0 -283
  30. sglang/backend/vertexai.py +0 -149
  31. sglang/bench.py +0 -627
  32. sglang/srt/managers/controller/dp_worker.py +0 -113
  33. sglang/srt/openai_api/api_adapter.py +0 -432
  34. sglang/srt/openai_api/openai_api_adapter.py +0 -431
  35. sglang/srt/openai_api/openai_protocol.py +0 -207
  36. sglang/srt/openai_api_adapter.py +0 -411
  37. sglang/srt/openai_protocol.py +0 -207
  38. {sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/LICENSE +0 -0
  39. {sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/top_level.txt +0 -0
@@ -5,14 +5,10 @@
5
5
  from typing import Any, Dict, Iterable, Optional, Tuple
6
6
 
7
7
  import torch
8
- import tqdm
9
8
  from torch import nn
10
9
  from transformers import LlamaConfig
11
10
  from vllm.config import CacheConfig
12
- from vllm.distributed import (
13
- get_tensor_model_parallel_rank,
14
- get_tensor_model_parallel_world_size,
15
- )
11
+ from vllm.distributed import get_tensor_model_parallel_world_size
16
12
  from vllm.model_executor.layers.activation import SiluAndMul
17
13
  from vllm.model_executor.layers.layernorm import RMSNorm
18
14
  from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
@@ -39,6 +35,7 @@ class LlamaMLP(nn.Module):
39
35
  intermediate_size: int,
40
36
  hidden_act: str,
41
37
  quant_config: Optional[QuantizationConfig] = None,
38
+ prefix: str = "",
42
39
  ) -> None:
43
40
  super().__init__()
44
41
  self.gate_up_proj = MergedColumnParallelLinear(
@@ -46,12 +43,14 @@ class LlamaMLP(nn.Module):
46
43
  [intermediate_size] * 2,
47
44
  bias=False,
48
45
  quant_config=quant_config,
46
+ prefix=f"{prefix}.gate_up_proj",
49
47
  )
50
48
  self.down_proj = RowParallelLinear(
51
49
  intermediate_size,
52
50
  hidden_size,
53
51
  bias=False,
54
52
  quant_config=quant_config,
53
+ prefix=f"{prefix}.down_proj",
55
54
  )
56
55
  if hidden_act != "silu":
57
56
  raise ValueError(
@@ -70,6 +69,7 @@ class LlamaMLP(nn.Module):
70
69
  class LlamaAttention(nn.Module):
71
70
  def __init__(
72
71
  self,
72
+ config: LlamaConfig,
73
73
  hidden_size: int,
74
74
  num_heads: int,
75
75
  num_kv_heads: int,
@@ -79,6 +79,7 @@ class LlamaAttention(nn.Module):
79
79
  rope_is_neox_style: bool = True,
80
80
  max_position_embeddings: int = 8192,
81
81
  quant_config: Optional[QuantizationConfig] = None,
82
+ prefix: str = "",
82
83
  ) -> None:
83
84
  super().__init__()
84
85
  self.hidden_size = hidden_size
@@ -96,7 +97,10 @@ class LlamaAttention(nn.Module):
96
97
  # the KV heads across multiple tensor parallel GPUs.
97
98
  assert tp_size % self.total_num_kv_heads == 0
98
99
  self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
99
- self.head_dim = hidden_size // self.total_num_heads
100
+ # MistralConfig has an optional head_dim introduced by Mistral-Nemo
101
+ self.head_dim = getattr(
102
+ config, "head_dim", self.hidden_size // self.total_num_heads
103
+ )
100
104
  self.q_size = self.num_heads * self.head_dim
101
105
  self.kv_size = self.num_kv_heads * self.head_dim
102
106
  self.scaling = self.head_dim**-0.5
@@ -110,12 +114,14 @@ class LlamaAttention(nn.Module):
110
114
  self.total_num_kv_heads,
111
115
  bias=False,
112
116
  quant_config=quant_config,
117
+ prefix=f"{prefix}.qkv_proj",
113
118
  )
114
119
  self.o_proj = RowParallelLinear(
115
120
  self.total_num_heads * self.head_dim,
116
121
  hidden_size,
117
122
  bias=False,
118
123
  quant_config=quant_config,
124
+ prefix=f"{prefix}.o_proj",
119
125
  )
120
126
 
121
127
  self.rotary_emb = get_rope(
@@ -154,6 +160,7 @@ class LlamaDecoderLayer(nn.Module):
154
160
  config: LlamaConfig,
155
161
  layer_id: int = 0,
156
162
  quant_config: Optional[QuantizationConfig] = None,
163
+ prefix: str = "",
157
164
  ) -> None:
158
165
  super().__init__()
159
166
  self.hidden_size = config.hidden_size
@@ -168,6 +175,7 @@ class LlamaDecoderLayer(nn.Module):
168
175
  rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
169
176
  max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
170
177
  self.self_attn = LlamaAttention(
178
+ config=config,
171
179
  hidden_size=self.hidden_size,
172
180
  num_heads=config.num_attention_heads,
173
181
  num_kv_heads=config.num_key_value_heads,
@@ -177,12 +185,14 @@ class LlamaDecoderLayer(nn.Module):
177
185
  rope_is_neox_style=rope_is_neox_style,
178
186
  max_position_embeddings=max_position_embeddings,
179
187
  quant_config=quant_config,
188
+ prefix=f"{prefix}.self_attn",
180
189
  )
181
190
  self.mlp = LlamaMLP(
182
191
  hidden_size=self.hidden_size,
183
192
  intermediate_size=config.intermediate_size,
184
193
  hidden_act=config.hidden_act,
185
194
  quant_config=quant_config,
195
+ prefix=f"{prefix}.mlp",
186
196
  )
187
197
  self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
188
198
  self.post_attention_layernorm = RMSNorm(
@@ -230,7 +240,9 @@ class LlamaModel(nn.Module):
230
240
  )
231
241
  self.layers = nn.ModuleList(
232
242
  [
233
- LlamaDecoderLayer(config, i, quant_config=quant_config)
243
+ LlamaDecoderLayer(
244
+ config, i, quant_config=quant_config, prefix=f"model.layers.{i}"
245
+ )
234
246
  for i in range(config.num_hidden_layers)
235
247
  ]
236
248
  )
@@ -370,9 +382,6 @@ class LlamaForCausalLM(nn.Module):
370
382
  weight_loader(param, loaded_weight)
371
383
 
372
384
  if name is None or loaded_weight is None:
373
- if get_tensor_model_parallel_rank() == 0:
374
- weights = tqdm.tqdm(weights, total=int(len(params_dict) * 1.5))
375
-
376
385
  for name, loaded_weight in weights:
377
386
  load_weights_per_param(name, loaded_weight)
378
387
  else:
sglang/srt/server.py CHANGED
@@ -52,6 +52,7 @@ from sglang.srt.utils import (
52
52
  allocate_init_ports,
53
53
  assert_pkg_version,
54
54
  enable_show_time_cost,
55
+ maybe_set_triton_cache_manager,
55
56
  set_ulimit,
56
57
  )
57
58
  from sglang.utils import get_exception_traceback
@@ -157,6 +158,19 @@ def _set_global_server_args(server_args: ServerArgs):
157
158
  }
158
159
 
159
160
 
161
+ def _set_torch_compile_config():
162
+ # The following configurations are for torch compile optimizations
163
+ import torch._dynamo.config
164
+ import torch._inductor.config
165
+
166
+ torch._inductor.config.coordinate_descent_tuning = True
167
+ torch._inductor.config.triton.unique_kernel_names = True
168
+ torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
169
+
170
+ # FIXME: tmp workaround
171
+ torch._dynamo.config.accumulated_cache_size_limit = 256
172
+
173
+
160
174
  def launch_server(
161
175
  server_args: ServerArgs,
162
176
  model_overide_args: Optional[dict] = None,
@@ -174,6 +188,7 @@ def launch_server(
174
188
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
175
189
  os.environ["NCCL_CUMEM_ENABLE"] = "0"
176
190
  os.environ["NCCL_NVLS_ENABLE"] = "0"
191
+ os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
177
192
  set_ulimit()
178
193
  if server_args.show_time_cost:
179
194
  enable_show_time_cost()
@@ -182,14 +197,23 @@ def launch_server(
182
197
  if not server_args.disable_flashinfer:
183
198
  assert_pkg_version(
184
199
  "flashinfer",
185
- "0.1.0",
200
+ "0.1.1",
186
201
  "Please uninstall the old version and "
187
202
  "reinstall the latest version by following the instructions "
188
203
  "at https://docs.flashinfer.ai/installation.html.",
189
204
  )
205
+
206
+ if server_args.tp_size // server_args.dp_size > 1:
207
+ # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
208
+ maybe_set_triton_cache_manager()
209
+
190
210
  if server_args.chat_template:
191
211
  # TODO: replace this with huggingface transformers template
192
212
  load_chat_template_for_openai_api(server_args.chat_template)
213
+
214
+ if server_args.enable_torch_compile:
215
+ _set_torch_compile_config()
216
+
193
217
  _set_global_server_args(server_args)
194
218
 
195
219
  # Allocate ports
@@ -205,6 +229,7 @@ def launch_server(
205
229
  detokenizer_port=ports[2],
206
230
  nccl_ports=ports[3:],
207
231
  )
232
+ logger.info(f"{server_args=}")
208
233
 
209
234
  # Handle multi-node tensor parallelism
210
235
  if server_args.nnodes > 1:
sglang/srt/server_args.py CHANGED
@@ -29,7 +29,7 @@ class ServerArgs:
29
29
  max_prefill_tokens: Optional[int] = None
30
30
  max_running_requests: Optional[int] = None
31
31
  schedule_heuristic: str = "lpm"
32
- schedule_conservativeness: float = 0.8
32
+ schedule_conservativeness: float = 1.0
33
33
 
34
34
  # Other runtime options
35
35
  tp_size: int = 1
@@ -55,6 +55,7 @@ class ServerArgs:
55
55
  disable_regex_jump_forward: bool = False
56
56
  disable_cuda_graph: bool = False
57
57
  disable_disk_cache: bool = False
58
+ enable_torch_compile: bool = False
58
59
  attention_reduce_in_fp32: bool = False
59
60
  enable_p2p_check: bool = False
60
61
  efficient_weight_load: bool = False
@@ -69,15 +70,15 @@ class ServerArgs:
69
70
  self.tokenizer_path = self.model_path
70
71
  if self.mem_fraction_static is None:
71
72
  if self.tp_size >= 16:
72
- self.mem_fraction_static = 0.74
73
+ self.mem_fraction_static = 0.80
73
74
  elif self.tp_size >= 8:
74
- self.mem_fraction_static = 0.78
75
+ self.mem_fraction_static = 0.84
75
76
  elif self.tp_size >= 4:
76
- self.mem_fraction_static = 0.82
77
+ self.mem_fraction_static = 0.86
77
78
  elif self.tp_size >= 2:
78
- self.mem_fraction_static = 0.85
79
- else:
80
79
  self.mem_fraction_static = 0.88
80
+ else:
81
+ self.mem_fraction_static = 0.89
81
82
  if isinstance(self.additional_ports, int):
82
83
  self.additional_ports = [self.additional_ports]
83
84
  elif self.additional_ports is None:
@@ -317,6 +318,11 @@ class ServerArgs:
317
318
  action="store_true",
318
319
  help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
319
320
  )
321
+ parser.add_argument(
322
+ "--enable-torch-compile",
323
+ action="store_true",
324
+ help="Optimize the model with torch.compile, experimental feature.",
325
+ )
320
326
  parser.add_argument(
321
327
  "--attention-reduce-in-fp32",
322
328
  action="store_true",
sglang/srt/utils.py CHANGED
@@ -18,10 +18,15 @@ import psutil
18
18
  import requests
19
19
  import torch
20
20
  import torch.distributed as dist
21
- import triton
22
21
  from fastapi.responses import JSONResponse
23
22
  from packaging import version as pkg_version
24
23
  from starlette.middleware.base import BaseHTTPMiddleware
24
+ from triton.runtime.cache import (
25
+ FileCacheManager,
26
+ default_cache_dir,
27
+ default_dump_dir,
28
+ default_override_dir,
29
+ )
25
30
 
26
31
  logger = logging.getLogger(__name__)
27
32
 
@@ -312,6 +317,9 @@ def suppress_other_loggers():
312
317
  logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
313
318
  logging.WARN
314
319
  )
320
+ logging.getLogger("vllm.distributed.device_communicators.shm_broadcast").setLevel(
321
+ logging.WARN
322
+ )
315
323
  logging.getLogger("vllm.selector").setLevel(logging.WARN)
316
324
  logging.getLogger("vllm.utils").setLevel(logging.WARN)
317
325
 
@@ -411,6 +419,90 @@ def monkey_patch_vllm_dummy_weight_loader():
411
419
  setattr(DummyModelLoader, "load_model", load_model)
412
420
 
413
421
 
422
+ vllm_all_gather_backup = None
423
+
424
+
425
+ def monkey_patch_vllm_all_gather(reverse: bool = False):
426
+ """Monkey patch all-gather to remove in-place operations."""
427
+ from torch.distributed import _functional_collectives as funcol
428
+ from vllm.distributed.parallel_state import GroupCoordinator
429
+
430
+ global vllm_all_gather_backup
431
+ if vllm_all_gather_backup is None:
432
+ vllm_all_gather_backup = GroupCoordinator.all_gather
433
+
434
+ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
435
+ world_size = self.world_size
436
+ # Bypass the function if we are using only 1 GPU.
437
+ if world_size == 1:
438
+ return input_
439
+ assert (
440
+ -input_.dim() <= dim < input_.dim()
441
+ ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
442
+ if dim < 0:
443
+ # Convert negative dim to positive.
444
+ dim += input_.dim()
445
+ input_size = input_.size()
446
+ # Allocate output tensor.
447
+ output_tensor = torch.empty(
448
+ (world_size,) + input_size, dtype=input_.dtype, device=input_.device
449
+ )
450
+
451
+ output_tensor = funcol.all_gather_tensor(
452
+ input_, gather_dim=0, group=self.device_group
453
+ ).view((world_size,) + input_size)
454
+
455
+ # Reshape
456
+ output_tensor = output_tensor.movedim(0, dim)
457
+ output_tensor = output_tensor.reshape(
458
+ input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
459
+ )
460
+ return output_tensor
461
+
462
+ if reverse:
463
+ setattr(GroupCoordinator, "all_gather", vllm_all_gather_backup)
464
+ else:
465
+ setattr(GroupCoordinator, "all_gather", all_gather)
466
+
467
+
468
+ def maybe_set_triton_cache_manager() -> None:
469
+ """Set environment variable to tell Triton to use a
470
+ custom cache manager"""
471
+ cache_manger = os.environ.get("TRITON_CACHE_MANAGER", None)
472
+ if cache_manger is None:
473
+ manager = "sglang.srt.utils:CustomCacheManager"
474
+ logger.info("Setting Triton cache manager to: %s", manager)
475
+ os.environ["TRITON_CACHE_MANAGER"] = manager
476
+
477
+
478
+ class CustomCacheManager(FileCacheManager):
479
+ # Adapted from: https://github.com/tdoublep/vllm/blob/3307522289fdfefe323b6c00d0db696651989a2f/vllm/triton_utils/custom_cache_manager.py
480
+ def __init__(self, key, override=False, dump=False):
481
+
482
+ self.key = key
483
+ self.lock_path = None
484
+ if dump:
485
+ self.cache_dir = default_dump_dir()
486
+ self.cache_dir = os.path.join(self.cache_dir, self.key)
487
+ self.lock_path = os.path.join(self.cache_dir, "lock")
488
+ os.makedirs(self.cache_dir, exist_ok=True)
489
+ elif override:
490
+ self.cache_dir = default_override_dir()
491
+ self.cache_dir = os.path.join(self.cache_dir, self.key)
492
+ else:
493
+ # create cache directory if it doesn't exist
494
+ self.cache_dir = (
495
+ os.getenv("TRITON_CACHE_DIR", "").strip() or default_cache_dir()
496
+ )
497
+ if self.cache_dir:
498
+ self.cache_dir = f"{self.cache_dir}_{os.getpid()}"
499
+ self.cache_dir = os.path.join(self.cache_dir, self.key)
500
+ self.lock_path = os.path.join(self.cache_dir, "lock")
501
+ os.makedirs(self.cache_dir, exist_ok=True)
502
+ else:
503
+ raise RuntimeError("Could not create or locate cache dir")
504
+
505
+
414
506
  API_KEY_HEADER_NAME = "X-API-Key"
415
507
 
416
508
 
sglang/version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.25"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.22
3
+ Version: 0.1.25
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -240,7 +240,7 @@ Requires-Dist: torch ; extra == 'srt'
240
240
  Requires-Dist: uvicorn ; extra == 'srt'
241
241
  Requires-Dist: uvloop ; extra == 'srt'
242
242
  Requires-Dist: zmq ; extra == 'srt'
243
- Requires-Dist: vllm ==0.5.1 ; extra == 'srt'
243
+ Requires-Dist: vllm ==0.5.3.post1 ; extra == 'srt'
244
244
  Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
245
245
 
246
246
  <div align="center">
@@ -282,6 +282,7 @@ The core features include:
282
282
 
283
283
  ### Method 1: With pip
284
284
  ```
285
+ pip install --upgrade pip
285
286
  pip install "sglang[all]"
286
287
 
287
288
  # Install FlashInfer CUDA kernels
@@ -293,6 +294,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
293
294
  git clone https://github.com/sgl-project/sglang.git
294
295
  cd sglang
295
296
 
297
+ pip install --upgrade pip
296
298
  pip install -e "python[all]"
297
299
 
298
300
  # Install FlashInfer CUDA kernels
@@ -390,19 +392,20 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
390
392
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
391
393
  ```
392
394
  - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
393
- - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
395
+ - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
394
396
  ```
395
397
  # Node 0
396
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
398
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
397
399
 
398
400
  # Node 1
399
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
401
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
400
402
  ```
401
403
  - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
404
+ - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
402
405
 
403
406
  ### Supported Models
404
407
 
405
- - Llama / Llama 2 / Llama 3
408
+ - Llama / Llama 2 / Llama 3 / Llama 3.1
406
409
  - Mistral / Mixtral
407
410
  - Gemma / Gemma 2
408
411
  - Qwen / Qwen 2 / Qwen 2 MoE
@@ -420,6 +423,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
420
423
  - Grok
421
424
  - ChatGLM
422
425
  - InternLM 2
426
+ - Mistral NeMo
423
427
 
424
428
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
425
429
 
@@ -1,24 +1,17 @@
1
- sglang/__init__.py,sha256=7-tQgpOarxM1MfYy5nCbpqhqSKB_hKRAI4tekucmYz4,1141
1
+ sglang/__init__.py,sha256=UV7VlXhXrwi00Zg45iNB9KcnmrwLjdMtjMz06AiafY0,1151
2
2
  sglang/api.py,sha256=1JARbc1wNYF6tODdUpgmNgTyLOvMnxdTBctLvEwzGTY,5565
3
- sglang/bench.py,sha256=p34wnfMRdiedOUf9GKGZkkNxehmyTzK6Q1O20q_SGjY,21841
4
3
  sglang/bench_latency.py,sha256=UPy6WhrddMTDX7HqIeHNhCn5vF0YMOKxJlQRvhMC8zU,10552
5
- sglang/bench_serving.py,sha256=IebHhb0AM_4FhA74Xu13QK1-KXpkRZ_k3ohwKiot9mU,26116
4
+ sglang/bench_serving.py,sha256=zKGgVX3S-ggUvOxvEM4AszzXRPRVU6NGNnBG5vAAvRY,34577
6
5
  sglang/check_env.py,sha256=CscuPMlf68dkgZf0m-FiLpUisNNDoihMck4qhLOeV1Q,4124
7
- sglang/global_config.py,sha256=6WAMjRR1lDeGFdFu-18xUAbWVM2Vj0_L5ExvQ5wofus,1711
6
+ sglang/global_config.py,sha256=QG-ABVJksKK_llvUx7fSZcmK4GGCs-hBUVcM4LCr7Nw,1749
8
7
  sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
9
8
  sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
10
9
  sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
11
- sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
13
- sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
14
- sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
15
- sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
16
- sglang/backend/runtime_endpoint.py,sha256=PAdnQBj3yQNtgw8GH9F1ecGE7HhxGa2T7Tz_c--H2aE,9203
17
- sglang/backend/vertexai.py,sha256=98toR-L0OTi4dYHaSmmzJdlQ2qN_0lImoKZFlVgYLRE,4850
10
+ sglang/version.py,sha256=Ej7LsXg-6CASlaEHsZkUoLDpYEfHeFKdIeXMIM0esgA,23
18
11
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
12
  sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
20
13
  sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
21
- sglang/lang/interpreter.py,sha256=0phpQs4PooVvVJCzzyNrTv2OFevI5fsU1FcN4roxqhY,29628
14
+ sglang/lang/interpreter.py,sha256=27j7H9p7TY4uUfF9f5E17FxK1xCNeNju4aut_PaWCrQ,29693
22
15
  sglang/lang/ir.py,sha256=5VVK2JnbspdysrhcGgkmp_JlAprd2XqqRnS_GfP_XWc,16645
23
16
  sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
24
17
  sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -30,16 +23,14 @@ sglang/lang/backend/runtime_endpoint.py,sha256=TZ0NV89or5_3MIZZFnc1JXAAjnv7tCfeQ
30
23
  sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
31
24
  sglang/srt/conversation.py,sha256=Il7JJuu4o42k2xdBWVfONNmstTsAM-4idX6AcEOnrXQ,15526
32
25
  sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
33
- sglang/srt/hf_transformers_utils.py,sha256=H3YnLtx05q65A1tn1JWNZOUhMtq6jANRhhMo6JJr6mg,10728
34
- sglang/srt/memory_pool.py,sha256=rzJq9-kgO9ON5mgHcLT5GKiQCWBCFaczPE8-9M6ckaU,3680
26
+ sglang/srt/hf_transformers_utils.py,sha256=94mOI93B2xOmXKqfJfEoGxqHgwwlWNbPHgsA47AQJK8,11245
27
+ sglang/srt/memory_pool.py,sha256=FhJk5GtYortO3MJIsMMQ-o49agwDHVX1aEQH2LITq6c,3949
35
28
  sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
36
29
  sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
37
- sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
38
- sglang/srt/openai_protocol.py,sha256=lGBhfxG6jmgUkMOh2NpBK9w9TUTRZKrsfHdW7XYhKKI,5700
39
30
  sglang/srt/sampling_params.py,sha256=OI11asr1Bd_E5soDjih614v4flgWxdMZU9HAF0aBafQ,3062
40
- sglang/srt/server.py,sha256=c0Ldp-10tvTroJI0msHWorrqObR90FuNK6SM4KP-qeU,13682
41
- sglang/srt/server_args.py,sha256=6pMKJN0S1QoTcVAstmxc5Laub2OAxMYpMykQky-Ym10,12959
42
- sglang/srt/utils.py,sha256=GFO0K-BnpAGi1_Cp4cSKOVTjfILz8qNltF-feZPR7yE,16804
31
+ sglang/srt/server.py,sha256=DXhcJt0V24a7yhydP1abPrK1qqV3qt7r8cyOMVOAI4M,14611
32
+ sglang/srt/server_args.py,sha256=aF6L35mEB-FU3BL_ooKuCIcOXLhYLxA9-MjpaOTQRCo,13189
33
+ sglang/srt/utils.py,sha256=bUp3SLzbDms0dvuETaccDPAGRHOIGW5A61pqH62XiT0,20370
43
34
  sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
44
35
  sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
45
36
  sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
@@ -48,33 +39,34 @@ sglang/srt/layers/context_flashattention_nopad.py,sha256=7ps_9W_ia9zikL9HqsSUwWH
48
39
  sglang/srt/layers/extend_attention.py,sha256=aYAAL9HZJpaSASp-ulMvbmSmyMcqdYUsgVQC-Lbm7_U,12008
49
40
  sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
50
41
  sglang/srt/layers/linear.py,sha256=qLwFkOiRAljzE7LkAkLRdcCdVMk-t7b56jEjwQAuYDM,33953
51
- sglang/srt/layers/logits_processor.py,sha256=RCHjWxlKlB_Mc2iOMHQKvKN9gjqg4oqgodS6gr3qCbA,9672
52
- sglang/srt/layers/radix_attention.py,sha256=xdj4v0L5DEcQDDHSbfo_VFqdvHLAWpiT2oU8wKqz3Gk,6212
42
+ sglang/srt/layers/logits_processor.py,sha256=KyRYANCiq9Cfu_VPjrIbSBAlqN_clcAgF3JrG9waU5k,9674
43
+ sglang/srt/layers/radix_attention.py,sha256=A3J_wOlysjblFXHgehAqRHBQmpYAHLyUovyLFsrMJ7A,6386
53
44
  sglang/srt/layers/token_attention.py,sha256=EJ4gjbVLfshOZ_vr1iB-Eq8_B-4F26n_wPDj6e1Zrww,7386
54
45
  sglang/srt/layers/quantization/__init__.py,sha256=PQFzdPpul98DvywBA6YMBOnrMjtHE1LMlMpJ7FM8J3I,1971
55
46
  sglang/srt/layers/quantization/fp8.py,sha256=jaqgRFnHC--IL8iqB6Qygi-KXYPYBKKqt_j4Rk55_h4,24946
56
47
  sglang/srt/managers/detokenizer_manager.py,sha256=8rN2cdMr61LWy07lingEqLnNy0W5Rebdn14IsTQ9PCs,5049
57
48
  sglang/srt/managers/io_struct.py,sha256=Y6jW3p0cNg0jcrEQNki1H8MMEWxwWA4p6Y-xVgUVWaI,5404
58
49
  sglang/srt/managers/tokenizer_manager.py,sha256=SbivhFhZUR9HU9pLTe93MlYprAFAHzOU3KMBA2piQUk,19308
59
- sglang/srt/managers/controller/cuda_graph_runner.py,sha256=xWyLPg7vG2EAsgmSG1AI0aEk_AueyOD0-aNbK3Mt_DE,7043
60
- sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
61
- sglang/srt/managers/controller/infer_batch.py,sha256=phXzANqBUFyqFwRVl06bd5yBnGK2hem6qzf5i0lrTq0,33086
50
+ sglang/srt/managers/controller/cuda_graph_runner.py,sha256=0aRqA1_34oJ557Zn8PjpJecex5bBWJdnCmBlcDVvYO0,8509
51
+ sglang/srt/managers/controller/infer_batch.py,sha256=SKwCwhnZ_CNlG0mVCEc4X0e4HNjJFke-c8zdWP3TzjQ,34186
62
52
  sglang/srt/managers/controller/manager_multi.py,sha256=DT8Y9RF5OyTxlrLEZYz4claNWir3UrVztdOZaVPiA6g,6077
63
53
  sglang/srt/managers/controller/manager_single.py,sha256=2xO_iWK6tWvc0B31nKbe2N3klxwQBJmPTnFhNjzhVSI,4566
64
- sglang/srt/managers/controller/model_runner.py,sha256=UBvaHShjBWWFMWSEKeDh2tNqd0zWTwtfK37BbYR7c6w,13864
54
+ sglang/srt/managers/controller/model_runner.py,sha256=FwZ7FU7nhJsYhtoTNxYFc4e6oMEwSqOh8ohXOKtFPKc,15828
65
55
  sglang/srt/managers/controller/radix_cache.py,sha256=tx8LEQpqLxipw9UUVj4D1YQLMMDmWnjDYv8oDlOl-co,8210
66
- sglang/srt/managers/controller/schedule_heuristic.py,sha256=tw9WEiA_pzL4dkPnoS34SYhhQ3hJXBL6K03zRm2n_g8,2482
67
- sglang/srt/managers/controller/tp_worker.py,sha256=uyoAW4O08UPciRYxGBPK6U5jaVuwEOvKBjaeJNNAe8s,30531
56
+ sglang/srt/managers/controller/schedule_heuristic.py,sha256=SQAGzPS3aB_TPj7rnPBhewwyR6W1sVwW4D3zG3JUY00,2714
57
+ sglang/srt/managers/controller/tp_worker.py,sha256=yjz-Xzl0zEy4QSU-EYneZH5vi3oHtBuXTtYe4VuDp2g,30517
68
58
  sglang/srt/model_loader/model_loader.py,sha256=VS8VQL5ITN3akZ9eU_-uHWMan1axLMNG2_O12HzGysA,10132
69
59
  sglang/srt/model_loader/utils.py,sha256=I2PS5HIH5Cg-p7xKO_Cw_foK2vQ61xVc3zQv7CbeGEw,10120
70
60
  sglang/srt/models/chatglm.py,sha256=pH8g2Dj8qQLGPYpWVTb-IONfXsdfmpWi0-IEYNdSi4s,13296
71
61
  sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
72
62
  sglang/srt/models/dbrx.py,sha256=rRxOusGPu670ommeqXg62AllwB1apzE4yZoWc1fcr2M,14095
63
+ sglang/srt/models/deepseek.py,sha256=YtoPmv4fKmiH_jsRMSab9Wxq3aOZga9pCPGnkCs3Vvs,15457
73
64
  sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
74
65
  sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
66
+ sglang/srt/models/gpt_bigcode.py,sha256=XHO1naPdXfiKYQRQ6uZe1fN3PBDhKH3-bchsaaZvfE4,9637
75
67
  sglang/srt/models/grok.py,sha256=611zrlIchvFaVfztRdBY7z97oU3KB-anykbOZy1hK6M,27295
76
68
  sglang/srt/models/internlm2.py,sha256=8MNcwxU5Th9IxWa314HqqmbCRlPUFScnfneBDs0riIU,11659
77
- sglang/srt/models/llama2.py,sha256=i97Ib4zq0-AbW7Wwp_ctFWnK528vipmlZVD_a7gB8L8,13819
69
+ sglang/srt/models/llama2.py,sha256=OyAf_lun5aZEsT80WmrIYBF8QXTXRpW8sUlylr4AZIc,14204
78
70
  sglang/srt/models/llama_classification.py,sha256=foCPvNyP2bTZ0YcRBF-qkmBv-gT24lhLNCXP30Oq4VU,4370
79
71
  sglang/srt/models/llava.py,sha256=vBI6EEeOG_9o23Shi9h8k58rxTOHZnSKMmPl3B3Q3uc,17924
80
72
  sglang/srt/models/llavavid.py,sha256=SrNQ-U2wekHvP_up-ZXRkCSros2NzheHpPfXHrp0YBU,13050
@@ -88,16 +80,13 @@ sglang/srt/models/qwen2_moe.py,sha256=oHNoo45myV5kitkls2GWVzuGt1Q4pRHN2nLlXEltFI
88
80
  sglang/srt/models/stablelm.py,sha256=Z_XCDSHY_QMz3lZwwkZdIZjEOizZjLYJU9GDi8o08qQ,10802
89
81
  sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
90
82
  sglang/srt/openai_api/adapter.py,sha256=eirFYVGIp5D-UrQLqW5dRJOQYKmzF9nmgCzFeUOb2z8,15737
91
- sglang/srt/openai_api/api_adapter.py,sha256=eirFYVGIp5D-UrQLqW5dRJOQYKmzF9nmgCzFeUOb2z8,15737
92
- sglang/srt/openai_api/openai_api_adapter.py,sha256=5pDaktIEteHxp3qN89U_U3ndd7N0FIfUZAM06YeziUY,15687
93
- sglang/srt/openai_api/openai_protocol.py,sha256=lGBhfxG6jmgUkMOh2NpBK9w9TUTRZKrsfHdW7XYhKKI,5700
94
83
  sglang/srt/openai_api/protocol.py,sha256=j7ifIR2SFQxTwaHAd9ksM096vfffcNltzTH4sg7H0RA,5739
95
84
  sglang/test/test_conversation.py,sha256=gF_AyOxQgpPQBPnA57-kq-M0p_zFu-rBDMFgAq655Rw,1596
96
85
  sglang/test/test_openai_protocol.py,sha256=DVx3r6hrb8oRqbo5AYIleldxbqMBTtb-gtORM6t_Y1c,1661
97
86
  sglang/test/test_programs.py,sha256=uefeHUFKT2NJESOujj-CsnPXdw1aQQN2TzUbPCHJjGs,13654
98
87
  sglang/test/test_utils.py,sha256=kD_fQe3WroZ9Kc3NBRKPiZOFJ_JD2uEE9XIvPp6AD9Y,11048
99
- sglang-0.1.22.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
100
- sglang-0.1.22.dist-info/METADATA,sha256=O1pihQWf_523B_fgluftctwOxcou6oj13_Wuquj7ztU,30691
101
- sglang-0.1.22.dist-info/WHEEL,sha256=rWxmBtp7hEUqVLOnTaDOPpR-cZpCDkzhhcBce-Zyd5k,91
102
- sglang-0.1.22.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
103
- sglang-0.1.22.dist-info/RECORD,,
88
+ sglang-0.1.25.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
89
+ sglang-0.1.25.dist-info/METADATA,sha256=Ifwh2YdZqQXMe2UCOklWFIGeM0KLkfLjBQHv98gS8Pw,30928
90
+ sglang-0.1.25.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
91
+ sglang-0.1.25.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
92
+ sglang-0.1.25.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (71.0.4)
2
+ Generator: setuptools (71.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
File without changes
@@ -1,77 +0,0 @@
1
- from typing import List, Optional, Union
2
-
3
- import numpy as np
4
-
5
- from sglang.backend.base_backend import BaseBackend
6
- from sglang.lang.chat_template import get_chat_template
7
- from sglang.lang.interpreter import StreamExecutor
8
- from sglang.lang.ir import SglSamplingParams
9
-
10
- try:
11
- import anthropic
12
- except ImportError as e:
13
- anthropic = e
14
-
15
-
16
- class Anthropic(BaseBackend):
17
- def __init__(self, model_name, *args, **kwargs):
18
- super().__init__()
19
-
20
- if isinstance(anthropic, Exception):
21
- raise anthropic
22
-
23
- self.model_name = model_name
24
- self.chat_template = get_chat_template("claude")
25
- self.client = anthropic.Anthropic(*args, **kwargs)
26
-
27
- def get_chat_template(self):
28
- return self.chat_template
29
-
30
- def generate(
31
- self,
32
- s: StreamExecutor,
33
- sampling_params: SglSamplingParams,
34
- ):
35
- if s.messages_:
36
- messages = s.messages_
37
- else:
38
- messages = [{"role": "user", "content": s.text_}]
39
-
40
- if messages and messages[0]["role"] == "system":
41
- system = messages.pop(0)["content"]
42
- else:
43
- system = ""
44
-
45
- ret = self.client.messages.create(
46
- model=self.model_name,
47
- system=system,
48
- messages=messages,
49
- **sampling_params.to_anthropic_kwargs(),
50
- )
51
- comp = ret.content[0].text
52
-
53
- return comp, {}
54
-
55
- def generate_stream(
56
- self,
57
- s: StreamExecutor,
58
- sampling_params: SglSamplingParams,
59
- ):
60
- if s.messages_:
61
- messages = s.messages_
62
- else:
63
- messages = [{"role": "user", "content": s.text_}]
64
-
65
- if messages and messages[0]["role"] == "system":
66
- system = messages.pop(0)["content"]
67
- else:
68
- system = ""
69
-
70
- with self.client.messages.stream(
71
- model=self.model_name,
72
- system=system,
73
- messages=messages,
74
- **sampling_params.to_anthropic_kwargs(),
75
- ) as stream:
76
- for text in stream.text_stream:
77
- yield text, {}