sglang 0.1.22__py3-none-any.whl → 0.1.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -2
- sglang/bench_serving.py +243 -25
- sglang/global_config.py +3 -2
- sglang/lang/interpreter.py +1 -0
- sglang/srt/hf_transformers_utils.py +13 -1
- sglang/srt/layers/logits_processor.py +4 -5
- sglang/srt/layers/radix_attention.py +38 -49
- sglang/srt/managers/controller/cuda_graph_runner.py +58 -16
- sglang/srt/managers/controller/infer_batch.py +51 -22
- sglang/srt/managers/controller/model_runner.py +58 -4
- sglang/srt/managers/controller/schedule_heuristic.py +8 -3
- sglang/srt/managers/controller/tp_worker.py +9 -11
- sglang/srt/memory_pool.py +13 -5
- sglang/srt/models/deepseek.py +430 -0
- sglang/srt/models/gpt_bigcode.py +282 -0
- sglang/srt/models/llama2.py +19 -10
- sglang/srt/server.py +26 -1
- sglang/srt/server_args.py +12 -6
- sglang/srt/utils.py +93 -1
- sglang/version.py +1 -0
- {sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/METADATA +10 -6
- {sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/RECORD +25 -36
- {sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/WHEEL +1 -1
- sglang/backend/__init__.py +0 -0
- sglang/backend/anthropic.py +0 -77
- sglang/backend/base_backend.py +0 -80
- sglang/backend/litellm.py +0 -90
- sglang/backend/openai.py +0 -438
- sglang/backend/runtime_endpoint.py +0 -283
- sglang/backend/vertexai.py +0 -149
- sglang/bench.py +0 -627
- sglang/srt/managers/controller/dp_worker.py +0 -113
- sglang/srt/openai_api/api_adapter.py +0 -432
- sglang/srt/openai_api/openai_api_adapter.py +0 -431
- sglang/srt/openai_api/openai_protocol.py +0 -207
- sglang/srt/openai_api_adapter.py +0 -411
- sglang/srt/openai_protocol.py +0 -207
- {sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/LICENSE +0 -0
- {sglang-0.1.22.dist-info → sglang-0.1.25.dist-info}/top_level.txt +0 -0
sglang/srt/models/llama2.py
CHANGED
@@ -5,14 +5,10 @@
|
|
5
5
|
from typing import Any, Dict, Iterable, Optional, Tuple
|
6
6
|
|
7
7
|
import torch
|
8
|
-
import tqdm
|
9
8
|
from torch import nn
|
10
9
|
from transformers import LlamaConfig
|
11
10
|
from vllm.config import CacheConfig
|
12
|
-
from vllm.distributed import
|
13
|
-
get_tensor_model_parallel_rank,
|
14
|
-
get_tensor_model_parallel_world_size,
|
15
|
-
)
|
11
|
+
from vllm.distributed import get_tensor_model_parallel_world_size
|
16
12
|
from vllm.model_executor.layers.activation import SiluAndMul
|
17
13
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
18
14
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
@@ -39,6 +35,7 @@ class LlamaMLP(nn.Module):
|
|
39
35
|
intermediate_size: int,
|
40
36
|
hidden_act: str,
|
41
37
|
quant_config: Optional[QuantizationConfig] = None,
|
38
|
+
prefix: str = "",
|
42
39
|
) -> None:
|
43
40
|
super().__init__()
|
44
41
|
self.gate_up_proj = MergedColumnParallelLinear(
|
@@ -46,12 +43,14 @@ class LlamaMLP(nn.Module):
|
|
46
43
|
[intermediate_size] * 2,
|
47
44
|
bias=False,
|
48
45
|
quant_config=quant_config,
|
46
|
+
prefix=f"{prefix}.gate_up_proj",
|
49
47
|
)
|
50
48
|
self.down_proj = RowParallelLinear(
|
51
49
|
intermediate_size,
|
52
50
|
hidden_size,
|
53
51
|
bias=False,
|
54
52
|
quant_config=quant_config,
|
53
|
+
prefix=f"{prefix}.down_proj",
|
55
54
|
)
|
56
55
|
if hidden_act != "silu":
|
57
56
|
raise ValueError(
|
@@ -70,6 +69,7 @@ class LlamaMLP(nn.Module):
|
|
70
69
|
class LlamaAttention(nn.Module):
|
71
70
|
def __init__(
|
72
71
|
self,
|
72
|
+
config: LlamaConfig,
|
73
73
|
hidden_size: int,
|
74
74
|
num_heads: int,
|
75
75
|
num_kv_heads: int,
|
@@ -79,6 +79,7 @@ class LlamaAttention(nn.Module):
|
|
79
79
|
rope_is_neox_style: bool = True,
|
80
80
|
max_position_embeddings: int = 8192,
|
81
81
|
quant_config: Optional[QuantizationConfig] = None,
|
82
|
+
prefix: str = "",
|
82
83
|
) -> None:
|
83
84
|
super().__init__()
|
84
85
|
self.hidden_size = hidden_size
|
@@ -96,7 +97,10 @@ class LlamaAttention(nn.Module):
|
|
96
97
|
# the KV heads across multiple tensor parallel GPUs.
|
97
98
|
assert tp_size % self.total_num_kv_heads == 0
|
98
99
|
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
99
|
-
|
100
|
+
# MistralConfig has an optional head_dim introduced by Mistral-Nemo
|
101
|
+
self.head_dim = getattr(
|
102
|
+
config, "head_dim", self.hidden_size // self.total_num_heads
|
103
|
+
)
|
100
104
|
self.q_size = self.num_heads * self.head_dim
|
101
105
|
self.kv_size = self.num_kv_heads * self.head_dim
|
102
106
|
self.scaling = self.head_dim**-0.5
|
@@ -110,12 +114,14 @@ class LlamaAttention(nn.Module):
|
|
110
114
|
self.total_num_kv_heads,
|
111
115
|
bias=False,
|
112
116
|
quant_config=quant_config,
|
117
|
+
prefix=f"{prefix}.qkv_proj",
|
113
118
|
)
|
114
119
|
self.o_proj = RowParallelLinear(
|
115
120
|
self.total_num_heads * self.head_dim,
|
116
121
|
hidden_size,
|
117
122
|
bias=False,
|
118
123
|
quant_config=quant_config,
|
124
|
+
prefix=f"{prefix}.o_proj",
|
119
125
|
)
|
120
126
|
|
121
127
|
self.rotary_emb = get_rope(
|
@@ -154,6 +160,7 @@ class LlamaDecoderLayer(nn.Module):
|
|
154
160
|
config: LlamaConfig,
|
155
161
|
layer_id: int = 0,
|
156
162
|
quant_config: Optional[QuantizationConfig] = None,
|
163
|
+
prefix: str = "",
|
157
164
|
) -> None:
|
158
165
|
super().__init__()
|
159
166
|
self.hidden_size = config.hidden_size
|
@@ -168,6 +175,7 @@ class LlamaDecoderLayer(nn.Module):
|
|
168
175
|
rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
|
169
176
|
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
170
177
|
self.self_attn = LlamaAttention(
|
178
|
+
config=config,
|
171
179
|
hidden_size=self.hidden_size,
|
172
180
|
num_heads=config.num_attention_heads,
|
173
181
|
num_kv_heads=config.num_key_value_heads,
|
@@ -177,12 +185,14 @@ class LlamaDecoderLayer(nn.Module):
|
|
177
185
|
rope_is_neox_style=rope_is_neox_style,
|
178
186
|
max_position_embeddings=max_position_embeddings,
|
179
187
|
quant_config=quant_config,
|
188
|
+
prefix=f"{prefix}.self_attn",
|
180
189
|
)
|
181
190
|
self.mlp = LlamaMLP(
|
182
191
|
hidden_size=self.hidden_size,
|
183
192
|
intermediate_size=config.intermediate_size,
|
184
193
|
hidden_act=config.hidden_act,
|
185
194
|
quant_config=quant_config,
|
195
|
+
prefix=f"{prefix}.mlp",
|
186
196
|
)
|
187
197
|
self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
188
198
|
self.post_attention_layernorm = RMSNorm(
|
@@ -230,7 +240,9 @@ class LlamaModel(nn.Module):
|
|
230
240
|
)
|
231
241
|
self.layers = nn.ModuleList(
|
232
242
|
[
|
233
|
-
LlamaDecoderLayer(
|
243
|
+
LlamaDecoderLayer(
|
244
|
+
config, i, quant_config=quant_config, prefix=f"model.layers.{i}"
|
245
|
+
)
|
234
246
|
for i in range(config.num_hidden_layers)
|
235
247
|
]
|
236
248
|
)
|
@@ -370,9 +382,6 @@ class LlamaForCausalLM(nn.Module):
|
|
370
382
|
weight_loader(param, loaded_weight)
|
371
383
|
|
372
384
|
if name is None or loaded_weight is None:
|
373
|
-
if get_tensor_model_parallel_rank() == 0:
|
374
|
-
weights = tqdm.tqdm(weights, total=int(len(params_dict) * 1.5))
|
375
|
-
|
376
385
|
for name, loaded_weight in weights:
|
377
386
|
load_weights_per_param(name, loaded_weight)
|
378
387
|
else:
|
sglang/srt/server.py
CHANGED
@@ -52,6 +52,7 @@ from sglang.srt.utils import (
|
|
52
52
|
allocate_init_ports,
|
53
53
|
assert_pkg_version,
|
54
54
|
enable_show_time_cost,
|
55
|
+
maybe_set_triton_cache_manager,
|
55
56
|
set_ulimit,
|
56
57
|
)
|
57
58
|
from sglang.utils import get_exception_traceback
|
@@ -157,6 +158,19 @@ def _set_global_server_args(server_args: ServerArgs):
|
|
157
158
|
}
|
158
159
|
|
159
160
|
|
161
|
+
def _set_torch_compile_config():
|
162
|
+
# The following configurations are for torch compile optimizations
|
163
|
+
import torch._dynamo.config
|
164
|
+
import torch._inductor.config
|
165
|
+
|
166
|
+
torch._inductor.config.coordinate_descent_tuning = True
|
167
|
+
torch._inductor.config.triton.unique_kernel_names = True
|
168
|
+
torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
|
169
|
+
|
170
|
+
# FIXME: tmp workaround
|
171
|
+
torch._dynamo.config.accumulated_cache_size_limit = 256
|
172
|
+
|
173
|
+
|
160
174
|
def launch_server(
|
161
175
|
server_args: ServerArgs,
|
162
176
|
model_overide_args: Optional[dict] = None,
|
@@ -174,6 +188,7 @@ def launch_server(
|
|
174
188
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
175
189
|
os.environ["NCCL_CUMEM_ENABLE"] = "0"
|
176
190
|
os.environ["NCCL_NVLS_ENABLE"] = "0"
|
191
|
+
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
177
192
|
set_ulimit()
|
178
193
|
if server_args.show_time_cost:
|
179
194
|
enable_show_time_cost()
|
@@ -182,14 +197,23 @@ def launch_server(
|
|
182
197
|
if not server_args.disable_flashinfer:
|
183
198
|
assert_pkg_version(
|
184
199
|
"flashinfer",
|
185
|
-
"0.1.
|
200
|
+
"0.1.1",
|
186
201
|
"Please uninstall the old version and "
|
187
202
|
"reinstall the latest version by following the instructions "
|
188
203
|
"at https://docs.flashinfer.ai/installation.html.",
|
189
204
|
)
|
205
|
+
|
206
|
+
if server_args.tp_size // server_args.dp_size > 1:
|
207
|
+
# FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
|
208
|
+
maybe_set_triton_cache_manager()
|
209
|
+
|
190
210
|
if server_args.chat_template:
|
191
211
|
# TODO: replace this with huggingface transformers template
|
192
212
|
load_chat_template_for_openai_api(server_args.chat_template)
|
213
|
+
|
214
|
+
if server_args.enable_torch_compile:
|
215
|
+
_set_torch_compile_config()
|
216
|
+
|
193
217
|
_set_global_server_args(server_args)
|
194
218
|
|
195
219
|
# Allocate ports
|
@@ -205,6 +229,7 @@ def launch_server(
|
|
205
229
|
detokenizer_port=ports[2],
|
206
230
|
nccl_ports=ports[3:],
|
207
231
|
)
|
232
|
+
logger.info(f"{server_args=}")
|
208
233
|
|
209
234
|
# Handle multi-node tensor parallelism
|
210
235
|
if server_args.nnodes > 1:
|
sglang/srt/server_args.py
CHANGED
@@ -29,7 +29,7 @@ class ServerArgs:
|
|
29
29
|
max_prefill_tokens: Optional[int] = None
|
30
30
|
max_running_requests: Optional[int] = None
|
31
31
|
schedule_heuristic: str = "lpm"
|
32
|
-
schedule_conservativeness: float = 0
|
32
|
+
schedule_conservativeness: float = 1.0
|
33
33
|
|
34
34
|
# Other runtime options
|
35
35
|
tp_size: int = 1
|
@@ -55,6 +55,7 @@ class ServerArgs:
|
|
55
55
|
disable_regex_jump_forward: bool = False
|
56
56
|
disable_cuda_graph: bool = False
|
57
57
|
disable_disk_cache: bool = False
|
58
|
+
enable_torch_compile: bool = False
|
58
59
|
attention_reduce_in_fp32: bool = False
|
59
60
|
enable_p2p_check: bool = False
|
60
61
|
efficient_weight_load: bool = False
|
@@ -69,15 +70,15 @@ class ServerArgs:
|
|
69
70
|
self.tokenizer_path = self.model_path
|
70
71
|
if self.mem_fraction_static is None:
|
71
72
|
if self.tp_size >= 16:
|
72
|
-
self.mem_fraction_static = 0.
|
73
|
+
self.mem_fraction_static = 0.80
|
73
74
|
elif self.tp_size >= 8:
|
74
|
-
self.mem_fraction_static = 0.
|
75
|
+
self.mem_fraction_static = 0.84
|
75
76
|
elif self.tp_size >= 4:
|
76
|
-
self.mem_fraction_static = 0.
|
77
|
+
self.mem_fraction_static = 0.86
|
77
78
|
elif self.tp_size >= 2:
|
78
|
-
self.mem_fraction_static = 0.85
|
79
|
-
else:
|
80
79
|
self.mem_fraction_static = 0.88
|
80
|
+
else:
|
81
|
+
self.mem_fraction_static = 0.89
|
81
82
|
if isinstance(self.additional_ports, int):
|
82
83
|
self.additional_ports = [self.additional_ports]
|
83
84
|
elif self.additional_ports is None:
|
@@ -317,6 +318,11 @@ class ServerArgs:
|
|
317
318
|
action="store_true",
|
318
319
|
help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
|
319
320
|
)
|
321
|
+
parser.add_argument(
|
322
|
+
"--enable-torch-compile",
|
323
|
+
action="store_true",
|
324
|
+
help="Optimize the model with torch.compile, experimental feature.",
|
325
|
+
)
|
320
326
|
parser.add_argument(
|
321
327
|
"--attention-reduce-in-fp32",
|
322
328
|
action="store_true",
|
sglang/srt/utils.py
CHANGED
@@ -18,10 +18,15 @@ import psutil
|
|
18
18
|
import requests
|
19
19
|
import torch
|
20
20
|
import torch.distributed as dist
|
21
|
-
import triton
|
22
21
|
from fastapi.responses import JSONResponse
|
23
22
|
from packaging import version as pkg_version
|
24
23
|
from starlette.middleware.base import BaseHTTPMiddleware
|
24
|
+
from triton.runtime.cache import (
|
25
|
+
FileCacheManager,
|
26
|
+
default_cache_dir,
|
27
|
+
default_dump_dir,
|
28
|
+
default_override_dir,
|
29
|
+
)
|
25
30
|
|
26
31
|
logger = logging.getLogger(__name__)
|
27
32
|
|
@@ -312,6 +317,9 @@ def suppress_other_loggers():
|
|
312
317
|
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
|
313
318
|
logging.WARN
|
314
319
|
)
|
320
|
+
logging.getLogger("vllm.distributed.device_communicators.shm_broadcast").setLevel(
|
321
|
+
logging.WARN
|
322
|
+
)
|
315
323
|
logging.getLogger("vllm.selector").setLevel(logging.WARN)
|
316
324
|
logging.getLogger("vllm.utils").setLevel(logging.WARN)
|
317
325
|
|
@@ -411,6 +419,90 @@ def monkey_patch_vllm_dummy_weight_loader():
|
|
411
419
|
setattr(DummyModelLoader, "load_model", load_model)
|
412
420
|
|
413
421
|
|
422
|
+
vllm_all_gather_backup = None
|
423
|
+
|
424
|
+
|
425
|
+
def monkey_patch_vllm_all_gather(reverse: bool = False):
|
426
|
+
"""Monkey patch all-gather to remove in-place operations."""
|
427
|
+
from torch.distributed import _functional_collectives as funcol
|
428
|
+
from vllm.distributed.parallel_state import GroupCoordinator
|
429
|
+
|
430
|
+
global vllm_all_gather_backup
|
431
|
+
if vllm_all_gather_backup is None:
|
432
|
+
vllm_all_gather_backup = GroupCoordinator.all_gather
|
433
|
+
|
434
|
+
def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
|
435
|
+
world_size = self.world_size
|
436
|
+
# Bypass the function if we are using only 1 GPU.
|
437
|
+
if world_size == 1:
|
438
|
+
return input_
|
439
|
+
assert (
|
440
|
+
-input_.dim() <= dim < input_.dim()
|
441
|
+
), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
|
442
|
+
if dim < 0:
|
443
|
+
# Convert negative dim to positive.
|
444
|
+
dim += input_.dim()
|
445
|
+
input_size = input_.size()
|
446
|
+
# Allocate output tensor.
|
447
|
+
output_tensor = torch.empty(
|
448
|
+
(world_size,) + input_size, dtype=input_.dtype, device=input_.device
|
449
|
+
)
|
450
|
+
|
451
|
+
output_tensor = funcol.all_gather_tensor(
|
452
|
+
input_, gather_dim=0, group=self.device_group
|
453
|
+
).view((world_size,) + input_size)
|
454
|
+
|
455
|
+
# Reshape
|
456
|
+
output_tensor = output_tensor.movedim(0, dim)
|
457
|
+
output_tensor = output_tensor.reshape(
|
458
|
+
input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
|
459
|
+
)
|
460
|
+
return output_tensor
|
461
|
+
|
462
|
+
if reverse:
|
463
|
+
setattr(GroupCoordinator, "all_gather", vllm_all_gather_backup)
|
464
|
+
else:
|
465
|
+
setattr(GroupCoordinator, "all_gather", all_gather)
|
466
|
+
|
467
|
+
|
468
|
+
def maybe_set_triton_cache_manager() -> None:
|
469
|
+
"""Set environment variable to tell Triton to use a
|
470
|
+
custom cache manager"""
|
471
|
+
cache_manger = os.environ.get("TRITON_CACHE_MANAGER", None)
|
472
|
+
if cache_manger is None:
|
473
|
+
manager = "sglang.srt.utils:CustomCacheManager"
|
474
|
+
logger.info("Setting Triton cache manager to: %s", manager)
|
475
|
+
os.environ["TRITON_CACHE_MANAGER"] = manager
|
476
|
+
|
477
|
+
|
478
|
+
class CustomCacheManager(FileCacheManager):
|
479
|
+
# Adapted from: https://github.com/tdoublep/vllm/blob/3307522289fdfefe323b6c00d0db696651989a2f/vllm/triton_utils/custom_cache_manager.py
|
480
|
+
def __init__(self, key, override=False, dump=False):
|
481
|
+
|
482
|
+
self.key = key
|
483
|
+
self.lock_path = None
|
484
|
+
if dump:
|
485
|
+
self.cache_dir = default_dump_dir()
|
486
|
+
self.cache_dir = os.path.join(self.cache_dir, self.key)
|
487
|
+
self.lock_path = os.path.join(self.cache_dir, "lock")
|
488
|
+
os.makedirs(self.cache_dir, exist_ok=True)
|
489
|
+
elif override:
|
490
|
+
self.cache_dir = default_override_dir()
|
491
|
+
self.cache_dir = os.path.join(self.cache_dir, self.key)
|
492
|
+
else:
|
493
|
+
# create cache directory if it doesn't exist
|
494
|
+
self.cache_dir = (
|
495
|
+
os.getenv("TRITON_CACHE_DIR", "").strip() or default_cache_dir()
|
496
|
+
)
|
497
|
+
if self.cache_dir:
|
498
|
+
self.cache_dir = f"{self.cache_dir}_{os.getpid()}"
|
499
|
+
self.cache_dir = os.path.join(self.cache_dir, self.key)
|
500
|
+
self.lock_path = os.path.join(self.cache_dir, "lock")
|
501
|
+
os.makedirs(self.cache_dir, exist_ok=True)
|
502
|
+
else:
|
503
|
+
raise RuntimeError("Could not create or locate cache dir")
|
504
|
+
|
505
|
+
|
414
506
|
API_KEY_HEADER_NAME = "X-API-Key"
|
415
507
|
|
416
508
|
|
sglang/version.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "0.1.25"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.25
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -240,7 +240,7 @@ Requires-Dist: torch ; extra == 'srt'
|
|
240
240
|
Requires-Dist: uvicorn ; extra == 'srt'
|
241
241
|
Requires-Dist: uvloop ; extra == 'srt'
|
242
242
|
Requires-Dist: zmq ; extra == 'srt'
|
243
|
-
Requires-Dist: vllm ==0.5.
|
243
|
+
Requires-Dist: vllm ==0.5.3.post1 ; extra == 'srt'
|
244
244
|
Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
|
245
245
|
|
246
246
|
<div align="center">
|
@@ -282,6 +282,7 @@ The core features include:
|
|
282
282
|
|
283
283
|
### Method 1: With pip
|
284
284
|
```
|
285
|
+
pip install --upgrade pip
|
285
286
|
pip install "sglang[all]"
|
286
287
|
|
287
288
|
# Install FlashInfer CUDA kernels
|
@@ -293,6 +294,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
293
294
|
git clone https://github.com/sgl-project/sglang.git
|
294
295
|
cd sglang
|
295
296
|
|
297
|
+
pip install --upgrade pip
|
296
298
|
pip install -e "python[all]"
|
297
299
|
|
298
300
|
# Install FlashInfer CUDA kernels
|
@@ -390,19 +392,20 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
390
392
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
391
393
|
```
|
392
394
|
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
393
|
-
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-
|
395
|
+
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
394
396
|
```
|
395
397
|
# Node 0
|
396
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-
|
398
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
|
397
399
|
|
398
400
|
# Node 1
|
399
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-
|
401
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
400
402
|
```
|
401
403
|
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
|
404
|
+
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
402
405
|
|
403
406
|
### Supported Models
|
404
407
|
|
405
|
-
- Llama / Llama 2 / Llama 3
|
408
|
+
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
406
409
|
- Mistral / Mixtral
|
407
410
|
- Gemma / Gemma 2
|
408
411
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
@@ -420,6 +423,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
420
423
|
- Grok
|
421
424
|
- ChatGLM
|
422
425
|
- InternLM 2
|
426
|
+
- Mistral NeMo
|
423
427
|
|
424
428
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
425
429
|
|
@@ -1,24 +1,17 @@
|
|
1
|
-
sglang/__init__.py,sha256=
|
1
|
+
sglang/__init__.py,sha256=UV7VlXhXrwi00Zg45iNB9KcnmrwLjdMtjMz06AiafY0,1151
|
2
2
|
sglang/api.py,sha256=1JARbc1wNYF6tODdUpgmNgTyLOvMnxdTBctLvEwzGTY,5565
|
3
|
-
sglang/bench.py,sha256=p34wnfMRdiedOUf9GKGZkkNxehmyTzK6Q1O20q_SGjY,21841
|
4
3
|
sglang/bench_latency.py,sha256=UPy6WhrddMTDX7HqIeHNhCn5vF0YMOKxJlQRvhMC8zU,10552
|
5
|
-
sglang/bench_serving.py,sha256=
|
4
|
+
sglang/bench_serving.py,sha256=zKGgVX3S-ggUvOxvEM4AszzXRPRVU6NGNnBG5vAAvRY,34577
|
6
5
|
sglang/check_env.py,sha256=CscuPMlf68dkgZf0m-FiLpUisNNDoihMck4qhLOeV1Q,4124
|
7
|
-
sglang/global_config.py,sha256=
|
6
|
+
sglang/global_config.py,sha256=QG-ABVJksKK_llvUx7fSZcmK4GGCs-hBUVcM4LCr7Nw,1749
|
8
7
|
sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
|
9
8
|
sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
|
10
9
|
sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
|
11
|
-
sglang/
|
12
|
-
sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
|
13
|
-
sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
|
14
|
-
sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
|
15
|
-
sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
|
16
|
-
sglang/backend/runtime_endpoint.py,sha256=PAdnQBj3yQNtgw8GH9F1ecGE7HhxGa2T7Tz_c--H2aE,9203
|
17
|
-
sglang/backend/vertexai.py,sha256=98toR-L0OTi4dYHaSmmzJdlQ2qN_0lImoKZFlVgYLRE,4850
|
10
|
+
sglang/version.py,sha256=Ej7LsXg-6CASlaEHsZkUoLDpYEfHeFKdIeXMIM0esgA,23
|
18
11
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
12
|
sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
|
20
13
|
sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
|
21
|
-
sglang/lang/interpreter.py,sha256=
|
14
|
+
sglang/lang/interpreter.py,sha256=27j7H9p7TY4uUfF9f5E17FxK1xCNeNju4aut_PaWCrQ,29693
|
22
15
|
sglang/lang/ir.py,sha256=5VVK2JnbspdysrhcGgkmp_JlAprd2XqqRnS_GfP_XWc,16645
|
23
16
|
sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
|
24
17
|
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -30,16 +23,14 @@ sglang/lang/backend/runtime_endpoint.py,sha256=TZ0NV89or5_3MIZZFnc1JXAAjnv7tCfeQ
|
|
30
23
|
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
31
24
|
sglang/srt/conversation.py,sha256=Il7JJuu4o42k2xdBWVfONNmstTsAM-4idX6AcEOnrXQ,15526
|
32
25
|
sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
|
33
|
-
sglang/srt/hf_transformers_utils.py,sha256=
|
34
|
-
sglang/srt/memory_pool.py,sha256=
|
26
|
+
sglang/srt/hf_transformers_utils.py,sha256=94mOI93B2xOmXKqfJfEoGxqHgwwlWNbPHgsA47AQJK8,11245
|
27
|
+
sglang/srt/memory_pool.py,sha256=FhJk5GtYortO3MJIsMMQ-o49agwDHVX1aEQH2LITq6c,3949
|
35
28
|
sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
|
36
29
|
sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
|
37
|
-
sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
|
38
|
-
sglang/srt/openai_protocol.py,sha256=lGBhfxG6jmgUkMOh2NpBK9w9TUTRZKrsfHdW7XYhKKI,5700
|
39
30
|
sglang/srt/sampling_params.py,sha256=OI11asr1Bd_E5soDjih614v4flgWxdMZU9HAF0aBafQ,3062
|
40
|
-
sglang/srt/server.py,sha256=
|
41
|
-
sglang/srt/server_args.py,sha256=
|
42
|
-
sglang/srt/utils.py,sha256=
|
31
|
+
sglang/srt/server.py,sha256=DXhcJt0V24a7yhydP1abPrK1qqV3qt7r8cyOMVOAI4M,14611
|
32
|
+
sglang/srt/server_args.py,sha256=aF6L35mEB-FU3BL_ooKuCIcOXLhYLxA9-MjpaOTQRCo,13189
|
33
|
+
sglang/srt/utils.py,sha256=bUp3SLzbDms0dvuETaccDPAGRHOIGW5A61pqH62XiT0,20370
|
43
34
|
sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
|
44
35
|
sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
|
45
36
|
sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
|
@@ -48,33 +39,34 @@ sglang/srt/layers/context_flashattention_nopad.py,sha256=7ps_9W_ia9zikL9HqsSUwWH
|
|
48
39
|
sglang/srt/layers/extend_attention.py,sha256=aYAAL9HZJpaSASp-ulMvbmSmyMcqdYUsgVQC-Lbm7_U,12008
|
49
40
|
sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
|
50
41
|
sglang/srt/layers/linear.py,sha256=qLwFkOiRAljzE7LkAkLRdcCdVMk-t7b56jEjwQAuYDM,33953
|
51
|
-
sglang/srt/layers/logits_processor.py,sha256=
|
52
|
-
sglang/srt/layers/radix_attention.py,sha256=
|
42
|
+
sglang/srt/layers/logits_processor.py,sha256=KyRYANCiq9Cfu_VPjrIbSBAlqN_clcAgF3JrG9waU5k,9674
|
43
|
+
sglang/srt/layers/radix_attention.py,sha256=A3J_wOlysjblFXHgehAqRHBQmpYAHLyUovyLFsrMJ7A,6386
|
53
44
|
sglang/srt/layers/token_attention.py,sha256=EJ4gjbVLfshOZ_vr1iB-Eq8_B-4F26n_wPDj6e1Zrww,7386
|
54
45
|
sglang/srt/layers/quantization/__init__.py,sha256=PQFzdPpul98DvywBA6YMBOnrMjtHE1LMlMpJ7FM8J3I,1971
|
55
46
|
sglang/srt/layers/quantization/fp8.py,sha256=jaqgRFnHC--IL8iqB6Qygi-KXYPYBKKqt_j4Rk55_h4,24946
|
56
47
|
sglang/srt/managers/detokenizer_manager.py,sha256=8rN2cdMr61LWy07lingEqLnNy0W5Rebdn14IsTQ9PCs,5049
|
57
48
|
sglang/srt/managers/io_struct.py,sha256=Y6jW3p0cNg0jcrEQNki1H8MMEWxwWA4p6Y-xVgUVWaI,5404
|
58
49
|
sglang/srt/managers/tokenizer_manager.py,sha256=SbivhFhZUR9HU9pLTe93MlYprAFAHzOU3KMBA2piQUk,19308
|
59
|
-
sglang/srt/managers/controller/cuda_graph_runner.py,sha256=
|
60
|
-
sglang/srt/managers/controller/
|
61
|
-
sglang/srt/managers/controller/infer_batch.py,sha256=phXzANqBUFyqFwRVl06bd5yBnGK2hem6qzf5i0lrTq0,33086
|
50
|
+
sglang/srt/managers/controller/cuda_graph_runner.py,sha256=0aRqA1_34oJ557Zn8PjpJecex5bBWJdnCmBlcDVvYO0,8509
|
51
|
+
sglang/srt/managers/controller/infer_batch.py,sha256=SKwCwhnZ_CNlG0mVCEc4X0e4HNjJFke-c8zdWP3TzjQ,34186
|
62
52
|
sglang/srt/managers/controller/manager_multi.py,sha256=DT8Y9RF5OyTxlrLEZYz4claNWir3UrVztdOZaVPiA6g,6077
|
63
53
|
sglang/srt/managers/controller/manager_single.py,sha256=2xO_iWK6tWvc0B31nKbe2N3klxwQBJmPTnFhNjzhVSI,4566
|
64
|
-
sglang/srt/managers/controller/model_runner.py,sha256=
|
54
|
+
sglang/srt/managers/controller/model_runner.py,sha256=FwZ7FU7nhJsYhtoTNxYFc4e6oMEwSqOh8ohXOKtFPKc,15828
|
65
55
|
sglang/srt/managers/controller/radix_cache.py,sha256=tx8LEQpqLxipw9UUVj4D1YQLMMDmWnjDYv8oDlOl-co,8210
|
66
|
-
sglang/srt/managers/controller/schedule_heuristic.py,sha256=
|
67
|
-
sglang/srt/managers/controller/tp_worker.py,sha256=
|
56
|
+
sglang/srt/managers/controller/schedule_heuristic.py,sha256=SQAGzPS3aB_TPj7rnPBhewwyR6W1sVwW4D3zG3JUY00,2714
|
57
|
+
sglang/srt/managers/controller/tp_worker.py,sha256=yjz-Xzl0zEy4QSU-EYneZH5vi3oHtBuXTtYe4VuDp2g,30517
|
68
58
|
sglang/srt/model_loader/model_loader.py,sha256=VS8VQL5ITN3akZ9eU_-uHWMan1axLMNG2_O12HzGysA,10132
|
69
59
|
sglang/srt/model_loader/utils.py,sha256=I2PS5HIH5Cg-p7xKO_Cw_foK2vQ61xVc3zQv7CbeGEw,10120
|
70
60
|
sglang/srt/models/chatglm.py,sha256=pH8g2Dj8qQLGPYpWVTb-IONfXsdfmpWi0-IEYNdSi4s,13296
|
71
61
|
sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
|
72
62
|
sglang/srt/models/dbrx.py,sha256=rRxOusGPu670ommeqXg62AllwB1apzE4yZoWc1fcr2M,14095
|
63
|
+
sglang/srt/models/deepseek.py,sha256=YtoPmv4fKmiH_jsRMSab9Wxq3aOZga9pCPGnkCs3Vvs,15457
|
73
64
|
sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
|
74
65
|
sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
|
66
|
+
sglang/srt/models/gpt_bigcode.py,sha256=XHO1naPdXfiKYQRQ6uZe1fN3PBDhKH3-bchsaaZvfE4,9637
|
75
67
|
sglang/srt/models/grok.py,sha256=611zrlIchvFaVfztRdBY7z97oU3KB-anykbOZy1hK6M,27295
|
76
68
|
sglang/srt/models/internlm2.py,sha256=8MNcwxU5Th9IxWa314HqqmbCRlPUFScnfneBDs0riIU,11659
|
77
|
-
sglang/srt/models/llama2.py,sha256=
|
69
|
+
sglang/srt/models/llama2.py,sha256=OyAf_lun5aZEsT80WmrIYBF8QXTXRpW8sUlylr4AZIc,14204
|
78
70
|
sglang/srt/models/llama_classification.py,sha256=foCPvNyP2bTZ0YcRBF-qkmBv-gT24lhLNCXP30Oq4VU,4370
|
79
71
|
sglang/srt/models/llava.py,sha256=vBI6EEeOG_9o23Shi9h8k58rxTOHZnSKMmPl3B3Q3uc,17924
|
80
72
|
sglang/srt/models/llavavid.py,sha256=SrNQ-U2wekHvP_up-ZXRkCSros2NzheHpPfXHrp0YBU,13050
|
@@ -88,16 +80,13 @@ sglang/srt/models/qwen2_moe.py,sha256=oHNoo45myV5kitkls2GWVzuGt1Q4pRHN2nLlXEltFI
|
|
88
80
|
sglang/srt/models/stablelm.py,sha256=Z_XCDSHY_QMz3lZwwkZdIZjEOizZjLYJU9GDi8o08qQ,10802
|
89
81
|
sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
|
90
82
|
sglang/srt/openai_api/adapter.py,sha256=eirFYVGIp5D-UrQLqW5dRJOQYKmzF9nmgCzFeUOb2z8,15737
|
91
|
-
sglang/srt/openai_api/api_adapter.py,sha256=eirFYVGIp5D-UrQLqW5dRJOQYKmzF9nmgCzFeUOb2z8,15737
|
92
|
-
sglang/srt/openai_api/openai_api_adapter.py,sha256=5pDaktIEteHxp3qN89U_U3ndd7N0FIfUZAM06YeziUY,15687
|
93
|
-
sglang/srt/openai_api/openai_protocol.py,sha256=lGBhfxG6jmgUkMOh2NpBK9w9TUTRZKrsfHdW7XYhKKI,5700
|
94
83
|
sglang/srt/openai_api/protocol.py,sha256=j7ifIR2SFQxTwaHAd9ksM096vfffcNltzTH4sg7H0RA,5739
|
95
84
|
sglang/test/test_conversation.py,sha256=gF_AyOxQgpPQBPnA57-kq-M0p_zFu-rBDMFgAq655Rw,1596
|
96
85
|
sglang/test/test_openai_protocol.py,sha256=DVx3r6hrb8oRqbo5AYIleldxbqMBTtb-gtORM6t_Y1c,1661
|
97
86
|
sglang/test/test_programs.py,sha256=uefeHUFKT2NJESOujj-CsnPXdw1aQQN2TzUbPCHJjGs,13654
|
98
87
|
sglang/test/test_utils.py,sha256=kD_fQe3WroZ9Kc3NBRKPiZOFJ_JD2uEE9XIvPp6AD9Y,11048
|
99
|
-
sglang-0.1.
|
100
|
-
sglang-0.1.
|
101
|
-
sglang-0.1.
|
102
|
-
sglang-0.1.
|
103
|
-
sglang-0.1.
|
88
|
+
sglang-0.1.25.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
89
|
+
sglang-0.1.25.dist-info/METADATA,sha256=Ifwh2YdZqQXMe2UCOklWFIGeM0KLkfLjBQHv98gS8Pw,30928
|
90
|
+
sglang-0.1.25.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
|
91
|
+
sglang-0.1.25.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
92
|
+
sglang-0.1.25.dist-info/RECORD,,
|
sglang/backend/__init__.py
DELETED
File without changes
|
sglang/backend/anthropic.py
DELETED
@@ -1,77 +0,0 @@
|
|
1
|
-
from typing import List, Optional, Union
|
2
|
-
|
3
|
-
import numpy as np
|
4
|
-
|
5
|
-
from sglang.backend.base_backend import BaseBackend
|
6
|
-
from sglang.lang.chat_template import get_chat_template
|
7
|
-
from sglang.lang.interpreter import StreamExecutor
|
8
|
-
from sglang.lang.ir import SglSamplingParams
|
9
|
-
|
10
|
-
try:
|
11
|
-
import anthropic
|
12
|
-
except ImportError as e:
|
13
|
-
anthropic = e
|
14
|
-
|
15
|
-
|
16
|
-
class Anthropic(BaseBackend):
|
17
|
-
def __init__(self, model_name, *args, **kwargs):
|
18
|
-
super().__init__()
|
19
|
-
|
20
|
-
if isinstance(anthropic, Exception):
|
21
|
-
raise anthropic
|
22
|
-
|
23
|
-
self.model_name = model_name
|
24
|
-
self.chat_template = get_chat_template("claude")
|
25
|
-
self.client = anthropic.Anthropic(*args, **kwargs)
|
26
|
-
|
27
|
-
def get_chat_template(self):
|
28
|
-
return self.chat_template
|
29
|
-
|
30
|
-
def generate(
|
31
|
-
self,
|
32
|
-
s: StreamExecutor,
|
33
|
-
sampling_params: SglSamplingParams,
|
34
|
-
):
|
35
|
-
if s.messages_:
|
36
|
-
messages = s.messages_
|
37
|
-
else:
|
38
|
-
messages = [{"role": "user", "content": s.text_}]
|
39
|
-
|
40
|
-
if messages and messages[0]["role"] == "system":
|
41
|
-
system = messages.pop(0)["content"]
|
42
|
-
else:
|
43
|
-
system = ""
|
44
|
-
|
45
|
-
ret = self.client.messages.create(
|
46
|
-
model=self.model_name,
|
47
|
-
system=system,
|
48
|
-
messages=messages,
|
49
|
-
**sampling_params.to_anthropic_kwargs(),
|
50
|
-
)
|
51
|
-
comp = ret.content[0].text
|
52
|
-
|
53
|
-
return comp, {}
|
54
|
-
|
55
|
-
def generate_stream(
|
56
|
-
self,
|
57
|
-
s: StreamExecutor,
|
58
|
-
sampling_params: SglSamplingParams,
|
59
|
-
):
|
60
|
-
if s.messages_:
|
61
|
-
messages = s.messages_
|
62
|
-
else:
|
63
|
-
messages = [{"role": "user", "content": s.text_}]
|
64
|
-
|
65
|
-
if messages and messages[0]["role"] == "system":
|
66
|
-
system = messages.pop(0)["content"]
|
67
|
-
else:
|
68
|
-
system = ""
|
69
|
-
|
70
|
-
with self.client.messages.stream(
|
71
|
-
model=self.model_name,
|
72
|
-
system=system,
|
73
|
-
messages=messages,
|
74
|
-
**sampling_params.to_anthropic_kwargs(),
|
75
|
-
) as stream:
|
76
|
-
for text in stream.text_stream:
|
77
|
-
yield text, {}
|