sglang 0.3.6.post1__py3-none-any.whl → 0.3.6.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +2 -3
- sglang/bench_one_batch_server.py +2 -2
- sglang/check_env.py +7 -1
- sglang/lang/tracer.py +1 -1
- sglang/launch_server.py +0 -3
- sglang/srt/configs/model_config.py +2 -6
- sglang/srt/layers/attention/flashinfer_backend.py +3 -3
- sglang/srt/layers/sampler.py +1 -1
- sglang/srt/managers/detokenizer_manager.py +0 -2
- sglang/srt/managers/image_processor.py +6 -9
- sglang/srt/managers/schedule_batch.py +37 -1
- sglang/srt/managers/scheduler.py +8 -5
- sglang/srt/managers/session_controller.py +15 -4
- sglang/srt/models/llava.py +7 -1
- sglang/srt/server.py +2 -1
- sglang/srt/utils.py +14 -4
- sglang/test/test_utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.3.6.post1.dist-info → sglang-0.3.6.post2.dist-info}/METADATA +3 -2
- {sglang-0.3.6.post1.dist-info → sglang-0.3.6.post2.dist-info}/RECORD +23 -23
- {sglang-0.3.6.post1.dist-info → sglang-0.3.6.post2.dist-info}/LICENSE +0 -0
- {sglang-0.3.6.post1.dist-info → sglang-0.3.6.post2.dist-info}/WHEEL +0 -0
- {sglang-0.3.6.post1.dist-info → sglang-0.3.6.post2.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py
CHANGED
sglang/bench_one_batch_server.py
CHANGED
@@ -5,9 +5,9 @@ This script launches a server and uses the HTTP interface.
|
|
5
5
|
It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
|
6
6
|
|
7
7
|
Usage:
|
8
|
-
python3 -m sglang.
|
8
|
+
python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
|
9
9
|
|
10
|
-
python3 -m sglang.
|
10
|
+
python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
|
11
11
|
"""
|
12
12
|
|
13
13
|
import argparse
|
sglang/check_env.py
CHANGED
@@ -22,18 +22,24 @@ PACKAGE_LIST = [
|
|
22
22
|
"hf_transfer",
|
23
23
|
"huggingface_hub",
|
24
24
|
"interegular",
|
25
|
+
"modelscope",
|
26
|
+
"orjson",
|
27
|
+
"outlines",
|
28
|
+
"packaging",
|
25
29
|
"psutil",
|
26
30
|
"pydantic",
|
27
31
|
"multipart",
|
28
32
|
"zmq",
|
33
|
+
"torchao",
|
29
34
|
"uvicorn",
|
30
35
|
"uvloop",
|
31
36
|
"vllm",
|
32
|
-
"
|
37
|
+
"xgrammar",
|
33
38
|
"openai",
|
34
39
|
"tiktoken",
|
35
40
|
"anthropic",
|
36
41
|
"litellm",
|
42
|
+
"decord",
|
37
43
|
]
|
38
44
|
|
39
45
|
|
sglang/lang/tracer.py
CHANGED
sglang/launch_server.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
"""Launch the inference server."""
|
2
2
|
|
3
|
-
import os
|
4
3
|
import sys
|
5
4
|
|
6
5
|
from sglang.srt.server import launch_server
|
@@ -12,7 +11,5 @@ if __name__ == "__main__":
|
|
12
11
|
|
13
12
|
try:
|
14
13
|
launch_server(server_args)
|
15
|
-
except Exception as e:
|
16
|
-
raise e
|
17
14
|
finally:
|
18
15
|
kill_child_process()
|
@@ -14,13 +14,13 @@
|
|
14
14
|
|
15
15
|
import json
|
16
16
|
import logging
|
17
|
-
import os
|
18
17
|
from enum import IntEnum, auto
|
19
18
|
from typing import List, Optional
|
20
19
|
|
21
20
|
from transformers import PretrainedConfig
|
22
21
|
|
23
22
|
from sglang.srt.hf_transformers_utils import get_config, get_context_length
|
23
|
+
from sglang.srt.utils import get_bool_env_var
|
24
24
|
|
25
25
|
logger = logging.getLogger(__name__)
|
26
26
|
|
@@ -59,13 +59,9 @@ class ModelConfig:
|
|
59
59
|
|
60
60
|
# Derive context length
|
61
61
|
derived_context_len = get_context_length(self.hf_text_config)
|
62
|
-
allow_long_context = os.environ.get(
|
63
|
-
"SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", None
|
64
|
-
)
|
65
|
-
|
66
62
|
if context_length is not None:
|
67
63
|
if context_length > derived_context_len:
|
68
|
-
if
|
64
|
+
if get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN"):
|
69
65
|
logger.warning(
|
70
66
|
f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
|
71
67
|
f"This may lead to incorrect model outputs or CUDA errors."
|
@@ -18,7 +18,7 @@ import triton.language as tl
|
|
18
18
|
from sglang.global_config import global_config
|
19
19
|
from sglang.srt.layers.attention import AttentionBackend
|
20
20
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
21
|
-
from sglang.srt.utils import is_flashinfer_available
|
21
|
+
from sglang.srt.utils import get_bool_env_var, is_flashinfer_available
|
22
22
|
|
23
23
|
if TYPE_CHECKING:
|
24
24
|
from sglang.srt.layers.radix_attention import RadixAttention
|
@@ -47,8 +47,8 @@ class FlashInferAttnBackend(AttentionBackend):
|
|
47
47
|
|
48
48
|
# Parse constants
|
49
49
|
if "SGLANG_FLASHINFER_USE_TENSOR_CORE" in os.environ:
|
50
|
-
self.decode_use_tensor_cores = (
|
51
|
-
|
50
|
+
self.decode_use_tensor_cores = get_bool_env_var(
|
51
|
+
"SGLANG_FLASHINFER_USE_TENSOR_CORE"
|
52
52
|
)
|
53
53
|
else:
|
54
54
|
if not _grouped_size_compiled_for_decode_kernels(
|
sglang/srt/layers/sampler.py
CHANGED
@@ -74,7 +74,7 @@ class Sampler(nn.Module):
|
|
74
74
|
filter_apply_order="joint",
|
75
75
|
)
|
76
76
|
|
77
|
-
if not torch.all(success):
|
77
|
+
if self.use_nan_detectioin and not torch.all(success):
|
78
78
|
logger.warning("Detected errors during sampling!")
|
79
79
|
batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
|
80
80
|
elif global_server_args_dict["sampling_backend"] == "pytorch":
|
@@ -25,8 +25,6 @@ from sglang.srt.managers.io_struct import (
|
|
25
25
|
BatchEmbeddingOut,
|
26
26
|
BatchStrOut,
|
27
27
|
BatchTokenIDOut,
|
28
|
-
GetMemPoolSizeReqOutput,
|
29
|
-
UpdateWeightReqOutput,
|
30
28
|
)
|
31
29
|
from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR, FINISH_MATCHED_TOKEN
|
32
30
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
@@ -131,6 +131,7 @@ class LlavaImageProcessor(BaseImageProcessor):
|
|
131
131
|
if not image_data:
|
132
132
|
return None
|
133
133
|
|
134
|
+
modalities = request_obj.modalities or ["image"]
|
134
135
|
aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
|
135
136
|
grid_pinpoints = (
|
136
137
|
self.hf_config.image_grid_pinpoints
|
@@ -139,9 +140,12 @@ class LlavaImageProcessor(BaseImageProcessor):
|
|
139
140
|
else None
|
140
141
|
)
|
141
142
|
|
143
|
+
if isinstance(image_data, str):
|
144
|
+
image_data = [image_data]
|
145
|
+
|
142
146
|
if isinstance(image_data, list) and len(image_data) > 0:
|
143
|
-
|
144
|
-
|
147
|
+
if "multi-images" in modalities or "video" in modalities:
|
148
|
+
# Multiple images
|
145
149
|
aspect_ratio = "pad" # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
|
146
150
|
pixel_values, image_hashes, image_sizes = [], [], []
|
147
151
|
res = []
|
@@ -166,13 +170,6 @@ class LlavaImageProcessor(BaseImageProcessor):
|
|
166
170
|
)
|
167
171
|
image_hashes = [image_hash]
|
168
172
|
image_sizes = [image_size]
|
169
|
-
elif isinstance(image_data, str):
|
170
|
-
# A single image
|
171
|
-
pixel_values, image_hash, image_size = await self._process_single_image(
|
172
|
-
image_data, aspect_ratio, grid_pinpoints
|
173
|
-
)
|
174
|
-
image_hashes = [image_hash]
|
175
|
-
image_sizes = [image_size]
|
176
173
|
else:
|
177
174
|
raise ValueError(f"Invalid image data: {image_data}")
|
178
175
|
|
@@ -31,6 +31,7 @@ import dataclasses
|
|
31
31
|
import logging
|
32
32
|
from typing import List, Optional, Tuple, Union
|
33
33
|
|
34
|
+
import numpy as np
|
34
35
|
import torch
|
35
36
|
import triton
|
36
37
|
import triton.language as tl
|
@@ -167,6 +168,30 @@ class ImageInputs:
|
|
167
168
|
|
168
169
|
return ret
|
169
170
|
|
171
|
+
def merge(self, other, vocab_size):
|
172
|
+
assert self.pixel_values.shape[1:] == other.pixel_values.shape[1:]
|
173
|
+
self.pixel_values = np.concatenate([self.pixel_values, other.pixel_values])
|
174
|
+
self.image_hashes += other.image_hashes
|
175
|
+
|
176
|
+
self.pad_values = [
|
177
|
+
(self.image_hashes) % vocab_size,
|
178
|
+
(self.image_hashes >> 16) % vocab_size,
|
179
|
+
(self.image_hashes >> 32) % vocab_size,
|
180
|
+
(self.image_hashes >> 64) % vocab_size,
|
181
|
+
]
|
182
|
+
|
183
|
+
optional_args = [
|
184
|
+
"image_sizes",
|
185
|
+
"image_offsets",
|
186
|
+
# "modalities", # modalities should be ["multi-images"] (one entry) even for multiple images
|
187
|
+
"aspect_ratio_ids",
|
188
|
+
"aspect_ratio_mask",
|
189
|
+
"image_grid_thws",
|
190
|
+
]
|
191
|
+
for arg in optional_args:
|
192
|
+
if getattr(self, arg, None) is not None:
|
193
|
+
setattr(self, arg, getattr(self, arg) + getattr(other, arg))
|
194
|
+
|
170
195
|
|
171
196
|
class Req:
|
172
197
|
"""The input and output status of a request."""
|
@@ -177,6 +202,7 @@ class Req:
|
|
177
202
|
origin_input_text: str,
|
178
203
|
origin_input_ids: Tuple[int],
|
179
204
|
sampling_params: SamplingParams,
|
205
|
+
origin_input_ids_unpadded: Optional[Tuple[int]] = None,
|
180
206
|
lora_path: Optional[str] = None,
|
181
207
|
input_embeds: Optional[List[List[float]]] = None,
|
182
208
|
session_id: Optional[str] = None,
|
@@ -184,7 +210,11 @@ class Req:
|
|
184
210
|
# Input and output info
|
185
211
|
self.rid = rid
|
186
212
|
self.origin_input_text = origin_input_text
|
187
|
-
self.origin_input_ids_unpadded =
|
213
|
+
self.origin_input_ids_unpadded = (
|
214
|
+
origin_input_ids_unpadded
|
215
|
+
if origin_input_ids_unpadded
|
216
|
+
else origin_input_ids # Before image padding
|
217
|
+
)
|
188
218
|
self.origin_input_ids = origin_input_ids
|
189
219
|
self.output_ids = [] # Each decode stage's output ids
|
190
220
|
self.fill_ids = None # fill_ids = origin_input_ids + output_ids
|
@@ -260,6 +290,12 @@ class Req:
|
|
260
290
|
# The number of cached tokens, that were already cached in the KV cache
|
261
291
|
self.cached_tokens = 0
|
262
292
|
|
293
|
+
def extend_image_inputs(self, image_inputs, vocab_size):
|
294
|
+
if self.image_inputs is None:
|
295
|
+
self.image_inputs = image_inputs
|
296
|
+
else:
|
297
|
+
self.image_inputs.merge(image_inputs, vocab_size)
|
298
|
+
|
263
299
|
# whether request reached finished condition
|
264
300
|
def finished(self) -> bool:
|
265
301
|
return self.finished_reason is not None
|
sglang/srt/managers/scheduler.py
CHANGED
@@ -71,9 +71,10 @@ from sglang.srt.utils import (
|
|
71
71
|
broadcast_pyobj,
|
72
72
|
configure_logger,
|
73
73
|
crash_on_warnings,
|
74
|
+
get_bool_env_var,
|
74
75
|
get_zmq_socket,
|
75
|
-
gpu_proc_affinity,
|
76
76
|
kill_parent_process,
|
77
|
+
set_gpu_proc_affinity,
|
77
78
|
set_random_seed,
|
78
79
|
suppress_other_loggers,
|
79
80
|
)
|
@@ -82,7 +83,7 @@ from sglang.utils import get_exception_traceback
|
|
82
83
|
logger = logging.getLogger(__name__)
|
83
84
|
|
84
85
|
# Test retract decode
|
85
|
-
test_retract =
|
86
|
+
test_retract = get_bool_env_var("SGLANG_TEST_RETRACT")
|
86
87
|
|
87
88
|
|
88
89
|
class Scheduler:
|
@@ -559,12 +560,13 @@ class Scheduler:
|
|
559
560
|
|
560
561
|
# Image inputs
|
561
562
|
if recv_req.image_inputs is not None:
|
562
|
-
|
563
|
+
image_inputs = ImageInputs.from_dict(
|
563
564
|
recv_req.image_inputs, self.model_config.vocab_size
|
564
565
|
)
|
565
566
|
req.origin_input_ids = self.pad_input_ids_func(
|
566
|
-
req.
|
567
|
+
req.origin_input_ids, image_inputs
|
567
568
|
)
|
569
|
+
req.extend_image_inputs(image_inputs, self.model_config.vocab_size)
|
568
570
|
|
569
571
|
if len(req.origin_input_ids) > self.max_req_input_len:
|
570
572
|
req.finished_reason = FINISH_ABORT(
|
@@ -1404,7 +1406,8 @@ def run_scheduler_process(
|
|
1404
1406
|
pipe_writer,
|
1405
1407
|
):
|
1406
1408
|
# set cpu affinity to this gpu process
|
1407
|
-
|
1409
|
+
if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
|
1410
|
+
set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
|
1408
1411
|
|
1409
1412
|
# [For Router] if env var "DP_RANK" exist, set dp_rank to the value of the env var
|
1410
1413
|
if dp_rank is None and "DP_RANK" in os.environ:
|
@@ -41,16 +41,27 @@ class Session:
|
|
41
41
|
]
|
42
42
|
+ req.input_ids
|
43
43
|
)
|
44
|
+
input_ids_unpadded = (
|
45
|
+
self.reqs[-1].origin_input_ids_unpadded
|
46
|
+
+ self.reqs[-1].output_ids[
|
47
|
+
: self.reqs[-1].sampling_params.max_new_tokens
|
48
|
+
]
|
49
|
+
+ req.input_ids
|
50
|
+
)
|
44
51
|
else:
|
45
52
|
input_ids = req.input_ids
|
53
|
+
input_ids_unpadded = req.input_ids
|
46
54
|
new_req = Req(
|
47
|
-
req.rid,
|
48
|
-
None,
|
49
|
-
input_ids,
|
50
|
-
|
55
|
+
rid=req.rid,
|
56
|
+
origin_input_text=None,
|
57
|
+
origin_input_ids=input_ids,
|
58
|
+
origin_input_ids_unpadded=input_ids_unpadded,
|
59
|
+
sampling_params=req.sampling_params,
|
51
60
|
lora_path=req.lora_path,
|
52
61
|
session_id=self.session_id,
|
53
62
|
)
|
63
|
+
if len(self.reqs) > 0:
|
64
|
+
new_req.image_inputs = self.reqs[-1].image_inputs
|
54
65
|
new_req.tokenizer = tokenizer
|
55
66
|
if req.session_rid is not None and len(self.reqs) == 0:
|
56
67
|
new_req.finished_reason = FINISH_ABORT(
|
sglang/srt/models/llava.py
CHANGED
@@ -49,7 +49,13 @@ class LlavaBaseForCausalLM(nn.Module):
|
|
49
49
|
image_sizes, pad_values = image_inputs.image_sizes, image_inputs.pad_values
|
50
50
|
|
51
51
|
# hardcode for spatial_unpad + anyres
|
52
|
-
|
52
|
+
if image_inputs.modalities is not None and (
|
53
|
+
"multi-images" in image_inputs.modalities
|
54
|
+
or "video" in image_inputs.modalities
|
55
|
+
):
|
56
|
+
image_aspect_ratio = "pad"
|
57
|
+
else:
|
58
|
+
image_aspect_ratio = "anyres"
|
53
59
|
offset_list = []
|
54
60
|
for image_s in image_sizes:
|
55
61
|
if len(image_sizes) > 16:
|
sglang/srt/server.py
CHANGED
@@ -86,6 +86,7 @@ from sglang.srt.utils import (
|
|
86
86
|
set_ulimit,
|
87
87
|
)
|
88
88
|
from sglang.utils import get_exception_traceback
|
89
|
+
from sglang.version import __version__
|
89
90
|
|
90
91
|
logger = logging.getLogger(__name__)
|
91
92
|
|
@@ -455,7 +456,6 @@ def launch_engine(
|
|
455
456
|
data = scheduler_pipe_readers[i].recv()
|
456
457
|
|
457
458
|
if data["status"] != "ready":
|
458
|
-
self.shutdown()
|
459
459
|
raise RuntimeError(
|
460
460
|
"Initialization failed. Please see the error messages above."
|
461
461
|
)
|
@@ -528,6 +528,7 @@ async def _get_server_info():
|
|
528
528
|
**dataclasses.asdict(tokenizer_manager.server_args), # server args
|
529
529
|
"memory_pool_size": await tokenizer_manager.get_memory_pool_size(), # memory pool size
|
530
530
|
"max_total_num_tokens": _max_total_num_tokens, # max total num tokens
|
531
|
+
"version": __version__,
|
531
532
|
}
|
532
533
|
|
533
534
|
|
sglang/srt/utils.py
CHANGED
@@ -72,7 +72,7 @@ def is_flashinfer_available():
|
|
72
72
|
Check whether flashinfer is available.
|
73
73
|
As of Oct. 6, 2024, it is only available on NVIDIA GPUs.
|
74
74
|
"""
|
75
|
-
if
|
75
|
+
if not get_bool_env_var("SGLANG_IS_FLASHINFER_AVAILABLE", default="true"):
|
76
76
|
return False
|
77
77
|
return torch.cuda.is_available() and not is_hip()
|
78
78
|
|
@@ -517,6 +517,11 @@ def monkey_patch_vllm_p2p_access_check(gpu_id: int):
|
|
517
517
|
|
518
518
|
setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
|
519
519
|
|
520
|
+
# Suppress the warnings from this delete function when using sglang.bench_one_batch
|
521
|
+
from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
|
522
|
+
|
523
|
+
setattr(CustomAllreduce, "__del__", lambda *args, **kwargs: None)
|
524
|
+
|
520
525
|
|
521
526
|
vllm_all_gather_backup = None
|
522
527
|
|
@@ -626,7 +631,7 @@ def add_api_key_middleware(app, api_key: str):
|
|
626
631
|
|
627
632
|
|
628
633
|
def prepare_model_and_tokenizer(model_path: str, tokenizer_path: str):
|
629
|
-
if "SGLANG_USE_MODELSCOPE"
|
634
|
+
if get_bool_env_var("SGLANG_USE_MODELSCOPE"):
|
630
635
|
if not os.path.exists(model_path):
|
631
636
|
from modelscope import snapshot_download
|
632
637
|
|
@@ -931,7 +936,7 @@ def get_nvgpu_memory_capacity():
|
|
931
936
|
|
932
937
|
def crash_on_warnings():
|
933
938
|
# Crash on warning if we are running CI tests
|
934
|
-
return
|
939
|
+
return get_bool_env_var("SGLANG_IS_IN_CI")
|
935
940
|
|
936
941
|
|
937
942
|
def get_device_name(device_id: int = 0) -> str:
|
@@ -990,7 +995,7 @@ def direct_register_custom_op(
|
|
990
995
|
my_lib._register_fake(op_name, fake_impl)
|
991
996
|
|
992
997
|
|
993
|
-
def
|
998
|
+
def set_gpu_proc_affinity(
|
994
999
|
tp_size: int,
|
995
1000
|
nnodes: int,
|
996
1001
|
gpu_id: int,
|
@@ -1022,3 +1027,8 @@ def gpu_proc_affinity(
|
|
1022
1027
|
# set cpu_affinity to current process
|
1023
1028
|
p.cpu_affinity(bind_cpu_ids)
|
1024
1029
|
logger.info(f"Process {pid} gpu_id {gpu_id} is running on CPUs: {p.cpu_affinity()}")
|
1030
|
+
|
1031
|
+
|
1032
|
+
def get_bool_env_var(name: str, default: str = "false") -> bool:
|
1033
|
+
value = os.getenv(name, default)
|
1034
|
+
return value.lower() in ("true", "1")
|
sglang/test/test_utils.py
CHANGED
@@ -22,7 +22,7 @@ from sglang.bench_serving import run_benchmark
|
|
22
22
|
from sglang.global_config import global_config
|
23
23
|
from sglang.lang.backend.openai import OpenAI
|
24
24
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
25
|
-
from sglang.srt.utils import kill_child_process
|
25
|
+
from sglang.srt.utils import get_bool_env_var, kill_child_process
|
26
26
|
from sglang.test.run_eval import run_eval
|
27
27
|
from sglang.utils import get_exception_traceback
|
28
28
|
|
@@ -44,7 +44,7 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8
|
|
44
44
|
|
45
45
|
def is_in_ci():
|
46
46
|
"""Return whether it is in CI runner."""
|
47
|
-
return
|
47
|
+
return get_bool_env_var("SGLANG_IS_IN_CI")
|
48
48
|
|
49
49
|
|
50
50
|
if is_in_ci():
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.6.
|
1
|
+
__version__ = "0.3.6.post2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.6.
|
3
|
+
Version: 0.3.6.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -240,6 +240,7 @@ Provides-Extra: srt
|
|
240
240
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
241
241
|
Requires-Dist: torch; extra == "srt"
|
242
242
|
Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
|
243
|
+
Requires-Dist: cuda-python; extra == "srt"
|
243
244
|
Provides-Extra: srt-hip
|
244
245
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
245
246
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -350,7 +351,7 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
350
351
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
351
352
|
|
352
353
|
## Adoption and Sponsorship
|
353
|
-
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, and
|
354
|
+
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
|
354
355
|
|
355
356
|
## Acknowledgment and Citation
|
356
357
|
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
@@ -2,22 +2,22 @@ sglang/__init__.py,sha256=3M0oz0ZA8fULhV5LwQ4hxh-MRdHsOJRD1D63C60pdG4,1616
|
|
2
2
|
sglang/api.py,sha256=NdO6cYnklnEBQBKqQjlqI8-P1EownKQ71t5ibCGhEVo,6953
|
3
3
|
sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
|
4
4
|
sglang/bench_offline_throughput.py,sha256=z6uA6Gxa_nFZa0cOXi7MJDuX82xcqk5WfqBMavd8a-s,10929
|
5
|
-
sglang/bench_one_batch.py,sha256=
|
6
|
-
sglang/bench_one_batch_server.py,sha256=
|
5
|
+
sglang/bench_one_batch.py,sha256=AVMpCBWEsMI2TlMK55JPgPJu0kHg8DI0WV_Bhd4pJgc,15668
|
6
|
+
sglang/bench_one_batch_server.py,sha256=hYc3r9JQOLrfqmKgKPOmP0Kr63Sya9wPV_dHzMRZ2Dw,5924
|
7
7
|
sglang/bench_serving.py,sha256=hI7FjaERyqKBrYtKewDU6E4rSufKxqsUPyUgtWtTKSI,52545
|
8
|
-
sglang/check_env.py,sha256=
|
8
|
+
sglang/check_env.py,sha256=rE4ZAG0e6M-Xd-qdHcKclN8Qav6b9gEh4yvlV_TbOg0,5450
|
9
9
|
sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
|
10
|
-
sglang/launch_server.py,sha256=
|
10
|
+
sglang/launch_server.py,sha256=U17c44CbbpMBm2JQxVLaz1mfUKk7PgBDhTLAFNeJEvI,362
|
11
11
|
sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
|
12
12
|
sglang/utils.py,sha256=eCvD3fZCALr-MuyZxJL7HAeeqqpxAxf4LJrf7OiCbco,11547
|
13
|
-
sglang/version.py,sha256=
|
13
|
+
sglang/version.py,sha256=_Aams_yVBpGe9-85k-kF3qpgcd3D_AsWkVfMFmCWh3c,28
|
14
14
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
sglang/lang/chat_template.py,sha256=jprS3-In2FTUoedKwZg-HYvDwU8RTIYntOlf2zoN2sU,14814
|
16
16
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
17
17
|
sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
|
18
18
|
sglang/lang/interpreter.py,sha256=SBjejhLhTKzNM0HbjtTg5r17WPJ64WFSk6lcM_SCWKs,30717
|
19
19
|
sglang/lang/ir.py,sha256=zpzzAO1YVldhE95Vwz5hU_TQltu-xt8A6rfFr0PuIDA,18410
|
20
|
-
sglang/lang/tracer.py,sha256=
|
20
|
+
sglang/lang/tracer.py,sha256=o-jLAPPSuy2vBfsGGrTAnbuWtORzQ50B4C_P5zvYkx8,8291
|
21
21
|
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
|
23
23
|
sglang/lang/backend/base_backend.py,sha256=tdoh9YF3CyekY1BKiX9n7-aA4srDWIuA4RDJLM7q8qg,1985
|
@@ -29,12 +29,12 @@ sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21
|
|
29
29
|
sglang/srt/hf_transformers_utils.py,sha256=sUUCpjbTHuYDMuwOaz00nH5fataXKjliD8gCxXU64sw,6712
|
30
30
|
sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
|
31
31
|
sglang/srt/model_parallel.py,sha256=QR-Alqo0sElDXPJ79N1PhUHHKiEHPQn3dyXduMP-SHQ,3664
|
32
|
-
sglang/srt/server.py,sha256=
|
32
|
+
sglang/srt/server.py,sha256=tH_22tnksy3bbhYu_njjx5L59pb9lJ7tU40Z2BLoiaI,30894
|
33
33
|
sglang/srt/server_args.py,sha256=CfmpU6_EDnxJzpJiRx2n6AhOPCtrHPOf-7wEtTF__L0,30834
|
34
|
-
sglang/srt/utils.py,sha256=
|
34
|
+
sglang/srt/utils.py,sha256=QXc01TOB7abpL6p3KzfP7u2xFZohQ-ThbI5DAJGoHeI,33894
|
35
35
|
sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
|
36
36
|
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
37
|
-
sglang/srt/configs/model_config.py,sha256=
|
37
|
+
sglang/srt/configs/model_config.py,sha256=r5N_OO4w3_R3kZ80P-ZPECscXmspI41d1vc6uEE9ixM,9526
|
38
38
|
sglang/srt/configs/qwen2vl.py,sha256=AYHuFgJ0bwhWYkD7S6fvP7yJejJnuhy4xp5Q2W-O6ps,4424
|
39
39
|
sglang/srt/constrained/__init__.py,sha256=UWZNVLvOT5ZBX8M36sONgDmnKtkQ0cSfhQD2jO0ATuk,786
|
40
40
|
sglang/srt/constrained/base_grammar_backend.py,sha256=FhVm7PxhXDl0joV9NP5RjKgz7dR1dZvUAQnh0mdtvVY,2353
|
@@ -50,12 +50,12 @@ sglang/srt/layers/logits_processor.py,sha256=V8fHxeQK8lzUhGD2Xc7MY1Y9qBhzFyh6hqp
|
|
50
50
|
sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
|
51
51
|
sglang/srt/layers/radix_attention.py,sha256=C_mK4mfmKlxMRNeKYP9E5R3PRd3eT-OcE_g3mo36dJM,2058
|
52
52
|
sglang/srt/layers/rotary_embedding.py,sha256=29tx3JNR40AoXqBa2cFGBjva9vU2xgFipETlpMaaZas,3985
|
53
|
-
sglang/srt/layers/sampler.py,sha256=
|
53
|
+
sglang/srt/layers/sampler.py,sha256=_enfER8MSxsCYrR6_NgyFxKA_XqKtii_asOZUFUUsd8,4580
|
54
54
|
sglang/srt/layers/torchao_utils.py,sha256=v0hyr4hLsM42QwOPCdKb-ftRTjVokBZbqvRj4O4C-Nw,3415
|
55
55
|
sglang/srt/layers/vocab_parallel_embedding.py,sha256=RmaZbgXbFnGKX1eGYxlmiko-6JwaJX6seHupUSCtAm8,21583
|
56
56
|
sglang/srt/layers/attention/__init__.py,sha256=EL1o6Q5vLgViN3pOr2A7F6K9FlNEpMdBypFAVMeq_HA,2445
|
57
57
|
sglang/srt/layers/attention/double_sparsity_backend.py,sha256=BlX7uXteQpnoOnKsdBKh8h20zMVMEiibB5F_PkZSlNI,10706
|
58
|
-
sglang/srt/layers/attention/flashinfer_backend.py,sha256=
|
58
|
+
sglang/srt/layers/attention/flashinfer_backend.py,sha256=ENnNbsA8bY--eFe-Ecqa2RRklH2-a7SV_yZRzpDKnDQ,24879
|
59
59
|
sglang/srt/layers/attention/triton_backend.py,sha256=gjxed2cvc2-8QEHkzyTVv6ui7oYOp2b_vgIUQVD1XuM,6538
|
60
60
|
sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=BE63WhKiutSNkhJLsRwvfsRy-ExvuAv7FZyoWv73ul8,18744
|
61
61
|
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
|
@@ -73,13 +73,13 @@ sglang/srt/lora/lora.py,sha256=KhhO9aKCyFWvJnhI07lZKANIvNjtt882HrTYFNBZMv0,15065
|
|
73
73
|
sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
|
74
74
|
sglang/srt/lora/lora_manager.py,sha256=DHiqdl0_4wQ5PxZBZtlCpP14515mDV2_H9tzL3Rdss8,12886
|
75
75
|
sglang/srt/managers/data_parallel_controller.py,sha256=JxRtJJTVn1FU2iD292rLZPftAsR4_8j4d3yF8j0dvBc,8327
|
76
|
-
sglang/srt/managers/detokenizer_manager.py,sha256=
|
77
|
-
sglang/srt/managers/image_processor.py,sha256=
|
76
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=oWquBe0yvSwILwllMBJFJUEgBt1NEM_3KluAc0T6Pnw,7333
|
77
|
+
sglang/srt/managers/image_processor.py,sha256=foLv3QVW_A8IRjRcHOKn0_HC771JbPEz8ML1mGqYKYw,13685
|
78
78
|
sglang/srt/managers/io_struct.py,sha256=WLXz-tyn0jR7zNO9feRBXgyjphVa8qR55OoEOUdzoVI,13751
|
79
|
-
sglang/srt/managers/schedule_batch.py,sha256
|
79
|
+
sglang/srt/managers/schedule_batch.py,sha256=jBABHbL7gyrKdrFrzScJ76MtvG2D9Y5HDx74qsclo80,44470
|
80
80
|
sglang/srt/managers/schedule_policy.py,sha256=ayFz4iPLIlG8mx5i1glTCAMHJPGpFedMP9UgRtqkNhA,12526
|
81
|
-
sglang/srt/managers/scheduler.py,sha256=
|
82
|
-
sglang/srt/managers/session_controller.py,sha256=
|
81
|
+
sglang/srt/managers/scheduler.py,sha256=JVxV3Y5AU0OOOfePVM5dVPuuN_Kd9nwV3p3vH3CHQps,56059
|
82
|
+
sglang/srt/managers/session_controller.py,sha256=hajOnkNZ_JpP4E-GKMVGzyJSK4sc9uF9t229uFuxkVs,2874
|
83
83
|
sglang/srt/managers/tokenizer_manager.py,sha256=zYbKEKNuM1B3PXzA7jnDpxew-0rZXSX-7dHmVLWG3e4,26477
|
84
84
|
sglang/srt/managers/tp_worker.py,sha256=1SQJ60iKS9e5vGY555fT1iZ4OtLumXzeWfB08fSWKbk,6176
|
85
85
|
sglang/srt/managers/tp_worker_overlap_thread.py,sha256=7vhPebaOS4JamaS08CGf_hwxnUO7Gy_SXZXEPwNHKoY,7621
|
@@ -112,7 +112,7 @@ sglang/srt/models/llama.py,sha256=FSGuM3BamhuT5h2jedh5cSFwFYduOJwkAZJJ672awRw,16
|
|
112
112
|
sglang/srt/models/llama_classification.py,sha256=c8WZ1ADa3f6s2IJVoP10ouVgeCwv_ndns_qMgLrC6QI,3413
|
113
113
|
sglang/srt/models/llama_embedding.py,sha256=2ex2jrz31osaAd9V8sJeN0qyxmk-L5NgOBkXL1puGhI,3166
|
114
114
|
sglang/srt/models/llama_reward.py,sha256=prhHDPpf1k6tlQtGE6zq5gx0uSZAD3W5v7W28bdgy4U,4619
|
115
|
-
sglang/srt/models/llava.py,sha256=
|
115
|
+
sglang/srt/models/llava.py,sha256=HjC2TDLngpaN8HMYyGp5doEK32HeQN8iT2tYE_Slrtg,25130
|
116
116
|
sglang/srt/models/llavavid.py,sha256=DeWqGSmXgIYGuLyy2ZrxjM9WqbRjueP4chNmXt7Bnus,12221
|
117
117
|
sglang/srt/models/minicpm.py,sha256=KbiTf-kaDAJxSo9Z4IGMTrs9WrYYji1KXO1kA2iy-as,13816
|
118
118
|
sglang/srt/models/minicpm3.py,sha256=C43mTr2Qjccj4sXuTDgzbfZhvCNbsEHNggMRXQ7SrWs,25108
|
@@ -155,10 +155,10 @@ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9
|
|
155
155
|
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
156
156
|
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
157
157
|
sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
|
158
|
-
sglang/test/test_utils.py,sha256=
|
158
|
+
sglang/test/test_utils.py,sha256=NBEGQC_wtMqODQQZWrxdwmsoLFSZfDlQzIbsQ1kE_Yc,23468
|
159
159
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
|
160
|
-
sglang-0.3.6.
|
161
|
-
sglang-0.3.6.
|
162
|
-
sglang-0.3.6.
|
163
|
-
sglang-0.3.6.
|
164
|
-
sglang-0.3.6.
|
160
|
+
sglang-0.3.6.post2.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
|
161
|
+
sglang-0.3.6.post2.dist-info/METADATA,sha256=3ekB4UX6bNwXzqlRChfxG0R8sme-x0FQAImcw0gpfM8,22122
|
162
|
+
sglang-0.3.6.post2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
163
|
+
sglang-0.3.6.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
164
|
+
sglang-0.3.6.post2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|