sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +23 -3
- sglang/srt/configs/deepseekvl2.py +10 -1
- sglang/srt/configs/model_config.py +5 -16
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
- sglang/srt/distributed/parallel_state.py +32 -5
- sglang/srt/entrypoints/http_server.py +7 -1
- sglang/srt/entrypoints/verl_engine.py +2 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/attention/flashattention_backend.py +218 -79
- sglang/srt/layers/dp_attention.py +12 -1
- sglang/srt/layers/moe/topk.py +30 -3
- sglang/srt/layers/quantization/__init__.py +134 -165
- sglang/srt/layers/quantization/awq.py +200 -0
- sglang/srt/layers/quantization/fp8_kernel.py +2 -1
- sglang/srt/layers/quantization/gptq.py +30 -40
- sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
- sglang/srt/layers/rotary_embedding.py +12 -0
- sglang/srt/lora/backend/base_backend.py +4 -4
- sglang/srt/lora/backend/flashinfer_backend.py +12 -9
- sglang/srt/lora/backend/triton_backend.py +5 -8
- sglang/srt/lora/layers.py +19 -33
- sglang/srt/lora/lora_manager.py +20 -7
- sglang/srt/lora/mem_pool.py +12 -6
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
- sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
- sglang/srt/lora/utils.py +6 -0
- sglang/srt/managers/io_struct.py +4 -2
- sglang/srt/managers/multimodal_processors/clip.py +63 -0
- sglang/srt/managers/schedule_batch.py +1 -0
- sglang/srt/managers/scheduler.py +25 -19
- sglang/srt/managers/tokenizer_manager.py +0 -1
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/model_executor/cuda_graph_runner.py +9 -8
- sglang/srt/model_executor/model_runner.py +9 -6
- sglang/srt/model_loader/loader.py +11 -1
- sglang/srt/model_loader/weight_utils.py +6 -3
- sglang/srt/models/clip.py +563 -0
- sglang/srt/models/deepseek_janus_pro.py +2 -2
- sglang/srt/models/deepseek_v2.py +151 -26
- sglang/srt/models/gemma3_causal.py +12 -2
- sglang/srt/models/gemma3_mm.py +6 -0
- sglang/srt/openai_api/adapter.py +88 -87
- sglang/srt/openai_api/protocol.py +10 -5
- sglang/srt/patch_torch.py +71 -0
- sglang/srt/server_args.py +21 -11
- sglang/srt/speculative/eagle_worker.py +1 -1
- sglang/srt/utils.py +33 -0
- sglang/test/runners.py +27 -2
- sglang/test/test_utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +8 -4
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +57 -53
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +0 -0
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,7 @@ class ModelCard(BaseModel):
|
|
28
28
|
created: int = Field(default_factory=lambda: int(time.time()))
|
29
29
|
owned_by: str = "sglang"
|
30
30
|
root: Optional[str] = None
|
31
|
+
max_model_len: Optional[int] = None
|
31
32
|
|
32
33
|
|
33
34
|
class ModelList(BaseModel):
|
@@ -187,7 +188,7 @@ class CompletionResponseChoice(BaseModel):
|
|
187
188
|
index: int
|
188
189
|
text: str
|
189
190
|
logprobs: Optional[LogProbs] = None
|
190
|
-
finish_reason:
|
191
|
+
finish_reason: Literal["stop", "length", "content_filter"]
|
191
192
|
matched_stop: Union[None, int, str] = None
|
192
193
|
|
193
194
|
|
@@ -204,7 +205,7 @@ class CompletionResponseStreamChoice(BaseModel):
|
|
204
205
|
index: int
|
205
206
|
text: str
|
206
207
|
logprobs: Optional[LogProbs] = None
|
207
|
-
finish_reason: Optional[
|
208
|
+
finish_reason: Optional[Literal["stop", "length", "content_filter"]] = None
|
208
209
|
matched_stop: Union[None, int, str] = None
|
209
210
|
|
210
211
|
|
@@ -322,7 +323,7 @@ class ChatCompletionRequest(BaseModel):
|
|
322
323
|
max_tokens: Optional[int] = None
|
323
324
|
n: int = 1
|
324
325
|
presence_penalty: float = 0.0
|
325
|
-
response_format: Union[ResponseFormat, StructuralTagResponseFormat] = None
|
326
|
+
response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
|
326
327
|
seed: Optional[int] = None
|
327
328
|
stop: Optional[Union[str, List[str]]] = None
|
328
329
|
stream: bool = False
|
@@ -387,7 +388,9 @@ class ChatCompletionResponseChoice(BaseModel):
|
|
387
388
|
index: int
|
388
389
|
message: ChatMessage
|
389
390
|
logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
|
390
|
-
finish_reason:
|
391
|
+
finish_reason: Literal[
|
392
|
+
"stop", "length", "tool_calls", "content_filter", "function_call"
|
393
|
+
]
|
391
394
|
matched_stop: Union[None, int, str] = None
|
392
395
|
|
393
396
|
|
@@ -411,7 +414,9 @@ class ChatCompletionResponseStreamChoice(BaseModel):
|
|
411
414
|
index: int
|
412
415
|
delta: DeltaMessage
|
413
416
|
logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
|
414
|
-
finish_reason: Optional[
|
417
|
+
finish_reason: Optional[
|
418
|
+
Literal["stop", "length", "tool_calls", "content_filter", "function_call"]
|
419
|
+
] = None
|
415
420
|
matched_stop: Union[None, int, str] = None
|
416
421
|
|
417
422
|
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
14
|
+
from typing import Callable, Union
|
15
|
+
|
16
|
+
import torch
|
17
|
+
from torch.multiprocessing import reductions
|
18
|
+
|
19
|
+
|
20
|
+
def monkey_patch_torch_reductions():
|
21
|
+
"""Monkey patching before Torch https://github.com/pytorch/pytorch/pull/149248 is fixed"""
|
22
|
+
|
23
|
+
if hasattr(reductions, "_reduce_tensor_original"):
|
24
|
+
return
|
25
|
+
|
26
|
+
reductions._reduce_tensor_original = reductions.reduce_tensor
|
27
|
+
reductions._rebuild_cuda_tensor_original = reductions.rebuild_cuda_tensor
|
28
|
+
|
29
|
+
reductions.reduce_tensor = _reduce_tensor_modified
|
30
|
+
reductions.rebuild_cuda_tensor = _rebuild_cuda_tensor_modified
|
31
|
+
|
32
|
+
reductions.init_reductions()
|
33
|
+
|
34
|
+
|
35
|
+
# The signature has not been changed for years, and we will not need this when the next version is released,
|
36
|
+
# so it looks safe to use a constant.
|
37
|
+
_REDUCE_TENSOR_ARG_DEVICE_INDEX = 6
|
38
|
+
|
39
|
+
|
40
|
+
def _reduce_tensor_modified(*args, **kwargs):
|
41
|
+
output_fn, output_args = reductions._reduce_tensor_original(*args, **kwargs)
|
42
|
+
output_args = _modify_tuple(
|
43
|
+
output_args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_to_uuid
|
44
|
+
)
|
45
|
+
return output_fn, output_args
|
46
|
+
|
47
|
+
|
48
|
+
def _rebuild_cuda_tensor_modified(*args):
|
49
|
+
args = _modify_tuple(args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_from_maybe_uuid)
|
50
|
+
return reductions._rebuild_cuda_tensor_original(*args)
|
51
|
+
|
52
|
+
|
53
|
+
def _device_to_uuid(device: int) -> str:
|
54
|
+
return str(torch.cuda.get_device_properties(device).uuid)
|
55
|
+
|
56
|
+
|
57
|
+
def _device_from_maybe_uuid(device_maybe_uuid: Union[int, str]) -> int:
|
58
|
+
if isinstance(device_maybe_uuid, int):
|
59
|
+
return device_maybe_uuid
|
60
|
+
|
61
|
+
if isinstance(device_maybe_uuid, str):
|
62
|
+
for device in range(torch.cuda.device_count()):
|
63
|
+
if str(torch.cuda.get_device_properties(device).uuid) == device_maybe_uuid:
|
64
|
+
return device
|
65
|
+
raise Exception("Invalid device_uuid=" + device_maybe_uuid)
|
66
|
+
|
67
|
+
raise Exception(f"Unknown type: {device_maybe_uuid=}")
|
68
|
+
|
69
|
+
|
70
|
+
def _modify_tuple(t, index: int, modifier: Callable):
|
71
|
+
return *t[:index], modifier(t[index]), *t[index + 1 :]
|
sglang/srt/server_args.py
CHANGED
@@ -24,6 +24,7 @@ from typing import List, Optional
|
|
24
24
|
from sglang.srt.hf_transformers_utils import check_gguf_file
|
25
25
|
from sglang.srt.reasoning_parser import ReasoningParser
|
26
26
|
from sglang.srt.utils import (
|
27
|
+
configure_ipv6,
|
27
28
|
get_amdgpu_memory_capacity,
|
28
29
|
get_device,
|
29
30
|
get_hpu_memory_capacity,
|
@@ -52,7 +53,7 @@ class ServerArgs:
|
|
52
53
|
dtype: str = "auto"
|
53
54
|
kv_cache_dtype: str = "auto"
|
54
55
|
quantization: Optional[str] = None
|
55
|
-
quantization_param_path:
|
56
|
+
quantization_param_path: Optional[str] = None
|
56
57
|
context_length: Optional[int] = None
|
57
58
|
device: Optional[str] = None
|
58
59
|
served_model_name: Optional[str] = None
|
@@ -140,7 +141,7 @@ class ServerArgs:
|
|
140
141
|
|
141
142
|
# Double Sparsity
|
142
143
|
enable_double_sparsity: bool = False
|
143
|
-
ds_channel_config_path: str = None
|
144
|
+
ds_channel_config_path: Optional[str] = None
|
144
145
|
ds_heavy_channel_num: int = 32
|
145
146
|
ds_heavy_token_num: int = 256
|
146
147
|
ds_heavy_channel_type: str = "qk"
|
@@ -173,7 +174,7 @@ class ServerArgs:
|
|
173
174
|
enable_memory_saver: bool = False
|
174
175
|
allow_auto_truncate: bool = False
|
175
176
|
enable_custom_logit_processor: bool = False
|
176
|
-
tool_call_parser: str = None
|
177
|
+
tool_call_parser: Optional[str] = None
|
177
178
|
enable_hierarchical_cache: bool = False
|
178
179
|
hicache_ratio: float = 2.0
|
179
180
|
enable_flashinfer_mla: bool = False
|
@@ -290,12 +291,17 @@ class ServerArgs:
|
|
290
291
|
logger.warning(
|
291
292
|
f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
|
292
293
|
)
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
294
|
+
|
295
|
+
self.enable_sp_layernorm = False
|
296
|
+
# DeepEP MoE
|
297
|
+
if self.enable_deepep_moe:
|
298
|
+
self.ep_size = self.tp_size
|
299
|
+
self.enable_sp_layernorm = (
|
300
|
+
self.dp_size < self.tp_size if self.enable_dp_attention else True
|
301
|
+
)
|
302
|
+
logger.info(
|
303
|
+
f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
304
|
+
)
|
299
305
|
|
300
306
|
# Speculative Decoding
|
301
307
|
if self.speculative_algorithm == "NEXTN":
|
@@ -1200,8 +1206,12 @@ class PortArgs:
|
|
1200
1206
|
# DP attention. Use TCP + port to handle both single-node and multi-node.
|
1201
1207
|
if server_args.nnodes == 1 and server_args.dist_init_addr is None:
|
1202
1208
|
dist_init_addr = ("127.0.0.1", server_args.port + ZMQ_TCP_PORT_DELTA)
|
1209
|
+
elif server_args.dist_init_addr.startswith("["): # ipv6 address
|
1210
|
+
port_num, host = configure_ipv6(server_args.dist_init_addr)
|
1211
|
+
dist_init_addr = (host, str(port_num))
|
1203
1212
|
else:
|
1204
1213
|
dist_init_addr = server_args.dist_init_addr.split(":")
|
1214
|
+
|
1205
1215
|
assert (
|
1206
1216
|
len(dist_init_addr) == 2
|
1207
1217
|
), "please provide --dist-init-addr as host:port of head node"
|
@@ -1210,10 +1220,10 @@ class PortArgs:
|
|
1210
1220
|
port_base = int(dist_init_port) + 1
|
1211
1221
|
if dp_rank is None:
|
1212
1222
|
scheduler_input_port = (
|
1213
|
-
port_base +
|
1223
|
+
port_base + 3
|
1214
1224
|
) # TokenizerManager to DataParallelController
|
1215
1225
|
else:
|
1216
|
-
scheduler_input_port = port_base +
|
1226
|
+
scheduler_input_port = port_base + 3 + 1 + dp_rank
|
1217
1227
|
|
1218
1228
|
return PortArgs(
|
1219
1229
|
tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
|
@@ -586,5 +586,5 @@ def load_token_map(token_map_path: str) -> List[int]:
|
|
586
586
|
ignore_patterns=["*.bin", "*.safetensors"],
|
587
587
|
)
|
588
588
|
token_map_path = os.path.join(cache_dir, os.path.basename(token_map_path))
|
589
|
-
hot_token_id = torch.load(token_map_path)
|
589
|
+
hot_token_id = torch.load(token_map_path, weights_only=True)
|
590
590
|
return torch.tensor(hot_token_id, dtype=torch.int32)
|
sglang/srt/utils.py
CHANGED
@@ -1602,6 +1602,7 @@ def get_ip() -> str:
|
|
1602
1602
|
def get_open_port() -> int:
|
1603
1603
|
port = os.getenv("SGLANG_PORT")
|
1604
1604
|
if port is not None:
|
1605
|
+
port = int(port)
|
1605
1606
|
while True:
|
1606
1607
|
try:
|
1607
1608
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
@@ -1630,6 +1631,38 @@ def is_valid_ipv6_address(address: str) -> bool:
|
|
1630
1631
|
return False
|
1631
1632
|
|
1632
1633
|
|
1634
|
+
def configure_ipv6(dist_init_addr):
|
1635
|
+
addr = dist_init_addr
|
1636
|
+
end = addr.find("]")
|
1637
|
+
if end == -1:
|
1638
|
+
raise ValueError("invalid IPv6 address format: missing ']'")
|
1639
|
+
|
1640
|
+
host = addr[: end + 1]
|
1641
|
+
|
1642
|
+
# this only validates the address without brackets: we still need the below checks.
|
1643
|
+
# if it's invalid, immediately raise an error so we know it's not formatting issues.
|
1644
|
+
if not is_valid_ipv6_address(host[1:end]):
|
1645
|
+
raise ValueError(f"invalid IPv6 address: {host}")
|
1646
|
+
|
1647
|
+
port_str = None
|
1648
|
+
if len(addr) > end + 1:
|
1649
|
+
if addr[end + 1] == ":":
|
1650
|
+
port_str = addr[end + 2 :]
|
1651
|
+
else:
|
1652
|
+
raise ValueError("received IPv6 address format: expected ':' after ']'")
|
1653
|
+
|
1654
|
+
if not port_str:
|
1655
|
+
raise ValueError(
|
1656
|
+
"a port must be specified in IPv6 address (format: [ipv6]:port)"
|
1657
|
+
)
|
1658
|
+
|
1659
|
+
try:
|
1660
|
+
port = int(port_str)
|
1661
|
+
except ValueError:
|
1662
|
+
raise ValueError(f"invalid port in IPv6 address: '{port_str}'")
|
1663
|
+
return port, host
|
1664
|
+
|
1665
|
+
|
1633
1666
|
def rank0_print(msg: str):
|
1634
1667
|
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
1635
1668
|
|
sglang/test/runners.py
CHANGED
@@ -19,10 +19,16 @@ from typing import List, Optional, Tuple, Union
|
|
19
19
|
|
20
20
|
import torch
|
21
21
|
import torch.nn.functional as F
|
22
|
-
from transformers import
|
22
|
+
from transformers import (
|
23
|
+
AutoModel,
|
24
|
+
AutoModelForCausalLM,
|
25
|
+
AutoModelForVision2Seq,
|
26
|
+
AutoProcessor,
|
27
|
+
)
|
23
28
|
|
24
29
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
25
30
|
from sglang.srt.server import Engine
|
31
|
+
from sglang.srt.utils import load_image
|
26
32
|
from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
|
27
33
|
|
28
34
|
DEFAULT_PROMPTS = [
|
@@ -140,7 +146,6 @@ class HFRunner:
|
|
140
146
|
def _get_gme_qwen2_vl_embeddings(
|
141
147
|
self, prompts, image_data: Optional[List[str]] = None
|
142
148
|
):
|
143
|
-
from sglang.srt.utils import load_image
|
144
149
|
|
145
150
|
images = None
|
146
151
|
if image_data is not None:
|
@@ -226,6 +231,9 @@ class HFRunner:
|
|
226
231
|
low_cpu_mem_usage=True,
|
227
232
|
).cuda()
|
228
233
|
self.processor = AutoProcessor.from_pretrained(model_path)
|
234
|
+
elif "clip" in model_path.lower():
|
235
|
+
self.model = AutoModel.from_pretrained(model_path).cuda()
|
236
|
+
self.processor = AutoProcessor.from_pretrained(model_path)
|
229
237
|
else:
|
230
238
|
self.model = _get_sentence_transformer_embedding_model(
|
231
239
|
model_path, torch_dtype
|
@@ -272,6 +280,23 @@ class HFRunner:
|
|
272
280
|
assert not self.output_str_only
|
273
281
|
if "gme-qwen2-vl" in model_path.lower():
|
274
282
|
logits = self._get_gme_qwen2_vl_embeddings(prompts, image_data)
|
283
|
+
elif "clip" in model_path.lower():
|
284
|
+
if image_data is not None:
|
285
|
+
image = load_image(image_data)
|
286
|
+
inputs = self.processor(
|
287
|
+
images=image[0], return_tensors="pt"
|
288
|
+
)
|
289
|
+
logits = self.model.get_image_features(
|
290
|
+
pixel_values=inputs.data["pixel_values"].cuda(),
|
291
|
+
).tolist()
|
292
|
+
else:
|
293
|
+
inputs = self.tokenizer(
|
294
|
+
prompts, padding=True, return_tensors="pt"
|
295
|
+
)
|
296
|
+
logits = self.model.get_text_features(
|
297
|
+
input_ids=inputs.data["input_ids"].cuda(),
|
298
|
+
attention_mask=inputs.data["attention_mask"].cuda(),
|
299
|
+
).tolist()
|
275
300
|
else:
|
276
301
|
logits = self.model.encode(prompts).tolist()
|
277
302
|
out_queue.put(ModelOutput(embed_logits=logits))
|
sglang/test/test_utils.py
CHANGED
@@ -29,7 +29,7 @@ from sglang.srt.utils import get_bool_env_var, kill_process_tree
|
|
29
29
|
from sglang.test.run_eval import run_eval
|
30
30
|
from sglang.utils import get_exception_traceback
|
31
31
|
|
32
|
-
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
|
32
|
+
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
|
33
33
|
DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
34
34
|
DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
|
35
35
|
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.4.
|
1
|
+
__version__ = "0.4.4.post3"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.4.
|
3
|
+
Version: 0.4.4.post3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -218,6 +218,7 @@ Requires-Dist: numpy
|
|
218
218
|
Requires-Dist: IPython
|
219
219
|
Requires-Dist: setproctitle
|
220
220
|
Provides-Extra: runtime-common
|
221
|
+
Requires-Dist: compressed-tensors; extra == "runtime-common"
|
221
222
|
Requires-Dist: datasets; extra == "runtime-common"
|
222
223
|
Requires-Dist: decord; extra == "runtime-common"
|
223
224
|
Requires-Dist: fastapi; extra == "runtime-common"
|
@@ -240,14 +241,17 @@ Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
|
240
241
|
Requires-Dist: transformers==4.50.0; extra == "runtime-common"
|
241
242
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
242
243
|
Requires-Dist: uvloop; extra == "runtime-common"
|
243
|
-
Requires-Dist:
|
244
|
+
Requires-Dist: compressed-tensors; extra == "runtime-common"
|
245
|
+
Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
|
244
246
|
Provides-Extra: srt
|
245
247
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
246
|
-
Requires-Dist: sgl-kernel==0.0.5.
|
248
|
+
Requires-Dist: sgl-kernel==0.0.5.post4; extra == "srt"
|
247
249
|
Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
|
248
250
|
Requires-Dist: torch==2.5.1; extra == "srt"
|
249
251
|
Requires-Dist: cuda-python; extra == "srt"
|
250
252
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
253
|
+
Requires-Dist: partial_json_parser; extra == "srt"
|
254
|
+
Requires-Dist: einops; extra == "srt"
|
251
255
|
Provides-Extra: srt-hip
|
252
256
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
253
257
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -271,7 +275,7 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
|
271
275
|
Provides-Extra: litellm
|
272
276
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
273
277
|
Provides-Extra: torch-memory-saver
|
274
|
-
Requires-Dist: torch_memory_saver>=0.0.
|
278
|
+
Requires-Dist: torch_memory_saver>=0.0.4; extra == "torch-memory-saver"
|
275
279
|
Provides-Extra: test
|
276
280
|
Requires-Dist: jsonlines; extra == "test"
|
277
281
|
Requires-Dist: matplotlib; extra == "test"
|