sglang 0.4.1.post1__py3-none-any.whl → 0.4.1.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +1 -0
- sglang/srt/configs/model_config.py +11 -2
- sglang/srt/layers/attention/__init__.py +0 -1
- sglang/srt/layers/attention/flashinfer_backend.py +54 -41
- sglang/srt/layers/logits_processor.py +30 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +46 -26
- sglang/srt/layers/quantization/fp8.py +42 -2
- sglang/srt/layers/quantization/fp8_kernel.py +77 -18
- sglang/srt/layers/quantization/fp8_utils.py +8 -2
- sglang/srt/managers/io_struct.py +29 -8
- sglang/srt/managers/schedule_batch.py +22 -15
- sglang/srt/managers/scheduler.py +60 -20
- sglang/srt/managers/session_controller.py +102 -27
- sglang/srt/managers/tokenizer_manager.py +41 -10
- sglang/srt/managers/tp_worker.py +7 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +5 -0
- sglang/srt/model_executor/forward_batch_info.py +42 -3
- sglang/srt/model_executor/model_runner.py +4 -0
- sglang/srt/models/llama.py +11 -0
- sglang/srt/models/llama_eagle.py +132 -0
- sglang/srt/openai_api/adapter.py +60 -2
- sglang/srt/openai_api/protocol.py +48 -0
- sglang/srt/server.py +26 -3
- sglang/srt/server_args.py +17 -30
- sglang/srt/speculative/spec_info.py +19 -0
- sglang/srt/utils.py +62 -0
- sglang/version.py +1 -1
- {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/METADATA +3 -3
- {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/RECORD +32 -30
- {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/LICENSE +0 -0
- {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/WHEEL +0 -0
- {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/top_level.txt +0 -0
@@ -257,6 +257,34 @@ class ResponseFormat(BaseModel):
|
|
257
257
|
json_schema: Optional[JsonSchemaResponseFormat] = None
|
258
258
|
|
259
259
|
|
260
|
+
class Function(BaseModel):
|
261
|
+
"""Function descriptions."""
|
262
|
+
|
263
|
+
description: Optional[str] = Field(default=None, examples=[None])
|
264
|
+
name: str
|
265
|
+
parameters: Optional[object] = None
|
266
|
+
|
267
|
+
|
268
|
+
class Tool(BaseModel):
|
269
|
+
"""Function wrapper."""
|
270
|
+
|
271
|
+
type: str = Field(default="function", examples=["function"])
|
272
|
+
function: Function
|
273
|
+
|
274
|
+
|
275
|
+
class ToolChoiceFuncName(BaseModel):
|
276
|
+
"""The name of tool choice function."""
|
277
|
+
|
278
|
+
name: str
|
279
|
+
|
280
|
+
|
281
|
+
class ToolChoice(BaseModel):
|
282
|
+
"""The tool choice definition."""
|
283
|
+
|
284
|
+
function: ToolChoiceFuncName
|
285
|
+
type: Literal["function"] = Field(default="function", examples=["function"])
|
286
|
+
|
287
|
+
|
260
288
|
class ChatCompletionRequest(BaseModel):
|
261
289
|
# Ordered by official OpenAI API documentation
|
262
290
|
# https://platform.openai.com/docs/api-reference/chat/create
|
@@ -277,6 +305,10 @@ class ChatCompletionRequest(BaseModel):
|
|
277
305
|
temperature: float = 0.7
|
278
306
|
top_p: float = 1.0
|
279
307
|
user: Optional[str] = None
|
308
|
+
tools: Optional[List[Tool]] = Field(default=None, examples=[None])
|
309
|
+
tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
|
310
|
+
default="auto", examples=["none"]
|
311
|
+
) # noqa
|
280
312
|
|
281
313
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
282
314
|
top_k: int = -1
|
@@ -292,9 +324,25 @@ class ChatCompletionRequest(BaseModel):
|
|
292
324
|
ebnf: Optional[str] = None
|
293
325
|
|
294
326
|
|
327
|
+
class FunctionResponse(BaseModel):
|
328
|
+
"""Function response."""
|
329
|
+
|
330
|
+
name: str
|
331
|
+
arguments: str
|
332
|
+
|
333
|
+
|
334
|
+
class ToolCall(BaseModel):
|
335
|
+
"""Tool call response."""
|
336
|
+
|
337
|
+
id: str
|
338
|
+
type: Literal["function"] = "function"
|
339
|
+
function: FunctionResponse
|
340
|
+
|
341
|
+
|
295
342
|
class ChatMessage(BaseModel):
|
296
343
|
role: Optional[str] = None
|
297
344
|
content: Optional[str] = None
|
345
|
+
tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
|
298
346
|
|
299
347
|
|
300
348
|
class ChatCompletionResponseChoice(BaseModel):
|
sglang/srt/server.py
CHANGED
@@ -57,6 +57,7 @@ from sglang.srt.managers.io_struct import (
|
|
57
57
|
OpenSessionReqInput,
|
58
58
|
UpdateWeightFromDiskReqInput,
|
59
59
|
UpdateWeightsFromDistributedReqInput,
|
60
|
+
UpdateWeightsFromTensorReqInput,
|
60
61
|
)
|
61
62
|
from sglang.srt.managers.scheduler import run_scheduler_process
|
62
63
|
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
@@ -109,6 +110,7 @@ app.add_middleware(
|
|
109
110
|
tokenizer_manager: TokenizerManager = None
|
110
111
|
scheduler_info: Dict = None
|
111
112
|
|
113
|
+
|
112
114
|
##### Native API endpoints #####
|
113
115
|
|
114
116
|
|
@@ -257,6 +259,10 @@ async def open_session(obj: OpenSessionReqInput, request: Request):
|
|
257
259
|
"""Open a session, and return its unique session id."""
|
258
260
|
try:
|
259
261
|
session_id = await tokenizer_manager.open_session(obj, request)
|
262
|
+
if session_id is None:
|
263
|
+
raise Exception(
|
264
|
+
"Failed to open the session. Check if a session with the same id is still open."
|
265
|
+
)
|
260
266
|
return session_id
|
261
267
|
except Exception as e:
|
262
268
|
return _create_error_response(e)
|
@@ -484,7 +490,16 @@ def launch_engine(
|
|
484
490
|
# Wait for model to finish loading
|
485
491
|
scheduler_infos = []
|
486
492
|
for i in range(len(scheduler_pipe_readers)):
|
487
|
-
|
493
|
+
try:
|
494
|
+
data = scheduler_pipe_readers[i].recv()
|
495
|
+
except EOFError as e:
|
496
|
+
logger.exception(e)
|
497
|
+
logger.error(
|
498
|
+
f"Rank {i} scheduler is dead. Please check if there are relevant logs."
|
499
|
+
)
|
500
|
+
scheduler_procs[i].join()
|
501
|
+
logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
|
502
|
+
raise
|
488
503
|
|
489
504
|
if data["status"] != "ready":
|
490
505
|
raise RuntimeError(
|
@@ -492,7 +507,7 @@ def launch_engine(
|
|
492
507
|
)
|
493
508
|
scheduler_infos.append(data)
|
494
509
|
|
495
|
-
# Assume all schedulers have same
|
510
|
+
# Assume all schedulers have same scheduler_info
|
496
511
|
scheduler_info = scheduler_infos[0]
|
497
512
|
|
498
513
|
|
@@ -857,6 +872,14 @@ class Engine:
|
|
857
872
|
tokenizer_manager.update_weights_from_distributed(obj, None)
|
858
873
|
)
|
859
874
|
|
875
|
+
def update_weights_from_tensor(self, name, tensor):
|
876
|
+
"""Update weights from distributed source."""
|
877
|
+
obj = UpdateWeightsFromTensorReqInput(name=name, tensor=tensor)
|
878
|
+
loop = asyncio.get_event_loop()
|
879
|
+
return loop.run_until_complete(
|
880
|
+
tokenizer_manager.update_weights_from_tensor(obj, None)
|
881
|
+
)
|
882
|
+
|
860
883
|
def get_weights_by_name(self, name, truncate_size=100):
|
861
884
|
"""Get weights by parameter name."""
|
862
885
|
obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
|
@@ -871,7 +894,7 @@ class Runtime:
|
|
871
894
|
using the commond line interface.
|
872
895
|
|
873
896
|
It is mainly used for the frontend language.
|
874
|
-
You should use the Engine class if you want to do normal offline processing.
|
897
|
+
You should use the Engine class above if you want to do normal offline processing.
|
875
898
|
"""
|
876
899
|
|
877
900
|
def __init__(
|
sglang/srt/server_args.py
CHANGED
@@ -55,7 +55,7 @@ class ServerArgs:
|
|
55
55
|
is_embedding: bool = False
|
56
56
|
revision: Optional[str] = None
|
57
57
|
|
58
|
-
# Port
|
58
|
+
# Port for the HTTP server
|
59
59
|
host: str = "127.0.0.1"
|
60
60
|
port: int = 30000
|
61
61
|
|
@@ -68,6 +68,7 @@ class ServerArgs:
|
|
68
68
|
schedule_policy: str = "lpm"
|
69
69
|
schedule_conservativeness: float = 1.0
|
70
70
|
cpu_offload_gb: int = 0
|
71
|
+
prefill_only_one_req: bool = False
|
71
72
|
|
72
73
|
# Other runtime options
|
73
74
|
tp_size: int = 1
|
@@ -94,6 +95,7 @@ class ServerArgs:
|
|
94
95
|
# Data parallelism
|
95
96
|
dp_size: int = 1
|
96
97
|
load_balance_method: str = "round_robin"
|
98
|
+
|
97
99
|
# Expert parallelism
|
98
100
|
ep_size: int = 1
|
99
101
|
|
@@ -217,6 +219,13 @@ class ServerArgs:
|
|
217
219
|
)
|
218
220
|
self.disable_cuda_graph = True
|
219
221
|
|
222
|
+
# Expert parallelism
|
223
|
+
if self.enable_ep_moe:
|
224
|
+
self.ep_size = self.tp_size
|
225
|
+
logger.info(
|
226
|
+
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
227
|
+
)
|
228
|
+
|
220
229
|
# Others
|
221
230
|
if self.enable_dp_attention:
|
222
231
|
self.dp_size = self.tp_size
|
@@ -229,12 +238,6 @@ class ServerArgs:
|
|
229
238
|
"Data parallel size is adjusted to be the same as tensor parallel size. "
|
230
239
|
"Overlap scheduler is disabled."
|
231
240
|
)
|
232
|
-
# Expert parallelism
|
233
|
-
if self.enable_ep_moe:
|
234
|
-
self.ep_size = self.tp_size
|
235
|
-
logger.info(
|
236
|
-
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
237
|
-
)
|
238
241
|
|
239
242
|
# GGUF
|
240
243
|
if (
|
@@ -430,13 +433,18 @@ class ServerArgs:
|
|
430
433
|
default=ServerArgs.schedule_conservativeness,
|
431
434
|
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
|
432
435
|
)
|
433
|
-
|
434
436
|
parser.add_argument(
|
435
437
|
"--cpu-offload-gb",
|
436
438
|
type=int,
|
437
439
|
default=ServerArgs.cpu_offload_gb,
|
438
440
|
help="How many GBs of RAM to reserve for CPU offloading",
|
439
441
|
)
|
442
|
+
parser.add_argument(
|
443
|
+
"--prefill-only-one-req",
|
444
|
+
type=bool,
|
445
|
+
help="If true, we only prefill one request at one prefill batch",
|
446
|
+
default=ServerArgs.prefill_only_one_req,
|
447
|
+
)
|
440
448
|
|
441
449
|
# Other runtime options
|
442
450
|
parser.add_argument(
|
@@ -555,6 +563,7 @@ class ServerArgs:
|
|
555
563
|
"shortest_queue",
|
556
564
|
],
|
557
565
|
)
|
566
|
+
|
558
567
|
# Expert parallelism
|
559
568
|
parser.add_argument(
|
560
569
|
"--expert-parallel-size",
|
@@ -777,28 +786,6 @@ class ServerArgs:
|
|
777
786
|
help="Delete the model checkpoint after loading the model.",
|
778
787
|
)
|
779
788
|
|
780
|
-
# Deprecated arguments
|
781
|
-
parser.add_argument(
|
782
|
-
"--enable-overlap-schedule",
|
783
|
-
action=DeprecatedAction,
|
784
|
-
help="'--enable-overlap-schedule' is deprecated. It is enabled by default now. Please drop this argument.",
|
785
|
-
)
|
786
|
-
parser.add_argument(
|
787
|
-
"--disable-flashinfer",
|
788
|
-
action=DeprecatedAction,
|
789
|
-
help="'--disable-flashinfer' is deprecated. Please use '--attention-backend triton' instead.",
|
790
|
-
)
|
791
|
-
parser.add_argument(
|
792
|
-
"--disable-flashinfer-sampling",
|
793
|
-
action=DeprecatedAction,
|
794
|
-
help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.",
|
795
|
-
)
|
796
|
-
parser.add_argument(
|
797
|
-
"--disable-disk-cache",
|
798
|
-
action=DeprecatedAction,
|
799
|
-
help="'--disable-disk-cache' is deprecated. Please use '--disable-outlines-disk-cache' instead.",
|
800
|
-
)
|
801
|
-
|
802
789
|
@classmethod
|
803
790
|
def from_cli_args(cls, args: argparse.Namespace):
|
804
791
|
args.tp_size = args.tensor_parallel_size
|
@@ -0,0 +1,19 @@
|
|
1
|
+
from enum import IntEnum, auto
|
2
|
+
|
3
|
+
|
4
|
+
class SpeculativeAlgorithm(IntEnum):
|
5
|
+
EAGLE = auto()
|
6
|
+
|
7
|
+
def is_eagle(self):
|
8
|
+
return self == SpeculativeAlgorithm.EAGLE
|
9
|
+
|
10
|
+
@staticmethod
|
11
|
+
def from_string(name: str):
|
12
|
+
name_map = {
|
13
|
+
"EAGLE": SpeculativeAlgorithm.EAGLE,
|
14
|
+
}
|
15
|
+
return name_map[name]
|
16
|
+
|
17
|
+
|
18
|
+
class SpecInfo:
|
19
|
+
pass
|
sglang/srt/utils.py
CHANGED
@@ -1273,3 +1273,65 @@ def dataclass_to_string_truncated(data, max_length=2048):
|
|
1273
1273
|
)
|
1274
1274
|
else:
|
1275
1275
|
return str(data)
|
1276
|
+
|
1277
|
+
|
1278
|
+
TOOLS_TAG_LIST = ["<|plugin|>", "<function=", "<tool_call>", "<|python_tag|>"]
|
1279
|
+
|
1280
|
+
|
1281
|
+
def parse_tool_response(text, tools, **kwargs):
|
1282
|
+
"""Parse model response containing tool information.
|
1283
|
+
|
1284
|
+
Args:
|
1285
|
+
text(str): model response in string format
|
1286
|
+
tools(List): tools from user request
|
1287
|
+
"""
|
1288
|
+
if "<|plugin|>" in text: # internlm2
|
1289
|
+
text, action = text.split("<|action_start|><|plugin|>")
|
1290
|
+
action = action.split("<|action_end|>".strip())[0]
|
1291
|
+
action = action[action.find("{") :]
|
1292
|
+
action = json.loads(action)
|
1293
|
+
name, parameters = action["name"], json.dumps(
|
1294
|
+
action.get("parameters", action.get("arguments", {})), ensure_ascii=False
|
1295
|
+
)
|
1296
|
+
call_info_list = [(name, parameters)]
|
1297
|
+
elif "<function=" in text: # llama3.1
|
1298
|
+
action, _ = text.split("</function>")
|
1299
|
+
parameters = action[action.find("{") :]
|
1300
|
+
name = action.split("<function=")[1].split(">{")[0]
|
1301
|
+
call_info_list = [(name, parameters)]
|
1302
|
+
elif "<tool_call>" in text and "</tool_call>" in text: # qwen2.5
|
1303
|
+
# get tool_call in text
|
1304
|
+
pattern = r"<tool_call>(.*?)</tool_call>"
|
1305
|
+
match_result_list = re.findall(pattern, text, re.DOTALL)
|
1306
|
+
call_info_list = []
|
1307
|
+
for match_result in match_result_list:
|
1308
|
+
action = json.loads(match_result)
|
1309
|
+
call_info_list.append(
|
1310
|
+
(action["name"], json.dumps(action["arguments"], ensure_ascii=False))
|
1311
|
+
)
|
1312
|
+
# get text outside of tags
|
1313
|
+
if not text.startswith("<tool_call>"):
|
1314
|
+
text = text[: text.find("<tool_call>")]
|
1315
|
+
elif not text.endswith("</tool_call>"):
|
1316
|
+
text = text[text.rfind("</tool_call>") + len("</tool_call>") :]
|
1317
|
+
else:
|
1318
|
+
text = ""
|
1319
|
+
elif "<|python_tag|>" in text: # llama3.2
|
1320
|
+
_, action = text.split("<|python_tag|>")
|
1321
|
+
action = json.loads(action)
|
1322
|
+
name, parameters = action["name"], json.dumps(
|
1323
|
+
action.get("parameters", action.get("arguments", {})), ensure_ascii=False
|
1324
|
+
)
|
1325
|
+
call_info_list = [(name, parameters)]
|
1326
|
+
else:
|
1327
|
+
raise RuntimeError(f"Unexpected model response: {text}")
|
1328
|
+
|
1329
|
+
call_info_list = [
|
1330
|
+
(
|
1331
|
+
[tool.function.name for tool in tools].index(call_info[0]),
|
1332
|
+
call_info[0],
|
1333
|
+
call_info[1],
|
1334
|
+
)
|
1335
|
+
for call_info in call_info_list
|
1336
|
+
]
|
1337
|
+
return text, call_info_list
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.1.
|
1
|
+
__version__ = "0.4.1.post2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.1.
|
3
|
+
Version: 0.4.1.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -351,14 +351,14 @@ The core features include:
|
|
351
351
|
- [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
|
352
352
|
- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
|
353
353
|
|
354
|
-
## Benchmark
|
354
|
+
## Benchmark and Performance
|
355
355
|
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
|
356
356
|
|
357
357
|
## Roadmap
|
358
358
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
359
359
|
|
360
360
|
## Adoption and Sponsorship
|
361
|
-
The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI.
|
361
|
+
The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
|
362
362
|
|
363
363
|
## Acknowledgment and Citation
|
364
364
|
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
@@ -1,7 +1,7 @@
|
|
1
1
|
sglang/__init__.py,sha256=b2oIdWzp5P8SzieeOs2TzJoN3Do3tfJbV8gZS_imVcs,1619
|
2
2
|
sglang/api.py,sha256=NdO6cYnklnEBQBKqQjlqI8-P1EownKQ71t5ibCGhEVo,6953
|
3
3
|
sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
|
4
|
-
sglang/bench_offline_throughput.py,sha256=
|
4
|
+
sglang/bench_offline_throughput.py,sha256=r-uBvpnx-30mAnVwQB4WlqiXxy2fn5a1NUARwZcaIo4,12533
|
5
5
|
sglang/bench_one_batch.py,sha256=jkyMhK0lqn5dRCYgAh30qZrNHP4gAbXODymBMNXK86I,15859
|
6
6
|
sglang/bench_one_batch_server.py,sha256=-fV9FTLNNcSIy0pgYeggXedPVK0fVsXZqVQswT8OMOY,5945
|
7
7
|
sglang/bench_serving.py,sha256=YQiCZreejCPBTqMmZsCB99RMi1N-Jx-dZtaafcQ8-14,53377
|
@@ -11,7 +11,7 @@ sglang/launch_server.py,sha256=4y2QeSj0wVNB9MJQZeahD4ahTDU6gwqo7MPUytyFop0,403
|
|
11
11
|
sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
|
12
12
|
sglang/llama3_eval.py,sha256=gWSboDchIGybIce88bJlrCG0yiLZ513mw4gcutJlzGM,10017
|
13
13
|
sglang/utils.py,sha256=23jf4Mz8E5p5a6JOkjnfYZixdjZUk88F_mZ8rZcby5Q,11597
|
14
|
-
sglang/version.py,sha256=
|
14
|
+
sglang/version.py,sha256=1g4t88smWYt1DD5SJZdzXI13OPbsQPOEMX9twA4wq6A,28
|
15
15
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
16
|
sglang/lang/chat_template.py,sha256=cnfjjxIIcYRGRxXlJlOGnpFxFuhMHut7DS52LsOMKcA,15826
|
17
17
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
@@ -32,14 +32,14 @@ sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21
|
|
32
32
|
sglang/srt/hf_transformers_utils.py,sha256=38Ms0H2-VMerOS6jnczcFtZMS6lhw9B5rSWKAfxVUfQ,7945
|
33
33
|
sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
|
34
34
|
sglang/srt/model_parallel.py,sha256=eLXZhvJ4wG6dh0FontNCIdVZvHYdWgaeY-5cu7TD9tE,6078
|
35
|
-
sglang/srt/server.py,sha256=
|
36
|
-
sglang/srt/server_args.py,sha256=
|
37
|
-
sglang/srt/utils.py,sha256=
|
35
|
+
sglang/srt/server.py,sha256=sDERAZlRa6OTaUk-SfW5aKJbPui1COpPG34HDlMHMNc,34916
|
36
|
+
sglang/srt/server_args.py,sha256=lBMOME7OSqG0Opinsin-QsioNrWpp2M0ZosB8cXoRrY,33917
|
37
|
+
sglang/srt/utils.py,sha256=i8MjcaSQjPPfPZ0txufTtqLr4Q7YhHQ86L1i9j-y5yY,44131
|
38
38
|
sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
|
39
39
|
sglang/srt/configs/device_config.py,sha256=dResqHjkg_dq10v6rnVpbXpvABZRB0jylOm-2_JAnx0,428
|
40
40
|
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
41
41
|
sglang/srt/configs/load_config.py,sha256=TcPi_HY6xu5SiVZsxPOoB5pGeDUNebOk7muoUH9VBDg,3083
|
42
|
-
sglang/srt/configs/model_config.py,sha256=
|
42
|
+
sglang/srt/configs/model_config.py,sha256=QP_6WaWMrE4NNF-XODRomiQPO0FABmVZIj5A-qJfnYg,16427
|
43
43
|
sglang/srt/configs/qwen2vl.py,sha256=ZjLy9v2eZY4wptUfY3CWgYKg2B5DDrkfCSyTy_Zf_bg,4351
|
44
44
|
sglang/srt/constrained/__init__.py,sha256=UWZNVLvOT5ZBX8M36sONgDmnKtkQ0cSfhQD2jO0ATuk,786
|
45
45
|
sglang/srt/constrained/base_grammar_backend.py,sha256=FhVm7PxhXDl0joV9NP5RjKgz7dR1dZvUAQnh0mdtvVY,2353
|
@@ -63,16 +63,16 @@ sglang/srt/layers/activation.py,sha256=EboMjT9HV2tNHQ6rzpojtlkzev1lAFbhQlxMg9hwx
|
|
63
63
|
sglang/srt/layers/custom_op_util.py,sha256=0vu-yX2wwonmO1L_o5G7SA6C-8XuhDIh9rPDvNeLhoc,922
|
64
64
|
sglang/srt/layers/layernorm.py,sha256=nRQ1w1xSUcU-zlqVC61BnGG6otS5W1w9VaSzeXizrx4,4037
|
65
65
|
sglang/srt/layers/linear.py,sha256=KyRFU0VcoNuN-hnQB9QQcBN9NCpeqPtLzzufIHUpV6w,47064
|
66
|
-
sglang/srt/layers/logits_processor.py,sha256=
|
66
|
+
sglang/srt/layers/logits_processor.py,sha256=Imh-qY1D9J80DZVSVV0LfTiHMEw6oQ3JbY9lXxPZAXE,15656
|
67
67
|
sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
|
68
68
|
sglang/srt/layers/radix_attention.py,sha256=E4cmvkcCdCtb6VyLNrCKy1D6VwHQ063oH3JQXPaRy6w,2178
|
69
69
|
sglang/srt/layers/rotary_embedding.py,sha256=29tx3JNR40AoXqBa2cFGBjva9vU2xgFipETlpMaaZas,3985
|
70
70
|
sglang/srt/layers/sampler.py,sha256=k4Op_HMkQfT7t9wgQwBVotfTUXEocrzRyQqEFnff1pc,5511
|
71
71
|
sglang/srt/layers/torchao_utils.py,sha256=dQVuWNXxAvOPjr2G5BBMWqC2oKcS2B52rx-fEc_elmc,3545
|
72
72
|
sglang/srt/layers/vocab_parallel_embedding.py,sha256=slGwLiWjuFLCUdRe-GTlfumyZpqVX9VF6No_UGOT-hA,21624
|
73
|
-
sglang/srt/layers/attention/__init__.py,sha256=
|
73
|
+
sglang/srt/layers/attention/__init__.py,sha256=lNLfWqePc5NMej-AcXl97vxVXsxQOgP7dNNb2ibyUWI,2562
|
74
74
|
sglang/srt/layers/attention/double_sparsity_backend.py,sha256=RQdEKRykSLf9ilnaHmR6T7RFqh4emH_adfB3aJN2BUU,10920
|
75
|
-
sglang/srt/layers/attention/flashinfer_backend.py,sha256=
|
75
|
+
sglang/srt/layers/attention/flashinfer_backend.py,sha256=8nH4EIEXvNk9yZVl7mSn78w5Dli5UiWL-ZCeYykG9HI,27280
|
76
76
|
sglang/srt/layers/attention/torch_native_backend.py,sha256=nQdeqWEMMH_wrod5wssDCJG-uPKm0uslvkALKqPRPQ8,10509
|
77
77
|
sglang/srt/layers/attention/triton_backend.py,sha256=-TobyZHwlbJ5HhbFg-jgCqVOw4Y-opgEuFo-EusASQc,6264
|
78
78
|
sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=oJ_UK1t229zF3hbTDiQe7t-X-IbM2dOxx4U2ch-vmjA,17847
|
@@ -85,27 +85,27 @@ sglang/srt/layers/moe/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
85
85
|
sglang/srt/layers/moe/ep_moe/kernels.py,sha256=wb_S2qLxoWWgQu9coXy0XLNGvHzdZSdwXr0PGy4QySg,10940
|
86
86
|
sglang/srt/layers/moe/ep_moe/layer.py,sha256=6iQU5ZjQ8IXGoQ8ZlBuJqyQxYTEem9vXI6rbVIWKlZw,22303
|
87
87
|
sglang/srt/layers/moe/fused_moe_triton/__init__.py,sha256=h9yMFAL_bagUf-qBED8gSWdCOb7d8IdA-pE-L_nIg8E,842
|
88
|
-
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=
|
88
|
+
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=KvOy544x_4nRqg50o5YHQpHvF8TUD7q9LXDAWPGJlAA,31796
|
89
89
|
sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=BclDj5JyCy-8Bfue4broL1-IG6a4dUyggE9WQLa06sg,20575
|
90
90
|
sglang/srt/layers/quantization/__init__.py,sha256=VPYXShHvbvkOgVBlkIqic4RhdJ1y6EZ3r34T-nZMT1k,4606
|
91
91
|
sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
|
92
|
-
sglang/srt/layers/quantization/fp8.py,sha256=
|
93
|
-
sglang/srt/layers/quantization/fp8_kernel.py,sha256=
|
94
|
-
sglang/srt/layers/quantization/fp8_utils.py,sha256=
|
92
|
+
sglang/srt/layers/quantization/fp8.py,sha256=k4mw-iKxlaEWRkGgaoxCLzZ_dYydyRj0y1N1B_umMwU,32668
|
93
|
+
sglang/srt/layers/quantization/fp8_kernel.py,sha256=cYF4ckqrUyhCO9Ha7zi05R8EhRaqSa8rFpYisz-9Ed0,10743
|
94
|
+
sglang/srt/layers/quantization/fp8_utils.py,sha256=qBVJXxbxqmf8-Juq0t-IXWjlaZoePJqFNYcs9-oT5Yo,4150
|
95
95
|
sglang/srt/lora/lora.py,sha256=-o2mBmUvoVpdkgdAkWTARN4kfyep3UNEJLcg6moh0SU,15056
|
96
96
|
sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
|
97
97
|
sglang/srt/lora/lora_manager.py,sha256=DHiqdl0_4wQ5PxZBZtlCpP14515mDV2_H9tzL3Rdss8,12886
|
98
98
|
sglang/srt/managers/data_parallel_controller.py,sha256=psI4FAuBGjtdnEuwagnGdtRqvqSSxOROfNKQqVDqlVA,8382
|
99
99
|
sglang/srt/managers/detokenizer_manager.py,sha256=nZkbwt4yty_oy8rvg4T7PbgyVLoBLohvHl25xlQpBoo,8439
|
100
100
|
sglang/srt/managers/image_processor.py,sha256=Y8RgyrzbJjJTpjbnZDa5qiiG5wWjZ68rOXUPDi6kkFo,13698
|
101
|
-
sglang/srt/managers/io_struct.py,sha256=
|
102
|
-
sglang/srt/managers/schedule_batch.py,sha256=
|
101
|
+
sglang/srt/managers/io_struct.py,sha256=Gru7LEyc3tcM_LewoteCb7GXIrh-OYhA2CnEvjc1Cis,15769
|
102
|
+
sglang/srt/managers/schedule_batch.py,sha256=KnoVuWgINnyard-BOXCo0jm3IMdXN9wIwnftMKcag-s,46097
|
103
103
|
sglang/srt/managers/schedule_policy.py,sha256=QxjQ8-le062AMHHxool6CxkhvB4FIwhOQPzTX_JwL6U,15447
|
104
|
-
sglang/srt/managers/scheduler.py,sha256=
|
105
|
-
sglang/srt/managers/session_controller.py,sha256=
|
106
|
-
sglang/srt/managers/tokenizer_manager.py,sha256=
|
107
|
-
sglang/srt/managers/tp_worker.py,sha256=
|
108
|
-
sglang/srt/managers/tp_worker_overlap_thread.py,sha256
|
104
|
+
sglang/srt/managers/scheduler.py,sha256=Z1_wf6OCC8Hevc7y0D4Rt4EW5Et9bgTL9oJOkmNmDjo,63490
|
105
|
+
sglang/srt/managers/session_controller.py,sha256=3laMRIXEYWDjfytCjPs0vw_Tw__k-nKBY-bYzycYbfc,5482
|
106
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=SXvVZHFMBCtcLkKnq-O3uzwrEhfVqk6Y1fzeBEFNq0E,33010
|
107
|
+
sglang/srt/managers/tp_worker.py,sha256=8RVBLQaS3TnX7Z4J35RVrFN0M6PVnRBhct3sczBL4dY,7644
|
108
|
+
sglang/srt/managers/tp_worker_overlap_thread.py,sha256=JQfrVPeE56ZGJ3nozkhZR-RSb2oePsY7iuedM7XCtdQ,9157
|
109
109
|
sglang/srt/mem_cache/base_prefix_cache.py,sha256=QC8HS8RC5DXu14kyXsxAgEUsn0f932p2DjqzbKjc6Bs,962
|
110
110
|
sglang/srt/mem_cache/chunk_cache.py,sha256=R2gHAuqKd5ayQW3NnsgoGUH31---Z5izCDyCqLL0FjQ,2524
|
111
111
|
sglang/srt/mem_cache/flush_cache.py,sha256=GYcxmNXh4hsMpFfNOuCTpKilW7guZwTtAg_usVeM3J0,979
|
@@ -114,8 +114,8 @@ sglang/srt/mem_cache/radix_cache.py,sha256=c5voySV5L855c0G9cBEc9iQ4nR7PDDmg0V6fW
|
|
114
114
|
sglang/srt/metrics/collector.py,sha256=ZWoFx_FKN0sNMSZ8RJWUVQ0RFEYhIHxdw0d4TZTluMU,6861
|
115
115
|
sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
|
116
116
|
sglang/srt/model_executor/cuda_graph_runner.py,sha256=1n5WxoE9-0B3unwkkcR355K_D290h2LGt_7EvH02DQM,16246
|
117
|
-
sglang/srt/model_executor/forward_batch_info.py,sha256=
|
118
|
-
sglang/srt/model_executor/model_runner.py,sha256=
|
117
|
+
sglang/srt/model_executor/forward_batch_info.py,sha256=vqF8XrHQPk3ZL7HqPvvkfP53oqBx0Fajb5lAIkdifBo,13961
|
118
|
+
sglang/srt/model_executor/model_runner.py,sha256=TjvAwwr7EqZdmE-5HbuQMeEa0e0FqY6LeqqzEAHXMPU,30012
|
119
119
|
sglang/srt/model_loader/__init__.py,sha256=zGZkOBz1zx-pkaIy47BasL3fjDlAcxAXUTjInOhXHAE,919
|
120
120
|
sglang/srt/model_loader/loader.py,sha256=7OG_8-66vFDFZ9kVKGNK1BFBjZ6ql449dlyvdCbMqvE,43876
|
121
121
|
sglang/srt/model_loader/utils.py,sha256=0NaMR67fESFopaklmsleiL27XH1QUrjZW246MUu1EJ0,1369
|
@@ -136,8 +136,9 @@ sglang/srt/models/granite.py,sha256=AeQY9Dxd1ZnwgCYBK0vSXXiMGM-yt9iaOVf_ruOUHXw,
|
|
136
136
|
sglang/srt/models/grok.py,sha256=J9lgNbFebvXgF19nfZyHwlGPlGWY_m0LgP506YvOYrU,15668
|
137
137
|
sglang/srt/models/internlm2.py,sha256=_xcKtd6YtEFUTozaN-yUb0xbSYckRpomfPSKcAk4j-Y,12127
|
138
138
|
sglang/srt/models/internlm2_reward.py,sha256=8K26A9oIFFGx_9U2mF87j7FX8K87HGKMnVL3ht1Uc7I,2398
|
139
|
-
sglang/srt/models/llama.py,sha256=
|
139
|
+
sglang/srt/models/llama.py,sha256=4UPKF7erp7qqBD11uvvQkO1Fo_wDs71BmA8Y2csXRcA,20302
|
140
140
|
sglang/srt/models/llama_classification.py,sha256=DwboM1xHXdf3Fddf7xGnrfdOLJwXdiJs994cIpAPa2g,2984
|
141
|
+
sglang/srt/models/llama_eagle.py,sha256=88DzR54DKBIKJ1h-bkIa8mc1qJnlkdZ1eGYY3c5mpBY,4442
|
141
142
|
sglang/srt/models/llama_embedding.py,sha256=rh-AiczPY_pTpzcACHvSMVjh1hsV_MZBBwP0LQxPsGM,3130
|
142
143
|
sglang/srt/models/llama_reward.py,sha256=oPxh5E2UkxLULNdR68dFvt2I7j33CJFN6nyA-8L2_cg,4516
|
143
144
|
sglang/srt/models/llava.py,sha256=xrkg8sht8tBOID7427IEZtHL-KKWfEivDe2NqGjTSAs,26373
|
@@ -162,8 +163,8 @@ sglang/srt/models/torch_native_llama.py,sha256=YeXHorFm6QfnczLXwPb5TG9a-He0uiA9R
|
|
162
163
|
sglang/srt/models/xverse.py,sha256=Oq--KqvbYu2H4TMVGEHpSnJLEwXBpxlncR9ilsQeckc,13579
|
163
164
|
sglang/srt/models/xverse_moe.py,sha256=7E60YIST4ELYwLRgjtHiLRI5Uyc7XqQTM7jQXiWaQs4,15541
|
164
165
|
sglang/srt/models/yivl.py,sha256=88OubtuZ38Dxb2LzfV_MTPBI4wKhh4NJqFu--efbhFM,4809
|
165
|
-
sglang/srt/openai_api/adapter.py,sha256=
|
166
|
-
sglang/srt/openai_api/protocol.py,sha256=
|
166
|
+
sglang/srt/openai_api/adapter.py,sha256=HvgeFPWv-v8LOiYF2iNCo-14BIZLAPznNTCUbubB2Rg,57091
|
167
|
+
sglang/srt/openai_api/protocol.py,sha256=anWGr2Br8gVYm6Z0yvDwjXLaPCPuvJZ28gr5rV2dhVQ,11613
|
167
168
|
sglang/srt/sampling/sampling_batch_info.py,sha256=s--zNjk-LErZ5lMqnZ7KiuJltaziKRbQAU5qYpKIxAc,8564
|
168
169
|
sglang/srt/sampling/sampling_params.py,sha256=BkgCJAOSmQXwJrNXg26zSjKfMy0d5mMN6oHRk_ZuESI,5499
|
169
170
|
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
@@ -172,6 +173,7 @@ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=1Zp2aL6dD6
|
|
172
173
|
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=_Nxv0XgUPirZjw2SEJYp_Cd9ZcLwmt7h6JE6J4hhFq4,3629
|
173
174
|
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=5tOgCg7OvE9kSN9VMCpH1hwqo1YMxt9iS5PVpct9HpU,2468
|
174
175
|
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=m22Rfn1RuB1HpImBDECsiJ2VooBYpsFADAwnk1EPzk0,2751
|
176
|
+
sglang/srt/speculative/spec_info.py,sha256=d-82uWEC-QBqAgv3XGDNDW8DlHv4MtUsZghFqzGwV7U,352
|
175
177
|
sglang/test/few_shot_gsm8k.py,sha256=7yDbEQe49gZeJhz2wFFX-gf_59ThDKsCS1xwfogNc7k,4034
|
176
178
|
sglang/test/few_shot_gsm8k_engine.py,sha256=QQbrwOX6-cJDD3RZC_e7zPnt6aSo8JdF8X_lRHSjdDM,3886
|
177
179
|
sglang/test/run_eval.py,sha256=9yO0hXZOcn4abEOs96T-XPguDEklK16Ltco0pGF3zCg,4020
|
@@ -188,8 +190,8 @@ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c
|
|
188
190
|
sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
|
189
191
|
sglang/test/test_utils.py,sha256=HJG7kUQOk6n9FBbH89PDtQ41C3kt1cfJODhAEcFT0AQ,23823
|
190
192
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
|
191
|
-
sglang-0.4.1.
|
192
|
-
sglang-0.4.1.
|
193
|
-
sglang-0.4.1.
|
194
|
-
sglang-0.4.1.
|
195
|
-
sglang-0.4.1.
|
193
|
+
sglang-0.4.1.post2.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
|
194
|
+
sglang-0.4.1.post2.dist-info/METADATA,sha256=eORQMKMQDt_eTOh6PxMzIvyjNbg2FllxXss2Z9jU3Ug,22544
|
195
|
+
sglang-0.4.1.post2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
196
|
+
sglang-0.4.1.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
197
|
+
sglang-0.4.1.post2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|