sglang 0.4.1__py3-none-any.whl → 0.4.1.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +11 -3
- sglang/lang/backend/openai.py +10 -0
- sglang/srt/constrained/xgrammar_backend.py +6 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -14
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +17 -4
- sglang/srt/layers/moe/topk.py +14 -0
- sglang/srt/layers/quantization/fp8_kernel.py +14 -0
- sglang/srt/managers/schedule_policy.py +1 -1
- sglang/srt/managers/scheduler.py +11 -14
- sglang/srt/managers/tokenizer_manager.py +54 -45
- sglang/srt/model_executor/model_runner.py +0 -6
- sglang/srt/model_loader/loader.py +22 -11
- sglang/srt/models/gemma2.py +19 -0
- sglang/srt/models/llama.py +2 -2
- sglang/srt/openai_api/adapter.py +19 -0
- sglang/srt/openai_api/protocol.py +2 -0
- sglang/srt/sampling/sampling_params.py +9 -2
- sglang/srt/server.py +20 -37
- sglang/version.py +1 -1
- {sglang-0.4.1.dist-info → sglang-0.4.1.post1.dist-info}/METADATA +4 -4
- {sglang-0.4.1.dist-info → sglang-0.4.1.post1.dist-info}/RECORD +24 -24
- {sglang-0.4.1.dist-info → sglang-0.4.1.post1.dist-info}/LICENSE +0 -0
- {sglang-0.4.1.dist-info → sglang-0.4.1.post1.dist-info}/WHEEL +0 -0
- {sglang-0.4.1.dist-info → sglang-0.4.1.post1.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py
CHANGED
@@ -897,6 +897,7 @@ async def benchmark(
|
|
897
897
|
else:
|
898
898
|
raise ValueError(f"Unknown backend: {backend}")
|
899
899
|
|
900
|
+
# Limit concurrency
|
900
901
|
# From https://github.com/vllm-project/vllm/pull/9390
|
901
902
|
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
902
903
|
|
@@ -906,6 +907,7 @@ async def benchmark(
|
|
906
907
|
async with semaphore:
|
907
908
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
908
909
|
|
910
|
+
# Warmup
|
909
911
|
print("Starting initial single prompt test run...")
|
910
912
|
test_prompt, test_prompt_len, test_output_len = input_requests[0]
|
911
913
|
test_input = RequestFuncInput(
|
@@ -924,11 +926,15 @@ async def benchmark(
|
|
924
926
|
f"are correctly specified. Error: {test_output.error}"
|
925
927
|
)
|
926
928
|
else:
|
927
|
-
requests.post(base_url + "/flush_cache")
|
928
929
|
print("Initial test run completed. Starting main benchmark run...")
|
929
930
|
|
930
|
-
|
931
|
+
# Flush cache
|
932
|
+
if "sglang" in backend:
|
933
|
+
requests.post(base_url + "/flush_cache")
|
934
|
+
|
935
|
+
time.sleep(1.0)
|
931
936
|
|
937
|
+
# Start profiler
|
932
938
|
if profile:
|
933
939
|
print("Starting profiler...")
|
934
940
|
profile_output = await async_request_profile(
|
@@ -939,6 +945,7 @@ async def benchmark(
|
|
939
945
|
|
940
946
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
941
947
|
|
948
|
+
# Run all requests
|
942
949
|
benchmark_start_time = time.perf_counter()
|
943
950
|
tasks: List[asyncio.Task] = []
|
944
951
|
async for request in get_request(input_requests, request_rate):
|
@@ -959,6 +966,7 @@ async def benchmark(
|
|
959
966
|
)
|
960
967
|
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
961
968
|
|
969
|
+
# Stop profiler
|
962
970
|
if profile:
|
963
971
|
print("Stopping profiler...")
|
964
972
|
profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
|
@@ -968,8 +976,8 @@ async def benchmark(
|
|
968
976
|
if pbar is not None:
|
969
977
|
pbar.close()
|
970
978
|
|
979
|
+
# Compute metrics and print results
|
971
980
|
benchmark_duration = time.perf_counter() - benchmark_start_time
|
972
|
-
|
973
981
|
metrics, output_lens = calculate_metrics(
|
974
982
|
input_requests=input_requests,
|
975
983
|
outputs=outputs,
|
sglang/lang/backend/openai.py
CHANGED
@@ -366,6 +366,11 @@ class OpenAI(BaseBackend):
|
|
366
366
|
def openai_completion(
|
367
367
|
client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
|
368
368
|
):
|
369
|
+
# if "ebnf" is in kwargs, warn and remove
|
370
|
+
if "ebnf" in kwargs:
|
371
|
+
warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
|
372
|
+
del kwargs["ebnf"]
|
373
|
+
|
369
374
|
for attempt in range(retries):
|
370
375
|
try:
|
371
376
|
if is_chat:
|
@@ -398,6 +403,11 @@ def openai_completion(
|
|
398
403
|
def openai_completion_stream(
|
399
404
|
client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
|
400
405
|
):
|
406
|
+
# if "ebnf" is in kwargs, warn and remove
|
407
|
+
if "ebnf" in kwargs:
|
408
|
+
warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
|
409
|
+
del kwargs["ebnf"]
|
410
|
+
|
401
411
|
for attempt in range(retries):
|
402
412
|
try:
|
403
413
|
if is_chat:
|
@@ -126,6 +126,12 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
|
|
126
126
|
f"Skip invalid json_schema: json_schema={key_string}, {e=}"
|
127
127
|
)
|
128
128
|
return None
|
129
|
+
elif key_type == "ebnf":
|
130
|
+
try:
|
131
|
+
ctx = self.grammar_compiler.compile_grammar(key_string)
|
132
|
+
except RuntimeError as e:
|
133
|
+
logging.warning(f"Skip invalid ebnf: ebnf={key_string}, {e=}")
|
134
|
+
return None
|
129
135
|
elif key_type == "regex":
|
130
136
|
logger.warning(
|
131
137
|
"regex hasn't been supported by xgrammar yet. This is skipped."
|
@@ -292,27 +292,33 @@ def extend_attention_fwd(
|
|
292
292
|
BLOCK_DPE = 0
|
293
293
|
BLOCK_DV = triton.next_power_of_2(Lv)
|
294
294
|
|
295
|
-
if
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
BLOCK_M, BLOCK_N = (32, 64)
|
300
|
-
elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
|
301
|
-
if Lq <= 128:
|
302
|
-
BLOCK_M, BLOCK_N = (128, 128)
|
303
|
-
elif Lq <= 256:
|
304
|
-
BLOCK_M, BLOCK_N = (64, 64)
|
305
|
-
else:
|
306
|
-
BLOCK_M, BLOCK_N = (32, 64)
|
295
|
+
if is_hip_:
|
296
|
+
BLOCK_M, BLOCK_N = (64, 64)
|
297
|
+
num_warps = 4
|
298
|
+
|
307
299
|
else:
|
308
|
-
|
300
|
+
if is_cuda_available and CUDA_CAPABILITY[0] >= 9:
|
301
|
+
if Lq <= 256:
|
302
|
+
BLOCK_M, BLOCK_N = (128, 64)
|
303
|
+
else:
|
304
|
+
BLOCK_M, BLOCK_N = (32, 64)
|
305
|
+
elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
|
306
|
+
if Lq <= 128:
|
307
|
+
BLOCK_M, BLOCK_N = (128, 128)
|
308
|
+
elif Lq <= 256:
|
309
|
+
BLOCK_M, BLOCK_N = (64, 64)
|
310
|
+
else:
|
311
|
+
BLOCK_M, BLOCK_N = (32, 64)
|
312
|
+
else:
|
313
|
+
BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
|
314
|
+
|
315
|
+
num_warps = 4 if Lk <= 64 else 8
|
309
316
|
|
310
317
|
sm_scale = sm_scale or 1.0 / (Lq**0.5)
|
311
318
|
batch_size, head_num = b_seq_len.shape[0], q_extend.shape[1]
|
312
319
|
kv_group_num = q_extend.shape[1] // k_extend.shape[1]
|
313
320
|
|
314
321
|
grid = (batch_size, head_num, triton.cdiv(max_len_extend, BLOCK_M))
|
315
|
-
num_warps = 4 if Lk <= 64 else 8
|
316
322
|
num_stages = 1
|
317
323
|
|
318
324
|
extra_kargs = {}
|
@@ -11,12 +11,17 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
11
11
|
import torch
|
12
12
|
import triton
|
13
13
|
import triton.language as tl
|
14
|
-
from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
|
15
14
|
from vllm import _custom_ops as ops
|
16
15
|
|
17
16
|
from sglang.srt.layers.moe.topk import select_experts
|
18
17
|
from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
|
19
|
-
from sglang.srt.utils import direct_register_custom_op, get_device_name
|
18
|
+
from sglang.srt.utils import direct_register_custom_op, get_device_name, is_hip
|
19
|
+
|
20
|
+
not_hip = False
|
21
|
+
if not is_hip():
|
22
|
+
from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
|
23
|
+
|
24
|
+
not_hip = True
|
20
25
|
|
21
26
|
logger = logging.getLogger(__name__)
|
22
27
|
padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0
|
@@ -267,8 +272,14 @@ def moe_align_block_size(
|
|
267
272
|
(max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
|
268
273
|
)
|
269
274
|
num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
|
270
|
-
|
271
|
-
|
275
|
+
if not_hip and num_experts >= 224:
|
276
|
+
token_cnts_buffer = torch.empty(
|
277
|
+
(num_experts + 1) * num_experts, dtype=torch.int32, device=topk_ids.device
|
278
|
+
)
|
279
|
+
cumsum_buffer = torch.empty(
|
280
|
+
num_experts + 1, dtype=torch.int32, device=topk_ids.device
|
281
|
+
)
|
282
|
+
|
272
283
|
sgl_moe_align_block_size(
|
273
284
|
topk_ids,
|
274
285
|
num_experts,
|
@@ -276,6 +287,8 @@ def moe_align_block_size(
|
|
276
287
|
sorted_ids,
|
277
288
|
expert_ids,
|
278
289
|
num_tokens_post_pad,
|
290
|
+
token_cnts_buffer,
|
291
|
+
cumsum_buffer,
|
279
292
|
)
|
280
293
|
else:
|
281
294
|
ops.moe_align_block_size(
|
sglang/srt/layers/moe/topk.py
CHANGED
@@ -1,3 +1,17 @@
|
|
1
|
+
# Copyright 2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
14
|
+
|
1
15
|
from typing import Callable, Optional
|
2
16
|
|
3
17
|
import torch
|
@@ -1,3 +1,17 @@
|
|
1
|
+
# Copyright 2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
14
|
+
|
1
15
|
from typing import List, Tuple
|
2
16
|
|
3
17
|
import torch
|
sglang/srt/managers/scheduler.py
CHANGED
@@ -468,9 +468,6 @@ class Scheduler:
|
|
468
468
|
self.send_to_tokenizer.send_pyobj(
|
469
469
|
UpdateWeightFromDiskReqOutput(success, message)
|
470
470
|
)
|
471
|
-
elif isinstance(recv_req, GetWeightsByNameReqInput):
|
472
|
-
parameter = self.get_weights_by_name(recv_req)
|
473
|
-
self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
|
474
471
|
elif isinstance(recv_req, InitWeightsUpdateGroupReqInput):
|
475
472
|
success, message = self.init_weights_update_group(recv_req)
|
476
473
|
self.send_to_tokenizer.send_pyobj(
|
@@ -565,7 +562,7 @@ class Scheduler:
|
|
565
562
|
|
566
563
|
if req.logprob_start_len == -1:
|
567
564
|
# By default, only return the logprobs for output tokens
|
568
|
-
req.logprob_start_len = len(
|
565
|
+
req.logprob_start_len = len(req.origin_input_ids) - 1
|
569
566
|
|
570
567
|
# Truncate prompts that are too long
|
571
568
|
if len(req.origin_input_ids) > self.max_req_input_len:
|
@@ -589,12 +586,15 @@ class Scheduler:
|
|
589
586
|
if (
|
590
587
|
req.sampling_params.json_schema is not None
|
591
588
|
or req.sampling_params.regex is not None
|
589
|
+
or req.sampling_params.ebnf is not None
|
592
590
|
):
|
593
591
|
assert self.grammar_backend is not None
|
594
592
|
if req.sampling_params.json_schema is not None:
|
595
593
|
key = ("json", req.sampling_params.json_schema)
|
596
594
|
elif req.sampling_params.regex is not None:
|
597
595
|
key = ("regex", req.sampling_params.regex)
|
596
|
+
elif req.sampling_params.ebnf is not None:
|
597
|
+
key = ("ebnf", req.sampling_params.ebnf)
|
598
598
|
|
599
599
|
req.grammar = self.grammar_backend.get_cached_value(key)
|
600
600
|
if not req.grammar:
|
@@ -629,16 +629,13 @@ class Scheduler:
|
|
629
629
|
self.waiting_queue.append(req)
|
630
630
|
|
631
631
|
def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked):
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
)
|
640
|
-
else:
|
641
|
-
tree_cache_hit_rate = 0.0
|
632
|
+
self.tree_cache_metrics["total"] += (
|
633
|
+
adder.log_input_tokens + adder.log_hit_tokens
|
634
|
+
) / 10**9
|
635
|
+
self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
|
636
|
+
tree_cache_hit_rate = (
|
637
|
+
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
|
638
|
+
)
|
642
639
|
|
643
640
|
num_used = self.max_total_num_tokens - (
|
644
641
|
self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
|
@@ -22,7 +22,7 @@ import signal
|
|
22
22
|
import sys
|
23
23
|
import time
|
24
24
|
import uuid
|
25
|
-
from typing import Any, Awaitable, Dict, List, Optional, Tuple, Union
|
25
|
+
from typing import Any, Awaitable, Dict, Generic, List, Optional, Tuple, TypeVar, Union
|
26
26
|
|
27
27
|
import fastapi
|
28
28
|
import uvloop
|
@@ -173,6 +173,15 @@ class TokenizerManager:
|
|
173
173
|
|
174
174
|
# Others
|
175
175
|
self.gracefully_exit = False
|
176
|
+
self.init_weights_update_group_communicator = _Communicator(
|
177
|
+
self.send_to_scheduler, server_args.dp_size
|
178
|
+
)
|
179
|
+
self.update_weights_from_distributed_communicator = _Communicator(
|
180
|
+
self.send_to_scheduler, server_args.dp_size
|
181
|
+
)
|
182
|
+
self.get_weights_by_name_communicator = _Communicator(
|
183
|
+
self.send_to_scheduler, server_args.dp_size
|
184
|
+
)
|
176
185
|
|
177
186
|
# Metrics
|
178
187
|
if self.enable_metrics:
|
@@ -190,8 +199,7 @@ class TokenizerManager:
|
|
190
199
|
):
|
191
200
|
created_time = time.time()
|
192
201
|
|
193
|
-
|
194
|
-
self.create_handle_loop()
|
202
|
+
self.auto_create_handle_loop()
|
195
203
|
|
196
204
|
if isinstance(obj, EmbeddingReqInput) and self.is_generation:
|
197
205
|
raise ValueError(
|
@@ -440,8 +448,7 @@ class TokenizerManager:
|
|
440
448
|
obj: UpdateWeightFromDiskReqInput,
|
441
449
|
request: Optional[fastapi.Request] = None,
|
442
450
|
) -> Tuple[bool, str]:
|
443
|
-
|
444
|
-
self.create_handle_loop()
|
451
|
+
self.auto_create_handle_loop()
|
445
452
|
|
446
453
|
# default the load format to the server_args
|
447
454
|
if obj.load_format is None:
|
@@ -456,7 +463,7 @@ class TokenizerManager:
|
|
456
463
|
|
457
464
|
async def _wait_for_model_update_from_disk(
|
458
465
|
self, obj: UpdateWeightFromDiskReqInput
|
459
|
-
) -> Tuple[bool, str
|
466
|
+
) -> Tuple[bool, str]:
|
460
467
|
self.send_to_scheduler.send_pyobj(obj)
|
461
468
|
self.model_update_result = asyncio.Future()
|
462
469
|
if self.server_args.dp_size == 1:
|
@@ -485,15 +492,11 @@ class TokenizerManager:
|
|
485
492
|
obj: InitWeightsUpdateGroupReqInput,
|
486
493
|
request: Optional[fastapi.Request] = None,
|
487
494
|
) -> Tuple[bool, str]:
|
488
|
-
|
489
|
-
self.create_handle_loop()
|
490
|
-
self.send_to_scheduler.send_pyobj(obj)
|
491
|
-
|
492
|
-
self.init_weights_update_group_result = asyncio.Future()
|
495
|
+
self.auto_create_handle_loop()
|
493
496
|
assert (
|
494
497
|
self.server_args.dp_size == 1
|
495
498
|
), "dp_size must be 1 for init parameter update group"
|
496
|
-
result = await self.
|
499
|
+
result = (await self.init_weights_update_group_communicator(obj))[0]
|
497
500
|
return result.success, result.message
|
498
501
|
|
499
502
|
async def update_weights_from_distributed(
|
@@ -501,44 +504,32 @@ class TokenizerManager:
|
|
501
504
|
obj: UpdateWeightsFromDistributedReqInput,
|
502
505
|
request: Optional[fastapi.Request] = None,
|
503
506
|
) -> Tuple[bool, str]:
|
504
|
-
|
505
|
-
|
507
|
+
self.auto_create_handle_loop()
|
508
|
+
assert (
|
509
|
+
self.server_args.dp_size == 1
|
510
|
+
), "dp_size must be for update weights from distributed"
|
506
511
|
|
507
512
|
# This means that weight sync
|
508
513
|
# cannot run while requests are in progress.
|
509
514
|
async with self.model_update_lock.writer_lock:
|
510
|
-
self.
|
511
|
-
self.parameter_update_result: Awaitable[
|
512
|
-
UpdateWeightsFromDistributedReqOutput
|
513
|
-
] = asyncio.Future()
|
514
|
-
assert (
|
515
|
-
self.server_args.dp_size == 1
|
516
|
-
), "dp_size must be for update weights from distributed"
|
517
|
-
result = await self.parameter_update_result
|
515
|
+
result = (await self.update_weights_from_distributed_communicator(obj))[0]
|
518
516
|
return result.success, result.message
|
519
517
|
|
520
518
|
async def get_weights_by_name(
|
521
519
|
self, obj: GetWeightsByNameReqInput, request: Optional[fastapi.Request] = None
|
522
520
|
):
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
self.send_to_scheduler.send_pyobj(obj)
|
527
|
-
self.get_weights_by_name_result = asyncio.Future()
|
521
|
+
self.auto_create_handle_loop()
|
522
|
+
results = await self.get_weights_by_name_communicator(obj)
|
523
|
+
all_parameters = [r.parameter for r in results]
|
528
524
|
if self.server_args.dp_size == 1:
|
529
|
-
|
530
|
-
return result.parameter
|
525
|
+
return all_parameters[0]
|
531
526
|
else:
|
532
|
-
self.get_weights_by_name_tmp = []
|
533
|
-
result = await self.get_weights_by_name_result
|
534
|
-
all_parameters = [r.parameter for r in result]
|
535
527
|
return all_parameters
|
536
528
|
|
537
529
|
async def open_session(
|
538
530
|
self, obj: OpenSessionReqInput, request: Optional[fastapi.Request] = None
|
539
531
|
):
|
540
|
-
|
541
|
-
self.create_handle_loop()
|
532
|
+
self.auto_create_handle_loop()
|
542
533
|
|
543
534
|
session_id = uuid.uuid4().hex
|
544
535
|
obj.session_id = session_id
|
@@ -568,7 +559,7 @@ class TokenizerManager:
|
|
568
559
|
background_tasks.add_task(abort_request)
|
569
560
|
return background_tasks
|
570
561
|
|
571
|
-
def
|
562
|
+
def auto_create_handle_loop(self):
|
572
563
|
if not self.to_create_loop:
|
573
564
|
return
|
574
565
|
|
@@ -711,21 +702,14 @@ class TokenizerManager:
|
|
711
702
|
assert (
|
712
703
|
self.server_args.dp_size == 1
|
713
704
|
), "dp_size must be 1 for init parameter update group"
|
714
|
-
self.
|
705
|
+
self.init_weights_update_group_communicator.handle_recv(recv_obj)
|
715
706
|
elif isinstance(recv_obj, UpdateWeightsFromDistributedReqOutput):
|
716
707
|
assert (
|
717
708
|
self.server_args.dp_size == 1
|
718
709
|
), "dp_size must be 1 for update weights from distributed"
|
719
|
-
self.
|
710
|
+
self.update_weights_from_distributed_communicator.handle_recv(recv_obj)
|
720
711
|
elif isinstance(recv_obj, GetWeightsByNameReqOutput):
|
721
|
-
|
722
|
-
self.get_weights_by_name_result.set_result(recv_obj)
|
723
|
-
else:
|
724
|
-
self.get_weights_by_name_tmp.append(recv_obj)
|
725
|
-
if len(self.get_weights_by_name_tmp) == self.server_args.dp_size:
|
726
|
-
self.get_weights_by_name_result.set_result(
|
727
|
-
self.get_weights_by_name_tmp
|
728
|
-
)
|
712
|
+
self.get_weights_by_name_communicator.handle_recv(recv_obj)
|
729
713
|
else:
|
730
714
|
raise ValueError(f"Invalid object: {recv_obj=}")
|
731
715
|
|
@@ -809,3 +793,28 @@ class SignalHandler:
|
|
809
793
|
f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
|
810
794
|
)
|
811
795
|
self.tokenizer_manager.gracefully_exit = True
|
796
|
+
|
797
|
+
|
798
|
+
T = TypeVar("T")
|
799
|
+
|
800
|
+
|
801
|
+
class _Communicator(Generic[T]):
|
802
|
+
def __init__(self, sender, fan_out: int):
|
803
|
+
self._sender = sender
|
804
|
+
self._fan_out = fan_out
|
805
|
+
self._result_future: Optional[asyncio.Future] = None
|
806
|
+
self._result_values: Optional[List[T]] = None
|
807
|
+
|
808
|
+
async def __call__(self, obj):
|
809
|
+
self._sender.send_pyobj(obj)
|
810
|
+
self._result_future = asyncio.Future()
|
811
|
+
self._result_values = []
|
812
|
+
await self._result_future
|
813
|
+
result_values = self._result_values
|
814
|
+
self._result_future = self._result_values = None
|
815
|
+
return result_values
|
816
|
+
|
817
|
+
def handle_recv(self, recv_obj: T):
|
818
|
+
self._result_values.append(recv_obj)
|
819
|
+
if len(self._result_values) == self._fan_out:
|
820
|
+
self._result_future.set_result(None)
|
@@ -95,12 +95,6 @@ class ModelRunner:
|
|
95
95
|
):
|
96
96
|
logger.info("MLA optimization is turned on. Use triton backend.")
|
97
97
|
self.server_args.attention_backend = "triton"
|
98
|
-
# FIXME(HandH1998)
|
99
|
-
if (
|
100
|
-
"DeepseekV3ForCausalLM" in self.model_config.hf_config.architectures
|
101
|
-
and not self.server_args.disable_cuda_graph
|
102
|
-
):
|
103
|
-
self.server_args.disable_cuda_graph = True
|
104
98
|
|
105
99
|
if self.server_args.enable_double_sparsity:
|
106
100
|
logger.info(
|
@@ -770,6 +770,21 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
|
770
770
|
quant_state_dict,
|
771
771
|
)
|
772
772
|
|
773
|
+
def _is_8bit_weight_name(self, weight_name: str):
|
774
|
+
quantized_suffix = {".scb", ".weight_format"}
|
775
|
+
return any(weight_name.lower().endswith(suffix) for suffix in quantized_suffix)
|
776
|
+
|
777
|
+
def _is_4bit_weight_name(self, weight_name: str):
|
778
|
+
quantized_suffix = {
|
779
|
+
"absmax",
|
780
|
+
"quant_map",
|
781
|
+
"nested_absmax",
|
782
|
+
"nested_quant_map",
|
783
|
+
"bitsandbytes",
|
784
|
+
}
|
785
|
+
suffix = weight_name.split(".")[-1]
|
786
|
+
return any(q_suffix in suffix for q_suffix in quantized_suffix)
|
787
|
+
|
773
788
|
def _quantized_8bit_generator(
|
774
789
|
self, hf_weights_files, use_safetensors, quant_state_dict
|
775
790
|
) -> Generator:
|
@@ -779,21 +794,18 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
|
779
794
|
if not weight_name.lower().endswith(".scb"):
|
780
795
|
continue
|
781
796
|
|
782
|
-
weight_key = weight_name.lower().replace(".scb", ".
|
797
|
+
weight_key = weight_name.lower().replace(".scb", ".weight")
|
783
798
|
quant_state_dict[weight_key] = weight_tensor
|
784
799
|
|
785
800
|
for weight_name, weight_tensor in self._hf_weight_iter(
|
786
801
|
hf_weights_files, use_safetensors
|
787
802
|
):
|
788
|
-
|
789
|
-
if not weight_name.endswith((".weight", ".bias")):
|
803
|
+
if self._is_8bit_weight_name(weight_name):
|
790
804
|
continue
|
791
805
|
|
792
|
-
|
793
|
-
|
794
|
-
if qweight_name in quant_state_dict:
|
806
|
+
if weight_name in quant_state_dict:
|
795
807
|
set_weight_attrs(weight_tensor, {"load_in_8bit": True})
|
796
|
-
yield
|
808
|
+
yield weight_name, weight_tensor
|
797
809
|
else:
|
798
810
|
yield weight_name, weight_tensor
|
799
811
|
|
@@ -806,7 +818,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
|
806
818
|
weight_iterator = self._hf_weight_iter(hf_weights_files, use_safetensors)
|
807
819
|
temp_state_dict = {}
|
808
820
|
for weight_name, weight_tensor in weight_iterator:
|
809
|
-
if
|
821
|
+
if not self._is_4bit_weight_name(weight_name):
|
810
822
|
continue
|
811
823
|
# bitsandbytes library requires
|
812
824
|
# weight.quant_state.bitsandbytes__* in CPU
|
@@ -830,16 +842,15 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
|
830
842
|
hf_weights_files, use_safetensors
|
831
843
|
):
|
832
844
|
|
833
|
-
if
|
845
|
+
if self._is_4bit_weight_name(weight_name):
|
834
846
|
continue
|
835
847
|
|
836
848
|
if (f"{weight_name}.quant_state.bitsandbytes__nf4" in temp_state_dict) or (
|
837
849
|
f"{weight_name}.quant_state.bitsandbytes__fp4" in temp_state_dict
|
838
850
|
):
|
839
851
|
quant_state = _parse_quant_state(weight_name, temp_state_dict)
|
840
|
-
weight_name = weight_name.replace(".weight", ".qweight")
|
841
852
|
quant_state_dict[weight_name] = quant_state
|
842
|
-
yield weight_name
|
853
|
+
yield weight_name, weight_tensor
|
843
854
|
else:
|
844
855
|
yield weight_name, weight_tensor
|
845
856
|
|
sglang/srt/models/gemma2.py
CHANGED
@@ -307,6 +307,25 @@ class Gemma2Model(nn.Module):
|
|
307
307
|
|
308
308
|
|
309
309
|
class Gemma2ForCausalLM(nn.Module):
|
310
|
+
# BitandBytes specific attributes
|
311
|
+
default_bitsandbytes_target_modules = [
|
312
|
+
".gate_proj.",
|
313
|
+
".down_proj.",
|
314
|
+
".up_proj.",
|
315
|
+
".q_proj.",
|
316
|
+
".k_proj.",
|
317
|
+
".v_proj.",
|
318
|
+
".o_proj.",
|
319
|
+
]
|
320
|
+
bitsandbytes_stacked_params_mapping = {
|
321
|
+
# shard_name, weight_name, index
|
322
|
+
"q_proj": ("qkv_proj", 0),
|
323
|
+
"k_proj": ("qkv_proj", 1),
|
324
|
+
"v_proj": ("qkv_proj", 2),
|
325
|
+
"gate_proj": ("gate_up_proj", 0),
|
326
|
+
"up_proj": ("gate_up_proj", 1),
|
327
|
+
}
|
328
|
+
|
310
329
|
packed_modules_mapping = {
|
311
330
|
"qkv_proj": [
|
312
331
|
"q_proj",
|
sglang/srt/models/llama.py
CHANGED
@@ -325,8 +325,8 @@ class LlamaForCausalLM(nn.Module):
|
|
325
325
|
self.config = config
|
326
326
|
self.quant_config = quant_config
|
327
327
|
self.model = LlamaModel(config, quant_config=quant_config)
|
328
|
-
# Llama 3.2 1B
|
329
|
-
# Llama 3.1 8B
|
328
|
+
# Llama 3.2 1B Instruct set tie_word_embeddings to True
|
329
|
+
# Llama 3.1 8B Instruct set tie_word_embeddings to False
|
330
330
|
if self.config.tie_word_embeddings:
|
331
331
|
self.lm_head = self.model.embed_tokens
|
332
332
|
else:
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -517,6 +517,7 @@ def v1_generate_request(
|
|
517
517
|
"repetition_penalty": request.repetition_penalty,
|
518
518
|
"regex": request.regex,
|
519
519
|
"json_schema": request.json_schema,
|
520
|
+
"ebnf": request.ebnf,
|
520
521
|
"n": request.n,
|
521
522
|
"no_stop_trim": request.no_stop_trim,
|
522
523
|
"ignore_eos": request.ignore_eos,
|
@@ -692,6 +693,14 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|
692
693
|
|
693
694
|
async def v1_completions(tokenizer_manager, raw_request: Request):
|
694
695
|
request_json = await raw_request.json()
|
696
|
+
if "extra_body" in request_json:
|
697
|
+
extra = request_json["extra_body"]
|
698
|
+
if "ebnf" in extra:
|
699
|
+
request_json["ebnf"] = extra["ebnf"]
|
700
|
+
if "regex" in extra:
|
701
|
+
request_json["regex"] = extra["regex"]
|
702
|
+
# remove extra_body to avoid pydantic conflict
|
703
|
+
del request_json["extra_body"]
|
695
704
|
all_requests = [CompletionRequest(**request_json)]
|
696
705
|
adapted_request, request = v1_generate_request(all_requests)
|
697
706
|
|
@@ -936,6 +945,7 @@ def v1_chat_generate_request(
|
|
936
945
|
"frequency_penalty": request.frequency_penalty,
|
937
946
|
"repetition_penalty": request.repetition_penalty,
|
938
947
|
"regex": request.regex,
|
948
|
+
"ebnf": request.ebnf,
|
939
949
|
"n": request.n,
|
940
950
|
"no_stop_trim": request.no_stop_trim,
|
941
951
|
"ignore_eos": request.ignore_eos,
|
@@ -1108,6 +1118,15 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
|
|
1108
1118
|
|
1109
1119
|
async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
1110
1120
|
request_json = await raw_request.json()
|
1121
|
+
if "extra_body" in request_json:
|
1122
|
+
extra = request_json["extra_body"]
|
1123
|
+
# For example, if 'ebnf' is given:
|
1124
|
+
if "ebnf" in extra:
|
1125
|
+
request_json["ebnf"] = extra["ebnf"]
|
1126
|
+
if "regex" in extra:
|
1127
|
+
request_json["regex"] = extra["regex"]
|
1128
|
+
# remove extra_body to avoid pydantic conflict
|
1129
|
+
del request_json["extra_body"]
|
1111
1130
|
all_requests = [ChatCompletionRequest(**request_json)]
|
1112
1131
|
adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
|
1113
1132
|
|
@@ -179,6 +179,7 @@ class CompletionRequest(BaseModel):
|
|
179
179
|
ignore_eos: bool = False
|
180
180
|
skip_special_tokens: bool = True
|
181
181
|
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
182
|
+
ebnf: Optional[str] = None
|
182
183
|
|
183
184
|
|
184
185
|
class CompletionResponseChoice(BaseModel):
|
@@ -288,6 +289,7 @@ class ChatCompletionRequest(BaseModel):
|
|
288
289
|
ignore_eos: bool = False
|
289
290
|
skip_special_tokens: bool = True
|
290
291
|
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
292
|
+
ebnf: Optional[str] = None
|
291
293
|
|
292
294
|
|
293
295
|
class ChatMessage(BaseModel):
|
@@ -36,6 +36,7 @@ class SamplingParams:
|
|
36
36
|
regex: Optional[str] = None,
|
37
37
|
n: int = 1,
|
38
38
|
json_schema: Optional[str] = None,
|
39
|
+
ebnf: Optional[str] = None,
|
39
40
|
no_stop_trim: bool = False,
|
40
41
|
ignore_eos: bool = False,
|
41
42
|
skip_special_tokens: bool = True,
|
@@ -60,6 +61,7 @@ class SamplingParams:
|
|
60
61
|
self.regex = regex
|
61
62
|
self.n = n
|
62
63
|
self.json_schema = json_schema
|
64
|
+
self.ebnf = ebnf
|
63
65
|
self.no_stop_trim = no_stop_trim
|
64
66
|
|
65
67
|
# Process some special cases
|
@@ -111,8 +113,13 @@ class SamplingParams:
|
|
111
113
|
f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
|
112
114
|
f"{self.min_new_tokens}."
|
113
115
|
)
|
114
|
-
|
115
|
-
|
116
|
+
grammars = [
|
117
|
+
self.json_schema,
|
118
|
+
self.regex,
|
119
|
+
self.ebnf,
|
120
|
+
] # since mutually exclusive, only one can be set
|
121
|
+
if sum(x is not None for x in grammars) > 1:
|
122
|
+
raise ValueError("Only one of regex, json_schema, or ebnf can be set.")
|
116
123
|
|
117
124
|
def normalize(self, tokenizer):
|
118
125
|
# Process stop strings
|
sglang/srt/server.py
CHANGED
@@ -245,16 +245,11 @@ async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
|
|
245
245
|
try:
|
246
246
|
ret = await tokenizer_manager.get_weights_by_name(obj, request)
|
247
247
|
if ret is None:
|
248
|
-
return
|
249
|
-
{"error": {"message": "Get parameter by name failed"}},
|
250
|
-
status_code=HTTPStatus.BAD_REQUEST,
|
251
|
-
)
|
248
|
+
return _create_error_response("Get parameter by name failed")
|
252
249
|
else:
|
253
250
|
return ORJSONResponse(ret, status_code=200)
|
254
251
|
except Exception as e:
|
255
|
-
return
|
256
|
-
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
257
|
-
)
|
252
|
+
return _create_error_response(e)
|
258
253
|
|
259
254
|
|
260
255
|
@app.api_route("/open_session", methods=["GET", "POST"])
|
@@ -264,9 +259,7 @@ async def open_session(obj: OpenSessionReqInput, request: Request):
|
|
264
259
|
session_id = await tokenizer_manager.open_session(obj, request)
|
265
260
|
return session_id
|
266
261
|
except Exception as e:
|
267
|
-
return
|
268
|
-
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
269
|
-
)
|
262
|
+
return _create_error_response(e)
|
270
263
|
|
271
264
|
|
272
265
|
@app.api_route("/close_session", methods=["GET", "POST"])
|
@@ -276,9 +269,7 @@ async def close_session(obj: CloseSessionReqInput, request: Request):
|
|
276
269
|
await tokenizer_manager.close_session(obj, request)
|
277
270
|
return Response(status_code=200)
|
278
271
|
except Exception as e:
|
279
|
-
return
|
280
|
-
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
281
|
-
)
|
272
|
+
return _create_error_response(e)
|
282
273
|
|
283
274
|
|
284
275
|
# fastapi implicitly converts json in the request to obj (dataclass)
|
@@ -312,9 +303,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
|
|
312
303
|
return ret
|
313
304
|
except ValueError as e:
|
314
305
|
logger.error(f"Error: {e}")
|
315
|
-
return
|
316
|
-
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
317
|
-
)
|
306
|
+
return _create_error_response(e)
|
318
307
|
|
319
308
|
|
320
309
|
@app.api_route("/encode", methods=["POST", "PUT"])
|
@@ -325,9 +314,7 @@ async def encode_request(obj: EmbeddingReqInput, request: Request):
|
|
325
314
|
ret = await tokenizer_manager.generate_request(obj, request).__anext__()
|
326
315
|
return ret
|
327
316
|
except ValueError as e:
|
328
|
-
return
|
329
|
-
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
330
|
-
)
|
317
|
+
return _create_error_response(e)
|
331
318
|
|
332
319
|
|
333
320
|
@app.api_route("/classify", methods=["POST", "PUT"])
|
@@ -338,9 +325,7 @@ async def classify_request(obj: EmbeddingReqInput, request: Request):
|
|
338
325
|
ret = await tokenizer_manager.generate_request(obj, request).__anext__()
|
339
326
|
return ret
|
340
327
|
except ValueError as e:
|
341
|
-
return
|
342
|
-
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
343
|
-
)
|
328
|
+
return _create_error_response(e)
|
344
329
|
|
345
330
|
|
346
331
|
##### OpenAI-compatible API endpoints #####
|
@@ -416,6 +401,12 @@ async def retrieve_file_content(file_id: str):
|
|
416
401
|
return await v1_retrieve_file_content(file_id)
|
417
402
|
|
418
403
|
|
404
|
+
def _create_error_response(e):
|
405
|
+
return ORJSONResponse(
|
406
|
+
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
407
|
+
)
|
408
|
+
|
409
|
+
|
419
410
|
def launch_engine(
|
420
411
|
server_args: ServerArgs,
|
421
412
|
):
|
@@ -849,12 +840,10 @@ class Engine:
|
|
849
840
|
group_name=group_name,
|
850
841
|
backend=backend,
|
851
842
|
)
|
852
|
-
|
853
|
-
async def _init_group():
|
854
|
-
return await tokenizer_manager.init_weights_update_group(obj, None)
|
855
|
-
|
856
843
|
loop = asyncio.get_event_loop()
|
857
|
-
return loop.run_until_complete(
|
844
|
+
return loop.run_until_complete(
|
845
|
+
tokenizer_manager.init_weights_update_group(obj, None)
|
846
|
+
)
|
858
847
|
|
859
848
|
def update_weights_from_distributed(self, name, dtype, shape):
|
860
849
|
"""Update weights from distributed source."""
|
@@ -863,22 +852,16 @@ class Engine:
|
|
863
852
|
dtype=dtype,
|
864
853
|
shape=shape,
|
865
854
|
)
|
866
|
-
|
867
|
-
async def _update_weights():
|
868
|
-
return await tokenizer_manager.update_weights_from_distributed(obj, None)
|
869
|
-
|
870
855
|
loop = asyncio.get_event_loop()
|
871
|
-
return loop.run_until_complete(
|
856
|
+
return loop.run_until_complete(
|
857
|
+
tokenizer_manager.update_weights_from_distributed(obj, None)
|
858
|
+
)
|
872
859
|
|
873
860
|
def get_weights_by_name(self, name, truncate_size=100):
|
874
861
|
"""Get weights by parameter name."""
|
875
862
|
obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
|
876
|
-
|
877
|
-
async def _get_weights():
|
878
|
-
return await tokenizer_manager.get_weights_by_name(obj, None)
|
879
|
-
|
880
863
|
loop = asyncio.get_event_loop()
|
881
|
-
return loop.run_until_complete(
|
864
|
+
return loop.run_until_complete(tokenizer_manager.get_weights_by_name(obj, None))
|
882
865
|
|
883
866
|
|
884
867
|
class Runtime:
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.1"
|
1
|
+
__version__ = "0.4.1.post1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.1
|
3
|
+
Version: 0.4.1.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -243,7 +243,7 @@ Requires-Dist: torch; extra == "srt"
|
|
243
243
|
Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
|
244
244
|
Requires-Dist: cuda-python; extra == "srt"
|
245
245
|
Requires-Dist: flashinfer==0.1.6; extra == "srt"
|
246
|
-
Requires-Dist: sgl-kernel>=0.0.2.
|
246
|
+
Requires-Dist: sgl-kernel>=0.0.2.post10; extra == "srt"
|
247
247
|
Provides-Extra: srt-hip
|
248
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
249
249
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -358,8 +358,8 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
358
358
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
359
359
|
|
360
360
|
## Adoption and Sponsorship
|
361
|
-
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI
|
361
|
+
The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI.
|
362
362
|
|
363
363
|
## Acknowledgment and Citation
|
364
364
|
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
365
|
-
Please cite
|
365
|
+
Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
@@ -4,14 +4,14 @@ sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
|
|
4
4
|
sglang/bench_offline_throughput.py,sha256=iQiJCK3KQDCdwU1NVbIwbtthssWzBXiIsKUDA7Z_hO0,12510
|
5
5
|
sglang/bench_one_batch.py,sha256=jkyMhK0lqn5dRCYgAh30qZrNHP4gAbXODymBMNXK86I,15859
|
6
6
|
sglang/bench_one_batch_server.py,sha256=-fV9FTLNNcSIy0pgYeggXedPVK0fVsXZqVQswT8OMOY,5945
|
7
|
-
sglang/bench_serving.py,sha256=
|
7
|
+
sglang/bench_serving.py,sha256=YQiCZreejCPBTqMmZsCB99RMi1N-Jx-dZtaafcQ8-14,53377
|
8
8
|
sglang/check_env.py,sha256=4OqpZaEJOfBM6-vtPILto5kqDmgiZM1Koc7lK78A7CI,8427
|
9
9
|
sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
|
10
10
|
sglang/launch_server.py,sha256=4y2QeSj0wVNB9MJQZeahD4ahTDU6gwqo7MPUytyFop0,403
|
11
11
|
sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
|
12
12
|
sglang/llama3_eval.py,sha256=gWSboDchIGybIce88bJlrCG0yiLZ513mw4gcutJlzGM,10017
|
13
13
|
sglang/utils.py,sha256=23jf4Mz8E5p5a6JOkjnfYZixdjZUk88F_mZ8rZcby5Q,11597
|
14
|
-
sglang/version.py,sha256=
|
14
|
+
sglang/version.py,sha256=ARioq8ApVNckeQorLPVfHZeN9mlHMLbaNgLGNbGq-ys,28
|
15
15
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
16
|
sglang/lang/chat_template.py,sha256=cnfjjxIIcYRGRxXlJlOGnpFxFuhMHut7DS52LsOMKcA,15826
|
17
17
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
@@ -23,7 +23,7 @@ sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
|
|
23
23
|
sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
|
24
24
|
sglang/lang/backend/base_backend.py,sha256=tdoh9YF3CyekY1BKiX9n7-aA4srDWIuA4RDJLM7q8qg,1985
|
25
25
|
sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
|
26
|
-
sglang/lang/backend/openai.py,sha256=
|
26
|
+
sglang/lang/backend/openai.py,sha256=ha9a2P6T80TmSgYlyIwB1qYawWkjcOgiOptkktkqa1U,15436
|
27
27
|
sglang/lang/backend/runtime_endpoint.py,sha256=dfs-yZ1ekKmnbpZLluQHWPmMeZJKbaaZRRGYRa9eBE8,10541
|
28
28
|
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
29
29
|
sglang/srt/_custom_ops.py,sha256=Y4gyTDGhWz-W2Igq25Ojm8XFiyvkawW9I-79iwYvxJ0,3574
|
@@ -32,7 +32,7 @@ sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21
|
|
32
32
|
sglang/srt/hf_transformers_utils.py,sha256=38Ms0H2-VMerOS6jnczcFtZMS6lhw9B5rSWKAfxVUfQ,7945
|
33
33
|
sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
|
34
34
|
sglang/srt/model_parallel.py,sha256=eLXZhvJ4wG6dh0FontNCIdVZvHYdWgaeY-5cu7TD9tE,6078
|
35
|
-
sglang/srt/server.py,sha256=
|
35
|
+
sglang/srt/server.py,sha256=vDucJl6qtEK2swzPJ_wYitaJvsI4MigMagGlBlH5V54,34033
|
36
36
|
sglang/srt/server_args.py,sha256=LgnQ-kBJZ3E7hMMZj9bSK0mn7Bhjk1nJHxLcxl-lGTM,34572
|
37
37
|
sglang/srt/utils.py,sha256=J8kFl6kDBwFZCM6AKaVTiqdhJKRg0JOH0pNrD1ZeWmM,41726
|
38
38
|
sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
|
@@ -45,7 +45,7 @@ sglang/srt/constrained/__init__.py,sha256=UWZNVLvOT5ZBX8M36sONgDmnKtkQ0cSfhQD2jO
|
|
45
45
|
sglang/srt/constrained/base_grammar_backend.py,sha256=FhVm7PxhXDl0joV9NP5RjKgz7dR1dZvUAQnh0mdtvVY,2353
|
46
46
|
sglang/srt/constrained/outlines_backend.py,sha256=CipNHNNXs8xtnJNVNe6FCwZUlSbIXbGmWVlZz3hUpFQ,6820
|
47
47
|
sglang/srt/constrained/outlines_jump_forward.py,sha256=iZWXeR3gNYoMubLGyFmLPO4V2YsN5DiGjD71Xk9iFaE,6418
|
48
|
-
sglang/srt/constrained/xgrammar_backend.py,sha256=
|
48
|
+
sglang/srt/constrained/xgrammar_backend.py,sha256=76oUFXeB29bfnEVWa1-rIrwQm5jhuMlzAX10HtAq1fQ,4887
|
49
49
|
sglang/srt/distributed/__init__.py,sha256=__tl9Frrf3PFrSyNYcn5i-y2rL-J4-Qn6RJwrsZ4xgc,83
|
50
50
|
sglang/srt/distributed/communication_op.py,sha256=ZoIhboZyefiAwr-1K-wF3rAFSQ4Wt-RxXpsX443Gbt4,1157
|
51
51
|
sglang/srt/distributed/parallel_state.py,sha256=HplRH5S0AWdwSdhoHYX9_UWQZlFjh2Z1LHaz68EXlpE,47555
|
@@ -77,20 +77,20 @@ sglang/srt/layers/attention/torch_native_backend.py,sha256=nQdeqWEMMH_wrod5wssDC
|
|
77
77
|
sglang/srt/layers/attention/triton_backend.py,sha256=-TobyZHwlbJ5HhbFg-jgCqVOw4Y-opgEuFo-EusASQc,6264
|
78
78
|
sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=oJ_UK1t229zF3hbTDiQe7t-X-IbM2dOxx4U2ch-vmjA,17847
|
79
79
|
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
|
80
|
-
sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=
|
80
|
+
sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=DWOZXSTVN5ZbcFjDjcqs-nPdUkxSwum0SVXhVKqwh2g,11688
|
81
81
|
sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=lojFXRZMLWkzS2Y8uxaolnQhXaWKG19mCAWaF5KQeiI,6087
|
82
82
|
sglang/srt/layers/moe/fused_moe_native.py,sha256=8q-LFZMSCGLc2_Gltp2lH0gSb4A1WOuKQW3wo3rpj5g,1601
|
83
|
-
sglang/srt/layers/moe/topk.py,sha256=
|
83
|
+
sglang/srt/layers/moe/topk.py,sha256=JpeIl_-CNk0yyG3k5fmmNbbmR2_9bkKC23UoLOlMkjw,6954
|
84
84
|
sglang/srt/layers/moe/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
85
85
|
sglang/srt/layers/moe/ep_moe/kernels.py,sha256=wb_S2qLxoWWgQu9coXy0XLNGvHzdZSdwXr0PGy4QySg,10940
|
86
86
|
sglang/srt/layers/moe/ep_moe/layer.py,sha256=6iQU5ZjQ8IXGoQ8ZlBuJqyQxYTEem9vXI6rbVIWKlZw,22303
|
87
87
|
sglang/srt/layers/moe/fused_moe_triton/__init__.py,sha256=h9yMFAL_bagUf-qBED8gSWdCOb7d8IdA-pE-L_nIg8E,842
|
88
|
-
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=
|
88
|
+
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=zXwWUtthLa9E35EvlQ9A_mnIsQyA0_NYKsUBdJqONHo,31163
|
89
89
|
sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=BclDj5JyCy-8Bfue4broL1-IG6a4dUyggE9WQLa06sg,20575
|
90
90
|
sglang/srt/layers/quantization/__init__.py,sha256=VPYXShHvbvkOgVBlkIqic4RhdJ1y6EZ3r34T-nZMT1k,4606
|
91
91
|
sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
|
92
92
|
sglang/srt/layers/quantization/fp8.py,sha256=wNnpXLroIl7D98mlfCiXZPE9hrP5ricHrXY1WZBzEEo,30810
|
93
|
-
sglang/srt/layers/quantization/fp8_kernel.py,sha256=
|
93
|
+
sglang/srt/layers/quantization/fp8_kernel.py,sha256=eoO1enzD9jPC80id2oC3i8bt-LN6-4Ey223yOQ9yIPE,8792
|
94
94
|
sglang/srt/layers/quantization/fp8_utils.py,sha256=HBJBaNcln1NrLxzw0ppUjMd6w-ryuGDDHCYJq7mRQac,4035
|
95
95
|
sglang/srt/lora/lora.py,sha256=-o2mBmUvoVpdkgdAkWTARN4kfyep3UNEJLcg6moh0SU,15056
|
96
96
|
sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
|
@@ -100,10 +100,10 @@ sglang/srt/managers/detokenizer_manager.py,sha256=nZkbwt4yty_oy8rvg4T7PbgyVLoBLo
|
|
100
100
|
sglang/srt/managers/image_processor.py,sha256=Y8RgyrzbJjJTpjbnZDa5qiiG5wWjZ68rOXUPDi6kkFo,13698
|
101
101
|
sglang/srt/managers/io_struct.py,sha256=_LWWqT3LNwZGaWhg2d3kTg1V2MTHKzRasCvxF9Nfpi4,15429
|
102
102
|
sglang/srt/managers/schedule_batch.py,sha256=qryPWCdOTFzxomDa80U-5guShOb1K4kBUWcPCCchYB8,45762
|
103
|
-
sglang/srt/managers/schedule_policy.py,sha256=
|
104
|
-
sglang/srt/managers/scheduler.py,sha256=
|
103
|
+
sglang/srt/managers/schedule_policy.py,sha256=QxjQ8-le062AMHHxool6CxkhvB4FIwhOQPzTX_JwL6U,15447
|
104
|
+
sglang/srt/managers/scheduler.py,sha256=Yh15uQFhJlku8a20-lhtIsiEHAcUmpL3BzL42kLVwiI,61637
|
105
105
|
sglang/srt/managers/session_controller.py,sha256=Yp-IV3rXczACZxZXmF-QxW9CWICGy8KHQ9ttBGJ8WXA,2800
|
106
|
-
sglang/srt/managers/tokenizer_manager.py,sha256=
|
106
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=uKiTt__lCFXG60zQhmM_K7dU7IuedVSIQHVw3x3y5-E,31758
|
107
107
|
sglang/srt/managers/tp_worker.py,sha256=X1EwFX3FSsmXx7jeeX2tjZRocaujabQYWm-M-0CFEBE,7363
|
108
108
|
sglang/srt/managers/tp_worker_overlap_thread.py,sha256=-QNBJRKxraa9Xt2WI1AFzZYdneIJ1eXv0GjFzDqXoE0,8926
|
109
109
|
sglang/srt/mem_cache/base_prefix_cache.py,sha256=QC8HS8RC5DXu14kyXsxAgEUsn0f932p2DjqzbKjc6Bs,962
|
@@ -115,9 +115,9 @@ sglang/srt/metrics/collector.py,sha256=ZWoFx_FKN0sNMSZ8RJWUVQ0RFEYhIHxdw0d4TZTlu
|
|
115
115
|
sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
|
116
116
|
sglang/srt/model_executor/cuda_graph_runner.py,sha256=1n5WxoE9-0B3unwkkcR355K_D290h2LGt_7EvH02DQM,16246
|
117
117
|
sglang/srt/model_executor/forward_batch_info.py,sha256=L5mVoW5SaO6To-7nGk0TZM-FFB5_78cARpJ-aC2rwD0,12883
|
118
|
-
sglang/srt/model_executor/model_runner.py,sha256=
|
118
|
+
sglang/srt/model_executor/model_runner.py,sha256=MLYBcYIQihu2I3PBTUghiU2mSWsDMzlKzcnX7yHa9JU,29837
|
119
119
|
sglang/srt/model_loader/__init__.py,sha256=zGZkOBz1zx-pkaIy47BasL3fjDlAcxAXUTjInOhXHAE,919
|
120
|
-
sglang/srt/model_loader/loader.py,sha256=
|
120
|
+
sglang/srt/model_loader/loader.py,sha256=7OG_8-66vFDFZ9kVKGNK1BFBjZ6ql449dlyvdCbMqvE,43876
|
121
121
|
sglang/srt/model_loader/utils.py,sha256=0NaMR67fESFopaklmsleiL27XH1QUrjZW246MUu1EJ0,1369
|
122
122
|
sglang/srt/model_loader/weight_utils.py,sha256=kQo9KPThjH3HAOCfC_tdwdrshdWuWJOVpPR0skSyaRY,24193
|
123
123
|
sglang/srt/models/baichuan.py,sha256=PzBOFcEAixakPEkQSaJwC0Xc1fu-yCsN9T0I67r8QmY,14919
|
@@ -128,7 +128,7 @@ sglang/srt/models/deepseek.py,sha256=_cVOvR6eSEgRf6TUBpTD5uMdijDWFw4sSt4lGzl8tbg
|
|
128
128
|
sglang/srt/models/deepseek_v2.py,sha256=-v_OJr2c3gJ0NMxQjvT3Jknz1XPGkzKx0TVR3NIiC6A,37284
|
129
129
|
sglang/srt/models/exaone.py,sha256=dkERTZVxrRroqu5AGLP7D4N6n8HvDqlNaDQUIe15mZY,13038
|
130
130
|
sglang/srt/models/gemma.py,sha256=ydRqsG-7004r1fAiz01LHUmcj_6XN0Tn4xO1keJnMQk,12126
|
131
|
-
sglang/srt/models/gemma2.py,sha256
|
131
|
+
sglang/srt/models/gemma2.py,sha256=-bFN-Te3YWAunLCrF-XFk_6fJS7gHM4Ca6h6aesXUTM,16362
|
132
132
|
sglang/srt/models/gemma2_reward.py,sha256=nJ01KfqLSJtqMLm3sG8p2mGZFK1xhhjh7I7Ccb-_Hq8,2494
|
133
133
|
sglang/srt/models/gpt2.py,sha256=2je1kE09sGcaORWnJuGYAkcwwOrT9EK-KhQaoCKjCSA,9517
|
134
134
|
sglang/srt/models/gpt_bigcode.py,sha256=tovyOdJu2x3LkzmkdFXX_iJdkxuyChIDxwgvPBy6UPo,9528
|
@@ -136,7 +136,7 @@ sglang/srt/models/granite.py,sha256=AeQY9Dxd1ZnwgCYBK0vSXXiMGM-yt9iaOVf_ruOUHXw,
|
|
136
136
|
sglang/srt/models/grok.py,sha256=J9lgNbFebvXgF19nfZyHwlGPlGWY_m0LgP506YvOYrU,15668
|
137
137
|
sglang/srt/models/internlm2.py,sha256=_xcKtd6YtEFUTozaN-yUb0xbSYckRpomfPSKcAk4j-Y,12127
|
138
138
|
sglang/srt/models/internlm2_reward.py,sha256=8K26A9oIFFGx_9U2mF87j7FX8K87HGKMnVL3ht1Uc7I,2398
|
139
|
-
sglang/srt/models/llama.py,sha256=
|
139
|
+
sglang/srt/models/llama.py,sha256=o3FYyOhkZJirzugyYz1kxs6RpY84O_uKowWWmt3jv24,19929
|
140
140
|
sglang/srt/models/llama_classification.py,sha256=DwboM1xHXdf3Fddf7xGnrfdOLJwXdiJs994cIpAPa2g,2984
|
141
141
|
sglang/srt/models/llama_embedding.py,sha256=rh-AiczPY_pTpzcACHvSMVjh1hsV_MZBBwP0LQxPsGM,3130
|
142
142
|
sglang/srt/models/llama_reward.py,sha256=oPxh5E2UkxLULNdR68dFvt2I7j33CJFN6nyA-8L2_cg,4516
|
@@ -162,10 +162,10 @@ sglang/srt/models/torch_native_llama.py,sha256=YeXHorFm6QfnczLXwPb5TG9a-He0uiA9R
|
|
162
162
|
sglang/srt/models/xverse.py,sha256=Oq--KqvbYu2H4TMVGEHpSnJLEwXBpxlncR9ilsQeckc,13579
|
163
163
|
sglang/srt/models/xverse_moe.py,sha256=7E60YIST4ELYwLRgjtHiLRI5Uyc7XqQTM7jQXiWaQs4,15541
|
164
164
|
sglang/srt/models/yivl.py,sha256=88OubtuZ38Dxb2LzfV_MTPBI4wKhh4NJqFu--efbhFM,4809
|
165
|
-
sglang/srt/openai_api/adapter.py,sha256=
|
166
|
-
sglang/srt/openai_api/protocol.py,sha256=
|
165
|
+
sglang/srt/openai_api/adapter.py,sha256=X0HLuNhg-chDQjcdsQIRpZijlImEwZLHum3G0JgU4Go,54834
|
166
|
+
sglang/srt/openai_api/protocol.py,sha256=RMzeDfh2tZITjhNwB2nX68wZwQe40N6HBuVebCzEWiU,10468
|
167
167
|
sglang/srt/sampling/sampling_batch_info.py,sha256=s--zNjk-LErZ5lMqnZ7KiuJltaziKRbQAU5qYpKIxAc,8564
|
168
|
-
sglang/srt/sampling/sampling_params.py,sha256=
|
168
|
+
sglang/srt/sampling/sampling_params.py,sha256=BkgCJAOSmQXwJrNXg26zSjKfMy0d5mMN6oHRk_ZuESI,5499
|
169
169
|
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
170
170
|
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=J-DEemZcKm1--o37kf3qDOE8SZ_6H3d5oex49Mgq2ZU,10762
|
171
171
|
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=1Zp2aL6dD60mwD1tCcSG0x5IYo0v4z9ce-q_YwbJ9f8,2490
|
@@ -188,8 +188,8 @@ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c
|
|
188
188
|
sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
|
189
189
|
sglang/test/test_utils.py,sha256=HJG7kUQOk6n9FBbH89PDtQ41C3kt1cfJODhAEcFT0AQ,23823
|
190
190
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
|
191
|
-
sglang-0.4.1.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
|
192
|
-
sglang-0.4.1.dist-info/METADATA,sha256=
|
193
|
-
sglang-0.4.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
194
|
-
sglang-0.4.1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
195
|
-
sglang-0.4.1.dist-info/RECORD,,
|
191
|
+
sglang-0.4.1.post1.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
|
192
|
+
sglang-0.4.1.post1.dist-info/METADATA,sha256=R2YDOrUU_49x5TEbNUODNlXvkSIzFqT7-hvInlSCs5k,22527
|
193
|
+
sglang-0.4.1.post1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
194
|
+
sglang-0.4.1.post1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
195
|
+
sglang-0.4.1.post1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|