sglang 0.3.6.post2__py3-none-any.whl → 0.3.6.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +55 -2
- sglang/bench_one_batch.py +3 -6
- sglang/bench_one_batch_server.py +4 -3
- sglang/launch_server.py +3 -2
- sglang/srt/managers/data_parallel_controller.py +7 -11
- sglang/srt/managers/detokenizer_manager.py +7 -4
- sglang/srt/managers/image_processor.py +1 -1
- sglang/srt/managers/io_struct.py +0 -10
- sglang/srt/managers/schedule_batch.py +24 -22
- sglang/srt/managers/scheduler.py +35 -26
- sglang/srt/managers/session_controller.py +0 -3
- sglang/srt/managers/tokenizer_manager.py +4 -33
- sglang/srt/managers/tp_worker_overlap_thread.py +11 -2
- sglang/srt/models/grok.py +11 -48
- sglang/srt/models/llava.py +9 -8
- sglang/srt/models/olmo2.py +392 -0
- sglang/srt/models/qwen2_vl.py +10 -3
- sglang/srt/openai_api/adapter.py +1 -1
- sglang/srt/server.py +46 -44
- sglang/srt/server_args.py +1 -1
- sglang/srt/utils.py +8 -20
- sglang/test/test_utils.py +20 -7
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.3.6.post2.dist-info → sglang-0.3.6.post3.dist-info}/METADATA +2 -1
- {sglang-0.3.6.post2.dist-info → sglang-0.3.6.post3.dist-info}/RECORD +29 -31
- sglang/srt/layers/fused_moe_grok/__init__.py +0 -1
- sglang/srt/layers/fused_moe_grok/fused_moe.py +0 -692
- sglang/srt/layers/fused_moe_grok/layer.py +0 -630
- {sglang-0.3.6.post2.dist-info → sglang-0.3.6.post3.dist-info}/LICENSE +0 -0
- {sglang-0.3.6.post2.dist-info → sglang-0.3.6.post3.dist-info}/WHEEL +0 -0
- {sglang-0.3.6.post2.dist-info → sglang-0.3.6.post3.dist-info}/top_level.txt +0 -0
sglang/srt/server.py
CHANGED
@@ -23,6 +23,7 @@ import json
|
|
23
23
|
import logging
|
24
24
|
import multiprocessing as mp
|
25
25
|
import os
|
26
|
+
import signal
|
26
27
|
import threading
|
27
28
|
import time
|
28
29
|
from http import HTTPStatus
|
@@ -79,7 +80,7 @@ from sglang.srt.utils import (
|
|
79
80
|
configure_logger,
|
80
81
|
delete_directory,
|
81
82
|
is_port_available,
|
82
|
-
|
83
|
+
kill_process_tree,
|
83
84
|
maybe_set_triton_cache_manager,
|
84
85
|
prepare_model_and_tokenizer,
|
85
86
|
set_prometheus_multiproc_dir,
|
@@ -92,7 +93,7 @@ logger = logging.getLogger(__name__)
|
|
92
93
|
|
93
94
|
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
94
95
|
|
95
|
-
|
96
|
+
# Fast API
|
96
97
|
app = FastAPI()
|
97
98
|
app.add_middleware(
|
98
99
|
CORSMiddleware,
|
@@ -103,7 +104,7 @@ app.add_middleware(
|
|
103
104
|
)
|
104
105
|
|
105
106
|
tokenizer_manager: TokenizerManager = None
|
106
|
-
|
107
|
+
scheduler_info: Dict = None
|
107
108
|
|
108
109
|
##### Native API endpoints #####
|
109
110
|
|
@@ -171,7 +172,7 @@ async def flush_cache():
|
|
171
172
|
|
172
173
|
@app.get("/start_profile")
|
173
174
|
@app.post("/start_profile")
|
174
|
-
async def
|
175
|
+
async def start_profile_async():
|
175
176
|
"""Start profiling."""
|
176
177
|
tokenizer_manager.start_profile()
|
177
178
|
return Response(
|
@@ -182,7 +183,7 @@ async def start_profile():
|
|
182
183
|
|
183
184
|
@app.get("/stop_profile")
|
184
185
|
@app.post("/stop_profile")
|
185
|
-
async def
|
186
|
+
async def stop_profile_async():
|
186
187
|
"""Stop profiling."""
|
187
188
|
tokenizer_manager.stop_profile()
|
188
189
|
return Response(
|
@@ -233,6 +234,8 @@ async def close_session(obj: CloseSessionReqInput, request: Request):
|
|
233
234
|
)
|
234
235
|
|
235
236
|
|
237
|
+
# fastapi implicitly converts json in the request to obj (dataclass)
|
238
|
+
@app.api_route("/generate", methods=["POST", "PUT"])
|
236
239
|
@time_func_latency
|
237
240
|
async def generate_request(obj: GenerateReqInput, request: Request):
|
238
241
|
"""Handle a generate request."""
|
@@ -266,11 +269,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
|
|
266
269
|
)
|
267
270
|
|
268
271
|
|
269
|
-
|
270
|
-
app.post("/generate")(generate_request)
|
271
|
-
app.put("/generate")(generate_request)
|
272
|
-
|
273
|
-
|
272
|
+
@app.api_route("/encode", methods=["POST", "PUT"])
|
274
273
|
@time_func_latency
|
275
274
|
async def encode_request(obj: EmbeddingReqInput, request: Request):
|
276
275
|
"""Handle an embedding request."""
|
@@ -283,10 +282,7 @@ async def encode_request(obj: EmbeddingReqInput, request: Request):
|
|
283
282
|
)
|
284
283
|
|
285
284
|
|
286
|
-
app.
|
287
|
-
app.put("/encode")(encode_request)
|
288
|
-
|
289
|
-
|
285
|
+
@app.api_route("/encode", methods=["POST", "PUT"])
|
290
286
|
@time_func_latency
|
291
287
|
async def classify_request(obj: EmbeddingReqInput, request: Request):
|
292
288
|
"""Handle a reward model request. Now the arguments and return values are the same as embedding models."""
|
@@ -299,10 +295,6 @@ async def classify_request(obj: EmbeddingReqInput, request: Request):
|
|
299
295
|
)
|
300
296
|
|
301
297
|
|
302
|
-
app.post("/classify")(classify_request)
|
303
|
-
app.put("/classify")(classify_request)
|
304
|
-
|
305
|
-
|
306
298
|
##### OpenAI-compatible API endpoints #####
|
307
299
|
|
308
300
|
|
@@ -380,11 +372,11 @@ def launch_engine(
|
|
380
372
|
server_args: ServerArgs,
|
381
373
|
):
|
382
374
|
"""
|
383
|
-
Launch the
|
375
|
+
Launch the TokenizerManager in the main process, the Scheduler in a subprocess, and the DetokenizerManager in another subprocess.
|
384
376
|
"""
|
385
377
|
|
386
378
|
global tokenizer_manager
|
387
|
-
global
|
379
|
+
global scheduler_info
|
388
380
|
|
389
381
|
# Configure global environment
|
390
382
|
configure_logger(server_args)
|
@@ -450,8 +442,8 @@ def launch_engine(
|
|
450
442
|
if server_args.chat_template:
|
451
443
|
load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
|
452
444
|
|
453
|
-
# Wait for model to finish loading
|
454
|
-
|
445
|
+
# Wait for model to finish loading
|
446
|
+
scheduler_infos = []
|
455
447
|
for i in range(len(scheduler_pipe_readers)):
|
456
448
|
data = scheduler_pipe_readers[i].recv()
|
457
449
|
|
@@ -459,10 +451,10 @@ def launch_engine(
|
|
459
451
|
raise RuntimeError(
|
460
452
|
"Initialization failed. Please see the error messages above."
|
461
453
|
)
|
462
|
-
|
454
|
+
scheduler_infos.append(data)
|
463
455
|
|
464
456
|
# Assume all schedulers have same max_total_num_tokens
|
465
|
-
|
457
|
+
scheduler_info = scheduler_infos[0]
|
466
458
|
|
467
459
|
|
468
460
|
def launch_server(
|
@@ -476,12 +468,12 @@ def launch_server(
|
|
476
468
|
|
477
469
|
1. HTTP server: A FastAPI server that routes requests to the engine.
|
478
470
|
2. SRT engine:
|
479
|
-
1.
|
471
|
+
1. TokenizerManager: Tokenizes the requests and sends them to the scheduler.
|
480
472
|
2. Scheduler (subprocess): Receives requests from the Tokenizer Manager, schedules batches, forwards them, and sends the output tokens to the Detokenizer Manager.
|
481
|
-
3.
|
473
|
+
3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.
|
482
474
|
|
483
475
|
Note:
|
484
|
-
1. The HTTP server and
|
476
|
+
1. The HTTP server and TokenizerManager both run in the main process.
|
485
477
|
2. Inter-process communication is done through ICP (each process uses a different port) via the ZMQ library.
|
486
478
|
"""
|
487
479
|
launch_engine(server_args=server_args)
|
@@ -490,7 +482,7 @@ def launch_server(
|
|
490
482
|
if server_args.api_key:
|
491
483
|
add_api_key_middleware(app, server_args.api_key)
|
492
484
|
|
493
|
-
#
|
485
|
+
# Add prometheus middleware
|
494
486
|
if server_args.enable_metrics:
|
495
487
|
add_prometheus_middleware(app)
|
496
488
|
enable_func_timer()
|
@@ -502,7 +494,7 @@ def launch_server(
|
|
502
494
|
t.start()
|
503
495
|
|
504
496
|
try:
|
505
|
-
#
|
497
|
+
# Update logging configs
|
506
498
|
LOGGING_CONFIG["formatters"]["default"][
|
507
499
|
"fmt"
|
508
500
|
] = "[%(asctime)s] %(levelprefix)s %(message)s"
|
@@ -511,6 +503,8 @@ def launch_server(
|
|
511
503
|
"fmt"
|
512
504
|
] = '[%(asctime)s] %(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)s'
|
513
505
|
LOGGING_CONFIG["formatters"]["access"]["datefmt"] = "%Y-%m-%d %H:%M:%S"
|
506
|
+
|
507
|
+
# Listen for HTTP requests
|
514
508
|
uvicorn.run(
|
515
509
|
app,
|
516
510
|
host=server_args.host,
|
@@ -526,8 +520,7 @@ def launch_server(
|
|
526
520
|
async def _get_server_info():
|
527
521
|
return {
|
528
522
|
**dataclasses.asdict(tokenizer_manager.server_args), # server args
|
529
|
-
|
530
|
-
"max_total_num_tokens": _max_total_num_tokens, # max total num tokens
|
523
|
+
**scheduler_info,
|
531
524
|
"version": __version__,
|
532
525
|
}
|
533
526
|
|
@@ -562,6 +555,15 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
562
555
|
"at https://docs.flashinfer.ai/installation.html.",
|
563
556
|
)
|
564
557
|
|
558
|
+
# Register the signal handler.
|
559
|
+
# The child processes will send SIGQUIT to this process when any error happens
|
560
|
+
# This process then clean up the whole process tree
|
561
|
+
def sigquit_handler(signum, frame):
|
562
|
+
kill_process_tree(os.getpid())
|
563
|
+
|
564
|
+
signal.signal(signal.SIGQUIT, sigquit_handler)
|
565
|
+
|
566
|
+
# Set mp start method
|
565
567
|
mp.set_start_method("spawn", force=True)
|
566
568
|
|
567
569
|
|
@@ -588,7 +590,7 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
|
|
588
590
|
if pipe_finish_writer is not None:
|
589
591
|
pipe_finish_writer.send(last_traceback)
|
590
592
|
logger.error(f"Initialization failed. warmup error: {last_traceback}")
|
591
|
-
|
593
|
+
kill_process_tree(os.getpid())
|
592
594
|
return
|
593
595
|
|
594
596
|
model_info = res.json()
|
@@ -621,9 +623,10 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
|
|
621
623
|
if pipe_finish_writer is not None:
|
622
624
|
pipe_finish_writer.send(last_traceback)
|
623
625
|
logger.error(f"Initialization failed. warmup error: {last_traceback}")
|
624
|
-
|
626
|
+
kill_process_tree(os.getpid())
|
625
627
|
return
|
626
628
|
|
629
|
+
# Debug print
|
627
630
|
# logger.info(f"{res.json()=}")
|
628
631
|
|
629
632
|
logger.info("The server is fired up and ready to roll!")
|
@@ -690,7 +693,7 @@ class Runtime:
|
|
690
693
|
|
691
694
|
def shutdown(self):
|
692
695
|
if self.pid is not None:
|
693
|
-
|
696
|
+
kill_process_tree(self.pid)
|
694
697
|
self.pid = None
|
695
698
|
|
696
699
|
def cache_prefix(self, prefix: str):
|
@@ -800,18 +803,11 @@ class Engine:
|
|
800
803
|
launching the HTTP server adds unnecessary complexity or overhead,
|
801
804
|
"""
|
802
805
|
|
803
|
-
def __init__(self, *args, **kwargs):
|
804
|
-
|
806
|
+
def __init__(self, log_level: str = "error", *args, **kwargs):
|
805
807
|
# before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
|
806
808
|
atexit.register(self.shutdown)
|
807
809
|
|
808
|
-
|
809
|
-
# offline engine works in scripts, so we set it to error
|
810
|
-
|
811
|
-
if "log_level" not in kwargs:
|
812
|
-
kwargs["log_level"] = "error"
|
813
|
-
|
814
|
-
server_args = ServerArgs(*args, **kwargs)
|
810
|
+
server_args = ServerArgs(*args, log_level=log_level, **kwargs)
|
815
811
|
launch_engine(server_args=server_args)
|
816
812
|
|
817
813
|
def generate(
|
@@ -914,7 +910,7 @@ class Engine:
|
|
914
910
|
return ret
|
915
911
|
|
916
912
|
def shutdown(self):
|
917
|
-
|
913
|
+
kill_process_tree(os.getpid(), include_parent=False)
|
918
914
|
|
919
915
|
def get_tokenizer(self):
|
920
916
|
global tokenizer_manager
|
@@ -934,5 +930,11 @@ class Engine:
|
|
934
930
|
loop = asyncio.get_event_loop()
|
935
931
|
return loop.run_until_complete(encode_request(obj, None))
|
936
932
|
|
933
|
+
def start_profile(self):
|
934
|
+
tokenizer_manager.start_profile()
|
935
|
+
|
936
|
+
def stop_profile(self):
|
937
|
+
tokenizer_manager.stop_profile()
|
938
|
+
|
937
939
|
async def get_server_info(self):
|
938
940
|
return await _get_server_info()
|
sglang/srt/server_args.py
CHANGED
@@ -144,7 +144,7 @@ class ServerArgs:
|
|
144
144
|
if self.served_model_name is None:
|
145
145
|
self.served_model_name = self.model_path
|
146
146
|
|
147
|
-
if self.chunked_prefill_size <= 0:
|
147
|
+
if self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0:
|
148
148
|
# Disable chunked prefill
|
149
149
|
self.chunked_prefill_size = None
|
150
150
|
|
sglang/srt/utils.py
CHANGED
@@ -443,26 +443,14 @@ def assert_pkg_version(pkg: str, min_version: str, message: str):
|
|
443
443
|
)
|
444
444
|
|
445
445
|
|
446
|
-
def
|
447
|
-
"""Kill the
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
parent_process.pid, include_self=True, skip_pid=current_process.pid
|
452
|
-
)
|
453
|
-
try:
|
454
|
-
current_process.kill()
|
455
|
-
except psutil.NoSuchProcess:
|
456
|
-
pass
|
457
|
-
|
458
|
-
|
459
|
-
def kill_child_process(pid=None, include_self=False, skip_pid=None):
|
460
|
-
"""Kill the process and all its children process."""
|
461
|
-
if pid is None:
|
462
|
-
pid = os.getpid()
|
446
|
+
def kill_process_tree(parent_pid, include_parent: bool = True, skip_pid: int = None):
|
447
|
+
"""Kill the process and all its child processes."""
|
448
|
+
if parent_pid is None:
|
449
|
+
parent_pid = os.getpid()
|
450
|
+
include_parent = False
|
463
451
|
|
464
452
|
try:
|
465
|
-
itself = psutil.Process(
|
453
|
+
itself = psutil.Process(parent_pid)
|
466
454
|
except psutil.NoSuchProcess:
|
467
455
|
return
|
468
456
|
|
@@ -475,13 +463,13 @@ def kill_child_process(pid=None, include_self=False, skip_pid=None):
|
|
475
463
|
except psutil.NoSuchProcess:
|
476
464
|
pass
|
477
465
|
|
478
|
-
if
|
466
|
+
if include_parent:
|
479
467
|
try:
|
480
468
|
itself.kill()
|
481
469
|
|
482
470
|
# Sometime processes cannot be killed with SIGKILL (e.g, PID=1 launched by kubernetes),
|
483
471
|
# so we send an additional signal to kill them.
|
484
|
-
itself.send_signal(signal.
|
472
|
+
itself.send_signal(signal.SIGQUIT)
|
485
473
|
except psutil.NoSuchProcess:
|
486
474
|
pass
|
487
475
|
|
sglang/test/test_utils.py
CHANGED
@@ -22,7 +22,7 @@ from sglang.bench_serving import run_benchmark
|
|
22
22
|
from sglang.global_config import global_config
|
23
23
|
from sglang.lang.backend.openai import OpenAI
|
24
24
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
25
|
-
from sglang.srt.utils import get_bool_env_var,
|
25
|
+
from sglang.srt.utils import get_bool_env_var, kill_process_tree
|
26
26
|
from sglang.test.run_eval import run_eval
|
27
27
|
from sglang.utils import get_exception_traceback
|
28
28
|
|
@@ -504,7 +504,7 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
|
|
504
504
|
)
|
505
505
|
assert ret_code == 0
|
506
506
|
except TimeoutError:
|
507
|
-
|
507
|
+
kill_process_tree(process.pid)
|
508
508
|
time.sleep(5)
|
509
509
|
print(
|
510
510
|
f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
|
@@ -578,7 +578,7 @@ def run_bench_serving(
|
|
578
578
|
run_benchmark(warmup_args)
|
579
579
|
res = run_benchmark(args)
|
580
580
|
finally:
|
581
|
-
|
581
|
+
kill_process_tree(process.pid)
|
582
582
|
|
583
583
|
assert res["completed"] == num_prompts
|
584
584
|
return res
|
@@ -611,7 +611,7 @@ def run_bench_one_batch(model, other_args):
|
|
611
611
|
lastline = output.split("\n")[-3]
|
612
612
|
output_throughput = float(lastline.split(" ")[-2])
|
613
613
|
finally:
|
614
|
-
|
614
|
+
kill_process_tree(process.pid)
|
615
615
|
|
616
616
|
return output_throughput
|
617
617
|
|
@@ -677,8 +677,14 @@ def run_and_check_memory_leak(
|
|
677
677
|
enable_mixed_chunk,
|
678
678
|
disable_overlap,
|
679
679
|
chunked_prefill_size,
|
680
|
+
assert_has_abort,
|
680
681
|
):
|
681
|
-
other_args = [
|
682
|
+
other_args = [
|
683
|
+
"--chunked-prefill-size",
|
684
|
+
str(chunked_prefill_size),
|
685
|
+
"--log-level",
|
686
|
+
"debug",
|
687
|
+
]
|
682
688
|
if disable_radix_cache:
|
683
689
|
other_args += ["--disable-radix-cache"]
|
684
690
|
if enable_mixed_chunk:
|
@@ -710,8 +716,8 @@ def run_and_check_memory_leak(
|
|
710
716
|
workload_func(base_url, model)
|
711
717
|
|
712
718
|
# Clean up everything
|
713
|
-
|
714
|
-
|
719
|
+
kill_process_tree(process.pid)
|
720
|
+
kill_process_tree(process.pid)
|
715
721
|
stdout.close()
|
716
722
|
stderr.close()
|
717
723
|
if os.path.exists(STDOUT_FILENAME):
|
@@ -723,14 +729,19 @@ def run_and_check_memory_leak(
|
|
723
729
|
# Assert success
|
724
730
|
has_new_server = False
|
725
731
|
has_leak = False
|
732
|
+
has_abort = False
|
726
733
|
for line in output_lines:
|
727
734
|
if "The server is fired" in line:
|
728
735
|
has_new_server = True
|
729
736
|
if "leak" in line:
|
730
737
|
has_leak = True
|
738
|
+
if "Abort" in line:
|
739
|
+
has_abort = True
|
731
740
|
|
732
741
|
assert has_new_server
|
733
742
|
assert not has_leak
|
743
|
+
if assert_has_abort:
|
744
|
+
assert has_abort
|
734
745
|
|
735
746
|
|
736
747
|
def run_mmlu_test(
|
@@ -761,6 +772,7 @@ def run_mmlu_test(
|
|
761
772
|
enable_mixed_chunk,
|
762
773
|
disable_overlap,
|
763
774
|
chunked_prefill_size,
|
775
|
+
assert_has_abort=False,
|
764
776
|
)
|
765
777
|
|
766
778
|
|
@@ -800,4 +812,5 @@ def run_mulit_request_test(
|
|
800
812
|
enable_mixed_chunk,
|
801
813
|
enable_overlap,
|
802
814
|
chunked_prefill_size,
|
815
|
+
assert_has_abort=False,
|
803
816
|
)
|
sglang/utils.py
CHANGED
@@ -348,9 +348,9 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
|
|
348
348
|
|
349
349
|
|
350
350
|
def terminate_process(process):
|
351
|
-
from sglang.srt.utils import
|
351
|
+
from sglang.srt.utils import kill_process_tree
|
352
352
|
|
353
|
-
|
353
|
+
kill_process_tree(process.pid)
|
354
354
|
|
355
355
|
|
356
356
|
def print_highlight(html_content: str):
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.6.
|
1
|
+
__version__ = "0.3.6.post3"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.6.
|
3
|
+
Version: 0.3.6.post3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -241,6 +241,7 @@ Requires-Dist: sglang[runtime_common]; extra == "srt"
|
|
241
241
|
Requires-Dist: torch; extra == "srt"
|
242
242
|
Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
|
243
243
|
Requires-Dist: cuda-python; extra == "srt"
|
244
|
+
Requires-Dist: flashinfer>=0.1.6; extra == "srt"
|
244
245
|
Provides-Extra: srt-hip
|
245
246
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
246
247
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -1,16 +1,16 @@
|
|
1
1
|
sglang/__init__.py,sha256=3M0oz0ZA8fULhV5LwQ4hxh-MRdHsOJRD1D63C60pdG4,1616
|
2
2
|
sglang/api.py,sha256=NdO6cYnklnEBQBKqQjlqI8-P1EownKQ71t5ibCGhEVo,6953
|
3
3
|
sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
|
4
|
-
sglang/bench_offline_throughput.py,sha256=
|
5
|
-
sglang/bench_one_batch.py,sha256=
|
6
|
-
sglang/bench_one_batch_server.py,sha256
|
4
|
+
sglang/bench_offline_throughput.py,sha256=3OrFI26PmoVTU3pQrBFC50AZI7HpKKuk4vYycbkDjhY,12428
|
5
|
+
sglang/bench_one_batch.py,sha256=iSev0LruPdfJ49mVeCJNFREmgex2omDSpTgwHgRDNIo,15692
|
6
|
+
sglang/bench_one_batch_server.py,sha256=-fV9FTLNNcSIy0pgYeggXedPVK0fVsXZqVQswT8OMOY,5945
|
7
7
|
sglang/bench_serving.py,sha256=hI7FjaERyqKBrYtKewDU6E4rSufKxqsUPyUgtWtTKSI,52545
|
8
8
|
sglang/check_env.py,sha256=rE4ZAG0e6M-Xd-qdHcKclN8Qav6b9gEh4yvlV_TbOg0,5450
|
9
9
|
sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
|
10
|
-
sglang/launch_server.py,sha256=
|
10
|
+
sglang/launch_server.py,sha256=4y2QeSj0wVNB9MJQZeahD4ahTDU6gwqo7MPUytyFop0,403
|
11
11
|
sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
|
12
|
-
sglang/utils.py,sha256=
|
13
|
-
sglang/version.py,sha256=
|
12
|
+
sglang/utils.py,sha256=r4Dw-xffcrTRposls-gqyoYxjgJNYhVduK_6bDN_Vj4,11526
|
13
|
+
sglang/version.py,sha256=pyFIjLZBxCp2AwTbeLFaYhEL1dFXGzpYW00Vkg2755Y,28
|
14
14
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
sglang/lang/chat_template.py,sha256=jprS3-In2FTUoedKwZg-HYvDwU8RTIYntOlf2zoN2sU,14814
|
16
16
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
@@ -29,9 +29,9 @@ sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21
|
|
29
29
|
sglang/srt/hf_transformers_utils.py,sha256=sUUCpjbTHuYDMuwOaz00nH5fataXKjliD8gCxXU64sw,6712
|
30
30
|
sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
|
31
31
|
sglang/srt/model_parallel.py,sha256=QR-Alqo0sElDXPJ79N1PhUHHKiEHPQn3dyXduMP-SHQ,3664
|
32
|
-
sglang/srt/server.py,sha256=
|
33
|
-
sglang/srt/server_args.py,sha256=
|
34
|
-
sglang/srt/utils.py,sha256=
|
32
|
+
sglang/srt/server.py,sha256=1A_RdzTgeVPKcoZvsLs0dH9U3ZOY2MWjS6X3EUmwzPs,31011
|
33
|
+
sglang/srt/server_args.py,sha256=PbkhdNkr46Ngv3_JPplo5jLw78pebRxNVTiIb-9uPVA,30876
|
34
|
+
sglang/srt/utils.py,sha256=TWeASu4TOqIbvb-rJ0CYvFcMyk67hPJxQZnvyqrKu8k,33585
|
35
35
|
sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
|
36
36
|
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
37
37
|
sglang/srt/configs/model_config.py,sha256=r5N_OO4w3_R3kZ80P-ZPECscXmspI41d1vc6uEE9ixM,9526
|
@@ -61,9 +61,6 @@ sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=BE63WhKiutSNkh
|
|
61
61
|
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
|
62
62
|
sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=Gfct-0_l-S2ZrP4F-zkzNiFbmd3C3f7uJovacOuDxaA,11472
|
63
63
|
sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=lojFXRZMLWkzS2Y8uxaolnQhXaWKG19mCAWaF5KQeiI,6087
|
64
|
-
sglang/srt/layers/fused_moe_grok/__init__.py,sha256=rj_JBzcP--eaaM6LGQ-u580uQvqLisp5JtGBAs1fVYc,80
|
65
|
-
sglang/srt/layers/fused_moe_grok/fused_moe.py,sha256=bxRcjdALxeY3FDnKivGOoNr6Er1kh6CCPtlAp7pjz50,23844
|
66
|
-
sglang/srt/layers/fused_moe_grok/layer.py,sha256=v-o5YHYEU2HIEZwouyuc3UyfNj7YQrEYOO_BXKELU7Y,23453
|
67
64
|
sglang/srt/layers/fused_moe_triton/__init__.py,sha256=PHKFqd2hPOO-g9kSMseg2g76lpg9OGXQDThWU6bt9vs,902
|
68
65
|
sglang/srt/layers/fused_moe_triton/fused_moe.py,sha256=qwfRBOeY5DT48Q6z71Eh9cjFehvs_K6eLIVWNL044Ug,28363
|
69
66
|
sglang/srt/layers/fused_moe_triton/layer.py,sha256=URDkTt8xEqnqpO5tb_3L7JlhlO53VWfqDDNSRYEu-LY,21545
|
@@ -72,17 +69,17 @@ sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87M
|
|
72
69
|
sglang/srt/lora/lora.py,sha256=KhhO9aKCyFWvJnhI07lZKANIvNjtt882HrTYFNBZMv0,15065
|
73
70
|
sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
|
74
71
|
sglang/srt/lora/lora_manager.py,sha256=DHiqdl0_4wQ5PxZBZtlCpP14515mDV2_H9tzL3Rdss8,12886
|
75
|
-
sglang/srt/managers/data_parallel_controller.py,sha256=
|
76
|
-
sglang/srt/managers/detokenizer_manager.py,sha256=
|
77
|
-
sglang/srt/managers/image_processor.py,sha256=
|
78
|
-
sglang/srt/managers/io_struct.py,sha256=
|
79
|
-
sglang/srt/managers/schedule_batch.py,sha256=
|
72
|
+
sglang/srt/managers/data_parallel_controller.py,sha256=psI4FAuBGjtdnEuwagnGdtRqvqSSxOROfNKQqVDqlVA,8382
|
73
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=TtrtE37XT5XcJzk8-R5rHZ16NHTPd5XZi8hf3h-sB2A,7462
|
74
|
+
sglang/srt/managers/image_processor.py,sha256=Y8RgyrzbJjJTpjbnZDa5qiiG5wWjZ68rOXUPDi6kkFo,13698
|
75
|
+
sglang/srt/managers/io_struct.py,sha256=bvhHIRSBpaCXFQqRBTpxy-hjvNtDxSfoDJ5XWCHoy6g,13646
|
76
|
+
sglang/srt/managers/schedule_batch.py,sha256=dTeM0U1xvUq_GlHy8SQft6-pP76cSubPKzCClsQ9MgM,44801
|
80
77
|
sglang/srt/managers/schedule_policy.py,sha256=ayFz4iPLIlG8mx5i1glTCAMHJPGpFedMP9UgRtqkNhA,12526
|
81
|
-
sglang/srt/managers/scheduler.py,sha256=
|
82
|
-
sglang/srt/managers/session_controller.py,sha256=
|
83
|
-
sglang/srt/managers/tokenizer_manager.py,sha256=
|
78
|
+
sglang/srt/managers/scheduler.py,sha256=aEU-6_0w-HbpFTMSoiDqf3mj_UfedjugCUvnQBmhgoU,56571
|
79
|
+
sglang/srt/managers/session_controller.py,sha256=Yp-IV3rXczACZxZXmF-QxW9CWICGy8KHQ9ttBGJ8WXA,2800
|
80
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=jLzoEIhQWzZX7rcLZ290vfnPY2ghxWdYhf7YJQtUC3s,25339
|
84
81
|
sglang/srt/managers/tp_worker.py,sha256=1SQJ60iKS9e5vGY555fT1iZ4OtLumXzeWfB08fSWKbk,6176
|
85
|
-
sglang/srt/managers/tp_worker_overlap_thread.py,sha256=
|
82
|
+
sglang/srt/managers/tp_worker_overlap_thread.py,sha256=pLQOHj-nFrqHyVFP-JvrU--tjh1X1yET_NJIFHp0H0I,7990
|
86
83
|
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
87
84
|
sglang/srt/mem_cache/chunk_cache.py,sha256=VcCpyrf5FOQ5xoKeOouCI5ZQLkZo_pgY1SPbDDkagGg,2492
|
88
85
|
sglang/srt/mem_cache/flush_cache.py,sha256=GYcxmNXh4hsMpFfNOuCTpKilW7guZwTtAg_usVeM3J0,979
|
@@ -105,14 +102,14 @@ sglang/srt/models/gemma2.py,sha256=lbfQhQpUhf1MAEB_00Uo6rp20k4Hr353UbPKKuMsxec,1
|
|
105
102
|
sglang/srt/models/gemma2_reward.py,sha256=cQawatbsfBuWQTueivYHl_17ZoQUHEelI1sr1y5pvfY,2556
|
106
103
|
sglang/srt/models/gpt2.py,sha256=Th7_Dnkw82GFBOuMOTrHtA44JBPHRUtY3Qd73rQwzMc,9741
|
107
104
|
sglang/srt/models/gpt_bigcode.py,sha256=lYo4ajy49VvvPkaduaFtOaCRT_ItqyNUE158S-BI5QA,10136
|
108
|
-
sglang/srt/models/grok.py,sha256=
|
105
|
+
sglang/srt/models/grok.py,sha256=d6qvj_i_Pam4dV_WInUvw2cIH3s6hCj-skbgyvLld1E,13923
|
109
106
|
sglang/srt/models/internlm2.py,sha256=DxbA15d9QR0tLOczpC6DkB8QyNHXJRdZatY6Nskwv1k,12170
|
110
107
|
sglang/srt/models/internlm2_reward.py,sha256=Lr-JA0vfTQJt9q5oDMiopGuoXAevyEv5PAoDe2rsTJk,2425
|
111
108
|
sglang/srt/models/llama.py,sha256=FSGuM3BamhuT5h2jedh5cSFwFYduOJwkAZJJ672awRw,16423
|
112
109
|
sglang/srt/models/llama_classification.py,sha256=c8WZ1ADa3f6s2IJVoP10ouVgeCwv_ndns_qMgLrC6QI,3413
|
113
110
|
sglang/srt/models/llama_embedding.py,sha256=2ex2jrz31osaAd9V8sJeN0qyxmk-L5NgOBkXL1puGhI,3166
|
114
111
|
sglang/srt/models/llama_reward.py,sha256=prhHDPpf1k6tlQtGE6zq5gx0uSZAD3W5v7W28bdgy4U,4619
|
115
|
-
sglang/srt/models/llava.py,sha256=
|
112
|
+
sglang/srt/models/llava.py,sha256=G6EcAJ84FvV4sae3Rrmdp-Bm-cczynSWSR16Ig-QiSw,25319
|
116
113
|
sglang/srt/models/llavavid.py,sha256=DeWqGSmXgIYGuLyy2ZrxjM9WqbRjueP4chNmXt7Bnus,12221
|
117
114
|
sglang/srt/models/minicpm.py,sha256=KbiTf-kaDAJxSo9Z4IGMTrs9WrYYji1KXO1kA2iy-as,13816
|
118
115
|
sglang/srt/models/minicpm3.py,sha256=C43mTr2Qjccj4sXuTDgzbfZhvCNbsEHNggMRXQ7SrWs,25108
|
@@ -121,18 +118,19 @@ sglang/srt/models/mixtral.py,sha256=E3d8I7V3Dp1nCEHRbhh-PKBG8UaVK5XOHwl9QyIjcX0,
|
|
121
118
|
sglang/srt/models/mixtral_quant.py,sha256=o-oTG8BGtWuNu-o6muHSarMNBQwrjQowyBFOQhuclZ8,14065
|
122
119
|
sglang/srt/models/mllama.py,sha256=pET1x8wY04yoS8HMCncKx0tFPqGp78K8rlA7Eq7XioE,37889
|
123
120
|
sglang/srt/models/olmo.py,sha256=DEUPNDM0z83N-Qdhkj2WJMtbiz5JNbSBMIjUaYZN9RM,12068
|
121
|
+
sglang/srt/models/olmo2.py,sha256=NriLbVKNGSR9bs0V8feeEorkRSr9BjlYbv50AReo2s4,13469
|
124
122
|
sglang/srt/models/olmoe.py,sha256=jVKrjqQQrWLdlkGSGUaMPdT9PHzNH4X-RVwON29eaGw,15412
|
125
123
|
sglang/srt/models/phi3_small.py,sha256=fxqGU0xphJzTeuBW38SRRYpRb2rcsg53JxuObK0pZig,15141
|
126
124
|
sglang/srt/models/qwen.py,sha256=P9zcFnz_Tsz73tVtLRwZ8uWzCtMxWOrzlv2o9Ys_Gck,9947
|
127
125
|
sglang/srt/models/qwen2.py,sha256=ApFFASNwvrkDXi-KkCNA7fTk4uLMuJWoMg15zCaAKdA,12514
|
128
126
|
sglang/srt/models/qwen2_moe.py,sha256=1oxDsKDq3jlHKx9jMi1SfHOqCRVyN5n76uw3M-CUODE,17048
|
129
|
-
sglang/srt/models/qwen2_vl.py,sha256=
|
127
|
+
sglang/srt/models/qwen2_vl.py,sha256=wFKBq52nZ5Q1sloDNh9YcYIoJ-4QpGVA15StxRMBuYE,26785
|
130
128
|
sglang/srt/models/stablelm.py,sha256=jpmsyWMJo_9JapOESnuV7ObNCh78BRznXY0iFvvIbZE,11354
|
131
129
|
sglang/srt/models/torch_native_llama.py,sha256=vNQxsnbVAY1bdyMCCWDZAtWdbaFIiJXhmVxHjk5BB9Y,19400
|
132
130
|
sglang/srt/models/xverse.py,sha256=LGe0ma0wOir3x-OLBT_cRocw8JEo9d3AYNxgA2OcLrk,13659
|
133
131
|
sglang/srt/models/xverse_moe.py,sha256=YqbzkSsnTFt-8-aI8YobF9qJA70qrBjbS1Kjn1KNqVY,15766
|
134
132
|
sglang/srt/models/yivl.py,sha256=yj4aWsOBVGQBLurSrLmYXVC7zGIPH7EYHHtAaAZ7Liw,4859
|
135
|
-
sglang/srt/openai_api/adapter.py,sha256=
|
133
|
+
sglang/srt/openai_api/adapter.py,sha256=Rit_XJ4h-O1-_fwivIkcGHt1hLLz3Y3XdTtEtDTnBcU,53634
|
136
134
|
sglang/srt/openai_api/protocol.py,sha256=vBgrbTqtECsZ5dG0rgP1FHsTBt4eR9zbDX3FBIN-rz4,10172
|
137
135
|
sglang/srt/sampling/sampling_batch_info.py,sha256=YC-KPyDWyLGNPL4YVcst4xwP8Wlz2zcCNJHB_5zljXQ,8470
|
138
136
|
sglang/srt/sampling/sampling_params.py,sha256=n7RbBg_bS5fYhsiWa8uJYnfoXy_i5DvtTBOkuFnHDNU,5286
|
@@ -155,10 +153,10 @@ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9
|
|
155
153
|
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
156
154
|
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
157
155
|
sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
|
158
|
-
sglang/test/test_utils.py,sha256=
|
156
|
+
sglang/test/test_utils.py,sha256=mPRTn1ORMiJODa_wWpH8QQG-IuZuQYZp9nGjyIcIHHU,23645
|
159
157
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
|
160
|
-
sglang-0.3.6.
|
161
|
-
sglang-0.3.6.
|
162
|
-
sglang-0.3.6.
|
163
|
-
sglang-0.3.6.
|
164
|
-
sglang-0.3.6.
|
158
|
+
sglang-0.3.6.post3.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
|
159
|
+
sglang-0.3.6.post3.dist-info/METADATA,sha256=2EeNTAznmmP399LIX3w0evy2Q2x6IqOSMoRKXJZLrSM,22171
|
160
|
+
sglang-0.3.6.post3.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
161
|
+
sglang-0.3.6.post3.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
162
|
+
sglang-0.3.6.post3.dist-info/RECORD,,
|
@@ -1 +0,0 @@
|
|
1
|
-
from sglang.srt.layers.fused_moe_grok.layer import FusedMoE, FusedMoEMethodBase
|