sglang 0.3.6.post2__py3-none-any.whl → 0.3.6.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sglang/srt/server.py CHANGED
@@ -23,6 +23,7 @@ import json
23
23
  import logging
24
24
  import multiprocessing as mp
25
25
  import os
26
+ import signal
26
27
  import threading
27
28
  import time
28
29
  from http import HTTPStatus
@@ -79,7 +80,7 @@ from sglang.srt.utils import (
79
80
  configure_logger,
80
81
  delete_directory,
81
82
  is_port_available,
82
- kill_child_process,
83
+ kill_process_tree,
83
84
  maybe_set_triton_cache_manager,
84
85
  prepare_model_and_tokenizer,
85
86
  set_prometheus_multiproc_dir,
@@ -92,7 +93,7 @@ logger = logging.getLogger(__name__)
92
93
 
93
94
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
94
95
 
95
-
96
+ # Fast API
96
97
  app = FastAPI()
97
98
  app.add_middleware(
98
99
  CORSMiddleware,
@@ -103,7 +104,7 @@ app.add_middleware(
103
104
  )
104
105
 
105
106
  tokenizer_manager: TokenizerManager = None
106
- _max_total_num_tokens = None
107
+ scheduler_info: Dict = None
107
108
 
108
109
  ##### Native API endpoints #####
109
110
 
@@ -171,7 +172,7 @@ async def flush_cache():
171
172
 
172
173
  @app.get("/start_profile")
173
174
  @app.post("/start_profile")
174
- async def start_profile():
175
+ async def start_profile_async():
175
176
  """Start profiling."""
176
177
  tokenizer_manager.start_profile()
177
178
  return Response(
@@ -182,7 +183,7 @@ async def start_profile():
182
183
 
183
184
  @app.get("/stop_profile")
184
185
  @app.post("/stop_profile")
185
- async def stop_profile():
186
+ async def stop_profile_async():
186
187
  """Stop profiling."""
187
188
  tokenizer_manager.stop_profile()
188
189
  return Response(
@@ -233,6 +234,8 @@ async def close_session(obj: CloseSessionReqInput, request: Request):
233
234
  )
234
235
 
235
236
 
237
+ # fastapi implicitly converts json in the request to obj (dataclass)
238
+ @app.api_route("/generate", methods=["POST", "PUT"])
236
239
  @time_func_latency
237
240
  async def generate_request(obj: GenerateReqInput, request: Request):
238
241
  """Handle a generate request."""
@@ -266,11 +269,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
266
269
  )
267
270
 
268
271
 
269
- # fastapi implicitly converts json in the request to obj (dataclass)
270
- app.post("/generate")(generate_request)
271
- app.put("/generate")(generate_request)
272
-
273
-
272
+ @app.api_route("/encode", methods=["POST", "PUT"])
274
273
  @time_func_latency
275
274
  async def encode_request(obj: EmbeddingReqInput, request: Request):
276
275
  """Handle an embedding request."""
@@ -283,10 +282,7 @@ async def encode_request(obj: EmbeddingReqInput, request: Request):
283
282
  )
284
283
 
285
284
 
286
- app.post("/encode")(encode_request)
287
- app.put("/encode")(encode_request)
288
-
289
-
285
+ @app.api_route("/encode", methods=["POST", "PUT"])
290
286
  @time_func_latency
291
287
  async def classify_request(obj: EmbeddingReqInput, request: Request):
292
288
  """Handle a reward model request. Now the arguments and return values are the same as embedding models."""
@@ -299,10 +295,6 @@ async def classify_request(obj: EmbeddingReqInput, request: Request):
299
295
  )
300
296
 
301
297
 
302
- app.post("/classify")(classify_request)
303
- app.put("/classify")(classify_request)
304
-
305
-
306
298
  ##### OpenAI-compatible API endpoints #####
307
299
 
308
300
 
@@ -380,11 +372,11 @@ def launch_engine(
380
372
  server_args: ServerArgs,
381
373
  ):
382
374
  """
383
- Launch the Tokenizer Manager in the main process, the Scheduler in a subprocess, and the Detokenizer Manager in another subprocess.
375
+ Launch the TokenizerManager in the main process, the Scheduler in a subprocess, and the DetokenizerManager in another subprocess.
384
376
  """
385
377
 
386
378
  global tokenizer_manager
387
- global _max_total_num_tokens
379
+ global scheduler_info
388
380
 
389
381
  # Configure global environment
390
382
  configure_logger(server_args)
@@ -450,8 +442,8 @@ def launch_engine(
450
442
  if server_args.chat_template:
451
443
  load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
452
444
 
453
- # Wait for model to finish loading & get max token nums
454
- scheduler_info = []
445
+ # Wait for model to finish loading
446
+ scheduler_infos = []
455
447
  for i in range(len(scheduler_pipe_readers)):
456
448
  data = scheduler_pipe_readers[i].recv()
457
449
 
@@ -459,10 +451,10 @@ def launch_engine(
459
451
  raise RuntimeError(
460
452
  "Initialization failed. Please see the error messages above."
461
453
  )
462
- scheduler_info.append(data)
454
+ scheduler_infos.append(data)
463
455
 
464
456
  # Assume all schedulers have same max_total_num_tokens
465
- _max_total_num_tokens = scheduler_info[0]["max_total_num_tokens"]
457
+ scheduler_info = scheduler_infos[0]
466
458
 
467
459
 
468
460
  def launch_server(
@@ -476,12 +468,12 @@ def launch_server(
476
468
 
477
469
  1. HTTP server: A FastAPI server that routes requests to the engine.
478
470
  2. SRT engine:
479
- 1. Tokenizer Manager: Tokenizes the requests and sends them to the scheduler.
471
+ 1. TokenizerManager: Tokenizes the requests and sends them to the scheduler.
480
472
  2. Scheduler (subprocess): Receives requests from the Tokenizer Manager, schedules batches, forwards them, and sends the output tokens to the Detokenizer Manager.
481
- 3. Detokenizer Manager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.
473
+ 3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.
482
474
 
483
475
  Note:
484
- 1. The HTTP server and Tokenizer Manager both run in the main process.
476
+ 1. The HTTP server and TokenizerManager both run in the main process.
485
477
  2. Inter-process communication is done through ICP (each process uses a different port) via the ZMQ library.
486
478
  """
487
479
  launch_engine(server_args=server_args)
@@ -490,7 +482,7 @@ def launch_server(
490
482
  if server_args.api_key:
491
483
  add_api_key_middleware(app, server_args.api_key)
492
484
 
493
- # add prometheus middleware
485
+ # Add prometheus middleware
494
486
  if server_args.enable_metrics:
495
487
  add_prometheus_middleware(app)
496
488
  enable_func_timer()
@@ -502,7 +494,7 @@ def launch_server(
502
494
  t.start()
503
495
 
504
496
  try:
505
- # Listen for HTTP requests
497
+ # Update logging configs
506
498
  LOGGING_CONFIG["formatters"]["default"][
507
499
  "fmt"
508
500
  ] = "[%(asctime)s] %(levelprefix)s %(message)s"
@@ -511,6 +503,8 @@ def launch_server(
511
503
  "fmt"
512
504
  ] = '[%(asctime)s] %(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)s'
513
505
  LOGGING_CONFIG["formatters"]["access"]["datefmt"] = "%Y-%m-%d %H:%M:%S"
506
+
507
+ # Listen for HTTP requests
514
508
  uvicorn.run(
515
509
  app,
516
510
  host=server_args.host,
@@ -526,8 +520,7 @@ def launch_server(
526
520
  async def _get_server_info():
527
521
  return {
528
522
  **dataclasses.asdict(tokenizer_manager.server_args), # server args
529
- "memory_pool_size": await tokenizer_manager.get_memory_pool_size(), # memory pool size
530
- "max_total_num_tokens": _max_total_num_tokens, # max total num tokens
523
+ **scheduler_info,
531
524
  "version": __version__,
532
525
  }
533
526
 
@@ -562,6 +555,15 @@ def _set_envs_and_config(server_args: ServerArgs):
562
555
  "at https://docs.flashinfer.ai/installation.html.",
563
556
  )
564
557
 
558
+ # Register the signal handler.
559
+ # The child processes will send SIGQUIT to this process when any error happens
560
+ # This process then clean up the whole process tree
561
+ def sigquit_handler(signum, frame):
562
+ kill_process_tree(os.getpid())
563
+
564
+ signal.signal(signal.SIGQUIT, sigquit_handler)
565
+
566
+ # Set mp start method
565
567
  mp.set_start_method("spawn", force=True)
566
568
 
567
569
 
@@ -588,7 +590,7 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
588
590
  if pipe_finish_writer is not None:
589
591
  pipe_finish_writer.send(last_traceback)
590
592
  logger.error(f"Initialization failed. warmup error: {last_traceback}")
591
- kill_child_process(include_self=True)
593
+ kill_process_tree(os.getpid())
592
594
  return
593
595
 
594
596
  model_info = res.json()
@@ -621,9 +623,10 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
621
623
  if pipe_finish_writer is not None:
622
624
  pipe_finish_writer.send(last_traceback)
623
625
  logger.error(f"Initialization failed. warmup error: {last_traceback}")
624
- kill_child_process(include_self=True)
626
+ kill_process_tree(os.getpid())
625
627
  return
626
628
 
629
+ # Debug print
627
630
  # logger.info(f"{res.json()=}")
628
631
 
629
632
  logger.info("The server is fired up and ready to roll!")
@@ -690,7 +693,7 @@ class Runtime:
690
693
 
691
694
  def shutdown(self):
692
695
  if self.pid is not None:
693
- kill_child_process(self.pid, include_self=True)
696
+ kill_process_tree(self.pid)
694
697
  self.pid = None
695
698
 
696
699
  def cache_prefix(self, prefix: str):
@@ -800,18 +803,11 @@ class Engine:
800
803
  launching the HTTP server adds unnecessary complexity or overhead,
801
804
  """
802
805
 
803
- def __init__(self, *args, **kwargs):
804
-
806
+ def __init__(self, log_level: str = "error", *args, **kwargs):
805
807
  # before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
806
808
  atexit.register(self.shutdown)
807
809
 
808
- # runtime server default log level is log
809
- # offline engine works in scripts, so we set it to error
810
-
811
- if "log_level" not in kwargs:
812
- kwargs["log_level"] = "error"
813
-
814
- server_args = ServerArgs(*args, **kwargs)
810
+ server_args = ServerArgs(*args, log_level=log_level, **kwargs)
815
811
  launch_engine(server_args=server_args)
816
812
 
817
813
  def generate(
@@ -914,7 +910,7 @@ class Engine:
914
910
  return ret
915
911
 
916
912
  def shutdown(self):
917
- kill_child_process()
913
+ kill_process_tree(os.getpid(), include_parent=False)
918
914
 
919
915
  def get_tokenizer(self):
920
916
  global tokenizer_manager
@@ -934,5 +930,11 @@ class Engine:
934
930
  loop = asyncio.get_event_loop()
935
931
  return loop.run_until_complete(encode_request(obj, None))
936
932
 
933
+ def start_profile(self):
934
+ tokenizer_manager.start_profile()
935
+
936
+ def stop_profile(self):
937
+ tokenizer_manager.stop_profile()
938
+
937
939
  async def get_server_info(self):
938
940
  return await _get_server_info()
sglang/srt/server_args.py CHANGED
@@ -144,7 +144,7 @@ class ServerArgs:
144
144
  if self.served_model_name is None:
145
145
  self.served_model_name = self.model_path
146
146
 
147
- if self.chunked_prefill_size <= 0:
147
+ if self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0:
148
148
  # Disable chunked prefill
149
149
  self.chunked_prefill_size = None
150
150
 
sglang/srt/utils.py CHANGED
@@ -443,26 +443,14 @@ def assert_pkg_version(pkg: str, min_version: str, message: str):
443
443
  )
444
444
 
445
445
 
446
- def kill_parent_process():
447
- """Kill the parent process and all children of the parent process."""
448
- current_process = psutil.Process()
449
- parent_process = current_process.parent()
450
- kill_child_process(
451
- parent_process.pid, include_self=True, skip_pid=current_process.pid
452
- )
453
- try:
454
- current_process.kill()
455
- except psutil.NoSuchProcess:
456
- pass
457
-
458
-
459
- def kill_child_process(pid=None, include_self=False, skip_pid=None):
460
- """Kill the process and all its children process."""
461
- if pid is None:
462
- pid = os.getpid()
446
+ def kill_process_tree(parent_pid, include_parent: bool = True, skip_pid: int = None):
447
+ """Kill the process and all its child processes."""
448
+ if parent_pid is None:
449
+ parent_pid = os.getpid()
450
+ include_parent = False
463
451
 
464
452
  try:
465
- itself = psutil.Process(pid)
453
+ itself = psutil.Process(parent_pid)
466
454
  except psutil.NoSuchProcess:
467
455
  return
468
456
 
@@ -475,13 +463,13 @@ def kill_child_process(pid=None, include_self=False, skip_pid=None):
475
463
  except psutil.NoSuchProcess:
476
464
  pass
477
465
 
478
- if include_self:
466
+ if include_parent:
479
467
  try:
480
468
  itself.kill()
481
469
 
482
470
  # Sometime processes cannot be killed with SIGKILL (e.g, PID=1 launched by kubernetes),
483
471
  # so we send an additional signal to kill them.
484
- itself.send_signal(signal.SIGINT)
472
+ itself.send_signal(signal.SIGQUIT)
485
473
  except psutil.NoSuchProcess:
486
474
  pass
487
475
 
sglang/test/test_utils.py CHANGED
@@ -22,7 +22,7 @@ from sglang.bench_serving import run_benchmark
22
22
  from sglang.global_config import global_config
23
23
  from sglang.lang.backend.openai import OpenAI
24
24
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
25
- from sglang.srt.utils import get_bool_env_var, kill_child_process
25
+ from sglang.srt.utils import get_bool_env_var, kill_process_tree
26
26
  from sglang.test.run_eval import run_eval
27
27
  from sglang.utils import get_exception_traceback
28
28
 
@@ -504,7 +504,7 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
504
504
  )
505
505
  assert ret_code == 0
506
506
  except TimeoutError:
507
- kill_child_process(process.pid, include_self=True)
507
+ kill_process_tree(process.pid)
508
508
  time.sleep(5)
509
509
  print(
510
510
  f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
@@ -578,7 +578,7 @@ def run_bench_serving(
578
578
  run_benchmark(warmup_args)
579
579
  res = run_benchmark(args)
580
580
  finally:
581
- kill_child_process(process.pid, include_self=True)
581
+ kill_process_tree(process.pid)
582
582
 
583
583
  assert res["completed"] == num_prompts
584
584
  return res
@@ -611,7 +611,7 @@ def run_bench_one_batch(model, other_args):
611
611
  lastline = output.split("\n")[-3]
612
612
  output_throughput = float(lastline.split(" ")[-2])
613
613
  finally:
614
- kill_child_process(process.pid, include_self=True)
614
+ kill_process_tree(process.pid)
615
615
 
616
616
  return output_throughput
617
617
 
@@ -677,8 +677,14 @@ def run_and_check_memory_leak(
677
677
  enable_mixed_chunk,
678
678
  disable_overlap,
679
679
  chunked_prefill_size,
680
+ assert_has_abort,
680
681
  ):
681
- other_args = ["--chunked-prefill-size", str(chunked_prefill_size)]
682
+ other_args = [
683
+ "--chunked-prefill-size",
684
+ str(chunked_prefill_size),
685
+ "--log-level",
686
+ "debug",
687
+ ]
682
688
  if disable_radix_cache:
683
689
  other_args += ["--disable-radix-cache"]
684
690
  if enable_mixed_chunk:
@@ -710,8 +716,8 @@ def run_and_check_memory_leak(
710
716
  workload_func(base_url, model)
711
717
 
712
718
  # Clean up everything
713
- kill_child_process(process.pid, include_self=True)
714
- kill_child_process(process.pid, include_self=True)
719
+ kill_process_tree(process.pid)
720
+ kill_process_tree(process.pid)
715
721
  stdout.close()
716
722
  stderr.close()
717
723
  if os.path.exists(STDOUT_FILENAME):
@@ -723,14 +729,19 @@ def run_and_check_memory_leak(
723
729
  # Assert success
724
730
  has_new_server = False
725
731
  has_leak = False
732
+ has_abort = False
726
733
  for line in output_lines:
727
734
  if "The server is fired" in line:
728
735
  has_new_server = True
729
736
  if "leak" in line:
730
737
  has_leak = True
738
+ if "Abort" in line:
739
+ has_abort = True
731
740
 
732
741
  assert has_new_server
733
742
  assert not has_leak
743
+ if assert_has_abort:
744
+ assert has_abort
734
745
 
735
746
 
736
747
  def run_mmlu_test(
@@ -761,6 +772,7 @@ def run_mmlu_test(
761
772
  enable_mixed_chunk,
762
773
  disable_overlap,
763
774
  chunked_prefill_size,
775
+ assert_has_abort=False,
764
776
  )
765
777
 
766
778
 
@@ -800,4 +812,5 @@ def run_mulit_request_test(
800
812
  enable_mixed_chunk,
801
813
  enable_overlap,
802
814
  chunked_prefill_size,
815
+ assert_has_abort=False,
803
816
  )
sglang/utils.py CHANGED
@@ -348,9 +348,9 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
348
348
 
349
349
 
350
350
  def terminate_process(process):
351
- from sglang.srt.utils import kill_child_process
351
+ from sglang.srt.utils import kill_process_tree
352
352
 
353
- kill_child_process(process.pid, include_self=True)
353
+ kill_process_tree(process.pid)
354
354
 
355
355
 
356
356
  def print_highlight(html_content: str):
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.6.post2"
1
+ __version__ = "0.3.6.post3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.6.post2
3
+ Version: 0.3.6.post3
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -241,6 +241,7 @@ Requires-Dist: sglang[runtime_common]; extra == "srt"
241
241
  Requires-Dist: torch; extra == "srt"
242
242
  Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
243
243
  Requires-Dist: cuda-python; extra == "srt"
244
+ Requires-Dist: flashinfer>=0.1.6; extra == "srt"
244
245
  Provides-Extra: srt-hip
245
246
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
246
247
  Requires-Dist: torch; extra == "srt-hip"
@@ -1,16 +1,16 @@
1
1
  sglang/__init__.py,sha256=3M0oz0ZA8fULhV5LwQ4hxh-MRdHsOJRD1D63C60pdG4,1616
2
2
  sglang/api.py,sha256=NdO6cYnklnEBQBKqQjlqI8-P1EownKQ71t5ibCGhEVo,6953
3
3
  sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
4
- sglang/bench_offline_throughput.py,sha256=z6uA6Gxa_nFZa0cOXi7MJDuX82xcqk5WfqBMavd8a-s,10929
5
- sglang/bench_one_batch.py,sha256=AVMpCBWEsMI2TlMK55JPgPJu0kHg8DI0WV_Bhd4pJgc,15668
6
- sglang/bench_one_batch_server.py,sha256=hYc3r9JQOLrfqmKgKPOmP0Kr63Sya9wPV_dHzMRZ2Dw,5924
4
+ sglang/bench_offline_throughput.py,sha256=3OrFI26PmoVTU3pQrBFC50AZI7HpKKuk4vYycbkDjhY,12428
5
+ sglang/bench_one_batch.py,sha256=iSev0LruPdfJ49mVeCJNFREmgex2omDSpTgwHgRDNIo,15692
6
+ sglang/bench_one_batch_server.py,sha256=-fV9FTLNNcSIy0pgYeggXedPVK0fVsXZqVQswT8OMOY,5945
7
7
  sglang/bench_serving.py,sha256=hI7FjaERyqKBrYtKewDU6E4rSufKxqsUPyUgtWtTKSI,52545
8
8
  sglang/check_env.py,sha256=rE4ZAG0e6M-Xd-qdHcKclN8Qav6b9gEh4yvlV_TbOg0,5450
9
9
  sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
10
- sglang/launch_server.py,sha256=U17c44CbbpMBm2JQxVLaz1mfUKk7PgBDhTLAFNeJEvI,362
10
+ sglang/launch_server.py,sha256=4y2QeSj0wVNB9MJQZeahD4ahTDU6gwqo7MPUytyFop0,403
11
11
  sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
12
- sglang/utils.py,sha256=eCvD3fZCALr-MuyZxJL7HAeeqqpxAxf4LJrf7OiCbco,11547
13
- sglang/version.py,sha256=_Aams_yVBpGe9-85k-kF3qpgcd3D_AsWkVfMFmCWh3c,28
12
+ sglang/utils.py,sha256=r4Dw-xffcrTRposls-gqyoYxjgJNYhVduK_6bDN_Vj4,11526
13
+ sglang/version.py,sha256=pyFIjLZBxCp2AwTbeLFaYhEL1dFXGzpYW00Vkg2755Y,28
14
14
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  sglang/lang/chat_template.py,sha256=jprS3-In2FTUoedKwZg-HYvDwU8RTIYntOlf2zoN2sU,14814
16
16
  sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
@@ -29,9 +29,9 @@ sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21
29
29
  sglang/srt/hf_transformers_utils.py,sha256=sUUCpjbTHuYDMuwOaz00nH5fataXKjliD8gCxXU64sw,6712
30
30
  sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
31
31
  sglang/srt/model_parallel.py,sha256=QR-Alqo0sElDXPJ79N1PhUHHKiEHPQn3dyXduMP-SHQ,3664
32
- sglang/srt/server.py,sha256=tH_22tnksy3bbhYu_njjx5L59pb9lJ7tU40Z2BLoiaI,30894
33
- sglang/srt/server_args.py,sha256=CfmpU6_EDnxJzpJiRx2n6AhOPCtrHPOf-7wEtTF__L0,30834
34
- sglang/srt/utils.py,sha256=QXc01TOB7abpL6p3KzfP7u2xFZohQ-ThbI5DAJGoHeI,33894
32
+ sglang/srt/server.py,sha256=1A_RdzTgeVPKcoZvsLs0dH9U3ZOY2MWjS6X3EUmwzPs,31011
33
+ sglang/srt/server_args.py,sha256=PbkhdNkr46Ngv3_JPplo5jLw78pebRxNVTiIb-9uPVA,30876
34
+ sglang/srt/utils.py,sha256=TWeASu4TOqIbvb-rJ0CYvFcMyk67hPJxQZnvyqrKu8k,33585
35
35
  sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
36
36
  sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
37
37
  sglang/srt/configs/model_config.py,sha256=r5N_OO4w3_R3kZ80P-ZPECscXmspI41d1vc6uEE9ixM,9526
@@ -61,9 +61,6 @@ sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=BE63WhKiutSNkh
61
61
  sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
62
62
  sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=Gfct-0_l-S2ZrP4F-zkzNiFbmd3C3f7uJovacOuDxaA,11472
63
63
  sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=lojFXRZMLWkzS2Y8uxaolnQhXaWKG19mCAWaF5KQeiI,6087
64
- sglang/srt/layers/fused_moe_grok/__init__.py,sha256=rj_JBzcP--eaaM6LGQ-u580uQvqLisp5JtGBAs1fVYc,80
65
- sglang/srt/layers/fused_moe_grok/fused_moe.py,sha256=bxRcjdALxeY3FDnKivGOoNr6Er1kh6CCPtlAp7pjz50,23844
66
- sglang/srt/layers/fused_moe_grok/layer.py,sha256=v-o5YHYEU2HIEZwouyuc3UyfNj7YQrEYOO_BXKELU7Y,23453
67
64
  sglang/srt/layers/fused_moe_triton/__init__.py,sha256=PHKFqd2hPOO-g9kSMseg2g76lpg9OGXQDThWU6bt9vs,902
68
65
  sglang/srt/layers/fused_moe_triton/fused_moe.py,sha256=qwfRBOeY5DT48Q6z71Eh9cjFehvs_K6eLIVWNL044Ug,28363
69
66
  sglang/srt/layers/fused_moe_triton/layer.py,sha256=URDkTt8xEqnqpO5tb_3L7JlhlO53VWfqDDNSRYEu-LY,21545
@@ -72,17 +69,17 @@ sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87M
72
69
  sglang/srt/lora/lora.py,sha256=KhhO9aKCyFWvJnhI07lZKANIvNjtt882HrTYFNBZMv0,15065
73
70
  sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
74
71
  sglang/srt/lora/lora_manager.py,sha256=DHiqdl0_4wQ5PxZBZtlCpP14515mDV2_H9tzL3Rdss8,12886
75
- sglang/srt/managers/data_parallel_controller.py,sha256=JxRtJJTVn1FU2iD292rLZPftAsR4_8j4d3yF8j0dvBc,8327
76
- sglang/srt/managers/detokenizer_manager.py,sha256=oWquBe0yvSwILwllMBJFJUEgBt1NEM_3KluAc0T6Pnw,7333
77
- sglang/srt/managers/image_processor.py,sha256=foLv3QVW_A8IRjRcHOKn0_HC771JbPEz8ML1mGqYKYw,13685
78
- sglang/srt/managers/io_struct.py,sha256=WLXz-tyn0jR7zNO9feRBXgyjphVa8qR55OoEOUdzoVI,13751
79
- sglang/srt/managers/schedule_batch.py,sha256=jBABHbL7gyrKdrFrzScJ76MtvG2D9Y5HDx74qsclo80,44470
72
+ sglang/srt/managers/data_parallel_controller.py,sha256=psI4FAuBGjtdnEuwagnGdtRqvqSSxOROfNKQqVDqlVA,8382
73
+ sglang/srt/managers/detokenizer_manager.py,sha256=TtrtE37XT5XcJzk8-R5rHZ16NHTPd5XZi8hf3h-sB2A,7462
74
+ sglang/srt/managers/image_processor.py,sha256=Y8RgyrzbJjJTpjbnZDa5qiiG5wWjZ68rOXUPDi6kkFo,13698
75
+ sglang/srt/managers/io_struct.py,sha256=bvhHIRSBpaCXFQqRBTpxy-hjvNtDxSfoDJ5XWCHoy6g,13646
76
+ sglang/srt/managers/schedule_batch.py,sha256=dTeM0U1xvUq_GlHy8SQft6-pP76cSubPKzCClsQ9MgM,44801
80
77
  sglang/srt/managers/schedule_policy.py,sha256=ayFz4iPLIlG8mx5i1glTCAMHJPGpFedMP9UgRtqkNhA,12526
81
- sglang/srt/managers/scheduler.py,sha256=JVxV3Y5AU0OOOfePVM5dVPuuN_Kd9nwV3p3vH3CHQps,56059
82
- sglang/srt/managers/session_controller.py,sha256=hajOnkNZ_JpP4E-GKMVGzyJSK4sc9uF9t229uFuxkVs,2874
83
- sglang/srt/managers/tokenizer_manager.py,sha256=zYbKEKNuM1B3PXzA7jnDpxew-0rZXSX-7dHmVLWG3e4,26477
78
+ sglang/srt/managers/scheduler.py,sha256=aEU-6_0w-HbpFTMSoiDqf3mj_UfedjugCUvnQBmhgoU,56571
79
+ sglang/srt/managers/session_controller.py,sha256=Yp-IV3rXczACZxZXmF-QxW9CWICGy8KHQ9ttBGJ8WXA,2800
80
+ sglang/srt/managers/tokenizer_manager.py,sha256=jLzoEIhQWzZX7rcLZ290vfnPY2ghxWdYhf7YJQtUC3s,25339
84
81
  sglang/srt/managers/tp_worker.py,sha256=1SQJ60iKS9e5vGY555fT1iZ4OtLumXzeWfB08fSWKbk,6176
85
- sglang/srt/managers/tp_worker_overlap_thread.py,sha256=7vhPebaOS4JamaS08CGf_hwxnUO7Gy_SXZXEPwNHKoY,7621
82
+ sglang/srt/managers/tp_worker_overlap_thread.py,sha256=pLQOHj-nFrqHyVFP-JvrU--tjh1X1yET_NJIFHp0H0I,7990
86
83
  sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
87
84
  sglang/srt/mem_cache/chunk_cache.py,sha256=VcCpyrf5FOQ5xoKeOouCI5ZQLkZo_pgY1SPbDDkagGg,2492
88
85
  sglang/srt/mem_cache/flush_cache.py,sha256=GYcxmNXh4hsMpFfNOuCTpKilW7guZwTtAg_usVeM3J0,979
@@ -105,14 +102,14 @@ sglang/srt/models/gemma2.py,sha256=lbfQhQpUhf1MAEB_00Uo6rp20k4Hr353UbPKKuMsxec,1
105
102
  sglang/srt/models/gemma2_reward.py,sha256=cQawatbsfBuWQTueivYHl_17ZoQUHEelI1sr1y5pvfY,2556
106
103
  sglang/srt/models/gpt2.py,sha256=Th7_Dnkw82GFBOuMOTrHtA44JBPHRUtY3Qd73rQwzMc,9741
107
104
  sglang/srt/models/gpt_bigcode.py,sha256=lYo4ajy49VvvPkaduaFtOaCRT_ItqyNUE158S-BI5QA,10136
108
- sglang/srt/models/grok.py,sha256=rDIH_SFzauuEHcL_vCOSrYLjdBC3i3o_AcceL3amsJw,14927
105
+ sglang/srt/models/grok.py,sha256=d6qvj_i_Pam4dV_WInUvw2cIH3s6hCj-skbgyvLld1E,13923
109
106
  sglang/srt/models/internlm2.py,sha256=DxbA15d9QR0tLOczpC6DkB8QyNHXJRdZatY6Nskwv1k,12170
110
107
  sglang/srt/models/internlm2_reward.py,sha256=Lr-JA0vfTQJt9q5oDMiopGuoXAevyEv5PAoDe2rsTJk,2425
111
108
  sglang/srt/models/llama.py,sha256=FSGuM3BamhuT5h2jedh5cSFwFYduOJwkAZJJ672awRw,16423
112
109
  sglang/srt/models/llama_classification.py,sha256=c8WZ1ADa3f6s2IJVoP10ouVgeCwv_ndns_qMgLrC6QI,3413
113
110
  sglang/srt/models/llama_embedding.py,sha256=2ex2jrz31osaAd9V8sJeN0qyxmk-L5NgOBkXL1puGhI,3166
114
111
  sglang/srt/models/llama_reward.py,sha256=prhHDPpf1k6tlQtGE6zq5gx0uSZAD3W5v7W28bdgy4U,4619
115
- sglang/srt/models/llava.py,sha256=HjC2TDLngpaN8HMYyGp5doEK32HeQN8iT2tYE_Slrtg,25130
112
+ sglang/srt/models/llava.py,sha256=G6EcAJ84FvV4sae3Rrmdp-Bm-cczynSWSR16Ig-QiSw,25319
116
113
  sglang/srt/models/llavavid.py,sha256=DeWqGSmXgIYGuLyy2ZrxjM9WqbRjueP4chNmXt7Bnus,12221
117
114
  sglang/srt/models/minicpm.py,sha256=KbiTf-kaDAJxSo9Z4IGMTrs9WrYYji1KXO1kA2iy-as,13816
118
115
  sglang/srt/models/minicpm3.py,sha256=C43mTr2Qjccj4sXuTDgzbfZhvCNbsEHNggMRXQ7SrWs,25108
@@ -121,18 +118,19 @@ sglang/srt/models/mixtral.py,sha256=E3d8I7V3Dp1nCEHRbhh-PKBG8UaVK5XOHwl9QyIjcX0,
121
118
  sglang/srt/models/mixtral_quant.py,sha256=o-oTG8BGtWuNu-o6muHSarMNBQwrjQowyBFOQhuclZ8,14065
122
119
  sglang/srt/models/mllama.py,sha256=pET1x8wY04yoS8HMCncKx0tFPqGp78K8rlA7Eq7XioE,37889
123
120
  sglang/srt/models/olmo.py,sha256=DEUPNDM0z83N-Qdhkj2WJMtbiz5JNbSBMIjUaYZN9RM,12068
121
+ sglang/srt/models/olmo2.py,sha256=NriLbVKNGSR9bs0V8feeEorkRSr9BjlYbv50AReo2s4,13469
124
122
  sglang/srt/models/olmoe.py,sha256=jVKrjqQQrWLdlkGSGUaMPdT9PHzNH4X-RVwON29eaGw,15412
125
123
  sglang/srt/models/phi3_small.py,sha256=fxqGU0xphJzTeuBW38SRRYpRb2rcsg53JxuObK0pZig,15141
126
124
  sglang/srt/models/qwen.py,sha256=P9zcFnz_Tsz73tVtLRwZ8uWzCtMxWOrzlv2o9Ys_Gck,9947
127
125
  sglang/srt/models/qwen2.py,sha256=ApFFASNwvrkDXi-KkCNA7fTk4uLMuJWoMg15zCaAKdA,12514
128
126
  sglang/srt/models/qwen2_moe.py,sha256=1oxDsKDq3jlHKx9jMi1SfHOqCRVyN5n76uw3M-CUODE,17048
129
- sglang/srt/models/qwen2_vl.py,sha256=G3FNa_N2-CzB56LVrukwBtJazxMrDC_GPNjK6Wqxc4s,26415
127
+ sglang/srt/models/qwen2_vl.py,sha256=wFKBq52nZ5Q1sloDNh9YcYIoJ-4QpGVA15StxRMBuYE,26785
130
128
  sglang/srt/models/stablelm.py,sha256=jpmsyWMJo_9JapOESnuV7ObNCh78BRznXY0iFvvIbZE,11354
131
129
  sglang/srt/models/torch_native_llama.py,sha256=vNQxsnbVAY1bdyMCCWDZAtWdbaFIiJXhmVxHjk5BB9Y,19400
132
130
  sglang/srt/models/xverse.py,sha256=LGe0ma0wOir3x-OLBT_cRocw8JEo9d3AYNxgA2OcLrk,13659
133
131
  sglang/srt/models/xverse_moe.py,sha256=YqbzkSsnTFt-8-aI8YobF9qJA70qrBjbS1Kjn1KNqVY,15766
134
132
  sglang/srt/models/yivl.py,sha256=yj4aWsOBVGQBLurSrLmYXVC7zGIPH7EYHHtAaAZ7Liw,4859
135
- sglang/srt/openai_api/adapter.py,sha256=MhOcWZjcLv4_OuvLvDMcAu6K_u2joJvhaZxaKm0hi3M,53634
133
+ sglang/srt/openai_api/adapter.py,sha256=Rit_XJ4h-O1-_fwivIkcGHt1hLLz3Y3XdTtEtDTnBcU,53634
136
134
  sglang/srt/openai_api/protocol.py,sha256=vBgrbTqtECsZ5dG0rgP1FHsTBt4eR9zbDX3FBIN-rz4,10172
137
135
  sglang/srt/sampling/sampling_batch_info.py,sha256=YC-KPyDWyLGNPL4YVcst4xwP8Wlz2zcCNJHB_5zljXQ,8470
138
136
  sglang/srt/sampling/sampling_params.py,sha256=n7RbBg_bS5fYhsiWa8uJYnfoXy_i5DvtTBOkuFnHDNU,5286
@@ -155,10 +153,10 @@ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9
155
153
  sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
156
154
  sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
157
155
  sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
158
- sglang/test/test_utils.py,sha256=NBEGQC_wtMqODQQZWrxdwmsoLFSZfDlQzIbsQ1kE_Yc,23468
156
+ sglang/test/test_utils.py,sha256=mPRTn1ORMiJODa_wWpH8QQG-IuZuQYZp9nGjyIcIHHU,23645
159
157
  sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
160
- sglang-0.3.6.post2.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
161
- sglang-0.3.6.post2.dist-info/METADATA,sha256=3ekB4UX6bNwXzqlRChfxG0R8sme-x0FQAImcw0gpfM8,22122
162
- sglang-0.3.6.post2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
163
- sglang-0.3.6.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
164
- sglang-0.3.6.post2.dist-info/RECORD,,
158
+ sglang-0.3.6.post3.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
159
+ sglang-0.3.6.post3.dist-info/METADATA,sha256=2EeNTAznmmP399LIX3w0evy2Q2x6IqOSMoRKXJZLrSM,22171
160
+ sglang-0.3.6.post3.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
161
+ sglang-0.3.6.post3.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
162
+ sglang-0.3.6.post3.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- from sglang.srt.layers.fused_moe_grok.layer import FusedMoE, FusedMoEMethodBase