sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. sglang/bench_offline_throughput.py +4 -2
  2. sglang/bench_one_batch.py +3 -13
  3. sglang/bench_one_batch_server.py +143 -15
  4. sglang/bench_serving.py +158 -8
  5. sglang/compile_deep_gemm.py +1 -1
  6. sglang/eval/loogle_eval.py +157 -0
  7. sglang/lang/chat_template.py +119 -75
  8. sglang/lang/tracer.py +1 -1
  9. sglang/srt/code_completion_parser.py +1 -1
  10. sglang/srt/configs/deepseekvl2.py +5 -2
  11. sglang/srt/configs/device_config.py +1 -1
  12. sglang/srt/configs/internvl.py +696 -0
  13. sglang/srt/configs/janus_pro.py +3 -0
  14. sglang/srt/configs/model_config.py +18 -0
  15. sglang/srt/constrained/base_grammar_backend.py +55 -72
  16. sglang/srt/constrained/llguidance_backend.py +25 -21
  17. sglang/srt/constrained/outlines_backend.py +27 -26
  18. sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
  19. sglang/srt/constrained/xgrammar_backend.py +71 -53
  20. sglang/srt/conversation.py +78 -46
  21. sglang/srt/disaggregation/base/conn.py +1 -0
  22. sglang/srt/disaggregation/decode.py +11 -3
  23. sglang/srt/disaggregation/fake/conn.py +1 -1
  24. sglang/srt/disaggregation/mini_lb.py +74 -23
  25. sglang/srt/disaggregation/mooncake/conn.py +236 -138
  26. sglang/srt/disaggregation/nixl/conn.py +242 -71
  27. sglang/srt/disaggregation/prefill.py +7 -4
  28. sglang/srt/disaggregation/utils.py +51 -2
  29. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
  30. sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
  31. sglang/srt/distributed/device_communicators/pynccl.py +2 -1
  32. sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
  33. sglang/srt/distributed/parallel_state.py +22 -1
  34. sglang/srt/entrypoints/engine.py +31 -4
  35. sglang/srt/entrypoints/http_server.py +45 -3
  36. sglang/srt/entrypoints/verl_engine.py +3 -2
  37. sglang/srt/function_call_parser.py +2 -2
  38. sglang/srt/hf_transformers_utils.py +20 -1
  39. sglang/srt/layers/attention/flashattention_backend.py +147 -51
  40. sglang/srt/layers/attention/flashinfer_backend.py +23 -13
  41. sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
  42. sglang/srt/layers/attention/merge_state.py +46 -0
  43. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
  44. sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
  45. sglang/srt/layers/attention/utils.py +4 -2
  46. sglang/srt/layers/attention/vision.py +290 -163
  47. sglang/srt/layers/dp_attention.py +71 -21
  48. sglang/srt/layers/layernorm.py +1 -1
  49. sglang/srt/layers/logits_processor.py +46 -11
  50. sglang/srt/layers/moe/ep_moe/kernels.py +343 -8
  51. sglang/srt/layers/moe/ep_moe/layer.py +121 -2
  52. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  56. sglang/srt/layers/moe/topk.py +1 -1
  57. sglang/srt/layers/quantization/__init__.py +1 -1
  58. sglang/srt/layers/quantization/blockwise_int8.py +2 -2
  59. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
  60. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
  61. sglang/srt/layers/quantization/deep_gemm.py +77 -71
  62. sglang/srt/layers/quantization/fp8.py +110 -97
  63. sglang/srt/layers/quantization/fp8_kernel.py +81 -62
  64. sglang/srt/layers/quantization/fp8_utils.py +71 -23
  65. sglang/srt/layers/quantization/int8_kernel.py +2 -2
  66. sglang/srt/layers/quantization/kv_cache.py +3 -10
  67. sglang/srt/layers/quantization/utils.py +0 -5
  68. sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
  69. sglang/srt/layers/sampler.py +0 -4
  70. sglang/srt/layers/vocab_parallel_embedding.py +18 -7
  71. sglang/srt/lora/lora_manager.py +11 -14
  72. sglang/srt/lora/mem_pool.py +4 -4
  73. sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
  74. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  75. sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
  76. sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
  77. sglang/srt/lora/utils.py +1 -1
  78. sglang/srt/managers/cache_controller.py +115 -119
  79. sglang/srt/managers/data_parallel_controller.py +3 -3
  80. sglang/srt/managers/detokenizer_manager.py +21 -8
  81. sglang/srt/managers/io_struct.py +13 -1
  82. sglang/srt/managers/mm_utils.py +1 -1
  83. sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
  84. sglang/srt/managers/multimodal_processors/internvl.py +232 -0
  85. sglang/srt/managers/multimodal_processors/llava.py +46 -0
  86. sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
  87. sglang/srt/managers/schedule_batch.py +93 -23
  88. sglang/srt/managers/schedule_policy.py +11 -8
  89. sglang/srt/managers/scheduler.py +140 -100
  90. sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
  91. sglang/srt/managers/tokenizer_manager.py +157 -47
  92. sglang/srt/managers/tp_worker.py +21 -21
  93. sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
  94. sglang/srt/mem_cache/chunk_cache.py +2 -0
  95. sglang/srt/mem_cache/memory_pool.py +4 -2
  96. sglang/srt/metrics/collector.py +312 -37
  97. sglang/srt/model_executor/cuda_graph_runner.py +10 -11
  98. sglang/srt/model_executor/forward_batch_info.py +1 -1
  99. sglang/srt/model_executor/model_runner.py +57 -41
  100. sglang/srt/model_loader/loader.py +18 -11
  101. sglang/srt/models/clip.py +4 -4
  102. sglang/srt/models/deepseek_janus_pro.py +3 -3
  103. sglang/srt/models/deepseek_nextn.py +1 -20
  104. sglang/srt/models/deepseek_v2.py +77 -39
  105. sglang/srt/models/gemma3_mm.py +1 -1
  106. sglang/srt/models/internlm2.py +3 -0
  107. sglang/srt/models/internvl.py +670 -0
  108. sglang/srt/models/llama.py +3 -1
  109. sglang/srt/models/llama4.py +58 -13
  110. sglang/srt/models/llava.py +248 -5
  111. sglang/srt/models/minicpmv.py +1 -1
  112. sglang/srt/models/mixtral.py +98 -34
  113. sglang/srt/models/mllama.py +1 -1
  114. sglang/srt/models/phi3_small.py +16 -2
  115. sglang/srt/models/pixtral.py +467 -0
  116. sglang/srt/models/qwen2_5_vl.py +8 -4
  117. sglang/srt/models/qwen2_vl.py +4 -4
  118. sglang/srt/models/roberta.py +1 -1
  119. sglang/srt/models/torch_native_llama.py +1 -1
  120. sglang/srt/models/xiaomi_mimo.py +171 -0
  121. sglang/srt/openai_api/adapter.py +52 -42
  122. sglang/srt/openai_api/protocol.py +20 -16
  123. sglang/srt/reasoning_parser.py +1 -1
  124. sglang/srt/sampling/custom_logit_processor.py +18 -3
  125. sglang/srt/sampling/sampling_batch_info.py +2 -2
  126. sglang/srt/sampling/sampling_params.py +2 -0
  127. sglang/srt/server_args.py +64 -10
  128. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
  129. sglang/srt/speculative/eagle_utils.py +7 -7
  130. sglang/srt/speculative/eagle_worker.py +22 -19
  131. sglang/srt/utils.py +41 -6
  132. sglang/test/few_shot_gsm8k.py +2 -2
  133. sglang/test/few_shot_gsm8k_engine.py +2 -2
  134. sglang/test/run_eval.py +2 -2
  135. sglang/test/runners.py +8 -1
  136. sglang/test/send_one.py +13 -3
  137. sglang/test/simple_eval_common.py +1 -1
  138. sglang/test/simple_eval_humaneval.py +1 -1
  139. sglang/test/test_block_fp8.py +2 -2
  140. sglang/test/test_deepep_utils.py +219 -0
  141. sglang/test/test_programs.py +5 -5
  142. sglang/test/test_utils.py +92 -15
  143. sglang/utils.py +1 -1
  144. sglang/version.py +1 -1
  145. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +18 -9
  146. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +150 -137
  147. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +1 -1
  148. /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
  149. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
  150. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0
@@ -370,7 +370,7 @@ def test_dtype_gen():
370
370
  @sgl.function
371
371
  def dtype_gen(s):
372
372
  s += "Q: What is the full name of DNS?\n"
373
- s += "A: The full nams is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
373
+ s += "A: The full names is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
374
374
  s += "Q: Which year was DNS invented?\n"
375
375
  s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
376
376
  s += "Q: What is the value of pi?\n"
@@ -503,7 +503,7 @@ def test_hellaswag_select():
503
503
  #####################################
504
504
 
505
505
  # Run requests
506
- tic = time.time()
506
+ tic = time.perf_counter()
507
507
  rets = few_shot_hellaswag.run_batch(
508
508
  arguments,
509
509
  temperature=0,
@@ -514,13 +514,13 @@ def test_hellaswag_select():
514
514
  preds = []
515
515
  for i, ret in enumerate(rets):
516
516
  preds.append(choices[i].index(ret["answer"]))
517
- latency = time.time() - tic
517
+ latency = time.perf_counter() - tic
518
518
 
519
519
  # Compute accuracy
520
520
  accuracy = np.mean(np.array(preds) == np.array(labels))
521
521
 
522
522
  # Test generator style of run_batch
523
- tic = time.time()
523
+ tic = time.perf_counter()
524
524
  rets = few_shot_hellaswag.run_batch(
525
525
  arguments,
526
526
  temperature=0,
@@ -531,7 +531,7 @@ def test_hellaswag_select():
531
531
  preds_gen = []
532
532
  for i, ret in enumerate(rets):
533
533
  preds_gen.append(choices[i].index(ret["answer"]))
534
- latency_gen = time.time() - tic
534
+ latency_gen = time.perf_counter() - tic
535
535
 
536
536
  # Compute accuracy
537
537
  accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
sglang/test/test_utils.py CHANGED
@@ -66,6 +66,7 @@ DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
66
66
  )
67
67
  DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
68
68
  DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
69
+ DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-V3-0324"
69
70
  DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
70
71
  "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
71
72
  )
@@ -78,7 +79,8 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Ins
78
79
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
79
80
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
80
81
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
81
- DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
82
+ DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
83
+ DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST = "qwen2-vl"
82
84
 
83
85
  DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
84
86
  DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
@@ -393,12 +395,12 @@ def popen_launch_server(
393
395
  other_args: list[str] = (),
394
396
  env: Optional[dict] = None,
395
397
  return_stdout_stderr: Optional[tuple] = None,
396
- pd_seperated: bool = False,
398
+ pd_separated: bool = False,
397
399
  ):
398
400
  _, host, port = base_url.split(":")
399
401
  host = host[2:]
400
402
 
401
- if pd_seperated:
403
+ if pd_separated:
402
404
  command = "sglang.launch_pd_server"
403
405
  else:
404
406
  command = "sglang.launch_server"
@@ -412,7 +414,7 @@ def popen_launch_server(
412
414
  *[str(x) for x in other_args],
413
415
  ]
414
416
 
415
- if pd_seperated:
417
+ if pd_separated:
416
418
  command.extend(
417
419
  [
418
420
  "--lb-host",
@@ -447,9 +449,9 @@ def popen_launch_server(
447
449
  else:
448
450
  process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
449
451
 
450
- start_time = time.time()
452
+ start_time = time.perf_counter()
451
453
  with requests.Session() as session:
452
- while time.time() - start_time < timeout:
454
+ while time.perf_counter() - start_time < timeout:
453
455
  try:
454
456
  headers = {
455
457
  "Content-Type": "application/json; charset=utf-8",
@@ -476,6 +478,81 @@ def popen_launch_server(
476
478
  raise TimeoutError("Server failed to start within the timeout period.")
477
479
 
478
480
 
481
+ def popen_launch_pd_server(
482
+ model: str,
483
+ base_url: str,
484
+ timeout: float,
485
+ api_key: Optional[str] = None,
486
+ other_args: list[str] = (),
487
+ env: Optional[dict] = None,
488
+ return_stdout_stderr: Optional[tuple] = None,
489
+ ):
490
+ _, host, port = base_url.split(":")
491
+ host = host[2:]
492
+
493
+ command = "sglang.launch_server"
494
+
495
+ command = [
496
+ "python3",
497
+ "-m",
498
+ command,
499
+ "--model-path",
500
+ model,
501
+ *[str(x) for x in other_args],
502
+ ]
503
+
504
+ command.extend(
505
+ [
506
+ "--host",
507
+ host,
508
+ "--port",
509
+ port,
510
+ ]
511
+ )
512
+
513
+ if api_key:
514
+ command += ["--api-key", api_key]
515
+
516
+ print(f"command={' '.join(command)}")
517
+
518
+ if return_stdout_stderr:
519
+ process = subprocess.Popen(
520
+ command,
521
+ stdout=return_stdout_stderr[0],
522
+ stderr=return_stdout_stderr[1],
523
+ env=env,
524
+ text=True,
525
+ )
526
+ else:
527
+ process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
528
+
529
+ start_time = time.time()
530
+ with requests.Session() as session:
531
+ while time.time() - start_time < timeout:
532
+ try:
533
+ headers = {
534
+ "Content-Type": "application/json; charset=utf-8",
535
+ "Authorization": f"Bearer {api_key}",
536
+ }
537
+ response = session.get(
538
+ f"{base_url}/health",
539
+ headers=headers,
540
+ )
541
+ if response.status_code == 200:
542
+ return process
543
+ except requests.RequestException:
544
+ pass
545
+
546
+ return_code = process.poll()
547
+ if return_code is not None:
548
+ raise Exception(f"Server unexpectedly exits ({return_code=}).")
549
+
550
+ time.sleep(10)
551
+
552
+ kill_process_tree(process.pid)
553
+ raise TimeoutError("Server failed to start within the timeout period.")
554
+
555
+
479
556
  def run_with_timeout(
480
557
  func: Callable,
481
558
  args: tuple = (),
@@ -507,7 +584,7 @@ class TestFile:
507
584
 
508
585
 
509
586
  def run_unittest_files(files: List[TestFile], timeout_per_file: float):
510
- tic = time.time()
587
+ tic = time.perf_counter()
511
588
  success = True
512
589
 
513
590
  for i, file in enumerate(files):
@@ -522,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
522
599
  f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
523
600
  flush=True,
524
601
  )
525
- tic = time.time()
602
+ tic = time.perf_counter()
526
603
 
527
604
  process = subprocess.Popen(
528
605
  ["python3", filename], stdout=None, stderr=None, env=os.environ
529
606
  )
530
607
  process.wait()
531
- elapsed = time.time() - tic
608
+ elapsed = time.perf_counter() - tic
532
609
 
533
610
  print(
534
611
  f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
@@ -554,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
554
631
  break
555
632
 
556
633
  if success:
557
- print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
634
+ print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
558
635
  else:
559
- print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
636
+ print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
560
637
 
561
638
  return 0 if success else -1
562
639
 
@@ -579,7 +656,7 @@ def get_benchmark_args(
579
656
  disable_stream=False,
580
657
  disable_ignore_eos=False,
581
658
  seed: int = 0,
582
- pd_seperated: bool = False,
659
+ pd_separated: bool = False,
583
660
  ):
584
661
  return SimpleNamespace(
585
662
  backend="sglang",
@@ -609,7 +686,7 @@ def get_benchmark_args(
609
686
  profile=None,
610
687
  lora_name=None,
611
688
  prompt_suffix="",
612
- pd_seperated=pd_seperated,
689
+ pd_separated=pd_separated,
613
690
  )
614
691
 
615
692
 
@@ -673,7 +750,7 @@ def run_bench_serving_multi(
673
750
  other_server_args,
674
751
  benchmark_args,
675
752
  need_warmup=False,
676
- pd_seperated=False,
753
+ pd_separated=False,
677
754
  ):
678
755
  # Launch the server
679
756
  process = popen_launch_server(
@@ -681,7 +758,7 @@ def run_bench_serving_multi(
681
758
  base_url,
682
759
  timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
683
760
  other_args=other_server_args,
684
- pd_seperated=pd_seperated,
761
+ pd_separated=pd_separated,
685
762
  )
686
763
 
687
764
  # run benchmark for all
sglang/utils.py CHANGED
@@ -278,7 +278,7 @@ def graceful_registry(sub_module_name: str):
278
278
  f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
279
279
  )
280
280
  if signum == signal.SIGTERM:
281
- logger.info(f"{sub_module_name} recive sigterm")
281
+ logger.info(f"{sub_module_name} receive sigterm")
282
282
 
283
283
  signal.signal(signal.SIGTERM, graceful_shutdown)
284
284
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.6.post2"
1
+ __version__ = "0.4.6.post4"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.6.post2
3
+ Version: 0.4.6.post4
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -230,6 +230,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
230
230
  Requires-Dist: ninja; extra == "runtime-common"
231
231
  Requires-Dist: orjson; extra == "runtime-common"
232
232
  Requires-Dist: packaging; extra == "runtime-common"
233
+ Requires-Dist: partial_json_parser; extra == "runtime-common"
233
234
  Requires-Dist: pillow; extra == "runtime-common"
234
235
  Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
235
236
  Requires-Dist: psutil; extra == "runtime-common"
@@ -242,17 +243,16 @@ Requires-Dist: torchao>=0.9.0; extra == "runtime-common"
242
243
  Requires-Dist: transformers==4.51.1; extra == "runtime-common"
243
244
  Requires-Dist: uvicorn; extra == "runtime-common"
244
245
  Requires-Dist: uvloop; extra == "runtime-common"
245
- Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
246
+ Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
246
247
  Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
247
248
  Provides-Extra: srt
248
249
  Requires-Dist: sglang[runtime_common]; extra == "srt"
249
- Requires-Dist: sgl-kernel==0.1.1; extra == "srt"
250
+ Requires-Dist: sgl-kernel==0.1.2.post1; extra == "srt"
250
251
  Requires-Dist: flashinfer_python==0.2.5; extra == "srt"
251
252
  Requires-Dist: torch==2.6.0; extra == "srt"
252
253
  Requires-Dist: torchvision==0.21.0; extra == "srt"
253
254
  Requires-Dist: cuda-python; extra == "srt"
254
255
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
255
- Requires-Dist: partial_json_parser; extra == "srt"
256
256
  Requires-Dist: einops; extra == "srt"
257
257
  Provides-Extra: blackwell
258
258
  Requires-Dist: sglang[runtime_common]; extra == "blackwell"
@@ -261,7 +261,6 @@ Requires-Dist: torch; extra == "blackwell"
261
261
  Requires-Dist: torchvision; extra == "blackwell"
262
262
  Requires-Dist: cuda-python; extra == "blackwell"
263
263
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
264
- Requires-Dist: partial_json_parser; extra == "blackwell"
265
264
  Requires-Dist: einops; extra == "blackwell"
266
265
  Provides-Extra: srt-hip
267
266
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
@@ -278,6 +277,9 @@ Provides-Extra: srt-cpu
278
277
  Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
279
278
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-cpu"
280
279
  Requires-Dist: torch; extra == "srt-cpu"
280
+ Provides-Extra: srt-npu
281
+ Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
282
+ Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-npu"
281
283
  Provides-Extra: openai
282
284
  Requires-Dist: openai>=1.0; extra == "openai"
283
285
  Requires-Dist: tiktoken; extra == "openai"
@@ -299,6 +301,7 @@ Requires-Dist: sglang[srt]; extra == "all"
299
301
  Requires-Dist: sglang[openai]; extra == "all"
300
302
  Requires-Dist: sglang[anthropic]; extra == "all"
301
303
  Requires-Dist: sglang[litellm]; extra == "all"
304
+ Requires-Dist: sglang[torch_memory_saver]; extra == "all"
302
305
  Provides-Extra: all-hip
303
306
  Requires-Dist: sglang[srt_hip]; extra == "all-hip"
304
307
  Requires-Dist: sglang[openai]; extra == "all-hip"
@@ -319,6 +322,11 @@ Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
319
322
  Requires-Dist: sglang[openai]; extra == "all-cpu"
320
323
  Requires-Dist: sglang[anthropic]; extra == "all-cpu"
321
324
  Requires-Dist: sglang[litellm]; extra == "all-cpu"
325
+ Provides-Extra: all-npu
326
+ Requires-Dist: sglang[srt_npu]; extra == "all-npu"
327
+ Requires-Dist: sglang[openai]; extra == "all-npu"
328
+ Requires-Dist: sglang[anthropic]; extra == "all-npu"
329
+ Requires-Dist: sglang[litellm]; extra == "all-npu"
322
330
  Provides-Extra: dev
323
331
  Requires-Dist: sglang[all]; extra == "dev"
324
332
  Requires-Dist: sglang[test]; extra == "dev"
@@ -358,18 +366,19 @@ Dynamic: license-file
358
366
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
359
367
 
360
368
  ## News
369
+ - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
361
370
  - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
362
371
  - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
363
- - [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
364
372
  - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
365
373
  - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
366
- - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
367
374
  - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
368
375
 
369
376
  <details>
370
377
  <summary>More</summary>
371
378
 
379
+ - [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
372
380
  - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
381
+ - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
373
382
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
374
383
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
375
384
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -383,7 +392,7 @@ The core features include:
383
392
 
384
393
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
385
394
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
386
- - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
395
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
387
396
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
388
397
 
389
398
  ## Getting Started
@@ -401,7 +410,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
401
410
 
402
411
  ## Adoption and Sponsorship
403
412
  The project has been deployed to large-scale production, generating trillions of tokens every day.
404
- It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
413
+ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Google Cloud, Hyperbolic, Iflytek, InnoMatrix, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
405
414
 
406
415
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
407
416