sglang 0.4.4__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +6 -0
  3. sglang/bench_one_batch.py +1 -1
  4. sglang/bench_one_batch_server.py +1 -1
  5. sglang/bench_serving.py +3 -1
  6. sglang/check_env.py +3 -4
  7. sglang/lang/backend/openai.py +18 -5
  8. sglang/lang/chat_template.py +28 -7
  9. sglang/lang/interpreter.py +7 -3
  10. sglang/lang/ir.py +10 -0
  11. sglang/srt/_custom_ops.py +1 -1
  12. sglang/srt/code_completion_parser.py +174 -0
  13. sglang/srt/configs/__init__.py +2 -6
  14. sglang/srt/configs/deepseekvl2.py +667 -0
  15. sglang/srt/configs/janus_pro.py +3 -4
  16. sglang/srt/configs/load_config.py +1 -0
  17. sglang/srt/configs/model_config.py +63 -11
  18. sglang/srt/configs/utils.py +25 -0
  19. sglang/srt/connector/__init__.py +51 -0
  20. sglang/srt/connector/base_connector.py +112 -0
  21. sglang/srt/connector/redis.py +85 -0
  22. sglang/srt/connector/s3.py +122 -0
  23. sglang/srt/connector/serde/__init__.py +31 -0
  24. sglang/srt/connector/serde/safe_serde.py +29 -0
  25. sglang/srt/connector/serde/serde.py +43 -0
  26. sglang/srt/connector/utils.py +35 -0
  27. sglang/srt/conversation.py +88 -0
  28. sglang/srt/disaggregation/conn.py +81 -0
  29. sglang/srt/disaggregation/decode.py +495 -0
  30. sglang/srt/disaggregation/mini_lb.py +285 -0
  31. sglang/srt/disaggregation/prefill.py +249 -0
  32. sglang/srt/disaggregation/utils.py +44 -0
  33. sglang/srt/distributed/parallel_state.py +10 -3
  34. sglang/srt/entrypoints/engine.py +55 -5
  35. sglang/srt/entrypoints/http_server.py +71 -12
  36. sglang/srt/function_call_parser.py +164 -54
  37. sglang/srt/hf_transformers_utils.py +28 -3
  38. sglang/srt/layers/activation.py +4 -2
  39. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  40. sglang/srt/layers/attention/flashattention_backend.py +295 -0
  41. sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  42. sglang/srt/layers/attention/flashmla_backend.py +284 -0
  43. sglang/srt/layers/attention/triton_backend.py +171 -38
  44. sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
  45. sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
  46. sglang/srt/layers/attention/utils.py +53 -0
  47. sglang/srt/layers/attention/vision.py +9 -28
  48. sglang/srt/layers/dp_attention.py +62 -23
  49. sglang/srt/layers/elementwise.py +411 -0
  50. sglang/srt/layers/layernorm.py +24 -2
  51. sglang/srt/layers/linear.py +17 -5
  52. sglang/srt/layers/logits_processor.py +26 -7
  53. sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
  54. sglang/srt/layers/moe/ep_moe/layer.py +273 -1
  55. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
  56. sglang/srt/layers/moe/fused_moe_native.py +2 -1
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
  63. sglang/srt/layers/moe/router.py +342 -0
  64. sglang/srt/layers/moe/topk.py +31 -18
  65. sglang/srt/layers/parameter.py +1 -1
  66. sglang/srt/layers/quantization/__init__.py +184 -126
  67. sglang/srt/layers/quantization/base_config.py +5 -0
  68. sglang/srt/layers/quantization/blockwise_int8.py +1 -1
  69. sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  70. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
  71. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
  72. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
  73. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
  74. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
  75. sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
  76. sglang/srt/layers/quantization/fp8.py +76 -34
  77. sglang/srt/layers/quantization/fp8_kernel.py +24 -8
  78. sglang/srt/layers/quantization/fp8_utils.py +284 -28
  79. sglang/srt/layers/quantization/gptq.py +36 -9
  80. sglang/srt/layers/quantization/kv_cache.py +98 -0
  81. sglang/srt/layers/quantization/modelopt_quant.py +9 -7
  82. sglang/srt/layers/quantization/utils.py +153 -0
  83. sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
  84. sglang/srt/layers/rotary_embedding.py +66 -87
  85. sglang/srt/layers/sampler.py +1 -1
  86. sglang/srt/lora/layers.py +68 -0
  87. sglang/srt/lora/lora.py +2 -22
  88. sglang/srt/lora/lora_manager.py +47 -23
  89. sglang/srt/lora/mem_pool.py +110 -51
  90. sglang/srt/lora/utils.py +12 -1
  91. sglang/srt/managers/cache_controller.py +4 -5
  92. sglang/srt/managers/data_parallel_controller.py +31 -9
  93. sglang/srt/managers/expert_distribution.py +81 -0
  94. sglang/srt/managers/io_struct.py +39 -3
  95. sglang/srt/managers/mm_utils.py +373 -0
  96. sglang/srt/managers/multimodal_processor.py +68 -0
  97. sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
  98. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
  99. sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
  100. sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
  101. sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
  102. sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
  103. sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
  104. sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
  105. sglang/srt/managers/schedule_batch.py +134 -31
  106. sglang/srt/managers/scheduler.py +325 -38
  107. sglang/srt/managers/scheduler_output_processor_mixin.py +4 -1
  108. sglang/srt/managers/session_controller.py +1 -1
  109. sglang/srt/managers/tokenizer_manager.py +59 -23
  110. sglang/srt/managers/tp_worker.py +1 -1
  111. sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
  112. sglang/srt/managers/utils.py +6 -1
  113. sglang/srt/mem_cache/hiradix_cache.py +27 -8
  114. sglang/srt/mem_cache/memory_pool.py +258 -98
  115. sglang/srt/mem_cache/paged_allocator.py +2 -2
  116. sglang/srt/mem_cache/radix_cache.py +4 -4
  117. sglang/srt/model_executor/cuda_graph_runner.py +85 -28
  118. sglang/srt/model_executor/forward_batch_info.py +81 -15
  119. sglang/srt/model_executor/model_runner.py +70 -6
  120. sglang/srt/model_loader/loader.py +160 -2
  121. sglang/srt/model_loader/weight_utils.py +45 -0
  122. sglang/srt/models/deepseek_janus_pro.py +29 -86
  123. sglang/srt/models/deepseek_nextn.py +22 -10
  124. sglang/srt/models/deepseek_v2.py +326 -192
  125. sglang/srt/models/deepseek_vl2.py +358 -0
  126. sglang/srt/models/gemma3_causal.py +684 -0
  127. sglang/srt/models/gemma3_mm.py +462 -0
  128. sglang/srt/models/grok.py +374 -119
  129. sglang/srt/models/llama.py +47 -7
  130. sglang/srt/models/llama_eagle.py +1 -0
  131. sglang/srt/models/llama_eagle3.py +196 -0
  132. sglang/srt/models/llava.py +3 -3
  133. sglang/srt/models/llavavid.py +3 -3
  134. sglang/srt/models/minicpmo.py +1995 -0
  135. sglang/srt/models/minicpmv.py +62 -137
  136. sglang/srt/models/mllama.py +4 -4
  137. sglang/srt/models/phi3_small.py +1 -1
  138. sglang/srt/models/qwen2.py +3 -0
  139. sglang/srt/models/qwen2_5_vl.py +68 -146
  140. sglang/srt/models/qwen2_classification.py +75 -0
  141. sglang/srt/models/qwen2_moe.py +9 -1
  142. sglang/srt/models/qwen2_vl.py +25 -63
  143. sglang/srt/openai_api/adapter.py +145 -47
  144. sglang/srt/openai_api/protocol.py +23 -2
  145. sglang/srt/sampling/sampling_batch_info.py +1 -1
  146. sglang/srt/sampling/sampling_params.py +6 -6
  147. sglang/srt/server_args.py +104 -14
  148. sglang/srt/speculative/build_eagle_tree.py +7 -347
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
  150. sglang/srt/speculative/eagle_utils.py +208 -252
  151. sglang/srt/speculative/eagle_worker.py +139 -53
  152. sglang/srt/speculative/spec_info.py +6 -1
  153. sglang/srt/torch_memory_saver_adapter.py +22 -0
  154. sglang/srt/utils.py +182 -21
  155. sglang/test/__init__.py +0 -0
  156. sglang/test/attention/__init__.py +0 -0
  157. sglang/test/attention/test_flashattn_backend.py +312 -0
  158. sglang/test/runners.py +2 -0
  159. sglang/test/test_activation.py +2 -1
  160. sglang/test/test_block_fp8.py +5 -4
  161. sglang/test/test_block_fp8_ep.py +2 -1
  162. sglang/test/test_dynamic_grad_mode.py +58 -0
  163. sglang/test/test_layernorm.py +3 -2
  164. sglang/test/test_utils.py +55 -4
  165. sglang/utils.py +31 -0
  166. sglang/version.py +1 -1
  167. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/METADATA +12 -8
  168. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/RECORD +171 -125
  169. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/WHEEL +1 -1
  170. sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
  171. sglang/srt/managers/image_processor.py +0 -55
  172. sglang/srt/managers/image_processors/base_image_processor.py +0 -219
  173. sglang/srt/managers/image_processors/minicpmv.py +0 -86
  174. sglang/srt/managers/multi_modality_padding.py +0 -134
  175. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info/licenses}/LICENSE +0 -0
  176. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,7 @@ from sglang.srt.layers.quantization.fp8_kernel import (
11
11
  static_quant_fp8,
12
12
  w8a8_block_fp8_matmul,
13
13
  )
14
+ from sglang.test.test_utils import CustomTestCase
14
15
 
15
16
  _is_cuda = torch.cuda.is_available() and torch.version.cuda
16
17
 
@@ -44,7 +45,7 @@ def native_per_token_group_quant_fp8(
44
45
  return x_q, x_s
45
46
 
46
47
 
47
- class TestPerTokenGroupQuantFP8(unittest.TestCase):
48
+ class TestPerTokenGroupQuantFP8(CustomTestCase):
48
49
  DTYPES = [torch.half, torch.bfloat16, torch.float32]
49
50
  NUM_TOKENS = [7, 83, 2048]
50
51
  D = [512, 4096, 5120, 13824]
@@ -111,7 +112,7 @@ def native_static_quant_fp8(x, x_s, dtype=torch.float8_e4m3fn):
111
112
  return x_q, x_s
112
113
 
113
114
 
114
- class TestStaticQuantFP8(unittest.TestCase):
115
+ class TestStaticQuantFP8(CustomTestCase):
115
116
  DTYPES = [torch.half, torch.bfloat16, torch.float32]
116
117
  NUM_TOKENS = [7, 83, 2048]
117
118
  D = [512, 4096, 5120, 13824]
@@ -210,7 +211,7 @@ def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.fl
210
211
  return C
211
212
 
212
213
 
213
- class TestW8A8BlockFP8Matmul(unittest.TestCase):
214
+ class TestW8A8BlockFP8Matmul(CustomTestCase):
214
215
 
215
216
  if not _is_cuda:
216
217
  OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
@@ -331,7 +332,7 @@ def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
331
332
  ).sum(dim=1)
332
333
 
333
334
 
334
- class TestW8A8BlockFP8FusedMoE(unittest.TestCase):
335
+ class TestW8A8BlockFP8FusedMoE(CustomTestCase):
335
336
  DTYPES = [torch.float32, torch.half, torch.bfloat16]
336
337
  M = [1, 33, 64, 222, 1024 * 128]
337
338
  N = [128, 1024, 2048]
@@ -13,6 +13,7 @@ from sglang.srt.layers.moe.ep_moe.kernels import (
13
13
  silu_and_mul_triton_kernel,
14
14
  )
15
15
  from sglang.srt.layers.moe.topk import select_experts
16
+ from sglang.test.test_utils import CustomTestCase
16
17
 
17
18
 
18
19
  # For test
@@ -232,7 +233,7 @@ def block_dequant(
232
233
  return x_dq_block
233
234
 
234
235
 
235
- class TestW8A8BlockFP8EPMoE(unittest.TestCase):
236
+ class TestW8A8BlockFP8EPMoE(CustomTestCase):
236
237
  DTYPES = [torch.half, torch.bfloat16]
237
238
  M = [1, 222, 1024, 2048]
238
239
  N = [128, 1024, 2048]
@@ -0,0 +1,58 @@
1
+ import unittest
2
+
3
+ import torch
4
+
5
+ from sglang.srt.utils import DynamicGradMode
6
+ from sglang.test.test_utils import CustomTestCase
7
+
8
+
9
+ class TestDynamicGradMode(CustomTestCase):
10
+ def test_inference(self):
11
+ # Test inference_mode
12
+ DynamicGradMode.set_inference_mode(True)
13
+
14
+ @DynamicGradMode()
15
+ def create_tensor_x():
16
+ return torch.empty(0)
17
+
18
+ X = create_tensor_x()
19
+ self.assertTrue(not X.requires_grad and X.is_inference())
20
+
21
+ def test_no_grad(self):
22
+ # Test no_grad
23
+ DynamicGradMode.set_inference_mode(False)
24
+
25
+ @DynamicGradMode()
26
+ def create_tensor_y():
27
+ return torch.empty(0)
28
+
29
+ Y = create_tensor_y()
30
+ self.assertTrue(not Y.requires_grad and not Y.is_inference())
31
+
32
+ def test_nested_inference(self):
33
+ # Test no_grad nested inference_mode, inference_mode should has higher priority
34
+ DynamicGradMode.set_inference_mode(False)
35
+
36
+ @DynamicGradMode()
37
+ def create_tensor_z():
38
+ with torch.inference_mode():
39
+ return torch.empty(0)
40
+
41
+ Z = create_tensor_z()
42
+ self.assertTrue(not Z.requires_grad and Z.is_inference())
43
+
44
+ def test_nested_no_grad(self):
45
+ # Test inference_mode nested no_grad, inference_mode should has higher priority
46
+ DynamicGradMode.set_inference_mode(True)
47
+
48
+ @DynamicGradMode()
49
+ def create_tensor_w():
50
+ with torch.no_grad():
51
+ return torch.empty(0)
52
+
53
+ W = create_tensor_w()
54
+ self.assertTrue(not W.requires_grad and W.is_inference())
55
+
56
+
57
+ if __name__ == "__main__":
58
+ unittest.main(verbosity=2)
@@ -4,9 +4,10 @@ import unittest
4
4
  import torch
5
5
 
6
6
  from sglang.srt.layers.layernorm import GemmaRMSNorm, RMSNorm
7
+ from sglang.test.test_utils import CustomTestCase
7
8
 
8
9
 
9
- class TestRMSNorm(unittest.TestCase):
10
+ class TestRMSNorm(CustomTestCase):
10
11
  DTYPES = [torch.half, torch.bfloat16]
11
12
  NUM_TOKENS = [7, 83, 4096]
12
13
  HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
@@ -56,7 +57,7 @@ class TestRMSNorm(unittest.TestCase):
56
57
  self._run_rms_norm_test(*params)
57
58
 
58
59
 
59
- class TestGemmaRMSNorm(unittest.TestCase):
60
+ class TestGemmaRMSNorm(CustomTestCase):
60
61
  DTYPES = [torch.half, torch.bfloat16]
61
62
  NUM_TOKENS = [7, 83, 4096]
62
63
  HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
sglang/test/test_utils.py CHANGED
@@ -1,15 +1,17 @@
1
1
  """Common utilities for testing and benchmarking"""
2
2
 
3
3
  import argparse
4
- import asyncio
5
4
  import copy
5
+ import logging
6
6
  import os
7
7
  import random
8
8
  import subprocess
9
9
  import threading
10
10
  import time
11
+ import traceback
11
12
  import unittest
12
13
  from concurrent.futures import ThreadPoolExecutor
14
+ from dataclasses import dataclass
13
15
  from functools import partial
14
16
  from types import SimpleNamespace
15
17
  from typing import Callable, List, Optional, Tuple
@@ -32,6 +34,15 @@ DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct
32
34
  DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
33
35
  "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
34
36
  )
37
+ DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST = (
38
+ "nvidia/Llama-3.1-8B-Instruct-FP8"
39
+ )
40
+ # TODO(yundai424): right now specifying to an older revision since the latest one
41
+ # carries kv cache quantization which doesn't work yet
42
+ DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_REVISION = (
43
+ "13858565416dbdc0b4e7a4a677fadfbd5b9e5bb9"
44
+ )
45
+
35
46
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
36
47
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
37
48
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
@@ -52,7 +63,6 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8
52
63
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
53
64
  DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
54
65
 
55
-
56
66
  DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
57
67
  DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
58
68
 
@@ -423,6 +433,11 @@ def popen_launch_server(
423
433
  return process
424
434
  except requests.RequestException:
425
435
  pass
436
+
437
+ return_code = process.poll()
438
+ if return_code is not None:
439
+ raise Exception(f"Server unexpectedly exits ({return_code=}).")
440
+
426
441
  time.sleep(10)
427
442
 
428
443
  kill_process_tree(process.pid)
@@ -453,7 +468,13 @@ def run_with_timeout(
453
468
  return ret_value[0]
454
469
 
455
470
 
456
- def run_unittest_files(files: List, timeout_per_file: float):
471
+ @dataclass
472
+ class TestFile:
473
+ name: str
474
+ estimated_time: float = 60
475
+
476
+
477
+ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
457
478
  tic = time.time()
458
479
  success = True
459
480
 
@@ -870,7 +891,6 @@ def run_mulit_request_test(
870
891
  enable_overlap=False,
871
892
  chunked_prefill_size=32,
872
893
  ):
873
-
874
894
  def workload_func(base_url, model):
875
895
  def run_one(_):
876
896
  prompt = """
@@ -905,6 +925,10 @@ def run_mulit_request_test(
905
925
 
906
926
 
907
927
  def write_github_step_summary(content):
928
+ if not os.environ.get("GITHUB_STEP_SUMMARY"):
929
+ logging.warning("GITHUB_STEP_SUMMARY environment variable not set")
930
+ return
931
+
908
932
  with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
909
933
  f.write(content)
910
934
 
@@ -982,3 +1006,30 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple):
982
1006
  rank += 1
983
1007
  else:
984
1008
  raise
1009
+
1010
+
1011
+ class CustomTestCase(unittest.TestCase):
1012
+ def _callTestMethod(self, method):
1013
+ _retry_execution(
1014
+ lambda: super(CustomTestCase, self)._callTestMethod(method),
1015
+ max_retry=_get_max_retry(),
1016
+ )
1017
+
1018
+
1019
+ def _get_max_retry():
1020
+ return int(os.environ.get("SGLANG_TEST_MAX_RETRY", "2" if is_in_ci() else "0"))
1021
+
1022
+
1023
+ def _retry_execution(fn, max_retry: int):
1024
+ if max_retry == 0:
1025
+ fn()
1026
+ return
1027
+
1028
+ try:
1029
+ fn()
1030
+ except Exception as e:
1031
+ print(
1032
+ f"retry_execution failed once and will retry. This may be an error or a flaky test. Error: {e}"
1033
+ )
1034
+ traceback.print_exc()
1035
+ _retry_execution(fn, max_retry=max_retry - 1)
sglang/utils.py CHANGED
@@ -22,6 +22,7 @@ from typing import Any, Callable, List, Optional, Tuple, Type, Union
22
22
  import numpy as np
23
23
  import requests
24
24
  from IPython.display import HTML, display
25
+ from pydantic import BaseModel
25
26
  from tqdm import tqdm
26
27
 
27
28
  from sglang.srt.utils import kill_process_tree
@@ -29,6 +30,36 @@ from sglang.srt.utils import kill_process_tree
29
30
  logger = logging.getLogger(__name__)
30
31
 
31
32
 
33
+ def convert_json_schema_to_str(json_schema: Union[dict, str, Type[BaseModel]]) -> str:
34
+ """Convert a JSON schema to a string.
35
+ Parameters
36
+ ----------
37
+ json_schema
38
+ The JSON schema.
39
+ Returns
40
+ -------
41
+ str
42
+ The JSON schema converted to a string.
43
+ Raises
44
+ ------
45
+ ValueError
46
+ If the schema is not a dictionary, a string or a Pydantic class.
47
+ """
48
+ if isinstance(json_schema, dict):
49
+ schema_str = json.dumps(json_schema)
50
+ elif isinstance(json_schema, str):
51
+ schema_str = json_schema
52
+ elif issubclass(json_schema, BaseModel):
53
+ schema_str = json.dumps(json_schema.model_json_schema())
54
+ else:
55
+ raise ValueError(
56
+ f"Cannot parse schema {json_schema}. The schema must be either "
57
+ + "a Pydantic class, a dictionary or a string that contains the JSON "
58
+ + "schema specification"
59
+ )
60
+ return schema_str
61
+
62
+
32
63
  def get_exception_traceback():
33
64
  etype, value, tb = sys.exc_info()
34
65
  err_str = "".join(traceback.format_exception(etype, value, tb))
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.4"
1
+ __version__ = "0.4.4.post2"
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.4
3
+ Version: 0.4.4.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -235,17 +235,17 @@ Requires-Dist: psutil; extra == "runtime-common"
235
235
  Requires-Dist: pydantic; extra == "runtime-common"
236
236
  Requires-Dist: python-multipart; extra == "runtime-common"
237
237
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
238
+ Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
238
239
  Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
239
- Requires-Dist: transformers==4.48.3; extra == "runtime-common"
240
+ Requires-Dist: transformers==4.50.0; extra == "runtime-common"
240
241
  Requires-Dist: uvicorn; extra == "runtime-common"
241
242
  Requires-Dist: uvloop; extra == "runtime-common"
242
- Requires-Dist: xgrammar==0.1.15; extra == "runtime-common"
243
+ Requires-Dist: xgrammar==0.1.16; extra == "runtime-common"
243
244
  Provides-Extra: srt
244
245
  Requires-Dist: sglang[runtime_common]; extra == "srt"
245
- Requires-Dist: sgl-kernel==0.0.5; extra == "srt"
246
+ Requires-Dist: sgl-kernel==0.0.5.post3; extra == "srt"
246
247
  Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
247
248
  Requires-Dist: torch==2.5.1; extra == "srt"
248
- Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
249
249
  Requires-Dist: cuda-python; extra == "srt"
250
250
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
251
251
  Provides-Extra: srt-hip
@@ -271,7 +271,7 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
271
271
  Provides-Extra: litellm
272
272
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
273
273
  Provides-Extra: torch-memory-saver
274
- Requires-Dist: torch_memory_saver; extra == "torch-memory-saver"
274
+ Requires-Dist: torch_memory_saver>=0.0.3; extra == "torch-memory-saver"
275
275
  Provides-Extra: test
276
276
  Requires-Dist: jsonlines; extra == "test"
277
277
  Requires-Dist: matplotlib; extra == "test"
@@ -319,6 +319,7 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
319
319
  Provides-Extra: dev-cpu
320
320
  Requires-Dist: sglang[all_cpu]; extra == "dev-cpu"
321
321
  Requires-Dist: sglang[test]; extra == "dev-cpu"
322
+ Dynamic: license-file
322
323
 
323
324
  <div align="center" id="sglangtop">
324
325
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
@@ -342,6 +343,9 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
342
343
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
343
344
 
344
345
  ## News
346
+ - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
347
+ - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
348
+ - [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
345
349
  - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
346
350
  - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
347
351
  - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -362,7 +366,7 @@ SGLang is a fast serving framework for large language models and vision language
362
366
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
363
367
  The core features include:
364
368
 
365
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
369
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
366
370
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
367
371
  - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
368
372
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.