sglang 0.4.0.post1__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. sglang/bench_offline_throughput.py +6 -6
  2. sglang/bench_one_batch.py +1 -0
  3. sglang/bench_serving.py +9 -1
  4. sglang/check_env.py +140 -48
  5. sglang/lang/backend/runtime_endpoint.py +1 -0
  6. sglang/lang/chat_template.py +32 -0
  7. sglang/llama3_eval.py +316 -0
  8. sglang/srt/aio_rwlock.py +100 -0
  9. sglang/srt/configs/model_config.py +8 -1
  10. sglang/srt/constrained/xgrammar_backend.py +4 -1
  11. sglang/srt/layers/attention/flashinfer_backend.py +51 -5
  12. sglang/srt/layers/attention/triton_backend.py +16 -25
  13. sglang/srt/layers/attention/triton_ops/decode_attention.py +305 -350
  14. sglang/srt/layers/linear.py +20 -2
  15. sglang/srt/layers/logits_processor.py +133 -95
  16. sglang/srt/layers/{ep_moe → moe/ep_moe}/layer.py +18 -39
  17. sglang/srt/layers/moe/fused_moe_native.py +46 -0
  18. sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/__init__.py +3 -7
  19. sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/fused_moe.py +174 -119
  20. sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/layer.py +17 -49
  21. sglang/srt/layers/moe/topk.py +191 -0
  22. sglang/srt/layers/quantization/__init__.py +5 -50
  23. sglang/srt/layers/quantization/fp8.py +221 -36
  24. sglang/srt/layers/quantization/fp8_kernel.py +278 -0
  25. sglang/srt/layers/quantization/fp8_utils.py +90 -1
  26. sglang/srt/layers/radix_attention.py +8 -1
  27. sglang/srt/layers/sampler.py +27 -5
  28. sglang/srt/layers/torchao_utils.py +31 -0
  29. sglang/srt/managers/detokenizer_manager.py +37 -17
  30. sglang/srt/managers/io_struct.py +39 -10
  31. sglang/srt/managers/schedule_batch.py +54 -34
  32. sglang/srt/managers/schedule_policy.py +64 -5
  33. sglang/srt/managers/scheduler.py +171 -136
  34. sglang/srt/managers/tokenizer_manager.py +184 -133
  35. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  36. sglang/srt/mem_cache/chunk_cache.py +2 -2
  37. sglang/srt/mem_cache/memory_pool.py +15 -8
  38. sglang/srt/mem_cache/radix_cache.py +12 -2
  39. sglang/srt/model_executor/cuda_graph_runner.py +25 -11
  40. sglang/srt/model_executor/model_runner.py +28 -14
  41. sglang/srt/model_parallel.py +66 -5
  42. sglang/srt/models/dbrx.py +1 -1
  43. sglang/srt/models/deepseek.py +1 -1
  44. sglang/srt/models/deepseek_v2.py +67 -18
  45. sglang/srt/models/gemma2.py +34 -0
  46. sglang/srt/models/gemma2_reward.py +0 -1
  47. sglang/srt/models/granite.py +517 -0
  48. sglang/srt/models/grok.py +73 -9
  49. sglang/srt/models/llama.py +22 -0
  50. sglang/srt/models/llama_classification.py +11 -23
  51. sglang/srt/models/llama_reward.py +0 -2
  52. sglang/srt/models/llava.py +37 -14
  53. sglang/srt/models/mixtral.py +2 -2
  54. sglang/srt/models/olmoe.py +1 -1
  55. sglang/srt/models/qwen2.py +20 -0
  56. sglang/srt/models/qwen2_moe.py +1 -1
  57. sglang/srt/models/xverse_moe.py +1 -1
  58. sglang/srt/openai_api/adapter.py +8 -0
  59. sglang/srt/openai_api/protocol.py +9 -4
  60. sglang/srt/server.py +2 -1
  61. sglang/srt/server_args.py +19 -9
  62. sglang/srt/utils.py +40 -54
  63. sglang/test/test_block_fp8.py +341 -0
  64. sglang/test/test_utils.py +3 -2
  65. sglang/utils.py +10 -3
  66. sglang/version.py +1 -1
  67. {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/METADATA +12 -7
  68. {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/RECORD +73 -67
  69. sglang/srt/layers/fused_moe_patch.py +0 -133
  70. /sglang/srt/layers/{ep_moe → moe/ep_moe}/__init__.py +0 -0
  71. /sglang/srt/layers/{ep_moe → moe/ep_moe}/kernels.py +0 -0
  72. {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/LICENSE +0 -0
  73. {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/WHEEL +0 -0
  74. {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/top_level.txt +0 -0
@@ -201,18 +201,17 @@ def throughput_test_once(
201
201
  for r in reqs
202
202
  ]
203
203
 
204
- st = time.perf_counter()
205
204
  if profile:
206
205
  backend.start_profile()
207
206
 
207
+ st = time.perf_counter()
208
208
  gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
209
+ latency = time.perf_counter() - st
209
210
 
210
211
  if profile:
211
212
  backend.stop_profile()
212
213
  monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
213
214
 
214
- latency = time.perf_counter() - st
215
-
216
215
  if backend_name == "runtime":
217
216
  gen_out = json.loads(gen_out)
218
217
 
@@ -285,7 +284,7 @@ def throughput_test(
285
284
  else:
286
285
  raise ValueError('Please set backend to either "engine" or "runtime"')
287
286
 
288
- tokenizer_id = server_args.model_path
287
+ tokenizer_id = server_args.tokenizer_path or server_args.model_path
289
288
  tokenizer = get_tokenizer(tokenizer_id)
290
289
 
291
290
  # Set global environmnets
@@ -304,8 +303,8 @@ def throughput_test(
304
303
  warmup_requests = sample_random_requests(
305
304
  input_len=256,
306
305
  output_len=16,
307
- num_prompts=16,
308
- range_ratio=0.8,
306
+ num_prompts=min(bench_args.num_prompts, 16),
307
+ range_ratio=1.0,
309
308
  tokenizer=tokenizer,
310
309
  dataset_path=bench_args.dataset_path,
311
310
  )
@@ -321,6 +320,7 @@ def throughput_test(
321
320
  extra_request_body=extra_request_body,
322
321
  profile=False,
323
322
  )
323
+ time.sleep(0.5)
324
324
 
325
325
  logging.info("\nBenchmark...")
326
326
  result = throughput_test_once(
sglang/bench_one_batch.py CHANGED
@@ -385,6 +385,7 @@ def latency_test(
385
385
  8, # shorter decoding to speed up the warmup
386
386
  server_args.device,
387
387
  )
388
+
388
389
  rank_print("Benchmark ...")
389
390
 
390
391
  # Run the sweep
sglang/bench_serving.py CHANGED
@@ -321,6 +321,8 @@ async def async_request_sglang_generate(
321
321
  },
322
322
  "stream": not args.disable_stream,
323
323
  "lora_path": request_func_input.lora_name,
324
+ "return_logprob": args.return_logprob,
325
+ "logprob_start_len": -1,
324
326
  **request_func_input.extra_request_body,
325
327
  }
326
328
  headers = {}
@@ -911,7 +913,7 @@ async def benchmark(
911
913
  prompt=test_prompt,
912
914
  api_url=api_url,
913
915
  prompt_len=test_prompt_len,
914
- output_len=test_output_len,
916
+ output_len=min(test_output_len, 32),
915
917
  lora_name=lora_name,
916
918
  extra_request_body=extra_request_body,
917
919
  )
@@ -922,6 +924,7 @@ async def benchmark(
922
924
  f"are correctly specified. Error: {test_output.error}"
923
925
  )
924
926
  else:
927
+ requests.post(base_url + "/flush_cache")
925
928
  print("Initial test run completed. Starting main benchmark run...")
926
929
 
927
930
  time.sleep(1.5)
@@ -1413,6 +1416,11 @@ if __name__ == "__main__":
1413
1416
  action="store_true",
1414
1417
  help="Disable ignoring EOS.",
1415
1418
  )
1419
+ parser.add_argument(
1420
+ "--return-logprob",
1421
+ action="store_true",
1422
+ help="Return logprob.",
1423
+ )
1416
1424
  parser.add_argument(
1417
1425
  "--extra-request-body",
1418
1426
  metavar='{"key1": "value1", "key2": "value2"}',
sglang/check_env.py CHANGED
@@ -9,6 +9,13 @@ from collections import OrderedDict, defaultdict
9
9
 
10
10
  import torch
11
11
 
12
+ from sglang.srt.utils import is_hip
13
+
14
+
15
+ def is_cuda_v2():
16
+ return torch.version.cuda is not None
17
+
18
+
12
19
  # List of packages to check versions
13
20
  PACKAGE_LIST = [
14
21
  "sglang",
@@ -63,13 +70,22 @@ def get_cuda_info():
63
70
  """
64
71
  Get CUDA-related information if available.
65
72
  """
66
- cuda_info = {"CUDA available": torch.cuda.is_available()}
73
+ if is_cuda_v2():
74
+ cuda_info = {"CUDA available": torch.cuda.is_available()}
75
+
76
+ if cuda_info["CUDA available"]:
77
+ cuda_info.update(_get_gpu_info())
78
+ cuda_info.update(_get_cuda_version_info())
79
+
80
+ return cuda_info
81
+ elif is_hip():
82
+ cuda_info = {"ROCM available": torch.cuda.is_available()}
67
83
 
68
- if cuda_info["CUDA available"]:
69
- cuda_info.update(_get_gpu_info())
70
- cuda_info.update(_get_cuda_version_info())
84
+ if cuda_info["ROCM available"]:
85
+ cuda_info.update(_get_gpu_info())
86
+ cuda_info.update(_get_cuda_version_info())
71
87
 
72
- return cuda_info
88
+ return cuda_info
73
89
 
74
90
 
75
91
  def _get_gpu_info():
@@ -103,34 +119,72 @@ def _get_cuda_version_info():
103
119
  """
104
120
  Get CUDA version information.
105
121
  """
106
- from torch.utils.cpp_extension import CUDA_HOME
122
+ if is_cuda_v2():
123
+ from torch.utils.cpp_extension import CUDA_HOME
107
124
 
108
- cuda_info = {"CUDA_HOME": CUDA_HOME}
125
+ cuda_info = {"CUDA_HOME": CUDA_HOME}
109
126
 
110
- if CUDA_HOME and os.path.isdir(CUDA_HOME):
111
- cuda_info.update(_get_nvcc_info())
112
- cuda_info.update(_get_cuda_driver_version())
127
+ if CUDA_HOME and os.path.isdir(CUDA_HOME):
128
+ cuda_info.update(_get_nvcc_info())
129
+ cuda_info.update(_get_cuda_driver_version())
113
130
 
114
- return cuda_info
131
+ return cuda_info
132
+ elif is_hip():
133
+ from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME
134
+
135
+ cuda_info = {"ROCM_HOME": ROCM_HOME}
136
+
137
+ if ROCM_HOME and os.path.isdir(ROCM_HOME):
138
+ cuda_info.update(_get_nvcc_info())
139
+ cuda_info.update(_get_cuda_driver_version())
140
+
141
+ return cuda_info
142
+ else:
143
+ cuda_info = {"CUDA_HOME": ""}
144
+ return cuda_info
115
145
 
116
146
 
117
147
  def _get_nvcc_info():
118
148
  """
119
149
  Get NVCC version information.
120
150
  """
121
- from torch.utils.cpp_extension import CUDA_HOME
151
+ if is_cuda_v2():
152
+ from torch.utils.cpp_extension import CUDA_HOME
122
153
 
123
- try:
124
- nvcc = os.path.join(CUDA_HOME, "bin/nvcc")
125
- nvcc_output = (
126
- subprocess.check_output(f'"{nvcc}" -V', shell=True).decode("utf-8").strip()
127
- )
128
- return {
129
- "NVCC": nvcc_output[
130
- nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind("Build")
131
- ].strip()
132
- }
133
- except subprocess.SubprocessError:
154
+ try:
155
+ nvcc = os.path.join(CUDA_HOME, "bin/nvcc")
156
+ nvcc_output = (
157
+ subprocess.check_output(f'"{nvcc}" -V', shell=True)
158
+ .decode("utf-8")
159
+ .strip()
160
+ )
161
+ return {
162
+ "NVCC": nvcc_output[
163
+ nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind(
164
+ "Build"
165
+ )
166
+ ].strip()
167
+ }
168
+ except subprocess.SubprocessError:
169
+ return {"NVCC": "Not Available"}
170
+ elif is_hip():
171
+ from torch.utils.cpp_extension import ROCM_HOME
172
+
173
+ try:
174
+ hipcc = os.path.join(ROCM_HOME, "bin/hipcc")
175
+ hipcc_output = (
176
+ subprocess.check_output(f'"{hipcc}" --version', shell=True)
177
+ .decode("utf-8")
178
+ .strip()
179
+ )
180
+ return {
181
+ "HIPCC": hipcc_output[
182
+ hipcc_output.rfind("HIP version") : hipcc_output.rfind("AMD clang")
183
+ ].strip()
184
+ }
185
+ except subprocess.SubprocessError:
186
+ return {"HIPCC": "Not Available"}
187
+ else:
134
188
  return {"NVCC": "Not Available"}
135
189
 
136
190
 
@@ -139,20 +193,40 @@ def _get_cuda_driver_version():
139
193
  Get CUDA driver version.
140
194
  """
141
195
  versions = set()
142
- try:
143
- output = subprocess.check_output(
144
- [
145
- "nvidia-smi",
146
- "--query-gpu=driver_version",
147
- "--format=csv,noheader,nounits",
148
- ]
149
- )
150
- versions = set(output.decode().strip().split("\n"))
151
- if len(versions) == 1:
152
- return {"CUDA Driver Version": versions.pop()}
153
- else:
154
- return {"CUDA Driver Versions": ", ".join(sorted(versions))}
155
- except subprocess.SubprocessError:
196
+ if is_cuda_v2():
197
+ try:
198
+ output = subprocess.check_output(
199
+ [
200
+ "nvidia-smi",
201
+ "--query-gpu=driver_version",
202
+ "--format=csv,noheader,nounits",
203
+ ]
204
+ )
205
+ versions = set(output.decode().strip().split("\n"))
206
+ if len(versions) == 1:
207
+ return {"CUDA Driver Version": versions.pop()}
208
+ else:
209
+ return {"CUDA Driver Versions": ", ".join(sorted(versions))}
210
+ except subprocess.SubprocessError:
211
+ return {"CUDA Driver Version": "Not Available"}
212
+ elif is_hip():
213
+ try:
214
+ output = subprocess.check_output(
215
+ [
216
+ "rocm-smi",
217
+ "--showdriverversion",
218
+ "--csv",
219
+ ]
220
+ )
221
+ versions = set(output.decode().strip().split("\n"))
222
+ versions.discard("name, value")
223
+ ver = versions.pop()
224
+ ver = ver.replace('"Driver version", ', "").replace('"', "")
225
+
226
+ return {"ROCM Driver Version": ver}
227
+ except subprocess.SubprocessError:
228
+ return {"ROCM Driver Version": "Not Available"}
229
+ else:
156
230
  return {"CUDA Driver Version": "Not Available"}
157
231
 
158
232
 
@@ -160,16 +234,31 @@ def get_gpu_topology():
160
234
  """
161
235
  Get GPU topology information.
162
236
  """
163
- try:
164
- result = subprocess.run(
165
- ["nvidia-smi", "topo", "-m"],
166
- stdout=subprocess.PIPE,
167
- stderr=subprocess.PIPE,
168
- text=True,
169
- check=True,
170
- )
171
- return "\n" + result.stdout if result.returncode == 0 else None
172
- except subprocess.SubprocessError:
237
+ if is_cuda_v2():
238
+ try:
239
+ result = subprocess.run(
240
+ ["nvidia-smi", "topo", "-m"],
241
+ stdout=subprocess.PIPE,
242
+ stderr=subprocess.PIPE,
243
+ text=True,
244
+ check=True,
245
+ )
246
+ return "\n" + result.stdout if result.returncode == 0 else None
247
+ except subprocess.SubprocessError:
248
+ return None
249
+ elif is_hip():
250
+ try:
251
+ result = subprocess.run(
252
+ ["rocm-smi", "--showtopotype"],
253
+ stdout=subprocess.PIPE,
254
+ stderr=subprocess.PIPE,
255
+ text=True,
256
+ check=True,
257
+ )
258
+ return "\n" + result.stdout if result.returncode == 0 else None
259
+ except subprocess.SubprocessError:
260
+ return None
261
+ else:
173
262
  return None
174
263
 
175
264
 
@@ -196,7 +285,10 @@ def check_env():
196
285
 
197
286
  gpu_topo = get_gpu_topology()
198
287
  if gpu_topo:
199
- env_info["NVIDIA Topology"] = gpu_topo
288
+ if is_cuda_v2():
289
+ env_info["NVIDIA Topology"] = gpu_topo
290
+ elif is_hip():
291
+ env_info["AMD Topology"] = gpu_topo
200
292
 
201
293
  hypervisor_vendor = get_hypervisor_vendor()
202
294
  if hypervisor_vendor:
@@ -55,6 +55,7 @@ class RuntimeEndpoint(BaseBackend):
55
55
  self.base_url + "/flush_cache",
56
56
  api_key=self.api_key,
57
57
  verify=self.verify,
58
+ method="POST",
58
59
  )
59
60
  self._assert_success(res)
60
61
 
@@ -320,6 +320,28 @@ register_chat_template(
320
320
  )
321
321
  )
322
322
 
323
+ register_chat_template(
324
+ ChatTemplate(
325
+ name="granite-3-instruct",
326
+ default_system_prompt=None,
327
+ role_prefix_and_suffix={
328
+ "system": (
329
+ "<|start_of_role|>system<|end_of_role|>",
330
+ "<|end_of_text|>",
331
+ ),
332
+ "user": (
333
+ "<|start_of_role|>user<|end_of_role|>",
334
+ "<|end_of_text|>",
335
+ ),
336
+ "assistant": (
337
+ "<|start_of_role|>assistant<|end_of_role|>",
338
+ "<|end_of_text|>",
339
+ ),
340
+ },
341
+ stop_str=("<|end_of_text|>",),
342
+ )
343
+ )
344
+
323
345
 
324
346
  @register_chat_template_matching_function
325
347
  def match_dbrx(model_path: str):
@@ -402,6 +424,16 @@ def match_c4ai_command_r(model_path: str):
402
424
  return get_chat_template("c4ai-command-r")
403
425
 
404
426
 
427
+ @register_chat_template_matching_function
428
+ def match_granite_instruct(model_path: str):
429
+ model_path = model_path.lower()
430
+ # When future versions of Granite are released, this code may
431
+ # need to be updated. For now, assume that the Granite 3.0
432
+ # template works across the board.
433
+ if "granite" in model_path and "instruct" in model_path:
434
+ return get_chat_template("granite-3-instruct")
435
+
436
+
405
437
  if __name__ == "__main__":
406
438
  messages = [
407
439
  {"role": "system", "content": None}, # None means default