sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. sglang/bench_offline_throughput.py +4 -2
  2. sglang/bench_one_batch.py +2 -2
  3. sglang/bench_one_batch_server.py +143 -15
  4. sglang/bench_serving.py +9 -7
  5. sglang/compile_deep_gemm.py +1 -1
  6. sglang/eval/loogle_eval.py +157 -0
  7. sglang/lang/chat_template.py +78 -78
  8. sglang/lang/tracer.py +1 -1
  9. sglang/srt/code_completion_parser.py +1 -1
  10. sglang/srt/configs/deepseekvl2.py +2 -2
  11. sglang/srt/configs/model_config.py +1 -0
  12. sglang/srt/constrained/base_grammar_backend.py +55 -72
  13. sglang/srt/constrained/llguidance_backend.py +25 -21
  14. sglang/srt/constrained/outlines_backend.py +27 -26
  15. sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
  16. sglang/srt/constrained/xgrammar_backend.py +69 -43
  17. sglang/srt/conversation.py +48 -43
  18. sglang/srt/disaggregation/base/conn.py +1 -0
  19. sglang/srt/disaggregation/decode.py +7 -2
  20. sglang/srt/disaggregation/fake/conn.py +1 -1
  21. sglang/srt/disaggregation/mooncake/conn.py +227 -120
  22. sglang/srt/disaggregation/nixl/conn.py +1 -0
  23. sglang/srt/disaggregation/prefill.py +7 -4
  24. sglang/srt/disaggregation/utils.py +7 -1
  25. sglang/srt/entrypoints/engine.py +17 -2
  26. sglang/srt/entrypoints/http_server.py +17 -2
  27. sglang/srt/function_call_parser.py +2 -2
  28. sglang/srt/layers/attention/flashattention_backend.py +1 -1
  29. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
  30. sglang/srt/layers/attention/utils.py +4 -2
  31. sglang/srt/layers/dp_attention.py +71 -21
  32. sglang/srt/layers/layernorm.py +1 -1
  33. sglang/srt/layers/logits_processor.py +46 -11
  34. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  35. sglang/srt/layers/moe/ep_moe/layer.py +1 -1
  36. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
  37. sglang/srt/layers/moe/topk.py +1 -1
  38. sglang/srt/layers/quantization/__init__.py +1 -1
  39. sglang/srt/layers/quantization/blockwise_int8.py +2 -2
  40. sglang/srt/layers/quantization/deep_gemm.py +72 -71
  41. sglang/srt/layers/quantization/fp8.py +2 -2
  42. sglang/srt/layers/quantization/fp8_kernel.py +3 -3
  43. sglang/srt/layers/quantization/int8_kernel.py +2 -2
  44. sglang/srt/layers/sampler.py +0 -4
  45. sglang/srt/layers/vocab_parallel_embedding.py +18 -7
  46. sglang/srt/lora/lora_manager.py +1 -1
  47. sglang/srt/lora/mem_pool.py +4 -4
  48. sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
  49. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  50. sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
  51. sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
  52. sglang/srt/lora/utils.py +1 -1
  53. sglang/srt/managers/data_parallel_controller.py +3 -3
  54. sglang/srt/managers/detokenizer_manager.py +21 -8
  55. sglang/srt/managers/io_struct.py +3 -1
  56. sglang/srt/managers/mm_utils.py +1 -1
  57. sglang/srt/managers/multimodal_processors/llava.py +46 -0
  58. sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
  59. sglang/srt/managers/schedule_batch.py +76 -24
  60. sglang/srt/managers/schedule_policy.py +0 -3
  61. sglang/srt/managers/scheduler.py +113 -88
  62. sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
  63. sglang/srt/managers/tokenizer_manager.py +133 -34
  64. sglang/srt/managers/tp_worker.py +12 -9
  65. sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
  66. sglang/srt/mem_cache/memory_pool.py +2 -0
  67. sglang/srt/metrics/collector.py +312 -37
  68. sglang/srt/model_executor/cuda_graph_runner.py +10 -11
  69. sglang/srt/model_executor/forward_batch_info.py +1 -1
  70. sglang/srt/model_executor/model_runner.py +19 -14
  71. sglang/srt/models/deepseek_janus_pro.py +2 -2
  72. sglang/srt/models/deepseek_v2.py +23 -20
  73. sglang/srt/models/llama.py +2 -0
  74. sglang/srt/models/llama4.py +5 -6
  75. sglang/srt/models/llava.py +248 -5
  76. sglang/srt/models/mixtral.py +98 -34
  77. sglang/srt/models/pixtral.py +467 -0
  78. sglang/srt/models/roberta.py +1 -1
  79. sglang/srt/models/torch_native_llama.py +1 -1
  80. sglang/srt/openai_api/adapter.py +30 -4
  81. sglang/srt/openai_api/protocol.py +0 -8
  82. sglang/srt/reasoning_parser.py +3 -3
  83. sglang/srt/sampling/custom_logit_processor.py +18 -3
  84. sglang/srt/sampling/sampling_batch_info.py +4 -56
  85. sglang/srt/sampling/sampling_params.py +2 -2
  86. sglang/srt/server_args.py +34 -4
  87. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
  88. sglang/srt/speculative/eagle_utils.py +7 -7
  89. sglang/srt/speculative/eagle_worker.py +22 -19
  90. sglang/srt/utils.py +6 -5
  91. sglang/test/few_shot_gsm8k.py +2 -2
  92. sglang/test/few_shot_gsm8k_engine.py +2 -2
  93. sglang/test/run_eval.py +2 -2
  94. sglang/test/runners.py +8 -1
  95. sglang/test/send_one.py +13 -3
  96. sglang/test/simple_eval_common.py +1 -1
  97. sglang/test/simple_eval_humaneval.py +1 -1
  98. sglang/test/test_programs.py +5 -5
  99. sglang/test/test_utils.py +89 -14
  100. sglang/utils.py +1 -1
  101. sglang/version.py +1 -1
  102. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +6 -5
  103. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +107 -104
  104. /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
  105. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +0 -0
  106. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
  107. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0
sglang/test/run_eval.py CHANGED
@@ -71,9 +71,9 @@ def run_eval(args):
71
71
  )
72
72
 
73
73
  # Run eval
74
- tic = time.time()
74
+ tic = time.perf_counter()
75
75
  result = eval_obj(sampler)
76
- latency = time.time() - tic
76
+ latency = time.perf_counter() - tic
77
77
 
78
78
  # Dump reports
79
79
  metrics = result.metrics | {"score": result.score}
sglang/test/runners.py CHANGED
@@ -19,7 +19,9 @@ from typing import List, Optional, Tuple, Union
19
19
 
20
20
  import torch
21
21
  import torch.nn.functional as F
22
+ import transformers
22
23
  from transformers import (
24
+ AutoConfig,
23
25
  AutoModel,
24
26
  AutoModelForCausalLM,
25
27
  AutoModelForVision2Seq,
@@ -211,7 +213,12 @@ class HFRunner:
211
213
 
212
214
  # Load the model and tokenizer
213
215
  if self.model_type == "generation":
214
- self.base_model = AutoModelForCausalLM.from_pretrained(
216
+ config = AutoConfig.from_pretrained(model_path)
217
+ if model_archs := getattr(config, "architectures"):
218
+ model_cls = getattr(transformers, model_archs[0])
219
+ else:
220
+ model_cls = AutoModelForCausalLM
221
+ self.base_model = model_cls.from_pretrained(
215
222
  model_path,
216
223
  torch_dtype=torch_dtype,
217
224
  trust_remote_code=self.trust_remote_code,
sglang/test/send_one.py CHANGED
@@ -27,6 +27,7 @@ class BenchArgs:
27
27
  "Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:"
28
28
  )
29
29
  image: bool = False
30
+ many_images: bool = False
30
31
  stream: bool = False
31
32
 
32
33
  @staticmethod
@@ -48,6 +49,7 @@ class BenchArgs:
48
49
  parser.add_argument("--return-logprob", action="store_true")
49
50
  parser.add_argument("--prompt", type=str, default=BenchArgs.prompt)
50
51
  parser.add_argument("--image", action="store_true")
52
+ parser.add_argument("--many-images", action="store_true")
51
53
  parser.add_argument("--stream", action="store_true")
52
54
 
53
55
  @classmethod
@@ -62,6 +64,17 @@ def send_one_prompt(args):
62
64
  "Human: Describe this image in a very short sentence.\n\nAssistant:"
63
65
  )
64
66
  image_data = "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png"
67
+ elif args.many_images:
68
+ args.prompt = (
69
+ "Human: I have one reference image and many images."
70
+ "Describe their relationship in a very short sentence.\n\nAssistant:"
71
+ )
72
+ image_data = [
73
+ "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
74
+ "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
75
+ "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
76
+ "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
77
+ ]
65
78
  else:
66
79
  image_data = None
67
80
 
@@ -74,9 +87,6 @@ def send_one_prompt(args):
74
87
  "Write in a format of json.\nAssistant:"
75
88
  )
76
89
  json_schema = "$$ANY$$"
77
- json_schema = (
78
- '{"type": "object", "properties": {"population": {"type": "integer"}}}'
79
- )
80
90
  else:
81
91
  json_schema = None
82
92
 
@@ -140,7 +140,7 @@ class ChatCompletionSampler(SamplerBase):
140
140
  max_tokens=self.max_tokens,
141
141
  )
142
142
  return response.choices[0].message.content
143
- # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are reruning MMMU
143
+ # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
144
144
  except openai.BadRequestError as e:
145
145
  print("Bad Request Error", e)
146
146
  return ""
@@ -121,7 +121,7 @@ class HumanEval(Eval):
121
121
  convo=convo,
122
122
  metrics={
123
123
  f"pass@{k}": estimate_pass_at_k([total], [correct], k)
124
- # this will be aggrated so no need of .mean()
124
+ # this will be aggregated so no need of .mean()
125
125
  for k in self._ks_passes
126
126
  if total >= k
127
127
  },
@@ -370,7 +370,7 @@ def test_dtype_gen():
370
370
  @sgl.function
371
371
  def dtype_gen(s):
372
372
  s += "Q: What is the full name of DNS?\n"
373
- s += "A: The full nams is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
373
+ s += "A: The full names is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
374
374
  s += "Q: Which year was DNS invented?\n"
375
375
  s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
376
376
  s += "Q: What is the value of pi?\n"
@@ -503,7 +503,7 @@ def test_hellaswag_select():
503
503
  #####################################
504
504
 
505
505
  # Run requests
506
- tic = time.time()
506
+ tic = time.perf_counter()
507
507
  rets = few_shot_hellaswag.run_batch(
508
508
  arguments,
509
509
  temperature=0,
@@ -514,13 +514,13 @@ def test_hellaswag_select():
514
514
  preds = []
515
515
  for i, ret in enumerate(rets):
516
516
  preds.append(choices[i].index(ret["answer"]))
517
- latency = time.time() - tic
517
+ latency = time.perf_counter() - tic
518
518
 
519
519
  # Compute accuracy
520
520
  accuracy = np.mean(np.array(preds) == np.array(labels))
521
521
 
522
522
  # Test generator style of run_batch
523
- tic = time.time()
523
+ tic = time.perf_counter()
524
524
  rets = few_shot_hellaswag.run_batch(
525
525
  arguments,
526
526
  temperature=0,
@@ -531,7 +531,7 @@ def test_hellaswag_select():
531
531
  preds_gen = []
532
532
  for i, ret in enumerate(rets):
533
533
  preds_gen.append(choices[i].index(ret["answer"]))
534
- latency_gen = time.time() - tic
534
+ latency_gen = time.perf_counter() - tic
535
535
 
536
536
  # Compute accuracy
537
537
  accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
sglang/test/test_utils.py CHANGED
@@ -395,12 +395,12 @@ def popen_launch_server(
395
395
  other_args: list[str] = (),
396
396
  env: Optional[dict] = None,
397
397
  return_stdout_stderr: Optional[tuple] = None,
398
- pd_seperated: bool = False,
398
+ pd_separated: bool = False,
399
399
  ):
400
400
  _, host, port = base_url.split(":")
401
401
  host = host[2:]
402
402
 
403
- if pd_seperated:
403
+ if pd_separated:
404
404
  command = "sglang.launch_pd_server"
405
405
  else:
406
406
  command = "sglang.launch_server"
@@ -414,7 +414,7 @@ def popen_launch_server(
414
414
  *[str(x) for x in other_args],
415
415
  ]
416
416
 
417
- if pd_seperated:
417
+ if pd_separated:
418
418
  command.extend(
419
419
  [
420
420
  "--lb-host",
@@ -449,9 +449,9 @@ def popen_launch_server(
449
449
  else:
450
450
  process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
451
451
 
452
- start_time = time.time()
452
+ start_time = time.perf_counter()
453
453
  with requests.Session() as session:
454
- while time.time() - start_time < timeout:
454
+ while time.perf_counter() - start_time < timeout:
455
455
  try:
456
456
  headers = {
457
457
  "Content-Type": "application/json; charset=utf-8",
@@ -478,6 +478,81 @@ def popen_launch_server(
478
478
  raise TimeoutError("Server failed to start within the timeout period.")
479
479
 
480
480
 
481
+ def popen_launch_pd_server(
482
+ model: str,
483
+ base_url: str,
484
+ timeout: float,
485
+ api_key: Optional[str] = None,
486
+ other_args: list[str] = (),
487
+ env: Optional[dict] = None,
488
+ return_stdout_stderr: Optional[tuple] = None,
489
+ ):
490
+ _, host, port = base_url.split(":")
491
+ host = host[2:]
492
+
493
+ command = "sglang.launch_server"
494
+
495
+ command = [
496
+ "python3",
497
+ "-m",
498
+ command,
499
+ "--model-path",
500
+ model,
501
+ *[str(x) for x in other_args],
502
+ ]
503
+
504
+ command.extend(
505
+ [
506
+ "--host",
507
+ host,
508
+ "--port",
509
+ port,
510
+ ]
511
+ )
512
+
513
+ if api_key:
514
+ command += ["--api-key", api_key]
515
+
516
+ print(f"command={' '.join(command)}")
517
+
518
+ if return_stdout_stderr:
519
+ process = subprocess.Popen(
520
+ command,
521
+ stdout=return_stdout_stderr[0],
522
+ stderr=return_stdout_stderr[1],
523
+ env=env,
524
+ text=True,
525
+ )
526
+ else:
527
+ process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
528
+
529
+ start_time = time.time()
530
+ with requests.Session() as session:
531
+ while time.time() - start_time < timeout:
532
+ try:
533
+ headers = {
534
+ "Content-Type": "application/json; charset=utf-8",
535
+ "Authorization": f"Bearer {api_key}",
536
+ }
537
+ response = session.get(
538
+ f"{base_url}/health",
539
+ headers=headers,
540
+ )
541
+ if response.status_code == 200:
542
+ return process
543
+ except requests.RequestException:
544
+ pass
545
+
546
+ return_code = process.poll()
547
+ if return_code is not None:
548
+ raise Exception(f"Server unexpectedly exits ({return_code=}).")
549
+
550
+ time.sleep(10)
551
+
552
+ kill_process_tree(process.pid)
553
+ raise TimeoutError("Server failed to start within the timeout period.")
554
+
555
+
481
556
  def run_with_timeout(
482
557
  func: Callable,
483
558
  args: tuple = (),
@@ -509,7 +584,7 @@ class TestFile:
509
584
 
510
585
 
511
586
  def run_unittest_files(files: List[TestFile], timeout_per_file: float):
512
- tic = time.time()
587
+ tic = time.perf_counter()
513
588
  success = True
514
589
 
515
590
  for i, file in enumerate(files):
@@ -524,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
524
599
  f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
525
600
  flush=True,
526
601
  )
527
- tic = time.time()
602
+ tic = time.perf_counter()
528
603
 
529
604
  process = subprocess.Popen(
530
605
  ["python3", filename], stdout=None, stderr=None, env=os.environ
531
606
  )
532
607
  process.wait()
533
- elapsed = time.time() - tic
608
+ elapsed = time.perf_counter() - tic
534
609
 
535
610
  print(
536
611
  f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
@@ -556,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
556
631
  break
557
632
 
558
633
  if success:
559
- print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
634
+ print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
560
635
  else:
561
- print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
636
+ print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
562
637
 
563
638
  return 0 if success else -1
564
639
 
@@ -581,7 +656,7 @@ def get_benchmark_args(
581
656
  disable_stream=False,
582
657
  disable_ignore_eos=False,
583
658
  seed: int = 0,
584
- pd_seperated: bool = False,
659
+ pd_separated: bool = False,
585
660
  ):
586
661
  return SimpleNamespace(
587
662
  backend="sglang",
@@ -611,7 +686,7 @@ def get_benchmark_args(
611
686
  profile=None,
612
687
  lora_name=None,
613
688
  prompt_suffix="",
614
- pd_seperated=pd_seperated,
689
+ pd_separated=pd_separated,
615
690
  )
616
691
 
617
692
 
@@ -675,7 +750,7 @@ def run_bench_serving_multi(
675
750
  other_server_args,
676
751
  benchmark_args,
677
752
  need_warmup=False,
678
- pd_seperated=False,
753
+ pd_separated=False,
679
754
  ):
680
755
  # Launch the server
681
756
  process = popen_launch_server(
@@ -683,7 +758,7 @@ def run_bench_serving_multi(
683
758
  base_url,
684
759
  timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
685
760
  other_args=other_server_args,
686
- pd_seperated=pd_seperated,
761
+ pd_separated=pd_separated,
687
762
  )
688
763
 
689
764
  # run benchmark for all
sglang/utils.py CHANGED
@@ -278,7 +278,7 @@ def graceful_registry(sub_module_name: str):
278
278
  f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
279
279
  )
280
280
  if signum == signal.SIGTERM:
281
- logger.info(f"{sub_module_name} recive sigterm")
281
+ logger.info(f"{sub_module_name} receive sigterm")
282
282
 
283
283
  signal.signal(signal.SIGTERM, graceful_shutdown)
284
284
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.6.post3"
1
+ __version__ = "0.4.6.post4"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.6.post3
3
+ Version: 0.4.6.post4
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -247,7 +247,7 @@ Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
247
247
  Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
248
248
  Provides-Extra: srt
249
249
  Requires-Dist: sglang[runtime_common]; extra == "srt"
250
- Requires-Dist: sgl-kernel==0.1.1; extra == "srt"
250
+ Requires-Dist: sgl-kernel==0.1.2.post1; extra == "srt"
251
251
  Requires-Dist: flashinfer_python==0.2.5; extra == "srt"
252
252
  Requires-Dist: torch==2.6.0; extra == "srt"
253
253
  Requires-Dist: torchvision==0.21.0; extra == "srt"
@@ -301,6 +301,7 @@ Requires-Dist: sglang[srt]; extra == "all"
301
301
  Requires-Dist: sglang[openai]; extra == "all"
302
302
  Requires-Dist: sglang[anthropic]; extra == "all"
303
303
  Requires-Dist: sglang[litellm]; extra == "all"
304
+ Requires-Dist: sglang[torch_memory_saver]; extra == "all"
304
305
  Provides-Extra: all-hip
305
306
  Requires-Dist: sglang[srt_hip]; extra == "all-hip"
306
307
  Requires-Dist: sglang[openai]; extra == "all-hip"
@@ -368,16 +369,16 @@ Dynamic: license-file
368
369
  - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
369
370
  - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
370
371
  - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
371
- - [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
372
372
  - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
373
373
  - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
374
- - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
375
374
  - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
376
375
 
377
376
  <details>
378
377
  <summary>More</summary>
379
378
 
379
+ - [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
380
380
  - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
381
+ - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
381
382
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
382
383
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
383
384
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -409,7 +410,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
409
410
 
410
411
  ## Adoption and Sponsorship
411
412
  The project has been deployed to large-scale production, generating trillions of tokens every day.
412
- It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Google Cloud, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
413
+ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Google Cloud, Hyperbolic, Iflytek, InnoMatrix, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
413
414
 
414
415
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
415
416