sglang 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. sglang/bench_one_batch.py +149 -34
  2. sglang/bench_serving.py +18 -3
  3. sglang/compile_deep_gemm.py +13 -7
  4. sglang/srt/batch_invariant_ops/__init__.py +2 -0
  5. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +120 -0
  6. sglang/srt/checkpoint_engine/__init__.py +9 -0
  7. sglang/srt/checkpoint_engine/update.py +317 -0
  8. sglang/srt/configs/__init__.py +2 -0
  9. sglang/srt/configs/deepseek_ocr.py +542 -10
  10. sglang/srt/configs/deepseekvl2.py +95 -194
  11. sglang/srt/configs/kimi_linear.py +160 -0
  12. sglang/srt/configs/mamba_utils.py +66 -0
  13. sglang/srt/configs/model_config.py +25 -2
  14. sglang/srt/constants.py +7 -0
  15. sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
  16. sglang/srt/disaggregation/decode.py +34 -6
  17. sglang/srt/disaggregation/nixl/conn.py +2 -2
  18. sglang/srt/disaggregation/prefill.py +25 -3
  19. sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
  20. sglang/srt/distributed/parallel_state.py +9 -5
  21. sglang/srt/entrypoints/engine.py +13 -5
  22. sglang/srt/entrypoints/http_server.py +22 -3
  23. sglang/srt/entrypoints/openai/protocol.py +7 -1
  24. sglang/srt/entrypoints/openai/serving_chat.py +42 -0
  25. sglang/srt/entrypoints/openai/serving_completions.py +10 -0
  26. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  27. sglang/srt/environ.py +7 -0
  28. sglang/srt/eplb/expert_distribution.py +34 -1
  29. sglang/srt/eplb/expert_location.py +106 -36
  30. sglang/srt/grpc/compile_proto.py +3 -0
  31. sglang/srt/layers/attention/ascend_backend.py +233 -5
  32. sglang/srt/layers/attention/attention_registry.py +3 -0
  33. sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
  34. sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
  35. sglang/srt/layers/attention/fla/kda.py +1359 -0
  36. sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
  37. sglang/srt/layers/attention/flashattention_backend.py +7 -6
  38. sglang/srt/layers/attention/flashinfer_mla_backend.py +3 -1
  39. sglang/srt/layers/attention/flashmla_backend.py +1 -1
  40. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
  41. sglang/srt/layers/attention/mamba/mamba.py +20 -11
  42. sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
  43. sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
  44. sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
  45. sglang/srt/layers/attention/nsa/transform_index.py +1 -1
  46. sglang/srt/layers/attention/nsa_backend.py +157 -23
  47. sglang/srt/layers/attention/triton_backend.py +4 -1
  48. sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
  49. sglang/srt/layers/attention/trtllm_mla_backend.py +10 -2
  50. sglang/srt/layers/communicator.py +23 -1
  51. sglang/srt/layers/layernorm.py +16 -2
  52. sglang/srt/layers/logits_processor.py +4 -20
  53. sglang/srt/layers/moe/ep_moe/layer.py +0 -18
  54. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
  56. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
  57. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
  58. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
  59. sglang/srt/layers/moe/moe_runner/deep_gemm.py +53 -33
  60. sglang/srt/layers/moe/token_dispatcher/deepep.py +12 -9
  61. sglang/srt/layers/moe/topk.py +31 -6
  62. sglang/srt/layers/pooler.py +21 -2
  63. sglang/srt/layers/quantization/__init__.py +9 -78
  64. sglang/srt/layers/quantization/auto_round.py +394 -0
  65. sglang/srt/layers/quantization/fp8_kernel.py +1 -1
  66. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  67. sglang/srt/layers/quantization/modelopt_quant.py +168 -11
  68. sglang/srt/layers/rotary_embedding.py +117 -45
  69. sglang/srt/lora/lora_registry.py +9 -0
  70. sglang/srt/managers/async_mm_data_processor.py +122 -0
  71. sglang/srt/managers/data_parallel_controller.py +30 -3
  72. sglang/srt/managers/detokenizer_manager.py +3 -0
  73. sglang/srt/managers/io_struct.py +26 -4
  74. sglang/srt/managers/multi_tokenizer_mixin.py +5 -0
  75. sglang/srt/managers/schedule_batch.py +74 -15
  76. sglang/srt/managers/scheduler.py +164 -129
  77. sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
  78. sglang/srt/managers/scheduler_pp_mixin.py +7 -2
  79. sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
  80. sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
  81. sglang/srt/managers/session_controller.py +6 -5
  82. sglang/srt/managers/tokenizer_manager.py +154 -59
  83. sglang/srt/managers/tp_worker.py +24 -1
  84. sglang/srt/mem_cache/base_prefix_cache.py +23 -4
  85. sglang/srt/mem_cache/common.py +1 -0
  86. sglang/srt/mem_cache/memory_pool.py +171 -57
  87. sglang/srt/mem_cache/memory_pool_host.py +12 -5
  88. sglang/srt/mem_cache/radix_cache.py +4 -0
  89. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
  90. sglang/srt/metrics/collector.py +46 -3
  91. sglang/srt/model_executor/cuda_graph_runner.py +15 -3
  92. sglang/srt/model_executor/forward_batch_info.py +11 -11
  93. sglang/srt/model_executor/model_runner.py +76 -21
  94. sglang/srt/model_executor/npu_graph_runner.py +7 -3
  95. sglang/srt/model_loader/weight_utils.py +1 -1
  96. sglang/srt/models/bailing_moe.py +9 -2
  97. sglang/srt/models/deepseek_nextn.py +11 -2
  98. sglang/srt/models/deepseek_v2.py +149 -34
  99. sglang/srt/models/glm4.py +391 -77
  100. sglang/srt/models/glm4v.py +196 -55
  101. sglang/srt/models/glm4v_moe.py +0 -1
  102. sglang/srt/models/gpt_oss.py +1 -10
  103. sglang/srt/models/kimi_linear.py +678 -0
  104. sglang/srt/models/llama4.py +1 -1
  105. sglang/srt/models/llama_eagle3.py +11 -1
  106. sglang/srt/models/longcat_flash.py +2 -2
  107. sglang/srt/models/minimax_m2.py +1 -1
  108. sglang/srt/models/qwen2.py +1 -1
  109. sglang/srt/models/qwen2_moe.py +30 -15
  110. sglang/srt/models/qwen3.py +1 -1
  111. sglang/srt/models/qwen3_moe.py +16 -8
  112. sglang/srt/models/qwen3_next.py +7 -0
  113. sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
  114. sglang/srt/multiplex/multiplexing_mixin.py +209 -0
  115. sglang/srt/multiplex/pdmux_context.py +164 -0
  116. sglang/srt/parser/conversation.py +7 -1
  117. sglang/srt/sampling/custom_logit_processor.py +67 -1
  118. sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
  119. sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
  120. sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
  121. sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
  122. sglang/srt/server_args.py +103 -22
  123. sglang/srt/single_batch_overlap.py +4 -1
  124. sglang/srt/speculative/draft_utils.py +16 -0
  125. sglang/srt/speculative/eagle_info.py +42 -36
  126. sglang/srt/speculative/eagle_info_v2.py +68 -25
  127. sglang/srt/speculative/eagle_utils.py +261 -16
  128. sglang/srt/speculative/eagle_worker.py +11 -3
  129. sglang/srt/speculative/eagle_worker_v2.py +15 -9
  130. sglang/srt/speculative/spec_info.py +305 -31
  131. sglang/srt/speculative/spec_utils.py +44 -8
  132. sglang/srt/tracing/trace.py +121 -12
  133. sglang/srt/utils/common.py +55 -32
  134. sglang/srt/utils/hf_transformers_utils.py +38 -16
  135. sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
  136. sglang/test/kits/radix_cache_server_kit.py +50 -0
  137. sglang/test/runners.py +31 -7
  138. sglang/test/simple_eval_common.py +5 -3
  139. sglang/test/simple_eval_humaneval.py +1 -0
  140. sglang/test/simple_eval_math.py +1 -0
  141. sglang/test/simple_eval_mmlu.py +1 -0
  142. sglang/test/simple_eval_mmmu_vlm.py +1 -0
  143. sglang/test/test_utils.py +7 -1
  144. sglang/version.py +1 -1
  145. {sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +10 -24
  146. {sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +150 -136
  147. /sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
  148. {sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
  149. {sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
  150. {sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0
sglang/test/runners.py CHANGED
@@ -12,10 +12,11 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
 
15
+ import json
15
16
  import multiprocessing as mp
16
17
  import os
17
18
  from dataclasses import dataclass
18
- from typing import List, Optional, Tuple, Union
19
+ from typing import Any, List, Optional, Tuple, Union
19
20
 
20
21
  import torch
21
22
  import torch.nn.functional as F
@@ -89,7 +90,9 @@ def get_token_ids_logprobs(logits, token_ids):
89
90
  return logprobs
90
91
 
91
92
 
92
- def _get_sentence_transformer_embedding_model(model_path, torch_dtype):
93
+ def _get_sentence_transformer_embedding_model(
94
+ model_path, torch_dtype, matryoshka_dim: Optional[int] = None
95
+ ):
93
96
  from sentence_transformers import SentenceTransformer
94
97
  from sentence_transformers.util import is_sentence_transformer_model
95
98
 
@@ -97,6 +100,7 @@ def _get_sentence_transformer_embedding_model(model_path, torch_dtype):
97
100
  model = SentenceTransformer(
98
101
  model_path,
99
102
  model_kwargs={"torch_dtype": torch_dtype},
103
+ truncate_dim=matryoshka_dim,
100
104
  )
101
105
  else: # if no pre-trained sentence-transformers model
102
106
  from sentence_transformers import models
@@ -106,7 +110,9 @@ def _get_sentence_transformer_embedding_model(model_path, torch_dtype):
106
110
  word_embedding_model.get_word_embedding_dimension(),
107
111
  pooling_mode="lasttoken",
108
112
  )
109
- model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
113
+ model = SentenceTransformer(
114
+ modules=[word_embedding_model, pooling_model], truncate_dim=matryoshka_dim
115
+ )
110
116
 
111
117
  return model.cuda()
112
118
 
@@ -135,6 +141,7 @@ class HFRunner:
135
141
  output_str_only: bool = False,
136
142
  trust_remote_code: bool = False,
137
143
  patch_model_do_sample_false: bool = False,
144
+ matryoshka_dim: Optional[int] = None,
138
145
  ):
139
146
  self.model_type = model_type
140
147
  self.output_str_only = output_str_only
@@ -151,6 +158,7 @@ class HFRunner:
151
158
  self.out_queue,
152
159
  model_path,
153
160
  torch_dtype,
161
+ matryoshka_dim,
154
162
  ),
155
163
  )
156
164
  self.model_proc.start()
@@ -225,7 +233,14 @@ class HFRunner:
225
233
  embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
226
234
  return embeddings.contiguous()
227
235
 
228
- def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
236
+ def start_model_process(
237
+ self,
238
+ in_queue,
239
+ out_queue,
240
+ model_path,
241
+ torch_dtype,
242
+ matryoshka_dim: Optional[int] = None,
243
+ ):
229
244
  # Apply model-specific patches
230
245
  monkey_patch_gemma2_sdpa()
231
246
 
@@ -259,7 +274,7 @@ class HFRunner:
259
274
  self.processor = AutoProcessor.from_pretrained(model_path)
260
275
  else:
261
276
  self.model = _get_sentence_transformer_embedding_model(
262
- model_path, torch_dtype
277
+ model_path, torch_dtype, matryoshka_dim=matryoshka_dim
263
278
  )
264
279
  elif self.model_type == "reward" or self.model_type == "cross_encoder":
265
280
  from transformers import AutoModelForSequenceClassification
@@ -496,7 +511,7 @@ class SRTRunner:
496
511
  attention_backend: Optional[str] = None,
497
512
  prefill_attention_backend: Optional[str] = None,
498
513
  decode_attention_backend: Optional[str] = None,
499
- lora_backend: str = "triton",
514
+ lora_backend: str = "csgmv",
500
515
  disable_cuda_graph: bool = False,
501
516
  disable_radix_cache: bool = False,
502
517
  chunked_prefill_size: Optional[int] = None,
@@ -519,6 +534,7 @@ class SRTRunner:
519
534
  lora_target_modules: Optional[List[str]] = None,
520
535
  enable_lora: Optional[bool] = None,
521
536
  max_loaded_loras: Optional[int] = None,
537
+ json_model_override_args: Optional[dict[str, Any]] = None,
522
538
  lora_eviction_policy: str = "lru",
523
539
  ):
524
540
  self.model_type = model_type
@@ -566,6 +582,11 @@ class SRTRunner:
566
582
  lora_target_modules=lora_target_modules,
567
583
  enable_lora=enable_lora,
568
584
  max_loaded_loras=max_loaded_loras,
585
+ json_model_override_args=(
586
+ json.dumps(json_model_override_args)
587
+ if json_model_override_args
588
+ else "{}"
589
+ ),
569
590
  lora_eviction_policy=lora_eviction_policy,
570
591
  **spec_kwargs,
571
592
  )
@@ -594,6 +615,7 @@ class SRTRunner:
594
615
  logprob_start_len: int = 0,
595
616
  top_k: Optional[int] = None,
596
617
  token_ids_logprob: Optional[List[int]] = None,
618
+ dimensions: Optional[int] = None,
597
619
  ):
598
620
  if self.is_generation:
599
621
  return self.forward_generation_raw(
@@ -607,7 +629,9 @@ class SRTRunner:
607
629
  )
608
630
  else:
609
631
  if self.model_type == "embedding":
610
- response = self.engine.encode(prompt=prompts, image_data=image_data)
632
+ response = self.engine.encode(
633
+ prompt=prompts, image_data=image_data, dimensions=dimensions
634
+ )
611
635
  if isinstance(response, list):
612
636
  logits = [x["embedding"] for x in response]
613
637
  else:
@@ -148,7 +148,7 @@ class ChatCompletionSampler(SamplerBase):
148
148
  reasoning_effort=self.reasoning_effort,
149
149
  extra_body=self.extra_body,
150
150
  )
151
- return response.choices[0].message.content
151
+ return response.choices[0].message.content or ""
152
152
  # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
153
153
  except openai.BadRequestError as e:
154
154
  print("Bad Request Error", e)
@@ -161,7 +161,9 @@ class ChatCompletionSampler(SamplerBase):
161
161
  )
162
162
  time.sleep(exception_backoff)
163
163
  trial += 1
164
- # unknown error shall throw exception
164
+ # If all retries are exhausted, return empty string instead of None
165
+ print(f"All retry attempts exhausted for request. Returning empty response.")
166
+ return ""
165
167
 
166
168
 
167
169
  QUERY_TEMPLATE_MULTICHOICE = """
@@ -261,7 +263,7 @@ def format_multichoice_question(row):
261
263
  def check_equality(sampler: SamplerBase, expr1: str, expr2: str):
262
264
  prompt = EQUALITY_TEMPLATE % {"expression1": expr1, "expression2": expr2}
263
265
  response = sampler([dict(content=prompt, role="user")])
264
- return response.lower().strip() == "yes"
266
+ return (response or "").lower().strip() == "yes"
265
267
 
266
268
 
267
269
  def _compute_stat(values: list, stat: str):
@@ -80,6 +80,7 @@ class HumanEval(Eval):
80
80
  instruction = "Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n"
81
81
 
82
82
  def find_code(completion):
83
+ completion = completion or ""
83
84
  pattern = re.compile(r"```python\n(.*?)```", re.DOTALL)
84
85
  matches = pattern.findall(completion)
85
86
  extracted_answer = matches[0] if len(matches) >= 1 else completion
@@ -54,6 +54,7 @@ class MathEval(Eval):
54
54
  sampler._pack_message(content=QUERY_TEMPLATE.format(**row), role="user")
55
55
  ]
56
56
  response_text = sampler(prompt_messages)
57
+ response_text = response_text or ""
57
58
  match = re.search(ANSWER_PATTERN, response_text)
58
59
  extracted_answer = match.group(1) if match else None
59
60
  score = float(
@@ -101,6 +101,7 @@ class MMLUEval(Eval):
101
101
  )
102
102
  ]
103
103
  response_text = sampler(prompt_messages)
104
+ response_text = response_text or ""
104
105
  match = re.search(ANSWER_PATTERN_MULTICHOICE, response_text)
105
106
  extracted_answer = match.group(1) if match else None
106
107
  score = 1.0 if extracted_answer == row["Answer"] else 0.0
@@ -204,6 +204,7 @@ class MMMUVLMEval(Eval):
204
204
 
205
205
  # Sample
206
206
  response_text = sampler(prompt_messages)
207
+ response_text = response_text or ""
207
208
 
208
209
  # Parse and score
209
210
  gold = sample["answer"]
sglang/test/test_utils.py CHANGED
@@ -84,6 +84,8 @@ DEFAULT_MODEL_NAME_FOR_TEST_AWQ_INT4 = (
84
84
  DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
85
85
  DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
86
86
  DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST_EAGLE3 = "meta-llama/Llama-3.1-8B-Instruct"
87
+ DEFAULT_EAGLE_DP_ATTENTION_TARGET_MODEL_FOR_TEST = "Qwen/Qwen3-30B-A3B"
88
+ DEFAULT_EAGLE_DP_ATTENTION_DRAFT_MODEL_FOR_TEST = "Tengyunw/qwen3_30b_moe_eagle3"
87
89
  DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B"
88
90
  DEFAULT_STANDALONE_SPECULATIVE_TARGET_MODEL_FOR_TEST = (
89
91
  "meta-llama/Llama-3.1-8B-Instruct"
@@ -92,6 +94,10 @@ DEFAULT_STANDALONE_SPECULATIVE_DRAFT_MODEL_FOR_TEST = "meta-llama/Llama-3.2-1B-I
92
94
  DEFAULT_NGRAM_SPECULATIVE_TARGET_MODEL_FOR_TEST = "Qwen/Qwen2.5-Coder-7B-Instruct"
93
95
 
94
96
  # Other use cases
97
+ DEFAULT_AUTOROUND_MODEL_NAME_FOR_TEST = (
98
+ "OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc", # auto_round:auto_gptq
99
+ "Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound", # auto_round:auto_awq
100
+ )
95
101
  DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
96
102
  "meta-llama/Llama-4-Scout-17B-16E-Instruct"
97
103
  )
@@ -145,7 +151,7 @@ def _use_cached_default_models(model_repo: str):
145
151
 
146
152
  if is_in_ci():
147
153
  DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
148
- 10000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 1000
154
+ 10000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 2000
149
155
  )
150
156
  else:
151
157
  DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.4.post1"
1
+ __version__ = "0.5.4.post2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.5.4.post1
3
+ Version: 0.5.4.post2
4
4
  Summary: SGLang is a fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -234,7 +234,7 @@ Requires-Dist: ninja
234
234
  Requires-Dist: numpy
235
235
  Requires-Dist: nvidia-cutlass-dsl==4.2.1
236
236
  Requires-Dist: openai-harmony==0.0.4
237
- Requires-Dist: openai==1.99.1
237
+ Requires-Dist: openai==2.6.1
238
238
  Requires-Dist: orjson
239
239
  Requires-Dist: outlines==0.1.11
240
240
  Requires-Dist: packaging
@@ -256,11 +256,11 @@ Requires-Dist: sgl-kernel==0.3.16.post4
256
256
  Requires-Dist: soundfile==0.13.1
257
257
  Requires-Dist: tiktoken
258
258
  Requires-Dist: timm==1.0.16
259
- Requires-Dist: torch==2.8.0
260
259
  Requires-Dist: torch_memory_saver==0.0.9
261
- Requires-Dist: torchao==0.9.0
260
+ Requires-Dist: torch==2.8.0
262
261
  Requires-Dist: torchaudio==2.8.0
263
262
  Requires-Dist: torchvision
263
+ Requires-Dist: torchao==0.9.0
264
264
  Requires-Dist: tqdm
265
265
  Requires-Dist: transformers==4.57.1
266
266
  Requires-Dist: uvicorn
@@ -270,8 +270,8 @@ Requires-Dist: grpcio==1.75.1
270
270
  Requires-Dist: grpcio-tools==1.75.1
271
271
  Requires-Dist: grpcio-reflection==1.75.1
272
272
  Requires-Dist: grpcio-health-checking==1.75.1
273
- Provides-Extra: modelopt
274
- Requires-Dist: nvidia-modelopt; extra == "modelopt"
273
+ Provides-Extra: checkpoint-engine
274
+ Requires-Dist: checkpoint-engine==0.1.2; extra == "checkpoint-engine"
275
275
  Provides-Extra: test
276
276
  Requires-Dist: accelerate; extra == "test"
277
277
  Requires-Dist: expecttest; extra == "test"
@@ -282,28 +282,13 @@ Requires-Dist: peft; extra == "test"
282
282
  Requires-Dist: pytest; extra == "test"
283
283
  Requires-Dist: sentence_transformers; extra == "test"
284
284
  Requires-Dist: tabulate; extra == "test"
285
- Provides-Extra: checkpoint-engine
286
- Requires-Dist: checkpoint-engine==0.1.2; extra == "checkpoint-engine"
287
- Provides-Extra: all
288
285
  Provides-Extra: dev
289
286
  Requires-Dist: sglang[test]; extra == "dev"
290
- Provides-Extra: cu130
291
- Requires-Dist: torch==2.9.0; extra == "cu130"
292
- Requires-Dist: torchaudio==2.9.0; extra == "cu130"
293
- Requires-Dist: torchvision==0.24.0; extra == "cu130"
294
- Provides-Extra: cu130-all
295
- Requires-Dist: sglang[test]; extra == "cu130-all"
296
- Requires-Dist: sglang[decord]; extra == "cu130-all"
297
- Requires-Dist: sglang[cu130]; extra == "cu130-all"
298
287
  Provides-Extra: tracing
299
288
  Requires-Dist: opentelemetry-api; extra == "tracing"
300
289
  Requires-Dist: opentelemetry-exporter-otlp; extra == "tracing"
301
290
  Requires-Dist: opentelemetry-exporter-otlp-proto-grpc; extra == "tracing"
302
291
  Requires-Dist: opentelemetry-sdk; extra == "tracing"
303
- Provides-Extra: blackwell
304
- Requires-Dist: sglang[dev]; extra == "blackwell"
305
- Provides-Extra: blackwell-aarch64
306
- Requires-Dist: sglang[dev]; extra == "blackwell-aarch64"
307
292
  Dynamic: license-file
308
293
 
309
294
  <div align="center" id="sglangtop">
@@ -328,14 +313,14 @@ Dynamic: license-file
328
313
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
329
314
 
330
315
  ## News
331
- - [2025/10] 🔥 AMD AI Dev Day 2025 SGLang ([slide](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/sglang_amd_ai_devday_2025.pdf)), PyTorch Conference 2025 SGLang ([slide](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/sglang_pytorch_2025.pdf)).
316
+ - [2025/10] 🔥 SGLang now runs natively on TPU with the SGLang-Jax backend ([blog](https://lmsys.org/blog/2025-10-29-sglang-jax/)).
317
+ - [2025/10] AMD AI Dev Day 2025 SGLang ([slide](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/sglang_amd_ai_devday_2025.pdf)), PyTorch Conference 2025 SGLang ([slide](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/sglang_pytorch_2025.pdf)).
332
318
  - [2025/09] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part II): 3.8x Prefill, 4.8x Decode Throughput ([blog](https://lmsys.org/blog/2025-09-25-gb200-part-2/)).
333
319
  - [2025/09] SGLang Day 0 Support for DeepSeek-V3.2 with Sparse Attention ([blog](https://lmsys.org/blog/2025-09-29-deepseek-V32/)).
334
320
  - [2025/08] SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
335
321
  - [2025/08] SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
336
322
  - [2025/05] Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
337
323
  - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
338
- - [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
339
324
 
340
325
  <details>
341
326
  <summary>More</summary>
@@ -345,6 +330,7 @@ Dynamic: license-file
345
330
  - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
346
331
  - [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
347
332
  - [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
333
+ - [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
348
334
  - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
349
335
  - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
350
336
  - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
@@ -385,7 +371,7 @@ SGLang is currently hosted under the non-profit open-source organization [LMSYS]
385
371
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
386
372
 
387
373
  ## Contact Us
388
- For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
374
+ For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at sglang@lmsys.org
389
375
 
390
376
  ## Acknowledgment
391
377
  We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).