sglang 0.4.5.post3__py3-none-any.whl → 0.4.6.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. sglang/bench_one_batch.py +19 -3
  2. sglang/bench_serving.py +8 -9
  3. sglang/compile_deep_gemm.py +45 -4
  4. sglang/srt/code_completion_parser.py +1 -1
  5. sglang/srt/configs/deepseekvl2.py +1 -1
  6. sglang/srt/configs/model_config.py +9 -3
  7. sglang/srt/constrained/llguidance_backend.py +78 -61
  8. sglang/srt/conversation.py +34 -1
  9. sglang/srt/disaggregation/decode.py +67 -13
  10. sglang/srt/disaggregation/fake/__init__.py +1 -0
  11. sglang/srt/disaggregation/fake/conn.py +88 -0
  12. sglang/srt/disaggregation/mini_lb.py +45 -8
  13. sglang/srt/disaggregation/mooncake/conn.py +198 -31
  14. sglang/srt/disaggregation/prefill.py +36 -12
  15. sglang/srt/disaggregation/utils.py +16 -2
  16. sglang/srt/entrypoints/engine.py +9 -0
  17. sglang/srt/entrypoints/http_server.py +35 -4
  18. sglang/srt/function_call_parser.py +77 -5
  19. sglang/srt/layers/attention/base_attn_backend.py +3 -0
  20. sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
  21. sglang/srt/layers/attention/flashattention_backend.py +28 -10
  22. sglang/srt/layers/attention/flashmla_backend.py +8 -11
  23. sglang/srt/layers/attention/utils.py +1 -1
  24. sglang/srt/layers/attention/vision.py +2 -0
  25. sglang/srt/layers/layernorm.py +38 -16
  26. sglang/srt/layers/logits_processor.py +2 -2
  27. sglang/srt/layers/moe/fused_moe_native.py +2 -4
  28. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  29. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  30. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  31. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  32. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  33. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  36. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  37. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  38. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  39. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  40. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +41 -41
  41. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +20 -17
  43. sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
  44. sglang/srt/layers/pooler.py +6 -0
  45. sglang/srt/layers/quantization/awq.py +5 -1
  46. sglang/srt/layers/quantization/deep_gemm.py +17 -10
  47. sglang/srt/layers/quantization/fp8.py +20 -22
  48. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  49. sglang/srt/layers/quantization/int8_kernel.py +32 -1
  50. sglang/srt/layers/radix_attention.py +13 -3
  51. sglang/srt/layers/rotary_embedding.py +170 -126
  52. sglang/srt/managers/data_parallel_controller.py +10 -3
  53. sglang/srt/managers/io_struct.py +7 -0
  54. sglang/srt/managers/mm_utils.py +85 -28
  55. sglang/srt/managers/multimodal_processors/base_processor.py +14 -1
  56. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +9 -2
  57. sglang/srt/managers/multimodal_processors/gemma3.py +2 -5
  58. sglang/srt/managers/multimodal_processors/janus_pro.py +2 -2
  59. sglang/srt/managers/multimodal_processors/minicpm.py +4 -3
  60. sglang/srt/managers/multimodal_processors/qwen_vl.py +38 -13
  61. sglang/srt/managers/schedule_batch.py +38 -12
  62. sglang/srt/managers/scheduler.py +41 -28
  63. sglang/srt/managers/scheduler_output_processor_mixin.py +25 -9
  64. sglang/srt/managers/tokenizer_manager.py +5 -1
  65. sglang/srt/managers/tp_worker.py +3 -3
  66. sglang/srt/managers/tp_worker_overlap_thread.py +9 -4
  67. sglang/srt/mem_cache/memory_pool.py +87 -0
  68. sglang/srt/model_executor/cuda_graph_runner.py +4 -3
  69. sglang/srt/model_executor/forward_batch_info.py +51 -95
  70. sglang/srt/model_executor/model_runner.py +19 -25
  71. sglang/srt/models/deepseek.py +12 -2
  72. sglang/srt/models/deepseek_nextn.py +101 -6
  73. sglang/srt/models/deepseek_v2.py +144 -70
  74. sglang/srt/models/deepseek_vl2.py +9 -4
  75. sglang/srt/models/gemma3_causal.py +1 -1
  76. sglang/srt/models/llama4.py +0 -1
  77. sglang/srt/models/minicpmo.py +5 -1
  78. sglang/srt/models/mllama4.py +2 -2
  79. sglang/srt/models/qwen2_5_vl.py +3 -6
  80. sglang/srt/models/qwen2_vl.py +3 -7
  81. sglang/srt/models/roberta.py +178 -0
  82. sglang/srt/openai_api/adapter.py +50 -11
  83. sglang/srt/openai_api/protocol.py +2 -0
  84. sglang/srt/reasoning_parser.py +25 -1
  85. sglang/srt/server_args.py +31 -24
  86. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
  87. sglang/srt/torch_memory_saver_adapter.py +10 -1
  88. sglang/srt/utils.py +5 -1
  89. sglang/test/runners.py +6 -13
  90. sglang/test/send_one.py +84 -28
  91. sglang/test/test_utils.py +74 -18
  92. sglang/version.py +1 -1
  93. {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/METADATA +5 -6
  94. {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/RECORD +97 -80
  95. {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/WHEEL +1 -1
  96. {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/licenses/LICENSE +0 -0
  97. {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/top_level.txt +0 -0
sglang/test/send_one.py CHANGED
@@ -6,11 +6,56 @@ python3 -m sglang.test.send_one
6
6
  """
7
7
 
8
8
  import argparse
9
+ import dataclasses
9
10
  import json
10
11
 
11
12
  import requests
12
13
 
13
14
 
15
+ @dataclasses.dataclass
16
+ class BenchArgs:
17
+ host: str = "localhost"
18
+ port: int = 30000
19
+ batch_size: int = 1
20
+ temperature: float = 0.0
21
+ max_new_tokens: int = 512
22
+ frequency_penalty: float = 0.0
23
+ presence_penalty: float = 0.0
24
+ json: bool = False
25
+ return_logprob: bool = False
26
+ prompt: str = (
27
+ "Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:"
28
+ )
29
+ image: bool = False
30
+ stream: bool = False
31
+
32
+ @staticmethod
33
+ def add_cli_args(parser: argparse.ArgumentParser):
34
+ parser.add_argument("--host", type=str, default=BenchArgs.host)
35
+ parser.add_argument("--port", type=int, default=BenchArgs.port)
36
+ parser.add_argument("--batch-size", type=int, default=BenchArgs.batch_size)
37
+ parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
38
+ parser.add_argument(
39
+ "--max-new-tokens", type=int, default=BenchArgs.max_new_tokens
40
+ )
41
+ parser.add_argument(
42
+ "--frequency-penalty", type=float, default=BenchArgs.frequency_penalty
43
+ )
44
+ parser.add_argument(
45
+ "--presence-penalty", type=float, default=BenchArgs.presence_penalty
46
+ )
47
+ parser.add_argument("--json", action="store_true")
48
+ parser.add_argument("--return-logprob", action="store_true")
49
+ parser.add_argument("--prompt", type=str, default=BenchArgs.prompt)
50
+ parser.add_argument("--image", action="store_true")
51
+ parser.add_argument("--stream", action="store_true")
52
+
53
+ @classmethod
54
+ def from_cli_args(cls, args: argparse.Namespace):
55
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
56
+ return cls(**{attr: getattr(args, attr) for attr in attrs})
57
+
58
+
14
59
  def send_one_prompt(args):
15
60
  if args.image:
16
61
  args.prompt = (
@@ -20,20 +65,42 @@ def send_one_prompt(args):
20
65
  else:
21
66
  image_data = None
22
67
 
23
- response = requests.post(
24
- "http://localhost:30000/generate",
25
- json={
26
- "text": args.prompt,
27
- "image_data": image_data,
28
- "sampling_params": {
29
- "temperature": args.temperature,
30
- "max_new_tokens": args.max_new_tokens,
31
- "frequency_penalty": args.frequency_penalty,
32
- "presence_penalty": args.presence_penalty,
33
- },
34
- "return_logprob": args.return_logprob,
35
- "stream": args.stream,
68
+ prompt = args.prompt
69
+
70
+ if args.json:
71
+ prompt = (
72
+ "Human: What is the capital of France and how is that city like. "
73
+ "Give me 3 trivial information about that city. "
74
+ "Write in a format of json.\nAssistant:"
75
+ )
76
+ json_schema = "$$ANY$$"
77
+ json_schema = (
78
+ '{"type": "object", "properties": {"population": {"type": "integer"}}}'
79
+ )
80
+ else:
81
+ json_schema = None
82
+
83
+ if args.batch_size > 1:
84
+ prompt = [prompt] * args.batch_size
85
+
86
+ json_data = {
87
+ "text": prompt,
88
+ "image_data": image_data,
89
+ "sampling_params": {
90
+ "temperature": args.temperature,
91
+ "max_new_tokens": args.max_new_tokens,
92
+ "frequency_penalty": args.frequency_penalty,
93
+ "presence_penalty": args.presence_penalty,
94
+ "json_schema": json_schema,
95
+ "stop": ["Question", "Assistant:", "<|separator|>", "<|eos|>"],
36
96
  },
97
+ "return_logprob": args.return_logprob,
98
+ "stream": args.stream,
99
+ }
100
+
101
+ response = requests.post(
102
+ f"http://{args.host}:{args.port}/generate",
103
+ json=json_data,
37
104
  stream=args.stream,
38
105
  )
39
106
 
@@ -47,6 +114,9 @@ def send_one_prompt(args):
47
114
  else:
48
115
  ret = response.json()
49
116
 
117
+ if args.batch_size > 1:
118
+ ret = ret[0]
119
+
50
120
  latency = ret["meta_info"]["e2e_latency"]
51
121
 
52
122
  if "spec_verify_ct" in ret["meta_info"]:
@@ -68,21 +138,7 @@ def send_one_prompt(args):
68
138
 
69
139
  if __name__ == "__main__":
70
140
  parser = argparse.ArgumentParser()
71
- parser.add_argument("--temperature", type=float, default=0.0)
72
- parser.add_argument("--max-new-tokens", type=int, default=512)
73
- parser.add_argument("--frequency-penalty", type=float, default=0.0)
74
- parser.add_argument("--presence-penalty", type=float, default=0.0)
75
- parser.add_argument("--return-logprob", action="store_true")
76
- parser.add_argument(
77
- "--prompt",
78
- type=str,
79
- default="Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:",
80
- )
81
- parser.add_argument(
82
- "--image",
83
- action="store_true",
84
- )
85
- parser.add_argument("--stream", action="store_true")
141
+ BenchArgs.add_cli_args(parser)
86
142
  args = parser.parse_args()
87
143
 
88
144
  send_one_prompt(args)
sglang/test/test_utils.py CHANGED
@@ -8,7 +8,6 @@ import random
8
8
  import subprocess
9
9
  import threading
10
10
  import time
11
- import traceback
12
11
  import unittest
13
12
  from concurrent.futures import ThreadPoolExecutor
14
13
  from dataclasses import dataclass
@@ -34,27 +33,44 @@ from sglang.srt.utils import (
34
33
  from sglang.test.run_eval import run_eval
35
34
  from sglang.utils import get_exception_traceback
36
35
 
37
- DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
38
- DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
39
- DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
40
- "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
41
- )
42
- DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST = (
43
- "nvidia/Llama-3.1-8B-Instruct-FP8"
44
- )
45
-
36
+ # General test models
46
37
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
47
38
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
48
39
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
49
40
  DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
50
- DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
41
+
42
+ # MLA test models
51
43
  DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
52
44
  DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
45
+ DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
46
+ DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN = "lmsys/sglang-ci-dsv3-test-NextN"
47
+
48
+ # FP8 models
49
+ DEFAULT_MODEL_NAME_FOR_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
50
+ DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
51
+ DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8 = (
52
+ "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
53
+ )
54
+ DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8 = (
55
+ "nvidia/Llama-3.1-8B-Instruct-FP8"
56
+ )
57
+
58
+ # EAGLE
59
+ DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
60
+ DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
61
+ DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B"
62
+
63
+ # Other use cases
64
+ DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
65
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct"
66
+ )
67
+ DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
53
68
  DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
54
69
  DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
55
70
  "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
56
71
  )
57
- DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
72
+
73
+ # Nightly tests
58
74
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
59
75
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
60
76
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
@@ -63,12 +79,11 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8
63
79
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
64
80
  DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
65
81
 
66
- DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
67
- DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
68
-
69
82
  DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
70
83
  DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
71
84
 
85
+ DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
86
+
72
87
 
73
88
  def is_in_ci():
74
89
  """Return whether it is in CI runner."""
@@ -494,7 +509,7 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
494
509
  tic = time.time()
495
510
  success = True
496
511
 
497
- for file in files:
512
+ for i, file in enumerate(files):
498
513
  filename, estimated_time = file.name, file.estimated_time
499
514
  process = None
500
515
 
@@ -502,7 +517,10 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
502
517
  nonlocal process
503
518
 
504
519
  filename = os.path.join(os.getcwd(), filename)
505
- print(f".\n.\nBegin:\npython3 {filename}\n.\n.\n", flush=True)
520
+ print(
521
+ f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
522
+ flush=True,
523
+ )
506
524
  tic = time.time()
507
525
 
508
526
  process = subprocess.Popen(
@@ -512,7 +530,7 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
512
530
  elapsed = time.time() - tic
513
531
 
514
532
  print(
515
- f".\n.\nEnd:\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
533
+ f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
516
534
  flush=True,
517
535
  )
518
536
  return process.returncode
@@ -714,6 +732,44 @@ def run_bench_one_batch(model, other_args):
714
732
  return output_throughput
715
733
 
716
734
 
735
+ def run_bench_offline_throughput(model, other_args):
736
+ command = [
737
+ "python3",
738
+ "-m",
739
+ "sglang.bench_offline_throughput",
740
+ "--num-prompts",
741
+ "1",
742
+ "--dataset-name",
743
+ "random",
744
+ "--random-input-len",
745
+ "256",
746
+ "--random-output-len",
747
+ "256",
748
+ "--model-path",
749
+ model,
750
+ *[str(x) for x in other_args],
751
+ ]
752
+
753
+ print(f"{command=}")
754
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
755
+
756
+ try:
757
+ stdout, stderr = process.communicate()
758
+ output = stdout.decode()
759
+ error = stderr.decode()
760
+ print(f"Output: {output}", flush=True)
761
+ print(f"Error: {error}", flush=True)
762
+
763
+ output_throughput = -1
764
+ for line in output.split("\n"):
765
+ if "Last generation throughput (tok/s):" in line:
766
+ output_throughput = float(line.split(":")[-1])
767
+ finally:
768
+ kill_process_tree(process.pid)
769
+
770
+ return output_throughput
771
+
772
+
717
773
  def lcs(X, Y):
718
774
  m = len(X)
719
775
  n = len(Y)
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.5.post3"
1
+ __version__ = "0.4.6.post1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.5.post3
3
+ Version: 0.4.6.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -225,7 +225,7 @@ Requires-Dist: fastapi; extra == "runtime-common"
225
225
  Requires-Dist: hf_transfer; extra == "runtime-common"
226
226
  Requires-Dist: huggingface_hub; extra == "runtime-common"
227
227
  Requires-Dist: interegular; extra == "runtime-common"
228
- Requires-Dist: llguidance>=0.6.15; extra == "runtime-common"
228
+ Requires-Dist: llguidance<0.8.0,>=0.7.11; extra == "runtime-common"
229
229
  Requires-Dist: modelscope; extra == "runtime-common"
230
230
  Requires-Dist: ninja; extra == "runtime-common"
231
231
  Requires-Dist: orjson; extra == "runtime-common"
@@ -242,11 +242,10 @@ Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
242
242
  Requires-Dist: transformers==4.51.1; extra == "runtime-common"
243
243
  Requires-Dist: uvicorn; extra == "runtime-common"
244
244
  Requires-Dist: uvloop; extra == "runtime-common"
245
- Requires-Dist: compressed-tensors; extra == "runtime-common"
246
245
  Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
247
246
  Provides-Extra: srt
248
247
  Requires-Dist: sglang[runtime_common]; extra == "srt"
249
- Requires-Dist: sgl-kernel==0.0.9.post2; extra == "srt"
248
+ Requires-Dist: sgl-kernel==0.1.0; extra == "srt"
250
249
  Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
251
250
  Requires-Dist: torch==2.6.0; extra == "srt"
252
251
  Requires-Dist: torchvision==0.21.0; extra == "srt"
@@ -409,5 +408,5 @@ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor
409
408
 
410
409
  For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
411
410
 
412
- ## Acknowledgment and Citation
413
- We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
411
+ ## Acknowledgment
412
+ We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).