sglang 0.1.21__py3-none-any.whl → 0.1.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. sglang/__init__.py +8 -8
  2. sglang/api.py +1 -1
  3. sglang/backend/vertexai.py +5 -4
  4. sglang/bench.py +627 -0
  5. sglang/bench_latency.py +22 -19
  6. sglang/bench_serving.py +976 -0
  7. sglang/check_env.py +171 -0
  8. sglang/global_config.py +3 -2
  9. sglang/lang/backend/__init__.py +0 -0
  10. sglang/lang/backend/anthropic.py +77 -0
  11. sglang/lang/backend/base_backend.py +80 -0
  12. sglang/lang/backend/litellm.py +90 -0
  13. sglang/lang/backend/openai.py +438 -0
  14. sglang/lang/backend/runtime_endpoint.py +283 -0
  15. sglang/lang/backend/vertexai.py +149 -0
  16. sglang/lang/interpreter.py +1 -0
  17. sglang/lang/tracer.py +1 -1
  18. sglang/launch_server.py +1 -1
  19. sglang/launch_server_llavavid.py +1 -4
  20. sglang/srt/conversation.py +1 -1
  21. sglang/srt/hf_transformers_utils.py +13 -1
  22. sglang/srt/layers/context_flashattention_nopad.py +0 -29
  23. sglang/srt/layers/extend_attention.py +0 -39
  24. sglang/srt/layers/linear.py +869 -0
  25. sglang/srt/layers/logits_processor.py +4 -5
  26. sglang/srt/layers/quantization/__init__.py +49 -0
  27. sglang/srt/layers/quantization/fp8.py +662 -0
  28. sglang/srt/layers/radix_attention.py +39 -24
  29. sglang/srt/layers/token_attention.py +1 -51
  30. sglang/srt/managers/controller/cuda_graph_runner.py +72 -28
  31. sglang/srt/managers/controller/infer_batch.py +90 -63
  32. sglang/srt/managers/controller/manager_multi.py +107 -100
  33. sglang/srt/managers/controller/manager_single.py +76 -96
  34. sglang/srt/managers/controller/model_runner.py +41 -26
  35. sglang/srt/managers/controller/schedule_heuristic.py +8 -3
  36. sglang/srt/managers/controller/tp_worker.py +136 -149
  37. sglang/srt/managers/detokenizer_manager.py +49 -5
  38. sglang/srt/managers/io_struct.py +36 -17
  39. sglang/srt/managers/tokenizer_manager.py +228 -125
  40. sglang/srt/memory_pool.py +32 -11
  41. sglang/srt/model_loader/model_loader.py +277 -0
  42. sglang/srt/model_loader/utils.py +260 -0
  43. sglang/srt/models/chatglm.py +1 -0
  44. sglang/srt/models/dbrx.py +1 -0
  45. sglang/srt/models/deepseek.py +430 -0
  46. sglang/srt/models/gpt_bigcode.py +282 -0
  47. sglang/srt/models/grok.py +1 -0
  48. sglang/srt/models/internlm2.py +317 -0
  49. sglang/srt/models/llama2.py +81 -23
  50. sglang/srt/models/llama_classification.py +1 -0
  51. sglang/srt/models/llava.py +1 -0
  52. sglang/srt/models/llavavid.py +1 -0
  53. sglang/srt/models/minicpm.py +1 -0
  54. sglang/srt/models/mixtral.py +1 -0
  55. sglang/srt/models/mixtral_quant.py +1 -0
  56. sglang/srt/models/qwen.py +1 -0
  57. sglang/srt/models/qwen2.py +6 -0
  58. sglang/srt/models/qwen2_moe.py +7 -4
  59. sglang/srt/models/stablelm.py +1 -0
  60. sglang/srt/openai_api/adapter.py +432 -0
  61. sglang/srt/openai_api/api_adapter.py +432 -0
  62. sglang/srt/openai_api/openai_api_adapter.py +431 -0
  63. sglang/srt/openai_api/openai_protocol.py +207 -0
  64. sglang/srt/openai_api/protocol.py +208 -0
  65. sglang/srt/openai_protocol.py +17 -0
  66. sglang/srt/sampling_params.py +2 -0
  67. sglang/srt/server.py +132 -84
  68. sglang/srt/server_args.py +35 -21
  69. sglang/srt/utils.py +65 -117
  70. sglang/test/test_conversation.py +1 -1
  71. sglang/test/test_openai_protocol.py +1 -1
  72. sglang/test/test_programs.py +1 -1
  73. sglang/test/test_utils.py +2 -2
  74. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/METADATA +162 -168
  75. sglang-0.1.24.dist-info/RECORD +105 -0
  76. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/WHEEL +1 -1
  77. sglang-0.1.21.dist-info/RECORD +0 -82
  78. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/LICENSE +0 -0
  79. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/top_level.txt +0 -0
sglang/check_env.py ADDED
@@ -0,0 +1,171 @@
1
+ """Check environment configurations and dependency versions."""
2
+
3
+ import importlib
4
+ import os
5
+ import resource
6
+ import subprocess
7
+ import sys
8
+ from collections import OrderedDict, defaultdict
9
+
10
+ import torch
11
+
12
+ # List of packages to check versions for
13
+ PACKAGE_LIST = [
14
+ "sglang",
15
+ "flashinfer",
16
+ "requests",
17
+ "tqdm",
18
+ "numpy",
19
+ "aiohttp",
20
+ "fastapi",
21
+ "hf_transfer",
22
+ "huggingface_hub",
23
+ "interegular",
24
+ "packaging",
25
+ "pillow",
26
+ "psutil",
27
+ "pydantic",
28
+ "uvicorn",
29
+ "uvloop",
30
+ "zmq",
31
+ "vllm",
32
+ "outlines",
33
+ "openai",
34
+ "tiktoken",
35
+ "anthropic",
36
+ "litellm",
37
+ ]
38
+
39
+
40
+ def get_package_versions(packages):
41
+ """
42
+ Get versions of specified packages.
43
+ """
44
+ versions = {}
45
+ for package in packages:
46
+ package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
47
+ try:
48
+ module = importlib.import_module(package_name)
49
+ if hasattr(module, "__version__"):
50
+ versions[package_name] = module.__version__
51
+ except ModuleNotFoundError:
52
+ versions[package_name] = "Module Not Found"
53
+ return versions
54
+
55
+
56
+ def get_cuda_info():
57
+ """
58
+ Get CUDA-related information if available.
59
+ """
60
+ cuda_info = {"CUDA available": torch.cuda.is_available()}
61
+
62
+ if cuda_info["CUDA available"]:
63
+ cuda_info.update(_get_gpu_info())
64
+ cuda_info.update(_get_cuda_version_info())
65
+
66
+ return cuda_info
67
+
68
+
69
+ def _get_gpu_info():
70
+ """
71
+ Get information about available GPUs.
72
+ """
73
+ devices = defaultdict(list)
74
+ for k in range(torch.cuda.device_count()):
75
+ devices[torch.cuda.get_device_name(k)].append(str(k))
76
+
77
+ return {f"GPU {','.join(device_ids)}": name for name, device_ids in devices.items()}
78
+
79
+
80
+ def _get_cuda_version_info():
81
+ """
82
+ Get CUDA version information.
83
+ """
84
+ from torch.utils.cpp_extension import CUDA_HOME
85
+
86
+ cuda_info = {"CUDA_HOME": CUDA_HOME}
87
+
88
+ if CUDA_HOME and os.path.isdir(CUDA_HOME):
89
+ cuda_info.update(_get_nvcc_info())
90
+ cuda_info.update(_get_cuda_driver_version())
91
+
92
+ return cuda_info
93
+
94
+
95
+ def _get_nvcc_info():
96
+ """
97
+ Get NVCC version information.
98
+ """
99
+ from torch.utils.cpp_extension import CUDA_HOME
100
+
101
+ try:
102
+ nvcc = os.path.join(CUDA_HOME, "bin/nvcc")
103
+ nvcc_output = (
104
+ subprocess.check_output(f'"{nvcc}" -V', shell=True).decode("utf-8").strip()
105
+ )
106
+ return {
107
+ "NVCC": nvcc_output[
108
+ nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind("Build")
109
+ ].strip()
110
+ }
111
+ except subprocess.SubprocessError:
112
+ return {"NVCC": "Not Available"}
113
+
114
+
115
+ def _get_cuda_driver_version():
116
+ """
117
+ Get CUDA driver version.
118
+ """
119
+ try:
120
+ output = subprocess.check_output(
121
+ [
122
+ "nvidia-smi",
123
+ "--query-gpu=driver_version",
124
+ "--format=csv,noheader,nounits",
125
+ ]
126
+ )
127
+ return {"CUDA Driver Version": output.decode().strip()}
128
+ except subprocess.SubprocessError:
129
+ return {"CUDA Driver Version": "Not Available"}
130
+
131
+
132
+ def get_gpu_topology():
133
+ """
134
+ Get GPU topology information.
135
+ """
136
+ try:
137
+ result = subprocess.run(
138
+ ["nvidia-smi", "topo", "-m"],
139
+ stdout=subprocess.PIPE,
140
+ stderr=subprocess.PIPE,
141
+ text=True,
142
+ check=True,
143
+ )
144
+ return "\n" + result.stdout if result.returncode == 0 else None
145
+ except subprocess.SubprocessError:
146
+ return None
147
+
148
+
149
+ def check_env():
150
+ """
151
+ Check and print environment information.
152
+ """
153
+ env_info = OrderedDict()
154
+ env_info["Python"] = sys.version.replace("\n", "")
155
+ env_info.update(get_cuda_info())
156
+ env_info["PyTorch"] = torch.__version__
157
+ env_info.update(get_package_versions(PACKAGE_LIST))
158
+
159
+ gpu_topo = get_gpu_topology()
160
+ if gpu_topo:
161
+ env_info["NVIDIA Topology"] = gpu_topo
162
+
163
+ ulimit_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
164
+ env_info["ulimit soft"] = ulimit_soft
165
+
166
+ for k, v in env_info.items():
167
+ print(f"{k}: {v}")
168
+
169
+
170
+ if __name__ == "__main__":
171
+ check_env()
sglang/global_config.py CHANGED
@@ -16,9 +16,9 @@ class GlobalConfig:
16
16
  self.wait_for_new_request_delay = 0.0006
17
17
 
18
18
  # Runtime constants: New generation token ratio estimation
19
- self.base_new_token_ratio = 0.4
19
+ self.init_new_token_ratio = 0.7
20
20
  self.base_min_new_token_ratio = 0.2
21
- self.new_token_ratio_decay = 0.0001
21
+ self.new_token_ratio_decay = 0.001
22
22
  self.new_token_ratio_recovery = 0.05
23
23
 
24
24
  # Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
@@ -27,6 +27,7 @@ class GlobalConfig:
27
27
 
28
28
  # Runtime constants: others
29
29
  self.num_continue_decode_steps = 10
30
+ self.retract_decode_steps = 20
30
31
  self.flashinfer_workspace_size = 192 * 1024 * 1024
31
32
 
32
33
  # Output tokenization configs
File without changes
@@ -0,0 +1,77 @@
1
+ from typing import List, Optional, Union
2
+
3
+ import numpy as np
4
+
5
+ from sglang.lang.backend.base_backend import BaseBackend
6
+ from sglang.lang.chat_template import get_chat_template
7
+ from sglang.lang.interpreter import StreamExecutor
8
+ from sglang.lang.ir import SglSamplingParams
9
+
10
+ try:
11
+ import anthropic
12
+ except ImportError as e:
13
+ anthropic = e
14
+
15
+
16
+ class Anthropic(BaseBackend):
17
+ def __init__(self, model_name, *args, **kwargs):
18
+ super().__init__()
19
+
20
+ if isinstance(anthropic, Exception):
21
+ raise anthropic
22
+
23
+ self.model_name = model_name
24
+ self.chat_template = get_chat_template("claude")
25
+ self.client = anthropic.Anthropic(*args, **kwargs)
26
+
27
+ def get_chat_template(self):
28
+ return self.chat_template
29
+
30
+ def generate(
31
+ self,
32
+ s: StreamExecutor,
33
+ sampling_params: SglSamplingParams,
34
+ ):
35
+ if s.messages_:
36
+ messages = s.messages_
37
+ else:
38
+ messages = [{"role": "user", "content": s.text_}]
39
+
40
+ if messages and messages[0]["role"] == "system":
41
+ system = messages.pop(0)["content"]
42
+ else:
43
+ system = ""
44
+
45
+ ret = self.client.messages.create(
46
+ model=self.model_name,
47
+ system=system,
48
+ messages=messages,
49
+ **sampling_params.to_anthropic_kwargs(),
50
+ )
51
+ comp = ret.content[0].text
52
+
53
+ return comp, {}
54
+
55
+ def generate_stream(
56
+ self,
57
+ s: StreamExecutor,
58
+ sampling_params: SglSamplingParams,
59
+ ):
60
+ if s.messages_:
61
+ messages = s.messages_
62
+ else:
63
+ messages = [{"role": "user", "content": s.text_}]
64
+
65
+ if messages and messages[0]["role"] == "system":
66
+ system = messages.pop(0)["content"]
67
+ else:
68
+ system = ""
69
+
70
+ with self.client.messages.stream(
71
+ model=self.model_name,
72
+ system=system,
73
+ messages=messages,
74
+ **sampling_params.to_anthropic_kwargs(),
75
+ ) as stream:
76
+ for text in stream.text_stream:
77
+ yield text, {}
@@ -0,0 +1,80 @@
1
+ from typing import Callable, List, Optional, Union
2
+
3
+ from sglang.lang.chat_template import get_chat_template
4
+ from sglang.lang.interpreter import StreamExecutor
5
+ from sglang.lang.ir import SglSamplingParams
6
+
7
+
8
+ class BaseBackend:
9
+ def __init__(self) -> None:
10
+ self.support_concate_and_append = False
11
+ self.chat_template = get_chat_template("default")
12
+
13
+ def get_model_name(self):
14
+ raise NotImplementedError()
15
+
16
+ def get_chat_template(self):
17
+ return self.chat_template
18
+
19
+ def cache_prefix(self, prefix_str: str):
20
+ pass
21
+
22
+ def uncache_prefix(self, rid: str):
23
+ pass
24
+
25
+ def end_request(self, rid: Union[str, List[str]]):
26
+ pass
27
+
28
+ def begin_program(self, s: StreamExecutor):
29
+ pass
30
+
31
+ def end_program(self, s: Union[StreamExecutor, List[StreamExecutor]]):
32
+ pass
33
+
34
+ def commit_lazy_operations(self, s: StreamExecutor):
35
+ pass
36
+
37
+ def fork_program(
38
+ self,
39
+ src: StreamExecutor,
40
+ dst: List[StreamExecutor],
41
+ position_ids_offset: Optional[List[int]] = None,
42
+ ):
43
+ pass
44
+
45
+ def fill_image(self, s: StreamExecutor):
46
+ pass
47
+
48
+ def generate(
49
+ self,
50
+ s: StreamExecutor,
51
+ sampling_params: SglSamplingParams,
52
+ ):
53
+ raise NotImplementedError()
54
+
55
+ def generate_stream(
56
+ self,
57
+ s: StreamExecutor,
58
+ sampling_params: SglSamplingParams,
59
+ ):
60
+ raise NotImplementedError()
61
+
62
+ def select(
63
+ self,
64
+ s: StreamExecutor,
65
+ choices: List[str],
66
+ temperature: float,
67
+ ):
68
+ raise NotImplementedError()
69
+
70
+ def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
71
+ raise NotImplementedError()
72
+
73
+ def shutdown(self):
74
+ pass
75
+
76
+ def flush_cache(self):
77
+ pass
78
+
79
+ def get_server_args(self):
80
+ pass
@@ -0,0 +1,90 @@
1
+ from typing import Mapping, Optional
2
+
3
+ from sglang.lang.backend.base_backend import BaseBackend
4
+ from sglang.lang.chat_template import get_chat_template_by_model_path
5
+ from sglang.lang.interpreter import StreamExecutor
6
+ from sglang.lang.ir import SglSamplingParams
7
+
8
+ try:
9
+ import litellm
10
+ except ImportError as e:
11
+ litellm = e
12
+ litellm.num_retries = 1
13
+
14
+
15
+ class LiteLLM(BaseBackend):
16
+ def __init__(
17
+ self,
18
+ model_name,
19
+ chat_template=None,
20
+ api_key=None,
21
+ organization: Optional[str] = None,
22
+ base_url: Optional[str] = None,
23
+ timeout: Optional[float] = 600,
24
+ max_retries: Optional[int] = litellm.num_retries,
25
+ default_headers: Optional[Mapping[str, str]] = None,
26
+ ):
27
+ super().__init__()
28
+
29
+ if isinstance(litellm, Exception):
30
+ raise litellm
31
+
32
+ self.model_name = model_name
33
+
34
+ self.chat_template = chat_template or get_chat_template_by_model_path(
35
+ model_name
36
+ )
37
+
38
+ self.client_params = {
39
+ "api_key": api_key,
40
+ "organization": organization,
41
+ "base_url": base_url,
42
+ "timeout": timeout,
43
+ "max_retries": max_retries,
44
+ "default_headers": default_headers,
45
+ }
46
+
47
+ def get_chat_template(self):
48
+ return self.chat_template
49
+
50
+ def generate(
51
+ self,
52
+ s: StreamExecutor,
53
+ sampling_params: SglSamplingParams,
54
+ ):
55
+ if s.messages_:
56
+ messages = s.messages_
57
+ else:
58
+ messages = [{"role": "user", "content": s.text_}]
59
+
60
+ ret = litellm.completion(
61
+ model=self.model_name,
62
+ messages=messages,
63
+ **self.client_params,
64
+ **sampling_params.to_anthropic_kwargs(),
65
+ )
66
+ comp = ret.choices[0].message.content
67
+
68
+ return comp, {}
69
+
70
+ def generate_stream(
71
+ self,
72
+ s: StreamExecutor,
73
+ sampling_params: SglSamplingParams,
74
+ ):
75
+ if s.messages_:
76
+ messages = s.messages_
77
+ else:
78
+ messages = [{"role": "user", "content": s.text_}]
79
+
80
+ ret = litellm.completion(
81
+ model=self.model_name,
82
+ messages=messages,
83
+ stream=True,
84
+ **self.client_params,
85
+ **sampling_params.to_litellm_kwargs(),
86
+ )
87
+ for chunk in ret:
88
+ text = chunk.choices[0].delta.content
89
+ if text is not None:
90
+ yield text, {}