sglang 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. sglang/__init__.py +21 -23
  2. sglang/api.py +2 -7
  3. sglang/bench_offline_throughput.py +41 -27
  4. sglang/bench_one_batch.py +60 -4
  5. sglang/bench_one_batch_server.py +1 -1
  6. sglang/bench_serving.py +83 -71
  7. sglang/lang/backend/runtime_endpoint.py +183 -4
  8. sglang/lang/chat_template.py +46 -4
  9. sglang/launch_server.py +1 -1
  10. sglang/srt/_custom_ops.py +80 -42
  11. sglang/srt/configs/device_config.py +1 -1
  12. sglang/srt/configs/load_config.py +1 -0
  13. sglang/srt/configs/model_config.py +1 -0
  14. sglang/srt/constrained/base_grammar_backend.py +21 -0
  15. sglang/srt/constrained/xgrammar_backend.py +8 -4
  16. sglang/srt/conversation.py +14 -1
  17. sglang/srt/distributed/__init__.py +3 -3
  18. sglang/srt/distributed/communication_op.py +2 -1
  19. sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
  20. sglang/srt/distributed/device_communicators/custom_all_reduce.py +112 -42
  21. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  22. sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
  23. sglang/srt/distributed/device_communicators/pynccl.py +80 -1
  24. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
  25. sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
  26. sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
  27. sglang/srt/distributed/parallel_state.py +1 -1
  28. sglang/srt/distributed/utils.py +2 -1
  29. sglang/srt/entrypoints/engine.py +452 -0
  30. sglang/srt/entrypoints/http_server.py +603 -0
  31. sglang/srt/function_call_parser.py +494 -0
  32. sglang/srt/layers/activation.py +8 -8
  33. sglang/srt/layers/attention/flashinfer_backend.py +10 -9
  34. sglang/srt/layers/attention/triton_backend.py +4 -6
  35. sglang/srt/layers/attention/vision.py +204 -0
  36. sglang/srt/layers/dp_attention.py +71 -0
  37. sglang/srt/layers/layernorm.py +5 -5
  38. sglang/srt/layers/linear.py +65 -14
  39. sglang/srt/layers/logits_processor.py +49 -64
  40. sglang/srt/layers/moe/ep_moe/layer.py +24 -16
  41. sglang/srt/layers/moe/fused_moe_native.py +84 -1
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  43. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +27 -7
  44. sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -5
  45. sglang/srt/layers/parameter.py +18 -8
  46. sglang/srt/layers/quantization/__init__.py +20 -23
  47. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  48. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  49. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  50. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  51. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  52. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  53. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  54. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  55. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  56. sglang/srt/layers/quantization/fp8.py +10 -4
  57. sglang/srt/layers/quantization/modelopt_quant.py +1 -2
  58. sglang/srt/layers/quantization/w8a8_int8.py +1 -1
  59. sglang/srt/layers/radix_attention.py +2 -2
  60. sglang/srt/layers/rotary_embedding.py +1184 -31
  61. sglang/srt/layers/sampler.py +64 -6
  62. sglang/srt/layers/torchao_utils.py +12 -6
  63. sglang/srt/layers/vocab_parallel_embedding.py +2 -2
  64. sglang/srt/lora/lora.py +1 -9
  65. sglang/srt/managers/configure_logging.py +3 -0
  66. sglang/srt/managers/data_parallel_controller.py +79 -72
  67. sglang/srt/managers/detokenizer_manager.py +24 -6
  68. sglang/srt/managers/image_processor.py +158 -2
  69. sglang/srt/managers/io_struct.py +57 -3
  70. sglang/srt/managers/schedule_batch.py +78 -45
  71. sglang/srt/managers/schedule_policy.py +26 -12
  72. sglang/srt/managers/scheduler.py +326 -201
  73. sglang/srt/managers/session_controller.py +1 -0
  74. sglang/srt/managers/tokenizer_manager.py +210 -121
  75. sglang/srt/managers/tp_worker.py +6 -4
  76. sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
  77. sglang/srt/managers/utils.py +44 -0
  78. sglang/srt/mem_cache/memory_pool.py +10 -32
  79. sglang/srt/metrics/collector.py +15 -6
  80. sglang/srt/model_executor/cuda_graph_runner.py +26 -30
  81. sglang/srt/model_executor/forward_batch_info.py +5 -7
  82. sglang/srt/model_executor/model_runner.py +44 -19
  83. sglang/srt/model_loader/loader.py +83 -6
  84. sglang/srt/model_loader/weight_utils.py +145 -6
  85. sglang/srt/models/baichuan.py +6 -6
  86. sglang/srt/models/chatglm.py +2 -2
  87. sglang/srt/models/commandr.py +17 -5
  88. sglang/srt/models/dbrx.py +13 -5
  89. sglang/srt/models/deepseek.py +3 -3
  90. sglang/srt/models/deepseek_v2.py +11 -11
  91. sglang/srt/models/exaone.py +2 -2
  92. sglang/srt/models/gemma.py +2 -2
  93. sglang/srt/models/gemma2.py +15 -25
  94. sglang/srt/models/gpt2.py +3 -5
  95. sglang/srt/models/gpt_bigcode.py +1 -1
  96. sglang/srt/models/granite.py +2 -2
  97. sglang/srt/models/grok.py +4 -3
  98. sglang/srt/models/internlm2.py +2 -2
  99. sglang/srt/models/llama.py +7 -5
  100. sglang/srt/models/minicpm.py +2 -2
  101. sglang/srt/models/minicpm3.py +9 -9
  102. sglang/srt/models/minicpmv.py +1238 -0
  103. sglang/srt/models/mixtral.py +3 -3
  104. sglang/srt/models/mixtral_quant.py +3 -3
  105. sglang/srt/models/mllama.py +2 -2
  106. sglang/srt/models/olmo.py +3 -3
  107. sglang/srt/models/olmo2.py +4 -4
  108. sglang/srt/models/olmoe.py +7 -13
  109. sglang/srt/models/phi3_small.py +2 -2
  110. sglang/srt/models/qwen.py +2 -2
  111. sglang/srt/models/qwen2.py +41 -4
  112. sglang/srt/models/qwen2_moe.py +3 -3
  113. sglang/srt/models/qwen2_vl.py +22 -122
  114. sglang/srt/models/stablelm.py +2 -2
  115. sglang/srt/models/torch_native_llama.py +20 -7
  116. sglang/srt/models/xverse.py +6 -6
  117. sglang/srt/models/xverse_moe.py +6 -6
  118. sglang/srt/openai_api/adapter.py +139 -37
  119. sglang/srt/openai_api/protocol.py +7 -4
  120. sglang/srt/sampling/custom_logit_processor.py +38 -0
  121. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
  122. sglang/srt/sampling/sampling_batch_info.py +143 -18
  123. sglang/srt/sampling/sampling_params.py +3 -1
  124. sglang/srt/server.py +4 -1090
  125. sglang/srt/server_args.py +77 -15
  126. sglang/srt/speculative/eagle_utils.py +37 -15
  127. sglang/srt/speculative/eagle_worker.py +11 -13
  128. sglang/srt/utils.py +164 -129
  129. sglang/test/runners.py +8 -13
  130. sglang/test/test_programs.py +2 -1
  131. sglang/test/test_utils.py +83 -22
  132. sglang/utils.py +12 -2
  133. sglang/version.py +1 -1
  134. {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/METADATA +21 -10
  135. {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/RECORD +138 -123
  136. sglang/launch_server_llavavid.py +0 -25
  137. sglang/srt/constrained/__init__.py +0 -16
  138. sglang/srt/distributed/device_communicators/__init__.py +0 -0
  139. {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
  140. {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
  141. {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0
sglang/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
- # SGL API Components
1
+ # SGLang public APIs
2
2
 
3
+ # Frontend Language APIs
3
4
  from sglang.api import (
4
5
  Engine,
5
6
  Runtime,
@@ -23,16 +24,26 @@ from sglang.api import (
23
24
  user_end,
24
25
  video,
25
26
  )
27
+ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
26
28
  from sglang.lang.choices import (
27
29
  greedy_token_selection,
28
30
  token_length_normalized,
29
31
  unconditional_likelihood_normalized,
30
32
  )
33
+ from sglang.utils import LazyImport
34
+
35
+ Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
36
+ LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
37
+ OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
38
+ VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
39
+
40
+ # Other configs
41
+ from sglang.global_config import global_config
42
+ from sglang.version import __version__
31
43
 
32
- # SGLang DSL APIs
33
44
  __all__ = [
34
- "Runtime",
35
45
  "Engine",
46
+ "Runtime",
36
47
  "assistant",
37
48
  "assistant_begin",
38
49
  "assistant_end",
@@ -52,27 +63,14 @@ __all__ = [
52
63
  "user_begin",
53
64
  "user_end",
54
65
  "video",
66
+ "RuntimeEndpoint",
55
67
  "greedy_token_selection",
56
68
  "token_length_normalized",
57
69
  "unconditional_likelihood_normalized",
70
+ "Anthropic",
71
+ "LiteLLM",
72
+ "OpenAI",
73
+ "VertexAI",
74
+ "global_config",
75
+ "__version__",
58
76
  ]
59
-
60
- # Global Configurations
61
- from sglang.global_config import global_config
62
-
63
- __all__ += ["global_config"]
64
-
65
- from sglang.version import __version__
66
-
67
- __all__ += ["__version__"]
68
-
69
- # SGLang Backends
70
- from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
71
- from sglang.utils import LazyImport
72
-
73
- Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
74
- LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
75
- OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
76
- VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
77
-
78
- __all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]
sglang/api.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """Public APIs of the language."""
2
2
 
3
- import os
4
3
  import re
5
4
  from typing import Callable, List, Optional, Union
6
5
 
@@ -33,19 +32,15 @@ def function(
33
32
 
34
33
 
35
34
  def Runtime(*args, **kwargs):
36
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
37
-
38
35
  # Avoid importing unnecessary dependency
39
- from sglang.srt.server import Runtime
36
+ from sglang.lang.backend.runtime_endpoint import Runtime
40
37
 
41
38
  return Runtime(*args, **kwargs)
42
39
 
43
40
 
44
41
  def Engine(*args, **kwargs):
45
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
46
-
47
42
  # Avoid importing unnecessary dependency
48
- from sglang.srt.server import Engine
43
+ from sglang.srt.entrypoints.engine import Engine
49
44
 
50
45
  return Engine(*args, **kwargs)
51
46
 
@@ -27,7 +27,8 @@ from sglang.bench_serving import (
27
27
  sample_random_requests,
28
28
  set_ulimit,
29
29
  )
30
- from sglang.srt.server import Engine, Runtime
30
+ from sglang.lang.backend.runtime_endpoint import Runtime
31
+ from sglang.srt.entrypoints.engine import Engine
31
32
  from sglang.srt.server_args import ServerArgs
32
33
 
33
34
 
@@ -39,20 +40,22 @@ class BenchArgs:
39
40
  dataset_path: str = ""
40
41
  num_prompts: int = 1000
41
42
  sharegpt_output_len: Optional[int] = None
43
+ sharegpt_context_len: Optional[int] = None
42
44
  random_input_len: int = 1024
43
45
  random_output_len: int = 1024
44
46
  random_range_ratio: float = 0.0
45
- gen_num_groups: int = 64
46
- gen_prompts_per_group: int = 16
47
- gen_system_prompt_len: int = 2048
48
- gen_question_len: int = 128
49
- gen_output_len: int = 256
47
+ gsp_num_groups: int = 64
48
+ gsp_prompts_per_group: int = 16
49
+ gsp_system_prompt_len: int = 2048
50
+ gsp_question_len: int = 128
51
+ gsp_output_len: int = 256
52
+ seed: int = 1
50
53
  disable_ignore_eos: bool = False
51
54
  extra_request_body: Optional[str] = None
52
- seed: int = 1
55
+ apply_chat_template: bool = False
56
+ profile: bool = False
53
57
  skip_warmup: bool = False
54
58
  do_not_exit: bool = False
55
- profile: bool = False
56
59
 
57
60
  @staticmethod
58
61
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -82,6 +85,12 @@ class BenchArgs:
82
85
  default=BenchArgs.sharegpt_output_len,
83
86
  help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
84
87
  )
88
+ parser.add_argument(
89
+ "--sharegpt-context-len",
90
+ type=int,
91
+ default=BenchArgs.sharegpt_context_len,
92
+ help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
93
+ )
85
94
  parser.add_argument(
86
95
  "--random-input-len",
87
96
  type=int,
@@ -102,51 +111,62 @@ class BenchArgs:
102
111
  "used only for random dataset.",
103
112
  )
104
113
  parser.add_argument(
105
- "--gen-num-groups",
114
+ "--gsp-num-groups",
106
115
  type=int,
107
- default=BenchArgs.gen_num_groups,
116
+ default=BenchArgs.gsp_num_groups,
108
117
  help="Number of groups with shared prefix, used"
109
118
  "only for generate-shared-prefix",
110
119
  )
111
120
  parser.add_argument(
112
- "--gen-prompts-per-group",
121
+ "--gsp-prompts-per-group",
113
122
  type=int,
114
- default=BenchArgs.gen_prompts_per_group,
123
+ default=BenchArgs.gsp_prompts_per_group,
115
124
  help="Number of prompts per group of shared prefix, used"
116
125
  "only for generate-shared-prefix",
117
126
  )
118
127
  parser.add_argument(
119
- "--gen-system-prompt-len",
128
+ "--gsp-system-prompt-len",
120
129
  type=int,
121
- default=BenchArgs.gen_system_prompt_len,
130
+ default=BenchArgs.gsp_system_prompt_len,
122
131
  help="System prompt length, used" "only for generate-shared-prefix",
123
132
  )
124
133
  parser.add_argument(
125
- "--gen-question-len",
134
+ "--gsp-question-len",
126
135
  type=int,
127
- default=BenchArgs.gen_question_len,
136
+ default=BenchArgs.gsp_question_len,
128
137
  help="Question length, used" "only for generate-shared-prefix",
129
138
  )
130
139
  parser.add_argument(
131
- "--gen-output-len",
140
+ "--gsp-output-len",
132
141
  type=int,
133
- default=BenchArgs.gen_output_len,
142
+ default=BenchArgs.gsp_output_len,
134
143
  help="Target length in tokens for outputs in generated-shared-prefix dataset",
135
144
  )
145
+ parser.add_argument("--seed", type=int, default=1, help="The random seed.")
136
146
  parser.add_argument(
137
147
  "--disable-ignore-eos",
138
- type=bool,
139
- default=BenchArgs.disable_ignore_eos,
148
+ action="store_true",
140
149
  help="Disable ignore EOS token",
141
150
  )
142
151
  parser.add_argument(
143
152
  "--extra-request-body",
144
153
  metavar='{"key1": "value1", "key2": "value2"}',
145
154
  type=str,
155
+ default=BenchArgs.extra_request_body,
146
156
  help="Append given JSON object to the request payload. You can use this to specify"
147
157
  "additional generate params like sampling params.",
148
158
  )
149
- parser.add_argument("--seed", type=int, default=1, help="The random seed.")
159
+ parser.add_argument(
160
+ "--apply-chat-template",
161
+ action="store_true",
162
+ help="Apply chat template",
163
+ )
164
+ parser.add_argument(
165
+ "--profile",
166
+ action="store_true",
167
+ help="Use Torch Profiler. The endpoint must be launched with "
168
+ "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
169
+ )
150
170
  parser.add_argument(
151
171
  "--skip-warmup",
152
172
  action="store_true",
@@ -157,12 +177,6 @@ class BenchArgs:
157
177
  action="store_true",
158
178
  help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
159
179
  )
160
- parser.add_argument(
161
- "--profile",
162
- action="store_true",
163
- help="Use Torch Profiler. The endpoint must be launched with "
164
- "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
165
- )
166
180
 
167
181
  @classmethod
168
182
  def from_cli_args(cls, args: argparse.Namespace):
sglang/bench_one_batch.py CHANGED
@@ -9,7 +9,8 @@ It accepts server arguments (the same as launch_server.py) and benchmark argumen
9
9
  python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
10
10
  ## sweep through multiple data points and store (append) the results in a jsonl file:
11
11
  python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
12
-
12
+ ## run with profiling:
13
+ python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --profile
13
14
  # Usage (correctness test):
14
15
  python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
15
16
 
@@ -56,15 +57,21 @@ import torch
56
57
  import torch.distributed as dist
57
58
 
58
59
  from sglang.srt.configs.model_config import ModelConfig
60
+ from sglang.srt.entrypoints.engine import _set_envs_and_config
59
61
  from sglang.srt.hf_transformers_utils import get_tokenizer
60
62
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
61
63
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
62
64
  from sglang.srt.model_executor.model_runner import ModelRunner
63
65
  from sglang.srt.sampling.sampling_params import SamplingParams
64
- from sglang.srt.server import _set_envs_and_config
65
66
  from sglang.srt.server_args import PortArgs, ServerArgs
66
67
  from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
67
- from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers
68
+ from sglang.srt.utils import (
69
+ configure_logger,
70
+ get_bool_env_var,
71
+ kill_process_tree,
72
+ set_gpu_proc_affinity,
73
+ suppress_other_loggers,
74
+ )
68
75
 
69
76
 
70
77
  @dataclasses.dataclass
@@ -77,6 +84,8 @@ class BenchArgs:
77
84
  correctness_test: bool = False
78
85
  # This is only used for correctness test
79
86
  cut_len: int = 4
87
+ profile: bool = False
88
+ profile_filename_prefix: str = "profile"
80
89
 
81
90
  @staticmethod
82
91
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -95,6 +104,16 @@ class BenchArgs:
95
104
  )
96
105
  parser.add_argument("--correctness-test", action="store_true")
97
106
  parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
107
+ parser.add_argument(
108
+ "--profile", action="store_true", help="Use Torch Profiler."
109
+ )
110
+ parser.add_argument(
111
+ "--profile-filename-prefix",
112
+ type=str,
113
+ default=BenchArgs.profile_filename_prefix,
114
+ help="Prefix of the profiling file names. The full profiling result file(s) be "
115
+ '"[profile_filename_prefix]_batch[batch_size]_input[input_len]_output[output_len].trace.json.gz"',
116
+ )
98
117
 
99
118
  @classmethod
100
119
  def from_cli_args(cls, args: argparse.Namespace):
@@ -216,6 +235,7 @@ def extend(reqs, model_runner):
216
235
  model_config=model_runner.model_config,
217
236
  enable_overlap=False,
218
237
  spec_algorithm=SpeculativeAlgorithm.NONE,
238
+ enable_custom_logit_processor=False,
219
239
  )
220
240
  batch.prepare_for_extend()
221
241
  model_worker_batch = batch.get_model_worker_batch()
@@ -286,7 +306,16 @@ def synchronize(device):
286
306
 
287
307
 
288
308
  def latency_test_run_once(
289
- run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len, device
309
+ run_name,
310
+ model_runner,
311
+ rank_print,
312
+ reqs,
313
+ batch_size,
314
+ input_len,
315
+ output_len,
316
+ device,
317
+ profile,
318
+ profile_filename_prefix,
290
319
  ):
291
320
  max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
292
321
  if batch_size > max_batch_size:
@@ -308,6 +337,17 @@ def latency_test_run_once(
308
337
 
309
338
  tot_latency = 0
310
339
 
340
+ profiler = None
341
+ if profile:
342
+ profiler = torch.profiler.profile(
343
+ activities=[
344
+ torch.profiler.ProfilerActivity.CPU,
345
+ torch.profiler.ProfilerActivity.CUDA,
346
+ ],
347
+ with_stack=True,
348
+ )
349
+ profiler.start()
350
+
311
351
  # Prefill
312
352
  synchronize(device)
313
353
  tic = time.time()
@@ -338,6 +378,14 @@ def latency_test_run_once(
338
378
  f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
339
379
  )
340
380
 
381
+ if profile:
382
+ profiler.stop()
383
+ profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}.trace.json.gz"
384
+ parent_dir = os.path.dirname(os.path.abspath(profile_filename))
385
+ os.makedirs(parent_dir, exist_ok=True)
386
+ profiler.export_chrome_trace(profile_filename)
387
+ rank_print(f"torch profiler chrome trace saved to {profile_filename}")
388
+
341
389
  # Record decode timing from 2nd output
342
390
  if output_len > 1:
343
391
  med_decode_latency = np.median(decode_latencies)
@@ -363,6 +411,10 @@ def latency_test(
363
411
  bench_args,
364
412
  tp_rank,
365
413
  ):
414
+ # Set CPU affinity
415
+ if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
416
+ set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)
417
+
366
418
  # Configure the logger
367
419
  configure_logger(server_args, prefix=f" TP{tp_rank}")
368
420
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
@@ -386,6 +438,8 @@ def latency_test(
386
438
  bench_args.input_len[0],
387
439
  8, # shorter decoding to speed up the warmup
388
440
  server_args.device,
441
+ profile=False,
442
+ profile_filename_prefix="", # not used
389
443
  )
390
444
 
391
445
  rank_print("Benchmark ...")
@@ -405,6 +459,8 @@ def latency_test(
405
459
  il,
406
460
  ol,
407
461
  server_args.device,
462
+ bench_args.profile if tp_rank == 0 else None,
463
+ bench_args.profile_filename_prefix,
408
464
  )
409
465
  if ret is not None:
410
466
  result_list.append(ret)
@@ -22,7 +22,7 @@ from typing import Tuple
22
22
  import numpy as np
23
23
  import requests
24
24
 
25
- from sglang.srt.server import launch_server
25
+ from sglang.srt.entrypoints.http_server import launch_server
26
26
  from sglang.srt.server_args import ServerArgs
27
27
  from sglang.srt.utils import kill_process_tree
28
28