sglang 0.3.6__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. sglang/__init__.py +2 -2
  2. sglang/api.py +2 -2
  3. sglang/bench_one_batch.py +2 -4
  4. sglang/bench_serving.py +75 -26
  5. sglang/lang/backend/base_backend.py +1 -1
  6. sglang/lang/backend/runtime_endpoint.py +2 -2
  7. sglang/srt/configs/model_config.py +13 -14
  8. sglang/srt/constrained/__init__.py +13 -14
  9. sglang/srt/constrained/base_grammar_backend.py +13 -15
  10. sglang/srt/constrained/outlines_backend.py +13 -15
  11. sglang/srt/constrained/outlines_jump_forward.py +13 -15
  12. sglang/srt/constrained/xgrammar_backend.py +38 -57
  13. sglang/srt/conversation.py +13 -15
  14. sglang/srt/hf_transformers_utils.py +13 -15
  15. sglang/srt/layers/activation.py +13 -13
  16. sglang/srt/layers/attention/flashinfer_backend.py +13 -6
  17. sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
  18. sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
  19. sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
  20. sglang/srt/layers/custom_op_util.py +13 -14
  21. sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
  22. sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
  23. sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
  24. sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
  25. sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
  26. sglang/srt/layers/fused_moe_triton/layer.py +633 -0
  27. sglang/srt/layers/layernorm.py +13 -15
  28. sglang/srt/layers/logits_processor.py +13 -15
  29. sglang/srt/layers/quantization/__init__.py +77 -17
  30. sglang/srt/layers/radix_attention.py +13 -15
  31. sglang/srt/layers/rotary_embedding.py +13 -13
  32. sglang/srt/lora/lora.py +13 -14
  33. sglang/srt/lora/lora_config.py +13 -14
  34. sglang/srt/lora/lora_manager.py +22 -24
  35. sglang/srt/managers/data_parallel_controller.py +25 -19
  36. sglang/srt/managers/detokenizer_manager.py +13 -16
  37. sglang/srt/managers/io_struct.py +43 -28
  38. sglang/srt/managers/schedule_batch.py +55 -26
  39. sglang/srt/managers/schedule_policy.py +13 -15
  40. sglang/srt/managers/scheduler.py +89 -70
  41. sglang/srt/managers/session_controller.py +14 -15
  42. sglang/srt/managers/tokenizer_manager.py +29 -22
  43. sglang/srt/managers/tp_worker.py +13 -15
  44. sglang/srt/managers/tp_worker_overlap_thread.py +13 -15
  45. sglang/srt/metrics/collector.py +13 -15
  46. sglang/srt/metrics/func_timer.py +13 -15
  47. sglang/srt/mm_utils.py +13 -14
  48. sglang/srt/model_executor/cuda_graph_runner.py +20 -19
  49. sglang/srt/model_executor/forward_batch_info.py +19 -17
  50. sglang/srt/model_executor/model_runner.py +42 -30
  51. sglang/srt/models/chatglm.py +15 -16
  52. sglang/srt/models/commandr.py +15 -16
  53. sglang/srt/models/dbrx.py +15 -16
  54. sglang/srt/models/deepseek.py +15 -15
  55. sglang/srt/models/deepseek_v2.py +15 -15
  56. sglang/srt/models/exaone.py +14 -15
  57. sglang/srt/models/gemma.py +14 -14
  58. sglang/srt/models/gemma2.py +24 -19
  59. sglang/srt/models/gemma2_reward.py +13 -14
  60. sglang/srt/models/gpt_bigcode.py +14 -14
  61. sglang/srt/models/grok.py +15 -15
  62. sglang/srt/models/internlm2.py +13 -15
  63. sglang/srt/models/internlm2_reward.py +13 -14
  64. sglang/srt/models/llama.py +21 -21
  65. sglang/srt/models/llama_classification.py +13 -14
  66. sglang/srt/models/llama_reward.py +13 -14
  67. sglang/srt/models/llava.py +13 -15
  68. sglang/srt/models/llavavid.py +13 -15
  69. sglang/srt/models/minicpm.py +13 -15
  70. sglang/srt/models/minicpm3.py +13 -15
  71. sglang/srt/models/mistral.py +13 -15
  72. sglang/srt/models/mixtral.py +15 -15
  73. sglang/srt/models/mixtral_quant.py +14 -14
  74. sglang/srt/models/olmo.py +21 -19
  75. sglang/srt/models/olmoe.py +23 -20
  76. sglang/srt/models/qwen.py +14 -14
  77. sglang/srt/models/qwen2.py +22 -19
  78. sglang/srt/models/qwen2_moe.py +17 -18
  79. sglang/srt/models/stablelm.py +18 -16
  80. sglang/srt/models/torch_native_llama.py +15 -17
  81. sglang/srt/models/xverse.py +13 -14
  82. sglang/srt/models/xverse_moe.py +15 -16
  83. sglang/srt/models/yivl.py +13 -15
  84. sglang/srt/openai_api/adapter.py +13 -15
  85. sglang/srt/openai_api/protocol.py +13 -15
  86. sglang/srt/sampling/sampling_batch_info.py +4 -1
  87. sglang/srt/sampling/sampling_params.py +13 -15
  88. sglang/srt/server.py +59 -34
  89. sglang/srt/server_args.py +22 -22
  90. sglang/srt/utils.py +196 -17
  91. sglang/test/few_shot_gsm8k.py +8 -4
  92. sglang/test/runners.py +13 -14
  93. sglang/test/test_utils.py +1 -1
  94. sglang/version.py +1 -1
  95. {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
  96. {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +24 -15
  97. sglang-0.3.6.post1.dist-info/RECORD +164 -0
  98. sglang/srt/layers/fused_moe/__init__.py +0 -1
  99. sglang-0.3.6.dist-info/RECORD +0 -161
  100. /sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +0 -0
  101. {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +0 -0
  102. {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
15
14
 
16
15
  # Adapted from
17
16
  # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/xverse.py#L1
@@ -1,19 +1,18 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """Inference-only XVERSE MoE model."""
15
+
17
16
  from typing import Any, Dict, Iterable, Optional, Tuple
18
17
 
19
18
  import torch
@@ -25,7 +24,6 @@ from vllm.distributed import (
25
24
  tensor_model_parallel_all_reduce,
26
25
  )
27
26
  from vllm.model_executor.layers.activation import SiluAndMul
28
- from vllm.model_executor.layers.fused_moe import fused_moe
29
27
  from vllm.model_executor.layers.layernorm import RMSNorm
30
28
  from vllm.model_executor.layers.linear import (
31
29
  MergedColumnParallelLinear,
@@ -36,6 +34,7 @@ from vllm.model_executor.layers.linear import (
36
34
  from vllm.model_executor.layers.rotary_embedding import get_rope
37
35
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
38
36
 
37
+ from sglang.srt.layers.fused_moe_triton import fused_moe
39
38
  from sglang.srt.layers.logits_processor import LogitsProcessor
40
39
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
41
40
  from sglang.srt.layers.radix_attention import RadixAttention
sglang/srt/models/yivl.py CHANGED
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """Inference-only Yi-VL model."""
17
15
 
18
16
  from typing import Iterable, Optional, Tuple
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """Conversion between OpenAI APIs and native SRT APIs"""
17
15
 
18
16
  import asyncio
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """Pydantic models for OpenAI API protocol"""
17
15
 
18
16
  import time
@@ -170,7 +170,10 @@ class SamplingBatchInfo:
170
170
 
171
171
  for i, grammar in enumerate(self.grammars):
172
172
  if grammar is not None:
173
- grammar.fill_vocab_mask(self.vocab_mask, i)
173
+ try:
174
+ grammar.fill_vocab_mask(self.vocab_mask, i)
175
+ except RuntimeError:
176
+ continue
174
177
 
175
178
  def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor):
176
179
  self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """Sampling parameters for text generation."""
17
15
 
18
16
  from typing import List, Optional, Union
sglang/srt/server.py CHANGED
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """
17
15
  The entry point of inference server.
18
16
  SRT = SGLang Runtime.
@@ -104,6 +102,7 @@ app.add_middleware(
104
102
  )
105
103
 
106
104
  tokenizer_manager: TokenizerManager = None
105
+ _max_total_num_tokens = None
107
106
 
108
107
  ##### Native API endpoints #####
109
108
 
@@ -147,10 +146,15 @@ async def get_model_info():
147
146
  return result
148
147
 
149
148
 
150
- @app.get("/get_server_args")
151
- async def get_server_args():
152
- """Get the server arguments."""
153
- return dataclasses.asdict(tokenizer_manager.server_args)
149
+ @app.get("/get_server_info")
150
+ async def get_server_info():
151
+ try:
152
+ return await _get_server_info()
153
+
154
+ except Exception as e:
155
+ return ORJSONResponse(
156
+ {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
157
+ )
154
158
 
155
159
 
156
160
  @app.post("/flush_cache")
@@ -186,19 +190,6 @@ async def stop_profile():
186
190
  )
187
191
 
188
192
 
189
- @app.api_route("/get_memory_pool_size", methods=["GET", "POST"])
190
- async def get_memory_pool_size():
191
- """Get the memory pool size in number of tokens"""
192
- try:
193
- ret = await tokenizer_manager.get_memory_pool_size()
194
-
195
- return ret
196
- except Exception as e:
197
- return ORJSONResponse(
198
- {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
199
- )
200
-
201
-
202
193
  @app.post("/update_weights")
203
194
  @time_func_latency
204
195
  async def update_weights(obj: UpdateWeightReqInput, request: Request):
@@ -392,6 +383,7 @@ def launch_engine(
392
383
  """
393
384
 
394
385
  global tokenizer_manager
386
+ global _max_total_num_tokens
395
387
 
396
388
  # Configure global environment
397
389
  configure_logger(server_args)
@@ -457,9 +449,20 @@ def launch_engine(
457
449
  if server_args.chat_template:
458
450
  load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
459
451
 
460
- # Wait for model to finish loading
452
+ # Wait for model to finish loading & get max token nums
453
+ scheduler_info = []
461
454
  for i in range(len(scheduler_pipe_readers)):
462
- scheduler_pipe_readers[i].recv()
455
+ data = scheduler_pipe_readers[i].recv()
456
+
457
+ if data["status"] != "ready":
458
+ self.shutdown()
459
+ raise RuntimeError(
460
+ "Initialization failed. Please see the error messages above."
461
+ )
462
+ scheduler_info.append(data)
463
+
464
+ # Assume all schedulers have same max_total_num_tokens
465
+ _max_total_num_tokens = scheduler_info[0]["max_total_num_tokens"]
463
466
 
464
467
 
465
468
  def launch_server(
@@ -520,6 +523,14 @@ def launch_server(
520
523
  t.join()
521
524
 
522
525
 
526
+ async def _get_server_info():
527
+ return {
528
+ **dataclasses.asdict(tokenizer_manager.server_args), # server args
529
+ "memory_pool_size": await tokenizer_manager.get_memory_pool_size(), # memory pool size
530
+ "max_total_num_tokens": _max_total_num_tokens, # max total num tokens
531
+ }
532
+
533
+
523
534
  def _set_envs_and_config(server_args: ServerArgs):
524
535
  # Set global environments
525
536
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@@ -761,6 +772,17 @@ class Runtime:
761
772
  response = requests.post(self.url + "/encode", json=json_data)
762
773
  return json.dumps(response.json())
763
774
 
775
+ async def get_server_info(self):
776
+ async with aiohttp.ClientSession() as session:
777
+ async with session.get(f"{self.url}/get_server_info") as response:
778
+ if response.status == 200:
779
+ return await response.json()
780
+ else:
781
+ error_data = await response.json()
782
+ raise RuntimeError(
783
+ f"Failed to get server info. {error_data['error']['message']}"
784
+ )
785
+
764
786
  def __del__(self):
765
787
  self.shutdown()
766
788
 
@@ -910,3 +932,6 @@ class Engine:
910
932
  # get the current event loop
911
933
  loop = asyncio.get_event_loop()
912
934
  return loop.run_until_complete(encode_request(obj, None))
935
+
936
+ async def get_server_info(self):
937
+ return await _get_server_info()
sglang/srt/server_args.py CHANGED
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """The arguments of the server."""
17
15
 
18
16
  import argparse
@@ -64,6 +62,7 @@ class ServerArgs:
64
62
  max_prefill_tokens: int = 16384
65
63
  schedule_policy: str = "lpm"
66
64
  schedule_conservativeness: float = 1.0
65
+ cpu_offload_gb: int = 0
67
66
 
68
67
  # Other runtime options
69
68
  tp_size: int = 1
@@ -200,12 +199,6 @@ class ServerArgs:
200
199
  "Overlap schedule is disabled."
201
200
  )
202
201
 
203
- if self.enable_mixed_chunk:
204
- logger.info(
205
- "Overlap schedule is disabled because mixed-style chunked prefill is enabled."
206
- )
207
- self.disable_overlap_schedule = True
208
-
209
202
  @staticmethod
210
203
  def add_cli_args(parser: argparse.ArgumentParser):
211
204
  # Model and port args
@@ -308,7 +301,7 @@ class ServerArgs:
308
301
  "--device",
309
302
  type=str,
310
303
  default="cuda",
311
- choices=["cuda", "xpu"],
304
+ choices=["cuda", "xpu", "hpu"],
312
305
  help="The device type.",
313
306
  )
314
307
  parser.add_argument(
@@ -375,6 +368,13 @@ class ServerArgs:
375
368
  help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
376
369
  )
377
370
 
371
+ parser.add_argument(
372
+ "--cpu-offload-gb",
373
+ type=int,
374
+ default=ServerArgs.cpu_offload_gb,
375
+ help="How many GBs of RAM to reserve for CPU offloading",
376
+ )
377
+
378
378
  # Other runtime options
379
379
  parser.add_argument(
380
380
  "--tensor-parallel-size",