sglang 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. sglang/__init__.py +33 -26
  2. sglang/api.py +9 -1
  3. sglang/bench_latency.py +2 -2
  4. sglang/bench_serving.py +10 -1
  5. sglang/check_env.py +1 -1
  6. sglang/lang/backend/litellm.py +1 -1
  7. sglang/lang/backend/openai.py +1 -1
  8. sglang/lang/interpreter.py +20 -5
  9. sglang/lang/ir.py +1 -1
  10. sglang/srt/constrained/__init__.py +15 -0
  11. sglang/srt/constrained/base_cache.py +15 -0
  12. sglang/srt/constrained/fsm_cache.py +15 -0
  13. sglang/srt/constrained/jump_forward.py +15 -0
  14. sglang/srt/conversation.py +26 -0
  15. sglang/srt/hf_transformers_utils.py +15 -0
  16. sglang/srt/layers/context_flashattention_nopad.py +15 -0
  17. sglang/srt/layers/extend_attention.py +15 -0
  18. sglang/srt/layers/fused_moe.py +15 -0
  19. sglang/srt/layers/linear.py +15 -0
  20. sglang/srt/layers/logits_processor.py +41 -13
  21. sglang/srt/layers/quantization/__init__.py +15 -0
  22. sglang/srt/layers/quantization/fp8.py +15 -0
  23. sglang/srt/layers/radix_attention.py +17 -2
  24. sglang/srt/layers/token_attention.py +16 -1
  25. sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
  26. sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
  27. sglang/srt/managers/detokenizer_manager.py +16 -1
  28. sglang/srt/managers/io_struct.py +36 -3
  29. sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
  30. sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +31 -12
  31. sglang/srt/managers/tokenizer_manager.py +39 -16
  32. sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +130 -40
  33. sglang/srt/mem_cache/flush_cache.py +33 -0
  34. sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
  35. sglang/srt/{managers/controller → mem_cache}/radix_cache.py +15 -0
  36. sglang/srt/mm_utils.py +15 -0
  37. sglang/srt/model_config.py +15 -0
  38. sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +16 -1
  39. sglang/srt/{managers/controller → model_executor}/model_runner.py +32 -12
  40. sglang/srt/model_loader/model_loader.py +15 -0
  41. sglang/srt/model_loader/utils.py +16 -1
  42. sglang/srt/models/chatglm.py +16 -1
  43. sglang/srt/models/commandr.py +16 -1
  44. sglang/srt/models/dbrx.py +16 -1
  45. sglang/srt/models/deepseek.py +16 -1
  46. sglang/srt/models/deepseek_v2.py +16 -1
  47. sglang/srt/models/gemma.py +16 -1
  48. sglang/srt/models/gemma2.py +16 -1
  49. sglang/srt/models/gpt_bigcode.py +16 -1
  50. sglang/srt/models/grok.py +16 -1
  51. sglang/srt/models/internlm2.py +16 -1
  52. sglang/srt/models/llama2.py +16 -1
  53. sglang/srt/models/llama_classification.py +16 -1
  54. sglang/srt/models/llava.py +17 -2
  55. sglang/srt/models/llavavid.py +17 -2
  56. sglang/srt/models/minicpm.py +16 -1
  57. sglang/srt/models/mistral.py +15 -0
  58. sglang/srt/models/mixtral.py +16 -1
  59. sglang/srt/models/mixtral_quant.py +16 -1
  60. sglang/srt/models/qwen.py +16 -1
  61. sglang/srt/models/qwen2.py +16 -1
  62. sglang/srt/models/qwen2_moe.py +16 -1
  63. sglang/srt/models/stablelm.py +16 -1
  64. sglang/srt/models/yivl.py +15 -0
  65. sglang/srt/openai_api/adapter.py +520 -135
  66. sglang/srt/openai_api/protocol.py +64 -0
  67. sglang/srt/sampling_params.py +15 -0
  68. sglang/srt/server.py +89 -23
  69. sglang/srt/server_args.py +49 -11
  70. sglang/srt/utils.py +15 -0
  71. sglang/utils.py +22 -0
  72. sglang/version.py +1 -1
  73. {sglang-0.2.6.dist-info → sglang-0.2.7.dist-info}/METADATA +32 -6
  74. sglang-0.2.7.dist-info/RECORD +93 -0
  75. {sglang-0.2.6.dist-info → sglang-0.2.7.dist-info}/WHEEL +1 -1
  76. sglang/srt/flush_cache.py +0 -18
  77. sglang-0.2.6.dist-info/RECORD +0 -93
  78. {sglang-0.2.6.dist-info → sglang-0.2.7.dist-info}/LICENSE +0 -0
  79. {sglang-0.2.6.dist-info → sglang-0.2.7.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Pydantic models for OpenAI API protocol"""
2
17
 
3
18
  import time
@@ -45,6 +60,55 @@ class UsageInfo(BaseModel):
45
60
  completion_tokens: Optional[int] = 0
46
61
 
47
62
 
63
+ class FileRequest(BaseModel):
64
+ # https://platform.openai.com/docs/api-reference/files/create
65
+ file: bytes # The File object (not file name) to be uploaded
66
+ purpose: str = (
67
+ "batch" # The intended purpose of the uploaded file, default is "batch"
68
+ )
69
+
70
+
71
+ class FileResponse(BaseModel):
72
+ id: str
73
+ object: str = "file"
74
+ bytes: int
75
+ created_at: int
76
+ filename: str
77
+ purpose: str
78
+
79
+
80
+ class BatchRequest(BaseModel):
81
+ input_file_id: (
82
+ str # The ID of an uploaded file that contains requests for the new batch
83
+ )
84
+ endpoint: str # The endpoint to be used for all requests in the batch
85
+ completion_window: str # The time frame within which the batch should be processed
86
+ metadata: Optional[dict] = None # Optional custom metadata for the batch
87
+
88
+
89
+ class BatchResponse(BaseModel):
90
+ id: str
91
+ object: str = "batch"
92
+ endpoint: str
93
+ errors: Optional[dict] = None
94
+ input_file_id: str
95
+ completion_window: str
96
+ status: str = "validating"
97
+ output_file_id: Optional[str] = None
98
+ error_file_id: Optional[str] = None
99
+ created_at: int
100
+ in_progress_at: Optional[int] = None
101
+ expires_at: Optional[int] = None
102
+ finalizing_at: Optional[int] = None
103
+ completed_at: Optional[int] = None
104
+ failed_at: Optional[int] = None
105
+ expired_at: Optional[int] = None
106
+ cancelling_at: Optional[int] = None
107
+ cancelled_at: Optional[int] = None
108
+ request_counts: dict = {"total": 0, "completed": 0, "failed": 0}
109
+ metadata: Optional[dict] = None
110
+
111
+
48
112
  class CompletionRequest(BaseModel):
49
113
  # Ordered by official OpenAI API documentation
50
114
  # https://platform.openai.com/docs/api-reference/completions/create
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Sampling parameters for text generation."""
2
17
 
3
18
  from typing import List, Optional, Union
sglang/srt/server.py CHANGED
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """
2
17
  The entry point of inference server.
3
18
  SRT = SGLang Runtime.
@@ -23,17 +38,17 @@ import psutil
23
38
  import requests
24
39
  import uvicorn
25
40
  import uvloop
26
- from fastapi import FastAPI, Request
41
+ from fastapi import FastAPI, File, Form, Request, UploadFile
27
42
  from fastapi.responses import JSONResponse, Response, StreamingResponse
28
43
 
29
44
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
30
45
  from sglang.srt.constrained import disable_cache
31
46
  from sglang.srt.hf_transformers_utils import get_tokenizer
32
- from sglang.srt.managers.controller.manager_multi import (
47
+ from sglang.srt.managers.controller_multi import (
33
48
  start_controller_process as start_controller_process_multi,
34
49
  )
35
- from sglang.srt.managers.controller.manager_single import launch_tp_servers
36
- from sglang.srt.managers.controller.manager_single import (
50
+ from sglang.srt.managers.controller_single import launch_tp_servers
51
+ from sglang.srt.managers.controller_single import (
37
52
  start_controller_process as start_controller_process_single,
38
53
  )
39
54
  from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
@@ -41,8 +56,13 @@ from sglang.srt.managers.io_struct import GenerateReqInput
41
56
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
42
57
  from sglang.srt.openai_api.adapter import (
43
58
  load_chat_template_for_openai_api,
59
+ v1_batches,
44
60
  v1_chat_completions,
45
61
  v1_completions,
62
+ v1_files_create,
63
+ v1_retrieve_batch,
64
+ v1_retrieve_file,
65
+ v1_retrieve_file_content,
46
66
  )
47
67
  from sglang.srt.openai_api.protocol import ModelCard, ModelList
48
68
  from sglang.srt.server_args import PortArgs, ServerArgs
@@ -137,6 +157,35 @@ async def openai_v1_chat_completions(raw_request: Request):
137
157
  return await v1_chat_completions(tokenizer_manager, raw_request)
138
158
 
139
159
 
160
+ @app.post("/v1/files")
161
+ async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("batch")):
162
+ return await v1_files_create(
163
+ file, purpose, tokenizer_manager.server_args.file_storage_pth
164
+ )
165
+
166
+
167
+ @app.post("/v1/batches")
168
+ async def openai_v1_batches(raw_request: Request):
169
+ return await v1_batches(tokenizer_manager, raw_request)
170
+
171
+
172
+ @app.get("/v1/batches/{batch_id}")
173
+ async def retrieve_batch(batch_id: str):
174
+ return await v1_retrieve_batch(batch_id)
175
+
176
+
177
+ @app.get("/v1/files/{file_id}")
178
+ async def retrieve_file(file_id: str):
179
+ # https://platform.openai.com/docs/api-reference/files/retrieve
180
+ return await v1_retrieve_file(file_id)
181
+
182
+
183
+ @app.get("/v1/files/{file_id}/content")
184
+ async def retrieve_file_content(file_id: str):
185
+ # https://platform.openai.com/docs/api-reference/files/retrieve-contents
186
+ return await v1_retrieve_file_content(file_id)
187
+
188
+
140
189
  @app.get("/v1/models")
141
190
  def available_models():
142
191
  """Show available models."""
@@ -160,6 +209,39 @@ def _set_torch_compile_config():
160
209
  torch._dynamo.config.accumulated_cache_size_limit = 256
161
210
 
162
211
 
212
+ def set_envs_and_config(server_args: ServerArgs):
213
+ # Set global environments
214
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
215
+ os.environ["NCCL_CUMEM_ENABLE"] = "0"
216
+ os.environ["NCCL_NVLS_ENABLE"] = "0"
217
+ os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
218
+
219
+ # Set ulimit
220
+ set_ulimit()
221
+
222
+ # Enable show time cost for debugging
223
+ if server_args.show_time_cost:
224
+ enable_show_time_cost()
225
+
226
+ # Disable disk cache
227
+ if server_args.disable_disk_cache:
228
+ disable_cache()
229
+
230
+ # Fix triton bugs
231
+ if server_args.tp_size * server_args.dp_size > 1:
232
+ # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
233
+ maybe_set_triton_cache_manager()
234
+
235
+ # Set torch compile config
236
+ if server_args.enable_torch_compile:
237
+ _set_torch_compile_config()
238
+
239
+ # Set global chat template
240
+ if server_args.chat_template:
241
+ # TODO: replace this with huggingface transformers template
242
+ load_chat_template_for_openai_api(server_args.chat_template)
243
+
244
+
163
245
  def launch_server(
164
246
  server_args: ServerArgs,
165
247
  model_overide_args: Optional[dict] = None,
@@ -175,32 +257,16 @@ def launch_server(
175
257
  format="%(message)s",
176
258
  )
177
259
 
178
- # Set global environments
179
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
180
- os.environ["NCCL_CUMEM_ENABLE"] = "0"
181
- os.environ["NCCL_NVLS_ENABLE"] = "0"
182
- os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
183
- set_ulimit()
184
- if server_args.show_time_cost:
185
- enable_show_time_cost()
186
- if server_args.disable_disk_cache:
187
- disable_cache()
188
260
  if not server_args.disable_flashinfer:
189
261
  assert_pkg_version(
190
262
  "flashinfer",
191
- "0.1.1",
263
+ "0.1.2",
192
264
  "Please uninstall the old version and "
193
265
  "reinstall the latest version by following the instructions "
194
266
  "at https://docs.flashinfer.ai/installation.html.",
195
267
  )
196
- if server_args.tp_size * server_args.dp_size > 1:
197
- # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
198
- maybe_set_triton_cache_manager()
199
- if server_args.chat_template:
200
- # TODO: replace this with huggingface transformers template
201
- load_chat_template_for_openai_api(server_args.chat_template)
202
- if server_args.enable_torch_compile:
203
- _set_torch_compile_config()
268
+
269
+ set_envs_and_config(server_args)
204
270
 
205
271
  # Allocate ports
206
272
  server_args.port, server_args.additional_ports = allocate_init_ports(
sglang/srt/server_args.py CHANGED
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """The arguments of the server."""
2
17
 
3
18
  import argparse
@@ -29,7 +44,7 @@ class ServerArgs:
29
44
  max_prefill_tokens: Optional[int] = None
30
45
  max_running_requests: Optional[int] = None
31
46
  max_num_reqs: Optional[int] = None
32
- schedule_heuristic: str = "lpm"
47
+ schedule_policy: str = "lpm"
33
48
  schedule_conservativeness: float = 1.0
34
49
 
35
50
  # Other runtime options
@@ -45,11 +60,15 @@ class ServerArgs:
45
60
 
46
61
  # Other
47
62
  api_key: str = ""
63
+ file_storage_pth: str = "SGlang_storage"
48
64
 
49
65
  # Data parallelism
50
66
  dp_size: int = 1
51
67
  load_balance_method: str = "round_robin"
52
68
 
69
+ # Chunked Prefill
70
+ chunked_prefill_size: Optional[int] = None
71
+
53
72
  # Optimization/debug options
54
73
  disable_flashinfer: bool = False
55
74
  disable_flashinfer_sampling: bool = False
@@ -72,15 +91,15 @@ class ServerArgs:
72
91
  self.tokenizer_path = self.model_path
73
92
  if self.mem_fraction_static is None:
74
93
  if self.tp_size >= 16:
75
- self.mem_fraction_static = 0.80
94
+ self.mem_fraction_static = 0.79
76
95
  elif self.tp_size >= 8:
77
- self.mem_fraction_static = 0.84
96
+ self.mem_fraction_static = 0.83
78
97
  elif self.tp_size >= 4:
79
- self.mem_fraction_static = 0.86
98
+ self.mem_fraction_static = 0.85
80
99
  elif self.tp_size >= 2:
81
- self.mem_fraction_static = 0.88
100
+ self.mem_fraction_static = 0.87
82
101
  else:
83
- self.mem_fraction_static = 0.89
102
+ self.mem_fraction_static = 0.88
84
103
  if isinstance(self.additional_ports, int):
85
104
  self.additional_ports = [self.additional_ports]
86
105
  elif self.additional_ports is None:
@@ -176,6 +195,7 @@ class ServerArgs:
176
195
  "gptq",
177
196
  "marlin",
178
197
  "gptq_marlin",
198
+ "awq_marlin",
179
199
  "squeezellm",
180
200
  "bitsandbytes",
181
201
  ],
@@ -208,15 +228,15 @@ class ServerArgs:
208
228
  parser.add_argument(
209
229
  "--max-num-reqs",
210
230
  type=int,
211
- default=None,
231
+ default=ServerArgs.max_num_reqs,
212
232
  help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
213
233
  )
214
234
  parser.add_argument(
215
- "--schedule-heuristic",
235
+ "--schedule-policy",
216
236
  type=str,
217
- default=ServerArgs.schedule_heuristic,
237
+ default=ServerArgs.schedule_policy,
218
238
  choices=["lpm", "random", "fcfs", "dfs-weight"],
219
- help="The scheduling heuristic.",
239
+ help="The scheduling policy of the requests.",
220
240
  )
221
241
  parser.add_argument(
222
242
  "--schedule-conservativeness",
@@ -270,6 +290,12 @@ class ServerArgs:
270
290
  default=ServerArgs.api_key,
271
291
  help="Set API key of the server.",
272
292
  )
293
+ parser.add_argument(
294
+ "--file-storage-pth",
295
+ type=str,
296
+ default=ServerArgs.file_storage_pth,
297
+ help="The path of the file storage in backend.",
298
+ )
273
299
 
274
300
  # Data parallelism
275
301
  parser.add_argument(
@@ -296,10 +322,18 @@ class ServerArgs:
296
322
  help="The nccl init address of multi-node server.",
297
323
  )
298
324
  parser.add_argument(
299
- "--nnodes", type=int, default=1, help="The number of nodes."
325
+ "--nnodes", type=int, default=ServerArgs.nnodes, help="The number of nodes."
300
326
  )
301
327
  parser.add_argument("--node-rank", type=int, help="The node rank.")
302
328
 
329
+ # Chunked prefill
330
+ parser.add_argument(
331
+ "--chunked-prefill-size",
332
+ type=int,
333
+ default=ServerArgs.chunked_prefill_size,
334
+ help="The size of the chunked prefill.",
335
+ )
336
+
303
337
  # Optimization/debug options
304
338
  parser.add_argument(
305
339
  "--disable-flashinfer",
@@ -378,6 +412,10 @@ class ServerArgs:
378
412
  self.dp_size > 1 and self.node_rank is not None
379
413
  ), "multi-node data parallel is not supported"
380
414
 
415
+ assert not (
416
+ self.chunked_prefill_size is not None and self.disable_radix_cache
417
+ ), "chunked prefill is not supported with radix cache disabled currently"
418
+
381
419
 
382
420
  @dataclasses.dataclass
383
421
  class PortArgs:
sglang/srt/utils.py CHANGED
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Common utilities."""
2
17
 
3
18
  import base64
sglang/utils.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Common utilities."""
2
2
 
3
3
  import base64
4
+ import importlib
4
5
  import json
5
6
  import logging
6
7
  import signal
@@ -261,3 +262,24 @@ def graceful_registry(sub_module_name):
261
262
  logger.info(f"{sub_module_name} recive sigterm")
262
263
 
263
264
  signal.signal(signal.SIGTERM, graceful_shutdown)
265
+
266
+
267
+ class LazyImport:
268
+ def __init__(self, module_name, class_name):
269
+ self.module_name = module_name
270
+ self.class_name = class_name
271
+ self._module = None
272
+
273
+ def _load(self):
274
+ if self._module is None:
275
+ module = importlib.import_module(self.module_name)
276
+ self._module = getattr(module, self.class_name)
277
+ return self._module
278
+
279
+ def __getattr__(self, name):
280
+ module = self._load()
281
+ return getattr(module, name)
282
+
283
+ def __call__(self, *args, **kwargs):
284
+ module = self._load()
285
+ return module(*args, **kwargs)
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.6"
1
+ __version__ = "0.2.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.6
3
+ Version: 0.2.7
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -245,6 +245,13 @@ Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
245
245
 
246
246
  <div align="center">
247
247
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
248
+
249
+ [![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
250
+ ![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
251
+ [![license](https://img.shields.io/github/license/sgl-project/sglang.svg)](https://github.com/sgl-project/sglang/tree/main/LICENSE)
252
+ [![issue resolution](https://img.shields.io/github/issues-closed-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
253
+ [![open issues](https://img.shields.io/github/issues-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
254
+
248
255
  </div>
249
256
 
250
257
  --------------------------------------------------------------------------------
@@ -292,7 +299,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
292
299
 
293
300
  ### Method 2: From source
294
301
  ```
295
- git clone https://github.com/sgl-project/sglang.git
302
+ # Use the stable release branch
303
+ git clone -b release https://github.com/sgl-project/sglang.git
296
304
  cd sglang
297
305
 
298
306
  pip install --upgrade pip
@@ -341,7 +349,7 @@ curl http://localhost:30000/generate \
341
349
  }
342
350
  }'
343
351
  ```
344
- Learn more about the argument format [here](docs/sampling_params.md).
352
+ Learn more about the argument format [here](docs/en/sampling_params.md).
345
353
 
346
354
  ### OpenAI Compatible API
347
355
  In addition, the server supports OpenAI-compatible APIs.
@@ -388,7 +396,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
388
396
  ```
389
397
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
390
398
  ```
391
- - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
399
+ - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
392
400
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
393
401
  ```
394
402
  # Node 0
@@ -397,7 +405,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
397
405
  # Node 1
398
406
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
399
407
  ```
400
- - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
408
+ - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
401
409
  - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
402
410
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
403
411
 
@@ -440,7 +448,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
440
448
  - InternLM 2
441
449
  - Mistral NeMo
442
450
 
443
- Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
451
+ Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
444
452
 
445
453
  ### Benchmark Performance
446
454
 
@@ -671,6 +679,24 @@ for out in state.text_iter():
671
679
  print(out, end="", flush=True)
672
680
  ```
673
681
 
682
+ #### Roles
683
+
684
+ Use `sgl.system`, `sgl.user` and `sgl.assistant` to set roles when using Chat models. You can also define more complex role prompts using begin and end tokens.
685
+
686
+ ```python
687
+ @sgl.function
688
+ def chat_example(s):
689
+ s += sgl.system("You are a helpful assistant.")
690
+ # Same as: s += s.system("You are a helpful assistant.")
691
+
692
+ with s.user():
693
+ s += "Question: What is the capital of France?"
694
+
695
+ s += sgl.assistant_begin()
696
+ s += "Answer: " + sgl.gen(max_tokens=100, stop="\n")
697
+ s += sgl.assistant_end()
698
+ ```
699
+
674
700
  #### Tips and Implementation Details
675
701
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
676
702
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
@@ -0,0 +1,93 @@
1
+ sglang/__init__.py,sha256=ECjvAWlxIwKtUIXGchfkoCIbF-iqLjH-Q0o8xHTlVNY,1352
2
+ sglang/api.py,sha256=s_P8BvGDCQ0PiqOapr2TLFge1NA7QmKqUx6bFQ8Q5GQ,5676
3
+ sglang/bench_latency.py,sha256=JPatRvstM3nXb-ViVgtR-TaRrFHpcHzqoDG7BQmRYK8,10539
4
+ sglang/bench_serving.py,sha256=6DK6Ps8y6-Eb9QlbGBRlhPRTseDqVIRoDreO5GDHZ64,34846
5
+ sglang/check_env.py,sha256=Eeb_20VetnlEFYSRcHFlNqt85lYUQN60NEtkoX7ahPA,4121
6
+ sglang/global_config.py,sha256=CyhGL7PE-KlMcg7IHWykzImU1y4NQlpeIlh9lHA77uo,1749
7
+ sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
8
+ sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
9
+ sglang/utils.py,sha256=r0Z7hY_bFFk-b6WeQJir9br-hCW2-p7n5E7Et2WziaQ,8776
10
+ sglang/version.py,sha256=XHypfHSPdgXFKmOdoewn7czU670gt8InhHhzlP5j_aA,22
11
+ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
13
+ sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
14
+ sglang/lang/interpreter.py,sha256=dt_NAAMv2oSYxwSMjhMr2pIGTe5_d12cSR91SUWvpCQ,30298
15
+ sglang/lang/ir.py,sha256=THa6hwnuTVXVYxnovNQP_o7A9v5O8uXE4eLXH9vDRLA,16648
16
+ sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
17
+ sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
19
+ sglang/lang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
20
+ sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
21
+ sglang/lang/backend/openai.py,sha256=6ww2rwKouWgtmjaCf4hk-kXXJ6bY6n9Xnbm3UTFZvl4,14808
22
+ sglang/lang/backend/runtime_endpoint.py,sha256=6iW1S62KmYyQGiWsHJFhZidK01vlIE55IsYN2tP38WQ,9202
23
+ sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
24
+ sglang/srt/conversation.py,sha256=V5YuoeO6-aLqGv0p3J2qx8TnBJbN1oTopYFutNul3GQ,16491
25
+ sglang/srt/hf_transformers_utils.py,sha256=Fg-3panb6lsqOhHmAYA0ivkXyBjdnvY5mqvilDv2xF4,11919
26
+ sglang/srt/mm_utils.py,sha256=n7_GmbOM_0IWVXovpM34rKIBw0Py9yb_NXSQw27u4OA,9454
27
+ sglang/srt/model_config.py,sha256=DO7m84WiT3dzPWmyKz_UXDAHEdqEjq8Lq5wCjzjYMME,6023
28
+ sglang/srt/sampling_params.py,sha256=uZFDlTUPnNR5_3IDH-INDeN-tm6LlRkC2KT-B3njxJs,3687
29
+ sglang/srt/server.py,sha256=2qgluP7_6-e36PDK_mr-rLK9us3_9KvXLG3255h-tS4,16022
30
+ sglang/srt/server_args.py,sha256=0cV-r5QTV_9Arl3hVf9mc20BlOhYhWSkICU0T3dS180,15412
31
+ sglang/srt/utils.py,sha256=uIatocIFzqi6fWSscz2MjF3jUcIRBJlqLgYeicM_W9s,22950
32
+ sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
33
+ sglang/srt/constrained/base_cache.py,sha256=Aeu2HMPhXMPNQNEwPJ19sECN0PYPZKjisrZiCcocHiw,1970
34
+ sglang/srt/constrained/fsm_cache.py,sha256=Q7wfGx7VOghErqcC_0kK4aI-lBEO9TxoFPyUiBxEGVE,2626
35
+ sglang/srt/constrained/jump_forward.py,sha256=SYKj5Pd3d7oym5fAI8zUzj3zKk-lV30m_ksAy0ubgO8,6180
36
+ sglang/srt/layers/context_flashattention_nopad.py,sha256=r_TpHuYAVgq1pN81PiWe1bebtY-p9MBndBaoIE2VXrk,5180
37
+ sglang/srt/layers/extend_attention.py,sha256=zuNnAdL_wF6BX0Mwn1dgDJvh3YJjYwqa5Fbzp8muOVc,12573
38
+ sglang/srt/layers/fused_moe.py,sha256=KmyXwau2OOZpQimGIQrHptzGNs1trIud5AKEEKXdzPU,20823
39
+ sglang/srt/layers/linear.py,sha256=3Se2FRXyqXcd-uvNx2b7s-jolsUTEVeYBMYHmV82wPw,34518
40
+ sglang/srt/layers/logits_processor.py,sha256=JE0NYlQniy9wmPeIKs3QbYbpaXqAoNtVdEPkV_qt59I,11076
41
+ sglang/srt/layers/radix_attention.py,sha256=tdA-kdd9LQY1wbw3iYuy-9cikVJYmy3EctwAlUfN-Uo,6945
42
+ sglang/srt/layers/token_attention.py,sha256=ylUqUnozJCCohxTGAiiP3sxgUrcXfEVic8-qgcHYDj4,7968
43
+ sglang/srt/layers/quantization/__init__.py,sha256=JMlgE-FWS759lfQ9Uc6mGFqBbTFLlvKeVEFpZLATe14,2536
44
+ sglang/srt/layers/quantization/fp8.py,sha256=GQOLeGbrcUfwO-7oClzDda0RXGPHR70ZXUHArZsa174,25511
45
+ sglang/srt/managers/controller_multi.py,sha256=LYI-XE9h57DW8Uh4gpd8upsC3p2dd5weKzddEH274jg,6626
46
+ sglang/srt/managers/controller_single.py,sha256=CdQ9_XPZdcWF5jArDmVR8K-WZ9_8Gpgk4SwANKxTX-Y,5112
47
+ sglang/srt/managers/detokenizer_manager.py,sha256=GXWdW4n2N-otL3zcgdr0t1PcEe2EmQJA8AElntiNV1o,5606
48
+ sglang/srt/managers/io_struct.py,sha256=Rz7Ur9Yw6prDGdy6XjsSiUmVBccS6cef-G_9TW7HA_4,7105
49
+ sglang/srt/managers/policy_scheduler.py,sha256=ajSB-gCC6VJkXvnKU8FYU3Kgcigozp2pMTwF84Wp14o,3138
50
+ sglang/srt/managers/schedule_batch.py,sha256=tbos5i4KSfk1K8VH5HCNm2pQGlJMKVAE_mZ8haVMelc,36620
51
+ sglang/srt/managers/tokenizer_manager.py,sha256=tEct3shjjw_7ickj_cmt9IxoBHfgbryQHI7DZS0m4TA,20511
52
+ sglang/srt/managers/tp_worker.py,sha256=91gbWi7hSuyTC3Qvo7EXKmHM6GKWTK0Nqpda001jOw0,34349
53
+ sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
54
+ sglang/srt/mem_cache/memory_pool.py,sha256=wkhjyYLbAZrl2FB5i4ODkxgMufBuDpe4N0kbXhu6ZO0,4509
55
+ sglang/srt/mem_cache/radix_cache.py,sha256=Xk0c8nwyPHEUsobVJQrr7edwyzUMk9MBYTQBprN8a0Y,8775
56
+ sglang/srt/model_executor/cuda_graph_runner.py,sha256=OdmO6R7nHWrRJCtZOxYkt0KNdGoX7Md4knsypwPYjaQ,9365
57
+ sglang/srt/model_executor/model_runner.py,sha256=WyPsO73MD3ziKAk76j4HemePYZluXjs9WGYeajUgfQA,15507
58
+ sglang/srt/model_loader/model_loader.py,sha256=QmZUhHh1nmWrfYlunfnxMcTsIvip1l6aMIlrXoCED4I,10697
59
+ sglang/srt/model_loader/utils.py,sha256=0AoWXX9uV5rKRYXJ4HduSnvdeerytI4ONCLCH6X4XFQ,10675
60
+ sglang/srt/models/chatglm.py,sha256=vYWooqyPmcSFZNjxj_g5I_FgHJlDytbEiz6vyv3JBNM,13856
61
+ sglang/srt/models/commandr.py,sha256=gaTI77hgitPlcUNyxMEdGu_XZQj2DuAMnh3KbZQ9HFg,14166
62
+ sglang/srt/models/dbrx.py,sha256=LQu7I2KH-XzY9iBlaK7IQsM1o3kzsuI1vTCspK2C19o,14655
63
+ sglang/srt/models/deepseek.py,sha256=adr57ZX6aPOBOpmvm7YIvoqo6u0jdrKJPZ8SGcVXAh8,16014
64
+ sglang/srt/models/deepseek_v2.py,sha256=9CORl-IroSguYPX3wz_aGe7mFoUE7cQRMs7CgbkBYLk,20087
65
+ sglang/srt/models/gemma.py,sha256=PMPI1-WLuLdk6e7u6I9d_LoCkauLkWY3aOP8MFEZ-sI,12279
66
+ sglang/srt/models/gemma2.py,sha256=kTjZcsptgtYaO8BL_NlygjVSMSloq2Mc4Rf3FKvEhbs,16420
67
+ sglang/srt/models/gpt_bigcode.py,sha256=U7GmHKywSu12D-EwvuWv3RwHkx6bPawaRIjlFIpQkfs,10194
68
+ sglang/srt/models/grok.py,sha256=NfZdsRVErDIUWFqjhtNf2pqC9G4cRdYHBFpgDq1IZ2A,27855
69
+ sglang/srt/models/internlm2.py,sha256=Ld2GUxZeqqqJ2vd4QiX2s1y2AceJLA1nVnUYY88GMQk,12219
70
+ sglang/srt/models/llama2.py,sha256=zhoCUh_3dNC7FOzDnaoHcHF3-y7vTVYDZzHKqIsUJgs,14764
71
+ sglang/srt/models/llama_classification.py,sha256=4r_orFZqBR3U_yC4bus1K3Z3-ADscYGSzgA82_VDN0g,4926
72
+ sglang/srt/models/llava.py,sha256=BJphgyQGdo7uTpJcKGEfWwdpH9GTMDnyiznLSSgmvm8,18476
73
+ sglang/srt/models/llavavid.py,sha256=-7vaVqaIfukCvMkNakEPblpwjIHC6ezrAvmpE5RzlUY,13602
74
+ sglang/srt/models/minicpm.py,sha256=Mj-dbhfN7li7cTEP-0sV7i5PSYkMGIaYCqRU7eDc-BY,13837
75
+ sglang/srt/models/mistral.py,sha256=jlrWBVNXbAUziAaIdHAjFcOJnKtn9Bl8rBd65ypJM-I,819
76
+ sglang/srt/models/mixtral.py,sha256=QiswCUdZ4VwMghtrr_vGP_dkzxSCrcUIcBgjlOZh_Ao,21391
77
+ sglang/srt/models/mixtral_quant.py,sha256=I1sIdistZHw7GO35qvlteA16DGVtME5rvEVV86v0-7Y,14216
78
+ sglang/srt/models/qwen.py,sha256=xAtlWyhMkcfwocRqzZoH01qKbkohXxAf4tnkPh0xtpM,10000
79
+ sglang/srt/models/qwen2.py,sha256=mXlVd6UTCXY3VdgodFpQnlaY-NYLIbA-SknxdA9R13w,12278
80
+ sglang/srt/models/qwen2_moe.py,sha256=YYdJEezic7GyW-_bXlNIaqBa0C4IHQpz_vuRBLxms4k,18141
81
+ sglang/srt/models/stablelm.py,sha256=b3d-ZwLQoLjZ6CupnkIq7d-z9tzGSxAyIcgSmZiZxZw,11362
82
+ sglang/srt/models/yivl.py,sha256=p4s_D_m4H2exP4b91Y-CTkq8T-eIG3DJsFy9pB0e7TM,4932
83
+ sglang/srt/openai_api/adapter.py,sha256=Jn8Awi93zkb3Wq5gqK698kOhmqYdtxZlRePciA50Ud4,30213
84
+ sglang/srt/openai_api/protocol.py,sha256=_mBNdxb_4ZRIeP0wmW8tMTc2x7zu4foVxBDCuCWkaiw,7822
85
+ sglang/test/test_conversation.py,sha256=gF_AyOxQgpPQBPnA57-kq-M0p_zFu-rBDMFgAq655Rw,1596
86
+ sglang/test/test_openai_protocol.py,sha256=DVx3r6hrb8oRqbo5AYIleldxbqMBTtb-gtORM6t_Y1c,1661
87
+ sglang/test/test_programs.py,sha256=s4WGpTmYP4Yx5g8JYZpbkeF9RN5iUnlKdi8FGAZovTc,13756
88
+ sglang/test/test_utils.py,sha256=kD_fQe3WroZ9Kc3NBRKPiZOFJ_JD2uEE9XIvPp6AD9Y,11048
89
+ sglang-0.2.7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
90
+ sglang-0.2.7.dist-info/METADATA,sha256=NU4S55-t6q87AKPkgbDORvX_Om0XbAJ9K67_p30JnQ0,33216
91
+ sglang-0.2.7.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
92
+ sglang-0.2.7.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
93
+ sglang-0.2.7.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (71.1.0)
2
+ Generator: setuptools (72.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
sglang/srt/flush_cache.py DELETED
@@ -1,18 +0,0 @@
1
- """
2
- Flush the KV cache.
3
-
4
- Usage:
5
- python3 -m sglang.srt.flush_cache --url http://localhost:30000
6
- """
7
-
8
- import argparse
9
-
10
- import requests
11
-
12
- if __name__ == "__main__":
13
- parser = argparse.ArgumentParser()
14
- parser.add_argument("--url", type=str, default="http://localhost:30000")
15
- args = parser.parse_args()
16
-
17
- response = requests.get(args.url + "/flush_cache")
18
- assert response.status_code == 200