sglang 0.4.1.post1__py3-none-any.whl → 0.4.1.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. sglang/bench_offline_throughput.py +1 -0
  2. sglang/srt/configs/model_config.py +11 -2
  3. sglang/srt/layers/attention/__init__.py +0 -1
  4. sglang/srt/layers/attention/flashinfer_backend.py +54 -41
  5. sglang/srt/layers/logits_processor.py +30 -2
  6. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +46 -26
  7. sglang/srt/layers/quantization/fp8.py +42 -2
  8. sglang/srt/layers/quantization/fp8_kernel.py +77 -18
  9. sglang/srt/layers/quantization/fp8_utils.py +8 -2
  10. sglang/srt/managers/io_struct.py +29 -8
  11. sglang/srt/managers/schedule_batch.py +22 -15
  12. sglang/srt/managers/scheduler.py +60 -20
  13. sglang/srt/managers/session_controller.py +102 -27
  14. sglang/srt/managers/tokenizer_manager.py +41 -10
  15. sglang/srt/managers/tp_worker.py +7 -0
  16. sglang/srt/managers/tp_worker_overlap_thread.py +5 -0
  17. sglang/srt/model_executor/forward_batch_info.py +42 -3
  18. sglang/srt/model_executor/model_runner.py +4 -0
  19. sglang/srt/models/llama.py +11 -0
  20. sglang/srt/models/llama_eagle.py +132 -0
  21. sglang/srt/openai_api/adapter.py +60 -2
  22. sglang/srt/openai_api/protocol.py +48 -0
  23. sglang/srt/server.py +26 -3
  24. sglang/srt/server_args.py +17 -30
  25. sglang/srt/speculative/spec_info.py +19 -0
  26. sglang/srt/utils.py +62 -0
  27. sglang/version.py +1 -1
  28. {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/METADATA +3 -3
  29. {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/RECORD +32 -30
  30. {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/LICENSE +0 -0
  31. {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/WHEEL +0 -0
  32. {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post2.dist-info}/top_level.txt +0 -0
@@ -257,6 +257,34 @@ class ResponseFormat(BaseModel):
257
257
  json_schema: Optional[JsonSchemaResponseFormat] = None
258
258
 
259
259
 
260
+ class Function(BaseModel):
261
+ """Function descriptions."""
262
+
263
+ description: Optional[str] = Field(default=None, examples=[None])
264
+ name: str
265
+ parameters: Optional[object] = None
266
+
267
+
268
+ class Tool(BaseModel):
269
+ """Function wrapper."""
270
+
271
+ type: str = Field(default="function", examples=["function"])
272
+ function: Function
273
+
274
+
275
+ class ToolChoiceFuncName(BaseModel):
276
+ """The name of tool choice function."""
277
+
278
+ name: str
279
+
280
+
281
+ class ToolChoice(BaseModel):
282
+ """The tool choice definition."""
283
+
284
+ function: ToolChoiceFuncName
285
+ type: Literal["function"] = Field(default="function", examples=["function"])
286
+
287
+
260
288
  class ChatCompletionRequest(BaseModel):
261
289
  # Ordered by official OpenAI API documentation
262
290
  # https://platform.openai.com/docs/api-reference/chat/create
@@ -277,6 +305,10 @@ class ChatCompletionRequest(BaseModel):
277
305
  temperature: float = 0.7
278
306
  top_p: float = 1.0
279
307
  user: Optional[str] = None
308
+ tools: Optional[List[Tool]] = Field(default=None, examples=[None])
309
+ tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
310
+ default="auto", examples=["none"]
311
+ ) # noqa
280
312
 
281
313
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
282
314
  top_k: int = -1
@@ -292,9 +324,25 @@ class ChatCompletionRequest(BaseModel):
292
324
  ebnf: Optional[str] = None
293
325
 
294
326
 
327
+ class FunctionResponse(BaseModel):
328
+ """Function response."""
329
+
330
+ name: str
331
+ arguments: str
332
+
333
+
334
+ class ToolCall(BaseModel):
335
+ """Tool call response."""
336
+
337
+ id: str
338
+ type: Literal["function"] = "function"
339
+ function: FunctionResponse
340
+
341
+
295
342
  class ChatMessage(BaseModel):
296
343
  role: Optional[str] = None
297
344
  content: Optional[str] = None
345
+ tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
298
346
 
299
347
 
300
348
  class ChatCompletionResponseChoice(BaseModel):
sglang/srt/server.py CHANGED
@@ -57,6 +57,7 @@ from sglang.srt.managers.io_struct import (
57
57
  OpenSessionReqInput,
58
58
  UpdateWeightFromDiskReqInput,
59
59
  UpdateWeightsFromDistributedReqInput,
60
+ UpdateWeightsFromTensorReqInput,
60
61
  )
61
62
  from sglang.srt.managers.scheduler import run_scheduler_process
62
63
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
@@ -109,6 +110,7 @@ app.add_middleware(
109
110
  tokenizer_manager: TokenizerManager = None
110
111
  scheduler_info: Dict = None
111
112
 
113
+
112
114
  ##### Native API endpoints #####
113
115
 
114
116
 
@@ -257,6 +259,10 @@ async def open_session(obj: OpenSessionReqInput, request: Request):
257
259
  """Open a session, and return its unique session id."""
258
260
  try:
259
261
  session_id = await tokenizer_manager.open_session(obj, request)
262
+ if session_id is None:
263
+ raise Exception(
264
+ "Failed to open the session. Check if a session with the same id is still open."
265
+ )
260
266
  return session_id
261
267
  except Exception as e:
262
268
  return _create_error_response(e)
@@ -484,7 +490,16 @@ def launch_engine(
484
490
  # Wait for model to finish loading
485
491
  scheduler_infos = []
486
492
  for i in range(len(scheduler_pipe_readers)):
487
- data = scheduler_pipe_readers[i].recv()
493
+ try:
494
+ data = scheduler_pipe_readers[i].recv()
495
+ except EOFError as e:
496
+ logger.exception(e)
497
+ logger.error(
498
+ f"Rank {i} scheduler is dead. Please check if there are relevant logs."
499
+ )
500
+ scheduler_procs[i].join()
501
+ logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
502
+ raise
488
503
 
489
504
  if data["status"] != "ready":
490
505
  raise RuntimeError(
@@ -492,7 +507,7 @@ def launch_engine(
492
507
  )
493
508
  scheduler_infos.append(data)
494
509
 
495
- # Assume all schedulers have same max_total_num_tokens
510
+ # Assume all schedulers have same scheduler_info
496
511
  scheduler_info = scheduler_infos[0]
497
512
 
498
513
 
@@ -857,6 +872,14 @@ class Engine:
857
872
  tokenizer_manager.update_weights_from_distributed(obj, None)
858
873
  )
859
874
 
875
+ def update_weights_from_tensor(self, name, tensor):
876
+ """Update weights from distributed source."""
877
+ obj = UpdateWeightsFromTensorReqInput(name=name, tensor=tensor)
878
+ loop = asyncio.get_event_loop()
879
+ return loop.run_until_complete(
880
+ tokenizer_manager.update_weights_from_tensor(obj, None)
881
+ )
882
+
860
883
  def get_weights_by_name(self, name, truncate_size=100):
861
884
  """Get weights by parameter name."""
862
885
  obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
@@ -871,7 +894,7 @@ class Runtime:
871
894
  using the commond line interface.
872
895
 
873
896
  It is mainly used for the frontend language.
874
- You should use the Engine class if you want to do normal offline processing.
897
+ You should use the Engine class above if you want to do normal offline processing.
875
898
  """
876
899
 
877
900
  def __init__(
sglang/srt/server_args.py CHANGED
@@ -55,7 +55,7 @@ class ServerArgs:
55
55
  is_embedding: bool = False
56
56
  revision: Optional[str] = None
57
57
 
58
- # Port
58
+ # Port for the HTTP server
59
59
  host: str = "127.0.0.1"
60
60
  port: int = 30000
61
61
 
@@ -68,6 +68,7 @@ class ServerArgs:
68
68
  schedule_policy: str = "lpm"
69
69
  schedule_conservativeness: float = 1.0
70
70
  cpu_offload_gb: int = 0
71
+ prefill_only_one_req: bool = False
71
72
 
72
73
  # Other runtime options
73
74
  tp_size: int = 1
@@ -94,6 +95,7 @@ class ServerArgs:
94
95
  # Data parallelism
95
96
  dp_size: int = 1
96
97
  load_balance_method: str = "round_robin"
98
+
97
99
  # Expert parallelism
98
100
  ep_size: int = 1
99
101
 
@@ -217,6 +219,13 @@ class ServerArgs:
217
219
  )
218
220
  self.disable_cuda_graph = True
219
221
 
222
+ # Expert parallelism
223
+ if self.enable_ep_moe:
224
+ self.ep_size = self.tp_size
225
+ logger.info(
226
+ f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
227
+ )
228
+
220
229
  # Others
221
230
  if self.enable_dp_attention:
222
231
  self.dp_size = self.tp_size
@@ -229,12 +238,6 @@ class ServerArgs:
229
238
  "Data parallel size is adjusted to be the same as tensor parallel size. "
230
239
  "Overlap scheduler is disabled."
231
240
  )
232
- # Expert parallelism
233
- if self.enable_ep_moe:
234
- self.ep_size = self.tp_size
235
- logger.info(
236
- f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
237
- )
238
241
 
239
242
  # GGUF
240
243
  if (
@@ -430,13 +433,18 @@ class ServerArgs:
430
433
  default=ServerArgs.schedule_conservativeness,
431
434
  help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
432
435
  )
433
-
434
436
  parser.add_argument(
435
437
  "--cpu-offload-gb",
436
438
  type=int,
437
439
  default=ServerArgs.cpu_offload_gb,
438
440
  help="How many GBs of RAM to reserve for CPU offloading",
439
441
  )
442
+ parser.add_argument(
443
+ "--prefill-only-one-req",
444
+ type=bool,
445
+ help="If true, we only prefill one request at one prefill batch",
446
+ default=ServerArgs.prefill_only_one_req,
447
+ )
440
448
 
441
449
  # Other runtime options
442
450
  parser.add_argument(
@@ -555,6 +563,7 @@ class ServerArgs:
555
563
  "shortest_queue",
556
564
  ],
557
565
  )
566
+
558
567
  # Expert parallelism
559
568
  parser.add_argument(
560
569
  "--expert-parallel-size",
@@ -777,28 +786,6 @@ class ServerArgs:
777
786
  help="Delete the model checkpoint after loading the model.",
778
787
  )
779
788
 
780
- # Deprecated arguments
781
- parser.add_argument(
782
- "--enable-overlap-schedule",
783
- action=DeprecatedAction,
784
- help="'--enable-overlap-schedule' is deprecated. It is enabled by default now. Please drop this argument.",
785
- )
786
- parser.add_argument(
787
- "--disable-flashinfer",
788
- action=DeprecatedAction,
789
- help="'--disable-flashinfer' is deprecated. Please use '--attention-backend triton' instead.",
790
- )
791
- parser.add_argument(
792
- "--disable-flashinfer-sampling",
793
- action=DeprecatedAction,
794
- help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.",
795
- )
796
- parser.add_argument(
797
- "--disable-disk-cache",
798
- action=DeprecatedAction,
799
- help="'--disable-disk-cache' is deprecated. Please use '--disable-outlines-disk-cache' instead.",
800
- )
801
-
802
789
  @classmethod
803
790
  def from_cli_args(cls, args: argparse.Namespace):
804
791
  args.tp_size = args.tensor_parallel_size
@@ -0,0 +1,19 @@
1
+ from enum import IntEnum, auto
2
+
3
+
4
+ class SpeculativeAlgorithm(IntEnum):
5
+ EAGLE = auto()
6
+
7
+ def is_eagle(self):
8
+ return self == SpeculativeAlgorithm.EAGLE
9
+
10
+ @staticmethod
11
+ def from_string(name: str):
12
+ name_map = {
13
+ "EAGLE": SpeculativeAlgorithm.EAGLE,
14
+ }
15
+ return name_map[name]
16
+
17
+
18
+ class SpecInfo:
19
+ pass
sglang/srt/utils.py CHANGED
@@ -1273,3 +1273,65 @@ def dataclass_to_string_truncated(data, max_length=2048):
1273
1273
  )
1274
1274
  else:
1275
1275
  return str(data)
1276
+
1277
+
1278
+ TOOLS_TAG_LIST = ["<|plugin|>", "<function=", "<tool_call>", "<|python_tag|>"]
1279
+
1280
+
1281
+ def parse_tool_response(text, tools, **kwargs):
1282
+ """Parse model response containing tool information.
1283
+
1284
+ Args:
1285
+ text(str): model response in string format
1286
+ tools(List): tools from user request
1287
+ """
1288
+ if "<|plugin|>" in text: # internlm2
1289
+ text, action = text.split("<|action_start|><|plugin|>")
1290
+ action = action.split("<|action_end|>".strip())[0]
1291
+ action = action[action.find("{") :]
1292
+ action = json.loads(action)
1293
+ name, parameters = action["name"], json.dumps(
1294
+ action.get("parameters", action.get("arguments", {})), ensure_ascii=False
1295
+ )
1296
+ call_info_list = [(name, parameters)]
1297
+ elif "<function=" in text: # llama3.1
1298
+ action, _ = text.split("</function>")
1299
+ parameters = action[action.find("{") :]
1300
+ name = action.split("<function=")[1].split(">{")[0]
1301
+ call_info_list = [(name, parameters)]
1302
+ elif "<tool_call>" in text and "</tool_call>" in text: # qwen2.5
1303
+ # get tool_call in text
1304
+ pattern = r"<tool_call>(.*?)</tool_call>"
1305
+ match_result_list = re.findall(pattern, text, re.DOTALL)
1306
+ call_info_list = []
1307
+ for match_result in match_result_list:
1308
+ action = json.loads(match_result)
1309
+ call_info_list.append(
1310
+ (action["name"], json.dumps(action["arguments"], ensure_ascii=False))
1311
+ )
1312
+ # get text outside of tags
1313
+ if not text.startswith("<tool_call>"):
1314
+ text = text[: text.find("<tool_call>")]
1315
+ elif not text.endswith("</tool_call>"):
1316
+ text = text[text.rfind("</tool_call>") + len("</tool_call>") :]
1317
+ else:
1318
+ text = ""
1319
+ elif "<|python_tag|>" in text: # llama3.2
1320
+ _, action = text.split("<|python_tag|>")
1321
+ action = json.loads(action)
1322
+ name, parameters = action["name"], json.dumps(
1323
+ action.get("parameters", action.get("arguments", {})), ensure_ascii=False
1324
+ )
1325
+ call_info_list = [(name, parameters)]
1326
+ else:
1327
+ raise RuntimeError(f"Unexpected model response: {text}")
1328
+
1329
+ call_info_list = [
1330
+ (
1331
+ [tool.function.name for tool in tools].index(call_info[0]),
1332
+ call_info[0],
1333
+ call_info[1],
1334
+ )
1335
+ for call_info in call_info_list
1336
+ ]
1337
+ return text, call_info_list
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.1.post1"
1
+ __version__ = "0.4.1.post2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.4.1.post1
3
+ Version: 0.4.1.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -351,14 +351,14 @@ The core features include:
351
351
  - [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
352
352
  - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
353
353
 
354
- ## Benchmark And Performance
354
+ ## Benchmark and Performance
355
355
  Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
356
356
 
357
357
  ## Roadmap
358
358
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
359
359
 
360
360
  ## Adoption and Sponsorship
361
- The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI.
361
+ The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
362
362
 
363
363
  ## Acknowledgment and Citation
364
364
  We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -1,7 +1,7 @@
1
1
  sglang/__init__.py,sha256=b2oIdWzp5P8SzieeOs2TzJoN3Do3tfJbV8gZS_imVcs,1619
2
2
  sglang/api.py,sha256=NdO6cYnklnEBQBKqQjlqI8-P1EownKQ71t5ibCGhEVo,6953
3
3
  sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
4
- sglang/bench_offline_throughput.py,sha256=iQiJCK3KQDCdwU1NVbIwbtthssWzBXiIsKUDA7Z_hO0,12510
4
+ sglang/bench_offline_throughput.py,sha256=r-uBvpnx-30mAnVwQB4WlqiXxy2fn5a1NUARwZcaIo4,12533
5
5
  sglang/bench_one_batch.py,sha256=jkyMhK0lqn5dRCYgAh30qZrNHP4gAbXODymBMNXK86I,15859
6
6
  sglang/bench_one_batch_server.py,sha256=-fV9FTLNNcSIy0pgYeggXedPVK0fVsXZqVQswT8OMOY,5945
7
7
  sglang/bench_serving.py,sha256=YQiCZreejCPBTqMmZsCB99RMi1N-Jx-dZtaafcQ8-14,53377
@@ -11,7 +11,7 @@ sglang/launch_server.py,sha256=4y2QeSj0wVNB9MJQZeahD4ahTDU6gwqo7MPUytyFop0,403
11
11
  sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
12
12
  sglang/llama3_eval.py,sha256=gWSboDchIGybIce88bJlrCG0yiLZ513mw4gcutJlzGM,10017
13
13
  sglang/utils.py,sha256=23jf4Mz8E5p5a6JOkjnfYZixdjZUk88F_mZ8rZcby5Q,11597
14
- sglang/version.py,sha256=ARioq8ApVNckeQorLPVfHZeN9mlHMLbaNgLGNbGq-ys,28
14
+ sglang/version.py,sha256=1g4t88smWYt1DD5SJZdzXI13OPbsQPOEMX9twA4wq6A,28
15
15
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  sglang/lang/chat_template.py,sha256=cnfjjxIIcYRGRxXlJlOGnpFxFuhMHut7DS52LsOMKcA,15826
17
17
  sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
@@ -32,14 +32,14 @@ sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21
32
32
  sglang/srt/hf_transformers_utils.py,sha256=38Ms0H2-VMerOS6jnczcFtZMS6lhw9B5rSWKAfxVUfQ,7945
33
33
  sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
34
34
  sglang/srt/model_parallel.py,sha256=eLXZhvJ4wG6dh0FontNCIdVZvHYdWgaeY-5cu7TD9tE,6078
35
- sglang/srt/server.py,sha256=vDucJl6qtEK2swzPJ_wYitaJvsI4MigMagGlBlH5V54,34033
36
- sglang/srt/server_args.py,sha256=LgnQ-kBJZ3E7hMMZj9bSK0mn7Bhjk1nJHxLcxl-lGTM,34572
37
- sglang/srt/utils.py,sha256=J8kFl6kDBwFZCM6AKaVTiqdhJKRg0JOH0pNrD1ZeWmM,41726
35
+ sglang/srt/server.py,sha256=sDERAZlRa6OTaUk-SfW5aKJbPui1COpPG34HDlMHMNc,34916
36
+ sglang/srt/server_args.py,sha256=lBMOME7OSqG0Opinsin-QsioNrWpp2M0ZosB8cXoRrY,33917
37
+ sglang/srt/utils.py,sha256=i8MjcaSQjPPfPZ0txufTtqLr4Q7YhHQ86L1i9j-y5yY,44131
38
38
  sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
39
39
  sglang/srt/configs/device_config.py,sha256=dResqHjkg_dq10v6rnVpbXpvABZRB0jylOm-2_JAnx0,428
40
40
  sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
41
41
  sglang/srt/configs/load_config.py,sha256=TcPi_HY6xu5SiVZsxPOoB5pGeDUNebOk7muoUH9VBDg,3083
42
- sglang/srt/configs/model_config.py,sha256=vVarlLTw9Ged1PXIwRP-R8UhiG6oaezNIZhTNuF0eQc,16070
42
+ sglang/srt/configs/model_config.py,sha256=QP_6WaWMrE4NNF-XODRomiQPO0FABmVZIj5A-qJfnYg,16427
43
43
  sglang/srt/configs/qwen2vl.py,sha256=ZjLy9v2eZY4wptUfY3CWgYKg2B5DDrkfCSyTy_Zf_bg,4351
44
44
  sglang/srt/constrained/__init__.py,sha256=UWZNVLvOT5ZBX8M36sONgDmnKtkQ0cSfhQD2jO0ATuk,786
45
45
  sglang/srt/constrained/base_grammar_backend.py,sha256=FhVm7PxhXDl0joV9NP5RjKgz7dR1dZvUAQnh0mdtvVY,2353
@@ -63,16 +63,16 @@ sglang/srt/layers/activation.py,sha256=EboMjT9HV2tNHQ6rzpojtlkzev1lAFbhQlxMg9hwx
63
63
  sglang/srt/layers/custom_op_util.py,sha256=0vu-yX2wwonmO1L_o5G7SA6C-8XuhDIh9rPDvNeLhoc,922
64
64
  sglang/srt/layers/layernorm.py,sha256=nRQ1w1xSUcU-zlqVC61BnGG6otS5W1w9VaSzeXizrx4,4037
65
65
  sglang/srt/layers/linear.py,sha256=KyRFU0VcoNuN-hnQB9QQcBN9NCpeqPtLzzufIHUpV6w,47064
66
- sglang/srt/layers/logits_processor.py,sha256=JlOU0x8vBGIuTwHSdjR6Kly9_uzilBMv0NE_rvUx0W4,14747
66
+ sglang/srt/layers/logits_processor.py,sha256=Imh-qY1D9J80DZVSVV0LfTiHMEw6oQ3JbY9lXxPZAXE,15656
67
67
  sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
68
68
  sglang/srt/layers/radix_attention.py,sha256=E4cmvkcCdCtb6VyLNrCKy1D6VwHQ063oH3JQXPaRy6w,2178
69
69
  sglang/srt/layers/rotary_embedding.py,sha256=29tx3JNR40AoXqBa2cFGBjva9vU2xgFipETlpMaaZas,3985
70
70
  sglang/srt/layers/sampler.py,sha256=k4Op_HMkQfT7t9wgQwBVotfTUXEocrzRyQqEFnff1pc,5511
71
71
  sglang/srt/layers/torchao_utils.py,sha256=dQVuWNXxAvOPjr2G5BBMWqC2oKcS2B52rx-fEc_elmc,3545
72
72
  sglang/srt/layers/vocab_parallel_embedding.py,sha256=slGwLiWjuFLCUdRe-GTlfumyZpqVX9VF6No_UGOT-hA,21624
73
- sglang/srt/layers/attention/__init__.py,sha256=KIJhzOJWYioQE7Va4D83-V-ZUZVMZcczuNgDC3dlSRo,2583
73
+ sglang/srt/layers/attention/__init__.py,sha256=lNLfWqePc5NMej-AcXl97vxVXsxQOgP7dNNb2ibyUWI,2562
74
74
  sglang/srt/layers/attention/double_sparsity_backend.py,sha256=RQdEKRykSLf9ilnaHmR6T7RFqh4emH_adfB3aJN2BUU,10920
75
- sglang/srt/layers/attention/flashinfer_backend.py,sha256=umD1E2zvMnPbbgvx2Ex5LQB6a4a41brjsks1M0gFMMU,26357
75
+ sglang/srt/layers/attention/flashinfer_backend.py,sha256=8nH4EIEXvNk9yZVl7mSn78w5Dli5UiWL-ZCeYykG9HI,27280
76
76
  sglang/srt/layers/attention/torch_native_backend.py,sha256=nQdeqWEMMH_wrod5wssDCJG-uPKm0uslvkALKqPRPQ8,10509
77
77
  sglang/srt/layers/attention/triton_backend.py,sha256=-TobyZHwlbJ5HhbFg-jgCqVOw4Y-opgEuFo-EusASQc,6264
78
78
  sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=oJ_UK1t229zF3hbTDiQe7t-X-IbM2dOxx4U2ch-vmjA,17847
@@ -85,27 +85,27 @@ sglang/srt/layers/moe/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
85
85
  sglang/srt/layers/moe/ep_moe/kernels.py,sha256=wb_S2qLxoWWgQu9coXy0XLNGvHzdZSdwXr0PGy4QySg,10940
86
86
  sglang/srt/layers/moe/ep_moe/layer.py,sha256=6iQU5ZjQ8IXGoQ8ZlBuJqyQxYTEem9vXI6rbVIWKlZw,22303
87
87
  sglang/srt/layers/moe/fused_moe_triton/__init__.py,sha256=h9yMFAL_bagUf-qBED8gSWdCOb7d8IdA-pE-L_nIg8E,842
88
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=zXwWUtthLa9E35EvlQ9A_mnIsQyA0_NYKsUBdJqONHo,31163
88
+ sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=KvOy544x_4nRqg50o5YHQpHvF8TUD7q9LXDAWPGJlAA,31796
89
89
  sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=BclDj5JyCy-8Bfue4broL1-IG6a4dUyggE9WQLa06sg,20575
90
90
  sglang/srt/layers/quantization/__init__.py,sha256=VPYXShHvbvkOgVBlkIqic4RhdJ1y6EZ3r34T-nZMT1k,4606
91
91
  sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
92
- sglang/srt/layers/quantization/fp8.py,sha256=wNnpXLroIl7D98mlfCiXZPE9hrP5ricHrXY1WZBzEEo,30810
93
- sglang/srt/layers/quantization/fp8_kernel.py,sha256=eoO1enzD9jPC80id2oC3i8bt-LN6-4Ey223yOQ9yIPE,8792
94
- sglang/srt/layers/quantization/fp8_utils.py,sha256=HBJBaNcln1NrLxzw0ppUjMd6w-ryuGDDHCYJq7mRQac,4035
92
+ sglang/srt/layers/quantization/fp8.py,sha256=k4mw-iKxlaEWRkGgaoxCLzZ_dYydyRj0y1N1B_umMwU,32668
93
+ sglang/srt/layers/quantization/fp8_kernel.py,sha256=cYF4ckqrUyhCO9Ha7zi05R8EhRaqSa8rFpYisz-9Ed0,10743
94
+ sglang/srt/layers/quantization/fp8_utils.py,sha256=qBVJXxbxqmf8-Juq0t-IXWjlaZoePJqFNYcs9-oT5Yo,4150
95
95
  sglang/srt/lora/lora.py,sha256=-o2mBmUvoVpdkgdAkWTARN4kfyep3UNEJLcg6moh0SU,15056
96
96
  sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
97
97
  sglang/srt/lora/lora_manager.py,sha256=DHiqdl0_4wQ5PxZBZtlCpP14515mDV2_H9tzL3Rdss8,12886
98
98
  sglang/srt/managers/data_parallel_controller.py,sha256=psI4FAuBGjtdnEuwagnGdtRqvqSSxOROfNKQqVDqlVA,8382
99
99
  sglang/srt/managers/detokenizer_manager.py,sha256=nZkbwt4yty_oy8rvg4T7PbgyVLoBLohvHl25xlQpBoo,8439
100
100
  sglang/srt/managers/image_processor.py,sha256=Y8RgyrzbJjJTpjbnZDa5qiiG5wWjZ68rOXUPDi6kkFo,13698
101
- sglang/srt/managers/io_struct.py,sha256=_LWWqT3LNwZGaWhg2d3kTg1V2MTHKzRasCvxF9Nfpi4,15429
102
- sglang/srt/managers/schedule_batch.py,sha256=qryPWCdOTFzxomDa80U-5guShOb1K4kBUWcPCCchYB8,45762
101
+ sglang/srt/managers/io_struct.py,sha256=Gru7LEyc3tcM_LewoteCb7GXIrh-OYhA2CnEvjc1Cis,15769
102
+ sglang/srt/managers/schedule_batch.py,sha256=KnoVuWgINnyard-BOXCo0jm3IMdXN9wIwnftMKcag-s,46097
103
103
  sglang/srt/managers/schedule_policy.py,sha256=QxjQ8-le062AMHHxool6CxkhvB4FIwhOQPzTX_JwL6U,15447
104
- sglang/srt/managers/scheduler.py,sha256=Yh15uQFhJlku8a20-lhtIsiEHAcUmpL3BzL42kLVwiI,61637
105
- sglang/srt/managers/session_controller.py,sha256=Yp-IV3rXczACZxZXmF-QxW9CWICGy8KHQ9ttBGJ8WXA,2800
106
- sglang/srt/managers/tokenizer_manager.py,sha256=uKiTt__lCFXG60zQhmM_K7dU7IuedVSIQHVw3x3y5-E,31758
107
- sglang/srt/managers/tp_worker.py,sha256=X1EwFX3FSsmXx7jeeX2tjZRocaujabQYWm-M-0CFEBE,7363
108
- sglang/srt/managers/tp_worker_overlap_thread.py,sha256=-QNBJRKxraa9Xt2WI1AFzZYdneIJ1eXv0GjFzDqXoE0,8926
104
+ sglang/srt/managers/scheduler.py,sha256=Z1_wf6OCC8Hevc7y0D4Rt4EW5Et9bgTL9oJOkmNmDjo,63490
105
+ sglang/srt/managers/session_controller.py,sha256=3laMRIXEYWDjfytCjPs0vw_Tw__k-nKBY-bYzycYbfc,5482
106
+ sglang/srt/managers/tokenizer_manager.py,sha256=SXvVZHFMBCtcLkKnq-O3uzwrEhfVqk6Y1fzeBEFNq0E,33010
107
+ sglang/srt/managers/tp_worker.py,sha256=8RVBLQaS3TnX7Z4J35RVrFN0M6PVnRBhct3sczBL4dY,7644
108
+ sglang/srt/managers/tp_worker_overlap_thread.py,sha256=JQfrVPeE56ZGJ3nozkhZR-RSb2oePsY7iuedM7XCtdQ,9157
109
109
  sglang/srt/mem_cache/base_prefix_cache.py,sha256=QC8HS8RC5DXu14kyXsxAgEUsn0f932p2DjqzbKjc6Bs,962
110
110
  sglang/srt/mem_cache/chunk_cache.py,sha256=R2gHAuqKd5ayQW3NnsgoGUH31---Z5izCDyCqLL0FjQ,2524
111
111
  sglang/srt/mem_cache/flush_cache.py,sha256=GYcxmNXh4hsMpFfNOuCTpKilW7guZwTtAg_usVeM3J0,979
@@ -114,8 +114,8 @@ sglang/srt/mem_cache/radix_cache.py,sha256=c5voySV5L855c0G9cBEc9iQ4nR7PDDmg0V6fW
114
114
  sglang/srt/metrics/collector.py,sha256=ZWoFx_FKN0sNMSZ8RJWUVQ0RFEYhIHxdw0d4TZTluMU,6861
115
115
  sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
116
116
  sglang/srt/model_executor/cuda_graph_runner.py,sha256=1n5WxoE9-0B3unwkkcR355K_D290h2LGt_7EvH02DQM,16246
117
- sglang/srt/model_executor/forward_batch_info.py,sha256=L5mVoW5SaO6To-7nGk0TZM-FFB5_78cARpJ-aC2rwD0,12883
118
- sglang/srt/model_executor/model_runner.py,sha256=MLYBcYIQihu2I3PBTUghiU2mSWsDMzlKzcnX7yHa9JU,29837
117
+ sglang/srt/model_executor/forward_batch_info.py,sha256=vqF8XrHQPk3ZL7HqPvvkfP53oqBx0Fajb5lAIkdifBo,13961
118
+ sglang/srt/model_executor/model_runner.py,sha256=TjvAwwr7EqZdmE-5HbuQMeEa0e0FqY6LeqqzEAHXMPU,30012
119
119
  sglang/srt/model_loader/__init__.py,sha256=zGZkOBz1zx-pkaIy47BasL3fjDlAcxAXUTjInOhXHAE,919
120
120
  sglang/srt/model_loader/loader.py,sha256=7OG_8-66vFDFZ9kVKGNK1BFBjZ6ql449dlyvdCbMqvE,43876
121
121
  sglang/srt/model_loader/utils.py,sha256=0NaMR67fESFopaklmsleiL27XH1QUrjZW246MUu1EJ0,1369
@@ -136,8 +136,9 @@ sglang/srt/models/granite.py,sha256=AeQY9Dxd1ZnwgCYBK0vSXXiMGM-yt9iaOVf_ruOUHXw,
136
136
  sglang/srt/models/grok.py,sha256=J9lgNbFebvXgF19nfZyHwlGPlGWY_m0LgP506YvOYrU,15668
137
137
  sglang/srt/models/internlm2.py,sha256=_xcKtd6YtEFUTozaN-yUb0xbSYckRpomfPSKcAk4j-Y,12127
138
138
  sglang/srt/models/internlm2_reward.py,sha256=8K26A9oIFFGx_9U2mF87j7FX8K87HGKMnVL3ht1Uc7I,2398
139
- sglang/srt/models/llama.py,sha256=o3FYyOhkZJirzugyYz1kxs6RpY84O_uKowWWmt3jv24,19929
139
+ sglang/srt/models/llama.py,sha256=4UPKF7erp7qqBD11uvvQkO1Fo_wDs71BmA8Y2csXRcA,20302
140
140
  sglang/srt/models/llama_classification.py,sha256=DwboM1xHXdf3Fddf7xGnrfdOLJwXdiJs994cIpAPa2g,2984
141
+ sglang/srt/models/llama_eagle.py,sha256=88DzR54DKBIKJ1h-bkIa8mc1qJnlkdZ1eGYY3c5mpBY,4442
141
142
  sglang/srt/models/llama_embedding.py,sha256=rh-AiczPY_pTpzcACHvSMVjh1hsV_MZBBwP0LQxPsGM,3130
142
143
  sglang/srt/models/llama_reward.py,sha256=oPxh5E2UkxLULNdR68dFvt2I7j33CJFN6nyA-8L2_cg,4516
143
144
  sglang/srt/models/llava.py,sha256=xrkg8sht8tBOID7427IEZtHL-KKWfEivDe2NqGjTSAs,26373
@@ -162,8 +163,8 @@ sglang/srt/models/torch_native_llama.py,sha256=YeXHorFm6QfnczLXwPb5TG9a-He0uiA9R
162
163
  sglang/srt/models/xverse.py,sha256=Oq--KqvbYu2H4TMVGEHpSnJLEwXBpxlncR9ilsQeckc,13579
163
164
  sglang/srt/models/xverse_moe.py,sha256=7E60YIST4ELYwLRgjtHiLRI5Uyc7XqQTM7jQXiWaQs4,15541
164
165
  sglang/srt/models/yivl.py,sha256=88OubtuZ38Dxb2LzfV_MTPBI4wKhh4NJqFu--efbhFM,4809
165
- sglang/srt/openai_api/adapter.py,sha256=X0HLuNhg-chDQjcdsQIRpZijlImEwZLHum3G0JgU4Go,54834
166
- sglang/srt/openai_api/protocol.py,sha256=RMzeDfh2tZITjhNwB2nX68wZwQe40N6HBuVebCzEWiU,10468
166
+ sglang/srt/openai_api/adapter.py,sha256=HvgeFPWv-v8LOiYF2iNCo-14BIZLAPznNTCUbubB2Rg,57091
167
+ sglang/srt/openai_api/protocol.py,sha256=anWGr2Br8gVYm6Z0yvDwjXLaPCPuvJZ28gr5rV2dhVQ,11613
167
168
  sglang/srt/sampling/sampling_batch_info.py,sha256=s--zNjk-LErZ5lMqnZ7KiuJltaziKRbQAU5qYpKIxAc,8564
168
169
  sglang/srt/sampling/sampling_params.py,sha256=BkgCJAOSmQXwJrNXg26zSjKfMy0d5mMN6oHRk_ZuESI,5499
169
170
  sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
@@ -172,6 +173,7 @@ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=1Zp2aL6dD6
172
173
  sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=_Nxv0XgUPirZjw2SEJYp_Cd9ZcLwmt7h6JE6J4hhFq4,3629
173
174
  sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=5tOgCg7OvE9kSN9VMCpH1hwqo1YMxt9iS5PVpct9HpU,2468
174
175
  sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=m22Rfn1RuB1HpImBDECsiJ2VooBYpsFADAwnk1EPzk0,2751
176
+ sglang/srt/speculative/spec_info.py,sha256=d-82uWEC-QBqAgv3XGDNDW8DlHv4MtUsZghFqzGwV7U,352
175
177
  sglang/test/few_shot_gsm8k.py,sha256=7yDbEQe49gZeJhz2wFFX-gf_59ThDKsCS1xwfogNc7k,4034
176
178
  sglang/test/few_shot_gsm8k_engine.py,sha256=QQbrwOX6-cJDD3RZC_e7zPnt6aSo8JdF8X_lRHSjdDM,3886
177
179
  sglang/test/run_eval.py,sha256=9yO0hXZOcn4abEOs96T-XPguDEklK16Ltco0pGF3zCg,4020
@@ -188,8 +190,8 @@ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c
188
190
  sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
189
191
  sglang/test/test_utils.py,sha256=HJG7kUQOk6n9FBbH89PDtQ41C3kt1cfJODhAEcFT0AQ,23823
190
192
  sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
191
- sglang-0.4.1.post1.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
192
- sglang-0.4.1.post1.dist-info/METADATA,sha256=R2YDOrUU_49x5TEbNUODNlXvkSIzFqT7-hvInlSCs5k,22527
193
- sglang-0.4.1.post1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
194
- sglang-0.4.1.post1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
195
- sglang-0.4.1.post1.dist-info/RECORD,,
193
+ sglang-0.4.1.post2.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
194
+ sglang-0.4.1.post2.dist-info/METADATA,sha256=eORQMKMQDt_eTOh6PxMzIvyjNbg2FllxXss2Z9jU3Ug,22544
195
+ sglang-0.4.1.post2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
196
+ sglang-0.4.1.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
197
+ sglang-0.4.1.post2.dist-info/RECORD,,