sglang 0.4.6.post1__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. sglang/bench_one_batch.py +2 -0
  2. sglang/check_env.py +3 -3
  3. sglang/srt/configs/__init__.py +4 -0
  4. sglang/srt/configs/kimi_vl.py +38 -0
  5. sglang/srt/configs/kimi_vl_moonvit.py +32 -0
  6. sglang/srt/configs/model_config.py +15 -0
  7. sglang/srt/conversation.py +122 -1
  8. sglang/srt/entrypoints/engine.py +44 -22
  9. sglang/srt/function_call_parser.py +97 -0
  10. sglang/srt/hf_transformers_utils.py +2 -0
  11. sglang/srt/layers/attention/cutlass_mla_backend.py +1 -1
  12. sglang/srt/layers/attention/flashinfer_backend.py +107 -82
  13. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -16
  14. sglang/srt/layers/attention/flashmla_backend.py +3 -0
  15. sglang/srt/layers/dp_attention.py +5 -2
  16. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +1 -3
  17. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  18. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  19. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  20. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  21. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -6
  22. sglang/srt/layers/quantization/__init__.py +2 -2
  23. sglang/srt/layers/quantization/deep_gemm.py +1 -1
  24. sglang/srt/layers/utils.py +35 -0
  25. sglang/srt/lora/layers.py +35 -9
  26. sglang/srt/lora/lora_manager.py +84 -35
  27. sglang/srt/managers/data_parallel_controller.py +52 -34
  28. sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
  29. sglang/srt/managers/schedule_batch.py +25 -15
  30. sglang/srt/managers/scheduler.py +263 -59
  31. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -1
  32. sglang/srt/managers/tp_worker.py +51 -16
  33. sglang/srt/managers/tp_worker_overlap_thread.py +9 -3
  34. sglang/srt/mem_cache/memory_pool.py +70 -36
  35. sglang/srt/model_executor/cuda_graph_runner.py +82 -19
  36. sglang/srt/model_executor/forward_batch_info.py +31 -1
  37. sglang/srt/model_executor/model_runner.py +115 -57
  38. sglang/srt/models/deepseek_nextn.py +1 -257
  39. sglang/srt/models/deepseek_v2.py +78 -18
  40. sglang/srt/models/kimi_vl.py +308 -0
  41. sglang/srt/models/kimi_vl_moonvit.py +639 -0
  42. sglang/srt/models/llama.py +92 -30
  43. sglang/srt/models/llama4.py +2 -1
  44. sglang/srt/models/llama_eagle.py +4 -1
  45. sglang/srt/models/llama_eagle3.py +4 -1
  46. sglang/srt/models/qwen2_moe.py +8 -3
  47. sglang/srt/models/qwen2_vl.py +0 -12
  48. sglang/srt/models/qwen3_moe.py +8 -3
  49. sglang/srt/openai_api/adapter.py +34 -22
  50. sglang/srt/openai_api/protocol.py +11 -1
  51. sglang/srt/server_args.py +67 -22
  52. sglang/srt/speculative/eagle_worker.py +3 -2
  53. sglang/srt/utils.py +88 -9
  54. sglang/test/runners.py +4 -0
  55. sglang/test/test_utils.py +29 -0
  56. sglang/version.py +1 -1
  57. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/METADATA +5 -4
  58. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/RECORD +61 -51
  59. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/WHEEL +1 -1
  60. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/licenses/LICENSE +0 -0
  61. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -22,7 +22,7 @@ import random
22
22
  import tempfile
23
23
  from typing import List, Literal, Optional
24
24
 
25
- from sglang.srt.hf_transformers_utils import check_gguf_file
25
+ from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
26
26
  from sglang.srt.reasoning_parser import ReasoningParser
27
27
  from sglang.srt.utils import (
28
28
  configure_ipv6,
@@ -78,6 +78,8 @@ class ServerArgs:
78
78
 
79
79
  # Other runtime options
80
80
  tp_size: int = 1
81
+ pp_size: int = 1
82
+ max_micro_batch_size: Optional[int] = None
81
83
  stream_interval: int = 1
82
84
  stream_output: bool = False
83
85
  random_seed: Optional[int] = None
@@ -222,25 +224,34 @@ class ServerArgs:
222
224
 
223
225
  # Set mem fraction static, which depends on the tensor parallelism size
224
226
  if self.mem_fraction_static is None:
227
+ parallel_size = self.tp_size * self.pp_size
225
228
  if gpu_mem <= 81920:
226
- if self.tp_size >= 16:
229
+ if parallel_size >= 16:
227
230
  self.mem_fraction_static = 0.79
228
- elif self.tp_size >= 8:
231
+ elif parallel_size >= 8:
229
232
  self.mem_fraction_static = 0.81
230
- elif self.tp_size >= 4:
233
+ elif parallel_size >= 4:
231
234
  self.mem_fraction_static = 0.85
232
- elif self.tp_size >= 2:
235
+ elif parallel_size >= 2:
233
236
  self.mem_fraction_static = 0.87
234
237
  else:
235
238
  self.mem_fraction_static = 0.88
236
239
  else:
237
- # FIXME: more fine grained auto-selection polices
238
- self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem
240
+ self.mem_fraction_static = 0.88
241
+ if gpu_mem > 96 * 1024:
242
+ mem_fraction = self.mem_fraction_static
243
+ self.mem_fraction_static = min(
244
+ mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
245
+ (gpu_mem - 1024 * 18)
246
+ / gpu_mem, # 15 GB + additional 3GB for cuda graph
247
+ )
239
248
 
240
249
  # Set chunked prefill size, which depends on the gpu memory capacity
241
250
  if self.chunked_prefill_size is None:
242
251
  if gpu_mem is not None and gpu_mem < 25_000:
243
252
  self.chunked_prefill_size = 2048
253
+ elif self.disaggregation_mode != "null":
254
+ self.chunked_prefill_size = 16384
244
255
  else:
245
256
  self.chunked_prefill_size = 8192
246
257
  assert self.chunked_prefill_size % self.page_size == 0
@@ -333,6 +344,14 @@ class ServerArgs:
333
344
  "eagle speculative decoding."
334
345
  )
335
346
 
347
+ model_arch = get_model_arch(self)
348
+
349
+ # Auto set draft_model_path DeepSeek-V3/R1
350
+ if self.speculative_draft_model_path is None and model_arch in [
351
+ "DeepseekV3ForCausalLM"
352
+ ]:
353
+ self.speculative_draft_model_path = self.model_path
354
+
336
355
  # Auto choose parameters
337
356
  if self.speculative_num_steps is None:
338
357
  assert (
@@ -343,7 +362,7 @@ class ServerArgs:
343
362
  self.speculative_num_steps,
344
363
  self.speculative_eagle_topk,
345
364
  self.speculative_num_draft_tokens,
346
- ) = auto_choose_speculative_params(self)
365
+ ) = auto_choose_speculative_params(model_arch)
347
366
 
348
367
  if self.page_size > 1 and self.speculative_eagle_topk > 1:
349
368
  self.speculative_eagle_topk = 1
@@ -632,6 +651,19 @@ class ServerArgs:
632
651
  default=ServerArgs.tp_size,
633
652
  help="The tensor parallelism size.",
634
653
  )
654
+ parser.add_argument(
655
+ "--pipeline-parallel-size",
656
+ "--pp-size",
657
+ type=int,
658
+ default=ServerArgs.pp_size,
659
+ help="The pipeline parallelism size.",
660
+ )
661
+ parser.add_argument(
662
+ "--max-micro-batch-size",
663
+ type=int,
664
+ default=ServerArgs.max_micro_batch_size,
665
+ help="The maximum micro batch size in pipeline parallelism.",
666
+ )
635
667
  parser.add_argument(
636
668
  "--stream-interval",
637
669
  type=int,
@@ -1096,9 +1128,9 @@ class ServerArgs:
1096
1128
  parser.add_argument(
1097
1129
  "--tool-call-parser",
1098
1130
  type=str,
1099
- choices=["qwen25", "mistral", "llama3", "deepseekv3"],
1131
+ choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
1100
1132
  default=ServerArgs.tool_call_parser,
1101
- help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and 'llama3'.",
1133
+ help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
1102
1134
  )
1103
1135
  parser.add_argument(
1104
1136
  "--enable-hierarchical-cache",
@@ -1221,6 +1253,7 @@ class ServerArgs:
1221
1253
  @classmethod
1222
1254
  def from_cli_args(cls, args: argparse.Namespace):
1223
1255
  args.tp_size = args.tensor_parallel_size
1256
+ args.pp_size = args.pipeline_parallel_size
1224
1257
  args.dp_size = args.data_parallel_size
1225
1258
  args.ep_size = args.expert_parallel_size
1226
1259
  attrs = [attr.name for attr in dataclasses.fields(cls)]
@@ -1234,15 +1267,25 @@ class ServerArgs:
1234
1267
 
1235
1268
  def check_server_args(self):
1236
1269
  assert (
1237
- self.tp_size % self.nnodes == 0
1238
- ), "tp_size must be divisible by number of nodes"
1270
+ self.tp_size * self.pp_size
1271
+ ) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
1272
+
1273
+ # FIXME pp constraints
1274
+ if self.pp_size > 1:
1275
+ logger.warning(f"Turn off overlap scheule for pipeline parallelism.")
1276
+ self.disable_overlap_schedule = True
1277
+ assert (
1278
+ self.disable_overlap_schedule
1279
+ and self.speculative_algorithm is None
1280
+ and not self.enable_mixed_chunk
1281
+ ), "Pipeline parallelism is not compatible with overlap schedule, speculative decoding, mixed chunked prefill."
1282
+
1239
1283
  assert not (
1240
1284
  self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
1241
1285
  ), "multi-node data parallel is not supported unless dp attention!"
1242
1286
  assert (
1243
1287
  self.max_loras_per_batch > 0
1244
1288
  # FIXME
1245
- and (self.lora_paths is None or self.disable_cuda_graph)
1246
1289
  and (self.lora_paths is None or self.disable_radix_cache)
1247
1290
  ), "compatibility of lora and cuda graph and radix attention is in progress"
1248
1291
  assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
@@ -1368,20 +1411,22 @@ class DeprecatedAction(argparse.Action):
1368
1411
  raise ValueError(self.help)
1369
1412
 
1370
1413
 
1371
- def auto_choose_speculative_params(self: ServerArgs):
1414
+ def get_model_arch(args: ServerArgs):
1415
+ hf_config = get_config(
1416
+ args.model_path,
1417
+ trust_remote_code=args.trust_remote_code,
1418
+ revision=args.revision,
1419
+ model_override_args=json.loads(args.json_model_override_args),
1420
+ )
1421
+ return hf_config.architectures[0]
1422
+
1423
+
1424
+ def auto_choose_speculative_params(arch: str):
1372
1425
  """
1373
1426
  Automatically choose the parameters for speculative decoding.
1374
1427
 
1375
1428
  You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
1376
1429
  """
1377
- config_path = os.path.join(self.model_path, "config.json")
1378
- if not os.path.exists(config_path):
1379
- raise ValueError(f"{config_path} is not found.")
1380
-
1381
- config = json.load(open(config_path))
1382
-
1383
- arch = config.get("architectures", ["Unknown"])[0]
1384
-
1385
1430
  if arch in ["LlamaForCausalLM"]:
1386
1431
  # The default value for llama
1387
1432
  return (5, 4, 8)
@@ -106,11 +106,12 @@ class EAGLEWorker(TpModelWorker):
106
106
  # Init draft worker
107
107
  with empty_context():
108
108
  super().__init__(
109
+ server_args=server_args,
109
110
  gpu_id=gpu_id,
110
111
  tp_rank=tp_rank,
111
- server_args=server_args,
112
- nccl_port=nccl_port,
112
+ pp_rank=0, # FIXME
113
113
  dp_rank=dp_rank,
114
+ nccl_port=nccl_port,
114
115
  is_draft_worker=True,
115
116
  req_to_token_pool=self.req_to_token_pool,
116
117
  token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
sglang/srt/utils.py CHANGED
@@ -12,6 +12,7 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
  """Common utilities."""
15
+
15
16
  import base64
16
17
  import builtins
17
18
  import ctypes
@@ -414,16 +415,40 @@ class LayerFn(Protocol):
414
415
  def make_layers(
415
416
  num_hidden_layers: int,
416
417
  layer_fn: LayerFn,
418
+ pp_rank: Optional[int] = None,
419
+ pp_size: Optional[int] = None,
417
420
  prefix: str = "",
421
+ return_tuple: bool = False,
418
422
  ) -> Tuple[int, int, torch.nn.ModuleList]:
419
423
  """Make a list of layers with the given layer function"""
424
+ # circula imports
425
+ from sglang.srt.distributed import get_pp_indices
426
+ from sglang.srt.layers.utils import PPMissingLayer
427
+
428
+ assert not pp_size or num_hidden_layers >= pp_size
429
+ start_layer, end_layer = (
430
+ get_pp_indices(
431
+ num_hidden_layers,
432
+ pp_rank,
433
+ pp_size,
434
+ )
435
+ if pp_rank is not None and pp_size is not None
436
+ else (0, num_hidden_layers)
437
+ )
420
438
  modules = torch.nn.ModuleList(
421
- [
439
+ [PPMissingLayer(return_tuple=return_tuple) for _ in range(start_layer)]
440
+ + [
422
441
  maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
423
- for idx in range(num_hidden_layers)
442
+ for idx in range(start_layer, end_layer)
443
+ ]
444
+ + [
445
+ PPMissingLayer(return_tuple=return_tuple)
446
+ for _ in range(end_layer, num_hidden_layers)
424
447
  ]
425
448
  )
426
- return modules
449
+ if pp_rank is None or pp_size is None:
450
+ return modules
451
+ return modules, start_layer, end_layer
427
452
 
428
453
 
429
454
  def set_random_seed(seed: int) -> None:
@@ -877,7 +902,7 @@ def broadcast_pyobj(
877
902
  "cuda" if torch.cuda.is_available() and not force_cpu_device else "cpu"
878
903
  )
879
904
 
880
- if rank == 0:
905
+ if rank == src:
881
906
  if len(data) == 0:
882
907
  tensor_size = torch.tensor([0], dtype=torch.long, device=device)
883
908
  dist.broadcast(tensor_size, src=src, group=dist_group)
@@ -909,6 +934,50 @@ def broadcast_pyobj(
909
934
  return data
910
935
 
911
936
 
937
+ def point_to_point_pyobj(
938
+ data: List[Any],
939
+ rank: int,
940
+ group: Optional[torch.distributed.ProcessGroup] = None,
941
+ src: int = 0,
942
+ dst: int = 1,
943
+ ):
944
+ """Send data from src to dst in group."""
945
+
946
+ if rank == src:
947
+ if len(data) == 0:
948
+ tensor_size = torch.tensor([0], dtype=torch.long)
949
+ dist.send(tensor_size, dst=dst, group=group)
950
+ else:
951
+ serialized_data = pickle.dumps(data)
952
+ size = len(serialized_data)
953
+ tensor_data = torch.ByteTensor(
954
+ np.frombuffer(serialized_data, dtype=np.uint8)
955
+ )
956
+ tensor_size = torch.tensor([size], dtype=torch.long)
957
+
958
+ dist.send(tensor_size, dst=dst, group=group)
959
+ dist.send(tensor_data, dst=dst, group=group)
960
+ return data
961
+
962
+ elif rank == dst:
963
+ tensor_size = torch.tensor([0], dtype=torch.long)
964
+ dist.recv(tensor_size, src=src, group=group)
965
+ size = tensor_size.item()
966
+
967
+ if size == 0:
968
+ return []
969
+
970
+ tensor_data = torch.empty(size, dtype=torch.uint8)
971
+ dist.recv(tensor_data, src=src, group=group)
972
+
973
+ serialized_data = bytes(tensor_data.cpu().numpy())
974
+ data = pickle.loads(serialized_data)
975
+ return data
976
+
977
+ # Other ranks in pp_group do nothing
978
+ return []
979
+
980
+
912
981
  step_counter = 0
913
982
 
914
983
 
@@ -1732,6 +1801,13 @@ def configure_ipv6(dist_init_addr):
1732
1801
  return port, host
1733
1802
 
1734
1803
 
1804
+ def rank0_log(msg: str):
1805
+ from sglang.srt.distributed import get_tensor_model_parallel_rank
1806
+
1807
+ if get_tensor_model_parallel_rank() == 0:
1808
+ logger.info(msg)
1809
+
1810
+
1735
1811
  def rank0_print(msg: str):
1736
1812
  from sglang.srt.distributed import get_tensor_model_parallel_rank
1737
1813
 
@@ -1905,13 +1981,16 @@ def fast_topk(values, topk, dim):
1905
1981
  return torch.topk(values, topk, dim=dim)
1906
1982
 
1907
1983
 
1908
- def is_hopper_with_cuda_12_3():
1984
+ def _check(cc_major):
1909
1985
  if not is_cuda():
1910
1986
  return False
1911
- is_hopper = torch.cuda.get_device_capability()[0] == 9
1912
- cuda_version = torch.version.cuda.split(".")
1913
- is_cuda_compatible = int(cuda_version[0]) == 12 and int(cuda_version[1]) >= 3
1914
- return is_hopper and is_cuda_compatible
1987
+ return torch.cuda.get_device_capability()[0] == cc_major and tuple(
1988
+ map(int, torch.version.cuda.split(".")[:2])
1989
+ ) >= (12, 3)
1990
+
1991
+
1992
+ is_ampere_with_cuda_12_3 = lambda: _check(8)
1993
+ is_hopper_with_cuda_12_3 = lambda: _check(9)
1915
1994
 
1916
1995
 
1917
1996
  def get_free_port():
sglang/test/runners.py CHANGED
@@ -423,6 +423,10 @@ class HFRunner:
423
423
  )
424
424
  del input_logits
425
425
 
426
+ if lora_paths is not None and lora_paths[i] is not None:
427
+ # Unload the LoRA adapter if it is used
428
+ model.unload()
429
+
426
430
  return ModelOutput(
427
431
  output_strs=output_strs,
428
432
  top_input_logprobs=top_input_logprobs,
sglang/test/test_utils.py CHANGED
@@ -69,6 +69,7 @@ DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
69
69
  DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
70
70
  "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
71
71
  )
72
+ DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
72
73
 
73
74
  # Nightly tests
74
75
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
@@ -770,6 +771,34 @@ def run_bench_offline_throughput(model, other_args):
770
771
  return output_throughput
771
772
 
772
773
 
774
+ def run_bench_one_batch_server(
775
+ model,
776
+ base_url,
777
+ server_args,
778
+ bench_args,
779
+ other_server_args,
780
+ simulate_spec_acc_lens=None,
781
+ ):
782
+ from sglang.bench_one_batch_server import run_benchmark
783
+
784
+ if simulate_spec_acc_lens is not None:
785
+ env = {**os.environ, "SIMULATE_ACC_LEN": str(simulate_spec_acc_lens)}
786
+ else:
787
+ env = None
788
+
789
+ process = popen_launch_server(
790
+ model,
791
+ base_url,
792
+ timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
793
+ other_args=other_server_args,
794
+ env=env,
795
+ )
796
+ try:
797
+ run_benchmark(server_args=server_args, bench_args=bench_args)
798
+ finally:
799
+ kill_process_tree(process.pid)
800
+
801
+
773
802
  def lcs(X, Y):
774
803
  m = len(X)
775
804
  n = len(Y)
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.6.post1"
1
+ __version__ = "0.4.6.post2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.6.post1
3
+ Version: 0.4.6.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -238,15 +238,16 @@ Requires-Dist: pynvml; extra == "runtime-common"
238
238
  Requires-Dist: python-multipart; extra == "runtime-common"
239
239
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
240
240
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
241
- Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
241
+ Requires-Dist: torchao>=0.9.0; extra == "runtime-common"
242
242
  Requires-Dist: transformers==4.51.1; extra == "runtime-common"
243
243
  Requires-Dist: uvicorn; extra == "runtime-common"
244
244
  Requires-Dist: uvloop; extra == "runtime-common"
245
245
  Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
246
+ Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
246
247
  Provides-Extra: srt
247
248
  Requires-Dist: sglang[runtime_common]; extra == "srt"
248
- Requires-Dist: sgl-kernel==0.1.0; extra == "srt"
249
- Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
249
+ Requires-Dist: sgl-kernel==0.1.1; extra == "srt"
250
+ Requires-Dist: flashinfer_python==0.2.5; extra == "srt"
250
251
  Requires-Dist: torch==2.6.0; extra == "srt"
251
252
  Requires-Dist: torchvision==0.21.0; extra == "srt"
252
253
  Requires-Dist: cuda-python; extra == "srt"