dgenerate-ultralytics-headless 8.3.194__py3-none-any.whl → 8.3.195__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {dgenerate_ultralytics_headless-8.3.194.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/METADATA +1 -2
  2. {dgenerate_ultralytics_headless-8.3.194.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/RECORD +97 -96
  3. tests/test_python.py +1 -1
  4. ultralytics/__init__.py +1 -1
  5. ultralytics/cfg/__init__.py +8 -8
  6. ultralytics/data/annotator.py +1 -1
  7. ultralytics/data/augment.py +75 -75
  8. ultralytics/data/base.py +12 -12
  9. ultralytics/data/converter.py +4 -4
  10. ultralytics/data/dataset.py +7 -7
  11. ultralytics/data/loaders.py +15 -15
  12. ultralytics/data/split_dota.py +10 -10
  13. ultralytics/data/utils.py +12 -12
  14. ultralytics/engine/model.py +13 -13
  15. ultralytics/engine/predictor.py +13 -13
  16. ultralytics/engine/results.py +21 -21
  17. ultralytics/hub/google/__init__.py +2 -2
  18. ultralytics/hub/session.py +7 -7
  19. ultralytics/models/fastsam/model.py +5 -5
  20. ultralytics/models/fastsam/predict.py +11 -11
  21. ultralytics/models/nas/model.py +1 -1
  22. ultralytics/models/rtdetr/predict.py +2 -2
  23. ultralytics/models/rtdetr/val.py +4 -4
  24. ultralytics/models/sam/amg.py +6 -6
  25. ultralytics/models/sam/build.py +9 -9
  26. ultralytics/models/sam/model.py +7 -7
  27. ultralytics/models/sam/modules/blocks.py +6 -6
  28. ultralytics/models/sam/modules/decoders.py +1 -1
  29. ultralytics/models/sam/modules/encoders.py +27 -27
  30. ultralytics/models/sam/modules/sam.py +4 -4
  31. ultralytics/models/sam/modules/tiny_encoder.py +18 -18
  32. ultralytics/models/sam/modules/utils.py +8 -8
  33. ultralytics/models/sam/predict.py +63 -63
  34. ultralytics/models/utils/loss.py +22 -22
  35. ultralytics/models/utils/ops.py +8 -8
  36. ultralytics/models/yolo/classify/predict.py +2 -2
  37. ultralytics/models/yolo/classify/train.py +8 -8
  38. ultralytics/models/yolo/classify/val.py +4 -4
  39. ultralytics/models/yolo/detect/predict.py +3 -3
  40. ultralytics/models/yolo/detect/train.py +6 -6
  41. ultralytics/models/yolo/detect/val.py +32 -32
  42. ultralytics/models/yolo/model.py +6 -6
  43. ultralytics/models/yolo/obb/train.py +1 -1
  44. ultralytics/models/yolo/obb/val.py +13 -13
  45. ultralytics/models/yolo/pose/val.py +11 -11
  46. ultralytics/models/yolo/segment/predict.py +4 -4
  47. ultralytics/models/yolo/segment/train.py +1 -1
  48. ultralytics/models/yolo/segment/val.py +14 -14
  49. ultralytics/models/yolo/world/train.py +9 -9
  50. ultralytics/models/yolo/world/train_world.py +1 -1
  51. ultralytics/models/yolo/yoloe/predict.py +4 -4
  52. ultralytics/models/yolo/yoloe/train.py +4 -4
  53. ultralytics/nn/autobackend.py +2 -2
  54. ultralytics/nn/modules/block.py +6 -6
  55. ultralytics/nn/modules/conv.py +2 -2
  56. ultralytics/nn/modules/head.py +4 -4
  57. ultralytics/nn/tasks.py +13 -13
  58. ultralytics/nn/text_model.py +3 -3
  59. ultralytics/solutions/ai_gym.py +2 -2
  60. ultralytics/solutions/analytics.py +3 -3
  61. ultralytics/solutions/config.py +5 -5
  62. ultralytics/solutions/distance_calculation.py +2 -2
  63. ultralytics/solutions/heatmap.py +1 -1
  64. ultralytics/solutions/instance_segmentation.py +4 -4
  65. ultralytics/solutions/object_counter.py +4 -4
  66. ultralytics/solutions/parking_management.py +7 -7
  67. ultralytics/solutions/queue_management.py +3 -3
  68. ultralytics/solutions/region_counter.py +4 -4
  69. ultralytics/solutions/similarity_search.py +2 -2
  70. ultralytics/solutions/solutions.py +48 -48
  71. ultralytics/solutions/streamlit_inference.py +1 -1
  72. ultralytics/solutions/trackzone.py +4 -4
  73. ultralytics/solutions/vision_eye.py +1 -1
  74. ultralytics/trackers/byte_tracker.py +11 -11
  75. ultralytics/trackers/utils/gmc.py +3 -3
  76. ultralytics/trackers/utils/matching.py +5 -5
  77. ultralytics/utils/autodevice.py +2 -2
  78. ultralytics/utils/benchmarks.py +10 -10
  79. ultralytics/utils/callbacks/clearml.py +1 -1
  80. ultralytics/utils/callbacks/comet.py +5 -5
  81. ultralytics/utils/checks.py +5 -5
  82. ultralytics/utils/cpu.py +90 -0
  83. ultralytics/utils/dist.py +1 -1
  84. ultralytics/utils/downloads.py +2 -2
  85. ultralytics/utils/export.py +5 -5
  86. ultralytics/utils/instance.py +2 -2
  87. ultralytics/utils/metrics.py +35 -35
  88. ultralytics/utils/nms.py +4 -4
  89. ultralytics/utils/ops.py +1 -1
  90. ultralytics/utils/patches.py +2 -2
  91. ultralytics/utils/plotting.py +9 -9
  92. ultralytics/utils/torch_utils.py +2 -6
  93. ultralytics/utils/triton.py +5 -5
  94. {dgenerate_ultralytics_headless-8.3.194.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/WHEEL +0 -0
  95. {dgenerate_ultralytics_headless-8.3.194.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/entry_points.txt +0 -0
  96. {dgenerate_ultralytics_headless-8.3.194.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/licenses/LICENSE +0 -0
  97. {dgenerate_ultralytics_headless-8.3.194.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/top_level.txt +0 -0
@@ -83,7 +83,7 @@ class ImageEncoderViT(nn.Module):
83
83
  use_rel_pos (bool): If True, adds relative positional embeddings to attention maps.
84
84
  rel_pos_zero_init (bool): If True, initializes relative positional parameters to zero.
85
85
  window_size (int): Size of attention window for windowed attention blocks.
86
- global_attn_indexes (Tuple[int, ...]): Indices of blocks that use global attention.
86
+ global_attn_indexes (tuple[int, ...]): Indices of blocks that use global attention.
87
87
 
88
88
  Examples:
89
89
  >>> encoder = ImageEncoderViT(img_size=224, patch_size=16, embed_dim=768, depth=12, num_heads=12)
@@ -161,13 +161,13 @@ class PromptEncoder(nn.Module):
161
161
 
162
162
  Attributes:
163
163
  embed_dim (int): Dimension of the embeddings.
164
- input_image_size (Tuple[int, int]): Size of the input image as (H, W).
165
- image_embedding_size (Tuple[int, int]): Spatial size of the image embedding as (H, W).
164
+ input_image_size (tuple[int, int]): Size of the input image as (H, W).
165
+ image_embedding_size (tuple[int, int]): Spatial size of the image embedding as (H, W).
166
166
  pe_layer (PositionEmbeddingRandom): Module for random position embedding.
167
167
  num_point_embeddings (int): Number of point embeddings for different types of points.
168
168
  point_embeddings (nn.ModuleList): List of point embeddings.
169
169
  not_a_point_embed (nn.Embedding): Embedding for points that are not part of any label.
170
- mask_input_size (Tuple[int, int]): Size of the input mask.
170
+ mask_input_size (tuple[int, int]): Size of the input mask.
171
171
  mask_downscaling (nn.Sequential): Neural network for downscaling the mask.
172
172
  no_mask_embed (nn.Embedding): Embedding for cases where no mask is provided.
173
173
 
@@ -198,8 +198,8 @@ class PromptEncoder(nn.Module):
198
198
 
199
199
  Args:
200
200
  embed_dim (int): The dimension of the embeddings.
201
- image_embedding_size (Tuple[int, int]): The spatial size of the image embedding as (H, W).
202
- input_image_size (Tuple[int, int]): The padded size of the input image as (H, W).
201
+ image_embedding_size (tuple[int, int]): The spatial size of the image embedding as (H, W).
202
+ input_image_size (tuple[int, int]): The padded size of the input image as (H, W).
203
203
  mask_in_chans (int): The number of hidden channels used for encoding input masks.
204
204
  activation (Type[nn.Module]): The activation function to use when encoding input masks.
205
205
 
@@ -310,7 +310,7 @@ class PromptEncoder(nn.Module):
310
310
  Embed different types of prompts, returning both sparse and dense embeddings.
311
311
 
312
312
  Args:
313
- points (Tuple[torch.Tensor, torch.Tensor] | None): Point coordinates and labels to embed. The first
313
+ points (tuple[torch.Tensor, torch.Tensor] | None): Point coordinates and labels to embed. The first
314
314
  tensor contains coordinates with shape (B, N, 2), and the second tensor contains labels with
315
315
  shape (B, N).
316
316
  boxes (torch.Tensor | None): Boxes to embed with shape (B, M, 2, 2), where M is the number of boxes.
@@ -522,10 +522,10 @@ class FpnNeck(nn.Module):
522
522
  Attributes:
523
523
  position_encoding (PositionEmbeddingSine): Sinusoidal positional encoding module.
524
524
  convs (nn.ModuleList): List of convolutional layers for each backbone level.
525
- backbone_channel_list (List[int]): List of channel dimensions from the backbone.
525
+ backbone_channel_list (list[int]): List of channel dimensions from the backbone.
526
526
  fpn_interp_model (str): Interpolation mode for FPN feature resizing.
527
527
  fuse_type (str): Type of feature fusion, either 'sum' or 'avg'.
528
- fpn_top_down_levels (List[int]): Levels to have top-down features in outputs.
528
+ fpn_top_down_levels (list[int]): Levels to have top-down features in outputs.
529
529
 
530
530
  Methods:
531
531
  forward: Perform forward pass through the FPN neck.
@@ -558,13 +558,13 @@ class FpnNeck(nn.Module):
558
558
 
559
559
  Args:
560
560
  d_model (int): Dimension of the model.
561
- backbone_channel_list (List[int]): List of channel dimensions from the backbone.
561
+ backbone_channel_list (list[int]): List of channel dimensions from the backbone.
562
562
  kernel_size (int): Kernel size for the convolutional layers.
563
563
  stride (int): Stride for the convolutional layers.
564
564
  padding (int): Padding for the convolutional layers.
565
565
  fpn_interp_model (str): Interpolation mode for FPN feature resizing.
566
566
  fuse_type (str): Type of feature fusion, either 'sum' or 'avg'.
567
- fpn_top_down_levels (Optional[List[int]]): Levels to have top-down features in outputs.
567
+ fpn_top_down_levels (Optional[list[int]]): Levels to have top-down features in outputs.
568
568
 
569
569
  Examples:
570
570
  >>> backbone_channels = [64, 128, 256, 512]
@@ -610,12 +610,12 @@ class FpnNeck(nn.Module):
610
610
  and top-down feature fusion. It generates output feature maps and corresponding positional encodings.
611
611
 
612
612
  Args:
613
- xs (List[torch.Tensor]): List of input tensors from the backbone, each with shape (B, C, H, W).
613
+ xs (list[torch.Tensor]): List of input tensors from the backbone, each with shape (B, C, H, W).
614
614
 
615
615
  Returns:
616
- out (List[torch.Tensor]): List of output feature maps after FPN processing, each with shape
616
+ out (list[torch.Tensor]): List of output feature maps after FPN processing, each with shape
617
617
  (B, d_model, H, W).
618
- pos (List[torch.Tensor]): List of positional encodings corresponding to each output feature map.
618
+ pos (list[torch.Tensor]): List of positional encodings corresponding to each output feature map.
619
619
 
620
620
  Examples:
621
621
  >>> fpn_neck = FpnNeck(d_model=256, backbone_channel_list=[64, 128, 256, 512])
@@ -664,18 +664,18 @@ class Hiera(nn.Module):
664
664
  with optional pooling and global attention mechanisms.
665
665
 
666
666
  Attributes:
667
- window_spec (Tuple[int, ...]): Window sizes for each stage.
668
- q_stride (Tuple[int, int]): Downsampling stride between stages.
669
- stage_ends (List[int]): Indices of the last block in each stage.
670
- q_pool_blocks (List[int]): Indices of blocks where pooling is applied.
667
+ window_spec (tuple[int, ...]): Window sizes for each stage.
668
+ q_stride (tuple[int, int]): Downsampling stride between stages.
669
+ stage_ends (list[int]): Indices of the last block in each stage.
670
+ q_pool_blocks (list[int]): Indices of blocks where pooling is applied.
671
671
  return_interm_layers (bool): Whether to return intermediate layer outputs.
672
672
  patch_embed (PatchEmbed): Module for patch embedding.
673
- global_att_blocks (Tuple[int, ...]): Indices of blocks with global attention.
674
- window_pos_embed_bkg_spatial_size (Tuple[int, int]): Spatial size for window positional embedding background.
673
+ global_att_blocks (tuple[int, ...]): Indices of blocks with global attention.
674
+ window_pos_embed_bkg_spatial_size (tuple[int, int]): Spatial size for window positional embedding background.
675
675
  pos_embed (nn.Parameter): Positional embedding for the background.
676
676
  pos_embed_window (nn.Parameter): Positional embedding for the window.
677
677
  blocks (nn.ModuleList): List of MultiScaleBlock modules.
678
- channel_list (List[int]): List of output channel dimensions for each stage.
678
+ channel_list (list[int]): List of output channel dimensions for each stage.
679
679
 
680
680
  Methods:
681
681
  _get_pos_embed: Generate positional embeddings by interpolating and combining window and background embeddings.
@@ -727,13 +727,13 @@ class Hiera(nn.Module):
727
727
  num_heads (int): Initial number of attention heads.
728
728
  drop_path_rate (float): Stochastic depth rate.
729
729
  q_pool (int): Number of query pooling stages.
730
- q_stride (Tuple[int, int]): Downsampling stride between stages.
731
- stages (Tuple[int, ...]): Number of blocks per stage.
730
+ q_stride (tuple[int, int]): Downsampling stride between stages.
731
+ stages (tuple[int, ...]): Number of blocks per stage.
732
732
  dim_mul (float): Dimension multiplier factor at stage transitions.
733
733
  head_mul (float): Head multiplier factor at stage transitions.
734
- window_pos_embed_bkg_spatial_size (Tuple[int, int]): Spatial size for window positional embedding background.
735
- window_spec (Tuple[int, ...]): Window sizes for each stage when not using global attention.
736
- global_att_blocks (Tuple[int, ...]): Indices of blocks that use global attention.
734
+ window_pos_embed_bkg_spatial_size (tuple[int, int]): Spatial size for window positional embedding background.
735
+ window_spec (tuple[int, ...]): Window sizes for each stage when not using global attention.
736
+ global_att_blocks (tuple[int, ...]): Indices of blocks that use global attention.
737
737
  return_interm_layers (bool): Whether to return intermediate layer outputs.
738
738
 
739
739
  Examples:
@@ -823,7 +823,7 @@ class Hiera(nn.Module):
823
823
  x (torch.Tensor): Input tensor with shape (B, C, H, W) representing a batch of images.
824
824
 
825
825
  Returns:
826
- (List[torch.Tensor]): List of feature maps at different scales, each with shape (B, C_i, H_i, W_i), where
826
+ (list[torch.Tensor]): List of feature maps at different scales, each with shape (B, C_i, H_i, W_i), where
827
827
  C_i is the channel dimension and H_i, W_i are the spatial dimensions at scale i. The list is ordered
828
828
  from highest resolution (fine features) to lowest resolution (coarse features) if return_interm_layers
829
829
  is True, otherwise contains only the final output.
@@ -68,8 +68,8 @@ class SAMModel(nn.Module):
68
68
  image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings.
69
69
  prompt_encoder (PromptEncoder): Encodes various types of input prompts.
70
70
  mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
71
- pixel_mean (List[float]): Mean values for normalizing pixels in the input image.
72
- pixel_std (List[float]): Standard deviation values for normalizing pixels in the input image.
71
+ pixel_mean (list[float]): Mean values for normalizing pixels in the input image.
72
+ pixel_std (list[float]): Standard deviation values for normalizing pixels in the input image.
73
73
 
74
74
  Examples:
75
75
  >>> image_encoder = ImageEncoderViT(...)
@@ -435,14 +435,14 @@ class SAM2Model(torch.nn.Module):
435
435
 
436
436
  Args:
437
437
  backbone_features (torch.Tensor): Image features with shape (B, C, H, W).
438
- point_inputs (Dict[str, torch.Tensor] | None): Dictionary containing point prompts.
438
+ point_inputs (dict[str, torch.Tensor] | None): Dictionary containing point prompts.
439
439
  'point_coords': Tensor of shape (B, P, 2) with float32 dtype, containing absolute
440
440
  pixel-unit coordinates in (x, y) format for P input points.
441
441
  'point_labels': Tensor of shape (B, P) with int32 dtype, where 1 means positive clicks,
442
442
  0 means negative clicks, and -1 means padding.
443
443
  mask_inputs (torch.Tensor | None): Mask of shape (B, 1, H*16, W*16), float or bool, with the
444
444
  same spatial size as the image.
445
- high_res_features (List[torch.Tensor] | None): List of two feature maps with shapes
445
+ high_res_features (list[torch.Tensor] | None): List of two feature maps with shapes
446
446
  (B, C, 4*H, 4*W) and (B, C, 2*H, 2*W) respectively, used as high-resolution feature maps
447
447
  for SAM decoder.
448
448
  multimask_output (bool): If True, output 3 candidate masks and their IoU estimates; if False,
@@ -81,7 +81,7 @@ class PatchEmbed(nn.Module):
81
81
  effectively downsampling the spatial dimensions while increasing the channel dimension.
82
82
 
83
83
  Attributes:
84
- patches_resolution (Tuple[int, int]): Resolution of the patches after embedding.
84
+ patches_resolution (tuple[int, int]): Resolution of the patches after embedding.
85
85
  num_patches (int): Total number of patches.
86
86
  in_chans (int): Number of input channels.
87
87
  embed_dim (int): Dimension of the embedding.
@@ -203,7 +203,7 @@ class PatchMerging(nn.Module):
203
203
  resolution while potentially increasing channel dimensions.
204
204
 
205
205
  Attributes:
206
- input_resolution (Tuple[int, int]): The input resolution (height, width) of the feature map.
206
+ input_resolution (tuple[int, int]): The input resolution (height, width) of the feature map.
207
207
  dim (int): The input dimension of the feature map.
208
208
  out_dim (int): The output dimension after merging and projection.
209
209
  act (nn.Module): The activation function used between convolutions.
@@ -225,7 +225,7 @@ class PatchMerging(nn.Module):
225
225
  Initialize the PatchMerging module for merging and projecting neighboring patches in feature maps.
226
226
 
227
227
  Args:
228
- input_resolution (Tuple[int, int]): The input resolution (height, width) of the feature map.
228
+ input_resolution (tuple[int, int]): The input resolution (height, width) of the feature map.
229
229
  dim (int): The input dimension of the feature map.
230
230
  out_dim (int): The output dimension after merging and projection.
231
231
  activation (nn.Module): The activation function used between convolutions.
@@ -267,7 +267,7 @@ class ConvLayer(nn.Module):
267
267
 
268
268
  Attributes:
269
269
  dim (int): Dimensionality of the input and output.
270
- input_resolution (Tuple[int, int]): Resolution of the input image.
270
+ input_resolution (tuple[int, int]): Resolution of the input image.
271
271
  depth (int): Number of MBConv layers in the block.
272
272
  use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
273
273
  blocks (nn.ModuleList): List of MBConv layers.
@@ -301,10 +301,10 @@ class ConvLayer(nn.Module):
301
301
 
302
302
  Args:
303
303
  dim (int): The dimensionality of the input and output.
304
- input_resolution (Tuple[int, int]): The resolution of the input image.
304
+ input_resolution (tuple[int, int]): The resolution of the input image.
305
305
  depth (int): The number of MBConv layers in the block.
306
306
  activation (nn.Module): Activation function applied after each convolution.
307
- drop_path (float | List[float], optional): Drop path rate. Single float or a list of floats for each MBConv.
307
+ drop_path (float | list[float], optional): Drop path rate. Single float or a list of floats for each MBConv.
308
308
  downsample (Optional[nn.Module], optional): Function for downsampling the output. None to skip downsampling.
309
309
  use_checkpoint (bool, optional): Whether to use gradient checkpointing to save memory.
310
310
  out_dim (Optional[int], optional): The dimensionality of the output. None means it will be the same as `dim`.
@@ -456,7 +456,7 @@ class Attention(torch.nn.Module):
456
456
  key_dim (int): The dimensionality of the keys and queries.
457
457
  num_heads (int, optional): Number of attention heads.
458
458
  attn_ratio (float, optional): Attention ratio, affecting the dimensions of the value vectors.
459
- resolution (Tuple[int, int], optional): Spatial resolution of the input feature map.
459
+ resolution (tuple[int, int], optional): Spatial resolution of the input feature map.
460
460
  """
461
461
  super().__init__()
462
462
 
@@ -530,7 +530,7 @@ class TinyViTBlock(nn.Module):
530
530
 
531
531
  Attributes:
532
532
  dim (int): The dimensionality of the input and output.
533
- input_resolution (Tuple[int, int]): Spatial resolution of the input feature map.
533
+ input_resolution (tuple[int, int]): Spatial resolution of the input feature map.
534
534
  num_heads (int): Number of attention heads.
535
535
  window_size (int): Size of the attention window.
536
536
  mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
@@ -567,7 +567,7 @@ class TinyViTBlock(nn.Module):
567
567
 
568
568
  Args:
569
569
  dim (int): Dimensionality of the input and output features.
570
- input_resolution (Tuple[int, int]): Spatial resolution of the input feature map (height, width).
570
+ input_resolution (tuple[int, int]): Spatial resolution of the input feature map (height, width).
571
571
  num_heads (int): Number of attention heads.
572
572
  window_size (int, optional): Size of the attention window. Must be greater than 0.
573
573
  mlp_ratio (float, optional): Ratio of MLP hidden dimension to embedding dimension.
@@ -674,7 +674,7 @@ class BasicLayer(nn.Module):
674
674
 
675
675
  Attributes:
676
676
  dim (int): The dimensionality of the input and output features.
677
- input_resolution (Tuple[int, int]): Spatial resolution of the input feature map.
677
+ input_resolution (tuple[int, int]): Spatial resolution of the input feature map.
678
678
  depth (int): Number of TinyViT blocks in this layer.
679
679
  use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
680
680
  blocks (nn.ModuleList): List of TinyViT blocks that make up this layer.
@@ -712,13 +712,13 @@ class BasicLayer(nn.Module):
712
712
 
713
713
  Args:
714
714
  dim (int): Dimensionality of the input and output features.
715
- input_resolution (Tuple[int, int]): Spatial resolution of the input feature map (height, width).
715
+ input_resolution (tuple[int, int]): Spatial resolution of the input feature map (height, width).
716
716
  depth (int): Number of TinyViT blocks in this layer.
717
717
  num_heads (int): Number of attention heads in each TinyViT block.
718
718
  window_size (int): Size of the local window for attention computation.
719
719
  mlp_ratio (float, optional): Ratio of MLP hidden dimension to embedding dimension.
720
720
  drop (float, optional): Dropout rate.
721
- drop_path (float | List[float], optional): Stochastic depth rate. Can be a float or a list of floats for each block.
721
+ drop_path (float | list[float], optional): Stochastic depth rate. Can be a float or a list of floats for each block.
722
722
  downsample (nn.Module | None, optional): Downsampling layer at the end of the layer. None to skip downsampling.
723
723
  use_checkpoint (bool, optional): Whether to use gradient checkpointing to save memory.
724
724
  local_conv_size (int, optional): Kernel size for the local convolution in each TinyViT block.
@@ -778,11 +778,11 @@ class TinyViT(nn.Module):
778
778
  Attributes:
779
779
  img_size (int): Input image size.
780
780
  num_classes (int): Number of classification classes.
781
- depths (Tuple[int, int, int, int]): Number of blocks in each stage.
781
+ depths (tuple[int, int, int, int]): Number of blocks in each stage.
782
782
  num_layers (int): Total number of layers in the network.
783
783
  mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
784
784
  patch_embed (PatchEmbed): Module for patch embedding.
785
- patches_resolution (Tuple[int, int]): Resolution of embedded patches.
785
+ patches_resolution (tuple[int, int]): Resolution of embedded patches.
786
786
  layers (nn.ModuleList): List of network layers.
787
787
  norm_head (nn.LayerNorm): Layer normalization for the classifier head.
788
788
  head (nn.Linear): Linear layer for final classification.
@@ -823,10 +823,10 @@ class TinyViT(nn.Module):
823
823
  img_size (int, optional): Size of the input image.
824
824
  in_chans (int, optional): Number of input channels.
825
825
  num_classes (int, optional): Number of classes for classification.
826
- embed_dims (Tuple[int, int, int, int], optional): Embedding dimensions for each stage.
827
- depths (Tuple[int, int, int, int], optional): Number of blocks in each stage.
828
- num_heads (Tuple[int, int, int, int], optional): Number of attention heads in each stage.
829
- window_sizes (Tuple[int, int, int, int], optional): Window sizes for each stage.
826
+ embed_dims (tuple[int, int, int, int], optional): Embedding dimensions for each stage.
827
+ depths (tuple[int, int, int, int], optional): Number of blocks in each stage.
828
+ num_heads (tuple[int, int, int, int], optional): Number of attention heads in each stage.
829
+ window_sizes (tuple[int, int, int, int], optional): Window sizes for each stage.
830
830
  mlp_ratio (float, optional): Ratio of MLP hidden dim to embedding dim.
831
831
  drop_rate (float, optional): Dropout rate.
832
832
  drop_path_rate (float, optional): Stochastic depth rate.
@@ -14,12 +14,12 @@ def select_closest_cond_frames(frame_idx: int, cond_frame_outputs: dict[int, Any
14
14
 
15
15
  Args:
16
16
  frame_idx (int): Current frame index.
17
- cond_frame_outputs (Dict[int, Any]): Dictionary of conditioning frame outputs keyed by frame indices.
17
+ cond_frame_outputs (dict[int, Any]): Dictionary of conditioning frame outputs keyed by frame indices.
18
18
  max_cond_frame_num (int): Maximum number of conditioning frames to select.
19
19
 
20
20
  Returns:
21
- selected_outputs (Dict[int, Any]): Selected items from cond_frame_outputs.
22
- unselected_outputs (Dict[int, Any]): Items not selected from cond_frame_outputs.
21
+ selected_outputs (dict[int, Any]): Selected items from cond_frame_outputs.
22
+ unselected_outputs (dict[int, Any]): Items not selected from cond_frame_outputs.
23
23
 
24
24
  Examples:
25
25
  >>> frame_idx = 5
@@ -229,7 +229,7 @@ def window_partition(x: torch.Tensor, window_size: int):
229
229
 
230
230
  Returns:
231
231
  windows (torch.Tensor): Partitioned windows with shape (B * num_windows, window_size, window_size, C).
232
- padded_h_w (Tuple[int, int]): Padded height and width before partition.
232
+ padded_h_w (tuple[int, int]): Padded height and width before partition.
233
233
 
234
234
  Examples:
235
235
  >>> x = torch.randn(1, 16, 16, 3)
@@ -262,8 +262,8 @@ def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: tuple[in
262
262
  window_size, C), where B is the batch size, num_windows is the number of windows, window_size is
263
263
  the size of each window, and C is the number of channels.
264
264
  window_size (int): Size of each window.
265
- pad_hw (Tuple[int, int]): Padded height and width (Hp, Wp) of the input before windowing.
266
- hw (Tuple[int, int]): Original height and width (H, W) of the input before padding and windowing.
265
+ pad_hw (tuple[int, int]): Padded height and width (Hp, Wp) of the input before windowing.
266
+ hw (tuple[int, int]): Original height and width (H, W) of the input before padding and windowing.
267
267
 
268
268
  Returns:
269
269
  (torch.Tensor): Unpartitioned sequences with shape (B, H, W, C), where B is the batch size, H and W
@@ -350,8 +350,8 @@ def add_decomposed_rel_pos(
350
350
  q (torch.Tensor): Query tensor in the attention layer with shape (B, q_h * q_w, C).
351
351
  rel_pos_h (torch.Tensor): Relative position embeddings for height axis with shape (Lh, C).
352
352
  rel_pos_w (torch.Tensor): Relative position embeddings for width axis with shape (Lw, C).
353
- q_size (Tuple[int, int]): Spatial sequence size of query q as (q_h, q_w).
354
- k_size (Tuple[int, int]): Spatial sequence size of key k as (k_h, k_w).
353
+ q_size (tuple[int, int]): Spatial sequence size of query q as (q_h, q_w).
354
+ k_size (tuple[int, int]): Spatial sequence size of key k as (k_h, k_w).
355
355
 
356
356
  Returns:
357
357
  (torch.Tensor): Updated attention map with added relative positional embeddings, shape