dgenerate-ultralytics-headless 8.3.193__py3-none-any.whl → 8.3.195__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dgenerate_ultralytics_headless-8.3.193.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/METADATA +1 -2
- {dgenerate_ultralytics_headless-8.3.193.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/RECORD +104 -102
- tests/test_exports.py +8 -5
- tests/test_python.py +2 -2
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +8 -8
- ultralytics/data/annotator.py +1 -1
- ultralytics/data/augment.py +75 -75
- ultralytics/data/base.py +12 -12
- ultralytics/data/converter.py +4 -4
- ultralytics/data/dataset.py +7 -7
- ultralytics/data/loaders.py +15 -15
- ultralytics/data/split_dota.py +10 -10
- ultralytics/data/utils.py +12 -12
- ultralytics/engine/exporter.py +4 -4
- ultralytics/engine/model.py +13 -13
- ultralytics/engine/predictor.py +13 -13
- ultralytics/engine/results.py +21 -21
- ultralytics/hub/__init__.py +1 -2
- ultralytics/hub/google/__init__.py +2 -2
- ultralytics/hub/session.py +7 -7
- ultralytics/hub/utils.py +0 -101
- ultralytics/models/fastsam/model.py +5 -5
- ultralytics/models/fastsam/predict.py +11 -11
- ultralytics/models/nas/model.py +1 -1
- ultralytics/models/rtdetr/predict.py +2 -2
- ultralytics/models/rtdetr/val.py +4 -4
- ultralytics/models/sam/amg.py +6 -6
- ultralytics/models/sam/build.py +9 -9
- ultralytics/models/sam/model.py +7 -7
- ultralytics/models/sam/modules/blocks.py +6 -6
- ultralytics/models/sam/modules/decoders.py +1 -1
- ultralytics/models/sam/modules/encoders.py +27 -27
- ultralytics/models/sam/modules/sam.py +4 -4
- ultralytics/models/sam/modules/tiny_encoder.py +18 -18
- ultralytics/models/sam/modules/utils.py +8 -8
- ultralytics/models/sam/predict.py +66 -66
- ultralytics/models/utils/loss.py +22 -22
- ultralytics/models/utils/ops.py +8 -8
- ultralytics/models/yolo/classify/predict.py +2 -2
- ultralytics/models/yolo/classify/train.py +8 -8
- ultralytics/models/yolo/classify/val.py +4 -4
- ultralytics/models/yolo/detect/predict.py +3 -3
- ultralytics/models/yolo/detect/train.py +6 -6
- ultralytics/models/yolo/detect/val.py +32 -32
- ultralytics/models/yolo/model.py +6 -6
- ultralytics/models/yolo/obb/train.py +1 -1
- ultralytics/models/yolo/obb/val.py +13 -13
- ultralytics/models/yolo/pose/val.py +11 -11
- ultralytics/models/yolo/segment/predict.py +4 -4
- ultralytics/models/yolo/segment/train.py +1 -1
- ultralytics/models/yolo/segment/val.py +14 -14
- ultralytics/models/yolo/world/train.py +9 -9
- ultralytics/models/yolo/world/train_world.py +1 -1
- ultralytics/models/yolo/yoloe/predict.py +4 -4
- ultralytics/models/yolo/yoloe/train.py +4 -4
- ultralytics/nn/autobackend.py +10 -13
- ultralytics/nn/modules/block.py +6 -6
- ultralytics/nn/modules/conv.py +2 -2
- ultralytics/nn/modules/head.py +4 -4
- ultralytics/nn/tasks.py +13 -13
- ultralytics/nn/text_model.py +3 -3
- ultralytics/solutions/ai_gym.py +2 -2
- ultralytics/solutions/analytics.py +3 -3
- ultralytics/solutions/config.py +5 -5
- ultralytics/solutions/distance_calculation.py +2 -2
- ultralytics/solutions/heatmap.py +1 -1
- ultralytics/solutions/instance_segmentation.py +4 -4
- ultralytics/solutions/object_counter.py +4 -4
- ultralytics/solutions/parking_management.py +7 -7
- ultralytics/solutions/queue_management.py +3 -3
- ultralytics/solutions/region_counter.py +4 -4
- ultralytics/solutions/similarity_search.py +2 -2
- ultralytics/solutions/solutions.py +48 -48
- ultralytics/solutions/streamlit_inference.py +1 -1
- ultralytics/solutions/trackzone.py +4 -4
- ultralytics/solutions/vision_eye.py +1 -1
- ultralytics/trackers/byte_tracker.py +11 -11
- ultralytics/trackers/utils/gmc.py +3 -3
- ultralytics/trackers/utils/matching.py +5 -5
- ultralytics/utils/__init__.py +1 -2
- ultralytics/utils/autodevice.py +2 -2
- ultralytics/utils/benchmarks.py +10 -10
- ultralytics/utils/callbacks/clearml.py +1 -1
- ultralytics/utils/callbacks/comet.py +5 -5
- ultralytics/utils/callbacks/hub.py +2 -1
- ultralytics/utils/checks.py +5 -5
- ultralytics/utils/cpu.py +90 -0
- ultralytics/utils/dist.py +1 -1
- ultralytics/utils/downloads.py +2 -2
- ultralytics/utils/events.py +115 -0
- ultralytics/utils/export.py +5 -5
- ultralytics/utils/instance.py +2 -2
- ultralytics/utils/metrics.py +35 -35
- ultralytics/utils/nms.py +4 -4
- ultralytics/utils/ops.py +4 -2
- ultralytics/utils/patches.py +2 -2
- ultralytics/utils/plotting.py +9 -9
- ultralytics/utils/torch_utils.py +2 -6
- ultralytics/utils/triton.py +5 -5
- {dgenerate_ultralytics_headless-8.3.193.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/WHEEL +0 -0
- {dgenerate_ultralytics_headless-8.3.193.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/entry_points.txt +0 -0
- {dgenerate_ultralytics_headless-8.3.193.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/licenses/LICENSE +0 -0
- {dgenerate_ultralytics_headless-8.3.193.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/top_level.txt +0 -0
@@ -83,7 +83,7 @@ class ImageEncoderViT(nn.Module):
|
|
83
83
|
use_rel_pos (bool): If True, adds relative positional embeddings to attention maps.
|
84
84
|
rel_pos_zero_init (bool): If True, initializes relative positional parameters to zero.
|
85
85
|
window_size (int): Size of attention window for windowed attention blocks.
|
86
|
-
global_attn_indexes (
|
86
|
+
global_attn_indexes (tuple[int, ...]): Indices of blocks that use global attention.
|
87
87
|
|
88
88
|
Examples:
|
89
89
|
>>> encoder = ImageEncoderViT(img_size=224, patch_size=16, embed_dim=768, depth=12, num_heads=12)
|
@@ -161,13 +161,13 @@ class PromptEncoder(nn.Module):
|
|
161
161
|
|
162
162
|
Attributes:
|
163
163
|
embed_dim (int): Dimension of the embeddings.
|
164
|
-
input_image_size (
|
165
|
-
image_embedding_size (
|
164
|
+
input_image_size (tuple[int, int]): Size of the input image as (H, W).
|
165
|
+
image_embedding_size (tuple[int, int]): Spatial size of the image embedding as (H, W).
|
166
166
|
pe_layer (PositionEmbeddingRandom): Module for random position embedding.
|
167
167
|
num_point_embeddings (int): Number of point embeddings for different types of points.
|
168
168
|
point_embeddings (nn.ModuleList): List of point embeddings.
|
169
169
|
not_a_point_embed (nn.Embedding): Embedding for points that are not part of any label.
|
170
|
-
mask_input_size (
|
170
|
+
mask_input_size (tuple[int, int]): Size of the input mask.
|
171
171
|
mask_downscaling (nn.Sequential): Neural network for downscaling the mask.
|
172
172
|
no_mask_embed (nn.Embedding): Embedding for cases where no mask is provided.
|
173
173
|
|
@@ -198,8 +198,8 @@ class PromptEncoder(nn.Module):
|
|
198
198
|
|
199
199
|
Args:
|
200
200
|
embed_dim (int): The dimension of the embeddings.
|
201
|
-
image_embedding_size (
|
202
|
-
input_image_size (
|
201
|
+
image_embedding_size (tuple[int, int]): The spatial size of the image embedding as (H, W).
|
202
|
+
input_image_size (tuple[int, int]): The padded size of the input image as (H, W).
|
203
203
|
mask_in_chans (int): The number of hidden channels used for encoding input masks.
|
204
204
|
activation (Type[nn.Module]): The activation function to use when encoding input masks.
|
205
205
|
|
@@ -310,7 +310,7 @@ class PromptEncoder(nn.Module):
|
|
310
310
|
Embed different types of prompts, returning both sparse and dense embeddings.
|
311
311
|
|
312
312
|
Args:
|
313
|
-
points (
|
313
|
+
points (tuple[torch.Tensor, torch.Tensor] | None): Point coordinates and labels to embed. The first
|
314
314
|
tensor contains coordinates with shape (B, N, 2), and the second tensor contains labels with
|
315
315
|
shape (B, N).
|
316
316
|
boxes (torch.Tensor | None): Boxes to embed with shape (B, M, 2, 2), where M is the number of boxes.
|
@@ -522,10 +522,10 @@ class FpnNeck(nn.Module):
|
|
522
522
|
Attributes:
|
523
523
|
position_encoding (PositionEmbeddingSine): Sinusoidal positional encoding module.
|
524
524
|
convs (nn.ModuleList): List of convolutional layers for each backbone level.
|
525
|
-
backbone_channel_list (
|
525
|
+
backbone_channel_list (list[int]): List of channel dimensions from the backbone.
|
526
526
|
fpn_interp_model (str): Interpolation mode for FPN feature resizing.
|
527
527
|
fuse_type (str): Type of feature fusion, either 'sum' or 'avg'.
|
528
|
-
fpn_top_down_levels (
|
528
|
+
fpn_top_down_levels (list[int]): Levels to have top-down features in outputs.
|
529
529
|
|
530
530
|
Methods:
|
531
531
|
forward: Perform forward pass through the FPN neck.
|
@@ -558,13 +558,13 @@ class FpnNeck(nn.Module):
|
|
558
558
|
|
559
559
|
Args:
|
560
560
|
d_model (int): Dimension of the model.
|
561
|
-
backbone_channel_list (
|
561
|
+
backbone_channel_list (list[int]): List of channel dimensions from the backbone.
|
562
562
|
kernel_size (int): Kernel size for the convolutional layers.
|
563
563
|
stride (int): Stride for the convolutional layers.
|
564
564
|
padding (int): Padding for the convolutional layers.
|
565
565
|
fpn_interp_model (str): Interpolation mode for FPN feature resizing.
|
566
566
|
fuse_type (str): Type of feature fusion, either 'sum' or 'avg'.
|
567
|
-
fpn_top_down_levels (Optional[
|
567
|
+
fpn_top_down_levels (Optional[list[int]]): Levels to have top-down features in outputs.
|
568
568
|
|
569
569
|
Examples:
|
570
570
|
>>> backbone_channels = [64, 128, 256, 512]
|
@@ -610,12 +610,12 @@ class FpnNeck(nn.Module):
|
|
610
610
|
and top-down feature fusion. It generates output feature maps and corresponding positional encodings.
|
611
611
|
|
612
612
|
Args:
|
613
|
-
xs (
|
613
|
+
xs (list[torch.Tensor]): List of input tensors from the backbone, each with shape (B, C, H, W).
|
614
614
|
|
615
615
|
Returns:
|
616
|
-
out (
|
616
|
+
out (list[torch.Tensor]): List of output feature maps after FPN processing, each with shape
|
617
617
|
(B, d_model, H, W).
|
618
|
-
pos (
|
618
|
+
pos (list[torch.Tensor]): List of positional encodings corresponding to each output feature map.
|
619
619
|
|
620
620
|
Examples:
|
621
621
|
>>> fpn_neck = FpnNeck(d_model=256, backbone_channel_list=[64, 128, 256, 512])
|
@@ -664,18 +664,18 @@ class Hiera(nn.Module):
|
|
664
664
|
with optional pooling and global attention mechanisms.
|
665
665
|
|
666
666
|
Attributes:
|
667
|
-
window_spec (
|
668
|
-
q_stride (
|
669
|
-
stage_ends (
|
670
|
-
q_pool_blocks (
|
667
|
+
window_spec (tuple[int, ...]): Window sizes for each stage.
|
668
|
+
q_stride (tuple[int, int]): Downsampling stride between stages.
|
669
|
+
stage_ends (list[int]): Indices of the last block in each stage.
|
670
|
+
q_pool_blocks (list[int]): Indices of blocks where pooling is applied.
|
671
671
|
return_interm_layers (bool): Whether to return intermediate layer outputs.
|
672
672
|
patch_embed (PatchEmbed): Module for patch embedding.
|
673
|
-
global_att_blocks (
|
674
|
-
window_pos_embed_bkg_spatial_size (
|
673
|
+
global_att_blocks (tuple[int, ...]): Indices of blocks with global attention.
|
674
|
+
window_pos_embed_bkg_spatial_size (tuple[int, int]): Spatial size for window positional embedding background.
|
675
675
|
pos_embed (nn.Parameter): Positional embedding for the background.
|
676
676
|
pos_embed_window (nn.Parameter): Positional embedding for the window.
|
677
677
|
blocks (nn.ModuleList): List of MultiScaleBlock modules.
|
678
|
-
channel_list (
|
678
|
+
channel_list (list[int]): List of output channel dimensions for each stage.
|
679
679
|
|
680
680
|
Methods:
|
681
681
|
_get_pos_embed: Generate positional embeddings by interpolating and combining window and background embeddings.
|
@@ -727,13 +727,13 @@ class Hiera(nn.Module):
|
|
727
727
|
num_heads (int): Initial number of attention heads.
|
728
728
|
drop_path_rate (float): Stochastic depth rate.
|
729
729
|
q_pool (int): Number of query pooling stages.
|
730
|
-
q_stride (
|
731
|
-
stages (
|
730
|
+
q_stride (tuple[int, int]): Downsampling stride between stages.
|
731
|
+
stages (tuple[int, ...]): Number of blocks per stage.
|
732
732
|
dim_mul (float): Dimension multiplier factor at stage transitions.
|
733
733
|
head_mul (float): Head multiplier factor at stage transitions.
|
734
|
-
window_pos_embed_bkg_spatial_size (
|
735
|
-
window_spec (
|
736
|
-
global_att_blocks (
|
734
|
+
window_pos_embed_bkg_spatial_size (tuple[int, int]): Spatial size for window positional embedding background.
|
735
|
+
window_spec (tuple[int, ...]): Window sizes for each stage when not using global attention.
|
736
|
+
global_att_blocks (tuple[int, ...]): Indices of blocks that use global attention.
|
737
737
|
return_interm_layers (bool): Whether to return intermediate layer outputs.
|
738
738
|
|
739
739
|
Examples:
|
@@ -823,7 +823,7 @@ class Hiera(nn.Module):
|
|
823
823
|
x (torch.Tensor): Input tensor with shape (B, C, H, W) representing a batch of images.
|
824
824
|
|
825
825
|
Returns:
|
826
|
-
(
|
826
|
+
(list[torch.Tensor]): List of feature maps at different scales, each with shape (B, C_i, H_i, W_i), where
|
827
827
|
C_i is the channel dimension and H_i, W_i are the spatial dimensions at scale i. The list is ordered
|
828
828
|
from highest resolution (fine features) to lowest resolution (coarse features) if return_interm_layers
|
829
829
|
is True, otherwise contains only the final output.
|
@@ -68,8 +68,8 @@ class SAMModel(nn.Module):
|
|
68
68
|
image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings.
|
69
69
|
prompt_encoder (PromptEncoder): Encodes various types of input prompts.
|
70
70
|
mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
|
71
|
-
pixel_mean (
|
72
|
-
pixel_std (
|
71
|
+
pixel_mean (list[float]): Mean values for normalizing pixels in the input image.
|
72
|
+
pixel_std (list[float]): Standard deviation values for normalizing pixels in the input image.
|
73
73
|
|
74
74
|
Examples:
|
75
75
|
>>> image_encoder = ImageEncoderViT(...)
|
@@ -435,14 +435,14 @@ class SAM2Model(torch.nn.Module):
|
|
435
435
|
|
436
436
|
Args:
|
437
437
|
backbone_features (torch.Tensor): Image features with shape (B, C, H, W).
|
438
|
-
point_inputs (
|
438
|
+
point_inputs (dict[str, torch.Tensor] | None): Dictionary containing point prompts.
|
439
439
|
'point_coords': Tensor of shape (B, P, 2) with float32 dtype, containing absolute
|
440
440
|
pixel-unit coordinates in (x, y) format for P input points.
|
441
441
|
'point_labels': Tensor of shape (B, P) with int32 dtype, where 1 means positive clicks,
|
442
442
|
0 means negative clicks, and -1 means padding.
|
443
443
|
mask_inputs (torch.Tensor | None): Mask of shape (B, 1, H*16, W*16), float or bool, with the
|
444
444
|
same spatial size as the image.
|
445
|
-
high_res_features (
|
445
|
+
high_res_features (list[torch.Tensor] | None): List of two feature maps with shapes
|
446
446
|
(B, C, 4*H, 4*W) and (B, C, 2*H, 2*W) respectively, used as high-resolution feature maps
|
447
447
|
for SAM decoder.
|
448
448
|
multimask_output (bool): If True, output 3 candidate masks and their IoU estimates; if False,
|
@@ -81,7 +81,7 @@ class PatchEmbed(nn.Module):
|
|
81
81
|
effectively downsampling the spatial dimensions while increasing the channel dimension.
|
82
82
|
|
83
83
|
Attributes:
|
84
|
-
patches_resolution (
|
84
|
+
patches_resolution (tuple[int, int]): Resolution of the patches after embedding.
|
85
85
|
num_patches (int): Total number of patches.
|
86
86
|
in_chans (int): Number of input channels.
|
87
87
|
embed_dim (int): Dimension of the embedding.
|
@@ -203,7 +203,7 @@ class PatchMerging(nn.Module):
|
|
203
203
|
resolution while potentially increasing channel dimensions.
|
204
204
|
|
205
205
|
Attributes:
|
206
|
-
input_resolution (
|
206
|
+
input_resolution (tuple[int, int]): The input resolution (height, width) of the feature map.
|
207
207
|
dim (int): The input dimension of the feature map.
|
208
208
|
out_dim (int): The output dimension after merging and projection.
|
209
209
|
act (nn.Module): The activation function used between convolutions.
|
@@ -225,7 +225,7 @@ class PatchMerging(nn.Module):
|
|
225
225
|
Initialize the PatchMerging module for merging and projecting neighboring patches in feature maps.
|
226
226
|
|
227
227
|
Args:
|
228
|
-
input_resolution (
|
228
|
+
input_resolution (tuple[int, int]): The input resolution (height, width) of the feature map.
|
229
229
|
dim (int): The input dimension of the feature map.
|
230
230
|
out_dim (int): The output dimension after merging and projection.
|
231
231
|
activation (nn.Module): The activation function used between convolutions.
|
@@ -267,7 +267,7 @@ class ConvLayer(nn.Module):
|
|
267
267
|
|
268
268
|
Attributes:
|
269
269
|
dim (int): Dimensionality of the input and output.
|
270
|
-
input_resolution (
|
270
|
+
input_resolution (tuple[int, int]): Resolution of the input image.
|
271
271
|
depth (int): Number of MBConv layers in the block.
|
272
272
|
use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
|
273
273
|
blocks (nn.ModuleList): List of MBConv layers.
|
@@ -301,10 +301,10 @@ class ConvLayer(nn.Module):
|
|
301
301
|
|
302
302
|
Args:
|
303
303
|
dim (int): The dimensionality of the input and output.
|
304
|
-
input_resolution (
|
304
|
+
input_resolution (tuple[int, int]): The resolution of the input image.
|
305
305
|
depth (int): The number of MBConv layers in the block.
|
306
306
|
activation (nn.Module): Activation function applied after each convolution.
|
307
|
-
drop_path (float |
|
307
|
+
drop_path (float | list[float], optional): Drop path rate. Single float or a list of floats for each MBConv.
|
308
308
|
downsample (Optional[nn.Module], optional): Function for downsampling the output. None to skip downsampling.
|
309
309
|
use_checkpoint (bool, optional): Whether to use gradient checkpointing to save memory.
|
310
310
|
out_dim (Optional[int], optional): The dimensionality of the output. None means it will be the same as `dim`.
|
@@ -456,7 +456,7 @@ class Attention(torch.nn.Module):
|
|
456
456
|
key_dim (int): The dimensionality of the keys and queries.
|
457
457
|
num_heads (int, optional): Number of attention heads.
|
458
458
|
attn_ratio (float, optional): Attention ratio, affecting the dimensions of the value vectors.
|
459
|
-
resolution (
|
459
|
+
resolution (tuple[int, int], optional): Spatial resolution of the input feature map.
|
460
460
|
"""
|
461
461
|
super().__init__()
|
462
462
|
|
@@ -530,7 +530,7 @@ class TinyViTBlock(nn.Module):
|
|
530
530
|
|
531
531
|
Attributes:
|
532
532
|
dim (int): The dimensionality of the input and output.
|
533
|
-
input_resolution (
|
533
|
+
input_resolution (tuple[int, int]): Spatial resolution of the input feature map.
|
534
534
|
num_heads (int): Number of attention heads.
|
535
535
|
window_size (int): Size of the attention window.
|
536
536
|
mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
|
@@ -567,7 +567,7 @@ class TinyViTBlock(nn.Module):
|
|
567
567
|
|
568
568
|
Args:
|
569
569
|
dim (int): Dimensionality of the input and output features.
|
570
|
-
input_resolution (
|
570
|
+
input_resolution (tuple[int, int]): Spatial resolution of the input feature map (height, width).
|
571
571
|
num_heads (int): Number of attention heads.
|
572
572
|
window_size (int, optional): Size of the attention window. Must be greater than 0.
|
573
573
|
mlp_ratio (float, optional): Ratio of MLP hidden dimension to embedding dimension.
|
@@ -674,7 +674,7 @@ class BasicLayer(nn.Module):
|
|
674
674
|
|
675
675
|
Attributes:
|
676
676
|
dim (int): The dimensionality of the input and output features.
|
677
|
-
input_resolution (
|
677
|
+
input_resolution (tuple[int, int]): Spatial resolution of the input feature map.
|
678
678
|
depth (int): Number of TinyViT blocks in this layer.
|
679
679
|
use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
|
680
680
|
blocks (nn.ModuleList): List of TinyViT blocks that make up this layer.
|
@@ -712,13 +712,13 @@ class BasicLayer(nn.Module):
|
|
712
712
|
|
713
713
|
Args:
|
714
714
|
dim (int): Dimensionality of the input and output features.
|
715
|
-
input_resolution (
|
715
|
+
input_resolution (tuple[int, int]): Spatial resolution of the input feature map (height, width).
|
716
716
|
depth (int): Number of TinyViT blocks in this layer.
|
717
717
|
num_heads (int): Number of attention heads in each TinyViT block.
|
718
718
|
window_size (int): Size of the local window for attention computation.
|
719
719
|
mlp_ratio (float, optional): Ratio of MLP hidden dimension to embedding dimension.
|
720
720
|
drop (float, optional): Dropout rate.
|
721
|
-
drop_path (float |
|
721
|
+
drop_path (float | list[float], optional): Stochastic depth rate. Can be a float or a list of floats for each block.
|
722
722
|
downsample (nn.Module | None, optional): Downsampling layer at the end of the layer. None to skip downsampling.
|
723
723
|
use_checkpoint (bool, optional): Whether to use gradient checkpointing to save memory.
|
724
724
|
local_conv_size (int, optional): Kernel size for the local convolution in each TinyViT block.
|
@@ -778,11 +778,11 @@ class TinyViT(nn.Module):
|
|
778
778
|
Attributes:
|
779
779
|
img_size (int): Input image size.
|
780
780
|
num_classes (int): Number of classification classes.
|
781
|
-
depths (
|
781
|
+
depths (tuple[int, int, int, int]): Number of blocks in each stage.
|
782
782
|
num_layers (int): Total number of layers in the network.
|
783
783
|
mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
|
784
784
|
patch_embed (PatchEmbed): Module for patch embedding.
|
785
|
-
patches_resolution (
|
785
|
+
patches_resolution (tuple[int, int]): Resolution of embedded patches.
|
786
786
|
layers (nn.ModuleList): List of network layers.
|
787
787
|
norm_head (nn.LayerNorm): Layer normalization for the classifier head.
|
788
788
|
head (nn.Linear): Linear layer for final classification.
|
@@ -823,10 +823,10 @@ class TinyViT(nn.Module):
|
|
823
823
|
img_size (int, optional): Size of the input image.
|
824
824
|
in_chans (int, optional): Number of input channels.
|
825
825
|
num_classes (int, optional): Number of classes for classification.
|
826
|
-
embed_dims (
|
827
|
-
depths (
|
828
|
-
num_heads (
|
829
|
-
window_sizes (
|
826
|
+
embed_dims (tuple[int, int, int, int], optional): Embedding dimensions for each stage.
|
827
|
+
depths (tuple[int, int, int, int], optional): Number of blocks in each stage.
|
828
|
+
num_heads (tuple[int, int, int, int], optional): Number of attention heads in each stage.
|
829
|
+
window_sizes (tuple[int, int, int, int], optional): Window sizes for each stage.
|
830
830
|
mlp_ratio (float, optional): Ratio of MLP hidden dim to embedding dim.
|
831
831
|
drop_rate (float, optional): Dropout rate.
|
832
832
|
drop_path_rate (float, optional): Stochastic depth rate.
|
@@ -14,12 +14,12 @@ def select_closest_cond_frames(frame_idx: int, cond_frame_outputs: dict[int, Any
|
|
14
14
|
|
15
15
|
Args:
|
16
16
|
frame_idx (int): Current frame index.
|
17
|
-
cond_frame_outputs (
|
17
|
+
cond_frame_outputs (dict[int, Any]): Dictionary of conditioning frame outputs keyed by frame indices.
|
18
18
|
max_cond_frame_num (int): Maximum number of conditioning frames to select.
|
19
19
|
|
20
20
|
Returns:
|
21
|
-
selected_outputs (
|
22
|
-
unselected_outputs (
|
21
|
+
selected_outputs (dict[int, Any]): Selected items from cond_frame_outputs.
|
22
|
+
unselected_outputs (dict[int, Any]): Items not selected from cond_frame_outputs.
|
23
23
|
|
24
24
|
Examples:
|
25
25
|
>>> frame_idx = 5
|
@@ -229,7 +229,7 @@ def window_partition(x: torch.Tensor, window_size: int):
|
|
229
229
|
|
230
230
|
Returns:
|
231
231
|
windows (torch.Tensor): Partitioned windows with shape (B * num_windows, window_size, window_size, C).
|
232
|
-
padded_h_w (
|
232
|
+
padded_h_w (tuple[int, int]): Padded height and width before partition.
|
233
233
|
|
234
234
|
Examples:
|
235
235
|
>>> x = torch.randn(1, 16, 16, 3)
|
@@ -262,8 +262,8 @@ def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: tuple[in
|
|
262
262
|
window_size, C), where B is the batch size, num_windows is the number of windows, window_size is
|
263
263
|
the size of each window, and C is the number of channels.
|
264
264
|
window_size (int): Size of each window.
|
265
|
-
pad_hw (
|
266
|
-
hw (
|
265
|
+
pad_hw (tuple[int, int]): Padded height and width (Hp, Wp) of the input before windowing.
|
266
|
+
hw (tuple[int, int]): Original height and width (H, W) of the input before padding and windowing.
|
267
267
|
|
268
268
|
Returns:
|
269
269
|
(torch.Tensor): Unpartitioned sequences with shape (B, H, W, C), where B is the batch size, H and W
|
@@ -350,8 +350,8 @@ def add_decomposed_rel_pos(
|
|
350
350
|
q (torch.Tensor): Query tensor in the attention layer with shape (B, q_h * q_w, C).
|
351
351
|
rel_pos_h (torch.Tensor): Relative position embeddings for height axis with shape (Lh, C).
|
352
352
|
rel_pos_w (torch.Tensor): Relative position embeddings for width axis with shape (Lw, C).
|
353
|
-
q_size (
|
354
|
-
k_size (
|
353
|
+
q_size (tuple[int, int]): Spatial sequence size of query q as (q_h, q_w).
|
354
|
+
k_size (tuple[int, int]): Spatial sequence size of key k as (k_h, k_w).
|
355
355
|
|
356
356
|
Returns:
|
357
357
|
(torch.Tensor): Updated attention map with added relative positional embeddings, shape
|