ultralytics 8.3.142__py3-none-any.whl → 8.3.144__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. tests/conftest.py +7 -24
  2. tests/test_cli.py +1 -1
  3. tests/test_cuda.py +7 -2
  4. tests/test_engine.py +7 -8
  5. tests/test_exports.py +16 -16
  6. tests/test_integrations.py +1 -1
  7. tests/test_solutions.py +12 -12
  8. ultralytics/__init__.py +1 -1
  9. ultralytics/cfg/__init__.py +16 -13
  10. ultralytics/data/annotator.py +6 -5
  11. ultralytics/data/augment.py +127 -126
  12. ultralytics/data/base.py +54 -51
  13. ultralytics/data/build.py +47 -23
  14. ultralytics/data/converter.py +47 -43
  15. ultralytics/data/dataset.py +51 -50
  16. ultralytics/data/loaders.py +77 -44
  17. ultralytics/data/split.py +22 -9
  18. ultralytics/data/split_dota.py +63 -39
  19. ultralytics/data/utils.py +59 -39
  20. ultralytics/engine/exporter.py +79 -27
  21. ultralytics/engine/model.py +39 -39
  22. ultralytics/engine/predictor.py +37 -28
  23. ultralytics/engine/results.py +187 -157
  24. ultralytics/engine/trainer.py +36 -19
  25. ultralytics/engine/tuner.py +12 -9
  26. ultralytics/engine/validator.py +7 -9
  27. ultralytics/hub/__init__.py +11 -13
  28. ultralytics/hub/auth.py +22 -2
  29. ultralytics/hub/google/__init__.py +19 -19
  30. ultralytics/hub/session.py +37 -51
  31. ultralytics/hub/utils.py +19 -5
  32. ultralytics/models/fastsam/model.py +30 -12
  33. ultralytics/models/fastsam/predict.py +5 -6
  34. ultralytics/models/fastsam/utils.py +3 -3
  35. ultralytics/models/fastsam/val.py +10 -6
  36. ultralytics/models/nas/model.py +9 -5
  37. ultralytics/models/nas/predict.py +6 -6
  38. ultralytics/models/nas/val.py +3 -3
  39. ultralytics/models/rtdetr/model.py +7 -6
  40. ultralytics/models/rtdetr/predict.py +14 -7
  41. ultralytics/models/rtdetr/train.py +10 -4
  42. ultralytics/models/rtdetr/val.py +36 -9
  43. ultralytics/models/sam/amg.py +30 -12
  44. ultralytics/models/sam/build.py +22 -22
  45. ultralytics/models/sam/model.py +10 -9
  46. ultralytics/models/sam/modules/blocks.py +76 -80
  47. ultralytics/models/sam/modules/decoders.py +6 -8
  48. ultralytics/models/sam/modules/encoders.py +23 -26
  49. ultralytics/models/sam/modules/memory_attention.py +13 -1
  50. ultralytics/models/sam/modules/sam.py +57 -26
  51. ultralytics/models/sam/modules/tiny_encoder.py +232 -237
  52. ultralytics/models/sam/modules/transformer.py +13 -13
  53. ultralytics/models/sam/modules/utils.py +11 -19
  54. ultralytics/models/sam/predict.py +114 -101
  55. ultralytics/models/utils/loss.py +98 -77
  56. ultralytics/models/utils/ops.py +116 -67
  57. ultralytics/models/yolo/classify/predict.py +5 -5
  58. ultralytics/models/yolo/classify/train.py +32 -28
  59. ultralytics/models/yolo/classify/val.py +7 -8
  60. ultralytics/models/yolo/detect/predict.py +1 -0
  61. ultralytics/models/yolo/detect/train.py +15 -14
  62. ultralytics/models/yolo/detect/val.py +37 -36
  63. ultralytics/models/yolo/model.py +106 -23
  64. ultralytics/models/yolo/obb/predict.py +3 -4
  65. ultralytics/models/yolo/obb/train.py +14 -6
  66. ultralytics/models/yolo/obb/val.py +29 -23
  67. ultralytics/models/yolo/pose/predict.py +9 -8
  68. ultralytics/models/yolo/pose/train.py +24 -16
  69. ultralytics/models/yolo/pose/val.py +44 -26
  70. ultralytics/models/yolo/segment/predict.py +5 -5
  71. ultralytics/models/yolo/segment/train.py +11 -7
  72. ultralytics/models/yolo/segment/val.py +2 -2
  73. ultralytics/models/yolo/world/train.py +33 -23
  74. ultralytics/models/yolo/world/train_world.py +11 -3
  75. ultralytics/models/yolo/yoloe/predict.py +11 -11
  76. ultralytics/models/yolo/yoloe/train.py +73 -21
  77. ultralytics/models/yolo/yoloe/train_seg.py +10 -7
  78. ultralytics/models/yolo/yoloe/val.py +42 -18
  79. ultralytics/nn/autobackend.py +59 -15
  80. ultralytics/nn/modules/__init__.py +4 -4
  81. ultralytics/nn/modules/activation.py +4 -1
  82. ultralytics/nn/modules/block.py +178 -111
  83. ultralytics/nn/modules/conv.py +6 -5
  84. ultralytics/nn/modules/head.py +469 -121
  85. ultralytics/nn/modules/transformer.py +147 -58
  86. ultralytics/nn/tasks.py +227 -20
  87. ultralytics/nn/text_model.py +30 -33
  88. ultralytics/solutions/ai_gym.py +1 -1
  89. ultralytics/solutions/analytics.py +7 -4
  90. ultralytics/solutions/config.py +10 -10
  91. ultralytics/solutions/distance_calculation.py +11 -10
  92. ultralytics/solutions/heatmap.py +1 -1
  93. ultralytics/solutions/instance_segmentation.py +6 -3
  94. ultralytics/solutions/object_blurrer.py +3 -3
  95. ultralytics/solutions/object_counter.py +16 -8
  96. ultralytics/solutions/object_cropper.py +12 -5
  97. ultralytics/solutions/parking_management.py +29 -28
  98. ultralytics/solutions/queue_management.py +6 -6
  99. ultralytics/solutions/region_counter.py +10 -3
  100. ultralytics/solutions/security_alarm.py +3 -3
  101. ultralytics/solutions/similarity_search.py +85 -24
  102. ultralytics/solutions/solutions.py +215 -85
  103. ultralytics/solutions/speed_estimation.py +28 -22
  104. ultralytics/solutions/streamlit_inference.py +17 -12
  105. ultralytics/solutions/trackzone.py +4 -4
  106. ultralytics/trackers/basetrack.py +16 -23
  107. ultralytics/trackers/bot_sort.py +30 -20
  108. ultralytics/trackers/byte_tracker.py +70 -64
  109. ultralytics/trackers/track.py +4 -8
  110. ultralytics/trackers/utils/gmc.py +31 -58
  111. ultralytics/trackers/utils/kalman_filter.py +37 -37
  112. ultralytics/trackers/utils/matching.py +1 -1
  113. ultralytics/utils/__init__.py +105 -89
  114. ultralytics/utils/autobatch.py +16 -3
  115. ultralytics/utils/autodevice.py +54 -24
  116. ultralytics/utils/benchmarks.py +42 -28
  117. ultralytics/utils/callbacks/base.py +3 -3
  118. ultralytics/utils/callbacks/clearml.py +9 -9
  119. ultralytics/utils/callbacks/comet.py +67 -25
  120. ultralytics/utils/callbacks/dvc.py +7 -10
  121. ultralytics/utils/callbacks/mlflow.py +2 -5
  122. ultralytics/utils/callbacks/neptune.py +7 -13
  123. ultralytics/utils/callbacks/raytune.py +1 -1
  124. ultralytics/utils/callbacks/tensorboard.py +5 -6
  125. ultralytics/utils/callbacks/wb.py +14 -14
  126. ultralytics/utils/checks.py +14 -13
  127. ultralytics/utils/dist.py +5 -5
  128. ultralytics/utils/downloads.py +94 -67
  129. ultralytics/utils/errors.py +5 -5
  130. ultralytics/utils/export.py +61 -47
  131. ultralytics/utils/files.py +23 -22
  132. ultralytics/utils/instance.py +48 -52
  133. ultralytics/utils/loss.py +78 -40
  134. ultralytics/utils/metrics.py +186 -130
  135. ultralytics/utils/ops.py +186 -190
  136. ultralytics/utils/patches.py +15 -17
  137. ultralytics/utils/plotting.py +71 -27
  138. ultralytics/utils/tal.py +21 -15
  139. ultralytics/utils/torch_utils.py +53 -50
  140. ultralytics/utils/triton.py +5 -4
  141. ultralytics/utils/tuner.py +5 -5
  142. {ultralytics-8.3.142.dist-info → ultralytics-8.3.144.dist-info}/METADATA +1 -1
  143. ultralytics-8.3.144.dist-info/RECORD +272 -0
  144. ultralytics-8.3.142.dist-info/RECORD +0 -272
  145. {ultralytics-8.3.142.dist-info → ultralytics-8.3.144.dist-info}/WHEEL +0 -0
  146. {ultralytics-8.3.142.dist-info → ultralytics-8.3.144.dist-info}/entry_points.txt +0 -0
  147. {ultralytics-8.3.142.dist-info → ultralytics-8.3.144.dist-info}/licenses/LICENSE +0 -0
  148. {ultralytics-8.3.142.dist-info → ultralytics-8.3.144.dist-info}/top_level.txt +0 -0
ultralytics/nn/tasks.py CHANGED
@@ -94,7 +94,30 @@ from ultralytics.utils.torch_utils import (
94
94
 
95
95
 
96
96
  class BaseModel(torch.nn.Module):
97
- """The BaseModel class serves as a base class for all the models in the Ultralytics YOLO family."""
97
+ """
98
+ Base class for all YOLO models in the Ultralytics family.
99
+
100
+ This class provides common functionality for YOLO models including forward pass handling, model fusion,
101
+ information display, and weight loading capabilities.
102
+
103
+ Attributes:
104
+ model (torch.nn.Module): The neural network model.
105
+ save (list): List of layer indices to save outputs from.
106
+ stride (torch.Tensor): Model stride values.
107
+
108
+ Methods:
109
+ forward: Perform forward pass for training or inference.
110
+ predict: Perform inference on input tensor.
111
+ fuse: Fuse Conv2d and BatchNorm2d layers for optimization.
112
+ info: Print model information.
113
+ load: Load weights into the model.
114
+ loss: Compute loss for training.
115
+
116
+ Examples:
117
+ Create a BaseModel instance
118
+ >>> model = BaseModel()
119
+ >>> model.info() # Display model information
120
+ """
98
121
 
99
122
  def forward(self, x, *args, **kwargs):
100
123
  """
@@ -319,7 +342,33 @@ class BaseModel(torch.nn.Module):
319
342
 
320
343
 
321
344
  class DetectionModel(BaseModel):
322
- """YOLO detection model."""
345
+ """
346
+ YOLO detection model.
347
+
348
+ This class implements the YOLO detection architecture, handling model initialization, forward pass,
349
+ augmented inference, and loss computation for object detection tasks.
350
+
351
+ Attributes:
352
+ yaml (dict): Model configuration dictionary.
353
+ model (torch.nn.Sequential): The neural network model.
354
+ save (list): List of layer indices to save outputs from.
355
+ names (dict): Class names dictionary.
356
+ inplace (bool): Whether to use inplace operations.
357
+ end2end (bool): Whether the model uses end-to-end detection.
358
+ stride (torch.Tensor): Model stride values.
359
+
360
+ Methods:
361
+ __init__: Initialize the YOLO detection model.
362
+ _predict_augment: Perform augmented inference.
363
+ _descale_pred: De-scale predictions following augmented inference.
364
+ _clip_augmented: Clip YOLO augmented inference tails.
365
+ init_criterion: Initialize the loss criterion.
366
+
367
+ Examples:
368
+ Initialize a detection model
369
+ >>> model = DetectionModel("yolo11n.yaml", ch=3, nc=80)
370
+ >>> results = model.predict(image_tensor)
371
+ """
323
372
 
324
373
  def __init__(self, cfg="yolo11n.yaml", ch=3, nc=None, verbose=True):
325
374
  """
@@ -447,7 +496,21 @@ class DetectionModel(BaseModel):
447
496
 
448
497
 
449
498
  class OBBModel(DetectionModel):
450
- """YOLO Oriented Bounding Box (OBB) model."""
499
+ """
500
+ YOLO Oriented Bounding Box (OBB) model.
501
+
502
+ This class extends DetectionModel to handle oriented bounding box detection tasks, providing specialized
503
+ loss computation for rotated object detection.
504
+
505
+ Methods:
506
+ __init__: Initialize YOLO OBB model.
507
+ init_criterion: Initialize the loss criterion for OBB detection.
508
+
509
+ Examples:
510
+ Initialize an OBB model
511
+ >>> model = OBBModel("yolo11n-obb.yaml", ch=3, nc=80)
512
+ >>> results = model.predict(image_tensor)
513
+ """
451
514
 
452
515
  def __init__(self, cfg="yolo11n-obb.yaml", ch=3, nc=None, verbose=True):
453
516
  """
@@ -467,7 +530,21 @@ class OBBModel(DetectionModel):
467
530
 
468
531
 
469
532
  class SegmentationModel(DetectionModel):
470
- """YOLO segmentation model."""
533
+ """
534
+ YOLO segmentation model.
535
+
536
+ This class extends DetectionModel to handle instance segmentation tasks, providing specialized
537
+ loss computation for pixel-level object detection and segmentation.
538
+
539
+ Methods:
540
+ __init__: Initialize YOLO segmentation model.
541
+ init_criterion: Initialize the loss criterion for segmentation.
542
+
543
+ Examples:
544
+ Initialize a segmentation model
545
+ >>> model = SegmentationModel("yolo11n-seg.yaml", ch=3, nc=80)
546
+ >>> results = model.predict(image_tensor)
547
+ """
471
548
 
472
549
  def __init__(self, cfg="yolo11n-seg.yaml", ch=3, nc=None, verbose=True):
473
550
  """
@@ -487,7 +564,24 @@ class SegmentationModel(DetectionModel):
487
564
 
488
565
 
489
566
  class PoseModel(DetectionModel):
490
- """YOLO pose model."""
567
+ """
568
+ YOLO pose model.
569
+
570
+ This class extends DetectionModel to handle human pose estimation tasks, providing specialized
571
+ loss computation for keypoint detection and pose estimation.
572
+
573
+ Attributes:
574
+ kpt_shape (tuple): Shape of keypoints data (num_keypoints, num_dimensions).
575
+
576
+ Methods:
577
+ __init__: Initialize YOLO pose model.
578
+ init_criterion: Initialize the loss criterion for pose estimation.
579
+
580
+ Examples:
581
+ Initialize a pose model
582
+ >>> model = PoseModel("yolo11n-pose.yaml", ch=3, nc=1, data_kpt_shape=(17, 3))
583
+ >>> results = model.predict(image_tensor)
584
+ """
491
585
 
492
586
  def __init__(self, cfg="yolo11n-pose.yaml", ch=3, nc=None, data_kpt_shape=(None, None), verbose=True):
493
587
  """
@@ -513,7 +607,29 @@ class PoseModel(DetectionModel):
513
607
 
514
608
 
515
609
  class ClassificationModel(BaseModel):
516
- """YOLO classification model."""
610
+ """
611
+ YOLO classification model.
612
+
613
+ This class implements the YOLO classification architecture for image classification tasks,
614
+ providing model initialization, configuration, and output reshaping capabilities.
615
+
616
+ Attributes:
617
+ yaml (dict): Model configuration dictionary.
618
+ model (torch.nn.Sequential): The neural network model.
619
+ stride (torch.Tensor): Model stride values.
620
+ names (dict): Class names dictionary.
621
+
622
+ Methods:
623
+ __init__: Initialize ClassificationModel.
624
+ _from_yaml: Set model configurations and define architecture.
625
+ reshape_outputs: Update model to specified class count.
626
+ init_criterion: Initialize the loss criterion.
627
+
628
+ Examples:
629
+ Initialize a classification model
630
+ >>> model = ClassificationModel("yolo11n-cls.yaml", ch=3, nc=1000)
631
+ >>> results = model.predict(image_tensor)
632
+ """
517
633
 
518
634
  def __init__(self, cfg="yolo11n-cls.yaml", ch=3, nc=None, verbose=True):
519
635
  """
@@ -594,10 +710,20 @@ class RTDETRDetectionModel(DetectionModel):
594
710
  the training and inference processes. RTDETR is an object detection and tracking model that extends from the
595
711
  DetectionModel base class.
596
712
 
713
+ Attributes:
714
+ nc (int): Number of classes for detection.
715
+ criterion (RTDETRDetectionLoss): Loss function for training.
716
+
597
717
  Methods:
598
- init_criterion: Initializes the criterion used for loss calculation.
599
- loss: Computes and returns the loss during training.
600
- predict: Performs a forward pass through the network and returns the output.
718
+ __init__: Initialize the RTDETRDetectionModel.
719
+ init_criterion: Initialize the loss criterion.
720
+ loss: Compute loss for training.
721
+ predict: Perform forward pass through the model.
722
+
723
+ Examples:
724
+ Initialize an RTDETR model
725
+ >>> model = RTDETRDetectionModel("rtdetr-l.yaml", ch=3, nc=80)
726
+ >>> results = model.predict(image_tensor)
601
727
  """
602
728
 
603
729
  def __init__(self, cfg="rtdetr-l.yaml", ch=3, nc=None, verbose=True):
@@ -627,7 +753,8 @@ class RTDETRDetectionModel(DetectionModel):
627
753
  preds (torch.Tensor, optional): Precomputed model predictions.
628
754
 
629
755
  Returns:
630
- (tuple): A tuple containing the total loss and main three losses in a tensor.
756
+ loss_sum (torch.Tensor): Total loss value.
757
+ loss_items (torch.Tensor): Main three losses in a tensor.
631
758
  """
632
759
  if not hasattr(self, "criterion"):
633
760
  self.criterion = self.init_criterion()
@@ -700,7 +827,29 @@ class RTDETRDetectionModel(DetectionModel):
700
827
 
701
828
 
702
829
  class WorldModel(DetectionModel):
703
- """YOLOv8 World Model."""
830
+ """
831
+ YOLOv8 World Model.
832
+
833
+ This class implements the YOLOv8 World model for open-vocabulary object detection, supporting text-based
834
+ class specification and CLIP model integration for zero-shot detection capabilities.
835
+
836
+ Attributes:
837
+ txt_feats (torch.Tensor): Text feature embeddings for classes.
838
+ clip_model (torch.nn.Module): CLIP model for text encoding.
839
+
840
+ Methods:
841
+ __init__: Initialize YOLOv8 world model.
842
+ set_classes: Set classes for offline inference.
843
+ get_text_pe: Get text positional embeddings.
844
+ predict: Perform forward pass with text features.
845
+ loss: Compute loss with text features.
846
+
847
+ Examples:
848
+ Initialize a world model
849
+ >>> model = WorldModel("yolov8s-world.yaml", ch=3, nc=80)
850
+ >>> model.set_classes(["person", "car", "bicycle"])
851
+ >>> results = model.predict(image_tensor)
852
+ """
704
853
 
705
854
  def __init__(self, cfg="yolov8s-world.yaml", ch=3, nc=None, verbose=True):
706
855
  """
@@ -815,7 +964,32 @@ class WorldModel(DetectionModel):
815
964
 
816
965
 
817
966
  class YOLOEModel(DetectionModel):
818
- """YOLOE detection model."""
967
+ """
968
+ YOLOE detection model.
969
+
970
+ This class implements the YOLOE architecture for efficient object detection with text and visual prompts,
971
+ supporting both prompt-based and prompt-free inference modes.
972
+
973
+ Attributes:
974
+ pe (torch.Tensor): Prompt embeddings for classes.
975
+ clip_model (torch.nn.Module): CLIP model for text encoding.
976
+
977
+ Methods:
978
+ __init__: Initialize YOLOE model.
979
+ get_text_pe: Get text positional embeddings.
980
+ get_visual_pe: Get visual embeddings.
981
+ set_vocab: Set vocabulary for prompt-free model.
982
+ get_vocab: Get fused vocabulary layer.
983
+ set_classes: Set classes for offline inference.
984
+ get_cls_pe: Get class positional embeddings.
985
+ predict: Perform forward pass with prompts.
986
+ loss: Compute loss with prompts.
987
+
988
+ Examples:
989
+ Initialize a YOLOE model
990
+ >>> model = YOLOEModel("yoloe-v8s.yaml", ch=3, nc=80)
991
+ >>> results = model.predict(image_tensor, tpe=text_embeddings)
992
+ """
819
993
 
820
994
  def __init__(self, cfg="yoloe-v8s.yaml", ch=3, nc=None, verbose=True):
821
995
  """
@@ -861,7 +1035,7 @@ class YOLOEModel(DetectionModel):
861
1035
  assert not self.training
862
1036
  head = self.model[-1]
863
1037
  assert isinstance(head, YOLOEDetect)
864
- return head.get_tpe(txt_feats) # run axuiliary text head
1038
+ return head.get_tpe(txt_feats) # run auxiliary text head
865
1039
 
866
1040
  @smart_inference_mode()
867
1041
  def get_visual_pe(self, img, visual):
@@ -1040,7 +1214,21 @@ class YOLOEModel(DetectionModel):
1040
1214
 
1041
1215
 
1042
1216
  class YOLOESegModel(YOLOEModel, SegmentationModel):
1043
- """YOLOE segmentation model."""
1217
+ """
1218
+ YOLOE segmentation model.
1219
+
1220
+ This class extends YOLOEModel to handle instance segmentation tasks with text and visual prompts,
1221
+ providing specialized loss computation for pixel-level object detection and segmentation.
1222
+
1223
+ Methods:
1224
+ __init__: Initialize YOLOE segmentation model.
1225
+ loss: Compute loss with prompts for segmentation.
1226
+
1227
+ Examples:
1228
+ Initialize a YOLOE segmentation model
1229
+ >>> model = YOLOESegModel("yoloe-v8s-seg.yaml", ch=3, nc=80)
1230
+ >>> results = model.predict(image_tensor, tpe=text_embeddings)
1231
+ """
1044
1232
 
1045
1233
  def __init__(self, cfg="yoloe-v8s-seg.yaml", ch=3, nc=None, verbose=True):
1046
1234
  """
@@ -1074,7 +1262,23 @@ class YOLOESegModel(YOLOEModel, SegmentationModel):
1074
1262
 
1075
1263
 
1076
1264
  class Ensemble(torch.nn.ModuleList):
1077
- """Ensemble of models."""
1265
+ """
1266
+ Ensemble of models.
1267
+
1268
+ This class allows combining multiple YOLO models into an ensemble for improved performance through
1269
+ model averaging or other ensemble techniques.
1270
+
1271
+ Methods:
1272
+ __init__: Initialize an ensemble of models.
1273
+ forward: Generate predictions from all models in the ensemble.
1274
+
1275
+ Examples:
1276
+ Create an ensemble of models
1277
+ >>> ensemble = Ensemble()
1278
+ >>> ensemble.append(model1)
1279
+ >>> ensemble.append(model2)
1280
+ >>> results = ensemble(image_tensor)
1281
+ """
1078
1282
 
1079
1283
  def __init__(self):
1080
1284
  """Initialize an ensemble of models."""
@@ -1091,7 +1295,8 @@ class Ensemble(torch.nn.ModuleList):
1091
1295
  visualize (bool): Whether to visualize the features.
1092
1296
 
1093
1297
  Returns:
1094
- (tuple): Tuple containing the concatenated predictions and None.
1298
+ y (torch.Tensor): Concatenated predictions from all models.
1299
+ train_out (None): Always None for ensemble inference.
1095
1300
  """
1096
1301
  y = [module(x, augment, profile, visualize)[0] for module in self]
1097
1302
  # y = torch.stack(y).max(0)[0] # max ensemble
@@ -1195,7 +1400,7 @@ class SafeUnpickler(pickle.Unpickler):
1195
1400
 
1196
1401
  def torch_safe_load(weight, safe_only=False):
1197
1402
  """
1198
- Attempts to load a PyTorch model with the torch.load() function. If a ModuleNotFoundError is raised, it catches the
1403
+ Attempt to load a PyTorch model with the torch.load() function. If a ModuleNotFoundError is raised, it catches the
1199
1404
  error, logs a warning message, and attempts to install the missing module via the check_requirements() function.
1200
1405
  After installation, the function again attempts to load the model using torch.load().
1201
1406
 
@@ -1329,7 +1534,8 @@ def attempt_load_one_weight(weight, device=None, inplace=True, fuse=False):
1329
1534
  fuse (bool): Whether to fuse model.
1330
1535
 
1331
1536
  Returns:
1332
- (tuple): Tuple containing the model and checkpoint.
1537
+ model (torch.nn.Module): Loaded model.
1538
+ ckpt (dict): Model checkpoint dictionary.
1333
1539
  """
1334
1540
  ckpt, weight = torch_safe_load(weight) # load ckpt
1335
1541
  args = {**DEFAULT_CFG_DICT, **(ckpt.get("train_args", {}))} # combine model and default args, preferring model args
@@ -1355,7 +1561,7 @@ def attempt_load_one_weight(weight, device=None, inplace=True, fuse=False):
1355
1561
  return model, ckpt
1356
1562
 
1357
1563
 
1358
- def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
1564
+ def parse_model(d, ch, verbose=True):
1359
1565
  """
1360
1566
  Parse a YOLO model.yaml dictionary into a PyTorch model.
1361
1567
 
@@ -1365,7 +1571,8 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
1365
1571
  verbose (bool): Whether to print model details.
1366
1572
 
1367
1573
  Returns:
1368
- (tuple): Tuple containing the PyTorch model and sorted list of output layers.
1574
+ model (torch.nn.Sequential): PyTorch model.
1575
+ save (list): Sorted list of output layers.
1369
1576
  """
1370
1577
  import ast
1371
1578
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  from abc import abstractmethod
4
4
  from pathlib import Path
5
+ from typing import List, Union
5
6
 
6
7
  import torch
7
8
  import torch.nn as nn
@@ -21,11 +22,11 @@ class TextModel(nn.Module):
21
22
  Abstract base class for text encoding models.
22
23
 
23
24
  This class defines the interface for text encoding models used in vision-language tasks. Subclasses must implement
24
- the tokenize and encode_text methods.
25
+ the tokenize and encode_text methods to provide text tokenization and encoding functionality.
25
26
 
26
27
  Methods:
27
- tokenize: Convert input texts to tokens.
28
- encode_text: Encode tokenized texts into feature vectors.
28
+ tokenize: Convert input texts to tokens for model processing.
29
+ encode_text: Encode tokenized texts into normalized feature vectors.
29
30
  """
30
31
 
31
32
  def __init__(self):
@@ -33,12 +34,12 @@ class TextModel(nn.Module):
33
34
  super().__init__()
34
35
 
35
36
  @abstractmethod
36
- def tokenize(texts):
37
+ def tokenize(self, texts):
37
38
  """Convert input texts to tokens for model processing."""
38
39
  pass
39
40
 
40
41
  @abstractmethod
41
- def encode_text(texts, dtype):
42
+ def encode_text(self, texts, dtype):
42
43
  """Encode tokenized texts into normalized feature vectors."""
43
44
  pass
44
45
 
@@ -59,7 +60,6 @@ class CLIP(TextModel):
59
60
  encode_text: Encode tokenized texts into normalized feature vectors.
60
61
 
61
62
  Examples:
62
- >>> from ultralytics.models.sam import CLIP
63
63
  >>> import torch
64
64
  >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
65
65
  >>> clip_model = CLIP(size="ViT-B/32", device=device)
@@ -68,7 +68,7 @@ class CLIP(TextModel):
68
68
  >>> print(text_features.shape)
69
69
  """
70
70
 
71
- def __init__(self, size, device):
71
+ def __init__(self, size: str, device: torch.device):
72
72
  """
73
73
  Initialize the CLIP text encoder.
74
74
 
@@ -81,7 +81,6 @@ class CLIP(TextModel):
81
81
 
82
82
  Examples:
83
83
  >>> import torch
84
- >>> from ultralytics.models.sam.modules.clip import CLIP
85
84
  >>> clip_model = CLIP("ViT-B/32", device=torch.device("cuda:0"))
86
85
  >>> text_features = clip_model.encode_text(["a photo of a cat", "a photo of a dog"])
87
86
  """
@@ -91,7 +90,7 @@ class CLIP(TextModel):
91
90
  self.device = device
92
91
  self.eval()
93
92
 
94
- def tokenize(self, texts):
93
+ def tokenize(self, texts: Union[str, List[str]]):
95
94
  """
96
95
  Convert input texts to CLIP tokens.
97
96
 
@@ -109,7 +108,7 @@ class CLIP(TextModel):
109
108
  return clip.tokenize(texts).to(self.device)
110
109
 
111
110
  @smart_inference_mode()
112
- def encode_text(self, texts, dtype=torch.float32):
111
+ def encode_text(self, texts: torch.Tensor, dtype: torch.dtype = torch.float32):
113
112
  """
114
113
  Encode tokenized texts into normalized feature vectors.
115
114
 
@@ -118,7 +117,7 @@ class CLIP(TextModel):
118
117
 
119
118
  Args:
120
119
  texts (torch.Tensor): Tokenized text inputs, typically created using the tokenize() method.
121
- dtype (torch.dtype, optional): Data type for output features. Default is torch.float32.
120
+ dtype (torch.dtype, optional): Data type for output features.
122
121
 
123
122
  Returns:
124
123
  (torch.Tensor): Normalized text feature vectors with unit length (L2 norm = 1).
@@ -140,7 +139,7 @@ class MobileCLIP(TextModel):
140
139
  Implement Apple's MobileCLIP text encoder for efficient text encoding.
141
140
 
142
141
  This class implements the TextModel interface using Apple's MobileCLIP model, providing efficient text encoding
143
- capabilities for vision-language tasks.
142
+ capabilities for vision-language tasks with reduced computational requirements compared to standard CLIP models.
144
143
 
145
144
  Attributes:
146
145
  model (mobileclip.model.MobileCLIP): The loaded MobileCLIP model.
@@ -161,7 +160,7 @@ class MobileCLIP(TextModel):
161
160
 
162
161
  config_size_map = {"s0": "s0", "s1": "s1", "s2": "s2", "b": "b", "blt": "b"}
163
162
 
164
- def __init__(self, size, device):
163
+ def __init__(self, size: str, device: torch.device):
165
164
  """
166
165
  Initialize the MobileCLIP text encoder.
167
166
 
@@ -172,7 +171,6 @@ class MobileCLIP(TextModel):
172
171
  device (torch.device): Device to load the model on.
173
172
 
174
173
  Examples:
175
- >>> from ultralytics.nn.modules import MobileCLIP
176
174
  >>> import torch
177
175
  >>> model = MobileCLIP("s0", device=torch.device("cpu"))
178
176
  >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
@@ -203,12 +201,12 @@ class MobileCLIP(TextModel):
203
201
  self.device = device
204
202
  self.eval()
205
203
 
206
- def tokenize(self, texts):
204
+ def tokenize(self, texts: List[str]):
207
205
  """
208
206
  Convert input texts to MobileCLIP tokens.
209
207
 
210
208
  Args:
211
- texts (list[str]): List of text strings to tokenize.
209
+ texts (List[str]): List of text strings to tokenize.
212
210
 
213
211
  Returns:
214
212
  (torch.Tensor): Tokenized text inputs with shape (batch_size, sequence_length).
@@ -220,7 +218,7 @@ class MobileCLIP(TextModel):
220
218
  return self.tokenizer(texts).to(self.device)
221
219
 
222
220
  @smart_inference_mode()
223
- def encode_text(self, texts, dtype=torch.float32):
221
+ def encode_text(self, texts: torch.Tensor, dtype: torch.dtype = torch.float32):
224
222
  """
225
223
  Encode tokenized texts into normalized feature vectors.
226
224
 
@@ -247,11 +245,11 @@ class MobileCLIPTS(TextModel):
247
245
  """
248
246
  Load a TorchScript traced version of MobileCLIP.
249
247
 
250
- This class implements the TextModel interface using Apple's MobileCLIP model, providing efficient text encoding
251
- capabilities for vision-language tasks.
248
+ This class implements the TextModel interface using Apple's MobileCLIP model in TorchScript format, providing
249
+ efficient text encoding capabilities for vision-language tasks with optimized inference performance.
252
250
 
253
251
  Attributes:
254
- encoder (mobileclip.model.MobileCLIP): The loaded MobileCLIP text encoder.
252
+ encoder (torch.jit.ScriptModule): The loaded TorchScript MobileCLIP text encoder.
255
253
  tokenizer (callable): Tokenizer function for processing text inputs.
256
254
  device (torch.device): Device where the model is loaded.
257
255
 
@@ -261,24 +259,23 @@ class MobileCLIPTS(TextModel):
261
259
 
262
260
  Examples:
263
261
  >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
264
- >>> text_encoder = MobileCLIP(device=device)
262
+ >>> text_encoder = MobileCLIPTS(device=device)
265
263
  >>> tokens = text_encoder.tokenize(["a photo of a cat", "a photo of a dog"])
266
264
  >>> features = text_encoder.encode_text(tokens)
267
265
  """
268
266
 
269
- def __init__(self, device):
267
+ def __init__(self, device: torch.device):
270
268
  """
271
- Initialize the MobileCLIP text encoder.
269
+ Initialize the MobileCLIP TorchScript text encoder.
272
270
 
273
- This class implements the TextModel interface using Apple's MobileCLIP model for efficient text encoding.
271
+ This class implements the TextModel interface using Apple's MobileCLIP model in TorchScript format for
272
+ efficient text encoding with optimized inference performance.
274
273
 
275
274
  Args:
276
275
  device (torch.device): Device to load the model on.
277
276
 
278
277
  Examples:
279
- >>> from ultralytics.nn.modules import MobileCLIP
280
- >>> import torch
281
- >>> model = MobileCLIP(device=torch.device("cpu"))
278
+ >>> model = MobileCLIPTS(device=torch.device("cpu"))
282
279
  >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
283
280
  >>> features = model.encode_text(tokens)
284
281
  """
@@ -289,24 +286,24 @@ class MobileCLIPTS(TextModel):
289
286
  self.tokenizer = clip.clip.tokenize
290
287
  self.device = device
291
288
 
292
- def tokenize(self, texts):
289
+ def tokenize(self, texts: List[str]):
293
290
  """
294
291
  Convert input texts to MobileCLIP tokens.
295
292
 
296
293
  Args:
297
- texts (list[str]): List of text strings to tokenize.
294
+ texts (List[str]): List of text strings to tokenize.
298
295
 
299
296
  Returns:
300
297
  (torch.Tensor): Tokenized text inputs with shape (batch_size, sequence_length).
301
298
 
302
299
  Examples:
303
- >>> model = MobileCLIP("cpu")
300
+ >>> model = MobileCLIPTS("cpu")
304
301
  >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
305
302
  """
306
303
  return self.tokenizer(texts).to(self.device)
307
304
 
308
305
  @smart_inference_mode()
309
- def encode_text(self, texts, dtype=torch.float32):
306
+ def encode_text(self, texts: torch.Tensor, dtype: torch.dtype = torch.float32):
310
307
  """
311
308
  Encode tokenized texts into normalized feature vectors.
312
309
 
@@ -318,7 +315,7 @@ class MobileCLIPTS(TextModel):
318
315
  (torch.Tensor): Normalized text feature vectors with L2 normalization applied.
319
316
 
320
317
  Examples:
321
- >>> model = MobileCLIP(device="cpu")
318
+ >>> model = MobileCLIPTS(device="cpu")
322
319
  >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
323
320
  >>> features = model.encode_text(tokens)
324
321
  >>> features.shape
@@ -328,7 +325,7 @@ class MobileCLIPTS(TextModel):
328
325
  return self.encoder(texts)
329
326
 
330
327
 
331
- def build_text_model(variant, device=None):
328
+ def build_text_model(variant: str, device: torch.device = None):
332
329
  """
333
330
  Build a text encoding model based on the specified variant.
334
331
 
@@ -19,7 +19,7 @@ class AIGym(BaseSolution):
19
19
  kpts (List[int]): Indices of keypoints used for angle calculation.
20
20
 
21
21
  Methods:
22
- process: Processes a frame to detect poses, calculate angles, and count repetitions.
22
+ process: Process a frame to detect poses, calculate angles, and count repetitions.
23
23
 
24
24
  Examples:
25
25
  >>> gym = AIGym(model="yolo11n-pose.pt")
@@ -1,6 +1,7 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
 
3
3
  from itertools import cycle
4
+ from typing import Dict, Optional
4
5
 
5
6
  import cv2
6
7
  import numpy as np
@@ -86,7 +87,7 @@ class Analytics(BaseSolution):
86
87
  if self.type == "pie": # Ensure pie chart is circular
87
88
  self.ax.axis("equal")
88
89
 
89
- def process(self, im0, frame_number):
90
+ def process(self, im0: np.ndarray, frame_number: int) -> SolutionResults:
90
91
  """
91
92
  Process image data and run object tracking to update analytics charts.
92
93
 
@@ -126,14 +127,16 @@ class Analytics(BaseSolution):
126
127
  # return output dictionary with summary for more usage
127
128
  return SolutionResults(plot_im=plot_im, total_tracks=len(self.track_ids), classwise_count=self.clswise_count)
128
129
 
129
- def update_graph(self, frame_number, count_dict=None, plot="line"):
130
+ def update_graph(
131
+ self, frame_number: int, count_dict: Optional[Dict[str, int]] = None, plot: str = "line"
132
+ ) -> np.ndarray:
130
133
  """
131
134
  Update the graph with new data for single or multiple classes.
132
135
 
133
136
  Args:
134
137
  frame_number (int): The current frame number.
135
- count_dict (Dict[str, int] | None): Dictionary with class names as keys and counts as values for multiple
136
- classes. If None, updates a single line graph.
138
+ count_dict (Dict[str, int], optional): Dictionary with class names as keys and counts as values for
139
+ multiple classes. If None, updates a single line graph.
137
140
  plot (str): Type of the plot. Options are 'line', 'bar', 'pie', or 'area'.
138
141
 
139
142
  Returns: