ultralytics 8.3.142__py3-none-any.whl → 8.3.144__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tests/conftest.py +7 -24
- tests/test_cli.py +1 -1
- tests/test_cuda.py +7 -2
- tests/test_engine.py +7 -8
- tests/test_exports.py +16 -16
- tests/test_integrations.py +1 -1
- tests/test_solutions.py +12 -12
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +16 -13
- ultralytics/data/annotator.py +6 -5
- ultralytics/data/augment.py +127 -126
- ultralytics/data/base.py +54 -51
- ultralytics/data/build.py +47 -23
- ultralytics/data/converter.py +47 -43
- ultralytics/data/dataset.py +51 -50
- ultralytics/data/loaders.py +77 -44
- ultralytics/data/split.py +22 -9
- ultralytics/data/split_dota.py +63 -39
- ultralytics/data/utils.py +59 -39
- ultralytics/engine/exporter.py +79 -27
- ultralytics/engine/model.py +39 -39
- ultralytics/engine/predictor.py +37 -28
- ultralytics/engine/results.py +187 -157
- ultralytics/engine/trainer.py +36 -19
- ultralytics/engine/tuner.py +12 -9
- ultralytics/engine/validator.py +7 -9
- ultralytics/hub/__init__.py +11 -13
- ultralytics/hub/auth.py +22 -2
- ultralytics/hub/google/__init__.py +19 -19
- ultralytics/hub/session.py +37 -51
- ultralytics/hub/utils.py +19 -5
- ultralytics/models/fastsam/model.py +30 -12
- ultralytics/models/fastsam/predict.py +5 -6
- ultralytics/models/fastsam/utils.py +3 -3
- ultralytics/models/fastsam/val.py +10 -6
- ultralytics/models/nas/model.py +9 -5
- ultralytics/models/nas/predict.py +6 -6
- ultralytics/models/nas/val.py +3 -3
- ultralytics/models/rtdetr/model.py +7 -6
- ultralytics/models/rtdetr/predict.py +14 -7
- ultralytics/models/rtdetr/train.py +10 -4
- ultralytics/models/rtdetr/val.py +36 -9
- ultralytics/models/sam/amg.py +30 -12
- ultralytics/models/sam/build.py +22 -22
- ultralytics/models/sam/model.py +10 -9
- ultralytics/models/sam/modules/blocks.py +76 -80
- ultralytics/models/sam/modules/decoders.py +6 -8
- ultralytics/models/sam/modules/encoders.py +23 -26
- ultralytics/models/sam/modules/memory_attention.py +13 -1
- ultralytics/models/sam/modules/sam.py +57 -26
- ultralytics/models/sam/modules/tiny_encoder.py +232 -237
- ultralytics/models/sam/modules/transformer.py +13 -13
- ultralytics/models/sam/modules/utils.py +11 -19
- ultralytics/models/sam/predict.py +114 -101
- ultralytics/models/utils/loss.py +98 -77
- ultralytics/models/utils/ops.py +116 -67
- ultralytics/models/yolo/classify/predict.py +5 -5
- ultralytics/models/yolo/classify/train.py +32 -28
- ultralytics/models/yolo/classify/val.py +7 -8
- ultralytics/models/yolo/detect/predict.py +1 -0
- ultralytics/models/yolo/detect/train.py +15 -14
- ultralytics/models/yolo/detect/val.py +37 -36
- ultralytics/models/yolo/model.py +106 -23
- ultralytics/models/yolo/obb/predict.py +3 -4
- ultralytics/models/yolo/obb/train.py +14 -6
- ultralytics/models/yolo/obb/val.py +29 -23
- ultralytics/models/yolo/pose/predict.py +9 -8
- ultralytics/models/yolo/pose/train.py +24 -16
- ultralytics/models/yolo/pose/val.py +44 -26
- ultralytics/models/yolo/segment/predict.py +5 -5
- ultralytics/models/yolo/segment/train.py +11 -7
- ultralytics/models/yolo/segment/val.py +2 -2
- ultralytics/models/yolo/world/train.py +33 -23
- ultralytics/models/yolo/world/train_world.py +11 -3
- ultralytics/models/yolo/yoloe/predict.py +11 -11
- ultralytics/models/yolo/yoloe/train.py +73 -21
- ultralytics/models/yolo/yoloe/train_seg.py +10 -7
- ultralytics/models/yolo/yoloe/val.py +42 -18
- ultralytics/nn/autobackend.py +59 -15
- ultralytics/nn/modules/__init__.py +4 -4
- ultralytics/nn/modules/activation.py +4 -1
- ultralytics/nn/modules/block.py +178 -111
- ultralytics/nn/modules/conv.py +6 -5
- ultralytics/nn/modules/head.py +469 -121
- ultralytics/nn/modules/transformer.py +147 -58
- ultralytics/nn/tasks.py +227 -20
- ultralytics/nn/text_model.py +30 -33
- ultralytics/solutions/ai_gym.py +1 -1
- ultralytics/solutions/analytics.py +7 -4
- ultralytics/solutions/config.py +10 -10
- ultralytics/solutions/distance_calculation.py +11 -10
- ultralytics/solutions/heatmap.py +1 -1
- ultralytics/solutions/instance_segmentation.py +6 -3
- ultralytics/solutions/object_blurrer.py +3 -3
- ultralytics/solutions/object_counter.py +16 -8
- ultralytics/solutions/object_cropper.py +12 -5
- ultralytics/solutions/parking_management.py +29 -28
- ultralytics/solutions/queue_management.py +6 -6
- ultralytics/solutions/region_counter.py +10 -3
- ultralytics/solutions/security_alarm.py +3 -3
- ultralytics/solutions/similarity_search.py +85 -24
- ultralytics/solutions/solutions.py +215 -85
- ultralytics/solutions/speed_estimation.py +28 -22
- ultralytics/solutions/streamlit_inference.py +17 -12
- ultralytics/solutions/trackzone.py +4 -4
- ultralytics/trackers/basetrack.py +16 -23
- ultralytics/trackers/bot_sort.py +30 -20
- ultralytics/trackers/byte_tracker.py +70 -64
- ultralytics/trackers/track.py +4 -8
- ultralytics/trackers/utils/gmc.py +31 -58
- ultralytics/trackers/utils/kalman_filter.py +37 -37
- ultralytics/trackers/utils/matching.py +1 -1
- ultralytics/utils/__init__.py +105 -89
- ultralytics/utils/autobatch.py +16 -3
- ultralytics/utils/autodevice.py +54 -24
- ultralytics/utils/benchmarks.py +42 -28
- ultralytics/utils/callbacks/base.py +3 -3
- ultralytics/utils/callbacks/clearml.py +9 -9
- ultralytics/utils/callbacks/comet.py +67 -25
- ultralytics/utils/callbacks/dvc.py +7 -10
- ultralytics/utils/callbacks/mlflow.py +2 -5
- ultralytics/utils/callbacks/neptune.py +7 -13
- ultralytics/utils/callbacks/raytune.py +1 -1
- ultralytics/utils/callbacks/tensorboard.py +5 -6
- ultralytics/utils/callbacks/wb.py +14 -14
- ultralytics/utils/checks.py +14 -13
- ultralytics/utils/dist.py +5 -5
- ultralytics/utils/downloads.py +94 -67
- ultralytics/utils/errors.py +5 -5
- ultralytics/utils/export.py +61 -47
- ultralytics/utils/files.py +23 -22
- ultralytics/utils/instance.py +48 -52
- ultralytics/utils/loss.py +78 -40
- ultralytics/utils/metrics.py +186 -130
- ultralytics/utils/ops.py +186 -190
- ultralytics/utils/patches.py +15 -17
- ultralytics/utils/plotting.py +71 -27
- ultralytics/utils/tal.py +21 -15
- ultralytics/utils/torch_utils.py +53 -50
- ultralytics/utils/triton.py +5 -4
- ultralytics/utils/tuner.py +5 -5
- {ultralytics-8.3.142.dist-info → ultralytics-8.3.144.dist-info}/METADATA +1 -1
- ultralytics-8.3.144.dist-info/RECORD +272 -0
- ultralytics-8.3.142.dist-info/RECORD +0 -272
- {ultralytics-8.3.142.dist-info → ultralytics-8.3.144.dist-info}/WHEEL +0 -0
- {ultralytics-8.3.142.dist-info → ultralytics-8.3.144.dist-info}/entry_points.txt +0 -0
- {ultralytics-8.3.142.dist-info → ultralytics-8.3.144.dist-info}/licenses/LICENSE +0 -0
- {ultralytics-8.3.142.dist-info → ultralytics-8.3.144.dist-info}/top_level.txt +0 -0
ultralytics/nn/tasks.py
CHANGED
@@ -94,7 +94,30 @@ from ultralytics.utils.torch_utils import (
|
|
94
94
|
|
95
95
|
|
96
96
|
class BaseModel(torch.nn.Module):
|
97
|
-
"""
|
97
|
+
"""
|
98
|
+
Base class for all YOLO models in the Ultralytics family.
|
99
|
+
|
100
|
+
This class provides common functionality for YOLO models including forward pass handling, model fusion,
|
101
|
+
information display, and weight loading capabilities.
|
102
|
+
|
103
|
+
Attributes:
|
104
|
+
model (torch.nn.Module): The neural network model.
|
105
|
+
save (list): List of layer indices to save outputs from.
|
106
|
+
stride (torch.Tensor): Model stride values.
|
107
|
+
|
108
|
+
Methods:
|
109
|
+
forward: Perform forward pass for training or inference.
|
110
|
+
predict: Perform inference on input tensor.
|
111
|
+
fuse: Fuse Conv2d and BatchNorm2d layers for optimization.
|
112
|
+
info: Print model information.
|
113
|
+
load: Load weights into the model.
|
114
|
+
loss: Compute loss for training.
|
115
|
+
|
116
|
+
Examples:
|
117
|
+
Create a BaseModel instance
|
118
|
+
>>> model = BaseModel()
|
119
|
+
>>> model.info() # Display model information
|
120
|
+
"""
|
98
121
|
|
99
122
|
def forward(self, x, *args, **kwargs):
|
100
123
|
"""
|
@@ -319,7 +342,33 @@ class BaseModel(torch.nn.Module):
|
|
319
342
|
|
320
343
|
|
321
344
|
class DetectionModel(BaseModel):
|
322
|
-
"""
|
345
|
+
"""
|
346
|
+
YOLO detection model.
|
347
|
+
|
348
|
+
This class implements the YOLO detection architecture, handling model initialization, forward pass,
|
349
|
+
augmented inference, and loss computation for object detection tasks.
|
350
|
+
|
351
|
+
Attributes:
|
352
|
+
yaml (dict): Model configuration dictionary.
|
353
|
+
model (torch.nn.Sequential): The neural network model.
|
354
|
+
save (list): List of layer indices to save outputs from.
|
355
|
+
names (dict): Class names dictionary.
|
356
|
+
inplace (bool): Whether to use inplace operations.
|
357
|
+
end2end (bool): Whether the model uses end-to-end detection.
|
358
|
+
stride (torch.Tensor): Model stride values.
|
359
|
+
|
360
|
+
Methods:
|
361
|
+
__init__: Initialize the YOLO detection model.
|
362
|
+
_predict_augment: Perform augmented inference.
|
363
|
+
_descale_pred: De-scale predictions following augmented inference.
|
364
|
+
_clip_augmented: Clip YOLO augmented inference tails.
|
365
|
+
init_criterion: Initialize the loss criterion.
|
366
|
+
|
367
|
+
Examples:
|
368
|
+
Initialize a detection model
|
369
|
+
>>> model = DetectionModel("yolo11n.yaml", ch=3, nc=80)
|
370
|
+
>>> results = model.predict(image_tensor)
|
371
|
+
"""
|
323
372
|
|
324
373
|
def __init__(self, cfg="yolo11n.yaml", ch=3, nc=None, verbose=True):
|
325
374
|
"""
|
@@ -447,7 +496,21 @@ class DetectionModel(BaseModel):
|
|
447
496
|
|
448
497
|
|
449
498
|
class OBBModel(DetectionModel):
|
450
|
-
"""
|
499
|
+
"""
|
500
|
+
YOLO Oriented Bounding Box (OBB) model.
|
501
|
+
|
502
|
+
This class extends DetectionModel to handle oriented bounding box detection tasks, providing specialized
|
503
|
+
loss computation for rotated object detection.
|
504
|
+
|
505
|
+
Methods:
|
506
|
+
__init__: Initialize YOLO OBB model.
|
507
|
+
init_criterion: Initialize the loss criterion for OBB detection.
|
508
|
+
|
509
|
+
Examples:
|
510
|
+
Initialize an OBB model
|
511
|
+
>>> model = OBBModel("yolo11n-obb.yaml", ch=3, nc=80)
|
512
|
+
>>> results = model.predict(image_tensor)
|
513
|
+
"""
|
451
514
|
|
452
515
|
def __init__(self, cfg="yolo11n-obb.yaml", ch=3, nc=None, verbose=True):
|
453
516
|
"""
|
@@ -467,7 +530,21 @@ class OBBModel(DetectionModel):
|
|
467
530
|
|
468
531
|
|
469
532
|
class SegmentationModel(DetectionModel):
|
470
|
-
"""
|
533
|
+
"""
|
534
|
+
YOLO segmentation model.
|
535
|
+
|
536
|
+
This class extends DetectionModel to handle instance segmentation tasks, providing specialized
|
537
|
+
loss computation for pixel-level object detection and segmentation.
|
538
|
+
|
539
|
+
Methods:
|
540
|
+
__init__: Initialize YOLO segmentation model.
|
541
|
+
init_criterion: Initialize the loss criterion for segmentation.
|
542
|
+
|
543
|
+
Examples:
|
544
|
+
Initialize a segmentation model
|
545
|
+
>>> model = SegmentationModel("yolo11n-seg.yaml", ch=3, nc=80)
|
546
|
+
>>> results = model.predict(image_tensor)
|
547
|
+
"""
|
471
548
|
|
472
549
|
def __init__(self, cfg="yolo11n-seg.yaml", ch=3, nc=None, verbose=True):
|
473
550
|
"""
|
@@ -487,7 +564,24 @@ class SegmentationModel(DetectionModel):
|
|
487
564
|
|
488
565
|
|
489
566
|
class PoseModel(DetectionModel):
|
490
|
-
"""
|
567
|
+
"""
|
568
|
+
YOLO pose model.
|
569
|
+
|
570
|
+
This class extends DetectionModel to handle human pose estimation tasks, providing specialized
|
571
|
+
loss computation for keypoint detection and pose estimation.
|
572
|
+
|
573
|
+
Attributes:
|
574
|
+
kpt_shape (tuple): Shape of keypoints data (num_keypoints, num_dimensions).
|
575
|
+
|
576
|
+
Methods:
|
577
|
+
__init__: Initialize YOLO pose model.
|
578
|
+
init_criterion: Initialize the loss criterion for pose estimation.
|
579
|
+
|
580
|
+
Examples:
|
581
|
+
Initialize a pose model
|
582
|
+
>>> model = PoseModel("yolo11n-pose.yaml", ch=3, nc=1, data_kpt_shape=(17, 3))
|
583
|
+
>>> results = model.predict(image_tensor)
|
584
|
+
"""
|
491
585
|
|
492
586
|
def __init__(self, cfg="yolo11n-pose.yaml", ch=3, nc=None, data_kpt_shape=(None, None), verbose=True):
|
493
587
|
"""
|
@@ -513,7 +607,29 @@ class PoseModel(DetectionModel):
|
|
513
607
|
|
514
608
|
|
515
609
|
class ClassificationModel(BaseModel):
|
516
|
-
"""
|
610
|
+
"""
|
611
|
+
YOLO classification model.
|
612
|
+
|
613
|
+
This class implements the YOLO classification architecture for image classification tasks,
|
614
|
+
providing model initialization, configuration, and output reshaping capabilities.
|
615
|
+
|
616
|
+
Attributes:
|
617
|
+
yaml (dict): Model configuration dictionary.
|
618
|
+
model (torch.nn.Sequential): The neural network model.
|
619
|
+
stride (torch.Tensor): Model stride values.
|
620
|
+
names (dict): Class names dictionary.
|
621
|
+
|
622
|
+
Methods:
|
623
|
+
__init__: Initialize ClassificationModel.
|
624
|
+
_from_yaml: Set model configurations and define architecture.
|
625
|
+
reshape_outputs: Update model to specified class count.
|
626
|
+
init_criterion: Initialize the loss criterion.
|
627
|
+
|
628
|
+
Examples:
|
629
|
+
Initialize a classification model
|
630
|
+
>>> model = ClassificationModel("yolo11n-cls.yaml", ch=3, nc=1000)
|
631
|
+
>>> results = model.predict(image_tensor)
|
632
|
+
"""
|
517
633
|
|
518
634
|
def __init__(self, cfg="yolo11n-cls.yaml", ch=3, nc=None, verbose=True):
|
519
635
|
"""
|
@@ -594,10 +710,20 @@ class RTDETRDetectionModel(DetectionModel):
|
|
594
710
|
the training and inference processes. RTDETR is an object detection and tracking model that extends from the
|
595
711
|
DetectionModel base class.
|
596
712
|
|
713
|
+
Attributes:
|
714
|
+
nc (int): Number of classes for detection.
|
715
|
+
criterion (RTDETRDetectionLoss): Loss function for training.
|
716
|
+
|
597
717
|
Methods:
|
598
|
-
|
599
|
-
|
600
|
-
|
718
|
+
__init__: Initialize the RTDETRDetectionModel.
|
719
|
+
init_criterion: Initialize the loss criterion.
|
720
|
+
loss: Compute loss for training.
|
721
|
+
predict: Perform forward pass through the model.
|
722
|
+
|
723
|
+
Examples:
|
724
|
+
Initialize an RTDETR model
|
725
|
+
>>> model = RTDETRDetectionModel("rtdetr-l.yaml", ch=3, nc=80)
|
726
|
+
>>> results = model.predict(image_tensor)
|
601
727
|
"""
|
602
728
|
|
603
729
|
def __init__(self, cfg="rtdetr-l.yaml", ch=3, nc=None, verbose=True):
|
@@ -627,7 +753,8 @@ class RTDETRDetectionModel(DetectionModel):
|
|
627
753
|
preds (torch.Tensor, optional): Precomputed model predictions.
|
628
754
|
|
629
755
|
Returns:
|
630
|
-
(
|
756
|
+
loss_sum (torch.Tensor): Total loss value.
|
757
|
+
loss_items (torch.Tensor): Main three losses in a tensor.
|
631
758
|
"""
|
632
759
|
if not hasattr(self, "criterion"):
|
633
760
|
self.criterion = self.init_criterion()
|
@@ -700,7 +827,29 @@ class RTDETRDetectionModel(DetectionModel):
|
|
700
827
|
|
701
828
|
|
702
829
|
class WorldModel(DetectionModel):
|
703
|
-
"""
|
830
|
+
"""
|
831
|
+
YOLOv8 World Model.
|
832
|
+
|
833
|
+
This class implements the YOLOv8 World model for open-vocabulary object detection, supporting text-based
|
834
|
+
class specification and CLIP model integration for zero-shot detection capabilities.
|
835
|
+
|
836
|
+
Attributes:
|
837
|
+
txt_feats (torch.Tensor): Text feature embeddings for classes.
|
838
|
+
clip_model (torch.nn.Module): CLIP model for text encoding.
|
839
|
+
|
840
|
+
Methods:
|
841
|
+
__init__: Initialize YOLOv8 world model.
|
842
|
+
set_classes: Set classes for offline inference.
|
843
|
+
get_text_pe: Get text positional embeddings.
|
844
|
+
predict: Perform forward pass with text features.
|
845
|
+
loss: Compute loss with text features.
|
846
|
+
|
847
|
+
Examples:
|
848
|
+
Initialize a world model
|
849
|
+
>>> model = WorldModel("yolov8s-world.yaml", ch=3, nc=80)
|
850
|
+
>>> model.set_classes(["person", "car", "bicycle"])
|
851
|
+
>>> results = model.predict(image_tensor)
|
852
|
+
"""
|
704
853
|
|
705
854
|
def __init__(self, cfg="yolov8s-world.yaml", ch=3, nc=None, verbose=True):
|
706
855
|
"""
|
@@ -815,7 +964,32 @@ class WorldModel(DetectionModel):
|
|
815
964
|
|
816
965
|
|
817
966
|
class YOLOEModel(DetectionModel):
|
818
|
-
"""
|
967
|
+
"""
|
968
|
+
YOLOE detection model.
|
969
|
+
|
970
|
+
This class implements the YOLOE architecture for efficient object detection with text and visual prompts,
|
971
|
+
supporting both prompt-based and prompt-free inference modes.
|
972
|
+
|
973
|
+
Attributes:
|
974
|
+
pe (torch.Tensor): Prompt embeddings for classes.
|
975
|
+
clip_model (torch.nn.Module): CLIP model for text encoding.
|
976
|
+
|
977
|
+
Methods:
|
978
|
+
__init__: Initialize YOLOE model.
|
979
|
+
get_text_pe: Get text positional embeddings.
|
980
|
+
get_visual_pe: Get visual embeddings.
|
981
|
+
set_vocab: Set vocabulary for prompt-free model.
|
982
|
+
get_vocab: Get fused vocabulary layer.
|
983
|
+
set_classes: Set classes for offline inference.
|
984
|
+
get_cls_pe: Get class positional embeddings.
|
985
|
+
predict: Perform forward pass with prompts.
|
986
|
+
loss: Compute loss with prompts.
|
987
|
+
|
988
|
+
Examples:
|
989
|
+
Initialize a YOLOE model
|
990
|
+
>>> model = YOLOEModel("yoloe-v8s.yaml", ch=3, nc=80)
|
991
|
+
>>> results = model.predict(image_tensor, tpe=text_embeddings)
|
992
|
+
"""
|
819
993
|
|
820
994
|
def __init__(self, cfg="yoloe-v8s.yaml", ch=3, nc=None, verbose=True):
|
821
995
|
"""
|
@@ -861,7 +1035,7 @@ class YOLOEModel(DetectionModel):
|
|
861
1035
|
assert not self.training
|
862
1036
|
head = self.model[-1]
|
863
1037
|
assert isinstance(head, YOLOEDetect)
|
864
|
-
return head.get_tpe(txt_feats) # run
|
1038
|
+
return head.get_tpe(txt_feats) # run auxiliary text head
|
865
1039
|
|
866
1040
|
@smart_inference_mode()
|
867
1041
|
def get_visual_pe(self, img, visual):
|
@@ -1040,7 +1214,21 @@ class YOLOEModel(DetectionModel):
|
|
1040
1214
|
|
1041
1215
|
|
1042
1216
|
class YOLOESegModel(YOLOEModel, SegmentationModel):
|
1043
|
-
"""
|
1217
|
+
"""
|
1218
|
+
YOLOE segmentation model.
|
1219
|
+
|
1220
|
+
This class extends YOLOEModel to handle instance segmentation tasks with text and visual prompts,
|
1221
|
+
providing specialized loss computation for pixel-level object detection and segmentation.
|
1222
|
+
|
1223
|
+
Methods:
|
1224
|
+
__init__: Initialize YOLOE segmentation model.
|
1225
|
+
loss: Compute loss with prompts for segmentation.
|
1226
|
+
|
1227
|
+
Examples:
|
1228
|
+
Initialize a YOLOE segmentation model
|
1229
|
+
>>> model = YOLOESegModel("yoloe-v8s-seg.yaml", ch=3, nc=80)
|
1230
|
+
>>> results = model.predict(image_tensor, tpe=text_embeddings)
|
1231
|
+
"""
|
1044
1232
|
|
1045
1233
|
def __init__(self, cfg="yoloe-v8s-seg.yaml", ch=3, nc=None, verbose=True):
|
1046
1234
|
"""
|
@@ -1074,7 +1262,23 @@ class YOLOESegModel(YOLOEModel, SegmentationModel):
|
|
1074
1262
|
|
1075
1263
|
|
1076
1264
|
class Ensemble(torch.nn.ModuleList):
|
1077
|
-
"""
|
1265
|
+
"""
|
1266
|
+
Ensemble of models.
|
1267
|
+
|
1268
|
+
This class allows combining multiple YOLO models into an ensemble for improved performance through
|
1269
|
+
model averaging or other ensemble techniques.
|
1270
|
+
|
1271
|
+
Methods:
|
1272
|
+
__init__: Initialize an ensemble of models.
|
1273
|
+
forward: Generate predictions from all models in the ensemble.
|
1274
|
+
|
1275
|
+
Examples:
|
1276
|
+
Create an ensemble of models
|
1277
|
+
>>> ensemble = Ensemble()
|
1278
|
+
>>> ensemble.append(model1)
|
1279
|
+
>>> ensemble.append(model2)
|
1280
|
+
>>> results = ensemble(image_tensor)
|
1281
|
+
"""
|
1078
1282
|
|
1079
1283
|
def __init__(self):
|
1080
1284
|
"""Initialize an ensemble of models."""
|
@@ -1091,7 +1295,8 @@ class Ensemble(torch.nn.ModuleList):
|
|
1091
1295
|
visualize (bool): Whether to visualize the features.
|
1092
1296
|
|
1093
1297
|
Returns:
|
1094
|
-
(
|
1298
|
+
y (torch.Tensor): Concatenated predictions from all models.
|
1299
|
+
train_out (None): Always None for ensemble inference.
|
1095
1300
|
"""
|
1096
1301
|
y = [module(x, augment, profile, visualize)[0] for module in self]
|
1097
1302
|
# y = torch.stack(y).max(0)[0] # max ensemble
|
@@ -1195,7 +1400,7 @@ class SafeUnpickler(pickle.Unpickler):
|
|
1195
1400
|
|
1196
1401
|
def torch_safe_load(weight, safe_only=False):
|
1197
1402
|
"""
|
1198
|
-
|
1403
|
+
Attempt to load a PyTorch model with the torch.load() function. If a ModuleNotFoundError is raised, it catches the
|
1199
1404
|
error, logs a warning message, and attempts to install the missing module via the check_requirements() function.
|
1200
1405
|
After installation, the function again attempts to load the model using torch.load().
|
1201
1406
|
|
@@ -1329,7 +1534,8 @@ def attempt_load_one_weight(weight, device=None, inplace=True, fuse=False):
|
|
1329
1534
|
fuse (bool): Whether to fuse model.
|
1330
1535
|
|
1331
1536
|
Returns:
|
1332
|
-
(
|
1537
|
+
model (torch.nn.Module): Loaded model.
|
1538
|
+
ckpt (dict): Model checkpoint dictionary.
|
1333
1539
|
"""
|
1334
1540
|
ckpt, weight = torch_safe_load(weight) # load ckpt
|
1335
1541
|
args = {**DEFAULT_CFG_DICT, **(ckpt.get("train_args", {}))} # combine model and default args, preferring model args
|
@@ -1355,7 +1561,7 @@ def attempt_load_one_weight(weight, device=None, inplace=True, fuse=False):
|
|
1355
1561
|
return model, ckpt
|
1356
1562
|
|
1357
1563
|
|
1358
|
-
def parse_model(d, ch, verbose=True):
|
1564
|
+
def parse_model(d, ch, verbose=True):
|
1359
1565
|
"""
|
1360
1566
|
Parse a YOLO model.yaml dictionary into a PyTorch model.
|
1361
1567
|
|
@@ -1365,7 +1571,8 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
|
|
1365
1571
|
verbose (bool): Whether to print model details.
|
1366
1572
|
|
1367
1573
|
Returns:
|
1368
|
-
(
|
1574
|
+
model (torch.nn.Sequential): PyTorch model.
|
1575
|
+
save (list): Sorted list of output layers.
|
1369
1576
|
"""
|
1370
1577
|
import ast
|
1371
1578
|
|
ultralytics/nn/text_model.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
from abc import abstractmethod
|
4
4
|
from pathlib import Path
|
5
|
+
from typing import List, Union
|
5
6
|
|
6
7
|
import torch
|
7
8
|
import torch.nn as nn
|
@@ -21,11 +22,11 @@ class TextModel(nn.Module):
|
|
21
22
|
Abstract base class for text encoding models.
|
22
23
|
|
23
24
|
This class defines the interface for text encoding models used in vision-language tasks. Subclasses must implement
|
24
|
-
the tokenize and encode_text methods.
|
25
|
+
the tokenize and encode_text methods to provide text tokenization and encoding functionality.
|
25
26
|
|
26
27
|
Methods:
|
27
|
-
tokenize: Convert input texts to tokens.
|
28
|
-
encode_text: Encode tokenized texts into feature vectors.
|
28
|
+
tokenize: Convert input texts to tokens for model processing.
|
29
|
+
encode_text: Encode tokenized texts into normalized feature vectors.
|
29
30
|
"""
|
30
31
|
|
31
32
|
def __init__(self):
|
@@ -33,12 +34,12 @@ class TextModel(nn.Module):
|
|
33
34
|
super().__init__()
|
34
35
|
|
35
36
|
@abstractmethod
|
36
|
-
def tokenize(texts):
|
37
|
+
def tokenize(self, texts):
|
37
38
|
"""Convert input texts to tokens for model processing."""
|
38
39
|
pass
|
39
40
|
|
40
41
|
@abstractmethod
|
41
|
-
def encode_text(texts, dtype):
|
42
|
+
def encode_text(self, texts, dtype):
|
42
43
|
"""Encode tokenized texts into normalized feature vectors."""
|
43
44
|
pass
|
44
45
|
|
@@ -59,7 +60,6 @@ class CLIP(TextModel):
|
|
59
60
|
encode_text: Encode tokenized texts into normalized feature vectors.
|
60
61
|
|
61
62
|
Examples:
|
62
|
-
>>> from ultralytics.models.sam import CLIP
|
63
63
|
>>> import torch
|
64
64
|
>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
65
65
|
>>> clip_model = CLIP(size="ViT-B/32", device=device)
|
@@ -68,7 +68,7 @@ class CLIP(TextModel):
|
|
68
68
|
>>> print(text_features.shape)
|
69
69
|
"""
|
70
70
|
|
71
|
-
def __init__(self, size, device):
|
71
|
+
def __init__(self, size: str, device: torch.device):
|
72
72
|
"""
|
73
73
|
Initialize the CLIP text encoder.
|
74
74
|
|
@@ -81,7 +81,6 @@ class CLIP(TextModel):
|
|
81
81
|
|
82
82
|
Examples:
|
83
83
|
>>> import torch
|
84
|
-
>>> from ultralytics.models.sam.modules.clip import CLIP
|
85
84
|
>>> clip_model = CLIP("ViT-B/32", device=torch.device("cuda:0"))
|
86
85
|
>>> text_features = clip_model.encode_text(["a photo of a cat", "a photo of a dog"])
|
87
86
|
"""
|
@@ -91,7 +90,7 @@ class CLIP(TextModel):
|
|
91
90
|
self.device = device
|
92
91
|
self.eval()
|
93
92
|
|
94
|
-
def tokenize(self, texts):
|
93
|
+
def tokenize(self, texts: Union[str, List[str]]):
|
95
94
|
"""
|
96
95
|
Convert input texts to CLIP tokens.
|
97
96
|
|
@@ -109,7 +108,7 @@ class CLIP(TextModel):
|
|
109
108
|
return clip.tokenize(texts).to(self.device)
|
110
109
|
|
111
110
|
@smart_inference_mode()
|
112
|
-
def encode_text(self, texts, dtype=torch.float32):
|
111
|
+
def encode_text(self, texts: torch.Tensor, dtype: torch.dtype = torch.float32):
|
113
112
|
"""
|
114
113
|
Encode tokenized texts into normalized feature vectors.
|
115
114
|
|
@@ -118,7 +117,7 @@ class CLIP(TextModel):
|
|
118
117
|
|
119
118
|
Args:
|
120
119
|
texts (torch.Tensor): Tokenized text inputs, typically created using the tokenize() method.
|
121
|
-
dtype (torch.dtype, optional): Data type for output features.
|
120
|
+
dtype (torch.dtype, optional): Data type for output features.
|
122
121
|
|
123
122
|
Returns:
|
124
123
|
(torch.Tensor): Normalized text feature vectors with unit length (L2 norm = 1).
|
@@ -140,7 +139,7 @@ class MobileCLIP(TextModel):
|
|
140
139
|
Implement Apple's MobileCLIP text encoder for efficient text encoding.
|
141
140
|
|
142
141
|
This class implements the TextModel interface using Apple's MobileCLIP model, providing efficient text encoding
|
143
|
-
capabilities for vision-language tasks.
|
142
|
+
capabilities for vision-language tasks with reduced computational requirements compared to standard CLIP models.
|
144
143
|
|
145
144
|
Attributes:
|
146
145
|
model (mobileclip.model.MobileCLIP): The loaded MobileCLIP model.
|
@@ -161,7 +160,7 @@ class MobileCLIP(TextModel):
|
|
161
160
|
|
162
161
|
config_size_map = {"s0": "s0", "s1": "s1", "s2": "s2", "b": "b", "blt": "b"}
|
163
162
|
|
164
|
-
def __init__(self, size, device):
|
163
|
+
def __init__(self, size: str, device: torch.device):
|
165
164
|
"""
|
166
165
|
Initialize the MobileCLIP text encoder.
|
167
166
|
|
@@ -172,7 +171,6 @@ class MobileCLIP(TextModel):
|
|
172
171
|
device (torch.device): Device to load the model on.
|
173
172
|
|
174
173
|
Examples:
|
175
|
-
>>> from ultralytics.nn.modules import MobileCLIP
|
176
174
|
>>> import torch
|
177
175
|
>>> model = MobileCLIP("s0", device=torch.device("cpu"))
|
178
176
|
>>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
|
@@ -203,12 +201,12 @@ class MobileCLIP(TextModel):
|
|
203
201
|
self.device = device
|
204
202
|
self.eval()
|
205
203
|
|
206
|
-
def tokenize(self, texts):
|
204
|
+
def tokenize(self, texts: List[str]):
|
207
205
|
"""
|
208
206
|
Convert input texts to MobileCLIP tokens.
|
209
207
|
|
210
208
|
Args:
|
211
|
-
texts (
|
209
|
+
texts (List[str]): List of text strings to tokenize.
|
212
210
|
|
213
211
|
Returns:
|
214
212
|
(torch.Tensor): Tokenized text inputs with shape (batch_size, sequence_length).
|
@@ -220,7 +218,7 @@ class MobileCLIP(TextModel):
|
|
220
218
|
return self.tokenizer(texts).to(self.device)
|
221
219
|
|
222
220
|
@smart_inference_mode()
|
223
|
-
def encode_text(self, texts, dtype=torch.float32):
|
221
|
+
def encode_text(self, texts: torch.Tensor, dtype: torch.dtype = torch.float32):
|
224
222
|
"""
|
225
223
|
Encode tokenized texts into normalized feature vectors.
|
226
224
|
|
@@ -247,11 +245,11 @@ class MobileCLIPTS(TextModel):
|
|
247
245
|
"""
|
248
246
|
Load a TorchScript traced version of MobileCLIP.
|
249
247
|
|
250
|
-
This class implements the TextModel interface using Apple's MobileCLIP model
|
251
|
-
capabilities for vision-language tasks.
|
248
|
+
This class implements the TextModel interface using Apple's MobileCLIP model in TorchScript format, providing
|
249
|
+
efficient text encoding capabilities for vision-language tasks with optimized inference performance.
|
252
250
|
|
253
251
|
Attributes:
|
254
|
-
encoder (
|
252
|
+
encoder (torch.jit.ScriptModule): The loaded TorchScript MobileCLIP text encoder.
|
255
253
|
tokenizer (callable): Tokenizer function for processing text inputs.
|
256
254
|
device (torch.device): Device where the model is loaded.
|
257
255
|
|
@@ -261,24 +259,23 @@ class MobileCLIPTS(TextModel):
|
|
261
259
|
|
262
260
|
Examples:
|
263
261
|
>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
264
|
-
>>> text_encoder =
|
262
|
+
>>> text_encoder = MobileCLIPTS(device=device)
|
265
263
|
>>> tokens = text_encoder.tokenize(["a photo of a cat", "a photo of a dog"])
|
266
264
|
>>> features = text_encoder.encode_text(tokens)
|
267
265
|
"""
|
268
266
|
|
269
|
-
def __init__(self, device):
|
267
|
+
def __init__(self, device: torch.device):
|
270
268
|
"""
|
271
|
-
Initialize the MobileCLIP text encoder.
|
269
|
+
Initialize the MobileCLIP TorchScript text encoder.
|
272
270
|
|
273
|
-
This class implements the TextModel interface using Apple's MobileCLIP model
|
271
|
+
This class implements the TextModel interface using Apple's MobileCLIP model in TorchScript format for
|
272
|
+
efficient text encoding with optimized inference performance.
|
274
273
|
|
275
274
|
Args:
|
276
275
|
device (torch.device): Device to load the model on.
|
277
276
|
|
278
277
|
Examples:
|
279
|
-
>>>
|
280
|
-
>>> import torch
|
281
|
-
>>> model = MobileCLIP(device=torch.device("cpu"))
|
278
|
+
>>> model = MobileCLIPTS(device=torch.device("cpu"))
|
282
279
|
>>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
|
283
280
|
>>> features = model.encode_text(tokens)
|
284
281
|
"""
|
@@ -289,24 +286,24 @@ class MobileCLIPTS(TextModel):
|
|
289
286
|
self.tokenizer = clip.clip.tokenize
|
290
287
|
self.device = device
|
291
288
|
|
292
|
-
def tokenize(self, texts):
|
289
|
+
def tokenize(self, texts: List[str]):
|
293
290
|
"""
|
294
291
|
Convert input texts to MobileCLIP tokens.
|
295
292
|
|
296
293
|
Args:
|
297
|
-
texts (
|
294
|
+
texts (List[str]): List of text strings to tokenize.
|
298
295
|
|
299
296
|
Returns:
|
300
297
|
(torch.Tensor): Tokenized text inputs with shape (batch_size, sequence_length).
|
301
298
|
|
302
299
|
Examples:
|
303
|
-
>>> model =
|
300
|
+
>>> model = MobileCLIPTS("cpu")
|
304
301
|
>>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
|
305
302
|
"""
|
306
303
|
return self.tokenizer(texts).to(self.device)
|
307
304
|
|
308
305
|
@smart_inference_mode()
|
309
|
-
def encode_text(self, texts, dtype=torch.float32):
|
306
|
+
def encode_text(self, texts: torch.Tensor, dtype: torch.dtype = torch.float32):
|
310
307
|
"""
|
311
308
|
Encode tokenized texts into normalized feature vectors.
|
312
309
|
|
@@ -318,7 +315,7 @@ class MobileCLIPTS(TextModel):
|
|
318
315
|
(torch.Tensor): Normalized text feature vectors with L2 normalization applied.
|
319
316
|
|
320
317
|
Examples:
|
321
|
-
>>> model =
|
318
|
+
>>> model = MobileCLIPTS(device="cpu")
|
322
319
|
>>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
|
323
320
|
>>> features = model.encode_text(tokens)
|
324
321
|
>>> features.shape
|
@@ -328,7 +325,7 @@ class MobileCLIPTS(TextModel):
|
|
328
325
|
return self.encoder(texts)
|
329
326
|
|
330
327
|
|
331
|
-
def build_text_model(variant, device=None):
|
328
|
+
def build_text_model(variant: str, device: torch.device = None):
|
332
329
|
"""
|
333
330
|
Build a text encoding model based on the specified variant.
|
334
331
|
|
ultralytics/solutions/ai_gym.py
CHANGED
@@ -19,7 +19,7 @@ class AIGym(BaseSolution):
|
|
19
19
|
kpts (List[int]): Indices of keypoints used for angle calculation.
|
20
20
|
|
21
21
|
Methods:
|
22
|
-
process:
|
22
|
+
process: Process a frame to detect poses, calculate angles, and count repetitions.
|
23
23
|
|
24
24
|
Examples:
|
25
25
|
>>> gym = AIGym(model="yolo11n-pose.pt")
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
3
|
from itertools import cycle
|
4
|
+
from typing import Dict, Optional
|
4
5
|
|
5
6
|
import cv2
|
6
7
|
import numpy as np
|
@@ -86,7 +87,7 @@ class Analytics(BaseSolution):
|
|
86
87
|
if self.type == "pie": # Ensure pie chart is circular
|
87
88
|
self.ax.axis("equal")
|
88
89
|
|
89
|
-
def process(self, im0, frame_number):
|
90
|
+
def process(self, im0: np.ndarray, frame_number: int) -> SolutionResults:
|
90
91
|
"""
|
91
92
|
Process image data and run object tracking to update analytics charts.
|
92
93
|
|
@@ -126,14 +127,16 @@ class Analytics(BaseSolution):
|
|
126
127
|
# return output dictionary with summary for more usage
|
127
128
|
return SolutionResults(plot_im=plot_im, total_tracks=len(self.track_ids), classwise_count=self.clswise_count)
|
128
129
|
|
129
|
-
def update_graph(
|
130
|
+
def update_graph(
|
131
|
+
self, frame_number: int, count_dict: Optional[Dict[str, int]] = None, plot: str = "line"
|
132
|
+
) -> np.ndarray:
|
130
133
|
"""
|
131
134
|
Update the graph with new data for single or multiple classes.
|
132
135
|
|
133
136
|
Args:
|
134
137
|
frame_number (int): The current frame number.
|
135
|
-
count_dict (Dict[str, int]
|
136
|
-
classes. If None, updates a single line graph.
|
138
|
+
count_dict (Dict[str, int], optional): Dictionary with class names as keys and counts as values for
|
139
|
+
multiple classes. If None, updates a single line graph.
|
137
140
|
plot (str): Type of the plot. Options are 'line', 'bar', 'pie', or 'area'.
|
138
141
|
|
139
142
|
Returns:
|