PyPI - supervisely - Versions diffs - 6.73.427__py3-none-any.whl → 6.73.429__py3-none-any.whl - Mend

supervisely 6.73.427py3-none-any.whl → 6.73.429py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

supervisely/nn/tracker/calculate_metrics.py ADDED Viewed

@@ -0,0 +1,264 @@
+import numpy as np
+from collections import defaultdict
+from typing import Dict, List, Union
+from scipy.optimize import linear_sum_assignment  # pylint: disable=import-error
+import supervisely as sly
+from supervisely.video_annotation.video_annotation import VideoAnnotation
+import motmetrics as mm  # pylint: disable=import-error
+class TrackingEvaluator:
+    """
+    Evaluator for video tracking metrics including MOTA, MOTP, IDF1.
+    """
+    def __init__(self, iou_threshold: float = 0.5):
+        """Initialize evaluator with IoU threshold for matching."""
+        from supervisely.nn.tracker import TRACKING_LIBS_INSTALLED
+        if not TRACKING_LIBS_INSTALLED:
+            raise ImportError(
+                "Tracking dependencies are not installed. "
+                "Please install supervisely with `pip install supervisely[tracking]`."
+            )
+        if not 0.0 <= iou_threshold <= 1.0:
+            raise ValueError("iou_threshold must be in [0.0, 1.0]")
+        self.iou_threshold = iou_threshold
+    def evaluate(
+        self,
+        gt_annotation: VideoAnnotation,
+        pred_annotation: VideoAnnotation,
+    ) -> Dict[str, Union[float, int]]:
+        """Main entry: extract tracks from annotations, compute basic and MOT metrics, return results."""
+        self._validate_annotations(gt_annotation, pred_annotation)
+        self.img_height, self.img_width = gt_annotation.img_size
+        gt_tracks = self._extract_tracks(gt_annotation)
+        pred_tracks = self._extract_tracks(pred_annotation)
+        basic = self._compute_basic_metrics(gt_tracks, pred_tracks)
+        mot = self._compute_mot_metrics(gt_tracks, pred_tracks)
+        results = {
+            # basic detection
+            "precision": basic["precision"],
+            "recall": basic["recall"],
+            "f1": basic["f1"],
+            "avg_iou": basic["avg_iou"],
+            "true_positives": basic["tp"],
+            "false_positives": basic["fp"],
+            "false_negatives": basic["fn"],
+            "total_gt_objects": basic["total_gt"],
+            "total_pred_objects": basic["total_pred"],
+            # motmetrics
+            "mota": mot["mota"],
+            "motp": mot["motp"],
+            "idf1": mot["idf1"],
+            "id_switches": mot["id_switches"],
+            "fragmentations": mot["fragmentations"],
+            "num_misses": mot["num_misses"],
+            "num_false_positives": mot["num_false_positives"],
+            # config
+            "iou_threshold": self.iou_threshold,
+        }
+        return results
+    def _validate_annotations(self, gt: VideoAnnotation, pred: VideoAnnotation):
+        """Minimal type validation for annotations."""
+        if not isinstance(gt, VideoAnnotation) or not isinstance(pred, VideoAnnotation):
+            raise TypeError("gt_annotation and pred_annotation must be VideoAnnotation instances")
+    def _extract_tracks(self, annotation: VideoAnnotation) -> Dict[int, List[Dict]]:
+        """
+        Extract tracks from a VideoAnnotation into a dict keyed by frame index.
+        Each element is a dict: {'track_id': int, 'bbox': [x1,y1,x2,y2], 'confidence': float, 'class_name': str}
+        """
+        frames_to_tracks = defaultdict(list)
+        for frame in annotation.frames:
+            frame_idx = frame.index
+            for figure in frame.figures:
+                # use track_id if present, otherwise fallback to object's key int
+                track_id = int(figure.track_id) if figure.track_id is not None else figure.video_object.key().int
+                bbox = figure.geometry
+                if not isinstance(bbox, sly.Rectangle):
+                    bbox = bbox.to_bbox()
+                x1 = float(bbox.left)
+                y1 = float(bbox.top)
+                x2 = float(bbox.right)
+                y2 = float(bbox.bottom)
+                frames_to_tracks[frame_idx].append({
+                    "track_id": track_id,
+                    "bbox": [x1, y1, x2, y2],
+                    "confidence": float(getattr(figure, "confidence", 1.0)),
+                    "class_name": figure.video_object.obj_class.name
+                })
+        return dict(frames_to_tracks)
+    def _compute_basic_metrics(self, gt_tracks: Dict[int, List[Dict]], pred_tracks: Dict[int, List[Dict]]):
+        """
+        Compute per-frame true positives / false positives / false negatives and average IoU.
+        Matching is performed with Hungarian algorithm (scipy). Matches with IoU < threshold are discarded.
+        """
+        tp = fp = fn = 0
+        total_iou = 0.0
+        iou_count = 0
+        frames = sorted(set(list(gt_tracks.keys()) + list(pred_tracks.keys())))
+        for f in frames:
+            gts = gt_tracks.get(f, [])
+            preds = pred_tracks.get(f, [])
+            if not gts and not preds:
+                continue
+            if not gts:
+                fp += len(preds)
+                continue
+            if not preds:
+                fn += len(gts)
+                continue
+            gt_boxes = np.array([g["bbox"] for g in gts])
+            pred_boxes = np.array([p["bbox"] for p in preds])
+            # get cost matrix from motmetrics (cost = 1 - IoU)
+            cost_mat = mm.distances.iou_matrix(gt_boxes, pred_boxes, max_iou=1.0)
+            # replace NaNs (if any) with a large cost so Hungarian will avoid them
+            cost_for_assignment = np.where(np.isnan(cost_mat), 1e6, cost_mat)
+            # Hungarian assignment (minimize cost -> maximize IoU)
+            row_idx, col_idx = linear_sum_assignment(cost_for_assignment)
+            matched_gt = set()
+            matched_pred = set()
+            for r, c in zip(row_idx, col_idx):
+                if r < cost_mat.shape[0] and c < cost_mat.shape[1]:
+                    # IoU = 1 - cost
+                    cost_val = cost_mat[r, c]
+                    if np.isnan(cost_val):
+                        continue
+                    iou_val = 1.0 - float(cost_val)
+                    if iou_val >= self.iou_threshold:
+                        matched_gt.add(r)
+                        matched_pred.add(c)
+                        total_iou += iou_val
+                        iou_count += 1
+            frame_tp = len(matched_gt)
+            frame_fp = len(preds) - len(matched_pred)
+            frame_fn = len(gts) - len(matched_gt)
+            tp += frame_tp
+            fp += frame_fp
+            fn += frame_fn
+        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
+        avg_iou = total_iou / iou_count if iou_count > 0 else 0.0
+        total_gt = sum(len(v) for v in gt_tracks.values())
+        total_pred = sum(len(v) for v in pred_tracks.values())
+        return {
+            "precision": precision,
+            "recall": recall,
+            "f1": f1,
+            "avg_iou": avg_iou,
+            "tp": tp,
+            "fp": fp,
+            "fn": fn,
+            "total_gt": total_gt,
+            "total_pred": total_pred,
+        }
+    def _compute_mot_metrics(self, gt_tracks: Dict[int, List[Dict]], pred_tracks: Dict[int, List[Dict]]):
+        """
+        Use motmetrics.MOTAccumulator to collect associations per frame and compute common MOT metrics.
+        Distance matrix is taken directly from motmetrics.distances.iou_matrix (which returns 1 - IoU).
+        Pairs with distance > (1 - iou_threshold) are set to infinity to exclude them from matching.
+        """
+        acc = mm.MOTAccumulator(auto_id=True)
+        frames = sorted(set(list(gt_tracks.keys()) + list(pred_tracks.keys())))
+        for f in frames:
+            gts = gt_tracks.get(f, [])
+            preds = pred_tracks.get(f, [])
+            gt_ids = [g["track_id"] for g in gts]
+            pred_ids = [p["track_id"] for p in preds]
+            if gts and preds:
+                gt_boxes = np.array([g["bbox"] for g in gts])
+                pred_boxes = np.array([p["bbox"] for p in preds])
+                # motmetrics provides a distance matrix (1 - IoU)
+                dist_mat = mm.distances.iou_matrix(gt_boxes, pred_boxes, max_iou=1.0)
+                # exclude pairs with IoU < threshold => distance > 1 - threshold
+                dist_mat = np.array(dist_mat, dtype=float)
+                dist_mat[np.isnan(dist_mat)] = np.inf
+                dist_mat[dist_mat > (1.0 - self.iou_threshold)] = np.inf
+            else:
+                dist_mat = np.full((len(gts), len(preds)), np.inf)
+            acc.update(gt_ids, pred_ids, dist_mat)
+        mh = mm.metrics.create()
+        summary = mh.compute(
+            acc,
+            metrics=[
+                "mota",
+                "motp",
+                "idf1",
+                "num_switches",
+                "num_fragmentations",
+                "num_misses",
+                "num_false_positives",
+            ],
+            name="eval",
+        )
+        def get_val(col: str, default=0.0):
+            if summary.empty or col not in summary.columns:
+                return float(default)
+            v = summary.iloc[0][col]
+            return float(v) if not np.isnan(v) else float(default)
+        return {
+            "mota": get_val("mota", 0.0),
+            "motp": get_val("motp", 0.0),
+            "idf1": get_val("idf1", 0.0),
+            "id_switches": int(get_val("num_switches", 0.0)),
+            "fragmentations": int(get_val("num_fragmentations", 0.0)),
+            "num_misses": int(get_val("num_misses", 0.0)),
+            "num_false_positives": int(get_val("num_false_positives", 0.0)),
+        }
+def evaluate(
+    gt_annotation: VideoAnnotation,
+    pred_annotation: VideoAnnotation,
+    iou_threshold: float = 0.5,
+) -> Dict[str, Union[float, int]]:
+    """
+    Evaluate tracking predictions against ground truth.
+    Args:
+        gt_annotation: Ground-truth annotation, an object of class supervisely VideoAnnotation containing reference object tracks.
+        pred_annotation: Predicted annotation, an object of class supervisely VideoAnnotation to be compared against the ground truth.
+        iou_threshold: Minimum Intersection-over-Union required for a detection to be considered a valid match.
+    Returns:
+        dict: json with evaluation metrics.
+    """
+    evaluator = TrackingEvaluator(iou_threshold=iou_threshold)
+    return evaluator.evaluate(gt_annotation, pred_annotation)

supervisely/nn/tracker/utils.py ADDED Viewed

@@ -0,0 +1,274 @@
+from typing import List, Union, Dict, Tuple
+from pathlib import Path
+from collections import defaultdict
+import numpy as np
+import supervisely as sly
+from supervisely.nn.model.prediction import Prediction
+from supervisely import VideoAnnotation
+from supervisely import logger
+def predictions_to_video_annotation(
+    predictions: List[Prediction],
+) -> VideoAnnotation:
+    """
+    Convert list of Prediction objects to VideoAnnotation.
+    Args:
+        predictions: List of Prediction objects, one per frame
+    Returns:
+        VideoAnnotation object with tracked objects
+    """
+    if not predictions:
+        raise ValueError("Empty predictions list provided")
+    frame_shape = predictions[0].annotation.img_size
+    img_h, img_w = frame_shape
+    video_objects = {}
+    frames = []
+    for pred in predictions:
+        frame_figures = []
+        frame_idx = pred.frame_index
+        # Get data using public properties
+        boxes = pred.boxes          # Public property - np.array (N, 4) in tlbr format
+        classes = pred.classes      # Public property - list of class names
+        track_ids = pred.track_ids  # Public property - can be None
+        # Skip frame if no detections
+        if len(boxes) == 0:
+            frames.append(sly.Frame(frame_idx, []))
+            continue
+        for bbox, class_name, track_id in zip(boxes, classes, track_ids):
+            # Clip bbox to image boundaries
+            # Note: pred.boxes returns tlbr format (top, left, bottom, right)
+            top, left, bottom, right = bbox
+            dims = np.array([img_h, img_w, img_h, img_w]) - 1
+            top, left, bottom, right = np.clip([top, left, bottom, right], 0, dims)
+            # Convert to integer coordinates
+            top, left, bottom, right = int(top), int(left), int(bottom), int(right)
+            # Get or create VideoObject
+            if track_id not in video_objects:
+                # Find obj_class from prediction annotation
+                obj_class = None
+                for label in pred.annotation.labels:
+                    if label.obj_class.name == class_name:
+                        obj_class = label.obj_class
+                        break
+                if obj_class is None:
+                    # Create obj_class if not found (fallback)
+                    obj_class = sly.ObjClass(class_name, sly.Rectangle)
+                video_objects[track_id] = sly.VideoObject(obj_class)
+            video_object = video_objects[track_id]
+            rect = sly.Rectangle(top=top, left=left, bottom=bottom, right=right)
+            frame_figures.append(sly.VideoFigure(video_object, rect, frame_idx, track_id=str(track_id)))
+        frames.append(sly.Frame(frame_idx, frame_figures))
+    objects = list(video_objects.values())
+    return VideoAnnotation(
+        img_size=frame_shape,
+        frames_count=len(predictions),
+        objects=sly.VideoObjectCollection(objects),
+        frames=sly.FrameCollection(frames)
+    )
+def video_annotation_to_mot(
+    annotation: VideoAnnotation,
+    output_path: Union[str, Path] = None,
+    class_to_id_mapping: Dict[str, int] = None
+) -> Union[str, List[str]]:
+    """
+    Convert Supervisely VideoAnnotation to MOT format.
+    MOT format: frame_id,track_id,left,top,width,height,confidence,class_id,visibility
+    """
+    mot_lines = []
+    # Create default class mapping if not provided
+    if class_to_id_mapping is None:
+        unique_classes = set()
+        for frame in annotation.frames:
+            for figure in frame.figures:
+                unique_classes.add(figure.video_object.obj_class.name)
+        class_to_id_mapping = {cls_name: idx + 1 for idx, cls_name in enumerate(sorted(unique_classes))}
+    # Extract tracks
+    for frame in annotation.frames:
+        frame_id = frame.index + 1  # MOT uses 1-based frame indexing
+        for figure in frame.figures:
+            # Get track ID from VideoFigure.track_id (official API)
+            if figure.track_id is not None:
+                track_id = int(figure.track_id)
+            else:
+                track_id = figure.video_object.key().int
+            # Get bounding box
+            if isinstance(figure.geometry, sly.Rectangle):
+                bbox = figure.geometry
+            else:
+                bbox = figure.geometry.to_bbox()
+            left = bbox.left
+            top = bbox.top
+            width = bbox.width
+            height = bbox.height
+            # Get class ID
+            class_name = figure.video_object.obj_class.name
+            class_id = class_to_id_mapping.get(class_name, 1)
+            # Get confidence (default)
+            confidence = 1.0
+            # Visibility (assume visible)
+            visibility = 1
+            # Create MOT line
+            mot_line = f"{frame_id},{track_id},{left:.2f},{top:.2f},{width:.2f},{height:.2f},{confidence:.3f},{class_id},{visibility}"
+            mot_lines.append(mot_line)
+    # Save to file if path provided
+    if output_path:
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_path, 'w') as f:
+            for line in mot_lines:
+                f.write(line + '\n')
+        logger.info(f"Saved MOT format to: {output_path} ({len(mot_lines)} detections)")
+        return str(output_path)
+    return mot_lines
+def mot_to_video_annotation(
+    mot_file_path: Union[str, Path],
+    img_size: Tuple[int, int] = (1080, 1920),
+    class_mapping: Dict[int, str] = None,
+    default_class_name: str = "person"
+) -> VideoAnnotation:
+    """
+    Convert MOT format tracking data to Supervisely VideoAnnotation.
+    MOT format: frame_id,track_id,left,top,width,height,confidence,class_id,visibility
+    """
+    mot_file_path = Path(mot_file_path)
+    if not mot_file_path.exists():
+        raise FileNotFoundError(f"MOT file not found: {mot_file_path}")
+    logger.info(f"Loading MOT data from: {mot_file_path}")
+    logger.info(f"Image size: {img_size} (height, width)")
+    # Default class mapping
+    if class_mapping is None:
+        class_mapping = {1: default_class_name}
+    # Parse MOT file
+    video_objects = {}  # track_id -> VideoObject
+    frames_data = defaultdict(list)  # frame_idx -> list of figures
+    max_frame_idx = 0
+    img_h, img_w = img_size
+    with open(mot_file_path, 'r') as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            try:
+                parts = line.split(',')
+                if len(parts) < 6:  # Minimum required fields
+                    continue
+                frame_id = int(parts[0])
+                track_id = int(parts[1])
+                left = float(parts[2])
+                top = float(parts[3])
+                width = float(parts[4])
+                height = float(parts[5])
+                # Optional fields
+                confidence = float(parts[6]) if len(parts) > 6 and parts[6] != '-1' else 1.0
+                class_id = int(parts[7]) if len(parts) > 7 and parts[7] != '-1' else 1
+                visibility = float(parts[8]) if len(parts) > 8 and parts[8] != '-1' else 1.0
+                frame_idx = frame_id - 1  # Convert to 0-based indexing
+                max_frame_idx = max(max_frame_idx, frame_idx)
+                # Skip low confidence detections
+                if confidence < 0.1:
+                    continue
+                # Calculate coordinates with safer clipping
+                right = left + width
+                bottom = top + height
+                # Clip to image boundaries
+                left = max(0, int(left))
+                top = max(0, int(top))
+                right = min(int(right), img_w - 1)
+                bottom = min(int(bottom), img_h - 1)
+                # Skip invalid boxes
+                if right <= left or bottom <= top:
+                    continue
+                # Get class name
+                class_name = class_mapping.get(class_id, default_class_name)
+                # Create VideoObject if not exists
+                if track_id not in video_objects:
+                    obj_class = sly.ObjClass(class_name, sly.Rectangle)
+                    video_objects[track_id] = sly.VideoObject(obj_class)
+                video_object = video_objects[track_id]
+                # Create rectangle and figure with track_id
+                rect = sly.Rectangle(top=top, left=left, bottom=bottom, right=right)
+                figure = sly.VideoFigure(video_object, rect, frame_idx, track_id=str(track_id))
+                frames_data[frame_idx].append(figure)
+            except (ValueError, IndexError) as e:
+                logger.warning(f"Skipped invalid MOT line {line_num}: {line} - {e}")
+                continue
+    # Create frames
+    frames = []
+    if frames_data:
+        frames_count = max(frames_data.keys()) + 1
+        for frame_idx in range(frames_count):
+            figures = frames_data.get(frame_idx, [])
+            frames.append(sly.Frame(frame_idx, figures))
+    else:
+        frames_count = 1
+        frames = [sly.Frame(0, [])]
+    # Create VideoAnnotation
+    objects = list(video_objects.values())
+    annotation = VideoAnnotation(
+        img_size=img_size,
+        frames_count=frames_count,
+        objects=sly.VideoObjectCollection(objects),
+        frames=sly.FrameCollection(frames)
+    )
+    logger.info(f"Created VideoAnnotation with {len(objects)} tracks and {frames_count} frames")
+    return annotation

supervisely 6.73.427__py3-none-any.whl → 6.73.429__py3-none-any.whl

supervisely 6.73.427py3-none-any.whl → 6.73.429py3-none-any.whl