PyPI - bplusplus - Versions diffs - 1.2.2__py3-none-any.whl → 1.2.3__py3-none-any.whl - Mend

bplusplus 1.2.2py3-none-any.whl → 1.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bplusplus might be problematic. Click here for more details.

Files changed (15) hide show

bplusplus/__init__.py +3 -5
bplusplus/inference.py +891 -0
bplusplus/prepare.py +419 -652
bplusplus/{hierarchical/test.py → test.py} +22 -9
bplusplus/tracker.py +261 -0
bplusplus/{hierarchical/train.py → train.py} +1 -1
bplusplus-1.2.3.dist-info/METADATA +101 -0
bplusplus-1.2.3.dist-info/RECORD +11 -0
{bplusplus-1.2.2.dist-info → bplusplus-1.2.3.dist-info}/WHEEL +1 -1
bplusplus/resnet/test.py +0 -473
bplusplus/resnet/train.py +0 -329
bplusplus/train_validate.py +0 -11
bplusplus-1.2.2.dist-info/METADATA +0 -260
bplusplus-1.2.2.dist-info/RECORD +0 -12
{bplusplus-1.2.2.dist-info → bplusplus-1.2.3.dist-info}/LICENSE +0 -0

bplusplus/{hierarchical/test.py → test.py} RENAMED Viewed

@@ -23,7 +23,7 @@ import sys
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-def test_multitask(species_list, test_set, yolo_weights, hierarchical_weights, output_dir="."):
+def test(species_list, test_set, yolo_weights, hierarchical_weights, output_dir="."):
     """
     Run the two-stage classifier on a test set.
@@ -243,8 +243,15 @@ class TestTwoStage:
                 if "species_list" in checkpoint:
                     saved_species = checkpoint["species_list"]
                     print(f"Saved model was trained on: {', '.join(saved_species)}")
-                taxonomy, species_to_genus, genus_to_family = get_taxonomy(species_names)
+                # Use saved taxonomy mappings if available
+                if "species_to_genus" in checkpoint and "genus_to_family" in checkpoint:
+                    species_to_genus = checkpoint["species_to_genus"]
+                    genus_to_family = checkpoint["genus_to_family"]
+                else:
+                    # Fallback: fetch from GBIF but this may cause index mismatches
+                    print("Warning: No taxonomy mappings in checkpoint, fetching from GBIF")
+                    _, species_to_genus, genus_to_family = get_taxonomy(species_names)
             else:
                 taxonomy, species_to_genus, genus_to_family = get_taxonomy(species_names)
         else:
@@ -285,8 +292,6 @@ class TestTwoStage:
         self.classification_model.eval()
         self.classification_transform = transforms.Compose([
-            transforms.Resize((768, 768)),  # Fixed size for all validation images
-            transforms.CenterCrop(640),
             transforms.ToTensor(),
             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
         ])
@@ -467,10 +472,18 @@ class TestTwoStage:
                          predicted_genus_frames, true_genus_frames,
                          predicted_family_frames, true_family_frames):
         """Calculate metrics at all taxonomic levels"""
-        # Get list of species, families and genera
+        # Get list of species, families and genera using the same order as model training
         species_list = self.species_names
-        genus_list = sorted(list(set(self.species_to_genus.values())))
-        family_list = sorted(list(set(self.genus_to_family.values())))
+        # Use the index mappings from the model to ensure consistency
+        if 1 in self.idx_to_level and 2 in self.idx_to_level:
+            family_list = [self.idx_to_level[1][i] for i in sorted(self.idx_to_level[1].keys())]
+            genus_list = [self.idx_to_level[2][i] for i in sorted(self.idx_to_level[2].keys())]
+        else:
+            # Fallback to sorted lists (may cause issues)
+            print("Warning: Using fallback sorted lists for taxonomy - this may cause index mismatches")
+            genus_list = sorted(list(set(self.species_to_genus.values())))
+            family_list = sorted(list(set(self.genus_to_family.values())))
         # Print the index mappings we're using for evaluation
         print("\nUsing the following index mappings for evaluation:")
@@ -665,4 +678,4 @@ if __name__ == "__main__":
     hierarchical_model_path = "/mnt/nvme0n1p1/mit/two-stage-detection/hierarchical/hierarchical-weights.pth"
     output_directory = "./output"
-    test_multitask(species_names, test_directory, yolo_model_path, hierarchical_model_path, output_directory)
+    test(species_names, test_directory, yolo_model_path, hierarchical_model_path, output_directory)

bplusplus/tracker.py ADDED Viewed

@@ -0,0 +1,261 @@
+import numpy as np
+import uuid
+from scipy.optimize import linear_sum_assignment
+from collections import deque
+class BoundingBox:
+    def __init__(self, x, y, width, height, frame_id, track_id=None):
+        self.x = x
+        self.y = y
+        self.width = width
+        self.height = height
+        self.area = width * height
+        self.frame_id = frame_id
+        self.track_id = track_id
+    def center(self):
+        return (self.x + self.width/2, self.y + self.height/2)
+    @classmethod
+    def from_xyxy(cls, x1, y1, x2, y2, frame_id, track_id=None):
+        """Create BoundingBox from x1,y1,x2,y2 coordinates"""
+        width = x2 - x1
+        height = y2 - y1
+        return cls(x1, y1, width, height, frame_id, track_id)
+class InsectTracker:
+    def __init__(self, image_height, image_width, max_frames=30, w_dist=0.7, w_area=0.3, cost_threshold=0.8, track_memory_frames=None, debug=False):
+        self.image_height = image_height
+        self.image_width = image_width
+        self.max_dist = np.sqrt(image_height**2 + image_width**2)
+        self.max_frames = max_frames
+        self.w_dist = w_dist
+        self.w_area = w_area
+        self.cost_threshold = cost_threshold
+        self.debug = debug
+        # If track_memory_frames not specified, use max_frames (full history window)
+        self.track_memory_frames = track_memory_frames if track_memory_frames is not None else max_frames
+        if self.debug:
+            print(f"DEBUG: Tracker initialized with max_frames={max_frames}, track_memory_frames={self.track_memory_frames}")
+        self.tracking_history = deque(maxlen=max_frames)
+        self.current_tracks = []
+        self.lost_tracks = {}  # track_id -> {box: BoundingBox, frames_lost: int}
+    def _generate_track_id(self):
+        """Generate a unique UUID for a new track"""
+        return str(uuid.uuid4())
+    def calculate_cost(self, box1, box2):
+        """Calculate cost between two bounding boxes as per equation (4)"""
+        # Calculate center points
+        cx1, cy1 = box1.center()
+        cx2, cy2 = box2.center()
+        # Euclidean distance (equation 1)
+        dist = np.sqrt((cx2 - cx1)**2 + (cy2 - cy1)**2)
+        # Normalized distance (equation 2 used for normalization)
+        norm_dist = dist / self.max_dist
+        # Area cost (equation 3)
+        min_area = min(box1.area, box2.area)
+        max_area = max(box1.area, box2.area)
+        area_cost = min_area / max_area if max_area > 0 else 1.0
+        # Final cost (equation 4)
+        cost = (norm_dist * self.w_dist) + ((1 - area_cost) * self.w_area)
+        return cost
+    def build_cost_matrix(self, prev_boxes, curr_boxes):
+        """Build cost matrix for Hungarian algorithm"""
+        n_prev = len(prev_boxes)
+        n_curr = len(curr_boxes)
+        n = max(n_prev, n_curr)
+        # Initialize cost matrix with high values
+        cost_matrix = np.ones((n, n)) * 999.0
+        # Fill in actual costs
+        for i in range(n_prev):
+            for j in range(n_curr):
+                cost_matrix[i, j] = self.calculate_cost(prev_boxes[i], curr_boxes[j])
+        return cost_matrix, n_prev, n_curr
+    def update(self, new_detections, frame_id):
+        """
+        Update tracking with new detections from YOLO
+        Args:
+            new_detections: List of YOLO detection boxes (x1, y1, x2, y2 format)
+            frame_id: Current frame number
+        Returns:
+            List of track IDs corresponding to each detection
+        """
+        # Handle empty detection list (no detections in this frame)
+        if not new_detections:
+            if self.debug:
+                print(f"DEBUG: Frame {frame_id} has no detections")
+            # Move all current tracks to lost tracks
+            for track in self.current_tracks:
+                if track.track_id not in self.lost_tracks:
+                    self.lost_tracks[track.track_id] = {
+                        'box': track,
+                        'frames_lost': 1
+                    }
+                    if self.debug:
+                        print(f"DEBUG: Moved track {track.track_id} to lost tracks")
+                else:
+                    self.lost_tracks[track.track_id]['frames_lost'] += 1
+            # Age lost tracks and remove old ones
+            self._age_lost_tracks()
+            self.current_tracks = []
+            self.tracking_history.append([])
+            return []
+        # Convert YOLO detections to BoundingBox objects
+        new_boxes = []
+        for i, detection in enumerate(new_detections):
+            x1, y1, x2, y2 = detection[:4]
+            bbox = BoundingBox.from_xyxy(x1, y1, x2, y2, frame_id)
+            new_boxes.append(bbox)
+        # If this is the first frame or no existing tracks, assign new track IDs to all boxes
+        if not self.current_tracks and not self.lost_tracks:
+            track_ids = []
+            for box in new_boxes:
+                box.track_id = self._generate_track_id()
+                track_ids.append(box.track_id)
+                if self.debug:
+                    print(f"DEBUG: FIRST FRAME - Assigned track ID {box.track_id} to new detection")
+            self.current_tracks = new_boxes
+            self.tracking_history.append(new_boxes)
+            return track_ids
+        # Combine current tracks and lost tracks for matching
+        all_previous_tracks = self.current_tracks.copy()
+        lost_track_list = []
+        for track_id, lost_info in self.lost_tracks.items():
+            lost_track_list.append(lost_info['box'])
+            lost_track_list[-1].track_id = track_id  # Ensure track_id is preserved
+        all_previous_tracks.extend(lost_track_list)
+        if not all_previous_tracks:
+            # No previous tracks at all, assign new IDs
+            track_ids = []
+            for box in new_boxes:
+                box.track_id = self._generate_track_id()
+                track_ids.append(box.track_id)
+                if self.debug:
+                    print(f"DEBUG: No previous tracks - Assigned track ID {box.track_id} to new detection")
+            self.current_tracks = new_boxes
+            self.tracking_history.append(new_boxes)
+            return track_ids
+        # Build cost matrix including lost tracks
+        cost_matrix, n_prev, n_curr = self.build_cost_matrix(all_previous_tracks, new_boxes)
+        # Apply Hungarian algorithm
+        row_indices, col_indices = linear_sum_assignment(cost_matrix)
+        # Assign track IDs based on the matching
+        assigned_curr_indices = set()
+        track_ids = [None] * len(new_boxes)
+        recovered_tracks = set()  # Track IDs that were recovered from lost tracks
+        if self.debug:
+            print(f"DEBUG: Hungarian assignment - rows: {row_indices}, cols: {col_indices}")
+            print(f"DEBUG: Cost threshold: {self.cost_threshold}")
+            print(f"DEBUG: Current tracks: {len(self.current_tracks)}, Lost tracks: {len(self.lost_tracks)}")
+        for i, j in zip(row_indices, col_indices):
+            # Only consider valid assignments (not dummy rows/columns)
+            if i < n_prev and j < n_curr:
+                cost = cost_matrix[i, j]
+                if self.debug:
+                    print(f"DEBUG: Checking assignment {i}->{j}, cost: {cost:.3f}")
+                # Check if cost is below threshold
+                if cost < self.cost_threshold:
+                    # Assign the track ID from previous box to current box
+                    prev_track_id = all_previous_tracks[i].track_id
+                    new_boxes[j].track_id = prev_track_id
+                    track_ids[j] = prev_track_id
+                    assigned_curr_indices.add(j)
+                    # Check if this was a lost track being recovered
+                    if prev_track_id in self.lost_tracks:
+                        recovered_tracks.add(prev_track_id)
+                        if self.debug:
+                            print(f"DEBUG: RECOVERED lost track ID {prev_track_id} for detection {j} (was lost for {self.lost_tracks[prev_track_id]['frames_lost']} frames)")
+                    else:
+                        if self.debug:
+                            print(f"DEBUG: Continued track ID {prev_track_id} for detection {j}")
+                else:
+                    if self.debug:
+                        print(f"DEBUG: Cost {cost:.3f} above threshold {self.cost_threshold}, not assigning")
+        # Remove recovered tracks from lost tracks
+        for track_id in recovered_tracks:
+            del self.lost_tracks[track_id]
+        # Assign new track IDs to unassigned current boxes (new insects)
+        for j in range(n_curr):
+            if j not in assigned_curr_indices:
+                new_boxes[j].track_id = self._generate_track_id()
+                track_ids[j] = new_boxes[j].track_id
+                if self.debug:
+                    print(f"DEBUG: Assigned NEW track ID {new_boxes[j].track_id} to detection {j}")
+        # Move unmatched current tracks to lost tracks (tracks that disappeared this frame)
+        matched_track_ids = {track_ids[j] for j in assigned_curr_indices if track_ids[j] is not None}
+        for track in self.current_tracks:
+            if track.track_id not in matched_track_ids and track.track_id not in recovered_tracks:
+                if track.track_id not in self.lost_tracks:
+                    self.lost_tracks[track.track_id] = {
+                        'box': track,
+                        'frames_lost': 1
+                    }
+                    if self.debug:
+                        print(f"DEBUG: Track {track.track_id} disappeared, moved to lost tracks")
+        # Age lost tracks and remove old ones
+        self._age_lost_tracks()
+        # Update current tracks
+        self.current_tracks = new_boxes
+        # Add to tracking history
+        self.tracking_history.append(new_boxes)
+        return track_ids
+    def _age_lost_tracks(self):
+        """Age lost tracks and remove those that have been lost too long"""
+        tracks_to_remove = []
+        for track_id, lost_info in self.lost_tracks.items():
+            lost_info['frames_lost'] += 1
+            if lost_info['frames_lost'] > self.track_memory_frames:
+                tracks_to_remove.append(track_id)
+                if self.debug:
+                    print(f"DEBUG: Permanently removing track {track_id} (lost for {lost_info['frames_lost']} frames)")
+        for track_id in tracks_to_remove:
+            del self.lost_tracks[track_id]
+    def get_tracking_stats(self):
+        """Get current tracking statistics for debugging/monitoring"""
+        return {
+            'active_tracks': len(self.current_tracks),
+            'lost_tracks': len(self.lost_tracks),
+            'active_track_ids': [track.track_id for track in self.current_tracks],
+            'lost_track_ids': list(self.lost_tracks.keys()),
+            'total_history_frames': len(self.tracking_history)
+        }

bplusplus/{hierarchical/train.py → train.py} RENAMED Viewed

@@ -14,7 +14,7 @@ import logging
 from tqdm import tqdm
 import sys
-def train_multitask(batch_size=4, epochs=30, patience=3, img_size=640, data_dir='/mnt/nvme0n1p1/datasets/insect/bjerge-train2', output_dir='./output', species_list=None):
+def train(batch_size=4, epochs=30, patience=3, img_size=640, data_dir='input', output_dir='./output', species_list=None):
     """
     Main function to run the entire training pipeline.
     Sets up datasets, model, training process and handles errors.

bplusplus-1.2.3.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,101 @@
+Metadata-Version: 2.3
+Name: bplusplus
+Version: 1.2.3
+Summary: A simple method to create AI models for biodiversity, with collect and prepare pipeline
+License: MIT
+Author: Titus Venverloo
+Author-email: tvenver@mit.edu
+Requires-Python: >=3.9.0,<4.0.0
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Requires-Dist: numpy
+Requires-Dist: pandas (==2.1.4)
+Requires-Dist: pillow
+Requires-Dist: prettytable (==3.7.0)
+Requires-Dist: pygbif (>=0.6.4,<0.7.0)
+Requires-Dist: pyyaml (==6.0.1)
+Requires-Dist: requests (==2.25.1)
+Requires-Dist: scikit-learn
+Requires-Dist: tabulate (>=0.9.0,<0.10.0)
+Requires-Dist: torch (>=2.5.0,<3.0.0)
+Requires-Dist: torchvision
+Requires-Dist: tqdm (==4.66.4)
+Requires-Dist: ultralytics (>=8.3.0)
+Requires-Dist: validators (>=0.33.0,<0.34.0)
+Description-Content-Type: text/markdown
+# Domain-Agnostic Insect Classification Pipeline
+This project provides a complete, end-to-end pipeline for building a custom insect classification system. The framework is designed to be **domain-agnostic**, allowing you to train a powerful detection and classification model for **any insect species** by simply providing a list of names.
+Using the `Bplusplus` library, this pipeline automates the entire machine learning workflow, from data collection to video inference.
+## Key Features
+- **Automated Data Collection**: Downloads hundreds of images for any species from the GBIF database.
+- **Intelligent Data Preparation**: Uses a pre-trained model to automatically find, crop, and resize insects from raw images, ensuring high-quality training data.
+- **Hierarchical Classification**: Trains a model to identify insects at three taxonomic levels: **family, genus, and species**.
+- **Video Inference & Tracking**: Processes video files to detect, classify, and track individual insects over time, providing aggregated predictions.
+## Pipeline Overview
+The process is broken down into six main steps, all detailed in the `full_pipeline.ipynb` notebook:
+1.  **Collect Data**: Select your target species and fetch raw insect images from the web.
+2.  **Prepare Data**: Filter, clean, and prepare images for training.
+3.  **Train Model**: Train the hierarchical classification model.
+4.  **Download Weights**: Fetch pre-trained weights for the detection model.
+5.  **Test Model**: Evaluate the performance of the trained model.
+6.  **Run Inference**: Run the full pipeline on a video file for real-world application.
+## How to Use
+### Prerequisites
+- Python 3.8+
+- `venv` for creating a virtual environment (recommended)
+### Setup
+1.  **Create and activate a virtual environment:**
+    ```bash
+    python3 -m venv venv
+    source venv/bin/activate
+    ```
+2.  **Install the required packages:**
+    ```bash
+    pip install bplusplus
+    ```
+### Running the Pipeline
+The entire workflow is contained within **`full_pipeline.ipynb`**. Open it with a Jupyter Notebook or JupyterLab environment and run the cells sequentially to execute the full pipeline.
+### Customization
+To train the model on different insect species, simply modify the `names` list in **Step 1** of the notebook:
+```python
+# a/full_pipeline.ipynb
+# To use your own species, change the names in this list
+names = [
+    "Vespa crabro", "Vespula vulgaris", "Dolichovespula media"
+]
+```
+The pipeline will automatically handle the rest, from data collection to training, for your new set of species.
+## Directory Structure
+The pipeline will create the following directories to store artifacts:
+- `GBIF_data/`: Stores the raw images downloaded from GBIF.
+- `prepared_data/`: Contains the cleaned, cropped, and resized images ready for training.
+- `trained_model/`: Saves the trained model weights (`best_multitask.pt`) and pre-trained detection weights.

bplusplus-1.2.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+bplusplus/__init__.py,sha256=GLqIx6Ln3Jt_Q95zkqXglKaSF3dbw-awax4dYry3tw0,145
+bplusplus/collect.py,sha256=lEJHXPpOo4DALBw6zemdmFuqAXZ12-BKwgesvq5ACYs,7135
+bplusplus/inference.py,sha256=3XmwzEfVTw5OFiMbMVgiuEa-r22HvMUWHVXESZsTIzo,37708
+bplusplus/prepare.py,sha256=pdXUVAzY030tM6f0Gf_zml8I26lS38wuvH13R2F00Do,25829
+bplusplus/test.py,sha256=kKjrsb3iCfljtRjot_kiVB5hopMkApoW9yvMcuI2O_U,30545
+bplusplus/tracker.py,sha256=JixV1ICGywGhVMTvkq3hrk4MLUUWDh3XJW4VLm4JdO0,11250
+bplusplus/train.py,sha256=wkHnKbTdZAFn2voJS7gSYXU7B9UVYVYmbTJCR0tFzs4,28058
+bplusplus-1.2.3.dist-info/LICENSE,sha256=rRkeHptDnlmviR0_WWgNT9t696eys_cjfVUU8FEO4k4,1071
+bplusplus-1.2.3.dist-info/METADATA,sha256=IDnokwF2CEyM_3xLmlRL30k2P9NglDjdjbxC7-UZoc4,4046
+bplusplus-1.2.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+bplusplus-1.2.3.dist-info/RECORD,,

{bplusplus-1.2.2.dist-info → bplusplus-1.2.3.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 2.1.2
+Generator: poetry-core 2.1.3
 Root-Is-Purelib: true
 Tag: py3-none-any

bplusplus 1.2.2__py3-none-any.whl → 1.2.3__py3-none-any.whl

Potentially problematic release.

bplusplus 1.2.2py3-none-any.whl → 1.2.3py3-none-any.whl