bplusplus 1.2.1__py3-none-any.whl → 1.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bplusplus might be problematic. Click here for more details.

bplusplus/prepare.py CHANGED
@@ -2,17 +2,12 @@ import os
2
2
  import random
3
3
  import shutil
4
4
  import tempfile
5
- from collections import defaultdict
6
5
  from pathlib import Path
7
- from typing import Any, Optional
6
+ from typing import Optional
8
7
 
9
- import matplotlib.pyplot as plt
10
- import numpy as np
11
8
  import requests
12
9
  import torch
13
- import yaml
14
- from PIL import Image, ImageDraw, ImageFont
15
- from prettytable import PrettyTable
10
+ from PIL import Image
16
11
  from torch import serialization
17
12
  from torch.nn import Module, ModuleDict, ModuleList
18
13
  from torch.nn.modules.activation import LeakyReLU, ReLU, SiLU
@@ -31,272 +26,290 @@ from ultralytics.nn.modules.block import DFL
31
26
  from ultralytics.nn.modules.conv import Conv
32
27
  from ultralytics.nn.tasks import DetectionModel
33
28
 
34
- from .collect import Group, collect
35
-
36
-
37
- def prepare(input_directory: str, output_directory: str, one_stage: bool = False, with_background: bool = False, size_filter: bool = False, sizes: list = None):
38
29
 
30
+ def prepare(input_directory: str, output_directory: str, img_size: int = 40):
39
31
  """
40
- Prepares the dataset for training by performing the following steps:
41
- 1. Copies images from the input directory to a temporary directory.
42
- 2. Deletes corrupted images.
43
- 3. Downloads YOLOv5 weights if not already present.
44
- 4. Runs YOLOv5 inference to generate labels for the images.
45
- 5. Deletes orphaned images and inferences.
46
- 6. Updates labels based on class mapping.
47
- 7. Splits the data into train, test, and validation sets.
48
- 8. Counts the total number of images across all splits.
49
- 9. Makes a YAML configuration file for YOLOv8.
32
+ Prepares a YOLO classification dataset by performing the following steps:
33
+ 1. Copies images from input directory to temporary directory and creates class mapping.
34
+ 2. Deletes corrupted images and downloads YOLO model weights if not present.
35
+ 3. Runs YOLO inference to generate detection labels (bounding boxes) for the images.
36
+ 4. Cleans up orphaned images, invalid labels, and updates labels with class indices.
37
+ 5. Crops detected objects from images based on bounding boxes and resizes them.
38
+ 6. Splits data into train/valid sets with classification folder structure (train/class_name/image.jpg).
50
39
 
51
40
  Args:
52
41
  input_directory (str): The path to the input directory containing the images.
53
- output_directory (str): The path to the output directory where the prepared dataset will be saved.
42
+ output_directory (str): The path to the output directory where the prepared classification dataset will be saved.
43
+ img_size (int, optional): The target size for the smallest dimension of cropped images. Defaults to 40.
54
44
  """
55
-
56
45
  input_directory = Path(input_directory)
57
46
  output_directory = Path(output_directory)
58
47
 
59
- class_mapping={}
48
+ print("="*60)
49
+ print("STARTING BPLUSPLUS DATASET PREPARATION")
50
+ print("="*60)
51
+ print(f"Input directory: {input_directory}")
52
+ print(f"Output directory: {output_directory}")
53
+ print(f"Target image size: {img_size}px (smallest dimension)")
54
+ print()
60
55
 
61
56
  with tempfile.TemporaryDirectory() as temp_dir:
62
-
63
57
  temp_dir_path = Path(temp_dir)
64
- images_path = temp_dir_path / "images"
65
-
66
- images_path.mkdir(parents=True, exist_ok=True)
67
-
68
- for folder_directory in input_directory.iterdir():
69
- images_names = []
70
- if folder_directory.is_dir():
71
- folder_name = folder_directory.name
72
- for image_file in folder_directory.glob("*.jpg"):
73
- shutil.copy(image_file, images_path)
74
- image_name = image_file.name
75
- images_names.append(image_name)
76
-
77
- class_mapping[folder_name] = images_names
78
-
79
- original_image_count = len(list(images_path.glob("*.jpg"))) + len(list(images_path.glob("*.jpeg")))
80
-
81
- __delete_corrupted_images(images_path)
82
-
83
- current_dir = Path(__file__).resolve().parent
84
-
85
- weights_path = current_dir / 'small-generic.pt'
86
-
87
- github_release_url = 'https://github.com/orlandocloss/TwoStageInsectDetection/releases/download/models/small-generic.pt'
88
-
89
- if not weights_path.exists():
90
- __download_file_from_github_release(github_release_url, weights_path)
91
-
92
- # Add all required classes to safe globals
93
- serialization.add_safe_globals([
94
- DetectionModel, Sequential, Conv, Conv2d, BatchNorm2d,
95
- SiLU, ReLU, LeakyReLU, MaxPool2d, Linear, Dropout, Upsample,
96
- Module, ModuleList, ModuleDict,
97
- Bottleneck, C2f, SPPF, Detect, Concat, DFL
98
- ])
58
+ print(f"Using temporary directory: {temp_dir_path}")
59
+ print()
99
60
 
100
- model = YOLO(weights_path)
101
- model.predict(images_path, conf=0.25, save=True, save_txt=True, project=temp_dir_path)
102
- labels_path = temp_dir_path / "predict" / "labels"
103
-
104
- if size_filter and len(sizes) <= 2:
105
- filtered=filter_by_size(images_path, labels_path, sizes)
106
- print(f"\nFiltered {len(list(images_path.glob('*.jpg')))} images by size out of {original_image_count} input images.\n NOTE: Some images may be filtered due to corruption or inaccurate labels.")
107
-
108
- if one_stage:
109
-
110
- __delete_orphaned_images_and_inferences(images_path, labels_path)
111
- __delete_invalid_txt_files(images_path, labels_path)
112
- class_idxs = update_labels(class_mapping, labels_path)
113
- __split_data(class_mapping, temp_dir_path, output_directory)
114
-
115
- # __save_class_idx_to_file(class_idxs, output_directory)
116
- final_image_count = count_images_across_splits(output_directory)
117
- print(f"\nOut of {original_image_count} input images, {final_image_count} are eligible for detection. \nThese are saved across train, test and valid split in {output_directory}.")
118
- __generate_sample_images_with_detections(output_directory, class_idxs)
119
-
120
- if with_background:
121
- print("\nCollecting and splitting background images.")
122
-
123
- bg_images=int(final_image_count*0.06)
124
-
125
- search: dict[str, Any] = {
126
- "scientificName": ["Plantae"]
127
- }
128
-
129
- collect(
130
- group_by_key=Group.scientificName,
131
- search_parameters=search,
132
- images_per_group=bg_images,
133
- output_directory=temp_dir_path,
134
- num_threads=3
135
- )
136
-
137
- __delete_corrupted_images(temp_dir_path / "Plantae")
138
-
139
- __split_background_images(temp_dir_path / "Plantae", output_directory)
140
-
141
- __count_classes_and_output_table(output_directory, class_idxs)
61
+ # Step 1: Setup directories and copy images
62
+ print("STEP 1: Setting up directories and copying images...")
63
+ print("-" * 50)
64
+ class_mapping, original_image_count = _setup_directories_and_copy_images(
65
+ input_directory, temp_dir_path
66
+ )
67
+ print(f" Step 1 completed: {original_image_count} images copied from {len(class_mapping)} classes")
68
+ print()
69
+
70
+ # Step 2-3: Clean images and setup model
71
+ print("STEP 2: Cleaning images and setting up YOLO model...")
72
+ print("-" * 50)
73
+ weights_path = _prepare_model_and_clean_images(temp_dir_path)
74
+ print(f"✓ Step 2 completed: Model ready at {weights_path}")
75
+ print()
76
+
77
+ # Step 4: Run YOLO inference
78
+ print("STEP 3: Running YOLO inference to detect objects...")
79
+ print("-" * 50)
80
+ labels_path = _run_yolo_inference(temp_dir_path, weights_path)
81
+ print(f"✓ Step 3 completed: Labels generated at {labels_path}")
82
+ print()
83
+
84
+ # Step 5-6: Clean up labels and update class mapping
85
+ print("STEP 4: Cleaning up orphaned files and processing labels...")
86
+ print("-" * 50)
87
+ class_idxs = _cleanup_and_process_labels(
88
+ temp_dir_path, labels_path, class_mapping
89
+ )
90
+ print(f"✓ Step 4 completed: Processed {len(class_idxs)} classes")
91
+ print()
92
+
93
+ # Step 7-9: Finalize dataset
94
+ print("STEP 5: Creating classification dataset with cropped images...")
95
+ print("-" * 50)
96
+ _finalize_dataset(
97
+ class_mapping, temp_dir_path, output_directory,
98
+ class_idxs, original_image_count, img_size
99
+ )
100
+ print("✓ Step 5 completed: Classification dataset ready!")
101
+ print()
102
+
103
+ print("="*60)
104
+ print("BPLUSPLUS DATASET PREPARATION COMPLETED SUCCESSFULLY!")
105
+ print("="*60)
142
106
 
143
- __make_yaml_file(output_directory, class_idxs)
144
- else:
145
- try:
146
- sized_dir = temp_dir_path / "sized"
147
- sized_dir.mkdir(parents=True, exist_ok=True)
148
- __two_stage_update(class_mapping, filtered, sized_dir, images_path)
149
- __classification_split(sized_dir, output_directory)
150
- __count_classification_split(output_directory, class_mapping)
151
- except:
152
- __classification_split(images_path, output_directory)
153
- __count_classification_split(output_directory, class_mapping)
154
-
155
- def __count_classification_split(output_directory: str, class_mapping: dict):
107
+ def _setup_directories_and_copy_images(input_directory: Path, temp_dir_path: Path):
156
108
  """
157
- Counts the number of images in the train and valid splits for each class.
158
-
159
- Args:
160
- output_directory (str): Path to the output directory containing train and valid splits.
161
- class_mapping (dict): Dictionary mapping class names to image file names.
109
+ Sets up temporary directories and copies images from input directory.
110
+
111
+ Returns:
112
+ tuple: (class_mapping dict, original_image_count int)
162
113
  """
163
- class_counts = {}
164
- train_counts = {}
165
- valid_counts = {}
114
+ images_path = temp_dir_path / "images"
115
+ images_path.mkdir(parents=True, exist_ok=True)
116
+ print(f" Created temporary images directory: {images_path}")
166
117
 
167
- for class_name in class_mapping.keys():
168
- train_dir = output_directory / 'train' / class_name
169
- valid_dir = output_directory / 'valid' / class_name
170
-
171
- train_count = len(list(train_dir.glob("*.jpg"))) if train_dir.exists() else 0
172
- valid_count = len(list(valid_dir.glob("*.jpg"))) if valid_dir.exists() else 0
173
- total_count = train_count + valid_count
174
-
175
- class_counts[class_name] = total_count
176
- train_counts[class_name] = train_count
177
- valid_counts[class_name] = valid_count
178
-
179
- table = PrettyTable()
180
- table.field_names = ["Class", "Train", "Valid", "Total"]
181
- for class_name in class_mapping.keys():
182
- table.add_row([
183
- class_name,
184
- train_counts[class_name],
185
- valid_counts[class_name],
186
- class_counts[class_name]
187
- ])
188
- print(table)
189
- print(f"Saved in {output_directory}")
190
-
191
- def __classification_split(input_directory: str, output_directory: str):
118
+ class_mapping = {}
119
+ total_copied = 0
120
+
121
+ print(" Scanning input directory for class folders...")
122
+ class_folders = [d for d in input_directory.iterdir() if d.is_dir()]
123
+ print(f" Found {len(class_folders)} class folders")
124
+
125
+ for folder_directory in class_folders:
126
+ images_names = []
127
+ if folder_directory.is_dir():
128
+ folder_name = folder_directory.name
129
+ image_files = list(folder_directory.glob("*.jpg"))
130
+ print(f" Copying {len(image_files)} images from class '{folder_name}'...")
131
+
132
+ for image_file in image_files:
133
+ shutil.copy(image_file, images_path)
134
+ image_name = image_file.name
135
+ images_names.append(image_name)
136
+ total_copied += 1
137
+
138
+ class_mapping[folder_name] = images_names
139
+ print(f" ✓ {len(images_names)} images copied for class '{folder_name}'")
140
+
141
+ original_image_count = len(list(images_path.glob("*.jpg"))) + len(list(images_path.glob("*.jpeg")))
142
+ print(f" Total images in temporary directory: {original_image_count}")
143
+
144
+ return class_mapping, original_image_count
145
+
146
+ def _prepare_model_and_clean_images(temp_dir_path: Path):
192
147
  """
193
- Splits the data into train and validation sets for classification tasks.
148
+ Cleans corrupted images and downloads/prepares the YOLO model.
194
149
 
195
- Args:
196
- input_directory (str): Path to the input directory containing subdirectories of class names.
197
- output_directory (str): Path to the output directory where train and valid splits will be created.
150
+ Returns:
151
+ Path: weights_path for the YOLO model
198
152
  """
199
- input_directory = Path(input_directory)
200
- output_directory = Path(output_directory)
153
+ images_path = temp_dir_path / "images"
201
154
 
202
- # Create train and valid directories
203
- train_dir = output_directory / 'train'
204
- valid_dir = output_directory / 'valid'
155
+ # Clean corrupted images
156
+ print(" Checking for corrupted images...")
157
+ images_before = len(list(images_path.glob("*.jpg")))
158
+ __delete_corrupted_images(images_path)
159
+ images_after = len(list(images_path.glob("*.jpg")))
160
+ deleted_count = images_before - images_after
161
+ print(f" ✓ Cleaned {deleted_count} corrupted images ({images_after} images remain)")
205
162
 
206
- train_dir.mkdir(parents=True, exist_ok=True)
207
- valid_dir.mkdir(parents=True, exist_ok=True)
163
+ # Setup model weights
164
+ current_dir = Path(__file__).resolve().parent
165
+ weights_path = current_dir / 'v11small-generic.pt'
166
+ github_release_url = 'https://github.com/Tvenver/Bplusplus/releases/download/v1.2.3/v11small-generic.pt'
208
167
 
209
- # Process each class directory
210
- for class_dir in input_directory.iterdir():
211
- if not class_dir.is_dir():
212
- continue
213
-
214
- class_name = class_dir.name
215
- print(f"Processing class: {class_name}")
168
+ print(f" Checking for YOLO model weights at: {weights_path}")
169
+ if not weights_path.exists():
170
+ print(" Model weights not found, downloading from GitHub...")
171
+ __download_file_from_github_release(github_release_url, weights_path)
172
+ print(f" ✓ Model weights downloaded successfully")
173
+ else:
174
+ print(" Model weights already exist")
175
+
176
+ # Add all required classes to safe globals
177
+ serialization.add_safe_globals([
178
+ DetectionModel, Sequential, Conv, Conv2d, BatchNorm2d,
179
+ SiLU, ReLU, LeakyReLU, MaxPool2d, Linear, Dropout, Upsample,
180
+ Module, ModuleList, ModuleDict,
181
+ Bottleneck, C2f, SPPF, Detect, Concat, DFL,
182
+ # Add torch internal classes
183
+ torch.nn.parameter.Parameter,
184
+ torch.Tensor,
185
+ torch._utils._rebuild_tensor_v2,
186
+ torch._utils._rebuild_parameter
187
+ ])
188
+
189
+ return weights_path
190
+
191
+ def _run_yolo_inference(temp_dir_path: Path, weights_path: Path):
192
+ """
193
+ Runs YOLO inference on all images to generate labels.
194
+
195
+ Returns:
196
+ Path: labels_path where the generated labels are stored
197
+ """
198
+ images_path = temp_dir_path / "images"
199
+ labels_path = temp_dir_path / "predict" / "labels"
200
+
201
+ try:
202
+ print(f" Loading YOLO model from: {weights_path}")
203
+ model = YOLO(weights_path)
204
+ print(" ✓ YOLO model loaded successfully")
216
205
 
217
- # Create corresponding class directories in train and valid
218
- (train_dir / class_name).mkdir(exist_ok=True)
219
- (valid_dir / class_name).mkdir(exist_ok=True)
206
+ # Get list of all image files
207
+ image_files = list(images_path.glob('*.jpg'))
208
+ print(f" Found {len(image_files)} images to process with YOLO")
220
209
 
221
- # Get all image files
222
- image_files = list(class_dir.glob('*.jpg')) + list(class_dir.glob('*.jpeg')) + list(class_dir.glob('*.png'))
210
+ # Ensure predict directory exists
211
+ predict_dir = temp_dir_path / "predict"
212
+ predict_dir.mkdir(exist_ok=True)
213
+ labels_path.mkdir(parents=True, exist_ok=True)
214
+ print(f" Created prediction output directory: {predict_dir}")
223
215
 
224
- if not image_files:
225
- print(f"Warning: No images found in {class_dir}")
226
- continue
227
-
228
- # Shuffle the files to ensure random distribution
229
- np.random.shuffle(image_files)
216
+ result_count = 0
217
+ error_count = 0
230
218
 
231
- # Split into train (90%) and valid (10%)
232
- split_idx = int(len(image_files) * 0.9)
233
- train_files = image_files[:split_idx]
234
- valid_files = image_files[split_idx:]
219
+ print(" Starting YOLO inference...")
220
+ print(f" Progress: 0/{len(image_files)} images processed", end="", flush=True)
235
221
 
236
- # Copy files to respective directories
237
- for img_file in train_files:
238
- shutil.copy(img_file, train_dir / class_name / img_file.name)
239
-
240
- for img_file in valid_files:
241
- shutil.copy(img_file, valid_dir / class_name / img_file.name)
222
+ for i, img_path in enumerate(image_files, 1):
223
+ try:
224
+ results = model.predict(
225
+ source=str(img_path),
226
+ conf=0.35,
227
+ save=True,
228
+ save_txt=True,
229
+ project=temp_dir_path,
230
+ name="predict",
231
+ exist_ok=True,
232
+ verbose=False # Set to False to reduce YOLO's own output
233
+ )
234
+
235
+ result_count += 1
236
+
237
+ # Update progress every 10% or every 100 images, whichever is smaller
238
+ update_interval = max(1, min(100, len(image_files) // 10))
239
+ if i % update_interval == 0 or i == len(image_files):
240
+ print(f"\r Progress: {i}/{len(image_files)} images processed", end="", flush=True)
241
+
242
+ except Exception as e:
243
+ error_count += 1
244
+ print(f"\n Error processing {img_path.name}: {e}")
245
+ continue
242
246
 
243
- print(f" - {len(train_files)} images in train, {len(valid_files)} images in valid")
247
+ print() # New line after progress
248
+ print(f" ✓ YOLO inference completed: {result_count} successful, {error_count} failed")
249
+
250
+ # Verify labels were created
251
+ label_files = list(labels_path.glob("*.txt"))
252
+ print(f" Generated {len(label_files)} label files")
253
+
254
+ if len(label_files) == 0:
255
+ print("WARNING: No label files were created by the model prediction!")
256
+
257
+ except Exception as e:
258
+ print(f"Error during model prediction setup: {e}")
259
+ import traceback
260
+ traceback.print_exc()
244
261
 
245
- print(f"\nData split complete. Train and validation sets created in {output_directory}")
262
+ return labels_path
246
263
 
247
- def filter_by_size(images_path: Path, labels_path: Path, sizes: list):
264
+ def _cleanup_and_process_labels(temp_dir_path: Path, labels_path: Path, class_mapping: dict):
248
265
  """
249
- Filters images by size and updates labels accordingly.
250
-
251
- Args:
252
- images_path (Path): The path to the directory containing images.
253
- labels_path (Path): The path to the directory containing labels.
254
- sizes (list): A list of sizes to filter by.
266
+ Cleans up orphaned images and invalid labels, then creates class index mapping.
267
+
268
+ Returns:
269
+ dict: class_idxs mapping class indices to class names
255
270
  """
256
- size_map={
257
- "small": [0, 0.15],
258
- "medium": [0.15, 0.3],
259
- "large": [0.3, 1],
260
- }
261
-
262
- filtered_images = []
263
- for image_file in images_path.glob("*.jpg"):
264
- label_file = labels_path / (image_file.stem + ".txt")
265
- image_name = image_file.name
266
-
267
- if label_file.exists():
268
- with open(label_file, 'r') as file:
269
- lines = file.readlines()
270
- if len(lines) != 1:
271
- continue
272
- else:
273
- parts = lines[0].split()
274
- _, _, width, height = map(float, parts[1:])
275
- for size in sizes:
276
- if width < size_map[size][1] and width >= size_map[size][0] and height < size_map[size][1] and height >= size_map[size][0]:
277
- filtered_images.append(image_name)
271
+ images_path = temp_dir_path / "images"
278
272
 
279
- for image_file in images_path.glob("*.jpg"):
280
- label_file = labels_path / (image_file.stem + ".txt")
281
- image_name = image_file.name
282
- if image_name not in filtered_images:
283
- image_file.unlink()
284
- try:
285
- label_file.unlink()
286
- except FileNotFoundError:
287
- pass
288
- return filtered_images
273
+ print(" Cleaning up orphaned images and labels...")
274
+ images_before = len(list(images_path.glob("*.jpg")))
275
+ labels_before = len(list(labels_path.glob("*.txt")))
276
+
277
+ __delete_orphaned_images_and_inferences(images_path, labels_path)
278
+ __delete_invalid_txt_files(images_path, labels_path)
279
+
280
+ images_after = len(list(images_path.glob("*.jpg")))
281
+ labels_after = len(list(labels_path.glob("*.txt")))
282
+
283
+ deleted_images = images_before - images_after
284
+ deleted_labels = labels_before - labels_after
285
+ print(f" ✓ Cleaned up {deleted_images} orphaned images and {deleted_labels} invalid labels")
286
+ print(f" Final counts: {images_after} images, {labels_after} valid labels")
287
+
288
+ # Create class index mapping for classification
289
+ class_idxs = {}
290
+ for idx, class_name in enumerate(class_mapping.keys()):
291
+ class_idxs[idx] = class_name
292
+
293
+ print(f" Created class mapping for {len(class_idxs)} classes: {list(class_idxs.values())}")
294
+
295
+ return class_idxs
289
296
 
290
- def __two_stage_update(class_mapping: dict, filtered_images: Path, output_directory: Path, images_path: Path):
297
+ def _finalize_dataset(class_mapping: dict, temp_dir_path: Path, output_directory: Path,
298
+ class_idxs: dict, original_image_count: int, img_size: int):
291
299
  """
292
- Prepares folders with class name containing filtered images.
300
+ Finalizes the dataset by creating cropped classification images and splitting into train/valid sets.
293
301
  """
294
-
295
- for class_name, images in class_mapping.items():
296
- for image_name in images:
297
- if image_name in filtered_images:
298
- (output_directory / class_name).mkdir(parents=True, exist_ok=True)
299
- shutil.copy(images_path / image_name, output_directory / class_name / image_name)
302
+ # Split data into train/valid with cropped classification images
303
+ __classification_split(class_mapping, temp_dir_path, output_directory, img_size)
304
+
305
+ # Generate final report
306
+ print(" Generating final statistics...")
307
+ final_image_count = count_images_across_splits(output_directory)
308
+ print(f" Dataset Statistics:")
309
+ print(f" - Original images: {original_image_count}")
310
+ print(f" - Final cropped images: {final_image_count}")
311
+ print(f" - Success rate: {final_image_count/original_image_count*100:.1f}%")
312
+ print(f" - Output directory: {output_directory}")
300
313
 
301
314
  def __delete_corrupted_images(images_path: Path):
302
315
 
@@ -367,7 +380,7 @@ def __delete_orphaned_images_and_inferences(images_path: Path, labels_path: Path
367
380
  image_file_jpeg = images_path / (txt_file.stem + ".jpeg")
368
381
 
369
382
  if not (image_file_jpg.exists() or image_file_jpeg.exists()):
370
- print(f"Deleting {txt_file.name} - No corresponding image file")
383
+ # print(f"Deleting {txt_file.name} - No corresponding image file")
371
384
  txt_file.unlink()
372
385
 
373
386
  label_stems = {txt_file.stem for txt_file in labels_path.glob("*.txt")}
@@ -375,10 +388,10 @@ def __delete_orphaned_images_and_inferences(images_path: Path, labels_path: Path
375
388
 
376
389
  for image_file in image_files:
377
390
  if image_file.stem not in label_stems:
378
- print(f"Deleting orphaned image: {image_file.name}")
391
+ # print(f"Deleting orphaned image: {image_file.name}")
379
392
  image_file.unlink()
380
393
 
381
- print("Orphaned images files without corresponding labels have been deleted.")
394
+
382
395
 
383
396
  def __delete_invalid_txt_files(images_path: Path, labels_path: Path):
384
397
 
@@ -400,7 +413,7 @@ def __delete_invalid_txt_files(images_path: Path, labels_path: Path):
400
413
  lines = file.readlines()
401
414
 
402
415
  if len(lines) == 0 or len(lines) > 1:
403
- print(f"Deleting {txt_file.name} - Invalid file")
416
+ # print(f"Deleting {txt_file.name} - Invalid file")
404
417
  txt_file.unlink()
405
418
 
406
419
  image_file_jpg = images_path / (txt_file.stem + ".jpg")
@@ -408,296 +421,193 @@ def __delete_invalid_txt_files(images_path: Path, labels_path: Path):
408
421
 
409
422
  if image_file_jpg.exists():
410
423
  image_file_jpg.unlink()
411
- print(f"Deleted corresponding image file: {image_file_jpg.name}")
424
+ # print(f"Deleted corresponding image file: {image_file_jpg.name}")
412
425
  elif image_file_jpeg.exists():
413
426
  image_file_jpeg.unlink()
414
- print(f"Deleted corresponding image file: {image_file_jpeg.name}")
427
+ # print(f"Deleted corresponding image file: {image_file_jpeg.name}")
415
428
 
416
- print("Invalid text files and their corresponding images files have been deleted.")
417
429
 
418
430
 
419
- def __split_data(class_mapping: dict, temp_dir_path: Path, output_directory: Path):
420
- """
421
- Splits the data into train, test, and validation sets.
422
431
 
432
+ def __classification_split(class_mapping: dict, temp_dir_path: Path, output_directory: Path, img_size: int):
433
+ """
434
+ Splits the data into train and validation sets for classification tasks,
435
+ cropping images according to their YOLO labels but preserving original class structure.
436
+
423
437
  Args:
424
438
  class_mapping (dict): A dictionary mapping class names to image file names.
425
439
  temp_dir_path (Path): The path to the temporary directory containing the images.
426
- output_directory (Path): The path to the output directory where the split data will be saved.
440
+ output_directory (Path): The path to the output directory where train and valid splits will be created.
441
+ img_size (int): The target size for the smallest dimension of cropped images.
427
442
  """
428
443
  images_dir = temp_dir_path / "images"
429
444
  labels_dir = temp_dir_path / "predict" / "labels"
430
-
431
- def create_dirs(split):
432
- (output_directory / split).mkdir(parents=True, exist_ok=True)
433
- (output_directory / split / "images").mkdir(parents=True, exist_ok=True)
434
- (output_directory / split / "labels").mkdir(parents=True, exist_ok=True)
435
-
436
- def copy_files(file_list, split):
437
- for image_file in file_list:
438
- image_file_path = images_dir / image_file
439
-
440
- if not image_file_path.exists():
441
- continue
442
-
443
- shutil.copy(image_file_path, output_directory / split / "images" / image_file_path.name)
444
-
445
- label_file = labels_dir / (image_file_path.stem + ".txt")
446
- if label_file.exists():
447
- shutil.copy(label_file, output_directory / split / "labels" / label_file.name)
448
-
449
- for split in ["train", "test", "valid"]:
450
- create_dirs(split)
451
-
452
- for _, files in class_mapping.items():
453
- random.shuffle(files)
454
- num_files = len(files)
455
-
456
- train_count = int(0.8 * num_files)
457
- test_count = int(0.1 * num_files)
458
- valid_count = num_files - train_count - test_count
459
-
460
- train_files = files[:train_count]
461
- test_files = files[train_count:train_count + test_count]
462
- valid_files = files[train_count + test_count:]
463
-
464
- copy_files(train_files, "train")
465
- copy_files(test_files, "test")
466
- copy_files(valid_files, "valid")
467
-
468
- print("Data has been split into train, test, and valid.")
469
-
470
- def __save_class_idx_to_file(class_idxs: dict, output_directory: Path):
471
- """
472
- Saves the class indices to a file.
473
-
474
- Args:
475
- class_idxs (dict): A dictionary mapping class names to class indices.
476
- output_directory (Path): The path to the output directory where the class index file will be saved.
477
- """
478
- class_idx_file = output_directory / "class_idx.txt"
479
- with open(class_idx_file, 'w') as f:
480
- for class_name, idx in class_idxs.items():
481
- f.write(f"{class_name}: {idx}\n")
482
- print(f"Class indices have been saved to {class_idx_file}")
483
-
484
- def __generate_sample_images_with_detections(main_dir: Path, class_idxs: dict):
485
-
486
- """
487
- Generates one sample image with multiple detections for each of train, test, valid, combining up to 6 images in one output.
488
-
489
- Args:
490
- main_dir (str): The main directory containing the train, test, and valid splits.
491
- """
492
-
493
- def resize_and_contain(image, target_size):
494
- image.thumbnail(target_size, Image.LANCZOS)
495
- new_image = Image.new("RGB", target_size, (0, 0, 0))
496
- new_image.paste(image, ((target_size[0] - image.width) // 2, (target_size[1] - image.height) // 2))
497
- return new_image
498
-
499
- def draw_bounding_boxes(image, labels_path, class_mapping, color_map):
500
- draw = ImageDraw.Draw(image)
501
- img_width, img_height = image.size
502
- try:
503
- font = ImageFont.truetype("DejaVuSans-Bold.ttf", 20)
504
- except IOError:
505
- font = ImageFont.load_default()
506
-
507
- if labels_path.exists():
508
- with open(labels_path, 'r') as label_file:
509
- for line in label_file.readlines():
510
- parts = line.strip().split()
511
- class_idx = int(parts[0])
512
- center_x, center_y, width, height = map(float, parts[1:])
513
- x_min = int((center_x - width / 2) * img_width)
514
- y_min = int((center_y - height / 2) * img_height)
515
- x_max = int((center_x + width / 2) * img_width)
516
- y_max = int((center_y + height / 2) * img_height)
517
- class_name = class_mapping.get(class_idx, str(class_idx))
518
- color = color_map[class_idx]
519
- draw.rectangle([x_min, y_min, x_max, y_max], outline=color, width=3)
520
- draw.text((x_min, y_min - 20), class_name, fill=color, font=font)
521
- return image
522
-
523
- def combine_images(images, grid_size=(3, 2), target_size=(416, 416)):
524
- resized_images = [resize_and_contain(img, target_size) for img in images]
525
- width, height = target_size
526
- combined_image = Image.new('RGB', (width * grid_size[0], height * grid_size[1]))
527
-
528
- for i, img in enumerate(resized_images):
529
- row = i // grid_size[0]
530
- col = i % grid_size[0]
531
- combined_image.paste(img, (col * width, row * height))
445
+
446
+ # Create train and valid directories
447
+ train_dir = output_directory / 'train'
448
+ valid_dir = output_directory / 'valid'
449
+
450
+ train_dir.mkdir(parents=True, exist_ok=True)
451
+ valid_dir.mkdir(parents=True, exist_ok=True)
452
+
453
+ # Create class directories based on class_mapping
454
+ print(f" Creating train and validation directories for {len(class_mapping)} classes...")
455
+ for class_name in class_mapping:
456
+ (train_dir / class_name).mkdir(exist_ok=True)
457
+ (valid_dir / class_name).mkdir(exist_ok=True)
458
+ print(f" ✓ Created directories for class: {class_name}")
459
+
460
+ # Process each class folder and its images
461
+ valid_images = []
462
+
463
+ # First, collect all valid label files
464
+ valid_label_stems = {label_file.stem for label_file in labels_dir.glob("*.txt")
465
+ if label_file.exists() and os.path.getsize(label_file) > 0}
466
+
467
+ print(f" Found {len(valid_label_stems)} valid label files for cropping")
468
+
469
+ print(" Starting image cropping and resizing...")
470
+ total_processed = 0
471
+ total_valid = 0
472
+
473
+ for class_name, image_names in class_mapping.items():
474
+ print(f" Processing class '{class_name}' ({len(image_names)} images)...")
475
+ class_processed = 0
476
+ class_valid = 0
477
+
478
+ for image_name in image_names:
479
+ # Check if the image exists in the images directory
480
+ image_path = images_dir / image_name
481
+ class_processed += 1
482
+ total_processed += 1
483
+
484
+ if not image_path.exists():
485
+ continue
486
+
487
+ # Skip images that don't have a valid label
488
+ if image_path.stem not in valid_label_stems:
489
+ continue
490
+
491
+ label_file = labels_dir / (image_path.stem + '.txt')
492
+
493
+ try:
494
+ img = Image.open(image_path)
495
+
496
+ if label_file.exists():
497
+ # If label exists, crop the image
498
+ with open(label_file, 'r') as f:
499
+ lines = f.readlines()
500
+ if lines:
501
+ parts = lines[0].strip().split()
502
+ if len(parts) >= 5:
503
+ x_center, y_center, width, height = map(float, parts[1:5])
504
+
505
+ img_width, img_height = img.size
506
+ x_min = int((x_center - width/2) * img_width)
507
+ y_min = int((y_center - height/2) * img_height)
508
+ x_max = int((x_center + width/2) * img_width)
509
+ y_max = int((y_center + height/2) * img_height)
510
+
511
+ x_min = max(0, x_min)
512
+ y_min = max(0, y_min)
513
+ x_max = min(img_width, x_max)
514
+ y_max = min(img_height, y_max)
515
+
516
+ img = img.crop((x_min, y_min, x_max, y_max))
517
+
518
+ img_width, img_height = img.size
519
+ if img_width < img_height:
520
+ # Width is smaller, set to img_size
521
+ new_width = img_size
522
+ new_height = int((img_height / img_width) * img_size)
523
+ else:
524
+ # Height is smaller, set to img_size
525
+ new_height = img_size
526
+ new_width = int((img_width / img_height) * img_size)
527
+
528
+ # Resize the image
529
+ img = img.resize((new_width, new_height), Image.LANCZOS)
530
+
531
+ valid_images.append((image_path, img, class_name))
532
+ class_valid += 1
533
+ total_valid += 1
534
+ except Exception as e:
535
+ print(f" Error processing {image_path}: {e}")
532
536
 
533
- return combined_image
534
-
535
- def generate_color_map(class_mapping):
536
- colors = ['red', 'blue', 'green', 'purple', 'orange', 'yellow', 'pink', 'cyan', 'magenta']
537
- color_map = {idx: random.choice(colors) for idx in class_mapping.keys()}
538
- return color_map
539
-
540
- splits = ['train', 'test', 'valid']
541
- class_mapping = class_idxs
542
- color_map = generate_color_map(class_mapping)
543
-
544
- for split in splits:
545
- images_dir = Path(main_dir) / split / 'images'
546
- labels_dir = Path(main_dir) / split / 'labels'
547
- image_files = list(images_dir.glob("*.jpg"))
548
- if not image_files:
549
- continue
537
+ print(f" ✓ Class '{class_name}': {class_valid} valid images from {class_processed} processed")
538
+
539
+ print(f" ✓ Successfully processed {total_valid} valid images from {total_processed} total images")
540
+
541
+ # Shuffle and split images
542
+ print(" Shuffling and splitting images into train/validation sets...")
543
+ random.shuffle(valid_images)
544
+ split_idx = int(len(valid_images) * 0.9)
545
+ train_images = valid_images[:split_idx]
546
+ valid_images_split = valid_images[split_idx:]
547
+
548
+ print(f" Split: {len(train_images)} training images, {len(valid_images_split)} validation images")
549
+
550
+ # Save images to train/valid directories
551
+ print(" Saving cropped and resized images...")
552
+ saved_train = 0
553
+ saved_valid = 0
554
+
555
+ for image_set, dest_dir, split_name in [(train_images, train_dir, "train"), (valid_images_split, valid_dir, "valid")]:
556
+ print(f" Saving {len(image_set)} images to {split_name} set...")
557
+ for orig_file, img, class_name in image_set:
558
+ output_path = dest_dir / class_name / (orig_file.stem + '.jpg')
559
+
560
+ # Convert any non-RGB mode to RGB before saving
561
+ if img.mode != 'RGB':
562
+ img = img.convert('RGB')
563
+
564
+ img.save(output_path, format='JPEG', quality=95)
565
+
566
+ if split_name == "train":
567
+ saved_train += 1
568
+ else:
569
+ saved_valid += 1
570
+
571
+ print(f" ✓ Saved {saved_train} train images and {saved_valid} validation images")
572
+
573
+ # Print detailed summary table
574
+ print(f" Final dataset summary:")
575
+ print()
576
+
577
+ # Calculate column widths for proper alignment
578
+ max_class_name_length = max(len(class_name) for class_name in class_mapping.keys())
579
+ class_col_width = max(max_class_name_length, len("Class"))
580
+
581
+ # Print table header
582
+ print(f" {'Class':<{class_col_width}} | {'Train':<7} | {'Valid':<7} | {'Total':<7}")
583
+ print(f" {'-' * class_col_width}-+-{'-' * 7}-+-{'-' * 7}-+-{'-' * 7}")
584
+
585
+ # Print data for each class and calculate totals
586
+ total_train = 0
587
+ total_valid = 0
588
+ total_overall = 0
589
+
590
+ for class_name in sorted(class_mapping.keys()): # Sort for consistent output
591
+ train_count = len(list((train_dir / class_name).glob('*.*')))
592
+ valid_count = len(list((valid_dir / class_name).glob('*.*')))
593
+ class_total = train_count + valid_count
550
594
 
551
- sample_images = []
552
- for image_file in image_files[:6]:
553
- label_file = labels_dir / (image_file.stem + '.txt')
554
- image = Image.open(image_file)
555
- image_with_boxes = draw_bounding_boxes(image, label_file, class_mapping, color_map)
556
- sample_images.append(image_with_boxes)
595
+ print(f" {class_name:<{class_col_width}} | {train_count:<7} | {valid_count:<7} | {class_total:<7}")
557
596
 
558
- if sample_images:
559
- combined_image = combine_images(sample_images, grid_size=(3, 2), target_size=(416, 416))
560
- combined_image_path = Path(main_dir) / split / f"{split}_sample_with_detections.jpg"
561
- combined_image.save(combined_image_path)
597
+ total_train += train_count
598
+ total_valid += valid_count
599
+ total_overall += class_total
562
600
 
563
-
564
- def __split_background_images(background_dir: Path, output_directory: Path):
565
- """
566
- Splits the background images into train, test, and validation sets.
567
-
568
- Args:
569
- temp_dir_path (Path): The path to the temporary directory containing the background images.
570
- output_directory (Path): The path to the output directory where the split background images will be saved.
571
- """
572
-
573
- image_files = list(Path(background_dir).glob("*.jpg"))
574
- random.shuffle(image_files)
575
-
576
- num_images = len(image_files)
577
- train_split = int(0.8 * num_images)
578
- valid_split = int(0.1 * num_images)
579
-
580
- train_files = image_files[:train_split]
581
- valid_files = image_files[train_split:train_split + valid_split]
582
- test_files = image_files[train_split + valid_split:]
583
-
584
- def copy_files(image_list, split):
585
- for image_file in image_list:
586
- shutil.copy(image_file, Path(output_directory) / split / 'images' / image_file.name)
587
-
588
- label_file = Path(output_directory) / split / 'labels' / (image_file.stem + ".txt")
589
- label_file.touch()
590
-
591
- copy_files(train_files, 'train')
592
- copy_files(valid_files, 'valid')
593
- copy_files(test_files, 'test')
594
-
595
- print(f"Background data has been split: {len(train_files)} train, {len(valid_files)} valid, {len(test_files)} test")
601
+ # Print totals row
602
+ print(f" {'-' * class_col_width}-+-{'-' * 7}-+-{'-' * 7}-+-{'-' * 7}")
603
+ print(f" {'TOTAL':<{class_col_width}} | {total_train:<7} | {total_valid:<7} | {total_overall:<7}")
604
+ print()
596
605
 
597
-
598
- def __count_classes_and_output_table(output_directory: Path, class_idxs: dict):
599
- """
600
- Counts the number of images per class and outputs a table.
601
-
602
- Args:
603
- output_directory (Path): The path to the output directory containing the split data.
604
- class_idxs (dict): A dictionary mapping class indices to class names.
605
- """
606
-
607
- def count_classes_in_split(labels_dir):
608
- class_counts = defaultdict(int)
609
- for label_file in os.listdir(labels_dir):
610
- if label_file.endswith(".txt"):
611
- label_path = os.path.join(labels_dir, label_file)
612
- with open(label_path, 'r') as f:
613
- lines = f.readlines()
614
- if not lines:
615
- # Count empty files as 'null' class (background images)
616
- class_counts['null'] += 1
617
- else:
618
- for line in lines:
619
- class_index = int(line.split()[0])
620
- class_counts[class_index] += 1
621
- return class_counts
622
-
623
- splits = ['train', 'test', 'valid']
624
- total_counts = defaultdict(int)
625
-
626
- table = PrettyTable()
627
- table.field_names = ["Class", "Class Index", "Train Count", "Test Count", "Valid Count", "Total"]
628
-
629
- split_counts = {split: defaultdict(int) for split in splits}
630
-
631
- for split in splits:
632
- labels_dir = output_directory / split / 'labels'
633
- if not os.path.exists(labels_dir):
634
- print(f"Warning: {labels_dir} does not exist, skipping {split}.")
635
- continue
636
-
637
- class_counts = count_classes_in_split(labels_dir)
638
- for class_index, count in class_counts.items():
639
- split_counts[split][class_index] = count
640
- total_counts[class_index] += count
641
-
642
- for class_index, total in total_counts.items():
643
- class_name = class_idxs.get(class_index, "Background" if class_index == 'null' else f"Class {class_index}")
644
- train_count = split_counts['train'].get(class_index, 0)
645
- test_count = split_counts['test'].get(class_index, 0)
646
- valid_count = split_counts['valid'].get(class_index, 0)
647
- table.add_row([class_name, class_index, train_count, test_count, valid_count, total])
648
-
649
- print(table)
650
-
651
- def update_labels(class_mapping: dict, labels_path: Path) -> dict:
652
- """
653
- Updates the labels based on the class mapping.
654
-
655
- Args:
656
- class_mapping (dict): A dictionary mapping class names to image file names.
657
- labels_path (Path): The path to the directory containing the label files.
658
-
659
- Returns:
660
- dict: A dictionary mapping class names to class indices.
661
- """
662
- class_index_mapping = {}
663
- class_index_definition = {}
664
-
665
- for idx, (class_name, images) in enumerate(class_mapping.items()):
666
- class_index_definition[idx] = class_name
667
- for image_name in images:
668
- class_index_mapping[image_name] = idx
669
-
670
- for txt_file in labels_path.glob("*.txt"):
671
- image_name_jpg = txt_file.stem + ".jpg"
672
- image_name_jpeg = txt_file.stem + ".jpeg"
673
-
674
- if image_name_jpg in class_index_mapping:
675
- class_index = class_index_mapping[image_name_jpg]
676
- elif image_name_jpeg in class_index_mapping:
677
- class_index = class_index_mapping[image_name_jpeg]
678
- else:
679
- print(f"Warning: No corresponding image found for {txt_file.name}")
680
- continue
681
-
682
- with open(txt_file, 'r') as file:
683
- lines = file.readlines()
684
-
685
- updated_lines = []
686
- for line in lines:
687
- parts = line.split()
688
- if len(parts) > 0:
689
- parts[0] = str(class_index)
690
- updated_lines.append(" ".join(parts))
691
-
692
- with open(txt_file, 'w') as file:
693
- file.write("\n".join(updated_lines))
694
-
695
- print(f"Labels updated successfully")
696
- return class_index_definition
606
+ print(f" ✓ Classification dataset created successfully at: {output_directory}")
697
607
 
698
608
  def count_images_across_splits(output_directory: Path) -> int:
699
609
  """
700
- Counts the total number of images across train, test, and validation splits.
610
+ Counts the total number of images across train and validation splits for classification dataset.
701
611
 
702
612
  Args:
703
613
  output_directory (Path): The path to the output directory containing the split data.
@@ -706,33 +616,12 @@ def count_images_across_splits(output_directory: Path) -> int:
706
616
  int: The total number of images across all splits.
707
617
  """
708
618
  total_images = 0
709
- for split in ['train', 'test', 'valid']:
710
- split_dir = output_directory / split / 'images'
711
- total_images += len(list(split_dir.glob("*.jpg"))) + len(list(split_dir.glob("*.jpeg")))
712
-
713
- return total_images
714
-
715
- def __make_yaml_file(output_directory: Path, class_idxs: dict):
716
- """
717
- Creates a YAML configuration file for YOLOv8.
718
-
719
- Args:
720
- output_directory (Path): The path to the output directory where the YAML file will be saved.
721
- class_idxs (dict): A dictionary mapping class indices to class names.
722
- """
723
-
724
- # Define the structure of the YAML file
725
- yaml_content = {
726
- 'path': str(output_directory.resolve()),
727
- 'train': 'train/images',
728
- 'val': 'valid/images',
729
- 'test': 'test/images',
730
- 'names': {idx: name for idx, name in class_idxs.items()}
731
- }
732
-
733
- # Write the YAML content to a file
734
- yaml_file_path = output_directory / 'dataset.yaml'
735
- with open(yaml_file_path, 'w') as yaml_file:
736
- yaml.dump(yaml_content, yaml_file, default_flow_style=False, sort_keys=False)
737
-
738
- print(f"YOLOv8 YAML file created at {yaml_file_path}")
619
+ for split in ['train', 'valid']:
620
+ split_dir = output_directory / split
621
+ if split_dir.exists():
622
+ # Count all images in all class subdirectories
623
+ for class_dir in split_dir.iterdir():
624
+ if class_dir.is_dir():
625
+ total_images += len(list(class_dir.glob("*.jpg"))) + len(list(class_dir.glob("*.jpeg")))
626
+
627
+ return total_images