bplusplus 1.2.1__tar.gz → 1.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bplusplus might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: bplusplus
3
- Version: 1.2.1
3
+ Version: 1.2.2
4
4
  Summary: A simple method to create AI models for biodiversity, with collect and prepare pipeline
5
5
  License: MIT
6
6
  Author: Titus Venverloo
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.9
12
12
  Classifier: Programming Language :: Python :: 3.10
13
13
  Classifier: Programming Language :: Python :: 3.11
14
14
  Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
15
16
  Requires-Dist: prettytable (==3.7.0)
16
17
  Requires-Dist: pygbif (>=0.6.4,<0.7.0)
17
18
  Requires-Dist: requests (==2.25.1)
@@ -111,14 +112,21 @@ This function takes three arguments:
111
112
  - **sizes: list = None** - List of sizes to filter by. If None, all sizes will be used, ["large", "medium", "small"].
112
113
 
113
114
  ```python
114
- # Prepare data
115
+ # Prepare data (one stage small insects)
115
116
  bplusplus.prepare(
116
117
  input_directory='/dataset/selected-species',
117
118
  output_directory='/dataset/prepared-data',
118
- with_background=False,
119
- one_stage=False,
119
+ with_background=True,
120
+ one_stage=True,
120
121
  size_filter=True,
121
- sizes=["large"]
122
+ sizes=["small"]
123
+ )
124
+
125
+ # Prepare data (two stage)
126
+ bplusplus.prepare(
127
+ input_directory='/dataset/selected-species',
128
+ output_directory='/dataset/prepared-data',
129
+ one_stage=False
122
130
  )
123
131
  ```
124
132
 
@@ -87,14 +87,21 @@ This function takes three arguments:
87
87
  - **sizes: list = None** - List of sizes to filter by. If None, all sizes will be used, ["large", "medium", "small"].
88
88
 
89
89
  ```python
90
- # Prepare data
90
+ # Prepare data (one stage small insects)
91
91
  bplusplus.prepare(
92
92
  input_directory='/dataset/selected-species',
93
93
  output_directory='/dataset/prepared-data',
94
- with_background=False,
95
- one_stage=False,
94
+ with_background=True,
95
+ one_stage=True,
96
96
  size_filter=True,
97
- sizes=["large"]
97
+ sizes=["small"]
98
+ )
99
+
100
+ # Prepare data (two stage)
101
+ bplusplus.prepare(
102
+ input_directory='/dataset/selected-species',
103
+ output_directory='/dataset/prepared-data',
104
+ one_stage=False
98
105
  )
99
106
  ```
100
107
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "bplusplus"
3
- version = "1.2.1"
3
+ version = "1.2.2"
4
4
  description = "A simple method to create AI models for biodiversity, with collect and prepare pipeline"
5
5
  authors = ["Titus Venverloo <tvenver@mit.edu>", "Deniz Aydemir <deniz@aydemir.us>", "Orlando Closs <orlando.closs@wur.nl>", "Ase Hatveit <aase@mit.edu>"]
6
6
  license = "MIT"
@@ -114,6 +114,8 @@ def __next_batch(parameters: dict[str, Any], total_limit: int, offset: int, curr
114
114
  parameters["limit"] = total_limit
115
115
  parameters["offset"] = offset
116
116
  parameters["mediaType"] = ["StillImage"]
117
+ parameters["basisOfRecord"] = ["HUMAN_OBSERVATION", "LIVING_SPECIMEN", "MACHINE_OBSERVATION", "OBSERVATION", "OCCURRENCE"]
118
+ parameters["lifeStage"] = ["Adult"]
117
119
  search = pygbif.occurrences.search(**parameters)
118
120
  occurrences = search["results"]
119
121
  if search["endOfRecords"] or len(current) >= total_limit:
@@ -115,17 +115,17 @@ class HierarchicalInsectClassifier(nn.Module):
115
115
  def get_taxonomy(species_list):
116
116
  """
117
117
  Retrieves taxonomic information for a list of species from GBIF API.
118
- Creates a hierarchical taxonomy dictionary with order, family, and species relationships.
118
+ Creates a hierarchical taxonomy dictionary with family, genus, and species relationships.
119
119
  """
120
120
  taxonomy = {1: [], 2: {}, 3: {}}
121
- species_to_family = {}
122
- family_to_order = {}
121
+ species_to_genus = {}
122
+ genus_to_family = {}
123
123
 
124
124
  logger.info(f"Building taxonomy from GBIF for {len(species_list)} species")
125
125
 
126
126
  print("\nTaxonomy Results:")
127
127
  print("-" * 80)
128
- print(f"{'Species':<30} {'Order':<20} {'Family':<20} {'Status'}")
128
+ print(f"{'Species':<30} {'Family':<20} {'Genus':<20} {'Status'}")
129
129
  print("-" * 80)
130
130
 
131
131
  for species_name in species_list:
@@ -136,23 +136,23 @@ def get_taxonomy(species_list):
136
136
 
137
137
  if data.get('status') == 'ACCEPTED' or data.get('status') == 'SYNONYM':
138
138
  family = data.get('family')
139
- order = data.get('order')
139
+ genus = data.get('genus')
140
140
 
141
- if family and order:
141
+ if family and genus:
142
142
  status = "OK"
143
143
 
144
- print(f"{species_name:<30} {order:<20} {family:<20} {status}")
144
+ print(f"{species_name:<30} {family:<20} {genus:<20} {status}")
145
145
 
146
- species_to_family[species_name] = family
147
- family_to_order[family] = order
146
+ species_to_genus[species_name] = genus
147
+ genus_to_family[genus] = family
148
148
 
149
- if order not in taxonomy[1]:
150
- taxonomy[1].append(order)
149
+ if family not in taxonomy[1]:
150
+ taxonomy[1].append(family)
151
151
 
152
- taxonomy[2][family] = order
153
- taxonomy[3][species_name] = family
152
+ taxonomy[2][genus] = family
153
+ taxonomy[3][species_name] = genus
154
154
  else:
155
- error_msg = f"Species '{species_name}' found in GBIF but family and order not found, could be spelling error in species, check GBIF"
155
+ error_msg = f"Species '{species_name}' found in GBIF but family and genus not found, could be spelling error in species, check GBIF"
156
156
  logger.error(error_msg)
157
157
  print(f"{species_name:<30} {'Not found':<20} {'Not found':<20} ERROR")
158
158
  print(f"Error: {error_msg}")
@@ -174,24 +174,24 @@ def get_taxonomy(species_list):
174
174
  taxonomy[1] = sorted(list(set(taxonomy[1])))
175
175
  print("-" * 80)
176
176
 
177
- num_orders = len(taxonomy[1])
178
- num_families = len(taxonomy[2])
177
+ num_families = len(taxonomy[1])
178
+ num_genera = len(taxonomy[2])
179
179
  num_species = len(taxonomy[3])
180
180
 
181
- print("\nOrder indices:")
182
- for i, order in enumerate(taxonomy[1]):
183
- print(f" {i}: {order}")
184
-
185
181
  print("\nFamily indices:")
186
- for i, family in enumerate(taxonomy[2].keys()):
182
+ for i, family in enumerate(taxonomy[1]):
187
183
  print(f" {i}: {family}")
188
184
 
185
+ print("\nGenus indices:")
186
+ for i, genus in enumerate(taxonomy[2].keys()):
187
+ print(f" {i}: {genus}")
188
+
189
189
  print("\nSpecies indices:")
190
190
  for i, species in enumerate(species_list):
191
191
  print(f" {i}: {species}")
192
192
 
193
- logger.info(f"Taxonomy built: {num_orders} orders, {num_families} families, {num_species} species")
194
- return taxonomy, species_to_family, family_to_order
193
+ logger.info(f"Taxonomy built: {num_families} families, {num_genera} genera, {num_species} species")
194
+ return taxonomy, species_to_genus, genus_to_family
195
195
 
196
196
  def create_mappings(taxonomy):
197
197
  """Create index mappings from taxonomy"""
@@ -244,12 +244,12 @@ class TestTwoStage:
244
244
  saved_species = checkpoint["species_list"]
245
245
  print(f"Saved model was trained on: {', '.join(saved_species)}")
246
246
 
247
- taxonomy, species_to_family, family_to_order = get_taxonomy(species_names)
247
+ taxonomy, species_to_genus, genus_to_family = get_taxonomy(species_names)
248
248
  else:
249
- taxonomy, species_to_family, family_to_order = get_taxonomy(species_names)
249
+ taxonomy, species_to_genus, genus_to_family = get_taxonomy(species_names)
250
250
  else:
251
251
  state_dict = checkpoint
252
- taxonomy, species_to_family, family_to_order = get_taxonomy(species_names)
252
+ taxonomy, species_to_genus, genus_to_family = get_taxonomy(species_names)
253
253
 
254
254
  level_to_idx, idx_to_level = create_mappings(taxonomy)
255
255
 
@@ -259,8 +259,6 @@ class TestTwoStage:
259
259
  if hasattr(taxonomy, "items"):
260
260
  num_classes_per_level = [len(classes) if isinstance(classes, list) else len(classes.keys())
261
261
  for level, classes in taxonomy.items()]
262
- else:
263
- num_classes_per_level = [4, 5, 9] # Example values, adjust as needed
264
262
 
265
263
  print(f"Using model with class counts: {num_classes_per_level}")
266
264
 
@@ -296,8 +294,8 @@ class TestTwoStage:
296
294
  print("Model successfully loaded")
297
295
  print(f"Using species: {', '.join(species_names)}")
298
296
 
299
- self.species_to_family = species_to_family
300
- self.family_to_order = family_to_order
297
+ self.species_to_genus = species_to_genus
298
+ self.genus_to_family = genus_to_family
301
299
 
302
300
  def get_frames(self, test_dir):
303
301
  image_dir = os.path.join(test_dir, "images")
@@ -305,10 +303,10 @@ class TestTwoStage:
305
303
 
306
304
  predicted_frames = []
307
305
  predicted_family_frames = []
308
- predicted_order_frames = []
306
+ predicted_genus_frames = []
309
307
  true_species_frames = []
310
308
  true_family_frames = []
311
- true_order_frames = []
309
+ true_genus_frames = []
312
310
  image_names = []
313
311
 
314
312
  start_time = time.time() # Start timing
@@ -326,7 +324,7 @@ class TestTwoStage:
326
324
  detections = results[0].boxes
327
325
  predicted_frame = []
328
326
  predicted_family_frame = []
329
- predicted_order_frame = []
327
+ predicted_genus_frame = []
330
328
 
331
329
  if detections:
332
330
  for box in detections:
@@ -346,13 +344,13 @@ class TestTwoStage:
346
344
  outputs = self.classification_model(input_tensor)
347
345
 
348
346
  # Get all taxonomic level predictions
349
- order_output = outputs[0] # First output is order (level 1)
350
- family_output = outputs[1] # Second output is family (level 2)
347
+ family_output = outputs[0] # First output is family (level 1)
348
+ genus_output = outputs[1] # Second output is genus (level 2)
351
349
  species_output = outputs[2] # Third output is species (level 3)
352
350
 
353
351
  # Get prediction indices
354
- order_idx = order_output.argmax(dim=1).item()
355
352
  family_idx = family_output.argmax(dim=1).item()
353
+ genus_idx = genus_output.argmax(dim=1).item()
356
354
  species_idx = species_output.argmax(dim=1).item()
357
355
 
358
356
  img_height, img_width, _ = frame.shape
@@ -367,15 +365,15 @@ class TestTwoStage:
367
365
  # Add predictions for each taxonomic level
368
366
  predicted_frame.append([species_idx] + box_coords)
369
367
  predicted_family_frame.append([family_idx] + box_coords)
370
- predicted_order_frame.append([order_idx] + box_coords)
368
+ predicted_genus_frame.append([genus_idx] + box_coords)
371
369
 
372
370
  predicted_frames.append(predicted_frame if predicted_frame else [])
373
371
  predicted_family_frames.append(predicted_family_frame if predicted_family_frame else [])
374
- predicted_order_frames.append(predicted_order_frame if predicted_order_frame else [])
372
+ predicted_genus_frames.append(predicted_genus_frame if predicted_genus_frame else [])
375
373
 
376
374
  true_species_frame = []
377
375
  true_family_frame = []
378
- true_order_frame = []
376
+ true_genus_frame = []
379
377
 
380
378
  if os.path.exists(label_path) and os.path.getsize(label_path) > 0:
381
379
  with open(label_path, 'r') as f:
@@ -389,22 +387,22 @@ class TestTwoStage:
389
387
  if species_idx < len(self.species_names):
390
388
  species_name = self.species_names[species_idx]
391
389
 
392
- if species_name in self.species_to_family:
393
- family_name = self.species_to_family[species_name]
394
- # Get the index of the family in the level_to_idx mapping
395
- if 2 in self.level_to_idx and family_name in self.level_to_idx[2]:
396
- family_idx = self.level_to_idx[2][family_name]
397
- true_family_frame.append([family_idx] + box_coords)
390
+ if species_name in self.species_to_genus:
391
+ genus_name = self.species_to_genus[species_name]
392
+ # Get the index of the genus in the level_to_idx mapping
393
+ if 2 in self.level_to_idx and genus_name in self.level_to_idx[2]:
394
+ genus_idx = self.level_to_idx[2][genus_name]
395
+ true_genus_frame.append([genus_idx] + box_coords)
398
396
 
399
- if family_name in self.family_to_order:
400
- order_name = self.family_to_order[family_name]
401
- if 1 in self.level_to_idx and order_name in self.level_to_idx[1]:
402
- order_idx = self.level_to_idx[1][order_name]
403
- true_order_frame.append([order_idx] + box_coords)
397
+ if genus_name in self.genus_to_family:
398
+ family_name = self.genus_to_family[genus_name]
399
+ if 1 in self.level_to_idx and family_name in self.level_to_idx[1]:
400
+ family_idx = self.level_to_idx[1][family_name]
401
+ true_family_frame.append([family_idx] + box_coords)
404
402
 
405
403
  true_species_frames.append(true_species_frame if true_species_frame else [])
406
404
  true_family_frames.append(true_family_frame if true_family_frame else [])
407
- true_order_frames.append(true_order_frame if true_order_frame else [])
405
+ true_genus_frames.append(true_genus_frame if true_genus_frame else [])
408
406
 
409
407
  end_time = time.time() # End timing
410
408
 
@@ -416,42 +414,42 @@ class TestTwoStage:
416
414
  writer.writerow([
417
415
  "Image Name",
418
416
  "True Species Detections",
417
+ "True Genus Detections",
419
418
  "True Family Detections",
420
- "True Order Detections",
421
419
  "Species Detections",
422
- "Family Detections",
423
- "Order Detections"
420
+ "Genus Detections",
421
+ "Family Detections"
424
422
  ])
425
423
 
426
- for image_name, true_species, true_family, true_order, species_pred, family_pred, order_pred in zip(
424
+ for image_name, true_species, true_genus, true_family, species_pred, genus_pred, family_pred in zip(
427
425
  image_names,
428
426
  true_species_frames,
427
+ true_genus_frames,
429
428
  true_family_frames,
430
- true_order_frames,
431
429
  predicted_frames,
432
- predicted_family_frames,
433
- predicted_order_frames
430
+ predicted_genus_frames,
431
+ predicted_family_frames
434
432
  ):
435
433
  writer.writerow([
436
434
  image_name,
437
435
  true_species,
436
+ true_genus,
438
437
  true_family,
439
- true_order,
440
438
  species_pred,
441
- family_pred,
442
- order_pred
439
+ genus_pred,
440
+ family_pred
443
441
  ])
444
442
 
445
443
  print(f"Results saved to {output_file}")
446
- return predicted_frames, true_species_frames, end_time - start_time, predicted_family_frames, predicted_order_frames, true_family_frames, true_order_frames
444
+ return predicted_frames, true_species_frames, end_time - start_time, predicted_genus_frames, predicted_family_frames, true_genus_frames, true_family_frames
447
445
 
448
446
  def run(self, test_dir):
449
447
  results = self.get_frames(test_dir)
450
448
  predicted_frames, true_species_frames, total_time = results[0], results[1], results[2]
451
- predicted_family_frames = results[3]
452
- predicted_order_frames = results[4]
453
- true_family_frames = results[5]
454
- true_order_frames = results[6]
449
+ predicted_genus_frames = results[3]
450
+ predicted_family_frames = results[4]
451
+ true_genus_frames = results[5]
452
+ true_family_frames = results[6]
455
453
 
456
454
  num_frames = len(os.listdir(os.path.join(test_dir, 'images')))
457
455
  avg_time_per_frame = total_time / num_frames
@@ -461,29 +459,29 @@ class TestTwoStage:
461
459
 
462
460
  self.calculate_metrics(
463
461
  predicted_frames, true_species_frames,
464
- predicted_family_frames, true_family_frames,
465
- predicted_order_frames, true_order_frames
462
+ predicted_genus_frames, true_genus_frames,
463
+ predicted_family_frames, true_family_frames
466
464
  )
467
465
 
468
466
  def calculate_metrics(self, predicted_species_frames, true_species_frames,
469
- predicted_family_frames, true_family_frames,
470
- predicted_order_frames, true_order_frames):
467
+ predicted_genus_frames, true_genus_frames,
468
+ predicted_family_frames, true_family_frames):
471
469
  """Calculate metrics at all taxonomic levels"""
472
- # Get list of species, families and orders
470
+ # Get list of species, families and genera
473
471
  species_list = self.species_names
474
- family_list = sorted(list(set(self.species_to_family.values())))
475
- order_list = sorted(list(set(self.family_to_order.values())))
472
+ genus_list = sorted(list(set(self.species_to_genus.values())))
473
+ family_list = sorted(list(set(self.genus_to_family.values())))
476
474
 
477
475
  # Print the index mappings we're using for evaluation
478
476
  print("\nUsing the following index mappings for evaluation:")
479
- print("\nOrder indices:")
480
- for i, order in enumerate(order_list):
481
- print(f" {i}: {order}")
482
-
483
477
  print("\nFamily indices:")
484
478
  for i, family in enumerate(family_list):
485
479
  print(f" {i}: {family}")
486
480
 
481
+ print("\nGenus indices:")
482
+ for i, genus in enumerate(genus_list):
483
+ print(f" {i}: {genus}")
484
+
487
485
  print("\nSpecies indices:")
488
486
  for i, species in enumerate(species_list):
489
487
  print(f" {i}: {species}")
@@ -491,11 +489,11 @@ class TestTwoStage:
491
489
  # Dictionary to track prediction category counts for debugging
492
490
  prediction_counts = {
493
491
  "true_species_boxes": sum(len(frame) for frame in true_species_frames),
492
+ "true_genus_boxes": sum(len(frame) for frame in true_genus_frames),
494
493
  "true_family_boxes": sum(len(frame) for frame in true_family_frames),
495
- "true_order_boxes": sum(len(frame) for frame in true_order_frames),
496
494
  "predicted_species": sum(len(frame) for frame in predicted_species_frames),
497
- "predicted_family": sum(len(frame) for frame in predicted_family_frames),
498
- "predicted_order": sum(len(frame) for frame in predicted_order_frames)
495
+ "predicted_genus": sum(len(frame) for frame in predicted_genus_frames),
496
+ "predicted_family": sum(len(frame) for frame in predicted_family_frames)
499
497
  }
500
498
 
501
499
  print(f"Prediction counts: {prediction_counts}")
@@ -504,11 +502,11 @@ class TestTwoStage:
504
502
  print("\n=== Species-level Metrics ===")
505
503
  self.get_metrics(predicted_species_frames, true_species_frames, species_list)
506
504
 
505
+ print("\n=== Genus-level Metrics ===")
506
+ self.get_metrics(predicted_genus_frames, true_genus_frames, genus_list)
507
+
507
508
  print("\n=== Family-level Metrics ===")
508
509
  self.get_metrics(predicted_family_frames, true_family_frames, family_list)
509
-
510
- print("\n=== Order-level Metrics ===")
511
- self.get_metrics(predicted_order_frames, true_order_frames, order_list)
512
510
 
513
511
  def get_metrics(self, predicted_frames, true_frames, labels):
514
512
  """Calculate metrics for object detection predictions"""
@@ -144,17 +144,17 @@ def train_multitask(batch_size=4, epochs=30, patience=3, img_size=640, data_dir=
144
144
  def get_taxonomy(species_list):
145
145
  """
146
146
  Retrieves taxonomic information for a list of species from GBIF API.
147
- Creates a hierarchical taxonomy dictionary with order, family, and species relationships.
147
+ Creates a hierarchical taxonomy dictionary with family, genus, and species relationships.
148
148
  """
149
149
  taxonomy = {1: [], 2: {}, 3: {}}
150
- species_to_family = {}
151
- family_to_order = {}
150
+ species_to_genus = {}
151
+ genus_to_family = {}
152
152
 
153
153
  logger.info(f"Building taxonomy from GBIF for {len(species_list)} species")
154
154
 
155
155
  print("\nTaxonomy Results:")
156
156
  print("-" * 80)
157
- print(f"{'Species':<30} {'Order':<20} {'Family':<20} {'Status'}")
157
+ print(f"{'Species':<30} {'Family':<20} {'Genus':<20} {'Status'}")
158
158
  print("-" * 80)
159
159
 
160
160
  for species_name in species_list:
@@ -165,23 +165,23 @@ def get_taxonomy(species_list):
165
165
 
166
166
  if data.get('status') == 'ACCEPTED' or data.get('status') == 'SYNONYM':
167
167
  family = data.get('family')
168
- order = data.get('order')
168
+ genus = data.get('genus')
169
169
 
170
- if family and order:
170
+ if family and genus:
171
171
  status = "OK"
172
172
 
173
- print(f"{species_name:<30} {order:<20} {family:<20} {status}")
173
+ print(f"{species_name:<30} {family:<20} {genus:<20} {status}")
174
174
 
175
- species_to_family[species_name] = family
176
- family_to_order[family] = order
175
+ species_to_genus[species_name] = genus
176
+ genus_to_family[genus] = family
177
177
 
178
- if order not in taxonomy[1]:
179
- taxonomy[1].append(order)
178
+ if family not in taxonomy[1]:
179
+ taxonomy[1].append(family)
180
180
 
181
- taxonomy[2][family] = order
182
- taxonomy[3][species_name] = family
181
+ taxonomy[2][genus] = family
182
+ taxonomy[3][species_name] = genus
183
183
  else:
184
- error_msg = f"Species '{species_name}' found in GBIF but family and order not found, could be spelling error in species, check GBIF"
184
+ error_msg = f"Species '{species_name}' found in GBIF but family and genus not found, could be spelling error in species, check GBIF"
185
185
  logger.error(error_msg)
186
186
  print(f"{species_name:<30} {'Not found':<20} {'Not found':<20} ERROR")
187
187
  print(f"Error: {error_msg}")
@@ -203,23 +203,23 @@ def get_taxonomy(species_list):
203
203
  taxonomy[1] = sorted(list(set(taxonomy[1])))
204
204
  print("-" * 80)
205
205
 
206
- num_orders = len(taxonomy[1])
207
- num_families = len(taxonomy[2])
206
+ num_families = len(taxonomy[1])
207
+ num_genera = len(taxonomy[2])
208
208
  num_species = len(taxonomy[3])
209
209
 
210
- print("\nOrder indices:")
211
- for i, order in enumerate(taxonomy[1]):
212
- print(f" {i}: {order}")
213
-
214
210
  print("\nFamily indices:")
215
- for i, family in enumerate(taxonomy[2].keys()):
211
+ for i, family in enumerate(taxonomy[1]):
216
212
  print(f" {i}: {family}")
217
213
 
214
+ print("\nGenus indices:")
215
+ for i, genus in enumerate(taxonomy[2].keys()):
216
+ print(f" {i}: {genus}")
217
+
218
218
  print("\nSpecies indices:")
219
219
  for i, species in enumerate(species_list):
220
220
  print(f" {i}: {species}")
221
221
 
222
- logger.info(f"Taxonomy built: {num_orders} orders, {num_families} families, {num_species} species")
222
+ logger.info(f"Taxonomy built: {num_families} families, {num_genera} genera, {num_species} species")
223
223
  return taxonomy
224
224
 
225
225
  def get_species_from_directory(train_dir):
@@ -276,15 +276,15 @@ class InsectDataset(Dataset):
276
276
  self.level_to_idx = level_to_idx
277
277
  self.samples = []
278
278
 
279
- species_to_family = {species: family for species, family in taxonomy[3].items()}
280
- family_to_order = {family: order for family, order in taxonomy[2].items()}
279
+ species_to_genus = {species: genus for species, genus in taxonomy[3].items()}
280
+ genus_to_family = {genus: family for genus, family in taxonomy[2].items()}
281
281
 
282
282
  for species_name in os.listdir(root_dir):
283
283
  species_path = os.path.join(root_dir, species_name)
284
284
  if os.path.isdir(species_path):
285
- if species_name in species_to_family:
286
- family_name = species_to_family[species_name]
287
- order_name = family_to_order[family_name]
285
+ if species_name in species_to_genus:
286
+ genus_name = species_to_genus[species_name]
287
+ family_name = genus_to_family[genus_name]
288
288
 
289
289
  for img_file in os.listdir(species_path):
290
290
  if img_file.endswith(('.jpg', '.png', '.jpeg')):
@@ -296,7 +296,7 @@ class InsectDataset(Dataset):
296
296
  # Only add valid images to samples
297
297
  self.samples.append({
298
298
  'image_path': img_path,
299
- 'labels': [order_name, family_name, species_name]
299
+ 'labels': [family_name, genus_name, species_name]
300
300
  })
301
301
 
302
302
  except Exception as e:
@@ -94,19 +94,73 @@ def prepare(input_directory: str, output_directory: str, one_stage: bool = False
94
94
  DetectionModel, Sequential, Conv, Conv2d, BatchNorm2d,
95
95
  SiLU, ReLU, LeakyReLU, MaxPool2d, Linear, Dropout, Upsample,
96
96
  Module, ModuleList, ModuleDict,
97
- Bottleneck, C2f, SPPF, Detect, Concat, DFL
97
+ Bottleneck, C2f, SPPF, Detect, Concat, DFL,
98
+ # Add torch internal classes
99
+ torch.nn.parameter.Parameter,
100
+ torch.Tensor,
101
+ torch._utils._rebuild_tensor_v2,
102
+ torch._utils._rebuild_parameter
98
103
  ])
99
-
100
- model = YOLO(weights_path)
101
- model.predict(images_path, conf=0.25, save=True, save_txt=True, project=temp_dir_path)
102
- labels_path = temp_dir_path / "predict" / "labels"
103
104
 
104
- if size_filter and len(sizes) <= 2:
105
- filtered=filter_by_size(images_path, labels_path, sizes)
106
- print(f"\nFiltered {len(list(images_path.glob('*.jpg')))} images by size out of {original_image_count} input images.\n NOTE: Some images may be filtered due to corruption or inaccurate labels.")
105
+ labels_path = temp_dir_path / "predict" / "labels"
106
+
107
+ try:
108
+ print(f"Loading YOLO model from {weights_path}")
109
+ model = YOLO(weights_path)
110
+
111
+ # Get list of all image files
112
+ image_files = list(images_path.glob('*.jpg'))
113
+ print(f"Found {len(image_files)} images to process")
114
+
115
+ # Ensure predict directory exists
116
+ predict_dir = temp_dir_path / "predict"
117
+ predict_dir.mkdir(exist_ok=True)
118
+ labels_path.mkdir(parents=True, exist_ok=True)
119
+
120
+ result_count = 0
121
+ error_count = 0
122
+
123
+ for img_path in image_files:
124
+ try:
125
+ results = model.predict(
126
+ source=str(img_path),
127
+ conf=0.5,
128
+ save=True,
129
+ save_txt=True,
130
+ project=temp_dir_path,
131
+ name="predict",
132
+ exist_ok=True,
133
+ verbose=True
134
+ )
135
+
136
+ result_count += 1
137
+
138
+ except Exception as e:
139
+ error_count += 1
140
+ print(f"Error processing {img_path.name}: {e}")
141
+ continue
142
+
143
+ print(f"Model prediction completed: {result_count} successful, {error_count} failed")
144
+ print(f"Checking for labels in {labels_path}")
145
+
146
+ # Verify labels were created
147
+ label_files = list(labels_path.glob("*.txt"))
148
+ print(f"Found {len(label_files)} label files")
149
+
150
+ if len(label_files) == 0:
151
+ print("WARNING: No label files were created by the model prediction!")
152
+
153
+ except Exception as e:
154
+ print(f"Error during model prediction setup: {e}")
155
+ import traceback
156
+ traceback.print_exc()
107
157
 
108
158
  if one_stage:
109
159
 
160
+ if size_filter and len(sizes) <= 2:
161
+ __filter_by_size(images_path, labels_path, sizes)
162
+ print(f"\nFiltered {len(list(images_path.glob('*.jpg')))} images by size out of {original_image_count} input images.\n NOTE: Some images may be filtered due to corruption or inaccurate labels.")
163
+
110
164
  __delete_orphaned_images_and_inferences(images_path, labels_path)
111
165
  __delete_invalid_txt_files(images_path, labels_path)
112
166
  class_idxs = update_labels(class_mapping, labels_path)
@@ -142,15 +196,17 @@ def prepare(input_directory: str, output_directory: str, one_stage: bool = False
142
196
 
143
197
  __make_yaml_file(output_directory, class_idxs)
144
198
  else:
145
- try:
146
- sized_dir = temp_dir_path / "sized"
147
- sized_dir.mkdir(parents=True, exist_ok=True)
148
- __two_stage_update(class_mapping, filtered, sized_dir, images_path)
149
- __classification_split(sized_dir, output_directory)
150
- __count_classification_split(output_directory, class_mapping)
151
- except:
152
- __classification_split(images_path, output_directory)
153
- __count_classification_split(output_directory, class_mapping)
199
+ # try:
200
+ # sized_dir = temp_dir_path / "sized"
201
+ # sized_dir.mkdir(parents=True, exist_ok=True)
202
+ # __two_stage_update(class_mapping, filtered, sized_dir, images_path)
203
+ # __classification_split(sized_dir, output_directory)
204
+ # __count_classification_split(output_directory, class_mapping)
205
+ # except:
206
+ __delete_orphaned_images_and_inferences(images_path, labels_path)
207
+ __delete_invalid_txt_files(images_path, labels_path)
208
+ __classification_split(images_path, labels_path, output_directory, class_mapping)
209
+ __count_classification_split(output_directory, class_mapping)
154
210
 
155
211
  def __count_classification_split(output_directory: str, class_mapping: dict):
156
212
  """
@@ -186,17 +242,21 @@ def __count_classification_split(output_directory: str, class_mapping: dict):
186
242
  class_counts[class_name]
187
243
  ])
188
244
  print(table)
189
- print(f"Saved in {output_directory}")
245
+ # print(f"Saved in {output_directory}")
190
246
 
191
- def __classification_split(input_directory: str, output_directory: str):
247
+ def __classification_split(input_directory: str, labels_directory: str, output_directory: str, class_mapping: dict):
192
248
  """
193
- Splits the data into train and validation sets for classification tasks.
249
+ Splits the data into train and validation sets for classification tasks,
250
+ cropping images according to their YOLO labels but preserving original class structure.
194
251
 
195
252
  Args:
196
- input_directory (str): Path to the input directory containing subdirectories of class names.
253
+ input_directory (str): Path to the input directory containing images.
254
+ labels_directory (str): Path to the directory containing YOLO label files.
197
255
  output_directory (str): Path to the output directory where train and valid splits will be created.
256
+ class_mapping (dict): Dictionary mapping class names to image file names.
198
257
  """
199
258
  input_directory = Path(input_directory)
259
+ labels_directory = Path(labels_directory)
200
260
  output_directory = Path(output_directory)
201
261
 
202
262
  # Create train and valid directories
@@ -206,45 +266,108 @@ def __classification_split(input_directory: str, output_directory: str):
206
266
  train_dir.mkdir(parents=True, exist_ok=True)
207
267
  valid_dir.mkdir(parents=True, exist_ok=True)
208
268
 
209
- # Process each class directory
210
- for class_dir in input_directory.iterdir():
211
- if not class_dir.is_dir():
212
- continue
213
-
214
- class_name = class_dir.name
215
- print(f"Processing class: {class_name}")
216
-
217
- # Create corresponding class directories in train and valid
269
+ # Create class directories based on class_mapping
270
+ for class_name in class_mapping:
218
271
  (train_dir / class_name).mkdir(exist_ok=True)
219
272
  (valid_dir / class_name).mkdir(exist_ok=True)
273
+ print(f"Created directory for class: {class_name}")
274
+
275
+ # Process each class folder and its images
276
+ valid_images = []
277
+
278
+ # First, collect all valid label files
279
+ valid_label_stems = {label_file.stem for label_file in labels_directory.glob("*.txt")
280
+ if label_file.exists() and os.path.getsize(label_file) > 0}
281
+
282
+ print(f"Found {len(valid_label_stems)} valid label files")
283
+
284
+ for class_name, image_names in class_mapping.items():
285
+ print(f"Processing class: {class_name} with {len(image_names)} images")
220
286
 
221
- # Get all image files
222
- image_files = list(class_dir.glob('*.jpg')) + list(class_dir.glob('*.jpeg')) + list(class_dir.glob('*.png'))
223
-
224
- if not image_files:
225
- print(f"Warning: No images found in {class_dir}")
226
- continue
287
+ for image_name in image_names:
288
+ # Check if the image exists directly in the input directory
289
+ image_path = input_directory / image_name
227
290
 
228
- # Shuffle the files to ensure random distribution
229
- np.random.shuffle(image_files)
230
-
231
- # Split into train (90%) and valid (10%)
232
- split_idx = int(len(image_files) * 0.9)
233
- train_files = image_files[:split_idx]
234
- valid_files = image_files[split_idx:]
235
-
236
- # Copy files to respective directories
237
- for img_file in train_files:
238
- shutil.copy(img_file, train_dir / class_name / img_file.name)
291
+ if not image_path.exists():
292
+ continue
293
+
294
+ # Skip images that don't have a valid label
295
+ if image_path.stem not in valid_label_stems:
296
+ continue
297
+
298
+ label_file = labels_directory / (image_path.stem + '.txt')
239
299
 
240
- for img_file in valid_files:
241
- shutil.copy(img_file, valid_dir / class_name / img_file.name)
242
-
243
- print(f" - {len(train_files)} images in train, {len(valid_files)} images in valid")
300
+ try:
301
+ img = Image.open(image_path)
302
+
303
+ if label_file.exists():
304
+ # If label exists, crop the image
305
+ with open(label_file, 'r') as f:
306
+ lines = f.readlines()
307
+ if lines:
308
+ parts = lines[0].strip().split()
309
+ if len(parts) >= 5:
310
+ x_center, y_center, width, height = map(float, parts[1:5])
311
+
312
+ img_width, img_height = img.size
313
+ x_min = int((x_center - width/2) * img_width)
314
+ y_min = int((y_center - height/2) * img_height)
315
+ x_max = int((x_center + width/2) * img_width)
316
+ y_max = int((y_center + height/2) * img_height)
317
+
318
+ x_min = max(0, x_min)
319
+ y_min = max(0, y_min)
320
+ x_max = min(img_width, x_max)
321
+ y_max = min(img_height, y_max)
322
+
323
+ img = img.crop((x_min, y_min, x_max, y_max))
324
+
325
+ img_width, img_height = img.size
326
+ if img_width < img_height:
327
+ # Width is smaller, make it 40
328
+ new_width = 40
329
+ new_height = int((img_height / img_width) * 40)
330
+ else:
331
+ # Height is smaller, make it 40
332
+ new_height = 40
333
+ new_width = int((img_width / img_height) * 40)
334
+
335
+ #blur the image
336
+ img = img.resize((new_width, new_height), Image.LANCZOS)
337
+
338
+ valid_images.append((image_path, img, class_name))
339
+ except Exception as e:
340
+ print(f"Error processing {image_path}: {e}")
341
+
342
+ print(f"Successfully processed {len(valid_images)} valid images for classification")
343
+
344
+ # Shuffle and split images
345
+ random.shuffle(valid_images)
346
+ split_idx = int(len(valid_images) * 0.9)
347
+ train_images = valid_images[:split_idx]
348
+ valid_images = valid_images[split_idx:]
349
+
350
+ print(f"Split into {len(train_images)} training images and {len(valid_images)} validation images")
351
+
352
+ # Save images to train/valid directories
353
+ for image_set, dest_dir in [(train_images, train_dir), (valid_images, valid_dir)]:
354
+ for orig_file, img, class_name in image_set:
355
+ output_path = dest_dir / class_name / (orig_file.stem + '.jpg')
356
+
357
+ # Convert any non-RGB mode to RGB before saving
358
+ if img.mode != 'RGB':
359
+ img = img.convert('RGB')
360
+
361
+ img.save(output_path, format='JPEG', quality=95)
244
362
 
245
- print(f"\nData split complete. Train and validation sets created in {output_directory}")
363
+ # Print summary
364
+ print(f"\nData split complete. Images saved to train and validation sets in {output_directory}")
365
+ for class_name in class_mapping:
366
+ train_count = len(list((train_dir / class_name).glob('*.*')))
367
+ valid_count = len(list((valid_dir / class_name).glob('*.*')))
368
+ print(f" - {class_name}: {train_count} images in train, {valid_count} images in valid")
246
369
 
247
- def filter_by_size(images_path: Path, labels_path: Path, sizes: list):
370
+ def __filter_by_size(images_path: Path, labels_path: Path, sizes: list):
248
371
  """
249
372
  Filters images by size and updates labels accordingly.
250
373
 
@@ -285,7 +408,6 @@ def filter_by_size(images_path: Path, labels_path: Path, sizes: list):
285
408
  label_file.unlink()
286
409
  except FileNotFoundError:
287
410
  pass
288
- return filtered_images
289
411
 
290
412
  def __two_stage_update(class_mapping: dict, filtered_images: Path, output_directory: Path, images_path: Path):
291
413
  """
@@ -367,7 +489,7 @@ def __delete_orphaned_images_and_inferences(images_path: Path, labels_path: Path
367
489
  image_file_jpeg = images_path / (txt_file.stem + ".jpeg")
368
490
 
369
491
  if not (image_file_jpg.exists() or image_file_jpeg.exists()):
370
- print(f"Deleting {txt_file.name} - No corresponding image file")
492
+ # print(f"Deleting {txt_file.name} - No corresponding image file")
371
493
  txt_file.unlink()
372
494
 
373
495
  label_stems = {txt_file.stem for txt_file in labels_path.glob("*.txt")}
@@ -375,7 +497,7 @@ def __delete_orphaned_images_and_inferences(images_path: Path, labels_path: Path
375
497
 
376
498
  for image_file in image_files:
377
499
  if image_file.stem not in label_stems:
378
- print(f"Deleting orphaned image: {image_file.name}")
500
+ # print(f"Deleting orphaned image: {image_file.name}")
379
501
  image_file.unlink()
380
502
 
381
503
  print("Orphaned images files without corresponding labels have been deleted.")
@@ -400,7 +522,7 @@ def __delete_invalid_txt_files(images_path: Path, labels_path: Path):
400
522
  lines = file.readlines()
401
523
 
402
524
  if len(lines) == 0 or len(lines) > 1:
403
- print(f"Deleting {txt_file.name} - Invalid file")
525
+ # print(f"Deleting {txt_file.name} - Invalid file")
404
526
  txt_file.unlink()
405
527
 
406
528
  image_file_jpg = images_path / (txt_file.stem + ".jpg")
@@ -408,10 +530,10 @@ def __delete_invalid_txt_files(images_path: Path, labels_path: Path):
408
530
 
409
531
  if image_file_jpg.exists():
410
532
  image_file_jpg.unlink()
411
- print(f"Deleted corresponding image file: {image_file_jpg.name}")
533
+ # print(f"Deleted corresponding image file: {image_file_jpg.name}")
412
534
  elif image_file_jpeg.exists():
413
535
  image_file_jpeg.unlink()
414
- print(f"Deleted corresponding image file: {image_file_jpeg.name}")
536
+ # print(f"Deleted corresponding image file: {image_file_jpeg.name}")
415
537
 
416
538
  print("Invalid text files and their corresponding images files have been deleted.")
417
539
 
File without changes