bplusplus 2.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bplusplus/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ try:
2
+ import torch
3
+ import torchvision
4
+ except ImportError:
5
+ raise ImportError(
6
+ "PyTorch and Torchvision are not installed. "
7
+ "Please install them before using bplusplus by following the instructions "
8
+ "on the official PyTorch website: https://pytorch.org/get-started/locally/"
9
+ )
10
+
11
+ from .collect import Group, collect
12
+ from .prepare import prepare
13
+ from .train import train
14
+ from .inference import inference
15
+ from .validation import validate
bplusplus/collect.py ADDED
@@ -0,0 +1,523 @@
1
+ import os
2
+ import random
3
+ import signal
4
+ import sys
5
+ import threading
6
+ import time
7
+ import atexit
8
+ from enum import Enum
9
+ from typing import Any, Dict, List, Optional, Set
10
+
11
+ import pygbif
12
+ import requests
13
+ import validators
14
+ from tqdm import tqdm
15
+
16
+
17
+ # Currently supported groupings, more can be added with proper testing
18
+ class Group(str, Enum):
19
+ scientificName = "scientificName"
20
+
21
+
22
+ # ============================================================================
23
+ # PROGRESS TRACKING
24
+ # ============================================================================
25
+
26
+ class CollectionProgress:
27
+ """Thread-safe tracker for collection progress across species."""
28
+
29
+ def __init__(self):
30
+ self._lock = threading.Lock()
31
+ self._pending: Set[str] = set()
32
+ self._completed: Set[str] = set()
33
+ self._failed: Dict[str, str] = {} # species -> error message
34
+ self._active = False
35
+
36
+ def start(self, groups: List[str]):
37
+ """Initialize tracking for a collection run."""
38
+ with self._lock:
39
+ self._pending = set(groups)
40
+ self._completed = set()
41
+ self._failed = {}
42
+ self._active = True
43
+
44
+ def mark_completed(self, group: str):
45
+ """Mark a species as successfully completed."""
46
+ with self._lock:
47
+ self._pending.discard(group)
48
+ self._completed.add(group)
49
+ if group in self._failed:
50
+ del self._failed[group]
51
+
52
+ def mark_failed(self, group: str, error: str):
53
+ """Mark a species as failed with error message."""
54
+ with self._lock:
55
+ self._failed[group] = error
56
+
57
+ def get_incomplete(self) -> List[str]:
58
+ """Get list of species not yet completed."""
59
+ with self._lock:
60
+ return list(self._pending - self._completed)
61
+
62
+ def get_failed(self) -> Dict[str, str]:
63
+ """Get dict of failed species and their errors."""
64
+ with self._lock:
65
+ return dict(self._failed)
66
+
67
+ def is_active(self) -> bool:
68
+ """Check if collection is active."""
69
+ with self._lock:
70
+ return self._active
71
+
72
+ def finish(self):
73
+ """Mark collection as finished."""
74
+ with self._lock:
75
+ self._active = False
76
+
77
+ def print_status(self):
78
+ """Print current collection status."""
79
+ with self._lock:
80
+ incomplete = list(self._pending - self._completed)
81
+ if not incomplete and not self._failed:
82
+ return
83
+
84
+ print("\n" + "=" * 60)
85
+ print("COLLECTION STATUS")
86
+ print("=" * 60)
87
+ print(f"Completed: {len(self._completed)}")
88
+ print(f"Incomplete: {len(incomplete)}")
89
+ print(f"Failed: {len(self._failed)}")
90
+
91
+ if incomplete:
92
+ print("\n⚠️ INCOMPLETE SPECIES (not yet processed):")
93
+ for species in sorted(incomplete):
94
+ print(f" - {species}")
95
+
96
+ if self._failed:
97
+ print("\n❌ FAILED SPECIES (errors encountered):")
98
+ for species, error in sorted(self._failed.items()):
99
+ print(f" - {species}: {error}")
100
+
101
+ print("=" * 60)
102
+
103
+
104
+ # Global progress tracker
105
+ _progress = CollectionProgress()
106
+
107
+
108
+ def _print_status_on_exit():
109
+ """Print incomplete species on exit."""
110
+ if _progress.is_active():
111
+ _progress.print_status()
112
+
113
+
114
+ def _signal_handler(signum, frame):
115
+ """Handle interrupt signals gracefully."""
116
+ print("\n\n⚠️ Collection interrupted by user!")
117
+ _progress.print_status()
118
+ sys.exit(1)
119
+
120
+
121
+ # Register exit handlers
122
+ atexit.register(_print_status_on_exit)
123
+ signal.signal(signal.SIGINT, _signal_handler)
124
+ signal.signal(signal.SIGTERM, _signal_handler)
125
+
126
+
127
+ # ============================================================================
128
+ # RETRY CONFIGURATION
129
+ # ============================================================================
130
+
131
+ DEFAULT_RETRY_CONFIG = {
132
+ "max_retries": 5,
133
+ "initial_wait": 10, # seconds
134
+ "max_wait": 300, # 5 minutes max
135
+ "backoff_factor": 2, # exponential backoff
136
+ }
137
+
138
+
139
+ # Default quality filters for high-quality training data from GBIF
140
+ # Reference: https://www.gbif.org/developer/occurrence
141
+ DEFAULT_QUALITY_FILTERS = {
142
+ # Media filters
143
+ "mediaType": ["StillImage"],
144
+
145
+ # Basis of record - observation types most likely to have quality images
146
+ "basisOfRecord": [
147
+ "HUMAN_OBSERVATION",
148
+ "MACHINE_OBSERVATION",
149
+ "OBSERVATION",
150
+ ],
151
+
152
+ # Life stage - adult insects for consistent morphology
153
+ "lifeStage": ["Adult"],
154
+
155
+ # Occurrence status - only confirmed presence records
156
+ "occurrenceStatus": "PRESENT",
157
+
158
+ # # Geospatial quality - ensure valid coordinates without issues
159
+ # "hasCoordinate": True,
160
+ # "hasGeospatialIssue": False,
161
+
162
+ # # Coordinate precision - max 1km uncertainty for reliable location
163
+ # "coordinateUncertaintyInMeters": "0,1000",
164
+
165
+ # # License - permissive licenses for research/training use
166
+ # # CC0_1_0: Public domain, CC_BY_4_0: Attribution only, CC_BY_NC_4_0: Non-commercial
167
+ # "license": ["CC0_1_0", "CC_BY_4_0", "CC_BY_NC_4_0"],
168
+
169
+ # Year range - recent records tend to have better quality images
170
+ # Can be overridden by user
171
+ "year": "2010,2025",
172
+ }
173
+
174
+ def collect(
175
+ group_by_key: Group,
176
+ search_parameters: dict[str, Any],
177
+ images_per_group: int,
178
+ output_directory: str,
179
+ num_threads: int,
180
+ use_quality_filters: bool = True,
181
+ quality_filter_overrides: Optional[dict[str, Any]] = None,
182
+ max_retries: int = 5,
183
+ initial_wait: int = 10,
184
+ ):
185
+ """
186
+ Collect images from GBIF for training data.
187
+
188
+ Args:
189
+ group_by_key: How to group occurrences (e.g., by scientificName)
190
+ search_parameters: GBIF search parameters including species list
191
+ images_per_group: Number of images to download per group
192
+ output_directory: Directory to save downloaded images
193
+ num_threads: Number of parallel download threads
194
+ use_quality_filters: Apply default quality filters for training data
195
+ quality_filter_overrides: Override specific quality filter values
196
+ max_retries: Maximum retry attempts for failed API calls
197
+ initial_wait: Initial wait time in seconds before retry (doubles each retry)
198
+
199
+ On interruption or failure, prints list of incomplete species.
200
+ """
201
+ groups: list[str] = search_parameters[group_by_key.value]
202
+
203
+ # Initialize progress tracking
204
+ _progress.start(groups)
205
+
206
+ # Build retry config
207
+ retry_config = {
208
+ "max_retries": max_retries,
209
+ "initial_wait": initial_wait,
210
+ "max_wait": 300,
211
+ "backoff_factor": 2,
212
+ }
213
+
214
+ # Build quality filters
215
+ quality_filters = {}
216
+ if use_quality_filters:
217
+ quality_filters = DEFAULT_QUALITY_FILTERS.copy()
218
+ if quality_filter_overrides:
219
+ quality_filters.update(quality_filter_overrides)
220
+ print("Quality filters enabled:")
221
+ for key, value in quality_filters.items():
222
+ print(f" {key}: {value}")
223
+
224
+ print(f"\nStarting collection for {len(groups)} species...")
225
+ print(f"Retry config: max_retries={max_retries}, initial_wait={initial_wait}s\n")
226
+
227
+ try:
228
+ # Check if user wants to parallelize the process
229
+ if num_threads > 1:
230
+ __threaded_collect(
231
+ images_per_group=images_per_group,
232
+ output_directory=output_directory,
233
+ num_threads=num_threads,
234
+ groups=groups,
235
+ quality_filters=quality_filters,
236
+ retry_config=retry_config,
237
+ )
238
+ else:
239
+ __single_collect(
240
+ search_parameters=search_parameters,
241
+ images_per_group=images_per_group,
242
+ output_directory=output_directory,
243
+ group_by_key=group_by_key,
244
+ groups=groups,
245
+ quality_filters=quality_filters,
246
+ retry_config=retry_config,
247
+ )
248
+ finally:
249
+ _progress.finish()
250
+ _progress.print_status()
251
+
252
+ def __single_collect(
253
+ group_by_key: Group,
254
+ search_parameters: dict[str, Any],
255
+ images_per_group: int,
256
+ output_directory: str,
257
+ groups: list[str],
258
+ quality_filters: dict[str, Any],
259
+ retry_config: dict[str, Any],
260
+ ):
261
+ """Single-threaded collection of images with retry logic."""
262
+ __create_folders(names=groups, directory=output_directory)
263
+
264
+ print("Beginning to collect images from GBIF...")
265
+ for group in groups:
266
+ success = __collect_single_group(
267
+ group=group,
268
+ group_by_key=group_by_key,
269
+ search_parameters=search_parameters.copy(),
270
+ images_per_group=images_per_group,
271
+ output_directory=output_directory,
272
+ quality_filters=quality_filters,
273
+ retry_config=retry_config,
274
+ )
275
+ if success:
276
+ _progress.mark_completed(group)
277
+
278
+ print("Finished collecting images.")
279
+
280
+
281
+ def __collect_single_group(
282
+ group: str,
283
+ group_by_key: Group,
284
+ search_parameters: dict[str, Any],
285
+ images_per_group: int,
286
+ output_directory: str,
287
+ quality_filters: dict[str, Any],
288
+ retry_config: dict[str, Any],
289
+ ) -> bool:
290
+ """
291
+ Collect images for a single group with retry logic.
292
+
293
+ Returns:
294
+ bool: True if successful, False if all retries exhausted
295
+ """
296
+ max_retries = retry_config["max_retries"]
297
+ initial_wait = retry_config["initial_wait"]
298
+ max_wait = retry_config["max_wait"]
299
+ backoff_factor = retry_config["backoff_factor"]
300
+
301
+ for attempt in range(max_retries + 1):
302
+ try:
303
+ # Fetch occurrences
304
+ occurrences_json = _fetch_occurrences(
305
+ group_key=group_by_key,
306
+ group_value=group,
307
+ parameters=search_parameters.copy(),
308
+ quality_filters=quality_filters,
309
+ totalLimit=10000,
310
+ )
311
+ optional_occurrences = map(lambda x: __parse_occurrence(x), occurrences_json)
312
+ occurrences = list(filter(None, optional_occurrences))
313
+
314
+ if not occurrences:
315
+ print(f"⚠️ No valid occurrences found for {group}")
316
+ return True # Not a failure, just no data
317
+
318
+ random.seed(42) # for reproducibility
319
+ sampled_occurrences = random.sample(occurrences, min(images_per_group, len(occurrences)))
320
+
321
+ print(f"Downloading {len(sampled_occurrences)} images into the {group} folder...")
322
+
323
+ # Download images with individual retry
324
+ for occurrence in tqdm(sampled_occurrences, desc=f"{group}", unit="img"):
325
+ __download_with_retry(
326
+ url=occurrence.image_url,
327
+ group=group,
328
+ ID_name=occurrence.key,
329
+ folder=output_directory,
330
+ max_retries=3,
331
+ )
332
+
333
+ print(f"✓ Completed: {group}")
334
+ return True
335
+
336
+ except (requests.exceptions.Timeout,
337
+ requests.exceptions.ConnectionError,
338
+ requests.exceptions.HTTPError,
339
+ Exception) as e:
340
+
341
+ error_msg = str(e)[:100]
342
+ _progress.mark_failed(group, error_msg)
343
+
344
+ if attempt < max_retries:
345
+ wait_time = min(initial_wait * (backoff_factor ** attempt), max_wait)
346
+ print(f"\n⚠️ Error for {group}: {error_msg}")
347
+ print(f" Retry {attempt + 1}/{max_retries} in {wait_time}s...")
348
+ time.sleep(wait_time)
349
+ else:
350
+ print(f"\n❌ Failed after {max_retries} retries: {group}")
351
+ print(f" Error: {error_msg}")
352
+ return False
353
+
354
+ return False
355
+
356
+
357
+ def __download_with_retry(url: str, group: str, ID_name: str, folder: str, max_retries: int = 3):
358
+ """Download a single image with retry logic."""
359
+ for attempt in range(max_retries + 1):
360
+ try:
361
+ __down_image(url=url, group=group, ID_name=ID_name, folder=folder)
362
+ return
363
+ except Exception as e:
364
+ if attempt < max_retries:
365
+ time.sleep(2 ** attempt) # Quick exponential backoff for images
366
+ else:
367
+ # Silent fail for individual images - don't halt the whole process
368
+ pass
369
+
370
+ def __threaded_collect(
371
+ images_per_group: int,
372
+ output_directory: str,
373
+ num_threads: int,
374
+ groups: list[str],
375
+ quality_filters: dict[str, Any],
376
+ retry_config: dict[str, Any],
377
+ ):
378
+ """Parallelize the collection of images across multiple threads."""
379
+ # Handle edge case where num_threads is greater than number of groups
380
+ if num_threads >= len(groups):
381
+ num_threads = len(groups)
382
+
383
+ # Divide the species list into num_threads parts
384
+ chunk_size = len(groups) // num_threads
385
+ species_chunks = [
386
+ groups[i : i + chunk_size] for i in range(0, len(groups), chunk_size)
387
+ ]
388
+
389
+ # Ensure we have exactly num_threads chunks
390
+ while len(species_chunks) < num_threads:
391
+ species_chunks.append([])
392
+
393
+ threads = []
394
+ for i, chunk in enumerate(species_chunks):
395
+ thread = threading.Thread(
396
+ target=__collect_subset,
397
+ args=(chunk, images_per_group, output_directory, i, quality_filters, retry_config),
398
+ )
399
+ threads.append(thread)
400
+ thread.start()
401
+
402
+ # Wait for all threads to complete
403
+ for thread in threads:
404
+ thread.join()
405
+
406
+ print("All collection threads have finished.")
407
+
408
+
409
+ def _fetch_occurrences(
410
+ group_key: str,
411
+ group_value: str,
412
+ parameters: dict[str, Any],
413
+ quality_filters: dict[str, Any],
414
+ totalLimit: int,
415
+ ) -> list[dict[str, Any]]:
416
+ """Fetch occurrences from GBIF with quality filters applied."""
417
+ parameters[group_key] = group_value
418
+ return __next_batch(
419
+ parameters=parameters,
420
+ quality_filters=quality_filters,
421
+ total_limit=totalLimit,
422
+ offset=0,
423
+ current=[],
424
+ )
425
+
426
+ def __next_batch(
427
+ parameters: dict[str, Any],
428
+ quality_filters: dict[str, Any],
429
+ total_limit: int,
430
+ offset: int,
431
+ current: list[dict[str, Any]],
432
+ ) -> list[dict[str, Any]]:
433
+ """Recursively fetch batches of occurrences from GBIF."""
434
+ # Build search parameters
435
+ search_params = {**parameters}
436
+ search_params["limit"] = total_limit
437
+ search_params["offset"] = offset
438
+
439
+ # Apply quality filters
440
+ search_params.update(quality_filters)
441
+
442
+ search = pygbif.occurrences.search(**search_params)
443
+ occurrences = search["results"]
444
+
445
+ if search["endOfRecords"] or len(current) >= total_limit:
446
+ return current + occurrences
447
+ else:
448
+ new_offset = search["offset"]
449
+ count = search["limit"]
450
+ return __next_batch(
451
+ parameters=parameters,
452
+ quality_filters=quality_filters,
453
+ total_limit=total_limit,
454
+ offset=new_offset + count,
455
+ current=current + occurrences,
456
+ )
457
+
458
+ # Function to download insect images
459
+ def __down_image(url: str, group: str, ID_name: str, folder: str, timeout: int = 30):
460
+ """Download a single image with timeout."""
461
+ directory = os.path.join(folder, f"{group}")
462
+ os.makedirs(directory, exist_ok=True)
463
+ image_response = requests.get(url, timeout=timeout)
464
+ image_response.raise_for_status() # Raise on bad status codes
465
+ image_name = f"{group}{ID_name}.jpg"
466
+ image_path = os.path.join(directory, image_name)
467
+ with open(image_path, "wb") as f:
468
+ f.write(image_response.content)
469
+
470
+ def __create_folders(names: list[str], directory: str):
471
+ print("Creating folders for images...")
472
+ # Check if the folder path exists, if not, create it
473
+ if not os.path.exists(directory):
474
+ os.makedirs(directory)
475
+
476
+ for name in names:
477
+ folder_name = os.path.join(directory, name)
478
+ # Create a folder using the group name
479
+ os.makedirs(folder_name, exist_ok=True)
480
+
481
+ def __collect_subset(
482
+ species_subset: List[str],
483
+ images_per_group: int,
484
+ output_directory: str,
485
+ thread_id: int,
486
+ quality_filters: Dict[str, Any],
487
+ retry_config: Dict[str, Any],
488
+ ):
489
+ """Worker function for threaded collection."""
490
+ search_subset: Dict[str, Any] = {"scientificName": species_subset}
491
+
492
+ print(f"Thread {thread_id} starting collection for {len(species_subset)} species.")
493
+
494
+ __single_collect(
495
+ search_parameters=search_subset,
496
+ images_per_group=images_per_group,
497
+ output_directory=output_directory,
498
+ group_by_key=Group.scientificName,
499
+ groups=species_subset,
500
+ quality_filters=quality_filters,
501
+ retry_config=retry_config,
502
+ )
503
+
504
+ print(f"Thread {thread_id} finished collection.")
505
+
506
+
507
+
508
+
509
+ class Occurrence:
510
+
511
+ def __init__(self, key: str, image_url: str) -> None:
512
+ self.key = key
513
+ self.image_url = image_url
514
+
515
+
516
+ def __parse_occurrence(json: dict[str, Any]) -> Optional[Occurrence]:
517
+ if (key := json.get("key", str)) is not None \
518
+ and (image_url := json.get("media", {})[0].get("identifier", str)) is not None \
519
+ and validators.url(image_url):
520
+
521
+ return Occurrence(key=key, image_url=image_url)
522
+ else:
523
+ return None