bplusplus 2.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bplusplus/__init__.py +15 -0
- bplusplus/collect.py +523 -0
- bplusplus/detector.py +376 -0
- bplusplus/inference.py +1337 -0
- bplusplus/prepare.py +706 -0
- bplusplus/tracker.py +261 -0
- bplusplus/train.py +913 -0
- bplusplus/validation.py +580 -0
- bplusplus-2.0.4.dist-info/LICENSE +21 -0
- bplusplus-2.0.4.dist-info/METADATA +259 -0
- bplusplus-2.0.4.dist-info/RECORD +12 -0
- bplusplus-2.0.4.dist-info/WHEEL +4 -0
bplusplus/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
try:
|
|
2
|
+
import torch
|
|
3
|
+
import torchvision
|
|
4
|
+
except ImportError:
|
|
5
|
+
raise ImportError(
|
|
6
|
+
"PyTorch and Torchvision are not installed. "
|
|
7
|
+
"Please install them before using bplusplus by following the instructions "
|
|
8
|
+
"on the official PyTorch website: https://pytorch.org/get-started/locally/"
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
from .collect import Group, collect
|
|
12
|
+
from .prepare import prepare
|
|
13
|
+
from .train import train
|
|
14
|
+
from .inference import inference
|
|
15
|
+
from .validation import validate
|
bplusplus/collect.py
ADDED
|
@@ -0,0 +1,523 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import random
|
|
3
|
+
import signal
|
|
4
|
+
import sys
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
import atexit
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import Any, Dict, List, Optional, Set
|
|
10
|
+
|
|
11
|
+
import pygbif
|
|
12
|
+
import requests
|
|
13
|
+
import validators
|
|
14
|
+
from tqdm import tqdm
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# Currently supported groupings, more can be added with proper testing
|
|
18
|
+
class Group(str, Enum):
|
|
19
|
+
scientificName = "scientificName"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# ============================================================================
|
|
23
|
+
# PROGRESS TRACKING
|
|
24
|
+
# ============================================================================
|
|
25
|
+
|
|
26
|
+
class CollectionProgress:
|
|
27
|
+
"""Thread-safe tracker for collection progress across species."""
|
|
28
|
+
|
|
29
|
+
def __init__(self):
|
|
30
|
+
self._lock = threading.Lock()
|
|
31
|
+
self._pending: Set[str] = set()
|
|
32
|
+
self._completed: Set[str] = set()
|
|
33
|
+
self._failed: Dict[str, str] = {} # species -> error message
|
|
34
|
+
self._active = False
|
|
35
|
+
|
|
36
|
+
def start(self, groups: List[str]):
|
|
37
|
+
"""Initialize tracking for a collection run."""
|
|
38
|
+
with self._lock:
|
|
39
|
+
self._pending = set(groups)
|
|
40
|
+
self._completed = set()
|
|
41
|
+
self._failed = {}
|
|
42
|
+
self._active = True
|
|
43
|
+
|
|
44
|
+
def mark_completed(self, group: str):
|
|
45
|
+
"""Mark a species as successfully completed."""
|
|
46
|
+
with self._lock:
|
|
47
|
+
self._pending.discard(group)
|
|
48
|
+
self._completed.add(group)
|
|
49
|
+
if group in self._failed:
|
|
50
|
+
del self._failed[group]
|
|
51
|
+
|
|
52
|
+
def mark_failed(self, group: str, error: str):
|
|
53
|
+
"""Mark a species as failed with error message."""
|
|
54
|
+
with self._lock:
|
|
55
|
+
self._failed[group] = error
|
|
56
|
+
|
|
57
|
+
def get_incomplete(self) -> List[str]:
|
|
58
|
+
"""Get list of species not yet completed."""
|
|
59
|
+
with self._lock:
|
|
60
|
+
return list(self._pending - self._completed)
|
|
61
|
+
|
|
62
|
+
def get_failed(self) -> Dict[str, str]:
|
|
63
|
+
"""Get dict of failed species and their errors."""
|
|
64
|
+
with self._lock:
|
|
65
|
+
return dict(self._failed)
|
|
66
|
+
|
|
67
|
+
def is_active(self) -> bool:
|
|
68
|
+
"""Check if collection is active."""
|
|
69
|
+
with self._lock:
|
|
70
|
+
return self._active
|
|
71
|
+
|
|
72
|
+
def finish(self):
|
|
73
|
+
"""Mark collection as finished."""
|
|
74
|
+
with self._lock:
|
|
75
|
+
self._active = False
|
|
76
|
+
|
|
77
|
+
def print_status(self):
|
|
78
|
+
"""Print current collection status."""
|
|
79
|
+
with self._lock:
|
|
80
|
+
incomplete = list(self._pending - self._completed)
|
|
81
|
+
if not incomplete and not self._failed:
|
|
82
|
+
return
|
|
83
|
+
|
|
84
|
+
print("\n" + "=" * 60)
|
|
85
|
+
print("COLLECTION STATUS")
|
|
86
|
+
print("=" * 60)
|
|
87
|
+
print(f"Completed: {len(self._completed)}")
|
|
88
|
+
print(f"Incomplete: {len(incomplete)}")
|
|
89
|
+
print(f"Failed: {len(self._failed)}")
|
|
90
|
+
|
|
91
|
+
if incomplete:
|
|
92
|
+
print("\n⚠️ INCOMPLETE SPECIES (not yet processed):")
|
|
93
|
+
for species in sorted(incomplete):
|
|
94
|
+
print(f" - {species}")
|
|
95
|
+
|
|
96
|
+
if self._failed:
|
|
97
|
+
print("\n❌ FAILED SPECIES (errors encountered):")
|
|
98
|
+
for species, error in sorted(self._failed.items()):
|
|
99
|
+
print(f" - {species}: {error}")
|
|
100
|
+
|
|
101
|
+
print("=" * 60)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# Global progress tracker
|
|
105
|
+
_progress = CollectionProgress()
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _print_status_on_exit():
|
|
109
|
+
"""Print incomplete species on exit."""
|
|
110
|
+
if _progress.is_active():
|
|
111
|
+
_progress.print_status()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _signal_handler(signum, frame):
|
|
115
|
+
"""Handle interrupt signals gracefully."""
|
|
116
|
+
print("\n\n⚠️ Collection interrupted by user!")
|
|
117
|
+
_progress.print_status()
|
|
118
|
+
sys.exit(1)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
# Register exit handlers
|
|
122
|
+
atexit.register(_print_status_on_exit)
|
|
123
|
+
signal.signal(signal.SIGINT, _signal_handler)
|
|
124
|
+
signal.signal(signal.SIGTERM, _signal_handler)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# ============================================================================
|
|
128
|
+
# RETRY CONFIGURATION
|
|
129
|
+
# ============================================================================
|
|
130
|
+
|
|
131
|
+
DEFAULT_RETRY_CONFIG = {
|
|
132
|
+
"max_retries": 5,
|
|
133
|
+
"initial_wait": 10, # seconds
|
|
134
|
+
"max_wait": 300, # 5 minutes max
|
|
135
|
+
"backoff_factor": 2, # exponential backoff
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
# Default quality filters for high-quality training data from GBIF
|
|
140
|
+
# Reference: https://www.gbif.org/developer/occurrence
|
|
141
|
+
DEFAULT_QUALITY_FILTERS = {
|
|
142
|
+
# Media filters
|
|
143
|
+
"mediaType": ["StillImage"],
|
|
144
|
+
|
|
145
|
+
# Basis of record - observation types most likely to have quality images
|
|
146
|
+
"basisOfRecord": [
|
|
147
|
+
"HUMAN_OBSERVATION",
|
|
148
|
+
"MACHINE_OBSERVATION",
|
|
149
|
+
"OBSERVATION",
|
|
150
|
+
],
|
|
151
|
+
|
|
152
|
+
# Life stage - adult insects for consistent morphology
|
|
153
|
+
"lifeStage": ["Adult"],
|
|
154
|
+
|
|
155
|
+
# Occurrence status - only confirmed presence records
|
|
156
|
+
"occurrenceStatus": "PRESENT",
|
|
157
|
+
|
|
158
|
+
# # Geospatial quality - ensure valid coordinates without issues
|
|
159
|
+
# "hasCoordinate": True,
|
|
160
|
+
# "hasGeospatialIssue": False,
|
|
161
|
+
|
|
162
|
+
# # Coordinate precision - max 1km uncertainty for reliable location
|
|
163
|
+
# "coordinateUncertaintyInMeters": "0,1000",
|
|
164
|
+
|
|
165
|
+
# # License - permissive licenses for research/training use
|
|
166
|
+
# # CC0_1_0: Public domain, CC_BY_4_0: Attribution only, CC_BY_NC_4_0: Non-commercial
|
|
167
|
+
# "license": ["CC0_1_0", "CC_BY_4_0", "CC_BY_NC_4_0"],
|
|
168
|
+
|
|
169
|
+
# Year range - recent records tend to have better quality images
|
|
170
|
+
# Can be overridden by user
|
|
171
|
+
"year": "2010,2025",
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
def collect(
|
|
175
|
+
group_by_key: Group,
|
|
176
|
+
search_parameters: dict[str, Any],
|
|
177
|
+
images_per_group: int,
|
|
178
|
+
output_directory: str,
|
|
179
|
+
num_threads: int,
|
|
180
|
+
use_quality_filters: bool = True,
|
|
181
|
+
quality_filter_overrides: Optional[dict[str, Any]] = None,
|
|
182
|
+
max_retries: int = 5,
|
|
183
|
+
initial_wait: int = 10,
|
|
184
|
+
):
|
|
185
|
+
"""
|
|
186
|
+
Collect images from GBIF for training data.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
group_by_key: How to group occurrences (e.g., by scientificName)
|
|
190
|
+
search_parameters: GBIF search parameters including species list
|
|
191
|
+
images_per_group: Number of images to download per group
|
|
192
|
+
output_directory: Directory to save downloaded images
|
|
193
|
+
num_threads: Number of parallel download threads
|
|
194
|
+
use_quality_filters: Apply default quality filters for training data
|
|
195
|
+
quality_filter_overrides: Override specific quality filter values
|
|
196
|
+
max_retries: Maximum retry attempts for failed API calls
|
|
197
|
+
initial_wait: Initial wait time in seconds before retry (doubles each retry)
|
|
198
|
+
|
|
199
|
+
On interruption or failure, prints list of incomplete species.
|
|
200
|
+
"""
|
|
201
|
+
groups: list[str] = search_parameters[group_by_key.value]
|
|
202
|
+
|
|
203
|
+
# Initialize progress tracking
|
|
204
|
+
_progress.start(groups)
|
|
205
|
+
|
|
206
|
+
# Build retry config
|
|
207
|
+
retry_config = {
|
|
208
|
+
"max_retries": max_retries,
|
|
209
|
+
"initial_wait": initial_wait,
|
|
210
|
+
"max_wait": 300,
|
|
211
|
+
"backoff_factor": 2,
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
# Build quality filters
|
|
215
|
+
quality_filters = {}
|
|
216
|
+
if use_quality_filters:
|
|
217
|
+
quality_filters = DEFAULT_QUALITY_FILTERS.copy()
|
|
218
|
+
if quality_filter_overrides:
|
|
219
|
+
quality_filters.update(quality_filter_overrides)
|
|
220
|
+
print("Quality filters enabled:")
|
|
221
|
+
for key, value in quality_filters.items():
|
|
222
|
+
print(f" {key}: {value}")
|
|
223
|
+
|
|
224
|
+
print(f"\nStarting collection for {len(groups)} species...")
|
|
225
|
+
print(f"Retry config: max_retries={max_retries}, initial_wait={initial_wait}s\n")
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
# Check if user wants to parallelize the process
|
|
229
|
+
if num_threads > 1:
|
|
230
|
+
__threaded_collect(
|
|
231
|
+
images_per_group=images_per_group,
|
|
232
|
+
output_directory=output_directory,
|
|
233
|
+
num_threads=num_threads,
|
|
234
|
+
groups=groups,
|
|
235
|
+
quality_filters=quality_filters,
|
|
236
|
+
retry_config=retry_config,
|
|
237
|
+
)
|
|
238
|
+
else:
|
|
239
|
+
__single_collect(
|
|
240
|
+
search_parameters=search_parameters,
|
|
241
|
+
images_per_group=images_per_group,
|
|
242
|
+
output_directory=output_directory,
|
|
243
|
+
group_by_key=group_by_key,
|
|
244
|
+
groups=groups,
|
|
245
|
+
quality_filters=quality_filters,
|
|
246
|
+
retry_config=retry_config,
|
|
247
|
+
)
|
|
248
|
+
finally:
|
|
249
|
+
_progress.finish()
|
|
250
|
+
_progress.print_status()
|
|
251
|
+
|
|
252
|
+
def __single_collect(
|
|
253
|
+
group_by_key: Group,
|
|
254
|
+
search_parameters: dict[str, Any],
|
|
255
|
+
images_per_group: int,
|
|
256
|
+
output_directory: str,
|
|
257
|
+
groups: list[str],
|
|
258
|
+
quality_filters: dict[str, Any],
|
|
259
|
+
retry_config: dict[str, Any],
|
|
260
|
+
):
|
|
261
|
+
"""Single-threaded collection of images with retry logic."""
|
|
262
|
+
__create_folders(names=groups, directory=output_directory)
|
|
263
|
+
|
|
264
|
+
print("Beginning to collect images from GBIF...")
|
|
265
|
+
for group in groups:
|
|
266
|
+
success = __collect_single_group(
|
|
267
|
+
group=group,
|
|
268
|
+
group_by_key=group_by_key,
|
|
269
|
+
search_parameters=search_parameters.copy(),
|
|
270
|
+
images_per_group=images_per_group,
|
|
271
|
+
output_directory=output_directory,
|
|
272
|
+
quality_filters=quality_filters,
|
|
273
|
+
retry_config=retry_config,
|
|
274
|
+
)
|
|
275
|
+
if success:
|
|
276
|
+
_progress.mark_completed(group)
|
|
277
|
+
|
|
278
|
+
print("Finished collecting images.")
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def __collect_single_group(
|
|
282
|
+
group: str,
|
|
283
|
+
group_by_key: Group,
|
|
284
|
+
search_parameters: dict[str, Any],
|
|
285
|
+
images_per_group: int,
|
|
286
|
+
output_directory: str,
|
|
287
|
+
quality_filters: dict[str, Any],
|
|
288
|
+
retry_config: dict[str, Any],
|
|
289
|
+
) -> bool:
|
|
290
|
+
"""
|
|
291
|
+
Collect images for a single group with retry logic.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
bool: True if successful, False if all retries exhausted
|
|
295
|
+
"""
|
|
296
|
+
max_retries = retry_config["max_retries"]
|
|
297
|
+
initial_wait = retry_config["initial_wait"]
|
|
298
|
+
max_wait = retry_config["max_wait"]
|
|
299
|
+
backoff_factor = retry_config["backoff_factor"]
|
|
300
|
+
|
|
301
|
+
for attempt in range(max_retries + 1):
|
|
302
|
+
try:
|
|
303
|
+
# Fetch occurrences
|
|
304
|
+
occurrences_json = _fetch_occurrences(
|
|
305
|
+
group_key=group_by_key,
|
|
306
|
+
group_value=group,
|
|
307
|
+
parameters=search_parameters.copy(),
|
|
308
|
+
quality_filters=quality_filters,
|
|
309
|
+
totalLimit=10000,
|
|
310
|
+
)
|
|
311
|
+
optional_occurrences = map(lambda x: __parse_occurrence(x), occurrences_json)
|
|
312
|
+
occurrences = list(filter(None, optional_occurrences))
|
|
313
|
+
|
|
314
|
+
if not occurrences:
|
|
315
|
+
print(f"⚠️ No valid occurrences found for {group}")
|
|
316
|
+
return True # Not a failure, just no data
|
|
317
|
+
|
|
318
|
+
random.seed(42) # for reproducibility
|
|
319
|
+
sampled_occurrences = random.sample(occurrences, min(images_per_group, len(occurrences)))
|
|
320
|
+
|
|
321
|
+
print(f"Downloading {len(sampled_occurrences)} images into the {group} folder...")
|
|
322
|
+
|
|
323
|
+
# Download images with individual retry
|
|
324
|
+
for occurrence in tqdm(sampled_occurrences, desc=f"{group}", unit="img"):
|
|
325
|
+
__download_with_retry(
|
|
326
|
+
url=occurrence.image_url,
|
|
327
|
+
group=group,
|
|
328
|
+
ID_name=occurrence.key,
|
|
329
|
+
folder=output_directory,
|
|
330
|
+
max_retries=3,
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
print(f"✓ Completed: {group}")
|
|
334
|
+
return True
|
|
335
|
+
|
|
336
|
+
except (requests.exceptions.Timeout,
|
|
337
|
+
requests.exceptions.ConnectionError,
|
|
338
|
+
requests.exceptions.HTTPError,
|
|
339
|
+
Exception) as e:
|
|
340
|
+
|
|
341
|
+
error_msg = str(e)[:100]
|
|
342
|
+
_progress.mark_failed(group, error_msg)
|
|
343
|
+
|
|
344
|
+
if attempt < max_retries:
|
|
345
|
+
wait_time = min(initial_wait * (backoff_factor ** attempt), max_wait)
|
|
346
|
+
print(f"\n⚠️ Error for {group}: {error_msg}")
|
|
347
|
+
print(f" Retry {attempt + 1}/{max_retries} in {wait_time}s...")
|
|
348
|
+
time.sleep(wait_time)
|
|
349
|
+
else:
|
|
350
|
+
print(f"\n❌ Failed after {max_retries} retries: {group}")
|
|
351
|
+
print(f" Error: {error_msg}")
|
|
352
|
+
return False
|
|
353
|
+
|
|
354
|
+
return False
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def __download_with_retry(url: str, group: str, ID_name: str, folder: str, max_retries: int = 3):
|
|
358
|
+
"""Download a single image with retry logic."""
|
|
359
|
+
for attempt in range(max_retries + 1):
|
|
360
|
+
try:
|
|
361
|
+
__down_image(url=url, group=group, ID_name=ID_name, folder=folder)
|
|
362
|
+
return
|
|
363
|
+
except Exception as e:
|
|
364
|
+
if attempt < max_retries:
|
|
365
|
+
time.sleep(2 ** attempt) # Quick exponential backoff for images
|
|
366
|
+
else:
|
|
367
|
+
# Silent fail for individual images - don't halt the whole process
|
|
368
|
+
pass
|
|
369
|
+
|
|
370
|
+
def __threaded_collect(
|
|
371
|
+
images_per_group: int,
|
|
372
|
+
output_directory: str,
|
|
373
|
+
num_threads: int,
|
|
374
|
+
groups: list[str],
|
|
375
|
+
quality_filters: dict[str, Any],
|
|
376
|
+
retry_config: dict[str, Any],
|
|
377
|
+
):
|
|
378
|
+
"""Parallelize the collection of images across multiple threads."""
|
|
379
|
+
# Handle edge case where num_threads is greater than number of groups
|
|
380
|
+
if num_threads >= len(groups):
|
|
381
|
+
num_threads = len(groups)
|
|
382
|
+
|
|
383
|
+
# Divide the species list into num_threads parts
|
|
384
|
+
chunk_size = len(groups) // num_threads
|
|
385
|
+
species_chunks = [
|
|
386
|
+
groups[i : i + chunk_size] for i in range(0, len(groups), chunk_size)
|
|
387
|
+
]
|
|
388
|
+
|
|
389
|
+
# Ensure we have exactly num_threads chunks
|
|
390
|
+
while len(species_chunks) < num_threads:
|
|
391
|
+
species_chunks.append([])
|
|
392
|
+
|
|
393
|
+
threads = []
|
|
394
|
+
for i, chunk in enumerate(species_chunks):
|
|
395
|
+
thread = threading.Thread(
|
|
396
|
+
target=__collect_subset,
|
|
397
|
+
args=(chunk, images_per_group, output_directory, i, quality_filters, retry_config),
|
|
398
|
+
)
|
|
399
|
+
threads.append(thread)
|
|
400
|
+
thread.start()
|
|
401
|
+
|
|
402
|
+
# Wait for all threads to complete
|
|
403
|
+
for thread in threads:
|
|
404
|
+
thread.join()
|
|
405
|
+
|
|
406
|
+
print("All collection threads have finished.")
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _fetch_occurrences(
|
|
410
|
+
group_key: str,
|
|
411
|
+
group_value: str,
|
|
412
|
+
parameters: dict[str, Any],
|
|
413
|
+
quality_filters: dict[str, Any],
|
|
414
|
+
totalLimit: int,
|
|
415
|
+
) -> list[dict[str, Any]]:
|
|
416
|
+
"""Fetch occurrences from GBIF with quality filters applied."""
|
|
417
|
+
parameters[group_key] = group_value
|
|
418
|
+
return __next_batch(
|
|
419
|
+
parameters=parameters,
|
|
420
|
+
quality_filters=quality_filters,
|
|
421
|
+
total_limit=totalLimit,
|
|
422
|
+
offset=0,
|
|
423
|
+
current=[],
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
def __next_batch(
|
|
427
|
+
parameters: dict[str, Any],
|
|
428
|
+
quality_filters: dict[str, Any],
|
|
429
|
+
total_limit: int,
|
|
430
|
+
offset: int,
|
|
431
|
+
current: list[dict[str, Any]],
|
|
432
|
+
) -> list[dict[str, Any]]:
|
|
433
|
+
"""Recursively fetch batches of occurrences from GBIF."""
|
|
434
|
+
# Build search parameters
|
|
435
|
+
search_params = {**parameters}
|
|
436
|
+
search_params["limit"] = total_limit
|
|
437
|
+
search_params["offset"] = offset
|
|
438
|
+
|
|
439
|
+
# Apply quality filters
|
|
440
|
+
search_params.update(quality_filters)
|
|
441
|
+
|
|
442
|
+
search = pygbif.occurrences.search(**search_params)
|
|
443
|
+
occurrences = search["results"]
|
|
444
|
+
|
|
445
|
+
if search["endOfRecords"] or len(current) >= total_limit:
|
|
446
|
+
return current + occurrences
|
|
447
|
+
else:
|
|
448
|
+
new_offset = search["offset"]
|
|
449
|
+
count = search["limit"]
|
|
450
|
+
return __next_batch(
|
|
451
|
+
parameters=parameters,
|
|
452
|
+
quality_filters=quality_filters,
|
|
453
|
+
total_limit=total_limit,
|
|
454
|
+
offset=new_offset + count,
|
|
455
|
+
current=current + occurrences,
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
# Function to download insect images
|
|
459
|
+
def __down_image(url: str, group: str, ID_name: str, folder: str, timeout: int = 30):
|
|
460
|
+
"""Download a single image with timeout."""
|
|
461
|
+
directory = os.path.join(folder, f"{group}")
|
|
462
|
+
os.makedirs(directory, exist_ok=True)
|
|
463
|
+
image_response = requests.get(url, timeout=timeout)
|
|
464
|
+
image_response.raise_for_status() # Raise on bad status codes
|
|
465
|
+
image_name = f"{group}{ID_name}.jpg"
|
|
466
|
+
image_path = os.path.join(directory, image_name)
|
|
467
|
+
with open(image_path, "wb") as f:
|
|
468
|
+
f.write(image_response.content)
|
|
469
|
+
|
|
470
|
+
def __create_folders(names: list[str], directory: str):
|
|
471
|
+
print("Creating folders for images...")
|
|
472
|
+
# Check if the folder path exists, if not, create it
|
|
473
|
+
if not os.path.exists(directory):
|
|
474
|
+
os.makedirs(directory)
|
|
475
|
+
|
|
476
|
+
for name in names:
|
|
477
|
+
folder_name = os.path.join(directory, name)
|
|
478
|
+
# Create a folder using the group name
|
|
479
|
+
os.makedirs(folder_name, exist_ok=True)
|
|
480
|
+
|
|
481
|
+
def __collect_subset(
|
|
482
|
+
species_subset: List[str],
|
|
483
|
+
images_per_group: int,
|
|
484
|
+
output_directory: str,
|
|
485
|
+
thread_id: int,
|
|
486
|
+
quality_filters: Dict[str, Any],
|
|
487
|
+
retry_config: Dict[str, Any],
|
|
488
|
+
):
|
|
489
|
+
"""Worker function for threaded collection."""
|
|
490
|
+
search_subset: Dict[str, Any] = {"scientificName": species_subset}
|
|
491
|
+
|
|
492
|
+
print(f"Thread {thread_id} starting collection for {len(species_subset)} species.")
|
|
493
|
+
|
|
494
|
+
__single_collect(
|
|
495
|
+
search_parameters=search_subset,
|
|
496
|
+
images_per_group=images_per_group,
|
|
497
|
+
output_directory=output_directory,
|
|
498
|
+
group_by_key=Group.scientificName,
|
|
499
|
+
groups=species_subset,
|
|
500
|
+
quality_filters=quality_filters,
|
|
501
|
+
retry_config=retry_config,
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
print(f"Thread {thread_id} finished collection.")
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
class Occurrence:
|
|
510
|
+
|
|
511
|
+
def __init__(self, key: str, image_url: str) -> None:
|
|
512
|
+
self.key = key
|
|
513
|
+
self.image_url = image_url
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def __parse_occurrence(json: dict[str, Any]) -> Optional[Occurrence]:
|
|
517
|
+
if (key := json.get("key", str)) is not None \
|
|
518
|
+
and (image_url := json.get("media", {})[0].get("identifier", str)) is not None \
|
|
519
|
+
and validators.url(image_url):
|
|
520
|
+
|
|
521
|
+
return Occurrence(key=key, image_url=image_url)
|
|
522
|
+
else:
|
|
523
|
+
return None
|