bplusplus 1.1.0__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bplusplus might be problematic. Click here for more details.

Files changed (99) hide show
  1. bplusplus-1.2.1/PKG-INFO +252 -0
  2. bplusplus-1.2.1/README.md +227 -0
  3. {bplusplus-1.1.0 → bplusplus-1.2.1}/pyproject.toml +4 -1
  4. bplusplus-1.2.1/src/bplusplus/__init__.py +7 -0
  5. {bplusplus-1.1.0 → bplusplus-1.2.1}/src/bplusplus/collect.py +72 -3
  6. bplusplus-1.2.1/src/bplusplus/hierarchical/test.py +670 -0
  7. bplusplus-1.2.1/src/bplusplus/hierarchical/train.py +676 -0
  8. {bplusplus-1.1.0 → bplusplus-1.2.1}/src/bplusplus/prepare.py +236 -71
  9. bplusplus-1.2.1/src/bplusplus/resnet/test.py +473 -0
  10. bplusplus-1.2.1/src/bplusplus/resnet/train.py +329 -0
  11. bplusplus-1.1.0/PKG-INFO +0 -179
  12. bplusplus-1.1.0/README.md +0 -157
  13. bplusplus-1.1.0/src/bplusplus/__init__.py +0 -5
  14. bplusplus-1.1.0/src/bplusplus/yolov5detect/__init__.py +0 -1
  15. bplusplus-1.1.0/src/bplusplus/yolov5detect/detect.py +0 -444
  16. bplusplus-1.1.0/src/bplusplus/yolov5detect/export.py +0 -1530
  17. bplusplus-1.1.0/src/bplusplus/yolov5detect/insect.yaml +0 -8
  18. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/__init__.py +0 -0
  19. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/common.py +0 -1109
  20. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/experimental.py +0 -130
  21. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/anchors.yaml +0 -56
  22. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov3-spp.yaml +0 -52
  23. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov3-tiny.yaml +0 -42
  24. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov3.yaml +0 -52
  25. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov5-bifpn.yaml +0 -49
  26. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov5-fpn.yaml +0 -43
  27. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov5-p2.yaml +0 -55
  28. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov5-p34.yaml +0 -42
  29. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov5-p6.yaml +0 -57
  30. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov5-p7.yaml +0 -68
  31. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov5-panet.yaml +0 -49
  32. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov5l6.yaml +0 -61
  33. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov5m6.yaml +0 -61
  34. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov5n6.yaml +0 -61
  35. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov5s-LeakyReLU.yaml +0 -50
  36. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov5s-ghost.yaml +0 -49
  37. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov5s-transformer.yaml +0 -49
  38. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov5s6.yaml +0 -61
  39. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/hub/yolov5x6.yaml +0 -61
  40. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/segment/yolov5l-seg.yaml +0 -49
  41. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/segment/yolov5m-seg.yaml +0 -49
  42. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/segment/yolov5n-seg.yaml +0 -49
  43. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/segment/yolov5s-seg.yaml +0 -49
  44. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/segment/yolov5x-seg.yaml +0 -49
  45. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/tf.py +0 -797
  46. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/yolo.py +0 -495
  47. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/yolov5l.yaml +0 -49
  48. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/yolov5m.yaml +0 -49
  49. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/yolov5n.yaml +0 -49
  50. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/yolov5s.yaml +0 -49
  51. bplusplus-1.1.0/src/bplusplus/yolov5detect/models/yolov5x.yaml +0 -49
  52. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/__init__.py +0 -97
  53. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/activations.py +0 -134
  54. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/augmentations.py +0 -448
  55. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/autoanchor.py +0 -175
  56. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/autobatch.py +0 -70
  57. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/aws/__init__.py +0 -0
  58. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/aws/mime.sh +0 -26
  59. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/aws/resume.py +0 -41
  60. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/aws/userdata.sh +0 -27
  61. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/callbacks.py +0 -72
  62. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/dataloaders.py +0 -1385
  63. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/docker/Dockerfile +0 -73
  64. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/docker/Dockerfile-arm64 +0 -40
  65. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/docker/Dockerfile-cpu +0 -42
  66. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/downloads.py +0 -136
  67. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/flask_rest_api/README.md +0 -70
  68. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/flask_rest_api/example_request.py +0 -17
  69. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/flask_rest_api/restapi.py +0 -49
  70. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/general.py +0 -1294
  71. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/google_app_engine/Dockerfile +0 -25
  72. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/google_app_engine/additional_requirements.txt +0 -6
  73. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/google_app_engine/app.yaml +0 -16
  74. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/loggers/__init__.py +0 -476
  75. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/loggers/clearml/README.md +0 -222
  76. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/loggers/clearml/__init__.py +0 -0
  77. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/loggers/clearml/clearml_utils.py +0 -230
  78. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/loggers/clearml/hpo.py +0 -90
  79. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/loggers/comet/README.md +0 -250
  80. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/loggers/comet/__init__.py +0 -551
  81. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/loggers/comet/comet_utils.py +0 -151
  82. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/loggers/comet/hpo.py +0 -126
  83. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/loggers/comet/optimizer_config.json +0 -135
  84. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/loggers/wandb/__init__.py +0 -0
  85. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/loggers/wandb/wandb_utils.py +0 -210
  86. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/loss.py +0 -259
  87. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/metrics.py +0 -381
  88. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/plots.py +0 -517
  89. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/segment/__init__.py +0 -0
  90. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/segment/augmentations.py +0 -100
  91. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/segment/dataloaders.py +0 -366
  92. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/segment/general.py +0 -160
  93. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/segment/loss.py +0 -198
  94. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/segment/metrics.py +0 -225
  95. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/segment/plots.py +0 -152
  96. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/torch_utils.py +0 -482
  97. bplusplus-1.1.0/src/bplusplus/yolov5detect/utils/triton.py +0 -90
  98. {bplusplus-1.1.0 → bplusplus-1.2.1}/LICENSE +0 -0
  99. {bplusplus-1.1.0 → bplusplus-1.2.1}/src/bplusplus/train_validate.py +0 -0
@@ -0,0 +1,252 @@
1
+ Metadata-Version: 2.1
2
+ Name: bplusplus
3
+ Version: 1.2.1
4
+ Summary: A simple method to create AI models for biodiversity, with collect and prepare pipeline
5
+ License: MIT
6
+ Author: Titus Venverloo
7
+ Author-email: tvenver@mit.edu
8
+ Requires-Python: >=3.9.0,<4.0.0
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Requires-Dist: prettytable (==3.7.0)
16
+ Requires-Dist: pygbif (>=0.6.4,<0.7.0)
17
+ Requires-Dist: requests (==2.25.1)
18
+ Requires-Dist: scikit-learn (>=1.6.1,<2.0.0)
19
+ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
20
+ Requires-Dist: torch (==2.5.0)
21
+ Requires-Dist: ultralytics (==8.0.195)
22
+ Requires-Dist: validators (>=0.33.0,<0.34.0)
23
+ Description-Content-Type: text/markdown
24
+
25
+ # B++ repository
26
+
27
+ [![DOI](https://zenodo.org/badge/765250194.svg)](https://zenodo.org/badge/latestdoi/765250194)
28
+ [![PyPi version](https://img.shields.io/pypi/v/bplusplus.svg)](https://pypi.org/project/bplusplus/)
29
+ [![Python versions](https://img.shields.io/pypi/pyversions/bplusplus.svg)](https://pypi.org/project/bplusplus/)
30
+ [![License](https://img.shields.io/pypi/l/bplusplus.svg)](https://pypi.org/project/bplusplus/)
31
+ [![Downloads](https://static.pepy.tech/badge/bplusplus)](https://pepy.tech/project/bplusplus)
32
+ [![Downloads](https://static.pepy.tech/badge/bplusplus/month)](https://pepy.tech/project/bplusplus)
33
+ [![Downloads](https://static.pepy.tech/badge/bplusplus/week)](https://pepy.tech/project/bplusplus)
34
+
35
+ This repo can be used to quickly generate models for biodiversity monitoring, relying on the GBIF dataset.
36
+
37
+ # Three pipeline options
38
+
39
+ ## One stage YOLO
40
+
41
+ For the one stage pipeline, we first collect `collect()` the data from GBIF, then prepare the data for training by running the `prepare()` function, which adds bounding boxes to the images using a pretrained YOLO model. We then train the model with YOLOv8 using the `train()` function.
42
+
43
+ ## Two stage YOLO/Resnet
44
+
45
+ For the two stage pipeline, we first collect `collect()` the data from GBIF, then prepare `prepare()` this (classification) data for training by either size filtering (recommended "large") which also splits the data into train and valid. We then train the model with resnet using the `train_resnet()` function. The trained model is a resnet classification model which will then be paired with a pretrained YOLOv8 insect detection model (hence two stage).
46
+
47
+ ## Two stage YOLO/Multitask-Resnet
48
+
49
+ For the two stage pipeline, we first collect `collect()` the data from GBIF, then prepare `prepare()` this (classification) data for training by either size filtering (recommended "large") which also splits the data into train and valid. We then train the model with resnet using the `train_multitask()` function. The difference here is that it is training for species, order and family simultaneously. The trained model is a resnet classification model which will then be paired with a pretrained YOLOv8 insect detection model (hence two stage).
50
+
51
+ # Setup
52
+
53
+ ### Install package
54
+
55
+ ```python
56
+ pip install bplusplus
57
+ ```
58
+
59
+ ### bplusplus.collect() (All pipelines)
60
+
61
+ This function takes three arguments:
62
+ - **search_parameters: dict[str, Any]** - List of scientific names of the species you want to collect from the GBIF database
63
+ - **images_per_group: int** - Number of images per species collected for training. Max 9000.
64
+ - **output_directory: str** - Directory to store collected images
65
+ - **num_threads: int** - Number of threads you want to run for collecting images. We recommend using a moderate number (3-5) to avoid overwhelming tha API server.
66
+
67
+ Example run:
68
+ ```python
69
+ import bplusplus
70
+
71
+ species_list=[ "Vanessa atalanta", "Gonepteryx rhamni", "Bombus hortorum"]
72
+ # convert to dict
73
+ search: dict[str, any] = {
74
+ "scientificName": species_list
75
+ }
76
+
77
+ images_per_group=20
78
+ output_directory="/dataset/selected-species"
79
+ num_threads=2
80
+
81
+
82
+ # Collect data from GBIF
83
+ bplusplus.collect(
84
+ search_parameters=search,
85
+ images_per_group=images_per_group,
86
+ output_directory=output_directory,
87
+ group_by_key=bplusplus.Group.scientificName,
88
+ num_threads=num_threads
89
+ )
90
+ ```
91
+
92
+ ### bplusplus.prepare() (All pipelines)
93
+
94
+ Prepares the dataset for training by performing the following steps:
95
+ 1. Copies images from the input directory to a temporary directory.
96
+ 2. Deletes corrupted images.
97
+ 3. Downloads YOLOv5 weights for *insect detection* if not already present.
98
+ 4. Runs YOLOv5 inference to generate labels for the images.
99
+ 5. Deletes orphaned images and inferences.
100
+ 6. Updates labels based on class mapping.
101
+ 7. Splits the data into train, test, and validation sets.
102
+ 8. Counts the total number of images across all splits.
103
+ 9. Makes a YAML configuration file for YOLOv8.
104
+
105
+ This function takes three arguments:
106
+ - **input_directory: str** - The path to the input directory containing the images.
107
+ - **output_directory: str** - The path to the output directory where the prepared dataset will be saved.
108
+ - **with_background: bool = False** - Set to False if you don't want to include/download background images
109
+ - **one_stage: bool = False** - Set to True if you want to train a one stage model
110
+ - **size_filter: bool = False** - Set to True if you want to filter by size of insect
111
+ - **sizes: list = None** - List of sizes to filter by. If None, all sizes will be used, ["large", "medium", "small"].
112
+
113
+ ```python
114
+ # Prepare data
115
+ bplusplus.prepare(
116
+ input_directory='/dataset/selected-species',
117
+ output_directory='/dataset/prepared-data',
118
+ with_background=False,
119
+ one_stage=False,
120
+ size_filter=True,
121
+ sizes=["large"]
122
+ )
123
+ ```
124
+
125
+ ### bplusplus.train() (One stage pipeline)
126
+
127
+ This function takes five arguments:
128
+ - **input_yaml: str** - yaml file created to train the model
129
+ - **output_directory: str**
130
+ - **epochs: int = 30** - Number of epochs to train the model
131
+ - **imgsz: int = 640** - Image size
132
+ - **batch: int = 16** - Batch size for training
133
+
134
+ ```python
135
+ # Train model
136
+ model = bplusplus.train(
137
+ input_yaml="/dataset/prepared-data/dataset.yaml", # Make sure to add the correct path
138
+ output_directory="trained-model",
139
+ epochs=30,
140
+ batch=16
141
+ )
142
+ ```
143
+
144
+ ### bplusplus.train_resnet() (Two stage (standard resnet) pipeline)
145
+
146
+ This function takes eight arguments:
147
+ - **species_list: list** - List of species to train the model on
148
+ - **model_type: str** - The type of resnet model to train. Options are "resnet50", "resnet152"
149
+ - **batch_size: int** - The batch size for training
150
+ - **num_epochs: int** - The number of epochs to train the model
151
+ - **patience: int** - The number of epochs to wait before early stopping
152
+ - **output_dir: str** - The path to the output directory where the trained model will be saved
153
+ - **data_dir: str** - The path to the directory containing the prepared data
154
+ - **img_size: int** - The size of the images to train the model on
155
+
156
+ ```python
157
+ # Train resnet model
158
+ bplusplus.train_resnet(
159
+ species_list=["Vanessa atalanta", "Gonepteryx rhamni", "Bombus hortorum"],
160
+ model_type="resnet50",
161
+ batch_size=16,
162
+ num_epochs=30,
163
+ patience=5,
164
+ output_dir="trained-model",
165
+ data_dir="prepared-data",
166
+ img_size=256
167
+ )
168
+ ```
169
+
170
+ ### bplusplus.train_multitask() (Two stage (multitask resnet) pipeline)
171
+
172
+ This function takes seven arguments:
173
+ - **batch_size: int** - The batch size for training
174
+ - **epochs: int** - The number of epochs to train the model
175
+ - **patience: int** - The number of epochs to wait before early stopping
176
+ - **img_size: int** - The size of the images to train the model on
177
+ - **data_dir: str** - The path to the directory containing the prepared data
178
+ - **output_dir: str** - The path to the output directory where the trained model will be saved
179
+ - **species_list: list** - List of species to train the model on
180
+
181
+ ```python
182
+ # Train multitask model
183
+ bplusplus.train_multitask(
184
+ batch_size=16,
185
+ epochs=30,
186
+ patience=5,
187
+ img_size=256,
188
+ data_dir="prepared-data",
189
+ output_dir="trained-model",
190
+ species_list=["Vanessa atalanta", "Gonepteryx rhamni", "Bombus hortorum"]
191
+ )
192
+ ```
193
+
194
+
195
+ ### bplusplus.validate() (One stage pipeline)
196
+
197
+ This function takes two arguments:
198
+ - **model** - The trained YOLO model
199
+ - **Path to yaml file**
200
+
201
+ ```python
202
+ metrics = bplusplus.validate(model, '/dataset/prepared-data/dataset.yaml')
203
+ print(metrics)
204
+ ```
205
+
206
+ ### bplusplus.test_resnet() (Two stage (standard resnet) pipeline)
207
+
208
+ This function takes six arguments:
209
+ - **data_path: str** - The path to the directory containing the test data
210
+ - **yolo_weights: str** - The path to the YOLO weights
211
+ - **resnet_weights: str** - The path to the resnet weights
212
+ - **model: str** - The type of resnet model to use
213
+ - **species_names: list** - The list of species names
214
+ - **output_dir: str** - The path to the output directory where the test results will be saved
215
+
216
+ ```python
217
+
218
+ bplusplus.test_resnet(
219
+ data_path=TEST_DATA_DIR,
220
+ yolo_weights=YOLO_WEIGHTS,
221
+ resnet_weights=RESNET_WEIGHTS,
222
+ model="resnet50",
223
+ species_names=species_list,
224
+ output_dir=TRAINED_MODEL_DIR
225
+ )
226
+ ```
227
+
228
+ ### bplusplus.test_multitask() (Two stage (multitask resnet) pipeline)
229
+
230
+ This function takes five arguments:
231
+ - **species_list: list** - List of species to test the model on
232
+ - **test_set: str** - The path to the directory containing the test data
233
+ - **yolo_weights: str** - The path to the YOLO weights
234
+ - **hierarchical_weights: str** - The path to the hierarchical weights
235
+ - **output_dir: str** - The path to the output directory where the test results will be saved
236
+
237
+
238
+ ```python
239
+ bplusplus.test_multitask(
240
+ species_list,
241
+ test_set=TEST_DATA_DIR,
242
+ yolo_weights=YOLO_WEIGHTS,
243
+ hierarchical_weights=RESNET_MULTITASK_WEIGHTS,
244
+ output_dir=TRAINED_MODEL_DIR
245
+ )
246
+ ```
247
+ # Citation
248
+
249
+ All information in this GitHub is available under MIT license, as long as credit is given to the authors.
250
+
251
+ **Venverloo, T., Duarte, F., B++: Towards Real-Time Monitoring of Insect Species. MIT Senseable City Laboratory, AMS Institute.**
252
+
@@ -0,0 +1,227 @@
1
+ # B++ repository
2
+
3
+ [![DOI](https://zenodo.org/badge/765250194.svg)](https://zenodo.org/badge/latestdoi/765250194)
4
+ [![PyPi version](https://img.shields.io/pypi/v/bplusplus.svg)](https://pypi.org/project/bplusplus/)
5
+ [![Python versions](https://img.shields.io/pypi/pyversions/bplusplus.svg)](https://pypi.org/project/bplusplus/)
6
+ [![License](https://img.shields.io/pypi/l/bplusplus.svg)](https://pypi.org/project/bplusplus/)
7
+ [![Downloads](https://static.pepy.tech/badge/bplusplus)](https://pepy.tech/project/bplusplus)
8
+ [![Downloads](https://static.pepy.tech/badge/bplusplus/month)](https://pepy.tech/project/bplusplus)
9
+ [![Downloads](https://static.pepy.tech/badge/bplusplus/week)](https://pepy.tech/project/bplusplus)
10
+
11
+ This repo can be used to quickly generate models for biodiversity monitoring, relying on the GBIF dataset.
12
+
13
+ # Three pipeline options
14
+
15
+ ## One stage YOLO
16
+
17
+ For the one stage pipeline, we first collect `collect()` the data from GBIF, then prepare the data for training by running the `prepare()` function, which adds bounding boxes to the images using a pretrained YOLO model. We then train the model with YOLOv8 using the `train()` function.
18
+
19
+ ## Two stage YOLO/Resnet
20
+
21
+ For the two stage pipeline, we first collect `collect()` the data from GBIF, then prepare `prepare()` this (classification) data for training by either size filtering (recommended "large") which also splits the data into train and valid. We then train the model with resnet using the `train_resnet()` function. The trained model is a resnet classification model which will then be paired with a pretrained YOLOv8 insect detection model (hence two stage).
22
+
23
+ ## Two stage YOLO/Multitask-Resnet
24
+
25
+ For the two stage pipeline, we first collect `collect()` the data from GBIF, then prepare `prepare()` this (classification) data for training by either size filtering (recommended "large") which also splits the data into train and valid. We then train the model with resnet using the `train_multitask()` function. The difference here is that it is training for species, order and family simultaneously. The trained model is a resnet classification model which will then be paired with a pretrained YOLOv8 insect detection model (hence two stage).
26
+
27
+ # Setup
28
+
29
+ ### Install package
30
+
31
+ ```python
32
+ pip install bplusplus
33
+ ```
34
+
35
+ ### bplusplus.collect() (All pipelines)
36
+
37
+ This function takes three arguments:
38
+ - **search_parameters: dict[str, Any]** - List of scientific names of the species you want to collect from the GBIF database
39
+ - **images_per_group: int** - Number of images per species collected for training. Max 9000.
40
+ - **output_directory: str** - Directory to store collected images
41
+ - **num_threads: int** - Number of threads you want to run for collecting images. We recommend using a moderate number (3-5) to avoid overwhelming tha API server.
42
+
43
+ Example run:
44
+ ```python
45
+ import bplusplus
46
+
47
+ species_list=[ "Vanessa atalanta", "Gonepteryx rhamni", "Bombus hortorum"]
48
+ # convert to dict
49
+ search: dict[str, any] = {
50
+ "scientificName": species_list
51
+ }
52
+
53
+ images_per_group=20
54
+ output_directory="/dataset/selected-species"
55
+ num_threads=2
56
+
57
+
58
+ # Collect data from GBIF
59
+ bplusplus.collect(
60
+ search_parameters=search,
61
+ images_per_group=images_per_group,
62
+ output_directory=output_directory,
63
+ group_by_key=bplusplus.Group.scientificName,
64
+ num_threads=num_threads
65
+ )
66
+ ```
67
+
68
+ ### bplusplus.prepare() (All pipelines)
69
+
70
+ Prepares the dataset for training by performing the following steps:
71
+ 1. Copies images from the input directory to a temporary directory.
72
+ 2. Deletes corrupted images.
73
+ 3. Downloads YOLOv5 weights for *insect detection* if not already present.
74
+ 4. Runs YOLOv5 inference to generate labels for the images.
75
+ 5. Deletes orphaned images and inferences.
76
+ 6. Updates labels based on class mapping.
77
+ 7. Splits the data into train, test, and validation sets.
78
+ 8. Counts the total number of images across all splits.
79
+ 9. Makes a YAML configuration file for YOLOv8.
80
+
81
+ This function takes three arguments:
82
+ - **input_directory: str** - The path to the input directory containing the images.
83
+ - **output_directory: str** - The path to the output directory where the prepared dataset will be saved.
84
+ - **with_background: bool = False** - Set to False if you don't want to include/download background images
85
+ - **one_stage: bool = False** - Set to True if you want to train a one stage model
86
+ - **size_filter: bool = False** - Set to True if you want to filter by size of insect
87
+ - **sizes: list = None** - List of sizes to filter by. If None, all sizes will be used, ["large", "medium", "small"].
88
+
89
+ ```python
90
+ # Prepare data
91
+ bplusplus.prepare(
92
+ input_directory='/dataset/selected-species',
93
+ output_directory='/dataset/prepared-data',
94
+ with_background=False,
95
+ one_stage=False,
96
+ size_filter=True,
97
+ sizes=["large"]
98
+ )
99
+ ```
100
+
101
+ ### bplusplus.train() (One stage pipeline)
102
+
103
+ This function takes five arguments:
104
+ - **input_yaml: str** - yaml file created to train the model
105
+ - **output_directory: str**
106
+ - **epochs: int = 30** - Number of epochs to train the model
107
+ - **imgsz: int = 640** - Image size
108
+ - **batch: int = 16** - Batch size for training
109
+
110
+ ```python
111
+ # Train model
112
+ model = bplusplus.train(
113
+ input_yaml="/dataset/prepared-data/dataset.yaml", # Make sure to add the correct path
114
+ output_directory="trained-model",
115
+ epochs=30,
116
+ batch=16
117
+ )
118
+ ```
119
+
120
+ ### bplusplus.train_resnet() (Two stage (standard resnet) pipeline)
121
+
122
+ This function takes eight arguments:
123
+ - **species_list: list** - List of species to train the model on
124
+ - **model_type: str** - The type of resnet model to train. Options are "resnet50", "resnet152"
125
+ - **batch_size: int** - The batch size for training
126
+ - **num_epochs: int** - The number of epochs to train the model
127
+ - **patience: int** - The number of epochs to wait before early stopping
128
+ - **output_dir: str** - The path to the output directory where the trained model will be saved
129
+ - **data_dir: str** - The path to the directory containing the prepared data
130
+ - **img_size: int** - The size of the images to train the model on
131
+
132
+ ```python
133
+ # Train resnet model
134
+ bplusplus.train_resnet(
135
+ species_list=["Vanessa atalanta", "Gonepteryx rhamni", "Bombus hortorum"],
136
+ model_type="resnet50",
137
+ batch_size=16,
138
+ num_epochs=30,
139
+ patience=5,
140
+ output_dir="trained-model",
141
+ data_dir="prepared-data",
142
+ img_size=256
143
+ )
144
+ ```
145
+
146
+ ### bplusplus.train_multitask() (Two stage (multitask resnet) pipeline)
147
+
148
+ This function takes seven arguments:
149
+ - **batch_size: int** - The batch size for training
150
+ - **epochs: int** - The number of epochs to train the model
151
+ - **patience: int** - The number of epochs to wait before early stopping
152
+ - **img_size: int** - The size of the images to train the model on
153
+ - **data_dir: str** - The path to the directory containing the prepared data
154
+ - **output_dir: str** - The path to the output directory where the trained model will be saved
155
+ - **species_list: list** - List of species to train the model on
156
+
157
+ ```python
158
+ # Train multitask model
159
+ bplusplus.train_multitask(
160
+ batch_size=16,
161
+ epochs=30,
162
+ patience=5,
163
+ img_size=256,
164
+ data_dir="prepared-data",
165
+ output_dir="trained-model",
166
+ species_list=["Vanessa atalanta", "Gonepteryx rhamni", "Bombus hortorum"]
167
+ )
168
+ ```
169
+
170
+
171
+ ### bplusplus.validate() (One stage pipeline)
172
+
173
+ This function takes two arguments:
174
+ - **model** - The trained YOLO model
175
+ - **Path to yaml file**
176
+
177
+ ```python
178
+ metrics = bplusplus.validate(model, '/dataset/prepared-data/dataset.yaml')
179
+ print(metrics)
180
+ ```
181
+
182
+ ### bplusplus.test_resnet() (Two stage (standard resnet) pipeline)
183
+
184
+ This function takes six arguments:
185
+ - **data_path: str** - The path to the directory containing the test data
186
+ - **yolo_weights: str** - The path to the YOLO weights
187
+ - **resnet_weights: str** - The path to the resnet weights
188
+ - **model: str** - The type of resnet model to use
189
+ - **species_names: list** - The list of species names
190
+ - **output_dir: str** - The path to the output directory where the test results will be saved
191
+
192
+ ```python
193
+
194
+ bplusplus.test_resnet(
195
+ data_path=TEST_DATA_DIR,
196
+ yolo_weights=YOLO_WEIGHTS,
197
+ resnet_weights=RESNET_WEIGHTS,
198
+ model="resnet50",
199
+ species_names=species_list,
200
+ output_dir=TRAINED_MODEL_DIR
201
+ )
202
+ ```
203
+
204
+ ### bplusplus.test_multitask() (Two stage (multitask resnet) pipeline)
205
+
206
+ This function takes five arguments:
207
+ - **species_list: list** - List of species to test the model on
208
+ - **test_set: str** - The path to the directory containing the test data
209
+ - **yolo_weights: str** - The path to the YOLO weights
210
+ - **hierarchical_weights: str** - The path to the hierarchical weights
211
+ - **output_dir: str** - The path to the output directory where the test results will be saved
212
+
213
+
214
+ ```python
215
+ bplusplus.test_multitask(
216
+ species_list,
217
+ test_set=TEST_DATA_DIR,
218
+ yolo_weights=YOLO_WEIGHTS,
219
+ hierarchical_weights=RESNET_MULTITASK_WEIGHTS,
220
+ output_dir=TRAINED_MODEL_DIR
221
+ )
222
+ ```
223
+ # Citation
224
+
225
+ All information in this GitHub is available under MIT license, as long as credit is given to the authors.
226
+
227
+ **Venverloo, T., Duarte, F., B++: Towards Real-Time Monitoring of Insect Species. MIT Senseable City Laboratory, AMS Institute.**
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "bplusplus"
3
- version = "1.1.0"
3
+ version = "1.2.1"
4
4
  description = "A simple method to create AI models for biodiversity, with collect and prepare pipeline"
5
5
  authors = ["Titus Venverloo <tvenver@mit.edu>", "Deniz Aydemir <deniz@aydemir.us>", "Orlando Closs <orlando.closs@wur.nl>", "Ase Hatveit <aase@mit.edu>"]
6
6
  license = "MIT"
@@ -13,6 +13,9 @@ ultralytics = "8.0.195"
13
13
  pygbif = "^0.6.4"
14
14
  validators = "^0.33.0"
15
15
  prettytable = "3.7.0"
16
+ scikit-learn = "^1.6.1"
17
+ tabulate = "^0.9.0"
18
+ torch = "2.5.0"
16
19
 
17
20
  [tool.poetry.group.dev.dependencies]
18
21
  jupyter = "^1.0.0"
@@ -0,0 +1,7 @@
1
+ from .collect import Group, collect
2
+ from .train_validate import train, validate
3
+ from .prepare import prepare
4
+ from .resnet.train import train_resnet
5
+ from .resnet.test import test_resnet
6
+ from .hierarchical.train import train_multitask
7
+ from .hierarchical.test import test_multitask
@@ -1,11 +1,13 @@
1
1
  import os
2
2
  import random
3
+ import threading
3
4
  from enum import Enum
4
- from typing import Any, Optional
5
+ from typing import Any, Dict, List, Optional
5
6
 
6
7
  import pygbif
7
8
  import requests
8
9
  import validators
10
+ from tqdm import tqdm
9
11
 
10
12
 
11
13
  #this lists currently supported groupings, more can be added with proper testing
@@ -13,10 +15,28 @@ class Group(str, Enum):
13
15
  scientificName="scientificName"
14
16
 
15
17
  #TODO add back support for fetching from dataset (or csvs)
16
- def collect(group_by_key: Group, search_parameters: dict[str, Any], images_per_group: int, output_directory: str):
18
+ def collect(group_by_key: Group, search_parameters: dict[str, Any], images_per_group: int, output_directory: str, num_threads: int):
17
19
 
18
20
  groups: list[str] = search_parameters[group_by_key.value]
19
21
 
22
+ # check if user wants to parallelize the process
23
+ if num_threads > 1:
24
+ __threaded_collect(
25
+ images_per_group=images_per_group,
26
+ output_directory=output_directory,
27
+ num_threads=num_threads,
28
+ groups=groups)
29
+ else:
30
+ __single_collect(
31
+ search_parameters=search_parameters,
32
+ images_per_group=images_per_group,
33
+ output_directory=output_directory,
34
+ group_by_key=group_by_key,
35
+ groups=groups,
36
+ )
37
+
38
+ def __single_collect(group_by_key: Group, search_parameters: dict[str, Any], images_per_group: int, output_directory: str, groups: list[str]):
39
+
20
40
  #TODO throw error if groups is not a str list
21
41
 
22
42
  __create_folders(
@@ -37,7 +57,7 @@ def collect(group_by_key: Group, search_parameters: dict[str, Any], images_per_g
37
57
  sampled_occurrences = random.sample(occurrences, min(images_per_group, len(occurrences)))
38
58
 
39
59
  print(f"Downloading {len(sampled_occurrences)} images into the {group} folder...")
40
- for occurrence in sampled_occurrences:
60
+ for occurrence in tqdm(sampled_occurrences, desc=f"Downloading images for {group}", unit="image"):
41
61
  # image_url = occurrence.image_url.replace("original", "large") # hack to get max 1024px image
42
62
 
43
63
  __down_image(
@@ -49,6 +69,38 @@ def collect(group_by_key: Group, search_parameters: dict[str, Any], images_per_g
49
69
 
50
70
  print("Finished collecting images.")
51
71
 
72
+ # threaded_collect: paralellize the collection of images
73
+ def __threaded_collect(images_per_group: int, output_directory: str, num_threads: int, groups: list[str]):
74
+ # Handle edge case where num_threads is greater than number of groups
75
+ if num_threads >= len(groups):
76
+ num_threads = len(groups)
77
+
78
+ # Divide the species list into num_threads parts
79
+ chunk_size = len(groups) // num_threads
80
+ species_chunks = [
81
+ groups[i:i + chunk_size] for i in range(0, len(groups), chunk_size)
82
+ ]
83
+
84
+ # Ensure we have exactly num_threads chunks (the last chunk might be larger if len(species_list) % num_threads != 0)
85
+ while len(species_chunks) < num_threads:
86
+ species_chunks.append([])
87
+
88
+ threads = []
89
+ for i, chunk in enumerate(species_chunks):
90
+ thread = threading.Thread(
91
+ target=__collect_subset,
92
+ args=(chunk, images_per_group, output_directory, i)
93
+ )
94
+ threads.append(thread)
95
+ thread.start()
96
+
97
+ # Wait for all threads to complete
98
+ for thread in threads:
99
+ thread.join()
100
+
101
+ print("All collection threads have finished.")
102
+
103
+
52
104
  def _fetch_occurrences(group_key: str, group_value: str, parameters: dict[str, Any], totalLimit: int) -> list[dict[str, Any]]:
53
105
  parameters[group_key] = group_value
54
106
  return __next_batch(
@@ -98,6 +150,23 @@ def __create_folders(names: list[str], directory: str):
98
150
  # Create a folder using the group name
99
151
  os.makedirs(folder_name, exist_ok=True)
100
152
 
153
+ def __collect_subset(species_subset: List[str], images_per_group: int, output_directory: str, thread_id: int):
154
+ search_subset: Dict[str, Any] = {
155
+ "scientificName": species_subset
156
+ }
157
+
158
+ print(f"Thread {thread_id} starting collection for {len(species_subset)} species.")
159
+
160
+ __single_collect(
161
+ search_parameters=search_subset,
162
+ images_per_group=images_per_group,
163
+ output_directory=output_directory,
164
+ group_by_key=Group.scientificName,
165
+ groups=species_subset
166
+ )
167
+
168
+ print(f"Thread {thread_id} finished collection.")
169
+
101
170
 
102
171
 
103
172