bplusplus 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bplusplus might be problematic. Click here for more details.

@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Titus Venverloo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,91 @@
1
+ Metadata-Version: 2.1
2
+ Name: bplusplus
3
+ Version: 0.1.0
4
+ Summary: A simple method to create AI models for biodiversity
5
+ License: MIT
6
+ Author: Titus Venverloo
7
+ Author-email: tvenver@mit.edu
8
+ Requires-Python: >=3.9.0,<4.0.0
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Requires-Dist: pygbif (>=0.6.4,<0.7.0)
16
+ Requires-Dist: requests (==2.25.1)
17
+ Requires-Dist: ultralytics (==8.0.195)
18
+ Requires-Dist: validators (>=0.33.0,<0.34.0)
19
+ Description-Content-Type: text/markdown
20
+
21
+ # B++ repository
22
+
23
+ This repo can be used to quickly generate YOLOv8 models for biodiversity monitoring, relying on Ultralytics and a GBIF dataset.
24
+
25
+ All code is tested on macOS and Python 3.12, without GPU. GPU would obviously accelerate the below steps, Ultralytics should automatically select the available GPU if there is any.
26
+
27
+
28
+ # How does it work?
29
+
30
+ ![Figure 9](https://github.com/user-attachments/assets/a01f513b-0609-412d-a633-3aee1e5dded6)
31
+
32
+ 1. Select scientific names you want to train your model on. For now, only scientific names are supported as training categories.
33
+ 2. Select the parameters you want to use to filter your dataset (using the [parameters available in the GBIF Occurrence Search API](https://techdocs.gbif.org/en/openapi/v1/occurrence)).
34
+ 3. Decide how many images you want to use for training and validation per category.
35
+ 4. Select a directory to output the model information.
36
+ 5. Pass the above information to the `build_model` function.
37
+
38
+ You have created a YOLOv8 model for bug classification.
39
+
40
+ The training and validation is done using Ultralytics. Please visit the Ultralytics YOLOv8 documentation for more information.
41
+
42
+ # Pretrained Model
43
+
44
+ There is also a pretrained YOLOv8 classification model, containing 2584 species, included in this repo under B++ CV Model. The included species are listed in a separate file.
45
+ 1. Download the pretrained model from the Google Drive link listed in the folder B++ CV Model
46
+ 2. Take the notebooks/run_model.py script, specify the path to the downloaded .pt file, and run the model.
47
+
48
+ # Example Usage
49
+ ## Using search options
50
+ ```python
51
+ import os
52
+ import bplusplus
53
+ from typing import Any
54
+
55
+ names = [
56
+ "Nabis rugosus",
57
+ "Forficula auricularia",
58
+ "Calosoma inquisitor",
59
+ "Bombus veteranus",
60
+ "Glyphotaelius pellucidus",
61
+ "Notoxus monoceros",
62
+ "Cacoxenus indagator",
63
+ "Chorthippus mollis",
64
+ "Trioza remota"
65
+ ]
66
+
67
+ search: dict[str, Any] = {
68
+ "scientificName": names,
69
+ "country": ["US", "NL"]
70
+ }
71
+
72
+ bplusplus.build_model(
73
+ group_by_key=bplusplus.Group.scientificName,
74
+ search_parameters=search,
75
+ images_per_group=150,
76
+ model_output_folder=os.path.join('model')
77
+ )
78
+ ```
79
+
80
+ # Pending Improvements
81
+
82
+ * The Ultralytics parameters should be surfaced to the user of the package so they have more control over the training process.
83
+ * The GBIF API documentation claims that you can filter on a dataset in your search, however it does not work in my current testing. This would be nice to allow users to create datasets on the GBIF website then pass that DOI directly here, so may warrant a closer look.
84
+
85
+
86
+ # Citation
87
+
88
+ All information in this GitHub is available under MIT license, as long as credit is given to the authors.
89
+
90
+ **Venverloo, T., Duarte, F., B++: Towards Real-Time Monitoring of Insect Species. MIT Senseable City Laboratory, AMS Institute.**
91
+
@@ -0,0 +1,70 @@
1
+ # B++ repository
2
+
3
+ This repo can be used to quickly generate YOLOv8 models for biodiversity monitoring, relying on Ultralytics and a GBIF dataset.
4
+
5
+ All code is tested on macOS and Python 3.12, without GPU. GPU would obviously accelerate the below steps, Ultralytics should automatically select the available GPU if there is any.
6
+
7
+
8
+ # How does it work?
9
+
10
+ ![Figure 9](https://github.com/user-attachments/assets/a01f513b-0609-412d-a633-3aee1e5dded6)
11
+
12
+ 1. Select scientific names you want to train your model on. For now, only scientific names are supported as training categories.
13
+ 2. Select the parameters you want to use to filter your dataset (using the [parameters available in the GBIF Occurrence Search API](https://techdocs.gbif.org/en/openapi/v1/occurrence)).
14
+ 3. Decide how many images you want to use for training and validation per category.
15
+ 4. Select a directory to output the model information.
16
+ 5. Pass the above information to the `build_model` function.
17
+
18
+ You have created a YOLOv8 model for bug classification.
19
+
20
+ The training and validation is done using Ultralytics. Please visit the Ultralytics YOLOv8 documentation for more information.
21
+
22
+ # Pretrained Model
23
+
24
+ There is also a pretrained YOLOv8 classification model, containing 2584 species, included in this repo under B++ CV Model. The included species are listed in a separate file.
25
+ 1. Download the pretrained model from the Google Drive link listed in the folder B++ CV Model
26
+ 2. Take the notebooks/run_model.py script, specify the path to the downloaded .pt file, and run the model.
27
+
28
+ # Example Usage
29
+ ## Using search options
30
+ ```python
31
+ import os
32
+ import bplusplus
33
+ from typing import Any
34
+
35
+ names = [
36
+ "Nabis rugosus",
37
+ "Forficula auricularia",
38
+ "Calosoma inquisitor",
39
+ "Bombus veteranus",
40
+ "Glyphotaelius pellucidus",
41
+ "Notoxus monoceros",
42
+ "Cacoxenus indagator",
43
+ "Chorthippus mollis",
44
+ "Trioza remota"
45
+ ]
46
+
47
+ search: dict[str, Any] = {
48
+ "scientificName": names,
49
+ "country": ["US", "NL"]
50
+ }
51
+
52
+ bplusplus.build_model(
53
+ group_by_key=bplusplus.Group.scientificName,
54
+ search_parameters=search,
55
+ images_per_group=150,
56
+ model_output_folder=os.path.join('model')
57
+ )
58
+ ```
59
+
60
+ # Pending Improvements
61
+
62
+ * The Ultralytics parameters should be surfaced to the user of the package so they have more control over the training process.
63
+ * The GBIF API documentation claims that you can filter on a dataset in your search, however it does not work in my current testing. This would be nice to allow users to create datasets on the GBIF website then pass that DOI directly here, so may warrant a closer look.
64
+
65
+
66
+ # Citation
67
+
68
+ All information in this GitHub is available under MIT license, as long as credit is given to the authors.
69
+
70
+ **Venverloo, T., Duarte, F., B++: Towards Real-Time Monitoring of Insect Species. MIT Senseable City Laboratory, AMS Institute.**
@@ -0,0 +1,23 @@
1
+ [tool.poetry]
2
+ name = "bplusplus"
3
+ version = "0.1.0"
4
+ description = "A simple method to create AI models for biodiversity"
5
+ authors = ["Titus Venverloo <tvenver@mit.edu>", "Deniz Aydemir <deniz@aydemir.us>"]
6
+ license = "MIT"
7
+ readme = "README.md"
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.9.0"
11
+ requests = "2.25.1"
12
+ ultralytics = "8.0.195"
13
+ pygbif = "^0.6.4"
14
+ validators = "^0.33.0"
15
+
16
+
17
+ [tool.poetry.group.dev.dependencies]
18
+ jupyter = "^1.0.0"
19
+ ipykernel = "^6.29.5"
20
+
21
+ [build-system]
22
+ requires = ["poetry-core"]
23
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,3 @@
1
+ from .build_model import build_model
2
+ from .collect_images import Group, collect_images
3
+ from .train_validate import train_validate
@@ -0,0 +1,38 @@
1
+ import os
2
+ import shutil
3
+ import tempfile
4
+ from typing import Any
5
+
6
+ from .collect_images import Group, collect_images
7
+ from .train_validate import train_validate
8
+
9
+
10
+ def build_model(group_by_key: Group, search_parameters: dict[str, Any], images_per_group: int, model_output_folder: str):
11
+ try:
12
+ # Create a temporary directory
13
+ temp_dir = tempfile.mkdtemp()
14
+ # Do more work...
15
+ print(f"Temporary directory path: {temp_dir}")
16
+
17
+ groups = search_parameters.get(group_by_key.value, list[str])
18
+
19
+ collect_images (
20
+ group_by_key=group_by_key,
21
+ search_parameters=search_parameters,
22
+ images_per_group=images_per_group,
23
+ output_directory=temp_dir
24
+ )
25
+
26
+ train_validate(
27
+ groups=groups,
28
+ dataset_path=temp_dir,
29
+ output_directory=model_output_folder
30
+ )
31
+
32
+ finally:
33
+ # Clean up the temporary directory
34
+ if temp_dir and os.path.exists(temp_dir):
35
+ shutil. rmtree (temp_dir)
36
+ print(f"Cleaned up temporary directory: {temp_dir}")
37
+
38
+ temp_dir = None
@@ -0,0 +1,118 @@
1
+ import os
2
+ import random
3
+ from enum import Enum
4
+ from typing import Any, Optional
5
+
6
+ import pygbif
7
+ import requests
8
+ import validators
9
+
10
+
11
+ #this lists currently supported groupings, more can be added with proper testing
12
+ class Group(str, Enum):
13
+ scientificName="scientificName"
14
+
15
+ #TODO add back support for fetching from dataset (or csvs)
16
+ def collect_images(group_by_key: Group, search_parameters: dict[str, Any], images_per_group: int, output_directory: str):
17
+
18
+ groups: list[str] = search_parameters[group_by_key.value]
19
+
20
+ #TODO throw error if groups is not a str list
21
+
22
+ __create_folders(
23
+ names=groups,
24
+ directory=output_directory
25
+ )
26
+
27
+ print("Beginning to collect images from GBIF...")
28
+ for group in groups:
29
+ print(f"Collecting images for {group}...")
30
+ occurrences_json = _fetch_occurrences(group_key=group_by_key, group_value=group, parameters=search_parameters, totalLimit=10000)
31
+ optional_occurrences = map(lambda x: __parse_occurrence(x), occurrences_json)
32
+ occurrences = list(filter(None, optional_occurrences))
33
+
34
+ print(f"{group} : {len(occurrences)} parseable occurrences fetched, will sample for {images_per_group}")
35
+
36
+ random.seed(42) # for reproducibility
37
+ sampled_occurrences = random.sample(occurrences, min(images_per_group, len(occurrences)))
38
+
39
+ print(f"Downloading {len(sampled_occurrences)} images into the {group} folder...")
40
+ for occurrence in sampled_occurrences:
41
+ # image_url = occurrence.image_url.replace("original", "large") # hack to get max 1024px image
42
+
43
+ __down_image(
44
+ url=occurrence.image_url,
45
+ group=group,
46
+ ID_name=occurrence.key,
47
+ folder=output_directory
48
+ )
49
+
50
+ print("Finished collecting images.")
51
+
52
+ def _fetch_occurrences(group_key: str, group_value: str, parameters: dict[str, Any], totalLimit: int) -> list[dict[str, Any]]:
53
+ parameters[group_key] = group_value
54
+ return __next_batch(
55
+ parameters=parameters,
56
+ total_limit=totalLimit,
57
+ offset=0,
58
+ current=[]
59
+ )
60
+
61
+ def __next_batch(parameters: dict[str, Any], total_limit: int, offset: int, current: list[dict[str, Any]]) -> list[dict[str, Any]]:
62
+ parameters["limit"] = total_limit
63
+ parameters["offset"] = offset
64
+ parameters["mediaType"] = ["StillImage"]
65
+ search = pygbif.occurrences.search(**parameters)
66
+ occurrences = search["results"]
67
+ if search["endOfRecords"] or len(current) >= total_limit:
68
+ return current + occurrences
69
+ else:
70
+ offset = search["offset"]
71
+ count = search["limit"] # this seems to be returning the count, and `count` appears to be returning the total number of results returned by the search
72
+ return __next_batch(
73
+ parameters=parameters,
74
+ total_limit=total_limit,
75
+ offset=offset + count,
76
+ current=current + occurrences
77
+ )
78
+
79
+ #function to download insect images
80
+ def __down_image(url: str, group: str, ID_name: str, folder: str):
81
+ directory = os.path.join(folder, f"{group}")
82
+ os.makedirs(directory, exist_ok=True)
83
+ image_response = requests.get(url)
84
+ image_name = f"{group}{ID_name}.jpg" # You can modify the naming convention as per your requirements
85
+ image_path = os.path.join(directory, image_name)
86
+ with open(image_path, "wb") as f:
87
+ f.write(image_response.content)
88
+ # print(f"{image_name} downloaded successfully.")
89
+
90
+ def __create_folders(names: list[str], directory: str):
91
+ print("Creating folders for images...")
92
+ # Check if the folder path exists, if not, create it
93
+ if not os.path.exists(directory):
94
+ os.makedirs(directory)
95
+
96
+ for name in names:
97
+ folder_name = os.path.join(directory, name)
98
+ # Create a folder using the group name
99
+ os.makedirs(folder_name, exist_ok=True)
100
+
101
+
102
+
103
+
104
+ class Occurrence:
105
+
106
+ def __init__(self, key: str, image_url: str) -> None:
107
+ self.key = key
108
+ self.image_url = image_url
109
+
110
+
111
+ def __parse_occurrence(json: dict[str, Any]) -> Optional[Occurrence]:
112
+ if (key := json.get("key", str)) is not None \
113
+ and (image_url := json.get("media", {})[0].get("identifier", str)) is not None \
114
+ and validators.url(image_url):
115
+
116
+ return Occurrence(key=key, image_url=image_url)
117
+ else:
118
+ return None
@@ -0,0 +1,67 @@
1
+ import os
2
+ import random
3
+ import shutil
4
+
5
+ from ultralytics import YOLO
6
+
7
+
8
+ #split ratio defaults to 80% training 20% validation
9
+ def train_validate(groups: list[str], dataset_path: str, output_directory: str, split_ratio: float = 0.8):
10
+
11
+ train_path = os.path.join(output_directory, 'train') # Path to the training folder
12
+ val_path = os.path.join(output_directory, 'val') # Path to the validation folder
13
+
14
+ # Create training and validation directories if they don't exist
15
+ os.makedirs(train_path, exist_ok=True)
16
+ os.makedirs(val_path, exist_ok=True)
17
+
18
+ # Walk through the dataset directory
19
+ # for root, dirs, files in os.walk(dataset_path):
20
+ for group in groups:
21
+ dataset_folder = os.path.join(dataset_path, group)
22
+ images = __files_in_folder(folder=dataset_folder)
23
+
24
+ # Shuffle the images
25
+ random.shuffle(images)
26
+
27
+ # Calculate the split index
28
+ split_index = int(len(images) * split_ratio)
29
+
30
+ # Split the images into training and validation sets
31
+ train_images = images[:split_index]
32
+ val_images = images[split_index:]
33
+
34
+ # Create destination folders if they don't exist
35
+ train_label_path = os.path.join(train_path, group)
36
+ val_label_path = os.path.join(val_path, group)
37
+ os.makedirs(train_label_path, exist_ok=True)
38
+ os.makedirs(val_label_path, exist_ok=True)
39
+
40
+ # Move images to the appropriate folders
41
+ for image in train_images:
42
+ src = os.path.join(dataset_folder, image)
43
+ dst = os.path.join(train_label_path, image)
44
+ shutil.move(src, dst)
45
+
46
+ for image in val_images:
47
+ src = os.path.join(dataset_folder, image)
48
+ dst = os.path.join(val_label_path, image)
49
+ shutil.move(src, dst)
50
+
51
+ print("Dataset splitting completed successfully.")
52
+
53
+ # Create a new YOLO model from scratch
54
+ model = YOLO(os.path.join(output_directory,'yolov8n-cls.pt'))
55
+ #
56
+ #define parameters for YOLO training, be aware of epoch, batch, and imgsz, to not exceed system requirements (memory, CPU, GPU...)
57
+ #Specify path to folder where the val and train folder is located
58
+ data = output_directory
59
+ results = model.train(data=data, epochs=5, batch=16, imgsz=224, project=output_directory)
60
+
61
+ #batch is adjusted to 1 to prevent a resizing bug - in training this bug doesnt emerge. A work around for larger batch size could be a resizing step in advance.
62
+ model.val(batch=1)
63
+
64
+
65
+
66
+ def __files_in_folder(folder: str) -> list[str]:
67
+ return [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]