PyPI - bplusplus - Versions diffs - 0.1.0__tar.gz - Mend

bplusplus 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bplusplus might be problematic. Click here for more details.

Files changed (8) hide show

bplusplus-0.1.0/LICENSE +21 -0
bplusplus-0.1.0/PKG-INFO +91 -0
bplusplus-0.1.0/README.md +70 -0
bplusplus-0.1.0/pyproject.toml +23 -0
bplusplus-0.1.0/src/bplusplus/__init__.py +3 -0
bplusplus-0.1.0/src/bplusplus/build_model.py +38 -0
bplusplus-0.1.0/src/bplusplus/collect_images.py +118 -0
bplusplus-0.1.0/src/bplusplus/train_validate.py +67 -0

bplusplus-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024 Titus Venverloo
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

bplusplus-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,91 @@
+Metadata-Version: 2.1
+Name: bplusplus
+Version: 0.1.0
+Summary: A simple method to create AI models for biodiversity
+License: MIT
+Author: Titus Venverloo
+Author-email: tvenver@mit.edu
+Requires-Python: >=3.9.0,<4.0.0
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Dist: pygbif (>=0.6.4,<0.7.0)
+Requires-Dist: requests (==2.25.1)
+Requires-Dist: ultralytics (==8.0.195)
+Requires-Dist: validators (>=0.33.0,<0.34.0)
+Description-Content-Type: text/markdown
+# B++ repository
+This repo can be used to quickly generate YOLOv8 models for biodiversity monitoring, relying on Ultralytics and a GBIF dataset.
+All code is tested on macOS and Python 3.12, without GPU. GPU would obviously accelerate the below steps, Ultralytics should automatically select the available GPU if there is any.
+# How does it work?
+![Figure 9](https://github.com/user-attachments/assets/a01f513b-0609-412d-a633-3aee1e5dded6)
+1. Select scientific names you want to train your model on. For now, only scientific names are supported as training categories.
+2. Select the parameters you want to use to filter your dataset (using the [parameters available in the GBIF Occurrence Search API](https://techdocs.gbif.org/en/openapi/v1/occurrence)).
+3. Decide how many images you want to use for training and validation per category.
+4. Select a directory to output the model information.
+5. Pass the above information to the `build_model` function.
+You have created a YOLOv8 model for bug classification.
+The training and validation is done using Ultralytics. Please visit the Ultralytics YOLOv8 documentation for more information.
+# Pretrained Model
+There is also a pretrained YOLOv8 classification model, containing 2584 species, included in this repo under B++ CV Model. The included species are listed in a separate file.
+1. Download the pretrained model from the Google Drive link listed in the folder B++ CV Model
+2. Take the notebooks/run_model.py script, specify the path to the downloaded .pt file, and run the model.
+# Example Usage
+## Using search options
+```python
+import os
+import bplusplus
+from typing import Any
+names = [
+    "Nabis rugosus",
+    "Forficula auricularia",
+    "Calosoma inquisitor",
+    "Bombus veteranus",
+    "Glyphotaelius pellucidus",
+    "Notoxus monoceros",
+    "Cacoxenus indagator",
+    "Chorthippus mollis",
+    "Trioza remota"
+]
+search: dict[str, Any] = {
+    "scientificName": names,
+    "country": ["US", "NL"]
+}
+bplusplus.build_model(
+    group_by_key=bplusplus.Group.scientificName,
+    search_parameters=search,
+    images_per_group=150,
+    model_output_folder=os.path.join('model')
+)
+```
+# Pending Improvements
+* The Ultralytics parameters should be surfaced to the user of the package so they have more control over the training process.
+* The GBIF API documentation claims that you can filter on a dataset in your search, however it does not work in my current testing. This would be nice to allow users to create datasets on the GBIF website then pass that DOI directly here, so may warrant a closer look.
+# Citation
+All information in this GitHub is available under MIT license, as long as credit is given to the authors.
+**Venverloo, T., Duarte, F., B++: Towards Real-Time Monitoring of Insect Species. MIT Senseable City Laboratory, AMS Institute.**

bplusplus-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,70 @@
+# B++ repository
+This repo can be used to quickly generate YOLOv8 models for biodiversity monitoring, relying on Ultralytics and a GBIF dataset.
+All code is tested on macOS and Python 3.12, without GPU. GPU would obviously accelerate the below steps, Ultralytics should automatically select the available GPU if there is any.
+# How does it work?
+![Figure 9](https://github.com/user-attachments/assets/a01f513b-0609-412d-a633-3aee1e5dded6)
+1. Select scientific names you want to train your model on. For now, only scientific names are supported as training categories.
+2. Select the parameters you want to use to filter your dataset (using the [parameters available in the GBIF Occurrence Search API](https://techdocs.gbif.org/en/openapi/v1/occurrence)).
+3. Decide how many images you want to use for training and validation per category.
+4. Select a directory to output the model information.
+5. Pass the above information to the `build_model` function.
+You have created a YOLOv8 model for bug classification.
+The training and validation is done using Ultralytics. Please visit the Ultralytics YOLOv8 documentation for more information.
+# Pretrained Model
+There is also a pretrained YOLOv8 classification model, containing 2584 species, included in this repo under B++ CV Model. The included species are listed in a separate file.
+1. Download the pretrained model from the Google Drive link listed in the folder B++ CV Model
+2. Take the notebooks/run_model.py script, specify the path to the downloaded .pt file, and run the model.
+# Example Usage
+## Using search options
+```python
+import os
+import bplusplus
+from typing import Any
+names = [
+    "Nabis rugosus",
+    "Forficula auricularia",
+    "Calosoma inquisitor",
+    "Bombus veteranus",
+    "Glyphotaelius pellucidus",
+    "Notoxus monoceros",
+    "Cacoxenus indagator",
+    "Chorthippus mollis",
+    "Trioza remota"
+]
+search: dict[str, Any] = {
+    "scientificName": names,
+    "country": ["US", "NL"]
+}
+bplusplus.build_model(
+    group_by_key=bplusplus.Group.scientificName,
+    search_parameters=search,
+    images_per_group=150,
+    model_output_folder=os.path.join('model')
+)
+```
+# Pending Improvements
+* The Ultralytics parameters should be surfaced to the user of the package so they have more control over the training process.
+* The GBIF API documentation claims that you can filter on a dataset in your search, however it does not work in my current testing. This would be nice to allow users to create datasets on the GBIF website then pass that DOI directly here, so may warrant a closer look.
+# Citation
+All information in this GitHub is available under MIT license, as long as credit is given to the authors.
+**Venverloo, T., Duarte, F., B++: Towards Real-Time Monitoring of Insect Species. MIT Senseable City Laboratory, AMS Institute.**

bplusplus-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,23 @@
+[tool.poetry]
+name = "bplusplus"
+version = "0.1.0"
+description = "A simple method to create AI models for biodiversity"
+authors = ["Titus Venverloo <tvenver@mit.edu>", "Deniz Aydemir <deniz@aydemir.us>"]
+license = "MIT"
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.9.0"
+requests = "2.25.1"
+ultralytics = "8.0.195"
+pygbif = "^0.6.4"
+validators = "^0.33.0"
+[tool.poetry.group.dev.dependencies]
+jupyter = "^1.0.0"
+ipykernel = "^6.29.5"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

bplusplus-0.1.0/src/bplusplus/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .build_model import build_model
+from .collect_images import Group, collect_images
+from .train_validate import train_validate

bplusplus-0.1.0/src/bplusplus/build_model.py ADDED Viewed

@@ -0,0 +1,38 @@
+import os
+import shutil
+import tempfile
+from typing import Any
+from .collect_images import Group, collect_images
+from .train_validate import train_validate
+def build_model(group_by_key: Group, search_parameters: dict[str, Any], images_per_group: int, model_output_folder: str):
+    try:
+        # Create a temporary directory
+        temp_dir = tempfile.mkdtemp()
+        # Do more work...
+        print(f"Temporary directory path: {temp_dir}")
+        groups = search_parameters.get(group_by_key.value, list[str])
+        collect_images (
+            group_by_key=group_by_key,
+            search_parameters=search_parameters,
+            images_per_group=images_per_group,
+            output_directory=temp_dir
+        )
+        train_validate(
+            groups=groups,
+            dataset_path=temp_dir,
+            output_directory=model_output_folder
+        )
+    finally:
+        # Clean up the temporary directory
+        if temp_dir and os.path.exists(temp_dir):
+            shutil. rmtree (temp_dir)
+            print(f"Cleaned up temporary directory: {temp_dir}")
+    temp_dir = None

bplusplus-0.1.0/src/bplusplus/collect_images.py ADDED Viewed

@@ -0,0 +1,118 @@
+import os
+import random
+from enum import Enum
+from typing import Any, Optional
+import pygbif
+import requests
+import validators
+#this lists currently supported groupings, more can be added with proper testing
+class Group(str, Enum):
+    scientificName="scientificName"
+#TODO add back support for fetching from dataset (or csvs)
+def collect_images(group_by_key: Group, search_parameters: dict[str, Any], images_per_group: int, output_directory: str):
+    groups: list[str] = search_parameters[group_by_key.value]
+    #TODO throw error if groups is not a str list
+    __create_folders(
+        names=groups,
+        directory=output_directory
+    )
+    print("Beginning to collect images from GBIF...")
+    for group in groups:
+        print(f"Collecting images for {group}...")
+        occurrences_json = _fetch_occurrences(group_key=group_by_key, group_value=group, parameters=search_parameters, totalLimit=10000)
+        optional_occurrences = map(lambda x: __parse_occurrence(x), occurrences_json)
+        occurrences = list(filter(None, optional_occurrences))
+        print(f"{group} : {len(occurrences)} parseable occurrences fetched, will sample for {images_per_group}")
+        random.seed(42) # for reproducibility
+        sampled_occurrences = random.sample(occurrences, min(images_per_group, len(occurrences)))
+        print(f"Downloading {len(sampled_occurrences)} images into the {group} folder...")
+        for occurrence in sampled_occurrences:
+            # image_url = occurrence.image_url.replace("original", "large") # hack to get max 1024px image
+            __down_image(
+                url=occurrence.image_url,
+                group=group,
+                ID_name=occurrence.key,
+                folder=output_directory
+            )
+    print("Finished collecting images.")
+def _fetch_occurrences(group_key: str, group_value: str, parameters: dict[str, Any], totalLimit: int) -> list[dict[str, Any]]:
+    parameters[group_key] = group_value
+    return __next_batch(
+        parameters=parameters,
+        total_limit=totalLimit,
+        offset=0,
+        current=[]
+    )
+def __next_batch(parameters: dict[str, Any], total_limit: int, offset: int, current: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        parameters["limit"] = total_limit
+        parameters["offset"] = offset
+        parameters["mediaType"] = ["StillImage"]
+        search = pygbif.occurrences.search(**parameters)
+        occurrences = search["results"]
+        if search["endOfRecords"] or len(current) >= total_limit:
+            return current + occurrences
+        else:
+            offset = search["offset"]
+            count = search["limit"] # this seems to be returning the count, and `count` appears to be returning the total number of results returned by the search
+            return __next_batch(
+                parameters=parameters,
+                total_limit=total_limit,
+                offset=offset + count,
+                current=current + occurrences
+            )
+#function to download insect images
+def __down_image(url: str, group: str, ID_name: str, folder: str):
+    directory = os.path.join(folder, f"{group}")
+    os.makedirs(directory, exist_ok=True)
+    image_response = requests.get(url)
+    image_name = f"{group}{ID_name}.jpg"  # You can modify the naming convention as per your requirements
+    image_path = os.path.join(directory, image_name)
+    with open(image_path, "wb") as f:
+        f.write(image_response.content)
+    # print(f"{image_name} downloaded successfully.")
+def __create_folders(names: list[str], directory: str):
+    print("Creating folders for images...")
+    # Check if the folder path exists, if not, create it
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    for name in names:
+        folder_name = os.path.join(directory, name)
+        # Create a folder using the group name
+        os.makedirs(folder_name, exist_ok=True)
+class Occurrence:
+    def __init__(self, key: str, image_url: str) -> None:
+         self.key = key
+         self.image_url = image_url
+def __parse_occurrence(json: dict[str, Any]) -> Optional[Occurrence]:
+    if (key := json.get("key", str)) is not None \
+        and (image_url := json.get("media", {})[0].get("identifier", str)) is not None \
+            and validators.url(image_url):
+         return Occurrence(key=key, image_url=image_url)
+    else:
+         return None

bplusplus-0.1.0/src/bplusplus/train_validate.py ADDED Viewed

@@ -0,0 +1,67 @@
+import os
+import random
+import shutil
+from ultralytics import YOLO
+#split ratio defaults to 80% training 20% validation
+def train_validate(groups: list[str], dataset_path: str, output_directory: str, split_ratio: float = 0.8):
+    train_path = os.path.join(output_directory, 'train')  # Path to the training folder
+    val_path = os.path.join(output_directory, 'val')   # Path to the validation folder
+    # Create training and validation directories if they don't exist
+    os.makedirs(train_path, exist_ok=True)
+    os.makedirs(val_path, exist_ok=True)
+    # Walk through the dataset directory
+    # for root, dirs, files in os.walk(dataset_path):
+    for group in groups:
+        dataset_folder = os.path.join(dataset_path, group)
+        images = __files_in_folder(folder=dataset_folder)
+        # Shuffle the images
+        random.shuffle(images)
+        # Calculate the split index
+        split_index = int(len(images) * split_ratio)
+        # Split the images into training and validation sets
+        train_images = images[:split_index]
+        val_images = images[split_index:]
+        # Create destination folders if they don't exist
+        train_label_path = os.path.join(train_path, group)
+        val_label_path = os.path.join(val_path, group)
+        os.makedirs(train_label_path, exist_ok=True)
+        os.makedirs(val_label_path, exist_ok=True)
+        # Move images to the appropriate folders
+        for image in train_images:
+            src = os.path.join(dataset_folder, image)
+            dst = os.path.join(train_label_path, image)
+            shutil.move(src, dst)
+        for image in val_images:
+            src = os.path.join(dataset_folder, image)
+            dst = os.path.join(val_label_path, image)
+            shutil.move(src, dst)
+    print("Dataset splitting completed successfully.")
+    # Create a new YOLO model from scratch
+    model = YOLO(os.path.join(output_directory,'yolov8n-cls.pt'))
+    #
+    #define parameters for YOLO training, be aware of epoch, batch, and imgsz, to not exceed system requirements (memory, CPU, GPU...)
+    #Specify path to folder where the val and train folder is located
+    data = output_directory
+    results = model.train(data=data, epochs=5, batch=16, imgsz=224, project=output_directory)
+    #batch is adjusted to 1 to prevent a resizing bug - in training this bug doesnt emerge. A work around for larger batch size could be a resizing step in advance.
+    model.val(batch=1)
+def __files_in_folder(folder: str) -> list[str]:
+    return [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]