PyPI - bplusplus - Versions diffs - 0.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

bplusplus 0.1.1py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bplusplus might be problematic. Click here for more details.

Files changed (15) hide show

bplusplus/__init__.py +7 -3
bplusplus/{collect_images.py → collect.py} +71 -7
bplusplus/hierarchical/test.py +670 -0
bplusplus/hierarchical/train.py +676 -0
bplusplus/prepare.py +737 -0
bplusplus/resnet/test.py +473 -0
bplusplus/resnet/train.py +329 -0
bplusplus/train_validate.py +8 -64
bplusplus-1.2.0.dist-info/METADATA +249 -0
bplusplus-1.2.0.dist-info/RECORD +12 -0
bplusplus/build_model.py +0 -38
bplusplus-0.1.1.dist-info/METADATA +0 -97
bplusplus-0.1.1.dist-info/RECORD +0 -8
{bplusplus-0.1.1.dist-info → bplusplus-1.2.0.dist-info}/LICENSE +0 -0
{bplusplus-0.1.1.dist-info → bplusplus-1.2.0.dist-info}/WHEEL +0 -0

bplusplus/__init__.py CHANGED Viewed

@@ -1,3 +1,7 @@
-from .build_model import build_model
-from .collect_images import Group, collect_images
-from .train_validate import train_validate
+from .collect import Group, collect
+from .train_validate import train, validate
+from .prepare import prepare
+from .resnet.train import train_resnet
+from .resnet.test import test_resnet
+from .hierarchical.train import train_multitask
+from .hierarchical.test import test_multitask

bplusplus/{collect_images.py → collect.py} RENAMED Viewed

@@ -1,22 +1,41 @@
 import os
 import random
+import threading
 from enum import Enum
-from typing import Any, Optional
+from typing import Any, Optional, List, Dict
+from tqdm import tqdm
+import random
 import pygbif
 import requests
 import validators
 #this lists currently supported groupings, more can be added with proper testing
 class Group(str, Enum):
     scientificName="scientificName"
 #TODO add back support for fetching from dataset (or csvs)
-def collect_images(group_by_key: Group, search_parameters: dict[str, Any], images_per_group: int, output_directory: str):
+def collect(group_by_key: Group, search_parameters: dict[str, Any], images_per_group: int, output_directory: str, num_threads: int):
     groups: list[str] = search_parameters[group_by_key.value]
+    # check if user wants to parallelize the process
+    if num_threads > 1:
+        __threaded_collect(
+            images_per_group=images_per_group,
+            output_directory=output_directory,
+            num_threads=num_threads,
+            groups=groups)
+    else:
+        __single_collect(
+            search_parameters=search_parameters,
+            images_per_group=images_per_group,
+            output_directory=output_directory,
+            group_by_key=group_by_key,
+            groups=groups,
+        )
+def __single_collect(group_by_key: Group, search_parameters: dict[str, Any], images_per_group: int, output_directory: str, groups: list[str]):
     #TODO throw error if groups is not a str list
     __create_folders(
@@ -26,18 +45,18 @@ def collect_images(group_by_key: Group, search_parameters: dict[str, Any], image
     print("Beginning to collect images from GBIF...")
     for group in groups:
-        print(f"Collecting images for {group}...")
+        # print(f"Collecting images for {group}...")
         occurrences_json = _fetch_occurrences(group_key=group_by_key, group_value=group, parameters=search_parameters, totalLimit=10000)
         optional_occurrences = map(lambda x: __parse_occurrence(x), occurrences_json)
         occurrences = list(filter(None, optional_occurrences))
-        print(f"{group} : {len(occurrences)} parseable occurrences fetched, will sample for {images_per_group}")
+        # print(f"{group} : {len(occurrences)} parseable occurrences fetched, will sample for {images_per_group}")
         random.seed(42) # for reproducibility
         sampled_occurrences = random.sample(occurrences, min(images_per_group, len(occurrences)))
         print(f"Downloading {len(sampled_occurrences)} images into the {group} folder...")
-        for occurrence in sampled_occurrences:
+        for occurrence in tqdm(sampled_occurrences, desc=f"Downloading images for {group}", unit="image"):
             # image_url = occurrence.image_url.replace("original", "large") # hack to get max 1024px image
             __down_image(
@@ -49,6 +68,34 @@ def collect_images(group_by_key: Group, search_parameters: dict[str, Any], image
     print("Finished collecting images.")
+# threaded_collect: paralellize the collection of images
+def __threaded_collect(images_per_group: int, output_directory: str, num_threads: int, groups: list[str]):
+    # Divide the species list into num_threads parts
+    chunk_size = len(groups) // num_threads
+    species_chunks = [
+        groups[i:i + chunk_size] for i in range(0, len(groups), chunk_size)
+    ]
+    # Ensure we have exactly num_threads chunks (the last chunk might be larger if len(species_list) % num_threads != 0)
+    while len(species_chunks) < num_threads:
+        species_chunks.append([])
+    threads = []
+    for i, chunk in enumerate(species_chunks):
+        thread = threading.Thread(
+            target=__collect_subset,
+            args=(chunk, images_per_group, output_directory, i)
+        )
+        threads.append(thread)
+        thread.start()
+    # Wait for all threads to complete
+    for thread in threads:
+        thread.join()
+    print("All collection threads have finished.")
 def _fetch_occurrences(group_key: str, group_value: str, parameters: dict[str, Any], totalLimit: int) -> list[dict[str, Any]]:
     parameters[group_key] = group_value
     return __next_batch(
@@ -98,6 +145,23 @@ def __create_folders(names: list[str], directory: str):
         # Create a folder using the group name
         os.makedirs(folder_name, exist_ok=True)
+def __collect_subset(species_subset: List[str], images_per_group: int, output_directory: str, thread_id: int):
+    search_subset: Dict[str, Any] = {
+        "scientificName": species_subset
+    }
+    print(f"Thread {thread_id} starting collection for {len(species_subset)} species.")
+    __single_collect(
+        search_parameters=search_subset,
+        images_per_group=images_per_group,
+        output_directory=output_directory,
+        group_by_key=Group.scientificName,
+        groups=species_subset
+    )
+    print(f"Thread {thread_id} finished collection.")

bplusplus 0.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

Potentially problematic release.

bplusplus 0.1.1py3-none-any.whl → 1.2.0py3-none-any.whl