PyPI - megadetector - Versions diffs - 5.0.11__py3-none-any.whl → 5.0.13__py3-none-any.whl - Mend

megadetector 5.0.11py3-none-any.whl → 5.0.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (203) hide show

megadetector/api/synchronous/api_core/tests/load_test.py ADDED Viewed

@@ -0,0 +1,110 @@
+import os
+import json
+import io
+import random
+import requests
+from PIL import Image
+from multiprocessing import Pool
+from datetime import datetime
+from requests_toolbelt import MultipartEncoder
+from requests_toolbelt.multipart import decoder
+ip_address = '100.100.200.200'
+port = 5050
+base_url = 'http://{}:{}/v1/camera-trap/sync/'.format(ip_address, port)
+def call_api(args):
+    start = datetime.now()
+    index, url, params, data, headers = args['index'],args['url'], args['params'], args['data'], args['headers']
+    print('calling api: {} starttime: {}'.format(index, start))
+    response = requests.post(url, params=params, data=data, headers=headers)
+    elapsed_time = datetime.now() - start
+    print('\napi {} status code: {}, elapsed time in seconds {}'.format(index, response.status_code, elapsed_time.total_seconds()))
+    get_detections(response)
+    return response
+def get_detections(response):
+    results = decoder.MultipartDecoder.from_response(response)
+    text_results = {}
+    images = {}
+    for part in results.parts:
+        # part is a BodyPart object with b'Content-Type', and b'Content-Disposition', the later includes 'name' and 'filename' info
+        headers = {}
+        for k, v in part.headers.items():
+            headers[k.decode(part.encoding)] = v.decode(part.encoding)
+        if headers.get('Content-Type', None) == 'application/json':
+            text_result = json.loads(part.content.decode())
+    print(text_result)
+def test_load(num_requests, params, max_images=1):
+    requests = []
+    # read the images anew for each request
+    index = 0
+    for i in range(num_requests):
+        index += 1
+        files = {}
+        sample_input_dir = '../../../api/synchronous/sample_input/test_images'
+        image_files = os.listdir(sample_input_dir)
+        random.shuffle(image_files)
+        num_images = 0
+        for i, image_name in enumerate(image_files):
+            if not image_name.lower().endswith('.jpg'):
+                continue
+            if num_images >= max_images:
+                break
+            else:
+                num_images += 1
+            img_path = os.path.join(sample_input_dir, image_name)
+            with open(img_path, 'rb') as f:
+                content = f.read()
+            files[image_name] = (image_name, content, 'image/jpeg')
+        m = MultipartEncoder(fields=files)
+        args = {
+            'index': index,
+            'url': base_url + 'detect',
+            'params': params,
+            'data': m,
+            'headers': {'Content-Type': m.content_type}
+        }
+        requests.append(args)
+    print('starting', num_requests, 'threads...')
+    # images are read and in each request by the time we call the API in map()
+    with Pool(num_requests) as pool:
+        results = pool.map(call_api, requests)
+    return results
+if __name__ == "__main__":
+    params = {
+    'min_confidence': 0.05,
+    'min_rendering_confidence': 0.2,
+    'render': True
+    }
+    num_requests = 10
+    max_images = 1
+    start = datetime.now()
+    responses = test_load(num_requests, params, max_images=max_images)
+    end = datetime.now()
+    total_time = end - start
+    print('Total time for {} requests: {}'.format(num_requests, total_time))

megadetector/classification/__init__.py ADDED Viewed

File without changes

megadetector/classification/aggregate_classifier_probs.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""
+aggregate_classifier_probs.py
+Aggregate probabilities from a classifier's outputs according to a mapping
+from the desired (target) categories to the classifier's categories.
+Using the mapping, create a new version of the classifier output CSV with
+probabilities summed within each target category. Also output a new
+"index-to-name" JSON file which identifies the sequential order of the target
+categories.
+"""
+#%% Imports
+from __future__ import annotations
+import argparse
+import json
+import pandas as pd
+from tqdm import tqdm
+#%%  Example usage
+"""
+python aggregate_classifier_probs.py \
+    classifier_output.csv.gz \
+    --target-mapping target_to_classifier_labels.json \
+    --output-csv classifier_output_remapped.csv.gz \
+    --output-label-index label_index_remapped.json
+"""
+#%% Main function
+def main(classifier_results_csv_path: str,
+         target_mapping_json_path: str,
+         output_csv_path: str,
+         output_label_index_json_path: str) -> None:
+    """
+    Main function.
+    Because the output CSV is often very large, we process it in chunks of 1000
+    rows at a time.
+    """
+    chunked_df_iterator = pd.read_csv(
+        classifier_results_csv_path, chunksize=1000, float_precision='high',
+        index_col='path')
+    with open(target_mapping_json_path, 'r') as f:
+        target_mapping = json.load(f)
+    target_names = sorted(target_mapping.keys())
+    all_classifier_labels: set[str] = set()
+    for classifier_labels in target_mapping.values():
+        assert all_classifier_labels.isdisjoint(classifier_labels)
+        all_classifier_labels.update(classifier_labels)
+    for i, chunk_df in tqdm(enumerate(chunked_df_iterator)):
+        if i == 0:
+            assert set(chunk_df.columns) == all_classifier_labels
+            header, mode = True, 'w'
+        else:
+            header, mode = False, 'a'
+        agg_df = pd.DataFrame(
+            data=0., index=chunk_df.index, columns=target_names)
+        for target in target_names:
+            classifier_labels = target_mapping[target]
+            agg_df[target] = chunk_df[classifier_labels].sum(axis=1)
+        agg_df.to_csv(output_csv_path, index=True, header=header, mode=mode)
+    with open(output_label_index_json_path, 'w') as f:
+        json.dump(dict(enumerate(target_names)), f, indent=1)
+#%% Command-line driver
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='Aggregate classifier probabilities to target classes.')
+    parser.add_argument(
+        'classifier_results_csv',
+        help='path to CSV with classifier probabilities')
+    parser.add_argument(
+        '-t', '--target-mapping', required=True,
+        help='path to JSON file mapping target categories to classifier labels')
+    parser.add_argument(
+        '-o', '--output-csv', required=True,
+        help='path to save output CSV with aggregated probabilities')
+    parser.add_argument(
+        '-i', '--output-label-index', required=True,
+        help='path to save output label index JSON')
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = _parse_args()
+    main(classifier_results_csv_path=args.classifier_results_csv,
+         target_mapping_json_path=args.target_mapping,
+         output_csv_path=args.output_csv,
+         output_label_index_json_path=args.output_label_index)

megadetector/classification/analyze_failed_images.py ADDED Viewed

@@ -0,0 +1,227 @@
+"""
+analyze_failed_images.py
+"""
+#%% Imports and constants
+import argparse
+from collections.abc import Mapping, Sequence
+from concurrent import futures
+import json
+from pprint import pprint
+import threading
+from typing import Any, Optional
+from PIL import Image, ImageFile
+import requests
+from tqdm import tqdm
+from megadetector.data_management.megadb.megadb_utils import MegadbUtils
+from megadetector.utils import path_utils
+from megadetector.utils import sas_blob_utils
+#%% Example usage
+"""
+    python analyze_failed_images.py failed.json \
+        -a ACCOUNT -c CONTAINER -s SAS_TOKEN
+"""
+ImageFile.LOAD_TRUNCATED_IMAGES = False
+#%% Support functions
+def check_image_condition(img_path: str,
+                          truncated_images_lock: threading.Lock,
+                          account: Optional[str] = None,
+                          container: Optional[str] = None,
+                          sas_token: Optional[str] = None,
+                          datasets_table: Optional[Mapping[str, Any]] = None
+                          ) -> tuple[str, str]:
+    """
+    Args:
+        img_path: str, either <blob_name> if datasets_table is None, or
+            <dataset>/<blob_name> if datasets_table is given
+        account: str, name of Azure Blob Storage account
+        container: str, name of Azure Blob Storage container
+        sas_token: str, optional SAS token (without leading '?') if the
+            container is not publicly accessible
+        datasets_table: dict, maps dataset name to dict of information
+    Returns: (img_file, status) tuple, where status is one of
+        'nonexistent': blob does not exist in the container
+        'non_image': img_file does not have valid file extension
+        'good': image exists and is able to be opened without setting
+            ImageFile.LOAD_TRUNCATED_IMAGES=True
+        'truncated': image exists but can only be opened by setting
+            ImageFile.LOAD_TRUNCATED_IMAGES=True
+        'bad': image exists, but cannot be opened even when setting
+            ImageFile.LOAD_TRUNCATED_IMAGES=True
+    """
+    if (account is None) or (container is None) or (datasets_table is not None):
+        assert account is None
+        assert container is None
+        assert sas_token is None
+        assert datasets_table is not None
+        dataset, img_file = img_path.split('/', maxsplit=1)
+        account = datasets_table[dataset]['storage_account']
+        container = datasets_table[dataset]['container']
+        sas_token = datasets_table[dataset]['container_sas_key']
+        if sas_token[0] == '?':  # strip leading '?' from SAS token
+            sas_token = sas_token[1:]
+    else:
+        img_file = img_path
+    if not path_utils.is_image_file(img_file):
+        return img_file, 'non_image'
+    blob_url = sas_blob_utils.build_azure_storage_uri(
+        account=account, container=container, sas_token=sas_token,
+        blob=img_file)
+    blob_exists = sas_blob_utils.check_blob_exists(blob_url)
+    if not blob_exists:
+        return img_file, 'nonexistent'
+    stream, _ = sas_blob_utils.download_blob_to_stream(blob_url)
+    stream.seek(0)
+    try:
+        with truncated_images_lock:
+            ImageFile.LOAD_TRUNCATED_IMAGES = False
+            with Image.open(stream) as img:
+                img.load()
+        return img_file, 'good'
+    except OSError:  # PIL.UnidentifiedImageError is a subclass of OSError
+        try:
+            stream.seek(0)
+            with truncated_images_lock:
+                ImageFile.LOAD_TRUNCATED_IMAGES = True
+                with Image.open(stream) as img:
+                    img.load()
+            return img_file, 'truncated'
+        except Exception as e:  # pylint: disable=broad-except
+            exception_type = type(e).__name__
+            tqdm.write(f'Unable to load {img_file}. {exception_type}: {e}.')
+            return img_file, 'bad'
+#%% Main function
+def analyze_images(url_or_path: str, json_keys: Optional[Sequence[str]] = None,
+                   account: Optional[str] = None,
+                   container: Optional[str] = None,
+                   sas_token: Optional[str] = None) -> None:
+    """
+    Args:
+        url_or_path: str, URL or local path to a file containing a list
+            of image paths. Each image path is either <blob_name> if account and
+            container are given, or <dataset>/<blob_name> if account and
+            container are None. File can either be a list of image paths, or a
+            JSON file containing image paths.
+        json_keys: optional list of str, only relevant if url_or_path is a JSON
+            file. If json_keys=None, then the JSON file at url_or_path is
+            assumed to be a JSON list of image paths. If json_keys is not None,
+            then the JSON file should be a dict, whose values corresponding to
+            json_keys are lists of image paths.
+        account: str, name of Azure Blob Storage account
+        container: str, name of Azure Blob Storage container
+        sas_token: str, optional SAS token (without leading '?') if the
+            container is not publicly accessible
+    """
+    datasets_table = None
+    if (account is None) or (container is None):
+        assert account is None
+        assert container is None
+        assert sas_token is None
+        datasets_table = MegadbUtils().get_datasets_table()
+    is_json = ('.json' in url_or_path)
+    if url_or_path.startswith(('http://', 'https://')):
+        r = requests.get(url_or_path)
+        if is_json:
+            img_paths = r.json()
+        else:
+            img_paths = r.text.splitlines()
+    else:
+        with open(url_or_path, 'r') as f:
+            if is_json:
+                img_paths = json.load(f)
+            else:
+                img_paths = f.readlines()
+    if is_json and json_keys is not None:
+        img_paths_json = img_paths
+        img_paths = []
+        for k in json_keys:
+            img_paths += img_paths_json[k]
+    mapping: dict[str, list[str]] = {
+        status: []
+        for status in ['good', 'nonexistent', 'non_image', 'truncated', 'bad']
+    }
+    pool = futures.ThreadPoolExecutor(max_workers=100)
+    # lock before changing ImageFile.LOAD_TRUNCATED_IMAGES
+    truncated_images_lock = threading.Lock()
+    futures_list = []
+    for img_path in tqdm(img_paths):
+        future = pool.submit(
+            check_image_condition, img_path, truncated_images_lock, account,
+            container, sas_token, datasets_table)
+        futures_list.append(future)
+    total = len(futures_list)
+    for future in tqdm(futures.as_completed(futures_list), total=total):
+        img_file, status = future.result()
+        mapping[status].append(img_file)
+    for status, img_list in mapping.items():
+        print(f'{status}: {len(img_list)}')
+        pprint(sorted(img_list))
+#%% Command-line driver
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description='Analyze a list of images that failed to download or crop.')
+    parser.add_argument(
+        'failed_images', metavar='URL_OR_PATH',
+        help='URL or path to text or JSON file containing list of image paths')
+    parser.add_argument(
+        '-k', '--json-keys', nargs='*',
+        help='list of keys in JSON file containing image paths')
+    parser.add_argument(
+        '-a', '--account',
+        help='name of Azure Blob Storage account. If not given, then image '
+             'paths are assumed to start with the dataset name, so we can look '
+             'up the account from MegaDB.')
+    parser.add_argument(
+        '-c', '--container',
+        help='name of Azure Blob Storage container. If not given, then image '
+             'paths are assumed to start with the dataset name, so we can look '
+             'up the container from MegaDB.')
+    parser.add_argument(
+        '-s', '--sas-token',
+        help='optional SAS token (without leading "?") if the container is not '
+             'publicly accessible. If account and container not given, then '
+             'image paths are assumed to start with the dataset name, so we '
+             'can look up the SAS Token from MegaDB.')
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = _parse_args()
+    analyze_images(url_or_path=args.failed_images, json_keys=args.json_keys,
+                   account=args.account, container=args.container,
+                   sas_token=args.sas_token)

megadetector/classification/cache_batchapi_outputs.py ADDED Viewed

@@ -0,0 +1,198 @@
+"""
+cache_batchapi_outputs.py
+Script to cache Batch Detection API outputs.
+This script can handle either the Batch Detection API JSON Response or the
+detections JSON.
+Batch Detection API Response format:
+    {
+        "Status": {
+            "request_status": "completed",
+            "message": {
+                "num_failed_shards": 0,
+                "output_file_urls": {
+                    "detections": "https://url/to/detections.json",
+                    "failed_images": "https://url/to/failed_images.json",
+                    "images": https://url/to/images.json",
+                }
+            },
+        },
+        "Endpoint": "/v3/camera-trap/detection-batch/request_detections",
+        "TaskId": "ea26326e-7e0d-4524-a9ea-f57a5799d4ba"
+    }
+Detections JSON format:
+    {
+        "info": {...}
+        "detection_categories": {...}
+        "classification_categories": {...}
+        "images": [
+            {
+                "file": "path/from/base/dir/image1.jpg",
+                "max_detection_conf": 0.926,
+                "detections": [{
+                        "category": "1",
+                        "conf": 0.061,
+                        "bbox": [0.0451, 0.1849, 0.3642, 0.4636]
+                }]
+            }
+        ]
+    }
+Batch Detection API Output Format:
+github.com/agentmorris/MegaDetector/tree/main/megadetector/api/batch_processing#api-outputs
+"""
+#%% Imports
+from __future__ import annotations
+import argparse
+from collections.abc import Mapping
+import json
+import os
+from typing import Any, Optional
+import requests
+from api.batch_processing.data_preparation.prepare_api_submission import (
+    TaskStatus, Task)
+from api.batch_processing.postprocessing.combine_api_outputs import (
+    combine_api_output_dictionaries)
+#%% Support functions
+def cache_json(json_path: str,
+               is_detections: bool,
+               dataset: str,
+               detector_output_cache_base_dir: str,
+               detector_version: Optional[str]) -> None:
+    """
+    Args:
+        json_path: str, path to JSON file
+        is_detections: bool, True if <json_path> is a detections JSON file,
+            False if <json_path> is a API response JSON file
+        dataset: str
+        detector_output_cache_base_dir: str
+        detector_version: str
+    """
+    with open(json_path, 'r') as f:
+        js = json.load(f)
+    if is_detections:
+        detections = js
+    else:
+        response = js
+        # task finished successfully
+        status = TaskStatus(response['Status']['request_status'])
+        assert status == TaskStatus.COMPLETED
+        # parse the task ID
+        task_id = response['TaskId']
+        message = response['Status']['message']
+        detections_url = message['output_file_urls']['detections']
+        assert detections_url.split('/')[-2] == task_id
+        # print info about missing and failed images
+        task = Task(name=task_id, task_id=task_id)
+        task.response = response
+        task.status = status
+        task.get_missing_images(verbose=True)
+        # get the detections
+        detections = requests.get(detections_url).json()
+    # add detections to the detections cache
+    api_det_version = detections['info']['detector'].rsplit('v', maxsplit=1)[1]
+    if detector_version is not None:
+        assert api_det_version == detector_version
+    detector_output_cache_dir = os.path.join(
+        detector_output_cache_base_dir, f'v{api_det_version}')
+    msg = cache_detections(
+        detections=detections, dataset=dataset,
+        detector_output_cache_dir=detector_output_cache_dir)
+    print(msg)
+def cache_detections(detections: Mapping[str, Any], dataset: str,
+                     detector_output_cache_dir: str) -> str:
+    """
+    Args:
+        detections: dict, represents JSON output of detector
+        dataset: str, name of dataset
+        detector_output_cache_dir: str, path to folder where detector outputs
+            are cached, stored as 1 JSON file per dataset, directory must
+            already exist
+    Returns: str, message
+    """
+    # combine detections with cache
+    dataset_cache_path = os.path.join(
+        detector_output_cache_dir, f'{dataset}.json')
+    merged_dataset_cache: Mapping[str, Any]
+    if os.path.exists(dataset_cache_path):
+        with open(dataset_cache_path, 'r') as f:
+            dataset_cache = json.load(f)
+        merged_dataset_cache = combine_api_output_dictionaries(
+            input_dicts=[dataset_cache, detections], require_uniqueness=False)
+        msg = f'Merging detection output with {dataset_cache_path}'
+    else:
+        merged_dataset_cache = detections
+        msg = ('No cached detection outputs found. Saving detection output to '
+               f'{dataset_cache_path}')
+    # write combined detections back out to cache
+    with open(dataset_cache_path, 'w') as f:
+        json.dump(merged_dataset_cache, f, indent=1)
+    return msg
+#%% Command-line driver
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='Caches detector outputs.')
+    parser.add_argument(
+        'json_file',
+        help='path to JSON file containing response of Batch Detection API')
+    parser.add_argument(
+        '-f', '--format', choices=['response', 'detections'], required=True,
+        help='(required) whether <json_file> is a Batch API response or a '
+             'detections JSON file')
+    parser.add_argument(
+        '-d', '--dataset', required=True,
+        help='(required) name of dataset corresponding to the API task')
+    parser.add_argument(
+        '-c', '--detector-output-cache-dir', required=True,
+        help='(required) path to directory where detector outputs are cached')
+    parser.add_argument(
+        '-v', '--detector-version',
+        help='detector version string, e.g., "4.1", inferred from detections '
+             'file if not given')
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = _parse_args()
+    cache_json(
+        json_path=args.json_file,
+        is_detections=(args.format == 'detections'),
+        dataset=args.dataset,
+        detector_output_cache_base_dir=args.detector_output_cache_dir,
+        detector_version=args.detector_version)

megadetector 5.0.11__py3-none-any.whl → 5.0.13__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.11py3-none-any.whl → 5.0.13py3-none-any.whl