PyPI - megadetector - Versions diffs - 5.0.9__py3-none-any.whl → 5.0.11__py3-none-any.whl - Mend

megadetector 5.0.9py3-none-any.whl → 5.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (226) hide show

{megadetector-5.0.9.dist-info → megadetector-5.0.11.dist-info}/LICENSE +0 -0
{megadetector-5.0.9.dist-info → megadetector-5.0.11.dist-info}/METADATA +12 -11
megadetector-5.0.11.dist-info/RECORD +5 -0
megadetector-5.0.11.dist-info/top_level.txt +1 -0
api/__init__.py +0 -0
api/batch_processing/__init__.py +0 -0
api/batch_processing/api_core/__init__.py +0 -0
api/batch_processing/api_core/batch_service/__init__.py +0 -0
api/batch_processing/api_core/batch_service/score.py +0 -439
api/batch_processing/api_core/server.py +0 -294
api/batch_processing/api_core/server_api_config.py +0 -98
api/batch_processing/api_core/server_app_config.py +0 -55
api/batch_processing/api_core/server_batch_job_manager.py +0 -220
api/batch_processing/api_core/server_job_status_table.py +0 -152
api/batch_processing/api_core/server_orchestration.py +0 -360
api/batch_processing/api_core/server_utils.py +0 -92
api/batch_processing/api_core_support/__init__.py +0 -0
api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
api/batch_processing/api_support/__init__.py +0 -0
api/batch_processing/api_support/summarize_daily_activity.py +0 -152
api/batch_processing/data_preparation/__init__.py +0 -0
api/batch_processing/data_preparation/manage_local_batch.py +0 -2391
api/batch_processing/data_preparation/manage_video_batch.py +0 -327
api/batch_processing/integration/digiKam/setup.py +0 -6
api/batch_processing/integration/digiKam/xmp_integration.py +0 -465
api/batch_processing/integration/eMammal/test_scripts/config_template.py +0 -5
api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -126
api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +0 -55
api/batch_processing/postprocessing/__init__.py +0 -0
api/batch_processing/postprocessing/add_max_conf.py +0 -64
api/batch_processing/postprocessing/categorize_detections_by_size.py +0 -163
api/batch_processing/postprocessing/combine_api_outputs.py +0 -249
api/batch_processing/postprocessing/compare_batch_results.py +0 -958
api/batch_processing/postprocessing/convert_output_format.py +0 -397
api/batch_processing/postprocessing/load_api_results.py +0 -195
api/batch_processing/postprocessing/md_to_coco.py +0 -310
api/batch_processing/postprocessing/md_to_labelme.py +0 -330
api/batch_processing/postprocessing/merge_detections.py +0 -401
api/batch_processing/postprocessing/postprocess_batch_results.py +0 -1904
api/batch_processing/postprocessing/remap_detection_categories.py +0 -170
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +0 -661
api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +0 -211
api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +0 -82
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +0 -1631
api/batch_processing/postprocessing/separate_detections_into_folders.py +0 -731
api/batch_processing/postprocessing/subset_json_detector_output.py +0 -696
api/batch_processing/postprocessing/top_folders_to_bottom.py +0 -223
api/synchronous/__init__.py +0 -0
api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
api/synchronous/api_core/animal_detection_api/api_backend.py +0 -152
api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -266
api/synchronous/api_core/animal_detection_api/config.py +0 -35
api/synchronous/api_core/animal_detection_api/data_management/annotations/annotation_constants.py +0 -47
api/synchronous/api_core/animal_detection_api/detection/detector_training/copy_checkpoints.py +0 -43
api/synchronous/api_core/animal_detection_api/detection/detector_training/model_main_tf2.py +0 -114
api/synchronous/api_core/animal_detection_api/detection/process_video.py +0 -543
api/synchronous/api_core/animal_detection_api/detection/pytorch_detector.py +0 -304
api/synchronous/api_core/animal_detection_api/detection/run_detector.py +0 -627
api/synchronous/api_core/animal_detection_api/detection/run_detector_batch.py +0 -1029
api/synchronous/api_core/animal_detection_api/detection/run_inference_with_yolov5_val.py +0 -581
api/synchronous/api_core/animal_detection_api/detection/run_tiled_inference.py +0 -754
api/synchronous/api_core/animal_detection_api/detection/tf_detector.py +0 -165
api/synchronous/api_core/animal_detection_api/detection/video_utils.py +0 -495
api/synchronous/api_core/animal_detection_api/md_utils/azure_utils.py +0 -174
api/synchronous/api_core/animal_detection_api/md_utils/ct_utils.py +0 -262
api/synchronous/api_core/animal_detection_api/md_utils/directory_listing.py +0 -251
api/synchronous/api_core/animal_detection_api/md_utils/matlab_porting_tools.py +0 -97
api/synchronous/api_core/animal_detection_api/md_utils/path_utils.py +0 -416
api/synchronous/api_core/animal_detection_api/md_utils/process_utils.py +0 -110
api/synchronous/api_core/animal_detection_api/md_utils/sas_blob_utils.py +0 -509
api/synchronous/api_core/animal_detection_api/md_utils/string_utils.py +0 -59
api/synchronous/api_core/animal_detection_api/md_utils/url_utils.py +0 -144
api/synchronous/api_core/animal_detection_api/md_utils/write_html_image_list.py +0 -226
api/synchronous/api_core/animal_detection_api/md_visualization/visualization_utils.py +0 -841
api/synchronous/api_core/tests/__init__.py +0 -0
api/synchronous/api_core/tests/load_test.py +0 -110
classification/__init__.py +0 -0
classification/aggregate_classifier_probs.py +0 -108
classification/analyze_failed_images.py +0 -227
classification/cache_batchapi_outputs.py +0 -198
classification/create_classification_dataset.py +0 -627
classification/crop_detections.py +0 -516
classification/csv_to_json.py +0 -226
classification/detect_and_crop.py +0 -855
classification/efficientnet/__init__.py +0 -9
classification/efficientnet/model.py +0 -415
classification/efficientnet/utils.py +0 -610
classification/evaluate_model.py +0 -520
classification/identify_mislabeled_candidates.py +0 -152
classification/json_to_azcopy_list.py +0 -63
classification/json_validator.py +0 -695
classification/map_classification_categories.py +0 -276
classification/merge_classification_detection_output.py +0 -506
classification/prepare_classification_script.py +0 -194
classification/prepare_classification_script_mc.py +0 -228
classification/run_classifier.py +0 -286
classification/save_mislabeled.py +0 -110
classification/train_classifier.py +0 -825
classification/train_classifier_tf.py +0 -724
classification/train_utils.py +0 -322
data_management/__init__.py +0 -0
data_management/annotations/__init__.py +0 -0
data_management/annotations/annotation_constants.py +0 -34
data_management/camtrap_dp_to_coco.py +0 -238
data_management/cct_json_utils.py +0 -395
data_management/cct_to_md.py +0 -176
data_management/cct_to_wi.py +0 -289
data_management/coco_to_labelme.py +0 -272
data_management/coco_to_yolo.py +0 -662
data_management/databases/__init__.py +0 -0
data_management/databases/add_width_and_height_to_db.py +0 -33
data_management/databases/combine_coco_camera_traps_files.py +0 -206
data_management/databases/integrity_check_json_db.py +0 -477
data_management/databases/subset_json_db.py +0 -115
data_management/generate_crops_from_cct.py +0 -149
data_management/get_image_sizes.py +0 -188
data_management/importers/add_nacti_sizes.py +0 -52
data_management/importers/add_timestamps_to_icct.py +0 -79
data_management/importers/animl_results_to_md_results.py +0 -158
data_management/importers/auckland_doc_test_to_json.py +0 -372
data_management/importers/auckland_doc_to_json.py +0 -200
data_management/importers/awc_to_json.py +0 -189
data_management/importers/bellevue_to_json.py +0 -273
data_management/importers/cacophony-thermal-importer.py +0 -796
data_management/importers/carrizo_shrubfree_2018.py +0 -268
data_management/importers/carrizo_trail_cam_2017.py +0 -287
data_management/importers/cct_field_adjustments.py +0 -57
data_management/importers/channel_islands_to_cct.py +0 -913
data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
data_management/importers/eMammal/eMammal_helpers.py +0 -249
data_management/importers/eMammal/make_eMammal_json.py +0 -223
data_management/importers/ena24_to_json.py +0 -275
data_management/importers/filenames_to_json.py +0 -385
data_management/importers/helena_to_cct.py +0 -282
data_management/importers/idaho-camera-traps.py +0 -1407
data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
data_management/importers/jb_csv_to_json.py +0 -150
data_management/importers/mcgill_to_json.py +0 -250
data_management/importers/missouri_to_json.py +0 -489
data_management/importers/nacti_fieldname_adjustments.py +0 -79
data_management/importers/noaa_seals_2019.py +0 -181
data_management/importers/pc_to_json.py +0 -365
data_management/importers/plot_wni_giraffes.py +0 -123
data_management/importers/prepare-noaa-fish-data-for-lila.py +0 -359
data_management/importers/prepare_zsl_imerit.py +0 -131
data_management/importers/rspb_to_json.py +0 -356
data_management/importers/save_the_elephants_survey_A.py +0 -320
data_management/importers/save_the_elephants_survey_B.py +0 -332
data_management/importers/snapshot_safari_importer.py +0 -758
data_management/importers/snapshot_safari_importer_reprise.py +0 -665
data_management/importers/snapshot_serengeti_lila.py +0 -1067
data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
data_management/importers/sulross_get_exif.py +0 -65
data_management/importers/timelapse_csv_set_to_json.py +0 -490
data_management/importers/ubc_to_json.py +0 -399
data_management/importers/umn_to_json.py +0 -507
data_management/importers/wellington_to_json.py +0 -263
data_management/importers/wi_to_json.py +0 -441
data_management/importers/zamba_results_to_md_results.py +0 -181
data_management/labelme_to_coco.py +0 -548
data_management/labelme_to_yolo.py +0 -272
data_management/lila/__init__.py +0 -0
data_management/lila/add_locations_to_island_camera_traps.py +0 -97
data_management/lila/add_locations_to_nacti.py +0 -147
data_management/lila/create_lila_blank_set.py +0 -557
data_management/lila/create_lila_test_set.py +0 -151
data_management/lila/create_links_to_md_results_files.py +0 -106
data_management/lila/download_lila_subset.py +0 -177
data_management/lila/generate_lila_per_image_labels.py +0 -515
data_management/lila/get_lila_annotation_counts.py +0 -170
data_management/lila/get_lila_image_counts.py +0 -111
data_management/lila/lila_common.py +0 -300
data_management/lila/test_lila_metadata_urls.py +0 -132
data_management/ocr_tools.py +0 -874
data_management/read_exif.py +0 -681
data_management/remap_coco_categories.py +0 -84
data_management/remove_exif.py +0 -66
data_management/resize_coco_dataset.py +0 -189
data_management/wi_download_csv_to_coco.py +0 -246
data_management/yolo_output_to_md_output.py +0 -441
data_management/yolo_to_coco.py +0 -676
detection/__init__.py +0 -0
detection/detector_training/__init__.py +0 -0
detection/detector_training/model_main_tf2.py +0 -114
detection/process_video.py +0 -703
detection/pytorch_detector.py +0 -337
detection/run_detector.py +0 -779
detection/run_detector_batch.py +0 -1219
detection/run_inference_with_yolov5_val.py +0 -917
detection/run_tiled_inference.py +0 -935
detection/tf_detector.py +0 -188
detection/video_utils.py +0 -606
docs/source/conf.py +0 -43
md_utils/__init__.py +0 -0
md_utils/azure_utils.py +0 -174
md_utils/ct_utils.py +0 -612
md_utils/directory_listing.py +0 -246
md_utils/md_tests.py +0 -968
md_utils/path_utils.py +0 -1044
md_utils/process_utils.py +0 -157
md_utils/sas_blob_utils.py +0 -509
md_utils/split_locations_into_train_val.py +0 -228
md_utils/string_utils.py +0 -92
md_utils/url_utils.py +0 -323
md_utils/write_html_image_list.py +0 -225
md_visualization/__init__.py +0 -0
md_visualization/plot_utils.py +0 -293
md_visualization/render_images_with_thumbnails.py +0 -275
md_visualization/visualization_utils.py +0 -1537
md_visualization/visualize_db.py +0 -551
md_visualization/visualize_detector_output.py +0 -406
megadetector-5.0.9.dist-info/RECORD +0 -224
megadetector-5.0.9.dist-info/top_level.txt +0 -8
taxonomy_mapping/__init__.py +0 -0
taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +0 -491
taxonomy_mapping/map_new_lila_datasets.py +0 -154
taxonomy_mapping/prepare_lila_taxonomy_release.py +0 -142
taxonomy_mapping/preview_lila_taxonomy.py +0 -591
taxonomy_mapping/retrieve_sample_image.py +0 -71
taxonomy_mapping/simple_image_download.py +0 -218
taxonomy_mapping/species_lookup.py +0 -834
taxonomy_mapping/taxonomy_csv_checker.py +0 -159
taxonomy_mapping/taxonomy_graph.py +0 -346
taxonomy_mapping/validate_lila_category_mappings.py +0 -83
{megadetector-5.0.9.dist-info → megadetector-5.0.11.dist-info}/WHEEL +0 -0

classification/create_classification_dataset.py DELETED Viewed

@@ -1,627 +0,0 @@
-"""
-create_classification_dataset.py
-Creates a classification dataset CSV with a corresponding JSON file determining
-the train/val/test split.
-This script takes as input a "queried images" JSON file whose keys are paths to
-images and values are dictionaries containing information relevant for training
-a classifier, including labels and (optionally) ground-truth bounding boxes.
-The image paths are in the format `<dataset-name>/<blob-name>` where we assume
-that the dataset name does not contain '/'.
-{
-    "caltech/cct_images/59f79901-23d2-11e8-a6a3-ec086b02610b.jpg": {
-        "dataset": "caltech",
-        "location": 13,
-        "class": "mountain_lion",  # class from dataset
-        "bbox": [{"category": "animal",
-                  "bbox": [0, 0.347, 0.237, 0.257]}],   # ground-truth bbox
-        "label": ["monutain_lion"]  # labels to use in classifier
-    },
-    "caltech/cct_images/59f5fe2b-23d2-11e8-a6a3-ec086b02610b.jpg": {
-        "dataset": "caltech",
-        "location": 13,
-        "class": "mountain_lion",  # class from dataset
-        "label": ["monutain_lion"]  # labels to use in classifier
-    },
-    ...
-}
-We assume that the tuple (dataset, location) identifies a unique location. In
-other words, we assume that no two datasets have overlapping locations. This
-probably isn't 100% true, but it's pretty much the best we can do in terms of
-avoiding overlapping locations between the train/val/test splits.
-This script outputs 3 files to <output_dir>:
-1) classification_ds.csv, contains columns:
-    - 'path': str, path to cropped images
-    - 'dataset': str, name of dataset
-    - 'location': str, location that image was taken, as saved in MegaDB
-    - 'dataset_class': str, original class assigned to image, as saved in MegaDB
-    - 'confidence': float, confidence that this crop is of an actual animal,
-        1.0 if the crop is a "ground truth bounding box" (i.e., from MegaDB),
-        <= 1.0 if the bounding box was detected by MegaDetector
-    - 'label': str, comma-separated list of label(s) assigned to this crop for
-        the sake of classification
-2) label_index.json: maps integer to label name
-    - keys are string representations of Python integers (JSON requires keys to
-      be strings), numbered from 0 to num_labels-1
-    - values are strings, label names
-3) splits.json: serialization of a Python dict that maps each split
-   ['train', 'val', 'test'] to a list of length-2 lists, where each inner list
-   is [<dataset>, <location>]
-"""
-#%% Imports and constants
-from __future__ import annotations
-import argparse
-from collections.abc import Container, MutableMapping
-import json
-import os
-from typing import Optional
-import numpy as np
-import pandas as pd
-from tqdm import tqdm
-from classification import detect_and_crop
-#%% Example usage
-"""
-    python create_classification_dataset.py \
-        run_idfg2 \
-        --queried-images-json run_idfg2/queried_images.json \
-        --cropped-images-dir /ssd/crops_sq \
-        -d $HOME/classifier-training/mdcache -v "4.1" -t 0.8
-"""
-DATASET_FILENAME = 'classification_ds.csv'
-LABEL_INDEX_FILENAME = 'label_index.json'
-SPLITS_FILENAME = 'splits.json'
-#%% Main function
-def main(output_dir: str,
-         mode: list[str],
-         match_test: Optional[list[str]],
-         queried_images_json_path: Optional[str],
-         cropped_images_dir: Optional[str],
-         detector_version: Optional[str],
-         detector_output_cache_base_dir: Optional[str],
-         confidence_threshold: Optional[float],
-         min_locs: Optional[int],
-         val_frac: Optional[float],
-         test_frac: Optional[float],
-         splits_method: Optional[str],
-         label_spec_json_path: Optional[str]) -> None:
-    # input validation
-    assert set(mode) <= {'csv', 'splits'}
-    if label_spec_json_path is not None:
-        assert splits_method == 'smallest_first'
-    test_set_locs = None  # set of (dataset, location) tuples
-    test_set_df = None
-    if match_test is not None:
-        match_test_csv_path, match_test_splits_path = match_test
-        match_df = pd.read_csv(match_test_csv_path, index_col=False,
-                               float_precision='high')
-        with open(match_test_splits_path, 'r') as f:
-            match_splits = json.load(f)
-        test_set_locs = set((loc[0], loc[1]) for loc in match_splits['test'])
-        ds_locs = pd.Series(zip(match_df['dataset'], match_df['location']))
-        test_set_df = match_df[ds_locs.isin(test_set_locs)]
-    dataset_path = os.path.join(output_dir, DATASET_FILENAME)
-    if 'csv' in mode:
-        assert queried_images_json_path is not None
-        assert cropped_images_dir is not None
-        assert detector_version is not None
-        assert detector_output_cache_base_dir is not None
-        assert confidence_threshold is not None
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-            print(f'Created {output_dir}')
-        df, log = create_classification_csv(
-            queried_images_json_path=queried_images_json_path,
-            detector_output_cache_base_dir=detector_output_cache_base_dir,
-            detector_version=detector_version,
-            cropped_images_dir=cropped_images_dir,
-            confidence_threshold=confidence_threshold,
-            min_locs=min_locs,
-            append_df=test_set_df,
-            exclude_locs=test_set_locs)
-        print('Saving classification dataset CSV')
-        df.to_csv(dataset_path, index=False)
-        for msg, img_list in log.items():
-            print(f'{msg}:', len(img_list))
-        # create label index JSON
-        labels = df['label']
-        if any(labels.str.contains(',')):
-            print('multi-label!')
-            labels = labels.map(lambda x: x.split(',')).explode()
-            # look into sklearn.preprocessing.MultiLabelBinarizer
-        label_names = sorted(labels.unique())
-        with open(os.path.join(output_dir, LABEL_INDEX_FILENAME), 'w') as f:
-            # Note: JSON always saves keys as strings!
-            json.dump(dict(enumerate(label_names)), f, indent=1)
-    if 'splits' in mode:
-        assert splits_method is not None
-        assert val_frac is not None
-        assert (match_test is None) != (test_frac is None)
-        if test_frac is None:
-            test_frac = 0
-        print(f'Creating splits using "{splits_method}" method...')
-        df = pd.read_csv(dataset_path, index_col=False, float_precision='high')
-        if splits_method == 'random':
-            split_to_locs = create_splits_random(
-                df, val_frac, test_frac, test_split=test_set_locs)
-        else:
-            split_to_locs = create_splits_smallest_label_first(
-                df, val_frac, test_frac, test_split=test_set_locs,
-                label_spec_json_path=label_spec_json_path)
-        with open(os.path.join(output_dir, SPLITS_FILENAME), 'w') as f:
-            json.dump(split_to_locs, f, indent=1)
-#%% Support functions
-def create_classification_csv(
-        queried_images_json_path: str,
-        detector_output_cache_base_dir: str,
-        detector_version: str,
-        cropped_images_dir: str,
-        confidence_threshold: float,
-        min_locs: Optional[int] = None,
-        append_df: Optional[pd.DataFrame] = None,
-        exclude_locs: Optional[Container[tuple[str, str]]] = None
-        ) -> tuple[pd.DataFrame, dict[str, list]]:
-    """
-    Creates a classification dataset.
-    The classification dataset is a pd.DataFrame with columns:
-    - path: str, <dataset>/<crop-filename>
-    - dataset: str, name of camera trap dataset
-    - location: str, location of image, provided by the camera trap dataset
-    - dataset_class: image class, as provided by the camera trap dataset
-    - confidence: float, confidence of bounding box, 1 if ground truth
-    - label: str, comma-separated list of classification labels
-    Args:
-        queried_images_json_path: str, path to output of json_validator.py
-        detector_version: str, detector version string, e.g., '4.1',
-            see {batch_detection_api_url}/supported_model_versions,
-            determines the subfolder of detector_output_cache_base_dir in
-            which to find and save detector outputs
-        detector_output_cache_base_dir: str, path to local directory
-            where detector outputs are cached, 1 JSON file per dataset
-        cropped_images_dir: str, path to local directory for saving crops of
-            bounding boxes
-        confidence_threshold: float, only crop bounding boxes above this value
-        min_locs: optional int, minimum # of locations that each label must
-            have in order to be included
-        append_df: optional pd.DataFrame, existing DataFrame that is appended to
-            the classification CSV
-        exclude_locs: optional set of (dataset, location) tuples, crops from
-            these locations are excluded (does not affect append_df)
-    Returns:
-        df: pd.DataFrame, the classification dataset
-        log: dict, with the following keys
-            'images missing detections': list of str, images without ground
-                truth bboxes and not in detection cache
-            'images without confident detections': list of str, images in
-                detection cache with no bboxes above the confidence threshold
-            'missing crops': list of tuple (img_path, i), where i is the
-                i-th crop index
-    """
-    assert 0 <= confidence_threshold <= 1
-    columns = [
-        'path', 'dataset', 'location', 'dataset_class', 'confidence', 'label']
-    if append_df is not None:
-        assert (append_df.columns == columns).all()
-    with open(queried_images_json_path, 'r') as f:
-        js = json.load(f)
-    print('loading detection cache...', end='')
-    detector_output_cache_dir = os.path.join(
-        detector_output_cache_base_dir, f'v{detector_version}')
-    datasets = set(img_path[:img_path.find('/')] for img_path in js)
-    detection_cache, cat_id_to_name = detect_and_crop.load_detection_cache(
-        detector_output_cache_dir=detector_output_cache_dir, datasets=datasets)
-    print('done!')
-    missing_detections = []  # no cached detections or ground truth bboxes
-    images_no_confident_detections = []  # cached detections contain 0 bboxes
-    images_missing_crop = []  # tuples: (img_path, crop_index)
-    all_rows = []
-    # True for ground truth, False for MegaDetector
-    # always save as .jpg for consistency
-    crop_path_template = {
-        True: '{img_path}___crop{n:>02d}.jpg',
-        False: '{img_path}___crop{n:>02d}_' + f'mdv{detector_version}.jpg'
-    }
-    for img_path, img_info in tqdm(js.items()):
-        ds, img_file = img_path.split('/', maxsplit=1)
-        # get bounding boxes
-        if 'bbox' in img_info:  # ground-truth bounding boxes
-            bbox_dicts = img_info['bbox']
-            is_ground_truth = True
-        else:  # get bounding boxes from detector cache
-            if img_file in detection_cache[ds]:
-                bbox_dicts = detection_cache[ds][img_file]['detections']
-                # convert from category ID to category name
-                for d in bbox_dicts:
-                    d['category'] = cat_id_to_name[d['category']]
-            else:
-                missing_detections.append(img_path)
-                continue
-            is_ground_truth = False
-        # check if crops are already downloaded, and ignore bboxes below the
-        # confidence threshold
-        rows = []
-        for i, bbox_dict in enumerate(bbox_dicts):
-            conf = 1.0 if is_ground_truth else bbox_dict['conf']
-            if conf < confidence_threshold:
-                continue
-            if bbox_dict['category'] != 'animal':
-                tqdm.write(f'Bbox {i} of {img_path} is non-animal. Skipping.')
-                continue
-            crop_path = crop_path_template[is_ground_truth].format(
-                img_path=img_path, n=i)
-            full_crop_path = os.path.join(cropped_images_dir, crop_path)
-            if not os.path.exists(full_crop_path):
-                images_missing_crop.append((img_path, i))
-            else:
-                # assign all images without location info to 'unknown_location'
-                img_loc = img_info.get('location', 'unknown_location')
-                row = [crop_path, ds, img_loc, img_info['class'],
-                       conf, ','.join(img_info['label'])]
-                rows.append(row)
-        if len(rows) == 0:
-            images_no_confident_detections.append(img_path)
-            continue
-        all_rows += rows
-    df = pd.DataFrame(data=all_rows, columns=columns)
-    # remove images from labels that have fewer than min_locs locations
-    if min_locs is not None:
-        nlocs_per_label = df.groupby('label').apply(
-            lambda xdf: len(xdf[['dataset', 'location']].drop_duplicates()))
-        valid_labels_mask = (nlocs_per_label >= min_locs)
-        valid_labels = nlocs_per_label.index[valid_labels_mask]
-        invalid_labels = nlocs_per_label.index[~valid_labels_mask]
-        orig = len(df)
-        df = df[df['label'].isin(valid_labels)]
-        print(f'Excluding {orig - len(df)} crops from {len(invalid_labels)} '
-              'labels:', invalid_labels.tolist())
-    if exclude_locs is not None:
-        mask = ~pd.Series(zip(df['dataset'], df['location'])).isin(exclude_locs)
-        print(f'Excluding {(~mask).sum()} crops from CSV')
-        df = df[mask]
-    if append_df is not None:
-        print(f'Appending {len(append_df)} rows to CSV')
-        df = df.append(append_df)
-    log = {
-        'images missing detections': missing_detections,
-        'images without confident detections': images_no_confident_detections,
-        'missing crops': images_missing_crop
-    }
-    return df, log
-def create_splits_random(df: pd.DataFrame, val_frac: float,
-                         test_frac: float = 0.,
-                         test_split: Optional[set[tuple[str, str]]] = None,
-                         ) -> dict[str, list[tuple[str, str]]]:
-    """
-    Args:
-        df: pd.DataFrame, contains columns ['dataset', 'location', 'label']
-            each row is a single image
-            assumes each image is assigned exactly 1 label
-        val_frac: float, desired fraction of dataset to use for val set
-        test_frac: float, desired fraction of dataset to use for test set,
-            must be 0 if test_split is given
-        test_split: optional set of (dataset, location) tuples to use as test
-            split
-    Returns: dict, keys are ['train', 'val', 'test'], values are lists of locs,
-        where each loc is a tuple (dataset, location)
-    """
-    if test_split is not None:
-        assert test_frac == 0
-    train_frac = 1. - val_frac - test_frac
-    targets = {'train': train_frac, 'val': val_frac, 'test': test_frac}
-    # merge dataset and location into a single string '<dataset>/<location>'
-    df['dataset_location'] = df['dataset'] + '/' + df['location']
-    # create DataFrame of counts. rows = locations, columns = labels
-    loc_label_counts = (df.groupby(['label', 'dataset_location']).size()
-                        .unstack('label', fill_value=0))
-    num_locs = len(loc_label_counts)
-    # label_count: label => number of examples
-    # loc_count: label => number of locs containing that label
-    label_count = loc_label_counts.sum()
-    loc_count = (loc_label_counts > 0).sum()
-    best_score = np.inf  # lower is better
-    best_splits = None
-    for _ in tqdm(range(10_000)):
-        # generate a new split
-        num_train = int(num_locs * (train_frac + np.random.uniform(-.03, .03)))
-        if test_frac > 0:
-            num_val = int(num_locs * (val_frac + np.random.uniform(-.03, .03)))
-        else:
-            num_val = num_locs - num_train
-        permuted_locs = loc_label_counts.index[np.random.permutation(num_locs)]
-        split_to_locs = {'train': permuted_locs[:num_train],
-                         'val': permuted_locs[num_train:num_train + num_val]}
-        if test_frac > 0:
-            split_to_locs['test'] = permuted_locs[num_train + num_val:]
-        # score the split
-        score = 0.
-        for split, locs in split_to_locs.items():
-            split_df = loc_label_counts.loc[locs]
-            target = targets[split]
-            # SSE for # of images per label (with 2x weight)
-            crop_frac = split_df.sum() / label_count
-            score += 2 * ((crop_frac - target) ** 2).sum()
-            # SSE for # of locs per label
-            loc_frac = (split_df > 0).sum() / loc_count
-            score += ((loc_frac - target) ** 2).sum()
-        if score < best_score:
-            tqdm.write(f'New lowest score: {score}')
-            best_score = score
-            best_splits = split_to_locs
-    assert best_splits is not None
-    split_to_locs = {
-        s: sorted(locs.map(lambda x: tuple(x.split('/', maxsplit=1))))
-        for s, locs in best_splits.items()
-    }
-    if test_split is not None:
-        split_to_locs['test'] = test_split
-    return split_to_locs
-def create_splits_smallest_label_first(
-        df: pd.DataFrame,
-        val_frac: float,
-        test_frac: float = 0.,
-        label_spec_json_path: Optional[str] = None,
-        test_split: Optional[set[tuple[str, str]]] = None,
-        ) -> dict[str, list[tuple[str, str]]]:
-    """
-    Args:
-        df: pd.DataFrame, contains columns ['dataset', 'location', 'label']
-            each row is a single image
-            assumes each image is assigned exactly 1 label
-        val_frac: float, desired fraction of dataset to use for val set
-        test_frac: float, desired fraction of dataset to use for test set,
-            must be 0 if test_split is given
-        label_spec_json_path: optional str, path to label spec JSON
-        test_split: optional set of (dataset, location) tuples to use as test
-            split
-    Returns: dict, keys are ['train', 'val', 'test'], values are lists of locs,
-        where each loc is a tuple (dataset, location)
-    """
-    # label => list of datasets to prioritize for test and validation sets
-    prioritize = {}
-    if label_spec_json_path is not None:
-        with open(label_spec_json_path, 'r') as f:
-            label_spec_js = json.load(f)
-        for label, label_spec in label_spec_js.items():
-            if 'prioritize' in label_spec:
-                datasets = []
-                for level in label_spec['prioritize']:
-                    datasets += level
-                prioritize[label] = datasets
-    # merge dataset and location into a tuple (dataset, location)
-    df['dataset_location'] = list(zip(df['dataset'], df['location']))
-    loc_to_label_sizes = df.groupby(['dataset_location', 'label']).size()
-    seen_locs = set()
-    split_to_locs: dict[str, list[tuple[str, str]]] = dict(
-        train=[], val=[], test=[])
-    label_sizes_by_split = {
-        label: dict(train=0, val=0, test=0)
-        for label in df['label'].unique()
-    }
-    if test_split is not None:
-        assert test_frac == 0
-        split_to_locs['test'] = list(test_split)
-        seen_locs.update(test_split)
-    def add_loc_to_split(loc: tuple[str, str], split: str) -> None:
-        split_to_locs[split].append(loc)
-        for label, label_size in loc_to_label_sizes[loc].items():
-            label_sizes_by_split[label][split] += label_size
-    # sorted smallest to largest
-    ordered_labels = df.groupby('label').size().sort_values()
-    for label, label_size in tqdm(ordered_labels.items()):
-        split_sizes = label_sizes_by_split[label]
-        test_thresh = test_frac * label_size
-        val_thresh = val_frac * label_size
-        mask = df['label'] == label
-        ordered_locs = sort_locs_by_size(
-            loc_to_size=df[mask].groupby('dataset_location').size().to_dict(),
-            prioritize=prioritize.get(label, None))
-        ordered_locs = [loc for loc in ordered_labels if loc not in seen_locs]
-        for loc in ordered_locs:
-            seen_locs.add(loc)
-            # greedily add to test set until it has >= 15% of images
-            if split_sizes['test'] < test_thresh:
-                split = 'test'
-            elif split_sizes['val'] < val_thresh:
-                split = 'val'
-            else:
-                split = 'train'
-            add_loc_to_split(loc, split)
-        seen_locs.update(ordered_locs)
-    # sort the resulting locs
-    split_to_locs = {s: sorted(locs) for s, locs in split_to_locs.items()}
-    return split_to_locs
-def sort_locs_by_size(loc_to_size: MutableMapping[tuple[str, str], int],
-                      prioritize: Optional[Container[str]] = None
-                      ) -> list[tuple[str, str]]:
-    """
-    Sorts locations by size, optionally prioritizing locations from certain
-    datasets first.
-    Args:
-        loc_to_size: dict, maps each (dataset, location) tuple to its size,
-            modified in-place
-        prioritize: optional list of str, datasets to prioritize
-    Returns: list of (dataset, location) tuples, ordered from smallest size to
-        largest. Locations from prioritized datasets come first.
-    """
-    result = []
-    if prioritize is not None:
-        # modify loc_to_size in place, so copy its keys before iterating
-        prioritized_loc_to_size = {
-            loc: loc_to_size.pop(loc) for loc in list(loc_to_size.keys())
-            if loc[0] in prioritize
-        }
-        result = sort_locs_by_size(prioritized_loc_to_size)
-    result += sorted(loc_to_size, key=loc_to_size.__getitem__)
-    return result
-#%% Command-line driver
-def _parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description='Creates classification dataset.')
-    # arguments relevant to both creating the dataset CSV and splits.json
-    parser.add_argument(
-        'output_dir',
-        help='path to directory where the 3 output files should be '
-             'saved: 1) dataset CSV, 2) label index JSON, 3) splits JSON')
-    parser.add_argument(
-        '--mode', nargs='+', choices=['csv', 'splits'],
-        default=['csv', 'splits'],
-        help='whether to generate only a CSV, only a splits.json file (based '
-             'on an existing classification_ds.csv), or both')
-    parser.add_argument(
-        '--match-test', nargs=2, metavar=('CLASSIFICATION_CSV', 'SPLITS_JSON'),
-        help='path to an existing classification CSV and path to an existing '
-             'splits JSON file from which to match test set')
-    # arguments only relevant for creating the dataset CSV
-    csv_group = parser.add_argument_group(
-        'arguments for creating classification CSV')
-    csv_group.add_argument(
-        '-q', '--queried-images-json',
-        help='path to JSON file containing image paths and classification info')
-    csv_group.add_argument(
-        '-c', '--cropped-images-dir',
-        help='path to local directory for saving crops of bounding boxes')
-    csv_group.add_argument(
-        '-d', '--detector-output-cache-dir',
-        help='(required) path to directory where detector outputs are cached')
-    csv_group.add_argument(
-        '-v', '--detector-version',
-        help='(required) detector version string, e.g., "4.1"')
-    csv_group.add_argument(
-        '-t', '--threshold', type=float, default=0.8,
-        help='confidence threshold above which to crop bounding boxes')
-    csv_group.add_argument(
-        '--min-locs', type=int,
-        help='minimum number of locations that each label must have in order '
-             'to be included (does not apply to match-test-splits)')
-    # arguments only relevant for creating the splits JSON
-    splits_group = parser.add_argument_group(
-        'arguments for creating train/val/test splits')
-    splits_group.add_argument(
-        '--val-frac', type=float,
-        help='(required) fraction of data to use for validation split')
-    splits_group.add_argument(
-        '--test-frac', type=float,
-        help='fraction of data to use for test split, must be provided if '
-             '--match-test is not given')
-    splits_group.add_argument(
-        '--method', choices=['random', 'smallest_first'], default='random',
-        help='"random": randomly tries up to 10,000 different train/val/test '
-             'splits and chooses the one that best meets the scoring criteria, '
-             'does not support --label-spec. '
-             '"smallest_first": greedily divides locations into splits '
-             'starting with the smallest class first. Supports --label-spec.')
-    splits_group.add_argument(
-        '--label-spec',
-        help='optional path to label specification JSON file, if specifying '
-             'dataset priority. Requires --method=smallest_first.')
-    return parser.parse_args()
-if __name__ == '__main__':
-    args = _parse_args()
-    main(output_dir=args.output_dir,
-         mode=args.mode,
-         match_test=args.match_test,
-         queried_images_json_path=args.queried_images_json,
-         cropped_images_dir=args.cropped_images_dir,
-         detector_version=args.detector_version,
-         detector_output_cache_base_dir=args.detector_output_cache_dir,
-         confidence_threshold=args.threshold,
-         min_locs=args.min_locs,
-         val_frac=args.val_frac,
-         test_frac=args.test_frac,
-         splits_method=args.method,
-         label_spec_json_path=args.label_spec)

megadetector 5.0.9__py3-none-any.whl → 5.0.11__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.9py3-none-any.whl → 5.0.11py3-none-any.whl