PyPI - megadetector - Versions diffs - 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl - Mend

megadetector 5.0.28py3-none-any.whl → 10.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (197) hide show

megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
megadetector/classification/aggregate_classifier_probs.py +3 -3
megadetector/classification/analyze_failed_images.py +5 -5
megadetector/classification/cache_batchapi_outputs.py +5 -5
megadetector/classification/create_classification_dataset.py +11 -12
megadetector/classification/crop_detections.py +10 -10
megadetector/classification/csv_to_json.py +8 -8
megadetector/classification/detect_and_crop.py +13 -15
megadetector/classification/efficientnet/model.py +8 -8
megadetector/classification/efficientnet/utils.py +6 -5
megadetector/classification/evaluate_model.py +7 -7
megadetector/classification/identify_mislabeled_candidates.py +6 -6
megadetector/classification/json_to_azcopy_list.py +1 -1
megadetector/classification/json_validator.py +29 -32
megadetector/classification/map_classification_categories.py +9 -9
megadetector/classification/merge_classification_detection_output.py +12 -9
megadetector/classification/prepare_classification_script.py +19 -19
megadetector/classification/prepare_classification_script_mc.py +26 -26
megadetector/classification/run_classifier.py +4 -4
megadetector/classification/save_mislabeled.py +6 -6
megadetector/classification/train_classifier.py +1 -1
megadetector/classification/train_classifier_tf.py +9 -9
megadetector/classification/train_utils.py +10 -10
megadetector/data_management/annotations/annotation_constants.py +1 -2
megadetector/data_management/camtrap_dp_to_coco.py +79 -46
megadetector/data_management/cct_json_utils.py +103 -103
megadetector/data_management/cct_to_md.py +49 -49
megadetector/data_management/cct_to_wi.py +33 -33
megadetector/data_management/coco_to_labelme.py +75 -75
megadetector/data_management/coco_to_yolo.py +210 -193
megadetector/data_management/databases/add_width_and_height_to_db.py +86 -12
megadetector/data_management/databases/combine_coco_camera_traps_files.py +40 -40
megadetector/data_management/databases/integrity_check_json_db.py +228 -200
megadetector/data_management/databases/subset_json_db.py +33 -33
megadetector/data_management/generate_crops_from_cct.py +88 -39
megadetector/data_management/get_image_sizes.py +54 -49
megadetector/data_management/labelme_to_coco.py +133 -125
megadetector/data_management/labelme_to_yolo.py +159 -73
megadetector/data_management/lila/create_lila_blank_set.py +81 -83
megadetector/data_management/lila/create_lila_test_set.py +32 -31
megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
megadetector/data_management/lila/download_lila_subset.py +21 -24
megadetector/data_management/lila/generate_lila_per_image_labels.py +365 -107
megadetector/data_management/lila/get_lila_annotation_counts.py +35 -33
megadetector/data_management/lila/get_lila_image_counts.py +22 -22
megadetector/data_management/lila/lila_common.py +73 -70
megadetector/data_management/lila/test_lila_metadata_urls.py +28 -19
megadetector/data_management/mewc_to_md.py +344 -340
megadetector/data_management/ocr_tools.py +262 -255
megadetector/data_management/read_exif.py +249 -227
megadetector/data_management/remap_coco_categories.py +90 -28
megadetector/data_management/remove_exif.py +81 -21
megadetector/data_management/rename_images.py +187 -187
megadetector/data_management/resize_coco_dataset.py +588 -120
megadetector/data_management/speciesnet_to_md.py +41 -41
megadetector/data_management/wi_download_csv_to_coco.py +55 -55
megadetector/data_management/yolo_output_to_md_output.py +248 -122
megadetector/data_management/yolo_to_coco.py +333 -191
megadetector/detection/change_detection.py +832 -0
megadetector/detection/process_video.py +340 -337
megadetector/detection/pytorch_detector.py +358 -278
megadetector/detection/run_detector.py +399 -186
megadetector/detection/run_detector_batch.py +404 -377
megadetector/detection/run_inference_with_yolov5_val.py +340 -327
megadetector/detection/run_tiled_inference.py +257 -249
megadetector/detection/tf_detector.py +24 -24
megadetector/detection/video_utils.py +332 -295
megadetector/postprocessing/add_max_conf.py +19 -11
megadetector/postprocessing/categorize_detections_by_size.py +45 -45
megadetector/postprocessing/classification_postprocessing.py +468 -433
megadetector/postprocessing/combine_batch_outputs.py +23 -23
megadetector/postprocessing/compare_batch_results.py +590 -525
megadetector/postprocessing/convert_output_format.py +106 -102
megadetector/postprocessing/create_crop_folder.py +347 -147
megadetector/postprocessing/detector_calibration.py +173 -168
megadetector/postprocessing/generate_csv_report.py +508 -499
megadetector/postprocessing/load_api_results.py +48 -27
megadetector/postprocessing/md_to_coco.py +133 -102
megadetector/postprocessing/md_to_labelme.py +107 -90
megadetector/postprocessing/md_to_wi.py +40 -40
megadetector/postprocessing/merge_detections.py +92 -114
megadetector/postprocessing/postprocess_batch_results.py +319 -301
megadetector/postprocessing/remap_detection_categories.py +91 -38
megadetector/postprocessing/render_detection_confusion_matrix.py +214 -205
megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +704 -679
megadetector/postprocessing/separate_detections_into_folders.py +226 -211
megadetector/postprocessing/subset_json_detector_output.py +265 -262
megadetector/postprocessing/top_folders_to_bottom.py +45 -45
megadetector/postprocessing/validate_batch_results.py +70 -70
megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
megadetector/taxonomy_mapping/map_new_lila_datasets.py +18 -19
megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +54 -33
megadetector/taxonomy_mapping/preview_lila_taxonomy.py +67 -67
megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
megadetector/taxonomy_mapping/simple_image_download.py +8 -8
megadetector/taxonomy_mapping/species_lookup.py +156 -74
megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
megadetector/utils/ct_utils.py +1049 -211
megadetector/utils/directory_listing.py +21 -77
megadetector/utils/gpu_test.py +22 -22
megadetector/utils/md_tests.py +632 -529
megadetector/utils/path_utils.py +1520 -431
megadetector/utils/process_utils.py +41 -41
megadetector/utils/split_locations_into_train_val.py +62 -62
megadetector/utils/string_utils.py +148 -27
megadetector/utils/url_utils.py +489 -176
megadetector/utils/wi_utils.py +2658 -2526
megadetector/utils/write_html_image_list.py +137 -137
megadetector/visualization/plot_utils.py +34 -30
megadetector/visualization/render_images_with_thumbnails.py +39 -74
megadetector/visualization/visualization_utils.py +487 -435
megadetector/visualization/visualize_db.py +232 -198
megadetector/visualization/visualize_detector_output.py +82 -76
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/METADATA +5 -2
megadetector-10.0.0.dist-info/RECORD +139 -0
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/WHEEL +1 -1
megadetector/api/batch_processing/api_core/__init__.py +0 -0
megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
megadetector/api/batch_processing/api_core/batch_service/score.py +0 -439
megadetector/api/batch_processing/api_core/server.py +0 -294
megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
megadetector/api/batch_processing/api_core/server_utils.py +0 -88
megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
megadetector/api/batch_processing/api_support/__init__.py +0 -0
megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
megadetector/api/synchronous/__init__.py +0 -0
megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
megadetector/api/synchronous/api_core/tests/load_test.py +0 -110
megadetector/data_management/importers/add_nacti_sizes.py +0 -52
megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
megadetector/data_management/importers/awc_to_json.py +0 -191
megadetector/data_management/importers/bellevue_to_json.py +0 -272
megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
megadetector/data_management/importers/cct_field_adjustments.py +0 -58
megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
megadetector/data_management/importers/ena24_to_json.py +0 -276
megadetector/data_management/importers/filenames_to_json.py +0 -386
megadetector/data_management/importers/helena_to_cct.py +0 -283
megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
megadetector/data_management/importers/jb_csv_to_json.py +0 -150
megadetector/data_management/importers/mcgill_to_json.py +0 -250
megadetector/data_management/importers/missouri_to_json.py +0 -490
megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
megadetector/data_management/importers/noaa_seals_2019.py +0 -181
megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
megadetector/data_management/importers/pc_to_json.py +0 -365
megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
megadetector/data_management/importers/rspb_to_json.py +0 -356
megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
megadetector/data_management/importers/sulross_get_exif.py +0 -65
megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
megadetector/data_management/importers/ubc_to_json.py +0 -399
megadetector/data_management/importers/umn_to_json.py +0 -507
megadetector/data_management/importers/wellington_to_json.py +0 -263
megadetector/data_management/importers/wi_to_json.py +0 -442
megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
megadetector/utils/azure_utils.py +0 -178
megadetector/utils/sas_blob_utils.py +0 -509
megadetector-5.0.28.dist-info/RECORD +0 -209
/megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0

megadetector/data_management/lila/generate_lila_per_image_labels.py CHANGED Viewed

@@ -35,6 +35,7 @@ from megadetector.data_management.lila.lila_common import \
 from megadetector.utils import write_html_image_list
 from megadetector.utils.path_utils import zip_file
 from megadetector.utils.path_utils import open_file
+from megadetector.utils.url_utils import parallel_download_urls
 # We'll write images, metadata downloads, and temporary files here
 lila_local_base = os.path.expanduser('~/lila')
@@ -47,7 +48,7 @@ os.makedirs(metadata_dir,exist_ok=True)
 output_file = os.path.join(lila_local_base,'lila_image_urls_and_labels.csv')
-# Some datasets don't have "sequence_level_annotation" fields populated, but we know their
+# Some datasets don't have "sequence_level_annotation" fields populated, but we know their
 # annotation level
 ds_name_to_annotation_level = {}
 ds_name_to_annotation_level['Caltech Camera Traps'] = 'image'
@@ -66,6 +67,18 @@ if debug_max_images_per_dataset > 0:
     print('Running in debug mode')
     output_file = output_file.replace('.csv','_debug.csv')
+taxonomy_levels_to_include = \
+    ['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
+     'suborder','infraorder','superfamily','family','subfamily','tribe','genus','subgenus',
+     'species','subspecies','variety']
+def _clearnan(v):
+    if isinstance(v,float):
+        assert np.isnan(v)
+        v = ''
+    assert isinstance(v,str)
+    return v
 #%% Download and parse the metadata file
@@ -79,14 +92,14 @@ if False:
 #%% Download and extract metadata for each dataset
-for ds_name in metadata_table.keys():
+for ds_name in metadata_table.keys():
     metadata_table[ds_name]['metadata_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
                                                                          metadata_dir=metadata_dir,
                                                                          metadata_table=metadata_table)
 #%% Load taxonomy data
-taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
+taxonomy_df = read_lila_taxonomy_mapping(metadata_dir, force_download=True)
 #%% Build a dictionary that maps each [dataset,query] pair to the full taxonomic label set
@@ -95,12 +108,12 @@ ds_label_to_taxonomy = {}
 # i_row = 0; row = taxonomy_df.iloc[i_row]
 for i_row,row in taxonomy_df.iterrows():
     ds_label = row['dataset_name'] + ':' + row['query']
     assert ds_label.strip() == ds_label
     assert ds_label not in ds_label_to_taxonomy
     ds_label_to_taxonomy[ds_label] = row.to_dict()
 #%% Process annotations for each dataset
@@ -112,74 +125,62 @@ header = ['dataset_name','url_gcp','url_aws','url_azure',
           'image_id','sequence_id','location_id','frame_num',
           'original_label','scientific_name','common_name','datetime','annotation_level']
-taxonomy_levels_to_include = \
-    ['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
-     'suborder','infraorder','superfamily','family','subfamily','tribe','genus','species','subspecies',
-     'variety']
 header.extend(taxonomy_levels_to_include)
 missing_annotations = set()
-def clearnan(v):
-    if isinstance(v,float):
-        assert np.isnan(v)
-        v = ''
-    assert isinstance(v,str)
-    return v
 with open(output_file,'w',encoding='utf-8',newline='') as f:
     csv_writer = csv.writer(f)
     csv_writer.writerow(header)
     # ds_name = list(metadata_table.keys())[0]
     for ds_name in metadata_table.keys():
         if 'bbox' in ds_name:
             print('Skipping bbox dataset {}'.format(ds_name))
             continue
         print('Processing dataset {}'.format(ds_name))
         json_filename = metadata_table[ds_name]['metadata_filename']
         with open(json_filename, 'r') as f:
             data = json.load(f)
         categories = data['categories']
         category_ids = [c['id'] for c in categories]
         for c in categories:
             category_id_to_name = {c['id']:c['name'] for c in categories}
         annotations = data['annotations']
         images = data['images']
         image_id_to_annotations = defaultdict(list)
         # Go through annotations, marking each image with the categories that are present
         #
         # ann = annotations[0]
-        for ann in annotations:
+        for ann in annotations:
             image_id_to_annotations[ann['image_id']].append(ann)
         unannotated_images = []
         found_date = False
         found_location = False
         found_annotation_level = False
         if ds_name in ds_name_to_annotation_level:
             expected_annotation_level = ds_name_to_annotation_level[ds_name]
         else:
             expected_annotation_level = None
         # im = images[10]
         for i_image,im in tqdm(enumerate(images),total=len(images)):
             if (debug_max_images_per_dataset is not None) and (debug_max_images_per_dataset > 0) \
                 and (i_image >= debug_max_images_per_dataset):
                 break
             file_name = im['file_name'].replace('\\','/')
             base_url_gcp = metadata_table[ds_name]['image_base_url_gcp']
             base_url_aws = metadata_table[ds_name]['image_base_url_aws']
@@ -187,21 +188,21 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
             assert not base_url_gcp.endswith('/')
             assert not base_url_aws.endswith('/')
             assert not base_url_azure.endswith('/')
             url_gcp = base_url_gcp + '/' + file_name
             url_aws = base_url_aws + '/' + file_name
             url_azure = base_url_azure + '/' + file_name
             for k in im.keys():
                 if ('date' in k or 'time' in k) and (k not in ['datetime','date_captured']):
                     raise ValueError('Unrecognized datetime field')
             # This field name was only used for Caltech Camera Traps
             if 'date_captured' in im:
                 assert ds_name == 'Caltech Camera Traps'
                 im['datetime'] = im['date_captured']
-            def has_valid_datetime(im):
+            def _has_valid_datetime(im):
                 if 'datetime' not in im:
                     return False
                 v = im['datetime']
@@ -212,29 +213,29 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
                 else:
                     assert isinstance(v,float) and np.isnan(v)
                     return False
-            dt_string = ''
-            if (has_valid_datetime(im)):
+            dt_string = ''
+            if (_has_valid_datetime(im)):
                 dt = dateparser.parse(im['datetime'])
                 if dt is None or dt.year < 1990 or dt.year > 2025:
                     # raise ValueError('Suspicious date parsing result')
-                    # Special case we don't want to print a warning about... this is
+                    # Special case we don't want to print a warning about... this is
                     # in invalid date that very likely originates on the camera, not at
                     # some intermediate processing step.
                     #
                     # print('Suspicious date for image {}: {} ({})'.format(
                     #    im['id'], im['datetime'], ds_name))
-                    pass
+                    pass
                 else:
                     found_date = True
                     dt_string = dt.strftime("%m-%d-%Y %H:%M:%S")
             # Location, sequence, and image IDs are only guaranteed to be unique within
             # a dataset, so for the output .csv file, include both
             if 'location' in im:
@@ -242,25 +243,25 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
                 location_id = ds_name + ' : ' + str(im['location'])
             else:
                 location_id = ds_name
             image_id = ds_name + ' : ' + str(im['id'])
             if 'seq_id' in im:
                 sequence_id = ds_name + ' : ' + str(im['seq_id'])
             else:
                 sequence_id = ds_name + ' : ' + 'unknown'
             if 'frame_num' in im:
                 frame_num = im['frame_num']
             else:
                 frame_num = -1
             annotations_this_image = image_id_to_annotations[im['id']]
             categories_this_image = set()
             annotation_level = 'unknown'
             for ann in annotations_this_image:
                 assert ann['image_id'] == im['id']
                 categories_this_image.add(category_id_to_name[ann['category_id']])
@@ -275,35 +276,35 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
                             'Unexpected annotation level'
                 elif expected_annotation_level is not None:
                     annotation_level = expected_annotation_level
             if len(categories_this_image) == 0:
                 unannotated_images.append(im)
                 continue
             # category_name = list(categories_this_image)[0]
             for category_name in categories_this_image:
                 ds_label = ds_name + ':' + category_name.lower()
                 if ds_label not in ds_label_to_taxonomy:
                     assert ds_label in known_unmapped_labels
                     # Only print a warning the first time we see an unmapped label
                     if ds_label not in missing_annotations:
                         print('Warning: {} not in taxonomy file'.format(ds_label))
                     missing_annotations.add(ds_label)
                     continue
                 taxonomy_labels = ds_label_to_taxonomy[ds_label]
                 """
-                header =
+                header =
                     ['dataset_name','url','image_id','sequence_id','location_id',
                      'frame_num','original_label','scientific_name','common_name',
                      'datetime','annotation_level']
                 """
                 row = []
                 row.append(ds_name)
                 row.append(url_gcp)
@@ -314,37 +315,37 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
                 row.append(location_id)
                 row.append(frame_num)
                 row.append(taxonomy_labels['query'])
-                row.append(clearnan(taxonomy_labels['scientific_name']))
-                row.append(clearnan(taxonomy_labels['common_name']))
+                row.append(_clearnan(taxonomy_labels['scientific_name']))
+                row.append(_clearnan(taxonomy_labels['common_name']))
                 row.append(dt_string)
                 row.append(annotation_level)
                 for s in taxonomy_levels_to_include:
-                    row.append(clearnan(taxonomy_labels[s]))
+                    row.append(_clearnan(taxonomy_labels[s]))
                 assert len(row) == len(header)
                 csv_writer.writerow(row)
             # ...for each category that was applied at least once to this image
         # ...for each image in this dataset
         if not found_date:
             pass
             # print('Warning: no date information available for this dataset')
         if not found_location:
             pass
             # print('Warning: no location information available for this dataset')
         if not found_annotation_level and (ds_name not in ds_name_to_annotation_level):
             print('Warning: no annotation level information available for this dataset')
         if len(unannotated_images) > 0:
             print('Warning: {} of {} images are un-annotated\n'.\
                   format(len(unannotated_images),len(images)))
     # ...for each dataset
 # ...with open()
@@ -360,11 +361,14 @@ print('Read {} rows from {}'.format(len(df),output_file))
 #%% Do some post-hoc integrity checking
-# Takes ~10 minutes without using apply()
+# Takes ~5 minutes with apply(), or ~10 minutes without apply()
+#
+# Using apply() is faster, but more annoying to debug.
+use_pandas_apply_for_integrity_checking = True
 tqdm.pandas()
-def isint(v):
+def _isint(v):
     return isinstance(v,int) or isinstance(v,np.int64)
 valid_annotation_levels = set(['sequence','image','unknown'])
@@ -373,8 +377,8 @@ valid_annotation_levels = set(['sequence','image','unknown'])
 # in the next cell to look for datasets that only have a single location
 dataset_name_to_locations = defaultdict(set)
-def check_row(row):
+def _check_row(row):
     assert row['dataset_name'] in metadata_table.keys()
     for url_column in ['url_gcp','url_aws','url_azure']:
         assert row[url_column].startswith('https://') or row[url_column].startswith('http://')
@@ -387,21 +391,20 @@ def check_row(row):
         assert np.isnan(row['frame_num'])
     else:
         # -1 is sometimes used for sequences of unknown length
-        assert isint(row['frame_num']) and row['frame_num'] >= -1
+        assert _isint(row['frame_num']) and row['frame_num'] >= -1
     ds_name = row['dataset_name']
     dataset_name_to_locations[ds_name].add(row['location_id'])
-# Faster, but more annoying to debug
-if True:
-    df.progress_apply(check_row, axis=1)
+if use_pandas_apply_for_integrity_checking:
+    df.progress_apply(_check_row, axis=1)
 else:
     # i_row = 0; row = df.iloc[i_row]
     for i_row,row in tqdm(df.iterrows(),total=len(df)):
-        check_row(row)
+        _check_row(row)
 #%% Check for datasets that have only one location string (typically "unknown")
@@ -428,31 +431,32 @@ images_to_download = []
 # ds_name = list(metadata_table.keys())[2]
 for ds_name in metadata_table.keys():
     if 'bbox' in ds_name:
         continue
     # Find all rows for this dataset
     ds_rows = df.loc[df['dataset_name'] == ds_name]
     print('{} rows available for {}'.format(len(ds_rows),ds_name))
     assert len(ds_rows) > 0
     empty_rows = ds_rows[ds_rows['scientific_name'].isnull()]
     non_empty_rows = ds_rows[~ds_rows['scientific_name'].isnull()]
     if len(empty_rows) == 0:
         print('No empty images available for {}'.format(ds_name))
     elif len(empty_rows) > n_empty_images_per_dataset:
         empty_rows = empty_rows.sample(n=n_empty_images_per_dataset)
     images_to_download.extend(empty_rows.to_dict('records'))
+    # All LILA datasets have non-empty images
     if len(non_empty_rows) == 0:
-        print('No non-empty images available for {}'.format(ds_name))
+        raise ValueError('No non-empty images available for {}'.format(ds_name))
     elif len(non_empty_rows) > n_non_empty_images_per_dataset:
         non_empty_rows = non_empty_rows.sample(n=n_non_empty_images_per_dataset)
     images_to_download.extend(non_empty_rows.to_dict('records'))
  # ...for each dataset
 print('Selected {} total images'.format(len(images_to_download)))
@@ -462,13 +466,13 @@ print('Selected {} total images'.format(len(images_to_download)))
 # Expect a few errors for images with human or vehicle labels (or things like "ignore" that *could* be humans)
-preferred_cloud = 'aws'
+preferred_cloud = 'gcp'
 url_to_target_file = {}
 # i_image = 10; image = images_to_download[i_image]
 for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_download)):
     url = image['url_' + preferred_cloud]
     ext = os.path.splitext(url)[1]
     fn_relative = 'image_{}'.format(str(i_image).zfill(4)) + ext
@@ -476,14 +480,26 @@ for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_down
     image['relative_file'] = fn_relative
     image['url'] = url
     url_to_target_file[url] = fn_abs
 #%% Download images (execution)
-from megadetector.utils.url_utils import parallel_download_urls
 download_results = parallel_download_urls(url_to_target_file,verbose=False,overwrite=True,
                                           n_workers=20,pool_type='thread')
+# 10-20 errors is normal; they should all be images that are labeled as "human"
+errors = []
+for r in download_results:
+    if r['status'] != 'success':
+        errors.append(r)
+assert len(download_results) == len(url_to_target_file)
+print('Errors on {} of {} downloads:\n'.format(len(errors),len(download_results)))
+for err in errors:
+    print(err['url'])
 #%% Write preview HTML
@@ -493,10 +509,10 @@ html_images = []
 # im = images_to_download[0]
 for im in images_to_download:
     if im['relative_file'] is None:
         continue
     output_im = {}
     output_im['filename'] = im['relative_file']
     output_im['linkTarget'] = im['url']
@@ -504,7 +520,7 @@ for im in images_to_download:
     output_im['imageStyle'] = 'width:600px;'
     output_im['textStyle'] = 'font-weight:normal;font-size:100%;'
     html_images.append(output_im)
 write_html_image_list.write_html_image_list(html_filename,html_images)
 open_file(html_filename)
@@ -515,3 +531,245 @@ open_file(html_filename)
 zipped_output_file = zip_file(output_file,verbose=True,overwrite=True)
 print('Zipped {} to {}'.format(output_file,zipped_output_file))
+#%% Convert to .json
+"""
+The .csv file "output_file" (already loaded into the variable "df" at this point) has the following columns:
+dataset_name,url_gcp,url_aws,url_azure,image_id,sequence_id,location_id,frame_num,original_label,scientific_name,common_name,datetime,annotation_level,kingdom,phylum,subphylum,superclass,class,subclass,infraclass,superorder,order,suborder,infraorder,superfamily,family,subfamily,tribe,genus,subgenus,species,subspecies,variety
+Each row in the .csv represents an image.  The URL columns represent the location of that
+image on three different clouds; for a given image, the value of those columns differs only
+in the prefix.  The columns starting with "kingdom" represent a taxonomic wildlife identifier.  Not
+all rows have values in all of these columns; some rows represent non-wildlife images where all of these
+columns are blank.
+This cell converts this to a .json dictionary, with the following top-level keys:
+## datasets (dict)
+A dict mapping integer IDs to strings.
+Each unique value in the "dataset_name" column should become an element in this dict with a unique ID.
+## sequences (dict)
+A dict mapping integer IDs to strings.
+Each unique value in the "sequence_id" column should become an element in this dict with a unique ID.
+## locations (dict)
+A dict mapping integer IDs to strings.
+Each unique value in the "location_id" column should become an element in this dict with a unique ID.
+## base_urls (dict)
+This key should point to the following dict:
+{
+"gcp": "https://storage.googleapis.com/public-datasets-lila/",
+"aws": "http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/",
+"azure": "https://lilawildlife.blob.core.windows.net/lila-wildlife/",
+}
+All values in the url_gcp, url_aws, and url_azure columns start with these values, respectively.
+## taxa (dict)
+A dict mapping integer IDs to dicts, where each dict has the fields:
+kingdom,phylum,subphylum,superclass,class,subclass,infraclass,superorder,order,suborder,infraorder,superfamily,family,subfamily,tribe,genus,subgenus,species,subspecies,variety
+The value of each of these fields in each row is either a string or None.
+## images (list)
+A list of images, where each image is a dict with the following fields:
+### dataset (int)
+The integer ID corresponding to the dataset_name column for this image
+### path (str)
+The suffix for this image's URL, which should be the same across the three URL columns.
+### seq (int)
+The integer ID corresponding to the sequence_id column for this image
+### loc (int)
+The integer ID corresponding to the location_id column for this image
+### frame_num
+The value of the frame_num column for this image, unless the original value was -1,
+in which case this is omitted.
+### original_label
+The value of the original_label column for this image
+### common_name
+The value of the common_name column for this image, if not empty
+### datetime
+The value of the datetime column for this image
+### ann_level
+The value of the annotation_level column for this image
+### taxon
+The integer ID corresponding to the taxonomic identifier columns for this image
+--
+The original .csv file is large (~15GB); this may impact the implementation of the .json conversion.  Speed of
+conversion is not a priority.
+"""
+print('Converting to JSON...')
+output_json_file = output_file.replace('.csv', '.json')
+json_data = {}
+# Create mappings for datasets, sequences, and locations
+dataset_to_id = {}
+sequence_to_id = {}
+location_to_id = {}
+taxa_to_id = {}
+next_dataset_id = 0
+next_sequence_id = 0
+next_location_id = 0
+next_taxa_id = 0
+json_data['datasets'] = {}
+json_data['sequences'] = {}
+json_data['locations'] = {}
+json_data['taxa'] = {}
+json_data['base_urls'] = {
+    "gcp": "https://storage.googleapis.com/public-datasets-lila/",
+    "aws": "http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/",
+    "azure": "https://lilawildlife.blob.core.windows.net/lila-wildlife/",
+}
+json_data['images'] = []
+debug_max_json_conversion_rows = None
+print('Counting rows in .csv file...')
+# Get total number of lines for progress bar (optional, but helpful for large files)
+def _count_lines(filename):
+    with open(filename, 'r', encoding='utf-8') as f:
+        return sum(1 for line in f) - 1
+total_rows = _count_lines(output_file)
+print('Total rows to process: {}'.format(total_rows))
+# Read CSV file line by line
+with open(output_file, 'r', encoding='utf-8') as csvfile:
+    reader = csv.DictReader(csvfile)
+    # Process each row
+    for i_row, row in enumerate(tqdm(reader, total=total_rows, desc="Processing rows")):
+        if (debug_max_json_conversion_rows is not None) and (i_row >= debug_max_json_conversion_rows):
+            break
+        # Datasets
+        dataset_name = row['dataset_name']
+        if dataset_name not in dataset_to_id:
+            dataset_to_id[dataset_name] = next_dataset_id
+            json_data['datasets'][str(next_dataset_id)] = dataset_name
+            next_dataset_id += 1
+        dataset_id = dataset_to_id[dataset_name]
+        # Sequences
+        sequence_id_str = row['sequence_id']
+        assert sequence_id_str.startswith(dataset_name + ' : ')
+        if sequence_id_str not in sequence_to_id:
+            sequence_to_id[sequence_id_str] = next_sequence_id
+            json_data['sequences'][str(next_sequence_id)] = sequence_id_str
+            next_sequence_id += 1
+        sequence_id = sequence_to_id[sequence_id_str]
+        # Locations
+        location_id_str = row['location_id']
+        assert location_id_str.startswith(dataset_name) # + ' : ')
+        if location_id_str not in location_to_id:
+            location_to_id[location_id_str] = next_location_id
+            json_data['locations'][str(next_location_id)] = location_id_str
+            next_location_id += 1
+        location_id = location_to_id[location_id_str]
+        # Taxa
+        taxa_data = {level: _clearnan(row[level]) for level in taxonomy_levels_to_include}
+        taxa_tuple = tuple(taxa_data.items())  # use tuple for hashable key
+        if taxa_tuple not in taxa_to_id:
+            taxa_to_id[taxa_tuple] = next_taxa_id
+            json_data['taxa'][str(next_taxa_id)] = taxa_data
+            next_taxa_id += 1
+        taxa_id = taxa_to_id[taxa_tuple]
+        # Image path
+        url_gcp = row['url_gcp']
+        assert url_gcp.startswith(json_data['base_urls']['gcp'])
+        path = url_gcp.replace(json_data['base_urls']['gcp'], '')
+        common_name = _clearnan(row['common_name'])
+        frame_num = int(row['frame_num'])
+        # Image data
+        image_entry = {
+            'dataset': dataset_id,
+            'path': path,
+            'seq': sequence_id,
+            'loc': location_id,
+            'ann_level': row['annotation_level'],
+            'original_label': row['original_label'],
+            'datetime': row['datetime'],
+            'taxon': taxa_id
+        }
+        if frame_num >= 0:
+           image_entry['frame_num'] = frame_num
+        if len(common_name) > 0:
+            image_entry['common_name'] = common_name
+        json_data['images'].append(image_entry)
+    # ...for each line
+# ...with open(...)
+# Save the JSON data
+print('Saving JSON file...')
+with open(output_json_file, 'w', encoding='utf-8') as f:
+    json.dump(json_data, f, indent=1)
+print(f'Converted to JSON and saved to {output_json_file}')
+print(f'JSON file size: {os.path.getsize(output_json_file)/(1024*1024*1024):.2f} GB')
+# Print summary statistics
+print(f'Total datasets: {len(json_data["datasets"])}')
+print(f'Total sequences: {len(json_data["sequences"])}')
+print(f'Total locations: {len(json_data["locations"])}')
+print(f'Total taxa: {len(json_data["taxa"])}')
+print(f'Total images: {len(json_data["images"])}')

megadetector 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.28py3-none-any.whl → 10.0.0py3-none-any.whl