megadetector 5.0.27__py3-none-any.whl → 5.0.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/api/batch_processing/api_core/batch_service/score.py +4 -5
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +1 -1
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +1 -1
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
- megadetector/api/synchronous/api_core/tests/load_test.py +2 -3
- megadetector/classification/aggregate_classifier_probs.py +3 -3
- megadetector/classification/analyze_failed_images.py +5 -5
- megadetector/classification/cache_batchapi_outputs.py +5 -5
- megadetector/classification/create_classification_dataset.py +11 -12
- megadetector/classification/crop_detections.py +10 -10
- megadetector/classification/csv_to_json.py +8 -8
- megadetector/classification/detect_and_crop.py +13 -15
- megadetector/classification/evaluate_model.py +7 -7
- megadetector/classification/identify_mislabeled_candidates.py +6 -6
- megadetector/classification/json_to_azcopy_list.py +1 -1
- megadetector/classification/json_validator.py +29 -32
- megadetector/classification/map_classification_categories.py +9 -9
- megadetector/classification/merge_classification_detection_output.py +12 -9
- megadetector/classification/prepare_classification_script.py +19 -19
- megadetector/classification/prepare_classification_script_mc.py +23 -23
- megadetector/classification/run_classifier.py +4 -4
- megadetector/classification/save_mislabeled.py +6 -6
- megadetector/classification/train_classifier.py +1 -1
- megadetector/classification/train_classifier_tf.py +9 -9
- megadetector/classification/train_utils.py +10 -10
- megadetector/data_management/annotations/annotation_constants.py +1 -1
- megadetector/data_management/camtrap_dp_to_coco.py +45 -45
- megadetector/data_management/cct_json_utils.py +101 -101
- megadetector/data_management/cct_to_md.py +49 -49
- megadetector/data_management/cct_to_wi.py +33 -33
- megadetector/data_management/coco_to_labelme.py +75 -75
- megadetector/data_management/coco_to_yolo.py +189 -189
- megadetector/data_management/databases/add_width_and_height_to_db.py +3 -2
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +38 -38
- megadetector/data_management/databases/integrity_check_json_db.py +202 -188
- megadetector/data_management/databases/subset_json_db.py +33 -33
- megadetector/data_management/generate_crops_from_cct.py +38 -38
- megadetector/data_management/get_image_sizes.py +54 -49
- megadetector/data_management/labelme_to_coco.py +130 -124
- megadetector/data_management/labelme_to_yolo.py +78 -72
- megadetector/data_management/lila/create_lila_blank_set.py +81 -83
- megadetector/data_management/lila/create_lila_test_set.py +32 -31
- megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
- megadetector/data_management/lila/download_lila_subset.py +21 -24
- megadetector/data_management/lila/generate_lila_per_image_labels.py +91 -91
- megadetector/data_management/lila/get_lila_annotation_counts.py +30 -30
- megadetector/data_management/lila/get_lila_image_counts.py +22 -22
- megadetector/data_management/lila/lila_common.py +70 -70
- megadetector/data_management/lila/test_lila_metadata_urls.py +13 -14
- megadetector/data_management/mewc_to_md.py +339 -340
- megadetector/data_management/ocr_tools.py +258 -252
- megadetector/data_management/read_exif.py +232 -223
- megadetector/data_management/remap_coco_categories.py +26 -26
- megadetector/data_management/remove_exif.py +31 -20
- megadetector/data_management/rename_images.py +187 -187
- megadetector/data_management/resize_coco_dataset.py +41 -41
- megadetector/data_management/speciesnet_to_md.py +41 -41
- megadetector/data_management/wi_download_csv_to_coco.py +55 -55
- megadetector/data_management/yolo_output_to_md_output.py +117 -120
- megadetector/data_management/yolo_to_coco.py +195 -188
- megadetector/detection/change_detection.py +831 -0
- megadetector/detection/process_video.py +341 -338
- megadetector/detection/pytorch_detector.py +308 -266
- megadetector/detection/run_detector.py +186 -166
- megadetector/detection/run_detector_batch.py +366 -364
- megadetector/detection/run_inference_with_yolov5_val.py +328 -325
- megadetector/detection/run_tiled_inference.py +312 -253
- megadetector/detection/tf_detector.py +24 -24
- megadetector/detection/video_utils.py +291 -283
- megadetector/postprocessing/add_max_conf.py +15 -11
- megadetector/postprocessing/categorize_detections_by_size.py +44 -44
- megadetector/postprocessing/classification_postprocessing.py +808 -311
- megadetector/postprocessing/combine_batch_outputs.py +20 -21
- megadetector/postprocessing/compare_batch_results.py +528 -517
- megadetector/postprocessing/convert_output_format.py +97 -97
- megadetector/postprocessing/create_crop_folder.py +220 -147
- megadetector/postprocessing/detector_calibration.py +173 -168
- megadetector/postprocessing/generate_csv_report.py +508 -0
- megadetector/postprocessing/load_api_results.py +25 -22
- megadetector/postprocessing/md_to_coco.py +129 -98
- megadetector/postprocessing/md_to_labelme.py +89 -83
- megadetector/postprocessing/md_to_wi.py +40 -40
- megadetector/postprocessing/merge_detections.py +87 -114
- megadetector/postprocessing/postprocess_batch_results.py +319 -302
- megadetector/postprocessing/remap_detection_categories.py +36 -36
- megadetector/postprocessing/render_detection_confusion_matrix.py +205 -199
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +702 -677
- megadetector/postprocessing/separate_detections_into_folders.py +226 -211
- megadetector/postprocessing/subset_json_detector_output.py +265 -262
- megadetector/postprocessing/top_folders_to_bottom.py +45 -45
- megadetector/postprocessing/validate_batch_results.py +70 -70
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +15 -15
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +14 -14
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +66 -69
- megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
- megadetector/taxonomy_mapping/simple_image_download.py +8 -8
- megadetector/taxonomy_mapping/species_lookup.py +33 -33
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
- megadetector/taxonomy_mapping/taxonomy_graph.py +11 -11
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
- megadetector/utils/azure_utils.py +22 -22
- megadetector/utils/ct_utils.py +1019 -200
- megadetector/utils/directory_listing.py +21 -77
- megadetector/utils/gpu_test.py +22 -22
- megadetector/utils/md_tests.py +541 -518
- megadetector/utils/path_utils.py +1511 -406
- megadetector/utils/process_utils.py +41 -41
- megadetector/utils/sas_blob_utils.py +53 -49
- megadetector/utils/split_locations_into_train_val.py +73 -60
- megadetector/utils/string_utils.py +147 -26
- megadetector/utils/url_utils.py +463 -173
- megadetector/utils/wi_utils.py +2629 -2868
- megadetector/utils/write_html_image_list.py +137 -137
- megadetector/visualization/plot_utils.py +21 -21
- megadetector/visualization/render_images_with_thumbnails.py +37 -73
- megadetector/visualization/visualization_utils.py +424 -404
- megadetector/visualization/visualize_db.py +197 -190
- megadetector/visualization/visualize_detector_output.py +126 -98
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/METADATA +6 -3
- megadetector-5.0.29.dist-info/RECORD +163 -0
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/WHEEL +1 -1
- megadetector/data_management/importers/add_nacti_sizes.py +0 -52
- megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
- megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
- megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
- megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
- megadetector/data_management/importers/awc_to_json.py +0 -191
- megadetector/data_management/importers/bellevue_to_json.py +0 -272
- megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
- megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
- megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
- megadetector/data_management/importers/cct_field_adjustments.py +0 -58
- megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
- megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
- megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
- megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
- megadetector/data_management/importers/ena24_to_json.py +0 -276
- megadetector/data_management/importers/filenames_to_json.py +0 -386
- megadetector/data_management/importers/helena_to_cct.py +0 -283
- megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
- megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
- megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
- megadetector/data_management/importers/jb_csv_to_json.py +0 -150
- megadetector/data_management/importers/mcgill_to_json.py +0 -250
- megadetector/data_management/importers/missouri_to_json.py +0 -490
- megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
- megadetector/data_management/importers/noaa_seals_2019.py +0 -181
- megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
- megadetector/data_management/importers/pc_to_json.py +0 -365
- megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
- megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
- megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
- megadetector/data_management/importers/rspb_to_json.py +0 -356
- megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
- megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
- megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
- megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
- megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
- megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
- megadetector/data_management/importers/sulross_get_exif.py +0 -65
- megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
- megadetector/data_management/importers/ubc_to_json.py +0 -399
- megadetector/data_management/importers/umn_to_json.py +0 -507
- megadetector/data_management/importers/wellington_to_json.py +0 -263
- megadetector/data_management/importers/wi_to_json.py +0 -442
- megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
- megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
- megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
- megadetector-5.0.27.dist-info/RECORD +0 -208
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/top_level.txt +0 -0
|
@@ -114,7 +114,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
114
114
|
if (not force_init) and (inat_taxonomy is not None):
|
|
115
115
|
print('Skipping taxonomy re-init')
|
|
116
116
|
return
|
|
117
|
-
|
|
117
|
+
|
|
118
118
|
if (not force_init) and (os.path.isfile(serialized_structures_file)):
|
|
119
119
|
|
|
120
120
|
print(f'De-serializing taxonomy data from {serialized_structures_file}')
|
|
@@ -135,7 +135,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
135
135
|
gbif_vernacular_to_taxon_id,\
|
|
136
136
|
gbif_taxon_id_to_scientific,\
|
|
137
137
|
gbif_scientific_to_taxon_id = structures_to_serialize
|
|
138
|
-
|
|
138
|
+
|
|
139
139
|
return
|
|
140
140
|
|
|
141
141
|
|
|
@@ -146,7 +146,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
146
146
|
for taxonomy_name, zip_url in taxonomy_urls.items():
|
|
147
147
|
|
|
148
148
|
need_to_download = False
|
|
149
|
-
|
|
149
|
+
|
|
150
150
|
if force_init:
|
|
151
151
|
need_to_download = True
|
|
152
152
|
|
|
@@ -267,7 +267,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
267
267
|
# Build iNat dictionaries
|
|
268
268
|
|
|
269
269
|
print('Building lookup dictionaries for iNat taxonomy')
|
|
270
|
-
|
|
270
|
+
|
|
271
271
|
for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
|
|
272
272
|
|
|
273
273
|
taxon_id = row['taxonID']
|
|
@@ -286,7 +286,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
286
286
|
# Build GBIF dictionaries
|
|
287
287
|
|
|
288
288
|
print('Building lookup dictionaries for GBIF taxonomy')
|
|
289
|
-
|
|
289
|
+
|
|
290
290
|
for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
|
|
291
291
|
|
|
292
292
|
taxon_id = row['taxonID']
|
|
@@ -596,21 +596,21 @@ class TaxonomicMatch:
|
|
|
596
596
|
|
|
597
597
|
|
|
598
598
|
hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeked',
|
|
599
|
-
'ruffed', 'browed', 'eating', 'striped', 'shanked',
|
|
599
|
+
'ruffed', 'browed', 'eating', 'striped', 'shanked',
|
|
600
600
|
'fronted', 'bellied', 'spotted', 'eared', 'collared', 'breasted',
|
|
601
601
|
'necked']
|
|
602
602
|
|
|
603
603
|
def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
|
|
604
604
|
"""
|
|
605
|
-
Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
|
|
605
|
+
Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
|
|
606
606
|
and preferences that are specific to our scenario.
|
|
607
|
-
|
|
607
|
+
|
|
608
608
|
Args:
|
|
609
609
|
query (str): The common or scientific name we want to look up
|
|
610
610
|
taxonomy_preference (str, optional): 'inat' or 'gbif'
|
|
611
|
-
retry (bool, optional): if the initial lookup fails, should we try heuristic
|
|
611
|
+
retry (bool, optional): if the initial lookup fails, should we try heuristic
|
|
612
612
|
substitutions, e.g. replacing "_" with " ", or "spp" with "species"?
|
|
613
|
-
|
|
613
|
+
|
|
614
614
|
Returns:
|
|
615
615
|
TaxonomicMatch: the best taxonomic match, or None
|
|
616
616
|
"""
|
|
@@ -618,31 +618,31 @@ def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retr
|
|
|
618
618
|
m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
|
|
619
619
|
if (len(m.scientific_name) > 0) or (not retry):
|
|
620
620
|
return m
|
|
621
|
-
|
|
621
|
+
|
|
622
622
|
for s in hyphenated_terms:
|
|
623
623
|
query = query.replace(' ' + s,'-' + s)
|
|
624
624
|
m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
|
|
625
625
|
return m
|
|
626
|
-
|
|
627
|
-
|
|
626
|
+
|
|
627
|
+
|
|
628
628
|
def validate_and_convert(data):
|
|
629
629
|
"""
|
|
630
630
|
Recursively validates that all elements in the nested structure are only
|
|
631
631
|
tuples, lists, ints, or np.int64, and converts np.int64 to int.
|
|
632
|
-
|
|
632
|
+
|
|
633
633
|
Args:
|
|
634
634
|
data: The nested structure to validate and convert
|
|
635
|
-
|
|
635
|
+
|
|
636
636
|
Returns:
|
|
637
637
|
The validated and converted structure
|
|
638
|
-
|
|
638
|
+
|
|
639
639
|
Raises:
|
|
640
640
|
TypeError: If an invalid type is encountered
|
|
641
641
|
"""
|
|
642
|
-
|
|
643
|
-
if isinstance(data, np.int64):
|
|
642
|
+
|
|
643
|
+
if isinstance(data, np.int64):
|
|
644
644
|
return int(data)
|
|
645
|
-
elif isinstance(data, int) or isinstance(data, str):
|
|
645
|
+
elif isinstance(data, int) or isinstance(data, str):
|
|
646
646
|
return data
|
|
647
647
|
elif isinstance(data, (list, tuple)):
|
|
648
648
|
# Process lists and tuples recursively
|
|
@@ -654,17 +654,17 @@ def validate_and_convert(data):
|
|
|
654
654
|
|
|
655
655
|
# ...def validate_and_convert(...)
|
|
656
656
|
|
|
657
|
-
|
|
657
|
+
|
|
658
658
|
def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
|
|
659
|
-
|
|
659
|
+
|
|
660
660
|
query = query.lower().strip().replace('_', ' ')
|
|
661
661
|
query = query.replace('unidentified','')
|
|
662
662
|
query = query.replace('unknown','')
|
|
663
663
|
if query.endswith(' sp'):
|
|
664
664
|
query = query.replace(' sp','')
|
|
665
665
|
if query.endswith(' group'):
|
|
666
|
-
query = query.replace(' group','')
|
|
667
|
-
|
|
666
|
+
query = query.replace(' group','')
|
|
667
|
+
|
|
668
668
|
query = query.strip()
|
|
669
669
|
|
|
670
670
|
# query = 'person'
|
|
@@ -686,17 +686,17 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
|
|
|
686
686
|
|
|
687
687
|
n_inat_matches = len(inat_matches)
|
|
688
688
|
n_gbif_matches = len(gbif_matches)
|
|
689
|
-
|
|
689
|
+
|
|
690
690
|
selected_matches = None
|
|
691
|
-
|
|
691
|
+
|
|
692
692
|
assert taxonomy_preference in ['gbif','inat'],\
|
|
693
693
|
'Unrecognized taxonomy preference: {}'.format(taxonomy_preference)
|
|
694
|
-
|
|
694
|
+
|
|
695
695
|
if n_inat_matches > 0 and taxonomy_preference == 'inat':
|
|
696
696
|
selected_matches = 'inat'
|
|
697
697
|
elif n_gbif_matches > 0:
|
|
698
698
|
selected_matches = 'gbif'
|
|
699
|
-
|
|
699
|
+
|
|
700
700
|
if selected_matches == 'inat':
|
|
701
701
|
|
|
702
702
|
i_match = 0
|
|
@@ -802,7 +802,7 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
|
|
|
802
802
|
# Convert np.int64's to ints
|
|
803
803
|
if match is not None:
|
|
804
804
|
match = validate_and_convert(match)
|
|
805
|
-
|
|
805
|
+
|
|
806
806
|
taxonomy_string = str(match)
|
|
807
807
|
|
|
808
808
|
return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
|
|
@@ -828,15 +828,15 @@ if False:
|
|
|
828
828
|
# print(matches)
|
|
829
829
|
|
|
830
830
|
print_taxonomy_matches(matches,verbose=True)
|
|
831
|
-
|
|
831
|
+
|
|
832
832
|
print('\n\n')
|
|
833
|
-
|
|
833
|
+
|
|
834
834
|
# Print the taxonomy in the taxonomy spreadsheet format
|
|
835
835
|
assert matches[1]['source'] == 'inat'
|
|
836
836
|
t = str(matches[1]['taxonomy'])
|
|
837
837
|
print(t)
|
|
838
838
|
import clipboard; clipboard.copy(t)
|
|
839
|
-
|
|
839
|
+
|
|
840
840
|
|
|
841
841
|
#%% Directly access the taxonomy tables
|
|
842
842
|
|
|
@@ -848,12 +848,12 @@ if False:
|
|
|
848
848
|
|
|
849
849
|
#%% Command-line driver
|
|
850
850
|
|
|
851
|
-
def main():
|
|
851
|
+
def main(): # noqa
|
|
852
852
|
|
|
853
853
|
# Read command line inputs (absolute path)
|
|
854
854
|
parser = argparse.ArgumentParser()
|
|
855
855
|
parser.add_argument('input_file')
|
|
856
|
-
|
|
856
|
+
|
|
857
857
|
if len(sys.argv[1:]) == 0:
|
|
858
858
|
parser.print_help()
|
|
859
859
|
parser.exit()
|
|
@@ -36,7 +36,7 @@ def check_taxonomy_csv(csv_path: str) -> None:
|
|
|
36
36
|
"""
|
|
37
37
|
See module docstring.
|
|
38
38
|
"""
|
|
39
|
-
|
|
39
|
+
|
|
40
40
|
taxonomy_df = pd.read_csv(csv_path)
|
|
41
41
|
|
|
42
42
|
graph = nx.DiGraph()
|
|
@@ -46,12 +46,12 @@ def check_taxonomy_csv(csv_path: str) -> None:
|
|
|
46
46
|
num_scientific_name_errors = 0
|
|
47
47
|
|
|
48
48
|
for i_row, row in taxonomy_df.iterrows():
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
ds = row['dataset_name']
|
|
51
51
|
ds_label = row['query']
|
|
52
52
|
scientific_name = row['scientific_name']
|
|
53
53
|
level = row['taxonomy_level']
|
|
54
|
-
|
|
54
|
+
|
|
55
55
|
# This used to represent the source of the mapping: iNat, gbif, or manual. We've
|
|
56
56
|
# stopped tracking this, so this is now vestigial.
|
|
57
57
|
id_source = 0 # row['source']
|
|
@@ -95,8 +95,8 @@ def check_taxonomy_csv(csv_path: str) -> None:
|
|
|
95
95
|
num_scientific_name_errors += 1
|
|
96
96
|
|
|
97
97
|
taxon_child = node
|
|
98
|
-
|
|
99
|
-
# ...for each row in the taxonomy file
|
|
98
|
+
|
|
99
|
+
# ...for each row in the taxonomy file
|
|
100
100
|
|
|
101
101
|
assert nx.is_directed_acyclic_graph(graph)
|
|
102
102
|
|
|
@@ -124,36 +124,36 @@ def check_taxonomy_csv(csv_path: str) -> None:
|
|
|
124
124
|
print(f'At least one node has unresolved ambiguous parents: {e}')
|
|
125
125
|
|
|
126
126
|
print('Processed {} rows from {}'.format(len(taxonomy_df),csv_path))
|
|
127
|
-
|
|
127
|
+
|
|
128
128
|
print('num taxon level errors:', num_taxon_level_errors)
|
|
129
129
|
print('num scientific name errors:', num_scientific_name_errors)
|
|
130
130
|
|
|
131
131
|
|
|
132
132
|
#%% Command-line driver
|
|
133
|
-
|
|
133
|
+
|
|
134
134
|
if __name__ == '__main__':
|
|
135
|
-
|
|
135
|
+
|
|
136
136
|
parser = argparse.ArgumentParser()
|
|
137
137
|
parser.add_argument(
|
|
138
138
|
'taxonomy_csv_path',
|
|
139
139
|
help='path to taxonomy CSV file')
|
|
140
|
-
|
|
140
|
+
|
|
141
141
|
if len(sys.argv[1:]) == 0:
|
|
142
142
|
parser.print_help()
|
|
143
143
|
parser.exit()
|
|
144
|
-
|
|
144
|
+
|
|
145
145
|
args = parser.parse_args()
|
|
146
146
|
|
|
147
147
|
check_taxonomy_csv(args.taxonomy_csv_path)
|
|
148
148
|
|
|
149
149
|
|
|
150
150
|
#%% Interactive driver
|
|
151
|
-
|
|
151
|
+
|
|
152
152
|
if False:
|
|
153
|
-
|
|
153
|
+
|
|
154
154
|
#%%
|
|
155
|
-
|
|
155
|
+
|
|
156
156
|
import os
|
|
157
157
|
csv_path = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.csv')
|
|
158
158
|
check_taxonomy_csv(csv_path)
|
|
159
|
-
|
|
159
|
+
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
r"""
|
|
2
2
|
|
|
3
3
|
taxonomy_graph.py
|
|
4
4
|
|
|
@@ -69,7 +69,7 @@ class TaxonNode:
|
|
|
69
69
|
By default, we support multiple parents for each TaxonNode. See discussion
|
|
70
70
|
in module docstring above.
|
|
71
71
|
"""
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
# class variables
|
|
74
74
|
single_parent_only: ClassVar[bool] = False
|
|
75
75
|
|
|
@@ -82,7 +82,7 @@ class TaxonNode:
|
|
|
82
82
|
|
|
83
83
|
def __init__(self, level: str, name: str,
|
|
84
84
|
graph: Optional[nx.DiGraph] = None):
|
|
85
|
-
|
|
85
|
+
|
|
86
86
|
self.level = level
|
|
87
87
|
self.name = name
|
|
88
88
|
self.graph = graph
|
|
@@ -131,7 +131,7 @@ class TaxonNode:
|
|
|
131
131
|
Args:
|
|
132
132
|
parent: TaxonNode, must be higher in the taxonomical hierarchy
|
|
133
133
|
"""
|
|
134
|
-
|
|
134
|
+
|
|
135
135
|
assert self.graph is not None
|
|
136
136
|
parents = self.parents
|
|
137
137
|
if TaxonNode.single_parent_only and len(parents) > 0:
|
|
@@ -150,7 +150,7 @@ class TaxonNode:
|
|
|
150
150
|
Args:
|
|
151
151
|
child: TaxonNode, must be lower in the taxonomical hierarchy
|
|
152
152
|
"""
|
|
153
|
-
|
|
153
|
+
|
|
154
154
|
assert self.graph is not None
|
|
155
155
|
self.graph.add_edge(self, child)
|
|
156
156
|
|
|
@@ -160,7 +160,7 @@ class TaxonNode:
|
|
|
160
160
|
ds: str, name of dataset
|
|
161
161
|
ds_label: str, name of label used by that dataset
|
|
162
162
|
"""
|
|
163
|
-
|
|
163
|
+
|
|
164
164
|
self.dataset_labels.add((ds, ds_label))
|
|
165
165
|
|
|
166
166
|
def get_dataset_labels(self,
|
|
@@ -176,7 +176,7 @@ class TaxonNode:
|
|
|
176
176
|
|
|
177
177
|
Returns: set of (ds, ds_label) tuples
|
|
178
178
|
"""
|
|
179
|
-
|
|
179
|
+
|
|
180
180
|
result = self.dataset_labels
|
|
181
181
|
if include_datasets is not None:
|
|
182
182
|
result = set(tup for tup in result if tup[0] in include_datasets)
|
|
@@ -199,7 +199,7 @@ class TaxonNode:
|
|
|
199
199
|
|
|
200
200
|
Returns: TaxonNode, the LCA if it exists, or None if no LCA exists
|
|
201
201
|
"""
|
|
202
|
-
|
|
202
|
+
|
|
203
203
|
paths = []
|
|
204
204
|
for node in nodes:
|
|
205
205
|
# get path to root
|
|
@@ -242,7 +242,7 @@ def build_taxonomy_graph(taxonomy_df: pd.DataFrame
|
|
|
242
242
|
TaxonNode node in the tree that contains the label,
|
|
243
243
|
keys are all lowercase
|
|
244
244
|
"""
|
|
245
|
-
|
|
245
|
+
|
|
246
246
|
graph = nx.DiGraph()
|
|
247
247
|
taxon_to_node = {} # maps (taxon_level, taxon_name) to a TaxonNode
|
|
248
248
|
label_to_node = {} # maps (dataset_name, dataset_label) to a TaxonNode
|
|
@@ -303,12 +303,12 @@ def dag_to_tree(graph: nx.DiGraph,
|
|
|
303
303
|
component separately.
|
|
304
304
|
|
|
305
305
|
Args:
|
|
306
|
-
graph: nx.DiGraph, DAG representation of taxonomy
|
|
306
|
+
graph: nx.DiGraph, DAG representation of taxonomy hierarchy
|
|
307
307
|
taxon_to_node: dict, maps (taxon_level, taxon_name) to a TaxonNode
|
|
308
308
|
|
|
309
309
|
Returns: nx.DiGraph, a tree-structured graph
|
|
310
310
|
"""
|
|
311
|
-
|
|
311
|
+
|
|
312
312
|
tree = nx.DiGraph()
|
|
313
313
|
for node in graph.nodes:
|
|
314
314
|
tree.add_node(node)
|
|
@@ -17,9 +17,9 @@ from megadetector.data_management.lila.lila_common import read_lila_taxonomy_map
|
|
|
17
17
|
#%% Prevent execution during infrastructural imports
|
|
18
18
|
|
|
19
19
|
if False:
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
#%% Constants
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
lila_local_base = os.path.expanduser('~/lila')
|
|
24
24
|
|
|
25
25
|
metadata_dir = os.path.join(lila_local_base,'metadata')
|
|
@@ -30,7 +30,7 @@ if False:
|
|
|
30
30
|
lila_dataset_to_categories_file = os.path.join(category_list_dir,'lila_dataset_to_categories.json')
|
|
31
31
|
|
|
32
32
|
assert os.path.isfile(lila_dataset_to_categories_file)
|
|
33
|
-
|
|
33
|
+
|
|
34
34
|
|
|
35
35
|
#%% Load category and taxonomy files
|
|
36
36
|
|
|
@@ -48,36 +48,36 @@ if False:
|
|
|
48
48
|
|
|
49
49
|
# i_row = 1; row = taxonomy_df.iloc[i_row]; row
|
|
50
50
|
for i_row,row in taxonomy_df.iterrows():
|
|
51
|
-
|
|
51
|
+
|
|
52
52
|
ds_query = row['dataset_name'] + ':' + row['query']
|
|
53
53
|
ds_query = ds_query.lower()
|
|
54
|
-
|
|
54
|
+
|
|
55
55
|
if not isinstance(row['scientific_name'],str):
|
|
56
56
|
unmapped_queries.add(ds_query)
|
|
57
57
|
ds_query_to_scientific_name[ds_query] = 'unmapped'
|
|
58
58
|
continue
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
ds_query_to_scientific_name[ds_query] = row['scientific_name']
|
|
61
|
-
|
|
62
|
-
|
|
61
|
+
|
|
62
|
+
|
|
63
63
|
#%% For each dataset, make sure we can map every category to the taxonomy
|
|
64
64
|
|
|
65
65
|
# dataset_name = list(lila_dataset_to_categories.keys())[0]
|
|
66
66
|
for _dataset_name in lila_dataset_to_categories.keys():
|
|
67
|
-
|
|
67
|
+
|
|
68
68
|
if '_bbox' in _dataset_name:
|
|
69
69
|
dataset_name = _dataset_name.replace('_bbox','')
|
|
70
70
|
else:
|
|
71
71
|
dataset_name = _dataset_name
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
categories = lila_dataset_to_categories[dataset_name]
|
|
74
|
-
|
|
74
|
+
|
|
75
75
|
# c = categories[0]
|
|
76
76
|
for c in categories:
|
|
77
77
|
ds_query = dataset_name + ':' + c['name']
|
|
78
78
|
ds_query = ds_query.lower()
|
|
79
|
-
|
|
79
|
+
|
|
80
80
|
if ds_query not in ds_query_to_scientific_name:
|
|
81
|
-
print('Could not find mapping for {}'.format(ds_query))
|
|
81
|
+
print('Could not find mapping for {}'.format(ds_query))
|
|
82
82
|
else:
|
|
83
83
|
scientific_name = ds_query_to_scientific_name[ds_query]
|
|
@@ -12,8 +12,8 @@ Requires azure-storage-blob>=12.4.0
|
|
|
12
12
|
|
|
13
13
|
import json
|
|
14
14
|
|
|
15
|
-
from typing import Any, Iterable,
|
|
16
|
-
from azure.storage.blob import BlobPrefix, ContainerClient
|
|
15
|
+
from typing import Any, Iterable, Optional, Union
|
|
16
|
+
from azure.storage.blob import BlobPrefix, ContainerClient # type: ignore
|
|
17
17
|
|
|
18
18
|
from megadetector.utils import path_utils
|
|
19
19
|
from megadetector.utils import sas_blob_utils
|
|
@@ -26,20 +26,20 @@ def walk_container(container_client: ContainerClient,
|
|
|
26
26
|
prefix: str = '',
|
|
27
27
|
store_folders: bool = True,
|
|
28
28
|
store_blobs: bool = True,
|
|
29
|
-
debug_max_items: int = -1) ->
|
|
29
|
+
debug_max_items: int = -1) -> tuple[list[str], list[str]]:
|
|
30
30
|
"""
|
|
31
31
|
Recursively walk folders a Azure Blob Storage container.
|
|
32
32
|
|
|
33
33
|
Based on:
|
|
34
34
|
https://github.com/Azure/azure-sdk-for-python/blob/master/sdk/storage/azure-storage-blob/samples/blob_samples_walk_blob_hierarchy.py
|
|
35
35
|
"""
|
|
36
|
-
|
|
36
|
+
|
|
37
37
|
depth = 1
|
|
38
38
|
|
|
39
39
|
def walk_blob_hierarchy(prefix: str,
|
|
40
|
-
folders: Optional[
|
|
41
|
-
blobs: Optional[
|
|
42
|
-
) ->
|
|
40
|
+
folders: Optional[list[str]] = None,
|
|
41
|
+
blobs: Optional[list[str]] = None
|
|
42
|
+
) -> tuple[list[str], list[str]]:
|
|
43
43
|
if folders is None:
|
|
44
44
|
folders = []
|
|
45
45
|
if blobs is None:
|
|
@@ -76,11 +76,11 @@ def walk_container(container_client: ContainerClient,
|
|
|
76
76
|
return folders, blobs
|
|
77
77
|
|
|
78
78
|
|
|
79
|
-
def list_top_level_blob_folders(container_client: ContainerClient) ->
|
|
79
|
+
def list_top_level_blob_folders(container_client: ContainerClient) -> list[str]:
|
|
80
80
|
"""
|
|
81
81
|
List all top-level folders in a container.
|
|
82
82
|
"""
|
|
83
|
-
|
|
83
|
+
|
|
84
84
|
top_level_folders, _ = walk_container(
|
|
85
85
|
container_client, max_depth=1, store_blobs=False)
|
|
86
86
|
return top_level_folders
|
|
@@ -88,13 +88,13 @@ def list_top_level_blob_folders(container_client: ContainerClient) -> List[str]:
|
|
|
88
88
|
|
|
89
89
|
def concatenate_json_lists(input_files: Iterable[str],
|
|
90
90
|
output_file: Optional[str] = None
|
|
91
|
-
) ->
|
|
91
|
+
) -> list[Any]:
|
|
92
92
|
"""
|
|
93
93
|
Given a list of JSON files that contain lists (typically string
|
|
94
94
|
filenames), concatenates the lists into a single list and optionally
|
|
95
95
|
writes out this list to a new output JSON file.
|
|
96
96
|
"""
|
|
97
|
-
|
|
97
|
+
|
|
98
98
|
output_list = []
|
|
99
99
|
for fn in input_files:
|
|
100
100
|
with open(fn, 'r') as f:
|
|
@@ -116,12 +116,12 @@ def upload_file_to_blob(account_name: str,
|
|
|
116
116
|
Uploads a local file to Azure Blob Storage and returns the uploaded
|
|
117
117
|
blob URI with SAS token.
|
|
118
118
|
"""
|
|
119
|
-
|
|
119
|
+
|
|
120
120
|
container_uri = sas_blob_utils.build_azure_storage_uri(
|
|
121
121
|
account=account_name, container=container_name, sas_token=sas_token)
|
|
122
122
|
with open(local_path, 'rb') as data:
|
|
123
123
|
return sas_blob_utils.upload_blob(
|
|
124
|
-
container_uri=container_uri, blob_name=blob_name, data=data,
|
|
124
|
+
container_uri=container_uri, blob_name=blob_name, data=data,
|
|
125
125
|
overwrite=overwrite)
|
|
126
126
|
|
|
127
127
|
|
|
@@ -131,11 +131,11 @@ def enumerate_blobs_to_file(
|
|
|
131
131
|
container_name: str,
|
|
132
132
|
sas_token: Optional[str] = None,
|
|
133
133
|
blob_prefix: Optional[str] = None,
|
|
134
|
-
blob_suffix: Optional[Union[str,
|
|
134
|
+
blob_suffix: Optional[Union[str, tuple[str]]] = None,
|
|
135
135
|
rsearch: Optional[str] = None,
|
|
136
136
|
limit: Optional[int] = None,
|
|
137
137
|
verbose: Optional[bool] = True
|
|
138
|
-
) ->
|
|
138
|
+
) -> list[str]:
|
|
139
139
|
"""
|
|
140
140
|
Enumerates blobs in a container, and writes the blob names to an output
|
|
141
141
|
file.
|
|
@@ -143,7 +143,7 @@ def enumerate_blobs_to_file(
|
|
|
143
143
|
Args:
|
|
144
144
|
output_file: str, path to save list of files in container
|
|
145
145
|
If ends in '.json', writes a JSON string. Otherwise, writes a
|
|
146
|
-
newline-delimited list. Can be None, in which case this is just a
|
|
146
|
+
newline-delimited list. Can be None, in which case this is just a
|
|
147
147
|
convenient wrapper for blob enumeration.
|
|
148
148
|
account_name: str, Azure Storage account name
|
|
149
149
|
container_name: str, Azure Blob Storage container name
|
|
@@ -155,24 +155,24 @@ def enumerate_blobs_to_file(
|
|
|
155
155
|
be lowercased first before comparing with the suffix(es).
|
|
156
156
|
rsearch: optional str, returned results will only contain blob names
|
|
157
157
|
that match this regex. Can also be a list of regexes, in which case
|
|
158
|
-
blobs matching *any* of the regex's will be returned.
|
|
158
|
+
blobs matching *any* of the regex's will be returned.
|
|
159
159
|
limit: int, maximum # of blob names to list
|
|
160
160
|
if None, then returns all blob names
|
|
161
161
|
|
|
162
162
|
Returns: list of str, sorted blob names, of length limit or shorter.
|
|
163
163
|
"""
|
|
164
|
-
|
|
164
|
+
|
|
165
165
|
if sas_token is not None and len(sas_token) > 9 and sas_token[0] == '?':
|
|
166
166
|
sas_token = sas_token[1:]
|
|
167
|
-
|
|
167
|
+
|
|
168
168
|
container_uri = sas_blob_utils.build_azure_storage_uri(
|
|
169
169
|
account=account_name, container=container_name, sas_token=sas_token)
|
|
170
|
-
|
|
170
|
+
|
|
171
171
|
matched_blobs = sas_blob_utils.list_blobs_in_container(
|
|
172
172
|
container_uri=container_uri, blob_prefix=blob_prefix,
|
|
173
173
|
blob_suffix=blob_suffix, rsearch=rsearch, limit=limit, verbose=verbose)
|
|
174
|
-
|
|
174
|
+
|
|
175
175
|
if output_file is not None:
|
|
176
176
|
path_utils.write_list_to_file(output_file, matched_blobs)
|
|
177
|
-
|
|
177
|
+
|
|
178
178
|
return matched_blobs
|