megadetector 10.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. megadetector/__init__.py +0 -0
  2. megadetector/api/__init__.py +0 -0
  3. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  7. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  8. megadetector/classification/__init__.py +0 -0
  9. megadetector/classification/aggregate_classifier_probs.py +108 -0
  10. megadetector/classification/analyze_failed_images.py +227 -0
  11. megadetector/classification/cache_batchapi_outputs.py +198 -0
  12. megadetector/classification/create_classification_dataset.py +626 -0
  13. megadetector/classification/crop_detections.py +516 -0
  14. megadetector/classification/csv_to_json.py +226 -0
  15. megadetector/classification/detect_and_crop.py +853 -0
  16. megadetector/classification/efficientnet/__init__.py +9 -0
  17. megadetector/classification/efficientnet/model.py +415 -0
  18. megadetector/classification/efficientnet/utils.py +608 -0
  19. megadetector/classification/evaluate_model.py +520 -0
  20. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  21. megadetector/classification/json_to_azcopy_list.py +63 -0
  22. megadetector/classification/json_validator.py +696 -0
  23. megadetector/classification/map_classification_categories.py +276 -0
  24. megadetector/classification/merge_classification_detection_output.py +509 -0
  25. megadetector/classification/prepare_classification_script.py +194 -0
  26. megadetector/classification/prepare_classification_script_mc.py +228 -0
  27. megadetector/classification/run_classifier.py +287 -0
  28. megadetector/classification/save_mislabeled.py +110 -0
  29. megadetector/classification/train_classifier.py +827 -0
  30. megadetector/classification/train_classifier_tf.py +725 -0
  31. megadetector/classification/train_utils.py +323 -0
  32. megadetector/data_management/__init__.py +0 -0
  33. megadetector/data_management/animl_to_md.py +161 -0
  34. megadetector/data_management/annotations/__init__.py +0 -0
  35. megadetector/data_management/annotations/annotation_constants.py +33 -0
  36. megadetector/data_management/camtrap_dp_to_coco.py +270 -0
  37. megadetector/data_management/cct_json_utils.py +566 -0
  38. megadetector/data_management/cct_to_md.py +184 -0
  39. megadetector/data_management/cct_to_wi.py +293 -0
  40. megadetector/data_management/coco_to_labelme.py +284 -0
  41. megadetector/data_management/coco_to_yolo.py +701 -0
  42. megadetector/data_management/databases/__init__.py +0 -0
  43. megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
  44. megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
  45. megadetector/data_management/databases/integrity_check_json_db.py +563 -0
  46. megadetector/data_management/databases/subset_json_db.py +195 -0
  47. megadetector/data_management/generate_crops_from_cct.py +200 -0
  48. megadetector/data_management/get_image_sizes.py +164 -0
  49. megadetector/data_management/labelme_to_coco.py +559 -0
  50. megadetector/data_management/labelme_to_yolo.py +349 -0
  51. megadetector/data_management/lila/__init__.py +0 -0
  52. megadetector/data_management/lila/create_lila_blank_set.py +556 -0
  53. megadetector/data_management/lila/create_lila_test_set.py +192 -0
  54. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  55. megadetector/data_management/lila/download_lila_subset.py +182 -0
  56. megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
  57. megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
  58. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  59. megadetector/data_management/lila/lila_common.py +319 -0
  60. megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
  61. megadetector/data_management/mewc_to_md.py +344 -0
  62. megadetector/data_management/ocr_tools.py +873 -0
  63. megadetector/data_management/read_exif.py +964 -0
  64. megadetector/data_management/remap_coco_categories.py +195 -0
  65. megadetector/data_management/remove_exif.py +156 -0
  66. megadetector/data_management/rename_images.py +194 -0
  67. megadetector/data_management/resize_coco_dataset.py +665 -0
  68. megadetector/data_management/speciesnet_to_md.py +41 -0
  69. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  70. megadetector/data_management/yolo_output_to_md_output.py +594 -0
  71. megadetector/data_management/yolo_to_coco.py +984 -0
  72. megadetector/data_management/zamba_to_md.py +188 -0
  73. megadetector/detection/__init__.py +0 -0
  74. megadetector/detection/change_detection.py +840 -0
  75. megadetector/detection/process_video.py +479 -0
  76. megadetector/detection/pytorch_detector.py +1451 -0
  77. megadetector/detection/run_detector.py +1267 -0
  78. megadetector/detection/run_detector_batch.py +2172 -0
  79. megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
  80. megadetector/detection/run_md_and_speciesnet.py +1604 -0
  81. megadetector/detection/run_tiled_inference.py +1044 -0
  82. megadetector/detection/tf_detector.py +209 -0
  83. megadetector/detection/video_utils.py +1379 -0
  84. megadetector/postprocessing/__init__.py +0 -0
  85. megadetector/postprocessing/add_max_conf.py +72 -0
  86. megadetector/postprocessing/categorize_detections_by_size.py +166 -0
  87. megadetector/postprocessing/classification_postprocessing.py +1943 -0
  88. megadetector/postprocessing/combine_batch_outputs.py +249 -0
  89. megadetector/postprocessing/compare_batch_results.py +2110 -0
  90. megadetector/postprocessing/convert_output_format.py +403 -0
  91. megadetector/postprocessing/create_crop_folder.py +629 -0
  92. megadetector/postprocessing/detector_calibration.py +570 -0
  93. megadetector/postprocessing/generate_csv_report.py +522 -0
  94. megadetector/postprocessing/load_api_results.py +223 -0
  95. megadetector/postprocessing/md_to_coco.py +428 -0
  96. megadetector/postprocessing/md_to_labelme.py +351 -0
  97. megadetector/postprocessing/md_to_wi.py +41 -0
  98. megadetector/postprocessing/merge_detections.py +392 -0
  99. megadetector/postprocessing/postprocess_batch_results.py +2140 -0
  100. megadetector/postprocessing/remap_detection_categories.py +226 -0
  101. megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
  102. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
  103. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
  104. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
  105. megadetector/postprocessing/separate_detections_into_folders.py +795 -0
  106. megadetector/postprocessing/subset_json_detector_output.py +964 -0
  107. megadetector/postprocessing/top_folders_to_bottom.py +238 -0
  108. megadetector/postprocessing/validate_batch_results.py +332 -0
  109. megadetector/taxonomy_mapping/__init__.py +0 -0
  110. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  111. megadetector/taxonomy_mapping/map_new_lila_datasets.py +211 -0
  112. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
  113. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
  114. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  115. megadetector/taxonomy_mapping/simple_image_download.py +231 -0
  116. megadetector/taxonomy_mapping/species_lookup.py +1008 -0
  117. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  118. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  119. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  120. megadetector/tests/__init__.py +0 -0
  121. megadetector/tests/test_nms_synthetic.py +335 -0
  122. megadetector/utils/__init__.py +0 -0
  123. megadetector/utils/ct_utils.py +1857 -0
  124. megadetector/utils/directory_listing.py +199 -0
  125. megadetector/utils/extract_frames_from_video.py +307 -0
  126. megadetector/utils/gpu_test.py +125 -0
  127. megadetector/utils/md_tests.py +2072 -0
  128. megadetector/utils/path_utils.py +2872 -0
  129. megadetector/utils/process_utils.py +172 -0
  130. megadetector/utils/split_locations_into_train_val.py +237 -0
  131. megadetector/utils/string_utils.py +234 -0
  132. megadetector/utils/url_utils.py +825 -0
  133. megadetector/utils/wi_platform_utils.py +968 -0
  134. megadetector/utils/wi_taxonomy_utils.py +1766 -0
  135. megadetector/utils/write_html_image_list.py +239 -0
  136. megadetector/visualization/__init__.py +0 -0
  137. megadetector/visualization/plot_utils.py +309 -0
  138. megadetector/visualization/render_images_with_thumbnails.py +243 -0
  139. megadetector/visualization/visualization_utils.py +1973 -0
  140. megadetector/visualization/visualize_db.py +630 -0
  141. megadetector/visualization/visualize_detector_output.py +498 -0
  142. megadetector/visualization/visualize_video_output.py +705 -0
  143. megadetector-10.0.15.dist-info/METADATA +115 -0
  144. megadetector-10.0.15.dist-info/RECORD +147 -0
  145. megadetector-10.0.15.dist-info/WHEEL +5 -0
  146. megadetector-10.0.15.dist-info/licenses/LICENSE +19 -0
  147. megadetector-10.0.15.dist-info/top_level.txt +1 -0
@@ -0,0 +1,174 @@
1
+ """
2
+
3
+ get_lila_annotation_counts.py
4
+
5
+ Generates a .json-formatted dictionary mapping each LILA dataset to all categories
6
+ that exist for that dataset, with counts for the number of occurrences of each category
7
+ (the number of *annotations* for each category, not the number of *images*).
8
+
9
+ Also loads the taxonomy mapping file, to include scientific names for each category.
10
+
11
+ get_lila_image_counts.py counts the number of *images* for each category in each dataset.
12
+
13
+ """
14
+
15
+ #%% Constants and imports
16
+
17
+ import json
18
+ import os
19
+
20
+ from collections import defaultdict
21
+
22
+ from megadetector.data_management.lila.lila_common import \
23
+ read_lila_metadata, read_metadata_file_for_dataset, read_lila_taxonomy_mapping
24
+ from megadetector.utils import ct_utils
25
+
26
+ # cloud provider to use for downloading images; options are 'gcp', 'azure', or 'aws'
27
+ preferred_cloud = 'gcp'
28
+
29
+ # array to fill for output
30
+ category_list = []
31
+
32
+ # We'll write images, metadata downloads, and temporary files here
33
+ lila_local_base = os.path.expanduser('~/lila')
34
+
35
+ output_dir = os.path.join(lila_local_base,'lila_categories_list')
36
+ os.makedirs(output_dir,exist_ok=True)
37
+
38
+ metadata_dir = os.path.join(lila_local_base,'metadata')
39
+ os.makedirs(metadata_dir,exist_ok=True)
40
+
41
+ output_file = os.path.join(output_dir,'lila_dataset_to_categories.json')
42
+
43
+
44
+ #%% Load category and taxonomy files
45
+
46
+ taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
47
+
48
+
49
+ #%% Map dataset names and category names to scientific names
50
+
51
+ ds_query_to_scientific_name = {}
52
+
53
+ unmapped_queries = set()
54
+
55
+ datasets_with_taxonomy_mapping = set()
56
+
57
+ # i_row = 1; row = taxonomy_df.iloc[i_row]; row
58
+ for i_row,row in taxonomy_df.iterrows():
59
+
60
+ datasets_with_taxonomy_mapping.add(row['dataset_name'])
61
+
62
+ ds_query = row['dataset_name'] + ':' + row['query']
63
+ ds_query = ds_query.lower()
64
+
65
+ if not isinstance(row['scientific_name'],str):
66
+ unmapped_queries.add(ds_query)
67
+ ds_query_to_scientific_name[ds_query] = 'unmapped'
68
+ continue
69
+
70
+ ds_query_to_scientific_name[ds_query] = row['scientific_name']
71
+
72
+ print('Loaded taxonomy mappings for {} datasets'.format(len(datasets_with_taxonomy_mapping)))
73
+
74
+
75
+ #%% Download and parse the metadata file
76
+
77
+ metadata_table = read_lila_metadata(metadata_dir)
78
+
79
+ print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
80
+
81
+
82
+ #%% Download and extract metadata for each dataset
83
+
84
+ for ds_name in metadata_table.keys():
85
+ metadata_table[ds_name]['json_filename'] = \
86
+ read_metadata_file_for_dataset(ds_name=ds_name,
87
+ metadata_dir=metadata_dir,
88
+ metadata_table=metadata_table,
89
+ preferred_cloud=preferred_cloud)
90
+
91
+
92
+ #%% Get category names and counts for each dataset
93
+
94
+ # Takes ~5 minutes
95
+
96
+ dataset_to_categories = {}
97
+
98
+ # ds_name = 'NACTI'
99
+ for ds_name in metadata_table.keys():
100
+
101
+ taxonomy_mapping_available = (ds_name in datasets_with_taxonomy_mapping)
102
+
103
+ if not taxonomy_mapping_available:
104
+ print('Warning: taxonomy mapping not available for {}'.format(ds_name))
105
+
106
+ print('Finding categories in {}'.format(ds_name))
107
+
108
+ json_filename = metadata_table[ds_name]['json_filename']
109
+ base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
110
+ assert not base_url.endswith('/')
111
+
112
+ # Open the metadata file
113
+ with open(json_filename, 'r') as f:
114
+ data = json.load(f)
115
+
116
+ # Collect list of categories and mappings to category name
117
+ categories = data['categories']
118
+
119
+ category_id_to_count = defaultdict(int)
120
+ annotations = data['annotations']
121
+
122
+ # ann = annotations[0]
123
+ for ann in annotations:
124
+ category_id_to_count[ann['category_id']] = category_id_to_count[ann['category_id']] + 1
125
+
126
+ # c = categories[0]
127
+ for c in categories:
128
+ count = category_id_to_count[c['id']]
129
+ if 'count' in c:
130
+ assert 'bbox' in ds_name or c['count'] == count
131
+ c['count'] = count
132
+
133
+ # Don't do taxonomy mapping for bbox data sets, which are sometimes just binary and are
134
+ # always redundant with the class-level data sets.
135
+ if 'bbox' in ds_name:
136
+ c['scientific_name_from_taxonomy_mapping'] = None
137
+ elif not taxonomy_mapping_available:
138
+ c['scientific_name_from_taxonomy_mapping'] = None
139
+ else:
140
+ taxonomy_query_string = ds_name.lower().strip() + ':' + c['name'].lower()
141
+ if taxonomy_query_string not in ds_query_to_scientific_name:
142
+ print('No match for query string {}'.format(taxonomy_query_string))
143
+ # As of right now, this is the only quirky case
144
+ assert '#ref!' in taxonomy_query_string and 'wcs' in ds_name.lower()
145
+ c['scientific_name_from_taxonomy_mapping'] = None
146
+ else:
147
+ sn = ds_query_to_scientific_name[taxonomy_query_string]
148
+ assert sn is not None and len(sn) > 0
149
+ c['scientific_name_from_taxonomy_mapping'] = sn
150
+
151
+ dataset_to_categories[ds_name] = categories
152
+
153
+ # ...for each dataset
154
+
155
+
156
+ #%% Print the results
157
+
158
+ # ds_name = list(dataset_to_categories.keys())[0]
159
+ for ds_name in dataset_to_categories:
160
+
161
+ print('\n** Category counts for {} **\n'.format(ds_name))
162
+
163
+ categories = dataset_to_categories[ds_name]
164
+ categories = sorted(categories, key=lambda x: x['count'], reverse=True)
165
+
166
+ for c in categories:
167
+ print('{} ({}): {}'.format(c['name'],c['scientific_name_from_taxonomy_mapping'],c['count']))
168
+
169
+ # ...for each dataset
170
+
171
+
172
+ #%% Save the results
173
+
174
+ ct_utils.write_json(output_file, dataset_to_categories)
@@ -0,0 +1,112 @@
1
+ """
2
+
3
+ get_lila_image_counts.py
4
+
5
+ Count the number of images and bounding boxes with each label in one or more LILA datasets.
6
+
7
+ This script doesn't write these counts out anywhere other than the console, it's just intended
8
+ as a template for doing operations like this on LILA data. get_lila_annotation_counts.py writes
9
+ information out to a .json file, but it counts *annotations*, not *images*, for each category.
10
+
11
+ """
12
+
13
+ #%% Constants and imports
14
+
15
+ import json
16
+ import os
17
+
18
+ from collections import defaultdict
19
+
20
+ from megadetector.data_management.lila.lila_common import \
21
+ read_lila_metadata, read_metadata_file_for_dataset
22
+
23
+ # If None, will use all datasets
24
+ datasets_of_interest = None
25
+
26
+ # We'll write images, metadata downloads, and temporary files here
27
+ lila_local_base = os.path.expanduser('~/lila')
28
+
29
+ metadata_dir = os.path.join(lila_local_base,'metadata')
30
+ os.makedirs(metadata_dir,exist_ok=True)
31
+
32
+
33
+ #%% Download and parse the metadata file
34
+
35
+ metadata_table = read_lila_metadata(metadata_dir)
36
+
37
+
38
+ #%% Download and extract metadata for the datasets we're interested in
39
+
40
+ if datasets_of_interest is None:
41
+ datasets_of_interest = list(metadata_table.keys())
42
+
43
+ for ds_name in datasets_of_interest:
44
+ metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
45
+ metadata_dir=metadata_dir,
46
+ metadata_table=metadata_table)
47
+
48
+
49
+ #%% Count categories
50
+
51
+ ds_name_to_category_counts = {}
52
+
53
+ # ds_name = datasets_of_interest[0]
54
+ for ds_name in datasets_of_interest:
55
+
56
+ category_to_image_count = {}
57
+ category_to_bbox_count = {}
58
+
59
+ print('Counting categories in: ' + ds_name)
60
+
61
+ json_filename = metadata_table[ds_name]['json_filename']
62
+ with open(json_filename, 'r') as f:
63
+ data = json.load(f)
64
+
65
+ categories = data['categories']
66
+ category_ids = [c['id'] for c in categories]
67
+ for c in categories:
68
+ category_id_to_name = {c['id']:c['name'] for c in categories}
69
+ annotations = data['annotations']
70
+ images = data['images']
71
+
72
+ for category_id in category_ids:
73
+ category_name = category_id_to_name[category_id]
74
+ category_to_image_count[category_name] = 0
75
+ category_to_bbox_count[category_name] = 0
76
+
77
+ image_id_to_category_names = defaultdict(set)
78
+
79
+ # Go through annotations, marking each image with the categories that are present
80
+ #
81
+ # ann = annotations[0]
82
+ for ann in annotations:
83
+
84
+ category_name = category_id_to_name[ann['category_id']]
85
+ image_id_to_category_names[ann['image_id']].add(category_name)
86
+
87
+ # Now go through images and count categories
88
+ category_to_count = defaultdict(int)
89
+
90
+ # im = images[0]
91
+ for im in images:
92
+ categories_this_image = image_id_to_category_names[im['id']]
93
+ for category_name in categories_this_image:
94
+ category_to_count[category_name] += 1
95
+
96
+ ds_name_to_category_counts[ds_name] = category_to_count
97
+
98
+ # ...for each dataset
99
+
100
+
101
+ #%% Print the results
102
+
103
+ for ds_name in ds_name_to_category_counts:
104
+
105
+ print('\n** Category counts for {} **\n'.format(ds_name))
106
+
107
+ category_to_count = ds_name_to_category_counts[ds_name]
108
+ category_to_count = {k: v for k, v in sorted(category_to_count.items(), reverse=True,
109
+ key=lambda item: item[1])}
110
+
111
+ for category_name in category_to_count.keys():
112
+ print('{}: {}'.format(category_name,category_to_count[category_name]))
@@ -0,0 +1,319 @@
1
+ """
2
+
3
+ lila_common.py
4
+
5
+ Common constants and functions related to LILA data management/retrieval.
6
+
7
+ """
8
+
9
+ #%% Imports and constants
10
+
11
+ import os
12
+ import json
13
+ import zipfile
14
+ import pandas as pd
15
+
16
+ from urllib.parse import urlparse
17
+
18
+ from megadetector.utils.url_utils import download_url
19
+ from megadetector.utils.path_utils import unzip_file
20
+ from megadetector.utils.ct_utils import is_empty
21
+
22
+ # LILA camera trap primary metadata file
23
+ lila_metadata_url = 'http://lila.science/wp-content/uploads/2023/06/lila_camera_trap_datasets.csv'
24
+ lila_taxonomy_mapping_url = 'https://lila.science/public/lila-taxonomy-mapping_release.csv'
25
+ lila_all_images_url = 'https://lila.science/public/lila_image_urls_and_labels.csv.zip'
26
+
27
+ wildlife_insights_page_size = 30000
28
+ wildlife_insights_taxonomy_url = 'https://api.wildlifeinsights.org/api/v1/taxonomy/taxonomies-all?fields=class,order,family,genus,species,authority,taxonomyType,uniqueIdentifier,commonNameEnglish&page[size]={}'.format(
29
+ wildlife_insights_page_size)
30
+ wildlife_insights_taxonomy_local_json_filename = 'wi_taxonomy.json'
31
+ wildlife_insights_taxonomy_local_csv_filename = \
32
+ wildlife_insights_taxonomy_local_json_filename.replace('.json','.csv')
33
+
34
+ # Filenames are consistent across clouds relative to these URLs
35
+ lila_base_urls = {
36
+ 'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
37
+ 'gcp':'https://storage.googleapis.com/public-datasets-lila/',
38
+ 'aws':'http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/'
39
+ }
40
+
41
+ lila_cloud_urls = {
42
+ 'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
43
+ 'gcp':'gs://public-datasets-lila/',
44
+ 'aws':'s3://us-west-2.opendata.source.coop/agentmorris/lila-wildlife/'
45
+ }
46
+
47
+ for url in lila_base_urls.values():
48
+ assert url.endswith('/')
49
+
50
+
51
+ #%% Common functions
52
+
53
+ def read_wildlife_insights_taxonomy_mapping(metadata_dir, force_download=False):
54
+ """
55
+ Reads the WI taxonomy mapping file, downloading the .json data (and writing to .csv) if necessary.
56
+
57
+ Args:
58
+ metadata_dir (str): folder to use for temporary LILA metadata files
59
+ force_download (bool, optional): download the taxonomy mapping file
60
+ even if the local file exists.
61
+
62
+ Returns:
63
+ pd.dataframe: A DataFrame with taxonomy information
64
+ """
65
+
66
+ wi_taxonomy_csv_path = os.path.join(metadata_dir,wildlife_insights_taxonomy_local_csv_filename)
67
+
68
+ if os.path.exists(wi_taxonomy_csv_path) and (not force_download):
69
+ df = pd.read_csv(wi_taxonomy_csv_path)
70
+ else:
71
+ wi_taxonomy_json_path = os.path.join(metadata_dir,wildlife_insights_taxonomy_local_json_filename)
72
+ download_url(wildlife_insights_taxonomy_url, wi_taxonomy_json_path,
73
+ force_download=force_download)
74
+ with open(wi_taxonomy_json_path,'r') as f:
75
+ d = json.load(f)
76
+
77
+ # We haven't implemented paging, make sure that's not an issue
78
+ assert d['meta']['totalItems'] < wildlife_insights_page_size
79
+
80
+ # d['data'] is a list of items that look like:
81
+ """
82
+ {'id': 2000003,
83
+ 'class': 'Mammalia',
84
+ 'order': 'Rodentia',
85
+ 'family': 'Abrocomidae',
86
+ 'genus': 'Abrocoma',
87
+ 'species': 'bennettii',
88
+ 'authority': 'Waterhouse, 1837',
89
+ 'commonNameEnglish': "Bennett's Chinchilla Rat",
90
+ 'taxonomyType': 'biological',
91
+ 'uniqueIdentifier': '7a6c93a5-bdf7-4182-82f9-7a67d23f7fe1'}
92
+ """
93
+ df = pd.DataFrame(d['data'])
94
+ df.to_csv(wi_taxonomy_csv_path,index=False)
95
+
96
+ return df
97
+
98
+
99
+ def read_lila_taxonomy_mapping(metadata_dir, force_download=False):
100
+ """
101
+ Reads the LILA taxonomy mapping file, downloading the .csv file if necessary.
102
+
103
+ Args:
104
+ metadata_dir (str): folder to use for temporary LILA metadata files
105
+ force_download (bool, optional): download the taxonomy mapping file
106
+ even if the local file exists.
107
+
108
+ Returns:
109
+ pd.DataFrame: a DataFrame with one row per identification
110
+ """
111
+
112
+ p = urlparse(lila_taxonomy_mapping_url)
113
+ taxonomy_filename = os.path.join(metadata_dir,os.path.basename(p.path))
114
+ download_url(lila_taxonomy_mapping_url, taxonomy_filename,
115
+ force_download=force_download)
116
+
117
+ df = pd.read_csv(taxonomy_filename)
118
+
119
+ return df
120
+
121
+
122
+ def read_lila_metadata(metadata_dir, force_download=False):
123
+ """
124
+ Reads LILA metadata (URLs to each dataset), downloading the .csv file if necessary.
125
+
126
+ Args:
127
+ metadata_dir (str): folder to use for temporary LILA metadata files
128
+ force_download (bool, optional): download the metadata file even if
129
+ the local file exists.
130
+
131
+ Returns:
132
+ dict: a dict mapping dataset names (e.g. "Caltech Camera Traps") to dicts
133
+ with keys corresponding to the headers in the .csv file, currently:
134
+
135
+ - name
136
+ - short_name
137
+ - continent
138
+ - country
139
+ - region
140
+ - image_base_url_relative
141
+ - bbox_url_relative
142
+ - image_base_url_gcp
143
+ - metadata_url_gcp
144
+ - bbox_url_gcp
145
+ - image_base_url_aws
146
+ - metadata_url_aws
147
+ - bbox_url_aws
148
+ - image_base_url_azure
149
+ - metadata_url_azure
150
+ - box_url_azure
151
+ - mdv4_results_raw
152
+ - mdv5b_results_raw
153
+ - md_results_with_rde
154
+ - json_filename
155
+ """
156
+
157
+ # Put the master metadata file in the same folder where we're putting images
158
+ p = urlparse(lila_metadata_url)
159
+ metadata_filename = os.path.join(metadata_dir,os.path.basename(p.path))
160
+ download_url(lila_metadata_url, metadata_filename, force_download=force_download)
161
+
162
+ df = pd.read_csv(metadata_filename)
163
+
164
+ records = df.to_dict('records')
165
+
166
+ # Parse into a table keyed by dataset name
167
+ metadata_table = {}
168
+
169
+ # r = records[0]
170
+ for r in records:
171
+ if is_empty(r['name']):
172
+ continue
173
+
174
+ # Convert NaN's to None
175
+ for k in r.keys():
176
+ if is_empty(r[k]):
177
+ r[k] = None
178
+
179
+ metadata_table[r['name']] = r
180
+
181
+ return metadata_table
182
+
183
+
184
+ def read_lila_all_images_file(metadata_dir, force_download=False):
185
+ """
186
+ Downloads if necessary - then unzips if necessary - the .csv file with label mappings for
187
+ all LILA files, and opens the resulting .csv file as a Pandas DataFrame.
188
+
189
+ Args:
190
+ metadata_dir (str): folder to use for temporary LILA metadata files
191
+ force_download (bool, optional): download the metadata file even if
192
+ the local file exists.
193
+
194
+ Returns:
195
+ pd.DataFrame: a DataFrame containing one row per identification in a LILA camera trap image
196
+ """
197
+
198
+ p = urlparse(lila_all_images_url)
199
+ lila_all_images_zip_filename = os.path.join(metadata_dir,os.path.basename(p.path))
200
+ download_url(lila_all_images_url, lila_all_images_zip_filename,
201
+ force_download=force_download)
202
+
203
+ with zipfile.ZipFile(lila_all_images_zip_filename,'r') as z:
204
+ files = z.namelist()
205
+ assert len(files) == 1
206
+
207
+ unzipped_csv_filename = os.path.join(metadata_dir,files[0])
208
+ if not os.path.isfile(unzipped_csv_filename):
209
+ unzip_file(lila_all_images_zip_filename,metadata_dir)
210
+ else:
211
+ print('{} already unzipped'.format(unzipped_csv_filename))
212
+
213
+ df = pd.read_csv(unzipped_csv_filename)
214
+
215
+ return df
216
+
217
+
218
+ def read_metadata_file_for_dataset(ds_name,
219
+ metadata_dir,
220
+ metadata_table=None,
221
+ json_url=None,
222
+ preferred_cloud='gcp',
223
+ force_download=False):
224
+ """
225
+ Downloads if necessary - then unzips if necessary - the .json file for a specific dataset.
226
+
227
+ Args:
228
+ ds_name (str): the name of the dataset for which you want to retrieve metadata (e.g.
229
+ "Caltech Camera Traps")
230
+ metadata_dir (str): folder to use for temporary LILA metadata files
231
+ metadata_table (dict, optional): an optional dictionary already loaded via
232
+ read_lila_metadata()
233
+ json_url (str, optional): the URL of the metadata file, if None will be retrieved
234
+ via read_lila_metadata()
235
+ preferred_cloud (str, optional): 'gcp' (default), 'azure', or 'aws'
236
+ force_download (bool, optional): download the metadata file even if
237
+ the local file exists.
238
+
239
+ Returns:
240
+ str: the .json filename on the local disk
241
+
242
+ """
243
+
244
+ if preferred_cloud is None:
245
+ preferred_cloud = 'gcp'
246
+
247
+ assert preferred_cloud in lila_base_urls.keys()
248
+
249
+ if json_url is None:
250
+
251
+ if metadata_table is None:
252
+ metadata_table = read_lila_metadata(metadata_dir)
253
+
254
+ json_url = metadata_table[ds_name]['metadata_url_' + preferred_cloud]
255
+
256
+ p = urlparse(json_url)
257
+ json_filename = os.path.join(metadata_dir,os.path.basename(p.path))
258
+ download_url(json_url, json_filename, force_download=force_download)
259
+
260
+ # Unzip if necessary
261
+ if json_filename.endswith('.zip'):
262
+
263
+ with zipfile.ZipFile(json_filename,'r') as z:
264
+ files = z.namelist()
265
+ assert len(files) == 1
266
+ unzipped_json_filename = os.path.join(metadata_dir,files[0])
267
+ if not os.path.isfile(unzipped_json_filename):
268
+ unzip_file(json_filename,metadata_dir)
269
+ else:
270
+ print('{} already unzipped'.format(unzipped_json_filename))
271
+ json_filename = unzipped_json_filename
272
+
273
+ return json_filename
274
+
275
+
276
+ #%% Interactive test driver
277
+
278
+ if False:
279
+
280
+ pass
281
+
282
+ #%% Verify that all base URLs exist
283
+
284
+ # LILA camera trap primary metadata file
285
+ urls = (lila_metadata_url,
286
+ lila_taxonomy_mapping_url,
287
+ lila_all_images_url,
288
+ wildlife_insights_taxonomy_url)
289
+
290
+ from megadetector.utils import url_utils
291
+
292
+ status_codes = url_utils.test_urls(urls,timeout=2.0)
293
+ assert all([code == 200 for code in status_codes])
294
+
295
+
296
+ #%% Verify that the metadata URLs exist for individual datasets
297
+
298
+ metadata_dir = os.path.expanduser('~/lila/metadata')
299
+
300
+ dataset_metadata = read_lila_metadata(metadata_dir)
301
+
302
+ urls_to_test = []
303
+
304
+ # ds_name = next(iter(dataset_metadata.keys()))
305
+ for ds_name in dataset_metadata.keys():
306
+
307
+ ds_info = dataset_metadata[ds_name]
308
+ for cloud_name in lila_base_urls.keys():
309
+ urls_to_test.append(ds_info['metadata_url_' + cloud_name])
310
+ if ds_info['bbox_url_relative'] is not None:
311
+ urls_to_test.append(ds_info['bbox_url_' + cloud_name])
312
+
313
+ status_codes = url_utils.test_urls(urls_to_test,
314
+ error_on_failure=True,
315
+ n_workers=10,
316
+ pool_type='process',
317
+ timeout=2.0)
318
+ assert all([code == 200 for code in status_codes])
319
+