megadetector 10.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (147) hide show
  1. megadetector/__init__.py +0 -0
  2. megadetector/api/__init__.py +0 -0
  3. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  7. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  8. megadetector/classification/__init__.py +0 -0
  9. megadetector/classification/aggregate_classifier_probs.py +108 -0
  10. megadetector/classification/analyze_failed_images.py +227 -0
  11. megadetector/classification/cache_batchapi_outputs.py +198 -0
  12. megadetector/classification/create_classification_dataset.py +626 -0
  13. megadetector/classification/crop_detections.py +516 -0
  14. megadetector/classification/csv_to_json.py +226 -0
  15. megadetector/classification/detect_and_crop.py +853 -0
  16. megadetector/classification/efficientnet/__init__.py +9 -0
  17. megadetector/classification/efficientnet/model.py +415 -0
  18. megadetector/classification/efficientnet/utils.py +608 -0
  19. megadetector/classification/evaluate_model.py +520 -0
  20. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  21. megadetector/classification/json_to_azcopy_list.py +63 -0
  22. megadetector/classification/json_validator.py +696 -0
  23. megadetector/classification/map_classification_categories.py +276 -0
  24. megadetector/classification/merge_classification_detection_output.py +509 -0
  25. megadetector/classification/prepare_classification_script.py +194 -0
  26. megadetector/classification/prepare_classification_script_mc.py +228 -0
  27. megadetector/classification/run_classifier.py +287 -0
  28. megadetector/classification/save_mislabeled.py +110 -0
  29. megadetector/classification/train_classifier.py +827 -0
  30. megadetector/classification/train_classifier_tf.py +725 -0
  31. megadetector/classification/train_utils.py +323 -0
  32. megadetector/data_management/__init__.py +0 -0
  33. megadetector/data_management/animl_to_md.py +161 -0
  34. megadetector/data_management/annotations/__init__.py +0 -0
  35. megadetector/data_management/annotations/annotation_constants.py +33 -0
  36. megadetector/data_management/camtrap_dp_to_coco.py +270 -0
  37. megadetector/data_management/cct_json_utils.py +566 -0
  38. megadetector/data_management/cct_to_md.py +184 -0
  39. megadetector/data_management/cct_to_wi.py +293 -0
  40. megadetector/data_management/coco_to_labelme.py +284 -0
  41. megadetector/data_management/coco_to_yolo.py +702 -0
  42. megadetector/data_management/databases/__init__.py +0 -0
  43. megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
  44. megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
  45. megadetector/data_management/databases/integrity_check_json_db.py +528 -0
  46. megadetector/data_management/databases/subset_json_db.py +195 -0
  47. megadetector/data_management/generate_crops_from_cct.py +200 -0
  48. megadetector/data_management/get_image_sizes.py +164 -0
  49. megadetector/data_management/labelme_to_coco.py +559 -0
  50. megadetector/data_management/labelme_to_yolo.py +349 -0
  51. megadetector/data_management/lila/__init__.py +0 -0
  52. megadetector/data_management/lila/create_lila_blank_set.py +556 -0
  53. megadetector/data_management/lila/create_lila_test_set.py +187 -0
  54. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  55. megadetector/data_management/lila/download_lila_subset.py +182 -0
  56. megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
  57. megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
  58. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  59. megadetector/data_management/lila/lila_common.py +319 -0
  60. megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
  61. megadetector/data_management/mewc_to_md.py +344 -0
  62. megadetector/data_management/ocr_tools.py +873 -0
  63. megadetector/data_management/read_exif.py +964 -0
  64. megadetector/data_management/remap_coco_categories.py +195 -0
  65. megadetector/data_management/remove_exif.py +156 -0
  66. megadetector/data_management/rename_images.py +194 -0
  67. megadetector/data_management/resize_coco_dataset.py +663 -0
  68. megadetector/data_management/speciesnet_to_md.py +41 -0
  69. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  70. megadetector/data_management/yolo_output_to_md_output.py +594 -0
  71. megadetector/data_management/yolo_to_coco.py +876 -0
  72. megadetector/data_management/zamba_to_md.py +188 -0
  73. megadetector/detection/__init__.py +0 -0
  74. megadetector/detection/change_detection.py +840 -0
  75. megadetector/detection/process_video.py +479 -0
  76. megadetector/detection/pytorch_detector.py +1451 -0
  77. megadetector/detection/run_detector.py +1267 -0
  78. megadetector/detection/run_detector_batch.py +2159 -0
  79. megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
  80. megadetector/detection/run_md_and_speciesnet.py +1494 -0
  81. megadetector/detection/run_tiled_inference.py +1038 -0
  82. megadetector/detection/tf_detector.py +209 -0
  83. megadetector/detection/video_utils.py +1379 -0
  84. megadetector/postprocessing/__init__.py +0 -0
  85. megadetector/postprocessing/add_max_conf.py +72 -0
  86. megadetector/postprocessing/categorize_detections_by_size.py +166 -0
  87. megadetector/postprocessing/classification_postprocessing.py +1752 -0
  88. megadetector/postprocessing/combine_batch_outputs.py +249 -0
  89. megadetector/postprocessing/compare_batch_results.py +2110 -0
  90. megadetector/postprocessing/convert_output_format.py +403 -0
  91. megadetector/postprocessing/create_crop_folder.py +629 -0
  92. megadetector/postprocessing/detector_calibration.py +570 -0
  93. megadetector/postprocessing/generate_csv_report.py +522 -0
  94. megadetector/postprocessing/load_api_results.py +223 -0
  95. megadetector/postprocessing/md_to_coco.py +428 -0
  96. megadetector/postprocessing/md_to_labelme.py +351 -0
  97. megadetector/postprocessing/md_to_wi.py +41 -0
  98. megadetector/postprocessing/merge_detections.py +392 -0
  99. megadetector/postprocessing/postprocess_batch_results.py +2077 -0
  100. megadetector/postprocessing/remap_detection_categories.py +226 -0
  101. megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
  102. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
  103. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
  104. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
  105. megadetector/postprocessing/separate_detections_into_folders.py +795 -0
  106. megadetector/postprocessing/subset_json_detector_output.py +964 -0
  107. megadetector/postprocessing/top_folders_to_bottom.py +238 -0
  108. megadetector/postprocessing/validate_batch_results.py +332 -0
  109. megadetector/taxonomy_mapping/__init__.py +0 -0
  110. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  111. megadetector/taxonomy_mapping/map_new_lila_datasets.py +213 -0
  112. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
  113. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
  114. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  115. megadetector/taxonomy_mapping/simple_image_download.py +224 -0
  116. megadetector/taxonomy_mapping/species_lookup.py +1008 -0
  117. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  118. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  119. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  120. megadetector/tests/__init__.py +0 -0
  121. megadetector/tests/test_nms_synthetic.py +335 -0
  122. megadetector/utils/__init__.py +0 -0
  123. megadetector/utils/ct_utils.py +1857 -0
  124. megadetector/utils/directory_listing.py +199 -0
  125. megadetector/utils/extract_frames_from_video.py +307 -0
  126. megadetector/utils/gpu_test.py +125 -0
  127. megadetector/utils/md_tests.py +2072 -0
  128. megadetector/utils/path_utils.py +2832 -0
  129. megadetector/utils/process_utils.py +172 -0
  130. megadetector/utils/split_locations_into_train_val.py +237 -0
  131. megadetector/utils/string_utils.py +234 -0
  132. megadetector/utils/url_utils.py +825 -0
  133. megadetector/utils/wi_platform_utils.py +968 -0
  134. megadetector/utils/wi_taxonomy_utils.py +1759 -0
  135. megadetector/utils/write_html_image_list.py +239 -0
  136. megadetector/visualization/__init__.py +0 -0
  137. megadetector/visualization/plot_utils.py +309 -0
  138. megadetector/visualization/render_images_with_thumbnails.py +243 -0
  139. megadetector/visualization/visualization_utils.py +1940 -0
  140. megadetector/visualization/visualize_db.py +630 -0
  141. megadetector/visualization/visualize_detector_output.py +479 -0
  142. megadetector/visualization/visualize_video_output.py +705 -0
  143. megadetector-10.0.13.dist-info/METADATA +134 -0
  144. megadetector-10.0.13.dist-info/RECORD +147 -0
  145. megadetector-10.0.13.dist-info/WHEEL +5 -0
  146. megadetector-10.0.13.dist-info/licenses/LICENSE +19 -0
  147. megadetector-10.0.13.dist-info/top_level.txt +1 -0
@@ -0,0 +1,777 @@
1
+ """
2
+
3
+ generate_lila_per_image_labels.py
4
+
5
+ Generate a .csv file with one row per annotation, containing full URLs to every
6
+ camera trap image on LILA, with taxonomically expanded labels.
7
+
8
+ Typically there will be one row per image, though images with multiple annotations
9
+ will have multiple rows.
10
+
11
+ Some images may not physically exist, particularly images that are labeled as "human".
12
+ This script does not validate image URLs.
13
+
14
+ Does not include bounding box annotations.
15
+
16
+ """
17
+
18
+ #%% Constants and imports
19
+
20
+ import os
21
+ import json
22
+ import pandas as pd
23
+ import numpy as np
24
+ import dateparser # type: ignore
25
+ import csv
26
+
27
+ from collections import defaultdict
28
+ from tqdm import tqdm
29
+
30
+ from megadetector.data_management.lila.lila_common import \
31
+ read_lila_metadata, \
32
+ read_metadata_file_for_dataset, \
33
+ read_lila_taxonomy_mapping
34
+
35
+ from megadetector.utils import write_html_image_list
36
+ from megadetector.utils.path_utils import zip_file
37
+ from megadetector.utils.path_utils import open_file
38
+ from megadetector.utils.url_utils import parallel_download_urls
39
+
40
+ # We'll write images, metadata downloads, and temporary files here
41
+ lila_local_base = os.path.expanduser('~/lila')
42
+ preview_folder = os.path.join(lila_local_base,'csv_preview')
43
+
44
+ os.makedirs(lila_local_base,exist_ok=True)
45
+
46
+ metadata_dir = os.path.join(lila_local_base,'metadata')
47
+ os.makedirs(metadata_dir,exist_ok=True)
48
+
49
+ output_file = os.path.join(lila_local_base,'lila_image_urls_and_labels.csv')
50
+
51
+ # Some datasets don't have "sequence_level_annotation" fields populated, but we know their
52
+ # annotation level
53
+ ds_name_to_annotation_level = {}
54
+ ds_name_to_annotation_level['Caltech Camera Traps'] = 'image'
55
+ ds_name_to_annotation_level['ENA24'] = 'image'
56
+ ds_name_to_annotation_level['Island Conservation Camera Traps'] = 'image'
57
+ ds_name_to_annotation_level['Channel IslandsCamera Traps'] = 'image'
58
+ ds_name_to_annotation_level['WCS Camera Traps'] = 'sequence'
59
+ ds_name_to_annotation_level['Wellington Camera Traps'] = 'sequence'
60
+ ds_name_to_annotation_level['NACTI'] = 'unknown'
61
+ ds_name_to_annotation_level['Seattle(ish) Camera Traps'] = 'image'
62
+
63
+ known_unmapped_labels = set(['WCS Camera Traps:#ref!'])
64
+
65
+ debug_max_images_per_dataset = -1
66
+ if debug_max_images_per_dataset > 0:
67
+ print('Running in debug mode')
68
+ output_file = output_file.replace('.csv','_debug.csv')
69
+
70
+ taxonomy_levels_to_include = \
71
+ ['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
72
+ 'suborder','infraorder','superfamily','family','subfamily','tribe','genus','subgenus',
73
+ 'species','subspecies','variety']
74
+
75
+ def _clearnan(v):
76
+ if isinstance(v,float):
77
+ assert np.isnan(v)
78
+ v = ''
79
+ assert isinstance(v,str)
80
+ return v
81
+
82
+
83
+ #%% Download and parse the metadata file
84
+
85
+ metadata_table = read_lila_metadata(metadata_dir)
86
+
87
+ # To select an individual data set for debugging
88
+ if False:
89
+ k = 'Idaho Camera Traps'
90
+ metadata_table = {k:metadata_table[k]}
91
+
92
+
93
+ #%% Download and extract metadata for each dataset
94
+
95
+ for ds_name in metadata_table.keys():
96
+ metadata_table[ds_name]['metadata_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
97
+ metadata_dir=metadata_dir,
98
+ metadata_table=metadata_table)
99
+
100
+ #%% Load taxonomy data
101
+
102
+ taxonomy_df = read_lila_taxonomy_mapping(metadata_dir, force_download=True)
103
+
104
+
105
+ #%% Build a dictionary that maps each [dataset,query] pair to the full taxonomic label set
106
+
107
+ ds_label_to_taxonomy = {}
108
+
109
+ # i_row = 0; row = taxonomy_df.iloc[i_row]
110
+ for i_row,row in taxonomy_df.iterrows():
111
+
112
+ ds_label = row['dataset_name'] + ':' + row['query']
113
+ assert ds_label.strip() == ds_label
114
+ assert ds_label not in ds_label_to_taxonomy
115
+ ds_label_to_taxonomy[ds_label] = row.to_dict()
116
+
117
+
118
+ #%% Process annotations for each dataset
119
+
120
+ # Takes a few hours
121
+
122
+ # The order of these headers needs to match the order in which fields are added later in this cell;
123
+ # don't mess with this order.
124
+ header = ['dataset_name','url_gcp','url_aws','url_azure',
125
+ 'image_id','sequence_id','location_id','frame_num',
126
+ 'original_label','scientific_name','common_name','datetime','annotation_level']
127
+
128
+ header.extend(taxonomy_levels_to_include)
129
+
130
+ missing_annotations = set()
131
+
132
+ with open(output_file,'w',encoding='utf-8',newline='') as f:
133
+
134
+ csv_writer = csv.writer(f)
135
+ csv_writer.writerow(header)
136
+
137
+ # ds_name = list(metadata_table.keys())[0]
138
+ for ds_name in metadata_table.keys():
139
+
140
+ if 'bbox' in ds_name:
141
+ print('Skipping bbox dataset {}'.format(ds_name))
142
+ continue
143
+
144
+ print('Processing dataset {}'.format(ds_name))
145
+
146
+ json_filename = metadata_table[ds_name]['metadata_filename']
147
+ with open(json_filename, 'r') as f:
148
+ data = json.load(f)
149
+
150
+ categories = data['categories']
151
+ for c in categories:
152
+ category_id_to_name = {c['id']:c['name'] for c in categories}
153
+
154
+ annotations = data['annotations']
155
+ images = data['images']
156
+
157
+ image_id_to_annotations = defaultdict(list)
158
+
159
+ # Go through annotations, marking each image with the categories that are present
160
+ #
161
+ # ann = annotations[0]
162
+ for ann in annotations:
163
+ image_id_to_annotations[ann['image_id']].append(ann)
164
+
165
+ unannotated_images = []
166
+
167
+ found_date = False
168
+ found_location = False
169
+ found_annotation_level = False
170
+
171
+ if ds_name in ds_name_to_annotation_level:
172
+ expected_annotation_level = ds_name_to_annotation_level[ds_name]
173
+ else:
174
+ expected_annotation_level = None
175
+
176
+ # im = images[10]
177
+ for i_image,im in tqdm(enumerate(images),total=len(images)):
178
+
179
+ if (debug_max_images_per_dataset is not None) and (debug_max_images_per_dataset > 0) \
180
+ and (i_image >= debug_max_images_per_dataset):
181
+ break
182
+
183
+ file_name = im['file_name'].replace('\\','/')
184
+ base_url_gcp = metadata_table[ds_name]['image_base_url_gcp']
185
+ base_url_aws = metadata_table[ds_name]['image_base_url_aws']
186
+ base_url_azure = metadata_table[ds_name]['image_base_url_azure']
187
+ assert not base_url_gcp.endswith('/')
188
+ assert not base_url_aws.endswith('/')
189
+ assert not base_url_azure.endswith('/')
190
+
191
+ url_gcp = base_url_gcp + '/' + file_name
192
+ url_aws = base_url_aws + '/' + file_name
193
+ url_azure = base_url_azure + '/' + file_name
194
+
195
+ for k in im.keys():
196
+ if ('date' in k or 'time' in k) and (k not in ['datetime','date_captured']):
197
+ raise ValueError('Unrecognized datetime field')
198
+
199
+ # This field name was only used for Caltech Camera Traps
200
+ if 'date_captured' in im:
201
+ assert ds_name == 'Caltech Camera Traps'
202
+ im['datetime'] = im['date_captured']
203
+
204
+ def _has_valid_datetime(im):
205
+ if 'datetime' not in im:
206
+ return False
207
+ v = im['datetime']
208
+ if v is None:
209
+ return False
210
+ if isinstance(v,str):
211
+ return len(v) > 0
212
+ else:
213
+ assert isinstance(v,float) and np.isnan(v)
214
+ return False
215
+
216
+ dt_string = ''
217
+ if (_has_valid_datetime(im)):
218
+
219
+ dt = dateparser.parse(im['datetime'])
220
+
221
+ if dt is None or dt.year < 1990 or dt.year > 2025:
222
+
223
+ # raise ValueError('Suspicious date parsing result')
224
+
225
+ # Special case we don't want to print a warning about... this is
226
+ # in invalid date that very likely originates on the camera, not at
227
+ # some intermediate processing step.
228
+ #
229
+ # print('Suspicious date for image {}: {} ({})'.format(
230
+ # im['id'], im['datetime'], ds_name))
231
+ pass
232
+
233
+ else:
234
+
235
+ found_date = True
236
+ dt_string = dt.strftime("%m-%d-%Y %H:%M:%S")
237
+
238
+ # Location, sequence, and image IDs are only guaranteed to be unique within
239
+ # a dataset, so for the output .csv file, include both
240
+ if 'location' in im:
241
+ found_location = True
242
+ location_id = ds_name + ' : ' + str(im['location'])
243
+ else:
244
+ location_id = ds_name
245
+
246
+ image_id = ds_name + ' : ' + str(im['id'])
247
+
248
+ if 'seq_id' in im:
249
+ sequence_id = ds_name + ' : ' + str(im['seq_id'])
250
+ else:
251
+ sequence_id = ds_name + ' : ' + 'unknown'
252
+
253
+ if 'frame_num' in im:
254
+ frame_num = im['frame_num']
255
+ else:
256
+ frame_num = -1
257
+
258
+ annotations_this_image = image_id_to_annotations[im['id']]
259
+
260
+ categories_this_image = set()
261
+
262
+ annotation_level = 'unknown'
263
+
264
+ for ann in annotations_this_image:
265
+ assert ann['image_id'] == im['id']
266
+ categories_this_image.add(category_id_to_name[ann['category_id']])
267
+ if 'sequence_level_annotation' in ann:
268
+ found_annotation_level = True
269
+ if ann['sequence_level_annotation']:
270
+ annotation_level = 'sequence'
271
+ else:
272
+ annotation_level = 'image'
273
+ if expected_annotation_level is not None:
274
+ assert expected_annotation_level == annotation_level,\
275
+ 'Unexpected annotation level'
276
+ elif expected_annotation_level is not None:
277
+ annotation_level = expected_annotation_level
278
+
279
+ if len(categories_this_image) == 0:
280
+ unannotated_images.append(im)
281
+ continue
282
+
283
+ # category_name = list(categories_this_image)[0]
284
+ for category_name in categories_this_image:
285
+
286
+ ds_label = ds_name + ':' + category_name.lower()
287
+
288
+ if ds_label not in ds_label_to_taxonomy:
289
+
290
+ assert ds_label in known_unmapped_labels
291
+
292
+ # Only print a warning the first time we see an unmapped label
293
+ if ds_label not in missing_annotations:
294
+ print('Warning: {} not in taxonomy file'.format(ds_label))
295
+ missing_annotations.add(ds_label)
296
+ continue
297
+
298
+ taxonomy_labels = ds_label_to_taxonomy[ds_label]
299
+
300
+ """
301
+ header =
302
+ ['dataset_name','url','image_id','sequence_id','location_id',
303
+ 'frame_num','original_label','scientific_name','common_name',
304
+ 'datetime','annotation_level']
305
+ """
306
+
307
+ row = []
308
+ row.append(ds_name)
309
+ row.append(url_gcp)
310
+ row.append(url_aws)
311
+ row.append(url_azure)
312
+ row.append(image_id)
313
+ row.append(sequence_id)
314
+ row.append(location_id)
315
+ row.append(frame_num)
316
+ row.append(taxonomy_labels['query'])
317
+ row.append(_clearnan(taxonomy_labels['scientific_name']))
318
+ row.append(_clearnan(taxonomy_labels['common_name']))
319
+ row.append(dt_string)
320
+ row.append(annotation_level)
321
+
322
+ for s in taxonomy_levels_to_include:
323
+ row.append(_clearnan(taxonomy_labels[s]))
324
+
325
+ assert len(row) == len(header)
326
+
327
+ csv_writer.writerow(row)
328
+
329
+ # ...for each category that was applied at least once to this image
330
+
331
+ # ...for each image in this dataset
332
+
333
+ if not found_date:
334
+ pass
335
+ # print('Warning: no date information available for this dataset')
336
+
337
+ if not found_location:
338
+ pass
339
+ # print('Warning: no location information available for this dataset')
340
+
341
+ if not found_annotation_level and (ds_name not in ds_name_to_annotation_level):
342
+ print('Warning: no annotation level information available for this dataset')
343
+
344
+ if len(unannotated_images) > 0:
345
+ print('Warning: {} of {} images are un-annotated\n'.\
346
+ format(len(unannotated_images),len(images)))
347
+
348
+ # ...for each dataset
349
+
350
+ # ...with open()
351
+
352
+ print('\nProcessed {} datasets'.format(len(metadata_table)))
353
+
354
+
355
+ #%% Read the .csv back
356
+
357
+ df = pd.read_csv(output_file, low_memory=False)
358
+ print('Read {} rows from {}'.format(len(df),output_file))
359
+
360
+
361
+ #%% Do some post-hoc integrity checking
362
+
363
+ # Takes ~5 minutes with apply(), or ~10 minutes without apply()
364
+ #
365
+ # Using apply() is faster, but more annoying to debug.
366
+ use_pandas_apply_for_integrity_checking = True
367
+
368
+ tqdm.pandas()
369
+
370
+ def _isint(v):
371
+ return isinstance(v,int) or isinstance(v,np.int64)
372
+
373
+ valid_annotation_levels = set(['sequence','image','unknown'])
374
+
375
+ # Collect a list of locations within each dataset; we'll use this
376
+ # in the next cell to look for datasets that only have a single location
377
+ dataset_name_to_locations = defaultdict(set)
378
+
379
+ def _check_row(row):
380
+
381
+ assert row['dataset_name'] in metadata_table.keys()
382
+ for url_column in ['url_gcp','url_aws','url_azure']:
383
+ assert row[url_column].startswith('https://') or row[url_column].startswith('http://')
384
+ assert ' : ' in row['image_id']
385
+ assert 'seq' not in row['location_id'].lower()
386
+ assert row['annotation_level'] in valid_annotation_levels
387
+
388
+ # frame_num should either be NaN or an integer
389
+ if isinstance(row['frame_num'],float):
390
+ assert np.isnan(row['frame_num'])
391
+ else:
392
+ # -1 is sometimes used for sequences of unknown length
393
+ assert _isint(row['frame_num']) and row['frame_num'] >= -1
394
+
395
+ ds_name = row['dataset_name']
396
+ dataset_name_to_locations[ds_name].add(row['location_id'])
397
+
398
+ if use_pandas_apply_for_integrity_checking:
399
+
400
+ df.progress_apply(_check_row, axis=1)
401
+
402
+ else:
403
+
404
+ # i_row = 0; row = df.iloc[i_row]
405
+ for i_row,row in tqdm(df.iterrows(),total=len(df)):
406
+ _check_row(row)
407
+
408
+
409
+ #%% Check for datasets that have only one location string (typically "unknown")
410
+
411
+ # Expected: ENA24, Missouri Camera Traps, Desert Lion Conservation Camera Traps
412
+
413
+ for ds_name in dataset_name_to_locations.keys():
414
+ if len(dataset_name_to_locations[ds_name]) == 1:
415
+ print('No location information for {}'.format(ds_name))
416
+
417
+
418
+ #%% Preview constants
419
+
420
+ n_empty_images_per_dataset = 3
421
+ n_non_empty_images_per_dataset = 10
422
+
423
+ os.makedirs(preview_folder,exist_ok=True)
424
+
425
+
426
+ #%% Choose images to download
427
+
428
+ # Takes ~60 seconds
429
+
430
+ np.random.seed(0)
431
+ images_to_download = []
432
+
433
+ # ds_name = list(metadata_table.keys())[2]
434
+ for ds_name in metadata_table.keys():
435
+
436
+ if 'bbox' in ds_name:
437
+ continue
438
+
439
+ # Find all rows for this dataset
440
+ ds_rows = df.loc[df['dataset_name'] == ds_name]
441
+
442
+ print('{} rows available for {}'.format(len(ds_rows),ds_name))
443
+ assert len(ds_rows) > 0
444
+
445
+ empty_rows = ds_rows[ds_rows['scientific_name'].isnull()]
446
+ non_empty_rows = ds_rows[~ds_rows['scientific_name'].isnull()]
447
+
448
+ if len(empty_rows) == 0:
449
+ print('No empty images available for {}'.format(ds_name))
450
+ elif len(empty_rows) > n_empty_images_per_dataset:
451
+ empty_rows = empty_rows.sample(n=n_empty_images_per_dataset)
452
+ images_to_download.extend(empty_rows.to_dict('records'))
453
+
454
+ # All LILA datasets have non-empty images
455
+ if len(non_empty_rows) == 0:
456
+ raise ValueError('No non-empty images available for {}'.format(ds_name))
457
+ elif len(non_empty_rows) > n_non_empty_images_per_dataset:
458
+ non_empty_rows = non_empty_rows.sample(n=n_non_empty_images_per_dataset)
459
+ images_to_download.extend(non_empty_rows.to_dict('records'))
460
+
461
+ # ...for each dataset
462
+
463
+ print('Selected {} total images'.format(len(images_to_download)))
464
+
465
+
466
+ #%% Download images (prep)
467
+
468
+ # Expect a few errors for images with human or vehicle labels (or things like "ignore" that *could* be humans)
469
+
470
+ preferred_cloud = 'gcp'
471
+
472
+ url_to_target_file = {}
473
+
474
+ # i_image = 10; image = images_to_download[i_image]
475
+ for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_download)):
476
+
477
+ url = image['url_' + preferred_cloud]
478
+ ext = os.path.splitext(url)[1]
479
+ fn_relative = 'image_{}'.format(str(i_image).zfill(4)) + ext
480
+ fn_abs = os.path.join(preview_folder,fn_relative)
481
+ image['relative_file'] = fn_relative
482
+ image['url'] = url
483
+ url_to_target_file[url] = fn_abs
484
+
485
+
486
+ #%% Download images (execution)
487
+
488
+ download_results = parallel_download_urls(url_to_target_file,verbose=False,overwrite=True,
489
+ n_workers=20,pool_type='thread')
490
+
491
+ # 10-20 errors is normal; they should all be images that are labeled as "human"
492
+ errors = []
493
+
494
+ for r in download_results:
495
+ if r['status'] != 'success':
496
+ errors.append(r)
497
+
498
+ assert len(download_results) == len(url_to_target_file)
499
+ print('Errors on {} of {} downloads:\n'.format(len(errors),len(download_results)))
500
+
501
+ for err in errors:
502
+ print(err['url'])
503
+
504
+
505
+ #%% Write preview HTML
506
+
507
+ html_filename = os.path.join(preview_folder,'index.html')
508
+
509
+ html_images = []
510
+
511
+ # im = images_to_download[0]
512
+ for im in images_to_download:
513
+
514
+ if im['relative_file'] is None:
515
+ continue
516
+
517
+ output_im = {}
518
+ output_im['filename'] = im['relative_file']
519
+ output_im['linkTarget'] = im['url']
520
+ output_im['title'] = '<b>{}: {}</b><br/><br/>'.format(im['dataset_name'],im['original_label']) + str(im)
521
+ output_im['imageStyle'] = 'width:600px;'
522
+ output_im['textStyle'] = 'font-weight:normal;font-size:100%;'
523
+ html_images.append(output_im)
524
+
525
+ write_html_image_list.write_html_image_list(html_filename,html_images)
526
+
527
+ open_file(html_filename)
528
+
529
+
530
+ #%% Zip output file
531
+
532
+ zipped_output_file = zip_file(output_file,verbose=True,overwrite=True)
533
+
534
+ print('Zipped {} to {}'.format(output_file,zipped_output_file))
535
+
536
+
537
+ #%% Experimental: convert to .json
538
+
539
+ """
540
+ The .csv file "output_file" (already loaded into the variable "df" at this point) has the following columns:
541
+
542
+ dataset_name,url_gcp,url_aws,url_azure,image_id,sequence_id,location_id,frame_num,original_label,scientific_name,common_name,datetime,annotation_level,kingdom,phylum,subphylum,superclass,class,subclass,infraclass,superorder,order,suborder,infraorder,superfamily,family,subfamily,tribe,genus,subgenus,species,subspecies,variety
543
+
544
+ Each row in the .csv represents an image. The URL columns represent the location of that
545
+ image on three different clouds; for a given image, the value of those columns differs only
546
+ in the prefix. The columns starting with "kingdom" represent a taxonomic wildlife identifier. Not
547
+ all rows have values in all of these columns; some rows represent non-wildlife images where all of these
548
+ columns are blank.
549
+
550
+ This cell converts this to a .json dictionary, with the following top-level keys:
551
+
552
+ ## datasets (dict)
553
+
554
+ A dict mapping integer IDs to strings.
555
+
556
+ Each unique value in the "dataset_name" column should become an element in this dict with a unique ID.
557
+
558
+ ## sequences (dict)
559
+
560
+ A dict mapping integer IDs to strings.
561
+
562
+ Each unique value in the "sequence_id" column should become an element in this dict with a unique ID.
563
+
564
+ ## locations (dict)
565
+
566
+ A dict mapping integer IDs to strings.
567
+
568
+ Each unique value in the "location_id" column should become an element in this dict with a unique ID.
569
+
570
+ ## base_urls (dict)
571
+
572
+ This key should point to the following dict:
573
+
574
+ {
575
+ "gcp": "https://storage.googleapis.com/public-datasets-lila/",
576
+ "aws": "http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/",
577
+ "azure": "https://lilawildlife.blob.core.windows.net/lila-wildlife/",
578
+ }
579
+
580
+ All values in the url_gcp, url_aws, and url_azure columns start with these values, respectively.
581
+
582
+ ## taxa (dict)
583
+
584
+ A dict mapping integer IDs to dicts, where each dict has the fields:
585
+
586
+ kingdom,phylum,subphylum,superclass,class,subclass,infraclass,superorder,order,suborder,infraorder,superfamily,family,subfamily,tribe,genus,subgenus,species,subspecies,variety
587
+
588
+ The value of each of these fields in each row is either a string or None.
589
+
590
+ ## images (list)
591
+
592
+ A list of images, where each image is a dict with the following fields:
593
+
594
+ ### dataset (int)
595
+
596
+ The integer ID corresponding to the dataset_name column for this image
597
+
598
+ ### path (str)
599
+
600
+ The suffix for this image's URL, which should be the same across the three URL columns.
601
+
602
+ ### seq (int)
603
+
604
+ The integer ID corresponding to the sequence_id column for this image
605
+
606
+ ### loc (int)
607
+
608
+ The integer ID corresponding to the location_id column for this image
609
+
610
+ ### frame_num
611
+
612
+ The value of the frame_num column for this image, unless the original value was -1,
613
+ in which case this is omitted.
614
+
615
+ ### original_label
616
+
617
+ The value of the original_label column for this image
618
+
619
+ ### common_name
620
+
621
+ The value of the common_name column for this image, if not empty
622
+
623
+ ### datetime
624
+
625
+ The value of the datetime column for this image
626
+
627
+ ### ann_level
628
+
629
+ The value of the annotation_level column for this image
630
+
631
+ ### taxon
632
+
633
+ The integer ID corresponding to the taxonomic identifier columns for this image
634
+
635
+ --
636
+
637
+ The original .csv file is large (~15GB); this may impact the implementation of the .json conversion. Speed of
638
+ conversion is not a priority.
639
+
640
+ """
641
+
642
+ print('Converting to JSON...')
643
+
644
+ output_json_file = output_file.replace('.csv', '.json')
645
+
646
+ json_data = {}
647
+
648
+ # Create mappings for datasets, sequences, and locations
649
+ dataset_to_id = {}
650
+ sequence_to_id = {}
651
+ location_to_id = {}
652
+ taxa_to_id = {}
653
+
654
+ next_dataset_id = 0
655
+ next_sequence_id = 0
656
+ next_location_id = 0
657
+ next_taxa_id = 0
658
+
659
+ json_data['datasets'] = {}
660
+ json_data['sequences'] = {}
661
+ json_data['locations'] = {}
662
+ json_data['taxa'] = {}
663
+
664
+ json_data['base_urls'] = {
665
+ "gcp": "https://storage.googleapis.com/public-datasets-lila/",
666
+ "aws": "http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/",
667
+ "azure": "https://lilawildlife.blob.core.windows.net/lila-wildlife/",
668
+ }
669
+
670
+ json_data['images'] = []
671
+
672
+ debug_max_json_conversion_rows = None
673
+
674
+ print('Counting rows in .csv file...')
675
+
676
+ # Get total number of lines for progress bar (optional, but helpful for large files)
677
+ def _count_lines(filename):
678
+ with open(filename, 'r', encoding='utf-8') as f:
679
+ return sum(1 for line in f) - 1
680
+
681
+ total_rows = _count_lines(output_file)
682
+ print('Total rows to process: {}'.format(total_rows))
683
+
684
+ # Read CSV file line by line
685
+ with open(output_file, 'r', encoding='utf-8') as csvfile:
686
+
687
+ reader = csv.DictReader(csvfile)
688
+
689
+ # Process each row
690
+ for i_row, row in enumerate(tqdm(reader, total=total_rows, desc="Processing rows")):
691
+
692
+ if (debug_max_json_conversion_rows is not None) and (i_row >= debug_max_json_conversion_rows):
693
+ break
694
+
695
+ # Datasets
696
+ dataset_name = row['dataset_name']
697
+ if dataset_name not in dataset_to_id:
698
+ dataset_to_id[dataset_name] = next_dataset_id
699
+ json_data['datasets'][str(next_dataset_id)] = dataset_name
700
+ next_dataset_id += 1
701
+ dataset_id = dataset_to_id[dataset_name]
702
+
703
+ # Sequences
704
+ sequence_id_str = row['sequence_id']
705
+ assert sequence_id_str.startswith(dataset_name + ' : ')
706
+ if sequence_id_str not in sequence_to_id:
707
+ sequence_to_id[sequence_id_str] = next_sequence_id
708
+ json_data['sequences'][str(next_sequence_id)] = sequence_id_str
709
+ next_sequence_id += 1
710
+ sequence_id = sequence_to_id[sequence_id_str]
711
+
712
+ # Locations
713
+ location_id_str = row['location_id']
714
+ assert location_id_str.startswith(dataset_name) # + ' : ')
715
+ if location_id_str not in location_to_id:
716
+ location_to_id[location_id_str] = next_location_id
717
+ json_data['locations'][str(next_location_id)] = location_id_str
718
+ next_location_id += 1
719
+ location_id = location_to_id[location_id_str]
720
+
721
+ # Taxa
722
+ taxa_data = {level: _clearnan(row[level]) for level in taxonomy_levels_to_include}
723
+ taxa_tuple = tuple(taxa_data.items()) # use tuple for hashable key
724
+ if taxa_tuple not in taxa_to_id:
725
+ taxa_to_id[taxa_tuple] = next_taxa_id
726
+ json_data['taxa'][str(next_taxa_id)] = taxa_data
727
+ next_taxa_id += 1
728
+ taxa_id = taxa_to_id[taxa_tuple]
729
+
730
+ # Image path
731
+ url_gcp = row['url_gcp']
732
+ assert url_gcp.startswith(json_data['base_urls']['gcp'])
733
+ path = url_gcp.replace(json_data['base_urls']['gcp'], '')
734
+
735
+ common_name = _clearnan(row['common_name'])
736
+
737
+ # Convert to float first in case this appears in the .csv file as, e.g. "3.0"
738
+ frame_num = int(float(row['frame_num']))
739
+
740
+ # Image data
741
+ image_entry = {
742
+ 'dataset': dataset_id,
743
+ 'path': path,
744
+ 'seq': sequence_id,
745
+ 'loc': location_id,
746
+ 'ann_level': row['annotation_level'],
747
+ 'original_label': row['original_label'],
748
+ 'datetime': row['datetime'],
749
+ 'taxon': taxa_id
750
+ }
751
+
752
+ if frame_num >= 0:
753
+ image_entry['frame_num'] = frame_num
754
+
755
+ if len(common_name) > 0:
756
+ image_entry['common_name'] = common_name
757
+
758
+ json_data['images'].append(image_entry)
759
+
760
+ # ...for each line
761
+
762
+ # ...with open(...)
763
+
764
+ # Save the JSON data
765
+ print('Saving JSON file...')
766
+ with open(output_json_file, 'w', encoding='utf-8') as f:
767
+ json.dump(json_data, f, indent=1)
768
+
769
+ print(f'Converted to JSON and saved to {output_json_file}')
770
+ print(f'JSON file size: {os.path.getsize(output_json_file)/(1024*1024*1024):.2f} GB')
771
+
772
+ # Print summary statistics
773
+ print(f'Total datasets: {len(json_data["datasets"])}')
774
+ print(f'Total sequences: {len(json_data["sequences"])}')
775
+ print(f'Total locations: {len(json_data["locations"])}')
776
+ print(f'Total taxa: {len(json_data["taxa"])}')
777
+ print(f'Total images: {len(json_data["images"])}')