megadetector 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (197) hide show
  1. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
  2. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
  3. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
  4. megadetector/classification/aggregate_classifier_probs.py +3 -3
  5. megadetector/classification/analyze_failed_images.py +5 -5
  6. megadetector/classification/cache_batchapi_outputs.py +5 -5
  7. megadetector/classification/create_classification_dataset.py +11 -12
  8. megadetector/classification/crop_detections.py +10 -10
  9. megadetector/classification/csv_to_json.py +8 -8
  10. megadetector/classification/detect_and_crop.py +13 -15
  11. megadetector/classification/efficientnet/model.py +8 -8
  12. megadetector/classification/efficientnet/utils.py +6 -5
  13. megadetector/classification/evaluate_model.py +7 -7
  14. megadetector/classification/identify_mislabeled_candidates.py +6 -6
  15. megadetector/classification/json_to_azcopy_list.py +1 -1
  16. megadetector/classification/json_validator.py +29 -32
  17. megadetector/classification/map_classification_categories.py +9 -9
  18. megadetector/classification/merge_classification_detection_output.py +12 -9
  19. megadetector/classification/prepare_classification_script.py +19 -19
  20. megadetector/classification/prepare_classification_script_mc.py +26 -26
  21. megadetector/classification/run_classifier.py +4 -4
  22. megadetector/classification/save_mislabeled.py +6 -6
  23. megadetector/classification/train_classifier.py +1 -1
  24. megadetector/classification/train_classifier_tf.py +9 -9
  25. megadetector/classification/train_utils.py +10 -10
  26. megadetector/data_management/annotations/annotation_constants.py +1 -2
  27. megadetector/data_management/camtrap_dp_to_coco.py +79 -46
  28. megadetector/data_management/cct_json_utils.py +103 -103
  29. megadetector/data_management/cct_to_md.py +49 -49
  30. megadetector/data_management/cct_to_wi.py +33 -33
  31. megadetector/data_management/coco_to_labelme.py +75 -75
  32. megadetector/data_management/coco_to_yolo.py +210 -193
  33. megadetector/data_management/databases/add_width_and_height_to_db.py +86 -12
  34. megadetector/data_management/databases/combine_coco_camera_traps_files.py +40 -40
  35. megadetector/data_management/databases/integrity_check_json_db.py +228 -200
  36. megadetector/data_management/databases/subset_json_db.py +33 -33
  37. megadetector/data_management/generate_crops_from_cct.py +88 -39
  38. megadetector/data_management/get_image_sizes.py +54 -49
  39. megadetector/data_management/labelme_to_coco.py +133 -125
  40. megadetector/data_management/labelme_to_yolo.py +159 -73
  41. megadetector/data_management/lila/create_lila_blank_set.py +81 -83
  42. megadetector/data_management/lila/create_lila_test_set.py +32 -31
  43. megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
  44. megadetector/data_management/lila/download_lila_subset.py +21 -24
  45. megadetector/data_management/lila/generate_lila_per_image_labels.py +365 -107
  46. megadetector/data_management/lila/get_lila_annotation_counts.py +35 -33
  47. megadetector/data_management/lila/get_lila_image_counts.py +22 -22
  48. megadetector/data_management/lila/lila_common.py +73 -70
  49. megadetector/data_management/lila/test_lila_metadata_urls.py +28 -19
  50. megadetector/data_management/mewc_to_md.py +344 -340
  51. megadetector/data_management/ocr_tools.py +262 -255
  52. megadetector/data_management/read_exif.py +249 -227
  53. megadetector/data_management/remap_coco_categories.py +90 -28
  54. megadetector/data_management/remove_exif.py +81 -21
  55. megadetector/data_management/rename_images.py +187 -187
  56. megadetector/data_management/resize_coco_dataset.py +588 -120
  57. megadetector/data_management/speciesnet_to_md.py +41 -41
  58. megadetector/data_management/wi_download_csv_to_coco.py +55 -55
  59. megadetector/data_management/yolo_output_to_md_output.py +248 -122
  60. megadetector/data_management/yolo_to_coco.py +333 -191
  61. megadetector/detection/change_detection.py +832 -0
  62. megadetector/detection/process_video.py +340 -337
  63. megadetector/detection/pytorch_detector.py +358 -278
  64. megadetector/detection/run_detector.py +399 -186
  65. megadetector/detection/run_detector_batch.py +404 -377
  66. megadetector/detection/run_inference_with_yolov5_val.py +340 -327
  67. megadetector/detection/run_tiled_inference.py +257 -249
  68. megadetector/detection/tf_detector.py +24 -24
  69. megadetector/detection/video_utils.py +332 -295
  70. megadetector/postprocessing/add_max_conf.py +19 -11
  71. megadetector/postprocessing/categorize_detections_by_size.py +45 -45
  72. megadetector/postprocessing/classification_postprocessing.py +468 -433
  73. megadetector/postprocessing/combine_batch_outputs.py +23 -23
  74. megadetector/postprocessing/compare_batch_results.py +590 -525
  75. megadetector/postprocessing/convert_output_format.py +106 -102
  76. megadetector/postprocessing/create_crop_folder.py +347 -147
  77. megadetector/postprocessing/detector_calibration.py +173 -168
  78. megadetector/postprocessing/generate_csv_report.py +508 -499
  79. megadetector/postprocessing/load_api_results.py +48 -27
  80. megadetector/postprocessing/md_to_coco.py +133 -102
  81. megadetector/postprocessing/md_to_labelme.py +107 -90
  82. megadetector/postprocessing/md_to_wi.py +40 -40
  83. megadetector/postprocessing/merge_detections.py +92 -114
  84. megadetector/postprocessing/postprocess_batch_results.py +319 -301
  85. megadetector/postprocessing/remap_detection_categories.py +91 -38
  86. megadetector/postprocessing/render_detection_confusion_matrix.py +214 -205
  87. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
  88. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
  89. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +704 -679
  90. megadetector/postprocessing/separate_detections_into_folders.py +226 -211
  91. megadetector/postprocessing/subset_json_detector_output.py +265 -262
  92. megadetector/postprocessing/top_folders_to_bottom.py +45 -45
  93. megadetector/postprocessing/validate_batch_results.py +70 -70
  94. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
  95. megadetector/taxonomy_mapping/map_new_lila_datasets.py +18 -19
  96. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +54 -33
  97. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +67 -67
  98. megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
  99. megadetector/taxonomy_mapping/simple_image_download.py +8 -8
  100. megadetector/taxonomy_mapping/species_lookup.py +156 -74
  101. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
  102. megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
  103. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
  104. megadetector/utils/ct_utils.py +1049 -211
  105. megadetector/utils/directory_listing.py +21 -77
  106. megadetector/utils/gpu_test.py +22 -22
  107. megadetector/utils/md_tests.py +632 -529
  108. megadetector/utils/path_utils.py +1520 -431
  109. megadetector/utils/process_utils.py +41 -41
  110. megadetector/utils/split_locations_into_train_val.py +62 -62
  111. megadetector/utils/string_utils.py +148 -27
  112. megadetector/utils/url_utils.py +489 -176
  113. megadetector/utils/wi_utils.py +2658 -2526
  114. megadetector/utils/write_html_image_list.py +137 -137
  115. megadetector/visualization/plot_utils.py +34 -30
  116. megadetector/visualization/render_images_with_thumbnails.py +39 -74
  117. megadetector/visualization/visualization_utils.py +487 -435
  118. megadetector/visualization/visualize_db.py +232 -198
  119. megadetector/visualization/visualize_detector_output.py +82 -76
  120. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/METADATA +5 -2
  121. megadetector-10.0.0.dist-info/RECORD +139 -0
  122. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/WHEEL +1 -1
  123. megadetector/api/batch_processing/api_core/__init__.py +0 -0
  124. megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
  125. megadetector/api/batch_processing/api_core/batch_service/score.py +0 -439
  126. megadetector/api/batch_processing/api_core/server.py +0 -294
  127. megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
  128. megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
  129. megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
  130. megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
  131. megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
  132. megadetector/api/batch_processing/api_core/server_utils.py +0 -88
  133. megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
  134. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
  135. megadetector/api/batch_processing/api_support/__init__.py +0 -0
  136. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
  137. megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
  138. megadetector/api/synchronous/__init__.py +0 -0
  139. megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  140. megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
  141. megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
  142. megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
  143. megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
  144. megadetector/api/synchronous/api_core/tests/load_test.py +0 -110
  145. megadetector/data_management/importers/add_nacti_sizes.py +0 -52
  146. megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
  147. megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
  148. megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
  149. megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
  150. megadetector/data_management/importers/awc_to_json.py +0 -191
  151. megadetector/data_management/importers/bellevue_to_json.py +0 -272
  152. megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
  153. megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
  154. megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
  155. megadetector/data_management/importers/cct_field_adjustments.py +0 -58
  156. megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
  157. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
  158. megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
  159. megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
  160. megadetector/data_management/importers/ena24_to_json.py +0 -276
  161. megadetector/data_management/importers/filenames_to_json.py +0 -386
  162. megadetector/data_management/importers/helena_to_cct.py +0 -283
  163. megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
  164. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
  165. megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
  166. megadetector/data_management/importers/jb_csv_to_json.py +0 -150
  167. megadetector/data_management/importers/mcgill_to_json.py +0 -250
  168. megadetector/data_management/importers/missouri_to_json.py +0 -490
  169. megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
  170. megadetector/data_management/importers/noaa_seals_2019.py +0 -181
  171. megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
  172. megadetector/data_management/importers/pc_to_json.py +0 -365
  173. megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
  174. megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
  175. megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
  176. megadetector/data_management/importers/rspb_to_json.py +0 -356
  177. megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
  178. megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
  179. megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
  180. megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
  181. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
  182. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
  183. megadetector/data_management/importers/sulross_get_exif.py +0 -65
  184. megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
  185. megadetector/data_management/importers/ubc_to_json.py +0 -399
  186. megadetector/data_management/importers/umn_to_json.py +0 -507
  187. megadetector/data_management/importers/wellington_to_json.py +0 -263
  188. megadetector/data_management/importers/wi_to_json.py +0 -442
  189. megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
  190. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
  191. megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
  192. megadetector/utils/azure_utils.py +0 -178
  193. megadetector/utils/sas_blob_utils.py +0 -509
  194. megadetector-5.0.28.dist-info/RECORD +0 -209
  195. /megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
  196. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
  197. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0
@@ -35,6 +35,7 @@ from megadetector.data_management.lila.lila_common import \
35
35
  from megadetector.utils import write_html_image_list
36
36
  from megadetector.utils.path_utils import zip_file
37
37
  from megadetector.utils.path_utils import open_file
38
+ from megadetector.utils.url_utils import parallel_download_urls
38
39
 
39
40
  # We'll write images, metadata downloads, and temporary files here
40
41
  lila_local_base = os.path.expanduser('~/lila')
@@ -47,7 +48,7 @@ os.makedirs(metadata_dir,exist_ok=True)
47
48
 
48
49
  output_file = os.path.join(lila_local_base,'lila_image_urls_and_labels.csv')
49
50
 
50
- # Some datasets don't have "sequence_level_annotation" fields populated, but we know their
51
+ # Some datasets don't have "sequence_level_annotation" fields populated, but we know their
51
52
  # annotation level
52
53
  ds_name_to_annotation_level = {}
53
54
  ds_name_to_annotation_level['Caltech Camera Traps'] = 'image'
@@ -66,6 +67,18 @@ if debug_max_images_per_dataset > 0:
66
67
  print('Running in debug mode')
67
68
  output_file = output_file.replace('.csv','_debug.csv')
68
69
 
70
+ taxonomy_levels_to_include = \
71
+ ['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
72
+ 'suborder','infraorder','superfamily','family','subfamily','tribe','genus','subgenus',
73
+ 'species','subspecies','variety']
74
+
75
+ def _clearnan(v):
76
+ if isinstance(v,float):
77
+ assert np.isnan(v)
78
+ v = ''
79
+ assert isinstance(v,str)
80
+ return v
81
+
69
82
 
70
83
  #%% Download and parse the metadata file
71
84
 
@@ -79,14 +92,14 @@ if False:
79
92
 
80
93
  #%% Download and extract metadata for each dataset
81
94
 
82
- for ds_name in metadata_table.keys():
95
+ for ds_name in metadata_table.keys():
83
96
  metadata_table[ds_name]['metadata_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
84
97
  metadata_dir=metadata_dir,
85
98
  metadata_table=metadata_table)
86
-
99
+
87
100
  #%% Load taxonomy data
88
101
 
89
- taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
102
+ taxonomy_df = read_lila_taxonomy_mapping(metadata_dir, force_download=True)
90
103
 
91
104
 
92
105
  #%% Build a dictionary that maps each [dataset,query] pair to the full taxonomic label set
@@ -95,12 +108,12 @@ ds_label_to_taxonomy = {}
95
108
 
96
109
  # i_row = 0; row = taxonomy_df.iloc[i_row]
97
110
  for i_row,row in taxonomy_df.iterrows():
98
-
111
+
99
112
  ds_label = row['dataset_name'] + ':' + row['query']
100
113
  assert ds_label.strip() == ds_label
101
114
  assert ds_label not in ds_label_to_taxonomy
102
115
  ds_label_to_taxonomy[ds_label] = row.to_dict()
103
-
116
+
104
117
 
105
118
  #%% Process annotations for each dataset
106
119
 
@@ -112,74 +125,62 @@ header = ['dataset_name','url_gcp','url_aws','url_azure',
112
125
  'image_id','sequence_id','location_id','frame_num',
113
126
  'original_label','scientific_name','common_name','datetime','annotation_level']
114
127
 
115
- taxonomy_levels_to_include = \
116
- ['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
117
- 'suborder','infraorder','superfamily','family','subfamily','tribe','genus','species','subspecies',
118
- 'variety']
119
-
120
128
  header.extend(taxonomy_levels_to_include)
121
129
 
122
130
  missing_annotations = set()
123
131
 
124
- def clearnan(v):
125
- if isinstance(v,float):
126
- assert np.isnan(v)
127
- v = ''
128
- assert isinstance(v,str)
129
- return v
130
-
131
132
  with open(output_file,'w',encoding='utf-8',newline='') as f:
132
-
133
+
133
134
  csv_writer = csv.writer(f)
134
135
  csv_writer.writerow(header)
135
-
136
+
136
137
  # ds_name = list(metadata_table.keys())[0]
137
138
  for ds_name in metadata_table.keys():
138
-
139
+
139
140
  if 'bbox' in ds_name:
140
141
  print('Skipping bbox dataset {}'.format(ds_name))
141
142
  continue
142
-
143
+
143
144
  print('Processing dataset {}'.format(ds_name))
144
-
145
+
145
146
  json_filename = metadata_table[ds_name]['metadata_filename']
146
147
  with open(json_filename, 'r') as f:
147
148
  data = json.load(f)
148
-
149
+
149
150
  categories = data['categories']
150
151
  category_ids = [c['id'] for c in categories]
151
152
  for c in categories:
152
153
  category_id_to_name = {c['id']:c['name'] for c in categories}
153
-
154
+
154
155
  annotations = data['annotations']
155
156
  images = data['images']
156
-
157
+
157
158
  image_id_to_annotations = defaultdict(list)
158
-
159
+
159
160
  # Go through annotations, marking each image with the categories that are present
160
161
  #
161
162
  # ann = annotations[0]
162
- for ann in annotations:
163
+ for ann in annotations:
163
164
  image_id_to_annotations[ann['image_id']].append(ann)
164
-
165
+
165
166
  unannotated_images = []
166
-
167
+
167
168
  found_date = False
168
169
  found_location = False
169
170
  found_annotation_level = False
170
-
171
+
171
172
  if ds_name in ds_name_to_annotation_level:
172
173
  expected_annotation_level = ds_name_to_annotation_level[ds_name]
173
174
  else:
174
175
  expected_annotation_level = None
175
-
176
+
176
177
  # im = images[10]
177
178
  for i_image,im in tqdm(enumerate(images),total=len(images)):
178
-
179
+
179
180
  if (debug_max_images_per_dataset is not None) and (debug_max_images_per_dataset > 0) \
180
181
  and (i_image >= debug_max_images_per_dataset):
181
182
  break
182
-
183
+
183
184
  file_name = im['file_name'].replace('\\','/')
184
185
  base_url_gcp = metadata_table[ds_name]['image_base_url_gcp']
185
186
  base_url_aws = metadata_table[ds_name]['image_base_url_aws']
@@ -187,21 +188,21 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
187
188
  assert not base_url_gcp.endswith('/')
188
189
  assert not base_url_aws.endswith('/')
189
190
  assert not base_url_azure.endswith('/')
190
-
191
+
191
192
  url_gcp = base_url_gcp + '/' + file_name
192
193
  url_aws = base_url_aws + '/' + file_name
193
194
  url_azure = base_url_azure + '/' + file_name
194
-
195
+
195
196
  for k in im.keys():
196
197
  if ('date' in k or 'time' in k) and (k not in ['datetime','date_captured']):
197
198
  raise ValueError('Unrecognized datetime field')
198
-
199
+
199
200
  # This field name was only used for Caltech Camera Traps
200
201
  if 'date_captured' in im:
201
202
  assert ds_name == 'Caltech Camera Traps'
202
203
  im['datetime'] = im['date_captured']
203
-
204
- def has_valid_datetime(im):
204
+
205
+ def _has_valid_datetime(im):
205
206
  if 'datetime' not in im:
206
207
  return False
207
208
  v = im['datetime']
@@ -212,29 +213,29 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
212
213
  else:
213
214
  assert isinstance(v,float) and np.isnan(v)
214
215
  return False
215
-
216
- dt_string = ''
217
- if (has_valid_datetime(im)):
218
-
216
+
217
+ dt_string = ''
218
+ if (_has_valid_datetime(im)):
219
+
219
220
  dt = dateparser.parse(im['datetime'])
220
-
221
+
221
222
  if dt is None or dt.year < 1990 or dt.year > 2025:
222
-
223
+
223
224
  # raise ValueError('Suspicious date parsing result')
224
-
225
- # Special case we don't want to print a warning about... this is
225
+
226
+ # Special case we don't want to print a warning about... this is
226
227
  # in invalid date that very likely originates on the camera, not at
227
228
  # some intermediate processing step.
228
229
  #
229
230
  # print('Suspicious date for image {}: {} ({})'.format(
230
231
  # im['id'], im['datetime'], ds_name))
231
- pass
232
-
232
+ pass
233
+
233
234
  else:
234
-
235
+
235
236
  found_date = True
236
237
  dt_string = dt.strftime("%m-%d-%Y %H:%M:%S")
237
-
238
+
238
239
  # Location, sequence, and image IDs are only guaranteed to be unique within
239
240
  # a dataset, so for the output .csv file, include both
240
241
  if 'location' in im:
@@ -242,25 +243,25 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
242
243
  location_id = ds_name + ' : ' + str(im['location'])
243
244
  else:
244
245
  location_id = ds_name
245
-
246
+
246
247
  image_id = ds_name + ' : ' + str(im['id'])
247
-
248
+
248
249
  if 'seq_id' in im:
249
250
  sequence_id = ds_name + ' : ' + str(im['seq_id'])
250
251
  else:
251
252
  sequence_id = ds_name + ' : ' + 'unknown'
252
-
253
+
253
254
  if 'frame_num' in im:
254
255
  frame_num = im['frame_num']
255
256
  else:
256
257
  frame_num = -1
257
-
258
+
258
259
  annotations_this_image = image_id_to_annotations[im['id']]
259
-
260
+
260
261
  categories_this_image = set()
261
-
262
+
262
263
  annotation_level = 'unknown'
263
-
264
+
264
265
  for ann in annotations_this_image:
265
266
  assert ann['image_id'] == im['id']
266
267
  categories_this_image.add(category_id_to_name[ann['category_id']])
@@ -275,35 +276,35 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
275
276
  'Unexpected annotation level'
276
277
  elif expected_annotation_level is not None:
277
278
  annotation_level = expected_annotation_level
278
-
279
+
279
280
  if len(categories_this_image) == 0:
280
281
  unannotated_images.append(im)
281
282
  continue
282
-
283
+
283
284
  # category_name = list(categories_this_image)[0]
284
285
  for category_name in categories_this_image:
285
-
286
+
286
287
  ds_label = ds_name + ':' + category_name.lower()
287
-
288
+
288
289
  if ds_label not in ds_label_to_taxonomy:
289
-
290
+
290
291
  assert ds_label in known_unmapped_labels
291
-
292
+
292
293
  # Only print a warning the first time we see an unmapped label
293
294
  if ds_label not in missing_annotations:
294
295
  print('Warning: {} not in taxonomy file'.format(ds_label))
295
296
  missing_annotations.add(ds_label)
296
297
  continue
297
-
298
+
298
299
  taxonomy_labels = ds_label_to_taxonomy[ds_label]
299
-
300
+
300
301
  """
301
- header =
302
+ header =
302
303
  ['dataset_name','url','image_id','sequence_id','location_id',
303
304
  'frame_num','original_label','scientific_name','common_name',
304
305
  'datetime','annotation_level']
305
306
  """
306
-
307
+
307
308
  row = []
308
309
  row.append(ds_name)
309
310
  row.append(url_gcp)
@@ -314,37 +315,37 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
314
315
  row.append(location_id)
315
316
  row.append(frame_num)
316
317
  row.append(taxonomy_labels['query'])
317
- row.append(clearnan(taxonomy_labels['scientific_name']))
318
- row.append(clearnan(taxonomy_labels['common_name']))
318
+ row.append(_clearnan(taxonomy_labels['scientific_name']))
319
+ row.append(_clearnan(taxonomy_labels['common_name']))
319
320
  row.append(dt_string)
320
321
  row.append(annotation_level)
321
-
322
+
322
323
  for s in taxonomy_levels_to_include:
323
- row.append(clearnan(taxonomy_labels[s]))
324
-
324
+ row.append(_clearnan(taxonomy_labels[s]))
325
+
325
326
  assert len(row) == len(header)
326
-
327
+
327
328
  csv_writer.writerow(row)
328
-
329
+
329
330
  # ...for each category that was applied at least once to this image
330
-
331
+
331
332
  # ...for each image in this dataset
332
-
333
+
333
334
  if not found_date:
334
335
  pass
335
336
  # print('Warning: no date information available for this dataset')
336
-
337
+
337
338
  if not found_location:
338
339
  pass
339
340
  # print('Warning: no location information available for this dataset')
340
-
341
+
341
342
  if not found_annotation_level and (ds_name not in ds_name_to_annotation_level):
342
343
  print('Warning: no annotation level information available for this dataset')
343
-
344
+
344
345
  if len(unannotated_images) > 0:
345
346
  print('Warning: {} of {} images are un-annotated\n'.\
346
347
  format(len(unannotated_images),len(images)))
347
-
348
+
348
349
  # ...for each dataset
349
350
 
350
351
  # ...with open()
@@ -360,11 +361,14 @@ print('Read {} rows from {}'.format(len(df),output_file))
360
361
 
361
362
  #%% Do some post-hoc integrity checking
362
363
 
363
- # Takes ~10 minutes without using apply()
364
+ # Takes ~5 minutes with apply(), or ~10 minutes without apply()
365
+ #
366
+ # Using apply() is faster, but more annoying to debug.
367
+ use_pandas_apply_for_integrity_checking = True
364
368
 
365
369
  tqdm.pandas()
366
370
 
367
- def isint(v):
371
+ def _isint(v):
368
372
  return isinstance(v,int) or isinstance(v,np.int64)
369
373
 
370
374
  valid_annotation_levels = set(['sequence','image','unknown'])
@@ -373,8 +377,8 @@ valid_annotation_levels = set(['sequence','image','unknown'])
373
377
  # in the next cell to look for datasets that only have a single location
374
378
  dataset_name_to_locations = defaultdict(set)
375
379
 
376
- def check_row(row):
377
-
380
+ def _check_row(row):
381
+
378
382
  assert row['dataset_name'] in metadata_table.keys()
379
383
  for url_column in ['url_gcp','url_aws','url_azure']:
380
384
  assert row[url_column].startswith('https://') or row[url_column].startswith('http://')
@@ -387,21 +391,20 @@ def check_row(row):
387
391
  assert np.isnan(row['frame_num'])
388
392
  else:
389
393
  # -1 is sometimes used for sequences of unknown length
390
- assert isint(row['frame_num']) and row['frame_num'] >= -1
394
+ assert _isint(row['frame_num']) and row['frame_num'] >= -1
391
395
 
392
396
  ds_name = row['dataset_name']
393
397
  dataset_name_to_locations[ds_name].add(row['location_id'])
394
-
395
- # Faster, but more annoying to debug
396
- if True:
397
-
398
- df.progress_apply(check_row, axis=1)
398
+
399
+ if use_pandas_apply_for_integrity_checking:
400
+
401
+ df.progress_apply(_check_row, axis=1)
399
402
 
400
403
  else:
401
-
404
+
402
405
  # i_row = 0; row = df.iloc[i_row]
403
406
  for i_row,row in tqdm(df.iterrows(),total=len(df)):
404
- check_row(row)
407
+ _check_row(row)
405
408
 
406
409
 
407
410
  #%% Check for datasets that have only one location string (typically "unknown")
@@ -428,31 +431,32 @@ images_to_download = []
428
431
 
429
432
  # ds_name = list(metadata_table.keys())[2]
430
433
  for ds_name in metadata_table.keys():
431
-
434
+
432
435
  if 'bbox' in ds_name:
433
436
  continue
434
-
437
+
435
438
  # Find all rows for this dataset
436
439
  ds_rows = df.loc[df['dataset_name'] == ds_name]
437
-
440
+
438
441
  print('{} rows available for {}'.format(len(ds_rows),ds_name))
439
442
  assert len(ds_rows) > 0
440
-
443
+
441
444
  empty_rows = ds_rows[ds_rows['scientific_name'].isnull()]
442
445
  non_empty_rows = ds_rows[~ds_rows['scientific_name'].isnull()]
443
-
446
+
444
447
  if len(empty_rows) == 0:
445
448
  print('No empty images available for {}'.format(ds_name))
446
449
  elif len(empty_rows) > n_empty_images_per_dataset:
447
450
  empty_rows = empty_rows.sample(n=n_empty_images_per_dataset)
448
451
  images_to_download.extend(empty_rows.to_dict('records'))
449
452
 
453
+ # All LILA datasets have non-empty images
450
454
  if len(non_empty_rows) == 0:
451
- print('No non-empty images available for {}'.format(ds_name))
455
+ raise ValueError('No non-empty images available for {}'.format(ds_name))
452
456
  elif len(non_empty_rows) > n_non_empty_images_per_dataset:
453
457
  non_empty_rows = non_empty_rows.sample(n=n_non_empty_images_per_dataset)
454
458
  images_to_download.extend(non_empty_rows.to_dict('records'))
455
-
459
+
456
460
  # ...for each dataset
457
461
 
458
462
  print('Selected {} total images'.format(len(images_to_download)))
@@ -462,13 +466,13 @@ print('Selected {} total images'.format(len(images_to_download)))
462
466
 
463
467
  # Expect a few errors for images with human or vehicle labels (or things like "ignore" that *could* be humans)
464
468
 
465
- preferred_cloud = 'aws'
469
+ preferred_cloud = 'gcp'
466
470
 
467
471
  url_to_target_file = {}
468
472
 
469
473
  # i_image = 10; image = images_to_download[i_image]
470
474
  for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_download)):
471
-
475
+
472
476
  url = image['url_' + preferred_cloud]
473
477
  ext = os.path.splitext(url)[1]
474
478
  fn_relative = 'image_{}'.format(str(i_image).zfill(4)) + ext
@@ -476,14 +480,26 @@ for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_down
476
480
  image['relative_file'] = fn_relative
477
481
  image['url'] = url
478
482
  url_to_target_file[url] = fn_abs
479
-
483
+
480
484
 
481
485
  #%% Download images (execution)
482
486
 
483
- from megadetector.utils.url_utils import parallel_download_urls
484
487
  download_results = parallel_download_urls(url_to_target_file,verbose=False,overwrite=True,
485
488
  n_workers=20,pool_type='thread')
486
489
 
490
+ # 10-20 errors is normal; they should all be images that are labeled as "human"
491
+ errors = []
492
+
493
+ for r in download_results:
494
+ if r['status'] != 'success':
495
+ errors.append(r)
496
+
497
+ assert len(download_results) == len(url_to_target_file)
498
+ print('Errors on {} of {} downloads:\n'.format(len(errors),len(download_results)))
499
+
500
+ for err in errors:
501
+ print(err['url'])
502
+
487
503
 
488
504
  #%% Write preview HTML
489
505
 
@@ -493,10 +509,10 @@ html_images = []
493
509
 
494
510
  # im = images_to_download[0]
495
511
  for im in images_to_download:
496
-
512
+
497
513
  if im['relative_file'] is None:
498
514
  continue
499
-
515
+
500
516
  output_im = {}
501
517
  output_im['filename'] = im['relative_file']
502
518
  output_im['linkTarget'] = im['url']
@@ -504,7 +520,7 @@ for im in images_to_download:
504
520
  output_im['imageStyle'] = 'width:600px;'
505
521
  output_im['textStyle'] = 'font-weight:normal;font-size:100%;'
506
522
  html_images.append(output_im)
507
-
523
+
508
524
  write_html_image_list.write_html_image_list(html_filename,html_images)
509
525
 
510
526
  open_file(html_filename)
@@ -515,3 +531,245 @@ open_file(html_filename)
515
531
  zipped_output_file = zip_file(output_file,verbose=True,overwrite=True)
516
532
 
517
533
  print('Zipped {} to {}'.format(output_file,zipped_output_file))
534
+
535
+
536
+ #%% Convert to .json
537
+
538
+ """
539
+ The .csv file "output_file" (already loaded into the variable "df" at this point) has the following columns:
540
+
541
+ dataset_name,url_gcp,url_aws,url_azure,image_id,sequence_id,location_id,frame_num,original_label,scientific_name,common_name,datetime,annotation_level,kingdom,phylum,subphylum,superclass,class,subclass,infraclass,superorder,order,suborder,infraorder,superfamily,family,subfamily,tribe,genus,subgenus,species,subspecies,variety
542
+
543
+ Each row in the .csv represents an image. The URL columns represent the location of that
544
+ image on three different clouds; for a given image, the value of those columns differs only
545
+ in the prefix. The columns starting with "kingdom" represent a taxonomic wildlife identifier. Not
546
+ all rows have values in all of these columns; some rows represent non-wildlife images where all of these
547
+ columns are blank.
548
+
549
+ This cell converts this to a .json dictionary, with the following top-level keys:
550
+
551
+ ## datasets (dict)
552
+
553
+ A dict mapping integer IDs to strings.
554
+
555
+ Each unique value in the "dataset_name" column should become an element in this dict with a unique ID.
556
+
557
+ ## sequences (dict)
558
+
559
+ A dict mapping integer IDs to strings.
560
+
561
+ Each unique value in the "sequence_id" column should become an element in this dict with a unique ID.
562
+
563
+ ## locations (dict)
564
+
565
+ A dict mapping integer IDs to strings.
566
+
567
+ Each unique value in the "location_id" column should become an element in this dict with a unique ID.
568
+
569
+ ## base_urls (dict)
570
+
571
+ This key should point to the following dict:
572
+
573
+ {
574
+ "gcp": "https://storage.googleapis.com/public-datasets-lila/",
575
+ "aws": "http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/",
576
+ "azure": "https://lilawildlife.blob.core.windows.net/lila-wildlife/",
577
+ }
578
+
579
+ All values in the url_gcp, url_aws, and url_azure columns start with these values, respectively.
580
+
581
+ ## taxa (dict)
582
+
583
+ A dict mapping integer IDs to dicts, where each dict has the fields:
584
+
585
+ kingdom,phylum,subphylum,superclass,class,subclass,infraclass,superorder,order,suborder,infraorder,superfamily,family,subfamily,tribe,genus,subgenus,species,subspecies,variety
586
+
587
+ The value of each of these fields in each row is either a string or None.
588
+
589
+ ## images (list)
590
+
591
+ A list of images, where each image is a dict with the following fields:
592
+
593
+ ### dataset (int)
594
+
595
+ The integer ID corresponding to the dataset_name column for this image
596
+
597
+ ### path (str)
598
+
599
+ The suffix for this image's URL, which should be the same across the three URL columns.
600
+
601
+ ### seq (int)
602
+
603
+ The integer ID corresponding to the sequence_id column for this image
604
+
605
+ ### loc (int)
606
+
607
+ The integer ID corresponding to the location_id column for this image
608
+
609
+ ### frame_num
610
+
611
+ The value of the frame_num column for this image, unless the original value was -1,
612
+ in which case this is omitted.
613
+
614
+ ### original_label
615
+
616
+ The value of the original_label column for this image
617
+
618
+ ### common_name
619
+
620
+ The value of the common_name column for this image, if not empty
621
+
622
+ ### datetime
623
+
624
+ The value of the datetime column for this image
625
+
626
+ ### ann_level
627
+
628
+ The value of the annotation_level column for this image
629
+
630
+ ### taxon
631
+
632
+ The integer ID corresponding to the taxonomic identifier columns for this image
633
+
634
+ --
635
+
636
+ The original .csv file is large (~15GB); this may impact the implementation of the .json conversion. Speed of
637
+ conversion is not a priority.
638
+
639
+ """
640
+
641
+ print('Converting to JSON...')
642
+
643
+ output_json_file = output_file.replace('.csv', '.json')
644
+
645
+ json_data = {}
646
+
647
+ # Create mappings for datasets, sequences, and locations
648
+ dataset_to_id = {}
649
+ sequence_to_id = {}
650
+ location_to_id = {}
651
+ taxa_to_id = {}
652
+
653
+ next_dataset_id = 0
654
+ next_sequence_id = 0
655
+ next_location_id = 0
656
+ next_taxa_id = 0
657
+
658
+ json_data['datasets'] = {}
659
+ json_data['sequences'] = {}
660
+ json_data['locations'] = {}
661
+ json_data['taxa'] = {}
662
+
663
+ json_data['base_urls'] = {
664
+ "gcp": "https://storage.googleapis.com/public-datasets-lila/",
665
+ "aws": "http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/",
666
+ "azure": "https://lilawildlife.blob.core.windows.net/lila-wildlife/",
667
+ }
668
+
669
+ json_data['images'] = []
670
+
671
+ debug_max_json_conversion_rows = None
672
+
673
+ print('Counting rows in .csv file...')
674
+
675
+ # Get total number of lines for progress bar (optional, but helpful for large files)
676
+ def _count_lines(filename):
677
+ with open(filename, 'r', encoding='utf-8') as f:
678
+ return sum(1 for line in f) - 1
679
+
680
+ total_rows = _count_lines(output_file)
681
+ print('Total rows to process: {}'.format(total_rows))
682
+
683
+ # Read CSV file line by line
684
+ with open(output_file, 'r', encoding='utf-8') as csvfile:
685
+
686
+ reader = csv.DictReader(csvfile)
687
+
688
+ # Process each row
689
+ for i_row, row in enumerate(tqdm(reader, total=total_rows, desc="Processing rows")):
690
+
691
+ if (debug_max_json_conversion_rows is not None) and (i_row >= debug_max_json_conversion_rows):
692
+ break
693
+
694
+ # Datasets
695
+ dataset_name = row['dataset_name']
696
+ if dataset_name not in dataset_to_id:
697
+ dataset_to_id[dataset_name] = next_dataset_id
698
+ json_data['datasets'][str(next_dataset_id)] = dataset_name
699
+ next_dataset_id += 1
700
+ dataset_id = dataset_to_id[dataset_name]
701
+
702
+ # Sequences
703
+ sequence_id_str = row['sequence_id']
704
+ assert sequence_id_str.startswith(dataset_name + ' : ')
705
+ if sequence_id_str not in sequence_to_id:
706
+ sequence_to_id[sequence_id_str] = next_sequence_id
707
+ json_data['sequences'][str(next_sequence_id)] = sequence_id_str
708
+ next_sequence_id += 1
709
+ sequence_id = sequence_to_id[sequence_id_str]
710
+
711
+ # Locations
712
+ location_id_str = row['location_id']
713
+ assert location_id_str.startswith(dataset_name) # + ' : ')
714
+ if location_id_str not in location_to_id:
715
+ location_to_id[location_id_str] = next_location_id
716
+ json_data['locations'][str(next_location_id)] = location_id_str
717
+ next_location_id += 1
718
+ location_id = location_to_id[location_id_str]
719
+
720
+ # Taxa
721
+ taxa_data = {level: _clearnan(row[level]) for level in taxonomy_levels_to_include}
722
+ taxa_tuple = tuple(taxa_data.items()) # use tuple for hashable key
723
+ if taxa_tuple not in taxa_to_id:
724
+ taxa_to_id[taxa_tuple] = next_taxa_id
725
+ json_data['taxa'][str(next_taxa_id)] = taxa_data
726
+ next_taxa_id += 1
727
+ taxa_id = taxa_to_id[taxa_tuple]
728
+
729
+ # Image path
730
+ url_gcp = row['url_gcp']
731
+ assert url_gcp.startswith(json_data['base_urls']['gcp'])
732
+ path = url_gcp.replace(json_data['base_urls']['gcp'], '')
733
+
734
+ common_name = _clearnan(row['common_name'])
735
+
736
+ frame_num = int(row['frame_num'])
737
+
738
+ # Image data
739
+ image_entry = {
740
+ 'dataset': dataset_id,
741
+ 'path': path,
742
+ 'seq': sequence_id,
743
+ 'loc': location_id,
744
+ 'ann_level': row['annotation_level'],
745
+ 'original_label': row['original_label'],
746
+ 'datetime': row['datetime'],
747
+ 'taxon': taxa_id
748
+ }
749
+
750
+ if frame_num >= 0:
751
+ image_entry['frame_num'] = frame_num
752
+
753
+ if len(common_name) > 0:
754
+ image_entry['common_name'] = common_name
755
+
756
+ json_data['images'].append(image_entry)
757
+
758
+ # ...for each line
759
+
760
+ # ...with open(...)
761
+
762
+ # Save the JSON data
763
+ print('Saving JSON file...')
764
+ with open(output_json_file, 'w', encoding='utf-8') as f:
765
+ json.dump(json_data, f, indent=1)
766
+
767
+ print(f'Converted to JSON and saved to {output_json_file}')
768
+ print(f'JSON file size: {os.path.getsize(output_json_file)/(1024*1024*1024):.2f} GB')
769
+
770
+ # Print summary statistics
771
+ print(f'Total datasets: {len(json_data["datasets"])}')
772
+ print(f'Total sequences: {len(json_data["sequences"])}')
773
+ print(f'Total locations: {len(json_data["locations"])}')
774
+ print(f'Total taxa: {len(json_data["taxa"])}')
775
+ print(f'Total images: {len(json_data["images"])}')