megadetector 5.0.7__py3-none-any.whl → 5.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (191) hide show
  1. api/__init__.py +0 -0
  2. api/batch_processing/__init__.py +0 -0
  3. api/batch_processing/api_core/__init__.py +0 -0
  4. api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. api/batch_processing/api_core/batch_service/score.py +0 -1
  6. api/batch_processing/api_core/server_job_status_table.py +0 -1
  7. api/batch_processing/api_core_support/__init__.py +0 -0
  8. api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
  9. api/batch_processing/api_support/__init__.py +0 -0
  10. api/batch_processing/api_support/summarize_daily_activity.py +0 -1
  11. api/batch_processing/data_preparation/__init__.py +0 -0
  12. api/batch_processing/data_preparation/manage_local_batch.py +93 -79
  13. api/batch_processing/data_preparation/manage_video_batch.py +8 -8
  14. api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
  15. api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
  16. api/batch_processing/postprocessing/__init__.py +0 -0
  17. api/batch_processing/postprocessing/add_max_conf.py +12 -12
  18. api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
  19. api/batch_processing/postprocessing/combine_api_outputs.py +69 -55
  20. api/batch_processing/postprocessing/compare_batch_results.py +114 -44
  21. api/batch_processing/postprocessing/convert_output_format.py +62 -19
  22. api/batch_processing/postprocessing/load_api_results.py +17 -20
  23. api/batch_processing/postprocessing/md_to_coco.py +31 -21
  24. api/batch_processing/postprocessing/md_to_labelme.py +165 -68
  25. api/batch_processing/postprocessing/merge_detections.py +40 -15
  26. api/batch_processing/postprocessing/postprocess_batch_results.py +270 -186
  27. api/batch_processing/postprocessing/remap_detection_categories.py +170 -0
  28. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +75 -39
  29. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
  30. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
  31. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +244 -160
  32. api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
  33. api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
  34. api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
  35. api/synchronous/__init__.py +0 -0
  36. api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  37. api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
  38. api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
  39. api/synchronous/api_core/animal_detection_api/config.py +35 -35
  40. api/synchronous/api_core/tests/__init__.py +0 -0
  41. api/synchronous/api_core/tests/load_test.py +109 -109
  42. classification/__init__.py +0 -0
  43. classification/aggregate_classifier_probs.py +21 -24
  44. classification/analyze_failed_images.py +11 -13
  45. classification/cache_batchapi_outputs.py +51 -51
  46. classification/create_classification_dataset.py +69 -68
  47. classification/crop_detections.py +54 -53
  48. classification/csv_to_json.py +97 -100
  49. classification/detect_and_crop.py +105 -105
  50. classification/evaluate_model.py +43 -42
  51. classification/identify_mislabeled_candidates.py +47 -46
  52. classification/json_to_azcopy_list.py +10 -10
  53. classification/json_validator.py +72 -71
  54. classification/map_classification_categories.py +44 -43
  55. classification/merge_classification_detection_output.py +68 -68
  56. classification/prepare_classification_script.py +157 -154
  57. classification/prepare_classification_script_mc.py +228 -228
  58. classification/run_classifier.py +27 -26
  59. classification/save_mislabeled.py +30 -30
  60. classification/train_classifier.py +20 -20
  61. classification/train_classifier_tf.py +21 -22
  62. classification/train_utils.py +10 -10
  63. data_management/__init__.py +0 -0
  64. data_management/annotations/__init__.py +0 -0
  65. data_management/annotations/annotation_constants.py +18 -31
  66. data_management/camtrap_dp_to_coco.py +238 -0
  67. data_management/cct_json_utils.py +107 -59
  68. data_management/cct_to_md.py +176 -158
  69. data_management/cct_to_wi.py +247 -219
  70. data_management/coco_to_labelme.py +272 -0
  71. data_management/coco_to_yolo.py +86 -62
  72. data_management/databases/__init__.py +0 -0
  73. data_management/databases/add_width_and_height_to_db.py +20 -16
  74. data_management/databases/combine_coco_camera_traps_files.py +35 -31
  75. data_management/databases/integrity_check_json_db.py +130 -83
  76. data_management/databases/subset_json_db.py +25 -16
  77. data_management/generate_crops_from_cct.py +27 -45
  78. data_management/get_image_sizes.py +188 -144
  79. data_management/importers/add_nacti_sizes.py +8 -8
  80. data_management/importers/add_timestamps_to_icct.py +78 -78
  81. data_management/importers/animl_results_to_md_results.py +158 -160
  82. data_management/importers/auckland_doc_test_to_json.py +9 -9
  83. data_management/importers/auckland_doc_to_json.py +8 -8
  84. data_management/importers/awc_to_json.py +7 -7
  85. data_management/importers/bellevue_to_json.py +15 -15
  86. data_management/importers/cacophony-thermal-importer.py +13 -13
  87. data_management/importers/carrizo_shrubfree_2018.py +8 -8
  88. data_management/importers/carrizo_trail_cam_2017.py +8 -8
  89. data_management/importers/cct_field_adjustments.py +9 -9
  90. data_management/importers/channel_islands_to_cct.py +10 -10
  91. data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
  92. data_management/importers/ena24_to_json.py +7 -7
  93. data_management/importers/filenames_to_json.py +8 -8
  94. data_management/importers/helena_to_cct.py +7 -7
  95. data_management/importers/idaho-camera-traps.py +7 -7
  96. data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
  97. data_management/importers/jb_csv_to_json.py +9 -9
  98. data_management/importers/mcgill_to_json.py +8 -8
  99. data_management/importers/missouri_to_json.py +18 -18
  100. data_management/importers/nacti_fieldname_adjustments.py +10 -10
  101. data_management/importers/noaa_seals_2019.py +8 -8
  102. data_management/importers/pc_to_json.py +7 -7
  103. data_management/importers/plot_wni_giraffes.py +7 -7
  104. data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
  105. data_management/importers/prepare_zsl_imerit.py +7 -7
  106. data_management/importers/rspb_to_json.py +8 -8
  107. data_management/importers/save_the_elephants_survey_A.py +8 -8
  108. data_management/importers/save_the_elephants_survey_B.py +9 -9
  109. data_management/importers/snapshot_safari_importer.py +26 -26
  110. data_management/importers/snapshot_safari_importer_reprise.py +665 -665
  111. data_management/importers/snapshot_serengeti_lila.py +14 -14
  112. data_management/importers/sulross_get_exif.py +8 -9
  113. data_management/importers/timelapse_csv_set_to_json.py +11 -11
  114. data_management/importers/ubc_to_json.py +13 -13
  115. data_management/importers/umn_to_json.py +7 -7
  116. data_management/importers/wellington_to_json.py +8 -8
  117. data_management/importers/wi_to_json.py +9 -9
  118. data_management/importers/zamba_results_to_md_results.py +181 -181
  119. data_management/labelme_to_coco.py +309 -159
  120. data_management/labelme_to_yolo.py +103 -60
  121. data_management/lila/__init__.py +0 -0
  122. data_management/lila/add_locations_to_island_camera_traps.py +9 -9
  123. data_management/lila/add_locations_to_nacti.py +147 -147
  124. data_management/lila/create_lila_blank_set.py +114 -31
  125. data_management/lila/create_lila_test_set.py +8 -8
  126. data_management/lila/create_links_to_md_results_files.py +106 -106
  127. data_management/lila/download_lila_subset.py +92 -90
  128. data_management/lila/generate_lila_per_image_labels.py +56 -43
  129. data_management/lila/get_lila_annotation_counts.py +18 -15
  130. data_management/lila/get_lila_image_counts.py +11 -11
  131. data_management/lila/lila_common.py +103 -70
  132. data_management/lila/test_lila_metadata_urls.py +132 -116
  133. data_management/ocr_tools.py +173 -128
  134. data_management/read_exif.py +161 -99
  135. data_management/remap_coco_categories.py +84 -0
  136. data_management/remove_exif.py +58 -62
  137. data_management/resize_coco_dataset.py +32 -44
  138. data_management/wi_download_csv_to_coco.py +246 -0
  139. data_management/yolo_output_to_md_output.py +86 -73
  140. data_management/yolo_to_coco.py +535 -95
  141. detection/__init__.py +0 -0
  142. detection/detector_training/__init__.py +0 -0
  143. detection/process_video.py +85 -33
  144. detection/pytorch_detector.py +43 -25
  145. detection/run_detector.py +157 -72
  146. detection/run_detector_batch.py +189 -114
  147. detection/run_inference_with_yolov5_val.py +118 -51
  148. detection/run_tiled_inference.py +113 -42
  149. detection/tf_detector.py +51 -28
  150. detection/video_utils.py +606 -521
  151. docs/source/conf.py +43 -0
  152. md_utils/__init__.py +0 -0
  153. md_utils/azure_utils.py +9 -9
  154. md_utils/ct_utils.py +249 -70
  155. md_utils/directory_listing.py +59 -64
  156. md_utils/md_tests.py +968 -862
  157. md_utils/path_utils.py +655 -155
  158. md_utils/process_utils.py +157 -133
  159. md_utils/sas_blob_utils.py +20 -20
  160. md_utils/split_locations_into_train_val.py +45 -32
  161. md_utils/string_utils.py +33 -10
  162. md_utils/url_utils.py +208 -27
  163. md_utils/write_html_image_list.py +51 -35
  164. md_visualization/__init__.py +0 -0
  165. md_visualization/plot_utils.py +102 -109
  166. md_visualization/render_images_with_thumbnails.py +34 -34
  167. md_visualization/visualization_utils.py +908 -311
  168. md_visualization/visualize_db.py +109 -58
  169. md_visualization/visualize_detector_output.py +61 -42
  170. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/METADATA +21 -17
  171. megadetector-5.0.9.dist-info/RECORD +224 -0
  172. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/WHEEL +1 -1
  173. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/top_level.txt +1 -0
  174. taxonomy_mapping/__init__.py +0 -0
  175. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
  176. taxonomy_mapping/map_new_lila_datasets.py +154 -154
  177. taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
  178. taxonomy_mapping/preview_lila_taxonomy.py +591 -591
  179. taxonomy_mapping/retrieve_sample_image.py +12 -12
  180. taxonomy_mapping/simple_image_download.py +11 -11
  181. taxonomy_mapping/species_lookup.py +10 -10
  182. taxonomy_mapping/taxonomy_csv_checker.py +18 -18
  183. taxonomy_mapping/taxonomy_graph.py +47 -47
  184. taxonomy_mapping/validate_lila_category_mappings.py +83 -76
  185. data_management/cct_json_to_filename_json.py +0 -89
  186. data_management/cct_to_csv.py +0 -140
  187. data_management/databases/remove_corrupted_images_from_db.py +0 -191
  188. detection/detector_training/copy_checkpoints.py +0 -43
  189. md_visualization/visualize_megadb.py +0 -183
  190. megadetector-5.0.7.dist-info/RECORD +0 -202
  191. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/LICENSE +0 -0
@@ -1,10 +1,10 @@
1
- ########
2
- #
3
- # lila_common.py
4
- #
5
- # Common constants and functions related to LILA data management/retrieval.
6
- #
7
- ########
1
+ """
2
+
3
+ lila_common.py
4
+
5
+ Common constants and functions related to LILA data management/retrieval.
6
+
7
+ """
8
8
 
9
9
  #%% Imports and constants
10
10
 
@@ -12,12 +12,12 @@ import os
12
12
  import json
13
13
  import zipfile
14
14
  import pandas as pd
15
- import numpy as np
16
15
 
17
16
  from urllib.parse import urlparse
18
17
 
19
18
  from md_utils.url_utils import download_url
20
19
  from md_utils.path_utils import unzip_file
20
+ from md_utils.ct_utils import is_empty
21
21
 
22
22
  # LILA camera trap primary metadata file
23
23
  lila_metadata_url = 'http://lila.science/wp-content/uploads/2023/06/lila_camera_trap_datasets.csv'
@@ -31,9 +31,21 @@ wildlife_insights_taxonomy_local_json_filename = 'wi_taxonomy.json'
31
31
  wildlife_insights_taxonomy_local_csv_filename = \
32
32
  wildlife_insights_taxonomy_local_json_filename.replace('.json','.csv')
33
33
 
34
- lila_azure_storage_account = 'https://lilablobssc.blob.core.windows.net'
35
- gcp_bucket_api_url = 'https://storage.googleapis.com/public-datasets-lila'
36
- gcp_bucket_gs_url = 'gs://public-datasets-lila'
34
+ # Filenames are consistent across clouds relative to these URLs
35
+ lila_base_urls = {
36
+ 'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
37
+ 'gcp':'https://storage.googleapis.com/public-datasets-lila/',
38
+ 'aws':'http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/'
39
+ }
40
+
41
+ lila_cloud_urls = {
42
+ 'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
43
+ 'gcp':'gs://public-datasets-lila/',
44
+ 'aws':'s3://us-west-2.opendata.source.coop/agentmorris/lila-wildlife/'
45
+ }
46
+
47
+ for url in lila_base_urls.values():
48
+ assert url.endswith('/')
37
49
 
38
50
 
39
51
  #%% Common functions
@@ -42,7 +54,11 @@ def read_wildlife_insights_taxonomy_mapping(metadata_dir):
42
54
  """
43
55
  Reads the WI taxonomy mapping file, downloading the .json data (and writing to .csv) if necessary.
44
56
 
45
- Returns a Pandas dataframe.
57
+ Args:
58
+ metadata_dir (str): folder to use for temporary LILA metadata files
59
+
60
+ Returns:
61
+ pd.dataframe: A DataFrame with taxonomy information
46
62
  """
47
63
 
48
64
  wi_taxonomy_csv_path = os.path.join(metadata_dir,wildlife_insights_taxonomy_local_csv_filename)
@@ -81,7 +97,11 @@ def read_lila_taxonomy_mapping(metadata_dir):
81
97
  """
82
98
  Reads the LILA taxonomy mapping file, downloading the .csv file if necessary.
83
99
 
84
- Returns a Pandas dataframe, with one row per identification.
100
+ Args:
101
+ metadata_dir (str): folder to use for temporary LILA metadata files
102
+
103
+ Returns:
104
+ pd.DataFrame: a DataFrame with one row per identification
85
105
  """
86
106
 
87
107
  p = urlparse(lila_taxonomy_mapping_url)
@@ -93,24 +113,38 @@ def read_lila_taxonomy_mapping(metadata_dir):
93
113
  return df
94
114
 
95
115
 
96
- def is_empty(v):
97
- if v is None:
98
- return True
99
- if isinstance(v,str) and v == '':
100
- return True
101
- if isinstance(v,float) and np.isnan(v):
102
- return True
103
- return False
104
-
105
-
106
116
  def read_lila_metadata(metadata_dir):
107
117
  """
108
- Reads LILA metadata (URLs to each dataset), downloading the txt file if necessary.
118
+ Reads LILA metadata (URLs to each dataset), downloading the .csv file if necessary.
109
119
 
110
- Returns a dict mapping dataset names (e.g. "Caltech Camera Traps") to dicts
111
- with keys corresponding to the headers in the .csv file, currently:
120
+ Args:
121
+ metadata_dir (str): folder to use for temporary LILA metadata files
122
+
123
+ Returns:
124
+ dict: a dict mapping dataset names (e.g. "Caltech Camera Traps") to dicts
125
+ with keys corresponding to the headers in the .csv file, currently:
112
126
 
113
- name,image_base_url,metadata_url,bbox_url,continent,country,region
127
+ - name
128
+ - short_name
129
+ - continent
130
+ - country
131
+ - region
132
+ - image_base_url_relative
133
+ - metadata_url_relative
134
+ - bbox_url_relative
135
+ - image_base_url_gcp
136
+ - metadata_url_gcp
137
+ - bbox_url_gcp
138
+ - image_base_url_aws
139
+ - metadata_url_aws
140
+ - bbox_url_aws
141
+ - image_base_url_azure
142
+ - metadata_url_azure
143
+ - box_url_azure
144
+ - mdv4_results_raw
145
+ - mdv5b_results_raw
146
+ - md_results_with_rde
147
+ - json_filename
114
148
  """
115
149
 
116
150
  # Put the master metadata file in the same folder where we're putting images
@@ -144,6 +178,12 @@ def read_lila_all_images_file(metadata_dir):
144
178
  """
145
179
  Downloads if necessary - then unzips if necessary - the .csv file with label mappings for
146
180
  all LILA files, and opens the resulting .csv file as a Pandas DataFrame.
181
+
182
+ Args:
183
+ metadata_dir (str): folder to use for temporary LILA metadata files
184
+
185
+ Returns:
186
+ pd.DataFrame: a DataFrame containing one row per identification in a LILA camera trap image
147
187
  """
148
188
 
149
189
  p = urlparse(lila_all_images_url)
@@ -165,18 +205,37 @@ def read_lila_all_images_file(metadata_dir):
165
205
  return df
166
206
 
167
207
 
168
- def read_metadata_file_for_dataset(ds_name,metadata_dir,metadata_table=None,json_url=None):
208
+ def read_metadata_file_for_dataset(ds_name,
209
+ metadata_dir,
210
+ metadata_table=None,
211
+ json_url=None,
212
+ preferred_cloud='gcp'):
169
213
  """
170
214
  Downloads if necessary - then unzips if necessary - the .json file for a specific dataset.
171
- Returns the .json filename on the local disk.
215
+
216
+ Args:
217
+ ds_name (str): the name of the dataset for which you want to retrieve metadata (e.g.
218
+ "Caltech Camera Traps")
219
+ metadata_dir (str): folder to use for temporary LILA metadata files
220
+ metadata_table (dict, optional): an optional dictionary already loaded via
221
+ read_lila_metadata()
222
+ json_url (str, optional): the URL of the metadata file, if None will be retrieved
223
+ via read_lila_metadata()
224
+ preferred_cloud (str, optional): 'gcp' (default), 'azure', or 'aws'
225
+
226
+ Returns:
227
+ str: the .json filename on the local disk
228
+
172
229
  """
173
230
 
231
+ assert preferred_cloud in lila_base_urls.keys()
232
+
174
233
  if json_url is None:
175
234
 
176
235
  if metadata_table is None:
177
236
  metadata_table = read_lila_metadata(metadata_dir)
178
237
 
179
- json_url = metadata_table[ds_name]['metadata_url']
238
+ json_url = metadata_table[ds_name]['metadata_url_' + preferred_cloud]
180
239
 
181
240
  p = urlparse(json_url)
182
241
  json_filename = os.path.join(metadata_dir,os.path.basename(p.path))
@@ -198,28 +257,6 @@ def read_metadata_file_for_dataset(ds_name,metadata_dir,metadata_table=None,json
198
257
  return json_filename
199
258
 
200
259
 
201
- def azure_url_to_gcp_http_url(url,error_if_not_azure_url=True):
202
- """
203
- Most URLs point to Azure by default, but most files are available on both Azure and GCP.
204
- This function converts an Azure URL to the corresponding GCP http:// url.
205
- """
206
-
207
- if error_if_not_azure_url:
208
- assert url.startswith(lila_azure_storage_account)
209
- gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
210
- return gcp_url
211
-
212
-
213
- def azure_url_to_gcp_gs_url(url,error_if_not_azure_url=True):
214
- """
215
- Most URLs point to Azure by default, but most files are available on both Azure and GCP.
216
- This function converts an Azure URL to the corresponding GCP gs:// url.
217
- """
218
-
219
- return azure_url_to_gcp_http_url(url,error_if_not_azure_url).\
220
- replace(gcp_bucket_api_url,gcp_bucket_gs_url,1)
221
-
222
-
223
260
  #%% Interactive test driver
224
261
 
225
262
  if False:
@@ -233,7 +270,8 @@ if False:
233
270
 
234
271
  from md_utils import url_utils
235
272
 
236
- status_codes = url_utils.test_urls(urls)
273
+ status_codes = url_utils.test_urls(urls,timeout=2.0)
274
+ assert all([code == 200 for code in status_codes])
237
275
 
238
276
 
239
277
  #%% Verify that the metadata URLs exist for individual datasets
@@ -243,25 +281,20 @@ if False:
243
281
  dataset_metadata = read_lila_metadata(metadata_dir)
244
282
 
245
283
  urls_to_test = []
284
+
246
285
  # ds_name = next(iter(dataset_metadata.keys()))
247
286
  for ds_name in dataset_metadata.keys():
248
287
 
249
288
  ds_info = dataset_metadata[ds_name]
250
- urls_to_test.append(ds_info['metadata_url'])
251
- if ds_info['bbox_url'] != None:
252
- urls_to_test.append(ds_info['bbox_url'])
289
+ for cloud_name in lila_base_urls.keys():
290
+ urls_to_test.append(ds_info['metadata_url_' + cloud_name])
291
+ if ds_info['bbox_url_relative'] != None:
292
+ urls_to_test.append(ds_info['bbox_url_' + cloud_name])
253
293
 
254
- status_codes = url_utils.test_urls(urls_to_test)
255
-
256
-
257
- #%% Verify that the GCP versions of all metadata files exist
258
-
259
- gcp_urls = []
260
-
261
- # url = urls_to_test[0]
262
- for url in urls_to_test:
263
- assert url.startswith(lila_azure_storage_account)
264
- gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
265
- gcp_urls.append(gcp_url)
266
-
267
- status_codes = url_utils.test_urls(gcp_urls)
294
+ status_codes = url_utils.test_urls(urls_to_test,
295
+ error_on_failure=True,
296
+ n_workers=10,
297
+ pool_type='process',
298
+ timeout=2.0)
299
+ assert all([code == 200 for code in status_codes])
300
+
@@ -1,116 +1,132 @@
1
- ########
2
- #
3
- # test_lila_metadata_urls.py
4
- #
5
- # Test that all the metadata URLs for LILA camera trap datasets are valid, and
6
- # test that at least one image within each URL is valid, including MegaDetector results
7
- # files.
8
- #
9
- ########
10
-
11
- #%% Constants and imports
12
-
13
- import json
14
- import os
15
-
16
- from data_management.lila.lila_common import read_lila_metadata,\
17
- read_metadata_file_for_dataset, read_lila_taxonomy_mapping
18
-
19
- # We'll write images, metadata downloads, and temporary files here
20
- lila_local_base = os.path.expanduser('~/lila')
21
-
22
- output_dir = os.path.join(lila_local_base,'lila_metadata_tests')
23
- os.makedirs(output_dir,exist_ok=True)
24
-
25
- metadata_dir = os.path.join(lila_local_base,'metadata')
26
- os.makedirs(metadata_dir,exist_ok=True)
27
-
28
- md_results_dir = os.path.join(lila_local_base,'md_results')
29
- os.makedirs(md_results_dir,exist_ok=True)
30
-
31
- md_results_keys = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
32
-
33
-
34
- #%% Load category and taxonomy files
35
-
36
- taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
37
-
38
-
39
- #%% Download and parse the metadata file
40
-
41
- metadata_table = read_lila_metadata(metadata_dir)
42
-
43
- print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
44
-
45
-
46
- #%% Download and extract metadata and MD results for each dataset
47
-
48
- for ds_name in metadata_table.keys():
49
-
50
- metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
51
- metadata_dir=metadata_dir,
52
- metadata_table=metadata_table)
53
- for k in md_results_keys:
54
- md_results_url = metadata_table[ds_name][k]
55
- if md_results_url is None:
56
- metadata_table[ds_name][k + '_filename'] = None
57
- else:
58
- metadata_table[ds_name][k + '_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
59
- metadata_dir=md_results_dir,
60
- json_url=md_results_url)
61
-
62
-
63
- #%% Build up a list of URLs to test
64
-
65
- url_to_source = {}
66
-
67
- # The first image in a dataset is disproportionately likely to be human (and thus 404)
68
- image_index = 1000
69
-
70
- # ds_name = list(metadata_table.keys())[0]
71
- for ds_name in metadata_table.keys():
72
-
73
- if 'bbox' in ds_name:
74
- print('Skipping bbox dataset {}'.format(ds_name))
75
- continue
76
-
77
- print('Processing dataset {}'.format(ds_name))
78
-
79
- json_filename = metadata_table[ds_name]['json_filename']
80
- with open(json_filename, 'r') as f:
81
- data = json.load(f)
82
-
83
- image_base_url = metadata_table[ds_name]['image_base_url']
84
- assert not image_base_url.endswith('/')
85
- # Download a test image
86
- test_image_relative_path = data['images'][image_index]['file_name']
87
- test_image_url = image_base_url + '/' + test_image_relative_path
88
-
89
- url_to_source[test_image_url] = ds_name + ' metadata'
90
-
91
- # k = md_results_keys[2]
92
- for k in md_results_keys:
93
- k_fn = k + '_filename'
94
- if metadata_table[ds_name][k_fn] is not None:
95
- with open(metadata_table[ds_name][k_fn],'r') as f:
96
- md_results = json.load(f)
97
- im = md_results['images'][image_index]
98
- md_image_url = image_base_url + '/' + im['file']
99
- url_to_source[md_image_url] = ds_name + ' ' + k
100
-
101
- # ...for each dataset
102
-
103
-
104
- #%% Test URLs
105
-
106
- from md_utils.url_utils import test_urls
107
-
108
- urls_to_test = sorted(url_to_source.keys())
109
- urls_to_test = [fn.replace('\\','/') for fn in urls_to_test]
110
-
111
- status_codes = test_urls(urls_to_test,error_on_failure=False)
112
-
113
- for i_url,url in enumerate(urls_to_test):
114
- if status_codes[i_url] != 200:
115
- print('Status {} for {} ({})'.format(
116
- status_codes[i_url],url,url_to_source[url]))
1
+ """
2
+
3
+ test_lila_metadata_urls.py
4
+
5
+ Test that all the metadata URLs for LILA camera trap datasets are valid, including MegaDetector
6
+ results files.
7
+
8
+ Also pick an arbitrary image from each dataset and make sure that URL is valid.
9
+
10
+ Also picks an arbitrary image from each dataset's MD results and make sure the corresponding URL is valid.
11
+
12
+ """
13
+
14
+ #%% Constants and imports
15
+
16
+ import json
17
+ import os
18
+
19
+ from data_management.lila.lila_common import read_lila_metadata,\
20
+ read_metadata_file_for_dataset, read_lila_taxonomy_mapping
21
+
22
+ # We'll write images, metadata downloads, and temporary files here
23
+ lila_local_base = os.path.expanduser('~/lila')
24
+
25
+ output_dir = os.path.join(lila_local_base,'lila_metadata_tests')
26
+ os.makedirs(output_dir,exist_ok=True)
27
+
28
+ metadata_dir = os.path.join(lila_local_base,'metadata')
29
+ os.makedirs(metadata_dir,exist_ok=True)
30
+
31
+ md_results_dir = os.path.join(lila_local_base,'md_results')
32
+ os.makedirs(md_results_dir,exist_ok=True)
33
+
34
+ md_results_keys = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
35
+
36
+ preferred_cloud = 'gcp' # 'azure', 'aws'
37
+
38
+
39
+ #%% Load category and taxonomy files
40
+
41
+ taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
42
+
43
+
44
+ #%% Download and parse the metadata file
45
+
46
+ metadata_table = read_lila_metadata(metadata_dir)
47
+
48
+ print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
49
+
50
+
51
+ #%% Download and extract metadata and MD results for each dataset
52
+
53
+ for ds_name in metadata_table.keys():
54
+
55
+ metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
56
+ metadata_dir=metadata_dir,
57
+ metadata_table=metadata_table)
58
+ for k in md_results_keys:
59
+ md_results_url = metadata_table[ds_name][k]
60
+ if md_results_url is None:
61
+ metadata_table[ds_name][k + '_filename'] = None
62
+ else:
63
+ metadata_table[ds_name][k + '_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
64
+ metadata_dir=md_results_dir,
65
+ json_url=md_results_url)
66
+
67
+
68
+ #%% Build up a list of URLs to test
69
+
70
+ # Takes ~15 mins, since it has to open all the giant .json files
71
+
72
+ url_to_source = {}
73
+
74
+ # The first image in a dataset is disproportionately likely to be human (and thus 404),
75
+ # so we pick a semi-arbitrary image that isn't the first. How about the 1000th?
76
+ image_index = 1000
77
+
78
+ # ds_name = list(metadata_table.keys())[0]
79
+ for ds_name in metadata_table.keys():
80
+
81
+ if 'bbox' in ds_name:
82
+ print('Skipping bbox dataset {}'.format(ds_name))
83
+ continue
84
+
85
+ print('Processing dataset {}'.format(ds_name))
86
+
87
+ json_filename = metadata_table[ds_name]['json_filename']
88
+ with open(json_filename, 'r') as f:
89
+ data = json.load(f)
90
+
91
+ image_base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
92
+ assert not image_base_url.endswith('/')
93
+ # Download a test image
94
+ test_image_relative_path = data['images'][image_index]['file_name']
95
+ test_image_url = image_base_url + '/' + test_image_relative_path
96
+
97
+ url_to_source[test_image_url] = ds_name + ' metadata'
98
+
99
+ # Grab an image from the MegaDetector results
100
+
101
+ # k = md_results_keys[2]
102
+ for k in md_results_keys:
103
+ k_fn = k + '_filename'
104
+ if metadata_table[ds_name][k_fn] is not None:
105
+ with open(metadata_table[ds_name][k_fn],'r') as f:
106
+ md_results = json.load(f)
107
+ im = md_results['images'][image_index]
108
+ md_image_url = image_base_url + '/' + im['file']
109
+ url_to_source[md_image_url] = ds_name + ' ' + k
110
+ del md_results
111
+ del data
112
+
113
+ # ...for each dataset
114
+
115
+
116
+ #%% Test URLs
117
+
118
+ from md_utils.url_utils import test_urls
119
+
120
+ urls_to_test = sorted(url_to_source.keys())
121
+ urls_to_test = [fn.replace('\\','/') for fn in urls_to_test]
122
+
123
+ status_codes = test_urls(urls_to_test,
124
+ error_on_failure=False,
125
+ pool_type='thread',
126
+ n_workers=10,
127
+ timeout=2.0)
128
+
129
+ for i_url,url in enumerate(urls_to_test):
130
+ if status_codes[i_url] != 200:
131
+ print('Status {} for {} ({})'.format(
132
+ status_codes[i_url],url,url_to_source[url]))