megadetector 5.0.8__py3-none-any.whl → 5.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (190) hide show
  1. api/__init__.py +0 -0
  2. api/batch_processing/__init__.py +0 -0
  3. api/batch_processing/api_core/__init__.py +0 -0
  4. api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. api/batch_processing/api_core/batch_service/score.py +0 -1
  6. api/batch_processing/api_core/server_job_status_table.py +0 -1
  7. api/batch_processing/api_core_support/__init__.py +0 -0
  8. api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
  9. api/batch_processing/api_support/__init__.py +0 -0
  10. api/batch_processing/api_support/summarize_daily_activity.py +0 -1
  11. api/batch_processing/data_preparation/__init__.py +0 -0
  12. api/batch_processing/data_preparation/manage_local_batch.py +65 -65
  13. api/batch_processing/data_preparation/manage_video_batch.py +8 -8
  14. api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
  15. api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
  16. api/batch_processing/postprocessing/__init__.py +0 -0
  17. api/batch_processing/postprocessing/add_max_conf.py +12 -12
  18. api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
  19. api/batch_processing/postprocessing/combine_api_outputs.py +68 -54
  20. api/batch_processing/postprocessing/compare_batch_results.py +113 -43
  21. api/batch_processing/postprocessing/convert_output_format.py +41 -16
  22. api/batch_processing/postprocessing/load_api_results.py +16 -17
  23. api/batch_processing/postprocessing/md_to_coco.py +31 -21
  24. api/batch_processing/postprocessing/md_to_labelme.py +52 -22
  25. api/batch_processing/postprocessing/merge_detections.py +14 -14
  26. api/batch_processing/postprocessing/postprocess_batch_results.py +246 -174
  27. api/batch_processing/postprocessing/remap_detection_categories.py +32 -25
  28. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +60 -27
  29. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
  30. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
  31. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +242 -158
  32. api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
  33. api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
  34. api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
  35. api/synchronous/__init__.py +0 -0
  36. api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  37. api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
  38. api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
  39. api/synchronous/api_core/animal_detection_api/config.py +35 -35
  40. api/synchronous/api_core/tests/__init__.py +0 -0
  41. api/synchronous/api_core/tests/load_test.py +109 -109
  42. classification/__init__.py +0 -0
  43. classification/aggregate_classifier_probs.py +21 -24
  44. classification/analyze_failed_images.py +11 -13
  45. classification/cache_batchapi_outputs.py +51 -51
  46. classification/create_classification_dataset.py +69 -68
  47. classification/crop_detections.py +54 -53
  48. classification/csv_to_json.py +97 -100
  49. classification/detect_and_crop.py +105 -105
  50. classification/evaluate_model.py +43 -42
  51. classification/identify_mislabeled_candidates.py +47 -46
  52. classification/json_to_azcopy_list.py +10 -10
  53. classification/json_validator.py +72 -71
  54. classification/map_classification_categories.py +44 -43
  55. classification/merge_classification_detection_output.py +68 -68
  56. classification/prepare_classification_script.py +157 -154
  57. classification/prepare_classification_script_mc.py +228 -228
  58. classification/run_classifier.py +27 -26
  59. classification/save_mislabeled.py +30 -30
  60. classification/train_classifier.py +20 -20
  61. classification/train_classifier_tf.py +21 -22
  62. classification/train_utils.py +10 -10
  63. data_management/__init__.py +0 -0
  64. data_management/annotations/__init__.py +0 -0
  65. data_management/annotations/annotation_constants.py +18 -31
  66. data_management/camtrap_dp_to_coco.py +238 -0
  67. data_management/cct_json_utils.py +102 -59
  68. data_management/cct_to_md.py +176 -158
  69. data_management/cct_to_wi.py +247 -219
  70. data_management/coco_to_labelme.py +272 -263
  71. data_management/coco_to_yolo.py +79 -58
  72. data_management/databases/__init__.py +0 -0
  73. data_management/databases/add_width_and_height_to_db.py +20 -16
  74. data_management/databases/combine_coco_camera_traps_files.py +35 -31
  75. data_management/databases/integrity_check_json_db.py +62 -24
  76. data_management/databases/subset_json_db.py +24 -15
  77. data_management/generate_crops_from_cct.py +27 -45
  78. data_management/get_image_sizes.py +188 -162
  79. data_management/importers/add_nacti_sizes.py +8 -8
  80. data_management/importers/add_timestamps_to_icct.py +78 -78
  81. data_management/importers/animl_results_to_md_results.py +158 -158
  82. data_management/importers/auckland_doc_test_to_json.py +9 -9
  83. data_management/importers/auckland_doc_to_json.py +8 -8
  84. data_management/importers/awc_to_json.py +7 -7
  85. data_management/importers/bellevue_to_json.py +15 -15
  86. data_management/importers/cacophony-thermal-importer.py +13 -13
  87. data_management/importers/carrizo_shrubfree_2018.py +8 -8
  88. data_management/importers/carrizo_trail_cam_2017.py +8 -8
  89. data_management/importers/cct_field_adjustments.py +9 -9
  90. data_management/importers/channel_islands_to_cct.py +10 -10
  91. data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
  92. data_management/importers/ena24_to_json.py +7 -7
  93. data_management/importers/filenames_to_json.py +8 -8
  94. data_management/importers/helena_to_cct.py +7 -7
  95. data_management/importers/idaho-camera-traps.py +7 -7
  96. data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
  97. data_management/importers/jb_csv_to_json.py +9 -9
  98. data_management/importers/mcgill_to_json.py +8 -8
  99. data_management/importers/missouri_to_json.py +18 -18
  100. data_management/importers/nacti_fieldname_adjustments.py +10 -10
  101. data_management/importers/noaa_seals_2019.py +7 -7
  102. data_management/importers/pc_to_json.py +7 -7
  103. data_management/importers/plot_wni_giraffes.py +7 -7
  104. data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
  105. data_management/importers/prepare_zsl_imerit.py +7 -7
  106. data_management/importers/rspb_to_json.py +8 -8
  107. data_management/importers/save_the_elephants_survey_A.py +8 -8
  108. data_management/importers/save_the_elephants_survey_B.py +9 -9
  109. data_management/importers/snapshot_safari_importer.py +26 -26
  110. data_management/importers/snapshot_safari_importer_reprise.py +665 -665
  111. data_management/importers/snapshot_serengeti_lila.py +14 -14
  112. data_management/importers/sulross_get_exif.py +8 -9
  113. data_management/importers/timelapse_csv_set_to_json.py +11 -11
  114. data_management/importers/ubc_to_json.py +13 -13
  115. data_management/importers/umn_to_json.py +7 -7
  116. data_management/importers/wellington_to_json.py +8 -8
  117. data_management/importers/wi_to_json.py +9 -9
  118. data_management/importers/zamba_results_to_md_results.py +181 -181
  119. data_management/labelme_to_coco.py +65 -24
  120. data_management/labelme_to_yolo.py +8 -8
  121. data_management/lila/__init__.py +0 -0
  122. data_management/lila/add_locations_to_island_camera_traps.py +9 -9
  123. data_management/lila/add_locations_to_nacti.py +147 -147
  124. data_management/lila/create_lila_blank_set.py +13 -13
  125. data_management/lila/create_lila_test_set.py +8 -8
  126. data_management/lila/create_links_to_md_results_files.py +106 -106
  127. data_management/lila/download_lila_subset.py +44 -110
  128. data_management/lila/generate_lila_per_image_labels.py +55 -42
  129. data_management/lila/get_lila_annotation_counts.py +18 -15
  130. data_management/lila/get_lila_image_counts.py +11 -11
  131. data_management/lila/lila_common.py +96 -33
  132. data_management/lila/test_lila_metadata_urls.py +132 -116
  133. data_management/ocr_tools.py +173 -128
  134. data_management/read_exif.py +110 -97
  135. data_management/remap_coco_categories.py +83 -83
  136. data_management/remove_exif.py +58 -62
  137. data_management/resize_coco_dataset.py +30 -23
  138. data_management/wi_download_csv_to_coco.py +246 -239
  139. data_management/yolo_output_to_md_output.py +86 -73
  140. data_management/yolo_to_coco.py +300 -60
  141. detection/__init__.py +0 -0
  142. detection/detector_training/__init__.py +0 -0
  143. detection/process_video.py +85 -33
  144. detection/pytorch_detector.py +43 -25
  145. detection/run_detector.py +157 -72
  146. detection/run_detector_batch.py +179 -113
  147. detection/run_inference_with_yolov5_val.py +108 -48
  148. detection/run_tiled_inference.py +111 -40
  149. detection/tf_detector.py +51 -29
  150. detection/video_utils.py +606 -521
  151. docs/source/conf.py +43 -0
  152. md_utils/__init__.py +0 -0
  153. md_utils/azure_utils.py +9 -9
  154. md_utils/ct_utils.py +228 -68
  155. md_utils/directory_listing.py +59 -64
  156. md_utils/md_tests.py +968 -871
  157. md_utils/path_utils.py +460 -134
  158. md_utils/process_utils.py +157 -133
  159. md_utils/sas_blob_utils.py +20 -20
  160. md_utils/split_locations_into_train_val.py +45 -32
  161. md_utils/string_utils.py +33 -10
  162. md_utils/url_utils.py +176 -60
  163. md_utils/write_html_image_list.py +40 -33
  164. md_visualization/__init__.py +0 -0
  165. md_visualization/plot_utils.py +102 -109
  166. md_visualization/render_images_with_thumbnails.py +34 -34
  167. md_visualization/visualization_utils.py +597 -291
  168. md_visualization/visualize_db.py +76 -48
  169. md_visualization/visualize_detector_output.py +61 -42
  170. {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/METADATA +13 -7
  171. megadetector-5.0.9.dist-info/RECORD +224 -0
  172. {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/top_level.txt +1 -0
  173. taxonomy_mapping/__init__.py +0 -0
  174. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
  175. taxonomy_mapping/map_new_lila_datasets.py +154 -154
  176. taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
  177. taxonomy_mapping/preview_lila_taxonomy.py +591 -591
  178. taxonomy_mapping/retrieve_sample_image.py +12 -12
  179. taxonomy_mapping/simple_image_download.py +11 -11
  180. taxonomy_mapping/species_lookup.py +10 -10
  181. taxonomy_mapping/taxonomy_csv_checker.py +18 -18
  182. taxonomy_mapping/taxonomy_graph.py +47 -47
  183. taxonomy_mapping/validate_lila_category_mappings.py +83 -76
  184. data_management/cct_json_to_filename_json.py +0 -89
  185. data_management/cct_to_csv.py +0 -140
  186. data_management/databases/remove_corrupted_images_from_db.py +0 -191
  187. detection/detector_training/copy_checkpoints.py +0 -43
  188. megadetector-5.0.8.dist-info/RECORD +0 -205
  189. {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/LICENSE +0 -0
  190. {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/WHEEL +0 -0
@@ -1,14 +1,14 @@
1
- ########
2
- #
3
- # get_lila_image_counts.py
4
- #
5
- # Count the number of images and bounding boxes with each label in one or more LILA datasets.
6
- #
7
- # This script doesn't write these counts out anywhere other than the console, it's just intended
8
- # as a template for doing operations like this on LILA data. get_lila_annotation_counts.py writes
9
- # information out to a .json file, but it counts *annotations*, not *images*, for each category.
10
- #
11
- ########
1
+ """
2
+
3
+ get_lila_image_counts.py
4
+
5
+ Count the number of images and bounding boxes with each label in one or more LILA datasets.
6
+
7
+ This script doesn't write these counts out anywhere other than the console, it's just intended
8
+ as a template for doing operations like this on LILA data. get_lila_annotation_counts.py writes
9
+ information out to a .json file, but it counts *annotations*, not *images*, for each category.
10
+
11
+ """
12
12
 
13
13
  #%% Constants and imports
14
14
 
@@ -1,10 +1,10 @@
1
- ########
2
- #
3
- # lila_common.py
4
- #
5
- # Common constants and functions related to LILA data management/retrieval.
6
- #
7
- ########
1
+ """
2
+
3
+ lila_common.py
4
+
5
+ Common constants and functions related to LILA data management/retrieval.
6
+
7
+ """
8
8
 
9
9
  #%% Imports and constants
10
10
 
@@ -12,12 +12,12 @@ import os
12
12
  import json
13
13
  import zipfile
14
14
  import pandas as pd
15
- import numpy as np
16
15
 
17
16
  from urllib.parse import urlparse
18
17
 
19
18
  from md_utils.url_utils import download_url
20
19
  from md_utils.path_utils import unzip_file
20
+ from md_utils.ct_utils import is_empty
21
21
 
22
22
  # LILA camera trap primary metadata file
23
23
  lila_metadata_url = 'http://lila.science/wp-content/uploads/2023/06/lila_camera_trap_datasets.csv'
@@ -33,11 +33,19 @@ wildlife_insights_taxonomy_local_csv_filename = \
33
33
 
34
34
  # Filenames are consistent across clouds relative to these URLs
35
35
  lila_base_urls = {
36
- 'azure':'https://lilablobssc.blob.core.windows.net/',
36
+ 'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
37
37
  'gcp':'https://storage.googleapis.com/public-datasets-lila/',
38
38
  'aws':'http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/'
39
39
  }
40
40
 
41
+ lila_cloud_urls = {
42
+ 'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
43
+ 'gcp':'gs://public-datasets-lila/',
44
+ 'aws':'s3://us-west-2.opendata.source.coop/agentmorris/lila-wildlife/'
45
+ }
46
+
47
+ for url in lila_base_urls.values():
48
+ assert url.endswith('/')
41
49
 
42
50
 
43
51
  #%% Common functions
@@ -46,7 +54,11 @@ def read_wildlife_insights_taxonomy_mapping(metadata_dir):
46
54
  """
47
55
  Reads the WI taxonomy mapping file, downloading the .json data (and writing to .csv) if necessary.
48
56
 
49
- Returns a Pandas dataframe.
57
+ Args:
58
+ metadata_dir (str): folder to use for temporary LILA metadata files
59
+
60
+ Returns:
61
+ pd.dataframe: A DataFrame with taxonomy information
50
62
  """
51
63
 
52
64
  wi_taxonomy_csv_path = os.path.join(metadata_dir,wildlife_insights_taxonomy_local_csv_filename)
@@ -85,7 +97,11 @@ def read_lila_taxonomy_mapping(metadata_dir):
85
97
  """
86
98
  Reads the LILA taxonomy mapping file, downloading the .csv file if necessary.
87
99
 
88
- Returns a Pandas dataframe, with one row per identification.
100
+ Args:
101
+ metadata_dir (str): folder to use for temporary LILA metadata files
102
+
103
+ Returns:
104
+ pd.DataFrame: a DataFrame with one row per identification
89
105
  """
90
106
 
91
107
  p = urlparse(lila_taxonomy_mapping_url)
@@ -97,24 +113,38 @@ def read_lila_taxonomy_mapping(metadata_dir):
97
113
  return df
98
114
 
99
115
 
100
- def is_empty(v):
101
- if v is None:
102
- return True
103
- if isinstance(v,str) and v == '':
104
- return True
105
- if isinstance(v,float) and np.isnan(v):
106
- return True
107
- return False
108
-
109
-
110
116
  def read_lila_metadata(metadata_dir):
111
117
  """
112
- Reads LILA metadata (URLs to each dataset), downloading the txt file if necessary.
118
+ Reads LILA metadata (URLs to each dataset), downloading the .csv file if necessary.
113
119
 
114
- Returns a dict mapping dataset names (e.g. "Caltech Camera Traps") to dicts
115
- with keys corresponding to the headers in the .csv file, currently:
120
+ Args:
121
+ metadata_dir (str): folder to use for temporary LILA metadata files
116
122
 
117
- name,image_base_url,metadata_url,bbox_url,continent,country,region
123
+ Returns:
124
+ dict: a dict mapping dataset names (e.g. "Caltech Camera Traps") to dicts
125
+ with keys corresponding to the headers in the .csv file, currently:
126
+
127
+ - name
128
+ - short_name
129
+ - continent
130
+ - country
131
+ - region
132
+ - image_base_url_relative
133
+ - metadata_url_relative
134
+ - bbox_url_relative
135
+ - image_base_url_gcp
136
+ - metadata_url_gcp
137
+ - bbox_url_gcp
138
+ - image_base_url_aws
139
+ - metadata_url_aws
140
+ - bbox_url_aws
141
+ - image_base_url_azure
142
+ - metadata_url_azure
143
+ - box_url_azure
144
+ - mdv4_results_raw
145
+ - mdv5b_results_raw
146
+ - md_results_with_rde
147
+ - json_filename
118
148
  """
119
149
 
120
150
  # Put the master metadata file in the same folder where we're putting images
@@ -148,6 +178,12 @@ def read_lila_all_images_file(metadata_dir):
148
178
  """
149
179
  Downloads if necessary - then unzips if necessary - the .csv file with label mappings for
150
180
  all LILA files, and opens the resulting .csv file as a Pandas DataFrame.
181
+
182
+ Args:
183
+ metadata_dir (str): folder to use for temporary LILA metadata files
184
+
185
+ Returns:
186
+ pd.DataFrame: a DataFrame containing one row per identification in a LILA camera trap image
151
187
  """
152
188
 
153
189
  p = urlparse(lila_all_images_url)
@@ -169,18 +205,37 @@ def read_lila_all_images_file(metadata_dir):
169
205
  return df
170
206
 
171
207
 
172
- def read_metadata_file_for_dataset(ds_name,metadata_dir,metadata_table=None,json_url=None):
208
+ def read_metadata_file_for_dataset(ds_name,
209
+ metadata_dir,
210
+ metadata_table=None,
211
+ json_url=None,
212
+ preferred_cloud='gcp'):
173
213
  """
174
214
  Downloads if necessary - then unzips if necessary - the .json file for a specific dataset.
175
- Returns the .json filename on the local disk.
215
+
216
+ Args:
217
+ ds_name (str): the name of the dataset for which you want to retrieve metadata (e.g.
218
+ "Caltech Camera Traps")
219
+ metadata_dir (str): folder to use for temporary LILA metadata files
220
+ metadata_table (dict, optional): an optional dictionary already loaded via
221
+ read_lila_metadata()
222
+ json_url (str, optional): the URL of the metadata file, if None will be retrieved
223
+ via read_lila_metadata()
224
+ preferred_cloud (str, optional): 'gcp' (default), 'azure', or 'aws'
225
+
226
+ Returns:
227
+ str: the .json filename on the local disk
228
+
176
229
  """
177
230
 
231
+ assert preferred_cloud in lila_base_urls.keys()
232
+
178
233
  if json_url is None:
179
234
 
180
235
  if metadata_table is None:
181
236
  metadata_table = read_lila_metadata(metadata_dir)
182
237
 
183
- json_url = metadata_table[ds_name]['metadata_url']
238
+ json_url = metadata_table[ds_name]['metadata_url_' + preferred_cloud]
184
239
 
185
240
  p = urlparse(json_url)
186
241
  json_filename = os.path.join(metadata_dir,os.path.basename(p.path))
@@ -215,7 +270,8 @@ if False:
215
270
 
216
271
  from md_utils import url_utils
217
272
 
218
- status_codes = url_utils.test_urls(urls)
273
+ status_codes = url_utils.test_urls(urls,timeout=2.0)
274
+ assert all([code == 200 for code in status_codes])
219
275
 
220
276
 
221
277
  #%% Verify that the metadata URLs exist for individual datasets
@@ -225,13 +281,20 @@ if False:
225
281
  dataset_metadata = read_lila_metadata(metadata_dir)
226
282
 
227
283
  urls_to_test = []
284
+
228
285
  # ds_name = next(iter(dataset_metadata.keys()))
229
286
  for ds_name in dataset_metadata.keys():
230
287
 
231
288
  ds_info = dataset_metadata[ds_name]
232
- urls_to_test.append(ds_info['metadata_url'])
233
- if ds_info['bbox_url'] != None:
234
- urls_to_test.append(ds_info['bbox_url'])
289
+ for cloud_name in lila_base_urls.keys():
290
+ urls_to_test.append(ds_info['metadata_url_' + cloud_name])
291
+ if ds_info['bbox_url_relative'] != None:
292
+ urls_to_test.append(ds_info['bbox_url_' + cloud_name])
235
293
 
236
- status_codes = url_utils.test_urls(urls_to_test)
294
+ status_codes = url_utils.test_urls(urls_to_test,
295
+ error_on_failure=True,
296
+ n_workers=10,
297
+ pool_type='process',
298
+ timeout=2.0)
299
+ assert all([code == 200 for code in status_codes])
237
300
 
@@ -1,116 +1,132 @@
1
- ########
2
- #
3
- # test_lila_metadata_urls.py
4
- #
5
- # Test that all the metadata URLs for LILA camera trap datasets are valid, and
6
- # test that at least one image within each URL is valid, including MegaDetector results
7
- # files.
8
- #
9
- ########
10
-
11
- #%% Constants and imports
12
-
13
- import json
14
- import os
15
-
16
- from data_management.lila.lila_common import read_lila_metadata,\
17
- read_metadata_file_for_dataset, read_lila_taxonomy_mapping
18
-
19
- # We'll write images, metadata downloads, and temporary files here
20
- lila_local_base = os.path.expanduser('~/lila')
21
-
22
- output_dir = os.path.join(lila_local_base,'lila_metadata_tests')
23
- os.makedirs(output_dir,exist_ok=True)
24
-
25
- metadata_dir = os.path.join(lila_local_base,'metadata')
26
- os.makedirs(metadata_dir,exist_ok=True)
27
-
28
- md_results_dir = os.path.join(lila_local_base,'md_results')
29
- os.makedirs(md_results_dir,exist_ok=True)
30
-
31
- md_results_keys = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
32
-
33
-
34
- #%% Load category and taxonomy files
35
-
36
- taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
37
-
38
-
39
- #%% Download and parse the metadata file
40
-
41
- metadata_table = read_lila_metadata(metadata_dir)
42
-
43
- print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
44
-
45
-
46
- #%% Download and extract metadata and MD results for each dataset
47
-
48
- for ds_name in metadata_table.keys():
49
-
50
- metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
51
- metadata_dir=metadata_dir,
52
- metadata_table=metadata_table)
53
- for k in md_results_keys:
54
- md_results_url = metadata_table[ds_name][k]
55
- if md_results_url is None:
56
- metadata_table[ds_name][k + '_filename'] = None
57
- else:
58
- metadata_table[ds_name][k + '_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
59
- metadata_dir=md_results_dir,
60
- json_url=md_results_url)
61
-
62
-
63
- #%% Build up a list of URLs to test
64
-
65
- url_to_source = {}
66
-
67
- # The first image in a dataset is disproportionately likely to be human (and thus 404)
68
- image_index = 1000
69
-
70
- # ds_name = list(metadata_table.keys())[0]
71
- for ds_name in metadata_table.keys():
72
-
73
- if 'bbox' in ds_name:
74
- print('Skipping bbox dataset {}'.format(ds_name))
75
- continue
76
-
77
- print('Processing dataset {}'.format(ds_name))
78
-
79
- json_filename = metadata_table[ds_name]['json_filename']
80
- with open(json_filename, 'r') as f:
81
- data = json.load(f)
82
-
83
- image_base_url = metadata_table[ds_name]['image_base_url']
84
- assert not image_base_url.endswith('/')
85
- # Download a test image
86
- test_image_relative_path = data['images'][image_index]['file_name']
87
- test_image_url = image_base_url + '/' + test_image_relative_path
88
-
89
- url_to_source[test_image_url] = ds_name + ' metadata'
90
-
91
- # k = md_results_keys[2]
92
- for k in md_results_keys:
93
- k_fn = k + '_filename'
94
- if metadata_table[ds_name][k_fn] is not None:
95
- with open(metadata_table[ds_name][k_fn],'r') as f:
96
- md_results = json.load(f)
97
- im = md_results['images'][image_index]
98
- md_image_url = image_base_url + '/' + im['file']
99
- url_to_source[md_image_url] = ds_name + ' ' + k
100
-
101
- # ...for each dataset
102
-
103
-
104
- #%% Test URLs
105
-
106
- from md_utils.url_utils import test_urls
107
-
108
- urls_to_test = sorted(url_to_source.keys())
109
- urls_to_test = [fn.replace('\\','/') for fn in urls_to_test]
110
-
111
- status_codes = test_urls(urls_to_test,error_on_failure=False)
112
-
113
- for i_url,url in enumerate(urls_to_test):
114
- if status_codes[i_url] != 200:
115
- print('Status {} for {} ({})'.format(
116
- status_codes[i_url],url,url_to_source[url]))
1
+ """
2
+
3
+ test_lila_metadata_urls.py
4
+
5
+ Test that all the metadata URLs for LILA camera trap datasets are valid, including MegaDetector
6
+ results files.
7
+
8
+ Also pick an arbitrary image from each dataset and make sure that URL is valid.
9
+
10
+ Also picks an arbitrary image from each dataset's MD results and make sure the corresponding URL is valid.
11
+
12
+ """
13
+
14
+ #%% Constants and imports
15
+
16
+ import json
17
+ import os
18
+
19
+ from data_management.lila.lila_common import read_lila_metadata,\
20
+ read_metadata_file_for_dataset, read_lila_taxonomy_mapping
21
+
22
+ # We'll write images, metadata downloads, and temporary files here
23
+ lila_local_base = os.path.expanduser('~/lila')
24
+
25
+ output_dir = os.path.join(lila_local_base,'lila_metadata_tests')
26
+ os.makedirs(output_dir,exist_ok=True)
27
+
28
+ metadata_dir = os.path.join(lila_local_base,'metadata')
29
+ os.makedirs(metadata_dir,exist_ok=True)
30
+
31
+ md_results_dir = os.path.join(lila_local_base,'md_results')
32
+ os.makedirs(md_results_dir,exist_ok=True)
33
+
34
+ md_results_keys = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
35
+
36
+ preferred_cloud = 'gcp' # 'azure', 'aws'
37
+
38
+
39
+ #%% Load category and taxonomy files
40
+
41
+ taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
42
+
43
+
44
+ #%% Download and parse the metadata file
45
+
46
+ metadata_table = read_lila_metadata(metadata_dir)
47
+
48
+ print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
49
+
50
+
51
+ #%% Download and extract metadata and MD results for each dataset
52
+
53
+ for ds_name in metadata_table.keys():
54
+
55
+ metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
56
+ metadata_dir=metadata_dir,
57
+ metadata_table=metadata_table)
58
+ for k in md_results_keys:
59
+ md_results_url = metadata_table[ds_name][k]
60
+ if md_results_url is None:
61
+ metadata_table[ds_name][k + '_filename'] = None
62
+ else:
63
+ metadata_table[ds_name][k + '_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
64
+ metadata_dir=md_results_dir,
65
+ json_url=md_results_url)
66
+
67
+
68
+ #%% Build up a list of URLs to test
69
+
70
+ # Takes ~15 mins, since it has to open all the giant .json files
71
+
72
+ url_to_source = {}
73
+
74
+ # The first image in a dataset is disproportionately likely to be human (and thus 404),
75
+ # so we pick a semi-arbitrary image that isn't the first. How about the 1000th?
76
+ image_index = 1000
77
+
78
+ # ds_name = list(metadata_table.keys())[0]
79
+ for ds_name in metadata_table.keys():
80
+
81
+ if 'bbox' in ds_name:
82
+ print('Skipping bbox dataset {}'.format(ds_name))
83
+ continue
84
+
85
+ print('Processing dataset {}'.format(ds_name))
86
+
87
+ json_filename = metadata_table[ds_name]['json_filename']
88
+ with open(json_filename, 'r') as f:
89
+ data = json.load(f)
90
+
91
+ image_base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
92
+ assert not image_base_url.endswith('/')
93
+ # Download a test image
94
+ test_image_relative_path = data['images'][image_index]['file_name']
95
+ test_image_url = image_base_url + '/' + test_image_relative_path
96
+
97
+ url_to_source[test_image_url] = ds_name + ' metadata'
98
+
99
+ # Grab an image from the MegaDetector results
100
+
101
+ # k = md_results_keys[2]
102
+ for k in md_results_keys:
103
+ k_fn = k + '_filename'
104
+ if metadata_table[ds_name][k_fn] is not None:
105
+ with open(metadata_table[ds_name][k_fn],'r') as f:
106
+ md_results = json.load(f)
107
+ im = md_results['images'][image_index]
108
+ md_image_url = image_base_url + '/' + im['file']
109
+ url_to_source[md_image_url] = ds_name + ' ' + k
110
+ del md_results
111
+ del data
112
+
113
+ # ...for each dataset
114
+
115
+
116
+ #%% Test URLs
117
+
118
+ from md_utils.url_utils import test_urls
119
+
120
+ urls_to_test = sorted(url_to_source.keys())
121
+ urls_to_test = [fn.replace('\\','/') for fn in urls_to_test]
122
+
123
+ status_codes = test_urls(urls_to_test,
124
+ error_on_failure=False,
125
+ pool_type='thread',
126
+ n_workers=10,
127
+ timeout=2.0)
128
+
129
+ for i_url,url in enumerate(urls_to_test):
130
+ if status_codes[i_url] != 200:
131
+ print('Status {} for {} ({})'.format(
132
+ status_codes[i_url],url,url_to_source[url]))