megadetector 5.0.11__py3-none-any.whl → 5.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (203) hide show
  1. megadetector/api/__init__.py +0 -0
  2. megadetector/api/batch_processing/__init__.py +0 -0
  3. megadetector/api/batch_processing/api_core/__init__.py +0 -0
  4. megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. megadetector/api/batch_processing/api_core/batch_service/score.py +439 -0
  6. megadetector/api/batch_processing/api_core/server.py +294 -0
  7. megadetector/api/batch_processing/api_core/server_api_config.py +97 -0
  8. megadetector/api/batch_processing/api_core/server_app_config.py +55 -0
  9. megadetector/api/batch_processing/api_core/server_batch_job_manager.py +220 -0
  10. megadetector/api/batch_processing/api_core/server_job_status_table.py +149 -0
  11. megadetector/api/batch_processing/api_core/server_orchestration.py +360 -0
  12. megadetector/api/batch_processing/api_core/server_utils.py +88 -0
  13. megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
  14. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +46 -0
  15. megadetector/api/batch_processing/api_support/__init__.py +0 -0
  16. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +152 -0
  17. megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
  18. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  19. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  20. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  21. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  22. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  23. megadetector/api/synchronous/__init__.py +0 -0
  24. megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  25. megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +152 -0
  26. megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +263 -0
  27. megadetector/api/synchronous/api_core/animal_detection_api/config.py +35 -0
  28. megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
  29. megadetector/api/synchronous/api_core/tests/load_test.py +110 -0
  30. megadetector/classification/__init__.py +0 -0
  31. megadetector/classification/aggregate_classifier_probs.py +108 -0
  32. megadetector/classification/analyze_failed_images.py +227 -0
  33. megadetector/classification/cache_batchapi_outputs.py +198 -0
  34. megadetector/classification/create_classification_dataset.py +627 -0
  35. megadetector/classification/crop_detections.py +516 -0
  36. megadetector/classification/csv_to_json.py +226 -0
  37. megadetector/classification/detect_and_crop.py +855 -0
  38. megadetector/classification/efficientnet/__init__.py +9 -0
  39. megadetector/classification/efficientnet/model.py +415 -0
  40. megadetector/classification/efficientnet/utils.py +607 -0
  41. megadetector/classification/evaluate_model.py +520 -0
  42. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  43. megadetector/classification/json_to_azcopy_list.py +63 -0
  44. megadetector/classification/json_validator.py +699 -0
  45. megadetector/classification/map_classification_categories.py +276 -0
  46. megadetector/classification/merge_classification_detection_output.py +506 -0
  47. megadetector/classification/prepare_classification_script.py +194 -0
  48. megadetector/classification/prepare_classification_script_mc.py +228 -0
  49. megadetector/classification/run_classifier.py +287 -0
  50. megadetector/classification/save_mislabeled.py +110 -0
  51. megadetector/classification/train_classifier.py +827 -0
  52. megadetector/classification/train_classifier_tf.py +725 -0
  53. megadetector/classification/train_utils.py +323 -0
  54. megadetector/data_management/__init__.py +0 -0
  55. megadetector/data_management/annotations/__init__.py +0 -0
  56. megadetector/data_management/annotations/annotation_constants.py +34 -0
  57. megadetector/data_management/camtrap_dp_to_coco.py +237 -0
  58. megadetector/data_management/cct_json_utils.py +404 -0
  59. megadetector/data_management/cct_to_md.py +176 -0
  60. megadetector/data_management/cct_to_wi.py +289 -0
  61. megadetector/data_management/coco_to_labelme.py +283 -0
  62. megadetector/data_management/coco_to_yolo.py +662 -0
  63. megadetector/data_management/databases/__init__.py +0 -0
  64. megadetector/data_management/databases/add_width_and_height_to_db.py +33 -0
  65. megadetector/data_management/databases/combine_coco_camera_traps_files.py +206 -0
  66. megadetector/data_management/databases/integrity_check_json_db.py +493 -0
  67. megadetector/data_management/databases/subset_json_db.py +115 -0
  68. megadetector/data_management/generate_crops_from_cct.py +149 -0
  69. megadetector/data_management/get_image_sizes.py +189 -0
  70. megadetector/data_management/importers/add_nacti_sizes.py +52 -0
  71. megadetector/data_management/importers/add_timestamps_to_icct.py +79 -0
  72. megadetector/data_management/importers/animl_results_to_md_results.py +158 -0
  73. megadetector/data_management/importers/auckland_doc_test_to_json.py +373 -0
  74. megadetector/data_management/importers/auckland_doc_to_json.py +201 -0
  75. megadetector/data_management/importers/awc_to_json.py +191 -0
  76. megadetector/data_management/importers/bellevue_to_json.py +273 -0
  77. megadetector/data_management/importers/cacophony-thermal-importer.py +793 -0
  78. megadetector/data_management/importers/carrizo_shrubfree_2018.py +269 -0
  79. megadetector/data_management/importers/carrizo_trail_cam_2017.py +289 -0
  80. megadetector/data_management/importers/cct_field_adjustments.py +58 -0
  81. megadetector/data_management/importers/channel_islands_to_cct.py +913 -0
  82. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +180 -0
  83. megadetector/data_management/importers/eMammal/eMammal_helpers.py +249 -0
  84. megadetector/data_management/importers/eMammal/make_eMammal_json.py +223 -0
  85. megadetector/data_management/importers/ena24_to_json.py +276 -0
  86. megadetector/data_management/importers/filenames_to_json.py +386 -0
  87. megadetector/data_management/importers/helena_to_cct.py +283 -0
  88. megadetector/data_management/importers/idaho-camera-traps.py +1407 -0
  89. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +294 -0
  90. megadetector/data_management/importers/jb_csv_to_json.py +150 -0
  91. megadetector/data_management/importers/mcgill_to_json.py +250 -0
  92. megadetector/data_management/importers/missouri_to_json.py +490 -0
  93. megadetector/data_management/importers/nacti_fieldname_adjustments.py +79 -0
  94. megadetector/data_management/importers/noaa_seals_2019.py +181 -0
  95. megadetector/data_management/importers/pc_to_json.py +365 -0
  96. megadetector/data_management/importers/plot_wni_giraffes.py +123 -0
  97. megadetector/data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -0
  98. megadetector/data_management/importers/prepare_zsl_imerit.py +131 -0
  99. megadetector/data_management/importers/rspb_to_json.py +356 -0
  100. megadetector/data_management/importers/save_the_elephants_survey_A.py +320 -0
  101. megadetector/data_management/importers/save_the_elephants_survey_B.py +329 -0
  102. megadetector/data_management/importers/snapshot_safari_importer.py +758 -0
  103. megadetector/data_management/importers/snapshot_safari_importer_reprise.py +665 -0
  104. megadetector/data_management/importers/snapshot_serengeti_lila.py +1067 -0
  105. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +150 -0
  106. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +153 -0
  107. megadetector/data_management/importers/sulross_get_exif.py +65 -0
  108. megadetector/data_management/importers/timelapse_csv_set_to_json.py +490 -0
  109. megadetector/data_management/importers/ubc_to_json.py +399 -0
  110. megadetector/data_management/importers/umn_to_json.py +507 -0
  111. megadetector/data_management/importers/wellington_to_json.py +263 -0
  112. megadetector/data_management/importers/wi_to_json.py +442 -0
  113. megadetector/data_management/importers/zamba_results_to_md_results.py +181 -0
  114. megadetector/data_management/labelme_to_coco.py +547 -0
  115. megadetector/data_management/labelme_to_yolo.py +272 -0
  116. megadetector/data_management/lila/__init__.py +0 -0
  117. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +97 -0
  118. megadetector/data_management/lila/add_locations_to_nacti.py +147 -0
  119. megadetector/data_management/lila/create_lila_blank_set.py +558 -0
  120. megadetector/data_management/lila/create_lila_test_set.py +152 -0
  121. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  122. megadetector/data_management/lila/download_lila_subset.py +178 -0
  123. megadetector/data_management/lila/generate_lila_per_image_labels.py +516 -0
  124. megadetector/data_management/lila/get_lila_annotation_counts.py +170 -0
  125. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  126. megadetector/data_management/lila/lila_common.py +300 -0
  127. megadetector/data_management/lila/test_lila_metadata_urls.py +132 -0
  128. megadetector/data_management/ocr_tools.py +870 -0
  129. megadetector/data_management/read_exif.py +809 -0
  130. megadetector/data_management/remap_coco_categories.py +84 -0
  131. megadetector/data_management/remove_exif.py +66 -0
  132. megadetector/data_management/rename_images.py +187 -0
  133. megadetector/data_management/resize_coco_dataset.py +189 -0
  134. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  135. megadetector/data_management/yolo_output_to_md_output.py +446 -0
  136. megadetector/data_management/yolo_to_coco.py +676 -0
  137. megadetector/detection/__init__.py +0 -0
  138. megadetector/detection/detector_training/__init__.py +0 -0
  139. megadetector/detection/detector_training/model_main_tf2.py +114 -0
  140. megadetector/detection/process_video.py +846 -0
  141. megadetector/detection/pytorch_detector.py +355 -0
  142. megadetector/detection/run_detector.py +779 -0
  143. megadetector/detection/run_detector_batch.py +1219 -0
  144. megadetector/detection/run_inference_with_yolov5_val.py +1087 -0
  145. megadetector/detection/run_tiled_inference.py +934 -0
  146. megadetector/detection/tf_detector.py +192 -0
  147. megadetector/detection/video_utils.py +698 -0
  148. megadetector/postprocessing/__init__.py +0 -0
  149. megadetector/postprocessing/add_max_conf.py +64 -0
  150. megadetector/postprocessing/categorize_detections_by_size.py +165 -0
  151. megadetector/postprocessing/classification_postprocessing.py +716 -0
  152. megadetector/postprocessing/combine_api_outputs.py +249 -0
  153. megadetector/postprocessing/compare_batch_results.py +966 -0
  154. megadetector/postprocessing/convert_output_format.py +396 -0
  155. megadetector/postprocessing/load_api_results.py +195 -0
  156. megadetector/postprocessing/md_to_coco.py +310 -0
  157. megadetector/postprocessing/md_to_labelme.py +330 -0
  158. megadetector/postprocessing/merge_detections.py +412 -0
  159. megadetector/postprocessing/postprocess_batch_results.py +1908 -0
  160. megadetector/postprocessing/remap_detection_categories.py +170 -0
  161. megadetector/postprocessing/render_detection_confusion_matrix.py +660 -0
  162. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +211 -0
  163. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +83 -0
  164. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1635 -0
  165. megadetector/postprocessing/separate_detections_into_folders.py +730 -0
  166. megadetector/postprocessing/subset_json_detector_output.py +700 -0
  167. megadetector/postprocessing/top_folders_to_bottom.py +223 -0
  168. megadetector/taxonomy_mapping/__init__.py +0 -0
  169. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  170. megadetector/taxonomy_mapping/map_new_lila_datasets.py +150 -0
  171. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -0
  172. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +588 -0
  173. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  174. megadetector/taxonomy_mapping/simple_image_download.py +219 -0
  175. megadetector/taxonomy_mapping/species_lookup.py +834 -0
  176. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  177. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  178. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  179. megadetector/utils/__init__.py +0 -0
  180. megadetector/utils/azure_utils.py +178 -0
  181. megadetector/utils/ct_utils.py +613 -0
  182. megadetector/utils/directory_listing.py +246 -0
  183. megadetector/utils/md_tests.py +1164 -0
  184. megadetector/utils/path_utils.py +1045 -0
  185. megadetector/utils/process_utils.py +160 -0
  186. megadetector/utils/sas_blob_utils.py +509 -0
  187. megadetector/utils/split_locations_into_train_val.py +228 -0
  188. megadetector/utils/string_utils.py +92 -0
  189. megadetector/utils/url_utils.py +323 -0
  190. megadetector/utils/write_html_image_list.py +225 -0
  191. megadetector/visualization/__init__.py +0 -0
  192. megadetector/visualization/plot_utils.py +293 -0
  193. megadetector/visualization/render_images_with_thumbnails.py +275 -0
  194. megadetector/visualization/visualization_utils.py +1536 -0
  195. megadetector/visualization/visualize_db.py +552 -0
  196. megadetector/visualization/visualize_detector_output.py +405 -0
  197. {megadetector-5.0.11.dist-info → megadetector-5.0.13.dist-info}/LICENSE +0 -0
  198. {megadetector-5.0.11.dist-info → megadetector-5.0.13.dist-info}/METADATA +2 -2
  199. megadetector-5.0.13.dist-info/RECORD +201 -0
  200. megadetector-5.0.13.dist-info/top_level.txt +1 -0
  201. megadetector-5.0.11.dist-info/RECORD +0 -5
  202. megadetector-5.0.11.dist-info/top_level.txt +0 -1
  203. {megadetector-5.0.11.dist-info → megadetector-5.0.13.dist-info}/WHEEL +0 -0
@@ -0,0 +1,112 @@
1
+ """
2
+
3
+ get_lila_image_counts.py
4
+
5
+ Count the number of images and bounding boxes with each label in one or more LILA datasets.
6
+
7
+ This script doesn't write these counts out anywhere other than the console, it's just intended
8
+ as a template for doing operations like this on LILA data. get_lila_annotation_counts.py writes
9
+ information out to a .json file, but it counts *annotations*, not *images*, for each category.
10
+
11
+ """
12
+
13
+ #%% Constants and imports
14
+
15
+ import json
16
+ import os
17
+
18
+ from collections import defaultdict
19
+
20
+ from megadetector.data_management.lila.lila_common import \
21
+ read_lila_metadata, read_metadata_file_for_dataset
22
+
23
+ # If None, will use all datasets
24
+ datasets_of_interest = None
25
+
26
+ # We'll write images, metadata downloads, and temporary files here
27
+ lila_local_base = os.path.expanduser('~/lila')
28
+
29
+ metadata_dir = os.path.join(lila_local_base,'metadata')
30
+ os.makedirs(metadata_dir,exist_ok=True)
31
+
32
+
33
+ #%% Download and parse the metadata file
34
+
35
+ metadata_table = read_lila_metadata(metadata_dir)
36
+
37
+
38
+ #%% Download and extract metadata for the datasets we're interested in
39
+
40
+ if datasets_of_interest is None:
41
+ datasets_of_interest = list(metadata_table.keys())
42
+
43
+ for ds_name in datasets_of_interest:
44
+ metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
45
+ metadata_dir=metadata_dir,
46
+ metadata_table=metadata_table)
47
+
48
+
49
+ #%% Count categories
50
+
51
+ ds_name_to_category_counts = {}
52
+
53
+ # ds_name = datasets_of_interest[0]
54
+ for ds_name in datasets_of_interest:
55
+
56
+ category_to_image_count = {}
57
+ category_to_bbox_count = {}
58
+
59
+ print('Counting categories in: ' + ds_name)
60
+
61
+ json_filename = metadata_table[ds_name]['json_filename']
62
+ with open(json_filename, 'r') as f:
63
+ data = json.load(f)
64
+
65
+ categories = data['categories']
66
+ category_ids = [c['id'] for c in categories]
67
+ for c in categories:
68
+ category_id_to_name = {c['id']:c['name'] for c in categories}
69
+ annotations = data['annotations']
70
+ images = data['images']
71
+
72
+ for category_id in category_ids:
73
+ category_name = category_id_to_name[category_id]
74
+ category_to_image_count[category_name] = 0
75
+ category_to_bbox_count[category_name] = 0
76
+
77
+ image_id_to_category_names = defaultdict(set)
78
+
79
+ # Go through annotations, marking each image with the categories that are present
80
+ #
81
+ # ann = annotations[0]
82
+ for ann in annotations:
83
+
84
+ category_name = category_id_to_name[ann['category_id']]
85
+ image_id_to_category_names[ann['image_id']].add(category_name)
86
+
87
+ # Now go through images and count categories
88
+ category_to_count = defaultdict(int)
89
+
90
+ # im = images[0]
91
+ for im in images:
92
+ categories_this_image = image_id_to_category_names[im['id']]
93
+ for category_name in categories_this_image:
94
+ category_to_count[category_name] += 1
95
+
96
+ ds_name_to_category_counts[ds_name] = category_to_count
97
+
98
+ # ...for each dataset
99
+
100
+
101
+ #%% Print the results
102
+
103
+ for ds_name in ds_name_to_category_counts:
104
+
105
+ print('\n** Category counts for {} **\n'.format(ds_name))
106
+
107
+ category_to_count = ds_name_to_category_counts[ds_name]
108
+ category_to_count = {k: v for k, v in sorted(category_to_count.items(), reverse=True,
109
+ key=lambda item: item[1])}
110
+
111
+ for category_name in category_to_count.keys():
112
+ print('{}: {}'.format(category_name,category_to_count[category_name]))
@@ -0,0 +1,300 @@
1
+ """
2
+
3
+ lila_common.py
4
+
5
+ Common constants and functions related to LILA data management/retrieval.
6
+
7
+ """
8
+
9
+ #%% Imports and constants
10
+
11
+ import os
12
+ import json
13
+ import zipfile
14
+ import pandas as pd
15
+
16
+ from urllib.parse import urlparse
17
+
18
+ from megadetector.utils.url_utils import download_url
19
+ from megadetector.utils.path_utils import unzip_file
20
+ from megadetector.utils.ct_utils import is_empty
21
+
22
+ # LILA camera trap primary metadata file
23
+ lila_metadata_url = 'http://lila.science/wp-content/uploads/2023/06/lila_camera_trap_datasets.csv'
24
+ lila_taxonomy_mapping_url = 'https://lila.science/public/lila-taxonomy-mapping_release.csv'
25
+ lila_all_images_url = 'https://lila.science/public/lila_image_urls_and_labels.csv.zip'
26
+
27
+ wildlife_insights_page_size = 30000
28
+ wildlife_insights_taxonomy_url = 'https://api.wildlifeinsights.org/api/v1/taxonomy/taxonomies-all?fields=class,order,family,genus,species,authority,taxonomyType,uniqueIdentifier,commonNameEnglish&page[size]={}'.format(
29
+ wildlife_insights_page_size)
30
+ wildlife_insights_taxonomy_local_json_filename = 'wi_taxonomy.json'
31
+ wildlife_insights_taxonomy_local_csv_filename = \
32
+ wildlife_insights_taxonomy_local_json_filename.replace('.json','.csv')
33
+
34
+ # Filenames are consistent across clouds relative to these URLs
35
+ lila_base_urls = {
36
+ 'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
37
+ 'gcp':'https://storage.googleapis.com/public-datasets-lila/',
38
+ 'aws':'http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/'
39
+ }
40
+
41
+ lila_cloud_urls = {
42
+ 'azure':'https://lilawildlife.blob.core.windows.net/lila-wildlife/',
43
+ 'gcp':'gs://public-datasets-lila/',
44
+ 'aws':'s3://us-west-2.opendata.source.coop/agentmorris/lila-wildlife/'
45
+ }
46
+
47
+ for url in lila_base_urls.values():
48
+ assert url.endswith('/')
49
+
50
+
51
+ #%% Common functions
52
+
53
+ def read_wildlife_insights_taxonomy_mapping(metadata_dir):
54
+ """
55
+ Reads the WI taxonomy mapping file, downloading the .json data (and writing to .csv) if necessary.
56
+
57
+ Args:
58
+ metadata_dir (str): folder to use for temporary LILA metadata files
59
+
60
+ Returns:
61
+ pd.dataframe: A DataFrame with taxonomy information
62
+ """
63
+
64
+ wi_taxonomy_csv_path = os.path.join(metadata_dir,wildlife_insights_taxonomy_local_csv_filename)
65
+
66
+ if os.path.exists(wi_taxonomy_csv_path):
67
+ df = pd.read_csv(wi_taxonomy_csv_path)
68
+ else:
69
+ wi_taxonomy_json_path = os.path.join(metadata_dir,wildlife_insights_taxonomy_local_json_filename)
70
+ download_url(wildlife_insights_taxonomy_url, wi_taxonomy_json_path)
71
+ with open(wi_taxonomy_json_path,'r') as f:
72
+ d = json.load(f)
73
+
74
+ # We haven't implemented paging, make sure that's not an issue
75
+ assert d['meta']['totalItems'] < wildlife_insights_page_size
76
+
77
+ # d['data'] is a list of items that look like:
78
+ """
79
+ {'id': 2000003,
80
+ 'class': 'Mammalia',
81
+ 'order': 'Rodentia',
82
+ 'family': 'Abrocomidae',
83
+ 'genus': 'Abrocoma',
84
+ 'species': 'bennettii',
85
+ 'authority': 'Waterhouse, 1837',
86
+ 'commonNameEnglish': "Bennett's Chinchilla Rat",
87
+ 'taxonomyType': 'biological',
88
+ 'uniqueIdentifier': '7a6c93a5-bdf7-4182-82f9-7a67d23f7fe1'}
89
+ """
90
+ df = pd.DataFrame(d['data'])
91
+ df.to_csv(wi_taxonomy_csv_path,index=False)
92
+
93
+ return df
94
+
95
+
96
+ def read_lila_taxonomy_mapping(metadata_dir):
97
+ """
98
+ Reads the LILA taxonomy mapping file, downloading the .csv file if necessary.
99
+
100
+ Args:
101
+ metadata_dir (str): folder to use for temporary LILA metadata files
102
+
103
+ Returns:
104
+ pd.DataFrame: a DataFrame with one row per identification
105
+ """
106
+
107
+ p = urlparse(lila_taxonomy_mapping_url)
108
+ taxonomy_filename = os.path.join(metadata_dir,os.path.basename(p.path))
109
+ download_url(lila_taxonomy_mapping_url, taxonomy_filename)
110
+
111
+ df = pd.read_csv(lila_taxonomy_mapping_url)
112
+
113
+ return df
114
+
115
+
116
+ def read_lila_metadata(metadata_dir):
117
+ """
118
+ Reads LILA metadata (URLs to each dataset), downloading the .csv file if necessary.
119
+
120
+ Args:
121
+ metadata_dir (str): folder to use for temporary LILA metadata files
122
+
123
+ Returns:
124
+ dict: a dict mapping dataset names (e.g. "Caltech Camera Traps") to dicts
125
+ with keys corresponding to the headers in the .csv file, currently:
126
+
127
+ - name
128
+ - short_name
129
+ - continent
130
+ - country
131
+ - region
132
+ - image_base_url_relative
133
+ - metadata_url_relative
134
+ - bbox_url_relative
135
+ - image_base_url_gcp
136
+ - metadata_url_gcp
137
+ - bbox_url_gcp
138
+ - image_base_url_aws
139
+ - metadata_url_aws
140
+ - bbox_url_aws
141
+ - image_base_url_azure
142
+ - metadata_url_azure
143
+ - box_url_azure
144
+ - mdv4_results_raw
145
+ - mdv5b_results_raw
146
+ - md_results_with_rde
147
+ - json_filename
148
+ """
149
+
150
+ # Put the master metadata file in the same folder where we're putting images
151
+ p = urlparse(lila_metadata_url)
152
+ metadata_filename = os.path.join(metadata_dir,os.path.basename(p.path))
153
+ download_url(lila_metadata_url, metadata_filename)
154
+
155
+ df = pd.read_csv(metadata_filename)
156
+
157
+ records = df.to_dict('records')
158
+
159
+ # Parse into a table keyed by dataset name
160
+ metadata_table = {}
161
+
162
+ # r = records[0]
163
+ for r in records:
164
+ if is_empty(r['name']):
165
+ continue
166
+
167
+ # Convert NaN's to None
168
+ for k in r.keys():
169
+ if is_empty(r[k]):
170
+ r[k] = None
171
+
172
+ metadata_table[r['name']] = r
173
+
174
+ return metadata_table
175
+
176
+
177
+ def read_lila_all_images_file(metadata_dir):
178
+ """
179
+ Downloads if necessary - then unzips if necessary - the .csv file with label mappings for
180
+ all LILA files, and opens the resulting .csv file as a Pandas DataFrame.
181
+
182
+ Args:
183
+ metadata_dir (str): folder to use for temporary LILA metadata files
184
+
185
+ Returns:
186
+ pd.DataFrame: a DataFrame containing one row per identification in a LILA camera trap image
187
+ """
188
+
189
+ p = urlparse(lila_all_images_url)
190
+ lila_all_images_zip_filename = os.path.join(metadata_dir,os.path.basename(p.path))
191
+ download_url(lila_all_images_url, lila_all_images_zip_filename)
192
+
193
+ with zipfile.ZipFile(lila_all_images_zip_filename,'r') as z:
194
+ files = z.namelist()
195
+ assert len(files) == 1
196
+
197
+ unzipped_csv_filename = os.path.join(metadata_dir,files[0])
198
+ if not os.path.isfile(unzipped_csv_filename):
199
+ unzip_file(lila_all_images_zip_filename,metadata_dir)
200
+ else:
201
+ print('{} already unzipped'.format(unzipped_csv_filename))
202
+
203
+ df = pd.read_csv(unzipped_csv_filename)
204
+
205
+ return df
206
+
207
+
208
+ def read_metadata_file_for_dataset(ds_name,
209
+ metadata_dir,
210
+ metadata_table=None,
211
+ json_url=None,
212
+ preferred_cloud='gcp'):
213
+ """
214
+ Downloads if necessary - then unzips if necessary - the .json file for a specific dataset.
215
+
216
+ Args:
217
+ ds_name (str): the name of the dataset for which you want to retrieve metadata (e.g.
218
+ "Caltech Camera Traps")
219
+ metadata_dir (str): folder to use for temporary LILA metadata files
220
+ metadata_table (dict, optional): an optional dictionary already loaded via
221
+ read_lila_metadata()
222
+ json_url (str, optional): the URL of the metadata file, if None will be retrieved
223
+ via read_lila_metadata()
224
+ preferred_cloud (str, optional): 'gcp' (default), 'azure', or 'aws'
225
+
226
+ Returns:
227
+ str: the .json filename on the local disk
228
+
229
+ """
230
+
231
+ assert preferred_cloud in lila_base_urls.keys()
232
+
233
+ if json_url is None:
234
+
235
+ if metadata_table is None:
236
+ metadata_table = read_lila_metadata(metadata_dir)
237
+
238
+ json_url = metadata_table[ds_name]['metadata_url_' + preferred_cloud]
239
+
240
+ p = urlparse(json_url)
241
+ json_filename = os.path.join(metadata_dir,os.path.basename(p.path))
242
+ download_url(json_url, json_filename)
243
+
244
+ # Unzip if necessary
245
+ if json_filename.endswith('.zip'):
246
+
247
+ with zipfile.ZipFile(json_filename,'r') as z:
248
+ files = z.namelist()
249
+ assert len(files) == 1
250
+ unzipped_json_filename = os.path.join(metadata_dir,files[0])
251
+ if not os.path.isfile(unzipped_json_filename):
252
+ unzip_file(json_filename,metadata_dir)
253
+ else:
254
+ print('{} already unzipped'.format(unzipped_json_filename))
255
+ json_filename = unzipped_json_filename
256
+
257
+ return json_filename
258
+
259
+
260
+ #%% Interactive test driver
261
+
262
+ if False:
263
+
264
+ pass
265
+
266
+ #%% Verify that all base URLs exist
267
+
268
+ # LILA camera trap primary metadata file
269
+ urls = (lila_metadata_url,lila_taxonomy_mapping_url,lila_all_images_url,wildlife_insights_taxonomy_url)
270
+
271
+ from megadetector.utils import url_utils
272
+
273
+ status_codes = url_utils.test_urls(urls,timeout=2.0)
274
+ assert all([code == 200 for code in status_codes])
275
+
276
+
277
+ #%% Verify that the metadata URLs exist for individual datasets
278
+
279
+ metadata_dir = os.path.expanduser('~/lila/metadata')
280
+
281
+ dataset_metadata = read_lila_metadata(metadata_dir)
282
+
283
+ urls_to_test = []
284
+
285
+ # ds_name = next(iter(dataset_metadata.keys()))
286
+ for ds_name in dataset_metadata.keys():
287
+
288
+ ds_info = dataset_metadata[ds_name]
289
+ for cloud_name in lila_base_urls.keys():
290
+ urls_to_test.append(ds_info['metadata_url_' + cloud_name])
291
+ if ds_info['bbox_url_relative'] != None:
292
+ urls_to_test.append(ds_info['bbox_url_' + cloud_name])
293
+
294
+ status_codes = url_utils.test_urls(urls_to_test,
295
+ error_on_failure=True,
296
+ n_workers=10,
297
+ pool_type='process',
298
+ timeout=2.0)
299
+ assert all([code == 200 for code in status_codes])
300
+
@@ -0,0 +1,132 @@
1
+ """
2
+
3
+ test_lila_metadata_urls.py
4
+
5
+ Test that all the metadata URLs for LILA camera trap datasets are valid, including MegaDetector
6
+ results files.
7
+
8
+ Also pick an arbitrary image from each dataset and make sure that URL is valid.
9
+
10
+ Also picks an arbitrary image from each dataset's MD results and make sure the corresponding URL is valid.
11
+
12
+ """
13
+
14
+ #%% Constants and imports
15
+
16
+ import json
17
+ import os
18
+
19
+ from megadetector.data_management.lila.lila_common import read_lila_metadata,\
20
+ read_metadata_file_for_dataset, read_lila_taxonomy_mapping
21
+
22
+ # We'll write images, metadata downloads, and temporary files here
23
+ lila_local_base = os.path.expanduser('~/lila')
24
+
25
+ output_dir = os.path.join(lila_local_base,'lila_metadata_tests')
26
+ os.makedirs(output_dir,exist_ok=True)
27
+
28
+ metadata_dir = os.path.join(lila_local_base,'metadata')
29
+ os.makedirs(metadata_dir,exist_ok=True)
30
+
31
+ md_results_dir = os.path.join(lila_local_base,'md_results')
32
+ os.makedirs(md_results_dir,exist_ok=True)
33
+
34
+ md_results_keys = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
35
+
36
+ preferred_cloud = 'gcp' # 'azure', 'aws'
37
+
38
+
39
+ #%% Load category and taxonomy files
40
+
41
+ taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
42
+
43
+
44
+ #%% Download and parse the metadata file
45
+
46
+ metadata_table = read_lila_metadata(metadata_dir)
47
+
48
+ print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
49
+
50
+
51
+ #%% Download and extract metadata and MD results for each dataset
52
+
53
+ for ds_name in metadata_table.keys():
54
+
55
+ metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
56
+ metadata_dir=metadata_dir,
57
+ metadata_table=metadata_table)
58
+ for k in md_results_keys:
59
+ md_results_url = metadata_table[ds_name][k]
60
+ if md_results_url is None:
61
+ metadata_table[ds_name][k + '_filename'] = None
62
+ else:
63
+ metadata_table[ds_name][k + '_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
64
+ metadata_dir=md_results_dir,
65
+ json_url=md_results_url)
66
+
67
+
68
+ #%% Build up a list of URLs to test
69
+
70
+ # Takes ~15 mins, since it has to open all the giant .json files
71
+
72
+ url_to_source = {}
73
+
74
+ # The first image in a dataset is disproportionately likely to be human (and thus 404),
75
+ # so we pick a semi-arbitrary image that isn't the first. How about the 1000th?
76
+ image_index = 1000
77
+
78
+ # ds_name = list(metadata_table.keys())[0]
79
+ for ds_name in metadata_table.keys():
80
+
81
+ if 'bbox' in ds_name:
82
+ print('Skipping bbox dataset {}'.format(ds_name))
83
+ continue
84
+
85
+ print('Processing dataset {}'.format(ds_name))
86
+
87
+ json_filename = metadata_table[ds_name]['json_filename']
88
+ with open(json_filename, 'r') as f:
89
+ data = json.load(f)
90
+
91
+ image_base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
92
+ assert not image_base_url.endswith('/')
93
+ # Download a test image
94
+ test_image_relative_path = data['images'][image_index]['file_name']
95
+ test_image_url = image_base_url + '/' + test_image_relative_path
96
+
97
+ url_to_source[test_image_url] = ds_name + ' metadata'
98
+
99
+ # Grab an image from the MegaDetector results
100
+
101
+ # k = md_results_keys[2]
102
+ for k in md_results_keys:
103
+ k_fn = k + '_filename'
104
+ if metadata_table[ds_name][k_fn] is not None:
105
+ with open(metadata_table[ds_name][k_fn],'r') as f:
106
+ md_results = json.load(f)
107
+ im = md_results['images'][image_index]
108
+ md_image_url = image_base_url + '/' + im['file']
109
+ url_to_source[md_image_url] = ds_name + ' ' + k
110
+ del md_results
111
+ del data
112
+
113
+ # ...for each dataset
114
+
115
+
116
+ #%% Test URLs
117
+
118
+ from megadetector.utils.url_utils import test_urls
119
+
120
+ urls_to_test = sorted(url_to_source.keys())
121
+ urls_to_test = [fn.replace('\\','/') for fn in urls_to_test]
122
+
123
+ status_codes = test_urls(urls_to_test,
124
+ error_on_failure=False,
125
+ pool_type='thread',
126
+ n_workers=10,
127
+ timeout=2.0)
128
+
129
+ for i_url,url in enumerate(urls_to_test):
130
+ if status_codes[i_url] != 200:
131
+ print('Status {} for {} ({})'.format(
132
+ status_codes[i_url],url,url_to_source[url]))