megadetector 5.0.11__py3-none-any.whl → 5.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (203) hide show
  1. megadetector/api/__init__.py +0 -0
  2. megadetector/api/batch_processing/__init__.py +0 -0
  3. megadetector/api/batch_processing/api_core/__init__.py +0 -0
  4. megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. megadetector/api/batch_processing/api_core/batch_service/score.py +439 -0
  6. megadetector/api/batch_processing/api_core/server.py +294 -0
  7. megadetector/api/batch_processing/api_core/server_api_config.py +97 -0
  8. megadetector/api/batch_processing/api_core/server_app_config.py +55 -0
  9. megadetector/api/batch_processing/api_core/server_batch_job_manager.py +220 -0
  10. megadetector/api/batch_processing/api_core/server_job_status_table.py +149 -0
  11. megadetector/api/batch_processing/api_core/server_orchestration.py +360 -0
  12. megadetector/api/batch_processing/api_core/server_utils.py +88 -0
  13. megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
  14. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +46 -0
  15. megadetector/api/batch_processing/api_support/__init__.py +0 -0
  16. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +152 -0
  17. megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
  18. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  19. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  20. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  21. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  22. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  23. megadetector/api/synchronous/__init__.py +0 -0
  24. megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  25. megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +152 -0
  26. megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +263 -0
  27. megadetector/api/synchronous/api_core/animal_detection_api/config.py +35 -0
  28. megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
  29. megadetector/api/synchronous/api_core/tests/load_test.py +110 -0
  30. megadetector/classification/__init__.py +0 -0
  31. megadetector/classification/aggregate_classifier_probs.py +108 -0
  32. megadetector/classification/analyze_failed_images.py +227 -0
  33. megadetector/classification/cache_batchapi_outputs.py +198 -0
  34. megadetector/classification/create_classification_dataset.py +627 -0
  35. megadetector/classification/crop_detections.py +516 -0
  36. megadetector/classification/csv_to_json.py +226 -0
  37. megadetector/classification/detect_and_crop.py +855 -0
  38. megadetector/classification/efficientnet/__init__.py +9 -0
  39. megadetector/classification/efficientnet/model.py +415 -0
  40. megadetector/classification/efficientnet/utils.py +607 -0
  41. megadetector/classification/evaluate_model.py +520 -0
  42. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  43. megadetector/classification/json_to_azcopy_list.py +63 -0
  44. megadetector/classification/json_validator.py +699 -0
  45. megadetector/classification/map_classification_categories.py +276 -0
  46. megadetector/classification/merge_classification_detection_output.py +506 -0
  47. megadetector/classification/prepare_classification_script.py +194 -0
  48. megadetector/classification/prepare_classification_script_mc.py +228 -0
  49. megadetector/classification/run_classifier.py +287 -0
  50. megadetector/classification/save_mislabeled.py +110 -0
  51. megadetector/classification/train_classifier.py +827 -0
  52. megadetector/classification/train_classifier_tf.py +725 -0
  53. megadetector/classification/train_utils.py +323 -0
  54. megadetector/data_management/__init__.py +0 -0
  55. megadetector/data_management/annotations/__init__.py +0 -0
  56. megadetector/data_management/annotations/annotation_constants.py +34 -0
  57. megadetector/data_management/camtrap_dp_to_coco.py +237 -0
  58. megadetector/data_management/cct_json_utils.py +404 -0
  59. megadetector/data_management/cct_to_md.py +176 -0
  60. megadetector/data_management/cct_to_wi.py +289 -0
  61. megadetector/data_management/coco_to_labelme.py +283 -0
  62. megadetector/data_management/coco_to_yolo.py +662 -0
  63. megadetector/data_management/databases/__init__.py +0 -0
  64. megadetector/data_management/databases/add_width_and_height_to_db.py +33 -0
  65. megadetector/data_management/databases/combine_coco_camera_traps_files.py +206 -0
  66. megadetector/data_management/databases/integrity_check_json_db.py +493 -0
  67. megadetector/data_management/databases/subset_json_db.py +115 -0
  68. megadetector/data_management/generate_crops_from_cct.py +149 -0
  69. megadetector/data_management/get_image_sizes.py +189 -0
  70. megadetector/data_management/importers/add_nacti_sizes.py +52 -0
  71. megadetector/data_management/importers/add_timestamps_to_icct.py +79 -0
  72. megadetector/data_management/importers/animl_results_to_md_results.py +158 -0
  73. megadetector/data_management/importers/auckland_doc_test_to_json.py +373 -0
  74. megadetector/data_management/importers/auckland_doc_to_json.py +201 -0
  75. megadetector/data_management/importers/awc_to_json.py +191 -0
  76. megadetector/data_management/importers/bellevue_to_json.py +273 -0
  77. megadetector/data_management/importers/cacophony-thermal-importer.py +793 -0
  78. megadetector/data_management/importers/carrizo_shrubfree_2018.py +269 -0
  79. megadetector/data_management/importers/carrizo_trail_cam_2017.py +289 -0
  80. megadetector/data_management/importers/cct_field_adjustments.py +58 -0
  81. megadetector/data_management/importers/channel_islands_to_cct.py +913 -0
  82. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +180 -0
  83. megadetector/data_management/importers/eMammal/eMammal_helpers.py +249 -0
  84. megadetector/data_management/importers/eMammal/make_eMammal_json.py +223 -0
  85. megadetector/data_management/importers/ena24_to_json.py +276 -0
  86. megadetector/data_management/importers/filenames_to_json.py +386 -0
  87. megadetector/data_management/importers/helena_to_cct.py +283 -0
  88. megadetector/data_management/importers/idaho-camera-traps.py +1407 -0
  89. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +294 -0
  90. megadetector/data_management/importers/jb_csv_to_json.py +150 -0
  91. megadetector/data_management/importers/mcgill_to_json.py +250 -0
  92. megadetector/data_management/importers/missouri_to_json.py +490 -0
  93. megadetector/data_management/importers/nacti_fieldname_adjustments.py +79 -0
  94. megadetector/data_management/importers/noaa_seals_2019.py +181 -0
  95. megadetector/data_management/importers/pc_to_json.py +365 -0
  96. megadetector/data_management/importers/plot_wni_giraffes.py +123 -0
  97. megadetector/data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -0
  98. megadetector/data_management/importers/prepare_zsl_imerit.py +131 -0
  99. megadetector/data_management/importers/rspb_to_json.py +356 -0
  100. megadetector/data_management/importers/save_the_elephants_survey_A.py +320 -0
  101. megadetector/data_management/importers/save_the_elephants_survey_B.py +329 -0
  102. megadetector/data_management/importers/snapshot_safari_importer.py +758 -0
  103. megadetector/data_management/importers/snapshot_safari_importer_reprise.py +665 -0
  104. megadetector/data_management/importers/snapshot_serengeti_lila.py +1067 -0
  105. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +150 -0
  106. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +153 -0
  107. megadetector/data_management/importers/sulross_get_exif.py +65 -0
  108. megadetector/data_management/importers/timelapse_csv_set_to_json.py +490 -0
  109. megadetector/data_management/importers/ubc_to_json.py +399 -0
  110. megadetector/data_management/importers/umn_to_json.py +507 -0
  111. megadetector/data_management/importers/wellington_to_json.py +263 -0
  112. megadetector/data_management/importers/wi_to_json.py +442 -0
  113. megadetector/data_management/importers/zamba_results_to_md_results.py +181 -0
  114. megadetector/data_management/labelme_to_coco.py +547 -0
  115. megadetector/data_management/labelme_to_yolo.py +272 -0
  116. megadetector/data_management/lila/__init__.py +0 -0
  117. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +97 -0
  118. megadetector/data_management/lila/add_locations_to_nacti.py +147 -0
  119. megadetector/data_management/lila/create_lila_blank_set.py +558 -0
  120. megadetector/data_management/lila/create_lila_test_set.py +152 -0
  121. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  122. megadetector/data_management/lila/download_lila_subset.py +178 -0
  123. megadetector/data_management/lila/generate_lila_per_image_labels.py +516 -0
  124. megadetector/data_management/lila/get_lila_annotation_counts.py +170 -0
  125. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  126. megadetector/data_management/lila/lila_common.py +300 -0
  127. megadetector/data_management/lila/test_lila_metadata_urls.py +132 -0
  128. megadetector/data_management/ocr_tools.py +870 -0
  129. megadetector/data_management/read_exif.py +809 -0
  130. megadetector/data_management/remap_coco_categories.py +84 -0
  131. megadetector/data_management/remove_exif.py +66 -0
  132. megadetector/data_management/rename_images.py +187 -0
  133. megadetector/data_management/resize_coco_dataset.py +189 -0
  134. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  135. megadetector/data_management/yolo_output_to_md_output.py +446 -0
  136. megadetector/data_management/yolo_to_coco.py +676 -0
  137. megadetector/detection/__init__.py +0 -0
  138. megadetector/detection/detector_training/__init__.py +0 -0
  139. megadetector/detection/detector_training/model_main_tf2.py +114 -0
  140. megadetector/detection/process_video.py +846 -0
  141. megadetector/detection/pytorch_detector.py +355 -0
  142. megadetector/detection/run_detector.py +779 -0
  143. megadetector/detection/run_detector_batch.py +1219 -0
  144. megadetector/detection/run_inference_with_yolov5_val.py +1087 -0
  145. megadetector/detection/run_tiled_inference.py +934 -0
  146. megadetector/detection/tf_detector.py +192 -0
  147. megadetector/detection/video_utils.py +698 -0
  148. megadetector/postprocessing/__init__.py +0 -0
  149. megadetector/postprocessing/add_max_conf.py +64 -0
  150. megadetector/postprocessing/categorize_detections_by_size.py +165 -0
  151. megadetector/postprocessing/classification_postprocessing.py +716 -0
  152. megadetector/postprocessing/combine_api_outputs.py +249 -0
  153. megadetector/postprocessing/compare_batch_results.py +966 -0
  154. megadetector/postprocessing/convert_output_format.py +396 -0
  155. megadetector/postprocessing/load_api_results.py +195 -0
  156. megadetector/postprocessing/md_to_coco.py +310 -0
  157. megadetector/postprocessing/md_to_labelme.py +330 -0
  158. megadetector/postprocessing/merge_detections.py +412 -0
  159. megadetector/postprocessing/postprocess_batch_results.py +1908 -0
  160. megadetector/postprocessing/remap_detection_categories.py +170 -0
  161. megadetector/postprocessing/render_detection_confusion_matrix.py +660 -0
  162. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +211 -0
  163. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +83 -0
  164. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1635 -0
  165. megadetector/postprocessing/separate_detections_into_folders.py +730 -0
  166. megadetector/postprocessing/subset_json_detector_output.py +700 -0
  167. megadetector/postprocessing/top_folders_to_bottom.py +223 -0
  168. megadetector/taxonomy_mapping/__init__.py +0 -0
  169. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  170. megadetector/taxonomy_mapping/map_new_lila_datasets.py +150 -0
  171. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -0
  172. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +588 -0
  173. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  174. megadetector/taxonomy_mapping/simple_image_download.py +219 -0
  175. megadetector/taxonomy_mapping/species_lookup.py +834 -0
  176. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  177. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  178. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  179. megadetector/utils/__init__.py +0 -0
  180. megadetector/utils/azure_utils.py +178 -0
  181. megadetector/utils/ct_utils.py +613 -0
  182. megadetector/utils/directory_listing.py +246 -0
  183. megadetector/utils/md_tests.py +1164 -0
  184. megadetector/utils/path_utils.py +1045 -0
  185. megadetector/utils/process_utils.py +160 -0
  186. megadetector/utils/sas_blob_utils.py +509 -0
  187. megadetector/utils/split_locations_into_train_val.py +228 -0
  188. megadetector/utils/string_utils.py +92 -0
  189. megadetector/utils/url_utils.py +323 -0
  190. megadetector/utils/write_html_image_list.py +225 -0
  191. megadetector/visualization/__init__.py +0 -0
  192. megadetector/visualization/plot_utils.py +293 -0
  193. megadetector/visualization/render_images_with_thumbnails.py +275 -0
  194. megadetector/visualization/visualization_utils.py +1536 -0
  195. megadetector/visualization/visualize_db.py +552 -0
  196. megadetector/visualization/visualize_detector_output.py +405 -0
  197. {megadetector-5.0.11.dist-info → megadetector-5.0.13.dist-info}/LICENSE +0 -0
  198. {megadetector-5.0.11.dist-info → megadetector-5.0.13.dist-info}/METADATA +2 -2
  199. megadetector-5.0.13.dist-info/RECORD +201 -0
  200. megadetector-5.0.13.dist-info/top_level.txt +1 -0
  201. megadetector-5.0.11.dist-info/RECORD +0 -5
  202. megadetector-5.0.11.dist-info/top_level.txt +0 -1
  203. {megadetector-5.0.11.dist-info → megadetector-5.0.13.dist-info}/WHEEL +0 -0
@@ -0,0 +1,110 @@
1
+
2
+ import os
3
+ import json
4
+ import io
5
+ import random
6
+ import requests
7
+
8
+ from PIL import Image
9
+ from multiprocessing import Pool
10
+ from datetime import datetime
11
+ from requests_toolbelt import MultipartEncoder
12
+ from requests_toolbelt.multipart import decoder
13
+
14
+
15
+ ip_address = '100.100.200.200'
16
+ port = 5050
17
+
18
+ base_url = 'http://{}:{}/v1/camera-trap/sync/'.format(ip_address, port)
19
+
20
+
21
+ def call_api(args):
22
+ start = datetime.now()
23
+
24
+ index, url, params, data, headers = args['index'],args['url'], args['params'], args['data'], args['headers']
25
+ print('calling api: {} starttime: {}'.format(index, start))
26
+
27
+ response = requests.post(url, params=params, data=data, headers=headers)
28
+ elapsed_time = datetime.now() - start
29
+ print('\napi {} status code: {}, elapsed time in seconds {}'.format(index, response.status_code, elapsed_time.total_seconds()))
30
+
31
+ get_detections(response)
32
+ return response
33
+
34
+ def get_detections(response):
35
+ results = decoder.MultipartDecoder.from_response(response)
36
+ text_results = {}
37
+ images = {}
38
+ for part in results.parts:
39
+ # part is a BodyPart object with b'Content-Type', and b'Content-Disposition', the later includes 'name' and 'filename' info
40
+ headers = {}
41
+ for k, v in part.headers.items():
42
+ headers[k.decode(part.encoding)] = v.decode(part.encoding)
43
+
44
+ if headers.get('Content-Type', None) == 'application/json':
45
+ text_result = json.loads(part.content.decode())
46
+
47
+ print(text_result)
48
+
49
+
50
+ def test_load(num_requests, params, max_images=1):
51
+ requests = []
52
+
53
+ # read the images anew for each request
54
+ index = 0
55
+ for i in range(num_requests):
56
+ index += 1
57
+ files = {}
58
+ sample_input_dir = '../../../api/synchronous/sample_input/test_images'
59
+
60
+ image_files = os.listdir(sample_input_dir)
61
+ random.shuffle(image_files)
62
+
63
+ num_images = 0
64
+ for i, image_name in enumerate(image_files):
65
+ if not image_name.lower().endswith('.jpg'):
66
+ continue
67
+
68
+ if num_images >= max_images:
69
+ break
70
+ else:
71
+ num_images += 1
72
+
73
+ img_path = os.path.join(sample_input_dir, image_name)
74
+ with open(img_path, 'rb') as f:
75
+ content = f.read()
76
+ files[image_name] = (image_name, content, 'image/jpeg')
77
+
78
+ m = MultipartEncoder(fields=files)
79
+ args = {
80
+ 'index': index,
81
+ 'url': base_url + 'detect',
82
+ 'params': params,
83
+ 'data': m,
84
+ 'headers': {'Content-Type': m.content_type}
85
+ }
86
+ requests.append(args)
87
+
88
+ print('starting', num_requests, 'threads...')
89
+ # images are read and in each request by the time we call the API in map()
90
+ with Pool(num_requests) as pool:
91
+ results = pool.map(call_api, requests)
92
+
93
+ return results
94
+
95
+
96
+ if __name__ == "__main__":
97
+ params = {
98
+ 'min_confidence': 0.05,
99
+ 'min_rendering_confidence': 0.2,
100
+ 'render': True
101
+ }
102
+
103
+ num_requests = 10
104
+ max_images = 1
105
+
106
+ start = datetime.now()
107
+ responses = test_load(num_requests, params, max_images=max_images)
108
+ end = datetime.now()
109
+ total_time = end - start
110
+ print('Total time for {} requests: {}'.format(num_requests, total_time))
File without changes
@@ -0,0 +1,108 @@
1
+ """
2
+
3
+ aggregate_classifier_probs.py
4
+
5
+ Aggregate probabilities from a classifier's outputs according to a mapping
6
+ from the desired (target) categories to the classifier's categories.
7
+
8
+ Using the mapping, create a new version of the classifier output CSV with
9
+ probabilities summed within each target category. Also output a new
10
+ "index-to-name" JSON file which identifies the sequential order of the target
11
+ categories.
12
+
13
+ """
14
+
15
+ #%% Imports
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import json
21
+
22
+ import pandas as pd
23
+ from tqdm import tqdm
24
+
25
+ #%% Example usage
26
+
27
+ """
28
+ python aggregate_classifier_probs.py \
29
+ classifier_output.csv.gz \
30
+ --target-mapping target_to_classifier_labels.json \
31
+ --output-csv classifier_output_remapped.csv.gz \
32
+ --output-label-index label_index_remapped.json
33
+ """
34
+
35
+ #%% Main function
36
+
37
+ def main(classifier_results_csv_path: str,
38
+ target_mapping_json_path: str,
39
+ output_csv_path: str,
40
+ output_label_index_json_path: str) -> None:
41
+ """
42
+ Main function.
43
+
44
+ Because the output CSV is often very large, we process it in chunks of 1000
45
+ rows at a time.
46
+ """
47
+
48
+ chunked_df_iterator = pd.read_csv(
49
+ classifier_results_csv_path, chunksize=1000, float_precision='high',
50
+ index_col='path')
51
+
52
+ with open(target_mapping_json_path, 'r') as f:
53
+ target_mapping = json.load(f)
54
+ target_names = sorted(target_mapping.keys())
55
+
56
+ all_classifier_labels: set[str] = set()
57
+ for classifier_labels in target_mapping.values():
58
+ assert all_classifier_labels.isdisjoint(classifier_labels)
59
+ all_classifier_labels.update(classifier_labels)
60
+
61
+ for i, chunk_df in tqdm(enumerate(chunked_df_iterator)):
62
+ if i == 0:
63
+ assert set(chunk_df.columns) == all_classifier_labels
64
+ header, mode = True, 'w'
65
+ else:
66
+ header, mode = False, 'a'
67
+
68
+ agg_df = pd.DataFrame(
69
+ data=0., index=chunk_df.index, columns=target_names)
70
+ for target in target_names:
71
+ classifier_labels = target_mapping[target]
72
+ agg_df[target] = chunk_df[classifier_labels].sum(axis=1)
73
+
74
+ agg_df.to_csv(output_csv_path, index=True, header=header, mode=mode)
75
+
76
+ with open(output_label_index_json_path, 'w') as f:
77
+ json.dump(dict(enumerate(target_names)), f, indent=1)
78
+
79
+
80
+ #%% Command-line driver
81
+
82
+ def _parse_args() -> argparse.Namespace:
83
+
84
+ parser = argparse.ArgumentParser(
85
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
86
+ description='Aggregate classifier probabilities to target classes.')
87
+ parser.add_argument(
88
+ 'classifier_results_csv',
89
+ help='path to CSV with classifier probabilities')
90
+ parser.add_argument(
91
+ '-t', '--target-mapping', required=True,
92
+ help='path to JSON file mapping target categories to classifier labels')
93
+ parser.add_argument(
94
+ '-o', '--output-csv', required=True,
95
+ help='path to save output CSV with aggregated probabilities')
96
+ parser.add_argument(
97
+ '-i', '--output-label-index', required=True,
98
+ help='path to save output label index JSON')
99
+ return parser.parse_args()
100
+
101
+
102
+ if __name__ == '__main__':
103
+
104
+ args = _parse_args()
105
+ main(classifier_results_csv_path=args.classifier_results_csv,
106
+ target_mapping_json_path=args.target_mapping,
107
+ output_csv_path=args.output_csv,
108
+ output_label_index_json_path=args.output_label_index)
@@ -0,0 +1,227 @@
1
+ """
2
+
3
+ analyze_failed_images.py
4
+
5
+ """
6
+
7
+ #%% Imports and constants
8
+
9
+ import argparse
10
+ from collections.abc import Mapping, Sequence
11
+ from concurrent import futures
12
+ import json
13
+ from pprint import pprint
14
+ import threading
15
+ from typing import Any, Optional
16
+
17
+ from PIL import Image, ImageFile
18
+ import requests
19
+ from tqdm import tqdm
20
+
21
+ from megadetector.data_management.megadb.megadb_utils import MegadbUtils
22
+ from megadetector.utils import path_utils
23
+ from megadetector.utils import sas_blob_utils
24
+
25
+
26
+ #%% Example usage
27
+
28
+ """
29
+ python analyze_failed_images.py failed.json \
30
+ -a ACCOUNT -c CONTAINER -s SAS_TOKEN
31
+ """
32
+
33
+ ImageFile.LOAD_TRUNCATED_IMAGES = False
34
+
35
+
36
+ #%% Support functions
37
+
38
+ def check_image_condition(img_path: str,
39
+ truncated_images_lock: threading.Lock,
40
+ account: Optional[str] = None,
41
+ container: Optional[str] = None,
42
+ sas_token: Optional[str] = None,
43
+ datasets_table: Optional[Mapping[str, Any]] = None
44
+ ) -> tuple[str, str]:
45
+ """
46
+ Args:
47
+ img_path: str, either <blob_name> if datasets_table is None, or
48
+ <dataset>/<blob_name> if datasets_table is given
49
+ account: str, name of Azure Blob Storage account
50
+ container: str, name of Azure Blob Storage container
51
+ sas_token: str, optional SAS token (without leading '?') if the
52
+ container is not publicly accessible
53
+ datasets_table: dict, maps dataset name to dict of information
54
+
55
+ Returns: (img_file, status) tuple, where status is one of
56
+ 'nonexistent': blob does not exist in the container
57
+ 'non_image': img_file does not have valid file extension
58
+ 'good': image exists and is able to be opened without setting
59
+ ImageFile.LOAD_TRUNCATED_IMAGES=True
60
+ 'truncated': image exists but can only be opened by setting
61
+ ImageFile.LOAD_TRUNCATED_IMAGES=True
62
+ 'bad': image exists, but cannot be opened even when setting
63
+ ImageFile.LOAD_TRUNCATED_IMAGES=True
64
+ """
65
+
66
+ if (account is None) or (container is None) or (datasets_table is not None):
67
+ assert account is None
68
+ assert container is None
69
+ assert sas_token is None
70
+ assert datasets_table is not None
71
+
72
+ dataset, img_file = img_path.split('/', maxsplit=1)
73
+ account = datasets_table[dataset]['storage_account']
74
+ container = datasets_table[dataset]['container']
75
+ sas_token = datasets_table[dataset]['container_sas_key']
76
+ if sas_token[0] == '?': # strip leading '?' from SAS token
77
+ sas_token = sas_token[1:]
78
+ else:
79
+ img_file = img_path
80
+
81
+ if not path_utils.is_image_file(img_file):
82
+ return img_file, 'non_image'
83
+
84
+ blob_url = sas_blob_utils.build_azure_storage_uri(
85
+ account=account, container=container, sas_token=sas_token,
86
+ blob=img_file)
87
+ blob_exists = sas_blob_utils.check_blob_exists(blob_url)
88
+ if not blob_exists:
89
+ return img_file, 'nonexistent'
90
+
91
+ stream, _ = sas_blob_utils.download_blob_to_stream(blob_url)
92
+ stream.seek(0)
93
+ try:
94
+ with truncated_images_lock:
95
+ ImageFile.LOAD_TRUNCATED_IMAGES = False
96
+ with Image.open(stream) as img:
97
+ img.load()
98
+ return img_file, 'good'
99
+ except OSError: # PIL.UnidentifiedImageError is a subclass of OSError
100
+ try:
101
+ stream.seek(0)
102
+ with truncated_images_lock:
103
+ ImageFile.LOAD_TRUNCATED_IMAGES = True
104
+ with Image.open(stream) as img:
105
+ img.load()
106
+ return img_file, 'truncated'
107
+ except Exception as e: # pylint: disable=broad-except
108
+ exception_type = type(e).__name__
109
+ tqdm.write(f'Unable to load {img_file}. {exception_type}: {e}.')
110
+ return img_file, 'bad'
111
+
112
+
113
+ #%% Main function
114
+
115
+ def analyze_images(url_or_path: str, json_keys: Optional[Sequence[str]] = None,
116
+ account: Optional[str] = None,
117
+ container: Optional[str] = None,
118
+ sas_token: Optional[str] = None) -> None:
119
+ """
120
+ Args:
121
+ url_or_path: str, URL or local path to a file containing a list
122
+ of image paths. Each image path is either <blob_name> if account and
123
+ container are given, or <dataset>/<blob_name> if account and
124
+ container are None. File can either be a list of image paths, or a
125
+ JSON file containing image paths.
126
+ json_keys: optional list of str, only relevant if url_or_path is a JSON
127
+ file. If json_keys=None, then the JSON file at url_or_path is
128
+ assumed to be a JSON list of image paths. If json_keys is not None,
129
+ then the JSON file should be a dict, whose values corresponding to
130
+ json_keys are lists of image paths.
131
+ account: str, name of Azure Blob Storage account
132
+ container: str, name of Azure Blob Storage container
133
+ sas_token: str, optional SAS token (without leading '?') if the
134
+ container is not publicly accessible
135
+ """
136
+
137
+ datasets_table = None
138
+ if (account is None) or (container is None):
139
+ assert account is None
140
+ assert container is None
141
+ assert sas_token is None
142
+ datasets_table = MegadbUtils().get_datasets_table()
143
+
144
+ is_json = ('.json' in url_or_path)
145
+ if url_or_path.startswith(('http://', 'https://')):
146
+ r = requests.get(url_or_path)
147
+ if is_json:
148
+ img_paths = r.json()
149
+ else:
150
+ img_paths = r.text.splitlines()
151
+ else:
152
+ with open(url_or_path, 'r') as f:
153
+ if is_json:
154
+ img_paths = json.load(f)
155
+ else:
156
+ img_paths = f.readlines()
157
+
158
+ if is_json and json_keys is not None:
159
+ img_paths_json = img_paths
160
+ img_paths = []
161
+ for k in json_keys:
162
+ img_paths += img_paths_json[k]
163
+
164
+ mapping: dict[str, list[str]] = {
165
+ status: []
166
+ for status in ['good', 'nonexistent', 'non_image', 'truncated', 'bad']
167
+ }
168
+
169
+ pool = futures.ThreadPoolExecutor(max_workers=100)
170
+
171
+ # lock before changing ImageFile.LOAD_TRUNCATED_IMAGES
172
+ truncated_images_lock = threading.Lock()
173
+
174
+ futures_list = []
175
+ for img_path in tqdm(img_paths):
176
+ future = pool.submit(
177
+ check_image_condition, img_path, truncated_images_lock, account,
178
+ container, sas_token, datasets_table)
179
+ futures_list.append(future)
180
+
181
+ total = len(futures_list)
182
+ for future in tqdm(futures.as_completed(futures_list), total=total):
183
+ img_file, status = future.result()
184
+ mapping[status].append(img_file)
185
+
186
+ for status, img_list in mapping.items():
187
+ print(f'{status}: {len(img_list)}')
188
+ pprint(sorted(img_list))
189
+
190
+
191
+ #%% Command-line driver
192
+
193
+ def _parse_args() -> argparse.Namespace:
194
+
195
+ parser = argparse.ArgumentParser(
196
+ description='Analyze a list of images that failed to download or crop.')
197
+ parser.add_argument(
198
+ 'failed_images', metavar='URL_OR_PATH',
199
+ help='URL or path to text or JSON file containing list of image paths')
200
+ parser.add_argument(
201
+ '-k', '--json-keys', nargs='*',
202
+ help='list of keys in JSON file containing image paths')
203
+ parser.add_argument(
204
+ '-a', '--account',
205
+ help='name of Azure Blob Storage account. If not given, then image '
206
+ 'paths are assumed to start with the dataset name, so we can look '
207
+ 'up the account from MegaDB.')
208
+ parser.add_argument(
209
+ '-c', '--container',
210
+ help='name of Azure Blob Storage container. If not given, then image '
211
+ 'paths are assumed to start with the dataset name, so we can look '
212
+ 'up the container from MegaDB.')
213
+ parser.add_argument(
214
+ '-s', '--sas-token',
215
+ help='optional SAS token (without leading "?") if the container is not '
216
+ 'publicly accessible. If account and container not given, then '
217
+ 'image paths are assumed to start with the dataset name, so we '
218
+ 'can look up the SAS Token from MegaDB.')
219
+ return parser.parse_args()
220
+
221
+
222
+ if __name__ == '__main__':
223
+
224
+ args = _parse_args()
225
+ analyze_images(url_or_path=args.failed_images, json_keys=args.json_keys,
226
+ account=args.account, container=args.container,
227
+ sas_token=args.sas_token)
@@ -0,0 +1,198 @@
1
+ """
2
+
3
+ cache_batchapi_outputs.py
4
+
5
+ Script to cache Batch Detection API outputs.
6
+
7
+ This script can handle either the Batch Detection API JSON Response or the
8
+ detections JSON.
9
+
10
+ Batch Detection API Response format:
11
+
12
+ {
13
+ "Status": {
14
+ "request_status": "completed",
15
+ "message": {
16
+ "num_failed_shards": 0,
17
+ "output_file_urls": {
18
+ "detections": "https://url/to/detections.json",
19
+ "failed_images": "https://url/to/failed_images.json",
20
+ "images": https://url/to/images.json",
21
+ }
22
+ },
23
+ },
24
+ "Endpoint": "/v3/camera-trap/detection-batch/request_detections",
25
+ "TaskId": "ea26326e-7e0d-4524-a9ea-f57a5799d4ba"
26
+ }
27
+
28
+ Detections JSON format:
29
+
30
+ {
31
+ "info": {...}
32
+ "detection_categories": {...}
33
+ "classification_categories": {...}
34
+ "images": [
35
+ {
36
+ "file": "path/from/base/dir/image1.jpg",
37
+ "max_detection_conf": 0.926,
38
+ "detections": [{
39
+ "category": "1",
40
+ "conf": 0.061,
41
+ "bbox": [0.0451, 0.1849, 0.3642, 0.4636]
42
+ }]
43
+ }
44
+ ]
45
+ }
46
+
47
+ Batch Detection API Output Format:
48
+
49
+ github.com/agentmorris/MegaDetector/tree/main/megadetector/api/batch_processing#api-outputs
50
+
51
+ """
52
+
53
+ #%% Imports
54
+
55
+ from __future__ import annotations
56
+
57
+ import argparse
58
+ from collections.abc import Mapping
59
+ import json
60
+ import os
61
+ from typing import Any, Optional
62
+
63
+ import requests
64
+
65
+ from api.batch_processing.data_preparation.prepare_api_submission import (
66
+ TaskStatus, Task)
67
+ from api.batch_processing.postprocessing.combine_api_outputs import (
68
+ combine_api_output_dictionaries)
69
+
70
+
71
+ #%% Support functions
72
+
73
+ def cache_json(json_path: str,
74
+ is_detections: bool,
75
+ dataset: str,
76
+ detector_output_cache_base_dir: str,
77
+ detector_version: Optional[str]) -> None:
78
+ """
79
+ Args:
80
+ json_path: str, path to JSON file
81
+ is_detections: bool, True if <json_path> is a detections JSON file,
82
+ False if <json_path> is a API response JSON file
83
+ dataset: str
84
+ detector_output_cache_base_dir: str
85
+ detector_version: str
86
+ """
87
+
88
+ with open(json_path, 'r') as f:
89
+ js = json.load(f)
90
+
91
+ if is_detections:
92
+ detections = js
93
+
94
+ else:
95
+ response = js
96
+
97
+ # task finished successfully
98
+ status = TaskStatus(response['Status']['request_status'])
99
+ assert status == TaskStatus.COMPLETED
100
+
101
+ # parse the task ID
102
+ task_id = response['TaskId']
103
+
104
+ message = response['Status']['message']
105
+ detections_url = message['output_file_urls']['detections']
106
+ assert detections_url.split('/')[-2] == task_id
107
+
108
+ # print info about missing and failed images
109
+ task = Task(name=task_id, task_id=task_id)
110
+ task.response = response
111
+ task.status = status
112
+ task.get_missing_images(verbose=True)
113
+
114
+ # get the detections
115
+ detections = requests.get(detections_url).json()
116
+
117
+ # add detections to the detections cache
118
+ api_det_version = detections['info']['detector'].rsplit('v', maxsplit=1)[1]
119
+ if detector_version is not None:
120
+ assert api_det_version == detector_version
121
+ detector_output_cache_dir = os.path.join(
122
+ detector_output_cache_base_dir, f'v{api_det_version}')
123
+ msg = cache_detections(
124
+ detections=detections, dataset=dataset,
125
+ detector_output_cache_dir=detector_output_cache_dir)
126
+ print(msg)
127
+
128
+
129
+ def cache_detections(detections: Mapping[str, Any], dataset: str,
130
+ detector_output_cache_dir: str) -> str:
131
+ """
132
+ Args:
133
+ detections: dict, represents JSON output of detector
134
+ dataset: str, name of dataset
135
+ detector_output_cache_dir: str, path to folder where detector outputs
136
+ are cached, stored as 1 JSON file per dataset, directory must
137
+ already exist
138
+
139
+ Returns: str, message
140
+ """
141
+
142
+ # combine detections with cache
143
+ dataset_cache_path = os.path.join(
144
+ detector_output_cache_dir, f'{dataset}.json')
145
+ merged_dataset_cache: Mapping[str, Any]
146
+ if os.path.exists(dataset_cache_path):
147
+ with open(dataset_cache_path, 'r') as f:
148
+ dataset_cache = json.load(f)
149
+ merged_dataset_cache = combine_api_output_dictionaries(
150
+ input_dicts=[dataset_cache, detections], require_uniqueness=False)
151
+ msg = f'Merging detection output with {dataset_cache_path}'
152
+ else:
153
+ merged_dataset_cache = detections
154
+ msg = ('No cached detection outputs found. Saving detection output to '
155
+ f'{dataset_cache_path}')
156
+
157
+ # write combined detections back out to cache
158
+ with open(dataset_cache_path, 'w') as f:
159
+ json.dump(merged_dataset_cache, f, indent=1)
160
+ return msg
161
+
162
+
163
+ #%% Command-line driver
164
+
165
+ def _parse_args() -> argparse.Namespace:
166
+
167
+ parser = argparse.ArgumentParser(
168
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
169
+ description='Caches detector outputs.')
170
+ parser.add_argument(
171
+ 'json_file',
172
+ help='path to JSON file containing response of Batch Detection API')
173
+ parser.add_argument(
174
+ '-f', '--format', choices=['response', 'detections'], required=True,
175
+ help='(required) whether <json_file> is a Batch API response or a '
176
+ 'detections JSON file')
177
+ parser.add_argument(
178
+ '-d', '--dataset', required=True,
179
+ help='(required) name of dataset corresponding to the API task')
180
+ parser.add_argument(
181
+ '-c', '--detector-output-cache-dir', required=True,
182
+ help='(required) path to directory where detector outputs are cached')
183
+ parser.add_argument(
184
+ '-v', '--detector-version',
185
+ help='detector version string, e.g., "4.1", inferred from detections '
186
+ 'file if not given')
187
+ return parser.parse_args()
188
+
189
+
190
+ if __name__ == '__main__':
191
+
192
+ args = _parse_args()
193
+ cache_json(
194
+ json_path=args.json_file,
195
+ is_detections=(args.format == 'detections'),
196
+ dataset=args.dataset,
197
+ detector_output_cache_base_dir=args.detector_output_cache_dir,
198
+ detector_version=args.detector_version)