megadetector 5.0.10__py3-none-any.whl → 5.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (226) hide show
  1. {megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/LICENSE +0 -0
  2. {megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/METADATA +12 -11
  3. megadetector-5.0.11.dist-info/RECORD +5 -0
  4. megadetector-5.0.11.dist-info/top_level.txt +1 -0
  5. api/__init__.py +0 -0
  6. api/batch_processing/__init__.py +0 -0
  7. api/batch_processing/api_core/__init__.py +0 -0
  8. api/batch_processing/api_core/batch_service/__init__.py +0 -0
  9. api/batch_processing/api_core/batch_service/score.py +0 -439
  10. api/batch_processing/api_core/server.py +0 -294
  11. api/batch_processing/api_core/server_api_config.py +0 -98
  12. api/batch_processing/api_core/server_app_config.py +0 -55
  13. api/batch_processing/api_core/server_batch_job_manager.py +0 -220
  14. api/batch_processing/api_core/server_job_status_table.py +0 -152
  15. api/batch_processing/api_core/server_orchestration.py +0 -360
  16. api/batch_processing/api_core/server_utils.py +0 -92
  17. api/batch_processing/api_core_support/__init__.py +0 -0
  18. api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
  19. api/batch_processing/api_support/__init__.py +0 -0
  20. api/batch_processing/api_support/summarize_daily_activity.py +0 -152
  21. api/batch_processing/data_preparation/__init__.py +0 -0
  22. api/batch_processing/data_preparation/manage_local_batch.py +0 -2391
  23. api/batch_processing/data_preparation/manage_video_batch.py +0 -327
  24. api/batch_processing/integration/digiKam/setup.py +0 -6
  25. api/batch_processing/integration/digiKam/xmp_integration.py +0 -465
  26. api/batch_processing/integration/eMammal/test_scripts/config_template.py +0 -5
  27. api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -126
  28. api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +0 -55
  29. api/batch_processing/postprocessing/__init__.py +0 -0
  30. api/batch_processing/postprocessing/add_max_conf.py +0 -64
  31. api/batch_processing/postprocessing/categorize_detections_by_size.py +0 -163
  32. api/batch_processing/postprocessing/combine_api_outputs.py +0 -249
  33. api/batch_processing/postprocessing/compare_batch_results.py +0 -958
  34. api/batch_processing/postprocessing/convert_output_format.py +0 -397
  35. api/batch_processing/postprocessing/load_api_results.py +0 -195
  36. api/batch_processing/postprocessing/md_to_coco.py +0 -310
  37. api/batch_processing/postprocessing/md_to_labelme.py +0 -330
  38. api/batch_processing/postprocessing/merge_detections.py +0 -401
  39. api/batch_processing/postprocessing/postprocess_batch_results.py +0 -1904
  40. api/batch_processing/postprocessing/remap_detection_categories.py +0 -170
  41. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +0 -661
  42. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +0 -211
  43. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +0 -82
  44. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +0 -1631
  45. api/batch_processing/postprocessing/separate_detections_into_folders.py +0 -731
  46. api/batch_processing/postprocessing/subset_json_detector_output.py +0 -696
  47. api/batch_processing/postprocessing/top_folders_to_bottom.py +0 -223
  48. api/synchronous/__init__.py +0 -0
  49. api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  50. api/synchronous/api_core/animal_detection_api/api_backend.py +0 -152
  51. api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -266
  52. api/synchronous/api_core/animal_detection_api/config.py +0 -35
  53. api/synchronous/api_core/animal_detection_api/data_management/annotations/annotation_constants.py +0 -47
  54. api/synchronous/api_core/animal_detection_api/detection/detector_training/copy_checkpoints.py +0 -43
  55. api/synchronous/api_core/animal_detection_api/detection/detector_training/model_main_tf2.py +0 -114
  56. api/synchronous/api_core/animal_detection_api/detection/process_video.py +0 -543
  57. api/synchronous/api_core/animal_detection_api/detection/pytorch_detector.py +0 -304
  58. api/synchronous/api_core/animal_detection_api/detection/run_detector.py +0 -627
  59. api/synchronous/api_core/animal_detection_api/detection/run_detector_batch.py +0 -1029
  60. api/synchronous/api_core/animal_detection_api/detection/run_inference_with_yolov5_val.py +0 -581
  61. api/synchronous/api_core/animal_detection_api/detection/run_tiled_inference.py +0 -754
  62. api/synchronous/api_core/animal_detection_api/detection/tf_detector.py +0 -165
  63. api/synchronous/api_core/animal_detection_api/detection/video_utils.py +0 -495
  64. api/synchronous/api_core/animal_detection_api/md_utils/azure_utils.py +0 -174
  65. api/synchronous/api_core/animal_detection_api/md_utils/ct_utils.py +0 -262
  66. api/synchronous/api_core/animal_detection_api/md_utils/directory_listing.py +0 -251
  67. api/synchronous/api_core/animal_detection_api/md_utils/matlab_porting_tools.py +0 -97
  68. api/synchronous/api_core/animal_detection_api/md_utils/path_utils.py +0 -416
  69. api/synchronous/api_core/animal_detection_api/md_utils/process_utils.py +0 -110
  70. api/synchronous/api_core/animal_detection_api/md_utils/sas_blob_utils.py +0 -509
  71. api/synchronous/api_core/animal_detection_api/md_utils/string_utils.py +0 -59
  72. api/synchronous/api_core/animal_detection_api/md_utils/url_utils.py +0 -144
  73. api/synchronous/api_core/animal_detection_api/md_utils/write_html_image_list.py +0 -226
  74. api/synchronous/api_core/animal_detection_api/md_visualization/visualization_utils.py +0 -841
  75. api/synchronous/api_core/tests/__init__.py +0 -0
  76. api/synchronous/api_core/tests/load_test.py +0 -110
  77. classification/__init__.py +0 -0
  78. classification/aggregate_classifier_probs.py +0 -108
  79. classification/analyze_failed_images.py +0 -227
  80. classification/cache_batchapi_outputs.py +0 -198
  81. classification/create_classification_dataset.py +0 -627
  82. classification/crop_detections.py +0 -516
  83. classification/csv_to_json.py +0 -226
  84. classification/detect_and_crop.py +0 -855
  85. classification/efficientnet/__init__.py +0 -9
  86. classification/efficientnet/model.py +0 -415
  87. classification/efficientnet/utils.py +0 -610
  88. classification/evaluate_model.py +0 -520
  89. classification/identify_mislabeled_candidates.py +0 -152
  90. classification/json_to_azcopy_list.py +0 -63
  91. classification/json_validator.py +0 -695
  92. classification/map_classification_categories.py +0 -276
  93. classification/merge_classification_detection_output.py +0 -506
  94. classification/prepare_classification_script.py +0 -194
  95. classification/prepare_classification_script_mc.py +0 -228
  96. classification/run_classifier.py +0 -286
  97. classification/save_mislabeled.py +0 -110
  98. classification/train_classifier.py +0 -825
  99. classification/train_classifier_tf.py +0 -724
  100. classification/train_utils.py +0 -322
  101. data_management/__init__.py +0 -0
  102. data_management/annotations/__init__.py +0 -0
  103. data_management/annotations/annotation_constants.py +0 -34
  104. data_management/camtrap_dp_to_coco.py +0 -238
  105. data_management/cct_json_utils.py +0 -395
  106. data_management/cct_to_md.py +0 -176
  107. data_management/cct_to_wi.py +0 -289
  108. data_management/coco_to_labelme.py +0 -272
  109. data_management/coco_to_yolo.py +0 -662
  110. data_management/databases/__init__.py +0 -0
  111. data_management/databases/add_width_and_height_to_db.py +0 -33
  112. data_management/databases/combine_coco_camera_traps_files.py +0 -206
  113. data_management/databases/integrity_check_json_db.py +0 -477
  114. data_management/databases/subset_json_db.py +0 -115
  115. data_management/generate_crops_from_cct.py +0 -149
  116. data_management/get_image_sizes.py +0 -188
  117. data_management/importers/add_nacti_sizes.py +0 -52
  118. data_management/importers/add_timestamps_to_icct.py +0 -79
  119. data_management/importers/animl_results_to_md_results.py +0 -158
  120. data_management/importers/auckland_doc_test_to_json.py +0 -372
  121. data_management/importers/auckland_doc_to_json.py +0 -200
  122. data_management/importers/awc_to_json.py +0 -189
  123. data_management/importers/bellevue_to_json.py +0 -273
  124. data_management/importers/cacophony-thermal-importer.py +0 -796
  125. data_management/importers/carrizo_shrubfree_2018.py +0 -268
  126. data_management/importers/carrizo_trail_cam_2017.py +0 -287
  127. data_management/importers/cct_field_adjustments.py +0 -57
  128. data_management/importers/channel_islands_to_cct.py +0 -913
  129. data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
  130. data_management/importers/eMammal/eMammal_helpers.py +0 -249
  131. data_management/importers/eMammal/make_eMammal_json.py +0 -223
  132. data_management/importers/ena24_to_json.py +0 -275
  133. data_management/importers/filenames_to_json.py +0 -385
  134. data_management/importers/helena_to_cct.py +0 -282
  135. data_management/importers/idaho-camera-traps.py +0 -1407
  136. data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
  137. data_management/importers/jb_csv_to_json.py +0 -150
  138. data_management/importers/mcgill_to_json.py +0 -250
  139. data_management/importers/missouri_to_json.py +0 -489
  140. data_management/importers/nacti_fieldname_adjustments.py +0 -79
  141. data_management/importers/noaa_seals_2019.py +0 -181
  142. data_management/importers/pc_to_json.py +0 -365
  143. data_management/importers/plot_wni_giraffes.py +0 -123
  144. data_management/importers/prepare-noaa-fish-data-for-lila.py +0 -359
  145. data_management/importers/prepare_zsl_imerit.py +0 -131
  146. data_management/importers/rspb_to_json.py +0 -356
  147. data_management/importers/save_the_elephants_survey_A.py +0 -320
  148. data_management/importers/save_the_elephants_survey_B.py +0 -332
  149. data_management/importers/snapshot_safari_importer.py +0 -758
  150. data_management/importers/snapshot_safari_importer_reprise.py +0 -665
  151. data_management/importers/snapshot_serengeti_lila.py +0 -1067
  152. data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
  153. data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
  154. data_management/importers/sulross_get_exif.py +0 -65
  155. data_management/importers/timelapse_csv_set_to_json.py +0 -490
  156. data_management/importers/ubc_to_json.py +0 -399
  157. data_management/importers/umn_to_json.py +0 -507
  158. data_management/importers/wellington_to_json.py +0 -263
  159. data_management/importers/wi_to_json.py +0 -441
  160. data_management/importers/zamba_results_to_md_results.py +0 -181
  161. data_management/labelme_to_coco.py +0 -548
  162. data_management/labelme_to_yolo.py +0 -272
  163. data_management/lila/__init__.py +0 -0
  164. data_management/lila/add_locations_to_island_camera_traps.py +0 -97
  165. data_management/lila/add_locations_to_nacti.py +0 -147
  166. data_management/lila/create_lila_blank_set.py +0 -557
  167. data_management/lila/create_lila_test_set.py +0 -151
  168. data_management/lila/create_links_to_md_results_files.py +0 -106
  169. data_management/lila/download_lila_subset.py +0 -177
  170. data_management/lila/generate_lila_per_image_labels.py +0 -515
  171. data_management/lila/get_lila_annotation_counts.py +0 -170
  172. data_management/lila/get_lila_image_counts.py +0 -111
  173. data_management/lila/lila_common.py +0 -300
  174. data_management/lila/test_lila_metadata_urls.py +0 -132
  175. data_management/ocr_tools.py +0 -874
  176. data_management/read_exif.py +0 -681
  177. data_management/remap_coco_categories.py +0 -84
  178. data_management/remove_exif.py +0 -66
  179. data_management/resize_coco_dataset.py +0 -189
  180. data_management/wi_download_csv_to_coco.py +0 -246
  181. data_management/yolo_output_to_md_output.py +0 -441
  182. data_management/yolo_to_coco.py +0 -676
  183. detection/__init__.py +0 -0
  184. detection/detector_training/__init__.py +0 -0
  185. detection/detector_training/model_main_tf2.py +0 -114
  186. detection/process_video.py +0 -703
  187. detection/pytorch_detector.py +0 -337
  188. detection/run_detector.py +0 -779
  189. detection/run_detector_batch.py +0 -1219
  190. detection/run_inference_with_yolov5_val.py +0 -917
  191. detection/run_tiled_inference.py +0 -935
  192. detection/tf_detector.py +0 -188
  193. detection/video_utils.py +0 -606
  194. docs/source/conf.py +0 -43
  195. md_utils/__init__.py +0 -0
  196. md_utils/azure_utils.py +0 -174
  197. md_utils/ct_utils.py +0 -612
  198. md_utils/directory_listing.py +0 -246
  199. md_utils/md_tests.py +0 -968
  200. md_utils/path_utils.py +0 -1044
  201. md_utils/process_utils.py +0 -157
  202. md_utils/sas_blob_utils.py +0 -509
  203. md_utils/split_locations_into_train_val.py +0 -228
  204. md_utils/string_utils.py +0 -92
  205. md_utils/url_utils.py +0 -323
  206. md_utils/write_html_image_list.py +0 -225
  207. md_visualization/__init__.py +0 -0
  208. md_visualization/plot_utils.py +0 -293
  209. md_visualization/render_images_with_thumbnails.py +0 -275
  210. md_visualization/visualization_utils.py +0 -1537
  211. md_visualization/visualize_db.py +0 -551
  212. md_visualization/visualize_detector_output.py +0 -406
  213. megadetector-5.0.10.dist-info/RECORD +0 -224
  214. megadetector-5.0.10.dist-info/top_level.txt +0 -8
  215. taxonomy_mapping/__init__.py +0 -0
  216. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +0 -491
  217. taxonomy_mapping/map_new_lila_datasets.py +0 -154
  218. taxonomy_mapping/prepare_lila_taxonomy_release.py +0 -142
  219. taxonomy_mapping/preview_lila_taxonomy.py +0 -591
  220. taxonomy_mapping/retrieve_sample_image.py +0 -71
  221. taxonomy_mapping/simple_image_download.py +0 -218
  222. taxonomy_mapping/species_lookup.py +0 -834
  223. taxonomy_mapping/taxonomy_csv_checker.py +0 -159
  224. taxonomy_mapping/taxonomy_graph.py +0 -346
  225. taxonomy_mapping/validate_lila_category_mappings.py +0 -83
  226. {megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/WHEEL +0 -0
@@ -1,152 +0,0 @@
1
- # Copyright (c) Microsoft Corporation. All rights reserved.
2
- # Licensed under the MIT License.
3
-
4
- """
5
- A class to manage updating the status of an API request / Azure Batch Job using
6
- the Cosmos DB table "batch_api_jobs".
7
- """
8
-
9
- import logging
10
- import os
11
- import unittest
12
- import uuid
13
- from typing import Union, Optional
14
-
15
- from azure.cosmos.cosmos_client import CosmosClient
16
- from azure.cosmos.exceptions import CosmosResourceNotFoundError
17
-
18
- from server_api_config import API_INSTANCE_NAME, COSMOS_ENDPOINT, COSMOS_WRITE_KEY
19
- from server_utils import get_utc_time
20
-
21
-
22
- log = logging.getLogger(os.environ['FLASK_APP'])
23
-
24
-
25
- class JobStatusTable:
26
- """
27
- A wrapper around the Cosmos DB client. Each item in the table "batch_api_jobs" represents
28
- a request/Batch Job, and should have the following fields:
29
- - id: this is the job_id
30
- - api_instance
31
- - status
32
- - last_updated
33
- - call_params: the dict representing the body of the POST request from the user
34
- The 'status' field is a dict with the following fields:
35
- - request_status
36
- - message
37
- - num_tasks (present after Batch Job created)
38
- - num_images (present after Batch Job created)
39
- """
40
- # a job moves from created to running/problem after the Batch Job has been submitted
41
- allowed_statuses = ['created', 'running', 'failed', 'problem', 'completed', 'canceled']
42
-
43
- def __init__(self, api_instance=None):
44
- self.api_instance = api_instance if api_instance is not None else API_INSTANCE_NAME
45
- cosmos_client = CosmosClient(COSMOS_ENDPOINT, credential=COSMOS_WRITE_KEY)
46
- db_client = cosmos_client.get_database_client('camera-trap')
47
- self.db_jobs_client = db_client.get_container_client('batch_api_jobs')
48
-
49
- def create_job_status(self, job_id: str, status: Union[dict, str], call_params: dict) -> dict:
50
- assert 'request_status' in status and 'message' in status
51
- assert status['request_status'] in JobStatusTable.allowed_statuses
52
-
53
- # job_id should be unique across all instances, and is also the partition key
54
- cur_time = get_utc_time()
55
- item = {
56
- 'id': job_id,
57
- 'api_instance': self.api_instance,
58
- 'status': status,
59
- 'job_submission_time': cur_time,
60
- 'last_updated': cur_time,
61
- 'call_params': call_params
62
- }
63
- created_item = self.db_jobs_client.create_item(item)
64
- return created_item
65
-
66
- def update_job_status(self, job_id: str, status: Union[dict, str]) -> dict:
67
- assert 'request_status' in status and 'message' in status
68
- assert status['request_status'] in JobStatusTable.allowed_statuses
69
-
70
- # TODO do not read the entry first to get the call_params when the Cosmos SDK add a
71
- # patching functionality:
72
- # https://feedback.azure.com/forums/263030-azure-cosmos-db/suggestions/6693091-be-able-to-do-partial-updates-on-document
73
- item_old = self.read_job_status(job_id)
74
- if item_old is None:
75
- raise ValueError
76
-
77
- # need to retain other fields in 'status' to be able to restart monitoring thread
78
- if 'status' in item_old and isinstance(item_old['status'], dict):
79
- # retain existing fields; update as needed
80
- for k, v in item_old['status'].items():
81
- if k not in status:
82
- status[k] = v
83
- item = {
84
- 'id': job_id,
85
- 'api_instance': self.api_instance,
86
- 'status': status,
87
- 'job_submission_time': item_old['job_submission_time'],
88
- 'last_updated': get_utc_time(),
89
- 'call_params': item_old['call_params']
90
- }
91
- replaced_item = self.db_jobs_client.replace_item(job_id, item)
92
- return replaced_item
93
-
94
- def read_job_status(self, job_id) -> Optional[dict]:
95
- """
96
- Read the status of the job from the Cosmos DB table of job status.
97
- Note that it does not check the actual status of the job on Batch, and just returns what
98
- the monitoring thread wrote to the database.
99
- job_id is also the partition key
100
- """
101
- try:
102
- read_item = self.db_jobs_client.read_item(job_id, partition_key=job_id)
103
- assert read_item['api_instance'] == self.api_instance, 'Job does not belong to this API instance'
104
- except CosmosResourceNotFoundError:
105
- return None # job_id not a key
106
- except Exception as e:
107
- logging.error(f'server_job_status_table, read_job_status, exception: {e}')
108
- raise
109
- else:
110
- item = {k: v for k, v in read_item.items() if not k.startswith('_')}
111
- return item
112
-
113
-
114
- class TestJobStatusTable(unittest.TestCase):
115
- api_instance = 'api_test'
116
-
117
- def test_insert(self):
118
- table = JobStatusTable(TestJobStatusTable.api_instance)
119
- status = {
120
- 'request_status': 'running',
121
- 'message': 'this is a test'
122
- }
123
- job_id = uuid.uuid4().hex
124
- item = table.create_job_status(job_id, status, {'container_sas': 'random_string'})
125
- self.assertTrue(job_id == item['id'], 'Expect job_id to be the id of the item')
126
- self.assertTrue(item['status']['request_status'] == 'running', 'Expect fields to be inserted correctly')
127
-
128
- def test_update_and_read(self):
129
- table = JobStatusTable(TestJobStatusTable.api_instance)
130
- status = {
131
- 'request_status': 'running',
132
- 'message': 'this is a test'
133
- }
134
- job_id = uuid.uuid4().hex
135
- res = table.create_job_status(job_id, status, {'container_sas': 'random_string'})
136
-
137
- status = {
138
- 'request_status': 'completed',
139
- 'message': 'this is a test again'
140
- }
141
- res = table.update_job_status(job_id, status)
142
- item_read = table.read_job_status(job_id)
143
- self.assertTrue(item_read['status']['request_status'] == 'completed', 'Expect field to have updated')
144
-
145
- def test_read_invalid_id(self):
146
- table = JobStatusTable(TestJobStatusTable.api_instance)
147
- job_id = uuid.uuid4().hex # should not be in the database
148
- item_read = table.read_job_status(job_id)
149
- self.assertIsNone(item_read)
150
-
151
- if __name__ == '__main__':
152
- unittest.main()
@@ -1,360 +0,0 @@
1
- # Copyright (c) Microsoft Corporation. All rights reserved.
2
- # Licensed under the MIT License.
3
-
4
- """
5
- Functions to submit images to the Azure Batch node pool for processing, monitor
6
- the Job and fetch results when completed.
7
- """
8
-
9
- import io
10
- import json
11
- import threading
12
- import time
13
- import logging
14
- import os
15
- import urllib.parse
16
- from datetime import timedelta
17
- from random import shuffle
18
-
19
- import sas_blob_utils
20
- import requests
21
- from azure.storage.blob import ContainerClient, BlobSasPermissions, generate_blob_sas
22
- from tqdm import tqdm
23
-
24
- from server_utils import *
25
- import server_api_config as api_config
26
- from server_batch_job_manager import BatchJobManager
27
- from server_job_status_table import JobStatusTable
28
-
29
-
30
- # Gunicorn logger handler will get attached if needed in server.py
31
- log = logging.getLogger(os.environ['FLASK_APP'])
32
-
33
-
34
- def create_batch_job(job_id: str, body: dict):
35
- """
36
- This is the target to be run in a thread to submit a batch processing job and monitor progress
37
- """
38
- job_status_table = JobStatusTable()
39
- try:
40
- log.info(f'server_job, create_batch_job, job_id {job_id}, {body}')
41
-
42
- input_container_sas = body.get('input_container_sas', None)
43
-
44
- use_url = body.get('use_url', False)
45
-
46
- images_requested_json_sas = body.get('images_requested_json_sas', None)
47
-
48
- image_path_prefix = body.get('image_path_prefix', None)
49
-
50
- first_n = body.get('first_n', None)
51
- first_n = int(first_n) if first_n else None
52
-
53
- sample_n = body.get('sample_n', None)
54
- sample_n = int(sample_n) if sample_n else None
55
-
56
- model_version = body.get('model_version', '')
57
- if model_version == '':
58
- model_version = api_config.DEFAULT_MD_VERSION
59
-
60
- # request_name and request_submission_timestamp are for appending to
61
- # output file names
62
- job_name = body.get('request_name', '') # in earlier versions we used "request" to mean a "job"
63
- job_submission_timestamp = get_utc_time()
64
-
65
- # image_paths can be a list of strings (Azure blob names or public URLs)
66
- # or a list of length-2 lists where each is a [image_id, metadata] pair
67
-
68
- # Case 1: listing all images in the container
69
- # - not possible to have attached metadata if listing images in a blob
70
- if images_requested_json_sas is None:
71
- log.info('server_job, create_batch_job, listing all images to process.')
72
-
73
- # list all images to process
74
- image_paths = sas_blob_utils.list_blobs_in_container(
75
- container_uri=input_container_sas,
76
- blob_prefix=image_path_prefix, # check will be case-sensitive
77
- blob_suffix=api_config.IMAGE_SUFFIXES_ACCEPTED, # check will be case-insensitive
78
- limit=api_config.MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB + 1
79
- # + 1 so if the number of images listed > MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB
80
- # we will know and not proceed
81
- )
82
-
83
- # Case 2: user supplied a list of images to process; can include metadata
84
- else:
85
- log.info('server_job, create_batch_job, using provided list of images.')
86
-
87
- response = requests.get(images_requested_json_sas) # could be a file hosted anywhere
88
- image_paths = response.json()
89
-
90
- log.info('server_job, create_batch_job, length of image_paths provided by the user: {}'.format(
91
- len(image_paths)))
92
- if len(image_paths) == 0:
93
- job_status = get_job_status(
94
- 'completed', '0 images found in provided list of images.')
95
- job_status_table.update_job_status(job_id, job_status)
96
- return
97
-
98
- error, metadata_available = validate_provided_image_paths(image_paths)
99
- if error is not None:
100
- msg = 'image paths provided in the json are not valid: {}'.format(error)
101
- raise ValueError(msg)
102
-
103
- # filter down to those conforming to the provided prefix and accepted suffixes (image file types)
104
- valid_image_paths = []
105
- for p in image_paths:
106
- locator = p[0] if metadata_available else p
107
-
108
- # prefix is case-sensitive; suffix is not
109
- if image_path_prefix is not None and not locator.startswith(image_path_prefix):
110
- continue
111
-
112
- # Although urlparse(p).path preserves the extension on local paths, it will not work for
113
- # blob file names that contains "#", which will be treated as indication of a query.
114
- # If the URL is generated via Azure Blob Storage, the "#" char will be properly encoded
115
- path = urllib.parse.urlparse(locator).path if use_url else locator
116
-
117
- if path.lower().endswith(api_config.IMAGE_SUFFIXES_ACCEPTED):
118
- valid_image_paths.append(p)
119
- image_paths = valid_image_paths
120
- log.info(('server_job, create_batch_job, length of image_paths provided by user, '
121
- f'after filtering to jpg: {len(image_paths)}'))
122
-
123
- # apply the first_n and sample_n filters
124
- if first_n:
125
- assert first_n > 0, 'parameter first_n is 0.'
126
- # OK if first_n > total number of images
127
- image_paths = image_paths[:first_n]
128
-
129
- if sample_n:
130
- assert sample_n > 0, 'parameter sample_n is 0.'
131
- if sample_n > len(image_paths):
132
- msg = ('parameter sample_n specifies more images than '
133
- 'available (after filtering by other provided params).')
134
- raise ValueError(msg)
135
-
136
- # sample by shuffling image paths and take the first sample_n images
137
- log.info('First path before shuffling:', image_paths[0])
138
- shuffle(image_paths)
139
- log.info('First path after shuffling:', image_paths[0])
140
- image_paths = image_paths[:sample_n]
141
-
142
- num_images = len(image_paths)
143
- log.info(f'server_job, create_batch_job, num_images after applying all filters: {num_images}')
144
-
145
- if num_images < 1:
146
- job_status = get_job_status('completed', (
147
- 'Zero images found in container or in provided list of images '
148
- 'after filtering with the provided parameters.'))
149
- job_status_table.update_job_status(job_id, job_status)
150
- return
151
- if num_images > api_config.MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB:
152
- job_status = get_job_status(
153
- 'failed',
154
- (f'The number of images ({num_images}) requested for processing exceeds the maximum '
155
- f'accepted {api_config.MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB} in one call'))
156
- job_status_table.update_job_status(job_id, job_status)
157
- return
158
-
159
- # upload the image list to the container, which is also mounted on all nodes
160
- # all sharding and scoring use the uploaded list
161
- images_list_str_as_bytes = bytes(json.dumps(image_paths, ensure_ascii=False), encoding='utf-8')
162
-
163
- container_url = sas_blob_utils.build_azure_storage_uri(account=api_config.STORAGE_ACCOUNT_NAME,
164
- container=api_config.STORAGE_CONTAINER_API)
165
- with ContainerClient.from_container_url(container_url,
166
- credential=api_config.STORAGE_ACCOUNT_KEY) as api_container_client:
167
- _ = api_container_client.upload_blob(
168
- name=f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/{job_id}_images.json',
169
- data=images_list_str_as_bytes)
170
-
171
- job_status = get_job_status('created', f'{num_images} images listed; submitting the job...')
172
- job_status_table.update_job_status(job_id, job_status)
173
-
174
- except Exception as e:
175
- job_status = get_job_status('failed', f'Error occurred while preparing the Batch job: {e}')
176
- job_status_table.update_job_status(job_id, job_status)
177
- log.error(f'server_job, create_batch_job, Error occurred while preparing the Batch job: {e}')
178
- return # do not start monitoring
179
-
180
- try:
181
- batch_job_manager = BatchJobManager()
182
-
183
- model_rel_path = api_config.MD_VERSIONS_TO_REL_PATH[model_version]
184
- batch_job_manager.create_job(job_id,
185
- model_rel_path,
186
- input_container_sas,
187
- use_url)
188
-
189
- num_tasks, task_ids_failed_to_submit = batch_job_manager.submit_tasks(job_id, num_images)
190
-
191
- # now request_status moves from created to running
192
- job_status = get_job_status('running',
193
- (f'Submitted {num_images} images to cluster in {num_tasks} shards. '
194
- f'Number of shards failed to be submitted: {len(task_ids_failed_to_submit)}'))
195
-
196
- # an extra field to allow the monitoring thread to restart after an API restart: total number of tasks
197
- job_status['num_tasks'] = num_tasks
198
- # also record the number of images to process for reporting
199
- job_status['num_images'] = num_images
200
-
201
- job_status_table.update_job_status(job_id, job_status)
202
- except Exception as e:
203
- job_status = get_job_status('problem', f'Please contact us. Error occurred while submitting the Batch job: {e}')
204
- job_status_table.update_job_status(job_id, job_status)
205
- log.error(f'server_job, create_batch_job, Error occurred while submitting the Batch job: {e}')
206
- return
207
-
208
- # start the monitor thread with the same name
209
- try:
210
- thread = threading.Thread(
211
- target=monitor_batch_job,
212
- name=f'job_{job_id}',
213
- kwargs={
214
- 'job_id': job_id,
215
- 'num_tasks': num_tasks,
216
- 'model_version': model_version,
217
- 'job_name': job_name,
218
- 'job_submission_timestamp': job_submission_timestamp
219
- }
220
- )
221
- thread.start()
222
- except Exception as e:
223
- job_status = get_job_status('problem', f'Error occurred while starting the monitoring thread: {e}')
224
- job_status_table.update_job_status(job_id, job_status)
225
- log.error(f'server_job, create_batch_job, Error occurred while starting the monitoring thread: {e}')
226
- return
227
-
228
-
229
- def monitor_batch_job(job_id: str,
230
- num_tasks: int,
231
- model_version: str,
232
- job_name: str,
233
- job_submission_timestamp: str):
234
-
235
- job_status_table = JobStatusTable()
236
- batch_job_manager = BatchJobManager()
237
-
238
- try:
239
- num_checks = 0
240
-
241
- while True:
242
- time.sleep(api_config.MONITOR_PERIOD_MINUTES * 60)
243
- num_checks += 1
244
-
245
- # both succeeded and failed tasks are marked "completed" on Batch
246
- num_tasks_succeeded, num_tasks_failed = batch_job_manager.get_num_completed_tasks(job_id)
247
- job_status = get_job_status('running',
248
- (f'Check number {num_checks}, '
249
- f'{num_tasks_succeeded} out of {num_tasks} shards have completed '
250
- f'successfully, {num_tasks_failed} shards have failed.'))
251
- job_status_table.update_job_status(job_id, job_status)
252
- log.info(f'job_id {job_id}. '
253
- f'Check number {num_checks}, {num_tasks_succeeded} out of {num_tasks} shards completed, '
254
- f'{num_tasks_failed} shards failed.')
255
-
256
- if (num_tasks_succeeded + num_tasks_failed) >= num_tasks:
257
- break
258
-
259
- if num_checks > api_config.MAX_MONITOR_CYCLES:
260
- job_status = get_job_status('problem',
261
- (
262
- f'Job unfinished after {num_checks} x {api_config.MONITOR_PERIOD_MINUTES} minutes, '
263
- f'please contact us to retrieve the results. Number of succeeded shards: {num_tasks_succeeded}')
264
- )
265
- job_status_table.update_job_status(job_id, job_status)
266
- log.warning(f'server_job, create_batch_job, MAX_MONITOR_CYCLES reached, ending thread')
267
- break # still aggregate the Tasks' outputs
268
-
269
- except Exception as e:
270
- job_status = get_job_status('problem', f'Error occurred while monitoring the Batch job: {e}')
271
- job_status_table.update_job_status(job_id, job_status)
272
- log.error(f'server_job, create_batch_job, Error occurred while monitoring the Batch job: {e}')
273
- return
274
-
275
- try:
276
- output_sas_url = aggregate_results(job_id, model_version, job_name, job_submission_timestamp)
277
- # preserving format from before, but SAS URL to 'failed_images' and 'images' are no longer provided
278
- # failures should be contained in the output entries, indicated by an 'error' field
279
- msg = {
280
- 'num_failed_shards': num_tasks_failed,
281
- 'output_file_urls': {
282
- 'detections': output_sas_url
283
- }
284
- }
285
- job_status = get_job_status('completed', msg)
286
- job_status_table.update_job_status(job_id, job_status)
287
-
288
- except Exception as e:
289
- job_status = get_job_status('problem',
290
- f'Please contact us to retrieve the results. Error occurred while aggregating results: {e}')
291
- job_status_table.update_job_status(job_id, job_status)
292
- log.error(f'server_job, create_batch_job, Error occurred while aggregating results: {e}')
293
- return
294
-
295
-
296
- def aggregate_results(job_id: str,
297
- model_version: str,
298
- job_name: str,
299
- job_submission_timestamp: str) -> str:
300
- log.info(f'server_job, aggregate_results starting, job_id: {job_id}')
301
-
302
- container_url = sas_blob_utils.build_azure_storage_uri(account=api_config.STORAGE_ACCOUNT_NAME,
303
- container=api_config.STORAGE_CONTAINER_API)
304
- # when people download this, the timestamp will have : replaced by _
305
- output_file_path = f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/{job_id}_detections_{job_name}_{job_submission_timestamp}.json'
306
-
307
- with ContainerClient.from_container_url(container_url,
308
- credential=api_config.STORAGE_ACCOUNT_KEY) as container_client:
309
- # check if the result blob has already been written (could be another instance of the API / worker thread)
310
- # and if so, skip aggregating and uploading the results, and just generate the SAS URL, which
311
- # could be needed still if the previous request_status was `problem`.
312
- blob_client = container_client.get_blob_client(output_file_path)
313
- if blob_client.exists():
314
- log.warning(f'The output file already exists, likely because another monitoring thread already wrote it.')
315
- else:
316
- task_outputs_dir = f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/task_outputs/'
317
- generator = container_client.list_blobs(name_starts_with=task_outputs_dir)
318
-
319
- blobs = [i for i in generator if i.name.endswith('.json')]
320
-
321
- all_results = []
322
- for blob_props in tqdm(blobs):
323
- with container_client.get_blob_client(blob_props) as blob_client:
324
- stream = io.BytesIO()
325
- blob_client.download_blob().readinto(stream)
326
- stream.seek(0)
327
- task_results = json.load(stream)
328
- all_results.extend(task_results)
329
-
330
- api_output = {
331
- 'info': {
332
- 'detector': f'megadetector_v{model_version}',
333
- 'detection_completion_time': get_utc_time(),
334
- 'format_version': api_config.OUTPUT_FORMAT_VERSION
335
- },
336
- 'detection_categories': api_config.DETECTOR_LABEL_MAP,
337
- 'images': all_results
338
- }
339
-
340
- # upload the output JSON to the Job folder
341
- api_output_as_bytes = bytes(json.dumps(api_output, ensure_ascii=False, indent=1), encoding='utf-8')
342
- _ = container_client.upload_blob(name=output_file_path, data=api_output_as_bytes)
343
-
344
- output_sas = generate_blob_sas(
345
- account_name=api_config.STORAGE_ACCOUNT_NAME,
346
- container_name=api_config.STORAGE_CONTAINER_API,
347
- blob_name=output_file_path,
348
- account_key=api_config.STORAGE_ACCOUNT_KEY,
349
- permission=BlobSasPermissions(read=True, write=False),
350
- expiry=datetime.utcnow() + timedelta(days=api_config.OUTPUT_SAS_EXPIRATION_DAYS)
351
- )
352
- output_sas_url = sas_blob_utils.build_azure_storage_uri(
353
- account=api_config.STORAGE_ACCOUNT_NAME,
354
- container=api_config.STORAGE_CONTAINER_API,
355
- blob=output_file_path,
356
- sas_token=output_sas
357
- )
358
- log.info(f'server_job, aggregate_results done, job_id: {job_id}')
359
- log.info(f'output_sas_url: {output_sas_url}')
360
- return output_sas_url
@@ -1,92 +0,0 @@
1
- # Copyright (c) Microsoft Corporation. All rights reserved.
2
- # Licensed under the MIT License.
3
-
4
- """
5
- Helper functions for the batch processing API.
6
- """
7
-
8
- import logging
9
- import os
10
- from datetime import datetime
11
- from typing import Tuple, Any, Sequence, Optional
12
-
13
- import sas_blob_utils
14
-
15
-
16
- log = logging.getLogger(os.environ['FLASK_APP'])
17
-
18
-
19
- #%% helper classes and functions
20
-
21
- def make_error(error_code: int, error_message: str) -> Tuple[dict, int]:
22
- # TODO log exception when we have more telemetry
23
- log.error(f'Error {error_code} - {error_message}')
24
- return {'error': error_message}, error_code
25
-
26
-
27
- def check_data_container_sas(input_container_sas: str) -> Optional[Tuple[int, str]]:
28
- """
29
- Returns a tuple (error_code, msg) if not a usable SAS URL, else returns None
30
- """
31
- # TODO check that the expiry date of input_container_sas is at least a month
32
- # into the future
33
- permissions = sas_blob_utils.get_permissions_from_uri(input_container_sas)
34
- data = sas_blob_utils.get_all_query_parts(input_container_sas)
35
-
36
- msg = ('input_container_sas provided does not have both read and list '
37
- 'permissions.')
38
- if 'read' not in permissions or 'list' not in permissions:
39
- if 'si' in data:
40
- # if no permission specified explicitly but has an access policy, assumes okay
41
- # TODO - check based on access policy as well
42
- return None
43
-
44
- return 400, msg
45
-
46
- return None
47
-
48
-
49
- def get_utc_time() -> str:
50
- # return current UTC time as a string in the ISO 8601 format (so we can query by
51
- # timestamp in the Cosmos DB job status table.
52
- # example: '2021-02-08T20:02:05.699689Z'
53
- return datetime.utcnow().isoformat(timespec='microseconds') + 'Z'
54
-
55
-
56
- def get_job_status(request_status: str, message: Any) -> dict:
57
- return {
58
- 'request_status': request_status,
59
- 'message': message
60
- }
61
-
62
-
63
- def validate_provided_image_paths(image_paths: Sequence[Any]) -> Tuple[Optional[str], bool]:
64
- """Given a list of image_paths (list length at least 1), validate them and
65
- determine if metadata is available.
66
- Args:
67
- image_paths: a list of string (image_id) or a list of 2-item lists
68
- ([image_id, image_metadata])
69
- Returns:
70
- error: None if checks passed, otherwise a string error message
71
- metadata_available: bool, True if available
72
- """
73
- # image_paths will have length at least 1, otherwise would have ended before this step
74
- first_item = image_paths[0]
75
- metadata_available = False
76
- if isinstance(first_item, str):
77
- for i in image_paths:
78
- if not isinstance(i, str):
79
- error = 'Not all items in image_paths are of type string.'
80
- return error, metadata_available
81
- return None, metadata_available
82
- elif isinstance(first_item, list):
83
- metadata_available = True
84
- for i in image_paths:
85
- if len(i) != 2: # i should be [image_id, metadata_string]
86
- error = ('Items in image_paths are lists, but not all lists '
87
- 'are of length 2 [image locator, metadata].')
88
- return error, metadata_available
89
- return None, metadata_available
90
- else:
91
- error = 'image_paths contain items that are not strings nor lists.'
92
- return error, metadata_available
File without changes
@@ -1,46 +0,0 @@
1
- #
2
- # If a request has been sent to AML for batch scoring but the monitoring thread of the API was
3
- # interrupted (uncaught exception or having to re-start the API container), we could manually
4
- # aggregate results from each shard using this script, assuming all jobs submitted to AML have finished.
5
- #
6
- # Need to have set environment variables STORAGE_ACCOUNT_NAME and STORAGE_ACCOUNT_KEY to those of the
7
- # storage account backing the API. Also need to adjust the INTERNAL_CONTAINER, AML_CONTAINER and
8
- # AML_CONFIG fields in api_core/orchestrator_api/api_config.py to match the instance of the API that this
9
- # request was submitted to.
10
- #
11
- # May need to change the import statement in api_core/orchestrator_api/orchestrator.py
12
- # "from sas_blob_utils import SasBlob" to
13
- # "from .sas_blob_utils import SasBlob" to not confuse with the module in AI4Eutils;
14
- # and change "import api_config" to
15
- # "from api.batch_processing.api_core.orchestrator_api import api_config"
16
-
17
- # Execute this script from the root of the repository. You may need to add the repository to PYTHONPATH.
18
-
19
- import argparse
20
- import json
21
-
22
- from api.batch_processing.api_core.orchestrator_api.orchestrator import AMLMonitor
23
-
24
-
25
- def main():
26
- parser = argparse.ArgumentParser()
27
- parser.add_argument('shortened_request_id', type=str,
28
- help='the request ID to restart monitoring')
29
- parser.add_argument('model_version', type=str, help='version of megadetector used; this is used to fill in the meta info section of the output file')
30
- parser.add_argument('request_name', type=str, help='easy to remember name for that job, optional', default='')
31
- args = parser.parse_args()
32
-
33
-
34
- # list_jobs_submitted cannot be serialized ("can't pickle _thread.RLock objects "), but
35
- # do not need it for aggregating results
36
- aml_monitor = AMLMonitor(request_id=args.request_id,
37
- list_jobs_submitted=None,
38
- request_name=args.request_name,
39
- request_submission_timestamp='',
40
- model_version=args.model_version)
41
- output_file_urls = aml_monitor.aggregate_results()
42
- output_file_urls_str = json.dumps(output_file_urls)
43
- print(output_file_urls_str)
44
-
45
- if __name__ == '__main__':
46
- main()
File without changes