megadetector 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
- megadetector/classification/aggregate_classifier_probs.py +3 -3
- megadetector/classification/analyze_failed_images.py +5 -5
- megadetector/classification/cache_batchapi_outputs.py +5 -5
- megadetector/classification/create_classification_dataset.py +11 -12
- megadetector/classification/crop_detections.py +10 -10
- megadetector/classification/csv_to_json.py +8 -8
- megadetector/classification/detect_and_crop.py +13 -15
- megadetector/classification/efficientnet/model.py +8 -8
- megadetector/classification/efficientnet/utils.py +6 -5
- megadetector/classification/evaluate_model.py +7 -7
- megadetector/classification/identify_mislabeled_candidates.py +6 -6
- megadetector/classification/json_to_azcopy_list.py +1 -1
- megadetector/classification/json_validator.py +29 -32
- megadetector/classification/map_classification_categories.py +9 -9
- megadetector/classification/merge_classification_detection_output.py +12 -9
- megadetector/classification/prepare_classification_script.py +19 -19
- megadetector/classification/prepare_classification_script_mc.py +26 -26
- megadetector/classification/run_classifier.py +4 -4
- megadetector/classification/save_mislabeled.py +6 -6
- megadetector/classification/train_classifier.py +1 -1
- megadetector/classification/train_classifier_tf.py +9 -9
- megadetector/classification/train_utils.py +10 -10
- megadetector/data_management/annotations/annotation_constants.py +1 -2
- megadetector/data_management/camtrap_dp_to_coco.py +79 -46
- megadetector/data_management/cct_json_utils.py +103 -103
- megadetector/data_management/cct_to_md.py +49 -49
- megadetector/data_management/cct_to_wi.py +33 -33
- megadetector/data_management/coco_to_labelme.py +75 -75
- megadetector/data_management/coco_to_yolo.py +210 -193
- megadetector/data_management/databases/add_width_and_height_to_db.py +86 -12
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +40 -40
- megadetector/data_management/databases/integrity_check_json_db.py +228 -200
- megadetector/data_management/databases/subset_json_db.py +33 -33
- megadetector/data_management/generate_crops_from_cct.py +88 -39
- megadetector/data_management/get_image_sizes.py +54 -49
- megadetector/data_management/labelme_to_coco.py +133 -125
- megadetector/data_management/labelme_to_yolo.py +159 -73
- megadetector/data_management/lila/create_lila_blank_set.py +81 -83
- megadetector/data_management/lila/create_lila_test_set.py +32 -31
- megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
- megadetector/data_management/lila/download_lila_subset.py +21 -24
- megadetector/data_management/lila/generate_lila_per_image_labels.py +365 -107
- megadetector/data_management/lila/get_lila_annotation_counts.py +35 -33
- megadetector/data_management/lila/get_lila_image_counts.py +22 -22
- megadetector/data_management/lila/lila_common.py +73 -70
- megadetector/data_management/lila/test_lila_metadata_urls.py +28 -19
- megadetector/data_management/mewc_to_md.py +344 -340
- megadetector/data_management/ocr_tools.py +262 -255
- megadetector/data_management/read_exif.py +249 -227
- megadetector/data_management/remap_coco_categories.py +90 -28
- megadetector/data_management/remove_exif.py +81 -21
- megadetector/data_management/rename_images.py +187 -187
- megadetector/data_management/resize_coco_dataset.py +588 -120
- megadetector/data_management/speciesnet_to_md.py +41 -41
- megadetector/data_management/wi_download_csv_to_coco.py +55 -55
- megadetector/data_management/yolo_output_to_md_output.py +248 -122
- megadetector/data_management/yolo_to_coco.py +333 -191
- megadetector/detection/change_detection.py +832 -0
- megadetector/detection/process_video.py +340 -337
- megadetector/detection/pytorch_detector.py +358 -278
- megadetector/detection/run_detector.py +399 -186
- megadetector/detection/run_detector_batch.py +404 -377
- megadetector/detection/run_inference_with_yolov5_val.py +340 -327
- megadetector/detection/run_tiled_inference.py +257 -249
- megadetector/detection/tf_detector.py +24 -24
- megadetector/detection/video_utils.py +332 -295
- megadetector/postprocessing/add_max_conf.py +19 -11
- megadetector/postprocessing/categorize_detections_by_size.py +45 -45
- megadetector/postprocessing/classification_postprocessing.py +468 -433
- megadetector/postprocessing/combine_batch_outputs.py +23 -23
- megadetector/postprocessing/compare_batch_results.py +590 -525
- megadetector/postprocessing/convert_output_format.py +106 -102
- megadetector/postprocessing/create_crop_folder.py +347 -147
- megadetector/postprocessing/detector_calibration.py +173 -168
- megadetector/postprocessing/generate_csv_report.py +508 -499
- megadetector/postprocessing/load_api_results.py +48 -27
- megadetector/postprocessing/md_to_coco.py +133 -102
- megadetector/postprocessing/md_to_labelme.py +107 -90
- megadetector/postprocessing/md_to_wi.py +40 -40
- megadetector/postprocessing/merge_detections.py +92 -114
- megadetector/postprocessing/postprocess_batch_results.py +319 -301
- megadetector/postprocessing/remap_detection_categories.py +91 -38
- megadetector/postprocessing/render_detection_confusion_matrix.py +214 -205
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +704 -679
- megadetector/postprocessing/separate_detections_into_folders.py +226 -211
- megadetector/postprocessing/subset_json_detector_output.py +265 -262
- megadetector/postprocessing/top_folders_to_bottom.py +45 -45
- megadetector/postprocessing/validate_batch_results.py +70 -70
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +18 -19
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +54 -33
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +67 -67
- megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
- megadetector/taxonomy_mapping/simple_image_download.py +8 -8
- megadetector/taxonomy_mapping/species_lookup.py +156 -74
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
- megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
- megadetector/utils/ct_utils.py +1049 -211
- megadetector/utils/directory_listing.py +21 -77
- megadetector/utils/gpu_test.py +22 -22
- megadetector/utils/md_tests.py +632 -529
- megadetector/utils/path_utils.py +1520 -431
- megadetector/utils/process_utils.py +41 -41
- megadetector/utils/split_locations_into_train_val.py +62 -62
- megadetector/utils/string_utils.py +148 -27
- megadetector/utils/url_utils.py +489 -176
- megadetector/utils/wi_utils.py +2658 -2526
- megadetector/utils/write_html_image_list.py +137 -137
- megadetector/visualization/plot_utils.py +34 -30
- megadetector/visualization/render_images_with_thumbnails.py +39 -74
- megadetector/visualization/visualization_utils.py +487 -435
- megadetector/visualization/visualize_db.py +232 -198
- megadetector/visualization/visualize_detector_output.py +82 -76
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/METADATA +5 -2
- megadetector-10.0.0.dist-info/RECORD +139 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/WHEEL +1 -1
- megadetector/api/batch_processing/api_core/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/score.py +0 -439
- megadetector/api/batch_processing/api_core/server.py +0 -294
- megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
- megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
- megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
- megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
- megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
- megadetector/api/batch_processing/api_core/server_utils.py +0 -88
- megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
- megadetector/api/batch_processing/api_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
- megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
- megadetector/api/synchronous/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
- megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
- megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
- megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
- megadetector/api/synchronous/api_core/tests/load_test.py +0 -110
- megadetector/data_management/importers/add_nacti_sizes.py +0 -52
- megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
- megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
- megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
- megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
- megadetector/data_management/importers/awc_to_json.py +0 -191
- megadetector/data_management/importers/bellevue_to_json.py +0 -272
- megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
- megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
- megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
- megadetector/data_management/importers/cct_field_adjustments.py +0 -58
- megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
- megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
- megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
- megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
- megadetector/data_management/importers/ena24_to_json.py +0 -276
- megadetector/data_management/importers/filenames_to_json.py +0 -386
- megadetector/data_management/importers/helena_to_cct.py +0 -283
- megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
- megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
- megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
- megadetector/data_management/importers/jb_csv_to_json.py +0 -150
- megadetector/data_management/importers/mcgill_to_json.py +0 -250
- megadetector/data_management/importers/missouri_to_json.py +0 -490
- megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
- megadetector/data_management/importers/noaa_seals_2019.py +0 -181
- megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
- megadetector/data_management/importers/pc_to_json.py +0 -365
- megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
- megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
- megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
- megadetector/data_management/importers/rspb_to_json.py +0 -356
- megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
- megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
- megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
- megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
- megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
- megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
- megadetector/data_management/importers/sulross_get_exif.py +0 -65
- megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
- megadetector/data_management/importers/ubc_to_json.py +0 -399
- megadetector/data_management/importers/umn_to_json.py +0 -507
- megadetector/data_management/importers/wellington_to_json.py +0 -263
- megadetector/data_management/importers/wi_to_json.py +0 -442
- megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
- megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
- megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
- megadetector/utils/azure_utils.py +0 -178
- megadetector/utils/sas_blob_utils.py +0 -509
- megadetector-5.0.28.dist-info/RECORD +0 -209
- /megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,294 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
2
|
-
# Licensed under the MIT License.
|
|
3
|
-
|
|
4
|
-
import string
|
|
5
|
-
import uuid
|
|
6
|
-
import threading
|
|
7
|
-
from datetime import timedelta
|
|
8
|
-
|
|
9
|
-
import sas_blob_utils
|
|
10
|
-
from flask import Flask, request, jsonify
|
|
11
|
-
|
|
12
|
-
import server_api_config as api_config
|
|
13
|
-
from server_app_config import AppConfig
|
|
14
|
-
from server_batch_job_manager import BatchJobManager
|
|
15
|
-
from server_orchestration import create_batch_job, monitor_batch_job
|
|
16
|
-
from server_job_status_table import JobStatusTable
|
|
17
|
-
from server_utils import *
|
|
18
|
-
|
|
19
|
-
# %% Flask app
|
|
20
|
-
app = Flask(__name__)
|
|
21
|
-
|
|
22
|
-
# reference: https://trstringer.com/logging-flask-gunicorn-the-manageable-way/
|
|
23
|
-
if __name__ != '__main__':
|
|
24
|
-
gunicorn_logger = logging.getLogger('gunicorn.error')
|
|
25
|
-
app.logger.handlers = gunicorn_logger.handlers
|
|
26
|
-
app.logger.setLevel(gunicorn_logger.level)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
API_PREFIX = api_config.API_PREFIX
|
|
30
|
-
app.logger.info('server, created Flask application...')
|
|
31
|
-
|
|
32
|
-
# %% Helper classes
|
|
33
|
-
|
|
34
|
-
app_config = AppConfig()
|
|
35
|
-
job_status_table = JobStatusTable()
|
|
36
|
-
batch_job_manager = BatchJobManager()
|
|
37
|
-
app.logger.info('server, finished instantiating helper classes')
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
# %% Flask endpoints
|
|
41
|
-
|
|
42
|
-
@app.route(f'{API_PREFIX}/')
|
|
43
|
-
def hello():
|
|
44
|
-
return f'Camera traps batch processing API. Instance: {api_config.API_INSTANCE_NAME}'
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
@app.route(f'{API_PREFIX}/request_detections', methods=['POST'])
|
|
48
|
-
def request_detections():
|
|
49
|
-
"""
|
|
50
|
-
Checks that the input parameters to this endpoint are valid, starts a thread
|
|
51
|
-
to launch the batch processing job, and return the job_id/request_id to the user.
|
|
52
|
-
"""
|
|
53
|
-
if not request.is_json:
|
|
54
|
-
msg = 'Body needs to have a JSON mimetype (e.g., application/json).'
|
|
55
|
-
return make_error(415, msg)
|
|
56
|
-
|
|
57
|
-
try:
|
|
58
|
-
post_body = request.get_json()
|
|
59
|
-
except Exception as e:
|
|
60
|
-
return make_error(415, f'Error occurred reading POST request body: {e}.')
|
|
61
|
-
|
|
62
|
-
app.logger.info(f'server, request_detections, post_body: {post_body}')
|
|
63
|
-
|
|
64
|
-
# required params
|
|
65
|
-
|
|
66
|
-
caller_id = post_body.get('caller', None)
|
|
67
|
-
if caller_id is None or caller_id not in app_config.get_allowlist():
|
|
68
|
-
msg = ('Parameter caller is not supplied or is not on our allowlist. '
|
|
69
|
-
'Please email cameratraps@lila.science to request access.')
|
|
70
|
-
return make_error(401, msg)
|
|
71
|
-
|
|
72
|
-
use_url = post_body.get('use_url', False)
|
|
73
|
-
if use_url and isinstance(use_url, str): # in case it is included but is intended to be False
|
|
74
|
-
if use_url.lower() in ['false', 'f', 'no', 'n']:
|
|
75
|
-
use_url = False
|
|
76
|
-
|
|
77
|
-
input_container_sas = post_body.get('input_container_sas', None)
|
|
78
|
-
if not input_container_sas and not use_url:
|
|
79
|
-
msg = ('input_container_sas with read and list access is a required '
|
|
80
|
-
'field when not using image URLs.')
|
|
81
|
-
return make_error(400, msg)
|
|
82
|
-
|
|
83
|
-
if input_container_sas is not None:
|
|
84
|
-
if not sas_blob_utils.is_container_uri(input_container_sas):
|
|
85
|
-
return make_error(400, 'input_container_sas provided is not for a container.')
|
|
86
|
-
|
|
87
|
-
result = check_data_container_sas(input_container_sas)
|
|
88
|
-
if result is not None:
|
|
89
|
-
return make_error(result[0], result[1])
|
|
90
|
-
|
|
91
|
-
# can be an URL to a file not hosted in an Azure blob storage container
|
|
92
|
-
images_requested_json_sas = post_body.get('images_requested_json_sas', None)
|
|
93
|
-
|
|
94
|
-
if images_requested_json_sas is not None:
|
|
95
|
-
if not images_requested_json_sas.startswith(('http://', 'https://')):
|
|
96
|
-
return make_error(400, 'images_requested_json_sas needs to be an URL.')
|
|
97
|
-
|
|
98
|
-
# if use_url, then images_requested_json_sas is required
|
|
99
|
-
if use_url and images_requested_json_sas is None:
|
|
100
|
-
return make_error(400, 'images_requested_json_sas is required since use_url is true.')
|
|
101
|
-
|
|
102
|
-
# optional params
|
|
103
|
-
|
|
104
|
-
# check model_version is among the available model versions
|
|
105
|
-
model_version = post_body.get('model_version', '')
|
|
106
|
-
if model_version != '':
|
|
107
|
-
model_version = str(model_version) # in case user used an int
|
|
108
|
-
if model_version not in api_config.MD_VERSIONS_TO_REL_PATH:
|
|
109
|
-
return make_error(400, f'model_version {model_version} is not supported.')
|
|
110
|
-
|
|
111
|
-
# check request_name has only allowed characters
|
|
112
|
-
request_name = post_body.get('request_name', '')
|
|
113
|
-
if request_name != '':
|
|
114
|
-
if len(request_name) > 92:
|
|
115
|
-
return make_error(400, 'request_name is longer than 92 characters.')
|
|
116
|
-
allowed = set(string.ascii_letters + string.digits + '_' + '-')
|
|
117
|
-
if not set(request_name) <= allowed:
|
|
118
|
-
msg = ('request_name contains invalid characters (only letters, '
|
|
119
|
-
'digits, - and _ are allowed).')
|
|
120
|
-
return make_error(400, msg)
|
|
121
|
-
|
|
122
|
-
# optional params for telemetry collection - logged to status table for now as part of call_params
|
|
123
|
-
country = post_body.get('country', None)
|
|
124
|
-
organization_name = post_body.get('organization_name', None)
|
|
125
|
-
|
|
126
|
-
# All API instances / node pools share a quota on total number of active Jobs;
|
|
127
|
-
# we cannot accept new Job submissions if we are at the quota
|
|
128
|
-
try:
|
|
129
|
-
num_active_jobs = batch_job_manager.get_num_active_jobs()
|
|
130
|
-
if num_active_jobs >= api_config.MAX_BATCH_ACCOUNT_ACTIVE_JOBS:
|
|
131
|
-
return make_error(503, f'Too many active jobs, please try again later')
|
|
132
|
-
except Exception as e:
|
|
133
|
-
return make_error(500, f'Error checking number of active jobs: {e}')
|
|
134
|
-
|
|
135
|
-
try:
|
|
136
|
-
job_id = uuid.uuid4().hex
|
|
137
|
-
job_status_table.create_job_status(
|
|
138
|
-
job_id=job_id,
|
|
139
|
-
status= get_job_status('created', 'Request received. Listing images next...'),
|
|
140
|
-
call_params=post_body
|
|
141
|
-
)
|
|
142
|
-
except Exception as e:
|
|
143
|
-
return make_error(500, f'Error creating a job status entry: {e}')
|
|
144
|
-
|
|
145
|
-
try:
|
|
146
|
-
thread = threading.Thread(
|
|
147
|
-
target=create_batch_job,
|
|
148
|
-
name=f'job_{job_id}',
|
|
149
|
-
kwargs={'job_id': job_id, 'body': post_body}
|
|
150
|
-
)
|
|
151
|
-
thread.start()
|
|
152
|
-
except Exception as e:
|
|
153
|
-
return make_error(500, f'Error creating or starting the batch processing thread: {e}')
|
|
154
|
-
|
|
155
|
-
return {'request_id': job_id}
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
@app.route(f'{API_PREFIX}/cancel_request', methods=['POST'])
|
|
159
|
-
def cancel_request():
|
|
160
|
-
"""
|
|
161
|
-
Cancels a request/job given the job_id and caller_id
|
|
162
|
-
"""
|
|
163
|
-
if not request.is_json:
|
|
164
|
-
msg = 'Body needs to have a JSON mimetype (e.g., application/json).'
|
|
165
|
-
return make_error(415, msg)
|
|
166
|
-
try:
|
|
167
|
-
post_body = request.get_json()
|
|
168
|
-
except Exception as e:
|
|
169
|
-
return make_error(415, f'Error occurred reading POST request body: {e}.')
|
|
170
|
-
|
|
171
|
-
app.logger.info(f'server, cancel_request received, body: {post_body}')
|
|
172
|
-
|
|
173
|
-
# required fields
|
|
174
|
-
job_id = post_body.get('request_id', None)
|
|
175
|
-
if job_id is None:
|
|
176
|
-
return make_error(400, 'request_id is a required field.')
|
|
177
|
-
|
|
178
|
-
caller_id = post_body.get('caller', None)
|
|
179
|
-
if caller_id is None or caller_id not in app_config.get_allowlist():
|
|
180
|
-
return make_error(401, 'Parameter caller is not supplied or is not on our allowlist.')
|
|
181
|
-
|
|
182
|
-
item_read = job_status_table.read_job_status(job_id)
|
|
183
|
-
if item_read is None:
|
|
184
|
-
return make_error(404, 'Task is not found.')
|
|
185
|
-
if 'status' not in item_read:
|
|
186
|
-
return make_error(404, 'Something went wrong. This task does not have a status field.')
|
|
187
|
-
|
|
188
|
-
request_status = item_read['status']['request_status']
|
|
189
|
-
if request_status not in ['running', 'problem']:
|
|
190
|
-
# request_status is either completed or failed
|
|
191
|
-
return make_error(400, f'Task has {request_status} and cannot be canceled')
|
|
192
|
-
|
|
193
|
-
try:
|
|
194
|
-
batch_job_manager.cancel_batch_job(job_id)
|
|
195
|
-
# the create_batch_job thread will stop when it wakes up the next time
|
|
196
|
-
except Exception as e:
|
|
197
|
-
return make_error(500, f'Error when canceling the request: {e}')
|
|
198
|
-
else:
|
|
199
|
-
job_status_table.update_job_status(job_id, {
|
|
200
|
-
'request_status': 'canceled',
|
|
201
|
-
'message': 'Request has been canceled by the user.'
|
|
202
|
-
})
|
|
203
|
-
return 'Canceling signal has been sent. You can verify the status at the /task endpoint'
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
@app.route(f'{API_PREFIX}/task/<job_id>')
|
|
207
|
-
def retrieve_job_status(job_id: str):
|
|
208
|
-
"""
|
|
209
|
-
Does not require the "caller" field to avoid checking the allowlist in App Configurations.
|
|
210
|
-
Retains the /task endpoint name to be compatible with previous versions.
|
|
211
|
-
"""
|
|
212
|
-
# Fix for Zooniverse - deleting any "-" characters in the job_id
|
|
213
|
-
job_id = job_id.replace('-', '')
|
|
214
|
-
|
|
215
|
-
item_read = job_status_table.read_job_status(job_id) # just what the monitoring thread wrote to the DB
|
|
216
|
-
if item_read is None:
|
|
217
|
-
return make_error(404, 'Task is not found.')
|
|
218
|
-
if 'status' not in item_read or 'last_updated' not in item_read or 'call_params' not in item_read:
|
|
219
|
-
return make_error(404, 'Something went wrong. This task does not have a valid status.')
|
|
220
|
-
|
|
221
|
-
# If the status is running, it could be a Job submitted before the last restart of this
|
|
222
|
-
# API instance. If that is the case, we should start to monitor its progress again.
|
|
223
|
-
status = item_read['status']
|
|
224
|
-
|
|
225
|
-
last_updated = datetime.fromisoformat(item_read['last_updated'][:-1]) # get rid of "Z" (required by Cosmos DB)
|
|
226
|
-
time_passed = datetime.utcnow() - last_updated
|
|
227
|
-
job_is_unmonitored = True if time_passed > timedelta(minutes=(api_config.MONITOR_PERIOD_MINUTES + 1)) else False
|
|
228
|
-
|
|
229
|
-
if isinstance(status, dict) and \
|
|
230
|
-
'request_status' in status and \
|
|
231
|
-
status['request_status'] in ['running', 'problem'] and \
|
|
232
|
-
'num_tasks' in status and \
|
|
233
|
-
job_id not in get_thread_names() and \
|
|
234
|
-
job_is_unmonitored:
|
|
235
|
-
# WARNING model_version could be wrong (a newer version number gets written to the output file) around
|
|
236
|
-
# the time that the model is updated, if this request was submitted before the model update
|
|
237
|
-
# and the API restart; this should be quite rare
|
|
238
|
-
model_version = item_read['call_params'].get('model_version', api_config.DEFAULT_MD_VERSION)
|
|
239
|
-
|
|
240
|
-
num_tasks = status['num_tasks']
|
|
241
|
-
job_name = item_read['call_params'].get('request_name', '')
|
|
242
|
-
job_submission_timestamp = item_read.get('job_submission_time', '')
|
|
243
|
-
|
|
244
|
-
thread = threading.Thread(
|
|
245
|
-
target=monitor_batch_job,
|
|
246
|
-
name=f'job_{job_id}',
|
|
247
|
-
kwargs={
|
|
248
|
-
'job_id': job_id,
|
|
249
|
-
'num_tasks': num_tasks,
|
|
250
|
-
'model_version': model_version,
|
|
251
|
-
'job_name': job_name,
|
|
252
|
-
'job_submission_timestamp': job_submission_timestamp
|
|
253
|
-
}
|
|
254
|
-
)
|
|
255
|
-
thread.start()
|
|
256
|
-
app.logger.info(f'server, started a new thread to monitor job {job_id}')
|
|
257
|
-
|
|
258
|
-
# conform to previous schemes
|
|
259
|
-
if 'num_tasks' in status:
|
|
260
|
-
del status['num_tasks']
|
|
261
|
-
item_to_return = {
|
|
262
|
-
'Status': status,
|
|
263
|
-
'Endpoint': f'{API_PREFIX}/request_detections',
|
|
264
|
-
'TaskId': job_id,
|
|
265
|
-
'Timestamp': item_read['last_updated']
|
|
266
|
-
}
|
|
267
|
-
return item_to_return
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
@app.route(f'{API_PREFIX}/default_model_version')
|
|
271
|
-
def get_default_model_version() -> str:
|
|
272
|
-
return api_config.DEFAULT_MD_VERSION
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
@app.route(f'{API_PREFIX}/supported_model_versions')
|
|
276
|
-
def get_supported_model_versions() -> str:
|
|
277
|
-
return jsonify(sorted(list(api_config.MD_VERSIONS_TO_REL_PATH.keys())))
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
# %% undocumented endpoints
|
|
281
|
-
|
|
282
|
-
def get_thread_names() -> list:
|
|
283
|
-
thread_names = []
|
|
284
|
-
for thread in threading.enumerate():
|
|
285
|
-
if thread.name.startswith('job_'):
|
|
286
|
-
thread_names.append(thread.name.split('_')[1])
|
|
287
|
-
return sorted(thread_names)
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
@app.route(f'{API_PREFIX}/all_jobs')
|
|
291
|
-
def get_all_jobs():
|
|
292
|
-
"""List all Jobs being monitored since this API instance started"""
|
|
293
|
-
thread_names = get_thread_names()
|
|
294
|
-
return jsonify(thread_names)
|
|
@@ -1,97 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
2
|
-
# Licensed under the MIT License.
|
|
3
|
-
|
|
4
|
-
"""
|
|
5
|
-
A module to hold the configurations specific to an instance of the API.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import os
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
#%% instance-specific API settings
|
|
12
|
-
# you likely need to modify these when deploying a new instance of the API
|
|
13
|
-
|
|
14
|
-
API_INSTANCE_NAME = 'cm' # 'internal', 'cm', 'camelot', 'zooniverse'
|
|
15
|
-
POOL_ID = 'cm_1' # name of the Batch pool created for this API instance
|
|
16
|
-
|
|
17
|
-
MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB = 4 * 1000 * 1000 # inclusive
|
|
18
|
-
|
|
19
|
-
# Azure Batch for batch processing
|
|
20
|
-
BATCH_ACCOUNT_NAME = 'cameratrapssc'
|
|
21
|
-
BATCH_ACCOUNT_URL = 'https://cameratrapssc.southcentralus.batch.azure.com'
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
#%% general API settings
|
|
25
|
-
API_PREFIX = '/v4/camera-trap/detection-batch' # URL to root is http://127.0.0.1:5000/v4/camera-trap/detection-batch/
|
|
26
|
-
|
|
27
|
-
MONITOR_PERIOD_MINUTES = 10
|
|
28
|
-
|
|
29
|
-
# if this number of times the thread wakes up to check is exceeded, stop the monitoring thread
|
|
30
|
-
MAX_MONITOR_CYCLES = 4 * 7 * int((60 * 24) / MONITOR_PERIOD_MINUTES) # 4 weeks
|
|
31
|
-
|
|
32
|
-
IMAGE_SUFFIXES_ACCEPTED = ('.jpg', '.jpeg', '.png') # case-insensitive
|
|
33
|
-
assert isinstance(IMAGE_SUFFIXES_ACCEPTED, tuple)
|
|
34
|
-
|
|
35
|
-
OUTPUT_FORMAT_VERSION = '1.1'
|
|
36
|
-
|
|
37
|
-
NUM_IMAGES_PER_TASK = 2000
|
|
38
|
-
|
|
39
|
-
OUTPUT_SAS_EXPIRATION_DAYS = 180
|
|
40
|
-
|
|
41
|
-
# quota of active Jobs in our Batch account, which all node pools i.e. API instances share;
|
|
42
|
-
# cannot accept job submissions if there are this many active Jobs already
|
|
43
|
-
MAX_BATCH_ACCOUNT_ACTIVE_JOBS = 300
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
#%% MegaDetector info
|
|
47
|
-
DETECTION_CONF_THRESHOLD = 0.1
|
|
48
|
-
|
|
49
|
-
# relative to the `megadetector_copies` folder in the container `models`
|
|
50
|
-
MD_VERSIONS_TO_REL_PATH = {
|
|
51
|
-
'4.1': 'megadetector_v4_1/md_v4.1.0.pb',
|
|
52
|
-
'3': 'megadetector_v3/megadetector_v3_tf19.pb',
|
|
53
|
-
'2': 'megadetector_v2/frozen_inference_graph.pb'
|
|
54
|
-
}
|
|
55
|
-
DEFAULT_MD_VERSION = '4.1'
|
|
56
|
-
assert DEFAULT_MD_VERSION in MD_VERSIONS_TO_REL_PATH
|
|
57
|
-
|
|
58
|
-
# copied from TFDetector class in detection/run_detector.py
|
|
59
|
-
DETECTOR_LABEL_MAP = {
|
|
60
|
-
'1': 'animal',
|
|
61
|
-
'2': 'person',
|
|
62
|
-
'3': 'vehicle'
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
#%% Azure Batch settings
|
|
67
|
-
NUM_TASKS_PER_SUBMISSION = 20 # max for the Python SDK without extension is 100
|
|
68
|
-
|
|
69
|
-
NUM_TASKS_PER_RESUBMISSION = 5
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
#%% env variables for service credentials, and info related to these services
|
|
73
|
-
|
|
74
|
-
# Cosmos DB `batch-api-jobs` table for job status
|
|
75
|
-
COSMOS_ENDPOINT = os.environ['COSMOS_ENDPOINT']
|
|
76
|
-
COSMOS_WRITE_KEY = os.environ['COSMOS_WRITE_KEY']
|
|
77
|
-
|
|
78
|
-
# Service principal of this "application", authorized to use Azure Batch
|
|
79
|
-
APP_TENANT_ID = os.environ['APP_TENANT_ID']
|
|
80
|
-
APP_CLIENT_ID = os.environ['APP_CLIENT_ID']
|
|
81
|
-
APP_CLIENT_SECRET = os.environ['APP_CLIENT_SECRET']
|
|
82
|
-
|
|
83
|
-
# Blob storage account for storing Batch tasks' outputs and scoring script
|
|
84
|
-
STORAGE_ACCOUNT_NAME = os.environ['STORAGE_ACCOUNT_NAME']
|
|
85
|
-
STORAGE_ACCOUNT_KEY = os.environ['STORAGE_ACCOUNT_KEY']
|
|
86
|
-
|
|
87
|
-
# STORAGE_CONTAINER_MODELS = 'models' # names of the two containers supporting Batch
|
|
88
|
-
STORAGE_CONTAINER_API = 'batch-api'
|
|
89
|
-
|
|
90
|
-
# Azure Container Registry for Docker image used by our Batch node pools
|
|
91
|
-
REGISTRY_SERVER = os.environ['REGISTRY_SERVER']
|
|
92
|
-
REGISTRY_PASSWORD = os.environ['REGISTRY_PASSWORD']
|
|
93
|
-
CONTAINER_IMAGE_NAME = 'cameratracrsppftkje.azurecr.io/tensorflow:1.14.0-gpu-py3'
|
|
94
|
-
|
|
95
|
-
# Azure App Configuration instance to get configurations specific to
|
|
96
|
-
# this instance of the API
|
|
97
|
-
APP_CONFIG_CONNECTION_STR = os.environ['APP_CONFIG_CONNECTION_STR']
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
2
|
-
# Licensed under the MIT License.
|
|
3
|
-
|
|
4
|
-
"""
|
|
5
|
-
A class wrapping the Azure App Configuration client to get configurations
|
|
6
|
-
for each instance of the API.
|
|
7
|
-
"""
|
|
8
|
-
import logging
|
|
9
|
-
import os
|
|
10
|
-
|
|
11
|
-
from server_api_config import APP_CONFIG_CONNECTION_STR, API_INSTANCE_NAME
|
|
12
|
-
|
|
13
|
-
from azure.appconfiguration import AzureAppConfigurationClient
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
log = logging.getLogger(os.environ['FLASK_APP'])
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class AppConfig:
|
|
20
|
-
"""Wrapper around the Azure App Configuration client"""
|
|
21
|
-
|
|
22
|
-
def __init__(self):
|
|
23
|
-
self.client = AzureAppConfigurationClient.from_connection_string(APP_CONFIG_CONNECTION_STR)
|
|
24
|
-
|
|
25
|
-
self.api_instance = API_INSTANCE_NAME
|
|
26
|
-
|
|
27
|
-
# sentinel should change if new configurations are available
|
|
28
|
-
self.sentinel = self._get_sentinel() # get initial sentinel and allowlist values
|
|
29
|
-
self.allowlist = self._get_allowlist()
|
|
30
|
-
|
|
31
|
-
def _get_sentinel(self):
|
|
32
|
-
return self.client.get_configuration_setting(key='batch_api:sentinel').value
|
|
33
|
-
|
|
34
|
-
def _get_allowlist(self):
|
|
35
|
-
filtered_listed = self.client.list_configuration_settings(key_filter='batch_api_allow:*')
|
|
36
|
-
allowlist = []
|
|
37
|
-
for item in filtered_listed:
|
|
38
|
-
if item.value == self.api_instance:
|
|
39
|
-
allowlist.append(item.key.split('batch_api_allow:')[1])
|
|
40
|
-
return allowlist
|
|
41
|
-
|
|
42
|
-
def get_allowlist(self):
|
|
43
|
-
try:
|
|
44
|
-
cur_sentinel = self._get_sentinel()
|
|
45
|
-
if cur_sentinel == self.sentinel:
|
|
46
|
-
# configs have not changed
|
|
47
|
-
return self.allowlist
|
|
48
|
-
else:
|
|
49
|
-
self.sentinel = cur_sentinel
|
|
50
|
-
self.allowlist = self._get_allowlist()
|
|
51
|
-
return self.allowlist
|
|
52
|
-
|
|
53
|
-
except Exception as e:
|
|
54
|
-
log.error(f'AppConfig, get_allowlist, exception so using old allowlist: {e}')
|
|
55
|
-
return self.allowlist
|
|
@@ -1,220 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
2
|
-
# Licensed under the MIT License.
|
|
3
|
-
|
|
4
|
-
"""
|
|
5
|
-
A class wrapping the Azure Batch client.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import logging
|
|
9
|
-
import os
|
|
10
|
-
import math
|
|
11
|
-
from typing import Tuple
|
|
12
|
-
from datetime import datetime, timedelta
|
|
13
|
-
|
|
14
|
-
import sas_blob_utils
|
|
15
|
-
from azure.storage.blob import ContainerClient, ContainerSasPermissions, generate_container_sas
|
|
16
|
-
from azure.batch import BatchServiceClient
|
|
17
|
-
from azure.batch.models import *
|
|
18
|
-
from azure.common.credentials import ServicePrincipalCredentials
|
|
19
|
-
|
|
20
|
-
import server_api_config as api_config
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
# Gunicorn logger handler will get attached if needed in server.py
|
|
24
|
-
log = logging.getLogger(os.environ['FLASK_APP'])
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class BatchJobManager:
|
|
28
|
-
"""Wrapper around the Azure App Configuration client"""
|
|
29
|
-
|
|
30
|
-
def __init__(self):
|
|
31
|
-
credentials = ServicePrincipalCredentials(
|
|
32
|
-
client_id=api_config.APP_CLIENT_ID,
|
|
33
|
-
secret=api_config.APP_CLIENT_SECRET,
|
|
34
|
-
tenant=api_config.APP_TENANT_ID,
|
|
35
|
-
resource='https://batch.core.windows.net/'
|
|
36
|
-
)
|
|
37
|
-
self.batch_client = BatchServiceClient(credentials=credentials,
|
|
38
|
-
batch_url=api_config.BATCH_ACCOUNT_URL)
|
|
39
|
-
|
|
40
|
-
def create_job(self, job_id: str, detector_model_rel_path: str,
|
|
41
|
-
input_container_sas: str, use_url: bool):
|
|
42
|
-
log.info(f'BatchJobManager, create_job, job_id: {job_id}')
|
|
43
|
-
job = JobAddParameter(
|
|
44
|
-
id=job_id,
|
|
45
|
-
pool_info=PoolInformation(pool_id=api_config.POOL_ID),
|
|
46
|
-
|
|
47
|
-
# set for all tasks in the job
|
|
48
|
-
common_environment_settings=[
|
|
49
|
-
EnvironmentSetting(name='DETECTOR_REL_PATH', value=detector_model_rel_path),
|
|
50
|
-
EnvironmentSetting(name='API_INSTANCE_NAME', value=api_config.API_INSTANCE_NAME),
|
|
51
|
-
EnvironmentSetting(name='JOB_CONTAINER_SAS', value=input_container_sas),
|
|
52
|
-
EnvironmentSetting(name='JOB_USE_URL', value=str(use_url)),
|
|
53
|
-
EnvironmentSetting(name='DETECTION_CONF_THRESHOLD', value=api_config.DETECTION_CONF_THRESHOLD)
|
|
54
|
-
]
|
|
55
|
-
)
|
|
56
|
-
self.batch_client.job.add(job)
|
|
57
|
-
|
|
58
|
-
def submit_tasks(self, job_id: str, num_images: int) -> Tuple[int, list]:
|
|
59
|
-
"""
|
|
60
|
-
Shard the images and submit each shard as a Task under the Job pointed to by this job_id
|
|
61
|
-
Args:
|
|
62
|
-
job_id: ID of the Batch Job to submit the tasks to
|
|
63
|
-
num_images: total number of images to be processed in this Job
|
|
64
|
-
|
|
65
|
-
Returns:
|
|
66
|
-
num_task: total number of Tasks that should be in this Job
|
|
67
|
-
task_ids_failed_to_submit: which Tasks from the above failed to be submitted
|
|
68
|
-
"""
|
|
69
|
-
log.info('BatchJobManager, submit_tasks')
|
|
70
|
-
|
|
71
|
-
# cannot execute the scoring script that is in the mounted directory; has to be copied to cwd
|
|
72
|
-
# not luck giving the commandline arguments via formatted string - set as env vars instead
|
|
73
|
-
score_command = '/bin/bash -c \"cp $AZ_BATCH_NODE_MOUNTS_DIR/batch-api/scripts/score.py . && python score.py\" '
|
|
74
|
-
|
|
75
|
-
num_images_per_task = api_config.NUM_IMAGES_PER_TASK
|
|
76
|
-
|
|
77
|
-
# form shards of images and assign each shard to a Task
|
|
78
|
-
num_tasks = math.ceil(num_images / num_images_per_task)
|
|
79
|
-
|
|
80
|
-
# for persisting stdout and stderr
|
|
81
|
-
permissions = ContainerSasPermissions(read=True, write=True, list=True)
|
|
82
|
-
access_duration_hrs = api_config.MONITOR_PERIOD_MINUTES * api_config.MAX_MONITOR_CYCLES / 60
|
|
83
|
-
container_sas_token = generate_container_sas(
|
|
84
|
-
account_name=api_config.STORAGE_ACCOUNT_NAME,
|
|
85
|
-
container_name=api_config.STORAGE_CONTAINER_API,
|
|
86
|
-
account_key=api_config.STORAGE_ACCOUNT_KEY,
|
|
87
|
-
permission=permissions,
|
|
88
|
-
expiry=datetime.utcnow() + timedelta(hours=access_duration_hrs))
|
|
89
|
-
container_sas_url = sas_blob_utils.build_azure_storage_uri(
|
|
90
|
-
account=api_config.STORAGE_ACCOUNT_NAME,
|
|
91
|
-
container=api_config.STORAGE_CONTAINER_API,
|
|
92
|
-
sas_token=container_sas_token)
|
|
93
|
-
|
|
94
|
-
tasks = []
|
|
95
|
-
for task_id in range(num_tasks):
|
|
96
|
-
begin_index = task_id * num_images_per_task
|
|
97
|
-
end_index = begin_index + num_images_per_task
|
|
98
|
-
|
|
99
|
-
# persist stdout and stderr (will be removed when node removed)
|
|
100
|
-
# paths are relative to the Task working directory
|
|
101
|
-
stderr_destination = OutputFileDestination(
|
|
102
|
-
container=OutputFileBlobContainerDestination(
|
|
103
|
-
container_url=container_sas_url,
|
|
104
|
-
path=f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/task_logs/job_{job_id}_task_{task_id}_stderr.txt'
|
|
105
|
-
)
|
|
106
|
-
)
|
|
107
|
-
stdout_destination = OutputFileDestination(
|
|
108
|
-
container=OutputFileBlobContainerDestination(
|
|
109
|
-
container_url=container_sas_url,
|
|
110
|
-
path=f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/task_logs/job_{job_id}_task_{task_id}_stdout.txt'
|
|
111
|
-
)
|
|
112
|
-
)
|
|
113
|
-
std_err_and_out = [
|
|
114
|
-
OutputFile(
|
|
115
|
-
file_pattern='../stderr.txt', # stderr.txt is at the same level as wd
|
|
116
|
-
destination=stderr_destination,
|
|
117
|
-
upload_options=OutputFileUploadOptions(upload_condition=OutputFileUploadCondition.task_completion)
|
|
118
|
-
# can also just upload on failure
|
|
119
|
-
),
|
|
120
|
-
OutputFile(
|
|
121
|
-
file_pattern='../stdout.txt',
|
|
122
|
-
destination=stdout_destination,
|
|
123
|
-
upload_options=OutputFileUploadOptions(upload_condition=OutputFileUploadCondition.task_completion)
|
|
124
|
-
)
|
|
125
|
-
]
|
|
126
|
-
|
|
127
|
-
task = TaskAddParameter(
|
|
128
|
-
id=str(task_id),
|
|
129
|
-
command_line=score_command,
|
|
130
|
-
container_settings=TaskContainerSettings(
|
|
131
|
-
image_name=api_config.CONTAINER_IMAGE_NAME,
|
|
132
|
-
working_directory='taskWorkingDirectory'
|
|
133
|
-
),
|
|
134
|
-
environment_settings=[
|
|
135
|
-
EnvironmentSetting(name='TASK_BEGIN_INDEX', value=begin_index),
|
|
136
|
-
EnvironmentSetting(name='TASK_END_INDEX', value=end_index),
|
|
137
|
-
],
|
|
138
|
-
output_files=std_err_and_out
|
|
139
|
-
)
|
|
140
|
-
tasks.append(task)
|
|
141
|
-
|
|
142
|
-
# first try submitting Tasks
|
|
143
|
-
task_ids_failed_to_submit = self._create_tasks(job_id, tasks, api_config.NUM_TASKS_PER_SUBMISSION, 1)
|
|
144
|
-
|
|
145
|
-
# retry submitting Tasks
|
|
146
|
-
if len(task_ids_failed_to_submit) > 0:
|
|
147
|
-
task_ids_failed_to_submit_set = set(task_ids_failed_to_submit)
|
|
148
|
-
tasks_to_retry = [t for t in tasks if t.id in task_ids_failed_to_submit_set]
|
|
149
|
-
task_ids_failed_to_submit = self._create_tasks(job_id,
|
|
150
|
-
tasks_to_retry,
|
|
151
|
-
api_config.NUM_TASKS_PER_RESUBMISSION,
|
|
152
|
-
2)
|
|
153
|
-
|
|
154
|
-
if len(task_ids_failed_to_submit) > 0:
|
|
155
|
-
log.info('BatchJobManager, submit_tasks, after retry, '
|
|
156
|
-
f'len of task_ids_failed_to_submit: {len(task_ids_failed_to_submit)}')
|
|
157
|
-
else:
|
|
158
|
-
log.info('BatchJobManager, submit_tasks, after retry, all Tasks submitted')
|
|
159
|
-
else:
|
|
160
|
-
log.info('BatchJobManager, submit_tasks, all Tasks submitted after first try')
|
|
161
|
-
|
|
162
|
-
# Change the Job's on_all_tasks_complete option to 'terminateJob' so the Job's status changes automatically
|
|
163
|
-
# after all submitted tasks are done
|
|
164
|
-
# This is so that we do not take up the quota for active Jobs in the Batch account.
|
|
165
|
-
job_patch_params = JobPatchParameter(
|
|
166
|
-
on_all_tasks_complete=OnAllTasksComplete.terminate_job
|
|
167
|
-
)
|
|
168
|
-
self.batch_client.job.patch(job_id, job_patch_params)
|
|
169
|
-
|
|
170
|
-
return num_tasks, task_ids_failed_to_submit
|
|
171
|
-
|
|
172
|
-
def _create_tasks(self, job_id, tasks, num_tasks_per_submission, n_th_try) -> list:
|
|
173
|
-
task_ids_failed_to_submit = []
|
|
174
|
-
|
|
175
|
-
for i in range(0, len(tasks), num_tasks_per_submission):
|
|
176
|
-
tasks_to_submit = tasks[i: i + num_tasks_per_submission]
|
|
177
|
-
|
|
178
|
-
# return type: TaskAddCollectionResult
|
|
179
|
-
collection_results = self.batch_client.task.add_collection(job_id, tasks_to_submit, threads=10)
|
|
180
|
-
|
|
181
|
-
for task_result in collection_results.value:
|
|
182
|
-
if task_result.status is not TaskAddStatus.success:
|
|
183
|
-
# actually we should probably only re-submit if it's a server_error
|
|
184
|
-
task_ids_failed_to_submit.append(task_result.task_id)
|
|
185
|
-
log.info(f'task {task_result.task_id} failed to submitted after {n_th_try} try/tries, '
|
|
186
|
-
f'status: {task_result.status}, error: {task_result.error}')
|
|
187
|
-
|
|
188
|
-
return task_ids_failed_to_submit
|
|
189
|
-
|
|
190
|
-
def get_num_completed_tasks(self, job_id: str) -> Tuple[int, int]:
|
|
191
|
-
"""
|
|
192
|
-
Returns the number of completed tasks for the job of job_id, as a tuple:
|
|
193
|
-
(number of succeeded jobs, number of failed jobs) - both are considered "completed".=
|
|
194
|
-
"""
|
|
195
|
-
# docs: # https://docs.microsoft.com/en-us/rest/api/batchservice/odata-filters-in-batch#list-tasks
|
|
196
|
-
tasks = self.batch_client.task.list(job_id,
|
|
197
|
-
task_list_options=TaskListOptions(
|
|
198
|
-
filter='state eq \'completed\'',
|
|
199
|
-
select='id, executionInfo' # only the id field will be non-empty
|
|
200
|
-
))
|
|
201
|
-
num_succeeded, num_failed = 0, 0
|
|
202
|
-
for task in tasks:
|
|
203
|
-
exit_code: int = task.execution_info.exit_code
|
|
204
|
-
if exit_code == 0:
|
|
205
|
-
num_succeeded += 1
|
|
206
|
-
else:
|
|
207
|
-
num_failed += 1
|
|
208
|
-
return num_succeeded, num_failed
|
|
209
|
-
|
|
210
|
-
def cancel_batch_job(self, job_id: str):
|
|
211
|
-
self.batch_client.job.terminate(job_id, terminate_reason='APIUserCanceled')
|
|
212
|
-
|
|
213
|
-
def get_num_active_jobs(self) -> int:
|
|
214
|
-
jobs_generator = self.batch_client.job.list(
|
|
215
|
-
job_list_options=JobListOptions(
|
|
216
|
-
filter='state eq \'active\'',
|
|
217
|
-
select='id'
|
|
218
|
-
))
|
|
219
|
-
jobs_list = [j for j in jobs_generator]
|
|
220
|
-
return len(jobs_list)
|