megadetector 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
- megadetector/classification/aggregate_classifier_probs.py +3 -3
- megadetector/classification/analyze_failed_images.py +5 -5
- megadetector/classification/cache_batchapi_outputs.py +5 -5
- megadetector/classification/create_classification_dataset.py +11 -12
- megadetector/classification/crop_detections.py +10 -10
- megadetector/classification/csv_to_json.py +8 -8
- megadetector/classification/detect_and_crop.py +13 -15
- megadetector/classification/efficientnet/model.py +8 -8
- megadetector/classification/efficientnet/utils.py +6 -5
- megadetector/classification/evaluate_model.py +7 -7
- megadetector/classification/identify_mislabeled_candidates.py +6 -6
- megadetector/classification/json_to_azcopy_list.py +1 -1
- megadetector/classification/json_validator.py +29 -32
- megadetector/classification/map_classification_categories.py +9 -9
- megadetector/classification/merge_classification_detection_output.py +12 -9
- megadetector/classification/prepare_classification_script.py +19 -19
- megadetector/classification/prepare_classification_script_mc.py +26 -26
- megadetector/classification/run_classifier.py +4 -4
- megadetector/classification/save_mislabeled.py +6 -6
- megadetector/classification/train_classifier.py +1 -1
- megadetector/classification/train_classifier_tf.py +9 -9
- megadetector/classification/train_utils.py +10 -10
- megadetector/data_management/annotations/annotation_constants.py +1 -2
- megadetector/data_management/camtrap_dp_to_coco.py +79 -46
- megadetector/data_management/cct_json_utils.py +103 -103
- megadetector/data_management/cct_to_md.py +49 -49
- megadetector/data_management/cct_to_wi.py +33 -33
- megadetector/data_management/coco_to_labelme.py +75 -75
- megadetector/data_management/coco_to_yolo.py +210 -193
- megadetector/data_management/databases/add_width_and_height_to_db.py +86 -12
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +40 -40
- megadetector/data_management/databases/integrity_check_json_db.py +228 -200
- megadetector/data_management/databases/subset_json_db.py +33 -33
- megadetector/data_management/generate_crops_from_cct.py +88 -39
- megadetector/data_management/get_image_sizes.py +54 -49
- megadetector/data_management/labelme_to_coco.py +133 -125
- megadetector/data_management/labelme_to_yolo.py +159 -73
- megadetector/data_management/lila/create_lila_blank_set.py +81 -83
- megadetector/data_management/lila/create_lila_test_set.py +32 -31
- megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
- megadetector/data_management/lila/download_lila_subset.py +21 -24
- megadetector/data_management/lila/generate_lila_per_image_labels.py +365 -107
- megadetector/data_management/lila/get_lila_annotation_counts.py +35 -33
- megadetector/data_management/lila/get_lila_image_counts.py +22 -22
- megadetector/data_management/lila/lila_common.py +73 -70
- megadetector/data_management/lila/test_lila_metadata_urls.py +28 -19
- megadetector/data_management/mewc_to_md.py +344 -340
- megadetector/data_management/ocr_tools.py +262 -255
- megadetector/data_management/read_exif.py +249 -227
- megadetector/data_management/remap_coco_categories.py +90 -28
- megadetector/data_management/remove_exif.py +81 -21
- megadetector/data_management/rename_images.py +187 -187
- megadetector/data_management/resize_coco_dataset.py +588 -120
- megadetector/data_management/speciesnet_to_md.py +41 -41
- megadetector/data_management/wi_download_csv_to_coco.py +55 -55
- megadetector/data_management/yolo_output_to_md_output.py +248 -122
- megadetector/data_management/yolo_to_coco.py +333 -191
- megadetector/detection/change_detection.py +832 -0
- megadetector/detection/process_video.py +340 -337
- megadetector/detection/pytorch_detector.py +358 -278
- megadetector/detection/run_detector.py +399 -186
- megadetector/detection/run_detector_batch.py +404 -377
- megadetector/detection/run_inference_with_yolov5_val.py +340 -327
- megadetector/detection/run_tiled_inference.py +257 -249
- megadetector/detection/tf_detector.py +24 -24
- megadetector/detection/video_utils.py +332 -295
- megadetector/postprocessing/add_max_conf.py +19 -11
- megadetector/postprocessing/categorize_detections_by_size.py +45 -45
- megadetector/postprocessing/classification_postprocessing.py +468 -433
- megadetector/postprocessing/combine_batch_outputs.py +23 -23
- megadetector/postprocessing/compare_batch_results.py +590 -525
- megadetector/postprocessing/convert_output_format.py +106 -102
- megadetector/postprocessing/create_crop_folder.py +347 -147
- megadetector/postprocessing/detector_calibration.py +173 -168
- megadetector/postprocessing/generate_csv_report.py +508 -499
- megadetector/postprocessing/load_api_results.py +48 -27
- megadetector/postprocessing/md_to_coco.py +133 -102
- megadetector/postprocessing/md_to_labelme.py +107 -90
- megadetector/postprocessing/md_to_wi.py +40 -40
- megadetector/postprocessing/merge_detections.py +92 -114
- megadetector/postprocessing/postprocess_batch_results.py +319 -301
- megadetector/postprocessing/remap_detection_categories.py +91 -38
- megadetector/postprocessing/render_detection_confusion_matrix.py +214 -205
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +704 -679
- megadetector/postprocessing/separate_detections_into_folders.py +226 -211
- megadetector/postprocessing/subset_json_detector_output.py +265 -262
- megadetector/postprocessing/top_folders_to_bottom.py +45 -45
- megadetector/postprocessing/validate_batch_results.py +70 -70
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +18 -19
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +54 -33
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +67 -67
- megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
- megadetector/taxonomy_mapping/simple_image_download.py +8 -8
- megadetector/taxonomy_mapping/species_lookup.py +156 -74
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
- megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
- megadetector/utils/ct_utils.py +1049 -211
- megadetector/utils/directory_listing.py +21 -77
- megadetector/utils/gpu_test.py +22 -22
- megadetector/utils/md_tests.py +632 -529
- megadetector/utils/path_utils.py +1520 -431
- megadetector/utils/process_utils.py +41 -41
- megadetector/utils/split_locations_into_train_val.py +62 -62
- megadetector/utils/string_utils.py +148 -27
- megadetector/utils/url_utils.py +489 -176
- megadetector/utils/wi_utils.py +2658 -2526
- megadetector/utils/write_html_image_list.py +137 -137
- megadetector/visualization/plot_utils.py +34 -30
- megadetector/visualization/render_images_with_thumbnails.py +39 -74
- megadetector/visualization/visualization_utils.py +487 -435
- megadetector/visualization/visualize_db.py +232 -198
- megadetector/visualization/visualize_detector_output.py +82 -76
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/METADATA +5 -2
- megadetector-10.0.0.dist-info/RECORD +139 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/WHEEL +1 -1
- megadetector/api/batch_processing/api_core/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/score.py +0 -439
- megadetector/api/batch_processing/api_core/server.py +0 -294
- megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
- megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
- megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
- megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
- megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
- megadetector/api/batch_processing/api_core/server_utils.py +0 -88
- megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
- megadetector/api/batch_processing/api_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
- megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
- megadetector/api/synchronous/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
- megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
- megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
- megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
- megadetector/api/synchronous/api_core/tests/load_test.py +0 -110
- megadetector/data_management/importers/add_nacti_sizes.py +0 -52
- megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
- megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
- megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
- megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
- megadetector/data_management/importers/awc_to_json.py +0 -191
- megadetector/data_management/importers/bellevue_to_json.py +0 -272
- megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
- megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
- megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
- megadetector/data_management/importers/cct_field_adjustments.py +0 -58
- megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
- megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
- megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
- megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
- megadetector/data_management/importers/ena24_to_json.py +0 -276
- megadetector/data_management/importers/filenames_to_json.py +0 -386
- megadetector/data_management/importers/helena_to_cct.py +0 -283
- megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
- megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
- megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
- megadetector/data_management/importers/jb_csv_to_json.py +0 -150
- megadetector/data_management/importers/mcgill_to_json.py +0 -250
- megadetector/data_management/importers/missouri_to_json.py +0 -490
- megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
- megadetector/data_management/importers/noaa_seals_2019.py +0 -181
- megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
- megadetector/data_management/importers/pc_to_json.py +0 -365
- megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
- megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
- megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
- megadetector/data_management/importers/rspb_to_json.py +0 -356
- megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
- megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
- megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
- megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
- megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
- megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
- megadetector/data_management/importers/sulross_get_exif.py +0 -65
- megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
- megadetector/data_management/importers/ubc_to_json.py +0 -399
- megadetector/data_management/importers/umn_to_json.py +0 -507
- megadetector/data_management/importers/wellington_to_json.py +0 -263
- megadetector/data_management/importers/wi_to_json.py +0 -442
- megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
- megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
- megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
- megadetector/utils/azure_utils.py +0 -178
- megadetector/utils/sas_blob_utils.py +0 -509
- megadetector-5.0.28.dist-info/RECORD +0 -209
- /megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0
|
@@ -18,33 +18,33 @@ import subprocess
|
|
|
18
18
|
def execute(cmd,encoding=None,errors=None,env=None,verbose=False):
|
|
19
19
|
"""
|
|
20
20
|
Run [cmd] (a single string) in a shell, yielding each line of output to the caller.
|
|
21
|
-
|
|
21
|
+
|
|
22
22
|
The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
|
|
23
|
-
|
|
23
|
+
|
|
24
24
|
"verbose" only impacts output about process management, it is not related to printing
|
|
25
25
|
output from the child process.
|
|
26
|
-
|
|
26
|
+
|
|
27
27
|
Args:
|
|
28
28
|
cmd (str): command to run
|
|
29
29
|
encoding (str, optional): stdout encoding, see Popen() documentation
|
|
30
30
|
errors (str, optional): error handling, see Popen() documentation
|
|
31
31
|
env (dict, optional): environment variables, see Popen() documentation
|
|
32
32
|
verbose (bool, optional): enable additional debug console output
|
|
33
|
-
|
|
33
|
+
|
|
34
34
|
Returns:
|
|
35
|
-
int: the command's return code, always zero, otherwise a CalledProcessError is raised
|
|
35
|
+
int: the command's return code, always zero, otherwise a CalledProcessError is raised
|
|
36
36
|
"""
|
|
37
|
-
|
|
37
|
+
|
|
38
38
|
os.environ["PYTHONUNBUFFERED"] = "1"
|
|
39
|
-
|
|
40
|
-
if verbose:
|
|
39
|
+
|
|
40
|
+
if verbose:
|
|
41
41
|
if encoding is not None:
|
|
42
42
|
print('Launching child process with non-default encoding {}'.format(encoding))
|
|
43
43
|
if errors is not None:
|
|
44
44
|
print('Launching child process with non-default text error handling {}'.format(errors))
|
|
45
45
|
if env is not None:
|
|
46
46
|
print('Launching child process with non-default environment {}'.format(str(env)))
|
|
47
|
-
|
|
47
|
+
|
|
48
48
|
# https://stackoverflow.com/questions/4417546/constantly-print-subprocess-output-while-process-is-running
|
|
49
49
|
popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
|
50
50
|
shell=True, universal_newlines=True, encoding=encoding,
|
|
@@ -55,7 +55,7 @@ def execute(cmd,encoding=None,errors=None,env=None,verbose=False):
|
|
|
55
55
|
return_code = popen.wait()
|
|
56
56
|
if return_code:
|
|
57
57
|
raise subprocess.CalledProcessError(return_code, cmd)
|
|
58
|
-
|
|
58
|
+
|
|
59
59
|
return return_code
|
|
60
60
|
|
|
61
61
|
|
|
@@ -70,15 +70,15 @@ def execute_and_print(cmd,
|
|
|
70
70
|
"""
|
|
71
71
|
Run [cmd] (a single string) in a shell, capturing and printing output. Returns
|
|
72
72
|
a dictionary with fields "status" and "output".
|
|
73
|
-
|
|
73
|
+
|
|
74
74
|
The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
|
|
75
|
-
|
|
75
|
+
|
|
76
76
|
"verbose" only impacts output about process management, it is not related to printing
|
|
77
77
|
output from the child process.
|
|
78
|
-
|
|
78
|
+
|
|
79
79
|
Args:
|
|
80
80
|
cmd (str): command to run
|
|
81
|
-
print_output (bool, optional): whether to print output from [cmd] (stdout is
|
|
81
|
+
print_output (bool, optional): whether to print output from [cmd] (stdout is
|
|
82
82
|
captured regardless of the value of print_output)
|
|
83
83
|
encoding (str, optional): stdout encoding, see Popen() documentation
|
|
84
84
|
errors (str, optional): error handling, see Popen() documentation
|
|
@@ -86,15 +86,15 @@ def execute_and_print(cmd,
|
|
|
86
86
|
verbose (bool, optional): enable additional debug console output
|
|
87
87
|
catch_exceptions (bool, optional): catch exceptions and include in the output, otherwise raise
|
|
88
88
|
echo_command (bool, optional): print the command before executing
|
|
89
|
-
|
|
89
|
+
|
|
90
90
|
Returns:
|
|
91
91
|
dict: a dictionary with fields "status" (the process return code) and "output"
|
|
92
|
-
(the content of stdout)
|
|
92
|
+
(the content of stdout)
|
|
93
93
|
"""
|
|
94
94
|
|
|
95
95
|
if echo_command:
|
|
96
96
|
print('Running command:\n{}\n'.format(cmd))
|
|
97
|
-
|
|
97
|
+
|
|
98
98
|
to_return = {'status':'unknown','output':''}
|
|
99
99
|
output = []
|
|
100
100
|
try:
|
|
@@ -109,64 +109,64 @@ def execute_and_print(cmd,
|
|
|
109
109
|
print('execute_and_print caught error: {} ({})'.format(cpe.output,str(cpe)))
|
|
110
110
|
to_return['status'] = cpe.returncode
|
|
111
111
|
to_return['output'] = output
|
|
112
|
-
|
|
112
|
+
|
|
113
113
|
return to_return
|
|
114
114
|
|
|
115
115
|
|
|
116
116
|
#%% Single-threaded test driver for execute_and_print
|
|
117
117
|
|
|
118
118
|
if False:
|
|
119
|
-
|
|
119
|
+
|
|
120
120
|
pass
|
|
121
121
|
|
|
122
122
|
#%%
|
|
123
|
-
|
|
123
|
+
|
|
124
124
|
if os.name == 'nt':
|
|
125
|
-
execute_and_print('echo hello && ping -n 5 127.0.0.1 && echo goodbye')
|
|
125
|
+
execute_and_print('echo hello && ping -n 5 127.0.0.1 && echo goodbye')
|
|
126
126
|
else:
|
|
127
|
-
execute_and_print('echo hello && sleep 1 && echo goodbye')
|
|
128
|
-
|
|
127
|
+
execute_and_print('echo hello && sleep 1 && echo goodbye')
|
|
128
|
+
|
|
129
129
|
|
|
130
130
|
#%% Parallel test driver for execute_and_print
|
|
131
131
|
|
|
132
132
|
if False:
|
|
133
|
-
|
|
133
|
+
|
|
134
134
|
pass
|
|
135
135
|
|
|
136
136
|
#%%
|
|
137
|
-
|
|
137
|
+
|
|
138
138
|
from functools import partial
|
|
139
139
|
from multiprocessing.pool import ThreadPool as ThreadPool
|
|
140
140
|
from multiprocessing.pool import Pool as Pool
|
|
141
|
-
|
|
141
|
+
|
|
142
142
|
n_workers = 10
|
|
143
|
-
|
|
143
|
+
|
|
144
144
|
# Should we use threads (vs. processes) for parallelization?
|
|
145
145
|
use_threads = True
|
|
146
|
-
|
|
146
|
+
|
|
147
147
|
test_data = ['a','b','c','d']
|
|
148
|
-
|
|
149
|
-
def
|
|
148
|
+
|
|
149
|
+
def _process_sample(s):
|
|
150
150
|
return execute_and_print('echo ' + s,True)
|
|
151
|
-
|
|
152
|
-
if n_workers == 1:
|
|
153
|
-
|
|
151
|
+
|
|
152
|
+
if n_workers == 1:
|
|
153
|
+
|
|
154
154
|
results = []
|
|
155
|
-
for i_sample,sample in enumerate(test_data):
|
|
156
|
-
results.append(
|
|
157
|
-
|
|
155
|
+
for i_sample,sample in enumerate(test_data):
|
|
156
|
+
results.append(_process_sample(sample))
|
|
157
|
+
|
|
158
158
|
else:
|
|
159
|
-
|
|
159
|
+
|
|
160
160
|
n_threads = min(n_workers,len(test_data))
|
|
161
|
-
|
|
161
|
+
|
|
162
162
|
if use_threads:
|
|
163
163
|
print('Starting parallel thread pool with {} workers'.format(n_threads))
|
|
164
164
|
pool = ThreadPool(n_threads)
|
|
165
165
|
else:
|
|
166
166
|
print('Starting parallel process pool with {} workers'.format(n_threads))
|
|
167
167
|
pool = Pool(n_threads)
|
|
168
|
-
|
|
169
|
-
results = list(pool.map(partial(
|
|
170
|
-
|
|
168
|
+
|
|
169
|
+
results = list(pool.map(partial(_process_sample),test_data))
|
|
170
|
+
|
|
171
171
|
for r in results:
|
|
172
172
|
print(r)
|
|
@@ -4,8 +4,8 @@ split_locations_into_train_val.py
|
|
|
4
4
|
|
|
5
5
|
Splits a list of location IDs into training and validation, targeting a specific
|
|
6
6
|
train/val split for each category, but allowing some categories to be tighter or looser
|
|
7
|
-
than others. Does nothing particularly clever, just randomly splits locations into
|
|
8
|
-
train/val lots of times using the target val fraction, and picks the one that meets the
|
|
7
|
+
than others. Does nothing particularly clever, just randomly splits locations into
|
|
8
|
+
train/val lots of times using the target val fraction, and picks the one that meets the
|
|
9
9
|
specified constraints and minimizes weighted error, where "error" is defined as the
|
|
10
10
|
sum of each class's absolute divergence from the target val fraction.
|
|
11
11
|
|
|
@@ -26,63 +26,63 @@ from tqdm import tqdm
|
|
|
26
26
|
def split_locations_into_train_val(location_to_category_counts,
|
|
27
27
|
n_random_seeds=10000,
|
|
28
28
|
target_val_fraction=0.15,
|
|
29
|
-
category_to_max_allowable_error=None,
|
|
29
|
+
category_to_max_allowable_error=None,
|
|
30
30
|
category_to_error_weight=None,
|
|
31
31
|
default_max_allowable_error=0.1,
|
|
32
32
|
require_complete_coverage=True):
|
|
33
33
|
"""
|
|
34
34
|
Splits a list of location IDs into training and validation, targeting a specific
|
|
35
35
|
train/val split for each category, but allowing some categories to be tighter or looser
|
|
36
|
-
than others. Does nothing particularly clever, just randomly splits locations into
|
|
37
|
-
train/val lots of times using the target val fraction, and picks the one that meets the
|
|
36
|
+
than others. Does nothing particularly clever, just randomly splits locations into
|
|
37
|
+
train/val lots of times using the target val fraction, and picks the one that meets the
|
|
38
38
|
specified constraints and minimizes weighted error, where "error" is defined as the
|
|
39
|
-
sum of each class's absolute divergence from the target val fraction.
|
|
40
|
-
|
|
39
|
+
sum of each class's absolute divergence from the target val fraction.
|
|
40
|
+
|
|
41
41
|
Args:
|
|
42
42
|
location_to_category_counts (dict): a dict mapping location IDs to dicts,
|
|
43
|
-
with each dict mapping a category name to a count. Any categories not present
|
|
43
|
+
with each dict mapping a category name to a count. Any categories not present
|
|
44
44
|
in a particular dict are assumed to have a count of zero for that location.
|
|
45
|
-
|
|
45
|
+
|
|
46
46
|
For example:
|
|
47
|
-
|
|
47
|
+
|
|
48
48
|
.. code-block:: none
|
|
49
49
|
|
|
50
50
|
{'location-000': {'bear':4,'wolf':10},
|
|
51
51
|
'location-001': {'bear':12,'elk':20}}
|
|
52
|
-
|
|
52
|
+
|
|
53
53
|
n_random_seeds (int, optional): number of random seeds to try, always starting from zero
|
|
54
54
|
target_val_fraction (float, optional): fraction of images containing each species we'd
|
|
55
55
|
like to put in the val split
|
|
56
56
|
category_to_max_allowable_error (dict, optional): a dict mapping category names
|
|
57
57
|
to maximum allowable errors. These are hard constraints (i.e., we will error
|
|
58
|
-
if we can't meet them). Does not need to include all categories; categories not
|
|
58
|
+
if we can't meet them). Does not need to include all categories; categories not
|
|
59
59
|
included will be assigned a maximum error according to [default_max_allowable_error].
|
|
60
60
|
If this is None, no hard constraints are applied.
|
|
61
61
|
category_to_error_weight (dict, optional): a dict mapping category names to
|
|
62
62
|
error weights. You can specify a subset of categories; categories not included here
|
|
63
63
|
have a weight of 1.0. If None, all categories have the same weight.
|
|
64
|
-
default_max_allowable_error (float, optional): the maximum allowable error for categories not
|
|
65
|
-
present in [category_to_max_allowable_error]. Set to None (or >= 1.0) to disable hard
|
|
64
|
+
default_max_allowable_error (float, optional): the maximum allowable error for categories not
|
|
65
|
+
present in [category_to_max_allowable_error]. Set to None (or >= 1.0) to disable hard
|
|
66
66
|
constraints for categories not present in [category_to_max_allowable_error]
|
|
67
|
-
require_complete_coverage (bool, optional): require that every category appear in both train
|
|
68
|
-
val
|
|
69
|
-
|
|
67
|
+
require_complete_coverage (bool, optional): require that every category appear in both train
|
|
68
|
+
and val
|
|
69
|
+
|
|
70
70
|
Returns:
|
|
71
71
|
tuple: A two-element tuple:
|
|
72
72
|
- list of location IDs in the val split
|
|
73
|
-
- a dict mapping category names to the fraction of images in the val split
|
|
73
|
+
- a dict mapping category names to the fraction of images in the val split
|
|
74
74
|
"""
|
|
75
|
-
|
|
75
|
+
|
|
76
76
|
location_ids = list(location_to_category_counts.keys())
|
|
77
|
-
|
|
77
|
+
|
|
78
78
|
n_val_locations = int(target_val_fraction*len(location_ids))
|
|
79
|
-
|
|
79
|
+
|
|
80
80
|
if category_to_max_allowable_error is None:
|
|
81
81
|
category_to_max_allowable_error = {}
|
|
82
|
-
|
|
82
|
+
|
|
83
83
|
if category_to_error_weight is None:
|
|
84
84
|
category_to_error_weight = {}
|
|
85
|
-
|
|
85
|
+
|
|
86
86
|
# category ID to total count; the total count is used only for printouts
|
|
87
87
|
category_id_to_count = {}
|
|
88
88
|
for location_id in location_to_category_counts:
|
|
@@ -91,28 +91,28 @@ def split_locations_into_train_val(location_to_category_counts,
|
|
|
91
91
|
category_id_to_count[category_id] = 0
|
|
92
92
|
category_id_to_count[category_id] += \
|
|
93
93
|
location_to_category_counts[location_id][category_id]
|
|
94
|
-
|
|
94
|
+
|
|
95
95
|
category_ids = set(category_id_to_count.keys())
|
|
96
|
-
|
|
96
|
+
|
|
97
97
|
print('Splitting {} categories over {} locations'.format(
|
|
98
98
|
len(category_ids),len(location_ids)))
|
|
99
|
-
|
|
99
|
+
|
|
100
100
|
# random_seed = 0
|
|
101
101
|
def compute_seed_errors(random_seed):
|
|
102
102
|
"""
|
|
103
103
|
Computes the per-category error for a specific random seed.
|
|
104
|
-
|
|
104
|
+
|
|
105
105
|
returns weighted_average_error,category_to_val_fraction
|
|
106
106
|
"""
|
|
107
|
-
|
|
107
|
+
|
|
108
108
|
# Randomly split into train/val
|
|
109
109
|
random.seed(random_seed)
|
|
110
110
|
val_locations = random.sample(location_ids,k=n_val_locations)
|
|
111
111
|
val_locations_set = set(val_locations)
|
|
112
|
-
|
|
112
|
+
|
|
113
113
|
# For each category, measure the % of images that went into the val set
|
|
114
114
|
category_to_val_fraction = defaultdict(float)
|
|
115
|
-
|
|
115
|
+
|
|
116
116
|
for category_id in category_ids:
|
|
117
117
|
category_val_count = 0
|
|
118
118
|
category_train_count = 0
|
|
@@ -127,44 +127,44 @@ def split_locations_into_train_val(location_to_category_counts,
|
|
|
127
127
|
category_train_count += location_category_count
|
|
128
128
|
category_val_fraction = category_val_count / (category_val_count + category_train_count)
|
|
129
129
|
category_to_val_fraction[category_id] = category_val_fraction
|
|
130
|
-
|
|
130
|
+
|
|
131
131
|
# Absolute deviation from the target val fraction for each category
|
|
132
132
|
category_errors = {}
|
|
133
133
|
weighted_category_errors = {}
|
|
134
|
-
|
|
134
|
+
|
|
135
135
|
# category = next(iter(category_to_val_fraction))
|
|
136
136
|
for category in category_to_val_fraction:
|
|
137
|
-
|
|
137
|
+
|
|
138
138
|
category_val_fraction = category_to_val_fraction[category]
|
|
139
|
-
|
|
139
|
+
|
|
140
140
|
category_error = abs(category_val_fraction-target_val_fraction)
|
|
141
141
|
category_errors[category] = category_error
|
|
142
|
-
|
|
142
|
+
|
|
143
143
|
category_weight = 1.0
|
|
144
144
|
if category in category_to_error_weight:
|
|
145
145
|
category_weight = category_to_error_weight[category]
|
|
146
146
|
weighted_category_error = category_error * category_weight
|
|
147
147
|
weighted_category_errors[category] = weighted_category_error
|
|
148
|
-
|
|
148
|
+
|
|
149
149
|
weighted_average_error = np.mean(list(weighted_category_errors.values()))
|
|
150
|
-
|
|
150
|
+
|
|
151
151
|
return weighted_average_error,weighted_category_errors,category_to_val_fraction
|
|
152
|
-
|
|
152
|
+
|
|
153
153
|
# ... def compute_seed_errors(...)
|
|
154
|
-
|
|
154
|
+
|
|
155
155
|
# This will only include random seeds that satisfy the hard constraints
|
|
156
156
|
random_seed_to_weighted_average_error = {}
|
|
157
|
-
|
|
157
|
+
|
|
158
158
|
# random_seed = 0
|
|
159
159
|
for random_seed in tqdm(range(0,n_random_seeds)):
|
|
160
|
-
|
|
160
|
+
|
|
161
161
|
weighted_average_error,weighted_category_errors,category_to_val_fraction = \
|
|
162
162
|
compute_seed_errors(random_seed)
|
|
163
|
-
|
|
163
|
+
|
|
164
164
|
seed_satisfies_hard_constraints = True
|
|
165
|
-
|
|
165
|
+
|
|
166
166
|
for category in category_to_val_fraction:
|
|
167
|
-
if category in category_to_max_allowable_error:
|
|
167
|
+
if category in category_to_max_allowable_error:
|
|
168
168
|
max_allowable_error = category_to_max_allowable_error[category]
|
|
169
169
|
else:
|
|
170
170
|
if default_max_allowable_error is None:
|
|
@@ -183,59 +183,59 @@ def split_locations_into_train_val(location_to_category_counts,
|
|
|
183
183
|
if category_error > max_allowable_error:
|
|
184
184
|
seed_satisfies_hard_constraints = False
|
|
185
185
|
break
|
|
186
|
-
|
|
186
|
+
|
|
187
187
|
# ...for each category
|
|
188
|
-
|
|
189
|
-
if seed_satisfies_hard_constraints:
|
|
188
|
+
|
|
189
|
+
if seed_satisfies_hard_constraints:
|
|
190
190
|
random_seed_to_weighted_average_error[random_seed] = weighted_average_error
|
|
191
|
-
|
|
191
|
+
|
|
192
192
|
# ...for each random seed
|
|
193
|
-
|
|
193
|
+
|
|
194
194
|
assert len(random_seed_to_weighted_average_error) > 0, \
|
|
195
195
|
'No random seed met all the hard constraints'
|
|
196
|
-
|
|
196
|
+
|
|
197
197
|
print('\n{} of {} random seeds satisfied hard constraints'.format(
|
|
198
198
|
len(random_seed_to_weighted_average_error),n_random_seeds))
|
|
199
|
-
|
|
199
|
+
|
|
200
200
|
min_error = None
|
|
201
201
|
min_error_seed = None
|
|
202
|
-
|
|
202
|
+
|
|
203
203
|
for random_seed in random_seed_to_weighted_average_error.keys():
|
|
204
204
|
error_metric = random_seed_to_weighted_average_error[random_seed]
|
|
205
205
|
if min_error is None or error_metric < min_error:
|
|
206
206
|
min_error = error_metric
|
|
207
207
|
min_error_seed = random_seed
|
|
208
|
-
|
|
208
|
+
|
|
209
209
|
random.seed(min_error_seed)
|
|
210
210
|
val_locations = random.sample(location_ids,k=n_val_locations)
|
|
211
211
|
train_locations = []
|
|
212
212
|
for location_id in location_ids:
|
|
213
213
|
if location_id not in val_locations:
|
|
214
214
|
train_locations.append(location_id)
|
|
215
|
-
|
|
216
|
-
print('\nVal locations:\n')
|
|
215
|
+
|
|
216
|
+
print('\nVal locations:\n')
|
|
217
217
|
for loc in val_locations:
|
|
218
218
|
print('{}'.format(loc))
|
|
219
219
|
print('')
|
|
220
|
-
|
|
220
|
+
|
|
221
221
|
weighted_average_error,weighted_category_errors,category_to_val_fraction = \
|
|
222
222
|
compute_seed_errors(min_error_seed)
|
|
223
|
-
|
|
223
|
+
|
|
224
224
|
random_seed = min_error_seed
|
|
225
|
-
|
|
225
|
+
|
|
226
226
|
category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,reverse=True)
|
|
227
227
|
category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,
|
|
228
228
|
sort_values=category_id_to_count,
|
|
229
229
|
reverse=True)
|
|
230
|
-
|
|
231
|
-
|
|
230
|
+
|
|
231
|
+
|
|
232
232
|
print('Val fractions by category:\n')
|
|
233
|
-
|
|
233
|
+
|
|
234
234
|
for category in category_to_val_fraction:
|
|
235
235
|
print('{} ({}) {:.2f}'.format(
|
|
236
236
|
category,category_id_to_count[category],
|
|
237
237
|
category_to_val_fraction[category]))
|
|
238
|
-
|
|
238
|
+
|
|
239
239
|
return val_locations,category_to_val_fraction
|
|
240
240
|
|
|
241
241
|
# ...def split_locations_into_train_val(...)
|
|
@@ -14,16 +14,19 @@ import re
|
|
|
14
14
|
#%% Functions
|
|
15
15
|
|
|
16
16
|
def is_float(s):
|
|
17
|
-
"""
|
|
17
|
+
"""
|
|
18
18
|
Checks whether [s] is an object (typically a string) that can be cast to a float
|
|
19
|
-
|
|
19
|
+
|
|
20
20
|
Args:
|
|
21
21
|
s (object): object to evaluate
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
Returns:
|
|
24
24
|
bool: True if s successfully casts to a float, otherwise False
|
|
25
25
|
"""
|
|
26
|
-
|
|
26
|
+
|
|
27
|
+
if s is None:
|
|
28
|
+
return False
|
|
29
|
+
|
|
27
30
|
try:
|
|
28
31
|
_ = float(s)
|
|
29
32
|
except ValueError:
|
|
@@ -36,57 +39,175 @@ def human_readable_to_bytes(size):
|
|
|
36
39
|
Given a human-readable byte string (e.g. 2G, 10GB, 30MB, 20KB),
|
|
37
40
|
returns the number of bytes. Will return 0 if the argument has
|
|
38
41
|
unexpected form.
|
|
39
|
-
|
|
42
|
+
|
|
40
43
|
https://gist.github.com/beugley/ccd69945346759eb6142272a6d69b4e0
|
|
41
|
-
|
|
44
|
+
|
|
42
45
|
Args:
|
|
43
46
|
size (str): string representing a size
|
|
44
|
-
|
|
47
|
+
|
|
45
48
|
Returns:
|
|
46
49
|
int: the corresponding size in bytes
|
|
47
50
|
"""
|
|
48
|
-
|
|
51
|
+
|
|
49
52
|
size = re.sub(r'\s+', '', size)
|
|
50
|
-
|
|
53
|
+
|
|
54
|
+
if not size: # Handle empty string case after stripping spaces
|
|
55
|
+
return 0
|
|
56
|
+
|
|
51
57
|
if (size[-1] == 'B'):
|
|
52
58
|
size = size[:-1]
|
|
53
|
-
|
|
59
|
+
|
|
60
|
+
if not size: # Handle case where size was just "B"
|
|
61
|
+
return 0
|
|
62
|
+
|
|
54
63
|
if (size.isdigit()):
|
|
55
|
-
|
|
64
|
+
bytes_val = int(size) # Renamed to avoid conflict with built-in 'bytes'
|
|
56
65
|
elif (is_float(size)):
|
|
57
|
-
|
|
66
|
+
bytes_val = float(size) # Renamed
|
|
58
67
|
else:
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
68
|
+
# Handle cases like "1KB" where size[:-1] might be "1K" before this block
|
|
69
|
+
# The original code would try to float("1K") which fails.
|
|
70
|
+
# Need to separate numeric part from unit more carefully.
|
|
71
|
+
numeric_part = ''
|
|
72
|
+
unit_part = ''
|
|
73
|
+
|
|
74
|
+
# Iterate from the end to find the unit (K, M, G, T)
|
|
75
|
+
# This handles cases like "10KB" or "2.5GB"
|
|
76
|
+
for i in range(len(size) -1, -1, -1):
|
|
77
|
+
if size[i].isalpha():
|
|
78
|
+
unit_part = size[i] + unit_part
|
|
79
|
+
else:
|
|
80
|
+
numeric_part = size[:i+1]
|
|
81
|
+
break
|
|
82
|
+
|
|
83
|
+
# If no unit found, or numeric part is empty after stripping unit
|
|
84
|
+
if not unit_part or not numeric_part:
|
|
85
|
+
return 0
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
bytes_val = float(numeric_part)
|
|
89
|
+
unit = unit_part
|
|
63
90
|
if (unit == 'T'):
|
|
64
|
-
|
|
91
|
+
bytes_val *= 1024*1024*1024*1024
|
|
65
92
|
elif (unit == 'G'):
|
|
66
|
-
|
|
93
|
+
bytes_val *= 1024*1024*1024
|
|
67
94
|
elif (unit == 'M'):
|
|
68
|
-
|
|
95
|
+
bytes_val *= 1024*1024
|
|
69
96
|
elif (unit == 'K'):
|
|
70
|
-
|
|
97
|
+
bytes_val *= 1024
|
|
71
98
|
else:
|
|
72
|
-
|
|
99
|
+
# If it's a known unit (like 'B' already stripped) but not T/G/M/K,
|
|
100
|
+
# and it was floatable, it's just bytes. If it's an unknown unit, it's
|
|
101
|
+
# an error.
|
|
102
|
+
if unit not in ['B', '']: # 'B' was stripped, '' means just a number
|
|
103
|
+
bytes_val = 0
|
|
73
104
|
except ValueError:
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
return
|
|
105
|
+
bytes_val = 0
|
|
106
|
+
|
|
107
|
+
return bytes_val
|
|
77
108
|
|
|
78
109
|
|
|
79
110
|
def remove_ansi_codes(s):
|
|
80
111
|
"""
|
|
81
112
|
Removes ANSI escape codes from a string.
|
|
82
|
-
|
|
113
|
+
|
|
83
114
|
https://stackoverflow.com/questions/14693701/how-can-i-remove-the-ansi-escape-sequences-from-a-string-in-python#14693789
|
|
84
|
-
|
|
115
|
+
|
|
85
116
|
Args:
|
|
86
117
|
s (str): the string to de-ANSI-i-fy
|
|
87
|
-
|
|
118
|
+
|
|
88
119
|
Returns:
|
|
89
120
|
str: A copy of [s] without ANSI codes
|
|
90
121
|
"""
|
|
122
|
+
|
|
91
123
|
ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
|
|
92
124
|
return ansi_escape.sub('', s)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
#%% Tests
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class TestStringUtils:
|
|
131
|
+
"""
|
|
132
|
+
Tests for string_utils.py
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test_is_float(self):
|
|
137
|
+
"""
|
|
138
|
+
Test the is_float function.
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
assert is_float("1.23")
|
|
142
|
+
assert is_float("-0.5")
|
|
143
|
+
assert is_float("0")
|
|
144
|
+
assert is_float(1.23)
|
|
145
|
+
assert is_float(0)
|
|
146
|
+
assert not is_float("abc")
|
|
147
|
+
assert not is_float("1.2.3")
|
|
148
|
+
assert not is_float("")
|
|
149
|
+
assert not is_float(None)
|
|
150
|
+
assert not is_float("1,23")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def test_human_readable_to_bytes(self):
|
|
154
|
+
"""
|
|
155
|
+
Test the human_readable_to_bytes function.
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
assert human_readable_to_bytes("10B") == 10
|
|
159
|
+
assert human_readable_to_bytes("10") == 10
|
|
160
|
+
assert human_readable_to_bytes("1K") == 1024
|
|
161
|
+
assert human_readable_to_bytes("1KB") == 1024
|
|
162
|
+
assert human_readable_to_bytes("1M") == 1024*1024
|
|
163
|
+
assert human_readable_to_bytes("1MB") == 1024*1024
|
|
164
|
+
assert human_readable_to_bytes("1G") == 1024*1024*1024
|
|
165
|
+
assert human_readable_to_bytes("1GB") == 1024*1024*1024
|
|
166
|
+
assert human_readable_to_bytes("1T") == 1024*1024*1024*1024
|
|
167
|
+
assert human_readable_to_bytes("1TB") == 1024*1024*1024*1024
|
|
168
|
+
|
|
169
|
+
assert human_readable_to_bytes("2.5K") == 2.5 * 1024
|
|
170
|
+
assert human_readable_to_bytes("0.5MB") == 0.5 * 1024 * 1024
|
|
171
|
+
|
|
172
|
+
# Test with spaces
|
|
173
|
+
assert human_readable_to_bytes(" 2 G ") == 2 * 1024*1024*1024
|
|
174
|
+
assert human_readable_to_bytes("500 KB") == 500 * 1024
|
|
175
|
+
|
|
176
|
+
# Invalid inputs
|
|
177
|
+
assert human_readable_to_bytes("abc") == 0
|
|
178
|
+
assert human_readable_to_bytes("1X") == 0
|
|
179
|
+
assert human_readable_to_bytes("1KBB") == 0
|
|
180
|
+
assert human_readable_to_bytes("K1") == 0
|
|
181
|
+
assert human_readable_to_bytes("") == 0
|
|
182
|
+
assert human_readable_to_bytes("1.2.3K") == 0
|
|
183
|
+
assert human_readable_to_bytes("B") == 0
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def test_remove_ansi_codes(self):
|
|
187
|
+
"""
|
|
188
|
+
Test the remove_ansi_codes function.
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
assert remove_ansi_codes("text without codes") == "text without codes"
|
|
192
|
+
assert remove_ansi_codes("\x1b[31mRed text\x1b[0m") == "Red text"
|
|
193
|
+
assert remove_ansi_codes("\x1b[1m\x1b[4mBold and Underline\x1b[0m") == "Bold and Underline"
|
|
194
|
+
assert remove_ansi_codes("Mixed \x1b[32mgreen\x1b[0m and normal") == "Mixed green and normal"
|
|
195
|
+
assert remove_ansi_codes("") == ""
|
|
196
|
+
|
|
197
|
+
# More complex/varied ANSI codes
|
|
198
|
+
assert remove_ansi_codes("text\x1b[1Aup") == "textup"
|
|
199
|
+
assert remove_ansi_codes("\x1b[2Jclearscreen") == "clearscreen"
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def test_string_utils():
|
|
203
|
+
"""
|
|
204
|
+
Runs all tests in the TestStringUtils class.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
test_instance = TestStringUtils()
|
|
208
|
+
test_instance.test_is_float()
|
|
209
|
+
test_instance.test_human_readable_to_bytes()
|
|
210
|
+
test_instance.test_remove_ansi_codes()
|
|
211
|
+
|
|
212
|
+
# from IPython import embed; embed()
|
|
213
|
+
# test_string_utils()
|