megadetector 5.0.8__py3-none-any.whl → 5.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (190) hide show
  1. api/__init__.py +0 -0
  2. api/batch_processing/__init__.py +0 -0
  3. api/batch_processing/api_core/__init__.py +0 -0
  4. api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. api/batch_processing/api_core/batch_service/score.py +0 -1
  6. api/batch_processing/api_core/server_job_status_table.py +0 -1
  7. api/batch_processing/api_core_support/__init__.py +0 -0
  8. api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
  9. api/batch_processing/api_support/__init__.py +0 -0
  10. api/batch_processing/api_support/summarize_daily_activity.py +0 -1
  11. api/batch_processing/data_preparation/__init__.py +0 -0
  12. api/batch_processing/data_preparation/manage_local_batch.py +65 -65
  13. api/batch_processing/data_preparation/manage_video_batch.py +8 -8
  14. api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
  15. api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
  16. api/batch_processing/postprocessing/__init__.py +0 -0
  17. api/batch_processing/postprocessing/add_max_conf.py +12 -12
  18. api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
  19. api/batch_processing/postprocessing/combine_api_outputs.py +68 -54
  20. api/batch_processing/postprocessing/compare_batch_results.py +113 -43
  21. api/batch_processing/postprocessing/convert_output_format.py +41 -16
  22. api/batch_processing/postprocessing/load_api_results.py +16 -17
  23. api/batch_processing/postprocessing/md_to_coco.py +31 -21
  24. api/batch_processing/postprocessing/md_to_labelme.py +52 -22
  25. api/batch_processing/postprocessing/merge_detections.py +14 -14
  26. api/batch_processing/postprocessing/postprocess_batch_results.py +246 -174
  27. api/batch_processing/postprocessing/remap_detection_categories.py +32 -25
  28. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +60 -27
  29. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
  30. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
  31. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +242 -158
  32. api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
  33. api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
  34. api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
  35. api/synchronous/__init__.py +0 -0
  36. api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  37. api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
  38. api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
  39. api/synchronous/api_core/animal_detection_api/config.py +35 -35
  40. api/synchronous/api_core/tests/__init__.py +0 -0
  41. api/synchronous/api_core/tests/load_test.py +109 -109
  42. classification/__init__.py +0 -0
  43. classification/aggregate_classifier_probs.py +21 -24
  44. classification/analyze_failed_images.py +11 -13
  45. classification/cache_batchapi_outputs.py +51 -51
  46. classification/create_classification_dataset.py +69 -68
  47. classification/crop_detections.py +54 -53
  48. classification/csv_to_json.py +97 -100
  49. classification/detect_and_crop.py +105 -105
  50. classification/evaluate_model.py +43 -42
  51. classification/identify_mislabeled_candidates.py +47 -46
  52. classification/json_to_azcopy_list.py +10 -10
  53. classification/json_validator.py +72 -71
  54. classification/map_classification_categories.py +44 -43
  55. classification/merge_classification_detection_output.py +68 -68
  56. classification/prepare_classification_script.py +157 -154
  57. classification/prepare_classification_script_mc.py +228 -228
  58. classification/run_classifier.py +27 -26
  59. classification/save_mislabeled.py +30 -30
  60. classification/train_classifier.py +20 -20
  61. classification/train_classifier_tf.py +21 -22
  62. classification/train_utils.py +10 -10
  63. data_management/__init__.py +0 -0
  64. data_management/annotations/__init__.py +0 -0
  65. data_management/annotations/annotation_constants.py +18 -31
  66. data_management/camtrap_dp_to_coco.py +238 -0
  67. data_management/cct_json_utils.py +102 -59
  68. data_management/cct_to_md.py +176 -158
  69. data_management/cct_to_wi.py +247 -219
  70. data_management/coco_to_labelme.py +272 -263
  71. data_management/coco_to_yolo.py +79 -58
  72. data_management/databases/__init__.py +0 -0
  73. data_management/databases/add_width_and_height_to_db.py +20 -16
  74. data_management/databases/combine_coco_camera_traps_files.py +35 -31
  75. data_management/databases/integrity_check_json_db.py +62 -24
  76. data_management/databases/subset_json_db.py +24 -15
  77. data_management/generate_crops_from_cct.py +27 -45
  78. data_management/get_image_sizes.py +188 -162
  79. data_management/importers/add_nacti_sizes.py +8 -8
  80. data_management/importers/add_timestamps_to_icct.py +78 -78
  81. data_management/importers/animl_results_to_md_results.py +158 -158
  82. data_management/importers/auckland_doc_test_to_json.py +9 -9
  83. data_management/importers/auckland_doc_to_json.py +8 -8
  84. data_management/importers/awc_to_json.py +7 -7
  85. data_management/importers/bellevue_to_json.py +15 -15
  86. data_management/importers/cacophony-thermal-importer.py +13 -13
  87. data_management/importers/carrizo_shrubfree_2018.py +8 -8
  88. data_management/importers/carrizo_trail_cam_2017.py +8 -8
  89. data_management/importers/cct_field_adjustments.py +9 -9
  90. data_management/importers/channel_islands_to_cct.py +10 -10
  91. data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
  92. data_management/importers/ena24_to_json.py +7 -7
  93. data_management/importers/filenames_to_json.py +8 -8
  94. data_management/importers/helena_to_cct.py +7 -7
  95. data_management/importers/idaho-camera-traps.py +7 -7
  96. data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
  97. data_management/importers/jb_csv_to_json.py +9 -9
  98. data_management/importers/mcgill_to_json.py +8 -8
  99. data_management/importers/missouri_to_json.py +18 -18
  100. data_management/importers/nacti_fieldname_adjustments.py +10 -10
  101. data_management/importers/noaa_seals_2019.py +7 -7
  102. data_management/importers/pc_to_json.py +7 -7
  103. data_management/importers/plot_wni_giraffes.py +7 -7
  104. data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
  105. data_management/importers/prepare_zsl_imerit.py +7 -7
  106. data_management/importers/rspb_to_json.py +8 -8
  107. data_management/importers/save_the_elephants_survey_A.py +8 -8
  108. data_management/importers/save_the_elephants_survey_B.py +9 -9
  109. data_management/importers/snapshot_safari_importer.py +26 -26
  110. data_management/importers/snapshot_safari_importer_reprise.py +665 -665
  111. data_management/importers/snapshot_serengeti_lila.py +14 -14
  112. data_management/importers/sulross_get_exif.py +8 -9
  113. data_management/importers/timelapse_csv_set_to_json.py +11 -11
  114. data_management/importers/ubc_to_json.py +13 -13
  115. data_management/importers/umn_to_json.py +7 -7
  116. data_management/importers/wellington_to_json.py +8 -8
  117. data_management/importers/wi_to_json.py +9 -9
  118. data_management/importers/zamba_results_to_md_results.py +181 -181
  119. data_management/labelme_to_coco.py +65 -24
  120. data_management/labelme_to_yolo.py +8 -8
  121. data_management/lila/__init__.py +0 -0
  122. data_management/lila/add_locations_to_island_camera_traps.py +9 -9
  123. data_management/lila/add_locations_to_nacti.py +147 -147
  124. data_management/lila/create_lila_blank_set.py +13 -13
  125. data_management/lila/create_lila_test_set.py +8 -8
  126. data_management/lila/create_links_to_md_results_files.py +106 -106
  127. data_management/lila/download_lila_subset.py +44 -110
  128. data_management/lila/generate_lila_per_image_labels.py +55 -42
  129. data_management/lila/get_lila_annotation_counts.py +18 -15
  130. data_management/lila/get_lila_image_counts.py +11 -11
  131. data_management/lila/lila_common.py +96 -33
  132. data_management/lila/test_lila_metadata_urls.py +132 -116
  133. data_management/ocr_tools.py +173 -128
  134. data_management/read_exif.py +110 -97
  135. data_management/remap_coco_categories.py +83 -83
  136. data_management/remove_exif.py +58 -62
  137. data_management/resize_coco_dataset.py +30 -23
  138. data_management/wi_download_csv_to_coco.py +246 -239
  139. data_management/yolo_output_to_md_output.py +86 -73
  140. data_management/yolo_to_coco.py +300 -60
  141. detection/__init__.py +0 -0
  142. detection/detector_training/__init__.py +0 -0
  143. detection/process_video.py +85 -33
  144. detection/pytorch_detector.py +43 -25
  145. detection/run_detector.py +157 -72
  146. detection/run_detector_batch.py +179 -113
  147. detection/run_inference_with_yolov5_val.py +108 -48
  148. detection/run_tiled_inference.py +111 -40
  149. detection/tf_detector.py +51 -29
  150. detection/video_utils.py +606 -521
  151. docs/source/conf.py +43 -0
  152. md_utils/__init__.py +0 -0
  153. md_utils/azure_utils.py +9 -9
  154. md_utils/ct_utils.py +228 -68
  155. md_utils/directory_listing.py +59 -64
  156. md_utils/md_tests.py +968 -871
  157. md_utils/path_utils.py +460 -134
  158. md_utils/process_utils.py +157 -133
  159. md_utils/sas_blob_utils.py +20 -20
  160. md_utils/split_locations_into_train_val.py +45 -32
  161. md_utils/string_utils.py +33 -10
  162. md_utils/url_utils.py +176 -60
  163. md_utils/write_html_image_list.py +40 -33
  164. md_visualization/__init__.py +0 -0
  165. md_visualization/plot_utils.py +102 -109
  166. md_visualization/render_images_with_thumbnails.py +34 -34
  167. md_visualization/visualization_utils.py +597 -291
  168. md_visualization/visualize_db.py +76 -48
  169. md_visualization/visualize_detector_output.py +61 -42
  170. {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/METADATA +13 -7
  171. megadetector-5.0.10.dist-info/RECORD +224 -0
  172. {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/top_level.txt +1 -0
  173. taxonomy_mapping/__init__.py +0 -0
  174. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
  175. taxonomy_mapping/map_new_lila_datasets.py +154 -154
  176. taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
  177. taxonomy_mapping/preview_lila_taxonomy.py +591 -591
  178. taxonomy_mapping/retrieve_sample_image.py +12 -12
  179. taxonomy_mapping/simple_image_download.py +11 -11
  180. taxonomy_mapping/species_lookup.py +10 -10
  181. taxonomy_mapping/taxonomy_csv_checker.py +18 -18
  182. taxonomy_mapping/taxonomy_graph.py +47 -47
  183. taxonomy_mapping/validate_lila_category_mappings.py +83 -76
  184. data_management/cct_json_to_filename_json.py +0 -89
  185. data_management/cct_to_csv.py +0 -140
  186. data_management/databases/remove_corrupted_images_from_db.py +0 -191
  187. detection/detector_training/copy_checkpoints.py +0 -43
  188. megadetector-5.0.8.dist-info/RECORD +0 -205
  189. {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/LICENSE +0 -0
  190. {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/WHEEL +0 -0
md_utils/process_utils.py CHANGED
@@ -1,133 +1,157 @@
1
- ########
2
- #
3
- # process_utils.py
4
- #
5
- # Run something at the command line and capture the output, based on:
6
- #
7
- # https://stackoverflow.com/questions/4417546/constantly-print-subprocess-output-while-process-is-running
8
- #
9
- # Includes handy example code for doing this on multiple processes/threads.
10
- #
11
- ########
12
-
13
- #%% Constants, imports, and environment
14
-
15
- import os
16
- import subprocess
17
-
18
- os.environ["PYTHONUNBUFFERED"] = "1"
19
-
20
- def execute(cmd,encoding=None,errors=None,env=None,verbose=False):
21
- """
22
- Run [cmd] (a single string) in a shell, yielding each line of output to the caller.
23
-
24
- The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
25
-
26
- "verbose" only impacts output about process management, it is not related to printing
27
- output from the child process.
28
- """
29
-
30
- if verbose:
31
- if encoding is not None:
32
- print('Launching child process with non-default encoding {}'.format(encoding))
33
- if errors is not None:
34
- print('Launching child process with non-default text error handling {}'.format(errors))
35
- if env is not None:
36
- print('Launching child process with non-default environment {}'.format(str(env)))
37
-
38
- # https://stackoverflow.com/questions/4417546/constantly-print-subprocess-output-while-process-is-running
39
- popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
40
- shell=True, universal_newlines=True, encoding=encoding,
41
- errors=errors, env=env)
42
- for stdout_line in iter(popen.stdout.readline, ""):
43
- yield stdout_line
44
- popen.stdout.close()
45
- return_code = popen.wait()
46
- if return_code:
47
- raise subprocess.CalledProcessError(return_code, cmd)
48
-
49
-
50
- def execute_and_print(cmd,print_output=True,encoding=None,errors=None,env=None,verbose=False):
51
- """
52
- Run [cmd] (a single string) in a shell, capturing and printing output. Returns
53
- a dictionary with fields "status" and "output".
54
-
55
- The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
56
-
57
- "verbose" only impacts output about process management, it is not related to printing
58
- output from the child process.
59
- """
60
-
61
- to_return = {'status':'unknown','output':''}
62
- output = []
63
- try:
64
- for s in execute(cmd,encoding=encoding,errors=errors,env=env,verbose=verbose):
65
- output.append(s)
66
- if print_output:
67
- print(s,end='',flush=True)
68
- to_return['status'] = 0
69
- except subprocess.CalledProcessError as cpe:
70
- print('execute_and_print caught error: {} ({})'.format(cpe.output,str(cpe)))
71
- to_return['status'] = cpe.returncode
72
- to_return['output'] = output
73
-
74
- return to_return
75
-
76
-
77
- #%% Single-threaded test driver for execute_and_print
78
-
79
- if False:
80
-
81
- pass
82
-
83
- #%%
84
-
85
- if os.name == 'nt':
86
- execute_and_print('echo hello && ping -n 5 127.0.0.1 && echo goodbye')
87
- else:
88
- execute_and_print('echo hello && sleep 1 && echo goodbye')
89
-
90
-
91
- #%% Parallel test driver for execute_and_print
92
-
93
- if False:
94
-
95
- pass
96
-
97
- #%%
98
-
99
- from functools import partial
100
- from multiprocessing.pool import ThreadPool as ThreadPool
101
- from multiprocessing.pool import Pool as Pool
102
-
103
- n_workers = 10
104
-
105
- # Should we use threads (vs. processes) for parallelization?
106
- use_threads = True
107
-
108
- test_data = ['a','b','c','d']
109
-
110
- def process_sample(s):
111
- return execute_and_print('echo ' + s,True)
112
-
113
- if n_workers == 1:
114
-
115
- results = []
116
- for i_sample,sample in enumerate(test_data):
117
- results.append(process_sample(sample))
118
-
119
- else:
120
-
121
- n_threads = min(n_workers,len(test_data))
122
-
123
- if use_threads:
124
- print('Starting parallel thread pool with {} workers'.format(n_threads))
125
- pool = ThreadPool(n_threads)
126
- else:
127
- print('Starting parallel process pool with {} workers'.format(n_threads))
128
- pool = Pool(n_threads)
129
-
130
- results = list(pool.map(partial(process_sample),test_data))
131
-
132
- for r in results:
133
- print(r)
1
+ """
2
+
3
+ process_utils.py
4
+
5
+ Run something at the command line and capture the output, based on:
6
+
7
+ https://stackoverflow.com/questions/4417546/constantly-print-subprocess-output-while-process-is-running
8
+
9
+ Includes handy example code for doing this on multiple processes/threads.
10
+
11
+ """
12
+
13
+ #%% Constants, imports, and environment
14
+
15
+ import os
16
+ import subprocess
17
+
18
+ os.environ["PYTHONUNBUFFERED"] = "1"
19
+
20
+ def execute(cmd,encoding=None,errors=None,env=None,verbose=False):
21
+ """
22
+ Run [cmd] (a single string) in a shell, yielding each line of output to the caller.
23
+
24
+ The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
25
+
26
+ "verbose" only impacts output about process management, it is not related to printing
27
+ output from the child process.
28
+
29
+ Args:
30
+ cmd (str): command to run
31
+ encoding (str, optional): stdout encoding, see Popen() documentation
32
+ errors (str, optional): error handling, see Popen() documentation
33
+ env (dict, optional): environment variables, see Popen() documentation
34
+ verbose (bool, optional): enable additional debug console output
35
+
36
+ Returns:
37
+ int: the command's return code, always zero, otherwise a CalledProcessError is raised
38
+ """
39
+
40
+ if verbose:
41
+ if encoding is not None:
42
+ print('Launching child process with non-default encoding {}'.format(encoding))
43
+ if errors is not None:
44
+ print('Launching child process with non-default text error handling {}'.format(errors))
45
+ if env is not None:
46
+ print('Launching child process with non-default environment {}'.format(str(env)))
47
+
48
+ # https://stackoverflow.com/questions/4417546/constantly-print-subprocess-output-while-process-is-running
49
+ popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
50
+ shell=True, universal_newlines=True, encoding=encoding,
51
+ errors=errors, env=env)
52
+ for stdout_line in iter(popen.stdout.readline, ""):
53
+ yield stdout_line
54
+ popen.stdout.close()
55
+ return_code = popen.wait()
56
+ if return_code:
57
+ raise subprocess.CalledProcessError(return_code, cmd)
58
+
59
+ return return_code
60
+
61
+
62
+ def execute_and_print(cmd,print_output=True,encoding=None,errors=None,env=None,verbose=False):
63
+ """
64
+ Run [cmd] (a single string) in a shell, capturing and printing output. Returns
65
+ a dictionary with fields "status" and "output".
66
+
67
+ The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
68
+
69
+ "verbose" only impacts output about process management, it is not related to printing
70
+ output from the child process.
71
+
72
+ Args:
73
+ cmd (str): command to run
74
+ print_output (bool, optional): whether to print output from [cmd]
75
+ encoding (str, optional): stdout encoding, see Popen() documentation
76
+ errors (str, optional): error handling, see Popen() documentation
77
+ env (dict, optional): environment variables, see Popen() documentation
78
+ verbose (bool, optional): enable additional debug console output
79
+
80
+ Returns:
81
+ dict: a dictionary with fields "status" (the process return code) and "output"
82
+ (the content of stdout)
83
+ """
84
+
85
+ to_return = {'status':'unknown','output':''}
86
+ output = []
87
+ try:
88
+ for s in execute(cmd,encoding=encoding,errors=errors,env=env,verbose=verbose):
89
+ output.append(s)
90
+ if print_output:
91
+ print(s,end='',flush=True)
92
+ to_return['status'] = 0
93
+ except subprocess.CalledProcessError as cpe:
94
+ print('execute_and_print caught error: {} ({})'.format(cpe.output,str(cpe)))
95
+ to_return['status'] = cpe.returncode
96
+ to_return['output'] = output
97
+
98
+ return to_return
99
+
100
+
101
+ #%% Single-threaded test driver for execute_and_print
102
+
103
+ if False:
104
+
105
+ pass
106
+
107
+ #%%
108
+
109
+ if os.name == 'nt':
110
+ execute_and_print('echo hello && ping -n 5 127.0.0.1 && echo goodbye')
111
+ else:
112
+ execute_and_print('echo hello && sleep 1 && echo goodbye')
113
+
114
+
115
+ #%% Parallel test driver for execute_and_print
116
+
117
+ if False:
118
+
119
+ pass
120
+
121
+ #%%
122
+
123
+ from functools import partial
124
+ from multiprocessing.pool import ThreadPool as ThreadPool
125
+ from multiprocessing.pool import Pool as Pool
126
+
127
+ n_workers = 10
128
+
129
+ # Should we use threads (vs. processes) for parallelization?
130
+ use_threads = True
131
+
132
+ test_data = ['a','b','c','d']
133
+
134
+ def process_sample(s):
135
+ return execute_and_print('echo ' + s,True)
136
+
137
+ if n_workers == 1:
138
+
139
+ results = []
140
+ for i_sample,sample in enumerate(test_data):
141
+ results.append(process_sample(sample))
142
+
143
+ else:
144
+
145
+ n_threads = min(n_workers,len(test_data))
146
+
147
+ if use_threads:
148
+ print('Starting parallel thread pool with {} workers'.format(n_threads))
149
+ pool = ThreadPool(n_threads)
150
+ else:
151
+ print('Starting parallel process pool with {} workers'.format(n_threads))
152
+ pool = Pool(n_threads)
153
+
154
+ results = list(pool.map(partial(process_sample),test_data))
155
+
156
+ for r in results:
157
+ print(r)
@@ -1,23 +1,23 @@
1
- ########
2
- #
3
- # sas_blob_utils.py
4
- #
5
- # This module contains helper functions for dealing with Shared Access Signatures
6
- # (SAS) tokens for Azure Blob Storage.
7
- #
8
- # The default Azure Storage SAS URI format is:
9
- #
10
- # https://<account>.blob.core.windows.net/<container>/<blob>?<sas_token>
11
- #
12
- # This module assumes azure-storage-blob version 12.5.
13
- #
14
- # Documentation for Azure Blob Storage:
15
- # docs.microsoft.com/en-us/azure/developer/python/sdk/storage/storage-blob-readme
16
- #
17
- # Documentation for SAS:
18
- # docs.microsoft.com/en-us/azure/storage/common/storage-sas-overview
19
- #
20
- ########
1
+ """
2
+
3
+ sas_blob_utils.py
4
+
5
+ This module contains helper functions for dealing with Shared Access Signatures
6
+ (SAS) tokens for Azure Blob Storage.
7
+
8
+ The default Azure Storage SAS URI format is:
9
+
10
+ https://<account>.blob.core.windows.net/<container>/<blob>?<sas_token>
11
+
12
+ This module assumes azure-storage-blob version 12.5.
13
+
14
+ Documentation for Azure Blob Storage:
15
+ docs.microsoft.com/en-us/azure/developer/python/sdk/storage/storage-blob-readme
16
+
17
+ Documentation for SAS:
18
+ docs.microsoft.com/en-us/azure/storage/common/storage-sas-overview
19
+
20
+ """
21
21
 
22
22
  #%% Imports
23
23
 
@@ -1,15 +1,15 @@
1
- ########
2
- #
3
- # split_locations_into_train_val.py
4
- #
5
- # Split a list of location IDs into training and validation, targeting a specific
6
- # train/val split for each category, but allowing some categories to be tighter or looser
7
- # than others. Does nothing particularly clever, just randomly splits locations into
8
- # train/val lots of times using the target val fraction, and picks the one that meets the
9
- # specified constraints and minimizes weighted error, where "error" is defined as the
10
- # sum of each class's absolute divergence from the target val fraction.
11
- #
12
- ########
1
+ """
2
+
3
+ split_locations_into_train_val.py
4
+
5
+ Splits a list of location IDs into training and validation, targeting a specific
6
+ train/val split for each category, but allowing some categories to be tighter or looser
7
+ than others. Does nothing particularly clever, just randomly splits locations into
8
+ train/val lots of times using the target val fraction, and picks the one that meets the
9
+ specified constraints and minimizes weighted error, where "error" is defined as the
10
+ sum of each class's absolute divergence from the target val fraction.
11
+
12
+ """
13
13
 
14
14
  #%% Imports/constants
15
15
 
@@ -30,31 +30,44 @@ def split_locations_into_train_val(location_to_category_counts,
30
30
  category_to_error_weight=None,
31
31
  default_max_allowable_error=0.1):
32
32
  """
33
- Split a list of location IDs into training and validation, targeting a specific
33
+ Splits a list of location IDs into training and validation, targeting a specific
34
34
  train/val split for each category, but allowing some categories to be tighter or looser
35
35
  than others. Does nothing particularly clever, just randomly splits locations into
36
36
  train/val lots of times using the target val fraction, and picks the one that meets the
37
37
  specified constraints and minimizes weighted error, where "error" is defined as the
38
38
  sum of each class's absolute divergence from the target val fraction.
39
39
 
40
- location_to_category_counts should be a dict mapping location IDs to dicts,
41
- with each dict mapping a category name to a count. Any categories not present in a
42
- particular dict are assumed to have a count of zero for that location.
43
-
44
- If not None, category_to_max_allowable_error should be a dict mapping category names
45
- to maximum allowable errors. These are hard constraints, but you can specify a subset
46
- of categories. Categories not included here have a maximum error of Inf.
47
-
48
- If not None, category_to_error_weight should be a dict mapping category names to
49
- error weights. You can specify a subset of categories. Categories not included here
50
- have a weight of 1.0.
51
-
52
- default_max_allowable_error is the maximum allowable error for categories not present in
53
- category_to_max_allowable_error. Set to None (or >= 1.0) to disable hard constraints for
54
- categories not present in category_to_max_allowable_error
55
-
56
- returns val_locations,category_to_val_fraction
57
-
40
+ Args:
41
+ location_to_category_counts (dict): a dict mapping location IDs to dicts,
42
+ with each dict mapping a category name to a count. Any categories not present
43
+ in a particular dict are assumed to have a count of zero for that location.
44
+
45
+ For example:
46
+
47
+ .. code-block:: none
48
+
49
+ {'location-000': {'bear':4,'wolf':10},
50
+ 'location-001': {'bear':12,'elk':20}}
51
+
52
+ n_random_seeds (int, optional): number of random seeds to try, always starting from zero
53
+ target_val_fraction (float, optional): fraction of images containing each species we'd
54
+ like to put in the val split
55
+ category_to_max_allowable_error (dict, optional): a dict mapping category names
56
+ to maximum allowable errors. These are hard constraints (i.e., we will error
57
+ if we can't meet them). Does not need to include all categories; categories not
58
+ included will be assigned a maximum error according to [default_max_allowable_error].
59
+ If this is None, no hard constraints are applied.
60
+ category_to_error_weight (dict, optional): a dict mapping category names to
61
+ error weights. You can specify a subset of categories; categories not included here
62
+ have a weight of 1.0. If None, all categories have the same weight.
63
+ default_max_allowable_error (float, optional): the maximum allowable error for categories not
64
+ present in [category_to_max_allowable_error]. Set to None (or >= 1.0) to disable hard
65
+ constraints for categories not present in [category_to_max_allowable_error]
66
+
67
+ Returns:
68
+ tuple: A two-element tuple:
69
+ - list of location IDs in the val split
70
+ - a dict mapping category names to the fraction of images in the val split
58
71
  """
59
72
 
60
73
  location_ids = list(location_to_category_counts.keys())
@@ -84,7 +97,7 @@ def split_locations_into_train_val(location_to_category_counts,
84
97
  # random_seed = 0
85
98
  def compute_seed_errors(random_seed):
86
99
  """
87
- Compute the per-category error for a specific random seed.
100
+ Computes the per-category error for a specific random seed.
88
101
 
89
102
  returns weighted_average_error,category_to_val_fraction
90
103
  """
md_utils/string_utils.py CHANGED
@@ -1,16 +1,27 @@
1
- ########
2
- #
3
- # string_utils.py
4
- #
5
- # Miscellaneous string utilities
6
- #
7
- ########
1
+ """
2
+
3
+ string_utils.py
4
+
5
+ Miscellaneous string utilities.
6
+
7
+ """
8
+
9
+ #%% Imports
8
10
 
9
11
  import re
10
12
 
13
+
14
+ #%% Functions
15
+
11
16
  def is_float(s):
12
17
  """
13
- Checks whether a string represents a valid float
18
+ Checks whether [s] is an object (typically a string) that can be cast to a float
19
+
20
+ Args:
21
+ s (object): object to evaluate
22
+
23
+ Returns:
24
+ bool: True if s successfully casts to a float, otherwise False
14
25
  """
15
26
 
16
27
  try:
@@ -23,10 +34,16 @@ def is_float(s):
23
34
  def human_readable_to_bytes(size):
24
35
  """
25
36
  Given a human-readable byte string (e.g. 2G, 10GB, 30MB, 20KB),
26
- return the number of bytes. Will return 0 if the argument has
37
+ returns the number of bytes. Will return 0 if the argument has
27
38
  unexpected form.
28
39
 
29
40
  https://gist.github.com/beugley/ccd69945346759eb6142272a6d69b4e0
41
+
42
+ Args:
43
+ size (str): string representing a size
44
+
45
+ Returns:
46
+ int: the corresponding size in bytes
30
47
  """
31
48
 
32
49
  size = re.sub(r'\s+', '', size)
@@ -61,9 +78,15 @@ def human_readable_to_bytes(size):
61
78
 
62
79
  def remove_ansi_codes(s):
63
80
  """
64
- Remove ANSI escape codes from a string.
81
+ Removes ANSI escape codes from a string.
65
82
 
66
83
  https://stackoverflow.com/questions/14693701/how-can-i-remove-the-ansi-escape-sequences-from-a-string-in-python#14693789
84
+
85
+ Args:
86
+ s (str): the string to de-ANSI-i-fy
87
+
88
+ Returns:
89
+ str: A copy of [s] without ANSI codes
67
90
  """
68
91
  ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
69
92
  return ansi_escape.sub('', s)