megadetector 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (197) hide show
  1. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
  2. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
  3. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
  4. megadetector/classification/aggregate_classifier_probs.py +3 -3
  5. megadetector/classification/analyze_failed_images.py +5 -5
  6. megadetector/classification/cache_batchapi_outputs.py +5 -5
  7. megadetector/classification/create_classification_dataset.py +11 -12
  8. megadetector/classification/crop_detections.py +10 -10
  9. megadetector/classification/csv_to_json.py +8 -8
  10. megadetector/classification/detect_and_crop.py +13 -15
  11. megadetector/classification/efficientnet/model.py +8 -8
  12. megadetector/classification/efficientnet/utils.py +6 -5
  13. megadetector/classification/evaluate_model.py +7 -7
  14. megadetector/classification/identify_mislabeled_candidates.py +6 -6
  15. megadetector/classification/json_to_azcopy_list.py +1 -1
  16. megadetector/classification/json_validator.py +29 -32
  17. megadetector/classification/map_classification_categories.py +9 -9
  18. megadetector/classification/merge_classification_detection_output.py +12 -9
  19. megadetector/classification/prepare_classification_script.py +19 -19
  20. megadetector/classification/prepare_classification_script_mc.py +26 -26
  21. megadetector/classification/run_classifier.py +4 -4
  22. megadetector/classification/save_mislabeled.py +6 -6
  23. megadetector/classification/train_classifier.py +1 -1
  24. megadetector/classification/train_classifier_tf.py +9 -9
  25. megadetector/classification/train_utils.py +10 -10
  26. megadetector/data_management/annotations/annotation_constants.py +1 -2
  27. megadetector/data_management/camtrap_dp_to_coco.py +79 -46
  28. megadetector/data_management/cct_json_utils.py +103 -103
  29. megadetector/data_management/cct_to_md.py +49 -49
  30. megadetector/data_management/cct_to_wi.py +33 -33
  31. megadetector/data_management/coco_to_labelme.py +75 -75
  32. megadetector/data_management/coco_to_yolo.py +210 -193
  33. megadetector/data_management/databases/add_width_and_height_to_db.py +86 -12
  34. megadetector/data_management/databases/combine_coco_camera_traps_files.py +40 -40
  35. megadetector/data_management/databases/integrity_check_json_db.py +228 -200
  36. megadetector/data_management/databases/subset_json_db.py +33 -33
  37. megadetector/data_management/generate_crops_from_cct.py +88 -39
  38. megadetector/data_management/get_image_sizes.py +54 -49
  39. megadetector/data_management/labelme_to_coco.py +133 -125
  40. megadetector/data_management/labelme_to_yolo.py +159 -73
  41. megadetector/data_management/lila/create_lila_blank_set.py +81 -83
  42. megadetector/data_management/lila/create_lila_test_set.py +32 -31
  43. megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
  44. megadetector/data_management/lila/download_lila_subset.py +21 -24
  45. megadetector/data_management/lila/generate_lila_per_image_labels.py +365 -107
  46. megadetector/data_management/lila/get_lila_annotation_counts.py +35 -33
  47. megadetector/data_management/lila/get_lila_image_counts.py +22 -22
  48. megadetector/data_management/lila/lila_common.py +73 -70
  49. megadetector/data_management/lila/test_lila_metadata_urls.py +28 -19
  50. megadetector/data_management/mewc_to_md.py +344 -340
  51. megadetector/data_management/ocr_tools.py +262 -255
  52. megadetector/data_management/read_exif.py +249 -227
  53. megadetector/data_management/remap_coco_categories.py +90 -28
  54. megadetector/data_management/remove_exif.py +81 -21
  55. megadetector/data_management/rename_images.py +187 -187
  56. megadetector/data_management/resize_coco_dataset.py +588 -120
  57. megadetector/data_management/speciesnet_to_md.py +41 -41
  58. megadetector/data_management/wi_download_csv_to_coco.py +55 -55
  59. megadetector/data_management/yolo_output_to_md_output.py +248 -122
  60. megadetector/data_management/yolo_to_coco.py +333 -191
  61. megadetector/detection/change_detection.py +832 -0
  62. megadetector/detection/process_video.py +340 -337
  63. megadetector/detection/pytorch_detector.py +358 -278
  64. megadetector/detection/run_detector.py +399 -186
  65. megadetector/detection/run_detector_batch.py +404 -377
  66. megadetector/detection/run_inference_with_yolov5_val.py +340 -327
  67. megadetector/detection/run_tiled_inference.py +257 -249
  68. megadetector/detection/tf_detector.py +24 -24
  69. megadetector/detection/video_utils.py +332 -295
  70. megadetector/postprocessing/add_max_conf.py +19 -11
  71. megadetector/postprocessing/categorize_detections_by_size.py +45 -45
  72. megadetector/postprocessing/classification_postprocessing.py +468 -433
  73. megadetector/postprocessing/combine_batch_outputs.py +23 -23
  74. megadetector/postprocessing/compare_batch_results.py +590 -525
  75. megadetector/postprocessing/convert_output_format.py +106 -102
  76. megadetector/postprocessing/create_crop_folder.py +347 -147
  77. megadetector/postprocessing/detector_calibration.py +173 -168
  78. megadetector/postprocessing/generate_csv_report.py +508 -499
  79. megadetector/postprocessing/load_api_results.py +48 -27
  80. megadetector/postprocessing/md_to_coco.py +133 -102
  81. megadetector/postprocessing/md_to_labelme.py +107 -90
  82. megadetector/postprocessing/md_to_wi.py +40 -40
  83. megadetector/postprocessing/merge_detections.py +92 -114
  84. megadetector/postprocessing/postprocess_batch_results.py +319 -301
  85. megadetector/postprocessing/remap_detection_categories.py +91 -38
  86. megadetector/postprocessing/render_detection_confusion_matrix.py +214 -205
  87. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
  88. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
  89. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +704 -679
  90. megadetector/postprocessing/separate_detections_into_folders.py +226 -211
  91. megadetector/postprocessing/subset_json_detector_output.py +265 -262
  92. megadetector/postprocessing/top_folders_to_bottom.py +45 -45
  93. megadetector/postprocessing/validate_batch_results.py +70 -70
  94. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
  95. megadetector/taxonomy_mapping/map_new_lila_datasets.py +18 -19
  96. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +54 -33
  97. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +67 -67
  98. megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
  99. megadetector/taxonomy_mapping/simple_image_download.py +8 -8
  100. megadetector/taxonomy_mapping/species_lookup.py +156 -74
  101. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
  102. megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
  103. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
  104. megadetector/utils/ct_utils.py +1049 -211
  105. megadetector/utils/directory_listing.py +21 -77
  106. megadetector/utils/gpu_test.py +22 -22
  107. megadetector/utils/md_tests.py +632 -529
  108. megadetector/utils/path_utils.py +1520 -431
  109. megadetector/utils/process_utils.py +41 -41
  110. megadetector/utils/split_locations_into_train_val.py +62 -62
  111. megadetector/utils/string_utils.py +148 -27
  112. megadetector/utils/url_utils.py +489 -176
  113. megadetector/utils/wi_utils.py +2658 -2526
  114. megadetector/utils/write_html_image_list.py +137 -137
  115. megadetector/visualization/plot_utils.py +34 -30
  116. megadetector/visualization/render_images_with_thumbnails.py +39 -74
  117. megadetector/visualization/visualization_utils.py +487 -435
  118. megadetector/visualization/visualize_db.py +232 -198
  119. megadetector/visualization/visualize_detector_output.py +82 -76
  120. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/METADATA +5 -2
  121. megadetector-10.0.0.dist-info/RECORD +139 -0
  122. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/WHEEL +1 -1
  123. megadetector/api/batch_processing/api_core/__init__.py +0 -0
  124. megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
  125. megadetector/api/batch_processing/api_core/batch_service/score.py +0 -439
  126. megadetector/api/batch_processing/api_core/server.py +0 -294
  127. megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
  128. megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
  129. megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
  130. megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
  131. megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
  132. megadetector/api/batch_processing/api_core/server_utils.py +0 -88
  133. megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
  134. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
  135. megadetector/api/batch_processing/api_support/__init__.py +0 -0
  136. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
  137. megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
  138. megadetector/api/synchronous/__init__.py +0 -0
  139. megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  140. megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
  141. megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
  142. megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
  143. megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
  144. megadetector/api/synchronous/api_core/tests/load_test.py +0 -110
  145. megadetector/data_management/importers/add_nacti_sizes.py +0 -52
  146. megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
  147. megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
  148. megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
  149. megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
  150. megadetector/data_management/importers/awc_to_json.py +0 -191
  151. megadetector/data_management/importers/bellevue_to_json.py +0 -272
  152. megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
  153. megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
  154. megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
  155. megadetector/data_management/importers/cct_field_adjustments.py +0 -58
  156. megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
  157. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
  158. megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
  159. megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
  160. megadetector/data_management/importers/ena24_to_json.py +0 -276
  161. megadetector/data_management/importers/filenames_to_json.py +0 -386
  162. megadetector/data_management/importers/helena_to_cct.py +0 -283
  163. megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
  164. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
  165. megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
  166. megadetector/data_management/importers/jb_csv_to_json.py +0 -150
  167. megadetector/data_management/importers/mcgill_to_json.py +0 -250
  168. megadetector/data_management/importers/missouri_to_json.py +0 -490
  169. megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
  170. megadetector/data_management/importers/noaa_seals_2019.py +0 -181
  171. megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
  172. megadetector/data_management/importers/pc_to_json.py +0 -365
  173. megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
  174. megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
  175. megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
  176. megadetector/data_management/importers/rspb_to_json.py +0 -356
  177. megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
  178. megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
  179. megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
  180. megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
  181. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
  182. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
  183. megadetector/data_management/importers/sulross_get_exif.py +0 -65
  184. megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
  185. megadetector/data_management/importers/ubc_to_json.py +0 -399
  186. megadetector/data_management/importers/umn_to_json.py +0 -507
  187. megadetector/data_management/importers/wellington_to_json.py +0 -263
  188. megadetector/data_management/importers/wi_to_json.py +0 -442
  189. megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
  190. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
  191. megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
  192. megadetector/utils/azure_utils.py +0 -178
  193. megadetector/utils/sas_blob_utils.py +0 -509
  194. megadetector-5.0.28.dist-info/RECORD +0 -209
  195. /megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
  196. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
  197. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0
@@ -18,33 +18,33 @@ import subprocess
18
18
  def execute(cmd,encoding=None,errors=None,env=None,verbose=False):
19
19
  """
20
20
  Run [cmd] (a single string) in a shell, yielding each line of output to the caller.
21
-
21
+
22
22
  The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
23
-
23
+
24
24
  "verbose" only impacts output about process management, it is not related to printing
25
25
  output from the child process.
26
-
26
+
27
27
  Args:
28
28
  cmd (str): command to run
29
29
  encoding (str, optional): stdout encoding, see Popen() documentation
30
30
  errors (str, optional): error handling, see Popen() documentation
31
31
  env (dict, optional): environment variables, see Popen() documentation
32
32
  verbose (bool, optional): enable additional debug console output
33
-
33
+
34
34
  Returns:
35
- int: the command's return code, always zero, otherwise a CalledProcessError is raised
35
+ int: the command's return code, always zero, otherwise a CalledProcessError is raised
36
36
  """
37
-
37
+
38
38
  os.environ["PYTHONUNBUFFERED"] = "1"
39
-
40
- if verbose:
39
+
40
+ if verbose:
41
41
  if encoding is not None:
42
42
  print('Launching child process with non-default encoding {}'.format(encoding))
43
43
  if errors is not None:
44
44
  print('Launching child process with non-default text error handling {}'.format(errors))
45
45
  if env is not None:
46
46
  print('Launching child process with non-default environment {}'.format(str(env)))
47
-
47
+
48
48
  # https://stackoverflow.com/questions/4417546/constantly-print-subprocess-output-while-process-is-running
49
49
  popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
50
50
  shell=True, universal_newlines=True, encoding=encoding,
@@ -55,7 +55,7 @@ def execute(cmd,encoding=None,errors=None,env=None,verbose=False):
55
55
  return_code = popen.wait()
56
56
  if return_code:
57
57
  raise subprocess.CalledProcessError(return_code, cmd)
58
-
58
+
59
59
  return return_code
60
60
 
61
61
 
@@ -70,15 +70,15 @@ def execute_and_print(cmd,
70
70
  """
71
71
  Run [cmd] (a single string) in a shell, capturing and printing output. Returns
72
72
  a dictionary with fields "status" and "output".
73
-
73
+
74
74
  The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
75
-
75
+
76
76
  "verbose" only impacts output about process management, it is not related to printing
77
77
  output from the child process.
78
-
78
+
79
79
  Args:
80
80
  cmd (str): command to run
81
- print_output (bool, optional): whether to print output from [cmd] (stdout is
81
+ print_output (bool, optional): whether to print output from [cmd] (stdout is
82
82
  captured regardless of the value of print_output)
83
83
  encoding (str, optional): stdout encoding, see Popen() documentation
84
84
  errors (str, optional): error handling, see Popen() documentation
@@ -86,15 +86,15 @@ def execute_and_print(cmd,
86
86
  verbose (bool, optional): enable additional debug console output
87
87
  catch_exceptions (bool, optional): catch exceptions and include in the output, otherwise raise
88
88
  echo_command (bool, optional): print the command before executing
89
-
89
+
90
90
  Returns:
91
91
  dict: a dictionary with fields "status" (the process return code) and "output"
92
- (the content of stdout)
92
+ (the content of stdout)
93
93
  """
94
94
 
95
95
  if echo_command:
96
96
  print('Running command:\n{}\n'.format(cmd))
97
-
97
+
98
98
  to_return = {'status':'unknown','output':''}
99
99
  output = []
100
100
  try:
@@ -109,64 +109,64 @@ def execute_and_print(cmd,
109
109
  print('execute_and_print caught error: {} ({})'.format(cpe.output,str(cpe)))
110
110
  to_return['status'] = cpe.returncode
111
111
  to_return['output'] = output
112
-
112
+
113
113
  return to_return
114
114
 
115
115
 
116
116
  #%% Single-threaded test driver for execute_and_print
117
117
 
118
118
  if False:
119
-
119
+
120
120
  pass
121
121
 
122
122
  #%%
123
-
123
+
124
124
  if os.name == 'nt':
125
- execute_and_print('echo hello && ping -n 5 127.0.0.1 && echo goodbye')
125
+ execute_and_print('echo hello && ping -n 5 127.0.0.1 && echo goodbye')
126
126
  else:
127
- execute_and_print('echo hello && sleep 1 && echo goodbye')
128
-
127
+ execute_and_print('echo hello && sleep 1 && echo goodbye')
128
+
129
129
 
130
130
  #%% Parallel test driver for execute_and_print
131
131
 
132
132
  if False:
133
-
133
+
134
134
  pass
135
135
 
136
136
  #%%
137
-
137
+
138
138
  from functools import partial
139
139
  from multiprocessing.pool import ThreadPool as ThreadPool
140
140
  from multiprocessing.pool import Pool as Pool
141
-
141
+
142
142
  n_workers = 10
143
-
143
+
144
144
  # Should we use threads (vs. processes) for parallelization?
145
145
  use_threads = True
146
-
146
+
147
147
  test_data = ['a','b','c','d']
148
-
149
- def process_sample(s):
148
+
149
+ def _process_sample(s):
150
150
  return execute_and_print('echo ' + s,True)
151
-
152
- if n_workers == 1:
153
-
151
+
152
+ if n_workers == 1:
153
+
154
154
  results = []
155
- for i_sample,sample in enumerate(test_data):
156
- results.append(process_sample(sample))
157
-
155
+ for i_sample,sample in enumerate(test_data):
156
+ results.append(_process_sample(sample))
157
+
158
158
  else:
159
-
159
+
160
160
  n_threads = min(n_workers,len(test_data))
161
-
161
+
162
162
  if use_threads:
163
163
  print('Starting parallel thread pool with {} workers'.format(n_threads))
164
164
  pool = ThreadPool(n_threads)
165
165
  else:
166
166
  print('Starting parallel process pool with {} workers'.format(n_threads))
167
167
  pool = Pool(n_threads)
168
-
169
- results = list(pool.map(partial(process_sample),test_data))
170
-
168
+
169
+ results = list(pool.map(partial(_process_sample),test_data))
170
+
171
171
  for r in results:
172
172
  print(r)
@@ -4,8 +4,8 @@ split_locations_into_train_val.py
4
4
 
5
5
  Splits a list of location IDs into training and validation, targeting a specific
6
6
  train/val split for each category, but allowing some categories to be tighter or looser
7
- than others. Does nothing particularly clever, just randomly splits locations into
8
- train/val lots of times using the target val fraction, and picks the one that meets the
7
+ than others. Does nothing particularly clever, just randomly splits locations into
8
+ train/val lots of times using the target val fraction, and picks the one that meets the
9
9
  specified constraints and minimizes weighted error, where "error" is defined as the
10
10
  sum of each class's absolute divergence from the target val fraction.
11
11
 
@@ -26,63 +26,63 @@ from tqdm import tqdm
26
26
  def split_locations_into_train_val(location_to_category_counts,
27
27
  n_random_seeds=10000,
28
28
  target_val_fraction=0.15,
29
- category_to_max_allowable_error=None,
29
+ category_to_max_allowable_error=None,
30
30
  category_to_error_weight=None,
31
31
  default_max_allowable_error=0.1,
32
32
  require_complete_coverage=True):
33
33
  """
34
34
  Splits a list of location IDs into training and validation, targeting a specific
35
35
  train/val split for each category, but allowing some categories to be tighter or looser
36
- than others. Does nothing particularly clever, just randomly splits locations into
37
- train/val lots of times using the target val fraction, and picks the one that meets the
36
+ than others. Does nothing particularly clever, just randomly splits locations into
37
+ train/val lots of times using the target val fraction, and picks the one that meets the
38
38
  specified constraints and minimizes weighted error, where "error" is defined as the
39
- sum of each class's absolute divergence from the target val fraction.
40
-
39
+ sum of each class's absolute divergence from the target val fraction.
40
+
41
41
  Args:
42
42
  location_to_category_counts (dict): a dict mapping location IDs to dicts,
43
- with each dict mapping a category name to a count. Any categories not present
43
+ with each dict mapping a category name to a count. Any categories not present
44
44
  in a particular dict are assumed to have a count of zero for that location.
45
-
45
+
46
46
  For example:
47
-
47
+
48
48
  .. code-block:: none
49
49
 
50
50
  {'location-000': {'bear':4,'wolf':10},
51
51
  'location-001': {'bear':12,'elk':20}}
52
-
52
+
53
53
  n_random_seeds (int, optional): number of random seeds to try, always starting from zero
54
54
  target_val_fraction (float, optional): fraction of images containing each species we'd
55
55
  like to put in the val split
56
56
  category_to_max_allowable_error (dict, optional): a dict mapping category names
57
57
  to maximum allowable errors. These are hard constraints (i.e., we will error
58
- if we can't meet them). Does not need to include all categories; categories not
58
+ if we can't meet them). Does not need to include all categories; categories not
59
59
  included will be assigned a maximum error according to [default_max_allowable_error].
60
60
  If this is None, no hard constraints are applied.
61
61
  category_to_error_weight (dict, optional): a dict mapping category names to
62
62
  error weights. You can specify a subset of categories; categories not included here
63
63
  have a weight of 1.0. If None, all categories have the same weight.
64
- default_max_allowable_error (float, optional): the maximum allowable error for categories not
65
- present in [category_to_max_allowable_error]. Set to None (or >= 1.0) to disable hard
64
+ default_max_allowable_error (float, optional): the maximum allowable error for categories not
65
+ present in [category_to_max_allowable_error]. Set to None (or >= 1.0) to disable hard
66
66
  constraints for categories not present in [category_to_max_allowable_error]
67
- require_complete_coverage (bool, optional): require that every category appear in both train and
68
- val
69
-
67
+ require_complete_coverage (bool, optional): require that every category appear in both train
68
+ and val
69
+
70
70
  Returns:
71
71
  tuple: A two-element tuple:
72
72
  - list of location IDs in the val split
73
- - a dict mapping category names to the fraction of images in the val split
73
+ - a dict mapping category names to the fraction of images in the val split
74
74
  """
75
-
75
+
76
76
  location_ids = list(location_to_category_counts.keys())
77
-
77
+
78
78
  n_val_locations = int(target_val_fraction*len(location_ids))
79
-
79
+
80
80
  if category_to_max_allowable_error is None:
81
81
  category_to_max_allowable_error = {}
82
-
82
+
83
83
  if category_to_error_weight is None:
84
84
  category_to_error_weight = {}
85
-
85
+
86
86
  # category ID to total count; the total count is used only for printouts
87
87
  category_id_to_count = {}
88
88
  for location_id in location_to_category_counts:
@@ -91,28 +91,28 @@ def split_locations_into_train_val(location_to_category_counts,
91
91
  category_id_to_count[category_id] = 0
92
92
  category_id_to_count[category_id] += \
93
93
  location_to_category_counts[location_id][category_id]
94
-
94
+
95
95
  category_ids = set(category_id_to_count.keys())
96
-
96
+
97
97
  print('Splitting {} categories over {} locations'.format(
98
98
  len(category_ids),len(location_ids)))
99
-
99
+
100
100
  # random_seed = 0
101
101
  def compute_seed_errors(random_seed):
102
102
  """
103
103
  Computes the per-category error for a specific random seed.
104
-
104
+
105
105
  returns weighted_average_error,category_to_val_fraction
106
106
  """
107
-
107
+
108
108
  # Randomly split into train/val
109
109
  random.seed(random_seed)
110
110
  val_locations = random.sample(location_ids,k=n_val_locations)
111
111
  val_locations_set = set(val_locations)
112
-
112
+
113
113
  # For each category, measure the % of images that went into the val set
114
114
  category_to_val_fraction = defaultdict(float)
115
-
115
+
116
116
  for category_id in category_ids:
117
117
  category_val_count = 0
118
118
  category_train_count = 0
@@ -127,44 +127,44 @@ def split_locations_into_train_val(location_to_category_counts,
127
127
  category_train_count += location_category_count
128
128
  category_val_fraction = category_val_count / (category_val_count + category_train_count)
129
129
  category_to_val_fraction[category_id] = category_val_fraction
130
-
130
+
131
131
  # Absolute deviation from the target val fraction for each category
132
132
  category_errors = {}
133
133
  weighted_category_errors = {}
134
-
134
+
135
135
  # category = next(iter(category_to_val_fraction))
136
136
  for category in category_to_val_fraction:
137
-
137
+
138
138
  category_val_fraction = category_to_val_fraction[category]
139
-
139
+
140
140
  category_error = abs(category_val_fraction-target_val_fraction)
141
141
  category_errors[category] = category_error
142
-
142
+
143
143
  category_weight = 1.0
144
144
  if category in category_to_error_weight:
145
145
  category_weight = category_to_error_weight[category]
146
146
  weighted_category_error = category_error * category_weight
147
147
  weighted_category_errors[category] = weighted_category_error
148
-
148
+
149
149
  weighted_average_error = np.mean(list(weighted_category_errors.values()))
150
-
150
+
151
151
  return weighted_average_error,weighted_category_errors,category_to_val_fraction
152
-
152
+
153
153
  # ... def compute_seed_errors(...)
154
-
154
+
155
155
  # This will only include random seeds that satisfy the hard constraints
156
156
  random_seed_to_weighted_average_error = {}
157
-
157
+
158
158
  # random_seed = 0
159
159
  for random_seed in tqdm(range(0,n_random_seeds)):
160
-
160
+
161
161
  weighted_average_error,weighted_category_errors,category_to_val_fraction = \
162
162
  compute_seed_errors(random_seed)
163
-
163
+
164
164
  seed_satisfies_hard_constraints = True
165
-
165
+
166
166
  for category in category_to_val_fraction:
167
- if category in category_to_max_allowable_error:
167
+ if category in category_to_max_allowable_error:
168
168
  max_allowable_error = category_to_max_allowable_error[category]
169
169
  else:
170
170
  if default_max_allowable_error is None:
@@ -183,59 +183,59 @@ def split_locations_into_train_val(location_to_category_counts,
183
183
  if category_error > max_allowable_error:
184
184
  seed_satisfies_hard_constraints = False
185
185
  break
186
-
186
+
187
187
  # ...for each category
188
-
189
- if seed_satisfies_hard_constraints:
188
+
189
+ if seed_satisfies_hard_constraints:
190
190
  random_seed_to_weighted_average_error[random_seed] = weighted_average_error
191
-
191
+
192
192
  # ...for each random seed
193
-
193
+
194
194
  assert len(random_seed_to_weighted_average_error) > 0, \
195
195
  'No random seed met all the hard constraints'
196
-
196
+
197
197
  print('\n{} of {} random seeds satisfied hard constraints'.format(
198
198
  len(random_seed_to_weighted_average_error),n_random_seeds))
199
-
199
+
200
200
  min_error = None
201
201
  min_error_seed = None
202
-
202
+
203
203
  for random_seed in random_seed_to_weighted_average_error.keys():
204
204
  error_metric = random_seed_to_weighted_average_error[random_seed]
205
205
  if min_error is None or error_metric < min_error:
206
206
  min_error = error_metric
207
207
  min_error_seed = random_seed
208
-
208
+
209
209
  random.seed(min_error_seed)
210
210
  val_locations = random.sample(location_ids,k=n_val_locations)
211
211
  train_locations = []
212
212
  for location_id in location_ids:
213
213
  if location_id not in val_locations:
214
214
  train_locations.append(location_id)
215
-
216
- print('\nVal locations:\n')
215
+
216
+ print('\nVal locations:\n')
217
217
  for loc in val_locations:
218
218
  print('{}'.format(loc))
219
219
  print('')
220
-
220
+
221
221
  weighted_average_error,weighted_category_errors,category_to_val_fraction = \
222
222
  compute_seed_errors(min_error_seed)
223
-
223
+
224
224
  random_seed = min_error_seed
225
-
225
+
226
226
  category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,reverse=True)
227
227
  category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,
228
228
  sort_values=category_id_to_count,
229
229
  reverse=True)
230
-
231
-
230
+
231
+
232
232
  print('Val fractions by category:\n')
233
-
233
+
234
234
  for category in category_to_val_fraction:
235
235
  print('{} ({}) {:.2f}'.format(
236
236
  category,category_id_to_count[category],
237
237
  category_to_val_fraction[category]))
238
-
238
+
239
239
  return val_locations,category_to_val_fraction
240
240
 
241
241
  # ...def split_locations_into_train_val(...)
@@ -14,16 +14,19 @@ import re
14
14
  #%% Functions
15
15
 
16
16
  def is_float(s):
17
- """
17
+ """
18
18
  Checks whether [s] is an object (typically a string) that can be cast to a float
19
-
19
+
20
20
  Args:
21
21
  s (object): object to evaluate
22
-
22
+
23
23
  Returns:
24
24
  bool: True if s successfully casts to a float, otherwise False
25
25
  """
26
-
26
+
27
+ if s is None:
28
+ return False
29
+
27
30
  try:
28
31
  _ = float(s)
29
32
  except ValueError:
@@ -36,57 +39,175 @@ def human_readable_to_bytes(size):
36
39
  Given a human-readable byte string (e.g. 2G, 10GB, 30MB, 20KB),
37
40
  returns the number of bytes. Will return 0 if the argument has
38
41
  unexpected form.
39
-
42
+
40
43
  https://gist.github.com/beugley/ccd69945346759eb6142272a6d69b4e0
41
-
44
+
42
45
  Args:
43
46
  size (str): string representing a size
44
-
47
+
45
48
  Returns:
46
49
  int: the corresponding size in bytes
47
50
  """
48
-
51
+
49
52
  size = re.sub(r'\s+', '', size)
50
-
53
+
54
+ if not size: # Handle empty string case after stripping spaces
55
+ return 0
56
+
51
57
  if (size[-1] == 'B'):
52
58
  size = size[:-1]
53
-
59
+
60
+ if not size: # Handle case where size was just "B"
61
+ return 0
62
+
54
63
  if (size.isdigit()):
55
- bytes = int(size)
64
+ bytes_val = int(size) # Renamed to avoid conflict with built-in 'bytes'
56
65
  elif (is_float(size)):
57
- bytes = float(size)
66
+ bytes_val = float(size) # Renamed
58
67
  else:
59
- bytes = size[:-1]
60
- unit = size[-1]
61
- try:
62
- bytes = float(bytes)
68
+ # Handle cases like "1KB" where size[:-1] might be "1K" before this block
69
+ # The original code would try to float("1K") which fails.
70
+ # Need to separate numeric part from unit more carefully.
71
+ numeric_part = ''
72
+ unit_part = ''
73
+
74
+ # Iterate from the end to find the unit (K, M, G, T)
75
+ # This handles cases like "10KB" or "2.5GB"
76
+ for i in range(len(size) -1, -1, -1):
77
+ if size[i].isalpha():
78
+ unit_part = size[i] + unit_part
79
+ else:
80
+ numeric_part = size[:i+1]
81
+ break
82
+
83
+ # If no unit found, or numeric part is empty after stripping unit
84
+ if not unit_part or not numeric_part:
85
+ return 0
86
+
87
+ try:
88
+ bytes_val = float(numeric_part)
89
+ unit = unit_part
63
90
  if (unit == 'T'):
64
- bytes *= 1024*1024*1024*1024
91
+ bytes_val *= 1024*1024*1024*1024
65
92
  elif (unit == 'G'):
66
- bytes *= 1024*1024*1024
93
+ bytes_val *= 1024*1024*1024
67
94
  elif (unit == 'M'):
68
- bytes *= 1024*1024
95
+ bytes_val *= 1024*1024
69
96
  elif (unit == 'K'):
70
- bytes *= 1024
97
+ bytes_val *= 1024
71
98
  else:
72
- bytes = 0
99
+ # If it's a known unit (like 'B' already stripped) but not T/G/M/K,
100
+ # and it was floatable, it's just bytes. If it's an unknown unit, it's
101
+ # an error.
102
+ if unit not in ['B', '']: # 'B' was stripped, '' means just a number
103
+ bytes_val = 0
73
104
  except ValueError:
74
- bytes = 0
75
-
76
- return bytes
105
+ bytes_val = 0
106
+
107
+ return bytes_val
77
108
 
78
109
 
79
110
  def remove_ansi_codes(s):
80
111
  """
81
112
  Removes ANSI escape codes from a string.
82
-
113
+
83
114
  https://stackoverflow.com/questions/14693701/how-can-i-remove-the-ansi-escape-sequences-from-a-string-in-python#14693789
84
-
115
+
85
116
  Args:
86
117
  s (str): the string to de-ANSI-i-fy
87
-
118
+
88
119
  Returns:
89
120
  str: A copy of [s] without ANSI codes
90
121
  """
122
+
91
123
  ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
92
124
  return ansi_escape.sub('', s)
125
+
126
+
127
+ #%% Tests
128
+
129
+
130
+ class TestStringUtils:
131
+ """
132
+ Tests for string_utils.py
133
+ """
134
+
135
+
136
+ def test_is_float(self):
137
+ """
138
+ Test the is_float function.
139
+ """
140
+
141
+ assert is_float("1.23")
142
+ assert is_float("-0.5")
143
+ assert is_float("0")
144
+ assert is_float(1.23)
145
+ assert is_float(0)
146
+ assert not is_float("abc")
147
+ assert not is_float("1.2.3")
148
+ assert not is_float("")
149
+ assert not is_float(None)
150
+ assert not is_float("1,23")
151
+
152
+
153
+ def test_human_readable_to_bytes(self):
154
+ """
155
+ Test the human_readable_to_bytes function.
156
+ """
157
+
158
+ assert human_readable_to_bytes("10B") == 10
159
+ assert human_readable_to_bytes("10") == 10
160
+ assert human_readable_to_bytes("1K") == 1024
161
+ assert human_readable_to_bytes("1KB") == 1024
162
+ assert human_readable_to_bytes("1M") == 1024*1024
163
+ assert human_readable_to_bytes("1MB") == 1024*1024
164
+ assert human_readable_to_bytes("1G") == 1024*1024*1024
165
+ assert human_readable_to_bytes("1GB") == 1024*1024*1024
166
+ assert human_readable_to_bytes("1T") == 1024*1024*1024*1024
167
+ assert human_readable_to_bytes("1TB") == 1024*1024*1024*1024
168
+
169
+ assert human_readable_to_bytes("2.5K") == 2.5 * 1024
170
+ assert human_readable_to_bytes("0.5MB") == 0.5 * 1024 * 1024
171
+
172
+ # Test with spaces
173
+ assert human_readable_to_bytes(" 2 G ") == 2 * 1024*1024*1024
174
+ assert human_readable_to_bytes("500 KB") == 500 * 1024
175
+
176
+ # Invalid inputs
177
+ assert human_readable_to_bytes("abc") == 0
178
+ assert human_readable_to_bytes("1X") == 0
179
+ assert human_readable_to_bytes("1KBB") == 0
180
+ assert human_readable_to_bytes("K1") == 0
181
+ assert human_readable_to_bytes("") == 0
182
+ assert human_readable_to_bytes("1.2.3K") == 0
183
+ assert human_readable_to_bytes("B") == 0
184
+
185
+
186
+ def test_remove_ansi_codes(self):
187
+ """
188
+ Test the remove_ansi_codes function.
189
+ """
190
+
191
+ assert remove_ansi_codes("text without codes") == "text without codes"
192
+ assert remove_ansi_codes("\x1b[31mRed text\x1b[0m") == "Red text"
193
+ assert remove_ansi_codes("\x1b[1m\x1b[4mBold and Underline\x1b[0m") == "Bold and Underline"
194
+ assert remove_ansi_codes("Mixed \x1b[32mgreen\x1b[0m and normal") == "Mixed green and normal"
195
+ assert remove_ansi_codes("") == ""
196
+
197
+ # More complex/varied ANSI codes
198
+ assert remove_ansi_codes("text\x1b[1Aup") == "textup"
199
+ assert remove_ansi_codes("\x1b[2Jclearscreen") == "clearscreen"
200
+
201
+
202
+ def test_string_utils():
203
+ """
204
+ Runs all tests in the TestStringUtils class.
205
+ """
206
+
207
+ test_instance = TestStringUtils()
208
+ test_instance.test_is_float()
209
+ test_instance.test_human_readable_to_bytes()
210
+ test_instance.test_remove_ansi_codes()
211
+
212
+ # from IPython import embed; embed()
213
+ # test_string_utils()