megadetector 5.0.6__py3-none-any.whl → 5.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (62) hide show
  1. api/batch_processing/data_preparation/manage_local_batch.py +278 -197
  2. api/batch_processing/data_preparation/manage_video_batch.py +7 -2
  3. api/batch_processing/postprocessing/add_max_conf.py +1 -0
  4. api/batch_processing/postprocessing/compare_batch_results.py +110 -60
  5. api/batch_processing/postprocessing/load_api_results.py +55 -69
  6. api/batch_processing/postprocessing/md_to_labelme.py +1 -0
  7. api/batch_processing/postprocessing/postprocess_batch_results.py +158 -50
  8. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
  9. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
  10. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
  11. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +222 -74
  12. api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
  13. api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
  14. classification/prepare_classification_script.py +191 -191
  15. data_management/coco_to_yolo.py +65 -44
  16. data_management/databases/integrity_check_json_db.py +7 -5
  17. data_management/generate_crops_from_cct.py +1 -1
  18. data_management/importers/animl_results_to_md_results.py +2 -2
  19. data_management/importers/noaa_seals_2019.py +1 -1
  20. data_management/importers/zamba_results_to_md_results.py +2 -2
  21. data_management/labelme_to_coco.py +34 -6
  22. data_management/labelme_to_yolo.py +1 -1
  23. data_management/lila/create_lila_blank_set.py +474 -0
  24. data_management/lila/create_lila_test_set.py +2 -1
  25. data_management/lila/create_links_to_md_results_files.py +1 -1
  26. data_management/lila/download_lila_subset.py +46 -21
  27. data_management/lila/generate_lila_per_image_labels.py +23 -14
  28. data_management/lila/get_lila_annotation_counts.py +16 -10
  29. data_management/lila/lila_common.py +14 -11
  30. data_management/lila/test_lila_metadata_urls.py +116 -0
  31. data_management/resize_coco_dataset.py +12 -10
  32. data_management/yolo_output_to_md_output.py +40 -13
  33. data_management/yolo_to_coco.py +34 -21
  34. detection/process_video.py +36 -14
  35. detection/pytorch_detector.py +1 -1
  36. detection/run_detector.py +73 -18
  37. detection/run_detector_batch.py +104 -24
  38. detection/run_inference_with_yolov5_val.py +127 -26
  39. detection/run_tiled_inference.py +153 -43
  40. detection/video_utils.py +3 -1
  41. md_utils/ct_utils.py +79 -3
  42. md_utils/md_tests.py +253 -15
  43. md_utils/path_utils.py +129 -24
  44. md_utils/process_utils.py +26 -7
  45. md_utils/split_locations_into_train_val.py +215 -0
  46. md_utils/string_utils.py +10 -0
  47. md_utils/url_utils.py +0 -2
  48. md_utils/write_html_image_list.py +1 -0
  49. md_visualization/visualization_utils.py +17 -2
  50. md_visualization/visualize_db.py +8 -0
  51. md_visualization/visualize_detector_output.py +185 -104
  52. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/METADATA +2 -2
  53. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/RECORD +62 -58
  54. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
  55. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
  56. taxonomy_mapping/map_new_lila_datasets.py +43 -39
  57. taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
  58. taxonomy_mapping/preview_lila_taxonomy.py +27 -27
  59. taxonomy_mapping/species_lookup.py +33 -13
  60. taxonomy_mapping/taxonomy_csv_checker.py +7 -5
  61. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
  62. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0
md_utils/process_utils.py CHANGED
@@ -17,14 +17,28 @@ import subprocess
17
17
 
18
18
  os.environ["PYTHONUNBUFFERED"] = "1"
19
19
 
20
- def execute(cmd):
20
+ def execute(cmd,encoding=None,errors=None,env=None,verbose=False):
21
21
  """
22
22
  Run [cmd] (a single string) in a shell, yielding each line of output to the caller.
23
+
24
+ The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
25
+
26
+ "verbose" only impacts output about process management, it is not related to printing
27
+ output from the child process.
23
28
  """
24
-
29
+
30
+ if verbose:
31
+ if encoding is not None:
32
+ print('Launching child process with non-default encoding {}'.format(encoding))
33
+ if errors is not None:
34
+ print('Launching child process with non-default text error handling {}'.format(errors))
35
+ if env is not None:
36
+ print('Launching child process with non-default environment {}'.format(str(env)))
37
+
25
38
  # https://stackoverflow.com/questions/4417546/constantly-print-subprocess-output-while-process-is-running
26
39
  popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
27
- shell=True, universal_newlines=True)
40
+ shell=True, universal_newlines=True, encoding=encoding,
41
+ errors=errors, env=env)
28
42
  for stdout_line in iter(popen.stdout.readline, ""):
29
43
  yield stdout_line
30
44
  popen.stdout.close()
@@ -33,22 +47,27 @@ def execute(cmd):
33
47
  raise subprocess.CalledProcessError(return_code, cmd)
34
48
 
35
49
 
36
- def execute_and_print(cmd,print_output=True):
50
+ def execute_and_print(cmd,print_output=True,encoding=None,errors=None,env=None,verbose=False):
37
51
  """
38
52
  Run [cmd] (a single string) in a shell, capturing and printing output. Returns
39
53
  a dictionary with fields "status" and "output".
54
+
55
+ The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
56
+
57
+ "verbose" only impacts output about process management, it is not related to printing
58
+ output from the child process.
40
59
  """
41
60
 
42
61
  to_return = {'status':'unknown','output':''}
43
- output=[]
62
+ output = []
44
63
  try:
45
- for s in execute(cmd):
64
+ for s in execute(cmd,encoding=encoding,errors=errors,env=env,verbose=verbose):
46
65
  output.append(s)
47
66
  if print_output:
48
67
  print(s,end='',flush=True)
49
68
  to_return['status'] = 0
50
69
  except subprocess.CalledProcessError as cpe:
51
- print('execute_and_print caught error: {}'.format(cpe.output))
70
+ print('execute_and_print caught error: {} ({})'.format(cpe.output,str(cpe)))
52
71
  to_return['status'] = cpe.returncode
53
72
  to_return['output'] = output
54
73
 
@@ -0,0 +1,215 @@
1
+ ########
2
+ #
3
+ # split_locations_into_train_val.py
4
+ #
5
+ # Split a list of location IDs into training and validation, targeting a specific
6
+ # train/val split for each category, but allowing some categories to be tighter or looser
7
+ # than others. Does nothing particularly clever, just randomly splits locations into
8
+ # train/val lots of times using the target val fraction, and picks the one that meets the
9
+ # specified constraints and minimizes weighted error, where "error" is defined as the
10
+ # sum of each class's absolute divergence from the target val fraction.
11
+ #
12
+ ########
13
+
14
+ #%% Imports/constants
15
+
16
+ import random
17
+ import numpy as np
18
+
19
+ from collections import defaultdict
20
+ from md_utils.ct_utils import sort_dictionary_by_value
21
+ from tqdm import tqdm
22
+
23
+
24
+ #%% Main function
25
+
26
+ def split_locations_into_train_val(location_to_category_counts,
27
+ n_random_seeds=10000,
28
+ target_val_fraction=0.15,
29
+ category_to_max_allowable_error=None,
30
+ category_to_error_weight=None,
31
+ default_max_allowable_error=0.1):
32
+ """
33
+ Split a list of location IDs into training and validation, targeting a specific
34
+ train/val split for each category, but allowing some categories to be tighter or looser
35
+ than others. Does nothing particularly clever, just randomly splits locations into
36
+ train/val lots of times using the target val fraction, and picks the one that meets the
37
+ specified constraints and minimizes weighted error, where "error" is defined as the
38
+ sum of each class's absolute divergence from the target val fraction.
39
+
40
+ location_to_category_counts should be a dict mapping location IDs to dicts,
41
+ with each dict mapping a category name to a count. Any categories not present in a
42
+ particular dict are assumed to have a count of zero for that location.
43
+
44
+ If not None, category_to_max_allowable_error should be a dict mapping category names
45
+ to maximum allowable errors. These are hard constraints, but you can specify a subset
46
+ of categories. Categories not included here have a maximum error of Inf.
47
+
48
+ If not None, category_to_error_weight should be a dict mapping category names to
49
+ error weights. You can specify a subset of categories. Categories not included here
50
+ have a weight of 1.0.
51
+
52
+ default_max_allowable_error is the maximum allowable error for categories not present in
53
+ category_to_max_allowable_error. Set to None (or >= 1.0) to disable hard constraints for
54
+ categories not present in category_to_max_allowable_error
55
+
56
+ returns val_locations,category_to_val_fraction
57
+
58
+ """
59
+
60
+ location_ids = list(location_to_category_counts.keys())
61
+
62
+ n_val_locations = int(target_val_fraction*len(location_ids))
63
+
64
+ if category_to_max_allowable_error is None:
65
+ category_to_max_allowable_error = {}
66
+
67
+ if category_to_error_weight is None:
68
+ category_to_error_weight = {}
69
+
70
+ # category ID to total count; the total count is used only for printouts
71
+ category_id_to_count = {}
72
+ for location_id in location_to_category_counts:
73
+ for category_id in location_to_category_counts[location_id].keys():
74
+ if category_id not in category_id_to_count:
75
+ category_id_to_count[category_id] = 0
76
+ category_id_to_count[category_id] += \
77
+ location_to_category_counts[location_id][category_id]
78
+
79
+ category_ids = set(category_id_to_count.keys())
80
+
81
+ print('Splitting {} categories over {} locations'.format(
82
+ len(category_ids),len(location_ids)))
83
+
84
+ # random_seed = 0
85
+ def compute_seed_errors(random_seed):
86
+ """
87
+ Compute the per-category error for a specific random seed.
88
+
89
+ returns weighted_average_error,category_to_val_fraction
90
+ """
91
+
92
+ # Randomly split into train/val
93
+ random.seed(random_seed)
94
+ val_locations = random.sample(location_ids,k=n_val_locations)
95
+ val_locations_set = set(val_locations)
96
+
97
+ # For each category, measure the % of images that went into the val set
98
+ category_to_val_fraction = defaultdict(float)
99
+
100
+ for category_id in category_ids:
101
+ category_val_count = 0
102
+ category_train_count = 0
103
+ for location_id in location_to_category_counts:
104
+ if category_id not in location_to_category_counts[location_id]:
105
+ location_category_count = 0
106
+ else:
107
+ location_category_count = location_to_category_counts[location_id][category_id]
108
+ if location_id in val_locations_set:
109
+ category_val_count += location_category_count
110
+ else:
111
+ category_train_count += location_category_count
112
+ category_val_fraction = category_val_count / (category_val_count + category_train_count)
113
+ category_to_val_fraction[category_id] = category_val_fraction
114
+
115
+ # Absolute deviation from the target val fraction for each categorys
116
+ category_errors = {}
117
+ weighted_category_errors = {}
118
+
119
+ # category = next(iter(category_to_val_fraction))
120
+ for category in category_to_val_fraction:
121
+
122
+ category_val_fraction = category_to_val_fraction[category]
123
+
124
+ category_error = abs(category_val_fraction-target_val_fraction)
125
+ category_errors[category] = category_error
126
+
127
+ category_weight = 1.0
128
+ if category in category_to_error_weight:
129
+ category_weight = category_to_error_weight[category]
130
+ weighted_category_error = category_error * category_weight
131
+ weighted_category_errors[category] = weighted_category_error
132
+
133
+ weighted_average_error = np.mean(list(weighted_category_errors.values()))
134
+
135
+ return weighted_average_error,weighted_category_errors,category_to_val_fraction
136
+
137
+ # ... def compute_seed_errors(...)
138
+
139
+ # This will only include random seeds that satisfy the hard constraints
140
+ random_seed_to_weighted_average_error = {}
141
+
142
+ # random_seed = 0
143
+ for random_seed in tqdm(range(0,n_random_seeds)):
144
+
145
+ weighted_average_error,weighted_category_errors,category_to_val_fraction = \
146
+ compute_seed_errors(random_seed)
147
+
148
+ seed_satisfies_hard_constraints = True
149
+
150
+ for category in category_to_val_fraction:
151
+ if category in category_to_max_allowable_error:
152
+ max_allowable_error = category_to_max_allowable_error[category]
153
+ else:
154
+ if default_max_allowable_error is None:
155
+ continue
156
+ max_allowable_error = default_max_allowable_error
157
+ val_fraction = category_to_val_fraction[category]
158
+ category_error = abs(val_fraction - target_val_fraction)
159
+ if category_error > max_allowable_error:
160
+ seed_satisfies_hard_constraints = False
161
+ break
162
+
163
+ if seed_satisfies_hard_constraints:
164
+ random_seed_to_weighted_average_error[random_seed] = weighted_average_error
165
+
166
+ # ...for each random seed
167
+
168
+ assert len(random_seed_to_weighted_average_error) > 0, \
169
+ 'No random seed met all the hard constraints'
170
+
171
+ print('\n{} of {} random seeds satisfied hard constraints'.format(
172
+ len(random_seed_to_weighted_average_error),n_random_seeds))
173
+
174
+ min_error = None
175
+ min_error_seed = None
176
+
177
+ for random_seed in random_seed_to_weighted_average_error.keys():
178
+ error_metric = random_seed_to_weighted_average_error[random_seed]
179
+ if min_error is None or error_metric < min_error:
180
+ min_error = error_metric
181
+ min_error_seed = random_seed
182
+
183
+ random.seed(min_error_seed)
184
+ val_locations = random.sample(location_ids,k=n_val_locations)
185
+ train_locations = []
186
+ for location_id in location_ids:
187
+ if location_id not in val_locations:
188
+ train_locations.append(location_id)
189
+
190
+ print('\nVal locations:\n')
191
+ for loc in val_locations:
192
+ print('{}'.format(loc))
193
+ print('')
194
+
195
+ weighted_average_error,weighted_category_errors,category_to_val_fraction = \
196
+ compute_seed_errors(min_error_seed)
197
+
198
+ random_seed = min_error_seed
199
+
200
+ category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,reverse=True)
201
+ category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,
202
+ sort_values=category_id_to_count,
203
+ reverse=True)
204
+
205
+
206
+ print('Val fractions by category:\n')
207
+
208
+ for category in category_to_val_fraction:
209
+ print('{} ({}) {:.2f}'.format(
210
+ category,category_id_to_count[category],
211
+ category_to_val_fraction[category]))
212
+
213
+ return val_locations,category_to_val_fraction
214
+
215
+ # ...def split_locations_into_train_val(...)
md_utils/string_utils.py CHANGED
@@ -57,3 +57,13 @@ def human_readable_to_bytes(size):
57
57
  bytes = 0
58
58
 
59
59
  return bytes
60
+
61
+
62
+ def remove_ansi_codes(s):
63
+ """
64
+ Remove ANSI escape codes from a string.
65
+
66
+ https://stackoverflow.com/questions/14693701/how-can-i-remove-the-ansi-escape-sequences-from-a-string-in-python#14693789
67
+ """
68
+ ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
69
+ return ansi_escape.sub('', s)
md_utils/url_utils.py CHANGED
@@ -140,5 +140,3 @@ def test_urls(urls, error_on_failure=True):
140
140
 
141
141
  return status_codes
142
142
 
143
-
144
-
@@ -177,6 +177,7 @@ def write_html_image_list(filename=None,images=None,options=None):
177
177
  filename = filename.encode('ascii','ignore').decode('ascii')
178
178
 
179
179
  if options['urlEncodeFilenames']:
180
+ filename = filename.replace('\\','/')
180
181
  filename = urllib.parse.quote(filename)
181
182
 
182
183
  if len(title) > 0:
@@ -172,12 +172,20 @@ def resize_image(image, target_width, target_height=-1, output_file=None):
172
172
  in place. If either width or height are -1, resizes with aspect ratio preservation.
173
173
  If both are -1, returns the original image (does not copy in this case).
174
174
 
175
+ None is equivalent to -1 for target_width and target_height.
176
+
175
177
  [image] can be a PIL image or a filename.
176
178
  """
177
179
 
178
180
  if isinstance(image,str):
179
181
  image = load_image(image)
180
182
 
183
+ if target_width is None:
184
+ target_width = -1
185
+
186
+ if target_height is None:
187
+ target_height = -1
188
+
181
189
  # Null operation
182
190
  if target_width == -1 and target_height == -1:
183
191
  return image
@@ -371,7 +379,8 @@ def render_detection_bounding_boxes(detections, image,
371
379
  The type of the numerical label (default string) needs to be consistent with the keys in
372
380
  label_map; no casting is carried out. If this is None, no classification labels are shown.
373
381
 
374
- confidence_threshold: optional, threshold above which the bounding box is rendered.
382
+ confidence_threshold: optional, threshold above which boxes are rendered. Can also be a dictionary
383
+ mapping category IDs to thresholds.
375
384
 
376
385
  thickness: line thickness in pixels. Default value is 4.
377
386
 
@@ -405,9 +414,15 @@ def render_detection_bounding_boxes(detections, image,
405
414
 
406
415
  score = detection['conf']
407
416
 
417
+ if isinstance(confidence_threshold,dict):
418
+ rendering_threshold = confidence_threshold[detection['category']]
419
+ else:
420
+ rendering_threshold = confidence_threshold
421
+
422
+
408
423
  # Always render objects with a confidence of "None", this is typically used
409
424
  # for ground truth data.
410
- if score is None or score >= confidence_threshold:
425
+ if score is None or score >= rendering_threshold:
411
426
 
412
427
  x1, y1, w_box, h_box = detection['bbox']
413
428
  display_boxes.append([y1, x1, y1 + h_box, x1 + w_box])
@@ -41,7 +41,15 @@ class DbVizOptions:
41
41
  #
42
42
  # If viz_size is None or (-1,-1), the original image size is used.
43
43
  viz_size = (675, -1)
44
+
45
+ # The most relevant option one might want to set here is:
46
+ #
47
+ # htmlOptions['maxFiguresPerHtmlFile']
48
+ #
49
+ # ...which can be used to paginate previews to a number of images that will load well
50
+ # in a browser (5000 is a reasonable limit).
44
51
  htmlOptions = write_html_image_list()
52
+
45
53
  sort_by_filename = True
46
54
  trim_to_images_with_bboxes = False
47
55