megadetector 5.0.6__py3-none-any.whl → 5.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (75) hide show
  1. api/batch_processing/data_preparation/manage_local_batch.py +297 -202
  2. api/batch_processing/data_preparation/manage_video_batch.py +7 -2
  3. api/batch_processing/postprocessing/add_max_conf.py +1 -0
  4. api/batch_processing/postprocessing/combine_api_outputs.py +2 -2
  5. api/batch_processing/postprocessing/compare_batch_results.py +111 -61
  6. api/batch_processing/postprocessing/convert_output_format.py +24 -6
  7. api/batch_processing/postprocessing/load_api_results.py +56 -72
  8. api/batch_processing/postprocessing/md_to_labelme.py +119 -51
  9. api/batch_processing/postprocessing/merge_detections.py +30 -5
  10. api/batch_processing/postprocessing/postprocess_batch_results.py +175 -55
  11. api/batch_processing/postprocessing/remap_detection_categories.py +163 -0
  12. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +628 -0
  13. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
  14. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
  15. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +224 -76
  16. api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
  17. api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
  18. classification/prepare_classification_script.py +191 -191
  19. data_management/cct_json_utils.py +7 -2
  20. data_management/coco_to_labelme.py +263 -0
  21. data_management/coco_to_yolo.py +72 -48
  22. data_management/databases/integrity_check_json_db.py +75 -64
  23. data_management/databases/subset_json_db.py +1 -1
  24. data_management/generate_crops_from_cct.py +1 -1
  25. data_management/get_image_sizes.py +44 -26
  26. data_management/importers/animl_results_to_md_results.py +3 -5
  27. data_management/importers/noaa_seals_2019.py +2 -2
  28. data_management/importers/zamba_results_to_md_results.py +2 -2
  29. data_management/labelme_to_coco.py +264 -127
  30. data_management/labelme_to_yolo.py +96 -53
  31. data_management/lila/create_lila_blank_set.py +557 -0
  32. data_management/lila/create_lila_test_set.py +2 -1
  33. data_management/lila/create_links_to_md_results_files.py +1 -1
  34. data_management/lila/download_lila_subset.py +138 -45
  35. data_management/lila/generate_lila_per_image_labels.py +23 -14
  36. data_management/lila/get_lila_annotation_counts.py +16 -10
  37. data_management/lila/lila_common.py +15 -42
  38. data_management/lila/test_lila_metadata_urls.py +116 -0
  39. data_management/read_exif.py +65 -16
  40. data_management/remap_coco_categories.py +84 -0
  41. data_management/resize_coco_dataset.py +14 -31
  42. data_management/wi_download_csv_to_coco.py +239 -0
  43. data_management/yolo_output_to_md_output.py +40 -13
  44. data_management/yolo_to_coco.py +313 -100
  45. detection/process_video.py +36 -14
  46. detection/pytorch_detector.py +1 -1
  47. detection/run_detector.py +73 -18
  48. detection/run_detector_batch.py +116 -27
  49. detection/run_inference_with_yolov5_val.py +135 -27
  50. detection/run_tiled_inference.py +153 -43
  51. detection/tf_detector.py +2 -1
  52. detection/video_utils.py +4 -2
  53. md_utils/ct_utils.py +101 -6
  54. md_utils/md_tests.py +264 -17
  55. md_utils/path_utils.py +326 -47
  56. md_utils/process_utils.py +26 -7
  57. md_utils/split_locations_into_train_val.py +215 -0
  58. md_utils/string_utils.py +10 -0
  59. md_utils/url_utils.py +66 -3
  60. md_utils/write_html_image_list.py +12 -2
  61. md_visualization/visualization_utils.py +380 -74
  62. md_visualization/visualize_db.py +41 -10
  63. md_visualization/visualize_detector_output.py +185 -104
  64. {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/METADATA +11 -13
  65. {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/RECORD +74 -67
  66. {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/WHEEL +1 -1
  67. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
  68. taxonomy_mapping/map_new_lila_datasets.py +43 -39
  69. taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
  70. taxonomy_mapping/preview_lila_taxonomy.py +27 -27
  71. taxonomy_mapping/species_lookup.py +33 -13
  72. taxonomy_mapping/taxonomy_csv_checker.py +7 -5
  73. md_visualization/visualize_megadb.py +0 -183
  74. {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/LICENSE +0 -0
  75. {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,215 @@
1
+ ########
2
+ #
3
+ # split_locations_into_train_val.py
4
+ #
5
+ # Split a list of location IDs into training and validation, targeting a specific
6
+ # train/val split for each category, but allowing some categories to be tighter or looser
7
+ # than others. Does nothing particularly clever, just randomly splits locations into
8
+ # train/val lots of times using the target val fraction, and picks the one that meets the
9
+ # specified constraints and minimizes weighted error, where "error" is defined as the
10
+ # sum of each class's absolute divergence from the target val fraction.
11
+ #
12
+ ########
13
+
14
+ #%% Imports/constants
15
+
16
+ import random
17
+ import numpy as np
18
+
19
+ from collections import defaultdict
20
+ from md_utils.ct_utils import sort_dictionary_by_value
21
+ from tqdm import tqdm
22
+
23
+
24
+ #%% Main function
25
+
26
+ def split_locations_into_train_val(location_to_category_counts,
27
+ n_random_seeds=10000,
28
+ target_val_fraction=0.15,
29
+ category_to_max_allowable_error=None,
30
+ category_to_error_weight=None,
31
+ default_max_allowable_error=0.1):
32
+ """
33
+ Split a list of location IDs into training and validation, targeting a specific
34
+ train/val split for each category, but allowing some categories to be tighter or looser
35
+ than others. Does nothing particularly clever, just randomly splits locations into
36
+ train/val lots of times using the target val fraction, and picks the one that meets the
37
+ specified constraints and minimizes weighted error, where "error" is defined as the
38
+ sum of each class's absolute divergence from the target val fraction.
39
+
40
+ location_to_category_counts should be a dict mapping location IDs to dicts,
41
+ with each dict mapping a category name to a count. Any categories not present in a
42
+ particular dict are assumed to have a count of zero for that location.
43
+
44
+ If not None, category_to_max_allowable_error should be a dict mapping category names
45
+ to maximum allowable errors. These are hard constraints, but you can specify a subset
46
+ of categories. Categories not included here have a maximum error of Inf.
47
+
48
+ If not None, category_to_error_weight should be a dict mapping category names to
49
+ error weights. You can specify a subset of categories. Categories not included here
50
+ have a weight of 1.0.
51
+
52
+ default_max_allowable_error is the maximum allowable error for categories not present in
53
+ category_to_max_allowable_error. Set to None (or >= 1.0) to disable hard constraints for
54
+ categories not present in category_to_max_allowable_error
55
+
56
+ returns val_locations,category_to_val_fraction
57
+
58
+ """
59
+
60
+ location_ids = list(location_to_category_counts.keys())
61
+
62
+ n_val_locations = int(target_val_fraction*len(location_ids))
63
+
64
+ if category_to_max_allowable_error is None:
65
+ category_to_max_allowable_error = {}
66
+
67
+ if category_to_error_weight is None:
68
+ category_to_error_weight = {}
69
+
70
+ # category ID to total count; the total count is used only for printouts
71
+ category_id_to_count = {}
72
+ for location_id in location_to_category_counts:
73
+ for category_id in location_to_category_counts[location_id].keys():
74
+ if category_id not in category_id_to_count:
75
+ category_id_to_count[category_id] = 0
76
+ category_id_to_count[category_id] += \
77
+ location_to_category_counts[location_id][category_id]
78
+
79
+ category_ids = set(category_id_to_count.keys())
80
+
81
+ print('Splitting {} categories over {} locations'.format(
82
+ len(category_ids),len(location_ids)))
83
+
84
+ # random_seed = 0
85
+ def compute_seed_errors(random_seed):
86
+ """
87
+ Compute the per-category error for a specific random seed.
88
+
89
+ returns weighted_average_error,category_to_val_fraction
90
+ """
91
+
92
+ # Randomly split into train/val
93
+ random.seed(random_seed)
94
+ val_locations = random.sample(location_ids,k=n_val_locations)
95
+ val_locations_set = set(val_locations)
96
+
97
+ # For each category, measure the % of images that went into the val set
98
+ category_to_val_fraction = defaultdict(float)
99
+
100
+ for category_id in category_ids:
101
+ category_val_count = 0
102
+ category_train_count = 0
103
+ for location_id in location_to_category_counts:
104
+ if category_id not in location_to_category_counts[location_id]:
105
+ location_category_count = 0
106
+ else:
107
+ location_category_count = location_to_category_counts[location_id][category_id]
108
+ if location_id in val_locations_set:
109
+ category_val_count += location_category_count
110
+ else:
111
+ category_train_count += location_category_count
112
+ category_val_fraction = category_val_count / (category_val_count + category_train_count)
113
+ category_to_val_fraction[category_id] = category_val_fraction
114
+
115
+ # Absolute deviation from the target val fraction for each categorys
116
+ category_errors = {}
117
+ weighted_category_errors = {}
118
+
119
+ # category = next(iter(category_to_val_fraction))
120
+ for category in category_to_val_fraction:
121
+
122
+ category_val_fraction = category_to_val_fraction[category]
123
+
124
+ category_error = abs(category_val_fraction-target_val_fraction)
125
+ category_errors[category] = category_error
126
+
127
+ category_weight = 1.0
128
+ if category in category_to_error_weight:
129
+ category_weight = category_to_error_weight[category]
130
+ weighted_category_error = category_error * category_weight
131
+ weighted_category_errors[category] = weighted_category_error
132
+
133
+ weighted_average_error = np.mean(list(weighted_category_errors.values()))
134
+
135
+ return weighted_average_error,weighted_category_errors,category_to_val_fraction
136
+
137
+ # ... def compute_seed_errors(...)
138
+
139
+ # This will only include random seeds that satisfy the hard constraints
140
+ random_seed_to_weighted_average_error = {}
141
+
142
+ # random_seed = 0
143
+ for random_seed in tqdm(range(0,n_random_seeds)):
144
+
145
+ weighted_average_error,weighted_category_errors,category_to_val_fraction = \
146
+ compute_seed_errors(random_seed)
147
+
148
+ seed_satisfies_hard_constraints = True
149
+
150
+ for category in category_to_val_fraction:
151
+ if category in category_to_max_allowable_error:
152
+ max_allowable_error = category_to_max_allowable_error[category]
153
+ else:
154
+ if default_max_allowable_error is None:
155
+ continue
156
+ max_allowable_error = default_max_allowable_error
157
+ val_fraction = category_to_val_fraction[category]
158
+ category_error = abs(val_fraction - target_val_fraction)
159
+ if category_error > max_allowable_error:
160
+ seed_satisfies_hard_constraints = False
161
+ break
162
+
163
+ if seed_satisfies_hard_constraints:
164
+ random_seed_to_weighted_average_error[random_seed] = weighted_average_error
165
+
166
+ # ...for each random seed
167
+
168
+ assert len(random_seed_to_weighted_average_error) > 0, \
169
+ 'No random seed met all the hard constraints'
170
+
171
+ print('\n{} of {} random seeds satisfied hard constraints'.format(
172
+ len(random_seed_to_weighted_average_error),n_random_seeds))
173
+
174
+ min_error = None
175
+ min_error_seed = None
176
+
177
+ for random_seed in random_seed_to_weighted_average_error.keys():
178
+ error_metric = random_seed_to_weighted_average_error[random_seed]
179
+ if min_error is None or error_metric < min_error:
180
+ min_error = error_metric
181
+ min_error_seed = random_seed
182
+
183
+ random.seed(min_error_seed)
184
+ val_locations = random.sample(location_ids,k=n_val_locations)
185
+ train_locations = []
186
+ for location_id in location_ids:
187
+ if location_id not in val_locations:
188
+ train_locations.append(location_id)
189
+
190
+ print('\nVal locations:\n')
191
+ for loc in val_locations:
192
+ print('{}'.format(loc))
193
+ print('')
194
+
195
+ weighted_average_error,weighted_category_errors,category_to_val_fraction = \
196
+ compute_seed_errors(min_error_seed)
197
+
198
+ random_seed = min_error_seed
199
+
200
+ category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,reverse=True)
201
+ category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,
202
+ sort_values=category_id_to_count,
203
+ reverse=True)
204
+
205
+
206
+ print('Val fractions by category:\n')
207
+
208
+ for category in category_to_val_fraction:
209
+ print('{} ({}) {:.2f}'.format(
210
+ category,category_id_to_count[category],
211
+ category_to_val_fraction[category]))
212
+
213
+ return val_locations,category_to_val_fraction
214
+
215
+ # ...def split_locations_into_train_val(...)
md_utils/string_utils.py CHANGED
@@ -57,3 +57,13 @@ def human_readable_to_bytes(size):
57
57
  bytes = 0
58
58
 
59
59
  return bytes
60
+
61
+
62
+ def remove_ansi_codes(s):
63
+ """
64
+ Remove ANSI escape codes from a string.
65
+
66
+ https://stackoverflow.com/questions/14693701/how-can-i-remove-the-ansi-escape-sequences-from-a-string-in-python#14693789
67
+ """
68
+ ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
69
+ return ansi_escape.sub('', s)
md_utils/url_utils.py CHANGED
@@ -16,6 +16,7 @@ import requests
16
16
 
17
17
  from tqdm import tqdm
18
18
  from urllib.parse import urlparse
19
+ from multiprocessing.pool import ThreadPool
19
20
 
20
21
  url_utils_temp_dir = None
21
22
  max_path_len = 255
@@ -109,7 +110,14 @@ def download_url(url, destination_filename=None, progress_updater=None,
109
110
 
110
111
  def download_relative_filename(url, output_base, verbose=False):
111
112
  """
112
- Download a URL to output_base, preserving relative path
113
+ Download a URL to output_base, preserving relative path. Path is relative to
114
+ the site, so:
115
+
116
+ https://abc.com/xyz/123.txt
117
+
118
+ ...will get downloaded to:
119
+
120
+ output_base/xyz/123.txt
113
121
  """
114
122
 
115
123
  p = urlparse(url)
@@ -119,6 +127,63 @@ def download_relative_filename(url, output_base, verbose=False):
119
127
  download_url(url, destination_filename, verbose=verbose)
120
128
 
121
129
 
130
+ def parallel_download_urls(url_to_target_file,verbose=False,overwrite=False,
131
+ n_workers=20):
132
+ """
133
+ Download a list of URLs to local files. url_to_target_file should
134
+ be a dict mapping URLs to output files. Catches exceptions and reports
135
+ them in the returned "results" array.
136
+ """
137
+
138
+ def _do_parallelized_download(download_info,overwrite=False):
139
+ url = download_info['url']
140
+ target_file = download_info['target_file']
141
+ result = {'status':'unknown','url':url,'target_file':target_file}
142
+
143
+ if ((os.path.isfile(target_file)) and (not overwrite)):
144
+ result['status'] = 'skipped'
145
+ return result
146
+ try:
147
+ download_url(url=url,
148
+ destination_filename=target_file,
149
+ verbose=verbose, force_download=overwrite)
150
+ except Exception as e:
151
+ print('Warning: error downloading URL {}: {}'.format(
152
+ url,str(e)))
153
+ result['status'] = 'error: {}'.format(str(e))
154
+ return result
155
+
156
+ result['status'] = 'success'
157
+ return result
158
+
159
+ all_download_info = []
160
+ for url in url_to_target_file:
161
+ download_info = {}
162
+ download_info['url'] = url
163
+ download_info['target_file'] = url_to_target_file[url]
164
+ all_download_info.append(download_info)
165
+
166
+ print('Downloading {} images on {} workers'.format(
167
+ len(all_download_info),n_workers))
168
+
169
+ if n_workers <= 1:
170
+
171
+ results = []
172
+
173
+ for download_info in tqdm(all_download_info):
174
+ result = _do_parallelized_download(download_info,overwrite=overwrite)
175
+ results.append(result)
176
+
177
+ else:
178
+
179
+ pool = ThreadPool(n_workers)
180
+ results = list(tqdm(pool.imap(lambda download_info: _do_parallelized_download(
181
+ download_info,overwrite=overwrite),all_download_info),
182
+ total=len(all_download_info)))
183
+
184
+ return results
185
+
186
+
122
187
  def test_urls(urls, error_on_failure=True):
123
188
  """
124
189
  Verify that a list of URLs is available (returns status 200). By default,
@@ -140,5 +205,3 @@ def test_urls(urls, error_on_failure=True):
140
205
 
141
206
  return status_codes
142
207
 
143
-
144
-
@@ -42,6 +42,7 @@ def write_html_image_list(filename=None,images=None,options=None):
42
42
  defaultImageStyle
43
43
  maxFiguresPerHtmlFile
44
44
  urlEncodeFilenames (default True, e.g. '#' will be replaced by '%23')
45
+ urlEncodeLinkTargets (default True, e.g. '#' will be replaced by '%23')
45
46
 
46
47
  """
47
48
 
@@ -68,7 +69,10 @@ def write_html_image_list(filename=None,images=None,options=None):
68
69
 
69
70
  if 'urlEncodeFilenames' not in options or options['urlEncodeFilenames'] is None:
70
71
  options['urlEncodeFilenames'] = True
71
-
72
+
73
+ if 'urlEncodeLinkTargets' not in options or options['urlEncodeLinkTargets'] is None:
74
+ options['urlEncodeLinkTargets'] = True
75
+
72
76
  # Possibly split the html output for figures into multiple files; Chrome gets sad with
73
77
  # thousands of images in a single tab.
74
78
  if 'maxFiguresPerHtmlFile' not in options or options['maxFiguresPerHtmlFile'] is None:
@@ -176,7 +180,8 @@ def write_html_image_list(filename=None,images=None,options=None):
176
180
  title = title.encode('ascii','ignore').decode('ascii')
177
181
  filename = filename.encode('ascii','ignore').decode('ascii')
178
182
 
179
- if options['urlEncodeFilenames']:
183
+ filename = filename.replace('\\','/')
184
+ if options['urlEncodeFilenames']:
180
185
  filename = urllib.parse.quote(filename)
181
186
 
182
187
  if len(title) > 0:
@@ -184,6 +189,11 @@ def write_html_image_list(filename=None,images=None,options=None):
184
189
  '<p style="{}">{}</p>\n'\
185
190
  .format(textStyle,title))
186
191
 
192
+ linkTarget = linkTarget.replace('\\','/')
193
+ if options['urlEncodeLinkTargets']:
194
+ # These are typically absolute paths, so we only want to mess with certain characters
195
+ linkTarget = urllib.parse.quote(linkTarget,safe=':/')
196
+
187
197
  if len(linkTarget) > 0:
188
198
  fHtml.write('<a href="{}">'.format(linkTarget))
189
199
  # imageStyle.append(';border:0px;')