megadetector 10.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. megadetector/__init__.py +0 -0
  2. megadetector/api/__init__.py +0 -0
  3. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  7. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  8. megadetector/classification/__init__.py +0 -0
  9. megadetector/classification/aggregate_classifier_probs.py +108 -0
  10. megadetector/classification/analyze_failed_images.py +227 -0
  11. megadetector/classification/cache_batchapi_outputs.py +198 -0
  12. megadetector/classification/create_classification_dataset.py +626 -0
  13. megadetector/classification/crop_detections.py +516 -0
  14. megadetector/classification/csv_to_json.py +226 -0
  15. megadetector/classification/detect_and_crop.py +853 -0
  16. megadetector/classification/efficientnet/__init__.py +9 -0
  17. megadetector/classification/efficientnet/model.py +415 -0
  18. megadetector/classification/efficientnet/utils.py +608 -0
  19. megadetector/classification/evaluate_model.py +520 -0
  20. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  21. megadetector/classification/json_to_azcopy_list.py +63 -0
  22. megadetector/classification/json_validator.py +696 -0
  23. megadetector/classification/map_classification_categories.py +276 -0
  24. megadetector/classification/merge_classification_detection_output.py +509 -0
  25. megadetector/classification/prepare_classification_script.py +194 -0
  26. megadetector/classification/prepare_classification_script_mc.py +228 -0
  27. megadetector/classification/run_classifier.py +287 -0
  28. megadetector/classification/save_mislabeled.py +110 -0
  29. megadetector/classification/train_classifier.py +827 -0
  30. megadetector/classification/train_classifier_tf.py +725 -0
  31. megadetector/classification/train_utils.py +323 -0
  32. megadetector/data_management/__init__.py +0 -0
  33. megadetector/data_management/animl_to_md.py +161 -0
  34. megadetector/data_management/annotations/__init__.py +0 -0
  35. megadetector/data_management/annotations/annotation_constants.py +33 -0
  36. megadetector/data_management/camtrap_dp_to_coco.py +270 -0
  37. megadetector/data_management/cct_json_utils.py +566 -0
  38. megadetector/data_management/cct_to_md.py +184 -0
  39. megadetector/data_management/cct_to_wi.py +293 -0
  40. megadetector/data_management/coco_to_labelme.py +284 -0
  41. megadetector/data_management/coco_to_yolo.py +701 -0
  42. megadetector/data_management/databases/__init__.py +0 -0
  43. megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
  44. megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
  45. megadetector/data_management/databases/integrity_check_json_db.py +563 -0
  46. megadetector/data_management/databases/subset_json_db.py +195 -0
  47. megadetector/data_management/generate_crops_from_cct.py +200 -0
  48. megadetector/data_management/get_image_sizes.py +164 -0
  49. megadetector/data_management/labelme_to_coco.py +559 -0
  50. megadetector/data_management/labelme_to_yolo.py +349 -0
  51. megadetector/data_management/lila/__init__.py +0 -0
  52. megadetector/data_management/lila/create_lila_blank_set.py +556 -0
  53. megadetector/data_management/lila/create_lila_test_set.py +192 -0
  54. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  55. megadetector/data_management/lila/download_lila_subset.py +182 -0
  56. megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
  57. megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
  58. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  59. megadetector/data_management/lila/lila_common.py +319 -0
  60. megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
  61. megadetector/data_management/mewc_to_md.py +344 -0
  62. megadetector/data_management/ocr_tools.py +873 -0
  63. megadetector/data_management/read_exif.py +964 -0
  64. megadetector/data_management/remap_coco_categories.py +195 -0
  65. megadetector/data_management/remove_exif.py +156 -0
  66. megadetector/data_management/rename_images.py +194 -0
  67. megadetector/data_management/resize_coco_dataset.py +665 -0
  68. megadetector/data_management/speciesnet_to_md.py +41 -0
  69. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  70. megadetector/data_management/yolo_output_to_md_output.py +594 -0
  71. megadetector/data_management/yolo_to_coco.py +984 -0
  72. megadetector/data_management/zamba_to_md.py +188 -0
  73. megadetector/detection/__init__.py +0 -0
  74. megadetector/detection/change_detection.py +840 -0
  75. megadetector/detection/process_video.py +479 -0
  76. megadetector/detection/pytorch_detector.py +1451 -0
  77. megadetector/detection/run_detector.py +1267 -0
  78. megadetector/detection/run_detector_batch.py +2172 -0
  79. megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
  80. megadetector/detection/run_md_and_speciesnet.py +1604 -0
  81. megadetector/detection/run_tiled_inference.py +1044 -0
  82. megadetector/detection/tf_detector.py +209 -0
  83. megadetector/detection/video_utils.py +1379 -0
  84. megadetector/postprocessing/__init__.py +0 -0
  85. megadetector/postprocessing/add_max_conf.py +72 -0
  86. megadetector/postprocessing/categorize_detections_by_size.py +166 -0
  87. megadetector/postprocessing/classification_postprocessing.py +1943 -0
  88. megadetector/postprocessing/combine_batch_outputs.py +249 -0
  89. megadetector/postprocessing/compare_batch_results.py +2110 -0
  90. megadetector/postprocessing/convert_output_format.py +403 -0
  91. megadetector/postprocessing/create_crop_folder.py +629 -0
  92. megadetector/postprocessing/detector_calibration.py +570 -0
  93. megadetector/postprocessing/generate_csv_report.py +522 -0
  94. megadetector/postprocessing/load_api_results.py +223 -0
  95. megadetector/postprocessing/md_to_coco.py +428 -0
  96. megadetector/postprocessing/md_to_labelme.py +351 -0
  97. megadetector/postprocessing/md_to_wi.py +41 -0
  98. megadetector/postprocessing/merge_detections.py +392 -0
  99. megadetector/postprocessing/postprocess_batch_results.py +2140 -0
  100. megadetector/postprocessing/remap_detection_categories.py +226 -0
  101. megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
  102. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
  103. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
  104. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
  105. megadetector/postprocessing/separate_detections_into_folders.py +795 -0
  106. megadetector/postprocessing/subset_json_detector_output.py +964 -0
  107. megadetector/postprocessing/top_folders_to_bottom.py +238 -0
  108. megadetector/postprocessing/validate_batch_results.py +332 -0
  109. megadetector/taxonomy_mapping/__init__.py +0 -0
  110. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  111. megadetector/taxonomy_mapping/map_new_lila_datasets.py +211 -0
  112. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
  113. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
  114. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  115. megadetector/taxonomy_mapping/simple_image_download.py +231 -0
  116. megadetector/taxonomy_mapping/species_lookup.py +1008 -0
  117. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  118. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  119. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  120. megadetector/tests/__init__.py +0 -0
  121. megadetector/tests/test_nms_synthetic.py +335 -0
  122. megadetector/utils/__init__.py +0 -0
  123. megadetector/utils/ct_utils.py +1857 -0
  124. megadetector/utils/directory_listing.py +199 -0
  125. megadetector/utils/extract_frames_from_video.py +307 -0
  126. megadetector/utils/gpu_test.py +125 -0
  127. megadetector/utils/md_tests.py +2072 -0
  128. megadetector/utils/path_utils.py +2872 -0
  129. megadetector/utils/process_utils.py +172 -0
  130. megadetector/utils/split_locations_into_train_val.py +237 -0
  131. megadetector/utils/string_utils.py +234 -0
  132. megadetector/utils/url_utils.py +825 -0
  133. megadetector/utils/wi_platform_utils.py +968 -0
  134. megadetector/utils/wi_taxonomy_utils.py +1766 -0
  135. megadetector/utils/write_html_image_list.py +239 -0
  136. megadetector/visualization/__init__.py +0 -0
  137. megadetector/visualization/plot_utils.py +309 -0
  138. megadetector/visualization/render_images_with_thumbnails.py +243 -0
  139. megadetector/visualization/visualization_utils.py +1973 -0
  140. megadetector/visualization/visualize_db.py +630 -0
  141. megadetector/visualization/visualize_detector_output.py +498 -0
  142. megadetector/visualization/visualize_video_output.py +705 -0
  143. megadetector-10.0.15.dist-info/METADATA +115 -0
  144. megadetector-10.0.15.dist-info/RECORD +147 -0
  145. megadetector-10.0.15.dist-info/WHEEL +5 -0
  146. megadetector-10.0.15.dist-info/licenses/LICENSE +19 -0
  147. megadetector-10.0.15.dist-info/top_level.txt +1 -0
@@ -0,0 +1,556 @@
1
+ """
2
+
3
+ create_lila_blank_set.py
4
+
5
+ Create a folder of blank images sampled from LILA. We'll aim for diversity, so less-common
6
+ locations will be oversampled relative to more common locations. We'll also run MegaDetector
7
+ (with manual review) to remove some incorrectly-labeled, not-actually-empty images from our
8
+ blank set.
9
+
10
+ We'll store location information for each image in a .json file, so we can split locations
11
+ into train/val in downstream tasks.
12
+
13
+ """
14
+
15
+ #%% Constants and imports
16
+
17
+ import os
18
+ import random
19
+ import math
20
+ import json
21
+
22
+ import numpy as np
23
+ from tqdm import tqdm
24
+ from multiprocessing.pool import ThreadPool
25
+ from urllib.parse import urlparse
26
+ from collections import defaultdict
27
+
28
+ from megadetector.data_management.lila.lila_common import read_lila_all_images_file
29
+ from megadetector.utils.url_utils import download_url
30
+ from megadetector.utils.ct_utils import sort_dictionary_by_value
31
+ from megadetector.utils.path_utils import is_image_file
32
+ from megadetector.utils.path_utils import find_images
33
+ from megadetector.visualization import visualization_utils as vis_utils
34
+ from megadetector.utils.path_utils import recursive_file_list
35
+ from megadetector.utils import ct_utils
36
+
37
+
38
+ #%% Environment
39
+
40
+ # We'll write images, metadata downloads, and temporary files here
41
+ lila_local_base = os.path.expanduser('~/lila')
42
+
43
+ metadata_dir = os.path.join(lila_local_base,'metadata')
44
+ os.makedirs(metadata_dir,exist_ok=True)
45
+
46
+ project_base = os.path.join(lila_local_base,'lila_blanks')
47
+
48
+ candidate_blanks_base = os.path.join(project_base,'candidate_blanks')
49
+ os.makedirs(candidate_blanks_base,exist_ok=True)
50
+
51
+ confirmed_blanks_base = os.path.join(project_base,'confirmed_blanks')
52
+ os.makedirs(confirmed_blanks_base,exist_ok=True)
53
+
54
+ md_possible_non_blanks_folder = os.path.join(project_base,'candidate_non_blanks')
55
+ os.makedirs(md_possible_non_blanks_folder,exist_ok=True)
56
+
57
+ location_to_blank_image_urls_cache_file = os.path.join(project_base,
58
+ 'location_to_blank_image_urls.json')
59
+
60
+ md_results_file = os.path.join(project_base,'lila_blanks_md_results.json')
61
+
62
+ all_fn_relative_to_location_file = os.path.join(project_base,'all_fn_relative_to_location.json')
63
+ confirmed_fn_relative_to_location_file = os.path.join(project_base,'confirmed_fn_relative_to_location.json')
64
+
65
+ preferred_image_download_source = 'gcp'
66
+
67
+ # Number of concurrent download threads
68
+ n_download_threads = 20
69
+
70
+ n_blanks = 100000
71
+
72
+ random.seed(0)
73
+
74
+
75
+ #%% Download and open the giant table of image URLs and labels
76
+
77
+ # ~60 seconds to download, unzip, and open
78
+ df = read_lila_all_images_file(metadata_dir)
79
+
80
+
81
+ #%% Explore blank labels
82
+
83
+ # Original labels we're treating as blank:
84
+ blank_original_labels = (
85
+ 'empty','misfire'
86
+ )
87
+
88
+ # Some notable original labels we're *not* treating as blank:
89
+ nonblank_original_labels = (
90
+ 'unclassifiable', 'unidentifiable', 'unidentified', 'unknown', 'fire',
91
+ 'foggy lens', 'foggy weather', 'blurred', 'end', 'eye_shine', 'ignore',
92
+ 'lens obscured', 'misdirected', 'other', 'start', 'sun', 'problem',
93
+ 'tilted', 'vegetation obstruction', 'snow on lens', 'malfunction'
94
+ )
95
+
96
+ other_labels_without_common_names = (
97
+ 'car', 'motorcycle', 'vehicle'
98
+ )
99
+
100
+ common_names = sorted(list(df['common_name'].unique()),
101
+ key=lambda x:str(x) if isinstance(x,float) else x)
102
+ original_labels = sorted(list(df['original_label'].unique()),
103
+ key=lambda x:str(x) if isinstance(x,float) else x)
104
+
105
+ # Blanks are represented as NaN in the "common_name" column (though not all NaN's are blanks)
106
+ assert '' not in common_names
107
+ assert all([s not in common_names for s in blank_original_labels])
108
+ assert all([s not in common_names for s in nonblank_original_labels])
109
+ assert np.nan in common_names
110
+
111
+ # Blanks are represented as "empty" or "misfire" in the "original_label" column
112
+ assert all([s in original_labels for s in blank_original_labels])
113
+ assert all([s in original_labels for s in nonblank_original_labels])
114
+ assert all([s in original_labels for s in other_labels_without_common_names])
115
+ assert all([s not in original_labels for s in ('','blank','none',np.nan)])
116
+
117
+
118
+ #%% Count empty labels and common names
119
+
120
+ common_names_with_empty_original_labels = set()
121
+ original_labels_with_nan_common_names = set()
122
+
123
+ common_name_to_count = defaultdict(int)
124
+ original_label_to_count = defaultdict(int)
125
+
126
+ # This loop takes ~10 mins
127
+ for i_row,row in tqdm(df.iterrows(),total=len(df)):
128
+
129
+ common_name = row['common_name']
130
+ original_label = row['original_label']
131
+
132
+ if isinstance(common_name,float):
133
+ assert np.isnan(common_name)
134
+ original_labels_with_nan_common_names.add(original_label)
135
+
136
+ common_name = str(common_name)
137
+
138
+ assert isinstance(original_label,str)
139
+ if original_label in blank_original_labels:
140
+ common_names_with_empty_original_labels.add(common_name)
141
+ common_name_to_count[common_name] += 1
142
+ original_label_to_count[original_label] += 1
143
+
144
+
145
+ #%% Look at the most common labels and common names
146
+
147
+ common_name_to_count = sort_dictionary_by_value(common_name_to_count,reverse=True)
148
+ original_label_to_count = sort_dictionary_by_value(original_label_to_count,reverse=True)
149
+
150
+ k = 10
151
+
152
+ print('\nMost frequent common names:\n')
153
+
154
+ i_label = 0
155
+ for i_label,s in enumerate(common_name_to_count):
156
+ if i_label >= k:
157
+ break
158
+ print('{}: {}'.format(s,common_name_to_count[s]))
159
+
160
+ print('\nMost frequent original labels:\n')
161
+
162
+ i_label = 0
163
+ for i_label,s in enumerate(original_label_to_count):
164
+ if i_label >= k:
165
+ break
166
+ print('{}: {}'.format(s,original_label_to_count[s]))
167
+
168
+
169
+ #%% Do some consistency checks over the empty labels and stats
170
+
171
+ # All images called 'empty' should have NaN as their common name
172
+ assert (len(common_names_with_empty_original_labels) == 1)
173
+ assert next(iter(common_names_with_empty_original_labels)) == 'nan'
174
+
175
+ # 'empty' should be the most frequent original label overall
176
+ assert next(iter(original_label_to_count)) == 'empty'
177
+
178
+ # NaN should be the most frequent common name overall
179
+ assert next(iter(common_name_to_count)) == 'nan'
180
+
181
+ for s in original_labels_with_nan_common_names:
182
+ assert \
183
+ (s in blank_original_labels) or \
184
+ (s in nonblank_original_labels) or \
185
+ (s in other_labels_without_common_names)
186
+
187
+
188
+ #%% Map locations to blank images
189
+
190
+ force_map_locations = False
191
+
192
+ # Load from .json if available
193
+ if (not force_map_locations) and (os.path.isfile(location_to_blank_image_urls_cache_file)):
194
+
195
+ with open(location_to_blank_image_urls_cache_file,'r') as f:
196
+ location_to_blank_image_urls = json.load(f)
197
+
198
+ else:
199
+
200
+ location_to_blank_image_urls = defaultdict(list)
201
+
202
+ # i_row = 0; row = df.iloc[i_row]
203
+ for i_row,row in tqdm(df.iterrows(),total=len(df)):
204
+
205
+ location_id = row['location_id']
206
+ url = row['url']
207
+
208
+ original_label = row['original_label']
209
+ if original_label in blank_original_labels:
210
+ assert np.isnan(row['common_name'])
211
+ location_to_blank_image_urls[location_id].append(url)
212
+
213
+ ct_utils.write_json(location_to_blank_image_urls_cache_file, location_to_blank_image_urls)
214
+
215
+ n_locations_with_blanks = len(location_to_blank_image_urls)
216
+ print('Found {} locations with blank images'.format(n_locations_with_blanks))
217
+
218
+
219
+ #%% Sample blanks
220
+
221
+ random.seed(0)
222
+
223
+ # Make a fresh copy of the lists
224
+ location_to_unsampled_blank_image_urls = {}
225
+
226
+ # location = next(iter(location_to_blank_image_urls.keys()))
227
+ for location in location_to_blank_image_urls:
228
+ blank_image_urls_this_location = location_to_blank_image_urls[location]
229
+ unsampled_blank_image_urls_this_location = blank_image_urls_this_location.copy()
230
+ location_to_unsampled_blank_image_urls[location] = unsampled_blank_image_urls_this_location
231
+
232
+ # Put locations in a random order
233
+ location_ids = list(location_to_unsampled_blank_image_urls.keys())
234
+ random.shuffle(location_ids)
235
+
236
+ blank_urls = []
237
+ location_to_sampled_blanks = defaultdict(list)
238
+ fully_sampled_locations = set()
239
+
240
+ # Pick from each location until we hit our limit or have no blanks left
241
+ while(True):
242
+
243
+ found_sample = False
244
+
245
+ # location = location_ids[0]
246
+ for location in location_ids:
247
+
248
+ unsampled_images_this_location = location_to_unsampled_blank_image_urls[location]
249
+ if len(unsampled_images_this_location) == 0:
250
+ fully_sampled_locations.add(location)
251
+ continue
252
+
253
+ url = random.choice(unsampled_images_this_location)
254
+ blank_urls.append(url)
255
+ location_to_unsampled_blank_image_urls[location].remove(url)
256
+ location_to_sampled_blanks[location].append(url)
257
+ found_sample = True
258
+
259
+ if len(blank_urls) == n_blanks:
260
+ break
261
+
262
+ # ...for each location
263
+
264
+ if not found_sample:
265
+ print('Terminating after {} blanks, we ran out before hitting {}'.format(
266
+ len(blank_urls),n_blanks))
267
+
268
+ if len(blank_urls) == n_blanks:
269
+ break
270
+
271
+ # ...while(True)
272
+
273
+ assert len(blank_urls) <= n_blanks
274
+ min_blanks_per_location = math.floor(n_blanks/n_locations_with_blanks)
275
+ max_blanks_per_location = -1
276
+ for location in location_to_sampled_blanks:
277
+ n_blanks_this_location = len(location_to_sampled_blanks[location])
278
+ if n_blanks_this_location >= max_blanks_per_location:
279
+ max_blanks_per_location = n_blanks_this_location
280
+ assert (location in fully_sampled_locations) or \
281
+ n_blanks_this_location >= min_blanks_per_location
282
+
283
+ print('Choose {} blanks from {} locations'.format(n_blanks,len(location_ids)))
284
+ print('Fully sampled {} locations'.format(len(fully_sampled_locations)))
285
+ print('Max samples per location: {}'.format(max_blanks_per_location))
286
+
287
+
288
+ #%% Download those image files (prep)
289
+
290
+ container_to_url_base = {
291
+ 'lilawildlife.blob.core.windows.net':'/lila-wildlide/',
292
+ 'storage.googleapis.com':'/public-datasets-lila/'
293
+ }
294
+
295
+ def download_relative_filename(url, output_base, verbose=False, url_base=None, overwrite=False):
296
+ """
297
+ Download a URL to output_base, preserving relative path
298
+ """
299
+
300
+ result = {'status':'unknown','url':url,'destination_filename':None}
301
+
302
+ if url_base is None:
303
+ assert url.startswith('https://')
304
+ container = url.split('/')[2]
305
+ assert container in container_to_url_base
306
+ url_base = container_to_url_base[container]
307
+
308
+ assert url_base.startswith('/') and url_base.endswith('/')
309
+
310
+ p = urlparse(url)
311
+ relative_filename = str(p.path)
312
+ # remove the leading '/'
313
+ assert relative_filename.startswith(url_base)
314
+ relative_filename = relative_filename.replace(url_base,'',1)
315
+
316
+ destination_filename = os.path.join(output_base,relative_filename)
317
+ result['destination_filename'] = destination_filename
318
+
319
+ if ((os.path.isfile(destination_filename)) and (not overwrite)):
320
+ result['status'] = 'skipped'
321
+ return result
322
+ try:
323
+ download_url(url, destination_filename, verbose=verbose)
324
+ except Exception as e:
325
+ print('Warning: error downloading URL {}: {}'.format(
326
+ url,str(e)))
327
+ result['status'] = 'error: {}'.format(str(e))
328
+ return result
329
+
330
+ result['status'] = 'success'
331
+ return result
332
+
333
+
334
+ def azure_url_to_gcp_http_url(url,error_if_not_azure_url=True):
335
+ """
336
+ Most URLs point to Azure by default, but most files are available on both Azure and GCP.
337
+ This function converts an Azure URL to the corresponding GCP http:// url.
338
+ """
339
+
340
+ lila_azure_storage_account = 'https://lilawildlife.blob.core.windows.net'
341
+ gcp_bucket_api_url = 'https://storage.googleapis.com/public-datasets-lila'
342
+ error_if_not_azure_url = False
343
+
344
+ if error_if_not_azure_url:
345
+ assert url.startswith(lila_azure_storage_account)
346
+ gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
347
+ return gcp_url
348
+
349
+ # Convert Azure URLs to GCP URLs if necessary
350
+ if preferred_image_download_source != 'azure':
351
+ assert preferred_image_download_source == 'gcp'
352
+ blank_urls = [azure_url_to_gcp_http_url(url) for url in blank_urls]
353
+
354
+
355
+ #%% Download those image files (execution)
356
+
357
+ print('Downloading {} images on {} workers'.format(len(blank_urls),n_download_threads))
358
+
359
+ if n_download_threads <= 1:
360
+
361
+ results = []
362
+
363
+ # url = all_urls[0]
364
+ for url in tqdm(blank_urls):
365
+ results.append(download_relative_filename(url,candidate_blanks_base,url_base=None))
366
+
367
+ else:
368
+
369
+ pool = ThreadPool(n_download_threads)
370
+ results = list(tqdm(pool.imap(lambda s: download_relative_filename(
371
+ s,candidate_blanks_base,url_base=None),
372
+ blank_urls), total=len(blank_urls)))
373
+
374
+ # pool.terminate()
375
+
376
+
377
+ #%% Review results
378
+
379
+ error_urls = []
380
+ for r in results:
381
+ if r['status'] != 'success':
382
+ error_urls.append(r['url'])
383
+
384
+ print('Errors on {} of {} downloads'.format(len(error_urls),len(results)))
385
+
386
+
387
+ #%% Run MegaDetector on the folder
388
+
389
+ cmd = 'python run_detector_batch.py MDV5A "{}" "{}"'.format(
390
+ candidate_blanks_base,md_results_file)
391
+ cmd += ' --recursive --output_relative_filenames'
392
+
393
+ # import clipboard; clipboard.copy(cmd); print(cmd)
394
+
395
+
396
+ #%% Review MD results that suggests images are non-empty
397
+
398
+ assert os.path.isfile(md_results_file)
399
+
400
+ category_name_to_threshold = {'animal':0.25,'person':0.25,'vehicle':0.25}
401
+ min_threshold = min(category_name_to_threshold.values())
402
+ with open(md_results_file,'r') as f:
403
+ md_results = json.load(f)
404
+
405
+ images_to_review_to_detections = {}
406
+
407
+ category_id_to_threshold = {}
408
+ for category_id in md_results['detection_categories']:
409
+ category_name = md_results['detection_categories'][category_id]
410
+ category_id_to_threshold[category_id] = category_name_to_threshold[category_name]
411
+
412
+ # im = md_results['images'][0]
413
+ for im in md_results['images']:
414
+
415
+ if 'detections' not in im:
416
+ continue
417
+
418
+ found_object = False
419
+ for det in im['detections']:
420
+ threshold = category_id_to_threshold[det['category']]
421
+ if det['conf'] >= threshold:
422
+ found_object = True
423
+ break
424
+ if found_object:
425
+ images_to_review_to_detections[im['file']] = im['detections']
426
+
427
+ print('Flagging {} of {} images for review'.format(len(images_to_review_to_detections),len(md_results['images'])))
428
+
429
+ output_file_to_source_file = {}
430
+
431
+ # i_fn = 0; source_file_relative = images_to_review[i_fn]
432
+ for i_fn,source_file_relative in tqdm(enumerate(images_to_review_to_detections),
433
+ total=len(images_to_review_to_detections)):
434
+
435
+ source_file_abs = os.path.join(candidate_blanks_base,source_file_relative)
436
+ assert os.path.isfile(source_file_abs)
437
+ ext = os.path.splitext(source_file_abs)[1]
438
+ target_file_relative = str(i_fn).zfill(8) + ext
439
+ target_file_abs = os.path.join(md_possible_non_blanks_folder,target_file_relative)
440
+ output_file_to_source_file[target_file_relative] = source_file_relative
441
+ # shutil.copyfile(source_file_abs,target_file_abs)
442
+ vis_utils.draw_bounding_boxes_on_file(input_file=source_file_abs,
443
+ output_file=target_file_abs,
444
+ detections=images_to_review_to_detections[source_file_relative],
445
+ confidence_threshold=min_threshold,
446
+ target_size=(1280,-1))
447
+
448
+ # This is a temporary file I just used during debugging
449
+ ct_utils.write_json(os.path.join(project_base,'output_file_to_source_file.json'), output_file_to_source_file)
450
+
451
+
452
+ #%% Manual review
453
+
454
+ # Delete images that are *not* empty
455
+
456
+
457
+ #%% Figure out which images are still there; these are the actually-blank ones
458
+
459
+ remaining_images = set(os.listdir(md_possible_non_blanks_folder))
460
+ print('Kept {} of {} candidate blank images'.format(len(remaining_images),
461
+ len(images_to_review_to_detections)))
462
+
463
+ removed_blank_images_relative = []
464
+
465
+ # output_file = next(iter(output_file_to_source_file.keys()))
466
+ for output_file in tqdm(output_file_to_source_file.keys()):
467
+ if output_file not in remaining_images:
468
+ source_file_relative = output_file_to_source_file[output_file]
469
+ removed_blank_images_relative.append(source_file_relative)
470
+
471
+ removed_blank_images_relative_set = set(removed_blank_images_relative)
472
+ assert len(removed_blank_images_relative) + len(remaining_images) == len(output_file_to_source_file)
473
+
474
+
475
+ #%% Copy only the confirmed blanks to the confirmed folder
476
+
477
+ all_candidate_blanks = recursive_file_list(candidate_blanks_base,return_relative_paths=True)
478
+ print('Found {} candidate blanks'.format(len(all_candidate_blanks)))
479
+
480
+ skipped_images_relative = []
481
+ skipped_non_images = []
482
+
483
+ for source_fn_relative in tqdm(all_candidate_blanks):
484
+
485
+ # Skip anything we removed from the "candidate non-blanks" folder; these weren't really
486
+ # blank.
487
+ if source_fn_relative in removed_blank_images_relative_set:
488
+ skipped_images_relative.append(source_fn_relative)
489
+ continue
490
+
491
+ if not is_image_file(source_fn_relative):
492
+ # Not a typo; "skipped images" really means "skipped files"
493
+ skipped_images_relative.append(source_fn_relative)
494
+ skipped_non_images.append(source_fn_relative)
495
+
496
+
497
+ source_fn_abs = os.path.join(candidate_blanks_base,source_fn_relative)
498
+ assert os.path.isfile(source_fn_abs)
499
+ target_fn_abs = os.path.join(confirmed_blanks_base,source_fn_relative)
500
+ os.makedirs(os.path.dirname(target_fn_abs),exist_ok=True)
501
+ # shutil.copyfile(source_fn_abs,target_fn_abs)
502
+
503
+ print('Skipped {} files ({} non-image files)'.format(len(skipped_images_relative),
504
+ len(skipped_non_images)))
505
+
506
+
507
+ #%% Validate the folder of confirmed blanks
508
+
509
+ # all_confirmed_blanks = recursive_file_list(confirmed_blanks_base,return_relative_paths=True)
510
+ all_confirmed_blanks = find_images(confirmed_blanks_base,return_relative_paths=True,recursive=True)
511
+ assert len(all_confirmed_blanks) < len(all_candidate_blanks)
512
+ print('Found {} confirmed blanks'.format(len(all_confirmed_blanks)))
513
+
514
+
515
+ #%% Manually review a few of the images we skipped
516
+
517
+ # ...to make sure they're non-blank
518
+ i_image = random.randint(0, len(skipped_images_relative))
519
+ fn_relative = skipped_images_relative[i_image]
520
+ fn_abs = os.path.join(candidate_blanks_base,fn_relative)
521
+ assert os.path.isfile(fn_abs)
522
+
523
+ # import clipboard; clipboard.copy('feh --scale-down "{}"'.format(fn_abs))
524
+
525
+
526
+ #%% Record location information for each confirmed file
527
+
528
+ # Map every URL's path to the corresponding location
529
+ #
530
+ # This is *all empty URLs*, not just the ones we downloaded
531
+ all_fn_relative_to_location = {}
532
+
533
+ # location = next(iter(location_to_blank_image_urls.keys()))
534
+ for location in tqdm(location_to_blank_image_urls):
535
+ urls_this_location = location_to_blank_image_urls[location]
536
+
537
+ # url = urls_this_location[0]
538
+ for url in urls_this_location:
539
+ # Turn:
540
+ #
541
+ # https://lilablobssc.blob.core.windows.net/caltech-unzipped/cct_images/5968c0f9-23d2-11e8-a6a3-ec086b02610b.jpg'
542
+ #
543
+ # ...into:
544
+ #
545
+ # caltech-unzipped/cct_images/5968c0f9-23d2-11e8-a6a3-ec086b02610b.jpg'
546
+ p = urlparse(url)
547
+ fn_relative = str(p.path)[1:]
548
+ all_fn_relative_to_location[fn_relative] = location
549
+
550
+ # Build a much smaller mapping of just the confirmed blanks
551
+ confirmed_fn_relative_to_location = {}
552
+ for i_fn,fn_relative in tqdm(enumerate(all_confirmed_blanks),total=len(all_confirmed_blanks)):
553
+ confirmed_fn_relative_to_location[fn_relative] = all_fn_relative_to_location[fn_relative]
554
+
555
+ ct_utils.write_json(all_fn_relative_to_location_file, all_fn_relative_to_location)
556
+ ct_utils.write_json(confirmed_fn_relative_to_location_file, confirmed_fn_relative_to_location)