megadetector 5.0.7__py3-none-any.whl → 5.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- api/batch_processing/data_preparation/manage_local_batch.py +28 -14
- api/batch_processing/postprocessing/combine_api_outputs.py +2 -2
- api/batch_processing/postprocessing/compare_batch_results.py +1 -1
- api/batch_processing/postprocessing/convert_output_format.py +24 -6
- api/batch_processing/postprocessing/load_api_results.py +1 -3
- api/batch_processing/postprocessing/md_to_labelme.py +118 -51
- api/batch_processing/postprocessing/merge_detections.py +30 -5
- api/batch_processing/postprocessing/postprocess_batch_results.py +24 -12
- api/batch_processing/postprocessing/remap_detection_categories.py +163 -0
- api/batch_processing/postprocessing/render_detection_confusion_matrix.py +15 -12
- api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +2 -2
- data_management/cct_json_utils.py +7 -2
- data_management/coco_to_labelme.py +263 -0
- data_management/coco_to_yolo.py +7 -4
- data_management/databases/integrity_check_json_db.py +68 -59
- data_management/databases/subset_json_db.py +1 -1
- data_management/get_image_sizes.py +44 -26
- data_management/importers/animl_results_to_md_results.py +1 -3
- data_management/importers/noaa_seals_2019.py +1 -1
- data_management/labelme_to_coco.py +252 -143
- data_management/labelme_to_yolo.py +95 -52
- data_management/lila/create_lila_blank_set.py +106 -23
- data_management/lila/download_lila_subset.py +133 -65
- data_management/lila/generate_lila_per_image_labels.py +1 -1
- data_management/lila/lila_common.py +8 -38
- data_management/read_exif.py +65 -16
- data_management/remap_coco_categories.py +84 -0
- data_management/resize_coco_dataset.py +3 -22
- data_management/wi_download_csv_to_coco.py +239 -0
- data_management/yolo_to_coco.py +283 -83
- detection/run_detector_batch.py +12 -3
- detection/run_inference_with_yolov5_val.py +10 -3
- detection/run_tiled_inference.py +2 -2
- detection/tf_detector.py +2 -1
- detection/video_utils.py +1 -1
- md_utils/ct_utils.py +22 -3
- md_utils/md_tests.py +11 -2
- md_utils/path_utils.py +206 -32
- md_utils/url_utils.py +66 -1
- md_utils/write_html_image_list.py +12 -3
- md_visualization/visualization_utils.py +363 -72
- md_visualization/visualize_db.py +33 -10
- {megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/METADATA +10 -12
- {megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/RECORD +47 -44
- {megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/WHEEL +1 -1
- md_visualization/visualize_megadb.py +0 -183
- {megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/LICENSE +0 -0
- {megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/top_level.txt +0 -0
|
@@ -11,6 +11,9 @@
|
|
|
11
11
|
import os
|
|
12
12
|
import json
|
|
13
13
|
|
|
14
|
+
from multiprocessing.pool import Pool, ThreadPool
|
|
15
|
+
from functools import partial
|
|
16
|
+
|
|
14
17
|
from md_utils.path_utils import recursive_file_list
|
|
15
18
|
from tqdm import tqdm
|
|
16
19
|
|
|
@@ -21,22 +24,21 @@ def labelme_file_to_yolo_file(labelme_file,
|
|
|
21
24
|
category_name_to_category_id,
|
|
22
25
|
yolo_file=None,
|
|
23
26
|
required_token=None,
|
|
24
|
-
right_edge_quantization_threshold=None,
|
|
25
27
|
overwrite_behavior='overwrite'):
|
|
26
28
|
"""
|
|
27
29
|
Convert the single .json file labelme_file to yolo format, writing the results to the text
|
|
28
30
|
file yolo_file (defaults to s/json/txt).
|
|
29
31
|
|
|
30
|
-
If required_token is not None and the labelme_file does not contain the key [required_token],
|
|
31
|
-
no-ops.
|
|
32
|
+
If required_token is not None and the dict in labelme_file does not contain the key [required_token],
|
|
33
|
+
this function no-ops (i.e., does not generate a YOLO file).
|
|
32
34
|
|
|
33
|
-
|
|
34
|
-
boxes that really should be running off the right side of the image only extend like 99%
|
|
35
|
-
of the way there, due to what appears to be a slight bias inherent to MD. If a box extends
|
|
36
|
-
within [right_edge_quantization_threshold] (a small number, from 0 to 1, but probably around
|
|
37
|
-
0.02) of the right edge of the image, it will be extended to the far right edge.
|
|
35
|
+
overwrite_behavior should be 'skip' or 'overwrite' (default).
|
|
38
36
|
"""
|
|
39
37
|
|
|
38
|
+
result = {}
|
|
39
|
+
result['labelme_file'] = labelme_file
|
|
40
|
+
result['status'] = 'unknown'
|
|
41
|
+
|
|
40
42
|
assert os.path.isfile(labelme_file), 'Could not find labelme .json file {}'.format(labelme_file)
|
|
41
43
|
assert labelme_file.endswith('.json'), 'Illegal labelme .json file {}'.format(labelme_file)
|
|
42
44
|
|
|
@@ -45,7 +47,8 @@ def labelme_file_to_yolo_file(labelme_file,
|
|
|
45
47
|
|
|
46
48
|
if os.path.isfile(yolo_file):
|
|
47
49
|
if overwrite_behavior == 'skip':
|
|
48
|
-
|
|
50
|
+
result['status'] = 'skip-exists'
|
|
51
|
+
return result
|
|
49
52
|
else:
|
|
50
53
|
assert overwrite_behavior == 'overwrite', \
|
|
51
54
|
'Unrecognized overwrite behavior {}'.format(overwrite_behavior)
|
|
@@ -54,7 +57,8 @@ def labelme_file_to_yolo_file(labelme_file,
|
|
|
54
57
|
labelme_data = json.load(f)
|
|
55
58
|
|
|
56
59
|
if required_token is not None and required_token not in labelme_data:
|
|
57
|
-
|
|
60
|
+
result['status'] = 'skip-no-required-token'
|
|
61
|
+
return result
|
|
58
62
|
|
|
59
63
|
im_height = labelme_data['imageHeight']
|
|
60
64
|
im_width = labelme_data['imageWidth']
|
|
@@ -83,10 +87,12 @@ def labelme_file_to_yolo_file(labelme_file,
|
|
|
83
87
|
|
|
84
88
|
if (minx_abs >= (im_width-1)) or (maxx_abs <= 0) or \
|
|
85
89
|
(miny_abs >= (im_height-1)) or (maxy_abs <= 0):
|
|
86
|
-
print('Skipping invalid shape in {}'.format(labelme_file))
|
|
90
|
+
print('Skipping invalid shape in {}'.format(labelme_file))
|
|
87
91
|
continue
|
|
88
92
|
|
|
89
|
-
# Clip to [0,1]
|
|
93
|
+
# Clip to [0,1]... it's not obvious that the YOLO format doesn't allow bounding
|
|
94
|
+
# boxes to extend outside the image, but YOLOv5 and YOLOv8 get sad about boxes
|
|
95
|
+
# that extend outside the image.
|
|
90
96
|
maxx_abs = min(maxx_abs,im_width-1)
|
|
91
97
|
maxy_abs = min(maxy_abs,im_height-1)
|
|
92
98
|
minx_abs = max(minx_abs,0.0)
|
|
@@ -97,11 +103,6 @@ def labelme_file_to_yolo_file(labelme_file,
|
|
|
97
103
|
miny_rel = miny_abs / (im_height-1)
|
|
98
104
|
maxy_rel = maxy_abs / (im_height-1)
|
|
99
105
|
|
|
100
|
-
if (right_edge_quantization_threshold is not None):
|
|
101
|
-
right_edge_distance = 1.0 - maxx_rel
|
|
102
|
-
if right_edge_distance < right_edge_quantization_threshold:
|
|
103
|
-
maxx_rel = 1.0
|
|
104
|
-
|
|
105
106
|
assert maxx_rel >= minx_rel
|
|
106
107
|
assert maxy_rel >= miny_rel
|
|
107
108
|
|
|
@@ -119,32 +120,45 @@ def labelme_file_to_yolo_file(labelme_file,
|
|
|
119
120
|
with open(yolo_file,'w') as f:
|
|
120
121
|
for s in yolo_lines:
|
|
121
122
|
f.write(s + '\n')
|
|
122
|
-
|
|
123
|
+
|
|
124
|
+
result['status'] = 'converted'
|
|
125
|
+
return result
|
|
126
|
+
|
|
123
127
|
|
|
124
128
|
def labelme_folder_to_yolo(labelme_folder,
|
|
125
129
|
category_name_to_category_id=None,
|
|
126
130
|
required_token=None,
|
|
127
|
-
|
|
128
|
-
|
|
131
|
+
overwrite_behavior='overwrite',
|
|
132
|
+
relative_filenames_to_convert=None,
|
|
133
|
+
n_workers=1,
|
|
134
|
+
use_threads=True):
|
|
129
135
|
"""
|
|
130
136
|
Given a folder with images and labelme .json files, convert the .json files
|
|
131
137
|
to YOLO .txt format. If category_name_to_category_id is None, first reads
|
|
132
138
|
all the labels in the folder to build a zero-indexed name --> ID mapping.
|
|
133
139
|
|
|
134
140
|
If required_token is not None and a labelme_file does not contain the key [required_token],
|
|
135
|
-
it won't be converted.
|
|
141
|
+
it won't be converted. Typically used to specify a field that indicates which files have
|
|
142
|
+
been reviewed.
|
|
136
143
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
of the way there, due to what appears to be a slight bias inherent to MD. If a box extends
|
|
140
|
-
within [right_edge_quantization_threshold] (a small number, from 0 to 1, but probably around
|
|
141
|
-
0.02) of the right edge of the image, it will be extended to the far right edge.
|
|
144
|
+
If relative_filenames_to_convert is not None, this should be a list of .json (not image)
|
|
145
|
+
files that should get converted, relative to the base folder.
|
|
142
146
|
|
|
143
|
-
|
|
147
|
+
overwrite_behavior should be 'skip' or 'overwrite' (default).
|
|
148
|
+
|
|
149
|
+
returns a dict with:
|
|
150
|
+
'category_name_to_category_id', whether it was passed in or constructed
|
|
151
|
+
'image_results': a list of results for each image (converted, skipped, error)
|
|
152
|
+
|
|
144
153
|
"""
|
|
145
154
|
|
|
146
|
-
|
|
147
|
-
|
|
155
|
+
if relative_filenames_to_convert is not None:
|
|
156
|
+
labelme_files_relative = relative_filenames_to_convert
|
|
157
|
+
assert all([fn.endswith('.json') for fn in labelme_files_relative]), \
|
|
158
|
+
'relative_filenames_to_convert contains non-json files'
|
|
159
|
+
else:
|
|
160
|
+
labelme_files_relative = recursive_file_list(labelme_folder,return_relative_paths=True)
|
|
161
|
+
labelme_files_relative = [fn for fn in labelme_files_relative if fn.endswith('.json')]
|
|
148
162
|
|
|
149
163
|
if required_token is None:
|
|
150
164
|
valid_labelme_files_relative = labelme_files_relative
|
|
@@ -163,9 +177,9 @@ def labelme_folder_to_yolo(labelme_folder,
|
|
|
163
177
|
|
|
164
178
|
valid_labelme_files_relative.append(fn_relative)
|
|
165
179
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
180
|
+
print('{} of {} files are valid'.format(len(valid_labelme_files_relative),
|
|
181
|
+
len(labelme_files_relative)))
|
|
182
|
+
|
|
169
183
|
del labelme_files_relative
|
|
170
184
|
|
|
171
185
|
if category_name_to_category_id is None:
|
|
@@ -184,26 +198,54 @@ def labelme_folder_to_yolo(labelme_folder,
|
|
|
184
198
|
# ...for each file
|
|
185
199
|
|
|
186
200
|
# ...if we need to build a category mapping
|
|
187
|
-
|
|
188
|
-
for fn_relative in tqdm(valid_labelme_files_relative):
|
|
189
|
-
|
|
190
|
-
fn_abs = os.path.join(labelme_folder,fn_relative)
|
|
191
|
-
labelme_file_to_yolo_file(fn_abs,
|
|
192
|
-
category_name_to_category_id,
|
|
193
|
-
yolo_file=None,
|
|
194
|
-
required_token=required_token,
|
|
195
|
-
right_edge_quantization_threshold=\
|
|
196
|
-
right_edge_quantization_threshold,
|
|
197
|
-
overwrite_behavior=overwrite_behavior)
|
|
198
201
|
|
|
199
|
-
|
|
202
|
+
image_results = []
|
|
203
|
+
|
|
204
|
+
n_workers = min(n_workers,len(valid_labelme_files_relative))
|
|
205
|
+
|
|
206
|
+
if n_workers <= 1:
|
|
207
|
+
for fn_relative in tqdm(valid_labelme_files_relative):
|
|
208
|
+
|
|
209
|
+
fn_abs = os.path.join(labelme_folder,fn_relative)
|
|
210
|
+
image_result = labelme_file_to_yolo_file(fn_abs,
|
|
211
|
+
category_name_to_category_id,
|
|
212
|
+
yolo_file=None,
|
|
213
|
+
required_token=required_token,
|
|
214
|
+
overwrite_behavior=overwrite_behavior)
|
|
215
|
+
image_results.append(image_result)
|
|
216
|
+
# ...for each file
|
|
217
|
+
else:
|
|
218
|
+
if use_threads:
|
|
219
|
+
pool = ThreadPool(n_workers)
|
|
220
|
+
else:
|
|
221
|
+
pool = Pool(n_workers)
|
|
222
|
+
|
|
223
|
+
valid_labelme_files_abs = [os.path.join(labelme_folder,fn_relative) for \
|
|
224
|
+
fn_relative in valid_labelme_files_relative]
|
|
225
|
+
|
|
226
|
+
image_results = list(tqdm(pool.imap(
|
|
227
|
+
partial(labelme_file_to_yolo_file,
|
|
228
|
+
category_name_to_category_id=category_name_to_category_id,
|
|
229
|
+
yolo_file=None,
|
|
230
|
+
required_token=required_token,
|
|
231
|
+
overwrite_behavior=overwrite_behavior),
|
|
232
|
+
valid_labelme_files_abs),
|
|
233
|
+
total=len(valid_labelme_files_abs)))
|
|
234
|
+
|
|
235
|
+
assert len(valid_labelme_files_relative) == len(image_results)
|
|
200
236
|
|
|
201
237
|
print('Converted {} labelme .json files to YOLO'.format(
|
|
202
238
|
len(valid_labelme_files_relative)))
|
|
203
239
|
|
|
204
|
-
|
|
240
|
+
labelme_to_yolo_results = {}
|
|
241
|
+
labelme_to_yolo_results['category_name_to_category_id'] = category_name_to_category_id
|
|
242
|
+
labelme_to_yolo_results['image_results'] = image_results
|
|
205
243
|
|
|
206
|
-
|
|
244
|
+
return labelme_to_yolo_results
|
|
245
|
+
|
|
246
|
+
# ...def labelme_folder_to_yolo(...)
|
|
247
|
+
|
|
248
|
+
|
|
207
249
|
#%% Interactive driver
|
|
208
250
|
|
|
209
251
|
if False:
|
|
@@ -212,18 +254,19 @@ if False:
|
|
|
212
254
|
|
|
213
255
|
#%%
|
|
214
256
|
|
|
215
|
-
import os
|
|
216
257
|
labelme_file = os.path.expanduser('~/tmp/labels/x.json')
|
|
217
|
-
yolo_file = None
|
|
218
258
|
required_token = 'saved_by_labelme'
|
|
219
|
-
right_edge_quantization_threshold = 0.015
|
|
220
259
|
category_name_to_category_id = {'animal':0}
|
|
260
|
+
labelme_folder = os.path.expanduser('~/tmp/labels')
|
|
221
261
|
|
|
222
262
|
#%%
|
|
223
263
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
264
|
+
category_name_to_category_id = \
|
|
265
|
+
labelme_folder_to_yolo(labelme_folder,
|
|
266
|
+
category_name_to_category_id=category_name_to_category_id,
|
|
267
|
+
required_token=required_token,
|
|
268
|
+
overwrite_behavior='overwrite')
|
|
269
|
+
|
|
227
270
|
#%% Command-line driver
|
|
228
271
|
|
|
229
272
|
# TODO
|
|
@@ -4,7 +4,11 @@
|
|
|
4
4
|
#
|
|
5
5
|
# Create a folder of blank images sampled from LILA. We'll aim for diversity, so less-common
|
|
6
6
|
# locations will be oversampled relative to more common locations. We'll also run MegaDetector
|
|
7
|
-
#
|
|
7
|
+
# (with manual review) to remove some incorrectly-labeled, not-actually-empty images from our
|
|
8
|
+
# blank set.
|
|
9
|
+
#
|
|
10
|
+
# We'll store location information for each image in a .json file, so we can split locations
|
|
11
|
+
# into train/val in downstream tasks.
|
|
8
12
|
#
|
|
9
13
|
########
|
|
10
14
|
|
|
@@ -14,7 +18,6 @@ import os
|
|
|
14
18
|
import random
|
|
15
19
|
import math
|
|
16
20
|
import json
|
|
17
|
-
import shutil
|
|
18
21
|
|
|
19
22
|
import numpy as np
|
|
20
23
|
from tqdm import tqdm
|
|
@@ -22,8 +25,7 @@ from multiprocessing.pool import ThreadPool
|
|
|
22
25
|
from urllib.parse import urlparse
|
|
23
26
|
from collections import defaultdict
|
|
24
27
|
|
|
25
|
-
from data_management.lila.lila_common import
|
|
26
|
-
read_lila_all_images_file, azure_url_to_gcp_http_url
|
|
28
|
+
from data_management.lila.lila_common import read_lila_all_images_file
|
|
27
29
|
from md_utils.url_utils import download_url
|
|
28
30
|
from md_visualization import visualization_utils as vis_utils
|
|
29
31
|
from md_utils.path_utils import recursive_file_list
|
|
@@ -45,6 +47,14 @@ os.makedirs(confirmed_blanks_base,exist_ok=True)
|
|
|
45
47
|
md_possible_non_blanks_folder = os.path.join(project_base,'candidate_non_blanks')
|
|
46
48
|
os.makedirs(md_possible_non_blanks_folder,exist_ok=True)
|
|
47
49
|
|
|
50
|
+
location_to_blank_image_urls_cache_file = os.path.join(project_base,
|
|
51
|
+
'location_to_blank_image_urls.json')
|
|
52
|
+
|
|
53
|
+
md_results_file = os.path.join(project_base,'lila_blanks_md_results.json')
|
|
54
|
+
|
|
55
|
+
all_fn_relative_to_location_file = os.path.join(project_base,'all_fn_relative_to_location.json')
|
|
56
|
+
confirmed_fn_relative_to_location_file = os.path.join(project_base,'confirmed_fn_relative_to_location.json')
|
|
57
|
+
|
|
48
58
|
preferred_image_download_source = 'gcp'
|
|
49
59
|
|
|
50
60
|
# Number of concurrent download threads
|
|
@@ -171,9 +181,6 @@ for s in original_labels_with_nan_common_names:
|
|
|
171
181
|
|
|
172
182
|
#%% Map locations to blank images
|
|
173
183
|
|
|
174
|
-
location_to_blank_image_urls_cache_file = os.path.join(project_base,
|
|
175
|
-
'location_to_blank_image_urls.json')
|
|
176
|
-
|
|
177
184
|
force_map_locations = False
|
|
178
185
|
|
|
179
186
|
# Load from .json if available
|
|
@@ -275,7 +282,7 @@ print('Max samples per location: {}'.format(max_blanks_per_location))
|
|
|
275
282
|
|
|
276
283
|
#%% Download those image files (prep)
|
|
277
284
|
|
|
278
|
-
container_to_url_base = {
|
|
285
|
+
container_to_url_base = {
|
|
279
286
|
'lilablobssc.blob.core.windows.net':'/',
|
|
280
287
|
'storage.googleapis.com':'/public-datasets-lila/'
|
|
281
288
|
}
|
|
@@ -318,6 +325,21 @@ def download_relative_filename(url, output_base, verbose=False, url_base=None, o
|
|
|
318
325
|
result['status'] = 'success'
|
|
319
326
|
return result
|
|
320
327
|
|
|
328
|
+
def azure_url_to_gcp_http_url(url,error_if_not_azure_url=True):
|
|
329
|
+
"""
|
|
330
|
+
Most URLs point to Azure by default, but most files are available on both Azure and GCP.
|
|
331
|
+
This function converts an Azure URL to the corresponding GCP http:// url.
|
|
332
|
+
"""
|
|
333
|
+
|
|
334
|
+
lila_azure_storage_account = 'https://lilablobssc.blob.core.windows.net'
|
|
335
|
+
gcp_bucket_api_url = 'https://storage.googleapis.com/public-datasets-lila'
|
|
336
|
+
error_if_not_azure_url = False
|
|
337
|
+
|
|
338
|
+
if error_if_not_azure_url:
|
|
339
|
+
assert url.startswith(lila_azure_storage_account)
|
|
340
|
+
gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
|
|
341
|
+
return gcp_url
|
|
342
|
+
|
|
321
343
|
# Convert Azure URLs to GCP URLs if necessary
|
|
322
344
|
if preferred_image_download_source != 'azure':
|
|
323
345
|
assert preferred_image_download_source == 'gcp'
|
|
@@ -358,8 +380,6 @@ print('Errors on {} of {} downloads'.format(len(error_urls),len(results)))
|
|
|
358
380
|
|
|
359
381
|
#%% Run MegaDetector on the folder
|
|
360
382
|
|
|
361
|
-
md_results_file = os.path.join(project_base,'lila_blanks_md_results.json')
|
|
362
|
-
|
|
363
383
|
cmd = 'python run_detector_batch.py MDV5A "{}" "{}"'.format(
|
|
364
384
|
candidate_blanks_base,md_results_file)
|
|
365
385
|
cmd += ' --recursive --output_relative_filenames'
|
|
@@ -419,6 +439,7 @@ for i_fn,source_file_relative in tqdm(enumerate(images_to_review_to_detections),
|
|
|
419
439
|
confidence_threshold=min_threshold,
|
|
420
440
|
target_size=(1280,-1))
|
|
421
441
|
|
|
442
|
+
# This is a temporary file I just used during debugging
|
|
422
443
|
with open(os.path.join(project_base,'output_file_to_source_file.json'),'w') as f:
|
|
423
444
|
json.dump(output_file_to_source_file,f,indent=1)
|
|
424
445
|
|
|
@@ -442,33 +463,95 @@ for output_file in tqdm(output_file_to_source_file.keys()):
|
|
|
442
463
|
source_file_relative = output_file_to_source_file[output_file]
|
|
443
464
|
removed_blank_images_relative.append(source_file_relative)
|
|
444
465
|
|
|
466
|
+
removed_blank_images_relative_set = set(removed_blank_images_relative)
|
|
445
467
|
assert len(removed_blank_images_relative) + len(remaining_images) == len(output_file_to_source_file)
|
|
446
468
|
|
|
447
469
|
|
|
448
|
-
#%% Copy
|
|
470
|
+
#%% Copy only the confirmed blanks to the confirmed folder
|
|
471
|
+
|
|
472
|
+
from md_utils.path_utils import is_image_file
|
|
449
473
|
|
|
450
474
|
all_candidate_blanks = recursive_file_list(candidate_blanks_base,return_relative_paths=True)
|
|
451
475
|
print('Found {} candidate blanks'.format(len(all_candidate_blanks)))
|
|
452
476
|
|
|
477
|
+
skipped_images_relative = []
|
|
478
|
+
skipped_non_images = []
|
|
479
|
+
|
|
453
480
|
for source_fn_relative in tqdm(all_candidate_blanks):
|
|
481
|
+
|
|
482
|
+
# Skip anything we removed from the "candidate non-blanks" folder; these weren't really
|
|
483
|
+
# blank.
|
|
484
|
+
if source_fn_relative in removed_blank_images_relative_set:
|
|
485
|
+
skipped_images_relative.append(source_fn_relative)
|
|
486
|
+
continue
|
|
487
|
+
|
|
488
|
+
if not is_image_file(source_fn_relative):
|
|
489
|
+
# Not a typo; "skipped images" really means "skipped files"
|
|
490
|
+
skipped_images_relative.append(source_fn_relative)
|
|
491
|
+
skipped_non_images.append(source_fn_relative)
|
|
492
|
+
|
|
493
|
+
|
|
454
494
|
source_fn_abs = os.path.join(candidate_blanks_base,source_fn_relative)
|
|
455
495
|
assert os.path.isfile(source_fn_abs)
|
|
456
496
|
target_fn_abs = os.path.join(confirmed_blanks_base,source_fn_relative)
|
|
457
497
|
os.makedirs(os.path.dirname(target_fn_abs),exist_ok=True)
|
|
458
|
-
shutil.copyfile(source_fn_abs,target_fn_abs)
|
|
498
|
+
# shutil.copyfile(source_fn_abs,target_fn_abs)
|
|
459
499
|
|
|
500
|
+
print('Skipped {} files ({} non-image files)'.format(len(skipped_images_relative),
|
|
501
|
+
len(skipped_non_images)))
|
|
460
502
|
|
|
461
|
-
#%% Record location information for each file
|
|
462
503
|
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
all_confirmed_blanks = recursive_file_list(confirmed_blanks_base,return_relative_paths=True)
|
|
504
|
+
#%% Validate the folder of confirmed blanks
|
|
505
|
+
|
|
506
|
+
from md_utils.path_utils import find_images
|
|
507
|
+
# all_confirmed_blanks = recursive_file_list(confirmed_blanks_base,return_relative_paths=True)
|
|
508
|
+
all_confirmed_blanks = find_images(confirmed_blanks_base,return_relative_paths=True,recursive=True)
|
|
509
|
+
assert len(all_confirmed_blanks) < len(all_candidate_blanks)
|
|
471
510
|
print('Found {} confirmed blanks'.format(len(all_confirmed_blanks)))
|
|
472
511
|
|
|
473
|
-
|
|
474
|
-
|
|
512
|
+
|
|
513
|
+
#%% Manually review a few of the images we skipped
|
|
514
|
+
|
|
515
|
+
# ...to make sure they're non-blank
|
|
516
|
+
i_image = random.randint(0, len(skipped_images_relative))
|
|
517
|
+
fn_relative = skipped_images_relative[i_image]
|
|
518
|
+
fn_abs = os.path.join(candidate_blanks_base,fn_relative)
|
|
519
|
+
assert os.path.isfile(fn_abs)
|
|
520
|
+
import clipboard
|
|
521
|
+
clipboard.copy('feh --scale-down "{}"'.format(fn_abs))
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
#%% Record location information for each confirmed file
|
|
525
|
+
|
|
526
|
+
# Map every URL's path to the corresponding location
|
|
527
|
+
#
|
|
528
|
+
# This is *all empty URLs*, not just the ones we downloaded
|
|
529
|
+
all_fn_relative_to_location = {}
|
|
530
|
+
|
|
531
|
+
# location = next(iter(location_to_blank_image_urls.keys()))
|
|
532
|
+
for location in tqdm(location_to_blank_image_urls):
|
|
533
|
+
urls_this_location = location_to_blank_image_urls[location]
|
|
534
|
+
|
|
535
|
+
# url = urls_this_location[0]
|
|
536
|
+
for url in urls_this_location:
|
|
537
|
+
# Turn:
|
|
538
|
+
#
|
|
539
|
+
# https://lilablobssc.blob.core.windows.net/caltech-unzipped/cct_images/5968c0f9-23d2-11e8-a6a3-ec086b02610b.jpg'
|
|
540
|
+
#
|
|
541
|
+
# ...into:
|
|
542
|
+
#
|
|
543
|
+
# caltech-unzipped/cct_images/5968c0f9-23d2-11e8-a6a3-ec086b02610b.jpg'
|
|
544
|
+
p = urlparse(url)
|
|
545
|
+
fn_relative = str(p.path)[1:]
|
|
546
|
+
all_fn_relative_to_location[fn_relative] = location
|
|
547
|
+
|
|
548
|
+
# Build a much smaller mapping of just the confirmed blanks
|
|
549
|
+
confirmed_fn_relative_to_location = {}
|
|
550
|
+
for i_fn,fn_relative in tqdm(enumerate(all_confirmed_blanks),total=len(all_confirmed_blanks)):
|
|
551
|
+
confirmed_fn_relative_to_location[fn_relative] = all_fn_relative_to_location[fn_relative]
|
|
552
|
+
|
|
553
|
+
with open(all_fn_relative_to_location_file,'w') as f:
|
|
554
|
+
json.dump(all_fn_relative_to_location,f,indent=1)
|
|
555
|
+
|
|
556
|
+
with open(confirmed_fn_relative_to_location_file,'w') as f:
|
|
557
|
+
json.dump(confirmed_fn_relative_to_location,f,indent=1)
|