megadetector 5.0.7__py3-none-any.whl → 5.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- api/batch_processing/data_preparation/manage_local_batch.py +28 -14
- api/batch_processing/postprocessing/combine_api_outputs.py +2 -2
- api/batch_processing/postprocessing/compare_batch_results.py +1 -1
- api/batch_processing/postprocessing/convert_output_format.py +24 -6
- api/batch_processing/postprocessing/load_api_results.py +1 -3
- api/batch_processing/postprocessing/md_to_labelme.py +118 -51
- api/batch_processing/postprocessing/merge_detections.py +30 -5
- api/batch_processing/postprocessing/postprocess_batch_results.py +24 -12
- api/batch_processing/postprocessing/remap_detection_categories.py +163 -0
- api/batch_processing/postprocessing/render_detection_confusion_matrix.py +15 -12
- api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +2 -2
- data_management/cct_json_utils.py +7 -2
- data_management/coco_to_labelme.py +263 -0
- data_management/coco_to_yolo.py +7 -4
- data_management/databases/integrity_check_json_db.py +68 -59
- data_management/databases/subset_json_db.py +1 -1
- data_management/get_image_sizes.py +44 -26
- data_management/importers/animl_results_to_md_results.py +1 -3
- data_management/importers/noaa_seals_2019.py +1 -1
- data_management/labelme_to_coco.py +252 -143
- data_management/labelme_to_yolo.py +95 -52
- data_management/lila/create_lila_blank_set.py +106 -23
- data_management/lila/download_lila_subset.py +133 -65
- data_management/lila/generate_lila_per_image_labels.py +1 -1
- data_management/lila/lila_common.py +8 -38
- data_management/read_exif.py +65 -16
- data_management/remap_coco_categories.py +84 -0
- data_management/resize_coco_dataset.py +3 -22
- data_management/wi_download_csv_to_coco.py +239 -0
- data_management/yolo_to_coco.py +283 -83
- detection/run_detector_batch.py +12 -3
- detection/run_inference_with_yolov5_val.py +10 -3
- detection/run_tiled_inference.py +2 -2
- detection/tf_detector.py +2 -1
- detection/video_utils.py +1 -1
- md_utils/ct_utils.py +22 -3
- md_utils/md_tests.py +11 -2
- md_utils/path_utils.py +206 -32
- md_utils/url_utils.py +66 -1
- md_utils/write_html_image_list.py +12 -3
- md_visualization/visualization_utils.py +363 -72
- md_visualization/visualize_db.py +33 -10
- {megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/METADATA +10 -12
- {megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/RECORD +47 -44
- {megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/WHEEL +1 -1
- md_visualization/visualize_megadb.py +0 -183
- {megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/LICENSE +0 -0
- {megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/top_level.txt +0 -0
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
# what you want to query for, etc., is very application-specific; this is just meant as a
|
|
10
10
|
# demo.
|
|
11
11
|
#
|
|
12
|
-
# Can download from
|
|
12
|
+
# Can download from GCP (all datasets), AWS (all datasets), or Azure (most datasets).
|
|
13
13
|
#
|
|
14
14
|
########
|
|
15
15
|
|
|
@@ -20,15 +20,16 @@ import random
|
|
|
20
20
|
|
|
21
21
|
from tqdm import tqdm
|
|
22
22
|
from multiprocessing.pool import ThreadPool
|
|
23
|
-
from urllib.parse import urlparse
|
|
24
23
|
from collections import defaultdict
|
|
25
24
|
|
|
26
|
-
from data_management.lila.lila_common import
|
|
27
|
-
read_lila_all_images_file, is_empty, azure_url_to_gcp_http_url
|
|
25
|
+
from data_management.lila.lila_common import read_lila_all_images_file, is_empty, lila_base_urls
|
|
28
26
|
from md_utils.url_utils import download_url
|
|
29
27
|
|
|
28
|
+
for s in lila_base_urls.values():
|
|
29
|
+
assert s.endswith('/')
|
|
30
|
+
|
|
30
31
|
# If any of these strings appear in the common name of a species, we'll download that image
|
|
31
|
-
species_of_interest = ['grey fox','
|
|
32
|
+
species_of_interest = ['grey fox','gray fox','cape fox','red fox','kit fox']
|
|
32
33
|
|
|
33
34
|
# We'll write images, metadata downloads, and temporary files here
|
|
34
35
|
lila_local_base = os.path.expanduser('~/lila')
|
|
@@ -42,17 +43,61 @@ os.makedirs(output_dir,exist_ok=True)
|
|
|
42
43
|
# Number of concurrent download threads
|
|
43
44
|
n_download_threads = 20
|
|
44
45
|
|
|
46
|
+
verbose = False
|
|
47
|
+
|
|
45
48
|
max_images_per_dataset = 10 # None
|
|
46
49
|
|
|
47
50
|
# This impacts the data download, but not the metadata download
|
|
48
51
|
#
|
|
49
|
-
# "Azure" really means "Azure if available";
|
|
50
|
-
# on
|
|
51
|
-
|
|
52
|
+
# Setting this to "Azure" really means "Azure if available"; some datasets are
|
|
53
|
+
# not available on Azure.
|
|
54
|
+
preferred_provider = 'gcp' # 'azure', 'gcp', 'aws'
|
|
52
55
|
|
|
53
56
|
random.seed(0)
|
|
54
57
|
|
|
55
58
|
|
|
59
|
+
#%% Support functions
|
|
60
|
+
|
|
61
|
+
def download_relative_url(relative_url, output_base, provider='gcp',
|
|
62
|
+
verbose=False, overwrite=False):
|
|
63
|
+
"""
|
|
64
|
+
Download a URL to output_base, preserving the path relative to the common LILA root.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
assert not relative_url.startswith('/')
|
|
68
|
+
|
|
69
|
+
# Not all datasets are available on Azure, fall back in these cases. The decision
|
|
70
|
+
# to fall back to GCP rather than AWS is arbitrary.
|
|
71
|
+
if provider == 'azure':
|
|
72
|
+
nominal_provider = relative_url_to_nominal_provider[relative_url]
|
|
73
|
+
if nominal_provider != 'azure':
|
|
74
|
+
if verbose:
|
|
75
|
+
print('URL {} not available on Azure, falling back to GCP'.format(
|
|
76
|
+
relative_url))
|
|
77
|
+
provider = 'gcp'
|
|
78
|
+
|
|
79
|
+
url = lila_base_urls[provider] + relative_url
|
|
80
|
+
|
|
81
|
+
result = {'status':'unknown','url':url,'destination_filename':None}
|
|
82
|
+
|
|
83
|
+
destination_filename = os.path.join(output_base,relative_url)
|
|
84
|
+
result['destination_filename'] = destination_filename
|
|
85
|
+
|
|
86
|
+
if ((os.path.isfile(destination_filename)) and (not overwrite)):
|
|
87
|
+
result['status'] = 'skipped'
|
|
88
|
+
return result
|
|
89
|
+
try:
|
|
90
|
+
download_url(url, destination_filename, verbose=verbose, force_download=overwrite)
|
|
91
|
+
except Exception as e:
|
|
92
|
+
print('Warning: error downloading URL {}: {}'.format(
|
|
93
|
+
url,str(e)))
|
|
94
|
+
result['status'] = 'error: {}'.format(str(e))
|
|
95
|
+
return result
|
|
96
|
+
|
|
97
|
+
result['status'] = 'success'
|
|
98
|
+
return result
|
|
99
|
+
|
|
100
|
+
|
|
56
101
|
#%% Download and open the giant table of image URLs and labels
|
|
57
102
|
|
|
58
103
|
# ~60 seconds to download, unzip, and open
|
|
@@ -63,6 +108,8 @@ df = read_lila_all_images_file(metadata_dir)
|
|
|
63
108
|
|
|
64
109
|
# ~2 minutes
|
|
65
110
|
|
|
111
|
+
common_name_to_count = defaultdict(int)
|
|
112
|
+
|
|
66
113
|
ds_name_to_urls = defaultdict(list)
|
|
67
114
|
|
|
68
115
|
def find_items(row):
|
|
@@ -75,6 +122,7 @@ def find_items(row):
|
|
|
75
122
|
for species_name in species_of_interest:
|
|
76
123
|
if species_name in row['common_name']:
|
|
77
124
|
match = True
|
|
125
|
+
common_name_to_count[species_name] += 1
|
|
78
126
|
break
|
|
79
127
|
|
|
80
128
|
if match:
|
|
@@ -83,15 +131,19 @@ def find_items(row):
|
|
|
83
131
|
tqdm.pandas()
|
|
84
132
|
_ = df.progress_apply(find_items,axis=1)
|
|
85
133
|
|
|
134
|
+
# We have a list of URLs for each dataset, flatten them all into a list of URLs
|
|
86
135
|
all_urls = list(ds_name_to_urls.values())
|
|
87
136
|
all_urls = [item for sublist in all_urls for item in sublist]
|
|
88
137
|
print('Found {} matching URLs across {} datasets'.format(len(all_urls),len(ds_name_to_urls)))
|
|
89
138
|
|
|
139
|
+
for common_name in common_name_to_count:
|
|
140
|
+
print('{}: {}'.format(common_name,common_name_to_count[common_name]))
|
|
141
|
+
|
|
90
142
|
from copy import deepcopy
|
|
91
143
|
ds_name_to_urls_raw = deepcopy(ds_name_to_urls)
|
|
92
144
|
|
|
93
145
|
|
|
94
|
-
#%%
|
|
146
|
+
#%% Optionally trim to a fixed number of URLs per dataset
|
|
95
147
|
|
|
96
148
|
if max_images_per_dataset is None:
|
|
97
149
|
pass
|
|
@@ -102,74 +154,90 @@ else:
|
|
|
102
154
|
ds_name_to_urls[ds_name] = random.sample(ds_name_to_urls[ds_name],max_images_per_dataset)
|
|
103
155
|
|
|
104
156
|
|
|
105
|
-
#%%
|
|
157
|
+
#%% Convert URLs to be relative to the common LILA base
|
|
106
158
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
'storage.googleapis.com':'/public-datasets-lila/'
|
|
110
|
-
}
|
|
159
|
+
all_urls = list(ds_name_to_urls.values())
|
|
160
|
+
all_urls = [item for sublist in all_urls for item in sublist]
|
|
111
161
|
|
|
112
|
-
|
|
113
|
-
"""
|
|
114
|
-
Download a URL to output_base, preserving relative path
|
|
115
|
-
"""
|
|
116
|
-
|
|
117
|
-
result = {'status':'unknown','url':url,'destination_filename':None}
|
|
118
|
-
|
|
119
|
-
if url_base is None:
|
|
120
|
-
assert url.startswith('https://')
|
|
121
|
-
container = url.split('/')[2]
|
|
122
|
-
assert container in container_to_url_base
|
|
123
|
-
url_base = container_to_url_base[container]
|
|
124
|
-
|
|
125
|
-
assert url_base.startswith('/') and url_base.endswith('/')
|
|
126
|
-
|
|
127
|
-
p = urlparse(url)
|
|
128
|
-
relative_filename = str(p.path)
|
|
129
|
-
# remove the leading '/'
|
|
130
|
-
assert relative_filename.startswith(url_base)
|
|
131
|
-
relative_filename = relative_filename.replace(url_base,'',1)
|
|
132
|
-
|
|
133
|
-
destination_filename = os.path.join(output_base,relative_filename)
|
|
134
|
-
result['destination_filename'] = destination_filename
|
|
135
|
-
|
|
136
|
-
if ((os.path.isfile(destination_filename)) and (not overwrite)):
|
|
137
|
-
result['status'] = 'skipped'
|
|
138
|
-
return result
|
|
139
|
-
try:
|
|
140
|
-
download_url(url, destination_filename, verbose=verbose)
|
|
141
|
-
except Exception as e:
|
|
142
|
-
print('Warning: error downloading URL {}: {}'.format(
|
|
143
|
-
url,str(e)))
|
|
144
|
-
result['status'] = 'error: {}'.format(str(e))
|
|
145
|
-
return result
|
|
146
|
-
|
|
147
|
-
result['status'] = 'success'
|
|
148
|
-
return result
|
|
162
|
+
all_urls_relative = []
|
|
149
163
|
|
|
164
|
+
# Each file has a nominal URL in the .csv file. For now, the only thing this tells is
|
|
165
|
+
# is that if the nominal URL isn't an Azure URL, the file isn't on Azure. All files are on
|
|
166
|
+
# GCP and AWS.
|
|
167
|
+
#
|
|
168
|
+
# Keep track of the nominal provider for each URL.
|
|
169
|
+
relative_url_to_nominal_provider = {}
|
|
170
|
+
|
|
171
|
+
for url in all_urls:
|
|
172
|
+
found_base = False
|
|
173
|
+
for provider in lila_base_urls.keys():
|
|
174
|
+
base = lila_base_urls[provider]
|
|
175
|
+
if url.startswith(base):
|
|
176
|
+
relative_url = url.replace(base,'')
|
|
177
|
+
all_urls_relative.append(relative_url)
|
|
178
|
+
relative_url_to_nominal_provider[relative_url] = provider
|
|
179
|
+
found_base = True
|
|
180
|
+
break
|
|
181
|
+
assert found_base
|
|
182
|
+
|
|
183
|
+
assert len(all_urls) == len(all_urls_relative)
|
|
150
184
|
|
|
151
|
-
# ds_name_to_urls maps dataset names to lists of URLs; flatten to a single list of URLs
|
|
152
|
-
all_urls = list(ds_name_to_urls.values())
|
|
153
|
-
all_urls = [item for sublist in all_urls for item in sublist]
|
|
154
185
|
|
|
155
|
-
|
|
156
|
-
if image_download_source != 'azure':
|
|
157
|
-
assert image_download_source == 'gcp'
|
|
158
|
-
all_urls = [azure_url_to_gcp_http_url(url) for url in all_urls]
|
|
186
|
+
#%% Download image files
|
|
159
187
|
|
|
160
|
-
print('Downloading {} images on {} workers'.format(
|
|
188
|
+
print('Downloading {} images on {} workers, preferred provider is {}'.format(
|
|
189
|
+
len(all_urls),n_download_threads,preferred_provider))
|
|
161
190
|
|
|
162
191
|
if n_download_threads <= 1:
|
|
163
192
|
|
|
164
193
|
results = []
|
|
165
194
|
|
|
166
|
-
#
|
|
167
|
-
for
|
|
168
|
-
|
|
195
|
+
# url_relative = all_urls_relative[0]
|
|
196
|
+
for url_relative in tqdm(all_urls_relative):
|
|
197
|
+
result = download_relative_url(url_relative,
|
|
198
|
+
output_base=output_dir,
|
|
199
|
+
provider=preferred_provider,
|
|
200
|
+
verbose=verbose)
|
|
201
|
+
results.append(result)
|
|
169
202
|
|
|
170
203
|
else:
|
|
171
204
|
|
|
172
205
|
pool = ThreadPool(n_download_threads)
|
|
173
|
-
results = list(tqdm(pool.imap(lambda s:
|
|
174
|
-
s,output_dir,
|
|
175
|
-
|
|
206
|
+
results = list(tqdm(pool.imap(lambda s: download_relative_url(
|
|
207
|
+
s,output_base=output_dir,provider=preferred_provider,verbose=verbose),
|
|
208
|
+
all_urls_relative), total=len(all_urls_relative)))
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
#%% Scrap
|
|
212
|
+
|
|
213
|
+
if False:
|
|
214
|
+
|
|
215
|
+
pass
|
|
216
|
+
|
|
217
|
+
#%% Find all the reptiles on LILA
|
|
218
|
+
|
|
219
|
+
reptile_rows = df.loc[df['class'] == 'reptilia']
|
|
220
|
+
|
|
221
|
+
# i_row = 0; row = reptile_rows.iloc[i_row]
|
|
222
|
+
|
|
223
|
+
common_name_to_count = defaultdict(int)
|
|
224
|
+
dataset_to_count = defaultdict(int)
|
|
225
|
+
for i_row,row in reptile_rows.iterrows():
|
|
226
|
+
common_name_to_count[row['common_name']] += 1
|
|
227
|
+
dataset_to_count[row['dataset_name']] += 1
|
|
228
|
+
|
|
229
|
+
from md_utils.ct_utils import sort_dictionary_by_value
|
|
230
|
+
|
|
231
|
+
print('Found {} reptiles\n'.format(len(reptile_rows)))
|
|
232
|
+
|
|
233
|
+
common_name_to_count = sort_dictionary_by_value(common_name_to_count,reverse=True)
|
|
234
|
+
dataset_to_count = sort_dictionary_by_value(dataset_to_count,reverse=True)
|
|
235
|
+
|
|
236
|
+
print('Common names by count:\n')
|
|
237
|
+
for k in common_name_to_count:
|
|
238
|
+
print('{} ({})'.format(k,common_name_to_count[k]))
|
|
239
|
+
|
|
240
|
+
print('\nDatasets by count:\n')
|
|
241
|
+
for k in dataset_to_count:
|
|
242
|
+
print('{} ({})'.format(k,dataset_to_count[k]))
|
|
243
|
+
|
|
@@ -31,9 +31,13 @@ wildlife_insights_taxonomy_local_json_filename = 'wi_taxonomy.json'
|
|
|
31
31
|
wildlife_insights_taxonomy_local_csv_filename = \
|
|
32
32
|
wildlife_insights_taxonomy_local_json_filename.replace('.json','.csv')
|
|
33
33
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
# Filenames are consistent across clouds relative to these URLs
|
|
35
|
+
lila_base_urls = {
|
|
36
|
+
'azure':'https://lilablobssc.blob.core.windows.net/',
|
|
37
|
+
'gcp':'https://storage.googleapis.com/public-datasets-lila/',
|
|
38
|
+
'aws':'http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/'
|
|
39
|
+
}
|
|
40
|
+
|
|
37
41
|
|
|
38
42
|
|
|
39
43
|
#%% Common functions
|
|
@@ -198,28 +202,6 @@ def read_metadata_file_for_dataset(ds_name,metadata_dir,metadata_table=None,json
|
|
|
198
202
|
return json_filename
|
|
199
203
|
|
|
200
204
|
|
|
201
|
-
def azure_url_to_gcp_http_url(url,error_if_not_azure_url=True):
|
|
202
|
-
"""
|
|
203
|
-
Most URLs point to Azure by default, but most files are available on both Azure and GCP.
|
|
204
|
-
This function converts an Azure URL to the corresponding GCP http:// url.
|
|
205
|
-
"""
|
|
206
|
-
|
|
207
|
-
if error_if_not_azure_url:
|
|
208
|
-
assert url.startswith(lila_azure_storage_account)
|
|
209
|
-
gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
|
|
210
|
-
return gcp_url
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
def azure_url_to_gcp_gs_url(url,error_if_not_azure_url=True):
|
|
214
|
-
"""
|
|
215
|
-
Most URLs point to Azure by default, but most files are available on both Azure and GCP.
|
|
216
|
-
This function converts an Azure URL to the corresponding GCP gs:// url.
|
|
217
|
-
"""
|
|
218
|
-
|
|
219
|
-
return azure_url_to_gcp_http_url(url,error_if_not_azure_url).\
|
|
220
|
-
replace(gcp_bucket_api_url,gcp_bucket_gs_url,1)
|
|
221
|
-
|
|
222
|
-
|
|
223
205
|
#%% Interactive test driver
|
|
224
206
|
|
|
225
207
|
if False:
|
|
@@ -252,16 +234,4 @@ if False:
|
|
|
252
234
|
urls_to_test.append(ds_info['bbox_url'])
|
|
253
235
|
|
|
254
236
|
status_codes = url_utils.test_urls(urls_to_test)
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
#%% Verify that the GCP versions of all metadata files exist
|
|
258
|
-
|
|
259
|
-
gcp_urls = []
|
|
260
|
-
|
|
261
|
-
# url = urls_to_test[0]
|
|
262
|
-
for url in urls_to_test:
|
|
263
|
-
assert url.startswith(lila_azure_storage_account)
|
|
264
|
-
gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
|
|
265
|
-
gcp_urls.append(gcp_url)
|
|
266
|
-
|
|
267
|
-
status_codes = url_utils.test_urls(gcp_urls)
|
|
237
|
+
|
data_management/read_exif.py
CHANGED
|
@@ -48,9 +48,18 @@ class ReadExifOptions:
|
|
|
48
48
|
#
|
|
49
49
|
# Not relevant if n_workers is 1.
|
|
50
50
|
use_threads = True
|
|
51
|
-
|
|
51
|
+
|
|
52
|
+
# "File" and "ExifTool" are tag types used by ExifTool to report data that
|
|
53
|
+
# doesn't come from EXIF, rather from the file (e.g. file size).
|
|
52
54
|
tag_types_to_ignore = set(['File','ExifTool'])
|
|
53
55
|
|
|
56
|
+
# Include/exclude specific tags (mutually incompatible)
|
|
57
|
+
tags_to_include = None
|
|
58
|
+
tags_to_exclude = None
|
|
59
|
+
|
|
60
|
+
# A useful set of tags one might want to limit queries for
|
|
61
|
+
# options.tags_to_include = ['DateTime','Model','Make','ExifImageWidth','ExifImageHeight','DateTime','DateTimeOriginal','Orientation']
|
|
62
|
+
|
|
54
63
|
exiftool_command_name = 'exiftool'
|
|
55
64
|
|
|
56
65
|
# How should we handle byte-formatted EXIF tags?
|
|
@@ -62,16 +71,17 @@ class ReadExifOptions:
|
|
|
62
71
|
|
|
63
72
|
# Should we use exiftool or pil?
|
|
64
73
|
processing_library = 'pil' # 'exiftool','pil'
|
|
65
|
-
|
|
74
|
+
|
|
75
|
+
|
|
66
76
|
|
|
67
77
|
#%% Functions
|
|
68
78
|
|
|
69
|
-
def enumerate_files(input_folder):
|
|
79
|
+
def enumerate_files(input_folder,recursive=True):
|
|
70
80
|
"""
|
|
71
81
|
Enumerates all image files in input_folder, returning relative paths
|
|
72
82
|
"""
|
|
73
83
|
|
|
74
|
-
image_files = find_images(input_folder,recursive=
|
|
84
|
+
image_files = find_images(input_folder,recursive=recursive)
|
|
75
85
|
image_files = [os.path.relpath(s,input_folder) for s in image_files]
|
|
76
86
|
image_files = [s.replace('\\','/') for s in image_files]
|
|
77
87
|
print('Enumerated {} files'.format(len(image_files)))
|
|
@@ -99,7 +109,7 @@ def get_exif_ifd(exif):
|
|
|
99
109
|
def read_pil_exif(im,options=None):
|
|
100
110
|
"""
|
|
101
111
|
Read all the EXIF data we know how to read from [im] (path or PIL Image), whether it's
|
|
102
|
-
in the PIL default EXIF data or not.
|
|
112
|
+
in the PIL default EXIF data or not. Returns a dict.
|
|
103
113
|
"""
|
|
104
114
|
|
|
105
115
|
if options is None:
|
|
@@ -192,6 +202,32 @@ def parse_exif_datetime_string(s,verbose=False):
|
|
|
192
202
|
return dt
|
|
193
203
|
|
|
194
204
|
|
|
205
|
+
def _filter_tags(tags,options):
|
|
206
|
+
"""
|
|
207
|
+
Internal function used to include/exclude specific tags from the exif_tags
|
|
208
|
+
dict.
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
if options is None:
|
|
212
|
+
return tags
|
|
213
|
+
if options.tags_to_include is None and options.tags_to_exclude is None:
|
|
214
|
+
return tags
|
|
215
|
+
if options.tags_to_include is not None:
|
|
216
|
+
assert options.tags_to_exclude is None, "tags_to_include and tags_to_exclude are incompatible"
|
|
217
|
+
tags_to_return = {}
|
|
218
|
+
for tag_name in tags.keys():
|
|
219
|
+
if tag_name in options.tags_to_include:
|
|
220
|
+
tags_to_return[tag_name] = tags[tag_name]
|
|
221
|
+
return tags_to_return
|
|
222
|
+
if options.tags_to_exclude is not None:
|
|
223
|
+
assert options.tags_to_include is None, "tags_to_include and tags_to_exclude are incompatible"
|
|
224
|
+
tags_to_return = {}
|
|
225
|
+
for tag_name in tags.keys():
|
|
226
|
+
if tag_name not in options.tags_to_exclude:
|
|
227
|
+
tags_to_return[tag_name] = tags[tag_name]
|
|
228
|
+
return tags_to_return
|
|
229
|
+
|
|
230
|
+
|
|
195
231
|
def read_exif_tags_for_image(file_path,options=None):
|
|
196
232
|
"""
|
|
197
233
|
Get relevant fields from EXIF data for an image
|
|
@@ -227,8 +263,8 @@ def read_exif_tags_for_image(file_path,options=None):
|
|
|
227
263
|
result['status'] = 'empty_read'
|
|
228
264
|
else:
|
|
229
265
|
result['status'] = 'success'
|
|
230
|
-
result['tags'] = exif_tags
|
|
231
|
-
|
|
266
|
+
result['tags'] = _filter_tags(exif_tags,options)
|
|
267
|
+
|
|
232
268
|
return result
|
|
233
269
|
|
|
234
270
|
elif options.processing_library == 'exiftool':
|
|
@@ -283,9 +319,12 @@ def read_exif_tags_for_image(file_path,options=None):
|
|
|
283
319
|
print('Ignoring tag with type {}'.format(field_type))
|
|
284
320
|
continue
|
|
285
321
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
322
|
+
field_name = field_name_type_tokens[1].strip()
|
|
323
|
+
if options.tags_to_exclude is not None and field_name in options.tags_to_exclude:
|
|
324
|
+
continue
|
|
325
|
+
if options.tags_to_include is not None and field_name not in options.tags_to_include:
|
|
326
|
+
continue
|
|
327
|
+
tag = [field_type,field_name,field_value]
|
|
289
328
|
|
|
290
329
|
exif_tags.append(tag)
|
|
291
330
|
|
|
@@ -350,20 +389,22 @@ def populate_exif_data(im, image_base, options=None):
|
|
|
350
389
|
# ...populate_exif_data()
|
|
351
390
|
|
|
352
391
|
|
|
353
|
-
def create_image_objects(image_files):
|
|
392
|
+
def create_image_objects(image_files,recursive=True):
|
|
354
393
|
"""
|
|
355
394
|
Create empty image objects for every image in [image_files], which can be a
|
|
356
395
|
list of relative paths (which will get stored without processing, so the base
|
|
357
396
|
path doesn't matter here), or a folder name.
|
|
358
397
|
|
|
359
398
|
Returns a list of dicts with field 'file_name' (a relative path).
|
|
399
|
+
|
|
400
|
+
"recursive" is ignored if "image_files" is a list.
|
|
360
401
|
"""
|
|
361
402
|
|
|
362
403
|
# Enumerate *relative* paths
|
|
363
404
|
if isinstance(image_files,str):
|
|
364
405
|
print('Enumerating image files in {}'.format(image_files))
|
|
365
406
|
assert os.path.isdir(image_files), 'Invalid image folder {}'.format(image_files)
|
|
366
|
-
image_files = enumerate_files(image_files)
|
|
407
|
+
image_files = enumerate_files(image_files,recursive=recursive)
|
|
367
408
|
|
|
368
409
|
images = []
|
|
369
410
|
for fn in image_files:
|
|
@@ -499,7 +540,7 @@ def is_executable(name):
|
|
|
499
540
|
return which(name) is not None
|
|
500
541
|
|
|
501
542
|
|
|
502
|
-
def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=None):
|
|
543
|
+
def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=None,recursive=True):
|
|
503
544
|
"""
|
|
504
545
|
Read EXIF data for all images in input_folder.
|
|
505
546
|
|
|
@@ -516,6 +557,12 @@ def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=N
|
|
|
516
557
|
if options is None:
|
|
517
558
|
options = ReadExifOptions()
|
|
518
559
|
|
|
560
|
+
# Validate options
|
|
561
|
+
if options.tags_to_include is not None:
|
|
562
|
+
assert options.tags_to_exclude is None, "tags_to_include and tags_to_exclude are incompatible"
|
|
563
|
+
if options.tags_to_exclude is not None:
|
|
564
|
+
assert options.tags_to_include is None, "tags_to_include and tags_to_exclude are incompatible"
|
|
565
|
+
|
|
519
566
|
if input_folder is None:
|
|
520
567
|
input_folder = ''
|
|
521
568
|
if len(input_folder) > 0:
|
|
@@ -542,7 +589,7 @@ def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=N
|
|
|
542
589
|
assert is_executable(options.exiftool_command_name), 'exiftool not available'
|
|
543
590
|
|
|
544
591
|
if filenames is None:
|
|
545
|
-
images = create_image_objects(input_folder)
|
|
592
|
+
images = create_image_objects(input_folder,recursive=recursive)
|
|
546
593
|
else:
|
|
547
594
|
assert isinstance(filenames,list)
|
|
548
595
|
images = create_image_objects(filenames)
|
|
@@ -567,14 +614,16 @@ if False:
|
|
|
567
614
|
|
|
568
615
|
#%%
|
|
569
616
|
|
|
570
|
-
input_folder =
|
|
571
|
-
output_file =
|
|
617
|
+
input_folder = r'C:\temp\md-name-testing'
|
|
618
|
+
output_file = None # r'C:\temp\md-name-testing\exif.json'
|
|
572
619
|
options = ReadExifOptions()
|
|
573
620
|
options.verbose = False
|
|
574
621
|
options.n_workers = 10
|
|
575
622
|
options.use_threads = False
|
|
576
623
|
options.processing_library = 'pil'
|
|
577
624
|
# options.processing_library = 'exiftool'
|
|
625
|
+
options.tags_to_include = ['DateTime','Model','Make','ExifImageWidth','ExifImageHeight','DateTime','DateTimeOriginal','Orientation']
|
|
626
|
+
# options.tags_to_exclude = ['MakerNote']
|
|
578
627
|
|
|
579
628
|
results = read_exif_from_folder(input_folder,output_file,options)
|
|
580
629
|
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
########
|
|
2
|
+
#
|
|
3
|
+
# remap_coco_categories.py
|
|
4
|
+
#
|
|
5
|
+
# Given a COCO-formatted dataset, remap the categories to a new mapping.
|
|
6
|
+
#
|
|
7
|
+
########
|
|
8
|
+
|
|
9
|
+
#%% Imports and constants
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import json
|
|
13
|
+
|
|
14
|
+
from copy import deepcopy
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
#%% Main function
|
|
18
|
+
|
|
19
|
+
def remap_coco_categories(input_data,
|
|
20
|
+
output_category_name_to_id,
|
|
21
|
+
input_category_name_to_output_category_name,
|
|
22
|
+
output_file=None):
|
|
23
|
+
"""
|
|
24
|
+
Given a COCO-formatted dataset, remap the categories to a new categories mapping, optionally
|
|
25
|
+
writing the results to a new file.
|
|
26
|
+
|
|
27
|
+
output_category_name_to_id is a dict mapping strings to ints.
|
|
28
|
+
|
|
29
|
+
input_category_name_to_output_category_name is a dict mapping strings to strings.
|
|
30
|
+
|
|
31
|
+
[input_data] can be a COCO-formatted dict or a filename. If it's a dict, it will be copied,
|
|
32
|
+
not modified in place.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
if isinstance(input_data,str):
|
|
36
|
+
assert os.path.isfile(input_data), "Can't find file {}".format(input_data)
|
|
37
|
+
with open(input_data,'r') as f:
|
|
38
|
+
input_data = json.load(f)
|
|
39
|
+
assert isinstance(input_data,dict), 'Illegal COCO input data'
|
|
40
|
+
else:
|
|
41
|
+
assert isinstance(input_data,dict), 'Illegal COCO input data'
|
|
42
|
+
input_data = deepcopy(input_data)
|
|
43
|
+
|
|
44
|
+
# It's safe to modify in-place now
|
|
45
|
+
output_data = input_data
|
|
46
|
+
|
|
47
|
+
# Read input name --> ID mapping
|
|
48
|
+
input_category_name_to_input_category_id = {}
|
|
49
|
+
for c in input_data['categories']:
|
|
50
|
+
input_category_name_to_input_category_id[c['name']] = c['id']
|
|
51
|
+
|
|
52
|
+
# Map input IDs --> output IDs
|
|
53
|
+
input_category_id_to_output_category_id = {}
|
|
54
|
+
for input_name in input_category_name_to_output_category_name.keys():
|
|
55
|
+
output_name = input_category_name_to_output_category_name[input_name]
|
|
56
|
+
assert output_name in output_category_name_to_id, \
|
|
57
|
+
'No output ID for {} --> {}'.format(input_name,output_name)
|
|
58
|
+
input_id = input_category_name_to_input_category_id[input_name]
|
|
59
|
+
output_id = output_category_name_to_id[output_name]
|
|
60
|
+
input_category_id_to_output_category_id[input_id] = output_id
|
|
61
|
+
|
|
62
|
+
# Map annotations
|
|
63
|
+
for ann in output_data['annotations']:
|
|
64
|
+
assert ann['category_id'] in input_category_id_to_output_category_id, \
|
|
65
|
+
'Unrecognized category ID {}'.format(ann['category_id'])
|
|
66
|
+
ann['category_id'] = input_category_id_to_output_category_id[ann['category_id']]
|
|
67
|
+
|
|
68
|
+
# Update the category list
|
|
69
|
+
output_categories = []
|
|
70
|
+
for output_name in output_category_name_to_id:
|
|
71
|
+
category = {'name':output_name,'id':output_category_name_to_id[output_name]}
|
|
72
|
+
output_categories.append(category)
|
|
73
|
+
output_data['categories'] = output_categories
|
|
74
|
+
|
|
75
|
+
if output_file is not None:
|
|
76
|
+
with open(output_file,'w') as f:
|
|
77
|
+
json.dump(output_data,f,indent=1)
|
|
78
|
+
|
|
79
|
+
return input_data
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
#%% Command-line driver
|
|
83
|
+
|
|
84
|
+
# TODO
|
|
@@ -26,8 +26,7 @@ from md_visualization.visualization_utils import \
|
|
|
26
26
|
def resize_coco_dataset(input_folder,input_filename,
|
|
27
27
|
output_folder,output_filename,
|
|
28
28
|
target_size=(-1,-1),
|
|
29
|
-
correct_size_image_handling='copy'
|
|
30
|
-
right_edge_quantization_threshold=None):
|
|
29
|
+
correct_size_image_handling='copy'):
|
|
31
30
|
"""
|
|
32
31
|
Given a COCO-formatted dataset (images in input_folder, data in input_filename), resize
|
|
33
32
|
all the images to a target size (in output_folder) and scale bounding boxes accordingly
|
|
@@ -36,7 +35,7 @@ def resize_coco_dataset(input_folder,input_filename,
|
|
|
36
35
|
target_size should be a tuple/list of ints, length 2. If either dimension is -1, aspect ratio
|
|
37
36
|
will be preserved. If both dimensions are -1, this means "keep the original size". If
|
|
38
37
|
both dimensions are -1 and correct_size_image_handling is copy, this function is basically
|
|
39
|
-
a no-op
|
|
38
|
+
a no-op.
|
|
40
39
|
|
|
41
40
|
correct_size_image_handling can be 'copy' (in which case the original image is just copied
|
|
42
41
|
to the output folder) or 'rewrite' (in which case the image is opened via PIL and re-written,
|
|
@@ -44,12 +43,6 @@ def resize_coco_dataset(input_folder,input_filename,
|
|
|
44
43
|
you're superstitious about biases coming from images in a training set being written
|
|
45
44
|
by different image encoders.
|
|
46
45
|
|
|
47
|
-
right_edge_quantization_threshold is an off-by-default hack to adjust large datasets where
|
|
48
|
-
boxes that really should be running off the right side of the image only extend like 99%
|
|
49
|
-
of the way there, due to what appears to be a slight bias inherent to MD. If a box extends
|
|
50
|
-
within [right_edge_quantization_threshold] (a small number, from 0 to 1, but probably around
|
|
51
|
-
0.02) of the right edge of the image, it will be extended to the far right edge.
|
|
52
|
-
|
|
53
46
|
Returns the COCO database with resized images.
|
|
54
47
|
"""
|
|
55
48
|
|
|
@@ -126,15 +119,6 @@ def resize_coco_dataset(input_folder,input_filename,
|
|
|
126
119
|
bbox[2] * width_scale,
|
|
127
120
|
bbox[3] * height_scale]
|
|
128
121
|
|
|
129
|
-
# Do we need to quantize this box?
|
|
130
|
-
if right_edge_quantization_threshold is not None and \
|
|
131
|
-
right_edge_quantization_threshold > 0:
|
|
132
|
-
bbox_right_edge_abs = bbox[0] + bbox[2]
|
|
133
|
-
bbox_right_edge_norm = bbox_right_edge_abs / output_w
|
|
134
|
-
bbox_right_edge_distance = (1.0 - bbox_right_edge_norm)
|
|
135
|
-
if bbox_right_edge_distance < right_edge_quantization_threshold:
|
|
136
|
-
bbox[2] = output_w - bbox[0]
|
|
137
|
-
|
|
138
122
|
ann['bbox'] = bbox
|
|
139
123
|
|
|
140
124
|
# ...if this annotation has a box
|
|
@@ -169,13 +153,10 @@ if False:
|
|
|
169
153
|
|
|
170
154
|
correct_size_image_handling = 'rewrite'
|
|
171
155
|
|
|
172
|
-
right_edge_quantization_threshold = 0.015
|
|
173
|
-
|
|
174
156
|
resize_coco_dataset(input_folder,input_filename,
|
|
175
157
|
output_folder,output_filename,
|
|
176
158
|
target_size=target_size,
|
|
177
|
-
correct_size_image_handling=correct_size_image_handling
|
|
178
|
-
right_edge_quantization_threshold=right_edge_quantization_threshold)
|
|
159
|
+
correct_size_image_handling=correct_size_image_handling)
|
|
179
160
|
|
|
180
161
|
|
|
181
162
|
#%% Preview
|