megadetector 5.0.7__py3-none-any.whl → 5.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (48) hide show
  1. api/batch_processing/data_preparation/manage_local_batch.py +28 -14
  2. api/batch_processing/postprocessing/combine_api_outputs.py +2 -2
  3. api/batch_processing/postprocessing/compare_batch_results.py +1 -1
  4. api/batch_processing/postprocessing/convert_output_format.py +24 -6
  5. api/batch_processing/postprocessing/load_api_results.py +1 -3
  6. api/batch_processing/postprocessing/md_to_labelme.py +118 -51
  7. api/batch_processing/postprocessing/merge_detections.py +30 -5
  8. api/batch_processing/postprocessing/postprocess_batch_results.py +24 -12
  9. api/batch_processing/postprocessing/remap_detection_categories.py +163 -0
  10. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +15 -12
  11. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +2 -2
  12. data_management/cct_json_utils.py +7 -2
  13. data_management/coco_to_labelme.py +263 -0
  14. data_management/coco_to_yolo.py +7 -4
  15. data_management/databases/integrity_check_json_db.py +68 -59
  16. data_management/databases/subset_json_db.py +1 -1
  17. data_management/get_image_sizes.py +44 -26
  18. data_management/importers/animl_results_to_md_results.py +1 -3
  19. data_management/importers/noaa_seals_2019.py +1 -1
  20. data_management/labelme_to_coco.py +252 -143
  21. data_management/labelme_to_yolo.py +95 -52
  22. data_management/lila/create_lila_blank_set.py +106 -23
  23. data_management/lila/download_lila_subset.py +133 -65
  24. data_management/lila/generate_lila_per_image_labels.py +1 -1
  25. data_management/lila/lila_common.py +8 -38
  26. data_management/read_exif.py +65 -16
  27. data_management/remap_coco_categories.py +84 -0
  28. data_management/resize_coco_dataset.py +3 -22
  29. data_management/wi_download_csv_to_coco.py +239 -0
  30. data_management/yolo_to_coco.py +283 -83
  31. detection/run_detector_batch.py +12 -3
  32. detection/run_inference_with_yolov5_val.py +10 -3
  33. detection/run_tiled_inference.py +2 -2
  34. detection/tf_detector.py +2 -1
  35. detection/video_utils.py +1 -1
  36. md_utils/ct_utils.py +22 -3
  37. md_utils/md_tests.py +11 -2
  38. md_utils/path_utils.py +206 -32
  39. md_utils/url_utils.py +66 -1
  40. md_utils/write_html_image_list.py +12 -3
  41. md_visualization/visualization_utils.py +363 -72
  42. md_visualization/visualize_db.py +33 -10
  43. {megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/METADATA +10 -12
  44. {megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/RECORD +47 -44
  45. {megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/WHEEL +1 -1
  46. md_visualization/visualize_megadb.py +0 -183
  47. {megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/LICENSE +0 -0
  48. {megadetector-5.0.7.dist-info → megadetector-5.0.8.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,7 @@
9
9
  # what you want to query for, etc., is very application-specific; this is just meant as a
10
10
  # demo.
11
11
  #
12
- # Can download from either Azure or GCP.
12
+ # Can download from GCP (all datasets), AWS (all datasets), or Azure (most datasets).
13
13
  #
14
14
  ########
15
15
 
@@ -20,15 +20,16 @@ import random
20
20
 
21
21
  from tqdm import tqdm
22
22
  from multiprocessing.pool import ThreadPool
23
- from urllib.parse import urlparse
24
23
  from collections import defaultdict
25
24
 
26
- from data_management.lila.lila_common import \
27
- read_lila_all_images_file, is_empty, azure_url_to_gcp_http_url
25
+ from data_management.lila.lila_common import read_lila_all_images_file, is_empty, lila_base_urls
28
26
  from md_utils.url_utils import download_url
29
27
 
28
+ for s in lila_base_urls.values():
29
+ assert s.endswith('/')
30
+
30
31
  # If any of these strings appear in the common name of a species, we'll download that image
31
- species_of_interest = ['grey fox','red fox','leopard cat','kiwi']
32
+ species_of_interest = ['grey fox','gray fox','cape fox','red fox','kit fox']
32
33
 
33
34
  # We'll write images, metadata downloads, and temporary files here
34
35
  lila_local_base = os.path.expanduser('~/lila')
@@ -42,17 +43,61 @@ os.makedirs(output_dir,exist_ok=True)
42
43
  # Number of concurrent download threads
43
44
  n_download_threads = 20
44
45
 
46
+ verbose = False
47
+
45
48
  max_images_per_dataset = 10 # None
46
49
 
47
50
  # This impacts the data download, but not the metadata download
48
51
  #
49
- # "Azure" really means "Azure if available"; recent datasets are only available
50
- # on GCP.
51
- image_download_source = 'azure' # 'azure' or 'gcp'
52
+ # Setting this to "Azure" really means "Azure if available"; some datasets are
53
+ # not available on Azure.
54
+ preferred_provider = 'gcp' # 'azure', 'gcp', 'aws'
52
55
 
53
56
  random.seed(0)
54
57
 
55
58
 
59
+ #%% Support functions
60
+
61
+ def download_relative_url(relative_url, output_base, provider='gcp',
62
+ verbose=False, overwrite=False):
63
+ """
64
+ Download a URL to output_base, preserving the path relative to the common LILA root.
65
+ """
66
+
67
+ assert not relative_url.startswith('/')
68
+
69
+ # Not all datasets are available on Azure, fall back in these cases. The decision
70
+ # to fall back to GCP rather than AWS is arbitrary.
71
+ if provider == 'azure':
72
+ nominal_provider = relative_url_to_nominal_provider[relative_url]
73
+ if nominal_provider != 'azure':
74
+ if verbose:
75
+ print('URL {} not available on Azure, falling back to GCP'.format(
76
+ relative_url))
77
+ provider = 'gcp'
78
+
79
+ url = lila_base_urls[provider] + relative_url
80
+
81
+ result = {'status':'unknown','url':url,'destination_filename':None}
82
+
83
+ destination_filename = os.path.join(output_base,relative_url)
84
+ result['destination_filename'] = destination_filename
85
+
86
+ if ((os.path.isfile(destination_filename)) and (not overwrite)):
87
+ result['status'] = 'skipped'
88
+ return result
89
+ try:
90
+ download_url(url, destination_filename, verbose=verbose, force_download=overwrite)
91
+ except Exception as e:
92
+ print('Warning: error downloading URL {}: {}'.format(
93
+ url,str(e)))
94
+ result['status'] = 'error: {}'.format(str(e))
95
+ return result
96
+
97
+ result['status'] = 'success'
98
+ return result
99
+
100
+
56
101
  #%% Download and open the giant table of image URLs and labels
57
102
 
58
103
  # ~60 seconds to download, unzip, and open
@@ -63,6 +108,8 @@ df = read_lila_all_images_file(metadata_dir)
63
108
 
64
109
  # ~2 minutes
65
110
 
111
+ common_name_to_count = defaultdict(int)
112
+
66
113
  ds_name_to_urls = defaultdict(list)
67
114
 
68
115
  def find_items(row):
@@ -75,6 +122,7 @@ def find_items(row):
75
122
  for species_name in species_of_interest:
76
123
  if species_name in row['common_name']:
77
124
  match = True
125
+ common_name_to_count[species_name] += 1
78
126
  break
79
127
 
80
128
  if match:
@@ -83,15 +131,19 @@ def find_items(row):
83
131
  tqdm.pandas()
84
132
  _ = df.progress_apply(find_items,axis=1)
85
133
 
134
+ # We have a list of URLs for each dataset, flatten them all into a list of URLs
86
135
  all_urls = list(ds_name_to_urls.values())
87
136
  all_urls = [item for sublist in all_urls for item in sublist]
88
137
  print('Found {} matching URLs across {} datasets'.format(len(all_urls),len(ds_name_to_urls)))
89
138
 
139
+ for common_name in common_name_to_count:
140
+ print('{}: {}'.format(common_name,common_name_to_count[common_name]))
141
+
90
142
  from copy import deepcopy
91
143
  ds_name_to_urls_raw = deepcopy(ds_name_to_urls)
92
144
 
93
145
 
94
- #%% Trim to a fixed number of URLs per dataset
146
+ #%% Optionally trim to a fixed number of URLs per dataset
95
147
 
96
148
  if max_images_per_dataset is None:
97
149
  pass
@@ -102,74 +154,90 @@ else:
102
154
  ds_name_to_urls[ds_name] = random.sample(ds_name_to_urls[ds_name],max_images_per_dataset)
103
155
 
104
156
 
105
- #%% Download those image files
157
+ #%% Convert URLs to be relative to the common LILA base
106
158
 
107
- container_to_url_base = {
108
- 'lilablobssc.blob.core.windows.net':'/',
109
- 'storage.googleapis.com':'/public-datasets-lila/'
110
- }
159
+ all_urls = list(ds_name_to_urls.values())
160
+ all_urls = [item for sublist in all_urls for item in sublist]
111
161
 
112
- def download_relative_filename(url, output_base, verbose=False, url_base=None, overwrite=False):
113
- """
114
- Download a URL to output_base, preserving relative path
115
- """
116
-
117
- result = {'status':'unknown','url':url,'destination_filename':None}
118
-
119
- if url_base is None:
120
- assert url.startswith('https://')
121
- container = url.split('/')[2]
122
- assert container in container_to_url_base
123
- url_base = container_to_url_base[container]
124
-
125
- assert url_base.startswith('/') and url_base.endswith('/')
126
-
127
- p = urlparse(url)
128
- relative_filename = str(p.path)
129
- # remove the leading '/'
130
- assert relative_filename.startswith(url_base)
131
- relative_filename = relative_filename.replace(url_base,'',1)
132
-
133
- destination_filename = os.path.join(output_base,relative_filename)
134
- result['destination_filename'] = destination_filename
135
-
136
- if ((os.path.isfile(destination_filename)) and (not overwrite)):
137
- result['status'] = 'skipped'
138
- return result
139
- try:
140
- download_url(url, destination_filename, verbose=verbose)
141
- except Exception as e:
142
- print('Warning: error downloading URL {}: {}'.format(
143
- url,str(e)))
144
- result['status'] = 'error: {}'.format(str(e))
145
- return result
146
-
147
- result['status'] = 'success'
148
- return result
162
+ all_urls_relative = []
149
163
 
164
+ # Each file has a nominal URL in the .csv file. For now, the only thing this tells is
165
+ # is that if the nominal URL isn't an Azure URL, the file isn't on Azure. All files are on
166
+ # GCP and AWS.
167
+ #
168
+ # Keep track of the nominal provider for each URL.
169
+ relative_url_to_nominal_provider = {}
170
+
171
+ for url in all_urls:
172
+ found_base = False
173
+ for provider in lila_base_urls.keys():
174
+ base = lila_base_urls[provider]
175
+ if url.startswith(base):
176
+ relative_url = url.replace(base,'')
177
+ all_urls_relative.append(relative_url)
178
+ relative_url_to_nominal_provider[relative_url] = provider
179
+ found_base = True
180
+ break
181
+ assert found_base
182
+
183
+ assert len(all_urls) == len(all_urls_relative)
150
184
 
151
- # ds_name_to_urls maps dataset names to lists of URLs; flatten to a single list of URLs
152
- all_urls = list(ds_name_to_urls.values())
153
- all_urls = [item for sublist in all_urls for item in sublist]
154
185
 
155
- # Convert Azure URLs to GCP URLs if necessary
156
- if image_download_source != 'azure':
157
- assert image_download_source == 'gcp'
158
- all_urls = [azure_url_to_gcp_http_url(url) for url in all_urls]
186
+ #%% Download image files
159
187
 
160
- print('Downloading {} images on {} workers'.format(len(all_urls),n_download_threads))
188
+ print('Downloading {} images on {} workers, preferred provider is {}'.format(
189
+ len(all_urls),n_download_threads,preferred_provider))
161
190
 
162
191
  if n_download_threads <= 1:
163
192
 
164
193
  results = []
165
194
 
166
- # url = all_urls[0]
167
- for url in tqdm(all_urls):
168
- results.append(download_relative_filename(url,output_dir,url_base=None))
195
+ # url_relative = all_urls_relative[0]
196
+ for url_relative in tqdm(all_urls_relative):
197
+ result = download_relative_url(url_relative,
198
+ output_base=output_dir,
199
+ provider=preferred_provider,
200
+ verbose=verbose)
201
+ results.append(result)
169
202
 
170
203
  else:
171
204
 
172
205
  pool = ThreadPool(n_download_threads)
173
- results = list(tqdm(pool.imap(lambda s: download_relative_filename(
174
- s,output_dir,url_base=None),
175
- all_urls), total=len(all_urls)))
206
+ results = list(tqdm(pool.imap(lambda s: download_relative_url(
207
+ s,output_base=output_dir,provider=preferred_provider,verbose=verbose),
208
+ all_urls_relative), total=len(all_urls_relative)))
209
+
210
+
211
+ #%% Scrap
212
+
213
+ if False:
214
+
215
+ pass
216
+
217
+ #%% Find all the reptiles on LILA
218
+
219
+ reptile_rows = df.loc[df['class'] == 'reptilia']
220
+
221
+ # i_row = 0; row = reptile_rows.iloc[i_row]
222
+
223
+ common_name_to_count = defaultdict(int)
224
+ dataset_to_count = defaultdict(int)
225
+ for i_row,row in reptile_rows.iterrows():
226
+ common_name_to_count[row['common_name']] += 1
227
+ dataset_to_count[row['dataset_name']] += 1
228
+
229
+ from md_utils.ct_utils import sort_dictionary_by_value
230
+
231
+ print('Found {} reptiles\n'.format(len(reptile_rows)))
232
+
233
+ common_name_to_count = sort_dictionary_by_value(common_name_to_count,reverse=True)
234
+ dataset_to_count = sort_dictionary_by_value(dataset_to_count,reverse=True)
235
+
236
+ print('Common names by count:\n')
237
+ for k in common_name_to_count:
238
+ print('{} ({})'.format(k,common_name_to_count[k]))
239
+
240
+ print('\nDatasets by count:\n')
241
+ for k in dataset_to_count:
242
+ print('{} ({})'.format(k,dataset_to_count[k]))
243
+
@@ -338,7 +338,7 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
338
338
 
339
339
  # ...with open()
340
340
 
341
- print('Processed {} datsets'.format(len(metadata_table)))
341
+ print('Processed {} datasets'.format(len(metadata_table)))
342
342
 
343
343
 
344
344
  #%% Read the .csv back
@@ -31,9 +31,13 @@ wildlife_insights_taxonomy_local_json_filename = 'wi_taxonomy.json'
31
31
  wildlife_insights_taxonomy_local_csv_filename = \
32
32
  wildlife_insights_taxonomy_local_json_filename.replace('.json','.csv')
33
33
 
34
- lila_azure_storage_account = 'https://lilablobssc.blob.core.windows.net'
35
- gcp_bucket_api_url = 'https://storage.googleapis.com/public-datasets-lila'
36
- gcp_bucket_gs_url = 'gs://public-datasets-lila'
34
+ # Filenames are consistent across clouds relative to these URLs
35
+ lila_base_urls = {
36
+ 'azure':'https://lilablobssc.blob.core.windows.net/',
37
+ 'gcp':'https://storage.googleapis.com/public-datasets-lila/',
38
+ 'aws':'http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/'
39
+ }
40
+
37
41
 
38
42
 
39
43
  #%% Common functions
@@ -198,28 +202,6 @@ def read_metadata_file_for_dataset(ds_name,metadata_dir,metadata_table=None,json
198
202
  return json_filename
199
203
 
200
204
 
201
- def azure_url_to_gcp_http_url(url,error_if_not_azure_url=True):
202
- """
203
- Most URLs point to Azure by default, but most files are available on both Azure and GCP.
204
- This function converts an Azure URL to the corresponding GCP http:// url.
205
- """
206
-
207
- if error_if_not_azure_url:
208
- assert url.startswith(lila_azure_storage_account)
209
- gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
210
- return gcp_url
211
-
212
-
213
- def azure_url_to_gcp_gs_url(url,error_if_not_azure_url=True):
214
- """
215
- Most URLs point to Azure by default, but most files are available on both Azure and GCP.
216
- This function converts an Azure URL to the corresponding GCP gs:// url.
217
- """
218
-
219
- return azure_url_to_gcp_http_url(url,error_if_not_azure_url).\
220
- replace(gcp_bucket_api_url,gcp_bucket_gs_url,1)
221
-
222
-
223
205
  #%% Interactive test driver
224
206
 
225
207
  if False:
@@ -252,16 +234,4 @@ if False:
252
234
  urls_to_test.append(ds_info['bbox_url'])
253
235
 
254
236
  status_codes = url_utils.test_urls(urls_to_test)
255
-
256
-
257
- #%% Verify that the GCP versions of all metadata files exist
258
-
259
- gcp_urls = []
260
-
261
- # url = urls_to_test[0]
262
- for url in urls_to_test:
263
- assert url.startswith(lila_azure_storage_account)
264
- gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
265
- gcp_urls.append(gcp_url)
266
-
267
- status_codes = url_utils.test_urls(gcp_urls)
237
+
@@ -48,9 +48,18 @@ class ReadExifOptions:
48
48
  #
49
49
  # Not relevant if n_workers is 1.
50
50
  use_threads = True
51
-
51
+
52
+ # "File" and "ExifTool" are tag types used by ExifTool to report data that
53
+ # doesn't come from EXIF, rather from the file (e.g. file size).
52
54
  tag_types_to_ignore = set(['File','ExifTool'])
53
55
 
56
+ # Include/exclude specific tags (mutually incompatible)
57
+ tags_to_include = None
58
+ tags_to_exclude = None
59
+
60
+ # A useful set of tags one might want to limit queries for
61
+ # options.tags_to_include = ['DateTime','Model','Make','ExifImageWidth','ExifImageHeight','DateTime','DateTimeOriginal','Orientation']
62
+
54
63
  exiftool_command_name = 'exiftool'
55
64
 
56
65
  # How should we handle byte-formatted EXIF tags?
@@ -62,16 +71,17 @@ class ReadExifOptions:
62
71
 
63
72
  # Should we use exiftool or pil?
64
73
  processing_library = 'pil' # 'exiftool','pil'
65
-
74
+
75
+
66
76
 
67
77
  #%% Functions
68
78
 
69
- def enumerate_files(input_folder):
79
+ def enumerate_files(input_folder,recursive=True):
70
80
  """
71
81
  Enumerates all image files in input_folder, returning relative paths
72
82
  """
73
83
 
74
- image_files = find_images(input_folder,recursive=True)
84
+ image_files = find_images(input_folder,recursive=recursive)
75
85
  image_files = [os.path.relpath(s,input_folder) for s in image_files]
76
86
  image_files = [s.replace('\\','/') for s in image_files]
77
87
  print('Enumerated {} files'.format(len(image_files)))
@@ -99,7 +109,7 @@ def get_exif_ifd(exif):
99
109
  def read_pil_exif(im,options=None):
100
110
  """
101
111
  Read all the EXIF data we know how to read from [im] (path or PIL Image), whether it's
102
- in the PIL default EXIF data or not.
112
+ in the PIL default EXIF data or not. Returns a dict.
103
113
  """
104
114
 
105
115
  if options is None:
@@ -192,6 +202,32 @@ def parse_exif_datetime_string(s,verbose=False):
192
202
  return dt
193
203
 
194
204
 
205
+ def _filter_tags(tags,options):
206
+ """
207
+ Internal function used to include/exclude specific tags from the exif_tags
208
+ dict.
209
+ """
210
+
211
+ if options is None:
212
+ return tags
213
+ if options.tags_to_include is None and options.tags_to_exclude is None:
214
+ return tags
215
+ if options.tags_to_include is not None:
216
+ assert options.tags_to_exclude is None, "tags_to_include and tags_to_exclude are incompatible"
217
+ tags_to_return = {}
218
+ for tag_name in tags.keys():
219
+ if tag_name in options.tags_to_include:
220
+ tags_to_return[tag_name] = tags[tag_name]
221
+ return tags_to_return
222
+ if options.tags_to_exclude is not None:
223
+ assert options.tags_to_include is None, "tags_to_include and tags_to_exclude are incompatible"
224
+ tags_to_return = {}
225
+ for tag_name in tags.keys():
226
+ if tag_name not in options.tags_to_exclude:
227
+ tags_to_return[tag_name] = tags[tag_name]
228
+ return tags_to_return
229
+
230
+
195
231
  def read_exif_tags_for_image(file_path,options=None):
196
232
  """
197
233
  Get relevant fields from EXIF data for an image
@@ -227,8 +263,8 @@ def read_exif_tags_for_image(file_path,options=None):
227
263
  result['status'] = 'empty_read'
228
264
  else:
229
265
  result['status'] = 'success'
230
- result['tags'] = exif_tags
231
-
266
+ result['tags'] = _filter_tags(exif_tags,options)
267
+
232
268
  return result
233
269
 
234
270
  elif options.processing_library == 'exiftool':
@@ -283,9 +319,12 @@ def read_exif_tags_for_image(file_path,options=None):
283
319
  print('Ignoring tag with type {}'.format(field_type))
284
320
  continue
285
321
 
286
- field_tag = field_name_type_tokens[1].strip()
287
-
288
- tag = [field_type,field_tag,field_value]
322
+ field_name = field_name_type_tokens[1].strip()
323
+ if options.tags_to_exclude is not None and field_name in options.tags_to_exclude:
324
+ continue
325
+ if options.tags_to_include is not None and field_name not in options.tags_to_include:
326
+ continue
327
+ tag = [field_type,field_name,field_value]
289
328
 
290
329
  exif_tags.append(tag)
291
330
 
@@ -350,20 +389,22 @@ def populate_exif_data(im, image_base, options=None):
350
389
  # ...populate_exif_data()
351
390
 
352
391
 
353
- def create_image_objects(image_files):
392
+ def create_image_objects(image_files,recursive=True):
354
393
  """
355
394
  Create empty image objects for every image in [image_files], which can be a
356
395
  list of relative paths (which will get stored without processing, so the base
357
396
  path doesn't matter here), or a folder name.
358
397
 
359
398
  Returns a list of dicts with field 'file_name' (a relative path).
399
+
400
+ "recursive" is ignored if "image_files" is a list.
360
401
  """
361
402
 
362
403
  # Enumerate *relative* paths
363
404
  if isinstance(image_files,str):
364
405
  print('Enumerating image files in {}'.format(image_files))
365
406
  assert os.path.isdir(image_files), 'Invalid image folder {}'.format(image_files)
366
- image_files = enumerate_files(image_files)
407
+ image_files = enumerate_files(image_files,recursive=recursive)
367
408
 
368
409
  images = []
369
410
  for fn in image_files:
@@ -499,7 +540,7 @@ def is_executable(name):
499
540
  return which(name) is not None
500
541
 
501
542
 
502
- def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=None):
543
+ def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=None,recursive=True):
503
544
  """
504
545
  Read EXIF data for all images in input_folder.
505
546
 
@@ -516,6 +557,12 @@ def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=N
516
557
  if options is None:
517
558
  options = ReadExifOptions()
518
559
 
560
+ # Validate options
561
+ if options.tags_to_include is not None:
562
+ assert options.tags_to_exclude is None, "tags_to_include and tags_to_exclude are incompatible"
563
+ if options.tags_to_exclude is not None:
564
+ assert options.tags_to_include is None, "tags_to_include and tags_to_exclude are incompatible"
565
+
519
566
  if input_folder is None:
520
567
  input_folder = ''
521
568
  if len(input_folder) > 0:
@@ -542,7 +589,7 @@ def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=N
542
589
  assert is_executable(options.exiftool_command_name), 'exiftool not available'
543
590
 
544
591
  if filenames is None:
545
- images = create_image_objects(input_folder)
592
+ images = create_image_objects(input_folder,recursive=recursive)
546
593
  else:
547
594
  assert isinstance(filenames,list)
548
595
  images = create_image_objects(filenames)
@@ -567,14 +614,16 @@ if False:
567
614
 
568
615
  #%%
569
616
 
570
- input_folder = os.path.expanduser('~/data/KRU-test')
571
- output_file = os.path.expanduser('~/data/test-exif.json')
617
+ input_folder = r'C:\temp\md-name-testing'
618
+ output_file = None # r'C:\temp\md-name-testing\exif.json'
572
619
  options = ReadExifOptions()
573
620
  options.verbose = False
574
621
  options.n_workers = 10
575
622
  options.use_threads = False
576
623
  options.processing_library = 'pil'
577
624
  # options.processing_library = 'exiftool'
625
+ options.tags_to_include = ['DateTime','Model','Make','ExifImageWidth','ExifImageHeight','DateTime','DateTimeOriginal','Orientation']
626
+ # options.tags_to_exclude = ['MakerNote']
578
627
 
579
628
  results = read_exif_from_folder(input_folder,output_file,options)
580
629
 
@@ -0,0 +1,84 @@
1
+ ########
2
+ #
3
+ # remap_coco_categories.py
4
+ #
5
+ # Given a COCO-formatted dataset, remap the categories to a new mapping.
6
+ #
7
+ ########
8
+
9
+ #%% Imports and constants
10
+
11
+ import os
12
+ import json
13
+
14
+ from copy import deepcopy
15
+
16
+
17
+ #%% Main function
18
+
19
+ def remap_coco_categories(input_data,
20
+ output_category_name_to_id,
21
+ input_category_name_to_output_category_name,
22
+ output_file=None):
23
+ """
24
+ Given a COCO-formatted dataset, remap the categories to a new categories mapping, optionally
25
+ writing the results to a new file.
26
+
27
+ output_category_name_to_id is a dict mapping strings to ints.
28
+
29
+ input_category_name_to_output_category_name is a dict mapping strings to strings.
30
+
31
+ [input_data] can be a COCO-formatted dict or a filename. If it's a dict, it will be copied,
32
+ not modified in place.
33
+ """
34
+
35
+ if isinstance(input_data,str):
36
+ assert os.path.isfile(input_data), "Can't find file {}".format(input_data)
37
+ with open(input_data,'r') as f:
38
+ input_data = json.load(f)
39
+ assert isinstance(input_data,dict), 'Illegal COCO input data'
40
+ else:
41
+ assert isinstance(input_data,dict), 'Illegal COCO input data'
42
+ input_data = deepcopy(input_data)
43
+
44
+ # It's safe to modify in-place now
45
+ output_data = input_data
46
+
47
+ # Read input name --> ID mapping
48
+ input_category_name_to_input_category_id = {}
49
+ for c in input_data['categories']:
50
+ input_category_name_to_input_category_id[c['name']] = c['id']
51
+
52
+ # Map input IDs --> output IDs
53
+ input_category_id_to_output_category_id = {}
54
+ for input_name in input_category_name_to_output_category_name.keys():
55
+ output_name = input_category_name_to_output_category_name[input_name]
56
+ assert output_name in output_category_name_to_id, \
57
+ 'No output ID for {} --> {}'.format(input_name,output_name)
58
+ input_id = input_category_name_to_input_category_id[input_name]
59
+ output_id = output_category_name_to_id[output_name]
60
+ input_category_id_to_output_category_id[input_id] = output_id
61
+
62
+ # Map annotations
63
+ for ann in output_data['annotations']:
64
+ assert ann['category_id'] in input_category_id_to_output_category_id, \
65
+ 'Unrecognized category ID {}'.format(ann['category_id'])
66
+ ann['category_id'] = input_category_id_to_output_category_id[ann['category_id']]
67
+
68
+ # Update the category list
69
+ output_categories = []
70
+ for output_name in output_category_name_to_id:
71
+ category = {'name':output_name,'id':output_category_name_to_id[output_name]}
72
+ output_categories.append(category)
73
+ output_data['categories'] = output_categories
74
+
75
+ if output_file is not None:
76
+ with open(output_file,'w') as f:
77
+ json.dump(output_data,f,indent=1)
78
+
79
+ return input_data
80
+
81
+
82
+ #%% Command-line driver
83
+
84
+ # TODO
@@ -26,8 +26,7 @@ from md_visualization.visualization_utils import \
26
26
  def resize_coco_dataset(input_folder,input_filename,
27
27
  output_folder,output_filename,
28
28
  target_size=(-1,-1),
29
- correct_size_image_handling='copy',
30
- right_edge_quantization_threshold=None):
29
+ correct_size_image_handling='copy'):
31
30
  """
32
31
  Given a COCO-formatted dataset (images in input_folder, data in input_filename), resize
33
32
  all the images to a target size (in output_folder) and scale bounding boxes accordingly
@@ -36,7 +35,7 @@ def resize_coco_dataset(input_folder,input_filename,
36
35
  target_size should be a tuple/list of ints, length 2. If either dimension is -1, aspect ratio
37
36
  will be preserved. If both dimensions are -1, this means "keep the original size". If
38
37
  both dimensions are -1 and correct_size_image_handling is copy, this function is basically
39
- a no-op, although you might still use it for right_edge_quantization_threshold.
38
+ a no-op.
40
39
 
41
40
  correct_size_image_handling can be 'copy' (in which case the original image is just copied
42
41
  to the output folder) or 'rewrite' (in which case the image is opened via PIL and re-written,
@@ -44,12 +43,6 @@ def resize_coco_dataset(input_folder,input_filename,
44
43
  you're superstitious about biases coming from images in a training set being written
45
44
  by different image encoders.
46
45
 
47
- right_edge_quantization_threshold is an off-by-default hack to adjust large datasets where
48
- boxes that really should be running off the right side of the image only extend like 99%
49
- of the way there, due to what appears to be a slight bias inherent to MD. If a box extends
50
- within [right_edge_quantization_threshold] (a small number, from 0 to 1, but probably around
51
- 0.02) of the right edge of the image, it will be extended to the far right edge.
52
-
53
46
  Returns the COCO database with resized images.
54
47
  """
55
48
 
@@ -126,15 +119,6 @@ def resize_coco_dataset(input_folder,input_filename,
126
119
  bbox[2] * width_scale,
127
120
  bbox[3] * height_scale]
128
121
 
129
- # Do we need to quantize this box?
130
- if right_edge_quantization_threshold is not None and \
131
- right_edge_quantization_threshold > 0:
132
- bbox_right_edge_abs = bbox[0] + bbox[2]
133
- bbox_right_edge_norm = bbox_right_edge_abs / output_w
134
- bbox_right_edge_distance = (1.0 - bbox_right_edge_norm)
135
- if bbox_right_edge_distance < right_edge_quantization_threshold:
136
- bbox[2] = output_w - bbox[0]
137
-
138
122
  ann['bbox'] = bbox
139
123
 
140
124
  # ...if this annotation has a box
@@ -169,13 +153,10 @@ if False:
169
153
 
170
154
  correct_size_image_handling = 'rewrite'
171
155
 
172
- right_edge_quantization_threshold = 0.015
173
-
174
156
  resize_coco_dataset(input_folder,input_filename,
175
157
  output_folder,output_filename,
176
158
  target_size=target_size,
177
- correct_size_image_handling=correct_size_image_handling,
178
- right_edge_quantization_threshold=right_edge_quantization_threshold)
159
+ correct_size_image_handling=correct_size_image_handling)
179
160
 
180
161
 
181
162
  #%% Preview