megadetector 5.0.29__py3-none-any.whl → 10.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/classification/efficientnet/model.py +8 -8
- megadetector/classification/efficientnet/utils.py +6 -5
- megadetector/classification/prepare_classification_script_mc.py +3 -3
- megadetector/data_management/annotations/annotation_constants.py +0 -1
- megadetector/data_management/camtrap_dp_to_coco.py +34 -1
- megadetector/data_management/cct_json_utils.py +2 -2
- megadetector/data_management/coco_to_yolo.py +22 -5
- megadetector/data_management/databases/add_width_and_height_to_db.py +85 -12
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +2 -2
- megadetector/data_management/databases/integrity_check_json_db.py +29 -15
- megadetector/data_management/generate_crops_from_cct.py +50 -1
- megadetector/data_management/labelme_to_coco.py +4 -2
- megadetector/data_management/labelme_to_yolo.py +82 -2
- megadetector/data_management/lila/generate_lila_per_image_labels.py +276 -18
- megadetector/data_management/lila/get_lila_annotation_counts.py +5 -3
- megadetector/data_management/lila/lila_common.py +3 -0
- megadetector/data_management/lila/test_lila_metadata_urls.py +15 -5
- megadetector/data_management/mewc_to_md.py +5 -0
- megadetector/data_management/ocr_tools.py +4 -3
- megadetector/data_management/read_exif.py +20 -5
- megadetector/data_management/remap_coco_categories.py +66 -4
- megadetector/data_management/remove_exif.py +50 -1
- megadetector/data_management/rename_images.py +3 -3
- megadetector/data_management/resize_coco_dataset.py +563 -95
- megadetector/data_management/yolo_output_to_md_output.py +131 -2
- megadetector/data_management/yolo_to_coco.py +140 -5
- megadetector/detection/change_detection.py +4 -3
- megadetector/detection/pytorch_detector.py +60 -22
- megadetector/detection/run_detector.py +225 -25
- megadetector/detection/run_detector_batch.py +42 -16
- megadetector/detection/run_inference_with_yolov5_val.py +12 -2
- megadetector/detection/run_tiled_inference.py +1 -0
- megadetector/detection/video_utils.py +53 -24
- megadetector/postprocessing/add_max_conf.py +4 -0
- megadetector/postprocessing/categorize_detections_by_size.py +1 -1
- megadetector/postprocessing/classification_postprocessing.py +55 -20
- megadetector/postprocessing/combine_batch_outputs.py +3 -2
- megadetector/postprocessing/compare_batch_results.py +64 -10
- megadetector/postprocessing/convert_output_format.py +12 -8
- megadetector/postprocessing/create_crop_folder.py +137 -10
- megadetector/postprocessing/load_api_results.py +26 -8
- megadetector/postprocessing/md_to_coco.py +4 -4
- megadetector/postprocessing/md_to_labelme.py +18 -7
- megadetector/postprocessing/merge_detections.py +5 -0
- megadetector/postprocessing/postprocess_batch_results.py +6 -3
- megadetector/postprocessing/remap_detection_categories.py +55 -2
- megadetector/postprocessing/render_detection_confusion_matrix.py +9 -6
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +2 -2
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +3 -4
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +40 -19
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +1 -1
- megadetector/taxonomy_mapping/species_lookup.py +123 -41
- megadetector/utils/ct_utils.py +133 -113
- megadetector/utils/md_tests.py +93 -13
- megadetector/utils/path_utils.py +137 -107
- megadetector/utils/split_locations_into_train_val.py +2 -2
- megadetector/utils/string_utils.py +7 -7
- megadetector/utils/url_utils.py +81 -58
- megadetector/utils/wi_utils.py +46 -17
- megadetector/visualization/plot_utils.py +13 -9
- megadetector/visualization/render_images_with_thumbnails.py +2 -1
- megadetector/visualization/visualization_utils.py +94 -46
- megadetector/visualization/visualize_db.py +36 -9
- megadetector/visualization/visualize_detector_output.py +4 -4
- {megadetector-5.0.29.dist-info → megadetector-10.0.1.dist-info}/METADATA +135 -135
- megadetector-10.0.1.dist-info/RECORD +139 -0
- {megadetector-5.0.29.dist-info → megadetector-10.0.1.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.29.dist-info → megadetector-10.0.1.dist-info}/top_level.txt +0 -0
- megadetector/api/batch_processing/api_core/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/score.py +0 -438
- megadetector/api/batch_processing/api_core/server.py +0 -294
- megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
- megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
- megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
- megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
- megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
- megadetector/api/batch_processing/api_core/server_utils.py +0 -88
- megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
- megadetector/api/batch_processing/api_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
- megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
- megadetector/api/synchronous/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
- megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
- megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
- megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
- megadetector/api/synchronous/api_core/tests/load_test.py +0 -109
- megadetector/utils/azure_utils.py +0 -178
- megadetector/utils/sas_blob_utils.py +0 -513
- megadetector-5.0.29.dist-info/RECORD +0 -163
- /megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
- {megadetector-5.0.29.dist-info → megadetector-10.0.1.dist-info}/WHEEL +0 -0
|
@@ -10,12 +10,14 @@ Create YOLO .txt files in a folder containing labelme .json files.
|
|
|
10
10
|
|
|
11
11
|
import os
|
|
12
12
|
import json
|
|
13
|
+
import argparse
|
|
13
14
|
|
|
14
15
|
from multiprocessing.pool import Pool, ThreadPool
|
|
15
16
|
from functools import partial
|
|
16
17
|
from tqdm import tqdm
|
|
17
18
|
|
|
18
19
|
from megadetector.utils.path_utils import recursive_file_list
|
|
20
|
+
from megadetector.utils.ct_utils import write_json
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
#%% Main function
|
|
@@ -33,6 +35,13 @@ def labelme_file_to_yolo_file(labelme_file,
|
|
|
33
35
|
this function no-ops (i.e., does not generate a YOLO file).
|
|
34
36
|
|
|
35
37
|
overwrite_behavior should be 'skip' or 'overwrite' (default).
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
labelme_file (str): .json file to convert
|
|
41
|
+
category_name_to_category_id (dict): category name --> ID mapping
|
|
42
|
+
yolo_file (str, optional): output .txt file defaults to s/json/txt
|
|
43
|
+
required_token (str, optional): only process filenames containing this token
|
|
44
|
+
overwrite_behavior (str, optional): "skip" or "overwrite"
|
|
36
45
|
"""
|
|
37
46
|
|
|
38
47
|
result = {}
|
|
@@ -150,6 +159,15 @@ def labelme_folder_to_yolo(labelme_folder,
|
|
|
150
159
|
'category_name_to_category_id', whether it was passed in or constructed
|
|
151
160
|
'image_results': a list of results for each image (converted, skipped, error)
|
|
152
161
|
|
|
162
|
+
Args:
|
|
163
|
+
labelme_folder (str): folder of .json files to convert
|
|
164
|
+
category_name_to_category_id (dict): category name --> ID mapping
|
|
165
|
+
required_token (str, optional): only process filenames containing this token
|
|
166
|
+
overwrite_behavior (str, optional): "skip" or "overwrite"
|
|
167
|
+
relative_filenames_to_convert (list of str, optional): only process filenames on this list
|
|
168
|
+
n_workers (int, optional): parallelism level
|
|
169
|
+
use_threads (bool, optional): whether to use threads (True) or processes (False) for
|
|
170
|
+
parallelism
|
|
153
171
|
"""
|
|
154
172
|
|
|
155
173
|
if relative_filenames_to_convert is not None:
|
|
@@ -236,7 +254,7 @@ def labelme_folder_to_yolo(labelme_folder,
|
|
|
236
254
|
finally:
|
|
237
255
|
pool.close()
|
|
238
256
|
pool.join()
|
|
239
|
-
print(
|
|
257
|
+
print('Pool closed and joined for labelme conversion to YOLO')
|
|
240
258
|
|
|
241
259
|
assert len(valid_labelme_files_relative) == len(image_results)
|
|
242
260
|
|
|
@@ -275,4 +293,66 @@ if False:
|
|
|
275
293
|
|
|
276
294
|
#%% Command-line driver
|
|
277
295
|
|
|
278
|
-
|
|
296
|
+
def main():
|
|
297
|
+
"""
|
|
298
|
+
Command-line interface to convert Labelme JSON files to YOLO format
|
|
299
|
+
"""
|
|
300
|
+
|
|
301
|
+
parser = argparse.ArgumentParser(
|
|
302
|
+
description='Convert a folder of Labelme .json files to YOLO .txt format'
|
|
303
|
+
)
|
|
304
|
+
parser.add_argument(
|
|
305
|
+
'labelme_folder',
|
|
306
|
+
type=str,
|
|
307
|
+
help='Folder of Labelme .json files to convert'
|
|
308
|
+
)
|
|
309
|
+
parser.add_argument(
|
|
310
|
+
'--output_category_file',
|
|
311
|
+
type=str,
|
|
312
|
+
default=None,
|
|
313
|
+
help='Path to save the generated category mapping (.json)'
|
|
314
|
+
)
|
|
315
|
+
parser.add_argument(
|
|
316
|
+
'--required_token',
|
|
317
|
+
type=str,
|
|
318
|
+
default=None,
|
|
319
|
+
help='Only process files containing this token as a key in the Labelme JSON dict'
|
|
320
|
+
)
|
|
321
|
+
parser.add_argument(
|
|
322
|
+
'--overwrite_behavior',
|
|
323
|
+
type=str,
|
|
324
|
+
default='overwrite',
|
|
325
|
+
choices=['skip', 'overwrite'],
|
|
326
|
+
help="Behavior if YOLO .txt files exist (default: 'overwrite')"
|
|
327
|
+
)
|
|
328
|
+
parser.add_argument(
|
|
329
|
+
'--n_workers',
|
|
330
|
+
type=int,
|
|
331
|
+
default=1,
|
|
332
|
+
help='Number of workers for parallel processing (default: 1)'
|
|
333
|
+
)
|
|
334
|
+
parser.add_argument(
|
|
335
|
+
'--use_processes',
|
|
336
|
+
action='store_true',
|
|
337
|
+
help='Use processes instead of threads for parallelization (defaults to threads)'
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
args = parser.parse_args()
|
|
341
|
+
|
|
342
|
+
results = labelme_folder_to_yolo(
|
|
343
|
+
labelme_folder=args.labelme_folder,
|
|
344
|
+
category_name_to_category_id=None,
|
|
345
|
+
required_token=args.required_token,
|
|
346
|
+
overwrite_behavior=args.overwrite_behavior,
|
|
347
|
+
relative_filenames_to_convert=None,
|
|
348
|
+
n_workers=args.n_workers,
|
|
349
|
+
use_threads=(not args.use_processes)
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
if args.output_category_file:
|
|
353
|
+
category_map = results['category_name_to_category_id']
|
|
354
|
+
write_json(args.output_category_file,category_map)
|
|
355
|
+
print(f'Saved category mapping to {args.output_category_file}')
|
|
356
|
+
|
|
357
|
+
if __name__ == '__main__':
|
|
358
|
+
main()
|
|
@@ -67,6 +67,18 @@ if debug_max_images_per_dataset > 0:
|
|
|
67
67
|
print('Running in debug mode')
|
|
68
68
|
output_file = output_file.replace('.csv','_debug.csv')
|
|
69
69
|
|
|
70
|
+
taxonomy_levels_to_include = \
|
|
71
|
+
['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
|
|
72
|
+
'suborder','infraorder','superfamily','family','subfamily','tribe','genus','subgenus',
|
|
73
|
+
'species','subspecies','variety']
|
|
74
|
+
|
|
75
|
+
def _clearnan(v):
|
|
76
|
+
if isinstance(v,float):
|
|
77
|
+
assert np.isnan(v)
|
|
78
|
+
v = ''
|
|
79
|
+
assert isinstance(v,str)
|
|
80
|
+
return v
|
|
81
|
+
|
|
70
82
|
|
|
71
83
|
#%% Download and parse the metadata file
|
|
72
84
|
|
|
@@ -87,7 +99,7 @@ for ds_name in metadata_table.keys():
|
|
|
87
99
|
|
|
88
100
|
#%% Load taxonomy data
|
|
89
101
|
|
|
90
|
-
taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
|
|
102
|
+
taxonomy_df = read_lila_taxonomy_mapping(metadata_dir, force_download=True)
|
|
91
103
|
|
|
92
104
|
|
|
93
105
|
#%% Build a dictionary that maps each [dataset,query] pair to the full taxonomic label set
|
|
@@ -113,22 +125,10 @@ header = ['dataset_name','url_gcp','url_aws','url_azure',
|
|
|
113
125
|
'image_id','sequence_id','location_id','frame_num',
|
|
114
126
|
'original_label','scientific_name','common_name','datetime','annotation_level']
|
|
115
127
|
|
|
116
|
-
taxonomy_levels_to_include = \
|
|
117
|
-
['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
|
|
118
|
-
'suborder','infraorder','superfamily','family','subfamily','tribe','genus','species','subspecies',
|
|
119
|
-
'variety']
|
|
120
|
-
|
|
121
128
|
header.extend(taxonomy_levels_to_include)
|
|
122
129
|
|
|
123
130
|
missing_annotations = set()
|
|
124
131
|
|
|
125
|
-
def _clearnan(v):
|
|
126
|
-
if isinstance(v,float):
|
|
127
|
-
assert np.isnan(v)
|
|
128
|
-
v = ''
|
|
129
|
-
assert isinstance(v,str)
|
|
130
|
-
return v
|
|
131
|
-
|
|
132
132
|
with open(output_file,'w',encoding='utf-8',newline='') as f:
|
|
133
133
|
|
|
134
134
|
csv_writer = csv.writer(f)
|
|
@@ -361,7 +361,10 @@ print('Read {} rows from {}'.format(len(df),output_file))
|
|
|
361
361
|
|
|
362
362
|
#%% Do some post-hoc integrity checking
|
|
363
363
|
|
|
364
|
-
# Takes ~10 minutes without
|
|
364
|
+
# Takes ~5 minutes with apply(), or ~10 minutes without apply()
|
|
365
|
+
#
|
|
366
|
+
# Using apply() is faster, but more annoying to debug.
|
|
367
|
+
use_pandas_apply_for_integrity_checking = True
|
|
365
368
|
|
|
366
369
|
tqdm.pandas()
|
|
367
370
|
|
|
@@ -393,8 +396,7 @@ def _check_row(row):
|
|
|
393
396
|
ds_name = row['dataset_name']
|
|
394
397
|
dataset_name_to_locations[ds_name].add(row['location_id'])
|
|
395
398
|
|
|
396
|
-
|
|
397
|
-
if True:
|
|
399
|
+
if use_pandas_apply_for_integrity_checking:
|
|
398
400
|
|
|
399
401
|
df.progress_apply(_check_row, axis=1)
|
|
400
402
|
|
|
@@ -448,8 +450,9 @@ for ds_name in metadata_table.keys():
|
|
|
448
450
|
empty_rows = empty_rows.sample(n=n_empty_images_per_dataset)
|
|
449
451
|
images_to_download.extend(empty_rows.to_dict('records'))
|
|
450
452
|
|
|
453
|
+
# All LILA datasets have non-empty images
|
|
451
454
|
if len(non_empty_rows) == 0:
|
|
452
|
-
|
|
455
|
+
raise ValueError('No non-empty images available for {}'.format(ds_name))
|
|
453
456
|
elif len(non_empty_rows) > n_non_empty_images_per_dataset:
|
|
454
457
|
non_empty_rows = non_empty_rows.sample(n=n_non_empty_images_per_dataset)
|
|
455
458
|
images_to_download.extend(non_empty_rows.to_dict('records'))
|
|
@@ -463,7 +466,7 @@ print('Selected {} total images'.format(len(images_to_download)))
|
|
|
463
466
|
|
|
464
467
|
# Expect a few errors for images with human or vehicle labels (or things like "ignore" that *could* be humans)
|
|
465
468
|
|
|
466
|
-
preferred_cloud = '
|
|
469
|
+
preferred_cloud = 'gcp'
|
|
467
470
|
|
|
468
471
|
url_to_target_file = {}
|
|
469
472
|
|
|
@@ -484,6 +487,19 @@ for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_down
|
|
|
484
487
|
download_results = parallel_download_urls(url_to_target_file,verbose=False,overwrite=True,
|
|
485
488
|
n_workers=20,pool_type='thread')
|
|
486
489
|
|
|
490
|
+
# 10-20 errors is normal; they should all be images that are labeled as "human"
|
|
491
|
+
errors = []
|
|
492
|
+
|
|
493
|
+
for r in download_results:
|
|
494
|
+
if r['status'] != 'success':
|
|
495
|
+
errors.append(r)
|
|
496
|
+
|
|
497
|
+
assert len(download_results) == len(url_to_target_file)
|
|
498
|
+
print('Errors on {} of {} downloads:\n'.format(len(errors),len(download_results)))
|
|
499
|
+
|
|
500
|
+
for err in errors:
|
|
501
|
+
print(err['url'])
|
|
502
|
+
|
|
487
503
|
|
|
488
504
|
#%% Write preview HTML
|
|
489
505
|
|
|
@@ -515,3 +531,245 @@ open_file(html_filename)
|
|
|
515
531
|
zipped_output_file = zip_file(output_file,verbose=True,overwrite=True)
|
|
516
532
|
|
|
517
533
|
print('Zipped {} to {}'.format(output_file,zipped_output_file))
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
#%% Convert to .json
|
|
537
|
+
|
|
538
|
+
"""
|
|
539
|
+
The .csv file "output_file" (already loaded into the variable "df" at this point) has the following columns:
|
|
540
|
+
|
|
541
|
+
dataset_name,url_gcp,url_aws,url_azure,image_id,sequence_id,location_id,frame_num,original_label,scientific_name,common_name,datetime,annotation_level,kingdom,phylum,subphylum,superclass,class,subclass,infraclass,superorder,order,suborder,infraorder,superfamily,family,subfamily,tribe,genus,subgenus,species,subspecies,variety
|
|
542
|
+
|
|
543
|
+
Each row in the .csv represents an image. The URL columns represent the location of that
|
|
544
|
+
image on three different clouds; for a given image, the value of those columns differs only
|
|
545
|
+
in the prefix. The columns starting with "kingdom" represent a taxonomic wildlife identifier. Not
|
|
546
|
+
all rows have values in all of these columns; some rows represent non-wildlife images where all of these
|
|
547
|
+
columns are blank.
|
|
548
|
+
|
|
549
|
+
This cell converts this to a .json dictionary, with the following top-level keys:
|
|
550
|
+
|
|
551
|
+
## datasets (dict)
|
|
552
|
+
|
|
553
|
+
A dict mapping integer IDs to strings.
|
|
554
|
+
|
|
555
|
+
Each unique value in the "dataset_name" column should become an element in this dict with a unique ID.
|
|
556
|
+
|
|
557
|
+
## sequences (dict)
|
|
558
|
+
|
|
559
|
+
A dict mapping integer IDs to strings.
|
|
560
|
+
|
|
561
|
+
Each unique value in the "sequence_id" column should become an element in this dict with a unique ID.
|
|
562
|
+
|
|
563
|
+
## locations (dict)
|
|
564
|
+
|
|
565
|
+
A dict mapping integer IDs to strings.
|
|
566
|
+
|
|
567
|
+
Each unique value in the "location_id" column should become an element in this dict with a unique ID.
|
|
568
|
+
|
|
569
|
+
## base_urls (dict)
|
|
570
|
+
|
|
571
|
+
This key should point to the following dict:
|
|
572
|
+
|
|
573
|
+
{
|
|
574
|
+
"gcp": "https://storage.googleapis.com/public-datasets-lila/",
|
|
575
|
+
"aws": "http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/",
|
|
576
|
+
"azure": "https://lilawildlife.blob.core.windows.net/lila-wildlife/",
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
All values in the url_gcp, url_aws, and url_azure columns start with these values, respectively.
|
|
580
|
+
|
|
581
|
+
## taxa (dict)
|
|
582
|
+
|
|
583
|
+
A dict mapping integer IDs to dicts, where each dict has the fields:
|
|
584
|
+
|
|
585
|
+
kingdom,phylum,subphylum,superclass,class,subclass,infraclass,superorder,order,suborder,infraorder,superfamily,family,subfamily,tribe,genus,subgenus,species,subspecies,variety
|
|
586
|
+
|
|
587
|
+
The value of each of these fields in each row is either a string or None.
|
|
588
|
+
|
|
589
|
+
## images (list)
|
|
590
|
+
|
|
591
|
+
A list of images, where each image is a dict with the following fields:
|
|
592
|
+
|
|
593
|
+
### dataset (int)
|
|
594
|
+
|
|
595
|
+
The integer ID corresponding to the dataset_name column for this image
|
|
596
|
+
|
|
597
|
+
### path (str)
|
|
598
|
+
|
|
599
|
+
The suffix for this image's URL, which should be the same across the three URL columns.
|
|
600
|
+
|
|
601
|
+
### seq (int)
|
|
602
|
+
|
|
603
|
+
The integer ID corresponding to the sequence_id column for this image
|
|
604
|
+
|
|
605
|
+
### loc (int)
|
|
606
|
+
|
|
607
|
+
The integer ID corresponding to the location_id column for this image
|
|
608
|
+
|
|
609
|
+
### frame_num
|
|
610
|
+
|
|
611
|
+
The value of the frame_num column for this image, unless the original value was -1,
|
|
612
|
+
in which case this is omitted.
|
|
613
|
+
|
|
614
|
+
### original_label
|
|
615
|
+
|
|
616
|
+
The value of the original_label column for this image
|
|
617
|
+
|
|
618
|
+
### common_name
|
|
619
|
+
|
|
620
|
+
The value of the common_name column for this image, if not empty
|
|
621
|
+
|
|
622
|
+
### datetime
|
|
623
|
+
|
|
624
|
+
The value of the datetime column for this image
|
|
625
|
+
|
|
626
|
+
### ann_level
|
|
627
|
+
|
|
628
|
+
The value of the annotation_level column for this image
|
|
629
|
+
|
|
630
|
+
### taxon
|
|
631
|
+
|
|
632
|
+
The integer ID corresponding to the taxonomic identifier columns for this image
|
|
633
|
+
|
|
634
|
+
--
|
|
635
|
+
|
|
636
|
+
The original .csv file is large (~15GB); this may impact the implementation of the .json conversion. Speed of
|
|
637
|
+
conversion is not a priority.
|
|
638
|
+
|
|
639
|
+
"""
|
|
640
|
+
|
|
641
|
+
print('Converting to JSON...')
|
|
642
|
+
|
|
643
|
+
output_json_file = output_file.replace('.csv', '.json')
|
|
644
|
+
|
|
645
|
+
json_data = {}
|
|
646
|
+
|
|
647
|
+
# Create mappings for datasets, sequences, and locations
|
|
648
|
+
dataset_to_id = {}
|
|
649
|
+
sequence_to_id = {}
|
|
650
|
+
location_to_id = {}
|
|
651
|
+
taxa_to_id = {}
|
|
652
|
+
|
|
653
|
+
next_dataset_id = 0
|
|
654
|
+
next_sequence_id = 0
|
|
655
|
+
next_location_id = 0
|
|
656
|
+
next_taxa_id = 0
|
|
657
|
+
|
|
658
|
+
json_data['datasets'] = {}
|
|
659
|
+
json_data['sequences'] = {}
|
|
660
|
+
json_data['locations'] = {}
|
|
661
|
+
json_data['taxa'] = {}
|
|
662
|
+
|
|
663
|
+
json_data['base_urls'] = {
|
|
664
|
+
"gcp": "https://storage.googleapis.com/public-datasets-lila/",
|
|
665
|
+
"aws": "http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/",
|
|
666
|
+
"azure": "https://lilawildlife.blob.core.windows.net/lila-wildlife/",
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
json_data['images'] = []
|
|
670
|
+
|
|
671
|
+
debug_max_json_conversion_rows = None
|
|
672
|
+
|
|
673
|
+
print('Counting rows in .csv file...')
|
|
674
|
+
|
|
675
|
+
# Get total number of lines for progress bar (optional, but helpful for large files)
|
|
676
|
+
def _count_lines(filename):
|
|
677
|
+
with open(filename, 'r', encoding='utf-8') as f:
|
|
678
|
+
return sum(1 for line in f) - 1
|
|
679
|
+
|
|
680
|
+
total_rows = _count_lines(output_file)
|
|
681
|
+
print('Total rows to process: {}'.format(total_rows))
|
|
682
|
+
|
|
683
|
+
# Read CSV file line by line
|
|
684
|
+
with open(output_file, 'r', encoding='utf-8') as csvfile:
|
|
685
|
+
|
|
686
|
+
reader = csv.DictReader(csvfile)
|
|
687
|
+
|
|
688
|
+
# Process each row
|
|
689
|
+
for i_row, row in enumerate(tqdm(reader, total=total_rows, desc="Processing rows")):
|
|
690
|
+
|
|
691
|
+
if (debug_max_json_conversion_rows is not None) and (i_row >= debug_max_json_conversion_rows):
|
|
692
|
+
break
|
|
693
|
+
|
|
694
|
+
# Datasets
|
|
695
|
+
dataset_name = row['dataset_name']
|
|
696
|
+
if dataset_name not in dataset_to_id:
|
|
697
|
+
dataset_to_id[dataset_name] = next_dataset_id
|
|
698
|
+
json_data['datasets'][str(next_dataset_id)] = dataset_name
|
|
699
|
+
next_dataset_id += 1
|
|
700
|
+
dataset_id = dataset_to_id[dataset_name]
|
|
701
|
+
|
|
702
|
+
# Sequences
|
|
703
|
+
sequence_id_str = row['sequence_id']
|
|
704
|
+
assert sequence_id_str.startswith(dataset_name + ' : ')
|
|
705
|
+
if sequence_id_str not in sequence_to_id:
|
|
706
|
+
sequence_to_id[sequence_id_str] = next_sequence_id
|
|
707
|
+
json_data['sequences'][str(next_sequence_id)] = sequence_id_str
|
|
708
|
+
next_sequence_id += 1
|
|
709
|
+
sequence_id = sequence_to_id[sequence_id_str]
|
|
710
|
+
|
|
711
|
+
# Locations
|
|
712
|
+
location_id_str = row['location_id']
|
|
713
|
+
assert location_id_str.startswith(dataset_name) # + ' : ')
|
|
714
|
+
if location_id_str not in location_to_id:
|
|
715
|
+
location_to_id[location_id_str] = next_location_id
|
|
716
|
+
json_data['locations'][str(next_location_id)] = location_id_str
|
|
717
|
+
next_location_id += 1
|
|
718
|
+
location_id = location_to_id[location_id_str]
|
|
719
|
+
|
|
720
|
+
# Taxa
|
|
721
|
+
taxa_data = {level: _clearnan(row[level]) for level in taxonomy_levels_to_include}
|
|
722
|
+
taxa_tuple = tuple(taxa_data.items()) # use tuple for hashable key
|
|
723
|
+
if taxa_tuple not in taxa_to_id:
|
|
724
|
+
taxa_to_id[taxa_tuple] = next_taxa_id
|
|
725
|
+
json_data['taxa'][str(next_taxa_id)] = taxa_data
|
|
726
|
+
next_taxa_id += 1
|
|
727
|
+
taxa_id = taxa_to_id[taxa_tuple]
|
|
728
|
+
|
|
729
|
+
# Image path
|
|
730
|
+
url_gcp = row['url_gcp']
|
|
731
|
+
assert url_gcp.startswith(json_data['base_urls']['gcp'])
|
|
732
|
+
path = url_gcp.replace(json_data['base_urls']['gcp'], '')
|
|
733
|
+
|
|
734
|
+
common_name = _clearnan(row['common_name'])
|
|
735
|
+
|
|
736
|
+
frame_num = int(row['frame_num'])
|
|
737
|
+
|
|
738
|
+
# Image data
|
|
739
|
+
image_entry = {
|
|
740
|
+
'dataset': dataset_id,
|
|
741
|
+
'path': path,
|
|
742
|
+
'seq': sequence_id,
|
|
743
|
+
'loc': location_id,
|
|
744
|
+
'ann_level': row['annotation_level'],
|
|
745
|
+
'original_label': row['original_label'],
|
|
746
|
+
'datetime': row['datetime'],
|
|
747
|
+
'taxon': taxa_id
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
if frame_num >= 0:
|
|
751
|
+
image_entry['frame_num'] = frame_num
|
|
752
|
+
|
|
753
|
+
if len(common_name) > 0:
|
|
754
|
+
image_entry['common_name'] = common_name
|
|
755
|
+
|
|
756
|
+
json_data['images'].append(image_entry)
|
|
757
|
+
|
|
758
|
+
# ...for each line
|
|
759
|
+
|
|
760
|
+
# ...with open(...)
|
|
761
|
+
|
|
762
|
+
# Save the JSON data
|
|
763
|
+
print('Saving JSON file...')
|
|
764
|
+
with open(output_json_file, 'w', encoding='utf-8') as f:
|
|
765
|
+
json.dump(json_data, f, indent=1)
|
|
766
|
+
|
|
767
|
+
print(f'Converted to JSON and saved to {output_json_file}')
|
|
768
|
+
print(f'JSON file size: {os.path.getsize(output_json_file)/(1024*1024*1024):.2f} GB')
|
|
769
|
+
|
|
770
|
+
# Print summary statistics
|
|
771
|
+
print(f'Total datasets: {len(json_data["datasets"])}')
|
|
772
|
+
print(f'Total sequences: {len(json_data["sequences"])}')
|
|
773
|
+
print(f'Total locations: {len(json_data["locations"])}')
|
|
774
|
+
print(f'Total taxa: {len(json_data["taxa"])}')
|
|
775
|
+
print(f'Total images: {len(json_data["images"])}')
|
|
@@ -82,9 +82,11 @@ print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
|
|
|
82
82
|
#%% Download and extract metadata for each dataset
|
|
83
83
|
|
|
84
84
|
for ds_name in metadata_table.keys():
|
|
85
|
-
metadata_table[ds_name]['json_filename'] =
|
|
86
|
-
|
|
87
|
-
|
|
85
|
+
metadata_table[ds_name]['json_filename'] = \
|
|
86
|
+
read_metadata_file_for_dataset(ds_name=ds_name,
|
|
87
|
+
metadata_dir=metadata_dir,
|
|
88
|
+
metadata_table=metadata_table,
|
|
89
|
+
preferred_cloud=preferred_cloud)
|
|
88
90
|
|
|
89
91
|
|
|
90
92
|
#%% Get category names and counts for each dataset
|
|
@@ -32,7 +32,7 @@ os.makedirs(metadata_dir,exist_ok=True)
|
|
|
32
32
|
md_results_dir = os.path.join(lila_local_base,'md_results')
|
|
33
33
|
os.makedirs(md_results_dir,exist_ok=True)
|
|
34
34
|
|
|
35
|
-
md_results_keys = ['
|
|
35
|
+
md_results_keys = ['mdv5a_results_raw','mdv5b_results_raw',
|
|
36
36
|
'md1000-redwood_results_raw','md_results_with_rde']
|
|
37
37
|
|
|
38
38
|
preferred_cloud = None # 'gcp' # 'azure', 'aws'
|
|
@@ -54,7 +54,7 @@ print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
|
|
|
54
54
|
|
|
55
55
|
#%% Download and extract metadata and MD results for each dataset
|
|
56
56
|
|
|
57
|
-
# Takes ~
|
|
57
|
+
# Takes ~10 minutes if everything needs to be downloaded and unzipped
|
|
58
58
|
|
|
59
59
|
for ds_name in metadata_table.keys():
|
|
60
60
|
|
|
@@ -63,10 +63,12 @@ for ds_name in metadata_table.keys():
|
|
|
63
63
|
read_metadata_file_for_dataset(ds_name=ds_name,
|
|
64
64
|
metadata_dir=metadata_dir,
|
|
65
65
|
metadata_table=metadata_table,
|
|
66
|
-
force_download=force_download
|
|
66
|
+
force_download=force_download,
|
|
67
|
+
preferred_cloud=preferred_cloud)
|
|
67
68
|
|
|
68
69
|
# Download MD results for this dataset
|
|
69
70
|
for k in md_results_keys:
|
|
71
|
+
|
|
70
72
|
md_results_url = metadata_table[ds_name][k]
|
|
71
73
|
if md_results_url is None:
|
|
72
74
|
metadata_table[ds_name][k + '_filename'] = None
|
|
@@ -75,7 +77,10 @@ for ds_name in metadata_table.keys():
|
|
|
75
77
|
read_metadata_file_for_dataset(ds_name=ds_name,
|
|
76
78
|
metadata_dir=md_results_dir,
|
|
77
79
|
json_url=md_results_url,
|
|
78
|
-
force_download=force_download
|
|
80
|
+
force_download=force_download,
|
|
81
|
+
preferred_cloud=preferred_cloud)
|
|
82
|
+
|
|
83
|
+
# ...for each MD results file
|
|
79
84
|
|
|
80
85
|
# ...for each dataset
|
|
81
86
|
|
|
@@ -121,9 +126,11 @@ for ds_name in metadata_table.keys():
|
|
|
121
126
|
|
|
122
127
|
url_to_source[test_image_url] = ds_name + ' metadata ({})'.format(cloud)
|
|
123
128
|
|
|
129
|
+
# ...for each cloud
|
|
130
|
+
|
|
124
131
|
# Grab an image from the MegaDetector results
|
|
125
132
|
|
|
126
|
-
# k = md_results_keys[
|
|
133
|
+
# k = md_results_keys[0]
|
|
127
134
|
for k in md_results_keys:
|
|
128
135
|
k_fn = k + '_filename'
|
|
129
136
|
if metadata_table[ds_name][k_fn] is not None:
|
|
@@ -153,3 +160,6 @@ for i_url,url in enumerate(urls_to_test):
|
|
|
153
160
|
if status_codes[i_url] != 200:
|
|
154
161
|
print('Status {} for {} ({})'.format(
|
|
155
162
|
status_codes[i_url],url,url_to_source[url]))
|
|
163
|
+
|
|
164
|
+
print('Tested {} URLs'.format(len(urls_to_test)))
|
|
165
|
+
|
|
@@ -35,6 +35,7 @@ def mewc_to_md(mewc_input_folder,
|
|
|
35
35
|
mewc_out_filename='mewc_out.csv',
|
|
36
36
|
md_out_filename='md_out.json'):
|
|
37
37
|
"""
|
|
38
|
+
Converts the output of the MEWC inference scripts to the MD output format.
|
|
38
39
|
|
|
39
40
|
Args:
|
|
40
41
|
mewc_input_folder (str): the folder we'll search for MEWC output files
|
|
@@ -43,6 +44,10 @@ def mewc_to_md(mewc_input_folder,
|
|
|
43
44
|
.json file, typically the prefix used to mount the image folder.
|
|
44
45
|
category_name_column (str, optional): column in the MEWC results .csv to use for
|
|
45
46
|
category naming.
|
|
47
|
+
mewc_out_filename (str, optional): MEWC-formatted .csv file that should be
|
|
48
|
+
in [mewc_input_folder]
|
|
49
|
+
md_out_filename (str, optional): MD-formatted .json file (without classification
|
|
50
|
+
information) that should be in [mewc_input_folder]
|
|
46
51
|
|
|
47
52
|
Returns:
|
|
48
53
|
dict: an MD-formatted dict, the same as what's written to [output_file]
|
|
@@ -39,6 +39,7 @@ Known limitations:
|
|
|
39
39
|
|
|
40
40
|
"""
|
|
41
41
|
|
|
42
|
+
|
|
42
43
|
#%% Constants and imports
|
|
43
44
|
|
|
44
45
|
import os
|
|
@@ -64,7 +65,7 @@ from megadetector.visualization import visualization_utils as vis_utils
|
|
|
64
65
|
#
|
|
65
66
|
# Also install tesseract from: https://github.com/UB-Mannheim/tesseract/wiki, and add
|
|
66
67
|
# the installation dir to your path (on Windows, typically C:\Program Files (x86)\Tesseract-OCR)
|
|
67
|
-
import pytesseract
|
|
68
|
+
import pytesseract # type: ignore
|
|
68
69
|
|
|
69
70
|
|
|
70
71
|
#%% Extraction options
|
|
@@ -546,8 +547,8 @@ def try_get_datetime_from_image(filename,include_crops=False,options=None):
|
|
|
546
547
|
until we find a datetime.
|
|
547
548
|
|
|
548
549
|
Args:
|
|
549
|
-
|
|
550
|
-
datetime information.
|
|
550
|
+
filename (Image or str): the PIL Image object or image filename in which we should look
|
|
551
|
+
for datetime information.
|
|
551
552
|
include_crops (bool, optional): whether to include cropped images in the return dict (set
|
|
552
553
|
this to False if you're worried about size and you're processing a zillion images)
|
|
553
554
|
options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
|
|
@@ -36,7 +36,8 @@ from megadetector.data_management.cct_json_utils import write_object_with_serial
|
|
|
36
36
|
debug_max_images = None
|
|
37
37
|
|
|
38
38
|
minimal_exif_tags = \
|
|
39
|
-
['DateTime','Model','Make','ExifImageWidth','ExifImageHeight',
|
|
39
|
+
['DateTime','Model','Make','ExifImageWidth','ExifImageHeight',
|
|
40
|
+
'DateTimeOriginal','Orientation', 'GPSInfo']
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
#%% Options
|
|
@@ -186,6 +187,7 @@ def read_pil_exif(im,options=None):
|
|
|
186
187
|
Args:
|
|
187
188
|
im (str or PIL.Image.Image): image (as a filename or an Image object) from which
|
|
188
189
|
we should read EXIF data.
|
|
190
|
+
options (ReadExifOptions, optional): see ReadExifOptions
|
|
189
191
|
|
|
190
192
|
Returns:
|
|
191
193
|
dict: a dictionary mapping EXIF tag names to their values
|
|
@@ -288,6 +290,12 @@ def format_datetime_as_exif_datetime_string(dt):
|
|
|
288
290
|
"""
|
|
289
291
|
Returns a Python datetime object rendered using the standard EXIF datetime
|
|
290
292
|
string format ('%Y:%m:%d %H:%M:%S')
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
dt (datetime): datetime object to format
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
str: [dt] as a string in standard EXIF format
|
|
291
299
|
"""
|
|
292
300
|
|
|
293
301
|
return datetime.strftime(dt, '%Y:%m:%d %H:%M:%S')
|
|
@@ -348,6 +356,10 @@ def read_exif_tags_for_image(file_path,options=None):
|
|
|
348
356
|
"""
|
|
349
357
|
Get relevant fields from EXIF data for an image
|
|
350
358
|
|
|
359
|
+
Args:
|
|
360
|
+
file_path (str): image from which we should read EXIF data
|
|
361
|
+
options (ReadExifOptions, optional): see ReadExifOptions
|
|
362
|
+
|
|
351
363
|
Returns:
|
|
352
364
|
dict: a dict with fields 'status' (str) and 'tags'. The exact format of 'tags' depends on
|
|
353
365
|
options (ReadExifOptions, optional): parameters controlling metadata extraction
|
|
@@ -656,7 +668,11 @@ def _write_exif_results(results,output_file):
|
|
|
656
668
|
# ..._write_exif_results(...)
|
|
657
669
|
|
|
658
670
|
|
|
659
|
-
def read_exif_from_folder(input_folder,
|
|
671
|
+
def read_exif_from_folder(input_folder,
|
|
672
|
+
output_file=None,
|
|
673
|
+
options=None,
|
|
674
|
+
filenames=None,
|
|
675
|
+
recursive=True):
|
|
660
676
|
"""
|
|
661
677
|
Read EXIF data for a folder of images.
|
|
662
678
|
|
|
@@ -670,7 +686,6 @@ def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=N
|
|
|
670
686
|
a list of absolute filenames (if [input_folder] is None)
|
|
671
687
|
recursive (bool, optional): whether to recurse into [input_folder], not relevant if [input_folder]
|
|
672
688
|
is None.
|
|
673
|
-
verbose (bool, optional): enable additional debug output
|
|
674
689
|
|
|
675
690
|
Returns:
|
|
676
691
|
list: list of dicts, each of which contains EXIF information for one images. Fields include at least:
|
|
@@ -704,7 +719,7 @@ def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=N
|
|
|
704
719
|
try:
|
|
705
720
|
with open(output_file, 'a') as f:
|
|
706
721
|
if not f.writable():
|
|
707
|
-
raise
|
|
722
|
+
raise OSError('File not writable')
|
|
708
723
|
except Exception:
|
|
709
724
|
print('Could not write to file {}'.format(output_file))
|
|
710
725
|
raise
|
|
@@ -743,7 +758,7 @@ def exif_results_to_cct(exif_results,cct_output_file=None,options=None):
|
|
|
743
758
|
Args:
|
|
744
759
|
exif_results (str or list): the filename (or loaded list) containing the results
|
|
745
760
|
from read_exif_from_folder
|
|
746
|
-
cct_output_file (str,optional): the filename to which we should write
|
|
761
|
+
cct_output_file (str, optional): the filename to which we should write
|
|
747
762
|
COCO-Camera-Traps-formatted data
|
|
748
763
|
options (ExifResultsToCCTOptions, optional): options guiding the generation
|
|
749
764
|
of the CCT file, particularly location mapping
|