megadetector 5.0.29__py3-none-any.whl → 10.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (95) hide show
  1. megadetector/classification/efficientnet/model.py +8 -8
  2. megadetector/classification/efficientnet/utils.py +6 -5
  3. megadetector/classification/prepare_classification_script_mc.py +3 -3
  4. megadetector/data_management/annotations/annotation_constants.py +0 -1
  5. megadetector/data_management/camtrap_dp_to_coco.py +34 -1
  6. megadetector/data_management/cct_json_utils.py +2 -2
  7. megadetector/data_management/coco_to_yolo.py +22 -5
  8. megadetector/data_management/databases/add_width_and_height_to_db.py +85 -12
  9. megadetector/data_management/databases/combine_coco_camera_traps_files.py +2 -2
  10. megadetector/data_management/databases/integrity_check_json_db.py +29 -15
  11. megadetector/data_management/generate_crops_from_cct.py +50 -1
  12. megadetector/data_management/labelme_to_coco.py +4 -2
  13. megadetector/data_management/labelme_to_yolo.py +82 -2
  14. megadetector/data_management/lila/generate_lila_per_image_labels.py +276 -18
  15. megadetector/data_management/lila/get_lila_annotation_counts.py +5 -3
  16. megadetector/data_management/lila/lila_common.py +3 -0
  17. megadetector/data_management/lila/test_lila_metadata_urls.py +15 -5
  18. megadetector/data_management/mewc_to_md.py +5 -0
  19. megadetector/data_management/ocr_tools.py +4 -3
  20. megadetector/data_management/read_exif.py +20 -5
  21. megadetector/data_management/remap_coco_categories.py +66 -4
  22. megadetector/data_management/remove_exif.py +50 -1
  23. megadetector/data_management/rename_images.py +3 -3
  24. megadetector/data_management/resize_coco_dataset.py +563 -95
  25. megadetector/data_management/yolo_output_to_md_output.py +131 -2
  26. megadetector/data_management/yolo_to_coco.py +140 -5
  27. megadetector/detection/change_detection.py +4 -3
  28. megadetector/detection/pytorch_detector.py +60 -22
  29. megadetector/detection/run_detector.py +225 -25
  30. megadetector/detection/run_detector_batch.py +42 -16
  31. megadetector/detection/run_inference_with_yolov5_val.py +12 -2
  32. megadetector/detection/run_tiled_inference.py +1 -0
  33. megadetector/detection/video_utils.py +53 -24
  34. megadetector/postprocessing/add_max_conf.py +4 -0
  35. megadetector/postprocessing/categorize_detections_by_size.py +1 -1
  36. megadetector/postprocessing/classification_postprocessing.py +55 -20
  37. megadetector/postprocessing/combine_batch_outputs.py +3 -2
  38. megadetector/postprocessing/compare_batch_results.py +64 -10
  39. megadetector/postprocessing/convert_output_format.py +12 -8
  40. megadetector/postprocessing/create_crop_folder.py +137 -10
  41. megadetector/postprocessing/load_api_results.py +26 -8
  42. megadetector/postprocessing/md_to_coco.py +4 -4
  43. megadetector/postprocessing/md_to_labelme.py +18 -7
  44. megadetector/postprocessing/merge_detections.py +5 -0
  45. megadetector/postprocessing/postprocess_batch_results.py +6 -3
  46. megadetector/postprocessing/remap_detection_categories.py +55 -2
  47. megadetector/postprocessing/render_detection_confusion_matrix.py +9 -6
  48. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +2 -2
  49. megadetector/taxonomy_mapping/map_new_lila_datasets.py +3 -4
  50. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +40 -19
  51. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +1 -1
  52. megadetector/taxonomy_mapping/species_lookup.py +123 -41
  53. megadetector/utils/ct_utils.py +133 -113
  54. megadetector/utils/md_tests.py +93 -13
  55. megadetector/utils/path_utils.py +137 -107
  56. megadetector/utils/split_locations_into_train_val.py +2 -2
  57. megadetector/utils/string_utils.py +7 -7
  58. megadetector/utils/url_utils.py +81 -58
  59. megadetector/utils/wi_utils.py +46 -17
  60. megadetector/visualization/plot_utils.py +13 -9
  61. megadetector/visualization/render_images_with_thumbnails.py +2 -1
  62. megadetector/visualization/visualization_utils.py +94 -46
  63. megadetector/visualization/visualize_db.py +36 -9
  64. megadetector/visualization/visualize_detector_output.py +4 -4
  65. {megadetector-5.0.29.dist-info → megadetector-10.0.0.dist-info}/METADATA +135 -135
  66. megadetector-10.0.0.dist-info/RECORD +139 -0
  67. {megadetector-5.0.29.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
  68. {megadetector-5.0.29.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0
  69. megadetector/api/batch_processing/api_core/__init__.py +0 -0
  70. megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
  71. megadetector/api/batch_processing/api_core/batch_service/score.py +0 -438
  72. megadetector/api/batch_processing/api_core/server.py +0 -294
  73. megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
  74. megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
  75. megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
  76. megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
  77. megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
  78. megadetector/api/batch_processing/api_core/server_utils.py +0 -88
  79. megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
  80. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
  81. megadetector/api/batch_processing/api_support/__init__.py +0 -0
  82. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
  83. megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
  84. megadetector/api/synchronous/__init__.py +0 -0
  85. megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  86. megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
  87. megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
  88. megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
  89. megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
  90. megadetector/api/synchronous/api_core/tests/load_test.py +0 -109
  91. megadetector/utils/azure_utils.py +0 -178
  92. megadetector/utils/sas_blob_utils.py +0 -513
  93. megadetector-5.0.29.dist-info/RECORD +0 -163
  94. /megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
  95. {megadetector-5.0.29.dist-info → megadetector-10.0.0.dist-info}/WHEEL +0 -0
@@ -10,12 +10,14 @@ Create YOLO .txt files in a folder containing labelme .json files.
10
10
 
11
11
  import os
12
12
  import json
13
+ import argparse
13
14
 
14
15
  from multiprocessing.pool import Pool, ThreadPool
15
16
  from functools import partial
16
17
  from tqdm import tqdm
17
18
 
18
19
  from megadetector.utils.path_utils import recursive_file_list
20
+ from megadetector.utils.ct_utils import write_json
19
21
 
20
22
 
21
23
  #%% Main function
@@ -33,6 +35,13 @@ def labelme_file_to_yolo_file(labelme_file,
33
35
  this function no-ops (i.e., does not generate a YOLO file).
34
36
 
35
37
  overwrite_behavior should be 'skip' or 'overwrite' (default).
38
+
39
+ Args:
40
+ labelme_file (str): .json file to convert
41
+ category_name_to_category_id (dict): category name --> ID mapping
42
+ yolo_file (str, optional): output .txt file defaults to s/json/txt
43
+ required_token (str, optional): only process filenames containing this token
44
+ overwrite_behavior (str, optional): "skip" or "overwrite"
36
45
  """
37
46
 
38
47
  result = {}
@@ -150,6 +159,15 @@ def labelme_folder_to_yolo(labelme_folder,
150
159
  'category_name_to_category_id', whether it was passed in or constructed
151
160
  'image_results': a list of results for each image (converted, skipped, error)
152
161
 
162
+ Args:
163
+ labelme_folder (str): folder of .json files to convert
164
+ category_name_to_category_id (dict): category name --> ID mapping
165
+ required_token (str, optional): only process filenames containing this token
166
+ overwrite_behavior (str, optional): "skip" or "overwrite"
167
+ relative_filenames_to_convert (list of str, optional): only process filenames on this list
168
+ n_workers (int, optional): parallelism level
169
+ use_threads (bool, optional): whether to use threads (True) or processes (False) for
170
+ parallelism
153
171
  """
154
172
 
155
173
  if relative_filenames_to_convert is not None:
@@ -236,7 +254,7 @@ def labelme_folder_to_yolo(labelme_folder,
236
254
  finally:
237
255
  pool.close()
238
256
  pool.join()
239
- print("Pool closed and joined for labelme conversion to YOLO")
257
+ print('Pool closed and joined for labelme conversion to YOLO')
240
258
 
241
259
  assert len(valid_labelme_files_relative) == len(image_results)
242
260
 
@@ -275,4 +293,66 @@ if False:
275
293
 
276
294
  #%% Command-line driver
277
295
 
278
- # TODO
296
+ def main():
297
+ """
298
+ Command-line interface to convert Labelme JSON files to YOLO format
299
+ """
300
+
301
+ parser = argparse.ArgumentParser(
302
+ description='Convert a folder of Labelme .json files to YOLO .txt format'
303
+ )
304
+ parser.add_argument(
305
+ 'labelme_folder',
306
+ type=str,
307
+ help='Folder of Labelme .json files to convert'
308
+ )
309
+ parser.add_argument(
310
+ '--output_category_file',
311
+ type=str,
312
+ default=None,
313
+ help='Path to save the generated category mapping (.json)'
314
+ )
315
+ parser.add_argument(
316
+ '--required_token',
317
+ type=str,
318
+ default=None,
319
+ help='Only process files containing this token as a key in the Labelme JSON dict'
320
+ )
321
+ parser.add_argument(
322
+ '--overwrite_behavior',
323
+ type=str,
324
+ default='overwrite',
325
+ choices=['skip', 'overwrite'],
326
+ help="Behavior if YOLO .txt files exist (default: 'overwrite')"
327
+ )
328
+ parser.add_argument(
329
+ '--n_workers',
330
+ type=int,
331
+ default=1,
332
+ help='Number of workers for parallel processing (default: 1)'
333
+ )
334
+ parser.add_argument(
335
+ '--use_processes',
336
+ action='store_true',
337
+ help='Use processes instead of threads for parallelization (defaults to threads)'
338
+ )
339
+
340
+ args = parser.parse_args()
341
+
342
+ results = labelme_folder_to_yolo(
343
+ labelme_folder=args.labelme_folder,
344
+ category_name_to_category_id=None,
345
+ required_token=args.required_token,
346
+ overwrite_behavior=args.overwrite_behavior,
347
+ relative_filenames_to_convert=None,
348
+ n_workers=args.n_workers,
349
+ use_threads=(not args.use_processes)
350
+ )
351
+
352
+ if args.output_category_file:
353
+ category_map = results['category_name_to_category_id']
354
+ write_json(args.output_category_file,category_map)
355
+ print(f'Saved category mapping to {args.output_category_file}')
356
+
357
+ if __name__ == '__main__':
358
+ main()
@@ -67,6 +67,18 @@ if debug_max_images_per_dataset > 0:
67
67
  print('Running in debug mode')
68
68
  output_file = output_file.replace('.csv','_debug.csv')
69
69
 
70
+ taxonomy_levels_to_include = \
71
+ ['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
72
+ 'suborder','infraorder','superfamily','family','subfamily','tribe','genus','subgenus',
73
+ 'species','subspecies','variety']
74
+
75
+ def _clearnan(v):
76
+ if isinstance(v,float):
77
+ assert np.isnan(v)
78
+ v = ''
79
+ assert isinstance(v,str)
80
+ return v
81
+
70
82
 
71
83
  #%% Download and parse the metadata file
72
84
 
@@ -87,7 +99,7 @@ for ds_name in metadata_table.keys():
87
99
 
88
100
  #%% Load taxonomy data
89
101
 
90
- taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
102
+ taxonomy_df = read_lila_taxonomy_mapping(metadata_dir, force_download=True)
91
103
 
92
104
 
93
105
  #%% Build a dictionary that maps each [dataset,query] pair to the full taxonomic label set
@@ -113,22 +125,10 @@ header = ['dataset_name','url_gcp','url_aws','url_azure',
113
125
  'image_id','sequence_id','location_id','frame_num',
114
126
  'original_label','scientific_name','common_name','datetime','annotation_level']
115
127
 
116
- taxonomy_levels_to_include = \
117
- ['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
118
- 'suborder','infraorder','superfamily','family','subfamily','tribe','genus','species','subspecies',
119
- 'variety']
120
-
121
128
  header.extend(taxonomy_levels_to_include)
122
129
 
123
130
  missing_annotations = set()
124
131
 
125
- def _clearnan(v):
126
- if isinstance(v,float):
127
- assert np.isnan(v)
128
- v = ''
129
- assert isinstance(v,str)
130
- return v
131
-
132
132
  with open(output_file,'w',encoding='utf-8',newline='') as f:
133
133
 
134
134
  csv_writer = csv.writer(f)
@@ -361,7 +361,10 @@ print('Read {} rows from {}'.format(len(df),output_file))
361
361
 
362
362
  #%% Do some post-hoc integrity checking
363
363
 
364
- # Takes ~10 minutes without using apply()
364
+ # Takes ~5 minutes with apply(), or ~10 minutes without apply()
365
+ #
366
+ # Using apply() is faster, but more annoying to debug.
367
+ use_pandas_apply_for_integrity_checking = True
365
368
 
366
369
  tqdm.pandas()
367
370
 
@@ -393,8 +396,7 @@ def _check_row(row):
393
396
  ds_name = row['dataset_name']
394
397
  dataset_name_to_locations[ds_name].add(row['location_id'])
395
398
 
396
- # Faster, but more annoying to debug
397
- if True:
399
+ if use_pandas_apply_for_integrity_checking:
398
400
 
399
401
  df.progress_apply(_check_row, axis=1)
400
402
 
@@ -448,8 +450,9 @@ for ds_name in metadata_table.keys():
448
450
  empty_rows = empty_rows.sample(n=n_empty_images_per_dataset)
449
451
  images_to_download.extend(empty_rows.to_dict('records'))
450
452
 
453
+ # All LILA datasets have non-empty images
451
454
  if len(non_empty_rows) == 0:
452
- print('No non-empty images available for {}'.format(ds_name))
455
+ raise ValueError('No non-empty images available for {}'.format(ds_name))
453
456
  elif len(non_empty_rows) > n_non_empty_images_per_dataset:
454
457
  non_empty_rows = non_empty_rows.sample(n=n_non_empty_images_per_dataset)
455
458
  images_to_download.extend(non_empty_rows.to_dict('records'))
@@ -463,7 +466,7 @@ print('Selected {} total images'.format(len(images_to_download)))
463
466
 
464
467
  # Expect a few errors for images with human or vehicle labels (or things like "ignore" that *could* be humans)
465
468
 
466
- preferred_cloud = 'aws'
469
+ preferred_cloud = 'gcp'
467
470
 
468
471
  url_to_target_file = {}
469
472
 
@@ -484,6 +487,19 @@ for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_down
484
487
  download_results = parallel_download_urls(url_to_target_file,verbose=False,overwrite=True,
485
488
  n_workers=20,pool_type='thread')
486
489
 
490
+ # 10-20 errors is normal; they should all be images that are labeled as "human"
491
+ errors = []
492
+
493
+ for r in download_results:
494
+ if r['status'] != 'success':
495
+ errors.append(r)
496
+
497
+ assert len(download_results) == len(url_to_target_file)
498
+ print('Errors on {} of {} downloads:\n'.format(len(errors),len(download_results)))
499
+
500
+ for err in errors:
501
+ print(err['url'])
502
+
487
503
 
488
504
  #%% Write preview HTML
489
505
 
@@ -515,3 +531,245 @@ open_file(html_filename)
515
531
  zipped_output_file = zip_file(output_file,verbose=True,overwrite=True)
516
532
 
517
533
  print('Zipped {} to {}'.format(output_file,zipped_output_file))
534
+
535
+
536
+ #%% Convert to .json
537
+
538
+ """
539
+ The .csv file "output_file" (already loaded into the variable "df" at this point) has the following columns:
540
+
541
+ dataset_name,url_gcp,url_aws,url_azure,image_id,sequence_id,location_id,frame_num,original_label,scientific_name,common_name,datetime,annotation_level,kingdom,phylum,subphylum,superclass,class,subclass,infraclass,superorder,order,suborder,infraorder,superfamily,family,subfamily,tribe,genus,subgenus,species,subspecies,variety
542
+
543
+ Each row in the .csv represents an image. The URL columns represent the location of that
544
+ image on three different clouds; for a given image, the value of those columns differs only
545
+ in the prefix. The columns starting with "kingdom" represent a taxonomic wildlife identifier. Not
546
+ all rows have values in all of these columns; some rows represent non-wildlife images where all of these
547
+ columns are blank.
548
+
549
+ This cell converts this to a .json dictionary, with the following top-level keys:
550
+
551
+ ## datasets (dict)
552
+
553
+ A dict mapping integer IDs to strings.
554
+
555
+ Each unique value in the "dataset_name" column should become an element in this dict with a unique ID.
556
+
557
+ ## sequences (dict)
558
+
559
+ A dict mapping integer IDs to strings.
560
+
561
+ Each unique value in the "sequence_id" column should become an element in this dict with a unique ID.
562
+
563
+ ## locations (dict)
564
+
565
+ A dict mapping integer IDs to strings.
566
+
567
+ Each unique value in the "location_id" column should become an element in this dict with a unique ID.
568
+
569
+ ## base_urls (dict)
570
+
571
+ This key should point to the following dict:
572
+
573
+ {
574
+ "gcp": "https://storage.googleapis.com/public-datasets-lila/",
575
+ "aws": "http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/",
576
+ "azure": "https://lilawildlife.blob.core.windows.net/lila-wildlife/",
577
+ }
578
+
579
+ All values in the url_gcp, url_aws, and url_azure columns start with these values, respectively.
580
+
581
+ ## taxa (dict)
582
+
583
+ A dict mapping integer IDs to dicts, where each dict has the fields:
584
+
585
+ kingdom,phylum,subphylum,superclass,class,subclass,infraclass,superorder,order,suborder,infraorder,superfamily,family,subfamily,tribe,genus,subgenus,species,subspecies,variety
586
+
587
+ The value of each of these fields in each row is either a string or None.
588
+
589
+ ## images (list)
590
+
591
+ A list of images, where each image is a dict with the following fields:
592
+
593
+ ### dataset (int)
594
+
595
+ The integer ID corresponding to the dataset_name column for this image
596
+
597
+ ### path (str)
598
+
599
+ The suffix for this image's URL, which should be the same across the three URL columns.
600
+
601
+ ### seq (int)
602
+
603
+ The integer ID corresponding to the sequence_id column for this image
604
+
605
+ ### loc (int)
606
+
607
+ The integer ID corresponding to the location_id column for this image
608
+
609
+ ### frame_num
610
+
611
+ The value of the frame_num column for this image, unless the original value was -1,
612
+ in which case this is omitted.
613
+
614
+ ### original_label
615
+
616
+ The value of the original_label column for this image
617
+
618
+ ### common_name
619
+
620
+ The value of the common_name column for this image, if not empty
621
+
622
+ ### datetime
623
+
624
+ The value of the datetime column for this image
625
+
626
+ ### ann_level
627
+
628
+ The value of the annotation_level column for this image
629
+
630
+ ### taxon
631
+
632
+ The integer ID corresponding to the taxonomic identifier columns for this image
633
+
634
+ --
635
+
636
+ The original .csv file is large (~15GB); this may impact the implementation of the .json conversion. Speed of
637
+ conversion is not a priority.
638
+
639
+ """
640
+
641
+ print('Converting to JSON...')
642
+
643
+ output_json_file = output_file.replace('.csv', '.json')
644
+
645
+ json_data = {}
646
+
647
+ # Create mappings for datasets, sequences, and locations
648
+ dataset_to_id = {}
649
+ sequence_to_id = {}
650
+ location_to_id = {}
651
+ taxa_to_id = {}
652
+
653
+ next_dataset_id = 0
654
+ next_sequence_id = 0
655
+ next_location_id = 0
656
+ next_taxa_id = 0
657
+
658
+ json_data['datasets'] = {}
659
+ json_data['sequences'] = {}
660
+ json_data['locations'] = {}
661
+ json_data['taxa'] = {}
662
+
663
+ json_data['base_urls'] = {
664
+ "gcp": "https://storage.googleapis.com/public-datasets-lila/",
665
+ "aws": "http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/",
666
+ "azure": "https://lilawildlife.blob.core.windows.net/lila-wildlife/",
667
+ }
668
+
669
+ json_data['images'] = []
670
+
671
+ debug_max_json_conversion_rows = None
672
+
673
+ print('Counting rows in .csv file...')
674
+
675
+ # Get total number of lines for progress bar (optional, but helpful for large files)
676
+ def _count_lines(filename):
677
+ with open(filename, 'r', encoding='utf-8') as f:
678
+ return sum(1 for line in f) - 1
679
+
680
+ total_rows = _count_lines(output_file)
681
+ print('Total rows to process: {}'.format(total_rows))
682
+
683
+ # Read CSV file line by line
684
+ with open(output_file, 'r', encoding='utf-8') as csvfile:
685
+
686
+ reader = csv.DictReader(csvfile)
687
+
688
+ # Process each row
689
+ for i_row, row in enumerate(tqdm(reader, total=total_rows, desc="Processing rows")):
690
+
691
+ if (debug_max_json_conversion_rows is not None) and (i_row >= debug_max_json_conversion_rows):
692
+ break
693
+
694
+ # Datasets
695
+ dataset_name = row['dataset_name']
696
+ if dataset_name not in dataset_to_id:
697
+ dataset_to_id[dataset_name] = next_dataset_id
698
+ json_data['datasets'][str(next_dataset_id)] = dataset_name
699
+ next_dataset_id += 1
700
+ dataset_id = dataset_to_id[dataset_name]
701
+
702
+ # Sequences
703
+ sequence_id_str = row['sequence_id']
704
+ assert sequence_id_str.startswith(dataset_name + ' : ')
705
+ if sequence_id_str not in sequence_to_id:
706
+ sequence_to_id[sequence_id_str] = next_sequence_id
707
+ json_data['sequences'][str(next_sequence_id)] = sequence_id_str
708
+ next_sequence_id += 1
709
+ sequence_id = sequence_to_id[sequence_id_str]
710
+
711
+ # Locations
712
+ location_id_str = row['location_id']
713
+ assert location_id_str.startswith(dataset_name) # + ' : ')
714
+ if location_id_str not in location_to_id:
715
+ location_to_id[location_id_str] = next_location_id
716
+ json_data['locations'][str(next_location_id)] = location_id_str
717
+ next_location_id += 1
718
+ location_id = location_to_id[location_id_str]
719
+
720
+ # Taxa
721
+ taxa_data = {level: _clearnan(row[level]) for level in taxonomy_levels_to_include}
722
+ taxa_tuple = tuple(taxa_data.items()) # use tuple for hashable key
723
+ if taxa_tuple not in taxa_to_id:
724
+ taxa_to_id[taxa_tuple] = next_taxa_id
725
+ json_data['taxa'][str(next_taxa_id)] = taxa_data
726
+ next_taxa_id += 1
727
+ taxa_id = taxa_to_id[taxa_tuple]
728
+
729
+ # Image path
730
+ url_gcp = row['url_gcp']
731
+ assert url_gcp.startswith(json_data['base_urls']['gcp'])
732
+ path = url_gcp.replace(json_data['base_urls']['gcp'], '')
733
+
734
+ common_name = _clearnan(row['common_name'])
735
+
736
+ frame_num = int(row['frame_num'])
737
+
738
+ # Image data
739
+ image_entry = {
740
+ 'dataset': dataset_id,
741
+ 'path': path,
742
+ 'seq': sequence_id,
743
+ 'loc': location_id,
744
+ 'ann_level': row['annotation_level'],
745
+ 'original_label': row['original_label'],
746
+ 'datetime': row['datetime'],
747
+ 'taxon': taxa_id
748
+ }
749
+
750
+ if frame_num >= 0:
751
+ image_entry['frame_num'] = frame_num
752
+
753
+ if len(common_name) > 0:
754
+ image_entry['common_name'] = common_name
755
+
756
+ json_data['images'].append(image_entry)
757
+
758
+ # ...for each line
759
+
760
+ # ...with open(...)
761
+
762
+ # Save the JSON data
763
+ print('Saving JSON file...')
764
+ with open(output_json_file, 'w', encoding='utf-8') as f:
765
+ json.dump(json_data, f, indent=1)
766
+
767
+ print(f'Converted to JSON and saved to {output_json_file}')
768
+ print(f'JSON file size: {os.path.getsize(output_json_file)/(1024*1024*1024):.2f} GB')
769
+
770
+ # Print summary statistics
771
+ print(f'Total datasets: {len(json_data["datasets"])}')
772
+ print(f'Total sequences: {len(json_data["sequences"])}')
773
+ print(f'Total locations: {len(json_data["locations"])}')
774
+ print(f'Total taxa: {len(json_data["taxa"])}')
775
+ print(f'Total images: {len(json_data["images"])}')
@@ -82,9 +82,11 @@ print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
82
82
  #%% Download and extract metadata for each dataset
83
83
 
84
84
  for ds_name in metadata_table.keys():
85
- metadata_table[ds_name]['json_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
86
- metadata_dir=metadata_dir,
87
- metadata_table=metadata_table)
85
+ metadata_table[ds_name]['json_filename'] = \
86
+ read_metadata_file_for_dataset(ds_name=ds_name,
87
+ metadata_dir=metadata_dir,
88
+ metadata_table=metadata_table,
89
+ preferred_cloud=preferred_cloud)
88
90
 
89
91
 
90
92
  #%% Get category names and counts for each dataset
@@ -241,6 +241,9 @@ def read_metadata_file_for_dataset(ds_name,
241
241
 
242
242
  """
243
243
 
244
+ if preferred_cloud is None:
245
+ preferred_cloud = 'gcp'
246
+
244
247
  assert preferred_cloud in lila_base_urls.keys()
245
248
 
246
249
  if json_url is None:
@@ -32,7 +32,7 @@ os.makedirs(metadata_dir,exist_ok=True)
32
32
  md_results_dir = os.path.join(lila_local_base,'md_results')
33
33
  os.makedirs(md_results_dir,exist_ok=True)
34
34
 
35
- md_results_keys = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw',
35
+ md_results_keys = ['mdv5a_results_raw','mdv5b_results_raw',
36
36
  'md1000-redwood_results_raw','md_results_with_rde']
37
37
 
38
38
  preferred_cloud = None # 'gcp' # 'azure', 'aws'
@@ -54,7 +54,7 @@ print('Loaded metadata URLs for {} datasets'.format(len(metadata_table)))
54
54
 
55
55
  #%% Download and extract metadata and MD results for each dataset
56
56
 
57
- # Takes ~60 seconds if everything needs to be downloaded and unzipped
57
+ # Takes ~10 minutes if everything needs to be downloaded and unzipped
58
58
 
59
59
  for ds_name in metadata_table.keys():
60
60
 
@@ -63,10 +63,12 @@ for ds_name in metadata_table.keys():
63
63
  read_metadata_file_for_dataset(ds_name=ds_name,
64
64
  metadata_dir=metadata_dir,
65
65
  metadata_table=metadata_table,
66
- force_download=force_download)
66
+ force_download=force_download,
67
+ preferred_cloud=preferred_cloud)
67
68
 
68
69
  # Download MD results for this dataset
69
70
  for k in md_results_keys:
71
+
70
72
  md_results_url = metadata_table[ds_name][k]
71
73
  if md_results_url is None:
72
74
  metadata_table[ds_name][k + '_filename'] = None
@@ -75,7 +77,10 @@ for ds_name in metadata_table.keys():
75
77
  read_metadata_file_for_dataset(ds_name=ds_name,
76
78
  metadata_dir=md_results_dir,
77
79
  json_url=md_results_url,
78
- force_download=force_download)
80
+ force_download=force_download,
81
+ preferred_cloud=preferred_cloud)
82
+
83
+ # ...for each MD results file
79
84
 
80
85
  # ...for each dataset
81
86
 
@@ -121,9 +126,11 @@ for ds_name in metadata_table.keys():
121
126
 
122
127
  url_to_source[test_image_url] = ds_name + ' metadata ({})'.format(cloud)
123
128
 
129
+ # ...for each cloud
130
+
124
131
  # Grab an image from the MegaDetector results
125
132
 
126
- # k = md_results_keys[2]
133
+ # k = md_results_keys[0]
127
134
  for k in md_results_keys:
128
135
  k_fn = k + '_filename'
129
136
  if metadata_table[ds_name][k_fn] is not None:
@@ -153,3 +160,6 @@ for i_url,url in enumerate(urls_to_test):
153
160
  if status_codes[i_url] != 200:
154
161
  print('Status {} for {} ({})'.format(
155
162
  status_codes[i_url],url,url_to_source[url]))
163
+
164
+ print('Tested {} URLs'.format(len(urls_to_test)))
165
+
@@ -35,6 +35,7 @@ def mewc_to_md(mewc_input_folder,
35
35
  mewc_out_filename='mewc_out.csv',
36
36
  md_out_filename='md_out.json'):
37
37
  """
38
+ Converts the output of the MEWC inference scripts to the MD output format.
38
39
 
39
40
  Args:
40
41
  mewc_input_folder (str): the folder we'll search for MEWC output files
@@ -43,6 +44,10 @@ def mewc_to_md(mewc_input_folder,
43
44
  .json file, typically the prefix used to mount the image folder.
44
45
  category_name_column (str, optional): column in the MEWC results .csv to use for
45
46
  category naming.
47
+ mewc_out_filename (str, optional): MEWC-formatted .csv file that should be
48
+ in [mewc_input_folder]
49
+ md_out_filename (str, optional): MD-formatted .json file (without classification
50
+ information) that should be in [mewc_input_folder]
46
51
 
47
52
  Returns:
48
53
  dict: an MD-formatted dict, the same as what's written to [output_file]
@@ -39,6 +39,7 @@ Known limitations:
39
39
 
40
40
  """
41
41
 
42
+
42
43
  #%% Constants and imports
43
44
 
44
45
  import os
@@ -64,7 +65,7 @@ from megadetector.visualization import visualization_utils as vis_utils
64
65
  #
65
66
  # Also install tesseract from: https://github.com/UB-Mannheim/tesseract/wiki, and add
66
67
  # the installation dir to your path (on Windows, typically C:\Program Files (x86)\Tesseract-OCR)
67
- import pytesseract
68
+ import pytesseract # type: ignore
68
69
 
69
70
 
70
71
  #%% Extraction options
@@ -546,8 +547,8 @@ def try_get_datetime_from_image(filename,include_crops=False,options=None):
546
547
  until we find a datetime.
547
548
 
548
549
  Args:
549
- image (Image or str): the PIL Image object or image filename in which we should look for
550
- datetime information.
550
+ filename (Image or str): the PIL Image object or image filename in which we should look
551
+ for datetime information.
551
552
  include_crops (bool, optional): whether to include cropped images in the return dict (set
552
553
  this to False if you're worried about size and you're processing a zillion images)
553
554
  options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
@@ -36,7 +36,8 @@ from megadetector.data_management.cct_json_utils import write_object_with_serial
36
36
  debug_max_images = None
37
37
 
38
38
  minimal_exif_tags = \
39
- ['DateTime','Model','Make','ExifImageWidth','ExifImageHeight','DateTimeOriginal','Orientation']
39
+ ['DateTime','Model','Make','ExifImageWidth','ExifImageHeight',
40
+ 'DateTimeOriginal','Orientation', 'GPSInfo']
40
41
 
41
42
 
42
43
  #%% Options
@@ -186,6 +187,7 @@ def read_pil_exif(im,options=None):
186
187
  Args:
187
188
  im (str or PIL.Image.Image): image (as a filename or an Image object) from which
188
189
  we should read EXIF data.
190
+ options (ReadExifOptions, optional): see ReadExifOptions
189
191
 
190
192
  Returns:
191
193
  dict: a dictionary mapping EXIF tag names to their values
@@ -288,6 +290,12 @@ def format_datetime_as_exif_datetime_string(dt):
288
290
  """
289
291
  Returns a Python datetime object rendered using the standard EXIF datetime
290
292
  string format ('%Y:%m:%d %H:%M:%S')
293
+
294
+ Args:
295
+ dt (datetime): datetime object to format
296
+
297
+ Returns:
298
+ str: [dt] as a string in standard EXIF format
291
299
  """
292
300
 
293
301
  return datetime.strftime(dt, '%Y:%m:%d %H:%M:%S')
@@ -348,6 +356,10 @@ def read_exif_tags_for_image(file_path,options=None):
348
356
  """
349
357
  Get relevant fields from EXIF data for an image
350
358
 
359
+ Args:
360
+ file_path (str): image from which we should read EXIF data
361
+ options (ReadExifOptions, optional): see ReadExifOptions
362
+
351
363
  Returns:
352
364
  dict: a dict with fields 'status' (str) and 'tags'. The exact format of 'tags' depends on
353
365
  options (ReadExifOptions, optional): parameters controlling metadata extraction
@@ -656,7 +668,11 @@ def _write_exif_results(results,output_file):
656
668
  # ..._write_exif_results(...)
657
669
 
658
670
 
659
- def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=None,recursive=True):
671
+ def read_exif_from_folder(input_folder,
672
+ output_file=None,
673
+ options=None,
674
+ filenames=None,
675
+ recursive=True):
660
676
  """
661
677
  Read EXIF data for a folder of images.
662
678
 
@@ -670,7 +686,6 @@ def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=N
670
686
  a list of absolute filenames (if [input_folder] is None)
671
687
  recursive (bool, optional): whether to recurse into [input_folder], not relevant if [input_folder]
672
688
  is None.
673
- verbose (bool, optional): enable additional debug output
674
689
 
675
690
  Returns:
676
691
  list: list of dicts, each of which contains EXIF information for one images. Fields include at least:
@@ -704,7 +719,7 @@ def read_exif_from_folder(input_folder,output_file=None,options=None,filenames=N
704
719
  try:
705
720
  with open(output_file, 'a') as f:
706
721
  if not f.writable():
707
- raise IOError('File not writable')
722
+ raise OSError('File not writable')
708
723
  except Exception:
709
724
  print('Could not write to file {}'.format(output_file))
710
725
  raise
@@ -743,7 +758,7 @@ def exif_results_to_cct(exif_results,cct_output_file=None,options=None):
743
758
  Args:
744
759
  exif_results (str or list): the filename (or loaded list) containing the results
745
760
  from read_exif_from_folder
746
- cct_output_file (str,optional): the filename to which we should write
761
+ cct_output_file (str, optional): the filename to which we should write
747
762
  COCO-Camera-Traps-formatted data
748
763
  options (ExifResultsToCCTOptions, optional): options guiding the generation
749
764
  of the CCT file, particularly location mapping