megadetector 5.0.19__py3-none-any.whl → 5.0.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (34) hide show
  1. megadetector/data_management/importers/bellevue_to_json.py +0 -1
  2. megadetector/data_management/importers/osu-small-animals-to-json.py +364 -0
  3. megadetector/data_management/lila/generate_lila_per_image_labels.py +1 -1
  4. megadetector/data_management/lila/get_lila_annotation_counts.py +2 -0
  5. megadetector/data_management/lila/lila_common.py +28 -12
  6. megadetector/data_management/lila/test_lila_metadata_urls.py +17 -8
  7. megadetector/data_management/read_exif.py +73 -0
  8. megadetector/data_management/yolo_output_to_md_output.py +18 -5
  9. megadetector/detection/process_video.py +84 -16
  10. megadetector/detection/run_detector.py +36 -13
  11. megadetector/detection/run_detector_batch.py +104 -15
  12. megadetector/detection/run_inference_with_yolov5_val.py +20 -23
  13. megadetector/detection/video_utils.py +79 -44
  14. megadetector/postprocessing/combine_api_outputs.py +1 -1
  15. megadetector/postprocessing/detector_calibration.py +367 -0
  16. megadetector/postprocessing/md_to_coco.py +2 -1
  17. megadetector/postprocessing/postprocess_batch_results.py +32 -20
  18. megadetector/postprocessing/validate_batch_results.py +118 -58
  19. megadetector/taxonomy_mapping/map_new_lila_datasets.py +8 -3
  20. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +3 -2
  21. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +3 -1
  22. megadetector/utils/ct_utils.py +20 -0
  23. megadetector/utils/md_tests.py +63 -17
  24. megadetector/utils/path_utils.py +139 -30
  25. megadetector/utils/write_html_image_list.py +16 -5
  26. megadetector/visualization/visualization_utils.py +126 -23
  27. megadetector/visualization/visualize_db.py +104 -63
  28. {megadetector-5.0.19.dist-info → megadetector-5.0.21.dist-info}/METADATA +2 -2
  29. {megadetector-5.0.19.dist-info → megadetector-5.0.21.dist-info}/RECORD +32 -32
  30. {megadetector-5.0.19.dist-info → megadetector-5.0.21.dist-info}/WHEEL +1 -1
  31. megadetector/data_management/importers/prepare-noaa-fish-data-for-lila.py +0 -359
  32. megadetector/data_management/importers/snapshot_safari_importer_reprise.py +0 -677
  33. {megadetector-5.0.19.dist-info → megadetector-5.0.21.dist-info}/LICENSE +0 -0
  34. {megadetector-5.0.19.dist-info → megadetector-5.0.21.dist-info}/top_level.txt +0 -0
@@ -17,6 +17,7 @@ import platform
17
17
  import string
18
18
  import json
19
19
  import shutil
20
+ import hashlib
20
21
  import unicodedata
21
22
  import zipfile
22
23
  import tarfile
@@ -236,6 +237,30 @@ def path_is_abs(p):
236
237
  return (len(p) > 1) and (p[0] == '/' or p[1] == ':' or p[0] == '\\')
237
238
 
238
239
 
240
+ def safe_create_link(link_exists,link_new):
241
+ """
242
+ Creates a symlink at [link_new] pointing to [link_exists].
243
+
244
+ If [link_new] already exists, make sure it's a link (not a file),
245
+ and if it has a different target than [link_exists], removes and re-creates
246
+ it.
247
+
248
+ Errors if [link_new] already exists but it's not a link.
249
+
250
+ Args:
251
+ link_exists (str): the source of the (possibly-new) symlink
252
+ link_new (str): the target of the (possibly-new) symlink
253
+ """
254
+
255
+ if os.path.exists(link_new) or os.path.islink(link_new):
256
+ assert os.path.islink(link_new)
257
+ if not os.readlink(link_new) == link_exists:
258
+ os.remove(link_new)
259
+ os.symlink(link_exists,link_new)
260
+ else:
261
+ os.symlink(link_exists,link_new)
262
+
263
+
239
264
  def top_level_folder(p):
240
265
  r"""
241
266
  Gets the top-level folder from the path *p*.
@@ -296,31 +321,6 @@ if False:
296
321
  p = r'c:/foo'; s = top_level_folder(p); print(s); assert s == 'c:/foo'
297
322
  p = r'c:\foo/bar'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
298
323
 
299
- #%%
300
-
301
- def safe_create_link(link_exists,link_new):
302
- """
303
- Creates a symlink at [link_new] pointing to [link_exists].
304
-
305
- If [link_new] already exists, make sure it's a link (not a file),
306
- and if it has a different target than [link_exists], removes and re-creates
307
- it.
308
-
309
- Errors if [link_new] already exists but it's not a link.
310
-
311
- Args:
312
- link_exists (str): the source of the (possibly-new) symlink
313
- link_new (str): the target of the (possibly-new) symlink
314
- """
315
-
316
- if os.path.exists(link_new) or os.path.islink(link_new):
317
- assert os.path.islink(link_new)
318
- if not os.readlink(link_new) == link_exists:
319
- os.remove(link_new)
320
- os.symlink(link_exists,link_new)
321
- else:
322
- os.symlink(link_exists,link_new)
323
-
324
324
 
325
325
  #%% Image-related path functions
326
326
 
@@ -598,7 +598,9 @@ def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
598
598
 
599
599
  opener = 'xdg-open'
600
600
  subprocess.call([opener, filename])
601
-
601
+
602
+ # ...def open_file(...)
603
+
602
604
 
603
605
  #%% File list functions
604
606
 
@@ -649,8 +651,12 @@ def _copy_file(input_output_tuple,overwrite=True,verbose=False):
649
651
  target_fn = input_output_tuple[1]
650
652
  if (not overwrite) and (os.path.isfile(target_fn)):
651
653
  if verbose:
652
- print('Skipping existing file {}'.format(target_fn))
653
- return
654
+ print('Skipping existing target file {}'.format(target_fn))
655
+ return
656
+
657
+ if verbose:
658
+ print('Copying to target file {}'.format(target_fn))
659
+
654
660
  os.makedirs(os.path.dirname(target_fn),exist_ok=True)
655
661
  shutil.copyfile(source_fn,target_fn)
656
662
 
@@ -667,7 +673,7 @@ def parallel_copy_files(input_file_to_output_file, max_workers=16,
667
673
  use_threads (bool, optional): whether to use threads (True) or processes (False) for
668
674
  parallel copying; ignored if max_workers <= 1
669
675
  overwrite (bool, optional): whether to overwrite existing destination files
670
- verbose (bool, optional): enable additionald debug output
676
+ verbose (bool, optional): enable additional debug output
671
677
  """
672
678
 
673
679
  n_workers = min(max_workers,len(input_file_to_output_file))
@@ -750,7 +756,7 @@ def parallel_get_file_sizes(filenames,
750
756
  max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
751
757
  use_threads (bool, optional): whether to use threads (True) or processes (False) for
752
758
  parallel copying; ignored if max_workers <= 1
753
- verbose (bool, optional): enable additionald debug output
759
+ verbose (bool, optional): enable additional debug output
754
760
  recursive (bool, optional): enumerate recursively, only relevant if [filenames] is a folder.
755
761
  convert_slashes (bool, optional): convert backslashes to forward slashes
756
762
  return_relative_paths (bool, optional): return relative paths; only relevant if [filenames]
@@ -804,6 +810,8 @@ def parallel_get_file_sizes(filenames,
804
810
 
805
811
  return to_return
806
812
 
813
+ # ...def parallel_get_file_sizes(...)
814
+
807
815
 
808
816
  #%% Zip functions
809
817
 
@@ -1075,3 +1083,104 @@ def unzip_file(input_file, output_folder=None):
1075
1083
 
1076
1084
  with zipfile.ZipFile(input_file, 'r') as zf:
1077
1085
  zf.extractall(output_folder)
1086
+
1087
+
1088
+ #%% File hashing functions
1089
+
1090
+ def compute_file_hash(file_path, algorithm='sha256', allow_failures=True):
1091
+ """
1092
+ Compute the hash of a file.
1093
+
1094
+ Adapted from:
1095
+
1096
+ https://www.geeksforgeeks.org/python-program-to-find-hash-of-file/
1097
+
1098
+ Args:
1099
+ file_path (str): the file to hash
1100
+ algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
1101
+
1102
+ Returns:
1103
+ str: the hash value for this file
1104
+ """
1105
+
1106
+ try:
1107
+
1108
+ hash_func = hashlib.new(algorithm)
1109
+
1110
+ with open(file_path, 'rb') as file:
1111
+ while chunk := file.read(8192): # Read the file in chunks of 8192 bytes
1112
+ hash_func.update(chunk)
1113
+
1114
+ return str(hash_func.hexdigest())
1115
+
1116
+ except Exception:
1117
+
1118
+ if allow_failures:
1119
+ return None
1120
+ else:
1121
+ raise
1122
+
1123
+ # ...def compute_file_hash(...)
1124
+
1125
+
1126
+ def parallel_compute_file_hashes(filenames,
1127
+ max_workers=16,
1128
+ use_threads=True,
1129
+ recursive=True,
1130
+ algorithm='sha256',
1131
+ verbose=False):
1132
+ """
1133
+ Compute file hashes for a list or folder of images.
1134
+
1135
+ Args:
1136
+ filenames (list or str): a list of filenames or a folder
1137
+ max_workers (int, optional): the number of parallel workers to use; set to <=1 to disable
1138
+ parallelization
1139
+ use_threads (bool, optional): whether to use threads (True) or processes (False) for
1140
+ parallelization
1141
+ algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
1142
+ recursive (bool, optional): if [filenames] is a folder, whether to enumerate recursively.
1143
+ Ignored if [filenames] is a list.
1144
+ verbose (bool, optional): enable additional debug output
1145
+
1146
+ Returns:
1147
+ dict: a dict mapping filenames to hash values; values will be None for files that fail
1148
+ to load.
1149
+ """
1150
+
1151
+ if isinstance(filenames,str) and os.path.isdir(filenames):
1152
+ if verbose:
1153
+ print('Enumerating files in {}'.format(filenames))
1154
+ filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
1155
+
1156
+ n_workers = min(max_workers,len(filenames))
1157
+
1158
+ if verbose:
1159
+ print('Computing hashes for {} files on {} workers'.format(len(filenames),n_workers))
1160
+
1161
+ if n_workers <= 1:
1162
+
1163
+ results = []
1164
+ for filename in filenames:
1165
+ results.append(compute_file_hash(filename,algorithm=algorithm,allow_failures=True))
1166
+
1167
+ else:
1168
+
1169
+ if use_threads:
1170
+ pool = ThreadPool(n_workers)
1171
+ else:
1172
+ pool = Pool(n_workers)
1173
+
1174
+ results = list(tqdm(pool.imap(
1175
+ partial(compute_file_hash,algorithm=algorithm,allow_failures=True),
1176
+ filenames), total=len(filenames)))
1177
+
1178
+ assert len(filenames) == len(results), 'Internal error in parallel_compute_file_hashes'
1179
+
1180
+ to_return = {}
1181
+ for i_file,filename in enumerate(filenames):
1182
+ to_return[filename] = results[i_file]
1183
+
1184
+ return to_return
1185
+
1186
+ # ...def parallel_compute_file_hashes(...)
@@ -42,6 +42,7 @@ def write_html_image_list(filename=None,images=None,options=None):
42
42
  options (dict, optional): a dict with one or more of the following fields:
43
43
 
44
44
  - fHtml (file pointer to write to, used for splitting write operations over multiple calls)
45
+ - pageTitle (HTML page title)
45
46
  - headerHtml (html text to include before the image list)
46
47
  - trailerHtml (html text to include after the image list)
47
48
  - defaultImageStyle (default css style for images)
@@ -60,11 +61,14 @@ def write_html_image_list(filename=None,images=None,options=None):
60
61
  if 'fHtml' not in options:
61
62
  options['fHtml'] = -1
62
63
 
64
+ if 'pageTitle' not in options or options['pageTitle'] is None:
65
+ options['pageTitle'] = ''
66
+
63
67
  if 'headerHtml' not in options or options['headerHtml'] is None:
64
- options['headerHtml'] = ''
68
+ options['headerHtml'] = ''
65
69
 
66
70
  if 'trailerHtml' not in options or options['trailerHtml'] is None:
67
- options['trailerHtml'] = ''
71
+ options['trailerHtml'] = ''
68
72
 
69
73
  if 'defaultTextStyle' not in options or options['defaultTextStyle'] is None:
70
74
  options['defaultTextStyle'] = \
@@ -114,7 +118,7 @@ def write_html_image_list(filename=None,images=None,options=None):
114
118
  # You can't supply your own file handle in this case
115
119
  if options['fHtml'] != -1:
116
120
  raise ValueError(
117
- 'You can''t supply your own file handle if we have to page the image set')
121
+ "You can't supply your own file handle if we have to page the image set")
118
122
 
119
123
  figureFileStartingIndices = list(range(0,nImages,options['maxFiguresPerHtmlFile']))
120
124
 
@@ -124,7 +128,10 @@ def write_html_image_list(filename=None,images=None,options=None):
124
128
  fMeta = open(filename,'w')
125
129
 
126
130
  # Write header stuff
127
- fMeta.write('<html><body>\n')
131
+ titleString = '<title>Index page</title>'
132
+ if len(options['pageTitle']) > 0:
133
+ titleString = '<title>Index page for: {}</title>'.format(options['pageTitle'])
134
+ fMeta.write('<html><head>{}</head><body>\n'.format(titleString))
128
135
  fMeta.write(options['headerHtml'])
129
136
  fMeta.write('<table border = 0 cellpadding = 2>\n')
130
137
 
@@ -170,7 +177,11 @@ def write_html_image_list(filename=None,images=None,options=None):
170
177
  else:
171
178
  fHtml = options['fHtml']
172
179
 
173
- fHtml.write('<html><body>\n')
180
+ titleString = ''
181
+ if len(options['pageTitle']) > 0:
182
+ titleString = '<title>{}</title>'.format(options['pageTitle'])
183
+
184
+ fHtml.write('<html>{}<body>\n'.format(titleString))
174
185
 
175
186
  fHtml.write(options['headerHtml'])
176
187
 
@@ -672,6 +672,36 @@ def draw_bounding_boxes_on_image(image,
672
672
  # ...draw_bounding_boxes_on_image(...)
673
673
 
674
674
 
675
+ def get_text_size(font,s):
676
+ """
677
+ Get the expected width and height when rendering the string [s] in the font
678
+ [font].
679
+
680
+ Args:
681
+ font (PIL.ImageFont): the font whose size we should query
682
+ s (str): the string whose size we should query
683
+
684
+ Returns:
685
+ tuple: (w,h), both floats in pixel coordinatess
686
+ """
687
+
688
+ # This is what we did w/Pillow 9
689
+ # w,h = font.getsize(s)
690
+
691
+ # I would *think* this would be the equivalent for Pillow 10
692
+ # l,t,r,b = font.getbbox(s); w = r-l; h=b-t
693
+
694
+ # ...but this actually produces the most similar results to Pillow 9
695
+ # l,t,r,b = font.getbbox(s); w = r; h=b
696
+
697
+ try:
698
+ l,t,r,b = font.getbbox(s); w = r; h=b
699
+ except Exception:
700
+ w,h = font.getsize(s)
701
+
702
+ return w,h
703
+
704
+
675
705
  def draw_bounding_box_on_image(image,
676
706
  ymin,
677
707
  xmin,
@@ -773,24 +803,6 @@ def draw_bounding_box_on_image(image,
773
803
  except IOError:
774
804
  font = ImageFont.load_default()
775
805
 
776
- def get_text_size(font,s):
777
-
778
- # This is what we did w/Pillow 9
779
- # w,h = font.getsize(s)
780
-
781
- # I would *think* this would be the equivalent for Pillow 10
782
- # l,t,r,b = font.getbbox(s); w = r-l; h=b-t
783
-
784
- # ...but this actually produces the most similar results to Pillow 9
785
- # l,t,r,b = font.getbbox(s); w = r; h=b
786
-
787
- try:
788
- l,t,r,b = font.getbbox(s); w = r; h=b
789
- except Exception:
790
- w,h = font.getsize(s)
791
-
792
- return w,h
793
-
794
806
  # If the total height of the display strings added to the top of the bounding
795
807
  # box exceeds the top of the image, stack the strings below the bounding box
796
808
  # instead of above.
@@ -972,7 +984,7 @@ def draw_bounding_boxes_on_file(input_file,
972
984
  boxes are length-four arrays formatted as [x,y,w,h], normalized,
973
985
  upper-left origin (this is the standard MD detection format)
974
986
  detector_label_map (dict, optional): a dict mapping category IDs to strings. If this
975
- is None, no confidence values or identifiers are shown If this is {}, just category
987
+ is None, no confidence values or identifiers are shown. If this is {}, just category
976
988
  indices and confidence values are shown.
977
989
  thickness (int, optional): line width in pixels for box rendering
978
990
  expansion (int, optional): box expansion in pixels
@@ -1043,7 +1055,7 @@ def draw_db_boxes_on_file(input_file,
1043
1055
  classes = [0] * len(boxes)
1044
1056
 
1045
1057
  render_db_bounding_boxes(boxes, classes, image, original_size=None,
1046
- label_map=label_map, thickness=thickness, expansion=expansion)
1058
+ label_map=label_map, thickness=thickness, expansion=expansion)
1047
1059
 
1048
1060
  image.save(output_file)
1049
1061
 
@@ -1125,7 +1137,6 @@ def gray_scale_fraction(image,crop_size=(0.1,0.1)):
1125
1137
  if r == g and r == b and g == b:
1126
1138
  n_gray_pixels += 1
1127
1139
 
1128
-
1129
1140
  # ...def gray_scale_fraction(...)
1130
1141
 
1131
1142
 
@@ -1376,6 +1387,98 @@ def resize_image_folder(input_folder,
1376
1387
  # ...def resize_image_folder(...)
1377
1388
 
1378
1389
 
1390
+ def get_image_size(im,verbose=False):
1391
+ """
1392
+ Retrieve the size of an image. Returns None if the image fails to load.
1393
+
1394
+ Args:
1395
+ im (str or PIL.Image): filename or PIL image
1396
+
1397
+ Returns:
1398
+ tuple (w,h), or None if the image fails to load.
1399
+ """
1400
+
1401
+ image_name = '[in memory]'
1402
+
1403
+ try:
1404
+ if isinstance(im,str):
1405
+ image_name = im
1406
+ im = load_image(im)
1407
+ w = im.width
1408
+ h = im.height
1409
+ if w <= 0 or h <= 0:
1410
+ if verbose:
1411
+ print('Error reading width from image {}: {},{}'.format(
1412
+ image_name,w,h))
1413
+ return None
1414
+ return (w,h)
1415
+ except Exception as e:
1416
+ if verbose:
1417
+ print('Error reading width from image {}: {}'.format(
1418
+ image_name,str(e)))
1419
+ return None
1420
+
1421
+ # ...def get_image_size(...)
1422
+
1423
+
1424
+ def parallel_get_image_sizes(filenames,
1425
+ max_workers=16,
1426
+ use_threads=True,
1427
+ recursive=True,
1428
+ verbose=False):
1429
+ """
1430
+ Retrieve image sizes for a list or folder of images
1431
+
1432
+ Args:
1433
+ filenames (list or str): a list of image filenames or a folder
1434
+ max_workers (int, optional): the number of parallel workers to use; set to <=1 to disable
1435
+ parallelization
1436
+ use_threads (bool, optional): whether to use threads (True) or processes (False) for
1437
+ parallelization
1438
+ recursive (bool, optional): if [filenames] is a folder, whether to search recursively for images.
1439
+ Ignored if [filenames] is a list.
1440
+ verbose (bool, optional): enable additional debug output
1441
+
1442
+ Returns:
1443
+ dict: a dict mapping filenames to (w,h) tuples; values will be None for images that fail
1444
+ to load.
1445
+ """
1446
+
1447
+ if isinstance(filenames,str) and os.path.isdir(filenames):
1448
+ if verbose:
1449
+ print('Enumerating images in {}'.format(filenames))
1450
+ filenames = find_images(filenames,recursive=recursive,return_relative_paths=False)
1451
+
1452
+ n_workers = min(max_workers,len(filenames))
1453
+
1454
+ if verbose:
1455
+ print('Getting image sizes for {} images'.format(len(filenames)))
1456
+
1457
+ if n_workers <= 1:
1458
+
1459
+ results = []
1460
+ for filename in filenames:
1461
+ results.append(get_image_size(filename,verbose=verbose))
1462
+
1463
+ else:
1464
+
1465
+ if use_threads:
1466
+ pool = ThreadPool(n_workers)
1467
+ else:
1468
+ pool = Pool(n_workers)
1469
+
1470
+ results = list(tqdm(pool.imap(
1471
+ partial(get_image_size,verbose=verbose),filenames), total=len(filenames)))
1472
+
1473
+ assert len(filenames) == len(results), 'Internal error in parallel_get_image_sizes'
1474
+
1475
+ to_return = {}
1476
+ for i_file,filename in enumerate(filenames):
1477
+ to_return[filename] = results[i_file]
1478
+
1479
+ return to_return
1480
+
1481
+
1379
1482
  #%% Image integrity checking functions
1380
1483
 
1381
1484
  def check_image_integrity(filename,modes=None):
@@ -1494,13 +1597,13 @@ def parallel_check_image_integrity(filenames,
1494
1597
  with either 'success' or 'error').
1495
1598
  """
1496
1599
 
1497
- n_workers = min(max_workers,len(filenames))
1498
-
1499
1600
  if isinstance(filenames,str) and os.path.isdir(filenames):
1500
1601
  if verbose:
1501
1602
  print('Enumerating images in {}'.format(filenames))
1502
1603
  filenames = find_images(filenames,recursive=recursive,return_relative_paths=False)
1503
1604
 
1605
+ n_workers = min(max_workers,len(filenames))
1606
+
1504
1607
  if verbose:
1505
1608
  print('Checking image integrity for {} filenames'.format(len(filenames)))
1506
1609