megadetector 5.0.20__py3-none-any.whl → 5.0.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/data_management/cct_json_utils.py +143 -7
- megadetector/data_management/cct_to_md.py +12 -5
- megadetector/data_management/databases/integrity_check_json_db.py +83 -77
- megadetector/data_management/importers/osu-small-animals-to-json.py +4 -4
- megadetector/data_management/importers/raic_csv_to_md_results.py +416 -0
- megadetector/data_management/importers/zamba_results_to_md_results.py +1 -2
- megadetector/data_management/lila/create_lila_test_set.py +25 -11
- megadetector/data_management/lila/download_lila_subset.py +9 -2
- megadetector/data_management/lila/generate_lila_per_image_labels.py +3 -2
- megadetector/data_management/lila/test_lila_metadata_urls.py +5 -1
- megadetector/data_management/read_exif.py +10 -14
- megadetector/data_management/rename_images.py +1 -1
- megadetector/data_management/yolo_output_to_md_output.py +18 -5
- megadetector/detection/process_video.py +14 -3
- megadetector/detection/pytorch_detector.py +15 -3
- megadetector/detection/run_detector.py +4 -3
- megadetector/detection/run_inference_with_yolov5_val.py +121 -13
- megadetector/detection/video_utils.py +40 -17
- megadetector/postprocessing/classification_postprocessing.py +1 -1
- megadetector/postprocessing/combine_api_outputs.py +1 -1
- megadetector/postprocessing/compare_batch_results.py +931 -142
- megadetector/postprocessing/detector_calibration.py +565 -0
- megadetector/postprocessing/md_to_coco.py +85 -19
- megadetector/postprocessing/postprocess_batch_results.py +32 -21
- megadetector/postprocessing/validate_batch_results.py +174 -64
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +15 -12
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +1 -1
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +3 -1
- megadetector/utils/ct_utils.py +64 -2
- megadetector/utils/md_tests.py +15 -13
- megadetector/utils/path_utils.py +153 -37
- megadetector/utils/process_utils.py +9 -3
- megadetector/utils/write_html_image_list.py +21 -6
- megadetector/visualization/visualization_utils.py +329 -102
- megadetector/visualization/visualize_db.py +104 -63
- {megadetector-5.0.20.dist-info → megadetector-5.0.22.dist-info}/LICENSE +0 -0
- {megadetector-5.0.20.dist-info → megadetector-5.0.22.dist-info}/METADATA +143 -142
- {megadetector-5.0.20.dist-info → megadetector-5.0.22.dist-info}/RECORD +40 -39
- {megadetector-5.0.20.dist-info → megadetector-5.0.22.dist-info}/WHEEL +1 -1
- {megadetector-5.0.20.dist-info → megadetector-5.0.22.dist-info}/top_level.txt +0 -0
- megadetector/data_management/importers/prepare-noaa-fish-data-for-lila.py +0 -359
|
@@ -15,15 +15,17 @@ import json
|
|
|
15
15
|
# Created by get_lila_category_list.py
|
|
16
16
|
input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
|
|
17
17
|
|
|
18
|
-
output_file = os.path.expanduser('~/lila/lila_additions_2024.
|
|
18
|
+
output_file = os.path.expanduser('~/lila/lila_additions_2024.12.31.csv')
|
|
19
19
|
|
|
20
20
|
datasets_to_map = [
|
|
21
|
-
'
|
|
21
|
+
'Seattle(ish) Camera Traps'
|
|
22
22
|
]
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
#%% Initialize taxonomic lookup
|
|
26
26
|
|
|
27
|
+
# Takes ~2 mins
|
|
28
|
+
|
|
27
29
|
from megadetector.taxonomy_mapping.species_lookup import \
|
|
28
30
|
initialize_taxonomy_lookup, get_preferred_taxonomic_match
|
|
29
31
|
|
|
@@ -39,27 +41,27 @@ lila_datasets = set()
|
|
|
39
41
|
|
|
40
42
|
for dataset_name in input_lila_categories.keys():
|
|
41
43
|
# The script that generates this dictionary creates a separate entry for bounding box
|
|
42
|
-
# metadata files, but those don't represent new dataset names
|
|
44
|
+
# metadata files, but those don't represent new dataset names, so we ignore them here.
|
|
43
45
|
lila_datasets.add(dataset_name.replace('_bbox',''))
|
|
44
|
-
|
|
46
|
+
|
|
45
47
|
for s in datasets_to_map:
|
|
46
48
|
assert s in lila_datasets
|
|
47
|
-
|
|
48
|
-
|
|
49
|
+
|
|
50
|
+
|
|
49
51
|
#%% Find all categories
|
|
50
52
|
|
|
51
53
|
category_mappings = []
|
|
52
54
|
|
|
53
55
|
# dataset_name = datasets_to_map[0]
|
|
54
56
|
for dataset_name in datasets_to_map:
|
|
55
|
-
|
|
57
|
+
|
|
56
58
|
ds_categories = input_lila_categories[dataset_name]
|
|
57
59
|
for category in ds_categories:
|
|
58
60
|
category_name = category['name']
|
|
59
61
|
assert ':' not in category_name
|
|
60
62
|
mapping_name = dataset_name + ':' + category_name
|
|
61
63
|
category_mappings.append(mapping_name)
|
|
62
|
-
|
|
64
|
+
|
|
63
65
|
print('Need to create {} mappings'.format(len(category_mappings)))
|
|
64
66
|
|
|
65
67
|
|
|
@@ -128,22 +130,23 @@ output_df.to_csv(output_file, index=None, header=True)
|
|
|
128
130
|
|
|
129
131
|
if False:
|
|
130
132
|
|
|
131
|
-
#%%
|
|
132
|
-
|
|
133
|
+
#%% You probably want to open the .csv file first
|
|
134
|
+
|
|
133
135
|
from megadetector.utils.path_utils import open_file
|
|
134
136
|
open_file(output_file)
|
|
137
|
+
|
|
135
138
|
|
|
136
139
|
#%%
|
|
137
140
|
|
|
138
141
|
# q = 'white-throated monkey'
|
|
139
142
|
# q = 'cingulata'
|
|
140
143
|
# q = 'notamacropus'
|
|
141
|
-
q = '
|
|
144
|
+
q = 'insects'
|
|
142
145
|
taxonomy_preference = 'inat'
|
|
143
146
|
m = get_preferred_taxonomic_match(q,taxonomy_preference)
|
|
144
147
|
# print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
|
|
145
148
|
|
|
146
|
-
if m is None:
|
|
149
|
+
if (m is None) or (len(m.taxonomy_string) == 0):
|
|
147
150
|
print('No match')
|
|
148
151
|
else:
|
|
149
152
|
if m.source != taxonomy_preference:
|
|
@@ -89,7 +89,7 @@ if False:
|
|
|
89
89
|
'genus',
|
|
90
90
|
'species','subspecies','variety']
|
|
91
91
|
|
|
92
|
-
levels_to_exclude = ['stateofmatter','zoosection','parvorder','complex']
|
|
92
|
+
levels_to_exclude = ['stateofmatter','zoosection','parvorder','complex','epifamily']
|
|
93
93
|
|
|
94
94
|
for s in levels_to_exclude:
|
|
95
95
|
assert s not in levels_to_include
|
|
@@ -16,7 +16,7 @@ import os
|
|
|
16
16
|
import pandas as pd
|
|
17
17
|
|
|
18
18
|
# lila_taxonomy_file = r"c:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
|
|
19
|
-
lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2024.
|
|
19
|
+
lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2024.12.31.csv')
|
|
20
20
|
|
|
21
21
|
preview_base = os.path.expanduser('~/lila/lila_taxonomy_preview')
|
|
22
22
|
os.makedirs(preview_base,exist_ok=True)
|
|
@@ -399,6 +399,8 @@ images_per_query = 15
|
|
|
399
399
|
min_valid_images_per_query = 3
|
|
400
400
|
min_valid_image_size = 3000
|
|
401
401
|
|
|
402
|
+
# TODO: parallelize this loop
|
|
403
|
+
#
|
|
402
404
|
# i_row = 0; row = df.iloc[i_row]
|
|
403
405
|
for i_row,row in df.iterrows():
|
|
404
406
|
|
megadetector/utils/ct_utils.py
CHANGED
|
@@ -12,6 +12,7 @@ import inspect
|
|
|
12
12
|
import json
|
|
13
13
|
import math
|
|
14
14
|
import os
|
|
15
|
+
import builtins
|
|
15
16
|
|
|
16
17
|
import jsonpickle
|
|
17
18
|
import numpy as np
|
|
@@ -613,6 +614,50 @@ def is_empty(v):
|
|
|
613
614
|
return False
|
|
614
615
|
|
|
615
616
|
|
|
617
|
+
def min_none(a,b):
|
|
618
|
+
"""
|
|
619
|
+
Returns the minimum of a and b. If both are None, returns None. If one is None,
|
|
620
|
+
returns the other.
|
|
621
|
+
|
|
622
|
+
Args:
|
|
623
|
+
a (numeric): the first value to compare
|
|
624
|
+
b (numeric): the second value to compare
|
|
625
|
+
|
|
626
|
+
Returns:
|
|
627
|
+
numeric: the minimum of a and b, or None
|
|
628
|
+
"""
|
|
629
|
+
if a is None and b is None:
|
|
630
|
+
return None
|
|
631
|
+
elif a is None:
|
|
632
|
+
return b
|
|
633
|
+
elif b is None:
|
|
634
|
+
return a
|
|
635
|
+
else:
|
|
636
|
+
return min(a,b)
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
def max_none(a,b):
|
|
640
|
+
"""
|
|
641
|
+
Returns the maximum of a and b. If both are None, returns None. If one is None,
|
|
642
|
+
returns the other.
|
|
643
|
+
|
|
644
|
+
Args:
|
|
645
|
+
a (numeric): the first value to compare
|
|
646
|
+
b (numeric): the second value to compare
|
|
647
|
+
|
|
648
|
+
Returns:
|
|
649
|
+
numeric: the maximum of a and b, or None
|
|
650
|
+
"""
|
|
651
|
+
if a is None and b is None:
|
|
652
|
+
return None
|
|
653
|
+
elif a is None:
|
|
654
|
+
return b
|
|
655
|
+
elif b is None:
|
|
656
|
+
return a
|
|
657
|
+
else:
|
|
658
|
+
return max(a,b)
|
|
659
|
+
|
|
660
|
+
|
|
616
661
|
def isnan(v):
|
|
617
662
|
"""
|
|
618
663
|
Returns True if v is a nan-valued float, otherwise returns False.
|
|
@@ -645,7 +690,24 @@ def sets_overlap(set1, set2):
|
|
|
645
690
|
return not set(set1).isdisjoint(set(set2))
|
|
646
691
|
|
|
647
692
|
|
|
648
|
-
|
|
693
|
+
def is_function_name(s,calling_namespace):
|
|
694
|
+
"""
|
|
695
|
+
Determines whether [s] is a callable function in the global or local scope, or a
|
|
696
|
+
built-in function.
|
|
697
|
+
|
|
698
|
+
Args:
|
|
699
|
+
s (str): the string to test for function-ness
|
|
700
|
+
calling_namespace (dict): typically pass the output of locals()
|
|
701
|
+
"""
|
|
702
|
+
|
|
703
|
+
assert isinstance(s,str), 'Input is not a string'
|
|
704
|
+
|
|
705
|
+
return callable(globals().get(s)) or \
|
|
706
|
+
callable(locals().get(s)) or \
|
|
707
|
+
callable(calling_namespace.get(s)) or \
|
|
708
|
+
callable(getattr(builtins, s, None))
|
|
709
|
+
|
|
710
|
+
|
|
649
711
|
#%% Test drivers
|
|
650
712
|
|
|
651
713
|
if False:
|
|
@@ -678,4 +740,4 @@ if False:
|
|
|
678
740
|
L = [{'a':5},{'a':0},{'a':10}]
|
|
679
741
|
k = 'a'
|
|
680
742
|
sort_list_of_dicts_by_key(L, k, reverse=True)
|
|
681
|
-
|
|
743
|
+
|
megadetector/utils/md_tests.py
CHANGED
|
@@ -29,10 +29,6 @@ import subprocess
|
|
|
29
29
|
import argparse
|
|
30
30
|
import inspect
|
|
31
31
|
|
|
32
|
-
#: IoU threshold used to determine whether boxes in two detection files likely correspond
|
|
33
|
-
#: to the same box.
|
|
34
|
-
iou_threshold_for_file_comparison = 0.9
|
|
35
|
-
|
|
36
32
|
|
|
37
33
|
#%% Classes
|
|
38
34
|
|
|
@@ -106,6 +102,10 @@ class MDTestOptions:
|
|
|
106
102
|
#: PYTHONPATH to set for CLI tests; if None, inherits from the parent process. Only
|
|
107
103
|
#: impacts the called functions, not the parent process.
|
|
108
104
|
self.cli_test_pythonpath = None
|
|
105
|
+
|
|
106
|
+
#: IoU threshold used to determine whether boxes in two detection files likely correspond
|
|
107
|
+
#: to the same box.
|
|
108
|
+
self.iou_threshold_for_file_comparison = 0.85
|
|
109
109
|
|
|
110
110
|
# ...class MDTestOptions()
|
|
111
111
|
|
|
@@ -410,7 +410,7 @@ def compare_detection_lists(detections_a,detections_b,options,bidirectional_comp
|
|
|
410
410
|
iou = get_iou(det_a['bbox'],b_det['bbox'])
|
|
411
411
|
|
|
412
412
|
# Is this likely the same detection as det_a?
|
|
413
|
-
if iou >= iou_threshold_for_file_comparison and iou > highest_iou:
|
|
413
|
+
if iou >= options.iou_threshold_for_file_comparison and iou > highest_iou:
|
|
414
414
|
matching_det_b = b_det
|
|
415
415
|
highest_iou = iou
|
|
416
416
|
|
|
@@ -529,12 +529,14 @@ def compare_results(inference_output_file,expected_results_file,options):
|
|
|
529
529
|
if not options.warning_mode:
|
|
530
530
|
|
|
531
531
|
assert max_conf_error <= options.max_conf_error, \
|
|
532
|
-
'Confidence error {} is greater than allowable ({}), on file:\n{}'.format(
|
|
533
|
-
max_conf_error,options.max_conf_error,max_conf_error_file
|
|
532
|
+
'Confidence error {} is greater than allowable ({}), on file:\n{} ({},{})'.format(
|
|
533
|
+
max_conf_error,options.max_conf_error,max_conf_error_file,
|
|
534
|
+
inference_output_file,expected_results_file)
|
|
534
535
|
|
|
535
536
|
assert max_coord_error <= options.max_coord_error, \
|
|
536
|
-
'Coord error {} is greater than allowable ({}), on file:\n{}'.format(
|
|
537
|
-
max_coord_error,options.max_coord_error,max_coord_error_file
|
|
537
|
+
'Coord error {} is greater than allowable ({}), on file:\n{} ({},{})'.format(
|
|
538
|
+
max_coord_error,options.max_coord_error,max_coord_error_file,
|
|
539
|
+
inference_output_file,expected_results_file)
|
|
538
540
|
|
|
539
541
|
print('Max conf error: {} (file {})'.format(
|
|
540
542
|
max_conf_error,max_conf_error_file))
|
|
@@ -847,7 +849,7 @@ def run_python_tests(options):
|
|
|
847
849
|
video_options.frame_rendering_folder = os.path.join(options.scratch_dir,'video_scratch/rendered_frame_folder')
|
|
848
850
|
video_options.render_output_video = True
|
|
849
851
|
# video_options.keep_rendered_frames = False
|
|
850
|
-
# video_options.
|
|
852
|
+
# video_options.keep_extracted_frames = False
|
|
851
853
|
video_options.force_extracted_frame_folder_deletion = True
|
|
852
854
|
video_options.force_rendered_frame_folder_deletion = True
|
|
853
855
|
# video_options.reuse_results_if_available = False
|
|
@@ -887,7 +889,7 @@ def run_python_tests(options):
|
|
|
887
889
|
video_options.frame_rendering_folder = os.path.join(options.scratch_dir,'video_scratch/rendered_frame_folder')
|
|
888
890
|
video_options.render_output_video = False
|
|
889
891
|
video_options.keep_rendered_frames = False
|
|
890
|
-
video_options.
|
|
892
|
+
video_options.keep_extracted_frames = False
|
|
891
893
|
video_options.force_extracted_frame_folder_deletion = True
|
|
892
894
|
video_options.force_rendered_frame_folder_deletion = True
|
|
893
895
|
video_options.reuse_results_if_available = False
|
|
@@ -1208,7 +1210,7 @@ def run_cli_tests(options):
|
|
|
1208
1210
|
cmd += ' --overwrite_handling overwrite'
|
|
1209
1211
|
cmd_results = execute_and_print(cmd)
|
|
1210
1212
|
|
|
1211
|
-
# Run again with checkpointing, make sure the
|
|
1213
|
+
# Run again with checkpointing, make sure the outputs are identical
|
|
1212
1214
|
cmd += ' --checkpoint_frequency 5'
|
|
1213
1215
|
inference_output_file_yolo_val_checkpoint = \
|
|
1214
1216
|
os.path.join(options.scratch_dir,'folder_inference_output_yolo_val_checkpoint.json')
|
|
@@ -1353,7 +1355,7 @@ if False:
|
|
|
1353
1355
|
# options.cli_working_dir = r'c:\git\MegaDetector'
|
|
1354
1356
|
# options.yolo_working_dir = r'c:\git\yolov5-md'
|
|
1355
1357
|
options.cli_working_dir = os.path.expanduser('~')
|
|
1356
|
-
options.yolo_working_dir = '/mnt/c/git/yolov5-md'
|
|
1358
|
+
# options.yolo_working_dir = '/mnt/c/git/yolov5-md'
|
|
1357
1359
|
options = download_test_data(options)
|
|
1358
1360
|
|
|
1359
1361
|
#%%
|
megadetector/utils/path_utils.py
CHANGED
|
@@ -17,6 +17,7 @@ import platform
|
|
|
17
17
|
import string
|
|
18
18
|
import json
|
|
19
19
|
import shutil
|
|
20
|
+
import hashlib
|
|
20
21
|
import unicodedata
|
|
21
22
|
import zipfile
|
|
22
23
|
import tarfile
|
|
@@ -31,6 +32,8 @@ from functools import partial
|
|
|
31
32
|
from shutil import which
|
|
32
33
|
from tqdm import tqdm
|
|
33
34
|
|
|
35
|
+
from megadetector.utils.ct_utils import is_iterable
|
|
36
|
+
|
|
34
37
|
# Should all be lower-case
|
|
35
38
|
IMG_EXTENSIONS = ('.jpg', '.jpeg', '.gif', '.png', '.tif', '.tiff', '.bmp')
|
|
36
39
|
|
|
@@ -236,6 +239,30 @@ def path_is_abs(p):
|
|
|
236
239
|
return (len(p) > 1) and (p[0] == '/' or p[1] == ':' or p[0] == '\\')
|
|
237
240
|
|
|
238
241
|
|
|
242
|
+
def safe_create_link(link_exists,link_new):
|
|
243
|
+
"""
|
|
244
|
+
Creates a symlink at [link_new] pointing to [link_exists].
|
|
245
|
+
|
|
246
|
+
If [link_new] already exists, make sure it's a link (not a file),
|
|
247
|
+
and if it has a different target than [link_exists], removes and re-creates
|
|
248
|
+
it.
|
|
249
|
+
|
|
250
|
+
Errors if [link_new] already exists but it's not a link.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
link_exists (str): the source of the (possibly-new) symlink
|
|
254
|
+
link_new (str): the target of the (possibly-new) symlink
|
|
255
|
+
"""
|
|
256
|
+
|
|
257
|
+
if os.path.exists(link_new) or os.path.islink(link_new):
|
|
258
|
+
assert os.path.islink(link_new)
|
|
259
|
+
if not os.readlink(link_new) == link_exists:
|
|
260
|
+
os.remove(link_new)
|
|
261
|
+
os.symlink(link_exists,link_new)
|
|
262
|
+
else:
|
|
263
|
+
os.symlink(link_exists,link_new)
|
|
264
|
+
|
|
265
|
+
|
|
239
266
|
def top_level_folder(p):
|
|
240
267
|
r"""
|
|
241
268
|
Gets the top-level folder from the path *p*.
|
|
@@ -296,31 +323,6 @@ if False:
|
|
|
296
323
|
p = r'c:/foo'; s = top_level_folder(p); print(s); assert s == 'c:/foo'
|
|
297
324
|
p = r'c:\foo/bar'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
|
|
298
325
|
|
|
299
|
-
#%%
|
|
300
|
-
|
|
301
|
-
def safe_create_link(link_exists,link_new):
|
|
302
|
-
"""
|
|
303
|
-
Creates a symlink at [link_new] pointing to [link_exists].
|
|
304
|
-
|
|
305
|
-
If [link_new] already exists, make sure it's a link (not a file),
|
|
306
|
-
and if it has a different target than [link_exists], removes and re-creates
|
|
307
|
-
it.
|
|
308
|
-
|
|
309
|
-
Errors if [link_new] already exists but it's not a link.
|
|
310
|
-
|
|
311
|
-
Args:
|
|
312
|
-
link_exists (str): the source of the (possibly-new) symlink
|
|
313
|
-
link_new (str): the target of the (possibly-new) symlink
|
|
314
|
-
"""
|
|
315
|
-
|
|
316
|
-
if os.path.exists(link_new) or os.path.islink(link_new):
|
|
317
|
-
assert os.path.islink(link_new)
|
|
318
|
-
if not os.readlink(link_new) == link_exists:
|
|
319
|
-
os.remove(link_new)
|
|
320
|
-
os.symlink(link_exists,link_new)
|
|
321
|
-
else:
|
|
322
|
-
os.symlink(link_exists,link_new)
|
|
323
|
-
|
|
324
326
|
|
|
325
327
|
#%% Image-related path functions
|
|
326
328
|
|
|
@@ -598,7 +600,9 @@ def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
|
|
|
598
600
|
|
|
599
601
|
opener = 'xdg-open'
|
|
600
602
|
subprocess.call([opener, filename])
|
|
601
|
-
|
|
603
|
+
|
|
604
|
+
# ...def open_file(...)
|
|
605
|
+
|
|
602
606
|
|
|
603
607
|
#%% File list functions
|
|
604
608
|
|
|
@@ -649,8 +653,12 @@ def _copy_file(input_output_tuple,overwrite=True,verbose=False):
|
|
|
649
653
|
target_fn = input_output_tuple[1]
|
|
650
654
|
if (not overwrite) and (os.path.isfile(target_fn)):
|
|
651
655
|
if verbose:
|
|
652
|
-
print('Skipping existing file {}'.format(target_fn))
|
|
653
|
-
return
|
|
656
|
+
print('Skipping existing target file {}'.format(target_fn))
|
|
657
|
+
return
|
|
658
|
+
|
|
659
|
+
if verbose:
|
|
660
|
+
print('Copying to target file {}'.format(target_fn))
|
|
661
|
+
|
|
654
662
|
os.makedirs(os.path.dirname(target_fn),exist_ok=True)
|
|
655
663
|
shutil.copyfile(source_fn,target_fn)
|
|
656
664
|
|
|
@@ -667,7 +675,7 @@ def parallel_copy_files(input_file_to_output_file, max_workers=16,
|
|
|
667
675
|
use_threads (bool, optional): whether to use threads (True) or processes (False) for
|
|
668
676
|
parallel copying; ignored if max_workers <= 1
|
|
669
677
|
overwrite (bool, optional): whether to overwrite existing destination files
|
|
670
|
-
verbose (bool, optional): enable
|
|
678
|
+
verbose (bool, optional): enable additional debug output
|
|
671
679
|
"""
|
|
672
680
|
|
|
673
681
|
n_workers = min(max_workers,len(input_file_to_output_file))
|
|
@@ -750,7 +758,7 @@ def parallel_get_file_sizes(filenames,
|
|
|
750
758
|
max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
|
|
751
759
|
use_threads (bool, optional): whether to use threads (True) or processes (False) for
|
|
752
760
|
parallel copying; ignored if max_workers <= 1
|
|
753
|
-
verbose (bool, optional): enable
|
|
761
|
+
verbose (bool, optional): enable additional debug output
|
|
754
762
|
recursive (bool, optional): enumerate recursively, only relevant if [filenames] is a folder.
|
|
755
763
|
convert_slashes (bool, optional): convert backslashes to forward slashes
|
|
756
764
|
return_relative_paths (bool, optional): return relative paths; only relevant if [filenames]
|
|
@@ -764,16 +772,21 @@ def parallel_get_file_sizes(filenames,
|
|
|
764
772
|
|
|
765
773
|
folder_name = None
|
|
766
774
|
|
|
767
|
-
if
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
if isinstance(filenames,str) and os.path.isdir(filenames):
|
|
771
|
-
|
|
775
|
+
if isinstance(filenames,str):
|
|
776
|
+
|
|
772
777
|
folder_name = filenames
|
|
778
|
+
assert os.path.isdir(filenames), 'Could not find folder {}'.format(folder_name)
|
|
773
779
|
|
|
780
|
+
if verbose:
|
|
781
|
+
print('Enumerating files in {}'.format(folder_name))
|
|
782
|
+
|
|
774
783
|
# Enumerate absolute paths here, we'll convert to relative later if requested
|
|
775
|
-
filenames = recursive_file_list(
|
|
784
|
+
filenames = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
|
|
776
785
|
|
|
786
|
+
else:
|
|
787
|
+
|
|
788
|
+
assert is_iterable(filenames), '[filenames] argument is neither a folder nor an iterable'
|
|
789
|
+
|
|
777
790
|
if verbose:
|
|
778
791
|
print('Creating worker pool')
|
|
779
792
|
|
|
@@ -804,6 +817,8 @@ def parallel_get_file_sizes(filenames,
|
|
|
804
817
|
|
|
805
818
|
return to_return
|
|
806
819
|
|
|
820
|
+
# ...def parallel_get_file_sizes(...)
|
|
821
|
+
|
|
807
822
|
|
|
808
823
|
#%% Zip functions
|
|
809
824
|
|
|
@@ -932,7 +947,7 @@ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, com
|
|
|
932
947
|
output_fn (str, optional): output filename; if this is None, we'll write to [input_folder].zip
|
|
933
948
|
overwrite (bool, optional): whether to overwrite an existing .tar file
|
|
934
949
|
verbose (bool, optional): enable additional debug console output
|
|
935
|
-
compresslevel (int, optional): compression level to use, between 0 and 9
|
|
950
|
+
compresslevel (int, optional): compression level to use, between 0 and 9
|
|
936
951
|
|
|
937
952
|
Returns:
|
|
938
953
|
str: the output zipfile, whether we created it or determined that it already exists
|
|
@@ -1075,3 +1090,104 @@ def unzip_file(input_file, output_folder=None):
|
|
|
1075
1090
|
|
|
1076
1091
|
with zipfile.ZipFile(input_file, 'r') as zf:
|
|
1077
1092
|
zf.extractall(output_folder)
|
|
1093
|
+
|
|
1094
|
+
|
|
1095
|
+
#%% File hashing functions
|
|
1096
|
+
|
|
1097
|
+
def compute_file_hash(file_path, algorithm='sha256', allow_failures=True):
|
|
1098
|
+
"""
|
|
1099
|
+
Compute the hash of a file.
|
|
1100
|
+
|
|
1101
|
+
Adapted from:
|
|
1102
|
+
|
|
1103
|
+
https://www.geeksforgeeks.org/python-program-to-find-hash-of-file/
|
|
1104
|
+
|
|
1105
|
+
Args:
|
|
1106
|
+
file_path (str): the file to hash
|
|
1107
|
+
algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
|
|
1108
|
+
|
|
1109
|
+
Returns:
|
|
1110
|
+
str: the hash value for this file
|
|
1111
|
+
"""
|
|
1112
|
+
|
|
1113
|
+
try:
|
|
1114
|
+
|
|
1115
|
+
hash_func = hashlib.new(algorithm)
|
|
1116
|
+
|
|
1117
|
+
with open(file_path, 'rb') as file:
|
|
1118
|
+
while chunk := file.read(8192): # Read the file in chunks of 8192 bytes
|
|
1119
|
+
hash_func.update(chunk)
|
|
1120
|
+
|
|
1121
|
+
return str(hash_func.hexdigest())
|
|
1122
|
+
|
|
1123
|
+
except Exception:
|
|
1124
|
+
|
|
1125
|
+
if allow_failures:
|
|
1126
|
+
return None
|
|
1127
|
+
else:
|
|
1128
|
+
raise
|
|
1129
|
+
|
|
1130
|
+
# ...def compute_file_hash(...)
|
|
1131
|
+
|
|
1132
|
+
|
|
1133
|
+
def parallel_compute_file_hashes(filenames,
|
|
1134
|
+
max_workers=16,
|
|
1135
|
+
use_threads=True,
|
|
1136
|
+
recursive=True,
|
|
1137
|
+
algorithm='sha256',
|
|
1138
|
+
verbose=False):
|
|
1139
|
+
"""
|
|
1140
|
+
Compute file hashes for a list or folder of images.
|
|
1141
|
+
|
|
1142
|
+
Args:
|
|
1143
|
+
filenames (list or str): a list of filenames or a folder
|
|
1144
|
+
max_workers (int, optional): the number of parallel workers to use; set to <=1 to disable
|
|
1145
|
+
parallelization
|
|
1146
|
+
use_threads (bool, optional): whether to use threads (True) or processes (False) for
|
|
1147
|
+
parallelization
|
|
1148
|
+
algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
|
|
1149
|
+
recursive (bool, optional): if [filenames] is a folder, whether to enumerate recursively.
|
|
1150
|
+
Ignored if [filenames] is a list.
|
|
1151
|
+
verbose (bool, optional): enable additional debug output
|
|
1152
|
+
|
|
1153
|
+
Returns:
|
|
1154
|
+
dict: a dict mapping filenames to hash values; values will be None for files that fail
|
|
1155
|
+
to load.
|
|
1156
|
+
"""
|
|
1157
|
+
|
|
1158
|
+
if isinstance(filenames,str) and os.path.isdir(filenames):
|
|
1159
|
+
if verbose:
|
|
1160
|
+
print('Enumerating files in {}'.format(filenames))
|
|
1161
|
+
filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
|
|
1162
|
+
|
|
1163
|
+
n_workers = min(max_workers,len(filenames))
|
|
1164
|
+
|
|
1165
|
+
if verbose:
|
|
1166
|
+
print('Computing hashes for {} files on {} workers'.format(len(filenames),n_workers))
|
|
1167
|
+
|
|
1168
|
+
if n_workers <= 1:
|
|
1169
|
+
|
|
1170
|
+
results = []
|
|
1171
|
+
for filename in filenames:
|
|
1172
|
+
results.append(compute_file_hash(filename,algorithm=algorithm,allow_failures=True))
|
|
1173
|
+
|
|
1174
|
+
else:
|
|
1175
|
+
|
|
1176
|
+
if use_threads:
|
|
1177
|
+
pool = ThreadPool(n_workers)
|
|
1178
|
+
else:
|
|
1179
|
+
pool = Pool(n_workers)
|
|
1180
|
+
|
|
1181
|
+
results = list(tqdm(pool.imap(
|
|
1182
|
+
partial(compute_file_hash,algorithm=algorithm,allow_failures=True),
|
|
1183
|
+
filenames), total=len(filenames)))
|
|
1184
|
+
|
|
1185
|
+
assert len(filenames) == len(results), 'Internal error in parallel_compute_file_hashes'
|
|
1186
|
+
|
|
1187
|
+
to_return = {}
|
|
1188
|
+
for i_file,filename in enumerate(filenames):
|
|
1189
|
+
to_return[filename] = results[i_file]
|
|
1190
|
+
|
|
1191
|
+
return to_return
|
|
1192
|
+
|
|
1193
|
+
# ...def parallel_compute_file_hashes(...)
|
|
@@ -59,8 +59,13 @@ def execute(cmd,encoding=None,errors=None,env=None,verbose=False):
|
|
|
59
59
|
return return_code
|
|
60
60
|
|
|
61
61
|
|
|
62
|
-
def execute_and_print(cmd,
|
|
63
|
-
|
|
62
|
+
def execute_and_print(cmd,
|
|
63
|
+
print_output=True,
|
|
64
|
+
encoding=None,
|
|
65
|
+
errors=None,
|
|
66
|
+
env=None,
|
|
67
|
+
verbose=False,
|
|
68
|
+
catch_exceptions=True,
|
|
64
69
|
echo_command=False):
|
|
65
70
|
"""
|
|
66
71
|
Run [cmd] (a single string) in a shell, capturing and printing output. Returns
|
|
@@ -73,7 +78,8 @@ def execute_and_print(cmd,print_output=True,encoding=None,errors=None,
|
|
|
73
78
|
|
|
74
79
|
Args:
|
|
75
80
|
cmd (str): command to run
|
|
76
|
-
print_output (bool, optional): whether to print output from [cmd]
|
|
81
|
+
print_output (bool, optional): whether to print output from [cmd] (stdout is
|
|
82
|
+
captured regardless of the value of print_output)
|
|
77
83
|
encoding (str, optional): stdout encoding, see Popen() documentation
|
|
78
84
|
errors (str, optional): error handling, see Popen() documentation
|
|
79
85
|
env (dict, optional): environment variables, see Popen() documentation
|
|
@@ -42,7 +42,9 @@ def write_html_image_list(filename=None,images=None,options=None):
|
|
|
42
42
|
options (dict, optional): a dict with one or more of the following fields:
|
|
43
43
|
|
|
44
44
|
- fHtml (file pointer to write to, used for splitting write operations over multiple calls)
|
|
45
|
+
- pageTitle (HTML page title)
|
|
45
46
|
- headerHtml (html text to include before the image list)
|
|
47
|
+
- subPageHeaderHtml (html text to include before the images when images are broken into pages)
|
|
46
48
|
- trailerHtml (html text to include after the image list)
|
|
47
49
|
- defaultImageStyle (default css style for images)
|
|
48
50
|
- defaultTextStyle (default css style for image titles)
|
|
@@ -60,11 +62,17 @@ def write_html_image_list(filename=None,images=None,options=None):
|
|
|
60
62
|
if 'fHtml' not in options:
|
|
61
63
|
options['fHtml'] = -1
|
|
62
64
|
|
|
65
|
+
if 'pageTitle' not in options or options['pageTitle'] is None:
|
|
66
|
+
options['pageTitle'] = ''
|
|
67
|
+
|
|
63
68
|
if 'headerHtml' not in options or options['headerHtml'] is None:
|
|
64
|
-
options['headerHtml'] = ''
|
|
69
|
+
options['headerHtml'] = ''
|
|
65
70
|
|
|
71
|
+
if 'subPageHeaderHtml' not in options or options['subPageHeaderHtml'] is None:
|
|
72
|
+
options['subPageHeaderHtml'] = ''
|
|
73
|
+
|
|
66
74
|
if 'trailerHtml' not in options or options['trailerHtml'] is None:
|
|
67
|
-
options['trailerHtml'] = ''
|
|
75
|
+
options['trailerHtml'] = ''
|
|
68
76
|
|
|
69
77
|
if 'defaultTextStyle' not in options or options['defaultTextStyle'] is None:
|
|
70
78
|
options['defaultTextStyle'] = \
|
|
@@ -114,7 +122,7 @@ def write_html_image_list(filename=None,images=None,options=None):
|
|
|
114
122
|
# You can't supply your own file handle in this case
|
|
115
123
|
if options['fHtml'] != -1:
|
|
116
124
|
raise ValueError(
|
|
117
|
-
|
|
125
|
+
"You can't supply your own file handle if we have to page the image set")
|
|
118
126
|
|
|
119
127
|
figureFileStartingIndices = list(range(0,nImages,options['maxFiguresPerHtmlFile']))
|
|
120
128
|
|
|
@@ -124,7 +132,10 @@ def write_html_image_list(filename=None,images=None,options=None):
|
|
|
124
132
|
fMeta = open(filename,'w')
|
|
125
133
|
|
|
126
134
|
# Write header stuff
|
|
127
|
-
|
|
135
|
+
titleString = '<title>Index page</title>'
|
|
136
|
+
if len(options['pageTitle']) > 0:
|
|
137
|
+
titleString = '<title>Index page for: {}</title>'.format(options['pageTitle'])
|
|
138
|
+
fMeta.write('<html><head>{}</head><body>\n'.format(titleString))
|
|
128
139
|
fMeta.write(options['headerHtml'])
|
|
129
140
|
fMeta.write('<table border = 0 cellpadding = 2>\n')
|
|
130
141
|
|
|
@@ -145,7 +156,7 @@ def write_html_image_list(filename=None,images=None,options=None):
|
|
|
145
156
|
localImages = images[iStart:iEnd+1]
|
|
146
157
|
|
|
147
158
|
localOptions = options.copy();
|
|
148
|
-
localOptions['headerHtml'] = '';
|
|
159
|
+
localOptions['headerHtml'] = options['subPageHeaderHtml'];
|
|
149
160
|
localOptions['trailerHtml'] = '';
|
|
150
161
|
|
|
151
162
|
# Make a recursive call for this image set
|
|
@@ -170,7 +181,11 @@ def write_html_image_list(filename=None,images=None,options=None):
|
|
|
170
181
|
else:
|
|
171
182
|
fHtml = options['fHtml']
|
|
172
183
|
|
|
173
|
-
|
|
184
|
+
titleString = ''
|
|
185
|
+
if len(options['pageTitle']) > 0:
|
|
186
|
+
titleString = '<title>{}</title>'.format(options['pageTitle'])
|
|
187
|
+
|
|
188
|
+
fHtml.write('<html>{}<body>\n'.format(titleString))
|
|
174
189
|
|
|
175
190
|
fHtml.write(options['headerHtml'])
|
|
176
191
|
|