megadetector 5.0.20__py3-none-any.whl → 5.0.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/data_management/importers/osu-small-animals-to-json.py +4 -4
- megadetector/data_management/yolo_output_to_md_output.py +18 -5
- megadetector/detection/video_utils.py +19 -7
- megadetector/postprocessing/combine_api_outputs.py +1 -1
- megadetector/postprocessing/detector_calibration.py +367 -0
- megadetector/postprocessing/md_to_coco.py +2 -1
- megadetector/postprocessing/postprocess_batch_results.py +32 -20
- megadetector/postprocessing/validate_batch_results.py +118 -58
- megadetector/utils/md_tests.py +14 -12
- megadetector/utils/path_utils.py +139 -30
- megadetector/utils/write_html_image_list.py +16 -5
- megadetector/visualization/visualization_utils.py +126 -23
- megadetector/visualization/visualize_db.py +104 -63
- {megadetector-5.0.20.dist-info → megadetector-5.0.21.dist-info}/METADATA +1 -1
- {megadetector-5.0.20.dist-info → megadetector-5.0.21.dist-info}/RECORD +18 -18
- {megadetector-5.0.20.dist-info → megadetector-5.0.21.dist-info}/WHEEL +1 -1
- megadetector/data_management/importers/prepare-noaa-fish-data-for-lila.py +0 -359
- {megadetector-5.0.20.dist-info → megadetector-5.0.21.dist-info}/LICENSE +0 -0
- {megadetector-5.0.20.dist-info → megadetector-5.0.21.dist-info}/top_level.txt +0 -0
|
@@ -42,11 +42,13 @@ class ValidateBatchResultsOptions:
|
|
|
42
42
|
#:
|
|
43
43
|
#: If None, assumes absolute paths.
|
|
44
44
|
self.relative_path_base = None
|
|
45
|
+
|
|
46
|
+
#: Should we return the loaded data, or just the validation results?
|
|
47
|
+
self.return_data = False
|
|
45
48
|
|
|
46
49
|
# ...class ValidateBatchResultsOptions
|
|
47
50
|
|
|
48
51
|
|
|
49
|
-
|
|
50
52
|
#%% Main function
|
|
51
53
|
|
|
52
54
|
def validate_batch_results(json_filename,options=None):
|
|
@@ -55,11 +57,17 @@ def validate_batch_results(json_filename,options=None):
|
|
|
55
57
|
|
|
56
58
|
Args:
|
|
57
59
|
json_filename (str): the filename to validate
|
|
58
|
-
options (ValidateBatchResultsOptions,
|
|
60
|
+
options (ValidateBatchResultsOptions, optional): all the parameters used to control this
|
|
59
61
|
process, see ValidateBatchResultsOptions for details
|
|
60
62
|
|
|
61
63
|
Returns:
|
|
62
|
-
|
|
64
|
+
dict: a dict with a field called "validation_results", which is itself a dict. The reason
|
|
65
|
+
it's a dict inside a dict is that if return_data is True, the outer dict also contains all
|
|
66
|
+
the loaded data. The "validation_results" dict contains fields called "errors", "warnings",
|
|
67
|
+
and "filename". "errors" and "warnings" are lists of strings, although "errors" will never
|
|
68
|
+
be longer than N=1, since validation fails at the first error.
|
|
69
|
+
|
|
70
|
+
|
|
63
71
|
"""
|
|
64
72
|
|
|
65
73
|
if options is None:
|
|
@@ -68,75 +76,127 @@ def validate_batch_results(json_filename,options=None):
|
|
|
68
76
|
with open(json_filename,'r') as f:
|
|
69
77
|
d = json.load(f)
|
|
70
78
|
|
|
71
|
-
|
|
79
|
+
validation_results = {}
|
|
80
|
+
validation_results['filename'] = json_filename
|
|
81
|
+
validation_results['warnings'] = []
|
|
82
|
+
validation_results['errors'] = []
|
|
72
83
|
|
|
73
|
-
|
|
74
|
-
|
|
84
|
+
if not isinstance(d,dict):
|
|
85
|
+
|
|
86
|
+
validation_results['errors'].append('Input data is not a dict')
|
|
87
|
+
to_return = {}
|
|
88
|
+
to_return['validation_results'] = validation_results
|
|
89
|
+
return to_return
|
|
75
90
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
91
|
+
try:
|
|
92
|
+
|
|
93
|
+
## Info validation
|
|
94
|
+
|
|
95
|
+
if not 'info' in d:
|
|
96
|
+
raise ValueError('Input does not contain info field')
|
|
80
97
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
98
|
+
info = d['info']
|
|
99
|
+
|
|
100
|
+
if not isinstance(info,dict):
|
|
101
|
+
raise ValueError('Input contains invalid info field')
|
|
102
|
+
|
|
103
|
+
if 'format_version' not in info :
|
|
104
|
+
raise ValueError('Input does not specify format version')
|
|
105
|
+
|
|
106
|
+
format_version = float(info['format_version'])
|
|
107
|
+
if format_version < 1.3:
|
|
108
|
+
raise ValueError('This validator can only be used with format version 1.3 or later')
|
|
91
109
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
110
|
+
|
|
111
|
+
## Category validation
|
|
112
|
+
|
|
113
|
+
if 'detection_categories' not in d:
|
|
114
|
+
raise ValueError('Input does not contain detection_categories field')
|
|
115
|
+
|
|
116
|
+
for k in d['detection_categories'].keys():
|
|
117
|
+
# Category ID should be string-formatted ints
|
|
118
|
+
if not isinstance(k,str):
|
|
119
|
+
raise ValueError('Invalid detection category ID: {}'.format(k))
|
|
96
120
|
_ = int(k)
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
121
|
+
if not isinstance(d['detection_categories'][k],str):
|
|
122
|
+
raise ValueError('Invalid detection category name: {}'.format(
|
|
123
|
+
d['detection_categories'][k]))
|
|
124
|
+
|
|
125
|
+
if 'classification_categories' in d:
|
|
126
|
+
for k in d['classification_categories'].keys():
|
|
127
|
+
# Categories should be string-formatted ints
|
|
128
|
+
if not isinstance(k,str):
|
|
129
|
+
raise ValueError('Invalid classification category ID: {}'.format(k))
|
|
130
|
+
_ = int(k)
|
|
131
|
+
if not isinstance(d['classification_categories'][k],str):
|
|
132
|
+
raise ValueError('Invalid classification category name: {}'.format(
|
|
133
|
+
d['classification_categories'][k]))
|
|
107
134
|
|
|
108
|
-
assert isinstance(im,dict)
|
|
109
|
-
assert 'file' in im
|
|
110
135
|
|
|
111
|
-
|
|
136
|
+
## Image validation
|
|
112
137
|
|
|
113
|
-
if
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
138
|
+
if 'images' not in d:
|
|
139
|
+
raise ValueError('images field not present')
|
|
140
|
+
if not isinstance(d['images'],list):
|
|
141
|
+
raise ValueError('Invalid images field')
|
|
142
|
+
|
|
143
|
+
# im = d['images'][0]
|
|
144
|
+
for i_im,im in enumerate(d['images']):
|
|
119
145
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
146
|
+
if not isinstance(im,dict):
|
|
147
|
+
raise ValueError('Invalid image at index {}'.format(i_im))
|
|
148
|
+
if 'file' not in im:
|
|
149
|
+
raise ValueError('Image without filename at index {}'.format(i_im))
|
|
124
150
|
|
|
125
|
-
|
|
126
|
-
assert 'frame_rate' in im
|
|
127
|
-
if 'detections' in im and im['detections'] is not None:
|
|
128
|
-
for det in im['detections']:
|
|
129
|
-
assert 'frame_number' in det
|
|
151
|
+
file = im['file']
|
|
130
152
|
|
|
131
|
-
|
|
153
|
+
if options.check_image_existence:
|
|
154
|
+
if options.relative_path_base is None:
|
|
155
|
+
file_abs = file
|
|
156
|
+
else:
|
|
157
|
+
file_abs = os.path.join(options.relative_path_base,file)
|
|
158
|
+
if not os.path.isfile(file_abs):
|
|
159
|
+
raise ValueError('Cannot find file {}'.format(file_abs))
|
|
160
|
+
|
|
161
|
+
if ('detections' not in im) or (im['detections'] is None):
|
|
162
|
+
if not ('failure' in im and isinstance(im['failure'],str)):
|
|
163
|
+
raise ValueError('Image {} has no detections and no failure'.format(im['file']))
|
|
164
|
+
else:
|
|
165
|
+
if not isinstance(im['detections'],list):
|
|
166
|
+
raise ValueError('Invalid detections list for image {}'.format(im['file']))
|
|
167
|
+
|
|
168
|
+
if is_video_file(im['file']) and (format_version >= 1.4):
|
|
169
|
+
if 'frame_rate' not in im:
|
|
170
|
+
raise ValueError('Video without frame rate: {}'.format(im['file']))
|
|
171
|
+
if 'detections' in im and im['detections'] is not None:
|
|
172
|
+
for det in im['detections']:
|
|
173
|
+
if 'frame_number' not in det:
|
|
174
|
+
raise ValueError('Frame without frame number in video {}'.format(
|
|
175
|
+
im['file']))
|
|
176
|
+
|
|
177
|
+
# ...for each image
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
## Checking on other keys
|
|
181
|
+
|
|
182
|
+
for k in d.keys():
|
|
183
|
+
if (k not in typical_keys) and (k not in required_keys):
|
|
184
|
+
validation_results['warnings'].append(
|
|
185
|
+
'Warning: non-standard key {} present at file level'.format(k))
|
|
132
186
|
|
|
187
|
+
except Exception as e:
|
|
188
|
+
|
|
189
|
+
validation_results['errors'].append(str(e))
|
|
190
|
+
|
|
191
|
+
if options.return_data:
|
|
192
|
+
to_return = d
|
|
193
|
+
else:
|
|
194
|
+
to_return = {}
|
|
133
195
|
|
|
134
|
-
|
|
196
|
+
to_return['validation_results'] = validation_results
|
|
135
197
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
print('Warning: non-standard key {} present at file level'.format(k))
|
|
139
|
-
|
|
198
|
+
return to_return
|
|
199
|
+
|
|
140
200
|
# ...def validate_batch_results(...)
|
|
141
201
|
|
|
142
202
|
|
megadetector/utils/md_tests.py
CHANGED
|
@@ -29,10 +29,6 @@ import subprocess
|
|
|
29
29
|
import argparse
|
|
30
30
|
import inspect
|
|
31
31
|
|
|
32
|
-
#: IoU threshold used to determine whether boxes in two detection files likely correspond
|
|
33
|
-
#: to the same box.
|
|
34
|
-
iou_threshold_for_file_comparison = 0.9
|
|
35
|
-
|
|
36
32
|
|
|
37
33
|
#%% Classes
|
|
38
34
|
|
|
@@ -106,6 +102,10 @@ class MDTestOptions:
|
|
|
106
102
|
#: PYTHONPATH to set for CLI tests; if None, inherits from the parent process. Only
|
|
107
103
|
#: impacts the called functions, not the parent process.
|
|
108
104
|
self.cli_test_pythonpath = None
|
|
105
|
+
|
|
106
|
+
#: IoU threshold used to determine whether boxes in two detection files likely correspond
|
|
107
|
+
#: to the same box.
|
|
108
|
+
self.iou_threshold_for_file_comparison = 0.85
|
|
109
109
|
|
|
110
110
|
# ...class MDTestOptions()
|
|
111
111
|
|
|
@@ -410,7 +410,7 @@ def compare_detection_lists(detections_a,detections_b,options,bidirectional_comp
|
|
|
410
410
|
iou = get_iou(det_a['bbox'],b_det['bbox'])
|
|
411
411
|
|
|
412
412
|
# Is this likely the same detection as det_a?
|
|
413
|
-
if iou >= iou_threshold_for_file_comparison and iou > highest_iou:
|
|
413
|
+
if iou >= options.iou_threshold_for_file_comparison and iou > highest_iou:
|
|
414
414
|
matching_det_b = b_det
|
|
415
415
|
highest_iou = iou
|
|
416
416
|
|
|
@@ -529,12 +529,14 @@ def compare_results(inference_output_file,expected_results_file,options):
|
|
|
529
529
|
if not options.warning_mode:
|
|
530
530
|
|
|
531
531
|
assert max_conf_error <= options.max_conf_error, \
|
|
532
|
-
'Confidence error {} is greater than allowable ({}), on file:\n{}'.format(
|
|
533
|
-
max_conf_error,options.max_conf_error,max_conf_error_file
|
|
532
|
+
'Confidence error {} is greater than allowable ({}), on file:\n{} ({},{})'.format(
|
|
533
|
+
max_conf_error,options.max_conf_error,max_conf_error_file,
|
|
534
|
+
inference_output_file,expected_results_file)
|
|
534
535
|
|
|
535
536
|
assert max_coord_error <= options.max_coord_error, \
|
|
536
|
-
'Coord error {} is greater than allowable ({}), on file:\n{}'.format(
|
|
537
|
-
max_coord_error,options.max_coord_error,max_coord_error_file
|
|
537
|
+
'Coord error {} is greater than allowable ({}), on file:\n{} ({},{})'.format(
|
|
538
|
+
max_coord_error,options.max_coord_error,max_coord_error_file,
|
|
539
|
+
inference_output_file,expected_results_file)
|
|
538
540
|
|
|
539
541
|
print('Max conf error: {} (file {})'.format(
|
|
540
542
|
max_conf_error,max_conf_error_file))
|
|
@@ -847,7 +849,7 @@ def run_python_tests(options):
|
|
|
847
849
|
video_options.frame_rendering_folder = os.path.join(options.scratch_dir,'video_scratch/rendered_frame_folder')
|
|
848
850
|
video_options.render_output_video = True
|
|
849
851
|
# video_options.keep_rendered_frames = False
|
|
850
|
-
# video_options.
|
|
852
|
+
# video_options.keep_extracted_frames = False
|
|
851
853
|
video_options.force_extracted_frame_folder_deletion = True
|
|
852
854
|
video_options.force_rendered_frame_folder_deletion = True
|
|
853
855
|
# video_options.reuse_results_if_available = False
|
|
@@ -887,7 +889,7 @@ def run_python_tests(options):
|
|
|
887
889
|
video_options.frame_rendering_folder = os.path.join(options.scratch_dir,'video_scratch/rendered_frame_folder')
|
|
888
890
|
video_options.render_output_video = False
|
|
889
891
|
video_options.keep_rendered_frames = False
|
|
890
|
-
video_options.
|
|
892
|
+
video_options.keep_extracted_frames = False
|
|
891
893
|
video_options.force_extracted_frame_folder_deletion = True
|
|
892
894
|
video_options.force_rendered_frame_folder_deletion = True
|
|
893
895
|
video_options.reuse_results_if_available = False
|
|
@@ -1353,7 +1355,7 @@ if False:
|
|
|
1353
1355
|
# options.cli_working_dir = r'c:\git\MegaDetector'
|
|
1354
1356
|
# options.yolo_working_dir = r'c:\git\yolov5-md'
|
|
1355
1357
|
options.cli_working_dir = os.path.expanduser('~')
|
|
1356
|
-
options.yolo_working_dir = '/mnt/c/git/yolov5-md'
|
|
1358
|
+
# options.yolo_working_dir = '/mnt/c/git/yolov5-md'
|
|
1357
1359
|
options = download_test_data(options)
|
|
1358
1360
|
|
|
1359
1361
|
#%%
|
megadetector/utils/path_utils.py
CHANGED
|
@@ -17,6 +17,7 @@ import platform
|
|
|
17
17
|
import string
|
|
18
18
|
import json
|
|
19
19
|
import shutil
|
|
20
|
+
import hashlib
|
|
20
21
|
import unicodedata
|
|
21
22
|
import zipfile
|
|
22
23
|
import tarfile
|
|
@@ -236,6 +237,30 @@ def path_is_abs(p):
|
|
|
236
237
|
return (len(p) > 1) and (p[0] == '/' or p[1] == ':' or p[0] == '\\')
|
|
237
238
|
|
|
238
239
|
|
|
240
|
+
def safe_create_link(link_exists,link_new):
|
|
241
|
+
"""
|
|
242
|
+
Creates a symlink at [link_new] pointing to [link_exists].
|
|
243
|
+
|
|
244
|
+
If [link_new] already exists, make sure it's a link (not a file),
|
|
245
|
+
and if it has a different target than [link_exists], removes and re-creates
|
|
246
|
+
it.
|
|
247
|
+
|
|
248
|
+
Errors if [link_new] already exists but it's not a link.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
link_exists (str): the source of the (possibly-new) symlink
|
|
252
|
+
link_new (str): the target of the (possibly-new) symlink
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
if os.path.exists(link_new) or os.path.islink(link_new):
|
|
256
|
+
assert os.path.islink(link_new)
|
|
257
|
+
if not os.readlink(link_new) == link_exists:
|
|
258
|
+
os.remove(link_new)
|
|
259
|
+
os.symlink(link_exists,link_new)
|
|
260
|
+
else:
|
|
261
|
+
os.symlink(link_exists,link_new)
|
|
262
|
+
|
|
263
|
+
|
|
239
264
|
def top_level_folder(p):
|
|
240
265
|
r"""
|
|
241
266
|
Gets the top-level folder from the path *p*.
|
|
@@ -296,31 +321,6 @@ if False:
|
|
|
296
321
|
p = r'c:/foo'; s = top_level_folder(p); print(s); assert s == 'c:/foo'
|
|
297
322
|
p = r'c:\foo/bar'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
|
|
298
323
|
|
|
299
|
-
#%%
|
|
300
|
-
|
|
301
|
-
def safe_create_link(link_exists,link_new):
|
|
302
|
-
"""
|
|
303
|
-
Creates a symlink at [link_new] pointing to [link_exists].
|
|
304
|
-
|
|
305
|
-
If [link_new] already exists, make sure it's a link (not a file),
|
|
306
|
-
and if it has a different target than [link_exists], removes and re-creates
|
|
307
|
-
it.
|
|
308
|
-
|
|
309
|
-
Errors if [link_new] already exists but it's not a link.
|
|
310
|
-
|
|
311
|
-
Args:
|
|
312
|
-
link_exists (str): the source of the (possibly-new) symlink
|
|
313
|
-
link_new (str): the target of the (possibly-new) symlink
|
|
314
|
-
"""
|
|
315
|
-
|
|
316
|
-
if os.path.exists(link_new) or os.path.islink(link_new):
|
|
317
|
-
assert os.path.islink(link_new)
|
|
318
|
-
if not os.readlink(link_new) == link_exists:
|
|
319
|
-
os.remove(link_new)
|
|
320
|
-
os.symlink(link_exists,link_new)
|
|
321
|
-
else:
|
|
322
|
-
os.symlink(link_exists,link_new)
|
|
323
|
-
|
|
324
324
|
|
|
325
325
|
#%% Image-related path functions
|
|
326
326
|
|
|
@@ -598,7 +598,9 @@ def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
|
|
|
598
598
|
|
|
599
599
|
opener = 'xdg-open'
|
|
600
600
|
subprocess.call([opener, filename])
|
|
601
|
-
|
|
601
|
+
|
|
602
|
+
# ...def open_file(...)
|
|
603
|
+
|
|
602
604
|
|
|
603
605
|
#%% File list functions
|
|
604
606
|
|
|
@@ -649,8 +651,12 @@ def _copy_file(input_output_tuple,overwrite=True,verbose=False):
|
|
|
649
651
|
target_fn = input_output_tuple[1]
|
|
650
652
|
if (not overwrite) and (os.path.isfile(target_fn)):
|
|
651
653
|
if verbose:
|
|
652
|
-
print('Skipping existing file {}'.format(target_fn))
|
|
653
|
-
return
|
|
654
|
+
print('Skipping existing target file {}'.format(target_fn))
|
|
655
|
+
return
|
|
656
|
+
|
|
657
|
+
if verbose:
|
|
658
|
+
print('Copying to target file {}'.format(target_fn))
|
|
659
|
+
|
|
654
660
|
os.makedirs(os.path.dirname(target_fn),exist_ok=True)
|
|
655
661
|
shutil.copyfile(source_fn,target_fn)
|
|
656
662
|
|
|
@@ -667,7 +673,7 @@ def parallel_copy_files(input_file_to_output_file, max_workers=16,
|
|
|
667
673
|
use_threads (bool, optional): whether to use threads (True) or processes (False) for
|
|
668
674
|
parallel copying; ignored if max_workers <= 1
|
|
669
675
|
overwrite (bool, optional): whether to overwrite existing destination files
|
|
670
|
-
verbose (bool, optional): enable
|
|
676
|
+
verbose (bool, optional): enable additional debug output
|
|
671
677
|
"""
|
|
672
678
|
|
|
673
679
|
n_workers = min(max_workers,len(input_file_to_output_file))
|
|
@@ -750,7 +756,7 @@ def parallel_get_file_sizes(filenames,
|
|
|
750
756
|
max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
|
|
751
757
|
use_threads (bool, optional): whether to use threads (True) or processes (False) for
|
|
752
758
|
parallel copying; ignored if max_workers <= 1
|
|
753
|
-
verbose (bool, optional): enable
|
|
759
|
+
verbose (bool, optional): enable additional debug output
|
|
754
760
|
recursive (bool, optional): enumerate recursively, only relevant if [filenames] is a folder.
|
|
755
761
|
convert_slashes (bool, optional): convert backslashes to forward slashes
|
|
756
762
|
return_relative_paths (bool, optional): return relative paths; only relevant if [filenames]
|
|
@@ -804,6 +810,8 @@ def parallel_get_file_sizes(filenames,
|
|
|
804
810
|
|
|
805
811
|
return to_return
|
|
806
812
|
|
|
813
|
+
# ...def parallel_get_file_sizes(...)
|
|
814
|
+
|
|
807
815
|
|
|
808
816
|
#%% Zip functions
|
|
809
817
|
|
|
@@ -1075,3 +1083,104 @@ def unzip_file(input_file, output_folder=None):
|
|
|
1075
1083
|
|
|
1076
1084
|
with zipfile.ZipFile(input_file, 'r') as zf:
|
|
1077
1085
|
zf.extractall(output_folder)
|
|
1086
|
+
|
|
1087
|
+
|
|
1088
|
+
#%% File hashing functions
|
|
1089
|
+
|
|
1090
|
+
def compute_file_hash(file_path, algorithm='sha256', allow_failures=True):
|
|
1091
|
+
"""
|
|
1092
|
+
Compute the hash of a file.
|
|
1093
|
+
|
|
1094
|
+
Adapted from:
|
|
1095
|
+
|
|
1096
|
+
https://www.geeksforgeeks.org/python-program-to-find-hash-of-file/
|
|
1097
|
+
|
|
1098
|
+
Args:
|
|
1099
|
+
file_path (str): the file to hash
|
|
1100
|
+
algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
|
|
1101
|
+
|
|
1102
|
+
Returns:
|
|
1103
|
+
str: the hash value for this file
|
|
1104
|
+
"""
|
|
1105
|
+
|
|
1106
|
+
try:
|
|
1107
|
+
|
|
1108
|
+
hash_func = hashlib.new(algorithm)
|
|
1109
|
+
|
|
1110
|
+
with open(file_path, 'rb') as file:
|
|
1111
|
+
while chunk := file.read(8192): # Read the file in chunks of 8192 bytes
|
|
1112
|
+
hash_func.update(chunk)
|
|
1113
|
+
|
|
1114
|
+
return str(hash_func.hexdigest())
|
|
1115
|
+
|
|
1116
|
+
except Exception:
|
|
1117
|
+
|
|
1118
|
+
if allow_failures:
|
|
1119
|
+
return None
|
|
1120
|
+
else:
|
|
1121
|
+
raise
|
|
1122
|
+
|
|
1123
|
+
# ...def compute_file_hash(...)
|
|
1124
|
+
|
|
1125
|
+
|
|
1126
|
+
def parallel_compute_file_hashes(filenames,
|
|
1127
|
+
max_workers=16,
|
|
1128
|
+
use_threads=True,
|
|
1129
|
+
recursive=True,
|
|
1130
|
+
algorithm='sha256',
|
|
1131
|
+
verbose=False):
|
|
1132
|
+
"""
|
|
1133
|
+
Compute file hashes for a list or folder of images.
|
|
1134
|
+
|
|
1135
|
+
Args:
|
|
1136
|
+
filenames (list or str): a list of filenames or a folder
|
|
1137
|
+
max_workers (int, optional): the number of parallel workers to use; set to <=1 to disable
|
|
1138
|
+
parallelization
|
|
1139
|
+
use_threads (bool, optional): whether to use threads (True) or processes (False) for
|
|
1140
|
+
parallelization
|
|
1141
|
+
algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
|
|
1142
|
+
recursive (bool, optional): if [filenames] is a folder, whether to enumerate recursively.
|
|
1143
|
+
Ignored if [filenames] is a list.
|
|
1144
|
+
verbose (bool, optional): enable additional debug output
|
|
1145
|
+
|
|
1146
|
+
Returns:
|
|
1147
|
+
dict: a dict mapping filenames to hash values; values will be None for files that fail
|
|
1148
|
+
to load.
|
|
1149
|
+
"""
|
|
1150
|
+
|
|
1151
|
+
if isinstance(filenames,str) and os.path.isdir(filenames):
|
|
1152
|
+
if verbose:
|
|
1153
|
+
print('Enumerating files in {}'.format(filenames))
|
|
1154
|
+
filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
|
|
1155
|
+
|
|
1156
|
+
n_workers = min(max_workers,len(filenames))
|
|
1157
|
+
|
|
1158
|
+
if verbose:
|
|
1159
|
+
print('Computing hashes for {} files on {} workers'.format(len(filenames),n_workers))
|
|
1160
|
+
|
|
1161
|
+
if n_workers <= 1:
|
|
1162
|
+
|
|
1163
|
+
results = []
|
|
1164
|
+
for filename in filenames:
|
|
1165
|
+
results.append(compute_file_hash(filename,algorithm=algorithm,allow_failures=True))
|
|
1166
|
+
|
|
1167
|
+
else:
|
|
1168
|
+
|
|
1169
|
+
if use_threads:
|
|
1170
|
+
pool = ThreadPool(n_workers)
|
|
1171
|
+
else:
|
|
1172
|
+
pool = Pool(n_workers)
|
|
1173
|
+
|
|
1174
|
+
results = list(tqdm(pool.imap(
|
|
1175
|
+
partial(compute_file_hash,algorithm=algorithm,allow_failures=True),
|
|
1176
|
+
filenames), total=len(filenames)))
|
|
1177
|
+
|
|
1178
|
+
assert len(filenames) == len(results), 'Internal error in parallel_compute_file_hashes'
|
|
1179
|
+
|
|
1180
|
+
to_return = {}
|
|
1181
|
+
for i_file,filename in enumerate(filenames):
|
|
1182
|
+
to_return[filename] = results[i_file]
|
|
1183
|
+
|
|
1184
|
+
return to_return
|
|
1185
|
+
|
|
1186
|
+
# ...def parallel_compute_file_hashes(...)
|
|
@@ -42,6 +42,7 @@ def write_html_image_list(filename=None,images=None,options=None):
|
|
|
42
42
|
options (dict, optional): a dict with one or more of the following fields:
|
|
43
43
|
|
|
44
44
|
- fHtml (file pointer to write to, used for splitting write operations over multiple calls)
|
|
45
|
+
- pageTitle (HTML page title)
|
|
45
46
|
- headerHtml (html text to include before the image list)
|
|
46
47
|
- trailerHtml (html text to include after the image list)
|
|
47
48
|
- defaultImageStyle (default css style for images)
|
|
@@ -60,11 +61,14 @@ def write_html_image_list(filename=None,images=None,options=None):
|
|
|
60
61
|
if 'fHtml' not in options:
|
|
61
62
|
options['fHtml'] = -1
|
|
62
63
|
|
|
64
|
+
if 'pageTitle' not in options or options['pageTitle'] is None:
|
|
65
|
+
options['pageTitle'] = ''
|
|
66
|
+
|
|
63
67
|
if 'headerHtml' not in options or options['headerHtml'] is None:
|
|
64
|
-
options['headerHtml'] = ''
|
|
68
|
+
options['headerHtml'] = ''
|
|
65
69
|
|
|
66
70
|
if 'trailerHtml' not in options or options['trailerHtml'] is None:
|
|
67
|
-
options['trailerHtml'] = ''
|
|
71
|
+
options['trailerHtml'] = ''
|
|
68
72
|
|
|
69
73
|
if 'defaultTextStyle' not in options or options['defaultTextStyle'] is None:
|
|
70
74
|
options['defaultTextStyle'] = \
|
|
@@ -114,7 +118,7 @@ def write_html_image_list(filename=None,images=None,options=None):
|
|
|
114
118
|
# You can't supply your own file handle in this case
|
|
115
119
|
if options['fHtml'] != -1:
|
|
116
120
|
raise ValueError(
|
|
117
|
-
|
|
121
|
+
"You can't supply your own file handle if we have to page the image set")
|
|
118
122
|
|
|
119
123
|
figureFileStartingIndices = list(range(0,nImages,options['maxFiguresPerHtmlFile']))
|
|
120
124
|
|
|
@@ -124,7 +128,10 @@ def write_html_image_list(filename=None,images=None,options=None):
|
|
|
124
128
|
fMeta = open(filename,'w')
|
|
125
129
|
|
|
126
130
|
# Write header stuff
|
|
127
|
-
|
|
131
|
+
titleString = '<title>Index page</title>'
|
|
132
|
+
if len(options['pageTitle']) > 0:
|
|
133
|
+
titleString = '<title>Index page for: {}</title>'.format(options['pageTitle'])
|
|
134
|
+
fMeta.write('<html><head>{}</head><body>\n'.format(titleString))
|
|
128
135
|
fMeta.write(options['headerHtml'])
|
|
129
136
|
fMeta.write('<table border = 0 cellpadding = 2>\n')
|
|
130
137
|
|
|
@@ -170,7 +177,11 @@ def write_html_image_list(filename=None,images=None,options=None):
|
|
|
170
177
|
else:
|
|
171
178
|
fHtml = options['fHtml']
|
|
172
179
|
|
|
173
|
-
|
|
180
|
+
titleString = ''
|
|
181
|
+
if len(options['pageTitle']) > 0:
|
|
182
|
+
titleString = '<title>{}</title>'.format(options['pageTitle'])
|
|
183
|
+
|
|
184
|
+
fHtml.write('<html>{}<body>\n'.format(titleString))
|
|
174
185
|
|
|
175
186
|
fHtml.write(options['headerHtml'])
|
|
176
187
|
|