megadetector 5.0.6__py3-none-any.whl → 5.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- api/batch_processing/data_preparation/manage_local_batch.py +297 -202
- api/batch_processing/data_preparation/manage_video_batch.py +7 -2
- api/batch_processing/postprocessing/add_max_conf.py +1 -0
- api/batch_processing/postprocessing/combine_api_outputs.py +2 -2
- api/batch_processing/postprocessing/compare_batch_results.py +111 -61
- api/batch_processing/postprocessing/convert_output_format.py +24 -6
- api/batch_processing/postprocessing/load_api_results.py +56 -72
- api/batch_processing/postprocessing/md_to_labelme.py +119 -51
- api/batch_processing/postprocessing/merge_detections.py +30 -5
- api/batch_processing/postprocessing/postprocess_batch_results.py +175 -55
- api/batch_processing/postprocessing/remap_detection_categories.py +163 -0
- api/batch_processing/postprocessing/render_detection_confusion_matrix.py +628 -0
- api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
- api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
- api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +224 -76
- api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
- api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
- classification/prepare_classification_script.py +191 -191
- data_management/cct_json_utils.py +7 -2
- data_management/coco_to_labelme.py +263 -0
- data_management/coco_to_yolo.py +72 -48
- data_management/databases/integrity_check_json_db.py +75 -64
- data_management/databases/subset_json_db.py +1 -1
- data_management/generate_crops_from_cct.py +1 -1
- data_management/get_image_sizes.py +44 -26
- data_management/importers/animl_results_to_md_results.py +3 -5
- data_management/importers/noaa_seals_2019.py +2 -2
- data_management/importers/zamba_results_to_md_results.py +2 -2
- data_management/labelme_to_coco.py +264 -127
- data_management/labelme_to_yolo.py +96 -53
- data_management/lila/create_lila_blank_set.py +557 -0
- data_management/lila/create_lila_test_set.py +2 -1
- data_management/lila/create_links_to_md_results_files.py +1 -1
- data_management/lila/download_lila_subset.py +138 -45
- data_management/lila/generate_lila_per_image_labels.py +23 -14
- data_management/lila/get_lila_annotation_counts.py +16 -10
- data_management/lila/lila_common.py +15 -42
- data_management/lila/test_lila_metadata_urls.py +116 -0
- data_management/read_exif.py +65 -16
- data_management/remap_coco_categories.py +84 -0
- data_management/resize_coco_dataset.py +14 -31
- data_management/wi_download_csv_to_coco.py +239 -0
- data_management/yolo_output_to_md_output.py +40 -13
- data_management/yolo_to_coco.py +313 -100
- detection/process_video.py +36 -14
- detection/pytorch_detector.py +1 -1
- detection/run_detector.py +73 -18
- detection/run_detector_batch.py +116 -27
- detection/run_inference_with_yolov5_val.py +135 -27
- detection/run_tiled_inference.py +153 -43
- detection/tf_detector.py +2 -1
- detection/video_utils.py +4 -2
- md_utils/ct_utils.py +101 -6
- md_utils/md_tests.py +264 -17
- md_utils/path_utils.py +326 -47
- md_utils/process_utils.py +26 -7
- md_utils/split_locations_into_train_val.py +215 -0
- md_utils/string_utils.py +10 -0
- md_utils/url_utils.py +66 -3
- md_utils/write_html_image_list.py +12 -2
- md_visualization/visualization_utils.py +380 -74
- md_visualization/visualize_db.py +41 -10
- md_visualization/visualize_detector_output.py +185 -104
- {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/METADATA +11 -13
- {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/RECORD +74 -67
- {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/WHEEL +1 -1
- taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
- taxonomy_mapping/map_new_lila_datasets.py +43 -39
- taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
- taxonomy_mapping/preview_lila_taxonomy.py +27 -27
- taxonomy_mapping/species_lookup.py +33 -13
- taxonomy_mapping/taxonomy_csv_checker.py +7 -5
- md_visualization/visualize_megadb.py +0 -183
- {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/LICENSE +0 -0
- {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/top_level.txt +0 -0
md_utils/path_utils.py
CHANGED
|
@@ -12,16 +12,23 @@
|
|
|
12
12
|
import glob
|
|
13
13
|
import ntpath
|
|
14
14
|
import os
|
|
15
|
+
import sys
|
|
16
|
+
import platform
|
|
15
17
|
import posixpath
|
|
16
18
|
import string
|
|
17
19
|
import json
|
|
20
|
+
import shutil
|
|
18
21
|
import unicodedata
|
|
19
22
|
import zipfile
|
|
23
|
+
import webbrowser
|
|
24
|
+
import subprocess
|
|
25
|
+
import re
|
|
20
26
|
|
|
21
27
|
from zipfile import ZipFile
|
|
22
28
|
from datetime import datetime
|
|
23
29
|
from typing import Container, Iterable, List, Optional, Tuple, Sequence
|
|
24
|
-
from multiprocessing.pool import ThreadPool
|
|
30
|
+
from multiprocessing.pool import Pool, ThreadPool
|
|
31
|
+
from functools import partial
|
|
25
32
|
from tqdm import tqdm
|
|
26
33
|
|
|
27
34
|
IMG_EXTENSIONS = ('.jpg', '.jpeg', '.gif', '.png', '.tif', '.tiff', '.bmp')
|
|
@@ -34,31 +41,53 @@ CHAR_LIMIT = 255
|
|
|
34
41
|
|
|
35
42
|
#%% General path functions
|
|
36
43
|
|
|
37
|
-
def recursive_file_list(base_dir, convert_slashes=True,
|
|
38
|
-
|
|
44
|
+
def recursive_file_list(base_dir, convert_slashes=True,
|
|
45
|
+
return_relative_paths=False, sort_files=True,
|
|
46
|
+
recursive=True):
|
|
47
|
+
r"""
|
|
39
48
|
Enumerate files (not directories) in [base_dir], optionally converting
|
|
40
49
|
\ to /
|
|
41
50
|
"""
|
|
42
51
|
|
|
52
|
+
assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
|
|
53
|
+
|
|
43
54
|
all_files = []
|
|
44
55
|
|
|
45
|
-
|
|
46
|
-
for
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
56
|
+
if recursive:
|
|
57
|
+
for root, _, filenames in os.walk(base_dir):
|
|
58
|
+
for filename in filenames:
|
|
59
|
+
full_path = os.path.join(root, filename)
|
|
60
|
+
all_files.append(full_path)
|
|
61
|
+
else:
|
|
62
|
+
all_files_relative = os.listdir(base_dir)
|
|
63
|
+
all_files = [os.path.join(base_dir,fn) for fn in all_files_relative]
|
|
64
|
+
all_files = [fn for fn in all_files if os.path.isfile(fn)]
|
|
65
|
+
|
|
50
66
|
if return_relative_paths:
|
|
51
67
|
all_files = [os.path.relpath(fn,base_dir) for fn in all_files]
|
|
52
68
|
|
|
53
69
|
if convert_slashes:
|
|
54
70
|
all_files = [fn.replace('\\', '/') for fn in all_files]
|
|
71
|
+
|
|
72
|
+
if sort_files:
|
|
73
|
+
all_files = sorted(all_files)
|
|
55
74
|
|
|
56
|
-
all_files = sorted(all_files)
|
|
57
75
|
return all_files
|
|
58
76
|
|
|
59
77
|
|
|
60
|
-
def
|
|
78
|
+
def file_list(base_dir, convert_slashes=True, return_relative_paths=False, sort_files=True,
|
|
79
|
+
recursive=False):
|
|
61
80
|
"""
|
|
81
|
+
Trivial wrapper for recursive_file_list, which was a poor function name choice at the time,
|
|
82
|
+
it doesn't really make sense to have a "recursive" option in a function called "recursive_file_list".
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
return recursive_file_list(base_dir,convert_slashes,return_relative_paths,sort_files,
|
|
86
|
+
recursive=recursive)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def split_path(path: str) -> List[str]:
|
|
90
|
+
r"""
|
|
62
91
|
Splits [path] into all its constituent tokens.
|
|
63
92
|
|
|
64
93
|
Non-recursive version of:
|
|
@@ -88,7 +117,7 @@ def split_path(path: str) -> List[str]:
|
|
|
88
117
|
|
|
89
118
|
|
|
90
119
|
def fileparts(path: str) -> Tuple[str, str, str]:
|
|
91
|
-
"""
|
|
120
|
+
r"""
|
|
92
121
|
Breaks down a path into the directory path, filename, and extension.
|
|
93
122
|
|
|
94
123
|
Note that the '.' lives with the extension, and separators are removed.
|
|
@@ -187,7 +216,8 @@ def safe_create_link(link_exists,link_new):
|
|
|
187
216
|
it.
|
|
188
217
|
|
|
189
218
|
Errors if link_new already exists but it's not a link.
|
|
190
|
-
"""
|
|
219
|
+
"""
|
|
220
|
+
|
|
191
221
|
if os.path.exists(link_new) or os.path.islink(link_new):
|
|
192
222
|
assert os.path.islink(link_new)
|
|
193
223
|
if not os.readlink(link_new) == link_exists:
|
|
@@ -197,23 +227,6 @@ def safe_create_link(link_exists,link_new):
|
|
|
197
227
|
os.symlink(link_exists,link_new)
|
|
198
228
|
|
|
199
229
|
|
|
200
|
-
def get_file_sizes(base_dir, convert_slashes=True):
|
|
201
|
-
"""
|
|
202
|
-
Get sizes recursively for all files in base_dir, returning a dict mapping
|
|
203
|
-
relative filenames to size.
|
|
204
|
-
"""
|
|
205
|
-
|
|
206
|
-
relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
|
|
207
|
-
return_relative_paths=True)
|
|
208
|
-
|
|
209
|
-
fn_to_size = {}
|
|
210
|
-
for fn_relative in tqdm(relative_filenames):
|
|
211
|
-
fn_abs = os.path.join(base_dir,fn_relative)
|
|
212
|
-
fn_to_size[fn_relative] = os.path.getsize(fn_abs)
|
|
213
|
-
|
|
214
|
-
return fn_to_size
|
|
215
|
-
|
|
216
|
-
|
|
217
230
|
#%% Image-related path functions
|
|
218
231
|
|
|
219
232
|
def is_image_file(s: str, img_extensions: Container[str] = IMG_EXTENSIONS
|
|
@@ -240,14 +253,17 @@ def find_image_strings(strings: Iterable[str]) -> List[str]:
|
|
|
240
253
|
|
|
241
254
|
|
|
242
255
|
def find_images(dirname: str, recursive: bool = False,
|
|
243
|
-
return_relative_paths: bool = False,
|
|
256
|
+
return_relative_paths: bool = False,
|
|
257
|
+
convert_slashes: bool = False) -> List[str]:
|
|
244
258
|
"""
|
|
245
259
|
Finds all files in a directory that look like image file names. Returns
|
|
246
260
|
absolute paths unless return_relative_paths is set. Uses the OS-native
|
|
247
|
-
path separator unless
|
|
261
|
+
path separator unless convert_slashes is set, in which case will always
|
|
248
262
|
use '/'.
|
|
249
263
|
"""
|
|
250
264
|
|
|
265
|
+
assert os.path.isdir(dirname), '{} is not a folder'.format(dirname)
|
|
266
|
+
|
|
251
267
|
if recursive:
|
|
252
268
|
strings = glob.glob(os.path.join(dirname, '**', '*.*'), recursive=True)
|
|
253
269
|
else:
|
|
@@ -270,11 +286,11 @@ def find_images(dirname: str, recursive: bool = False,
|
|
|
270
286
|
|
|
271
287
|
def clean_filename(filename: str, allow_list: str = VALID_FILENAME_CHARS,
|
|
272
288
|
char_limit: int = CHAR_LIMIT, force_lower: bool = False) -> str:
|
|
273
|
-
"""
|
|
289
|
+
r"""
|
|
274
290
|
Removes non-ASCII and other invalid filename characters (on any
|
|
275
291
|
reasonable OS) from a filename, then trims to a maximum length.
|
|
276
292
|
|
|
277
|
-
Does not allow
|
|
293
|
+
Does not allow :\/ by default, use clean_path if you want to preserve those.
|
|
278
294
|
|
|
279
295
|
Adapted from
|
|
280
296
|
https://gist.github.com/wassname/1393c4a57cfcbf03641dbc31886123b8
|
|
@@ -319,15 +335,91 @@ def flatten_path(pathname: str, separator_chars: str = SEPARATOR_CHARS) -> str:
|
|
|
319
335
|
|
|
320
336
|
#%% Platform-independent way to open files in their associated application
|
|
321
337
|
|
|
322
|
-
|
|
338
|
+
def environment_is_wsl():
|
|
339
|
+
"""
|
|
340
|
+
Returns True if we're running in WSL
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
if sys.platform not in ('linux','posix'):
|
|
344
|
+
return False
|
|
345
|
+
platform_string = ' '.join(platform.uname()).lower()
|
|
346
|
+
return 'microsoft' in platform_string and 'wsl' in platform_string
|
|
347
|
+
|
|
323
348
|
|
|
324
|
-
def
|
|
325
|
-
|
|
349
|
+
def wsl_path_to_windows_path(filename):
|
|
350
|
+
"""
|
|
351
|
+
Converts a WSL path to a Windows path, or returns None if that's not possible. E.g.
|
|
352
|
+
converts:
|
|
353
|
+
|
|
354
|
+
/mnt/e/a/b/c
|
|
355
|
+
|
|
356
|
+
...to:
|
|
357
|
+
|
|
358
|
+
e:\a\b\c
|
|
359
|
+
"""
|
|
360
|
+
|
|
361
|
+
result = subprocess.run(['wslpath', '-w', filename], text=True, capture_output=True)
|
|
362
|
+
if result.returncode != 0:
|
|
363
|
+
print('Could not convert path {} from WSL to Windows'.format(filename))
|
|
364
|
+
return None
|
|
365
|
+
return result.stdout.strip()
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
|
|
369
|
+
"""
|
|
370
|
+
Opens [filename] in the default OS file handler for this file type.
|
|
371
|
+
|
|
372
|
+
If attempt_to_open_in_wsl_host is True, and we're in WSL, attempts to open
|
|
373
|
+
[filename] in the Windows host environment.
|
|
374
|
+
|
|
375
|
+
If browser_name is not None, uses the webbrowser module to open the filename
|
|
376
|
+
in the specified browser; see https://docs.python.org/3/library/webbrowser.html
|
|
377
|
+
for supported browsers. Falls back to the default file handler if webbrowser.open()
|
|
378
|
+
fails. In this case, attempt_to_open_in_wsl_host is ignored unless webbrowser.open() fails.
|
|
379
|
+
|
|
380
|
+
If browser_name is 'default', use the system default. This is different from the
|
|
381
|
+
parameter to webbrowser.get(), where None implies the system default.
|
|
382
|
+
"""
|
|
383
|
+
|
|
384
|
+
if browser_name is not None:
|
|
385
|
+
if browser_name == 'chrome':
|
|
386
|
+
browser_name = 'google-chrome'
|
|
387
|
+
elif browser_name == 'default':
|
|
388
|
+
browser_name = None
|
|
389
|
+
try:
|
|
390
|
+
result = webbrowser.get(using=browser_name).open(filename)
|
|
391
|
+
except Exception:
|
|
392
|
+
result = False
|
|
393
|
+
if result:
|
|
394
|
+
return
|
|
395
|
+
|
|
396
|
+
if sys.platform == 'win32':
|
|
397
|
+
|
|
326
398
|
os.startfile(filename)
|
|
399
|
+
|
|
400
|
+
elif sys.platform == 'darwin':
|
|
401
|
+
|
|
402
|
+
opener = 'open'
|
|
403
|
+
subprocess.call([opener, filename])
|
|
404
|
+
|
|
405
|
+
elif attempt_to_open_in_wsl_host and environment_is_wsl():
|
|
406
|
+
|
|
407
|
+
windows_path = wsl_path_to_windows_path(filename)
|
|
408
|
+
|
|
409
|
+
# Fall back to xdg-open
|
|
410
|
+
if windows_path is None:
|
|
411
|
+
subprocess.call(['xdg-open', filename])
|
|
412
|
+
|
|
413
|
+
if os.path.isdir(filename):
|
|
414
|
+
subprocess.run(["explorer.exe", windows_path])
|
|
415
|
+
else:
|
|
416
|
+
os.system("cmd.exe /C start %s" % (re.escape(windows_path)))
|
|
417
|
+
|
|
327
418
|
else:
|
|
328
|
-
|
|
419
|
+
|
|
420
|
+
opener = 'xdg-open'
|
|
329
421
|
subprocess.call([opener, filename])
|
|
330
|
-
|
|
422
|
+
|
|
331
423
|
|
|
332
424
|
#%% File list functions
|
|
333
425
|
|
|
@@ -358,6 +450,107 @@ def read_list_from_file(filename: str) -> List[str]:
|
|
|
358
450
|
return file_list
|
|
359
451
|
|
|
360
452
|
|
|
453
|
+
def _copy_file(input_output_tuple,overwrite=True,verbose=False):
|
|
454
|
+
assert len(input_output_tuple) == 2
|
|
455
|
+
source_fn = input_output_tuple[0]
|
|
456
|
+
target_fn = input_output_tuple[1]
|
|
457
|
+
if (not overwrite) and (os.path.isfile(target_fn)):
|
|
458
|
+
if verbose:
|
|
459
|
+
print('Skipping existing file {}'.format(target_fn))
|
|
460
|
+
return
|
|
461
|
+
os.makedirs(os.path.dirname(target_fn),exist_ok=True)
|
|
462
|
+
shutil.copyfile(source_fn,target_fn)
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def parallel_copy_files(input_file_to_output_file, max_workers=16,
|
|
466
|
+
use_threads=True, overwrite=False, verbose=False):
|
|
467
|
+
"""
|
|
468
|
+
Copy files from source to target according to the dict input_file_to_output_file.
|
|
469
|
+
"""
|
|
470
|
+
|
|
471
|
+
n_workers = min(max_workers,len(input_file_to_output_file))
|
|
472
|
+
|
|
473
|
+
# Package the dictionary as a set of 2-tuples
|
|
474
|
+
input_output_tuples = []
|
|
475
|
+
for input_fn in input_file_to_output_file:
|
|
476
|
+
input_output_tuples.append((input_fn,input_file_to_output_file[input_fn]))
|
|
477
|
+
|
|
478
|
+
if use_threads:
|
|
479
|
+
pool = ThreadPool(n_workers)
|
|
480
|
+
else:
|
|
481
|
+
pool = Pool(n_workers)
|
|
482
|
+
|
|
483
|
+
with tqdm(total=len(input_output_tuples)) as pbar:
|
|
484
|
+
for i,_ in enumerate(pool.imap_unordered(partial(_copy_file,overwrite=overwrite,verbose=verbose),
|
|
485
|
+
input_output_tuples)):
|
|
486
|
+
pbar.update()
|
|
487
|
+
|
|
488
|
+
# ...def parallel_copy_files(...)
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def get_file_sizes(base_dir, convert_slashes=True):
|
|
492
|
+
"""
|
|
493
|
+
Get sizes recursively for all files in base_dir, returning a dict mapping
|
|
494
|
+
relative filenames to size.
|
|
495
|
+
|
|
496
|
+
TODO: merge the functionality here with parallel_get_file_sizes, which uses slightly
|
|
497
|
+
different semantics.
|
|
498
|
+
"""
|
|
499
|
+
|
|
500
|
+
relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
|
|
501
|
+
return_relative_paths=True)
|
|
502
|
+
|
|
503
|
+
fn_to_size = {}
|
|
504
|
+
for fn_relative in tqdm(relative_filenames):
|
|
505
|
+
fn_abs = os.path.join(base_dir,fn_relative)
|
|
506
|
+
fn_to_size[fn_relative] = os.path.getsize(fn_abs)
|
|
507
|
+
|
|
508
|
+
return fn_to_size
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def _get_file_size(filename,verbose=False):
|
|
512
|
+
"""
|
|
513
|
+
Internal function for safely getting the size of a file. Returns a (filename,size)
|
|
514
|
+
tuple, where size is None if there is an error.
|
|
515
|
+
"""
|
|
516
|
+
|
|
517
|
+
try:
|
|
518
|
+
size = os.path.getsize(filename)
|
|
519
|
+
except Exception as e:
|
|
520
|
+
if verbose:
|
|
521
|
+
print('Error reading file size for {}: {}'.format(filename,str(e)))
|
|
522
|
+
size = None
|
|
523
|
+
return (filename,size)
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def parallel_get_file_sizes(filenames, max_workers=16,
|
|
527
|
+
use_threads=True, verbose=False,
|
|
528
|
+
recursive=True):
|
|
529
|
+
"""
|
|
530
|
+
Return a dictionary mapping every file in [filenames] to the corresponding file size,
|
|
531
|
+
or None for errors. If [filenames] is a folder, will enumerate the folder (optionally recursively).
|
|
532
|
+
"""
|
|
533
|
+
|
|
534
|
+
n_workers = min(max_workers,len(filenames))
|
|
535
|
+
|
|
536
|
+
if isinstance(filenames,str) and os.path.isdir(filenames):
|
|
537
|
+
filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
|
|
538
|
+
|
|
539
|
+
if use_threads:
|
|
540
|
+
pool = ThreadPool(n_workers)
|
|
541
|
+
else:
|
|
542
|
+
pool = Pool(n_workers)
|
|
543
|
+
|
|
544
|
+
resize_results = list(tqdm(pool.imap(
|
|
545
|
+
partial(_get_file_size,verbose=verbose),filenames), total=len(filenames)))
|
|
546
|
+
|
|
547
|
+
to_return = {}
|
|
548
|
+
for r in resize_results:
|
|
549
|
+
to_return[r[0]] = r[1]
|
|
550
|
+
|
|
551
|
+
return to_return
|
|
552
|
+
|
|
553
|
+
|
|
361
554
|
#%% Zip functions
|
|
362
555
|
|
|
363
556
|
def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
|
|
@@ -375,7 +568,7 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
|
|
|
375
568
|
return
|
|
376
569
|
|
|
377
570
|
if verbose:
|
|
378
|
-
print('Zipping {} to {}'.format(input_fn,output_fn))
|
|
571
|
+
print('Zipping {} to {} with level {}'.format(input_fn,output_fn,compresslevel))
|
|
379
572
|
|
|
380
573
|
with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
|
|
381
574
|
zipf.write(input_fn,arcname=basename,compresslevel=compresslevel,
|
|
@@ -384,9 +577,37 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
|
|
|
384
577
|
return output_fn
|
|
385
578
|
|
|
386
579
|
|
|
580
|
+
def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
|
|
581
|
+
overwrite=False, verbose=False, compresslevel=9):
|
|
582
|
+
"""
|
|
583
|
+
Zip all the files in [input_files] into [output_fn]. Archive names are relative to
|
|
584
|
+
arc_name_base.
|
|
585
|
+
"""
|
|
586
|
+
|
|
587
|
+
if not overwrite:
|
|
588
|
+
if os.path.isfile(output_fn):
|
|
589
|
+
print('Zip file {} exists, skipping'.format(output_fn))
|
|
590
|
+
return
|
|
591
|
+
|
|
592
|
+
if verbose:
|
|
593
|
+
print('Zipping {} files to {} (compression level {})'.format(
|
|
594
|
+
len(input_files),output_fn,compresslevel))
|
|
595
|
+
|
|
596
|
+
with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
|
|
597
|
+
for input_fn_abs in tqdm(input_files,disable=(not verbose)):
|
|
598
|
+
input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
|
|
599
|
+
zipf.write(input_fn_abs,
|
|
600
|
+
arcname=input_fn_relative,
|
|
601
|
+
compresslevel=compresslevel,
|
|
602
|
+
compress_type=zipfile.ZIP_DEFLATED)
|
|
603
|
+
|
|
604
|
+
return output_fn
|
|
605
|
+
|
|
606
|
+
|
|
387
607
|
def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
|
|
388
608
|
"""
|
|
389
|
-
Recursively zip everything in [input_folder], storing outputs as relative
|
|
609
|
+
Recursively zip everything in [input_folder] into a single zipfile, storing outputs as relative
|
|
610
|
+
paths.
|
|
390
611
|
|
|
391
612
|
Defaults to writing to [input_folder].zip
|
|
392
613
|
"""
|
|
@@ -395,15 +616,18 @@ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, com
|
|
|
395
616
|
output_fn = input_folder + '.zip'
|
|
396
617
|
|
|
397
618
|
if not overwrite:
|
|
398
|
-
|
|
619
|
+
if os.path.isfile(output_fn):
|
|
620
|
+
print('Zip file {} exists, skipping'.format(output_fn))
|
|
621
|
+
return
|
|
399
622
|
|
|
400
623
|
if verbose:
|
|
401
|
-
print('Zipping {} to {}'.format(
|
|
624
|
+
print('Zipping {} to {} (compression level {})'.format(
|
|
625
|
+
input_folder,output_fn,compresslevel))
|
|
402
626
|
|
|
403
627
|
relative_filenames = recursive_file_list(input_folder,return_relative_paths=True)
|
|
404
628
|
|
|
405
629
|
with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
|
|
406
|
-
for input_fn_relative in relative_filenames:
|
|
630
|
+
for input_fn_relative in tqdm(relative_filenames,disable=(not verbose)):
|
|
407
631
|
input_fn_abs = os.path.join(input_folder,input_fn_relative)
|
|
408
632
|
zipf.write(input_fn_abs,
|
|
409
633
|
arcname=input_fn_relative,
|
|
@@ -413,19 +637,74 @@ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, com
|
|
|
413
637
|
return output_fn
|
|
414
638
|
|
|
415
639
|
|
|
416
|
-
def parallel_zip_files(input_files,max_workers=16
|
|
640
|
+
def parallel_zip_files(input_files, max_workers=16, use_threads=True, compresslevel=9,
|
|
641
|
+
overwrite=False, verbose=False):
|
|
417
642
|
"""
|
|
418
643
|
Zip one or more files to separate output files in parallel, leaving the
|
|
419
|
-
original files in place.
|
|
644
|
+
original files in place. Each file is zipped to [filename].zip.
|
|
420
645
|
"""
|
|
421
646
|
|
|
422
647
|
n_workers = min(max_workers,len(input_files))
|
|
423
|
-
|
|
648
|
+
|
|
649
|
+
if use_threads:
|
|
650
|
+
pool = ThreadPool(n_workers)
|
|
651
|
+
else:
|
|
652
|
+
pool = Pool(n_workers)
|
|
653
|
+
|
|
424
654
|
with tqdm(total=len(input_files)) as pbar:
|
|
425
|
-
for i,_ in enumerate(pool.imap_unordered(zip_file,
|
|
655
|
+
for i,_ in enumerate(pool.imap_unordered(partial(zip_file,
|
|
656
|
+
output_fn=None,overwrite=overwrite,verbose=verbose,compresslevel=compresslevel),
|
|
657
|
+
input_files)):
|
|
426
658
|
pbar.update()
|
|
427
659
|
|
|
428
660
|
|
|
661
|
+
def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
|
|
662
|
+
compresslevel=9, overwrite=False, verbose=False):
|
|
663
|
+
"""
|
|
664
|
+
Zip one or more folders to separate output files in parallel, leaving the
|
|
665
|
+
original folders in place. Each folder is zipped to [folder_name].zip.
|
|
666
|
+
"""
|
|
667
|
+
|
|
668
|
+
n_workers = min(max_workers,len(input_folders))
|
|
669
|
+
|
|
670
|
+
if use_threads:
|
|
671
|
+
pool = ThreadPool(n_workers)
|
|
672
|
+
else:
|
|
673
|
+
pool = Pool(n_workers)
|
|
674
|
+
|
|
675
|
+
with tqdm(total=len(input_folders)) as pbar:
|
|
676
|
+
for i,_ in enumerate(pool.imap_unordered(
|
|
677
|
+
partial(zip_folder,overwrite=overwrite,
|
|
678
|
+
compresslevel=compresslevel,verbose=verbose),
|
|
679
|
+
input_folders)):
|
|
680
|
+
pbar.update()
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
def zip_each_file_in_folder(folder_name,recursive=False,max_workers=16,use_threads=True,
|
|
684
|
+
compresslevel=9,overwrite=False,required_token=None,verbose=False,
|
|
685
|
+
exclude_zip=True):
|
|
686
|
+
"""
|
|
687
|
+
Zip each file in [folder_name] to its own zipfile (filename.zip), optionally recursing. To zip a whole
|
|
688
|
+
folder into a single zipfile, use zip_folder().
|
|
689
|
+
|
|
690
|
+
If required_token is not None, include only files that contain that token.
|
|
691
|
+
"""
|
|
692
|
+
|
|
693
|
+
assert os.path.isdir(folder_name), '{} is not a folder'.format(folder_name)
|
|
694
|
+
|
|
695
|
+
input_files = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
|
|
696
|
+
|
|
697
|
+
if required_token is not None:
|
|
698
|
+
input_files = [fn for fn in input_files if required_token in fn]
|
|
699
|
+
|
|
700
|
+
if exclude_zip:
|
|
701
|
+
input_files = [fn for fn in input_files if (not fn.endswith('.zip'))]
|
|
702
|
+
|
|
703
|
+
parallel_zip_files(input_files=input_files,max_workers=max_workers,
|
|
704
|
+
use_threads=use_threads,compresslevel=compresslevel,
|
|
705
|
+
overwrite=overwrite,verbose=verbose)
|
|
706
|
+
|
|
707
|
+
|
|
429
708
|
def unzip_file(input_file, output_folder=None):
|
|
430
709
|
"""
|
|
431
710
|
Unzip a zipfile to the specified output folder, defaulting to the same location as
|
md_utils/process_utils.py
CHANGED
|
@@ -17,14 +17,28 @@ import subprocess
|
|
|
17
17
|
|
|
18
18
|
os.environ["PYTHONUNBUFFERED"] = "1"
|
|
19
19
|
|
|
20
|
-
def execute(cmd):
|
|
20
|
+
def execute(cmd,encoding=None,errors=None,env=None,verbose=False):
|
|
21
21
|
"""
|
|
22
22
|
Run [cmd] (a single string) in a shell, yielding each line of output to the caller.
|
|
23
|
+
|
|
24
|
+
The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
|
|
25
|
+
|
|
26
|
+
"verbose" only impacts output about process management, it is not related to printing
|
|
27
|
+
output from the child process.
|
|
23
28
|
"""
|
|
24
|
-
|
|
29
|
+
|
|
30
|
+
if verbose:
|
|
31
|
+
if encoding is not None:
|
|
32
|
+
print('Launching child process with non-default encoding {}'.format(encoding))
|
|
33
|
+
if errors is not None:
|
|
34
|
+
print('Launching child process with non-default text error handling {}'.format(errors))
|
|
35
|
+
if env is not None:
|
|
36
|
+
print('Launching child process with non-default environment {}'.format(str(env)))
|
|
37
|
+
|
|
25
38
|
# https://stackoverflow.com/questions/4417546/constantly-print-subprocess-output-while-process-is-running
|
|
26
39
|
popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
|
27
|
-
shell=True, universal_newlines=True
|
|
40
|
+
shell=True, universal_newlines=True, encoding=encoding,
|
|
41
|
+
errors=errors, env=env)
|
|
28
42
|
for stdout_line in iter(popen.stdout.readline, ""):
|
|
29
43
|
yield stdout_line
|
|
30
44
|
popen.stdout.close()
|
|
@@ -33,22 +47,27 @@ def execute(cmd):
|
|
|
33
47
|
raise subprocess.CalledProcessError(return_code, cmd)
|
|
34
48
|
|
|
35
49
|
|
|
36
|
-
def execute_and_print(cmd,print_output=True):
|
|
50
|
+
def execute_and_print(cmd,print_output=True,encoding=None,errors=None,env=None,verbose=False):
|
|
37
51
|
"""
|
|
38
52
|
Run [cmd] (a single string) in a shell, capturing and printing output. Returns
|
|
39
53
|
a dictionary with fields "status" and "output".
|
|
54
|
+
|
|
55
|
+
The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
|
|
56
|
+
|
|
57
|
+
"verbose" only impacts output about process management, it is not related to printing
|
|
58
|
+
output from the child process.
|
|
40
59
|
"""
|
|
41
60
|
|
|
42
61
|
to_return = {'status':'unknown','output':''}
|
|
43
|
-
output=[]
|
|
62
|
+
output = []
|
|
44
63
|
try:
|
|
45
|
-
for s in execute(cmd):
|
|
64
|
+
for s in execute(cmd,encoding=encoding,errors=errors,env=env,verbose=verbose):
|
|
46
65
|
output.append(s)
|
|
47
66
|
if print_output:
|
|
48
67
|
print(s,end='',flush=True)
|
|
49
68
|
to_return['status'] = 0
|
|
50
69
|
except subprocess.CalledProcessError as cpe:
|
|
51
|
-
print('execute_and_print caught error: {}'.format(cpe.output))
|
|
70
|
+
print('execute_and_print caught error: {} ({})'.format(cpe.output,str(cpe)))
|
|
52
71
|
to_return['status'] = cpe.returncode
|
|
53
72
|
to_return['output'] = output
|
|
54
73
|
|