megadetector 5.0.6__py3-none-any.whl → 5.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (75) hide show
  1. api/batch_processing/data_preparation/manage_local_batch.py +297 -202
  2. api/batch_processing/data_preparation/manage_video_batch.py +7 -2
  3. api/batch_processing/postprocessing/add_max_conf.py +1 -0
  4. api/batch_processing/postprocessing/combine_api_outputs.py +2 -2
  5. api/batch_processing/postprocessing/compare_batch_results.py +111 -61
  6. api/batch_processing/postprocessing/convert_output_format.py +24 -6
  7. api/batch_processing/postprocessing/load_api_results.py +56 -72
  8. api/batch_processing/postprocessing/md_to_labelme.py +119 -51
  9. api/batch_processing/postprocessing/merge_detections.py +30 -5
  10. api/batch_processing/postprocessing/postprocess_batch_results.py +175 -55
  11. api/batch_processing/postprocessing/remap_detection_categories.py +163 -0
  12. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +628 -0
  13. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
  14. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
  15. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +224 -76
  16. api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
  17. api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
  18. classification/prepare_classification_script.py +191 -191
  19. data_management/cct_json_utils.py +7 -2
  20. data_management/coco_to_labelme.py +263 -0
  21. data_management/coco_to_yolo.py +72 -48
  22. data_management/databases/integrity_check_json_db.py +75 -64
  23. data_management/databases/subset_json_db.py +1 -1
  24. data_management/generate_crops_from_cct.py +1 -1
  25. data_management/get_image_sizes.py +44 -26
  26. data_management/importers/animl_results_to_md_results.py +3 -5
  27. data_management/importers/noaa_seals_2019.py +2 -2
  28. data_management/importers/zamba_results_to_md_results.py +2 -2
  29. data_management/labelme_to_coco.py +264 -127
  30. data_management/labelme_to_yolo.py +96 -53
  31. data_management/lila/create_lila_blank_set.py +557 -0
  32. data_management/lila/create_lila_test_set.py +2 -1
  33. data_management/lila/create_links_to_md_results_files.py +1 -1
  34. data_management/lila/download_lila_subset.py +138 -45
  35. data_management/lila/generate_lila_per_image_labels.py +23 -14
  36. data_management/lila/get_lila_annotation_counts.py +16 -10
  37. data_management/lila/lila_common.py +15 -42
  38. data_management/lila/test_lila_metadata_urls.py +116 -0
  39. data_management/read_exif.py +65 -16
  40. data_management/remap_coco_categories.py +84 -0
  41. data_management/resize_coco_dataset.py +14 -31
  42. data_management/wi_download_csv_to_coco.py +239 -0
  43. data_management/yolo_output_to_md_output.py +40 -13
  44. data_management/yolo_to_coco.py +313 -100
  45. detection/process_video.py +36 -14
  46. detection/pytorch_detector.py +1 -1
  47. detection/run_detector.py +73 -18
  48. detection/run_detector_batch.py +116 -27
  49. detection/run_inference_with_yolov5_val.py +135 -27
  50. detection/run_tiled_inference.py +153 -43
  51. detection/tf_detector.py +2 -1
  52. detection/video_utils.py +4 -2
  53. md_utils/ct_utils.py +101 -6
  54. md_utils/md_tests.py +264 -17
  55. md_utils/path_utils.py +326 -47
  56. md_utils/process_utils.py +26 -7
  57. md_utils/split_locations_into_train_val.py +215 -0
  58. md_utils/string_utils.py +10 -0
  59. md_utils/url_utils.py +66 -3
  60. md_utils/write_html_image_list.py +12 -2
  61. md_visualization/visualization_utils.py +380 -74
  62. md_visualization/visualize_db.py +41 -10
  63. md_visualization/visualize_detector_output.py +185 -104
  64. {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/METADATA +11 -13
  65. {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/RECORD +74 -67
  66. {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/WHEEL +1 -1
  67. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
  68. taxonomy_mapping/map_new_lila_datasets.py +43 -39
  69. taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
  70. taxonomy_mapping/preview_lila_taxonomy.py +27 -27
  71. taxonomy_mapping/species_lookup.py +33 -13
  72. taxonomy_mapping/taxonomy_csv_checker.py +7 -5
  73. md_visualization/visualize_megadb.py +0 -183
  74. {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/LICENSE +0 -0
  75. {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/top_level.txt +0 -0
md_utils/path_utils.py CHANGED
@@ -12,16 +12,23 @@
12
12
  import glob
13
13
  import ntpath
14
14
  import os
15
+ import sys
16
+ import platform
15
17
  import posixpath
16
18
  import string
17
19
  import json
20
+ import shutil
18
21
  import unicodedata
19
22
  import zipfile
23
+ import webbrowser
24
+ import subprocess
25
+ import re
20
26
 
21
27
  from zipfile import ZipFile
22
28
  from datetime import datetime
23
29
  from typing import Container, Iterable, List, Optional, Tuple, Sequence
24
- from multiprocessing.pool import ThreadPool
30
+ from multiprocessing.pool import Pool, ThreadPool
31
+ from functools import partial
25
32
  from tqdm import tqdm
26
33
 
27
34
  IMG_EXTENSIONS = ('.jpg', '.jpeg', '.gif', '.png', '.tif', '.tiff', '.bmp')
@@ -34,31 +41,53 @@ CHAR_LIMIT = 255
34
41
 
35
42
  #%% General path functions
36
43
 
37
- def recursive_file_list(base_dir, convert_slashes=True, return_relative_paths=False):
38
- """
44
+ def recursive_file_list(base_dir, convert_slashes=True,
45
+ return_relative_paths=False, sort_files=True,
46
+ recursive=True):
47
+ r"""
39
48
  Enumerate files (not directories) in [base_dir], optionally converting
40
49
  \ to /
41
50
  """
42
51
 
52
+ assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
53
+
43
54
  all_files = []
44
55
 
45
- for root, _, filenames in os.walk(base_dir):
46
- for filename in filenames:
47
- full_path = os.path.join(root, filename)
48
- all_files.append(full_path)
49
-
56
+ if recursive:
57
+ for root, _, filenames in os.walk(base_dir):
58
+ for filename in filenames:
59
+ full_path = os.path.join(root, filename)
60
+ all_files.append(full_path)
61
+ else:
62
+ all_files_relative = os.listdir(base_dir)
63
+ all_files = [os.path.join(base_dir,fn) for fn in all_files_relative]
64
+ all_files = [fn for fn in all_files if os.path.isfile(fn)]
65
+
50
66
  if return_relative_paths:
51
67
  all_files = [os.path.relpath(fn,base_dir) for fn in all_files]
52
68
 
53
69
  if convert_slashes:
54
70
  all_files = [fn.replace('\\', '/') for fn in all_files]
71
+
72
+ if sort_files:
73
+ all_files = sorted(all_files)
55
74
 
56
- all_files = sorted(all_files)
57
75
  return all_files
58
76
 
59
77
 
60
- def split_path(path: str) -> List[str]:
78
+ def file_list(base_dir, convert_slashes=True, return_relative_paths=False, sort_files=True,
79
+ recursive=False):
61
80
  """
81
+ Trivial wrapper for recursive_file_list, which was a poor function name choice at the time,
82
+ it doesn't really make sense to have a "recursive" option in a function called "recursive_file_list".
83
+ """
84
+
85
+ return recursive_file_list(base_dir,convert_slashes,return_relative_paths,sort_files,
86
+ recursive=recursive)
87
+
88
+
89
+ def split_path(path: str) -> List[str]:
90
+ r"""
62
91
  Splits [path] into all its constituent tokens.
63
92
 
64
93
  Non-recursive version of:
@@ -88,7 +117,7 @@ def split_path(path: str) -> List[str]:
88
117
 
89
118
 
90
119
  def fileparts(path: str) -> Tuple[str, str, str]:
91
- """
120
+ r"""
92
121
  Breaks down a path into the directory path, filename, and extension.
93
122
 
94
123
  Note that the '.' lives with the extension, and separators are removed.
@@ -187,7 +216,8 @@ def safe_create_link(link_exists,link_new):
187
216
  it.
188
217
 
189
218
  Errors if link_new already exists but it's not a link.
190
- """
219
+ """
220
+
191
221
  if os.path.exists(link_new) or os.path.islink(link_new):
192
222
  assert os.path.islink(link_new)
193
223
  if not os.readlink(link_new) == link_exists:
@@ -197,23 +227,6 @@ def safe_create_link(link_exists,link_new):
197
227
  os.symlink(link_exists,link_new)
198
228
 
199
229
 
200
- def get_file_sizes(base_dir, convert_slashes=True):
201
- """
202
- Get sizes recursively for all files in base_dir, returning a dict mapping
203
- relative filenames to size.
204
- """
205
-
206
- relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
207
- return_relative_paths=True)
208
-
209
- fn_to_size = {}
210
- for fn_relative in tqdm(relative_filenames):
211
- fn_abs = os.path.join(base_dir,fn_relative)
212
- fn_to_size[fn_relative] = os.path.getsize(fn_abs)
213
-
214
- return fn_to_size
215
-
216
-
217
230
  #%% Image-related path functions
218
231
 
219
232
  def is_image_file(s: str, img_extensions: Container[str] = IMG_EXTENSIONS
@@ -240,14 +253,17 @@ def find_image_strings(strings: Iterable[str]) -> List[str]:
240
253
 
241
254
 
242
255
  def find_images(dirname: str, recursive: bool = False,
243
- return_relative_paths: bool = False, convert_slashes: bool = False) -> List[str]:
256
+ return_relative_paths: bool = False,
257
+ convert_slashes: bool = False) -> List[str]:
244
258
  """
245
259
  Finds all files in a directory that look like image file names. Returns
246
260
  absolute paths unless return_relative_paths is set. Uses the OS-native
247
- path separator unless convert_slahes is set, in which case will always
261
+ path separator unless convert_slashes is set, in which case will always
248
262
  use '/'.
249
263
  """
250
264
 
265
+ assert os.path.isdir(dirname), '{} is not a folder'.format(dirname)
266
+
251
267
  if recursive:
252
268
  strings = glob.glob(os.path.join(dirname, '**', '*.*'), recursive=True)
253
269
  else:
@@ -270,11 +286,11 @@ def find_images(dirname: str, recursive: bool = False,
270
286
 
271
287
  def clean_filename(filename: str, allow_list: str = VALID_FILENAME_CHARS,
272
288
  char_limit: int = CHAR_LIMIT, force_lower: bool = False) -> str:
273
- """
289
+ r"""
274
290
  Removes non-ASCII and other invalid filename characters (on any
275
291
  reasonable OS) from a filename, then trims to a maximum length.
276
292
 
277
- Does not allow :\/, use clean_path if you want to preserve those.
293
+ Does not allow :\/ by default, use clean_path if you want to preserve those.
278
294
 
279
295
  Adapted from
280
296
  https://gist.github.com/wassname/1393c4a57cfcbf03641dbc31886123b8
@@ -319,15 +335,91 @@ def flatten_path(pathname: str, separator_chars: str = SEPARATOR_CHARS) -> str:
319
335
 
320
336
  #%% Platform-independent way to open files in their associated application
321
337
 
322
- import sys,subprocess
338
+ def environment_is_wsl():
339
+ """
340
+ Returns True if we're running in WSL
341
+ """
342
+
343
+ if sys.platform not in ('linux','posix'):
344
+ return False
345
+ platform_string = ' '.join(platform.uname()).lower()
346
+ return 'microsoft' in platform_string and 'wsl' in platform_string
347
+
323
348
 
324
- def open_file(filename):
325
- if sys.platform == "win32":
349
+ def wsl_path_to_windows_path(filename):
350
+ """
351
+ Converts a WSL path to a Windows path, or returns None if that's not possible. E.g.
352
+ converts:
353
+
354
+ /mnt/e/a/b/c
355
+
356
+ ...to:
357
+
358
+ e:\a\b\c
359
+ """
360
+
361
+ result = subprocess.run(['wslpath', '-w', filename], text=True, capture_output=True)
362
+ if result.returncode != 0:
363
+ print('Could not convert path {} from WSL to Windows'.format(filename))
364
+ return None
365
+ return result.stdout.strip()
366
+
367
+
368
+ def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
369
+ """
370
+ Opens [filename] in the default OS file handler for this file type.
371
+
372
+ If attempt_to_open_in_wsl_host is True, and we're in WSL, attempts to open
373
+ [filename] in the Windows host environment.
374
+
375
+ If browser_name is not None, uses the webbrowser module to open the filename
376
+ in the specified browser; see https://docs.python.org/3/library/webbrowser.html
377
+ for supported browsers. Falls back to the default file handler if webbrowser.open()
378
+ fails. In this case, attempt_to_open_in_wsl_host is ignored unless webbrowser.open() fails.
379
+
380
+ If browser_name is 'default', use the system default. This is different from the
381
+ parameter to webbrowser.get(), where None implies the system default.
382
+ """
383
+
384
+ if browser_name is not None:
385
+ if browser_name == 'chrome':
386
+ browser_name = 'google-chrome'
387
+ elif browser_name == 'default':
388
+ browser_name = None
389
+ try:
390
+ result = webbrowser.get(using=browser_name).open(filename)
391
+ except Exception:
392
+ result = False
393
+ if result:
394
+ return
395
+
396
+ if sys.platform == 'win32':
397
+
326
398
  os.startfile(filename)
399
+
400
+ elif sys.platform == 'darwin':
401
+
402
+ opener = 'open'
403
+ subprocess.call([opener, filename])
404
+
405
+ elif attempt_to_open_in_wsl_host and environment_is_wsl():
406
+
407
+ windows_path = wsl_path_to_windows_path(filename)
408
+
409
+ # Fall back to xdg-open
410
+ if windows_path is None:
411
+ subprocess.call(['xdg-open', filename])
412
+
413
+ if os.path.isdir(filename):
414
+ subprocess.run(["explorer.exe", windows_path])
415
+ else:
416
+ os.system("cmd.exe /C start %s" % (re.escape(windows_path)))
417
+
327
418
  else:
328
- opener = "open" if sys.platform == "darwin" else "xdg-open"
419
+
420
+ opener = 'xdg-open'
329
421
  subprocess.call([opener, filename])
330
-
422
+
331
423
 
332
424
  #%% File list functions
333
425
 
@@ -358,6 +450,107 @@ def read_list_from_file(filename: str) -> List[str]:
358
450
  return file_list
359
451
 
360
452
 
453
+ def _copy_file(input_output_tuple,overwrite=True,verbose=False):
454
+ assert len(input_output_tuple) == 2
455
+ source_fn = input_output_tuple[0]
456
+ target_fn = input_output_tuple[1]
457
+ if (not overwrite) and (os.path.isfile(target_fn)):
458
+ if verbose:
459
+ print('Skipping existing file {}'.format(target_fn))
460
+ return
461
+ os.makedirs(os.path.dirname(target_fn),exist_ok=True)
462
+ shutil.copyfile(source_fn,target_fn)
463
+
464
+
465
+ def parallel_copy_files(input_file_to_output_file, max_workers=16,
466
+ use_threads=True, overwrite=False, verbose=False):
467
+ """
468
+ Copy files from source to target according to the dict input_file_to_output_file.
469
+ """
470
+
471
+ n_workers = min(max_workers,len(input_file_to_output_file))
472
+
473
+ # Package the dictionary as a set of 2-tuples
474
+ input_output_tuples = []
475
+ for input_fn in input_file_to_output_file:
476
+ input_output_tuples.append((input_fn,input_file_to_output_file[input_fn]))
477
+
478
+ if use_threads:
479
+ pool = ThreadPool(n_workers)
480
+ else:
481
+ pool = Pool(n_workers)
482
+
483
+ with tqdm(total=len(input_output_tuples)) as pbar:
484
+ for i,_ in enumerate(pool.imap_unordered(partial(_copy_file,overwrite=overwrite,verbose=verbose),
485
+ input_output_tuples)):
486
+ pbar.update()
487
+
488
+ # ...def parallel_copy_files(...)
489
+
490
+
491
+ def get_file_sizes(base_dir, convert_slashes=True):
492
+ """
493
+ Get sizes recursively for all files in base_dir, returning a dict mapping
494
+ relative filenames to size.
495
+
496
+ TODO: merge the functionality here with parallel_get_file_sizes, which uses slightly
497
+ different semantics.
498
+ """
499
+
500
+ relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
501
+ return_relative_paths=True)
502
+
503
+ fn_to_size = {}
504
+ for fn_relative in tqdm(relative_filenames):
505
+ fn_abs = os.path.join(base_dir,fn_relative)
506
+ fn_to_size[fn_relative] = os.path.getsize(fn_abs)
507
+
508
+ return fn_to_size
509
+
510
+
511
+ def _get_file_size(filename,verbose=False):
512
+ """
513
+ Internal function for safely getting the size of a file. Returns a (filename,size)
514
+ tuple, where size is None if there is an error.
515
+ """
516
+
517
+ try:
518
+ size = os.path.getsize(filename)
519
+ except Exception as e:
520
+ if verbose:
521
+ print('Error reading file size for {}: {}'.format(filename,str(e)))
522
+ size = None
523
+ return (filename,size)
524
+
525
+
526
+ def parallel_get_file_sizes(filenames, max_workers=16,
527
+ use_threads=True, verbose=False,
528
+ recursive=True):
529
+ """
530
+ Return a dictionary mapping every file in [filenames] to the corresponding file size,
531
+ or None for errors. If [filenames] is a folder, will enumerate the folder (optionally recursively).
532
+ """
533
+
534
+ n_workers = min(max_workers,len(filenames))
535
+
536
+ if isinstance(filenames,str) and os.path.isdir(filenames):
537
+ filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
538
+
539
+ if use_threads:
540
+ pool = ThreadPool(n_workers)
541
+ else:
542
+ pool = Pool(n_workers)
543
+
544
+ resize_results = list(tqdm(pool.imap(
545
+ partial(_get_file_size,verbose=verbose),filenames), total=len(filenames)))
546
+
547
+ to_return = {}
548
+ for r in resize_results:
549
+ to_return[r[0]] = r[1]
550
+
551
+ return to_return
552
+
553
+
361
554
  #%% Zip functions
362
555
 
363
556
  def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
@@ -375,7 +568,7 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
375
568
  return
376
569
 
377
570
  if verbose:
378
- print('Zipping {} to {}'.format(input_fn,output_fn))
571
+ print('Zipping {} to {} with level {}'.format(input_fn,output_fn,compresslevel))
379
572
 
380
573
  with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
381
574
  zipf.write(input_fn,arcname=basename,compresslevel=compresslevel,
@@ -384,9 +577,37 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
384
577
  return output_fn
385
578
 
386
579
 
580
+ def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
581
+ overwrite=False, verbose=False, compresslevel=9):
582
+ """
583
+ Zip all the files in [input_files] into [output_fn]. Archive names are relative to
584
+ arc_name_base.
585
+ """
586
+
587
+ if not overwrite:
588
+ if os.path.isfile(output_fn):
589
+ print('Zip file {} exists, skipping'.format(output_fn))
590
+ return
591
+
592
+ if verbose:
593
+ print('Zipping {} files to {} (compression level {})'.format(
594
+ len(input_files),output_fn,compresslevel))
595
+
596
+ with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
597
+ for input_fn_abs in tqdm(input_files,disable=(not verbose)):
598
+ input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
599
+ zipf.write(input_fn_abs,
600
+ arcname=input_fn_relative,
601
+ compresslevel=compresslevel,
602
+ compress_type=zipfile.ZIP_DEFLATED)
603
+
604
+ return output_fn
605
+
606
+
387
607
  def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
388
608
  """
389
- Recursively zip everything in [input_folder], storing outputs as relative paths.
609
+ Recursively zip everything in [input_folder] into a single zipfile, storing outputs as relative
610
+ paths.
390
611
 
391
612
  Defaults to writing to [input_folder].zip
392
613
  """
@@ -395,15 +616,18 @@ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, com
395
616
  output_fn = input_folder + '.zip'
396
617
 
397
618
  if not overwrite:
398
- assert not os.path.isfile(output_fn), 'Zip file {} exists'.format(output_fn)
619
+ if os.path.isfile(output_fn):
620
+ print('Zip file {} exists, skipping'.format(output_fn))
621
+ return
399
622
 
400
623
  if verbose:
401
- print('Zipping {} to {}'.format(input_folder,output_fn))
624
+ print('Zipping {} to {} (compression level {})'.format(
625
+ input_folder,output_fn,compresslevel))
402
626
 
403
627
  relative_filenames = recursive_file_list(input_folder,return_relative_paths=True)
404
628
 
405
629
  with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
406
- for input_fn_relative in relative_filenames:
630
+ for input_fn_relative in tqdm(relative_filenames,disable=(not verbose)):
407
631
  input_fn_abs = os.path.join(input_folder,input_fn_relative)
408
632
  zipf.write(input_fn_abs,
409
633
  arcname=input_fn_relative,
@@ -413,19 +637,74 @@ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, com
413
637
  return output_fn
414
638
 
415
639
 
416
- def parallel_zip_files(input_files,max_workers=16):
640
+ def parallel_zip_files(input_files, max_workers=16, use_threads=True, compresslevel=9,
641
+ overwrite=False, verbose=False):
417
642
  """
418
643
  Zip one or more files to separate output files in parallel, leaving the
419
- original files in place.
644
+ original files in place. Each file is zipped to [filename].zip.
420
645
  """
421
646
 
422
647
  n_workers = min(max_workers,len(input_files))
423
- pool = ThreadPool(n_workers)
648
+
649
+ if use_threads:
650
+ pool = ThreadPool(n_workers)
651
+ else:
652
+ pool = Pool(n_workers)
653
+
424
654
  with tqdm(total=len(input_files)) as pbar:
425
- for i,_ in enumerate(pool.imap_unordered(zip_file,input_files)):
655
+ for i,_ in enumerate(pool.imap_unordered(partial(zip_file,
656
+ output_fn=None,overwrite=overwrite,verbose=verbose,compresslevel=compresslevel),
657
+ input_files)):
426
658
  pbar.update()
427
659
 
428
660
 
661
+ def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
662
+ compresslevel=9, overwrite=False, verbose=False):
663
+ """
664
+ Zip one or more folders to separate output files in parallel, leaving the
665
+ original folders in place. Each folder is zipped to [folder_name].zip.
666
+ """
667
+
668
+ n_workers = min(max_workers,len(input_folders))
669
+
670
+ if use_threads:
671
+ pool = ThreadPool(n_workers)
672
+ else:
673
+ pool = Pool(n_workers)
674
+
675
+ with tqdm(total=len(input_folders)) as pbar:
676
+ for i,_ in enumerate(pool.imap_unordered(
677
+ partial(zip_folder,overwrite=overwrite,
678
+ compresslevel=compresslevel,verbose=verbose),
679
+ input_folders)):
680
+ pbar.update()
681
+
682
+
683
+ def zip_each_file_in_folder(folder_name,recursive=False,max_workers=16,use_threads=True,
684
+ compresslevel=9,overwrite=False,required_token=None,verbose=False,
685
+ exclude_zip=True):
686
+ """
687
+ Zip each file in [folder_name] to its own zipfile (filename.zip), optionally recursing. To zip a whole
688
+ folder into a single zipfile, use zip_folder().
689
+
690
+ If required_token is not None, include only files that contain that token.
691
+ """
692
+
693
+ assert os.path.isdir(folder_name), '{} is not a folder'.format(folder_name)
694
+
695
+ input_files = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
696
+
697
+ if required_token is not None:
698
+ input_files = [fn for fn in input_files if required_token in fn]
699
+
700
+ if exclude_zip:
701
+ input_files = [fn for fn in input_files if (not fn.endswith('.zip'))]
702
+
703
+ parallel_zip_files(input_files=input_files,max_workers=max_workers,
704
+ use_threads=use_threads,compresslevel=compresslevel,
705
+ overwrite=overwrite,verbose=verbose)
706
+
707
+
429
708
  def unzip_file(input_file, output_folder=None):
430
709
  """
431
710
  Unzip a zipfile to the specified output folder, defaulting to the same location as
md_utils/process_utils.py CHANGED
@@ -17,14 +17,28 @@ import subprocess
17
17
 
18
18
  os.environ["PYTHONUNBUFFERED"] = "1"
19
19
 
20
- def execute(cmd):
20
+ def execute(cmd,encoding=None,errors=None,env=None,verbose=False):
21
21
  """
22
22
  Run [cmd] (a single string) in a shell, yielding each line of output to the caller.
23
+
24
+ The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
25
+
26
+ "verbose" only impacts output about process management, it is not related to printing
27
+ output from the child process.
23
28
  """
24
-
29
+
30
+ if verbose:
31
+ if encoding is not None:
32
+ print('Launching child process with non-default encoding {}'.format(encoding))
33
+ if errors is not None:
34
+ print('Launching child process with non-default text error handling {}'.format(errors))
35
+ if env is not None:
36
+ print('Launching child process with non-default environment {}'.format(str(env)))
37
+
25
38
  # https://stackoverflow.com/questions/4417546/constantly-print-subprocess-output-while-process-is-running
26
39
  popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
27
- shell=True, universal_newlines=True)
40
+ shell=True, universal_newlines=True, encoding=encoding,
41
+ errors=errors, env=env)
28
42
  for stdout_line in iter(popen.stdout.readline, ""):
29
43
  yield stdout_line
30
44
  popen.stdout.close()
@@ -33,22 +47,27 @@ def execute(cmd):
33
47
  raise subprocess.CalledProcessError(return_code, cmd)
34
48
 
35
49
 
36
- def execute_and_print(cmd,print_output=True):
50
+ def execute_and_print(cmd,print_output=True,encoding=None,errors=None,env=None,verbose=False):
37
51
  """
38
52
  Run [cmd] (a single string) in a shell, capturing and printing output. Returns
39
53
  a dictionary with fields "status" and "output".
54
+
55
+ The "encoding", "errors", and "env" parameters are passed directly to subprocess.Popen().
56
+
57
+ "verbose" only impacts output about process management, it is not related to printing
58
+ output from the child process.
40
59
  """
41
60
 
42
61
  to_return = {'status':'unknown','output':''}
43
- output=[]
62
+ output = []
44
63
  try:
45
- for s in execute(cmd):
64
+ for s in execute(cmd,encoding=encoding,errors=errors,env=env,verbose=verbose):
46
65
  output.append(s)
47
66
  if print_output:
48
67
  print(s,end='',flush=True)
49
68
  to_return['status'] = 0
50
69
  except subprocess.CalledProcessError as cpe:
51
- print('execute_and_print caught error: {}'.format(cpe.output))
70
+ print('execute_and_print caught error: {} ({})'.format(cpe.output,str(cpe)))
52
71
  to_return['status'] = cpe.returncode
53
72
  to_return['output'] = output
54
73