megadetector 5.0.7__py3-none-any.whl → 5.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (191) hide show
  1. api/__init__.py +0 -0
  2. api/batch_processing/__init__.py +0 -0
  3. api/batch_processing/api_core/__init__.py +0 -0
  4. api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. api/batch_processing/api_core/batch_service/score.py +0 -1
  6. api/batch_processing/api_core/server_job_status_table.py +0 -1
  7. api/batch_processing/api_core_support/__init__.py +0 -0
  8. api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
  9. api/batch_processing/api_support/__init__.py +0 -0
  10. api/batch_processing/api_support/summarize_daily_activity.py +0 -1
  11. api/batch_processing/data_preparation/__init__.py +0 -0
  12. api/batch_processing/data_preparation/manage_local_batch.py +93 -79
  13. api/batch_processing/data_preparation/manage_video_batch.py +8 -8
  14. api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
  15. api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
  16. api/batch_processing/postprocessing/__init__.py +0 -0
  17. api/batch_processing/postprocessing/add_max_conf.py +12 -12
  18. api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
  19. api/batch_processing/postprocessing/combine_api_outputs.py +69 -55
  20. api/batch_processing/postprocessing/compare_batch_results.py +114 -44
  21. api/batch_processing/postprocessing/convert_output_format.py +62 -19
  22. api/batch_processing/postprocessing/load_api_results.py +17 -20
  23. api/batch_processing/postprocessing/md_to_coco.py +31 -21
  24. api/batch_processing/postprocessing/md_to_labelme.py +165 -68
  25. api/batch_processing/postprocessing/merge_detections.py +40 -15
  26. api/batch_processing/postprocessing/postprocess_batch_results.py +270 -186
  27. api/batch_processing/postprocessing/remap_detection_categories.py +170 -0
  28. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +75 -39
  29. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
  30. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
  31. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +244 -160
  32. api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
  33. api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
  34. api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
  35. api/synchronous/__init__.py +0 -0
  36. api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  37. api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
  38. api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
  39. api/synchronous/api_core/animal_detection_api/config.py +35 -35
  40. api/synchronous/api_core/tests/__init__.py +0 -0
  41. api/synchronous/api_core/tests/load_test.py +109 -109
  42. classification/__init__.py +0 -0
  43. classification/aggregate_classifier_probs.py +21 -24
  44. classification/analyze_failed_images.py +11 -13
  45. classification/cache_batchapi_outputs.py +51 -51
  46. classification/create_classification_dataset.py +69 -68
  47. classification/crop_detections.py +54 -53
  48. classification/csv_to_json.py +97 -100
  49. classification/detect_and_crop.py +105 -105
  50. classification/evaluate_model.py +43 -42
  51. classification/identify_mislabeled_candidates.py +47 -46
  52. classification/json_to_azcopy_list.py +10 -10
  53. classification/json_validator.py +72 -71
  54. classification/map_classification_categories.py +44 -43
  55. classification/merge_classification_detection_output.py +68 -68
  56. classification/prepare_classification_script.py +157 -154
  57. classification/prepare_classification_script_mc.py +228 -228
  58. classification/run_classifier.py +27 -26
  59. classification/save_mislabeled.py +30 -30
  60. classification/train_classifier.py +20 -20
  61. classification/train_classifier_tf.py +21 -22
  62. classification/train_utils.py +10 -10
  63. data_management/__init__.py +0 -0
  64. data_management/annotations/__init__.py +0 -0
  65. data_management/annotations/annotation_constants.py +18 -31
  66. data_management/camtrap_dp_to_coco.py +238 -0
  67. data_management/cct_json_utils.py +107 -59
  68. data_management/cct_to_md.py +176 -158
  69. data_management/cct_to_wi.py +247 -219
  70. data_management/coco_to_labelme.py +272 -0
  71. data_management/coco_to_yolo.py +86 -62
  72. data_management/databases/__init__.py +0 -0
  73. data_management/databases/add_width_and_height_to_db.py +20 -16
  74. data_management/databases/combine_coco_camera_traps_files.py +35 -31
  75. data_management/databases/integrity_check_json_db.py +130 -83
  76. data_management/databases/subset_json_db.py +25 -16
  77. data_management/generate_crops_from_cct.py +27 -45
  78. data_management/get_image_sizes.py +188 -144
  79. data_management/importers/add_nacti_sizes.py +8 -8
  80. data_management/importers/add_timestamps_to_icct.py +78 -78
  81. data_management/importers/animl_results_to_md_results.py +158 -160
  82. data_management/importers/auckland_doc_test_to_json.py +9 -9
  83. data_management/importers/auckland_doc_to_json.py +8 -8
  84. data_management/importers/awc_to_json.py +7 -7
  85. data_management/importers/bellevue_to_json.py +15 -15
  86. data_management/importers/cacophony-thermal-importer.py +13 -13
  87. data_management/importers/carrizo_shrubfree_2018.py +8 -8
  88. data_management/importers/carrizo_trail_cam_2017.py +8 -8
  89. data_management/importers/cct_field_adjustments.py +9 -9
  90. data_management/importers/channel_islands_to_cct.py +10 -10
  91. data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
  92. data_management/importers/ena24_to_json.py +7 -7
  93. data_management/importers/filenames_to_json.py +8 -8
  94. data_management/importers/helena_to_cct.py +7 -7
  95. data_management/importers/idaho-camera-traps.py +7 -7
  96. data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
  97. data_management/importers/jb_csv_to_json.py +9 -9
  98. data_management/importers/mcgill_to_json.py +8 -8
  99. data_management/importers/missouri_to_json.py +18 -18
  100. data_management/importers/nacti_fieldname_adjustments.py +10 -10
  101. data_management/importers/noaa_seals_2019.py +8 -8
  102. data_management/importers/pc_to_json.py +7 -7
  103. data_management/importers/plot_wni_giraffes.py +7 -7
  104. data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
  105. data_management/importers/prepare_zsl_imerit.py +7 -7
  106. data_management/importers/rspb_to_json.py +8 -8
  107. data_management/importers/save_the_elephants_survey_A.py +8 -8
  108. data_management/importers/save_the_elephants_survey_B.py +9 -9
  109. data_management/importers/snapshot_safari_importer.py +26 -26
  110. data_management/importers/snapshot_safari_importer_reprise.py +665 -665
  111. data_management/importers/snapshot_serengeti_lila.py +14 -14
  112. data_management/importers/sulross_get_exif.py +8 -9
  113. data_management/importers/timelapse_csv_set_to_json.py +11 -11
  114. data_management/importers/ubc_to_json.py +13 -13
  115. data_management/importers/umn_to_json.py +7 -7
  116. data_management/importers/wellington_to_json.py +8 -8
  117. data_management/importers/wi_to_json.py +9 -9
  118. data_management/importers/zamba_results_to_md_results.py +181 -181
  119. data_management/labelme_to_coco.py +309 -159
  120. data_management/labelme_to_yolo.py +103 -60
  121. data_management/lila/__init__.py +0 -0
  122. data_management/lila/add_locations_to_island_camera_traps.py +9 -9
  123. data_management/lila/add_locations_to_nacti.py +147 -147
  124. data_management/lila/create_lila_blank_set.py +114 -31
  125. data_management/lila/create_lila_test_set.py +8 -8
  126. data_management/lila/create_links_to_md_results_files.py +106 -106
  127. data_management/lila/download_lila_subset.py +92 -90
  128. data_management/lila/generate_lila_per_image_labels.py +56 -43
  129. data_management/lila/get_lila_annotation_counts.py +18 -15
  130. data_management/lila/get_lila_image_counts.py +11 -11
  131. data_management/lila/lila_common.py +103 -70
  132. data_management/lila/test_lila_metadata_urls.py +132 -116
  133. data_management/ocr_tools.py +173 -128
  134. data_management/read_exif.py +161 -99
  135. data_management/remap_coco_categories.py +84 -0
  136. data_management/remove_exif.py +58 -62
  137. data_management/resize_coco_dataset.py +32 -44
  138. data_management/wi_download_csv_to_coco.py +246 -0
  139. data_management/yolo_output_to_md_output.py +86 -73
  140. data_management/yolo_to_coco.py +535 -95
  141. detection/__init__.py +0 -0
  142. detection/detector_training/__init__.py +0 -0
  143. detection/process_video.py +85 -33
  144. detection/pytorch_detector.py +43 -25
  145. detection/run_detector.py +157 -72
  146. detection/run_detector_batch.py +189 -114
  147. detection/run_inference_with_yolov5_val.py +118 -51
  148. detection/run_tiled_inference.py +113 -42
  149. detection/tf_detector.py +51 -28
  150. detection/video_utils.py +606 -521
  151. docs/source/conf.py +43 -0
  152. md_utils/__init__.py +0 -0
  153. md_utils/azure_utils.py +9 -9
  154. md_utils/ct_utils.py +249 -70
  155. md_utils/directory_listing.py +59 -64
  156. md_utils/md_tests.py +968 -862
  157. md_utils/path_utils.py +655 -155
  158. md_utils/process_utils.py +157 -133
  159. md_utils/sas_blob_utils.py +20 -20
  160. md_utils/split_locations_into_train_val.py +45 -32
  161. md_utils/string_utils.py +33 -10
  162. md_utils/url_utils.py +208 -27
  163. md_utils/write_html_image_list.py +51 -35
  164. md_visualization/__init__.py +0 -0
  165. md_visualization/plot_utils.py +102 -109
  166. md_visualization/render_images_with_thumbnails.py +34 -34
  167. md_visualization/visualization_utils.py +908 -311
  168. md_visualization/visualize_db.py +109 -58
  169. md_visualization/visualize_detector_output.py +61 -42
  170. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/METADATA +21 -17
  171. megadetector-5.0.9.dist-info/RECORD +224 -0
  172. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/WHEEL +1 -1
  173. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/top_level.txt +1 -0
  174. taxonomy_mapping/__init__.py +0 -0
  175. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
  176. taxonomy_mapping/map_new_lila_datasets.py +154 -154
  177. taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
  178. taxonomy_mapping/preview_lila_taxonomy.py +591 -591
  179. taxonomy_mapping/retrieve_sample_image.py +12 -12
  180. taxonomy_mapping/simple_image_download.py +11 -11
  181. taxonomy_mapping/species_lookup.py +10 -10
  182. taxonomy_mapping/taxonomy_csv_checker.py +18 -18
  183. taxonomy_mapping/taxonomy_graph.py +47 -47
  184. taxonomy_mapping/validate_lila_category_mappings.py +83 -76
  185. data_management/cct_json_to_filename_json.py +0 -89
  186. data_management/cct_to_csv.py +0 -140
  187. data_management/databases/remove_corrupted_images_from_db.py +0 -191
  188. detection/detector_training/copy_checkpoints.py +0 -43
  189. md_visualization/visualize_megadb.py +0 -183
  190. megadetector-5.0.7.dist-info/RECORD +0 -202
  191. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/LICENSE +0 -0
md_utils/path_utils.py CHANGED
@@ -1,30 +1,37 @@
1
- ########
2
- #
3
- # path_utils.py
4
- #
5
- # Miscellaneous useful utils for path manipulation, things that could *almost*
6
- # be in os.path, but aren't.
7
- #
8
- ########
1
+ """
2
+
3
+ path_utils.py
4
+
5
+ Miscellaneous useful utils for path manipulation, i.e. things that could *almost*
6
+ be in os.path, but aren't.
7
+
8
+ """
9
9
 
10
10
  #%% Imports and constants
11
11
 
12
12
  import glob
13
13
  import ntpath
14
14
  import os
15
- import posixpath
15
+ import sys
16
+ import platform
16
17
  import string
17
18
  import json
19
+ import shutil
18
20
  import unicodedata
19
21
  import zipfile
22
+ import tarfile
23
+ import webbrowser
24
+ import subprocess
25
+ import re
20
26
 
21
27
  from zipfile import ZipFile
22
28
  from datetime import datetime
23
- from typing import Container, Iterable, List, Optional, Tuple, Sequence
24
29
  from multiprocessing.pool import Pool, ThreadPool
25
30
  from functools import partial
31
+ from shutil import which
26
32
  from tqdm import tqdm
27
33
 
34
+ # Should all be lower-case
28
35
  IMG_EXTENSIONS = ('.jpg', '.jpeg', '.gif', '.png', '.tif', '.tiff', '.bmp')
29
36
 
30
37
  VALID_FILENAME_CHARS = f"~-_.() {string.ascii_letters}{string.digits}"
@@ -35,14 +42,31 @@ CHAR_LIMIT = 255
35
42
 
36
43
  #%% General path functions
37
44
 
38
- def recursive_file_list(base_dir, convert_slashes=True,
39
- return_relative_paths=False, sort_files=True,
45
+ def recursive_file_list(base_dir,
46
+ convert_slashes=True,
47
+ return_relative_paths=False,
48
+ sort_files=True,
40
49
  recursive=True):
41
50
  r"""
42
- Enumerate files (not directories) in [base_dir], optionally converting
43
- \ to /
51
+ Enumerates files (not directories) in [base_dir], optionally converting
52
+ backslahes to slashes
53
+
54
+ Args:
55
+ base_dir (str): folder to enumerate
56
+ convert_slashes (bool, optional): force forward slashes; if this is False, will use
57
+ the native path separator
58
+ return_relative_paths (bool, optional): return paths that are relative to [base_dir],
59
+ rather than absolute paths
60
+ sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
61
+ provided by os.walk()
62
+ recursive (bool, optional): enumerate recursively
63
+
64
+ Returns:
65
+ list: list of filenames
44
66
  """
45
67
 
68
+ assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
69
+
46
70
  all_files = []
47
71
 
48
72
  if recursive:
@@ -71,61 +95,51 @@ def file_list(base_dir, convert_slashes=True, return_relative_paths=False, sort_
71
95
  recursive=False):
72
96
  """
73
97
  Trivial wrapper for recursive_file_list, which was a poor function name choice at the time,
74
- it doesn't really make sense to have a "recursive" option in a function called "recursive_file_list".
98
+ since it doesn't really make sense to have a "recursive" option in a function called
99
+ "recursive_file_list".
100
+
101
+ Args:
102
+ base_dir (str): folder to enumerate
103
+ convert_slashes (bool, optional): force forward slashes; if this is False, will use
104
+ the native path separator
105
+ return_relative_paths (bool, optional): return paths that are relative to [base_dir],
106
+ rather than absolute paths
107
+ sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
108
+ provided by os.walk()
109
+ recursive (bool, optional): enumerate recursively
110
+
111
+ Returns:
112
+ list: list of filenames
75
113
  """
76
114
 
77
115
  return recursive_file_list(base_dir,convert_slashes,return_relative_paths,sort_files,
78
116
  recursive=recursive)
79
117
 
80
118
 
81
- def split_path(path: str) -> List[str]:
82
- r"""
83
- Splits [path] into all its constituent tokens.
84
-
85
- Non-recursive version of:
86
- http://nicks-liquid-soapbox.blogspot.com/2011/03/splitting-path-to-list-in-python.html
87
-
88
- Examples
89
- >>> split_path(r'c:\dir\subdir\file.txt')
90
- ['c:\\', 'dir', 'subdir', 'file.txt']
91
- >>> split_path('/dir/subdir/file.jpg')
92
- ['/', 'dir', 'subdir', 'file.jpg']
93
- >>> split_path('c:\\')
94
- ['c:\\']
95
- >>> split_path('/')
96
- ['/']
97
- """
98
-
99
- parts = []
100
- while True:
101
- # ntpath seems to do the right thing for both Windows and Unix paths
102
- head, tail = ntpath.split(path)
103
- if head == '' or head == path:
104
- break
105
- parts.append(tail)
106
- path = head
107
- parts.append(head or tail)
108
- return parts[::-1] # reverse
109
-
110
-
111
- def fileparts(path: str) -> Tuple[str, str, str]:
119
+ def fileparts(path):
112
120
  r"""
113
121
  Breaks down a path into the directory path, filename, and extension.
114
122
 
115
123
  Note that the '.' lives with the extension, and separators are removed.
116
124
 
117
- Examples
118
- >>> fileparts('file')
119
- ('', 'file', '')
120
- >>> fileparts(r'c:\dir\file.jpg')
121
- ('c:\\dir', 'file', '.jpg')
122
- >>> fileparts('/dir/subdir/file.jpg')
123
- ('/dir/subdir', 'file', '.jpg')
125
+ Examples:
126
+
127
+ .. code-block:: none
128
+
129
+ >>> fileparts('file')
130
+ ('', 'file', '')
131
+ >>> fileparts(r'c:/dir/file.jpg')
132
+ ('c:/dir', 'file', '.jpg')
133
+ >>> fileparts('/dir/subdir/file.jpg')
134
+ ('/dir/subdir', 'file', '.jpg')
124
135
 
136
+ Args:
137
+ path (str): path name to separate into parts
125
138
  Returns:
126
- p: str, directory path
127
- n: str, filename without extension
128
- e: str, extension including the '.'
139
+ tuple: tuple containing (p,n,e):
140
+ - p: str, directory path
141
+ - n: str, filename without extension
142
+ - e: str, extension including the '.'
129
143
  """
130
144
 
131
145
  # ntpath seems to do the right thing for both Windows and Unix paths
@@ -135,79 +149,168 @@ def fileparts(path: str) -> Tuple[str, str, str]:
135
149
  return p, n, e
136
150
 
137
151
 
138
- def insert_before_extension(filename: str, s: str = '', separator='.') -> str:
152
+ def insert_before_extension(filename, s=None, separator='.'):
139
153
  """
140
154
  Insert string [s] before the extension in [filename], separated with [separator].
141
155
 
142
156
  If [s] is empty, generates a date/timestamp. If [filename] has no extension,
143
157
  appends [s].
144
158
 
145
- Examples
146
- >>> insert_before_extension('/dir/subdir/file.ext', 'insert')
147
- '/dir/subdir/file.insert.ext'
148
- >>> insert_before_extension('/dir/subdir/file', 'insert')
149
- '/dir/subdir/file.insert'
150
- >>> insert_before_extension('/dir/subdir/file')
151
- '/dir/subdir/file.2020.07.20.10.54.38'
159
+ Examples:
160
+
161
+ .. code-block:: none
162
+
163
+ >>> insert_before_extension('/dir/subdir/file.ext', 'insert')
164
+ '/dir/subdir/file.insert.ext'
165
+ >>> insert_before_extension('/dir/subdir/file', 'insert')
166
+ '/dir/subdir/file.insert'
167
+ >>> insert_before_extension('/dir/subdir/file')
168
+ '/dir/subdir/file.2020.07.20.10.54.38'
169
+
170
+ Args:
171
+ filename (str): filename to manipulate
172
+ s (str, optional): string to insert before the extension in [filename], or
173
+ None to insert a datestamp
174
+ separator (str, optional): separator to place between the filename base
175
+ and the inserted string
176
+
177
+ Returns:
178
+ str: modified string
152
179
  """
153
180
 
154
181
  assert len(filename) > 0
155
- if len(s) == 0:
182
+ if s is None or len(s) == 0:
156
183
  s = datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
157
184
  name, ext = os.path.splitext(filename)
158
185
  return f'{name}{separator}{s}{ext}'
159
186
 
160
187
 
161
- def top_level_folder(p: str, windows: Optional[bool] = None) -> str:
188
+ def split_path(path):
189
+ r"""
190
+ Splits [path] into all its constituent file/folder tokens.
191
+
192
+ Examples:
193
+
194
+ .. code-block:: none
195
+
196
+ >>> split_path(r'c:\dir\subdir\file.txt')
197
+ ['c:\\', 'dir', 'subdir', 'file.txt']
198
+ >>> split_path('/dir/subdir/file.jpg')
199
+ ['/', 'dir', 'subdir', 'file.jpg']
200
+ >>> split_path('c:\\')
201
+ ['c:\\']
202
+ >>> split_path('/')
203
+ ['/']
204
+
205
+ Args:
206
+ path (str): path to split into tokens
207
+
208
+ Returns:
209
+ list: list of path tokens
162
210
  """
163
- Gets the top-level folder from path [p].
211
+
212
+ parts = []
213
+ while True:
214
+ # ntpath seems to do the right thing for both Windows and Unix paths
215
+ head, tail = ntpath.split(path)
216
+ if head == '' or head == path:
217
+ break
218
+ parts.append(tail)
219
+ path = head
220
+ parts.append(head or tail)
221
+ return parts[::-1] # reverse
164
222
 
165
- This function behaves differently for Windows vs. Unix paths. Set
166
- windows=True if [p] is a Windows path. Set windows=None (default) to treat
167
- [p] as a native system path.
168
223
 
169
- On Windows, will use the top-level folder that isn't the drive.
170
- >>> top_level_folder(r'c:\blah\foo')
171
- 'c:\blah'
224
+ def path_is_abs(p):
225
+ """
226
+ Determines whether [p] is an absolute path. An absolute path is defined as
227
+ one that starts with slash, backslash, or a letter followed by a colon.
228
+
229
+ Args:
230
+ p (str): path to evaluate
231
+
232
+ Returns:
233
+ bool: True if [p] is an absolute path, else False
234
+ """
235
+
236
+ return (len(p) > 1) and (p[0] == '/' or p[1] == ':' or p[0] == '\\')
237
+
172
238
 
173
- On Unix, does not include the leaf node.
174
- >>> top_level_folder('/blah/foo')
175
- '/blah'
239
+ def top_level_folder(p):
240
+ r"""
241
+ Gets the top-level folder from the path *p*.
242
+
243
+ On UNIX, this is straightforward:
244
+
245
+ /blah/foo
246
+
247
+ ...returns '/blah'
248
+
249
+ On Windows, we define this as the top-level folder that isn't the drive, so:
250
+
251
+ c:\blah\foo
252
+
253
+ ...returns 'c:\blah'.
254
+
255
+ Args:
256
+ p (str): filename to evaluate
257
+
258
+ Returns:
259
+ str: the top-level folder in [p], see above for details on how this is defined
176
260
  """
177
261
 
178
262
  if p == '':
179
263
  return ''
180
-
181
- default_lib = os.path # save default os.path
182
- if windows is not None:
183
- os.path = ntpath if windows else posixpath
184
-
185
- # Path('/blah').parts is ('/', 'blah')
264
+
265
+ # Path('/blah').parts is ('/','blah')
186
266
  parts = split_path(p)
267
+
268
+ if len(parts) == 1:
269
+ return parts[0]
187
270
 
271
+ # Handle paths like:
272
+ #
273
+ # /, \, /stuff, c:, c:\stuff
188
274
  drive = os.path.splitdrive(p)[0]
189
- if len(parts) > 1 and (
190
- parts[0] == drive
191
- or parts[0] == drive + '/'
192
- or parts[0] == drive + '\\'
193
- or parts[0] in ['\\', '/']):
194
- result = os.path.join(parts[0], parts[1])
275
+ if parts[0] == drive or parts[0] == drive + '/' or parts[0] == drive + '\\' or parts[0] in ['\\', '/']:
276
+ return os.path.join(parts[0], parts[1])
195
277
  else:
196
- result = parts[0]
278
+ return parts[0]
279
+
280
+ # ...top_level_folder()
197
281
 
198
- os.path = default_lib # restore default os.path
199
- return result
200
282
 
283
+ #%% Test driver for top_level_folder
284
+
285
+ if False:
286
+
287
+ #%%
288
+
289
+ p = 'blah/foo/bar'; s = top_level_folder(p); print(s); assert s == 'blah'
290
+ p = '/blah/foo/bar'; s = top_level_folder(p); print(s); assert s == '/blah'
291
+ p = 'bar'; s = top_level_folder(p); print(s); assert s == 'bar'
292
+ p = ''; s = top_level_folder(p); print(s); assert s == ''
293
+ p = 'c:\\'; s = top_level_folder(p); print(s); assert s == 'c:\\'
294
+ p = r'c:\blah'; s = top_level_folder(p); print(s); assert s == 'c:\\blah'
295
+ p = r'c:\foo'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
296
+ p = r'c:/foo'; s = top_level_folder(p); print(s); assert s == 'c:/foo'
297
+ p = r'c:\foo/bar'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
298
+
299
+ #%%
201
300
 
202
301
  def safe_create_link(link_exists,link_new):
203
302
  """
204
- Create a symlink at link_new pointing to link_exists.
303
+ Creates a symlink at [link_new] pointing to [link_exists].
205
304
 
206
- If link_new already exists, make sure it's a link (not a file),
207
- and if it has a different target than link_exists, remove and re-create
305
+ If [link_new] already exists, make sure it's a link (not a file),
306
+ and if it has a different target than [link_exists], removes and re-creates
208
307
  it.
209
308
 
210
- Errors if link_new already exists but it's not a link.
309
+ Errors if [link_new] already exists but it's not a link.
310
+
311
+ Args:
312
+ link_exists (str): the source of the (possibly-new) symlink
313
+ link_new (str): the target of the (possibly-new) symlink
211
314
  """
212
315
 
213
316
  if os.path.exists(link_new) or os.path.islink(link_new):
@@ -219,58 +322,66 @@ def safe_create_link(link_exists,link_new):
219
322
  os.symlink(link_exists,link_new)
220
323
 
221
324
 
222
- def get_file_sizes(base_dir, convert_slashes=True):
223
- """
224
- Get sizes recursively for all files in base_dir, returning a dict mapping
225
- relative filenames to size.
226
- """
227
-
228
- relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
229
- return_relative_paths=True)
230
-
231
- fn_to_size = {}
232
- for fn_relative in tqdm(relative_filenames):
233
- fn_abs = os.path.join(base_dir,fn_relative)
234
- fn_to_size[fn_relative] = os.path.getsize(fn_abs)
235
-
236
- return fn_to_size
237
-
238
-
239
325
  #%% Image-related path functions
240
326
 
241
- def is_image_file(s: str, img_extensions: Container[str] = IMG_EXTENSIONS
242
- ) -> bool:
327
+ def is_image_file(s, img_extensions=IMG_EXTENSIONS):
243
328
  """
244
329
  Checks a file's extension against a hard-coded set of image file
245
- extensions.
330
+ extensions. Uses case-insensitive comparison.
246
331
 
247
332
  Does not check whether the file exists, only determines whether the filename
248
333
  implies it's an image file.
334
+
335
+ Args:
336
+ s (str): filename to evaluate for image-ness
337
+ img_extensions (list, optional): list of known image file extensions
338
+
339
+ Returns:
340
+ bool: True if [s] appears to be an image file, else False
249
341
  """
250
342
 
251
343
  ext = os.path.splitext(s)[1]
252
344
  return ext.lower() in img_extensions
253
345
 
254
346
 
255
- def find_image_strings(strings: Iterable[str]) -> List[str]:
347
+ def find_image_strings(strings):
256
348
  """
257
349
  Given a list of strings that are potentially image file names, looks for
258
350
  strings that actually look like image file names (based on extension).
351
+
352
+ Args:
353
+ strings (list): list of filenames to check for image-ness
354
+
355
+ Returns:
356
+ list: the subset of [strings] that appear to be image filenames
259
357
  """
260
358
 
261
359
  return [s for s in strings if is_image_file(s)]
262
360
 
263
361
 
264
- def find_images(dirname: str, recursive: bool = False,
265
- return_relative_paths: bool = False,
266
- convert_slashes: bool = False) -> List[str]:
362
+ def find_images(dirname,
363
+ recursive=False,
364
+ return_relative_paths=False,
365
+ convert_slashes=True):
267
366
  """
268
367
  Finds all files in a directory that look like image file names. Returns
269
368
  absolute paths unless return_relative_paths is set. Uses the OS-native
270
- path separator unless convert_slahes is set, in which case will always
369
+ path separator unless convert_slashes is set, in which case will always
271
370
  use '/'.
371
+
372
+ Args:
373
+ dirname (str): the folder to search for images
374
+ recursive (bool, optional): whether to search recursively
375
+ return_relative_paths (str, optional): return paths that are relative
376
+ to [dirname], rather than absolute paths
377
+ convert_slashes (bool, optional): force forward slashes in return values
378
+
379
+ Returns:
380
+ list: list of image filenames found in [dirname]
272
381
  """
273
382
 
383
+ assert os.path.isdir(dirname), '{} is not a folder'.format(dirname)
384
+
274
385
  if recursive:
275
386
  strings = glob.glob(os.path.join(dirname, '**', '*.*'), recursive=True)
276
387
  else:
@@ -291,16 +402,28 @@ def find_images(dirname: str, recursive: bool = False,
291
402
 
292
403
  #%% Filename cleaning functions
293
404
 
294
- def clean_filename(filename: str, allow_list: str = VALID_FILENAME_CHARS,
295
- char_limit: int = CHAR_LIMIT, force_lower: bool = False) -> str:
405
+ def clean_filename(filename,
406
+ allow_list=VALID_FILENAME_CHARS,
407
+ char_limit=CHAR_LIMIT,
408
+ force_lower= False):
296
409
  r"""
297
410
  Removes non-ASCII and other invalid filename characters (on any
298
- reasonable OS) from a filename, then trims to a maximum length.
411
+ reasonable OS) from a filename, then optionally trims to a maximum length.
299
412
 
300
413
  Does not allow :\/ by default, use clean_path if you want to preserve those.
301
414
 
302
415
  Adapted from
303
416
  https://gist.github.com/wassname/1393c4a57cfcbf03641dbc31886123b8
417
+
418
+ Args:
419
+ filename (str): filename to clean
420
+ allow_list (str, optional): string containing all allowable filename characters
421
+ char_limit (int, optional): maximum allowable filename length, if None will skip this
422
+ step
423
+ force_lower (bool, optional): convert the resulting filename to lowercase
424
+
425
+ returns:
426
+ str: cleaned version of [filename]
304
427
  """
305
428
 
306
429
  # keep only valid ascii chars
@@ -316,37 +439,75 @@ def clean_filename(filename: str, allow_list: str = VALID_FILENAME_CHARS,
316
439
  return cleaned_filename
317
440
 
318
441
 
319
- def clean_path(pathname: str, allow_list: str = VALID_PATH_CHARS,
320
- char_limit: int = CHAR_LIMIT, force_lower: bool = False) -> str:
442
+ def clean_path(pathname,
443
+ allow_list=VALID_PATH_CHARS,
444
+ char_limit=CHAR_LIMIT,
445
+ force_lower=False):
321
446
  """
322
447
  Removes non-ASCII and other invalid path characters (on any reasonable
323
- OS) from a path, then trims to a maximum length.
448
+ OS) from a path, then optionally trims to a maximum length.
449
+
450
+ Args:
451
+ pathname (str): path name to clean
452
+ allow_list (str, optional): string containing all allowable filename characters
453
+ char_limit (int, optional): maximum allowable filename length, if None will skip this
454
+ step
455
+ force_lower (bool, optional): convert the resulting filename to lowercase
456
+
457
+ returns:
458
+ str: cleaned version of [filename]
324
459
  """
325
460
 
326
461
  return clean_filename(pathname, allow_list=allow_list,
327
462
  char_limit=char_limit, force_lower=force_lower)
328
463
 
329
464
 
330
- def flatten_path(pathname: str, separator_chars: str = SEPARATOR_CHARS) -> str:
331
- """
465
+ def flatten_path(pathname,separator_chars=SEPARATOR_CHARS,separator_char_replacement='~'):
466
+ r"""
332
467
  Removes non-ASCII and other invalid path characters (on any reasonable
333
468
  OS) from a path, then trims to a maximum length. Replaces all valid
334
- separators with '~'.
469
+ separators with [separator_char_replacement.]
470
+
471
+ Args:
472
+ pathname (str): path name to flatten
473
+ separator_chars (str, optional): string containing all known path separators
474
+ separator_char_replacement (str, optional): string to insert in place of
475
+ path separators.
476
+
477
+ Returns:
478
+ str: flattened version of [pathname]
335
479
  """
336
480
 
337
481
  s = clean_path(pathname)
338
482
  for c in separator_chars:
339
- s = s.replace(c, '~')
483
+ s = s.replace(c, separator_char_replacement)
340
484
  return s
341
485
 
342
486
 
343
- #%% Platform-independent way to open files in their associated application
487
+ def is_executable(filename):
488
+ """
489
+ Checks whether [filename] is on the system path and marked as executable.
490
+
491
+ Args:
492
+ filename (str): filename to check for executable status
493
+
494
+ Returns:
495
+ bool: True if [filename] is on the system path and marked as executable, otherwise False
496
+ """
497
+
498
+ # https://stackoverflow.com/questions/11210104/check-if-a-program-exists-from-a-python-script
499
+
500
+ return which(filename) is not None
344
501
 
345
- import sys,subprocess,platform,re
502
+
503
+ #%% Platform-independent way to open files in their associated application
346
504
 
347
505
  def environment_is_wsl():
348
506
  """
349
- Returns True if we're running in WSL
507
+ Determines whether we're running in WSL.
508
+
509
+ Returns:
510
+ True if we're running in WSL.
350
511
  """
351
512
 
352
513
  if sys.platform not in ('linux','posix'):
@@ -356,7 +517,7 @@ def environment_is_wsl():
356
517
 
357
518
 
358
519
  def wsl_path_to_windows_path(filename):
359
- """
520
+ r"""
360
521
  Converts a WSL path to a Windows path, or returns None if that's not possible. E.g.
361
522
  converts:
362
523
 
@@ -365,6 +526,12 @@ def wsl_path_to_windows_path(filename):
365
526
  ...to:
366
527
 
367
528
  e:\a\b\c
529
+
530
+ Args:
531
+ filename (str): filename to convert
532
+
533
+ Returns:
534
+ str: Windows equivalent to the WSL path [filename]
368
535
  """
369
536
 
370
537
  result = subprocess.run(['wslpath', '-w', filename], text=True, capture_output=True)
@@ -373,13 +540,38 @@ def wsl_path_to_windows_path(filename):
373
540
  return None
374
541
  return result.stdout.strip()
375
542
 
376
-
377
- def open_file(filename,attempt_to_open_in_wsl_host=False):
543
+
544
+ def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
378
545
  """
379
- Opens [filename] in the native OS file handler. If attempt_to_open_in_wsl_host
380
- is True, and we're in WSL, attempts to open [filename] in Windows.
546
+ Opens [filename] in the default OS file handler for this file type.
547
+
548
+ If browser_name is not None, uses the webbrowser module to open the filename
549
+ in the specified browser; see https://docs.python.org/3/library/webbrowser.html
550
+ for supported browsers. Falls back to the default file handler if webbrowser.open()
551
+ fails. In this case, attempt_to_open_in_wsl_host is ignored unless webbrowser.open() fails.
552
+
553
+ If browser_name is 'default', uses the system default. This is different from the
554
+ parameter to webbrowser.get(), where None implies the system default.
555
+
556
+ Args:
557
+ filename (str): file to open
558
+ attempt_to_open_in_wsl_host: if this is True, and we're in WSL, attempts to open
559
+ [filename] in the Windows host environment
560
+ browser_name: see above
381
561
  """
382
562
 
563
+ if browser_name is not None:
564
+ if browser_name == 'chrome':
565
+ browser_name = 'google-chrome'
566
+ elif browser_name == 'default':
567
+ browser_name = None
568
+ try:
569
+ result = webbrowser.get(using=browser_name).open(filename)
570
+ except Exception:
571
+ result = False
572
+ if result:
573
+ return
574
+
383
575
  if sys.platform == 'win32':
384
576
 
385
577
  os.startfile(filename)
@@ -410,10 +602,14 @@ def open_file(filename,attempt_to_open_in_wsl_host=False):
410
602
 
411
603
  #%% File list functions
412
604
 
413
- def write_list_to_file(output_file: str, strings: Sequence[str]) -> None:
605
+ def write_list_to_file(output_file,strings):
414
606
  """
415
607
  Writes a list of strings to either a JSON file or text file,
416
608
  depending on extension of the given file name.
609
+
610
+ Args:
611
+ output_file (str): file to write
612
+ strings (list): list of strings to write to [output_file]
417
613
  """
418
614
 
419
615
  with open(output_file, 'w') as f:
@@ -423,9 +619,15 @@ def write_list_to_file(output_file: str, strings: Sequence[str]) -> None:
423
619
  f.write('\n'.join(strings))
424
620
 
425
621
 
426
- def read_list_from_file(filename: str) -> List[str]:
622
+ def read_list_from_file(filename):
427
623
  """
428
624
  Reads a json-formatted list of strings from a file.
625
+
626
+ Args:
627
+ filename (str): .json filename to read
628
+
629
+ Returns:
630
+ list: list of strings read from [filename]
429
631
  """
430
632
 
431
633
  assert filename.endswith('.json')
@@ -437,11 +639,155 @@ def read_list_from_file(filename: str) -> List[str]:
437
639
  return file_list
438
640
 
439
641
 
642
+ def _copy_file(input_output_tuple,overwrite=True,verbose=False):
643
+ """
644
+ Internal function for copying files from within parallel_copy_files.
645
+ """
646
+
647
+ assert len(input_output_tuple) == 2
648
+ source_fn = input_output_tuple[0]
649
+ target_fn = input_output_tuple[1]
650
+ if (not overwrite) and (os.path.isfile(target_fn)):
651
+ if verbose:
652
+ print('Skipping existing file {}'.format(target_fn))
653
+ return
654
+ os.makedirs(os.path.dirname(target_fn),exist_ok=True)
655
+ shutil.copyfile(source_fn,target_fn)
656
+
657
+
658
+ def parallel_copy_files(input_file_to_output_file, max_workers=16,
659
+ use_threads=True, overwrite=False, verbose=False):
660
+ """
661
+ Copies files from source to target according to the dict input_file_to_output_file.
662
+
663
+ Args:
664
+ input_file_to_output_file (dict): dictionary mapping source files to the target files
665
+ to which they should be copied
666
+ max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
667
+ use_threads (bool, optional): whether to use threads (True) or processes (False) for
668
+ parallel copying; ignored if max_workers <= 1
669
+ overwrite (bool, optional): whether to overwrite existing destination files
670
+ verbose (bool, optional): enable additionald debug output
671
+ """
672
+
673
+ n_workers = min(max_workers,len(input_file_to_output_file))
674
+
675
+ # Package the dictionary as a set of 2-tuples
676
+ input_output_tuples = []
677
+ for input_fn in input_file_to_output_file:
678
+ input_output_tuples.append((input_fn,input_file_to_output_file[input_fn]))
679
+
680
+ if use_threads:
681
+ pool = ThreadPool(n_workers)
682
+ else:
683
+ pool = Pool(n_workers)
684
+
685
+ with tqdm(total=len(input_output_tuples)) as pbar:
686
+ for i,_ in enumerate(pool.imap_unordered(partial(_copy_file,overwrite=overwrite,verbose=verbose),
687
+ input_output_tuples)):
688
+ pbar.update()
689
+
690
+ # ...def parallel_copy_files(...)
691
+
692
+
693
+ def get_file_sizes(base_dir, convert_slashes=True):
694
+ """
695
+ Gets sizes recursively for all files in base_dir, returning a dict mapping
696
+ relative filenames to size.
697
+
698
+ TODO: merge the functionality here with parallel_get_file_sizes, which uses slightly
699
+ different semantics.
700
+
701
+ Args:
702
+ base_dir (str): folder within which we want all file sizes
703
+ convert_slashes (bool, optional): force forward slashes in return strings,
704
+ otherwise uses the native path separator
705
+
706
+ Returns:
707
+ dict: dictionary mapping filenames to file sizes in bytes
708
+ """
709
+
710
+ relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
711
+ return_relative_paths=True)
712
+
713
+ fn_to_size = {}
714
+ for fn_relative in tqdm(relative_filenames):
715
+ fn_abs = os.path.join(base_dir,fn_relative)
716
+ fn_to_size[fn_relative] = os.path.getsize(fn_abs)
717
+
718
+ return fn_to_size
719
+
720
+
721
+ def _get_file_size(filename,verbose=False):
722
+ """
723
+ Internal function for safely getting the size of a file. Returns a (filename,size)
724
+ tuple, where size is None if there is an error.
725
+ """
726
+
727
+ try:
728
+ size = os.path.getsize(filename)
729
+ except Exception as e:
730
+ if verbose:
731
+ print('Error reading file size for {}: {}'.format(filename,str(e)))
732
+ size = None
733
+ return (filename,size)
734
+
735
+
736
+ def parallel_get_file_sizes(filenames, max_workers=16,
737
+ use_threads=True, verbose=False,
738
+ recursive=True):
739
+ """
740
+ Returns a dictionary mapping every file in [filenames] to the corresponding file size,
741
+ or None for errors. If [filenames] is a folder, will enumerate the folder (optionally recursively).
742
+
743
+ Args:
744
+ filenames (list or str): list of filenames for which we should read sizes, or a folder
745
+ within which we should read all file sizes recursively
746
+ max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
747
+ use_threads (bool, optional): whether to use threads (True) or processes (False) for
748
+ parallel copying; ignored if max_workers <= 1
749
+ verbose (bool, optional): enable additionald debug output
750
+
751
+ Returns:
752
+ dict: dictionary mapping filenames to file sizes in bytes
753
+ """
754
+
755
+ n_workers = min(max_workers,len(filenames))
756
+
757
+ if isinstance(filenames,str) and os.path.isdir(filenames):
758
+ filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
759
+
760
+ if use_threads:
761
+ pool = ThreadPool(n_workers)
762
+ else:
763
+ pool = Pool(n_workers)
764
+
765
+ resize_results = list(tqdm(pool.imap(
766
+ partial(_get_file_size,verbose=verbose),filenames), total=len(filenames)))
767
+
768
+ to_return = {}
769
+ for r in resize_results:
770
+ to_return[r[0]] = r[1]
771
+
772
+ return to_return
773
+
774
+
440
775
  #%% Zip functions
441
776
 
442
777
  def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
443
778
  """
444
- Zip a single file, by default writing to a new file called [input_fn].zip
779
+ Zips a single file.
780
+
781
+ Args:
782
+ input_fn (str): file to zip
783
+ output_fn (str, optional): target zipfile; if this is None, we'll use
784
+ [input_fn].zip
785
+ overwrite (bool, optional): whether to overwrite an existing target file
786
+ verbose (bool, optional): enable existing debug console output
787
+ compresslevel (int, optional): compression level to use, between 0 and 9
788
+
789
+ Returns:
790
+ str: the output zipfile, whether we created it or determined that it already exists
445
791
  """
446
792
 
447
793
  basename = os.path.basename(input_fn)
@@ -451,10 +797,10 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
451
797
 
452
798
  if (not overwrite) and (os.path.isfile(output_fn)):
453
799
  print('Skipping existing file {}'.format(output_fn))
454
- return
800
+ return output_fn
455
801
 
456
802
  if verbose:
457
- print('Zipping {} to {}'.format(input_fn,output_fn))
803
+ print('Zipping {} to {} with level {}'.format(input_fn,output_fn,compresslevel))
458
804
 
459
805
  with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
460
806
  zipf.write(input_fn,arcname=basename,compresslevel=compresslevel,
@@ -463,21 +809,113 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
463
809
  return output_fn
464
810
 
465
811
 
812
+ def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
813
+ overwrite=False, verbose=False, mode='x'):
814
+ """
815
+ Adds all the files in [input_files] to the tar file [output_fn].
816
+ Archive names are relative to arc_name_base.
817
+
818
+ Args:
819
+ input_files (list): list of absolute filenames to include in the .tar file
820
+ output_fn (str): .tar file to create
821
+ arc_name_base (str): absolute folder from which relative paths should be determined;
822
+ behavior is undefined if there are files in [input_files] that don't live within
823
+ [arc_name_base]
824
+ overwrite (bool, optional): whether to overwrite an existing .tar file
825
+ verbose (bool, optional): enable additional debug console output
826
+ mode (str, optional): compression type, can be 'x' (no compression), 'x:gz', or 'x:bz2'.
827
+
828
+ Returns:
829
+ str: the output tar file, whether we created it or determined that it already exists
830
+ """
831
+
832
+ if os.path.isfile(output_fn):
833
+ if not overwrite:
834
+ print('Tar file {} exists, skipping'.format(output_fn))
835
+ return output_fn
836
+ else:
837
+ print('Tar file {} exists, deleting and re-creating'.format(output_fn))
838
+ os.remove(output_fn)
839
+
840
+ if verbose:
841
+ print('Adding {} files to {} (mode {})'.format(
842
+ len(input_files),output_fn,mode))
843
+
844
+ with tarfile.open(output_fn,mode) as tarf:
845
+ for input_fn_abs in tqdm(input_files,disable=(not verbose)):
846
+ input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
847
+ tarf.add(input_fn_abs,arcname=input_fn_relative)
848
+
849
+ return output_fn
850
+
851
+
852
+ def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
853
+ overwrite=False, verbose=False, compresslevel=9):
854
+ """
855
+ Zip all the files in [input_files] into [output_fn]. Archive names are relative to
856
+ arc_name_base.
857
+
858
+ Args:
859
+ input_files (list): list of absolute filenames to include in the .tar file
860
+ output_fn (str): .tar file to create
861
+ arc_name_base (str): absolute folder from which relative paths should be determined;
862
+ behavior is undefined if there are files in [input_files] that don't live within
863
+ [arc_name_base]
864
+ overwrite (bool, optional): whether to overwrite an existing .tar file
865
+ verbose (bool, optional): enable additional debug console output
866
+ compresslevel (int, optional): compression level to use, between 0 and 9
867
+
868
+ Returns:
869
+ str: the output zipfile, whether we created it or determined that it already exists
870
+ """
871
+
872
+ if not overwrite:
873
+ if os.path.isfile(output_fn):
874
+ print('Zip file {} exists, skipping'.format(output_fn))
875
+ return output_fn
876
+
877
+ if verbose:
878
+ print('Zipping {} files to {} (compression level {})'.format(
879
+ len(input_files),output_fn,compresslevel))
880
+
881
+ with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
882
+ for input_fn_abs in tqdm(input_files,disable=(not verbose)):
883
+ input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
884
+ zipf.write(input_fn_abs,
885
+ arcname=input_fn_relative,
886
+ compresslevel=compresslevel,
887
+ compress_type=zipfile.ZIP_DEFLATED)
888
+
889
+ return output_fn
890
+
891
+
466
892
  def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
467
893
  """
468
- Recursively zip everything in [input_folder], storing outputs as relative paths.
894
+ Recursively zip everything in [input_folder] into a single zipfile, storing outputs as relative
895
+ paths.
469
896
 
470
- Defaults to writing to [input_folder].zip
897
+ Args:
898
+ input_folder (str): folder to zip
899
+ output_fn (str, optional): output filename; if this is None, we'll write to [input_folder].zip
900
+ overwrite (bool, optional): whether to overwrite an existing .tar file
901
+ verbose (bool, optional): enable additional debug console output
902
+ compresslevel (int, optional): compression level to use, between 0 and 9
903
+
904
+ Returns:
905
+ str: the output zipfile, whether we created it or determined that it already exists
471
906
  """
472
907
 
473
908
  if output_fn is None:
474
909
  output_fn = input_folder + '.zip'
475
910
 
476
911
  if not overwrite:
477
- assert not os.path.isfile(output_fn), 'Zip file {} exists'.format(output_fn)
912
+ if os.path.isfile(output_fn):
913
+ print('Zip file {} exists, skipping'.format(output_fn))
914
+ return
478
915
 
479
916
  if verbose:
480
- print('Zipping {} to {}'.format(input_folder,output_fn))
917
+ print('Zipping {} to {} (compression level {})'.format(
918
+ input_folder,output_fn,compresslevel))
481
919
 
482
920
  relative_filenames = recursive_file_list(input_folder,return_relative_paths=True)
483
921
 
@@ -492,10 +930,20 @@ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, com
492
930
  return output_fn
493
931
 
494
932
 
495
- def parallel_zip_files(input_files, max_workers=16, use_threads=True):
933
+ def parallel_zip_files(input_files, max_workers=16, use_threads=True, compresslevel=9,
934
+ overwrite=False, verbose=False):
496
935
  """
497
- Zip one or more files to separate output files in parallel, leaving the
936
+ Zips one or more files to separate output files in parallel, leaving the
498
937
  original files in place. Each file is zipped to [filename].zip.
938
+
939
+ Args:
940
+ input_file (str): list of files to zip
941
+ max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
942
+ use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
943
+ max_workers <= 1
944
+ compresslevel (int, optional): zip compression level between 0 and 9
945
+ overwrite (bool, optional): whether to overwrite an existing .tar file
946
+ verbose (bool, optional): enable additional debug console output
499
947
  """
500
948
 
501
949
  n_workers = min(max_workers,len(input_files))
@@ -506,15 +954,26 @@ def parallel_zip_files(input_files, max_workers=16, use_threads=True):
506
954
  pool = Pool(n_workers)
507
955
 
508
956
  with tqdm(total=len(input_files)) as pbar:
509
- for i,_ in enumerate(pool.imap_unordered(zip_file,input_files)):
957
+ for i,_ in enumerate(pool.imap_unordered(partial(zip_file,
958
+ output_fn=None,overwrite=overwrite,verbose=verbose,compresslevel=compresslevel),
959
+ input_files)):
510
960
  pbar.update()
511
961
 
512
962
 
513
963
  def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
514
- compresslevel=9, overwrite=False):
964
+ compresslevel=9, overwrite=False, verbose=False):
515
965
  """
516
- Zip one or more folders to separate output files in parallel, leaving the
966
+ Zips one or more folders to separate output files in parallel, leaving the
517
967
  original folders in place. Each folder is zipped to [folder_name].zip.
968
+
969
+ Args:
970
+ input_folder (list): list of folders to zip
971
+ max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
972
+ use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
973
+ max_workers <= 1
974
+ compresslevel (int, optional): zip compression level between 0 and 9
975
+ overwrite (bool, optional): whether to overwrite an existing .tar file
976
+ verbose (bool, optional): enable additional debug console output
518
977
  """
519
978
 
520
979
  n_workers = min(max_workers,len(input_folders))
@@ -526,15 +985,56 @@ def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
526
985
 
527
986
  with tqdm(total=len(input_folders)) as pbar:
528
987
  for i,_ in enumerate(pool.imap_unordered(
529
- partial(zip_folder,overwrite=overwrite,compresslevel=compresslevel),
988
+ partial(zip_folder,overwrite=overwrite,
989
+ compresslevel=compresslevel,verbose=verbose),
530
990
  input_folders)):
531
991
  pbar.update()
532
992
 
533
993
 
994
+ def zip_each_file_in_folder(folder_name,recursive=False,max_workers=16,use_threads=True,
995
+ compresslevel=9,overwrite=False,required_token=None,verbose=False,
996
+ exclude_zip=True):
997
+ """
998
+ Zips each file in [folder_name] to its own zipfile (filename.zip), optionally recursing. To
999
+ zip a whole folder into a single zipfile, use zip_folder().
1000
+
1001
+ Args:
1002
+ folder_name (str): the folder within which we should zip files
1003
+ recursive (bool, optional): whether to recurse within [folder_name]
1004
+ max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
1005
+ use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
1006
+ max_workers <= 1
1007
+ compresslevel (int, optional): zip compression level between 0 and 9
1008
+ overwrite (bool, optional): whether to overwrite an existing .tar file
1009
+ required_token (str, optional): only zip files whose names contain this string
1010
+ verbose (bool, optional): enable additional debug console output
1011
+ exclude_zip (bool, optional): skip files ending in .zip
1012
+ """
1013
+
1014
+ assert os.path.isdir(folder_name), '{} is not a folder'.format(folder_name)
1015
+
1016
+ input_files = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
1017
+
1018
+ if required_token is not None:
1019
+ input_files = [fn for fn in input_files if required_token in fn]
1020
+
1021
+ if exclude_zip:
1022
+ input_files = [fn for fn in input_files if (not fn.endswith('.zip'))]
1023
+
1024
+ parallel_zip_files(input_files=input_files,max_workers=max_workers,
1025
+ use_threads=use_threads,compresslevel=compresslevel,
1026
+ overwrite=overwrite,verbose=verbose)
1027
+
1028
+
534
1029
  def unzip_file(input_file, output_folder=None):
535
1030
  """
536
- Unzip a zipfile to the specified output folder, defaulting to the same location as
537
- the input file
1031
+ Unzips a zipfile to the specified output folder, defaulting to the same location as
1032
+ the input file.
1033
+
1034
+ Args:
1035
+ input_file (str): zipfile to unzip
1036
+ output_folder (str, optional): folder to which we should unzip [input_file], defaults
1037
+ to unzipping to the folder where [input_file] lives
538
1038
  """
539
1039
 
540
1040
  if output_folder is None: