megadetector 5.0.11__py3-none-any.whl → 5.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (203) hide show
  1. megadetector/api/__init__.py +0 -0
  2. megadetector/api/batch_processing/__init__.py +0 -0
  3. megadetector/api/batch_processing/api_core/__init__.py +0 -0
  4. megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. megadetector/api/batch_processing/api_core/batch_service/score.py +439 -0
  6. megadetector/api/batch_processing/api_core/server.py +294 -0
  7. megadetector/api/batch_processing/api_core/server_api_config.py +97 -0
  8. megadetector/api/batch_processing/api_core/server_app_config.py +55 -0
  9. megadetector/api/batch_processing/api_core/server_batch_job_manager.py +220 -0
  10. megadetector/api/batch_processing/api_core/server_job_status_table.py +149 -0
  11. megadetector/api/batch_processing/api_core/server_orchestration.py +360 -0
  12. megadetector/api/batch_processing/api_core/server_utils.py +88 -0
  13. megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
  14. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +46 -0
  15. megadetector/api/batch_processing/api_support/__init__.py +0 -0
  16. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +152 -0
  17. megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
  18. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  19. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  20. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  21. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  22. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  23. megadetector/api/synchronous/__init__.py +0 -0
  24. megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  25. megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +152 -0
  26. megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +263 -0
  27. megadetector/api/synchronous/api_core/animal_detection_api/config.py +35 -0
  28. megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
  29. megadetector/api/synchronous/api_core/tests/load_test.py +110 -0
  30. megadetector/classification/__init__.py +0 -0
  31. megadetector/classification/aggregate_classifier_probs.py +108 -0
  32. megadetector/classification/analyze_failed_images.py +227 -0
  33. megadetector/classification/cache_batchapi_outputs.py +198 -0
  34. megadetector/classification/create_classification_dataset.py +627 -0
  35. megadetector/classification/crop_detections.py +516 -0
  36. megadetector/classification/csv_to_json.py +226 -0
  37. megadetector/classification/detect_and_crop.py +855 -0
  38. megadetector/classification/efficientnet/__init__.py +9 -0
  39. megadetector/classification/efficientnet/model.py +415 -0
  40. megadetector/classification/efficientnet/utils.py +607 -0
  41. megadetector/classification/evaluate_model.py +520 -0
  42. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  43. megadetector/classification/json_to_azcopy_list.py +63 -0
  44. megadetector/classification/json_validator.py +699 -0
  45. megadetector/classification/map_classification_categories.py +276 -0
  46. megadetector/classification/merge_classification_detection_output.py +506 -0
  47. megadetector/classification/prepare_classification_script.py +194 -0
  48. megadetector/classification/prepare_classification_script_mc.py +228 -0
  49. megadetector/classification/run_classifier.py +287 -0
  50. megadetector/classification/save_mislabeled.py +110 -0
  51. megadetector/classification/train_classifier.py +827 -0
  52. megadetector/classification/train_classifier_tf.py +725 -0
  53. megadetector/classification/train_utils.py +323 -0
  54. megadetector/data_management/__init__.py +0 -0
  55. megadetector/data_management/annotations/__init__.py +0 -0
  56. megadetector/data_management/annotations/annotation_constants.py +34 -0
  57. megadetector/data_management/camtrap_dp_to_coco.py +237 -0
  58. megadetector/data_management/cct_json_utils.py +404 -0
  59. megadetector/data_management/cct_to_md.py +176 -0
  60. megadetector/data_management/cct_to_wi.py +289 -0
  61. megadetector/data_management/coco_to_labelme.py +283 -0
  62. megadetector/data_management/coco_to_yolo.py +662 -0
  63. megadetector/data_management/databases/__init__.py +0 -0
  64. megadetector/data_management/databases/add_width_and_height_to_db.py +33 -0
  65. megadetector/data_management/databases/combine_coco_camera_traps_files.py +206 -0
  66. megadetector/data_management/databases/integrity_check_json_db.py +493 -0
  67. megadetector/data_management/databases/subset_json_db.py +115 -0
  68. megadetector/data_management/generate_crops_from_cct.py +149 -0
  69. megadetector/data_management/get_image_sizes.py +189 -0
  70. megadetector/data_management/importers/add_nacti_sizes.py +52 -0
  71. megadetector/data_management/importers/add_timestamps_to_icct.py +79 -0
  72. megadetector/data_management/importers/animl_results_to_md_results.py +158 -0
  73. megadetector/data_management/importers/auckland_doc_test_to_json.py +373 -0
  74. megadetector/data_management/importers/auckland_doc_to_json.py +201 -0
  75. megadetector/data_management/importers/awc_to_json.py +191 -0
  76. megadetector/data_management/importers/bellevue_to_json.py +273 -0
  77. megadetector/data_management/importers/cacophony-thermal-importer.py +793 -0
  78. megadetector/data_management/importers/carrizo_shrubfree_2018.py +269 -0
  79. megadetector/data_management/importers/carrizo_trail_cam_2017.py +289 -0
  80. megadetector/data_management/importers/cct_field_adjustments.py +58 -0
  81. megadetector/data_management/importers/channel_islands_to_cct.py +913 -0
  82. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +180 -0
  83. megadetector/data_management/importers/eMammal/eMammal_helpers.py +249 -0
  84. megadetector/data_management/importers/eMammal/make_eMammal_json.py +223 -0
  85. megadetector/data_management/importers/ena24_to_json.py +276 -0
  86. megadetector/data_management/importers/filenames_to_json.py +386 -0
  87. megadetector/data_management/importers/helena_to_cct.py +283 -0
  88. megadetector/data_management/importers/idaho-camera-traps.py +1407 -0
  89. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +294 -0
  90. megadetector/data_management/importers/jb_csv_to_json.py +150 -0
  91. megadetector/data_management/importers/mcgill_to_json.py +250 -0
  92. megadetector/data_management/importers/missouri_to_json.py +490 -0
  93. megadetector/data_management/importers/nacti_fieldname_adjustments.py +79 -0
  94. megadetector/data_management/importers/noaa_seals_2019.py +181 -0
  95. megadetector/data_management/importers/pc_to_json.py +365 -0
  96. megadetector/data_management/importers/plot_wni_giraffes.py +123 -0
  97. megadetector/data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -0
  98. megadetector/data_management/importers/prepare_zsl_imerit.py +131 -0
  99. megadetector/data_management/importers/rspb_to_json.py +356 -0
  100. megadetector/data_management/importers/save_the_elephants_survey_A.py +320 -0
  101. megadetector/data_management/importers/save_the_elephants_survey_B.py +329 -0
  102. megadetector/data_management/importers/snapshot_safari_importer.py +758 -0
  103. megadetector/data_management/importers/snapshot_safari_importer_reprise.py +665 -0
  104. megadetector/data_management/importers/snapshot_serengeti_lila.py +1067 -0
  105. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +150 -0
  106. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +153 -0
  107. megadetector/data_management/importers/sulross_get_exif.py +65 -0
  108. megadetector/data_management/importers/timelapse_csv_set_to_json.py +490 -0
  109. megadetector/data_management/importers/ubc_to_json.py +399 -0
  110. megadetector/data_management/importers/umn_to_json.py +507 -0
  111. megadetector/data_management/importers/wellington_to_json.py +263 -0
  112. megadetector/data_management/importers/wi_to_json.py +442 -0
  113. megadetector/data_management/importers/zamba_results_to_md_results.py +181 -0
  114. megadetector/data_management/labelme_to_coco.py +547 -0
  115. megadetector/data_management/labelme_to_yolo.py +272 -0
  116. megadetector/data_management/lila/__init__.py +0 -0
  117. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +97 -0
  118. megadetector/data_management/lila/add_locations_to_nacti.py +147 -0
  119. megadetector/data_management/lila/create_lila_blank_set.py +558 -0
  120. megadetector/data_management/lila/create_lila_test_set.py +152 -0
  121. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  122. megadetector/data_management/lila/download_lila_subset.py +178 -0
  123. megadetector/data_management/lila/generate_lila_per_image_labels.py +516 -0
  124. megadetector/data_management/lila/get_lila_annotation_counts.py +170 -0
  125. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  126. megadetector/data_management/lila/lila_common.py +300 -0
  127. megadetector/data_management/lila/test_lila_metadata_urls.py +132 -0
  128. megadetector/data_management/ocr_tools.py +870 -0
  129. megadetector/data_management/read_exif.py +809 -0
  130. megadetector/data_management/remap_coco_categories.py +84 -0
  131. megadetector/data_management/remove_exif.py +66 -0
  132. megadetector/data_management/rename_images.py +187 -0
  133. megadetector/data_management/resize_coco_dataset.py +189 -0
  134. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  135. megadetector/data_management/yolo_output_to_md_output.py +446 -0
  136. megadetector/data_management/yolo_to_coco.py +676 -0
  137. megadetector/detection/__init__.py +0 -0
  138. megadetector/detection/detector_training/__init__.py +0 -0
  139. megadetector/detection/detector_training/model_main_tf2.py +114 -0
  140. megadetector/detection/process_video.py +846 -0
  141. megadetector/detection/pytorch_detector.py +355 -0
  142. megadetector/detection/run_detector.py +779 -0
  143. megadetector/detection/run_detector_batch.py +1219 -0
  144. megadetector/detection/run_inference_with_yolov5_val.py +1087 -0
  145. megadetector/detection/run_tiled_inference.py +934 -0
  146. megadetector/detection/tf_detector.py +192 -0
  147. megadetector/detection/video_utils.py +698 -0
  148. megadetector/postprocessing/__init__.py +0 -0
  149. megadetector/postprocessing/add_max_conf.py +64 -0
  150. megadetector/postprocessing/categorize_detections_by_size.py +165 -0
  151. megadetector/postprocessing/classification_postprocessing.py +716 -0
  152. megadetector/postprocessing/combine_api_outputs.py +249 -0
  153. megadetector/postprocessing/compare_batch_results.py +966 -0
  154. megadetector/postprocessing/convert_output_format.py +396 -0
  155. megadetector/postprocessing/load_api_results.py +195 -0
  156. megadetector/postprocessing/md_to_coco.py +310 -0
  157. megadetector/postprocessing/md_to_labelme.py +330 -0
  158. megadetector/postprocessing/merge_detections.py +412 -0
  159. megadetector/postprocessing/postprocess_batch_results.py +1908 -0
  160. megadetector/postprocessing/remap_detection_categories.py +170 -0
  161. megadetector/postprocessing/render_detection_confusion_matrix.py +660 -0
  162. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +211 -0
  163. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +83 -0
  164. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1635 -0
  165. megadetector/postprocessing/separate_detections_into_folders.py +730 -0
  166. megadetector/postprocessing/subset_json_detector_output.py +700 -0
  167. megadetector/postprocessing/top_folders_to_bottom.py +223 -0
  168. megadetector/taxonomy_mapping/__init__.py +0 -0
  169. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  170. megadetector/taxonomy_mapping/map_new_lila_datasets.py +150 -0
  171. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -0
  172. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +588 -0
  173. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  174. megadetector/taxonomy_mapping/simple_image_download.py +219 -0
  175. megadetector/taxonomy_mapping/species_lookup.py +834 -0
  176. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  177. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  178. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  179. megadetector/utils/__init__.py +0 -0
  180. megadetector/utils/azure_utils.py +178 -0
  181. megadetector/utils/ct_utils.py +613 -0
  182. megadetector/utils/directory_listing.py +246 -0
  183. megadetector/utils/md_tests.py +1164 -0
  184. megadetector/utils/path_utils.py +1045 -0
  185. megadetector/utils/process_utils.py +160 -0
  186. megadetector/utils/sas_blob_utils.py +509 -0
  187. megadetector/utils/split_locations_into_train_val.py +228 -0
  188. megadetector/utils/string_utils.py +92 -0
  189. megadetector/utils/url_utils.py +323 -0
  190. megadetector/utils/write_html_image_list.py +225 -0
  191. megadetector/visualization/__init__.py +0 -0
  192. megadetector/visualization/plot_utils.py +293 -0
  193. megadetector/visualization/render_images_with_thumbnails.py +275 -0
  194. megadetector/visualization/visualization_utils.py +1536 -0
  195. megadetector/visualization/visualize_db.py +552 -0
  196. megadetector/visualization/visualize_detector_output.py +405 -0
  197. {megadetector-5.0.11.dist-info → megadetector-5.0.13.dist-info}/LICENSE +0 -0
  198. {megadetector-5.0.11.dist-info → megadetector-5.0.13.dist-info}/METADATA +2 -2
  199. megadetector-5.0.13.dist-info/RECORD +201 -0
  200. megadetector-5.0.13.dist-info/top_level.txt +1 -0
  201. megadetector-5.0.11.dist-info/RECORD +0 -5
  202. megadetector-5.0.11.dist-info/top_level.txt +0 -1
  203. {megadetector-5.0.11.dist-info → megadetector-5.0.13.dist-info}/WHEEL +0 -0
@@ -0,0 +1,1045 @@
1
+ """
2
+
3
+ path_utils.py
4
+
5
+ Miscellaneous useful utils for path manipulation, i.e. things that could *almost*
6
+ be in os.path, but aren't.
7
+
8
+ """
9
+
10
+ #%% Imports and constants
11
+
12
+ import glob
13
+ import ntpath
14
+ import os
15
+ import sys
16
+ import platform
17
+ import string
18
+ import json
19
+ import shutil
20
+ import unicodedata
21
+ import zipfile
22
+ import tarfile
23
+ import webbrowser
24
+ import subprocess
25
+ import re
26
+
27
+ from zipfile import ZipFile
28
+ from datetime import datetime
29
+ from multiprocessing.pool import Pool, ThreadPool
30
+ from functools import partial
31
+ from shutil import which
32
+ from tqdm import tqdm
33
+
34
+ # Should all be lower-case
35
+ IMG_EXTENSIONS = ('.jpg', '.jpeg', '.gif', '.png', '.tif', '.tiff', '.bmp')
36
+
37
+ VALID_FILENAME_CHARS = f"~-_.() {string.ascii_letters}{string.digits}"
38
+ SEPARATOR_CHARS = r":\/"
39
+ VALID_PATH_CHARS = VALID_FILENAME_CHARS + SEPARATOR_CHARS
40
+ CHAR_LIMIT = 255
41
+
42
+
43
+ #%% General path functions
44
+
45
+ def recursive_file_list(base_dir,
46
+ convert_slashes=True,
47
+ return_relative_paths=False,
48
+ sort_files=True,
49
+ recursive=True):
50
+ r"""
51
+ Enumerates files (not directories) in [base_dir], optionally converting
52
+ backslahes to slashes
53
+
54
+ Args:
55
+ base_dir (str): folder to enumerate
56
+ convert_slashes (bool, optional): force forward slashes; if this is False, will use
57
+ the native path separator
58
+ return_relative_paths (bool, optional): return paths that are relative to [base_dir],
59
+ rather than absolute paths
60
+ sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
61
+ provided by os.walk()
62
+ recursive (bool, optional): enumerate recursively
63
+
64
+ Returns:
65
+ list: list of filenames
66
+ """
67
+
68
+ assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
69
+
70
+ all_files = []
71
+
72
+ if recursive:
73
+ for root, _, filenames in os.walk(base_dir):
74
+ for filename in filenames:
75
+ full_path = os.path.join(root, filename)
76
+ all_files.append(full_path)
77
+ else:
78
+ all_files_relative = os.listdir(base_dir)
79
+ all_files = [os.path.join(base_dir,fn) for fn in all_files_relative]
80
+ all_files = [fn for fn in all_files if os.path.isfile(fn)]
81
+
82
+ if return_relative_paths:
83
+ all_files = [os.path.relpath(fn,base_dir) for fn in all_files]
84
+
85
+ if convert_slashes:
86
+ all_files = [fn.replace('\\', '/') for fn in all_files]
87
+
88
+ if sort_files:
89
+ all_files = sorted(all_files)
90
+
91
+ return all_files
92
+
93
+
94
+ def file_list(base_dir, convert_slashes=True, return_relative_paths=False, sort_files=True,
95
+ recursive=False):
96
+ """
97
+ Trivial wrapper for recursive_file_list, which was a poor function name choice at the time,
98
+ since it doesn't really make sense to have a "recursive" option in a function called
99
+ "recursive_file_list".
100
+
101
+ Args:
102
+ base_dir (str): folder to enumerate
103
+ convert_slashes (bool, optional): force forward slashes; if this is False, will use
104
+ the native path separator
105
+ return_relative_paths (bool, optional): return paths that are relative to [base_dir],
106
+ rather than absolute paths
107
+ sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
108
+ provided by os.walk()
109
+ recursive (bool, optional): enumerate recursively
110
+
111
+ Returns:
112
+ list: list of filenames
113
+ """
114
+
115
+ return recursive_file_list(base_dir,convert_slashes,return_relative_paths,sort_files,
116
+ recursive=recursive)
117
+
118
+
119
+ def fileparts(path):
120
+ r"""
121
+ Breaks down a path into the directory path, filename, and extension.
122
+
123
+ Note that the '.' lives with the extension, and separators are removed.
124
+
125
+ Examples:
126
+
127
+ .. code-block:: none
128
+
129
+ >>> fileparts('file')
130
+ ('', 'file', '')
131
+ >>> fileparts(r'c:/dir/file.jpg')
132
+ ('c:/dir', 'file', '.jpg')
133
+ >>> fileparts('/dir/subdir/file.jpg')
134
+ ('/dir/subdir', 'file', '.jpg')
135
+
136
+ Args:
137
+ path (str): path name to separate into parts
138
+ Returns:
139
+ tuple: tuple containing (p,n,e):
140
+ - p: str, directory path
141
+ - n: str, filename without extension
142
+ - e: str, extension including the '.'
143
+ """
144
+
145
+ # ntpath seems to do the right thing for both Windows and Unix paths
146
+ p = ntpath.dirname(path)
147
+ basename = ntpath.basename(path)
148
+ n, e = ntpath.splitext(basename)
149
+ return p, n, e
150
+
151
+
152
+ def insert_before_extension(filename, s=None, separator='.'):
153
+ """
154
+ Insert string [s] before the extension in [filename], separated with [separator].
155
+
156
+ If [s] is empty, generates a date/timestamp. If [filename] has no extension,
157
+ appends [s].
158
+
159
+ Examples:
160
+
161
+ .. code-block:: none
162
+
163
+ >>> insert_before_extension('/dir/subdir/file.ext', 'insert')
164
+ '/dir/subdir/file.insert.ext'
165
+ >>> insert_before_extension('/dir/subdir/file', 'insert')
166
+ '/dir/subdir/file.insert'
167
+ >>> insert_before_extension('/dir/subdir/file')
168
+ '/dir/subdir/file.2020.07.20.10.54.38'
169
+
170
+ Args:
171
+ filename (str): filename to manipulate
172
+ s (str, optional): string to insert before the extension in [filename], or
173
+ None to insert a datestamp
174
+ separator (str, optional): separator to place between the filename base
175
+ and the inserted string
176
+
177
+ Returns:
178
+ str: modified string
179
+ """
180
+
181
+ assert len(filename) > 0
182
+ if s is None or len(s) == 0:
183
+ s = datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
184
+ name, ext = os.path.splitext(filename)
185
+ return f'{name}{separator}{s}{ext}'
186
+
187
+
188
+ def split_path(path):
189
+ r"""
190
+ Splits [path] into all its constituent file/folder tokens.
191
+
192
+ Examples:
193
+
194
+ .. code-block:: none
195
+
196
+ >>> split_path(r'c:\dir\subdir\file.txt')
197
+ ['c:\\', 'dir', 'subdir', 'file.txt']
198
+ >>> split_path('/dir/subdir/file.jpg')
199
+ ['/', 'dir', 'subdir', 'file.jpg']
200
+ >>> split_path('c:\\')
201
+ ['c:\\']
202
+ >>> split_path('/')
203
+ ['/']
204
+
205
+ Args:
206
+ path (str): path to split into tokens
207
+
208
+ Returns:
209
+ list: list of path tokens
210
+ """
211
+
212
+ parts = []
213
+ while True:
214
+ # ntpath seems to do the right thing for both Windows and Unix paths
215
+ head, tail = ntpath.split(path)
216
+ if head == '' or head == path:
217
+ break
218
+ parts.append(tail)
219
+ path = head
220
+ parts.append(head or tail)
221
+ return parts[::-1] # reverse
222
+
223
+
224
+ def path_is_abs(p):
225
+ """
226
+ Determines whether [p] is an absolute path. An absolute path is defined as
227
+ one that starts with slash, backslash, or a letter followed by a colon.
228
+
229
+ Args:
230
+ p (str): path to evaluate
231
+
232
+ Returns:
233
+ bool: True if [p] is an absolute path, else False
234
+ """
235
+
236
+ return (len(p) > 1) and (p[0] == '/' or p[1] == ':' or p[0] == '\\')
237
+
238
+
239
+ def top_level_folder(p):
240
+ r"""
241
+ Gets the top-level folder from the path *p*.
242
+
243
+ On UNIX, this is straightforward:
244
+
245
+ /blah/foo
246
+
247
+ ...returns '/blah'
248
+
249
+ On Windows, we define this as the top-level folder that isn't the drive, so:
250
+
251
+ c:\blah\foo
252
+
253
+ ...returns 'c:\blah'.
254
+
255
+ Args:
256
+ p (str): filename to evaluate
257
+
258
+ Returns:
259
+ str: the top-level folder in [p], see above for details on how this is defined
260
+ """
261
+
262
+ if p == '':
263
+ return ''
264
+
265
+ # Path('/blah').parts is ('/','blah')
266
+ parts = split_path(p)
267
+
268
+ if len(parts) == 1:
269
+ return parts[0]
270
+
271
+ # Handle paths like:
272
+ #
273
+ # /, \, /stuff, c:, c:\stuff
274
+ drive = os.path.splitdrive(p)[0]
275
+ if parts[0] == drive or parts[0] == drive + '/' or parts[0] == drive + '\\' or parts[0] in ['\\', '/']:
276
+ return os.path.join(parts[0], parts[1])
277
+ else:
278
+ return parts[0]
279
+
280
+ # ...top_level_folder()
281
+
282
+
283
+ #%% Test driver for top_level_folder
284
+
285
+ if False:
286
+
287
+ #%%
288
+
289
+ p = 'blah/foo/bar'; s = top_level_folder(p); print(s); assert s == 'blah'
290
+ p = '/blah/foo/bar'; s = top_level_folder(p); print(s); assert s == '/blah'
291
+ p = 'bar'; s = top_level_folder(p); print(s); assert s == 'bar'
292
+ p = ''; s = top_level_folder(p); print(s); assert s == ''
293
+ p = 'c:\\'; s = top_level_folder(p); print(s); assert s == 'c:\\'
294
+ p = r'c:\blah'; s = top_level_folder(p); print(s); assert s == 'c:\\blah'
295
+ p = r'c:\foo'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
296
+ p = r'c:/foo'; s = top_level_folder(p); print(s); assert s == 'c:/foo'
297
+ p = r'c:\foo/bar'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
298
+
299
+ #%%
300
+
301
+ def safe_create_link(link_exists,link_new):
302
+ """
303
+ Creates a symlink at [link_new] pointing to [link_exists].
304
+
305
+ If [link_new] already exists, make sure it's a link (not a file),
306
+ and if it has a different target than [link_exists], removes and re-creates
307
+ it.
308
+
309
+ Errors if [link_new] already exists but it's not a link.
310
+
311
+ Args:
312
+ link_exists (str): the source of the (possibly-new) symlink
313
+ link_new (str): the target of the (possibly-new) symlink
314
+ """
315
+
316
+ if os.path.exists(link_new) or os.path.islink(link_new):
317
+ assert os.path.islink(link_new)
318
+ if not os.readlink(link_new) == link_exists:
319
+ os.remove(link_new)
320
+ os.symlink(link_exists,link_new)
321
+ else:
322
+ os.symlink(link_exists,link_new)
323
+
324
+
325
+ #%% Image-related path functions
326
+
327
+ def is_image_file(s, img_extensions=IMG_EXTENSIONS):
328
+ """
329
+ Checks a file's extension against a hard-coded set of image file
330
+ extensions. Uses case-insensitive comparison.
331
+
332
+ Does not check whether the file exists, only determines whether the filename
333
+ implies it's an image file.
334
+
335
+ Args:
336
+ s (str): filename to evaluate for image-ness
337
+ img_extensions (list, optional): list of known image file extensions
338
+
339
+ Returns:
340
+ bool: True if [s] appears to be an image file, else False
341
+ """
342
+
343
+ ext = os.path.splitext(s)[1]
344
+ return ext.lower() in img_extensions
345
+
346
+
347
+ def find_image_strings(strings):
348
+ """
349
+ Given a list of strings that are potentially image file names, looks for
350
+ strings that actually look like image file names (based on extension).
351
+
352
+ Args:
353
+ strings (list): list of filenames to check for image-ness
354
+
355
+ Returns:
356
+ list: the subset of [strings] that appear to be image filenames
357
+ """
358
+
359
+ return [s for s in strings if is_image_file(s)]
360
+
361
+
362
+ def find_images(dirname,
363
+ recursive=False,
364
+ return_relative_paths=False,
365
+ convert_slashes=True):
366
+ """
367
+ Finds all files in a directory that look like image file names. Returns
368
+ absolute paths unless return_relative_paths is set. Uses the OS-native
369
+ path separator unless convert_slashes is set, in which case will always
370
+ use '/'.
371
+
372
+ Args:
373
+ dirname (str): the folder to search for images
374
+ recursive (bool, optional): whether to search recursively
375
+ return_relative_paths (str, optional): return paths that are relative
376
+ to [dirname], rather than absolute paths
377
+ convert_slashes (bool, optional): force forward slashes in return values
378
+
379
+ Returns:
380
+ list: list of image filenames found in [dirname]
381
+ """
382
+
383
+ assert os.path.isdir(dirname), '{} is not a folder'.format(dirname)
384
+
385
+ if recursive:
386
+ strings = glob.glob(os.path.join(dirname, '**', '*.*'), recursive=True)
387
+ else:
388
+ strings = glob.glob(os.path.join(dirname, '*.*'))
389
+
390
+ image_files = find_image_strings(strings)
391
+
392
+ if return_relative_paths:
393
+ image_files = [os.path.relpath(fn,dirname) for fn in image_files]
394
+
395
+ image_files = sorted(image_files)
396
+
397
+ if convert_slashes:
398
+ image_files = [fn.replace('\\', '/') for fn in image_files]
399
+
400
+ return image_files
401
+
402
+
403
+ #%% Filename cleaning functions
404
+
405
+ def clean_filename(filename,
406
+ allow_list=VALID_FILENAME_CHARS,
407
+ char_limit=CHAR_LIMIT,
408
+ force_lower= False):
409
+ r"""
410
+ Removes non-ASCII and other invalid filename characters (on any
411
+ reasonable OS) from a filename, then optionally trims to a maximum length.
412
+
413
+ Does not allow :\/ by default, use clean_path if you want to preserve those.
414
+
415
+ Adapted from
416
+ https://gist.github.com/wassname/1393c4a57cfcbf03641dbc31886123b8
417
+
418
+ Args:
419
+ filename (str): filename to clean
420
+ allow_list (str, optional): string containing all allowable filename characters
421
+ char_limit (int, optional): maximum allowable filename length, if None will skip this
422
+ step
423
+ force_lower (bool, optional): convert the resulting filename to lowercase
424
+
425
+ returns:
426
+ str: cleaned version of [filename]
427
+ """
428
+
429
+ # keep only valid ascii chars
430
+ cleaned_filename = (unicodedata.normalize('NFKD', filename)
431
+ .encode('ASCII', 'ignore').decode())
432
+
433
+ # keep only allow-listed chars
434
+ cleaned_filename = ''.join([c for c in cleaned_filename if c in allow_list])
435
+ if char_limit is not None:
436
+ cleaned_filename = cleaned_filename[:char_limit]
437
+ if force_lower:
438
+ cleaned_filename = cleaned_filename.lower()
439
+ return cleaned_filename
440
+
441
+
442
+ def clean_path(pathname,
443
+ allow_list=VALID_PATH_CHARS,
444
+ char_limit=CHAR_LIMIT,
445
+ force_lower=False):
446
+ """
447
+ Removes non-ASCII and other invalid path characters (on any reasonable
448
+ OS) from a path, then optionally trims to a maximum length.
449
+
450
+ Args:
451
+ pathname (str): path name to clean
452
+ allow_list (str, optional): string containing all allowable filename characters
453
+ char_limit (int, optional): maximum allowable filename length, if None will skip this
454
+ step
455
+ force_lower (bool, optional): convert the resulting filename to lowercase
456
+
457
+ returns:
458
+ str: cleaned version of [filename]
459
+ """
460
+
461
+ return clean_filename(pathname, allow_list=allow_list,
462
+ char_limit=char_limit, force_lower=force_lower)
463
+
464
+
465
+ def flatten_path(pathname,separator_chars=SEPARATOR_CHARS,separator_char_replacement='~'):
466
+ r"""
467
+ Removes non-ASCII and other invalid path characters (on any reasonable
468
+ OS) from a path, then trims to a maximum length. Replaces all valid
469
+ separators with [separator_char_replacement.]
470
+
471
+ Args:
472
+ pathname (str): path name to flatten
473
+ separator_chars (str, optional): string containing all known path separators
474
+ separator_char_replacement (str, optional): string to insert in place of
475
+ path separators.
476
+
477
+ Returns:
478
+ str: flattened version of [pathname]
479
+ """
480
+
481
+ s = clean_path(pathname)
482
+ for c in separator_chars:
483
+ s = s.replace(c, separator_char_replacement)
484
+ return s
485
+
486
+
487
+ def is_executable(filename):
488
+ """
489
+ Checks whether [filename] is on the system path and marked as executable.
490
+
491
+ Args:
492
+ filename (str): filename to check for executable status
493
+
494
+ Returns:
495
+ bool: True if [filename] is on the system path and marked as executable, otherwise False
496
+ """
497
+
498
+ # https://stackoverflow.com/questions/11210104/check-if-a-program-exists-from-a-python-script
499
+
500
+ return which(filename) is not None
501
+
502
+
503
+ #%% Platform-independent way to open files in their associated application
504
+
505
+ def environment_is_wsl():
506
+ """
507
+ Determines whether we're running in WSL.
508
+
509
+ Returns:
510
+ True if we're running in WSL.
511
+ """
512
+
513
+ if sys.platform not in ('linux','posix'):
514
+ return False
515
+ platform_string = ' '.join(platform.uname()).lower()
516
+ return 'microsoft' in platform_string and 'wsl' in platform_string
517
+
518
+
519
+ def wsl_path_to_windows_path(filename):
520
+ r"""
521
+ Converts a WSL path to a Windows path, or returns None if that's not possible. E.g.
522
+ converts:
523
+
524
+ /mnt/e/a/b/c
525
+
526
+ ...to:
527
+
528
+ e:\a\b\c
529
+
530
+ Args:
531
+ filename (str): filename to convert
532
+
533
+ Returns:
534
+ str: Windows equivalent to the WSL path [filename]
535
+ """
536
+
537
+ result = subprocess.run(['wslpath', '-w', filename], text=True, capture_output=True)
538
+ if result.returncode != 0:
539
+ print('Could not convert path {} from WSL to Windows'.format(filename))
540
+ return None
541
+ return result.stdout.strip()
542
+
543
+
544
+ def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
545
+ """
546
+ Opens [filename] in the default OS file handler for this file type.
547
+
548
+ If browser_name is not None, uses the webbrowser module to open the filename
549
+ in the specified browser; see https://docs.python.org/3/library/webbrowser.html
550
+ for supported browsers. Falls back to the default file handler if webbrowser.open()
551
+ fails. In this case, attempt_to_open_in_wsl_host is ignored unless webbrowser.open() fails.
552
+
553
+ If browser_name is 'default', uses the system default. This is different from the
554
+ parameter to webbrowser.get(), where None implies the system default.
555
+
556
+ Args:
557
+ filename (str): file to open
558
+ attempt_to_open_in_wsl_host: if this is True, and we're in WSL, attempts to open
559
+ [filename] in the Windows host environment
560
+ browser_name: see above
561
+ """
562
+
563
+ if browser_name is not None:
564
+ if browser_name == 'chrome':
565
+ browser_name = 'google-chrome'
566
+ elif browser_name == 'default':
567
+ browser_name = None
568
+ try:
569
+ result = webbrowser.get(using=browser_name).open(filename)
570
+ except Exception:
571
+ result = False
572
+ if result:
573
+ return
574
+
575
+ if sys.platform == 'win32':
576
+
577
+ os.startfile(filename)
578
+
579
+ elif sys.platform == 'darwin':
580
+
581
+ opener = 'open'
582
+ subprocess.call([opener, filename])
583
+
584
+ elif attempt_to_open_in_wsl_host and environment_is_wsl():
585
+
586
+ windows_path = wsl_path_to_windows_path(filename)
587
+
588
+ # Fall back to xdg-open
589
+ if windows_path is None:
590
+ subprocess.call(['xdg-open', filename])
591
+
592
+ if os.path.isdir(filename):
593
+ subprocess.run(["explorer.exe", windows_path])
594
+ else:
595
+ os.system("cmd.exe /C start %s" % (re.escape(windows_path)))
596
+
597
+ else:
598
+
599
+ opener = 'xdg-open'
600
+ subprocess.call([opener, filename])
601
+
602
+
603
+ #%% File list functions
604
+
605
+ def write_list_to_file(output_file,strings):
606
+ """
607
+ Writes a list of strings to either a JSON file or text file,
608
+ depending on extension of the given file name.
609
+
610
+ Args:
611
+ output_file (str): file to write
612
+ strings (list): list of strings to write to [output_file]
613
+ """
614
+
615
+ with open(output_file, 'w') as f:
616
+ if output_file.endswith('.json'):
617
+ json.dump(strings, f, indent=1)
618
+ else:
619
+ f.write('\n'.join(strings))
620
+
621
+
622
+ def read_list_from_file(filename):
623
+ """
624
+ Reads a json-formatted list of strings from a file.
625
+
626
+ Args:
627
+ filename (str): .json filename to read
628
+
629
+ Returns:
630
+ list: list of strings read from [filename]
631
+ """
632
+
633
+ assert filename.endswith('.json')
634
+ with open(filename, 'r') as f:
635
+ file_list = json.load(f)
636
+ assert isinstance(file_list, list)
637
+ for s in file_list:
638
+ assert isinstance(s, str)
639
+ return file_list
640
+
641
+
642
+ def _copy_file(input_output_tuple,overwrite=True,verbose=False):
643
+ """
644
+ Internal function for copying files from within parallel_copy_files.
645
+ """
646
+
647
+ assert len(input_output_tuple) == 2
648
+ source_fn = input_output_tuple[0]
649
+ target_fn = input_output_tuple[1]
650
+ if (not overwrite) and (os.path.isfile(target_fn)):
651
+ if verbose:
652
+ print('Skipping existing file {}'.format(target_fn))
653
+ return
654
+ os.makedirs(os.path.dirname(target_fn),exist_ok=True)
655
+ shutil.copyfile(source_fn,target_fn)
656
+
657
+
658
+ def parallel_copy_files(input_file_to_output_file, max_workers=16,
659
+ use_threads=True, overwrite=False, verbose=False):
660
+ """
661
+ Copies files from source to target according to the dict input_file_to_output_file.
662
+
663
+ Args:
664
+ input_file_to_output_file (dict): dictionary mapping source files to the target files
665
+ to which they should be copied
666
+ max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
667
+ use_threads (bool, optional): whether to use threads (True) or processes (False) for
668
+ parallel copying; ignored if max_workers <= 1
669
+ overwrite (bool, optional): whether to overwrite existing destination files
670
+ verbose (bool, optional): enable additionald debug output
671
+ """
672
+
673
+ n_workers = min(max_workers,len(input_file_to_output_file))
674
+
675
+ # Package the dictionary as a set of 2-tuples
676
+ input_output_tuples = []
677
+ for input_fn in input_file_to_output_file:
678
+ input_output_tuples.append((input_fn,input_file_to_output_file[input_fn]))
679
+
680
+ if use_threads:
681
+ pool = ThreadPool(n_workers)
682
+ else:
683
+ pool = Pool(n_workers)
684
+
685
+ with tqdm(total=len(input_output_tuples)) as pbar:
686
+ for i,_ in enumerate(pool.imap_unordered(partial(_copy_file,overwrite=overwrite,verbose=verbose),
687
+ input_output_tuples)):
688
+ pbar.update()
689
+
690
+ # ...def parallel_copy_files(...)
691
+
692
+
693
+ def get_file_sizes(base_dir, convert_slashes=True):
694
+ """
695
+ Gets sizes recursively for all files in base_dir, returning a dict mapping
696
+ relative filenames to size.
697
+
698
+ TODO: merge the functionality here with parallel_get_file_sizes, which uses slightly
699
+ different semantics.
700
+
701
+ Args:
702
+ base_dir (str): folder within which we want all file sizes
703
+ convert_slashes (bool, optional): force forward slashes in return strings,
704
+ otherwise uses the native path separator
705
+
706
+ Returns:
707
+ dict: dictionary mapping filenames to file sizes in bytes
708
+ """
709
+
710
+ relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
711
+ return_relative_paths=True)
712
+
713
+ fn_to_size = {}
714
+ for fn_relative in tqdm(relative_filenames):
715
+ fn_abs = os.path.join(base_dir,fn_relative)
716
+ fn_to_size[fn_relative] = os.path.getsize(fn_abs)
717
+
718
+ return fn_to_size
719
+
720
+
721
+ def _get_file_size(filename,verbose=False):
722
+ """
723
+ Internal function for safely getting the size of a file. Returns a (filename,size)
724
+ tuple, where size is None if there is an error.
725
+ """
726
+
727
+ try:
728
+ size = os.path.getsize(filename)
729
+ except Exception as e:
730
+ if verbose:
731
+ print('Error reading file size for {}: {}'.format(filename,str(e)))
732
+ size = None
733
+ return (filename,size)
734
+
735
+
736
+ def parallel_get_file_sizes(filenames, max_workers=16,
737
+ use_threads=True, verbose=False,
738
+ recursive=True):
739
+ """
740
+ Returns a dictionary mapping every file in [filenames] to the corresponding file size,
741
+ or None for errors. If [filenames] is a folder, will enumerate the folder (optionally recursively).
742
+
743
+ Args:
744
+ filenames (list or str): list of filenames for which we should read sizes, or a folder
745
+ within which we should read all file sizes recursively
746
+ max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
747
+ use_threads (bool, optional): whether to use threads (True) or processes (False) for
748
+ parallel copying; ignored if max_workers <= 1
749
+ verbose (bool, optional): enable additionald debug output
750
+ recursive (bool, optional): enumerate recursively, only relevant if [filenames] is a folder.
751
+
752
+ Returns:
753
+ dict: dictionary mapping filenames to file sizes in bytes
754
+ """
755
+
756
+ n_workers = min(max_workers,len(filenames))
757
+
758
+ if isinstance(filenames,str) and os.path.isdir(filenames):
759
+ filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
760
+
761
+ if use_threads:
762
+ pool = ThreadPool(n_workers)
763
+ else:
764
+ pool = Pool(n_workers)
765
+
766
+ resize_results = list(tqdm(pool.imap(
767
+ partial(_get_file_size,verbose=verbose),filenames), total=len(filenames)))
768
+
769
+ to_return = {}
770
+ for r in resize_results:
771
+ to_return[r[0]] = r[1]
772
+
773
+ return to_return
774
+
775
+
776
+ #%% Zip functions
777
+
778
+ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
779
+ """
780
+ Zips a single file.
781
+
782
+ Args:
783
+ input_fn (str): file to zip
784
+ output_fn (str, optional): target zipfile; if this is None, we'll use
785
+ [input_fn].zip
786
+ overwrite (bool, optional): whether to overwrite an existing target file
787
+ verbose (bool, optional): enable existing debug console output
788
+ compresslevel (int, optional): compression level to use, between 0 and 9
789
+
790
+ Returns:
791
+ str: the output zipfile, whether we created it or determined that it already exists
792
+ """
793
+
794
+ basename = os.path.basename(input_fn)
795
+
796
+ if output_fn is None:
797
+ output_fn = input_fn + '.zip'
798
+
799
+ if (not overwrite) and (os.path.isfile(output_fn)):
800
+ print('Skipping existing file {}'.format(output_fn))
801
+ return output_fn
802
+
803
+ if verbose:
804
+ print('Zipping {} to {} with level {}'.format(input_fn,output_fn,compresslevel))
805
+
806
+ with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
807
+ zipf.write(input_fn,arcname=basename,compresslevel=compresslevel,
808
+ compress_type=zipfile.ZIP_DEFLATED)
809
+
810
+ return output_fn
811
+
812
+
813
+ def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
814
+ overwrite=False, verbose=False, mode='x'):
815
+ """
816
+ Adds all the files in [input_files] to the tar file [output_fn].
817
+ Archive names are relative to arc_name_base.
818
+
819
+ Args:
820
+ input_files (list): list of absolute filenames to include in the .tar file
821
+ output_fn (str): .tar file to create
822
+ arc_name_base (str): absolute folder from which relative paths should be determined;
823
+ behavior is undefined if there are files in [input_files] that don't live within
824
+ [arc_name_base]
825
+ overwrite (bool, optional): whether to overwrite an existing .tar file
826
+ verbose (bool, optional): enable additional debug console output
827
+ mode (str, optional): compression type, can be 'x' (no compression), 'x:gz', or 'x:bz2'.
828
+
829
+ Returns:
830
+ str: the output tar file, whether we created it or determined that it already exists
831
+ """
832
+
833
+ if os.path.isfile(output_fn):
834
+ if not overwrite:
835
+ print('Tar file {} exists, skipping'.format(output_fn))
836
+ return output_fn
837
+ else:
838
+ print('Tar file {} exists, deleting and re-creating'.format(output_fn))
839
+ os.remove(output_fn)
840
+
841
+ if verbose:
842
+ print('Adding {} files to {} (mode {})'.format(
843
+ len(input_files),output_fn,mode))
844
+
845
+ with tarfile.open(output_fn,mode) as tarf:
846
+ for input_fn_abs in tqdm(input_files,disable=(not verbose)):
847
+ input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
848
+ tarf.add(input_fn_abs,arcname=input_fn_relative)
849
+
850
+ return output_fn
851
+
852
+
853
+ def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
854
+ overwrite=False, verbose=False, compresslevel=9):
855
+ """
856
+ Zip all the files in [input_files] into [output_fn]. Archive names are relative to
857
+ arc_name_base.
858
+
859
+ Args:
860
+ input_files (list): list of absolute filenames to include in the .tar file
861
+ output_fn (str): .tar file to create
862
+ arc_name_base (str): absolute folder from which relative paths should be determined;
863
+ behavior is undefined if there are files in [input_files] that don't live within
864
+ [arc_name_base]
865
+ overwrite (bool, optional): whether to overwrite an existing .tar file
866
+ verbose (bool, optional): enable additional debug console output
867
+ compresslevel (int, optional): compression level to use, between 0 and 9
868
+
869
+ Returns:
870
+ str: the output zipfile, whether we created it or determined that it already exists
871
+ """
872
+
873
+ if not overwrite:
874
+ if os.path.isfile(output_fn):
875
+ print('Zip file {} exists, skipping'.format(output_fn))
876
+ return output_fn
877
+
878
+ if verbose:
879
+ print('Zipping {} files to {} (compression level {})'.format(
880
+ len(input_files),output_fn,compresslevel))
881
+
882
+ with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
883
+ for input_fn_abs in tqdm(input_files,disable=(not verbose)):
884
+ input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
885
+ zipf.write(input_fn_abs,
886
+ arcname=input_fn_relative,
887
+ compresslevel=compresslevel,
888
+ compress_type=zipfile.ZIP_DEFLATED)
889
+
890
+ return output_fn
891
+
892
+
893
+ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
894
+ """
895
+ Recursively zip everything in [input_folder] into a single zipfile, storing outputs as relative
896
+ paths.
897
+
898
+ Args:
899
+ input_folder (str): folder to zip
900
+ output_fn (str, optional): output filename; if this is None, we'll write to [input_folder].zip
901
+ overwrite (bool, optional): whether to overwrite an existing .tar file
902
+ verbose (bool, optional): enable additional debug console output
903
+ compresslevel (int, optional): compression level to use, between 0 and 9
904
+
905
+ Returns:
906
+ str: the output zipfile, whether we created it or determined that it already exists
907
+ """
908
+
909
+ if output_fn is None:
910
+ output_fn = input_folder + '.zip'
911
+
912
+ if not overwrite:
913
+ if os.path.isfile(output_fn):
914
+ print('Zip file {} exists, skipping'.format(output_fn))
915
+ return
916
+
917
+ if verbose:
918
+ print('Zipping {} to {} (compression level {})'.format(
919
+ input_folder,output_fn,compresslevel))
920
+
921
+ relative_filenames = recursive_file_list(input_folder,return_relative_paths=True)
922
+
923
+ with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
924
+ for input_fn_relative in tqdm(relative_filenames,disable=(not verbose)):
925
+ input_fn_abs = os.path.join(input_folder,input_fn_relative)
926
+ zipf.write(input_fn_abs,
927
+ arcname=input_fn_relative,
928
+ compresslevel=compresslevel,
929
+ compress_type=zipfile.ZIP_DEFLATED)
930
+
931
+ return output_fn
932
+
933
+
934
+ def parallel_zip_files(input_files, max_workers=16, use_threads=True, compresslevel=9,
935
+ overwrite=False, verbose=False):
936
+ """
937
+ Zips one or more files to separate output files in parallel, leaving the
938
+ original files in place. Each file is zipped to [filename].zip.
939
+
940
+ Args:
941
+ input_file (str): list of files to zip
942
+ max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
943
+ use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
944
+ max_workers <= 1
945
+ compresslevel (int, optional): zip compression level between 0 and 9
946
+ overwrite (bool, optional): whether to overwrite an existing .tar file
947
+ verbose (bool, optional): enable additional debug console output
948
+ """
949
+
950
+ n_workers = min(max_workers,len(input_files))
951
+
952
+ if use_threads:
953
+ pool = ThreadPool(n_workers)
954
+ else:
955
+ pool = Pool(n_workers)
956
+
957
+ with tqdm(total=len(input_files)) as pbar:
958
+ for i,_ in enumerate(pool.imap_unordered(partial(zip_file,
959
+ output_fn=None,overwrite=overwrite,verbose=verbose,compresslevel=compresslevel),
960
+ input_files)):
961
+ pbar.update()
962
+
963
+
964
+ def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
965
+ compresslevel=9, overwrite=False, verbose=False):
966
+ """
967
+ Zips one or more folders to separate output files in parallel, leaving the
968
+ original folders in place. Each folder is zipped to [folder_name].zip.
969
+
970
+ Args:
971
+ input_folder (list): list of folders to zip
972
+ max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
973
+ use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
974
+ max_workers <= 1
975
+ compresslevel (int, optional): zip compression level between 0 and 9
976
+ overwrite (bool, optional): whether to overwrite an existing .tar file
977
+ verbose (bool, optional): enable additional debug console output
978
+ """
979
+
980
+ n_workers = min(max_workers,len(input_folders))
981
+
982
+ if use_threads:
983
+ pool = ThreadPool(n_workers)
984
+ else:
985
+ pool = Pool(n_workers)
986
+
987
+ with tqdm(total=len(input_folders)) as pbar:
988
+ for i,_ in enumerate(pool.imap_unordered(
989
+ partial(zip_folder,overwrite=overwrite,
990
+ compresslevel=compresslevel,verbose=verbose),
991
+ input_folders)):
992
+ pbar.update()
993
+
994
+
995
+ def zip_each_file_in_folder(folder_name,recursive=False,max_workers=16,use_threads=True,
996
+ compresslevel=9,overwrite=False,required_token=None,verbose=False,
997
+ exclude_zip=True):
998
+ """
999
+ Zips each file in [folder_name] to its own zipfile (filename.zip), optionally recursing. To
1000
+ zip a whole folder into a single zipfile, use zip_folder().
1001
+
1002
+ Args:
1003
+ folder_name (str): the folder within which we should zip files
1004
+ recursive (bool, optional): whether to recurse within [folder_name]
1005
+ max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
1006
+ use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
1007
+ max_workers <= 1
1008
+ compresslevel (int, optional): zip compression level between 0 and 9
1009
+ overwrite (bool, optional): whether to overwrite an existing .tar file
1010
+ required_token (str, optional): only zip files whose names contain this string
1011
+ verbose (bool, optional): enable additional debug console output
1012
+ exclude_zip (bool, optional): skip files ending in .zip
1013
+ """
1014
+
1015
+ assert os.path.isdir(folder_name), '{} is not a folder'.format(folder_name)
1016
+
1017
+ input_files = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
1018
+
1019
+ if required_token is not None:
1020
+ input_files = [fn for fn in input_files if required_token in fn]
1021
+
1022
+ if exclude_zip:
1023
+ input_files = [fn for fn in input_files if (not fn.endswith('.zip'))]
1024
+
1025
+ parallel_zip_files(input_files=input_files,max_workers=max_workers,
1026
+ use_threads=use_threads,compresslevel=compresslevel,
1027
+ overwrite=overwrite,verbose=verbose)
1028
+
1029
+
1030
+ def unzip_file(input_file, output_folder=None):
1031
+ """
1032
+ Unzips a zipfile to the specified output folder, defaulting to the same location as
1033
+ the input file.
1034
+
1035
+ Args:
1036
+ input_file (str): zipfile to unzip
1037
+ output_folder (str, optional): folder to which we should unzip [input_file], defaults
1038
+ to unzipping to the folder where [input_file] lives
1039
+ """
1040
+
1041
+ if output_folder is None:
1042
+ output_folder = os.path.dirname(input_file)
1043
+
1044
+ with zipfile.ZipFile(input_file, 'r') as zf:
1045
+ zf.extractall(output_folder)