megadetector 10.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. megadetector/__init__.py +0 -0
  2. megadetector/api/__init__.py +0 -0
  3. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  7. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  8. megadetector/classification/__init__.py +0 -0
  9. megadetector/classification/aggregate_classifier_probs.py +108 -0
  10. megadetector/classification/analyze_failed_images.py +227 -0
  11. megadetector/classification/cache_batchapi_outputs.py +198 -0
  12. megadetector/classification/create_classification_dataset.py +626 -0
  13. megadetector/classification/crop_detections.py +516 -0
  14. megadetector/classification/csv_to_json.py +226 -0
  15. megadetector/classification/detect_and_crop.py +853 -0
  16. megadetector/classification/efficientnet/__init__.py +9 -0
  17. megadetector/classification/efficientnet/model.py +415 -0
  18. megadetector/classification/efficientnet/utils.py +608 -0
  19. megadetector/classification/evaluate_model.py +520 -0
  20. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  21. megadetector/classification/json_to_azcopy_list.py +63 -0
  22. megadetector/classification/json_validator.py +696 -0
  23. megadetector/classification/map_classification_categories.py +276 -0
  24. megadetector/classification/merge_classification_detection_output.py +509 -0
  25. megadetector/classification/prepare_classification_script.py +194 -0
  26. megadetector/classification/prepare_classification_script_mc.py +228 -0
  27. megadetector/classification/run_classifier.py +287 -0
  28. megadetector/classification/save_mislabeled.py +110 -0
  29. megadetector/classification/train_classifier.py +827 -0
  30. megadetector/classification/train_classifier_tf.py +725 -0
  31. megadetector/classification/train_utils.py +323 -0
  32. megadetector/data_management/__init__.py +0 -0
  33. megadetector/data_management/animl_to_md.py +161 -0
  34. megadetector/data_management/annotations/__init__.py +0 -0
  35. megadetector/data_management/annotations/annotation_constants.py +33 -0
  36. megadetector/data_management/camtrap_dp_to_coco.py +270 -0
  37. megadetector/data_management/cct_json_utils.py +566 -0
  38. megadetector/data_management/cct_to_md.py +184 -0
  39. megadetector/data_management/cct_to_wi.py +293 -0
  40. megadetector/data_management/coco_to_labelme.py +284 -0
  41. megadetector/data_management/coco_to_yolo.py +701 -0
  42. megadetector/data_management/databases/__init__.py +0 -0
  43. megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
  44. megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
  45. megadetector/data_management/databases/integrity_check_json_db.py +563 -0
  46. megadetector/data_management/databases/subset_json_db.py +195 -0
  47. megadetector/data_management/generate_crops_from_cct.py +200 -0
  48. megadetector/data_management/get_image_sizes.py +164 -0
  49. megadetector/data_management/labelme_to_coco.py +559 -0
  50. megadetector/data_management/labelme_to_yolo.py +349 -0
  51. megadetector/data_management/lila/__init__.py +0 -0
  52. megadetector/data_management/lila/create_lila_blank_set.py +556 -0
  53. megadetector/data_management/lila/create_lila_test_set.py +192 -0
  54. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  55. megadetector/data_management/lila/download_lila_subset.py +182 -0
  56. megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
  57. megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
  58. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  59. megadetector/data_management/lila/lila_common.py +319 -0
  60. megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
  61. megadetector/data_management/mewc_to_md.py +344 -0
  62. megadetector/data_management/ocr_tools.py +873 -0
  63. megadetector/data_management/read_exif.py +964 -0
  64. megadetector/data_management/remap_coco_categories.py +195 -0
  65. megadetector/data_management/remove_exif.py +156 -0
  66. megadetector/data_management/rename_images.py +194 -0
  67. megadetector/data_management/resize_coco_dataset.py +665 -0
  68. megadetector/data_management/speciesnet_to_md.py +41 -0
  69. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  70. megadetector/data_management/yolo_output_to_md_output.py +594 -0
  71. megadetector/data_management/yolo_to_coco.py +984 -0
  72. megadetector/data_management/zamba_to_md.py +188 -0
  73. megadetector/detection/__init__.py +0 -0
  74. megadetector/detection/change_detection.py +840 -0
  75. megadetector/detection/process_video.py +479 -0
  76. megadetector/detection/pytorch_detector.py +1451 -0
  77. megadetector/detection/run_detector.py +1267 -0
  78. megadetector/detection/run_detector_batch.py +2172 -0
  79. megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
  80. megadetector/detection/run_md_and_speciesnet.py +1604 -0
  81. megadetector/detection/run_tiled_inference.py +1044 -0
  82. megadetector/detection/tf_detector.py +209 -0
  83. megadetector/detection/video_utils.py +1379 -0
  84. megadetector/postprocessing/__init__.py +0 -0
  85. megadetector/postprocessing/add_max_conf.py +72 -0
  86. megadetector/postprocessing/categorize_detections_by_size.py +166 -0
  87. megadetector/postprocessing/classification_postprocessing.py +1943 -0
  88. megadetector/postprocessing/combine_batch_outputs.py +249 -0
  89. megadetector/postprocessing/compare_batch_results.py +2110 -0
  90. megadetector/postprocessing/convert_output_format.py +403 -0
  91. megadetector/postprocessing/create_crop_folder.py +629 -0
  92. megadetector/postprocessing/detector_calibration.py +570 -0
  93. megadetector/postprocessing/generate_csv_report.py +522 -0
  94. megadetector/postprocessing/load_api_results.py +223 -0
  95. megadetector/postprocessing/md_to_coco.py +428 -0
  96. megadetector/postprocessing/md_to_labelme.py +351 -0
  97. megadetector/postprocessing/md_to_wi.py +41 -0
  98. megadetector/postprocessing/merge_detections.py +392 -0
  99. megadetector/postprocessing/postprocess_batch_results.py +2140 -0
  100. megadetector/postprocessing/remap_detection_categories.py +226 -0
  101. megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
  102. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
  103. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
  104. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
  105. megadetector/postprocessing/separate_detections_into_folders.py +795 -0
  106. megadetector/postprocessing/subset_json_detector_output.py +964 -0
  107. megadetector/postprocessing/top_folders_to_bottom.py +238 -0
  108. megadetector/postprocessing/validate_batch_results.py +332 -0
  109. megadetector/taxonomy_mapping/__init__.py +0 -0
  110. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  111. megadetector/taxonomy_mapping/map_new_lila_datasets.py +211 -0
  112. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
  113. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
  114. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  115. megadetector/taxonomy_mapping/simple_image_download.py +231 -0
  116. megadetector/taxonomy_mapping/species_lookup.py +1008 -0
  117. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  118. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  119. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  120. megadetector/tests/__init__.py +0 -0
  121. megadetector/tests/test_nms_synthetic.py +335 -0
  122. megadetector/utils/__init__.py +0 -0
  123. megadetector/utils/ct_utils.py +1857 -0
  124. megadetector/utils/directory_listing.py +199 -0
  125. megadetector/utils/extract_frames_from_video.py +307 -0
  126. megadetector/utils/gpu_test.py +125 -0
  127. megadetector/utils/md_tests.py +2072 -0
  128. megadetector/utils/path_utils.py +2872 -0
  129. megadetector/utils/process_utils.py +172 -0
  130. megadetector/utils/split_locations_into_train_val.py +237 -0
  131. megadetector/utils/string_utils.py +234 -0
  132. megadetector/utils/url_utils.py +825 -0
  133. megadetector/utils/wi_platform_utils.py +968 -0
  134. megadetector/utils/wi_taxonomy_utils.py +1766 -0
  135. megadetector/utils/write_html_image_list.py +239 -0
  136. megadetector/visualization/__init__.py +0 -0
  137. megadetector/visualization/plot_utils.py +309 -0
  138. megadetector/visualization/render_images_with_thumbnails.py +243 -0
  139. megadetector/visualization/visualization_utils.py +1973 -0
  140. megadetector/visualization/visualize_db.py +630 -0
  141. megadetector/visualization/visualize_detector_output.py +498 -0
  142. megadetector/visualization/visualize_video_output.py +705 -0
  143. megadetector-10.0.15.dist-info/METADATA +115 -0
  144. megadetector-10.0.15.dist-info/RECORD +147 -0
  145. megadetector-10.0.15.dist-info/WHEEL +5 -0
  146. megadetector-10.0.15.dist-info/licenses/LICENSE +19 -0
  147. megadetector-10.0.15.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2872 @@
1
+ """
2
+
3
+ path_utils.py
4
+
5
+ Miscellaneous useful utils for path manipulation, i.e. things that could *almost*
6
+ be in os.path, but aren't.
7
+
8
+ """
9
+
10
+ #%% Imports and constants
11
+
12
+ import glob
13
+ import ntpath
14
+ import os
15
+ import sys
16
+ import platform
17
+ import string
18
+ import json
19
+ import shutil
20
+ import hashlib
21
+ import unicodedata
22
+ import zipfile
23
+ import tarfile
24
+ import webbrowser
25
+ import subprocess
26
+ import re
27
+ import pytest
28
+
29
+ from zipfile import ZipFile
30
+ from datetime import datetime
31
+ from collections import defaultdict
32
+ from multiprocessing.pool import Pool, ThreadPool
33
+ from functools import partial
34
+ from shutil import which
35
+ from tqdm import tqdm
36
+
37
+ from megadetector.utils.ct_utils import is_iterable
38
+ from megadetector.utils.ct_utils import make_test_folder
39
+ from megadetector.utils.ct_utils import sort_dictionary_by_value
40
+ from megadetector.utils.ct_utils import environment_is_wsl
41
+
42
+ # Should all be lower-case
43
+ IMG_EXTENSIONS = ('.jpg', '.jpeg', '.gif', '.png', '.tif', '.tiff', '.bmp')
44
+
45
+ VALID_FILENAME_CHARS = f"~-_.() {string.ascii_letters}{string.digits}"
46
+ SEPARATOR_CHARS = r":\/"
47
+ VALID_PATH_CHARS = VALID_FILENAME_CHARS + SEPARATOR_CHARS
48
+ CHAR_LIMIT = 255
49
+
50
+
51
+ #%% General path functions
52
+
53
+ def recursive_file_list(base_dir,
54
+ convert_slashes=True,
55
+ return_relative_paths=False,
56
+ sort_files=True,
57
+ recursive=True):
58
+ r"""
59
+ Enumerates files (not directories) in [base_dir].
60
+
61
+ Args:
62
+ base_dir (str): folder to enumerate
63
+ convert_slashes (bool, optional): force forward slashes; if this is False, will use
64
+ the native path separator
65
+ return_relative_paths (bool, optional): return paths that are relative to [base_dir],
66
+ rather than absolute paths
67
+ sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
68
+ provided by os.walk()
69
+ recursive (bool, optional): enumerate recursively
70
+
71
+ Returns:
72
+ list: list of filenames
73
+ """
74
+
75
+ assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
76
+
77
+ all_files = []
78
+
79
+ if recursive:
80
+ for root, _, filenames in os.walk(base_dir):
81
+ for filename in filenames:
82
+ full_path = os.path.join(root, filename)
83
+ all_files.append(full_path)
84
+ else:
85
+ all_files_relative = os.listdir(base_dir)
86
+ all_files = [os.path.join(base_dir,fn) for fn in all_files_relative]
87
+ all_files = [fn for fn in all_files if os.path.isfile(fn)]
88
+
89
+ if return_relative_paths:
90
+ all_files = [os.path.relpath(fn,base_dir) for fn in all_files]
91
+
92
+ if convert_slashes:
93
+ all_files = [fn.replace('\\', '/') for fn in all_files]
94
+
95
+ if sort_files:
96
+ all_files = sorted(all_files)
97
+
98
+ return all_files
99
+
100
+
101
+ def file_list(base_dir,
102
+ convert_slashes=True,
103
+ return_relative_paths=False,
104
+ sort_files=True,
105
+ recursive=False):
106
+ """
107
+ Trivial wrapper for recursive_file_list, which was a poor function name choice
108
+ at the time, since I later wanted to add non-recursive lists, but it doesn't
109
+ make sense to have a "recursive" option in a function called "recursive_file_list".
110
+
111
+ Args:
112
+ base_dir (str): folder to enumerate
113
+ convert_slashes (bool, optional): force forward slashes; if this is False, will use
114
+ the native path separator
115
+ return_relative_paths (bool, optional): return paths that are relative to [base_dir],
116
+ rather than absolute paths
117
+ sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
118
+ provided by os.walk()
119
+ recursive (bool, optional): enumerate recursively
120
+
121
+ Returns:
122
+ list: list of filenames
123
+ """
124
+
125
+ return recursive_file_list(base_dir,convert_slashes,return_relative_paths,sort_files,
126
+ recursive=recursive)
127
+
128
+
129
+ def folder_list(base_dir,
130
+ convert_slashes=True,
131
+ return_relative_paths=False,
132
+ sort_folders=True,
133
+ recursive=False):
134
+ """
135
+ Enumerates folders (not files) in [base_dir].
136
+
137
+ Args:
138
+ base_dir (str): folder to enumerate
139
+ convert_slashes (bool, optional): force forward slashes; if this is False, will use
140
+ the native path separator
141
+ return_relative_paths (bool, optional): return paths that are relative to [base_dir],
142
+ rather than absolute paths
143
+ sort_folders (bool, optional): force folders to be sorted, otherwise uses the sorting
144
+ provided by os.walk()
145
+ recursive (bool, optional): enumerate recursively
146
+
147
+ Returns:
148
+ list: list of folder names
149
+ """
150
+
151
+ assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
152
+
153
+ folders = []
154
+
155
+ if recursive:
156
+ for root, dirs, _ in os.walk(base_dir):
157
+ for d in dirs:
158
+ folders.append(os.path.join(root, d))
159
+ else:
160
+ folders = os.listdir(base_dir)
161
+ folders = [os.path.join(base_dir,fn) for fn in folders]
162
+ folders = [fn for fn in folders if os.path.isdir(fn)]
163
+
164
+ if return_relative_paths:
165
+ folders = [os.path.relpath(fn,base_dir) for fn in folders]
166
+
167
+ if convert_slashes:
168
+ folders = [fn.replace('\\', '/') for fn in folders]
169
+
170
+ if sort_folders:
171
+ folders = sorted(folders)
172
+
173
+ return folders
174
+
175
+
176
+ def folder_summary(folder,print_summary=True):
177
+ """
178
+ Returns (and optionally prints) a summary of [folder], including:
179
+
180
+ * The total number of files
181
+ * The total number of folders
182
+ * The number of files for each extension
183
+
184
+ Args:
185
+ folder (str): folder to summarize
186
+ print_summary (bool, optional): whether to print the summary
187
+
188
+ Returns:
189
+ dict: with fields "n_files", "n_folders", and "extension_to_count"
190
+ """
191
+
192
+ assert os.path.isdir(folder), '{} is not a folder'.format(folder)
193
+
194
+ folders_relative = folder_list(folder,return_relative_paths=True,recursive=True)
195
+ files_relative = file_list(folder,return_relative_paths=True,recursive=True)
196
+
197
+ extension_to_count = defaultdict(int)
198
+
199
+ for fn in files_relative:
200
+ ext = os.path.splitext(fn)[1]
201
+ extension_to_count[ext] += 1
202
+
203
+ extension_to_count = sort_dictionary_by_value(extension_to_count,reverse=True)
204
+
205
+ if print_summary:
206
+ for extension in extension_to_count.keys():
207
+ print('{}: {}'.format(extension,extension_to_count[extension]))
208
+ print('')
209
+ print('Total files: {}'.format(len(files_relative)))
210
+ print('Total folders: {}'.format(len(folders_relative)))
211
+
212
+ to_return = {}
213
+ to_return['n_files'] = len(files_relative)
214
+ to_return['n_folders'] = len(folders_relative)
215
+ to_return['extension_to_count'] = extension_to_count
216
+
217
+ return to_return
218
+
219
+
220
+ def fileparts(path):
221
+ r"""
222
+ Breaks down a path into the directory path, filename, and extension.
223
+
224
+ Note that the '.' lives with the extension, and separators are removed.
225
+
226
+ Examples:
227
+
228
+ .. code-block:: none
229
+
230
+ >>> fileparts('file')
231
+ ('', 'file', '')
232
+ >>> fileparts(r'c:/dir/file.jpg')
233
+ ('c:/dir', 'file', '.jpg')
234
+ >>> fileparts('/dir/subdir/file.jpg')
235
+ ('/dir/subdir', 'file', '.jpg')
236
+
237
+ Args:
238
+ path (str): path name to separate into parts
239
+ Returns:
240
+ tuple: tuple containing (p,n,e):
241
+ - p: str, directory path
242
+ - n: str, filename without extension
243
+ - e: str, extension including the '.'
244
+ """
245
+
246
+ # ntpath seems to do the right thing for both Windows and Unix paths
247
+ p = ntpath.dirname(path)
248
+ basename = ntpath.basename(path)
249
+ n, e = ntpath.splitext(basename)
250
+ return p, n, e
251
+
252
+
253
+ def insert_before_extension(filename, s=None, separator='.'):
254
+ """
255
+ Insert string [s] before the extension in [filename], separated with [separator].
256
+
257
+ If [s] is empty, generates a date/timestamp. If [filename] has no extension,
258
+ appends [s].
259
+
260
+ Examples:
261
+
262
+ .. code-block:: none
263
+
264
+ >>> insert_before_extension('/dir/subdir/file.ext', 'insert')
265
+ '/dir/subdir/file.insert.ext'
266
+ >>> insert_before_extension('/dir/subdir/file', 'insert')
267
+ '/dir/subdir/file.insert'
268
+ >>> insert_before_extension('/dir/subdir/file')
269
+ '/dir/subdir/file.2020.07.20.10.54.38'
270
+
271
+ Args:
272
+ filename (str): filename to manipulate
273
+ s (str, optional): string to insert before the extension in [filename], or
274
+ None to insert a datestamp
275
+ separator (str, optional): separator to place between the filename base
276
+ and the inserted string
277
+
278
+ Returns:
279
+ str: modified string
280
+ """
281
+
282
+ assert len(filename) > 0
283
+ if s is None or len(s) == 0:
284
+ s = datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
285
+ name, ext = os.path.splitext(filename)
286
+ return f'{name}{separator}{s}{ext}'
287
+
288
+
289
+ def split_path(path):
290
+ r"""
291
+ Splits [path] into all its constituent file/folder tokens.
292
+
293
+ Examples:
294
+
295
+ .. code-block:: none
296
+
297
+ >>> split_path(r'c:\dir\subdir\file.txt')
298
+ ['c:\\', 'dir', 'subdir', 'file.txt']
299
+ >>> split_path('/dir/subdir/file.jpg')
300
+ ['/', 'dir', 'subdir', 'file.jpg']
301
+ >>> split_path('c:\\')
302
+ ['c:\\']
303
+ >>> split_path('/')
304
+ ['/']
305
+
306
+ Args:
307
+ path (str): path to split into tokens
308
+
309
+ Returns:
310
+ list: list of path tokens
311
+ """
312
+
313
+ # Edge cases
314
+ if path == '':
315
+ return ''
316
+ if path is None:
317
+ return None
318
+
319
+ parts = []
320
+ while True:
321
+ # ntpath seems to do the right thing for both Windows and Unix paths
322
+ head, tail = ntpath.split(path)
323
+ if head == '' or head == path:
324
+ break
325
+ parts.append(tail)
326
+ path = head
327
+ parts.append(head or tail)
328
+ return parts[::-1] # reverse
329
+
330
+
331
+ def path_is_abs(p):
332
+ """
333
+ Determines whether [p] is an absolute path. An absolute path is defined as
334
+ one that starts with slash, backslash, or a letter followed by a colon.
335
+
336
+ Args:
337
+ p (str): path to evaluate
338
+
339
+ Returns:
340
+ bool: True if [p] is an absolute path, else False
341
+ """
342
+
343
+ return (len(p) > 1) and (p[0] == '/' or p[1] == ':' or p[0] == '\\')
344
+
345
+
346
+ def safe_create_link(link_exists,link_new):
347
+ """
348
+ Creates a symlink at [link_new] pointing to [link_exists].
349
+
350
+ If [link_new] already exists, make sure it's a link (not a file),
351
+ and if it has a different target than [link_exists], removes and re-creates
352
+ it.
353
+
354
+ Creates a *real* directory if necessary.
355
+
356
+ Errors if [link_new] already exists but it's not a link.
357
+
358
+ Args:
359
+ link_exists (str): the source of the (possibly-new) symlink
360
+ link_new (str): the target of the (possibly-new) symlink
361
+ """
362
+
363
+ # If the new file already exists...
364
+ if os.path.exists(link_new) or os.path.islink(link_new):
365
+ # Error if it's not already a link
366
+ assert os.path.islink(link_new)
367
+ # If it's already a link, and it points to the "exists" file,
368
+ # leave it alone, otherwise redirect it.
369
+ if not os.readlink(link_new) == link_exists:
370
+ os.remove(link_new)
371
+ os.symlink(link_exists,link_new)
372
+ else:
373
+ link_new_dir = os.path.dirname(link_new)
374
+ if len(link_new_dir) > 0:
375
+ os.makedirs(link_new_dir,exist_ok=True)
376
+ os.symlink(link_exists,link_new)
377
+
378
+ # ...def safe_create_link(...)
379
+
380
+
381
+ def remove_empty_folders(path, remove_root=False):
382
+ """
383
+ Recursively removes empty folders within the specified path.
384
+
385
+ Args:
386
+ path (str): the folder from which we should recursively remove
387
+ empty folders.
388
+ remove_root (bool, optional): whether to remove the root directory if
389
+ it's empty after removing all empty subdirectories. This will always
390
+ be True during recursive calls.
391
+
392
+ Returns:
393
+ bool: True if the directory is empty after processing, False otherwise
394
+ """
395
+
396
+ # Verify that [path] is a directory
397
+ if not os.path.isdir(path):
398
+ return False
399
+
400
+ # Track whether the current directory is empty
401
+ is_empty = True
402
+
403
+ # Iterate through all items in the directory
404
+ for item in os.listdir(path):
405
+
406
+ item_path = os.path.join(path, item)
407
+
408
+ # If it's a directory, process it recursively
409
+ if os.path.isdir(item_path):
410
+ # If the subdirectory is empty after processing, it will be removed
411
+ if not remove_empty_folders(item_path, True):
412
+ # If the subdirectory is not empty, the current directory isn't empty either
413
+ is_empty = False
414
+ else:
415
+ # If there's a file, the directory is not empty
416
+ is_empty = False
417
+
418
+ # If the directory is empty and we're supposed to remove it
419
+ if is_empty and remove_root:
420
+ try:
421
+ os.rmdir(path)
422
+ except Exception as e:
423
+ print('Error removing directory {}: {}'.format(path,str(e)))
424
+ is_empty = False
425
+
426
+ return is_empty
427
+
428
+ # ...def remove_empty_folders(...)
429
+
430
+
431
+ def path_join(*paths, convert_slashes=True):
432
+ r"""
433
+ Wrapper for os.path.join that optionally converts backslashes to forward slashes.
434
+
435
+ Args:
436
+ *paths (variable-length set of strings): Path components to be joined.
437
+ convert_slashes (bool, optional): whether to convert \\ to /
438
+
439
+ Returns:
440
+ A string with the joined path components.
441
+ """
442
+
443
+ joined_path = os.path.join(*paths)
444
+ if convert_slashes:
445
+ return joined_path.replace('\\', '/')
446
+ else:
447
+ return joined_path
448
+
449
+
450
+ @pytest.mark.skip(reason="This is not a test function")
451
+ def test_file_write(fn, overwrite=True):
452
+ """
453
+ Writes an empty file to [fn], used to test that we have
454
+ appropriate permissions. If [fn] exists and overwrite is False,
455
+ this function errors. Creates the directory containing [fn] if
456
+ necessary. Does not delete the test file.
457
+
458
+ Args:
459
+ fn (str): the filename to which we should perform a test write
460
+ overwrite (bool, optional): if [fn] exists, whether we should
461
+ overwrite (True) or error (False)
462
+
463
+ Returns:
464
+ bool: currently always returns True or errors
465
+ """
466
+
467
+ if os.path.isfile(fn) and (not overwrite):
468
+ raise ValueError(
469
+ 'test_write_file: target file {} exists'.format(fn))
470
+ if os.path.isdir(fn):
471
+ raise ValueError(
472
+ 'test_write_file: target file {} is a directory'.format(fn))
473
+
474
+ target_dir = os.path.dirname(fn)
475
+ if len(target_dir) > 0:
476
+ os.makedirs(target_dir,exist_ok=True)
477
+
478
+ # Create an empty file at the destination "fn"
479
+ with open(fn, 'w') as f:
480
+ f.write('')
481
+
482
+ return True
483
+
484
+
485
+ #%% Image-related path functions
486
+
487
+ def is_image_file(s, img_extensions=IMG_EXTENSIONS):
488
+ """
489
+ Checks a file's extension against a hard-coded set of image file
490
+ extensions. Uses case-insensitive comparison.
491
+
492
+ Does not check whether the file exists, only determines whether the filename
493
+ implies it's an image file.
494
+
495
+ Args:
496
+ s (str): filename to evaluate for image-ness
497
+ img_extensions (list, optional): list of known image file extensions
498
+
499
+ Returns:
500
+ bool: True if [s] appears to be an image file, else False
501
+ """
502
+
503
+ ext = os.path.splitext(s)[1]
504
+ return ext.lower() in img_extensions
505
+
506
+
507
+ def find_image_strings(strings):
508
+ """
509
+ Given a list of strings that are potentially image file names, looks for
510
+ strings that actually look like image file names (based on extension).
511
+
512
+ Args:
513
+ strings (list): list of filenames to check for image-ness
514
+
515
+ Returns:
516
+ list: the subset of [strings] that appear to be image filenames
517
+ """
518
+
519
+ return [s for s in strings if is_image_file(s)]
520
+
521
+
522
+ def find_images(dirname,
523
+ recursive=False,
524
+ return_relative_paths=False,
525
+ convert_slashes=True):
526
+ """
527
+ Finds all files in a directory that look like image file names. Returns
528
+ absolute paths unless return_relative_paths is set. Uses the OS-native
529
+ path separator unless convert_slashes is set, in which case will always
530
+ use '/'.
531
+
532
+ Args:
533
+ dirname (str): the folder to search for images
534
+ recursive (bool, optional): whether to search recursively
535
+ return_relative_paths (str, optional): return paths that are relative
536
+ to [dirname], rather than absolute paths
537
+ convert_slashes (bool, optional): force forward slashes in return values
538
+
539
+ Returns:
540
+ list: list of image filenames found in [dirname]
541
+ """
542
+
543
+ assert os.path.isdir(dirname), '{} is not a folder'.format(dirname)
544
+
545
+ if recursive:
546
+ strings = glob.glob(os.path.join(dirname, '**', '*.*'), recursive=True)
547
+ else:
548
+ strings = glob.glob(os.path.join(dirname, '*.*'))
549
+
550
+ image_files = find_image_strings(strings)
551
+
552
+ if return_relative_paths:
553
+ image_files = [os.path.relpath(fn,dirname) for fn in image_files]
554
+
555
+ image_files = sorted(image_files)
556
+
557
+ if convert_slashes:
558
+ image_files = [fn.replace('\\', '/') for fn in image_files]
559
+
560
+ return image_files
561
+
562
+
563
+ #%% Filename cleaning functions
564
+
565
+ def clean_filename(filename,
566
+ allow_list=VALID_FILENAME_CHARS,
567
+ char_limit=CHAR_LIMIT,
568
+ force_lower=False,
569
+ remove_trailing_leading_whitespace=True):
570
+ r"""
571
+ Removes non-ASCII and other invalid filename characters (on any
572
+ reasonable OS) from a filename, then optionally trims to a maximum length.
573
+
574
+ Does not allow :\/ by default, use clean_path if you want to preserve those.
575
+
576
+ Adapted from
577
+ https://gist.github.com/wassname/1393c4a57cfcbf03641dbc31886123b8
578
+
579
+ Args:
580
+ filename (str): filename to clean
581
+ allow_list (str, optional): string containing all allowable filename characters
582
+ char_limit (int, optional): maximum allowable filename length, if None will skip this
583
+ step
584
+ force_lower (bool, optional): convert the resulting filename to lowercase
585
+ remove_trailing_leading_whitespace (bool, optional): remove trailing and
586
+ leading whitespace from each component of a path, e.g. does not allow
587
+ a/b/c /d.jpg
588
+ Returns:
589
+ str: cleaned version of [filename]
590
+ """
591
+
592
+ if remove_trailing_leading_whitespace:
593
+
594
+ # Best effort to preserve the original separator
595
+ separator = '/'
596
+ if '\\' in filename:
597
+ separator = '\\'
598
+
599
+ filename = filename.replace('\\','/')
600
+ components = filename.split('/')
601
+ clean_components = [c.strip() for c in components]
602
+ filename = separator.join(clean_components)
603
+ if separator == '\\':
604
+ filename = filename.replace('/','\\')
605
+
606
+ # keep only valid ascii chars
607
+ cleaned_filename = (unicodedata.normalize('NFKD', filename)
608
+ .encode('ASCII', 'ignore').decode())
609
+
610
+ # keep only allow-listed chars
611
+ cleaned_filename = ''.join([c for c in cleaned_filename if c in allow_list])
612
+ if char_limit is not None:
613
+ cleaned_filename = cleaned_filename[:char_limit]
614
+ if force_lower:
615
+ cleaned_filename = cleaned_filename.lower()
616
+ return cleaned_filename
617
+
618
+
619
+ def clean_path(pathname,
620
+ allow_list=VALID_PATH_CHARS,
621
+ char_limit=CHAR_LIMIT,
622
+ force_lower=False,
623
+ remove_trailing_leading_whitespace=True):
624
+ """
625
+ Removes non-ASCII and other invalid path characters (on any reasonable
626
+ OS) from a path, then optionally trims to a maximum length.
627
+
628
+ Args:
629
+ pathname (str): path name to clean
630
+ allow_list (str, optional): string containing all allowable filename characters
631
+ char_limit (int, optional): maximum allowable filename length, if None will skip this
632
+ step
633
+ force_lower (bool, optional): convert the resulting filename to lowercase
634
+ remove_trailing_leading_whitespace (bool, optional): remove trailing and
635
+ leading whitespace from each component of a path, e.g. does not allow
636
+ a/b/c /d.jpg
637
+
638
+ Returns:
639
+ str: cleaned version of [filename]
640
+ """
641
+
642
+ return clean_filename(pathname,
643
+ allow_list=allow_list,
644
+ char_limit=char_limit,
645
+ force_lower=force_lower,
646
+ remove_trailing_leading_whitespace=\
647
+ remove_trailing_leading_whitespace)
648
+
649
+
650
+ def flatten_path(pathname,separator_chars=SEPARATOR_CHARS,separator_char_replacement='~'):
651
+ r"""
652
+ Removes non-ASCII and other invalid path characters (on any reasonable
653
+ OS) from a path, then trims to a maximum length. Replaces all valid
654
+ separators with [separator_char_replacement.]
655
+
656
+ Args:
657
+ pathname (str): path name to flatten
658
+ separator_chars (str, optional): string containing all known path separators
659
+ separator_char_replacement (str, optional): string to insert in place of
660
+ path separators.
661
+
662
+ Returns:
663
+ str: flattened version of [pathname]
664
+ """
665
+
666
+ s = clean_path(pathname)
667
+ for c in separator_chars:
668
+ s = s.replace(c, separator_char_replacement)
669
+ return s
670
+
671
+
672
+ def is_executable(filename):
673
+ """
674
+ Checks whether [filename] is on the system path and marked as executable.
675
+
676
+ Args:
677
+ filename (str): filename to check for executable status
678
+
679
+ Returns:
680
+ bool: True if [filename] is on the system path and marked as executable, otherwise False
681
+ """
682
+
683
+ # https://stackoverflow.com/questions/11210104/check-if-a-program-exists-from-a-python-script
684
+
685
+ return which(filename) is not None
686
+
687
+
688
+ #%% WSL utilities
689
+
690
+ def wsl_path_to_windows_path(filename, failure_behavior='none'):
691
+ r"""
692
+ Converts a WSL path to a Windows path. For example, converts:
693
+
694
+ /mnt/e/a/b/c
695
+
696
+ ...to:
697
+
698
+ e:\a\b\c
699
+
700
+ Args:
701
+ filename (str): filename to convert
702
+ failure_behavior (str, optional): what to do if the path can't be processed as a
703
+ WSL path. 'none' to return None in this case, 'original' to return the original path.
704
+
705
+ Returns:
706
+ str: Windows equivalent to the WSL path [filename]
707
+ """
708
+
709
+ assert failure_behavior in ('none','original'), \
710
+ 'Unrecognized failure_behavior value {}'.format(failure_behavior)
711
+
712
+ # Check whether the path follows the standard WSL mount pattern
713
+ wsl_path_pattern = r'^/mnt/([a-zA-Z])(/.*)?$'
714
+ match = re.match(wsl_path_pattern, filename)
715
+
716
+ if match:
717
+
718
+ # Extract the drive letter and the rest of the path
719
+ drive_letter = match.group(1)
720
+ path_remainder = match.group(2) if match.group(2) else ''
721
+
722
+ # Convert forward slashes to backslashes for Windows
723
+ path_remainder = path_remainder.replace('/', '\\')
724
+
725
+ # Format the Windows path
726
+ windows_path = f"{drive_letter}:{path_remainder}"
727
+ return windows_path
728
+
729
+ if failure_behavior == 'none':
730
+ return None
731
+ else:
732
+ return filename
733
+
734
+ # ...def wsl_path_to_windows_path(...)
735
+
736
+
737
+ def windows_path_to_wsl_path(filename, failure_behavior='none'):
738
+ r"""
739
+ Converts a Windows path to a WSL path, or returns None if that's not possible. E.g.
740
+ converts:
741
+
742
+ e:\a\b\c
743
+
744
+ ...to:
745
+
746
+ /mnt/e/a/b/c
747
+
748
+ Args:
749
+ filename (str): filename to convert
750
+ failure_behavior (str, optional): what to do if the path can't be processed as a Windows path.
751
+ 'none' to return None in this case, 'original' to return the original path.
752
+
753
+ Returns:
754
+ str: WSL equivalent to the Windows path [filename]
755
+ """
756
+
757
+ assert failure_behavior in ('none','original'), \
758
+ 'Unrecognized failure_behavior value {}'.format(failure_behavior)
759
+
760
+ filename = filename.replace('\\', '/')
761
+
762
+ # Check whether the path follows a Windows drive letter pattern
763
+ windows_path_pattern = r'^([a-zA-Z]):(/.*)?$'
764
+ match = re.match(windows_path_pattern, filename)
765
+
766
+ if match:
767
+ # Extract the drive letter and the rest of the path
768
+ drive_letter = match.group(1).lower() # Convert to lowercase for WSL
769
+ path_remainder = match.group(2) if match.group(2) else ''
770
+
771
+ # Format the WSL path
772
+ wsl_path = f"/mnt/{drive_letter}{path_remainder}"
773
+ return wsl_path
774
+
775
+ if failure_behavior == 'none':
776
+ return None
777
+ else:
778
+ return filename
779
+
780
+ # ...def window_path_to_wsl_path(...)
781
+
782
+
783
+ #%% Platform-independent file openers
784
+
785
+ def open_file_in_chrome(filename):
786
+ """
787
+ Open a file in chrome, regardless of file type. I typically use this to open
788
+ .md files in Chrome.
789
+
790
+ Args:
791
+ filename (str): file to open
792
+
793
+ Return:
794
+ bool: whether the operation was successful
795
+ """
796
+
797
+ # Create URL
798
+ abs_path = os.path.abspath(filename)
799
+
800
+ system = platform.system()
801
+ if system == 'Windows':
802
+ url = f'file:///{abs_path.replace(os.sep, "/")}'
803
+ else: # macOS and Linux
804
+ url = f'file://{abs_path}'
805
+
806
+ # Determine the Chrome path
807
+ if system == 'Windows':
808
+
809
+ # This is a native Python module, but it only exists on Windows
810
+ import winreg
811
+
812
+ chrome_paths = [
813
+ os.path.expanduser("~") + r"\AppData\Local\Google\Chrome\Application\chrome.exe",
814
+ r"C:\Program Files\Google\Chrome\Application\chrome.exe",
815
+ r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"
816
+ ]
817
+
818
+ # Default approach: run from a typical chrome location
819
+ for path in chrome_paths:
820
+ if os.path.exists(path):
821
+ subprocess.run([path, url])
822
+ return True
823
+
824
+ # Method 2: Check registry for Chrome path
825
+ try:
826
+ with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE,
827
+ r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe") as key:
828
+ chrome_path = winreg.QueryValue(key, None)
829
+ if chrome_path and os.path.exists(chrome_path):
830
+ subprocess.run([chrome_path, url])
831
+ return True
832
+ except Exception:
833
+ pass
834
+
835
+ # Method 3: Try alternate registry location
836
+ try:
837
+ with winreg.OpenKey(winreg.HKEY_CURRENT_USER,
838
+ r"Software\Google\Chrome\BLBeacon") as key:
839
+ chrome_path = os.path.join(os.path.dirname(winreg.QueryValueEx(key, "version")[0]), "chrome.exe")
840
+ if os.path.exists(chrome_path):
841
+ subprocess.run([chrome_path, url])
842
+ return True
843
+ except Exception:
844
+ pass
845
+
846
+ # Method 4: Try system path or command
847
+ for chrome_cmd in ["chrome", "chrome.exe", "googlechrome", "google-chrome"]:
848
+ try:
849
+ subprocess.run([chrome_cmd, url], shell=True)
850
+ return True
851
+ except Exception:
852
+ continue
853
+
854
+ # Method 5: Use Windows URL protocol handler
855
+ try:
856
+ os.startfile(url)
857
+ return True
858
+ except Exception:
859
+ pass
860
+
861
+ # Method 6: Use rundll32
862
+ try:
863
+ cmd = f'rundll32 url.dll,FileProtocolHandler {url}'
864
+ subprocess.run(cmd, shell=True)
865
+ return True
866
+ except Exception:
867
+ pass
868
+
869
+ elif system == 'Darwin':
870
+
871
+ chrome_paths = [
872
+ '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
873
+ os.path.expanduser('~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome')
874
+ ]
875
+
876
+ for path in chrome_paths:
877
+ if os.path.exists(path):
878
+ subprocess.run([path, url])
879
+ return True
880
+
881
+ # Fallback to 'open' command with Chrome as the app
882
+ try:
883
+ subprocess.run(['open', '-a', 'Google Chrome', url])
884
+ return True
885
+ except Exception:
886
+ pass
887
+
888
+ elif system == 'Linux':
889
+
890
+ chrome_commands = ['google-chrome', 'chrome', 'chromium', 'chromium-browser']
891
+
892
+ for cmd in chrome_commands:
893
+ try:
894
+ subprocess.run([cmd, url], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
895
+ return True
896
+ except Exception:
897
+ continue
898
+
899
+ print(f"Could not open {filename} in Chrome on {system}.")
900
+ return False
901
+
902
+
903
+ def open_file(filename,
904
+ attempt_to_open_in_wsl_host=False,
905
+ browser_name=None):
906
+ """
907
+ Opens [filename] in the default OS file handler for this file type.
908
+
909
+ If browser_name is not None, uses the webbrowser module to open the filename
910
+ in the specified browser; see https://docs.python.org/3/library/webbrowser.html
911
+ for supported browsers. Falls back to the default file handler if webbrowser.open()
912
+ fails. In this case, attempt_to_open_in_wsl_host is ignored unless webbrowser.open() fails.
913
+
914
+ If browser_name is 'default', uses the system default. This is different from the
915
+ parameter to webbrowser.get(), where None implies the system default.
916
+
917
+ Args:
918
+ filename (str): file to open
919
+ attempt_to_open_in_wsl_host (bool, optional): if this is True, and we're in WSL, attempts
920
+ to open [filename] in the Windows host environment
921
+ browser_name (str, optional): see above
922
+ """
923
+
924
+ if browser_name is not None:
925
+ if browser_name == 'chrome':
926
+ browser_name = 'google-chrome'
927
+ elif browser_name == 'default':
928
+ browser_name = None
929
+ try:
930
+ result = webbrowser.get(using=browser_name).open(filename)
931
+ except Exception:
932
+ result = False
933
+ if result:
934
+ return
935
+
936
+ if sys.platform == 'win32':
937
+
938
+ os.startfile(filename)
939
+
940
+ elif sys.platform == 'darwin':
941
+
942
+ opener = 'open'
943
+ subprocess.call([opener, filename])
944
+
945
+ elif attempt_to_open_in_wsl_host and environment_is_wsl():
946
+
947
+ windows_path = wsl_path_to_windows_path(filename)
948
+
949
+ # Fall back to xdg-open
950
+ if windows_path is None:
951
+ subprocess.call(['xdg-open', filename])
952
+
953
+ if os.path.isdir(filename):
954
+ subprocess.run(["explorer.exe", windows_path])
955
+ else:
956
+ os.system("cmd.exe /C start {}".format(re.escape(windows_path)))
957
+
958
+ else:
959
+
960
+ opener = 'xdg-open'
961
+ subprocess.call([opener, filename])
962
+
963
+ # ...def open_file(...)
964
+
965
+
966
+ #%% File list functions (as in, files that are lists of other filenames)
967
+
968
+ def write_list_to_file(output_file,strings):
969
+ """
970
+ Writes a list of strings to either a JSON file or text file,
971
+ depending on extension of the given file name.
972
+
973
+ Args:
974
+ output_file (str): file to write
975
+ strings (list): list of strings to write to [output_file]
976
+ """
977
+
978
+ with open(output_file, 'w') as f:
979
+ if output_file.endswith('.json'):
980
+ json.dump(strings, f, indent=1)
981
+ else:
982
+ f.write('\n'.join(strings))
983
+
984
+
985
+ def read_list_from_file(filename):
986
+ """
987
+ Reads a json-formatted list of strings from a file.
988
+
989
+ Args:
990
+ filename (str): .json filename to read
991
+
992
+ Returns:
993
+ list: list of strings read from [filename]
994
+ """
995
+
996
+ assert filename.endswith('.json')
997
+ with open(filename, 'r') as f:
998
+ file_list = json.load(f)
999
+ assert isinstance(file_list, list)
1000
+ for s in file_list:
1001
+ assert isinstance(s, str)
1002
+ return file_list
1003
+
1004
+
1005
+ #%% File copying functions
1006
+
1007
+ def _copy_file(input_output_tuple,overwrite=True,verbose=False,move=False):
1008
+ """
1009
+ Internal function for copying files from within parallel_copy_files.
1010
+ """
1011
+
1012
+ assert len(input_output_tuple) == 2
1013
+ source_fn = input_output_tuple[0]
1014
+ target_fn = input_output_tuple[1]
1015
+ if (not overwrite) and (os.path.isfile(target_fn)):
1016
+ if verbose:
1017
+ print('Skipping existing target file {}'.format(target_fn))
1018
+ return
1019
+
1020
+ if move:
1021
+ action_string = 'Moving'
1022
+ else:
1023
+ action_string = 'Copying'
1024
+
1025
+ if verbose:
1026
+ print('{} to {}'.format(action_string,target_fn))
1027
+
1028
+ target_dir = os.path.dirname(target_fn)
1029
+ if len(target_dir) > 0:
1030
+ os.makedirs(target_dir,exist_ok=True)
1031
+ if move:
1032
+ shutil.move(source_fn, target_fn)
1033
+ else:
1034
+ shutil.copyfile(source_fn,target_fn)
1035
+
1036
+
1037
+ def parallel_copy_files(input_file_to_output_file,
1038
+ max_workers=16,
1039
+ use_threads=True,
1040
+ overwrite=False,
1041
+ verbose=False,
1042
+ move=False):
1043
+ """
1044
+ Copy (or move) files from source to target according to the dict input_file_to_output_file.
1045
+
1046
+ Args:
1047
+ input_file_to_output_file (dict): dictionary mapping source files to the target files
1048
+ to which they should be copied
1049
+ max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
1050
+ use_threads (bool, optional): whether to use threads (True) or processes (False) for
1051
+ parallel copying; ignored if max_workers <= 1
1052
+ overwrite (bool, optional): whether to overwrite existing destination files
1053
+ verbose (bool, optional): enable additional debug output
1054
+ move (bool, optional): move instead of copying
1055
+ """
1056
+
1057
+ if len(input_file_to_output_file) == 0:
1058
+ print('Warning: parallel_copy_files called with an empty copy list')
1059
+ return
1060
+
1061
+ n_workers = min(max_workers,len(input_file_to_output_file))
1062
+
1063
+ # Package the dictionary as a set of 2-tuples
1064
+ input_output_tuples = []
1065
+ for input_fn in input_file_to_output_file:
1066
+ input_output_tuples.append((input_fn,input_file_to_output_file[input_fn]))
1067
+
1068
+ pool = None
1069
+
1070
+ try:
1071
+ if use_threads:
1072
+ pool = ThreadPool(n_workers)
1073
+ else:
1074
+ pool = Pool(n_workers)
1075
+
1076
+ with tqdm(total=len(input_output_tuples)) as pbar:
1077
+ for i,_ in enumerate(pool.imap_unordered(partial(_copy_file,
1078
+ overwrite=overwrite,
1079
+ verbose=verbose,
1080
+ move=move),
1081
+ input_output_tuples)):
1082
+ pbar.update()
1083
+ finally:
1084
+ if pool is not None:
1085
+ pool.close()
1086
+ pool.join()
1087
+ if verbose:
1088
+ print('Pool closed and joined for parallel file copying')
1089
+
1090
+ # ...def parallel_copy_files(...)
1091
+
1092
+
1093
+ #%% File deletion functions
1094
+
1095
+ def delete_file(input_file, verbose=False):
1096
+ """
1097
+ Deletes a single file.
1098
+
1099
+ Args:
1100
+ input_file (str): file to delete
1101
+ verbose (bool, optional): enable additional debug console output
1102
+
1103
+ Returns:
1104
+ bool: True if file was deleted successfully, False otherwise
1105
+ """
1106
+
1107
+ try:
1108
+ if verbose:
1109
+ print('Deleting file {}'.format(input_file))
1110
+
1111
+ if os.path.isfile(input_file):
1112
+ os.remove(input_file)
1113
+ return True
1114
+ else:
1115
+ if verbose:
1116
+ print('File {} does not exist'.format(input_file))
1117
+ return False
1118
+
1119
+ except Exception as e:
1120
+ if verbose:
1121
+ print('Error deleting file {}: {}'.format(input_file, str(e)))
1122
+ return False
1123
+
1124
+ # ...def delete_file(...)
1125
+
1126
+
1127
+ def parallel_delete_files(input_files,
1128
+ max_workers=16,
1129
+ use_threads=True,
1130
+ verbose=False):
1131
+ """
1132
+ Deletes one or more files in parallel.
1133
+
1134
+ Args:
1135
+ input_files (list): list of files to delete
1136
+ max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
1137
+ use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
1138
+ max_workers <= 1
1139
+ verbose (bool, optional): enable additional debug console output
1140
+ """
1141
+
1142
+ if len(input_files) == 0:
1143
+ return
1144
+
1145
+ n_workers = min(max_workers, len(input_files))
1146
+
1147
+ pool = None
1148
+
1149
+ try:
1150
+ if use_threads:
1151
+ pool = ThreadPool(n_workers)
1152
+ else:
1153
+ pool = Pool(n_workers)
1154
+
1155
+ with tqdm(total=len(input_files)) as pbar:
1156
+ for i, _ in enumerate(pool.imap_unordered(partial(delete_file, verbose=verbose),
1157
+ input_files)):
1158
+ pbar.update()
1159
+ finally:
1160
+ if pool is not None:
1161
+ pool.close()
1162
+ pool.join()
1163
+ if verbose:
1164
+ print('Pool closed and joined for file deletion')
1165
+
1166
+ # ...def parallel_delete_files(...)
1167
+
1168
+
1169
+ #%% File size functions
1170
+
1171
+ def get_file_sizes(base_dir, convert_slashes=True):
1172
+ """
1173
+ Gets sizes recursively for all files in base_dir, returning a dict mapping
1174
+ relative filenames to size.
1175
+
1176
+ TODO: merge the functionality here with parallel_get_file_sizes, which uses slightly
1177
+ different semantics.
1178
+
1179
+ Args:
1180
+ base_dir (str): folder within which we want all file sizes
1181
+ convert_slashes (bool, optional): force forward slashes in return strings,
1182
+ otherwise uses the native path separator
1183
+
1184
+ Returns:
1185
+ dict: dictionary mapping filenames to file sizes in bytes
1186
+ """
1187
+
1188
+ relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
1189
+ return_relative_paths=True)
1190
+
1191
+ fn_to_size = {}
1192
+ for fn_relative in tqdm(relative_filenames):
1193
+ fn_abs = os.path.join(base_dir,fn_relative)
1194
+ fn_to_size[fn_relative] = os.path.getsize(fn_abs)
1195
+
1196
+ return fn_to_size
1197
+
1198
+
1199
+ def _get_file_size(filename,verbose=False):
1200
+ """
1201
+ Internal function for safely getting the size of a file. Returns a (filename,size)
1202
+ tuple, where size is None if there is an error.
1203
+ """
1204
+
1205
+ try:
1206
+ size = os.path.getsize(filename)
1207
+ except Exception as e:
1208
+ if verbose:
1209
+ print('Error reading file size for {}: {}'.format(filename,str(e)))
1210
+ size = None
1211
+ return (filename,size)
1212
+
1213
+
1214
+ def parallel_get_file_sizes(filenames,
1215
+ max_workers=16,
1216
+ use_threads=True,
1217
+ verbose=False,
1218
+ recursive=True,
1219
+ convert_slashes=True,
1220
+ return_relative_paths=False):
1221
+ """
1222
+ Returns a dictionary mapping every file in [filenames] to the corresponding file size,
1223
+ or None for errors. If [filenames] is a folder, will enumerate the folder (optionally recursively).
1224
+
1225
+ Args:
1226
+ filenames (list or str): list of filenames for which we should read sizes, or a folder
1227
+ within which we should read all file sizes recursively
1228
+ max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
1229
+ use_threads (bool, optional): whether to use threads (True) or processes (False) for
1230
+ parallel copying; ignored if max_workers <= 1
1231
+ verbose (bool, optional): enable additional debug output
1232
+ recursive (bool, optional): enumerate recursively, only relevant if [filenames] is a folder.
1233
+ convert_slashes (bool, optional): convert backslashes to forward slashes
1234
+ return_relative_paths (bool, optional): return relative paths; only relevant if [filenames]
1235
+ is a folder.
1236
+
1237
+ Returns:
1238
+ dict: dictionary mapping filenames to file sizes in bytes
1239
+ """
1240
+
1241
+ folder_name = None
1242
+
1243
+ if isinstance(filenames,str):
1244
+
1245
+ folder_name = filenames
1246
+ assert os.path.isdir(filenames), 'Could not find folder {}'.format(folder_name)
1247
+
1248
+ if verbose:
1249
+ print('Enumerating files in {}'.format(folder_name))
1250
+
1251
+ # Enumerate absolute paths here, we'll convert to relative later if requested
1252
+ filenames = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
1253
+
1254
+ else:
1255
+
1256
+ assert is_iterable(filenames), '[filenames] argument is neither a folder nor an iterable'
1257
+
1258
+ n_workers = min(max_workers,len(filenames))
1259
+
1260
+ if verbose:
1261
+ print('Creating worker pool')
1262
+
1263
+ pool = None
1264
+
1265
+ try:
1266
+
1267
+ if use_threads:
1268
+ pool_string = 'thread'
1269
+ pool = ThreadPool(n_workers)
1270
+ else:
1271
+ pool_string = 'process'
1272
+ pool = Pool(n_workers)
1273
+
1274
+ if verbose:
1275
+ print('Created a {} pool of {} workers'.format(
1276
+ pool_string,n_workers))
1277
+
1278
+ # This returns (filename,size) tuples
1279
+ get_size_results = list(tqdm(pool.imap(
1280
+ partial(_get_file_size,verbose=verbose),filenames), total=len(filenames)))
1281
+
1282
+ finally:
1283
+
1284
+ if pool is not None:
1285
+ pool.close()
1286
+ pool.join()
1287
+ if verbose:
1288
+ print('Pool closed and join for file size collection')
1289
+
1290
+ to_return = {}
1291
+ for r in get_size_results:
1292
+ fn = r[0]
1293
+ if return_relative_paths and (folder_name is not None):
1294
+ fn = os.path.relpath(fn,folder_name)
1295
+ if convert_slashes:
1296
+ fn = fn.replace('\\','/')
1297
+ size = r[1]
1298
+ to_return[fn] = size
1299
+
1300
+ return to_return
1301
+
1302
+ # ...def parallel_get_file_sizes(...)
1303
+
1304
+
1305
+ #%% Compression (zip/tar) functions
1306
+
1307
+ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compress_level=9):
1308
+ """
1309
+ Zips a single file.
1310
+
1311
+ Args:
1312
+ input_fn (str): file to zip
1313
+ output_fn (str, optional): target zipfile; if this is None, we'll use
1314
+ [input_fn].zip
1315
+ overwrite (bool, optional): whether to overwrite an existing target file
1316
+ verbose (bool, optional): enable existing debug console output
1317
+ compress_level (int, optional): compression level to use, between 0 and 9
1318
+
1319
+ Returns:
1320
+ str: the output zipfile, whether we created it or determined that it already exists
1321
+ """
1322
+
1323
+ basename = os.path.basename(input_fn)
1324
+
1325
+ if output_fn is None:
1326
+ output_fn = input_fn + '.zip'
1327
+
1328
+ if (not overwrite) and (os.path.isfile(output_fn)):
1329
+ print('Skipping existing file {}'.format(output_fn))
1330
+ return output_fn
1331
+
1332
+ if verbose:
1333
+ print('Zipping {} to {} with level {}'.format(input_fn,output_fn,compress_level))
1334
+
1335
+ with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
1336
+ zipf.write(input_fn,
1337
+ arcname=basename,
1338
+ compresslevel=compress_level,
1339
+ compress_type=zipfile.ZIP_DEFLATED)
1340
+
1341
+ return output_fn
1342
+
1343
+ # ...def zip_file(...)
1344
+
1345
+
1346
+ def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
1347
+ overwrite=False, verbose=False, mode='x'):
1348
+ """
1349
+ Adds all the files in [input_files] to the tar file [output_fn].
1350
+ Archive names are relative to arc_name_base.
1351
+
1352
+ Args:
1353
+ input_files (list): list of absolute filenames to include in the .tar file
1354
+ output_fn (str): .tar file to create
1355
+ arc_name_base (str): absolute folder from which relative paths should be determined;
1356
+ behavior is undefined if there are files in [input_files] that don't live within
1357
+ [arc_name_base]
1358
+ overwrite (bool, optional): whether to overwrite an existing .tar file
1359
+ verbose (bool, optional): enable additional debug console output
1360
+ mode (str, optional): compression type, can be 'x' (no compression), 'x:gz', or 'x:bz2'.
1361
+
1362
+ Returns:
1363
+ str: the output tar file, whether we created it or determined that it already exists
1364
+ """
1365
+
1366
+ if os.path.isfile(output_fn):
1367
+ if not overwrite:
1368
+ print('Tar file {} exists, skipping'.format(output_fn))
1369
+ return output_fn
1370
+ else:
1371
+ print('Tar file {} exists, deleting and re-creating'.format(output_fn))
1372
+ os.remove(output_fn)
1373
+
1374
+ if verbose:
1375
+ print('Adding {} files to {} (mode {})'.format(
1376
+ len(input_files),output_fn,mode))
1377
+
1378
+ with tarfile.open(output_fn,mode) as tarf:
1379
+ for input_fn_abs in tqdm(input_files,disable=(not verbose)):
1380
+ input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
1381
+ tarf.add(input_fn_abs,arcname=input_fn_relative)
1382
+
1383
+ return output_fn
1384
+
1385
+ # ...def add_files_to_single_tar_file(...)
1386
+
1387
+
1388
+ def zip_files_into_single_zipfile(input_files,
1389
+ output_fn,
1390
+ arc_name_base,
1391
+ overwrite=False,
1392
+ verbose=False,
1393
+ compress_level=9):
1394
+ """
1395
+ Zip all the files in [input_files] into [output_fn]. Archive names are relative to
1396
+ arc_name_base.
1397
+
1398
+ Args:
1399
+ input_files (list): list of absolute filenames to include in the .tar file
1400
+ output_fn (str): .tar file to create
1401
+ arc_name_base (str): absolute folder from which relative paths should be determined;
1402
+ behavior is undefined if there are files in [input_files] that don't live within
1403
+ [arc_name_base]
1404
+ overwrite (bool, optional): whether to overwrite an existing .tar file
1405
+ verbose (bool, optional): enable additional debug console output
1406
+ compress_level (int, optional): compression level to use, between 0 and 9
1407
+
1408
+ Returns:
1409
+ str: the output zipfile, whether we created it or determined that it already exists
1410
+ """
1411
+
1412
+ if not overwrite:
1413
+ if os.path.isfile(output_fn):
1414
+ print('Zip file {} exists, skipping'.format(output_fn))
1415
+ return output_fn
1416
+
1417
+ if verbose:
1418
+ print('Zipping {} files to {} (compression level {})'.format(
1419
+ len(input_files),output_fn,compress_level))
1420
+
1421
+ with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
1422
+ for input_fn_abs in tqdm(input_files,disable=(not verbose)):
1423
+ input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
1424
+ zipf.write(input_fn_abs,
1425
+ arcname=input_fn_relative,
1426
+ compresslevel=compress_level,
1427
+ compress_type=zipfile.ZIP_DEFLATED)
1428
+
1429
+ return output_fn
1430
+
1431
+ # ...def zip_files_into_single_zipfile(...)
1432
+
1433
+
1434
+ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, compress_level=9):
1435
+ """
1436
+ Recursively zip everything in [input_folder] into a single zipfile, storing files as paths
1437
+ relative to [input_folder].
1438
+
1439
+ Args:
1440
+ input_folder (str): folder to zip
1441
+ output_fn (str, optional): output filename; if this is None, we'll write to [input_folder].zip
1442
+ overwrite (bool, optional): whether to overwrite an existing .tar file
1443
+ verbose (bool, optional): enable additional debug console output
1444
+ compress_level (int, optional): compression level to use, between 0 and 9
1445
+
1446
+ Returns:
1447
+ str: the output zipfile, whether we created it or determined that it already exists
1448
+ """
1449
+
1450
+ if output_fn is None:
1451
+ output_fn = input_folder + '.zip'
1452
+
1453
+ if not overwrite:
1454
+ if os.path.isfile(output_fn):
1455
+ print('Zip file {} exists, skipping'.format(output_fn))
1456
+ return output_fn
1457
+
1458
+ if verbose:
1459
+ print('Zipping {} to {} (compression level {})'.format(
1460
+ input_folder,output_fn,compress_level))
1461
+
1462
+ relative_filenames = recursive_file_list(input_folder,return_relative_paths=True)
1463
+
1464
+ with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
1465
+ for input_fn_relative in tqdm(relative_filenames,disable=(not verbose)):
1466
+ input_fn_abs = os.path.join(input_folder,input_fn_relative)
1467
+ zipf.write(input_fn_abs,
1468
+ arcname=input_fn_relative,
1469
+ compresslevel=compress_level,
1470
+ compress_type=zipfile.ZIP_DEFLATED)
1471
+
1472
+ return output_fn
1473
+
1474
+ # ...def zip_folder(...)
1475
+
1476
+
1477
+ def parallel_zip_files(input_files,
1478
+ max_workers=16,
1479
+ use_threads=True,
1480
+ compress_level=9,
1481
+ overwrite=False,
1482
+ verbose=False):
1483
+ """
1484
+ Zips one or more files to separate output files in parallel, leaving the
1485
+ original files in place. Each file is zipped to [filename].zip.
1486
+
1487
+ Args:
1488
+ input_files (str): list of files to zip
1489
+ max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
1490
+ use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
1491
+ max_workers <= 1
1492
+ compress_level (int, optional): zip compression level between 0 and 9
1493
+ overwrite (bool, optional): whether to overwrite an existing .tar file
1494
+ verbose (bool, optional): enable additional debug console output
1495
+ """
1496
+
1497
+ n_workers = min(max_workers,len(input_files))
1498
+
1499
+ if use_threads:
1500
+ pool = ThreadPool(n_workers)
1501
+ else:
1502
+ pool = Pool(n_workers)
1503
+
1504
+ try:
1505
+
1506
+ with tqdm(total=len(input_files)) as pbar:
1507
+ for i,_ in enumerate(pool.imap_unordered(partial(zip_file,
1508
+ output_fn=None,overwrite=overwrite,verbose=verbose,compress_level=compress_level),
1509
+ input_files)):
1510
+ pbar.update()
1511
+
1512
+ finally:
1513
+
1514
+ pool.close()
1515
+ pool.join()
1516
+ if verbose:
1517
+ print('Pool closed and joined for parallel zipping')
1518
+
1519
+ # ...def parallel_zip_files(...)
1520
+
1521
+
1522
+ def parallel_zip_folders(input_folders,
1523
+ max_workers=16,
1524
+ use_threads=True,
1525
+ compress_level=9,
1526
+ overwrite=False,
1527
+ verbose=False):
1528
+ """
1529
+ Zips one or more folders to separate output files in parallel, leaving the
1530
+ original folders in place. Each folder is zipped to [folder_name].zip.
1531
+
1532
+ Args:
1533
+ input_folders (list): list of folders to zip
1534
+ max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
1535
+ use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
1536
+ max_workers <= 1
1537
+ compress_level (int, optional): zip compression level between 0 and 9
1538
+ overwrite (bool, optional): whether to overwrite an existing .tar file
1539
+ verbose (bool, optional): enable additional debug console output
1540
+ """
1541
+
1542
+ n_workers = min(max_workers,len(input_folders))
1543
+
1544
+ if use_threads:
1545
+ pool = ThreadPool(n_workers)
1546
+ else:
1547
+ pool = Pool(n_workers)
1548
+
1549
+ try:
1550
+
1551
+ with tqdm(total=len(input_folders)) as pbar:
1552
+ for i,_ in enumerate(pool.imap_unordered(
1553
+ partial(zip_folder,overwrite=overwrite,
1554
+ compress_level=compress_level,verbose=verbose),
1555
+ input_folders)):
1556
+ pbar.update()
1557
+
1558
+ finally:
1559
+
1560
+ pool.close()
1561
+ pool.join()
1562
+ if verbose:
1563
+ print('Pool closed and joined for parallel folder zipping')
1564
+
1565
+ # ...def parallel_zip_folders(...)
1566
+
1567
+
1568
+ def zip_each_file_in_folder(folder_name,
1569
+ recursive=False,
1570
+ max_workers=16,
1571
+ use_threads=True,
1572
+ compress_level=9,
1573
+ overwrite=False,
1574
+ required_token=None,
1575
+ verbose=False,
1576
+ exclude_zip=True):
1577
+ """
1578
+ Zips each file in [folder_name] to its own zipfile (filename.zip), optionally recursing. To
1579
+ zip a whole folder into a single zipfile, use zip_folder().
1580
+
1581
+ Args:
1582
+ folder_name (str): the folder within which we should zip files
1583
+ recursive (bool, optional): whether to recurse within [folder_name]
1584
+ max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
1585
+ use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
1586
+ max_workers <= 1
1587
+ compress_level (int, optional): zip compression level between 0 and 9
1588
+ overwrite (bool, optional): whether to overwrite an existing .tar file
1589
+ required_token (str, optional): only zip files whose names contain this string
1590
+ verbose (bool, optional): enable additional debug console output
1591
+ exclude_zip (bool, optional): skip files ending in .zip
1592
+ """
1593
+
1594
+ assert os.path.isdir(folder_name), '{} is not a folder'.format(folder_name)
1595
+
1596
+ input_files = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
1597
+
1598
+ if required_token is not None:
1599
+ input_files = [fn for fn in input_files if required_token in fn]
1600
+
1601
+ if exclude_zip:
1602
+ input_files = [fn for fn in input_files if (not fn.endswith('.zip'))]
1603
+
1604
+ parallel_zip_files(input_files=input_files,max_workers=max_workers,
1605
+ use_threads=use_threads,compress_level=compress_level,
1606
+ overwrite=overwrite,verbose=verbose)
1607
+
1608
+ # ...def zip_each_file_in_folder(...)
1609
+
1610
+
1611
+ def unzip_file(input_file, output_folder=None):
1612
+ """
1613
+ Unzips a zipfile to the specified output folder, defaulting to the same location as
1614
+ the input file.
1615
+
1616
+ Args:
1617
+ input_file (str): zipfile to unzip
1618
+ output_folder (str, optional): folder to which we should unzip [input_file], defaults
1619
+ to unzipping to the folder where [input_file] lives
1620
+ """
1621
+
1622
+ if output_folder is None:
1623
+ output_folder = os.path.dirname(input_file)
1624
+
1625
+ with zipfile.ZipFile(input_file, 'r') as zf:
1626
+ zf.extractall(output_folder)
1627
+
1628
+
1629
+ #%% File hashing functions
1630
+
1631
+ def compute_file_hash(file_path, algorithm='sha256', allow_failures=True):
1632
+ """
1633
+ Compute the hash of a file.
1634
+
1635
+ Adapted from:
1636
+
1637
+ https://www.geeksforgeeks.org/python-program-to-find-hash-of-file/
1638
+
1639
+ Args:
1640
+ file_path (str): the file to hash
1641
+ algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
1642
+ allow_failures (bool, optional): if True, read failures will silently return
1643
+ None; if false, read failures will raise exceptions
1644
+
1645
+ Returns:
1646
+ str: the hash value for this file
1647
+ """
1648
+
1649
+ try:
1650
+
1651
+ hash_func = hashlib.new(algorithm)
1652
+
1653
+ with open(file_path, 'rb') as file:
1654
+ while chunk := file.read(8192): # Read the file in chunks of 8192 bytes
1655
+ hash_func.update(chunk)
1656
+
1657
+ return str(hash_func.hexdigest())
1658
+
1659
+ except Exception:
1660
+
1661
+ if allow_failures:
1662
+ return None
1663
+ else:
1664
+ raise
1665
+
1666
+ # ...def compute_file_hash(...)
1667
+
1668
+
1669
+ def parallel_compute_file_hashes(filenames,
1670
+ max_workers=16,
1671
+ use_threads=True,
1672
+ recursive=True,
1673
+ algorithm='sha256',
1674
+ verbose=False):
1675
+ """
1676
+ Compute file hashes for a list or folder of images.
1677
+
1678
+ Args:
1679
+ filenames (list or str): a list of filenames or a folder
1680
+ max_workers (int, optional): the number of parallel workers to use; set to <=1 to disable
1681
+ parallelization
1682
+ use_threads (bool, optional): whether to use threads (True) or processes (False) for
1683
+ parallelization
1684
+ algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
1685
+ recursive (bool, optional): if [filenames] is a folder, whether to enumerate recursively.
1686
+ Ignored if [filenames] is a list.
1687
+ verbose (bool, optional): enable additional debug output
1688
+
1689
+ Returns:
1690
+ dict: a dict mapping filenames to hash values; values will be None for files that fail
1691
+ to load.
1692
+ """
1693
+
1694
+ if isinstance(filenames,str) and os.path.isdir(filenames):
1695
+ if verbose:
1696
+ print('Enumerating files in {}'.format(filenames))
1697
+ filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
1698
+
1699
+ n_workers = min(max_workers,len(filenames))
1700
+
1701
+ if verbose:
1702
+ print('Computing hashes for {} files on {} workers'.format(len(filenames),n_workers))
1703
+
1704
+ if n_workers <= 1:
1705
+
1706
+ results = []
1707
+ for filename in filenames:
1708
+ results.append(compute_file_hash(filename,algorithm=algorithm,allow_failures=True))
1709
+
1710
+ else:
1711
+
1712
+ if use_threads:
1713
+ pool = ThreadPool(n_workers)
1714
+ else:
1715
+ pool = Pool(n_workers)
1716
+
1717
+ try:
1718
+
1719
+ results = list(tqdm(pool.imap(
1720
+ partial(compute_file_hash,algorithm=algorithm,allow_failures=True),
1721
+ filenames), total=len(filenames)))
1722
+
1723
+ finally:
1724
+
1725
+ pool.close()
1726
+ pool.join()
1727
+ if verbose:
1728
+ print('Pool closed and joined for parallel zipping')
1729
+
1730
+ # ...if we are/aren't parallelizing
1731
+
1732
+ assert len(filenames) == len(results), 'Internal error in parallel_compute_file_hashes'
1733
+
1734
+ to_return = {}
1735
+ for i_file,filename in enumerate(filenames):
1736
+ to_return[filename] = results[i_file]
1737
+
1738
+ return to_return
1739
+
1740
+ # ...def parallel_compute_file_hashes(...)
1741
+
1742
+
1743
+ #%% Tests
1744
+
1745
+ class TestPathUtils:
1746
+ """
1747
+ Tests for path_utils.py
1748
+ """
1749
+
1750
+ def set_up(self):
1751
+ """
1752
+ Create a temporary directory for testing.
1753
+ """
1754
+
1755
+ self.test_dir = make_test_folder(subfolder='megadetector/path_utils_tests')
1756
+ print('Using temporary folder {} for path utils testing'.format(self.test_dir))
1757
+ os.makedirs(self.test_dir, exist_ok=True)
1758
+
1759
+
1760
+ def tear_down(self):
1761
+ """
1762
+ Remove the temporary directory after tests.
1763
+ """
1764
+
1765
+ if os.path.exists(self.test_dir):
1766
+ shutil.rmtree(self.test_dir)
1767
+
1768
+
1769
+ def test_is_image_file(self):
1770
+ """
1771
+ Test the is_image_file function.
1772
+ """
1773
+
1774
+ assert is_image_file('test.jpg')
1775
+ assert is_image_file('test.jpeg')
1776
+ assert is_image_file('test.png')
1777
+ assert is_image_file('test.gif')
1778
+ assert is_image_file('test.bmp')
1779
+ assert is_image_file('test.tiff')
1780
+ assert is_image_file('test.TIF')
1781
+ assert not is_image_file('test.txt')
1782
+ assert not is_image_file('test.doc')
1783
+ assert is_image_file('path/to/image.JPG')
1784
+ assert not is_image_file('image')
1785
+ assert is_image_file('test.custom', img_extensions=['.custom'])
1786
+ assert not is_image_file('test.jpg', img_extensions=['.custom'])
1787
+
1788
+
1789
+ def test_find_image_strings(self):
1790
+ """
1791
+ Test the find_image_strings function.
1792
+ """
1793
+
1794
+ strings = ['a.jpg', 'b.txt', 'c.PNG', 'd.gif', 'e.jpeg', 'f.doc']
1795
+ expected = ['a.jpg', 'c.PNG', 'd.gif', 'e.jpeg']
1796
+ assert sorted(find_image_strings(strings)) == sorted(expected)
1797
+ assert find_image_strings([]) == []
1798
+ assert find_image_strings(['no_image.txt', 'another.doc']) == []
1799
+
1800
+
1801
+ def test_find_images(self):
1802
+ """
1803
+ Test the find_images function.
1804
+ """
1805
+
1806
+ # Create some dummy files
1807
+ img1_abs = os.path.join(self.test_dir, 'img1.jpg')
1808
+ img2_abs = os.path.join(self.test_dir, 'img2.PNG')
1809
+ txt1_abs = os.path.join(self.test_dir, 'text1.txt')
1810
+ open(img1_abs, 'w').close()
1811
+ open(img2_abs, 'w').close()
1812
+ open(txt1_abs, 'w').close()
1813
+
1814
+ subdir = os.path.join(self.test_dir, 'subdir')
1815
+ os.makedirs(subdir, exist_ok=True)
1816
+ img3_abs = os.path.join(subdir, 'img3.jpeg')
1817
+ txt2_abs = os.path.join(subdir, 'text2.txt')
1818
+ open(img3_abs, 'w').close()
1819
+ open(txt2_abs, 'w').close()
1820
+
1821
+ # Test non-recursive
1822
+ expected_non_recursive_abs = sorted([img1_abs.replace('\\', '/'), img2_abs.replace('\\', '/')])
1823
+ found_non_recursive_abs = find_images(self.test_dir, recursive=False, return_relative_paths=False)
1824
+ assert sorted(found_non_recursive_abs) == expected_non_recursive_abs
1825
+
1826
+ # Test non-recursive, relative paths
1827
+ expected_non_recursive_rel = sorted(['img1.jpg', 'img2.PNG'])
1828
+ found_non_recursive_rel = find_images(self.test_dir, recursive=False, return_relative_paths=True)
1829
+ assert sorted(found_non_recursive_rel) == expected_non_recursive_rel
1830
+
1831
+ # Test recursive
1832
+ expected_recursive_abs = sorted([
1833
+ img1_abs.replace('\\', '/'),
1834
+ img2_abs.replace('\\', '/'),
1835
+ img3_abs.replace('\\', '/')
1836
+ ])
1837
+ found_recursive_abs = find_images(self.test_dir, recursive=True, return_relative_paths=False)
1838
+ assert sorted(found_recursive_abs) == expected_recursive_abs
1839
+
1840
+ # Test recursive, relative paths
1841
+ expected_recursive_rel = sorted([
1842
+ 'img1.jpg',
1843
+ 'img2.PNG',
1844
+ os.path.join('subdir', 'img3.jpeg').replace('\\', '/')
1845
+ ])
1846
+ found_recursive_rel = find_images(self.test_dir, recursive=True, return_relative_paths=True)
1847
+ assert sorted(found_recursive_rel) == expected_recursive_rel
1848
+
1849
+ # Test with an empty directory
1850
+ empty_dir = os.path.join(self.test_dir, 'empty_dir')
1851
+ os.makedirs(empty_dir, exist_ok=True)
1852
+ assert find_images(empty_dir, recursive=True) == []
1853
+
1854
+ # Test with a directory that doesn't exist (should assert)
1855
+ try:
1856
+ find_images(os.path.join(self.test_dir, 'non_existent_dir'))
1857
+ raise AssertionError("AssertionError not raised for non_existent_dir")
1858
+ except AssertionError:
1859
+ pass
1860
+
1861
+
1862
+ def test_recursive_file_list_and_file_list(self):
1863
+ """
1864
+ Test the recursive_file_list and file_list functions.
1865
+ """
1866
+
1867
+ # Setup directory structure
1868
+ # test_dir/
1869
+ # file1.txt
1870
+ # file2.jpg
1871
+ # subdir1/
1872
+ # file3.txt
1873
+ # subsubdir/
1874
+ # file4.png
1875
+ # subdir2/
1876
+ # file5.doc
1877
+
1878
+ list_dir = os.path.join(self.test_dir,'recursive_list')
1879
+
1880
+ f1 = os.path.join(list_dir, 'file1.txt')
1881
+ f2 = os.path.join(list_dir, 'file2.jpg')
1882
+ subdir1 = os.path.join(list_dir, 'subdir1')
1883
+ os.makedirs(subdir1, exist_ok=True)
1884
+ f3 = os.path.join(subdir1, 'file3.txt')
1885
+ subsubdir = os.path.join(subdir1, 'subsubdir')
1886
+ os.makedirs(subsubdir, exist_ok=True)
1887
+ f4 = os.path.join(subsubdir, 'file4.png')
1888
+ subdir2 = os.path.join(list_dir, 'subdir2')
1889
+ os.makedirs(subdir2, exist_ok=True)
1890
+ f5 = os.path.join(subdir2, 'file5.doc')
1891
+
1892
+ for filepath in [f1, f2, f3, f4, f5]:
1893
+ with open(filepath, 'w') as f:
1894
+ f.write('test')
1895
+
1896
+ # Test recursive_file_list (recursive=True by default)
1897
+ expected_all_files_abs = sorted([
1898
+ f1.replace('\\', '/'), f2.replace('\\', '/'), f3.replace('\\', '/'),
1899
+ f4.replace('\\', '/'), f5.replace('\\', '/')
1900
+ ])
1901
+ all_files_abs = recursive_file_list(list_dir, convert_slashes=True,
1902
+ return_relative_paths=False)
1903
+ assert sorted(all_files_abs) == expected_all_files_abs
1904
+
1905
+ # Test recursive_file_list with relative paths
1906
+ expected_all_files_rel = sorted([
1907
+ 'file1.txt', 'file2.jpg',
1908
+ os.path.join('subdir1', 'file3.txt').replace('\\', '/'),
1909
+ os.path.join('subdir1', 'subsubdir', 'file4.png').replace('\\', '/'),
1910
+ os.path.join('subdir2', 'file5.doc').replace('\\', '/')
1911
+ ])
1912
+ all_files_rel = recursive_file_list(list_dir, convert_slashes=True,
1913
+ return_relative_paths=True)
1914
+ assert sorted(all_files_rel) == expected_all_files_rel
1915
+
1916
+ # Test file_list (non-recursive by default via wrapper)
1917
+ expected_top_level_files_abs = sorted([f1.replace('\\', '/'), f2.replace('\\', '/')])
1918
+ top_level_files_abs = file_list(list_dir, convert_slashes=True,
1919
+ return_relative_paths=False, recursive=False)
1920
+ assert sorted(top_level_files_abs) == expected_top_level_files_abs
1921
+
1922
+ # Test file_list (recursive explicitly) - should be same as recursive_file_list
1923
+ recursive_via_file_list = file_list(list_dir, convert_slashes=True,
1924
+ return_relative_paths=False, recursive=True)
1925
+ assert sorted(recursive_via_file_list) == expected_all_files_abs
1926
+
1927
+ # Test with convert_slashes=False (use os.sep)
1928
+ #
1929
+ # Note: This test might be tricky if os.sep is '/', as no replacement happens. We'll check
1930
+ # that backslashes remain on Windows.
1931
+ if os.sep == '\\':
1932
+ f1_raw = os.path.join(list_dir, 'file1.txt')
1933
+ # Only one file for simplicity
1934
+ files_no_slash_conversion = file_list(list_dir, convert_slashes=False, recursive=False)
1935
+ assert any(f1_raw in s for s in files_no_slash_conversion)
1936
+
1937
+ # Test with an empty directory
1938
+ empty_dir = os.path.join(list_dir, "empty_dir_for_files")
1939
+ os.makedirs(empty_dir, exist_ok=True)
1940
+ assert recursive_file_list(empty_dir) == []
1941
+ assert file_list(empty_dir, recursive=False) == []
1942
+
1943
+ # Test with a non-existent directory
1944
+ try:
1945
+ recursive_file_list(os.path.join(list_dir, "non_existent_dir"))
1946
+ raise AssertionError("AssertionError not raised for non_existent_dir in recursive_file_list")
1947
+ except AssertionError:
1948
+ pass
1949
+
1950
+
1951
+ def test_folder_list(self):
1952
+ """
1953
+ Test the folder_list function.
1954
+ """
1955
+
1956
+ # Setup directory structure
1957
+ # test_dir/
1958
+ # subdir1/
1959
+ # subsubdir1/
1960
+ # subdir2/
1961
+ # file1.txt (should be ignored)
1962
+
1963
+ folder_list_dir = os.path.join(self.test_dir,'folder_list')
1964
+
1965
+ subdir1 = os.path.join(folder_list_dir, 'subdir1')
1966
+ subsubdir1 = os.path.join(subdir1, 'subsubdir1')
1967
+ subdir2 = os.path.join(folder_list_dir, 'subdir2')
1968
+ os.makedirs(subdir1, exist_ok=True)
1969
+ os.makedirs(subsubdir1, exist_ok=True)
1970
+ os.makedirs(subdir2, exist_ok=True)
1971
+ with open(os.path.join(folder_list_dir, 'file1.txt'), 'w') as f:
1972
+ f.write('test')
1973
+
1974
+ # Test non-recursive
1975
+ expected_folders_non_recursive_abs = sorted([
1976
+ subdir1.replace('\\', '/'), subdir2.replace('\\', '/')
1977
+ ])
1978
+ folders_non_recursive_abs = folder_list(folder_list_dir, recursive=False,
1979
+ return_relative_paths=False)
1980
+ assert sorted(folders_non_recursive_abs) == expected_folders_non_recursive_abs, \
1981
+ 'Non-recursive folder list failed, expected:\n\n{}\n\nFound:\n\n{}'.format(
1982
+ str(expected_folders_non_recursive_abs),
1983
+ str(folders_non_recursive_abs)
1984
+ )
1985
+
1986
+ # Test non-recursive, relative paths
1987
+ expected_folders_non_recursive_rel = sorted(['subdir1', 'subdir2'])
1988
+ folders_non_recursive_rel = folder_list(folder_list_dir, recursive=False,
1989
+ return_relative_paths=True)
1990
+ assert sorted(folders_non_recursive_rel) == expected_folders_non_recursive_rel
1991
+
1992
+ # Test recursive
1993
+ expected_folders_recursive_abs = sorted([
1994
+ subdir1.replace('\\', '/'),
1995
+ subsubdir1.replace('\\', '/'),
1996
+ subdir2.replace('\\', '/')
1997
+ ])
1998
+ folders_recursive_abs = folder_list(folder_list_dir, recursive=True,
1999
+ return_relative_paths=False)
2000
+ assert sorted(folders_recursive_abs) == expected_folders_recursive_abs
2001
+
2002
+ # Test recursive, relative paths
2003
+ expected_folders_recursive_rel = sorted([
2004
+ 'subdir1',
2005
+ os.path.join('subdir1', 'subsubdir1').replace('\\', '/'),
2006
+ 'subdir2'
2007
+ ])
2008
+ folders_recursive_rel = folder_list(folder_list_dir, recursive=True,
2009
+ return_relative_paths=True)
2010
+ assert sorted(folders_recursive_rel) == expected_folders_recursive_rel
2011
+
2012
+ # Test with an empty directory (except for the file)
2013
+ empty_dir_for_folders = os.path.join(folder_list_dir, "empty_for_folders")
2014
+ os.makedirs(empty_dir_for_folders, exist_ok=True)
2015
+ with open(os.path.join(empty_dir_for_folders, 'temp.txt'), 'w') as f: f.write('t')
2016
+ assert folder_list(empty_dir_for_folders, recursive=True) == []
2017
+ assert folder_list(empty_dir_for_folders, recursive=False) == []
2018
+
2019
+ # Test with a non-existent directory
2020
+ try:
2021
+ folder_list(os.path.join(self.test_dir, "non_existent_dir"))
2022
+ raise AssertionError("AssertionError not raised for non_existent_dir in folder_list")
2023
+ except AssertionError:
2024
+ pass
2025
+
2026
+
2027
+ def test_folder_summary(self):
2028
+ """
2029
+ Test the folder_summary function.
2030
+ """
2031
+
2032
+ # test_dir/
2033
+ # file1.txt
2034
+ # img1.jpg
2035
+ # subdir/
2036
+ # file2.txt
2037
+ # img2.png
2038
+ # img3.png
2039
+
2040
+ folder_summary_dir = os.path.join(self.test_dir,'folder_summary')
2041
+
2042
+ f1 = os.path.join(folder_summary_dir, 'file1.txt')
2043
+ img1 = os.path.join(folder_summary_dir, 'img1.jpg')
2044
+ subdir = os.path.join(folder_summary_dir, 'subdir')
2045
+ os.makedirs(subdir, exist_ok=True)
2046
+ f2 = os.path.join(subdir, 'file2.txt')
2047
+ img2 = os.path.join(subdir, 'img2.png')
2048
+ img3 = os.path.join(subdir, 'img3.png')
2049
+
2050
+ for filepath in [f1, img1, f2, img2, img3]:
2051
+ with open(filepath, 'w') as f:
2052
+ f.write('test')
2053
+
2054
+ summary = folder_summary(folder_summary_dir, print_summary=False)
2055
+
2056
+ assert summary['n_files'] == 5
2057
+ assert summary['n_folders'] == 1 # 'subdir'
2058
+ assert summary['extension_to_count']['.txt'] == 2
2059
+ assert summary['extension_to_count']['.jpg'] == 1
2060
+ assert summary['extension_to_count']['.png'] == 2
2061
+
2062
+ # Check order (sorted by value, desc)
2063
+ #
2064
+ # The specific order of keys with the same counts can vary based on file system list
2065
+ # order. We'll check that the counts are correct and the number of unique extensions is
2066
+ # right.
2067
+ assert len(summary['extension_to_count']) == 3
2068
+
2069
+
2070
+ empty_dir = os.path.join(folder_summary_dir, "empty_summary_dir")
2071
+ os.makedirs(empty_dir, exist_ok=True)
2072
+ empty_summary = folder_summary(empty_dir, print_summary=False)
2073
+ assert empty_summary['n_files'] == 0
2074
+ assert empty_summary['n_folders'] == 0
2075
+ assert empty_summary['extension_to_count'] == {}
2076
+
2077
+
2078
+ def test_fileparts(self):
2079
+ """
2080
+ Test the fileparts function.
2081
+ """
2082
+
2083
+ assert fileparts('file') == ('', 'file', '')
2084
+ assert fileparts('file.txt') == ('', 'file', '.txt')
2085
+ assert fileparts(r'c:/dir/file.jpg') == ('c:/dir', 'file', '.jpg')
2086
+ assert fileparts('/dir/subdir/file.jpg') == ('/dir/subdir', 'file', '.jpg')
2087
+ assert fileparts(r'c:\dir\file') == (r'c:\dir', 'file', '')
2088
+ assert fileparts(r'c:\dir\file.tar.gz') == (r'c:\dir', 'file.tar', '.gz')
2089
+ assert fileparts('.bashrc') == ('', '.bashrc', '') # Hidden file, no extension
2090
+ assert fileparts('nodir/.bashrc') == ('nodir', '.bashrc', '')
2091
+ assert fileparts('a/b/c.d.e') == ('a/b', 'c.d', '.e')
2092
+
2093
+
2094
+ def test_insert_before_extension(self):
2095
+ """
2096
+ Test the insert_before_extension function.
2097
+ """
2098
+
2099
+ assert insert_before_extension('file.ext', 'inserted') == 'file.inserted.ext'
2100
+ assert insert_before_extension('file', 'inserted') == 'file.inserted'
2101
+ assert insert_before_extension('path/to/file.ext', 'tag') == 'path/to/file.tag.ext'
2102
+ assert insert_before_extension('path/to/file', 'tag') == 'path/to/file.tag'
2103
+ assert insert_before_extension('file.tar.gz', 'new') == 'file.tar.new.gz'
2104
+
2105
+ # Test with custom separator
2106
+ assert insert_before_extension('file.ext', 'inserted', separator='_') == 'file_inserted.ext'
2107
+
2108
+ # Test with s=None (timestamp) - check format roughly
2109
+ fname_with_ts = insert_before_extension('file.ext', None)
2110
+ parts = fname_with_ts.split('.')
2111
+ # file.YYYY.MM.DD.HH.MM.SS.ext
2112
+ assert len(parts) >= 8 # file, Y, M, D, H, M, S, ext
2113
+ assert parts[0] == 'file'
2114
+ assert parts[-1] == 'ext'
2115
+ assert all(p.isdigit() for p in parts[1:-1])
2116
+
2117
+ fname_no_ext_ts = insert_before_extension('file', '') # s is empty string, should also use timestamp
2118
+ parts_no_ext = fname_no_ext_ts.split('.')
2119
+ assert len(parts_no_ext) >= 7 # file, Y, M, D, H, M, S
2120
+ assert parts_no_ext[0] == 'file'
2121
+ assert all(p.isdigit() for p in parts_no_ext[1:])
2122
+
2123
+
2124
+ def test_split_path(self):
2125
+ """
2126
+ Test the split_path function.
2127
+ """
2128
+
2129
+ if os.name == 'nt':
2130
+ assert split_path(r'c:\dir\subdir\file.txt') == ['c:\\', 'dir', 'subdir', 'file.txt']
2131
+ assert split_path('c:\\') == ['c:\\']
2132
+ # Test with mixed slashes, ntpath.split handles them
2133
+ assert split_path(r'c:/dir/subdir/file.txt') == ['c:/', 'dir', 'subdir', 'file.txt']
2134
+ else: # POSIX
2135
+ assert split_path('/dir/subdir/file.jpg') == ['/', 'dir', 'subdir', 'file.jpg']
2136
+ assert split_path('/') == ['/']
2137
+
2138
+ assert split_path('dir/file.txt') == ['dir', 'file.txt']
2139
+ assert split_path('file.txt') == ['file.txt']
2140
+ assert split_path('') == ''
2141
+ assert split_path('.') == ['.']
2142
+ assert split_path('..') == ['..']
2143
+ assert split_path('../a/b') == ['..', 'a', 'b']
2144
+
2145
+
2146
+ def test_path_is_abs(self):
2147
+ """
2148
+ Test the path_is_abs function.
2149
+ """
2150
+
2151
+ assert path_is_abs('/absolute/path')
2152
+ assert path_is_abs('c:/absolute/path')
2153
+ assert path_is_abs('C:\\absolute\\path')
2154
+ assert path_is_abs('\\\\server\\share\\path') # UNC path
2155
+ assert path_is_abs('c:file_without_slash_after_drive')
2156
+
2157
+ assert not path_is_abs('relative/path')
2158
+ assert not path_is_abs('file.txt')
2159
+ assert not path_is_abs('../relative')
2160
+ assert not path_is_abs('')
2161
+
2162
+
2163
+
2164
+ def test_safe_create_link_unix(self):
2165
+ """
2166
+ Test the safe_create_link function on Unix-like systems.
2167
+ """
2168
+
2169
+ if os.name == 'nt':
2170
+ # print("Skipping test_safe_create_link_unix on Windows.")
2171
+ return
2172
+
2173
+ source_file_path = os.path.join(self.test_dir, 'source.txt')
2174
+ link_path = os.path.join(self.test_dir, 'link.txt')
2175
+ other_source_path = os.path.join(self.test_dir, 'other_source.txt')
2176
+
2177
+ with open(source_file_path, 'w') as f:
2178
+ f.write('source data')
2179
+ with open(other_source_path, 'w') as f:
2180
+ f.write('other data')
2181
+
2182
+ # Create new link
2183
+ safe_create_link(source_file_path, link_path)
2184
+ assert os.path.islink(link_path)
2185
+ assert os.readlink(link_path) == source_file_path
2186
+
2187
+ # Link already exists and points to the correct source
2188
+ safe_create_link(source_file_path, link_path) # Should do nothing
2189
+ assert os.path.islink(link_path)
2190
+ assert os.readlink(link_path) == source_file_path
2191
+
2192
+ # Link already exists but points to a different source
2193
+ safe_create_link(other_source_path, link_path) # Should remove and re-create
2194
+ assert os.path.islink(link_path)
2195
+ assert os.readlink(link_path) == other_source_path
2196
+
2197
+ # Link_new path exists and is a file (not a link)
2198
+ file_path_conflict = os.path.join(self.test_dir, 'conflict_file.txt')
2199
+ with open(file_path_conflict, 'w') as f:
2200
+ f.write('actual file')
2201
+ try:
2202
+ safe_create_link(source_file_path, file_path_conflict)
2203
+ raise AssertionError("AssertionError not raised for file conflict")
2204
+ except AssertionError:
2205
+ pass
2206
+ os.remove(file_path_conflict)
2207
+
2208
+ # Link_new path exists and is a directory
2209
+ dir_path_conflict = os.path.join(self.test_dir, 'conflict_dir')
2210
+ os.makedirs(dir_path_conflict, exist_ok=True)
2211
+ try:
2212
+ safe_create_link(source_file_path, dir_path_conflict)
2213
+ raise AssertionError("AssertionError not raised for directory conflict")
2214
+ except AssertionError: # islink will be false
2215
+ pass
2216
+ shutil.rmtree(dir_path_conflict)
2217
+
2218
+
2219
+ def test_remove_empty_folders(self):
2220
+ """
2221
+ Test the remove_empty_folders function.
2222
+ """
2223
+
2224
+ # test_dir/
2225
+ # empty_top/
2226
+ # empty_mid/
2227
+ # empty_leaf/
2228
+ # mixed_top/
2229
+ # empty_mid_in_mixed/
2230
+ # empty_leaf_in_mixed/
2231
+ # non_empty_mid/
2232
+ # file.txt
2233
+ # non_empty_top/
2234
+ # file_in_top.txt
2235
+
2236
+ empty_top = os.path.join(self.test_dir, 'empty_top')
2237
+ empty_mid = os.path.join(empty_top, 'empty_mid')
2238
+ empty_leaf = os.path.join(empty_mid, 'empty_leaf')
2239
+ os.makedirs(empty_leaf, exist_ok=True)
2240
+
2241
+ mixed_top = os.path.join(self.test_dir, 'mixed_top')
2242
+ empty_mid_in_mixed = os.path.join(mixed_top, 'empty_mid_in_mixed')
2243
+ empty_leaf_in_mixed = os.path.join(empty_mid_in_mixed, 'empty_leaf_in_mixed')
2244
+ os.makedirs(empty_leaf_in_mixed, exist_ok=True)
2245
+ non_empty_mid = os.path.join(mixed_top, 'non_empty_mid')
2246
+ os.makedirs(non_empty_mid, exist_ok=True)
2247
+ with open(os.path.join(non_empty_mid, 'file.txt'), 'w') as f:
2248
+ f.write('data')
2249
+
2250
+ non_empty_top = os.path.join(self.test_dir, 'non_empty_top')
2251
+ os.makedirs(non_empty_top, exist_ok=True)
2252
+ with open(os.path.join(non_empty_top, 'file_in_top.txt'), 'w') as f:
2253
+ f.write('data')
2254
+
2255
+ # Process empty_top - should remove all three
2256
+ remove_empty_folders(empty_top, remove_root=True)
2257
+ assert not os.path.exists(empty_top)
2258
+ assert not os.path.exists(empty_mid)
2259
+ assert not os.path.exists(empty_leaf)
2260
+
2261
+ # Process mixed_top; should remove empty_leaf_in_mixed and empty_mid_in_mixed
2262
+ # but not mixed_top or non_empty_mid.
2263
+ remove_empty_folders(mixed_top, remove_root=True)
2264
+ assert os.path.exists(mixed_top) # mixed_top itself should remain
2265
+ assert not os.path.exists(empty_mid_in_mixed)
2266
+ assert not os.path.exists(empty_leaf_in_mixed)
2267
+ assert os.path.exists(non_empty_mid)
2268
+ assert os.path.exists(os.path.join(non_empty_mid, 'file.txt'))
2269
+
2270
+ # Process non_empty_top; should remove nothing.
2271
+ remove_empty_folders(non_empty_top, remove_root=True)
2272
+ assert os.path.exists(non_empty_top)
2273
+ assert os.path.exists(os.path.join(non_empty_top, 'file_in_top.txt'))
2274
+
2275
+ # Test with a file path (should do nothing and return False)
2276
+ file_path_for_removal = os.path.join(self.test_dir, 'a_file.txt')
2277
+ with open(file_path_for_removal, 'w') as f: f.write('t')
2278
+ assert not remove_empty_folders(file_path_for_removal, remove_root=True)
2279
+ assert os.path.exists(file_path_for_removal)
2280
+
2281
+ # Test with remove_root=False for the top level
2282
+ another_empty_top = os.path.join(self.test_dir, 'another_empty_top')
2283
+ another_empty_mid = os.path.join(another_empty_top, 'another_empty_mid')
2284
+ os.makedirs(another_empty_mid)
2285
+ remove_empty_folders(another_empty_top, remove_root=False)
2286
+ assert os.path.exists(another_empty_top) # Root not removed
2287
+ assert not os.path.exists(another_empty_mid) # Mid removed
2288
+
2289
+
2290
+ def test_path_join(self):
2291
+ """
2292
+ Test the path_join function.
2293
+ """
2294
+
2295
+ assert path_join('a', 'b', 'c') == 'a/b/c'
2296
+ assert path_join('a/b', 'c', 'd.txt') == 'a/b/c/d.txt'
2297
+ if os.name == 'nt':
2298
+ # On Windows, os.path.join uses '\', so convert_slashes=True should change it
2299
+ assert path_join('a', 'b', convert_slashes=True) == 'a/b'
2300
+ assert path_join('a', 'b', convert_slashes=False) == 'a\\b'
2301
+ assert path_join('c:\\', 'foo', 'bar', convert_slashes=True) == 'c:/foo/bar'
2302
+ assert path_join('c:\\', 'foo', 'bar', convert_slashes=False) == 'c:\\foo\\bar'
2303
+ else:
2304
+ # On POSIX, os.path.join uses '/', so convert_slashes=False should still be '/'
2305
+ assert path_join('a', 'b', convert_slashes=False) == 'a/b'
2306
+
2307
+ assert path_join('a', '', 'b') == 'a/b' # os.path.join behavior
2308
+ assert path_join('/a', 'b') == '/a/b'
2309
+ assert path_join('a', '/b') == '/b' # '/b' is absolute
2310
+
2311
+
2312
+ def test_filename_cleaning(self):
2313
+ """
2314
+ Test clean_filename, clean_path, and flatten_path functions.
2315
+ """
2316
+
2317
+ # clean_filename
2318
+ assert clean_filename("test file.txt") == "test file.txt"
2319
+ assert clean_filename("test*file?.txt", char_limit=10) == "testfile.t"
2320
+ assert clean_filename("TestFile.TXT", force_lower=True) == "testfile.txt"
2321
+ assert clean_filename("file:with<illegal>chars.txt") == "filewithillegalchars.txt"
2322
+
2323
+ s = " accented_name_éà.txt"
2324
+
2325
+ assert clean_filename(s,
2326
+ remove_trailing_leading_whitespace=False) == " accented_name_ea.txt", \
2327
+ 'clean_filename with remove_trailing_leading_whitespace=False: {}'.format(
2328
+ clean_filename(s, remove_trailing_leading_whitespace=False))
2329
+
2330
+ assert clean_filename(s, remove_trailing_leading_whitespace=True) == "accented_name_ea.txt", \
2331
+ 'clean_filename with remove_trailing_leading_whitespace=False: {}'.format(
2332
+ clean_filename(s, remove_trailing_leading_whitespace=True))
2333
+
2334
+ # Separators are not allowed by default in clean_filename
2335
+ assert clean_filename("path/to/file.txt") == "pathtofile.txt"
2336
+
2337
+ # clean_path
2338
+ assert clean_path("path/to/file.txt") == "path/to/file.txt" # slashes allowed
2339
+ assert clean_path("path\\to\\file.txt") == "path\\to\\file.txt" # backslashes allowed
2340
+ assert clean_path("path:to:file.txt") == "path:to:file.txt" # colons allowed
2341
+ assert clean_path("path/to<illegal>/file.txt") == "path/toillegal/file.txt"
2342
+
2343
+ # flatten_path
2344
+ assert flatten_path("path/to/file.txt") == "path~to~file.txt"
2345
+ assert flatten_path("path:to:file.txt", separator_char_replacement='_') == "path_to_file.txt"
2346
+ assert flatten_path("path\\to/file:name.txt") == "path~to~file~name.txt"
2347
+ assert flatten_path("path/to<illegal>/file.txt") == "path~toillegal~file.txt"
2348
+
2349
+
2350
+ def test_is_executable(self):
2351
+ """
2352
+ Test the is_executable function.
2353
+ This is a basic test; comprehensive testing is environment-dependent.
2354
+ """
2355
+
2356
+ # Hard to test reliably across all systems without knowing what's on PATH.
2357
+ if os.name == 'nt':
2358
+ assert is_executable('cmd.exe')
2359
+ assert not is_executable('non_existent_executable_blah_blah')
2360
+ else:
2361
+ assert is_executable('ls')
2362
+ assert is_executable('sh')
2363
+ assert not is_executable('non_existent_executable_blah_blah')
2364
+
2365
+
2366
+ def test_write_read_list_to_file(self):
2367
+ """
2368
+ Test write_list_to_file and read_list_from_file functions.
2369
+ """
2370
+
2371
+ test_list = ["item1", "item2 with space", "item3/with/slash"]
2372
+
2373
+ # Test with .json
2374
+ json_file_path = os.path.join(self.test_dir, "test_list.json")
2375
+ write_list_to_file(json_file_path, test_list)
2376
+ read_list_json = read_list_from_file(json_file_path)
2377
+ assert test_list == read_list_json
2378
+
2379
+ # Test with .txt
2380
+ txt_file_path = os.path.join(self.test_dir, "test_list.txt")
2381
+ write_list_to_file(txt_file_path, test_list)
2382
+ # read_list_from_file is specifically for JSON, so we read .txt manually
2383
+ with open(txt_file_path, 'r') as f:
2384
+ read_list_txt = [line.strip() for line in f.readlines()]
2385
+ assert test_list == read_list_txt
2386
+
2387
+ # Test reading non-existent json
2388
+ try:
2389
+ read_list_from_file(os.path.join(self.test_dir,"non_existent.json"))
2390
+ raise AssertionError("FileNotFoundError not raised")
2391
+ except FileNotFoundError:
2392
+ pass
2393
+
2394
+ # Test reading a non-json file with read_list_from_file (should fail parsing)
2395
+ non_json_path = os.path.join(self.test_dir, "not_a_list.json")
2396
+ with open(non_json_path, 'w') as f: f.write("this is not json")
2397
+ try:
2398
+ read_list_from_file(non_json_path)
2399
+ raise AssertionError("json.JSONDecodeError not raised")
2400
+ except json.JSONDecodeError:
2401
+ pass
2402
+
2403
+
2404
+ def test_parallel_copy_files(self):
2405
+ """
2406
+ Test the parallel_copy_files function (with max_workers=1 for test simplicity).
2407
+ """
2408
+
2409
+ source_dir = os.path.join(self.test_dir, "copy_source")
2410
+ target_dir = os.path.join(self.test_dir, "copy_target")
2411
+ os.makedirs(source_dir, exist_ok=True)
2412
+
2413
+ file_mappings = {}
2414
+ source_files_content = {}
2415
+
2416
+ for i in range(3):
2417
+ src_fn = f"file{i}.txt"
2418
+ src_path = os.path.join(source_dir, src_fn)
2419
+ if i == 0:
2420
+ tgt_fn = f"copied_file{i}.txt"
2421
+ tgt_path = os.path.join(target_dir, tgt_fn)
2422
+ else:
2423
+ tgt_fn = f"copied_file{i}_subdir.txt"
2424
+ tgt_path = os.path.join(target_dir, f"sub{i}", tgt_fn)
2425
+
2426
+ content = f"content of file {i}"
2427
+ with open(src_path, 'w') as f:
2428
+ f.write(content)
2429
+
2430
+ file_mappings[src_path] = tgt_path
2431
+ source_files_content[tgt_path] = content
2432
+
2433
+ # Test copy
2434
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, overwrite=False)
2435
+ for tgt_path, expected_content in source_files_content.items():
2436
+ assert os.path.exists(tgt_path)
2437
+ with open(tgt_path, 'r') as f:
2438
+ assert f.read() == expected_content
2439
+
2440
+ existing_target_path = list(source_files_content.keys())[0]
2441
+ with open(existing_target_path, 'w') as f:
2442
+ f.write("old content")
2443
+
2444
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, overwrite=False)
2445
+ with open(existing_target_path, 'r') as f:
2446
+ assert f.read() == "old content"
2447
+
2448
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, overwrite=True)
2449
+ with open(existing_target_path, 'r') as f:
2450
+ assert f.read() == source_files_content[existing_target_path]
2451
+
2452
+ for src_path_orig, tgt_path_orig in file_mappings.items(): # Re-create source for move
2453
+ with open(src_path_orig, 'w') as f:
2454
+ f.write(source_files_content[tgt_path_orig])
2455
+
2456
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, move=True, overwrite=True)
2457
+ for src_path, tgt_path in file_mappings.items():
2458
+ assert not os.path.exists(src_path)
2459
+ assert os.path.exists(tgt_path)
2460
+ with open(tgt_path, 'r') as f:
2461
+ assert f.read() == source_files_content[tgt_path]
2462
+
2463
+
2464
+ def test_get_file_sizes(self):
2465
+ """
2466
+ Test get_file_sizes and parallel_get_file_sizes functions.
2467
+ """
2468
+
2469
+ file_sizes_test_dir = os.path.join(self.test_dir,'file_sizes')
2470
+ os.makedirs(file_sizes_test_dir,exist_ok=True)
2471
+
2472
+ f1_path = os.path.join(file_sizes_test_dir, 'file1.txt')
2473
+ content1 = "0123456789" # 10 bytes
2474
+ with open(f1_path, 'w') as f:
2475
+ f.write(content1)
2476
+
2477
+ subdir_path = os.path.join(file_sizes_test_dir, 'subdir')
2478
+ os.makedirs(subdir_path, exist_ok=True)
2479
+ f2_path = os.path.join(subdir_path, 'file2.txt')
2480
+ content2 = "01234567890123456789" # 20 bytes
2481
+ with open(f2_path, 'w') as f:
2482
+ f.write(content2)
2483
+
2484
+ sizes_relative = get_file_sizes(file_sizes_test_dir)
2485
+ expected_sizes_relative = {
2486
+ 'file1.txt': len(content1),
2487
+ os.path.join('subdir', 'file2.txt').replace('\\','/'): len(content2)
2488
+ }
2489
+ assert sizes_relative == expected_sizes_relative
2490
+
2491
+ file_list_abs = [f1_path, f2_path]
2492
+ sizes_parallel_abs = parallel_get_file_sizes(file_list_abs, max_workers=1)
2493
+ expected_sizes_parallel_abs = {
2494
+ f1_path.replace('\\','/'): len(content1),
2495
+ f2_path.replace('\\','/'): len(content2)
2496
+ }
2497
+ assert sizes_parallel_abs == expected_sizes_parallel_abs
2498
+
2499
+ sizes_parallel_folder_abs = parallel_get_file_sizes(file_sizes_test_dir,
2500
+ max_workers=1,
2501
+ return_relative_paths=False)
2502
+ assert sizes_parallel_folder_abs == expected_sizes_parallel_abs
2503
+
2504
+ sizes_parallel_folder_rel = parallel_get_file_sizes(file_sizes_test_dir,
2505
+ max_workers=1,
2506
+ return_relative_paths=True)
2507
+ assert sizes_parallel_folder_rel == expected_sizes_relative
2508
+
2509
+ non_existent_file = os.path.join(file_sizes_test_dir, "no_such_file.txt")
2510
+ sizes_with_error = parallel_get_file_sizes([f1_path, non_existent_file],
2511
+ max_workers=1)
2512
+ expected_with_error = {
2513
+ f1_path.replace('\\','/'): len(content1),
2514
+ non_existent_file.replace('\\','/'): None
2515
+ }
2516
+ assert sizes_with_error == expected_with_error
2517
+
2518
+
2519
+ def test_zip_file_and_unzip_file(self):
2520
+ """
2521
+ Test zip_file and unzip_file functions.
2522
+ """
2523
+
2524
+ file_to_zip_name = "test_zip_me.txt"
2525
+ file_to_zip_path = os.path.join(self.test_dir, file_to_zip_name)
2526
+ content = "This is the content to be zipped."
2527
+ with open(file_to_zip_path, 'w') as f:
2528
+ f.write(content)
2529
+
2530
+ default_zip_output_path = file_to_zip_path + ".zip"
2531
+ returned_zip_path = zip_file(file_to_zip_path)
2532
+ assert returned_zip_path == default_zip_output_path
2533
+ assert os.path.exists(default_zip_output_path)
2534
+
2535
+ unzip_dir_default = os.path.join(self.test_dir, "unzip_default")
2536
+ os.makedirs(unzip_dir_default, exist_ok=True)
2537
+ unzip_file(default_zip_output_path, unzip_dir_default)
2538
+ unzipped_file_path_default = os.path.join(unzip_dir_default, file_to_zip_name)
2539
+ assert os.path.exists(unzipped_file_path_default)
2540
+ with open(unzipped_file_path_default, 'r') as f:
2541
+ assert f.read() == content
2542
+
2543
+ custom_zip_output_name = "custom_archive.zip"
2544
+ custom_zip_output_path = os.path.join(self.test_dir, custom_zip_output_name)
2545
+ zip_file(file_to_zip_path, output_fn=custom_zip_output_path, overwrite=True)
2546
+ assert os.path.exists(custom_zip_output_path)
2547
+
2548
+ zip_in_subdir_path = os.path.join(self.test_dir, "subdir_zip", "my.zip")
2549
+ file_in_subdir_name = "file_for_subdir_zip.txt"
2550
+ file_in_subdir_path = os.path.join(self.test_dir,"subdir_zip", file_in_subdir_name)
2551
+ os.makedirs(os.path.dirname(zip_in_subdir_path), exist_ok=True)
2552
+ with open(file_in_subdir_path, "w") as f: f.write("sub dir content")
2553
+ zip_file(file_in_subdir_path, output_fn=zip_in_subdir_path)
2554
+
2555
+ unzip_file(zip_in_subdir_path, output_folder=None)
2556
+ unzipped_in_same_dir_path = os.path.join(os.path.dirname(zip_in_subdir_path), file_in_subdir_name)
2557
+ assert os.path.exists(unzipped_in_same_dir_path)
2558
+ with open(unzipped_in_same_dir_path, 'r') as f:
2559
+ assert f.read() == "sub dir content"
2560
+
2561
+
2562
+ def test_zip_folder(self):
2563
+ """
2564
+ Test the zip_folder function.
2565
+ """
2566
+
2567
+ folder_to_zip = os.path.join(self.test_dir, "folder_to_zip")
2568
+ os.makedirs(folder_to_zip, exist_ok=True)
2569
+
2570
+ file1_name = "file1.txt"; path1 = os.path.join(folder_to_zip, file1_name)
2571
+ file2_name = "file2.log"; path2 = os.path.join(folder_to_zip, file2_name)
2572
+ subdir_name = "sub"; subdir_path = os.path.join(folder_to_zip, subdir_name)
2573
+ os.makedirs(subdir_path, exist_ok=True)
2574
+ file3_name = "file3.dat"; path3 = os.path.join(subdir_path, file3_name)
2575
+
2576
+ content1 = "content1"; content2 = "content2"; content3 = "content3"
2577
+ with open(path1, 'w') as f: f.write(content1)
2578
+ with open(path2, 'w') as f: f.write(content2)
2579
+ with open(path3, 'w') as f: f.write(content3)
2580
+
2581
+ default_zip_path = folder_to_zip + ".zip"
2582
+ zip_folder(folder_to_zip, output_fn=None, overwrite=True)
2583
+ assert os.path.exists(default_zip_path)
2584
+
2585
+ unzip_output_dir = os.path.join(self.test_dir, "unzipped_folder_content")
2586
+ os.makedirs(unzip_output_dir, exist_ok=True)
2587
+ unzip_file(default_zip_path, unzip_output_dir)
2588
+
2589
+ assert os.path.exists(os.path.join(unzip_output_dir, file1_name))
2590
+ assert os.path.exists(os.path.join(unzip_output_dir, file2_name))
2591
+ assert os.path.exists(os.path.join(unzip_output_dir, subdir_name, file3_name))
2592
+ with open(os.path.join(unzip_output_dir, file1_name), 'r')as f: assert f.read() == content1
2593
+ with open(os.path.join(unzip_output_dir, file2_name), 'r')as f: assert f.read() == content2
2594
+ with open(os.path.join(unzip_output_dir, subdir_name, file3_name), 'r')as f: assert f.read() == content3
2595
+
2596
+ mtime_before = os.path.getmtime(default_zip_path)
2597
+ zip_folder(folder_to_zip, output_fn=None, overwrite=False)
2598
+ mtime_after = os.path.getmtime(default_zip_path)
2599
+ assert mtime_before == mtime_after
2600
+
2601
+
2602
+ def test_zip_files_into_single_zipfile(self):
2603
+ """
2604
+ Test zip_files_into_single_zipfile.
2605
+ """
2606
+
2607
+ file1_path = os.path.join(self.test_dir, "zfs_file1.txt")
2608
+ content1 = "content for zfs1"
2609
+ with open(file1_path, 'w') as f: f.write(content1)
2610
+
2611
+ subdir_for_zfs = os.path.join(self.test_dir, "zfs_subdir")
2612
+ os.makedirs(subdir_for_zfs, exist_ok=True)
2613
+ file2_path = os.path.join(subdir_for_zfs, "zfs_file2.log")
2614
+ content2 = "content for zfs2"
2615
+ with open(file2_path, 'w') as f: f.write(content2)
2616
+
2617
+ input_files = [file1_path, file2_path]
2618
+ output_zip_path = os.path.join(self.test_dir, "multi_file_archive.zip")
2619
+ zip_files_into_single_zipfile(input_files, output_zip_path, arc_name_base=self.test_dir, overwrite=True)
2620
+ assert os.path.exists(output_zip_path)
2621
+
2622
+ unzip_dir = os.path.join(self.test_dir, "unzip_multi_file")
2623
+ os.makedirs(unzip_dir, exist_ok=True)
2624
+ unzip_file(output_zip_path, unzip_dir)
2625
+
2626
+ expected_unzipped_file1 = os.path.join(unzip_dir, os.path.relpath(file1_path, self.test_dir))
2627
+ expected_unzipped_file2 = os.path.join(unzip_dir, os.path.relpath(file2_path, self.test_dir))
2628
+
2629
+ assert os.path.exists(expected_unzipped_file1)
2630
+ with open(expected_unzipped_file1, 'r') as f: assert f.read() == content1
2631
+ assert os.path.exists(expected_unzipped_file2)
2632
+ assert os.path.basename(expected_unzipped_file2) == "zfs_file2.log"
2633
+ assert os.path.basename(os.path.dirname(expected_unzipped_file2)) == "zfs_subdir"
2634
+ with open(expected_unzipped_file2, 'r') as f: assert f.read() == content2
2635
+
2636
+
2637
+ def test_add_files_to_single_tar_file(self):
2638
+ """
2639
+ Test add_files_to_single_tar_file.
2640
+ """
2641
+
2642
+ file1_path = os.path.join(self.test_dir, "tar_file1.txt")
2643
+ content1 = "content for tar1"
2644
+ with open(file1_path, 'w') as f: f.write(content1)
2645
+
2646
+ subdir_for_tar = os.path.join(self.test_dir, "tar_subdir")
2647
+ os.makedirs(subdir_for_tar, exist_ok=True)
2648
+ file2_path = os.path.join(subdir_for_tar, "tar_file2.log")
2649
+ content2 = "content for tar2"
2650
+ with open(file2_path, 'w') as f: f.write(content2)
2651
+
2652
+ input_files = [file1_path, file2_path]
2653
+ output_tar_path = os.path.join(self.test_dir, "archive.tar.gz")
2654
+
2655
+ add_files_to_single_tar_file(input_files, output_tar_path, arc_name_base=self.test_dir,
2656
+ overwrite=True, mode='x:gz')
2657
+ assert os.path.exists(output_tar_path)
2658
+
2659
+ un_tar_dir = os.path.join(self.test_dir, "un_tar_contents")
2660
+ os.makedirs(un_tar_dir, exist_ok=True)
2661
+ with tarfile.open(output_tar_path, 'r:gz') as tf:
2662
+ # The "filter" option was added as of Python 3.12, and *not* specifying
2663
+ # filter=None will change behavior as of Python 3.14. We want the unmodified
2664
+ # behavior, but we want to support Python <3.12, so we do a version check.
2665
+ if sys.version_info >= (3, 12):
2666
+ tf.extractall(path=un_tar_dir, filter=None)
2667
+ else:
2668
+ tf.extractall(path=un_tar_dir)
2669
+
2670
+ expected_untarred_file1 = os.path.join(un_tar_dir, os.path.relpath(file1_path, self.test_dir))
2671
+ expected_untarred_file2 = os.path.join(un_tar_dir, os.path.relpath(file2_path, self.test_dir))
2672
+
2673
+ assert os.path.exists(expected_untarred_file1)
2674
+ with open(expected_untarred_file1, 'r') as f: assert f.read() == content1
2675
+ assert os.path.exists(expected_untarred_file2)
2676
+ with open(expected_untarred_file2, 'r') as f: assert f.read() == content2
2677
+
2678
+
2679
+ def test_parallel_zip_individual_files_and_folders(self):
2680
+ """
2681
+ Test parallel_zip_files, parallel_zip_folders, and zip_each_file_in_folder.
2682
+ """
2683
+
2684
+ file1_to_zip = os.path.join(self.test_dir, "pz_file1.txt")
2685
+ file2_to_zip = os.path.join(self.test_dir, "pz_file2.txt")
2686
+ with open(file1_to_zip, 'w') as f: f.write("pz_content1")
2687
+ with open(file2_to_zip, 'w') as f: f.write("pz_content2")
2688
+
2689
+ parallel_zip_files([file1_to_zip, file2_to_zip], max_workers=1, overwrite=True)
2690
+ assert os.path.exists(file1_to_zip + ".zip")
2691
+ assert os.path.exists(file2_to_zip + ".zip")
2692
+ unzip_dir_pz = os.path.join(self.test_dir, "unzip_pz")
2693
+ unzip_file(file1_to_zip + ".zip", unzip_dir_pz)
2694
+ assert os.path.exists(os.path.join(unzip_dir_pz, os.path.basename(file1_to_zip)))
2695
+
2696
+ folder1_to_zip = os.path.join(self.test_dir, "pz_folder1")
2697
+ os.makedirs(folder1_to_zip, exist_ok=True)
2698
+ with open(os.path.join(folder1_to_zip, "pf1.txt"), 'w') as f: f.write("pf1_content")
2699
+ folder2_to_zip = os.path.join(self.test_dir, "pz_folder2")
2700
+ os.makedirs(folder2_to_zip, exist_ok=True)
2701
+ with open(os.path.join(folder2_to_zip, "pf2.txt"), 'w') as f: f.write("pf2_content")
2702
+
2703
+ parallel_zip_folders([folder1_to_zip, folder2_to_zip], max_workers=1, overwrite=True)
2704
+ assert os.path.exists(folder1_to_zip + ".zip")
2705
+ assert os.path.exists(folder2_to_zip + ".zip")
2706
+ unzip_dir_pzf = os.path.join(self.test_dir, "unzip_pzf")
2707
+ unzip_file(folder1_to_zip + ".zip", unzip_dir_pzf)
2708
+ assert os.path.exists(os.path.join(unzip_dir_pzf, "pf1.txt"))
2709
+
2710
+ zef_folder = os.path.join(self.test_dir, "zef_test_folder")
2711
+ os.makedirs(zef_folder, exist_ok=True)
2712
+ zef_file1 = os.path.join(zef_folder, "zef1.txt")
2713
+ zef_file2_png = os.path.join(zef_folder, "zef2.png")
2714
+ zef_file3_zip = os.path.join(zef_folder, "zef3.zip")
2715
+ zef_subdir = os.path.join(zef_folder, "zef_sub")
2716
+ os.makedirs(zef_subdir, exist_ok=True)
2717
+ zef_file_in_sub = os.path.join(zef_subdir, "zef_subfile.txt")
2718
+
2719
+ for p_path in [zef_file1, zef_file2_png, zef_file3_zip, zef_file_in_sub]:
2720
+ with open(p_path, 'w') as f: f.write(f"content of {os.path.basename(p_path)}")
2721
+
2722
+ zip_each_file_in_folder(zef_folder, recursive=False, max_workers=1, overwrite=True)
2723
+ assert os.path.exists(zef_file1 + ".zip")
2724
+ assert os.path.exists(zef_file2_png + ".zip")
2725
+ assert not os.path.exists(zef_file3_zip + ".zip")
2726
+ assert not os.path.exists(zef_file_in_sub + ".zip")
2727
+
2728
+ if os.path.exists(zef_file1 + ".zip"): os.remove(zef_file1 + ".zip")
2729
+ if os.path.exists(zef_file2_png + ".zip"): os.remove(zef_file2_png + ".zip")
2730
+
2731
+ zip_each_file_in_folder(zef_folder, recursive=True, max_workers=1, overwrite=True)
2732
+ assert os.path.exists(zef_file1 + ".zip")
2733
+ assert os.path.exists(zef_file2_png + ".zip")
2734
+ assert not os.path.exists(zef_file3_zip + ".zip")
2735
+ assert os.path.exists(zef_file_in_sub + ".zip")
2736
+
2737
+ if os.path.exists(zef_file1 + ".zip"): os.remove(zef_file1 + ".zip")
2738
+ if os.path.exists(zef_file2_png + ".zip"): os.remove(zef_file2_png + ".zip")
2739
+ if os.path.exists(zef_file_in_sub + ".zip"): os.remove(zef_file_in_sub + ".zip")
2740
+ zip_each_file_in_folder(zef_folder, recursive=True, required_token="zef1", max_workers=1, overwrite=True)
2741
+ assert os.path.exists(zef_file1 + ".zip")
2742
+ assert not os.path.exists(zef_file2_png + ".zip")
2743
+ assert not os.path.exists(zef_file_in_sub + ".zip")
2744
+
2745
+ if os.path.exists(zef_file1 + ".zip"): os.remove(zef_file1 + ".zip")
2746
+ dummy_to_zip = os.path.join(zef_folder,"dummy.txt")
2747
+ with open(dummy_to_zip,'w') as f: f.write('d')
2748
+ zip_each_file_in_folder(zef_folder, recursive=False, exclude_zip=False, max_workers=1, overwrite=True)
2749
+ assert os.path.exists(dummy_to_zip + ".zip")
2750
+ assert os.path.exists(zef_file3_zip + ".zip")
2751
+ if os.path.exists(dummy_to_zip + ".zip"): os.remove(dummy_to_zip + ".zip")
2752
+ if os.path.exists(zef_file3_zip + ".zip"): os.remove(zef_file3_zip + ".zip")
2753
+
2754
+
2755
+ def test_compute_file_hash(self):
2756
+ """
2757
+ Test compute_file_hash and parallel_compute_file_hashes.
2758
+ """
2759
+
2760
+ file1_name = "hash_me1.txt"
2761
+ file1_path = os.path.join(self.test_dir, file1_name)
2762
+ content1 = "This is a test string for hashing."
2763
+ with open(file1_path, 'w') as f:
2764
+ f.write(content1)
2765
+
2766
+ file2_name = "hash_me2.txt"
2767
+ file2_path = os.path.join(self.test_dir, file2_name)
2768
+ with open(file2_path, 'w') as f:
2769
+ f.write(content1)
2770
+
2771
+ file3_name = "hash_me3.txt"
2772
+ file3_path = os.path.join(self.test_dir, file3_name)
2773
+ content3 = "This is a different test string for hashing."
2774
+ with open(file3_path, 'w') as f:
2775
+ f.write(content3)
2776
+
2777
+ expected_hash_content1_sha256 = \
2778
+ "c56f19d76df6a09e49fe0d9ce7b1bc7f1dbd582f668742bede65c54c47d5bcf4".lower()
2779
+ expected_hash_content3_sha256 = \
2780
+ "23013ff7e93264317f7b2fc0e9a217649f2dc0b11ca7e0bd49632424b70b6680".lower()
2781
+
2782
+ hash1 = compute_file_hash(file1_path)
2783
+ hash2 = compute_file_hash(file2_path)
2784
+ hash3 = compute_file_hash(file3_path)
2785
+ assert hash1 == expected_hash_content1_sha256
2786
+ assert hash2 == expected_hash_content1_sha256
2787
+ assert hash1 != hash3
2788
+ assert hash3 == expected_hash_content3_sha256
2789
+
2790
+ expected_hash_content1_md5 = "94b971f1f8cdb23c2af82af73160d4b0".lower()
2791
+ hash1_md5 = compute_file_hash(file1_path, algorithm='md5')
2792
+ assert hash1_md5 == expected_hash_content1_md5
2793
+
2794
+ non_existent_path = os.path.join(self.test_dir, "no_such_file.txt")
2795
+ assert compute_file_hash(non_existent_path, allow_failures=True) is None
2796
+ try:
2797
+ compute_file_hash(non_existent_path, allow_failures=False)
2798
+ raise AssertionError("FileNotFoundError not raised for compute_file_hash")
2799
+ except FileNotFoundError:
2800
+ pass
2801
+
2802
+ files_to_hash = [file1_path, file3_path, non_existent_path]
2803
+ hashes_parallel = parallel_compute_file_hashes(files_to_hash, max_workers=1)
2804
+
2805
+ norm_f1 = file1_path.replace('\\','/')
2806
+ norm_f3 = file3_path.replace('\\','/')
2807
+ norm_non = non_existent_path.replace('\\','/')
2808
+
2809
+ expected_parallel_hashes = {
2810
+ norm_f1: expected_hash_content1_sha256,
2811
+ norm_f3: expected_hash_content3_sha256,
2812
+ norm_non: None
2813
+ }
2814
+ hashes_parallel_norm = {k.replace('\\','/'): v for k,v in hashes_parallel.items()}
2815
+ assert hashes_parallel_norm == expected_parallel_hashes
2816
+
2817
+ hash_folder = os.path.join(self.test_dir, "hash_test_folder")
2818
+ os.makedirs(hash_folder, exist_ok=True)
2819
+ h_f1_name = "h_f1.txt"; h_f1_path = os.path.join(hash_folder, h_f1_name)
2820
+ h_f2_name = "h_f2.txt"; h_f2_path = os.path.join(hash_folder, h_f2_name)
2821
+ with open(h_f1_path, 'w') as f: f.write(content1)
2822
+ with open(h_f2_path, 'w') as f: f.write(content3)
2823
+
2824
+ hashes_folder_parallel = parallel_compute_file_hashes(hash_folder, recursive=False, max_workers=1)
2825
+ norm_hf1 = h_f1_path.replace('\\','/')
2826
+ norm_hf2 = h_f2_path.replace('\\','/')
2827
+ expected_folder_hashes = {
2828
+ norm_hf1: expected_hash_content1_sha256,
2829
+ norm_hf2: expected_hash_content3_sha256
2830
+ }
2831
+ hashes_folder_parallel_norm = {k.replace('\\','/'): v for k,v in hashes_folder_parallel.items()}
2832
+ assert hashes_folder_parallel_norm == expected_folder_hashes
2833
+
2834
+
2835
+ def test_path_utils():
2836
+ """
2837
+ Runs all tests in the TestPathUtils class.
2838
+ """
2839
+
2840
+ test_instance = TestPathUtils()
2841
+ test_instance.set_up()
2842
+
2843
+ try:
2844
+
2845
+ test_instance.test_is_image_file()
2846
+ test_instance.test_find_image_strings()
2847
+ test_instance.test_find_images()
2848
+ test_instance.test_recursive_file_list_and_file_list()
2849
+ test_instance.test_folder_list()
2850
+ test_instance.test_folder_summary()
2851
+ test_instance.test_fileparts()
2852
+ test_instance.test_insert_before_extension()
2853
+ test_instance.test_split_path()
2854
+ test_instance.test_path_is_abs()
2855
+ test_instance.test_safe_create_link_unix()
2856
+ test_instance.test_remove_empty_folders()
2857
+ test_instance.test_path_join()
2858
+ test_instance.test_filename_cleaning()
2859
+ test_instance.test_is_executable()
2860
+ test_instance.test_write_read_list_to_file()
2861
+ test_instance.test_parallel_copy_files()
2862
+ test_instance.test_get_file_sizes()
2863
+ test_instance.test_zip_file_and_unzip_file()
2864
+ test_instance.test_zip_folder()
2865
+ test_instance.test_zip_files_into_single_zipfile()
2866
+ test_instance.test_add_files_to_single_tar_file()
2867
+ test_instance.test_parallel_zip_individual_files_and_folders()
2868
+ test_instance.test_compute_file_hash()
2869
+
2870
+ finally:
2871
+
2872
+ test_instance.tear_down()