megadetector 5.0.28__py3-none-any.whl → 5.0.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (176) hide show
  1. megadetector/api/batch_processing/api_core/batch_service/score.py +4 -5
  2. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +1 -1
  3. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +1 -1
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
  7. megadetector/api/synchronous/api_core/tests/load_test.py +2 -3
  8. megadetector/classification/aggregate_classifier_probs.py +3 -3
  9. megadetector/classification/analyze_failed_images.py +5 -5
  10. megadetector/classification/cache_batchapi_outputs.py +5 -5
  11. megadetector/classification/create_classification_dataset.py +11 -12
  12. megadetector/classification/crop_detections.py +10 -10
  13. megadetector/classification/csv_to_json.py +8 -8
  14. megadetector/classification/detect_and_crop.py +13 -15
  15. megadetector/classification/evaluate_model.py +7 -7
  16. megadetector/classification/identify_mislabeled_candidates.py +6 -6
  17. megadetector/classification/json_to_azcopy_list.py +1 -1
  18. megadetector/classification/json_validator.py +29 -32
  19. megadetector/classification/map_classification_categories.py +9 -9
  20. megadetector/classification/merge_classification_detection_output.py +12 -9
  21. megadetector/classification/prepare_classification_script.py +19 -19
  22. megadetector/classification/prepare_classification_script_mc.py +23 -23
  23. megadetector/classification/run_classifier.py +4 -4
  24. megadetector/classification/save_mislabeled.py +6 -6
  25. megadetector/classification/train_classifier.py +1 -1
  26. megadetector/classification/train_classifier_tf.py +9 -9
  27. megadetector/classification/train_utils.py +10 -10
  28. megadetector/data_management/annotations/annotation_constants.py +1 -1
  29. megadetector/data_management/camtrap_dp_to_coco.py +45 -45
  30. megadetector/data_management/cct_json_utils.py +101 -101
  31. megadetector/data_management/cct_to_md.py +49 -49
  32. megadetector/data_management/cct_to_wi.py +33 -33
  33. megadetector/data_management/coco_to_labelme.py +75 -75
  34. megadetector/data_management/coco_to_yolo.py +189 -189
  35. megadetector/data_management/databases/add_width_and_height_to_db.py +3 -2
  36. megadetector/data_management/databases/combine_coco_camera_traps_files.py +38 -38
  37. megadetector/data_management/databases/integrity_check_json_db.py +202 -188
  38. megadetector/data_management/databases/subset_json_db.py +33 -33
  39. megadetector/data_management/generate_crops_from_cct.py +38 -38
  40. megadetector/data_management/get_image_sizes.py +54 -49
  41. megadetector/data_management/labelme_to_coco.py +130 -124
  42. megadetector/data_management/labelme_to_yolo.py +78 -72
  43. megadetector/data_management/lila/create_lila_blank_set.py +81 -83
  44. megadetector/data_management/lila/create_lila_test_set.py +32 -31
  45. megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
  46. megadetector/data_management/lila/download_lila_subset.py +21 -24
  47. megadetector/data_management/lila/generate_lila_per_image_labels.py +91 -91
  48. megadetector/data_management/lila/get_lila_annotation_counts.py +30 -30
  49. megadetector/data_management/lila/get_lila_image_counts.py +22 -22
  50. megadetector/data_management/lila/lila_common.py +70 -70
  51. megadetector/data_management/lila/test_lila_metadata_urls.py +13 -14
  52. megadetector/data_management/mewc_to_md.py +339 -340
  53. megadetector/data_management/ocr_tools.py +258 -252
  54. megadetector/data_management/read_exif.py +231 -224
  55. megadetector/data_management/remap_coco_categories.py +26 -26
  56. megadetector/data_management/remove_exif.py +31 -20
  57. megadetector/data_management/rename_images.py +187 -187
  58. megadetector/data_management/resize_coco_dataset.py +41 -41
  59. megadetector/data_management/speciesnet_to_md.py +41 -41
  60. megadetector/data_management/wi_download_csv_to_coco.py +55 -55
  61. megadetector/data_management/yolo_output_to_md_output.py +117 -120
  62. megadetector/data_management/yolo_to_coco.py +195 -188
  63. megadetector/detection/change_detection.py +831 -0
  64. megadetector/detection/process_video.py +340 -337
  65. megadetector/detection/pytorch_detector.py +304 -262
  66. megadetector/detection/run_detector.py +177 -164
  67. megadetector/detection/run_detector_batch.py +364 -363
  68. megadetector/detection/run_inference_with_yolov5_val.py +328 -325
  69. megadetector/detection/run_tiled_inference.py +256 -249
  70. megadetector/detection/tf_detector.py +24 -24
  71. megadetector/detection/video_utils.py +290 -282
  72. megadetector/postprocessing/add_max_conf.py +15 -11
  73. megadetector/postprocessing/categorize_detections_by_size.py +44 -44
  74. megadetector/postprocessing/classification_postprocessing.py +415 -415
  75. megadetector/postprocessing/combine_batch_outputs.py +20 -21
  76. megadetector/postprocessing/compare_batch_results.py +528 -517
  77. megadetector/postprocessing/convert_output_format.py +97 -97
  78. megadetector/postprocessing/create_crop_folder.py +219 -146
  79. megadetector/postprocessing/detector_calibration.py +173 -168
  80. megadetector/postprocessing/generate_csv_report.py +508 -499
  81. megadetector/postprocessing/load_api_results.py +23 -20
  82. megadetector/postprocessing/md_to_coco.py +129 -98
  83. megadetector/postprocessing/md_to_labelme.py +89 -83
  84. megadetector/postprocessing/md_to_wi.py +40 -40
  85. megadetector/postprocessing/merge_detections.py +87 -114
  86. megadetector/postprocessing/postprocess_batch_results.py +313 -298
  87. megadetector/postprocessing/remap_detection_categories.py +36 -36
  88. megadetector/postprocessing/render_detection_confusion_matrix.py +205 -199
  89. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
  90. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
  91. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +702 -677
  92. megadetector/postprocessing/separate_detections_into_folders.py +226 -211
  93. megadetector/postprocessing/subset_json_detector_output.py +265 -262
  94. megadetector/postprocessing/top_folders_to_bottom.py +45 -45
  95. megadetector/postprocessing/validate_batch_results.py +70 -70
  96. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
  97. megadetector/taxonomy_mapping/map_new_lila_datasets.py +15 -15
  98. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +14 -14
  99. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +66 -66
  100. megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
  101. megadetector/taxonomy_mapping/simple_image_download.py +8 -8
  102. megadetector/taxonomy_mapping/species_lookup.py +33 -33
  103. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
  104. megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
  105. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
  106. megadetector/utils/azure_utils.py +22 -22
  107. megadetector/utils/ct_utils.py +1018 -200
  108. megadetector/utils/directory_listing.py +21 -77
  109. megadetector/utils/gpu_test.py +22 -22
  110. megadetector/utils/md_tests.py +541 -518
  111. megadetector/utils/path_utils.py +1457 -398
  112. megadetector/utils/process_utils.py +41 -41
  113. megadetector/utils/sas_blob_utils.py +53 -49
  114. megadetector/utils/split_locations_into_train_val.py +61 -61
  115. megadetector/utils/string_utils.py +147 -26
  116. megadetector/utils/url_utils.py +463 -173
  117. megadetector/utils/wi_utils.py +2629 -2526
  118. megadetector/utils/write_html_image_list.py +137 -137
  119. megadetector/visualization/plot_utils.py +21 -21
  120. megadetector/visualization/render_images_with_thumbnails.py +37 -73
  121. megadetector/visualization/visualization_utils.py +401 -397
  122. megadetector/visualization/visualize_db.py +197 -190
  123. megadetector/visualization/visualize_detector_output.py +79 -73
  124. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/METADATA +135 -132
  125. megadetector-5.0.29.dist-info/RECORD +163 -0
  126. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/WHEEL +1 -1
  127. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/licenses/LICENSE +0 -0
  128. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/top_level.txt +0 -0
  129. megadetector/data_management/importers/add_nacti_sizes.py +0 -52
  130. megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
  131. megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
  132. megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
  133. megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
  134. megadetector/data_management/importers/awc_to_json.py +0 -191
  135. megadetector/data_management/importers/bellevue_to_json.py +0 -272
  136. megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
  137. megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
  138. megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
  139. megadetector/data_management/importers/cct_field_adjustments.py +0 -58
  140. megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
  141. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
  142. megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
  143. megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
  144. megadetector/data_management/importers/ena24_to_json.py +0 -276
  145. megadetector/data_management/importers/filenames_to_json.py +0 -386
  146. megadetector/data_management/importers/helena_to_cct.py +0 -283
  147. megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
  148. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
  149. megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
  150. megadetector/data_management/importers/jb_csv_to_json.py +0 -150
  151. megadetector/data_management/importers/mcgill_to_json.py +0 -250
  152. megadetector/data_management/importers/missouri_to_json.py +0 -490
  153. megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
  154. megadetector/data_management/importers/noaa_seals_2019.py +0 -181
  155. megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
  156. megadetector/data_management/importers/pc_to_json.py +0 -365
  157. megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
  158. megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
  159. megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
  160. megadetector/data_management/importers/rspb_to_json.py +0 -356
  161. megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
  162. megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
  163. megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
  164. megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
  165. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
  166. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
  167. megadetector/data_management/importers/sulross_get_exif.py +0 -65
  168. megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
  169. megadetector/data_management/importers/ubc_to_json.py +0 -399
  170. megadetector/data_management/importers/umn_to_json.py +0 -507
  171. megadetector/data_management/importers/wellington_to_json.py +0 -263
  172. megadetector/data_management/importers/wi_to_json.py +0 -442
  173. megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
  174. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
  175. megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
  176. megadetector-5.0.28.dist-info/RECORD +0 -209
@@ -24,6 +24,7 @@ import tarfile
24
24
  import webbrowser
25
25
  import subprocess
26
26
  import re
27
+ import tempfile
27
28
 
28
29
  from zipfile import ZipFile
29
30
  from datetime import datetime
@@ -34,6 +35,7 @@ from shutil import which
34
35
  from tqdm import tqdm
35
36
 
36
37
  from megadetector.utils.ct_utils import is_iterable
38
+ from megadetector.utils.ct_utils import make_test_folder
37
39
  from megadetector.utils.ct_utils import sort_dictionary_by_value
38
40
 
39
41
  # Should all be lower-case
@@ -47,14 +49,14 @@ CHAR_LIMIT = 255
47
49
 
48
50
  #%% General path functions
49
51
 
50
- def recursive_file_list(base_dir,
51
- convert_slashes=True,
52
- return_relative_paths=False,
52
+ def recursive_file_list(base_dir,
53
+ convert_slashes=True,
54
+ return_relative_paths=False,
53
55
  sort_files=True,
54
56
  recursive=True):
55
57
  r"""
56
58
  Enumerates files (not directories) in [base_dir].
57
-
59
+
58
60
  Args:
59
61
  base_dir (str): folder to enumerate
60
62
  convert_slashes (bool, optional): force forward slashes; if this is False, will use
@@ -64,15 +66,15 @@ def recursive_file_list(base_dir,
64
66
  sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
65
67
  provided by os.walk()
66
68
  recursive (bool, optional): enumerate recursively
67
-
69
+
68
70
  Returns:
69
71
  list: list of filenames
70
72
  """
71
-
73
+
72
74
  assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
73
-
75
+
74
76
  all_files = []
75
-
77
+
76
78
  if recursive:
77
79
  for root, _, filenames in os.walk(base_dir):
78
80
  for filename in filenames:
@@ -82,29 +84,29 @@ def recursive_file_list(base_dir,
82
84
  all_files_relative = os.listdir(base_dir)
83
85
  all_files = [os.path.join(base_dir,fn) for fn in all_files_relative]
84
86
  all_files = [fn for fn in all_files if os.path.isfile(fn)]
85
-
87
+
86
88
  if return_relative_paths:
87
89
  all_files = [os.path.relpath(fn,base_dir) for fn in all_files]
88
90
 
89
91
  if convert_slashes:
90
92
  all_files = [fn.replace('\\', '/') for fn in all_files]
91
-
93
+
92
94
  if sort_files:
93
95
  all_files = sorted(all_files)
94
-
96
+
95
97
  return all_files
96
98
 
97
99
 
98
- def file_list(base_dir,
100
+ def file_list(base_dir,
99
101
  convert_slashes=True,
100
- return_relative_paths=False,
101
- sort_files=True,
102
+ return_relative_paths=False,
103
+ sort_files=True,
102
104
  recursive=False):
103
105
  """
104
- Trivial wrapper for recursive_file_list, which was a poor function name choice
105
- at the time, since I later wanted to add non-recursive lists, but it doesn't
106
+ Trivial wrapper for recursive_file_list, which was a poor function name choice
107
+ at the time, since I later wanted to add non-recursive lists, but it doesn't
106
108
  make sense to have a "recursive" option in a function called "recursive_file_list".
107
-
109
+
108
110
  Args:
109
111
  base_dir (str): folder to enumerate
110
112
  convert_slashes (bool, optional): force forward slashes; if this is False, will use
@@ -114,11 +116,11 @@ def file_list(base_dir,
114
116
  sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
115
117
  provided by os.walk()
116
118
  recursive (bool, optional): enumerate recursively
117
-
119
+
118
120
  Returns:
119
- list: list of filenames
121
+ list: list of filenames
120
122
  """
121
-
123
+
122
124
  return recursive_file_list(base_dir,convert_slashes,return_relative_paths,sort_files,
123
125
  recursive=recursive)
124
126
 
@@ -128,10 +130,9 @@ def folder_list(base_dir,
128
130
  return_relative_paths=False,
129
131
  sort_folders=True,
130
132
  recursive=False):
131
-
132
133
  """
133
134
  Enumerates folders (not files) in [base_dir].
134
-
135
+
135
136
  Args:
136
137
  base_dir (str): folder to enumerate
137
138
  convert_slashes (bool, optional): force forward slashes; if this is False, will use
@@ -141,81 +142,81 @@ def folder_list(base_dir,
141
142
  sort_files (bool, optional): force folders to be sorted, otherwise uses the sorting
142
143
  provided by os.walk()
143
144
  recursive (bool, optional): enumerate recursively
144
-
145
+
145
146
  Returns:
146
147
  list: list of folder names
147
148
  """
148
-
149
+
149
150
  assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
150
-
151
+
151
152
  folders = []
152
153
 
153
- if recursive:
154
+ if recursive:
154
155
  folders = []
155
156
  for root, dirs, _ in os.walk(base_dir):
156
157
  for d in dirs:
157
- folders.append(os.path.join(root, d))
158
+ folders.append(os.path.join(root, d))
158
159
  else:
159
160
  folders = os.listdir(base_dir)
160
161
  folders = [os.path.join(base_dir,fn) for fn in folders]
161
162
  folders = [fn for fn in folders if os.path.isdir(fn)]
162
-
163
+
163
164
  if return_relative_paths:
164
165
  folders = [os.path.relpath(fn,base_dir) for fn in folders]
165
166
 
166
167
  if convert_slashes:
167
168
  folders = [fn.replace('\\', '/') for fn in folders]
168
-
169
+
169
170
  if sort_folders:
170
- folders = sorted(folders)
171
-
171
+ folders = sorted(folders)
172
+
172
173
  return folders
173
174
 
174
175
 
175
176
  def folder_summary(folder,print_summary=True):
176
177
  """
177
178
  Returns (and optionally prints) a summary of [folder], including:
178
-
179
+
179
180
  * The total number of files
180
181
  * The total number of folders
181
- * The number of files for each extension
182
-
182
+ * The number of files for each extension
183
+
183
184
  Args:
184
185
  folder (str): folder to summarize
185
186
  print_summary (bool, optional): whether to print the summary
186
-
187
+
187
188
  Returns:
188
189
  dict: with fields "n_files", "n_folders", and "extension_to_count"
189
190
  """
190
-
191
+
191
192
  assert os.path.isdir(folder), '{} is not a folder'.format(folder)
192
-
193
+
193
194
  folders_relative = folder_list(folder,return_relative_paths=True,recursive=True)
194
195
  files_relative = file_list(folder,return_relative_paths=True,recursive=True)
195
-
196
+
196
197
  extension_to_count = defaultdict(int)
197
-
198
+
198
199
  for fn in files_relative:
199
200
  ext = os.path.splitext(fn)[1]
200
201
  extension_to_count[ext] += 1
201
-
202
+
202
203
  extension_to_count = sort_dictionary_by_value(extension_to_count,reverse=True)
203
-
204
+
204
205
  if print_summary:
205
206
  for extension in extension_to_count.keys():
206
207
  print('{}: {}'.format(extension,extension_to_count[extension]))
207
208
  print('')
208
209
  print('Total files: {}'.format(len(files_relative)))
209
210
  print('Total folders: {}'.format(len(folders_relative)))
210
-
211
+
211
212
  to_return = {}
212
213
  to_return['n_files'] = len(files_relative)
213
214
  to_return['n_folders'] = len(folders_relative)
214
- to_return['extension_to_count'] = extension_to_count
215
-
215
+ to_return['extension_to_count'] = extension_to_count
216
+
216
217
  return to_return
217
-
218
-
218
+
219
+
219
220
  def fileparts(path):
220
221
  r"""
221
222
  Breaks down a path into the directory path, filename, and extension.
@@ -223,25 +224,25 @@ def fileparts(path):
223
224
  Note that the '.' lives with the extension, and separators are removed.
224
225
 
225
226
  Examples:
226
-
227
+
227
228
  .. code-block:: none
228
229
 
229
- >>> fileparts('file')
230
+ >>> fileparts('file')
230
231
  ('', 'file', '')
231
232
  >>> fileparts(r'c:/dir/file.jpg')
232
233
  ('c:/dir', 'file', '.jpg')
233
234
  >>> fileparts('/dir/subdir/file.jpg')
234
- ('/dir/subdir', 'file', '.jpg')
235
+ ('/dir/subdir', 'file', '.jpg')
235
236
 
236
237
  Args:
237
238
  path (str): path name to separate into parts
238
239
  Returns:
239
- tuple: tuple containing (p,n,e):
240
+ tuple: tuple containing (p,n,e):
240
241
  - p: str, directory path
241
242
  - n: str, filename without extension
242
243
  - e: str, extension including the '.'
243
244
  """
244
-
245
+
245
246
  # ntpath seems to do the right thing for both Windows and Unix paths
246
247
  p = ntpath.dirname(path)
247
248
  basename = ntpath.basename(path)
@@ -257,27 +258,27 @@ def insert_before_extension(filename, s=None, separator='.'):
257
258
  appends [s].
258
259
 
259
260
  Examples:
260
-
261
+
261
262
  .. code-block:: none
262
-
263
+
263
264
  >>> insert_before_extension('/dir/subdir/file.ext', 'insert')
264
265
  '/dir/subdir/file.insert.ext'
265
266
  >>> insert_before_extension('/dir/subdir/file', 'insert')
266
267
  '/dir/subdir/file.insert'
267
268
  >>> insert_before_extension('/dir/subdir/file')
268
269
  '/dir/subdir/file.2020.07.20.10.54.38'
269
-
270
+
270
271
  Args:
271
272
  filename (str): filename to manipulate
272
273
  s (str, optional): string to insert before the extension in [filename], or
273
274
  None to insert a datestamp
274
275
  separator (str, optional): separator to place between the filename base
275
276
  and the inserted string
276
-
277
+
277
278
  Returns:
278
279
  str: modified string
279
280
  """
280
-
281
+
281
282
  assert len(filename) > 0
282
283
  if s is None or len(s) == 0:
283
284
  s = datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
@@ -290,9 +291,9 @@ def split_path(path):
290
291
  Splits [path] into all its constituent file/folder tokens.
291
292
 
292
293
  Examples:
293
-
294
+
294
295
  .. code-block:: none
295
-
296
+
296
297
  >>> split_path(r'c:\dir\subdir\file.txt')
297
298
  ['c:\\', 'dir', 'subdir', 'file.txt']
298
299
  >>> split_path('/dir/subdir/file.jpg')
@@ -301,13 +302,19 @@ def split_path(path):
301
302
  ['c:\\']
302
303
  >>> split_path('/')
303
304
  ['/']
304
-
305
+
305
306
  Args:
306
307
  path (str): path to split into tokens
307
-
308
+
308
309
  Returns:
309
310
  list: list of path tokens
310
311
  """
312
+
313
+ # Edge cases
314
+ if path == '':
315
+ return ''
316
+ if path is None:
317
+ return None
311
318
 
312
319
  parts = []
313
320
  while True:
@@ -325,32 +332,32 @@ def path_is_abs(p):
325
332
  """
326
333
  Determines whether [p] is an absolute path. An absolute path is defined as
327
334
  one that starts with slash, backslash, or a letter followed by a colon.
328
-
335
+
329
336
  Args:
330
337
  p (str): path to evaluate
331
-
338
+
332
339
  Returns:
333
340
  bool: True if [p] is an absolute path, else False
334
341
  """
335
-
342
+
336
343
  return (len(p) > 1) and (p[0] == '/' or p[1] == ':' or p[0] == '\\')
337
344
 
338
345
 
339
346
  def safe_create_link(link_exists,link_new):
340
347
  """
341
348
  Creates a symlink at [link_new] pointing to [link_exists].
342
-
349
+
343
350
  If [link_new] already exists, make sure it's a link (not a file),
344
351
  and if it has a different target than [link_exists], removes and re-creates
345
352
  it.
346
-
353
+
347
354
  Errors if [link_new] already exists but it's not a link.
348
-
355
+
349
356
  Args:
350
357
  link_exists (str): the source of the (possibly-new) symlink
351
358
  link_new (str): the target of the (possibly-new) symlink
352
359
  """
353
-
360
+
354
361
  if os.path.exists(link_new) or os.path.islink(link_new):
355
362
  assert os.path.islink(link_new)
356
363
  if not os.readlink(link_new) == link_exists:
@@ -358,35 +365,35 @@ def safe_create_link(link_exists,link_new):
358
365
  os.symlink(link_exists,link_new)
359
366
  else:
360
367
  os.symlink(link_exists,link_new)
361
-
368
+
362
369
 
363
370
  def remove_empty_folders(path, remove_root=False):
364
371
  """
365
372
  Recursively removes empty folders within the specified path.
366
-
373
+
367
374
  Args:
368
- path (str): the folder from which we should recursively remove
375
+ path (str): the folder from which we should recursively remove
369
376
  empty folders.
370
- remove_root (bool, optional): whether to remove the root directory if
377
+ remove_root (bool, optional): whether to remove the root directory if
371
378
  it's empty after removing all empty subdirectories. This will always
372
379
  be True during recursive calls.
373
-
380
+
374
381
  Returns:
375
382
  bool: True if the directory is empty after processing, False otherwise
376
383
  """
377
-
384
+
378
385
  # Verify that [path] is a directory
379
386
  if not os.path.isdir(path):
380
387
  return False
381
-
388
+
382
389
  # Track whether the current directory is empty
383
390
  is_empty = True
384
-
391
+
385
392
  # Iterate through all items in the directory
386
393
  for item in os.listdir(path):
387
-
394
+
388
395
  item_path = os.path.join(path, item)
389
-
396
+
390
397
  # If it's a directory, process it recursively
391
398
  if os.path.isdir(item_path):
392
399
  # If the subdirectory is empty after processing, it will be removed
@@ -396,118 +403,57 @@ def remove_empty_folders(path, remove_root=False):
396
403
  else:
397
404
  # If there's a file, the directory is not empty
398
405
  is_empty = False
399
-
406
+
400
407
  # If the directory is empty and we're supposed to remove it
401
408
  if is_empty and remove_root:
402
409
  try:
403
- os.rmdir(path)
410
+ os.rmdir(path)
404
411
  except Exception as e:
405
412
  print('Error removing directory {}: {}'.format(path,str(e)))
406
413
  is_empty = False
407
-
414
+
408
415
  return is_empty
409
416
 
410
417
  # ...def remove_empty_folders(...)
411
418
 
412
419
 
413
- def top_level_folder(p):
414
- r"""
415
- Gets the top-level folder from the path *p*.
416
-
417
- On UNIX, this is straightforward:
418
-
419
- /blah/foo
420
-
421
- ...returns '/blah'
422
-
423
- On Windows, we define this as the top-level folder that isn't the drive, so:
424
-
425
- c:\blah\foo
426
-
427
- ...returns 'c:\blah'.
428
-
429
- Args:
430
- p (str): filename to evaluate
431
-
432
- Returns:
433
- str: the top-level folder in [p], see above for details on how this is defined
434
- """
435
-
436
- if p == '':
437
- return ''
438
-
439
- # Path('/blah').parts is ('/','blah')
440
- parts = split_path(p)
441
-
442
- if len(parts) == 1:
443
- return parts[0]
444
-
445
- # Handle paths like:
446
- #
447
- # /, \, /stuff, c:, c:\stuff
448
- drive = os.path.splitdrive(p)[0]
449
- if parts[0] == drive or parts[0] == drive + '/' or parts[0] == drive + '\\' or parts[0] in ['\\', '/']:
450
- return os.path.join(parts[0], parts[1])
451
- else:
452
- return parts[0]
453
-
454
- # ...top_level_folder()
455
-
456
-
457
420
  def path_join(*paths, convert_slashes=True):
458
421
  r"""
459
422
  Wrapper for os.path.join that optionally converts backslashes to forward slashes.
460
-
423
+
461
424
  Args:
462
425
  *paths (variable-length set of strings): Path components to be joined.
463
426
  convert_slashes (bool, optional): whether to convert \\ to /
464
-
427
+
465
428
  Returns:
466
429
  A string with the joined path components.
467
430
  """
468
-
431
+
469
432
  joined_path = os.path.join(*paths)
470
433
  if convert_slashes:
471
434
  return joined_path.replace('\\', '/')
472
435
  else:
473
436
  return joined_path
474
437
 
475
-
476
- #%% Test driver for top_level_folder
477
-
478
- if False:
479
-
480
- #%%
481
438
 
482
- p = 'blah/foo/bar'; s = top_level_folder(p); print(s); assert s == 'blah'
483
- p = '/blah/foo/bar'; s = top_level_folder(p); print(s); assert s == '/blah'
484
- p = 'bar'; s = top_level_folder(p); print(s); assert s == 'bar'
485
- p = ''; s = top_level_folder(p); print(s); assert s == ''
486
- p = 'c:\\'; s = top_level_folder(p); print(s); assert s == 'c:\\'
487
- p = r'c:\blah'; s = top_level_folder(p); print(s); assert s == 'c:\\blah'
488
- p = r'c:\foo'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
489
- p = r'c:/foo'; s = top_level_folder(p); print(s); assert s == 'c:/foo'
490
- p = r'c:\foo/bar'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
491
-
492
-
493
439
  #%% Image-related path functions
494
440
 
495
441
  def is_image_file(s, img_extensions=IMG_EXTENSIONS):
496
442
  """
497
443
  Checks a file's extension against a hard-coded set of image file
498
444
  extensions. Uses case-insensitive comparison.
499
-
445
+
500
446
  Does not check whether the file exists, only determines whether the filename
501
447
  implies it's an image file.
502
-
448
+
503
449
  Args:
504
450
  s (str): filename to evaluate for image-ness
505
451
  img_extensions (list, optional): list of known image file extensions
506
-
452
+
507
453
  Returns:
508
454
  bool: True if [s] appears to be an image file, else False
509
455
  """
510
-
456
+
511
457
  ext = os.path.splitext(s)[1]
512
458
  return ext.lower() in img_extensions
513
459
 
@@ -516,27 +462,27 @@ def find_image_strings(strings):
516
462
  """
517
463
  Given a list of strings that are potentially image file names, looks for
518
464
  strings that actually look like image file names (based on extension).
519
-
465
+
520
466
  Args:
521
467
  strings (list): list of filenames to check for image-ness
522
-
468
+
523
469
  Returns:
524
470
  list: the subset of [strings] that appear to be image filenames
525
471
  """
526
-
472
+
527
473
  return [s for s in strings if is_image_file(s)]
528
474
 
529
475
 
530
- def find_images(dirname,
531
- recursive=False,
532
- return_relative_paths=False,
476
+ def find_images(dirname,
477
+ recursive=False,
478
+ return_relative_paths=False,
533
479
  convert_slashes=True):
534
480
  """
535
481
  Finds all files in a directory that look like image file names. Returns
536
482
  absolute paths unless return_relative_paths is set. Uses the OS-native
537
483
  path separator unless convert_slashes is set, in which case will always
538
484
  use '/'.
539
-
485
+
540
486
  Args:
541
487
  dirname (str): the folder to search for images
542
488
  recursive (bool, optional): whether to search recursively
@@ -547,30 +493,30 @@ def find_images(dirname,
547
493
  Returns:
548
494
  list: list of image filenames found in [dirname]
549
495
  """
550
-
496
+
551
497
  assert os.path.isdir(dirname), '{} is not a folder'.format(dirname)
552
-
498
+
553
499
  if recursive:
554
500
  strings = glob.glob(os.path.join(dirname, '**', '*.*'), recursive=True)
555
501
  else:
556
502
  strings = glob.glob(os.path.join(dirname, '*.*'))
557
-
503
+
558
504
  image_files = find_image_strings(strings)
559
-
505
+
560
506
  if return_relative_paths:
561
507
  image_files = [os.path.relpath(fn,dirname) for fn in image_files]
562
-
508
+
563
509
  image_files = sorted(image_files)
564
-
510
+
565
511
  if convert_slashes:
566
512
  image_files = [fn.replace('\\', '/') for fn in image_files]
567
-
513
+
568
514
  return image_files
569
515
 
570
516
 
571
517
  #%% Filename cleaning functions
572
518
 
573
- def clean_filename(filename,
519
+ def clean_filename(filename,
574
520
  allow_list=VALID_FILENAME_CHARS,
575
521
  char_limit=CHAR_LIMIT,
576
522
  force_lower= False):
@@ -582,18 +528,18 @@ def clean_filename(filename,
582
528
 
583
529
  Adapted from
584
530
  https://gist.github.com/wassname/1393c4a57cfcbf03641dbc31886123b8
585
-
531
+
586
532
  Args:
587
533
  filename (str): filename to clean
588
534
  allow_list (str, optional): string containing all allowable filename characters
589
535
  char_limit (int, optional): maximum allowable filename length, if None will skip this
590
536
  step
591
537
  force_lower (bool, optional): convert the resulting filename to lowercase
592
-
593
- returns:
594
- str: cleaned version of [filename]
538
+
539
+ Returns:
540
+ str: cleaned version of [filename]
595
541
  """
596
-
542
+
597
543
  # keep only valid ascii chars
598
544
  cleaned_filename = (unicodedata.normalize('NFKD', filename)
599
545
  .encode('ASCII', 'ignore').decode())
@@ -607,26 +553,26 @@ def clean_filename(filename,
607
553
  return cleaned_filename
608
554
 
609
555
 
610
- def clean_path(pathname,
556
+ def clean_path(pathname,
611
557
  allow_list=VALID_PATH_CHARS,
612
558
  char_limit=CHAR_LIMIT,
613
559
  force_lower=False):
614
560
  """
615
561
  Removes non-ASCII and other invalid path characters (on any reasonable
616
562
  OS) from a path, then optionally trims to a maximum length.
617
-
563
+
618
564
  Args:
619
565
  pathname (str): path name to clean
620
566
  allow_list (str, optional): string containing all allowable filename characters
621
567
  char_limit (int, optional): maximum allowable filename length, if None will skip this
622
568
  step
623
569
  force_lower (bool, optional): convert the resulting filename to lowercase
624
-
625
- returns:
626
- str: cleaned version of [filename]
570
+
571
+ Returns:
572
+ str: cleaned version of [filename]
627
573
  """
628
-
629
- return clean_filename(pathname, allow_list=allow_list,
574
+
575
+ return clean_filename(pathname, allow_list=allow_list,
630
576
  char_limit=char_limit, force_lower=force_lower)
631
577
 
632
578
 
@@ -635,34 +581,34 @@ def flatten_path(pathname,separator_chars=SEPARATOR_CHARS,separator_char_replace
635
581
  Removes non-ASCII and other invalid path characters (on any reasonable
636
582
  OS) from a path, then trims to a maximum length. Replaces all valid
637
583
  separators with [separator_char_replacement.]
638
-
584
+
639
585
  Args:
640
586
  pathname (str): path name to flatten
641
587
  separator_chars (str, optional): string containing all known path separators
642
- separator_char_replacement (str, optional): string to insert in place of
588
+ separator_char_replacement (str, optional): string to insert in place of
643
589
  path separators.
644
-
590
+
645
591
  Returns:
646
592
  str: flattened version of [pathname]
647
593
  """
648
-
594
+
649
595
  s = clean_path(pathname)
650
596
  for c in separator_chars:
651
597
  s = s.replace(c, separator_char_replacement)
652
598
  return s
653
599
 
654
600
 
655
- def is_executable(filename):
601
+ def is_executable(filename):
656
602
  """
657
603
  Checks whether [filename] is on the system path and marked as executable.
658
-
604
+
659
605
  Args:
660
606
  filename (str): filename to check for executable status
661
-
607
+
662
608
  Returns:
663
609
  bool: True if [filename] is on the system path and marked as executable, otherwise False
664
610
  """
665
-
611
+
666
612
  # https://stackoverflow.com/questions/11210104/check-if-a-program-exists-from-a-python-script
667
613
 
668
614
  return which(filename) is not None
@@ -673,247 +619,247 @@ def is_executable(filename):
673
619
  def environment_is_wsl():
674
620
  """
675
621
  Determines whether we're running in WSL.
676
-
622
+
677
623
  Returns:
678
- True if we're running in WSL.
624
+ True if we're running in WSL.
679
625
  """
680
-
626
+
681
627
  if sys.platform not in ('linux','posix'):
682
628
  return False
683
629
  platform_string = ' '.join(platform.uname()).lower()
684
630
  return 'microsoft' in platform_string and 'wsl' in platform_string
685
-
631
+
686
632
 
687
633
  def wsl_path_to_windows_path(filename, failure_behavior='none'):
688
634
  r"""
689
635
  Converts a WSL path to a Windows path. For example, converts:
690
-
636
+
691
637
  /mnt/e/a/b/c
692
-
638
+
693
639
  ...to:
694
-
640
+
695
641
  e:\a\b\c
696
-
642
+
697
643
  Args:
698
644
  filename (str): filename to convert
699
645
  failure_behavior (str): what to do if the path can't be processed as a WSL path.
700
646
  'none' to return None in this case, 'original' to return the original path.
701
-
647
+
702
648
  Returns:
703
649
  str: Windows equivalent to the WSL path [filename]
704
650
  """
705
-
651
+
706
652
  assert failure_behavior in ('none','original'), \
707
653
  'Unrecognized failure_behavior value {}'.format(failure_behavior)
708
-
654
+
709
655
  # Check whether the path follows the standard WSL mount pattern
710
656
  wsl_path_pattern = r'^/mnt/([a-zA-Z])(/.*)?$'
711
657
  match = re.match(wsl_path_pattern, filename)
712
-
658
+
713
659
  if match:
714
660
 
715
661
  # Extract the drive letter and the rest of the path
716
662
  drive_letter = match.group(1)
717
663
  path_remainder = match.group(2) if match.group(2) else ''
718
-
664
+
719
665
  # Convert forward slashes to backslashes for Windows
720
666
  path_remainder = path_remainder.replace('/', '\\')
721
-
667
+
722
668
  # Format the Windows path
723
669
  windows_path = f"{drive_letter}:{path_remainder}"
724
670
  return windows_path
725
-
671
+
726
672
  if failure_behavior == 'none':
727
673
  return None
728
674
  else:
729
675
  return filename
730
676
 
731
677
  # ...def wsl_path_to_windows_path(...)
732
-
733
-
678
+
679
+
734
680
  def windows_path_to_wsl_path(filename, failure_behavior='none'):
735
681
  r"""
736
682
  Converts a Windows path to a WSL path, or returns None if that's not possible. E.g.
737
683
  converts:
738
-
684
+
739
685
  e:\a\b\c
740
-
686
+
741
687
  ...to:
742
-
688
+
743
689
  /mnt/e/a/b/c
744
-
690
+
745
691
  Args:
746
692
  filename (str): filename to convert
747
693
  failure_behavior (str): what to do if the path can't be processed as a Windows path.
748
694
  'none' to return None in this case, 'original' to return the original path.
749
-
695
+
750
696
  Returns:
751
697
  str: WSL equivalent to the Windows path [filename]
752
698
  """
753
-
699
+
754
700
  assert failure_behavior in ('none','original'), \
755
701
  'Unrecognized failure_behavior value {}'.format(failure_behavior)
756
-
702
+
757
703
  filename = filename.replace('\\', '/')
758
-
704
+
759
705
  # Check whether the path follows a Windows drive letter pattern
760
706
  windows_path_pattern = r'^([a-zA-Z]):(/.*)?$'
761
707
  match = re.match(windows_path_pattern, filename)
762
-
708
+
763
709
  if match:
764
710
  # Extract the drive letter and the rest of the path
765
711
  drive_letter = match.group(1).lower() # Convert to lowercase for WSL
766
712
  path_remainder = match.group(2) if match.group(2) else ''
767
-
713
+
768
714
  # Format the WSL path
769
715
  wsl_path = f"/mnt/{drive_letter}{path_remainder}"
770
716
  return wsl_path
771
-
717
+
772
718
  if failure_behavior == 'none':
773
719
  return None
774
720
  else:
775
721
  return filename
776
-
722
+
777
723
  # ...def window_path_to_wsl_path(...)
778
724
 
779
725
 
780
726
  def open_file_in_chrome(filename):
781
727
  """
782
- Open a file in chrome, regardless of file type. I typically use this to open
728
+ Open a file in chrome, regardless of file type. I typically use this to open
783
729
  .md files in Chrome.
784
-
730
+
785
731
  Args:
786
732
  filename (str): file to open
787
-
733
+
788
734
  Return:
789
735
  bool: whether the operation was successful
790
736
  """
791
-
737
+
792
738
  # Create URL
793
739
  abs_path = os.path.abspath(filename)
794
-
740
+
795
741
  system = platform.system()
796
742
  if system == 'Windows':
797
743
  url = f'file:///{abs_path.replace(os.sep, "/")}'
798
744
  else: # macOS and Linux
799
745
  url = f'file://{abs_path}'
800
-
746
+
801
747
  # Determine the Chrome path
802
748
  if system == 'Windows':
803
-
749
+
804
750
  # This is a native Python module, but it only exists on Windows
805
751
  import winreg
806
-
752
+
807
753
  chrome_paths = [
808
754
  os.path.expanduser("~") + r"\AppData\Local\Google\Chrome\Application\chrome.exe",
809
755
  r"C:\Program Files\Google\Chrome\Application\chrome.exe",
810
756
  r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"
811
757
  ]
812
-
758
+
813
759
  # Default approach: run from a typical chrome location
814
760
  for path in chrome_paths:
815
761
  if os.path.exists(path):
816
762
  subprocess.run([path, url])
817
763
  return True
818
-
764
+
819
765
  # Method 2: Check registry for Chrome path
820
766
  try:
821
- with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE,
767
+ with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE,
822
768
  r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe") as key:
823
769
  chrome_path = winreg.QueryValue(key, None)
824
770
  if chrome_path and os.path.exists(chrome_path):
825
771
  subprocess.run([chrome_path, url])
826
772
  return True
827
- except:
773
+ except Exception:
828
774
  pass
829
-
775
+
830
776
  # Method 3: Try alternate registry location
831
777
  try:
832
- with winreg.OpenKey(winreg.HKEY_CURRENT_USER,
778
+ with winreg.OpenKey(winreg.HKEY_CURRENT_USER,
833
779
  r"Software\Google\Chrome\BLBeacon") as key:
834
780
  chrome_path = os.path.join(os.path.dirname(winreg.QueryValueEx(key, "version")[0]), "chrome.exe")
835
781
  if os.path.exists(chrome_path):
836
782
  subprocess.run([chrome_path, url])
837
783
  return True
838
- except:
784
+ except Exception:
839
785
  pass
840
-
786
+
841
787
  # Method 4: Try system path or command
842
788
  for chrome_cmd in ["chrome", "chrome.exe", "googlechrome", "google-chrome"]:
843
789
  try:
844
790
  subprocess.run([chrome_cmd, url], shell=True)
845
791
  return True
846
- except:
792
+ except Exception:
847
793
  continue
848
-
794
+
849
795
  # Method 5: Use Windows URL protocol handler
850
796
  try:
851
797
  os.startfile(url)
852
798
  return True
853
- except:
799
+ except Exception:
854
800
  pass
855
-
856
- # Method 6: Use rundll32
801
+
802
+ # Method 6: Use rundll32
857
803
  try:
858
804
  cmd = f'rundll32 url.dll,FileProtocolHandler {url}'
859
805
  subprocess.run(cmd, shell=True)
860
806
  return True
861
- except:
807
+ except Exception:
862
808
  pass
863
-
809
+
864
810
  elif system == 'Darwin':
865
-
811
+
866
812
  chrome_paths = [
867
813
  '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
868
814
  os.path.expanduser('~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome')
869
815
  ]
870
-
816
+
871
817
  for path in chrome_paths:
872
818
  if os.path.exists(path):
873
819
  subprocess.run([path, url])
874
820
  return True
875
-
821
+
876
822
  # Fallback to 'open' command with Chrome as the app
877
823
  try:
878
824
  subprocess.run(['open', '-a', 'Google Chrome', url])
879
825
  return True
880
- except:
826
+ except Exception:
881
827
  pass
882
-
828
+
883
829
  elif system == 'Linux':
884
-
830
+
885
831
  chrome_commands = ['google-chrome', 'chrome', 'chromium', 'chromium-browser']
886
-
832
+
887
833
  for cmd in chrome_commands:
888
834
  try:
889
835
  subprocess.run([cmd, url], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
890
836
  return True
891
- except:
837
+ except Exception:
892
838
  continue
893
-
839
+
894
840
  print(f"Could not open {filename} in Chrome on {system}.")
895
841
  return False
896
842
 
897
-
843
+
898
844
  def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
899
845
  """
900
846
  Opens [filename] in the default OS file handler for this file type.
901
-
847
+
902
848
  If browser_name is not None, uses the webbrowser module to open the filename
903
849
  in the specified browser; see https://docs.python.org/3/library/webbrowser.html
904
850
  for supported browsers. Falls back to the default file handler if webbrowser.open()
905
851
  fails. In this case, attempt_to_open_in_wsl_host is ignored unless webbrowser.open() fails.
906
-
907
- If browser_name is 'default', uses the system default. This is different from the
852
+
853
+ If browser_name is 'default', uses the system default. This is different from the
908
854
  parameter to webbrowser.get(), where None implies the system default.
909
-
855
+
910
856
  Args:
911
857
  filename (str): file to open
912
858
  attempt_to_open_in_wsl_host: if this is True, and we're in WSL, attempts to open
913
859
  [filename] in the Windows host environment
914
860
  browser_name: see above
915
861
  """
916
-
862
+
917
863
  if browser_name is not None:
918
864
  if browser_name == 'chrome':
919
865
  browser_name = 'google-chrome'
@@ -925,32 +871,32 @@ def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
925
871
  result = False
926
872
  if result:
927
873
  return
928
-
874
+
929
875
  if sys.platform == 'win32':
930
-
876
+
931
877
  os.startfile(filename)
932
878
 
933
879
  elif sys.platform == 'darwin':
934
-
880
+
935
881
  opener = 'open'
936
882
  subprocess.call([opener, filename])
937
-
883
+
938
884
  elif attempt_to_open_in_wsl_host and environment_is_wsl():
939
-
885
+
940
886
  windows_path = wsl_path_to_windows_path(filename)
941
-
887
+
942
888
  # Fall back to xdg-open
943
889
  if windows_path is None:
944
890
  subprocess.call(['xdg-open', filename])
945
-
946
- if os.path.isdir(filename):
891
+
892
+ if os.path.isdir(filename):
947
893
  subprocess.run(["explorer.exe", windows_path])
948
894
  else:
949
- os.system("cmd.exe /C start %s" % (re.escape(windows_path)))
950
-
895
+ os.system("cmd.exe /C start {}".format(re.escape(windows_path)))
896
+
951
897
  else:
952
-
953
- opener = 'xdg-open'
898
+
899
+ opener = 'xdg-open'
954
900
  subprocess.call([opener, filename])
955
901
 
956
902
  # ...def open_file(...)
@@ -962,12 +908,12 @@ def write_list_to_file(output_file,strings):
962
908
  """
963
909
  Writes a list of strings to either a JSON file or text file,
964
910
  depending on extension of the given file name.
965
-
911
+
966
912
  Args:
967
913
  output_file (str): file to write
968
914
  strings (list): list of strings to write to [output_file]
969
915
  """
970
-
916
+
971
917
  with open(output_file, 'w') as f:
972
918
  if output_file.endswith('.json'):
973
919
  json.dump(strings, f, indent=1)
@@ -978,14 +924,14 @@ def write_list_to_file(output_file,strings):
978
924
  def read_list_from_file(filename):
979
925
  """
980
926
  Reads a json-formatted list of strings from a file.
981
-
927
+
982
928
  Args:
983
929
  filename (str): .json filename to read
984
-
930
+
985
931
  Returns:
986
932
  list: list of strings read from [filename]
987
933
  """
988
-
934
+
989
935
  assert filename.endswith('.json')
990
936
  with open(filename, 'r') as f:
991
937
  file_list = json.load(f)
@@ -1001,39 +947,39 @@ def _copy_file(input_output_tuple,overwrite=True,verbose=False,move=False):
1001
947
  """
1002
948
  Internal function for copying files from within parallel_copy_files.
1003
949
  """
1004
-
950
+
1005
951
  assert len(input_output_tuple) == 2
1006
952
  source_fn = input_output_tuple[0]
1007
953
  target_fn = input_output_tuple[1]
1008
954
  if (not overwrite) and (os.path.isfile(target_fn)):
1009
955
  if verbose:
1010
956
  print('Skipping existing target file {}'.format(target_fn))
1011
- return
1012
-
957
+ return
958
+
1013
959
  if move:
1014
960
  action_string = 'Moving'
1015
961
  else:
1016
962
  action_string = 'Copying'
1017
-
963
+
1018
964
  if verbose:
1019
965
  print('{} to {}'.format(action_string,target_fn))
1020
-
966
+
1021
967
  os.makedirs(os.path.dirname(target_fn),exist_ok=True)
1022
968
  if move:
1023
969
  shutil.move(source_fn, target_fn)
1024
970
  else:
1025
971
  shutil.copyfile(source_fn,target_fn)
1026
-
1027
972
 
1028
- def parallel_copy_files(input_file_to_output_file,
1029
- max_workers=16,
1030
- use_threads=True,
1031
- overwrite=False,
973
+
974
+ def parallel_copy_files(input_file_to_output_file,
975
+ max_workers=16,
976
+ use_threads=True,
977
+ overwrite=False,
1032
978
  verbose=False,
1033
979
  move=False):
1034
980
  """
1035
981
  Copy (or move) files from source to target according to the dict input_file_to_output_file.
1036
-
982
+
1037
983
  Args:
1038
984
  input_file_to_output_file (dict): dictionary mapping source files to the target files
1039
985
  to which they should be copied
@@ -1046,24 +992,32 @@ def parallel_copy_files(input_file_to_output_file,
1046
992
  """
1047
993
 
1048
994
  n_workers = min(max_workers,len(input_file_to_output_file))
1049
-
995
+
1050
996
  # Package the dictionary as a set of 2-tuples
1051
997
  input_output_tuples = []
1052
998
  for input_fn in input_file_to_output_file:
1053
999
  input_output_tuples.append((input_fn,input_file_to_output_file[input_fn]))
1054
1000
 
1055
- if use_threads:
1056
- pool = ThreadPool(n_workers)
1057
- else:
1058
- pool = Pool(n_workers)
1001
+ pool = None
1059
1002
 
1060
- with tqdm(total=len(input_output_tuples)) as pbar:
1061
- for i,_ in enumerate(pool.imap_unordered(partial(_copy_file,
1062
- overwrite=overwrite,
1063
- verbose=verbose,
1064
- move=move),
1065
- input_output_tuples)):
1066
- pbar.update()
1003
+ try:
1004
+ if use_threads:
1005
+ pool = ThreadPool(n_workers)
1006
+ else:
1007
+ pool = Pool(n_workers)
1008
+
1009
+ with tqdm(total=len(input_output_tuples)) as pbar:
1010
+ for i,_ in enumerate(pool.imap_unordered(partial(_copy_file,
1011
+ overwrite=overwrite,
1012
+ verbose=verbose,
1013
+ move=move),
1014
+ input_output_tuples)):
1015
+ pbar.update()
1016
+ finally:
1017
+ pool.close()
1018
+ pool.join()
1019
+ if verbose:
1020
+ print("Pool closed and joined parallel file copying")
1067
1021
 
1068
1022
  # ...def parallel_copy_files(...)
1069
1023
 
@@ -1074,36 +1028,36 @@ def get_file_sizes(base_dir, convert_slashes=True):
1074
1028
  """
1075
1029
  Gets sizes recursively for all files in base_dir, returning a dict mapping
1076
1030
  relative filenames to size.
1077
-
1031
+
1078
1032
  TODO: merge the functionality here with parallel_get_file_sizes, which uses slightly
1079
1033
  different semantics.
1080
-
1034
+
1081
1035
  Args:
1082
1036
  base_dir (str): folder within which we want all file sizes
1083
1037
  convert_slashes (bool, optional): force forward slashes in return strings,
1084
1038
  otherwise uses the native path separator
1085
-
1039
+
1086
1040
  Returns:
1087
1041
  dict: dictionary mapping filenames to file sizes in bytes
1088
1042
  """
1089
-
1090
- relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
1043
+
1044
+ relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
1091
1045
  return_relative_paths=True)
1092
-
1046
+
1093
1047
  fn_to_size = {}
1094
1048
  for fn_relative in tqdm(relative_filenames):
1095
1049
  fn_abs = os.path.join(base_dir,fn_relative)
1096
1050
  fn_to_size[fn_relative] = os.path.getsize(fn_abs)
1097
-
1051
+
1098
1052
  return fn_to_size
1099
-
1053
+
1100
1054
 
1101
1055
  def _get_file_size(filename,verbose=False):
1102
1056
  """
1103
1057
  Internal function for safely getting the size of a file. Returns a (filename,size)
1104
1058
  tuple, where size is None if there is an error.
1105
1059
  """
1106
-
1060
+
1107
1061
  try:
1108
1062
  size = os.path.getsize(filename)
1109
1063
  except Exception as e:
@@ -1112,18 +1066,18 @@ def _get_file_size(filename,verbose=False):
1112
1066
  size = None
1113
1067
  return (filename,size)
1114
1068
 
1115
-
1116
- def parallel_get_file_sizes(filenames,
1117
- max_workers=16,
1118
- use_threads=True,
1069
+
1070
+ def parallel_get_file_sizes(filenames,
1071
+ max_workers=16,
1072
+ use_threads=True,
1119
1073
  verbose=False,
1120
- recursive=True,
1074
+ recursive=True,
1121
1075
  convert_slashes=True,
1122
1076
  return_relative_paths=False):
1123
1077
  """
1124
1078
  Returns a dictionary mapping every file in [filenames] to the corresponding file size,
1125
1079
  or None for errors. If [filenames] is a folder, will enumerate the folder (optionally recursively).
1126
-
1080
+
1127
1081
  Args:
1128
1082
  filenames (list or str): list of filenames for which we should read sizes, or a folder
1129
1083
  within which we should read all file sizes recursively
@@ -1135,33 +1089,33 @@ def parallel_get_file_sizes(filenames,
1135
1089
  convert_slashes (bool, optional): convert backslashes to forward slashes
1136
1090
  return_relative_paths (bool, optional): return relative paths; only relevant if [filenames]
1137
1091
  is a folder.
1138
-
1092
+
1139
1093
  Returns:
1140
1094
  dict: dictionary mapping filenames to file sizes in bytes
1141
1095
  """
1142
1096
 
1143
1097
  n_workers = min(max_workers,len(filenames))
1144
-
1098
+
1145
1099
  folder_name = None
1146
-
1100
+
1147
1101
  if isinstance(filenames,str):
1148
-
1102
+
1149
1103
  folder_name = filenames
1150
- assert os.path.isdir(filenames), 'Could not find folder {}'.format(folder_name)
1151
-
1104
+ assert os.path.isdir(filenames), 'Could not find folder {}'.format(folder_name)
1105
+
1152
1106
  if verbose:
1153
1107
  print('Enumerating files in {}'.format(folder_name))
1154
-
1108
+
1155
1109
  # Enumerate absolute paths here, we'll convert to relative later if requested
1156
1110
  filenames = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
1157
1111
 
1158
1112
  else:
1159
-
1113
+
1160
1114
  assert is_iterable(filenames), '[filenames] argument is neither a folder nor an iterable'
1161
-
1115
+
1162
1116
  if verbose:
1163
1117
  print('Creating worker pool')
1164
-
1118
+
1165
1119
  if use_threads:
1166
1120
  pool_string = 'thread'
1167
1121
  pool = ThreadPool(n_workers)
@@ -1172,11 +1126,11 @@ def parallel_get_file_sizes(filenames,
1172
1126
  if verbose:
1173
1127
  print('Created a {} pool of {} workers'.format(
1174
1128
  pool_string,n_workers))
1175
-
1129
+
1176
1130
  # This returns (filename,size) tuples
1177
1131
  get_size_results = list(tqdm(pool.imap(
1178
1132
  partial(_get_file_size,verbose=verbose),filenames), total=len(filenames)))
1179
-
1133
+
1180
1134
  to_return = {}
1181
1135
  for r in get_size_results:
1182
1136
  fn = r[0]
@@ -1197,7 +1151,7 @@ def parallel_get_file_sizes(filenames,
1197
1151
  def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
1198
1152
  """
1199
1153
  Zips a single file.
1200
-
1154
+
1201
1155
  Args:
1202
1156
  input_fn (str): file to zip
1203
1157
  output_fn (str, optional): target zipfile; if this is None, we'll use
@@ -1205,23 +1159,23 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
1205
1159
  overwrite (bool, optional): whether to overwrite an existing target file
1206
1160
  verbose (bool, optional): enable existing debug console output
1207
1161
  compresslevel (int, optional): compression level to use, between 0 and 9
1208
-
1162
+
1209
1163
  Returns:
1210
1164
  str: the output zipfile, whether we created it or determined that it already exists
1211
1165
  """
1212
-
1166
+
1213
1167
  basename = os.path.basename(input_fn)
1214
-
1168
+
1215
1169
  if output_fn is None:
1216
1170
  output_fn = input_fn + '.zip'
1217
-
1171
+
1218
1172
  if (not overwrite) and (os.path.isfile(output_fn)):
1219
1173
  print('Skipping existing file {}'.format(output_fn))
1220
1174
  return output_fn
1221
-
1175
+
1222
1176
  if verbose:
1223
1177
  print('Zipping {} to {} with level {}'.format(input_fn,output_fn,compresslevel))
1224
-
1178
+
1225
1179
  with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
1226
1180
  zipf.write(input_fn,arcname=basename,compresslevel=compresslevel,
1227
1181
  compress_type=zipfile.ZIP_DEFLATED)
@@ -1232,9 +1186,9 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
1232
1186
  def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
1233
1187
  overwrite=False, verbose=False, mode='x'):
1234
1188
  """
1235
- Adds all the files in [input_files] to the tar file [output_fn].
1189
+ Adds all the files in [input_files] to the tar file [output_fn].
1236
1190
  Archive names are relative to arc_name_base.
1237
-
1191
+
1238
1192
  Args:
1239
1193
  input_files (list): list of absolute filenames to include in the .tar file
1240
1194
  output_fn (str): .tar file to create
@@ -1244,11 +1198,11 @@ def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
1244
1198
  overwrite (bool, optional): whether to overwrite an existing .tar file
1245
1199
  verbose (bool, optional): enable additional debug console output
1246
1200
  mode (str, optional): compression type, can be 'x' (no compression), 'x:gz', or 'x:bz2'.
1247
-
1201
+
1248
1202
  Returns:
1249
1203
  str: the output tar file, whether we created it or determined that it already exists
1250
1204
  """
1251
-
1205
+
1252
1206
  if os.path.isfile(output_fn):
1253
1207
  if not overwrite:
1254
1208
  print('Tar file {} exists, skipping'.format(output_fn))
@@ -1256,11 +1210,11 @@ def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
1256
1210
  else:
1257
1211
  print('Tar file {} exists, deleting and re-creating'.format(output_fn))
1258
1212
  os.remove(output_fn)
1259
-
1213
+
1260
1214
  if verbose:
1261
1215
  print('Adding {} files to {} (mode {})'.format(
1262
1216
  len(input_files),output_fn,mode))
1263
-
1217
+
1264
1218
  with tarfile.open(output_fn,mode) as tarf:
1265
1219
  for input_fn_abs in tqdm(input_files,disable=(not verbose)):
1266
1220
  input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
@@ -1272,9 +1226,9 @@ def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
1272
1226
  def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
1273
1227
  overwrite=False, verbose=False, compresslevel=9):
1274
1228
  """
1275
- Zip all the files in [input_files] into [output_fn]. Archive names are relative to
1229
+ Zip all the files in [input_files] into [output_fn]. Archive names are relative to
1276
1230
  arc_name_base.
1277
-
1231
+
1278
1232
  Args:
1279
1233
  input_files (list): list of absolute filenames to include in the .tar file
1280
1234
  output_fn (str): .tar file to create
@@ -1284,20 +1238,20 @@ def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
1284
1238
  overwrite (bool, optional): whether to overwrite an existing .tar file
1285
1239
  verbose (bool, optional): enable additional debug console output
1286
1240
  compresslevel (int, optional): compression level to use, between 0 and 9
1287
-
1241
+
1288
1242
  Returns:
1289
1243
  str: the output zipfile, whether we created it or determined that it already exists
1290
1244
  """
1291
-
1245
+
1292
1246
  if not overwrite:
1293
1247
  if os.path.isfile(output_fn):
1294
1248
  print('Zip file {} exists, skipping'.format(output_fn))
1295
1249
  return output_fn
1296
-
1250
+
1297
1251
  if verbose:
1298
1252
  print('Zipping {} files to {} (compression level {})'.format(
1299
1253
  len(input_files),output_fn,compresslevel))
1300
-
1254
+
1301
1255
  with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
1302
1256
  for input_fn_abs in tqdm(input_files,disable=(not verbose)):
1303
1257
  input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
@@ -1307,41 +1261,41 @@ def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
1307
1261
  compress_type=zipfile.ZIP_DEFLATED)
1308
1262
 
1309
1263
  return output_fn
1310
-
1311
-
1264
+
1265
+
1312
1266
  def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
1313
1267
  """
1314
- Recursively zip everything in [input_folder] into a single zipfile, storing files as paths
1268
+ Recursively zip everything in [input_folder] into a single zipfile, storing files as paths
1315
1269
  relative to [input_folder].
1316
-
1317
- Args:
1270
+
1271
+ Args:
1318
1272
  input_folder (str): folder to zip
1319
1273
  output_fn (str, optional): output filename; if this is None, we'll write to [input_folder].zip
1320
1274
  overwrite (bool, optional): whether to overwrite an existing .tar file
1321
1275
  verbose (bool, optional): enable additional debug console output
1322
- compresslevel (int, optional): compression level to use, between 0 and 9
1323
-
1276
+ compresslevel (int, optional): compression level to use, between 0 and 9
1277
+
1324
1278
  Returns:
1325
- str: the output zipfile, whether we created it or determined that it already exists
1279
+ str: the output zipfile, whether we created it or determined that it already exists
1326
1280
  """
1327
-
1281
+
1328
1282
  if output_fn is None:
1329
1283
  output_fn = input_folder + '.zip'
1330
-
1284
+
1331
1285
  if not overwrite:
1332
1286
  if os.path.isfile(output_fn):
1333
1287
  print('Zip file {} exists, skipping'.format(output_fn))
1334
- return
1335
-
1288
+ return
1289
+
1336
1290
  if verbose:
1337
1291
  print('Zipping {} to {} (compression level {})'.format(
1338
1292
  input_folder,output_fn,compresslevel))
1339
-
1293
+
1340
1294
  relative_filenames = recursive_file_list(input_folder,return_relative_paths=True)
1341
-
1295
+
1342
1296
  with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
1343
1297
  for input_fn_relative in tqdm(relative_filenames,disable=(not verbose)):
1344
- input_fn_abs = os.path.join(input_folder,input_fn_relative)
1298
+ input_fn_abs = os.path.join(input_folder,input_fn_relative)
1345
1299
  zipf.write(input_fn_abs,
1346
1300
  arcname=input_fn_relative,
1347
1301
  compresslevel=compresslevel,
@@ -1349,17 +1303,17 @@ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, com
1349
1303
 
1350
1304
  return output_fn
1351
1305
 
1352
-
1353
- def parallel_zip_files(input_files,
1354
- max_workers=16,
1355
- use_threads=True,
1356
- compresslevel=9,
1357
- overwrite=False,
1306
+
1307
+ def parallel_zip_files(input_files,
1308
+ max_workers=16,
1309
+ use_threads=True,
1310
+ compresslevel=9,
1311
+ overwrite=False,
1358
1312
  verbose=False):
1359
1313
  """
1360
- Zips one or more files to separate output files in parallel, leaving the
1314
+ Zips one or more files to separate output files in parallel, leaving the
1361
1315
  original files in place. Each file is zipped to [filename].zip.
1362
-
1316
+
1363
1317
  Args:
1364
1318
  input_file (str): list of files to zip
1365
1319
  max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
@@ -1387,9 +1341,9 @@ def parallel_zip_files(input_files,
1387
1341
  def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
1388
1342
  compresslevel=9, overwrite=False, verbose=False):
1389
1343
  """
1390
- Zips one or more folders to separate output files in parallel, leaving the
1344
+ Zips one or more folders to separate output files in parallel, leaving the
1391
1345
  original folders in place. Each folder is zipped to [folder_name].zip.
1392
-
1346
+
1393
1347
  Args:
1394
1348
  input_folder (list): list of folders to zip
1395
1349
  max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
@@ -1406,7 +1360,7 @@ def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
1406
1360
  pool = ThreadPool(n_workers)
1407
1361
  else:
1408
1362
  pool = Pool(n_workers)
1409
-
1363
+
1410
1364
  with tqdm(total=len(input_folders)) as pbar:
1411
1365
  for i,_ in enumerate(pool.imap_unordered(
1412
1366
  partial(zip_folder,overwrite=overwrite,
@@ -1419,9 +1373,9 @@ def zip_each_file_in_folder(folder_name,recursive=False,max_workers=16,use_threa
1419
1373
  compresslevel=9,overwrite=False,required_token=None,verbose=False,
1420
1374
  exclude_zip=True):
1421
1375
  """
1422
- Zips each file in [folder_name] to its own zipfile (filename.zip), optionally recursing. To
1376
+ Zips each file in [folder_name] to its own zipfile (filename.zip), optionally recursing. To
1423
1377
  zip a whole folder into a single zipfile, use zip_folder().
1424
-
1378
+
1425
1379
  Args:
1426
1380
  folder_name (str): the folder within which we should zip files
1427
1381
  recursive (bool, optional): whether to recurse within [folder_name]
@@ -1432,19 +1386,19 @@ def zip_each_file_in_folder(folder_name,recursive=False,max_workers=16,use_threa
1432
1386
  overwrite (bool, optional): whether to overwrite an existing .tar file
1433
1387
  required_token (str, optional): only zip files whose names contain this string
1434
1388
  verbose (bool, optional): enable additional debug console output
1435
- exclude_zip (bool, optional): skip files ending in .zip
1389
+ exclude_zip (bool, optional): skip files ending in .zip
1436
1390
  """
1437
-
1391
+
1438
1392
  assert os.path.isdir(folder_name), '{} is not a folder'.format(folder_name)
1439
-
1393
+
1440
1394
  input_files = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
1441
-
1395
+
1442
1396
  if required_token is not None:
1443
1397
  input_files = [fn for fn in input_files if required_token in fn]
1444
-
1398
+
1445
1399
  if exclude_zip:
1446
1400
  input_files = [fn for fn in input_files if (not fn.endswith('.zip'))]
1447
-
1401
+
1448
1402
  parallel_zip_files(input_files=input_files,max_workers=max_workers,
1449
1403
  use_threads=use_threads,compresslevel=compresslevel,
1450
1404
  overwrite=overwrite,verbose=verbose)
@@ -1454,16 +1408,16 @@ def unzip_file(input_file, output_folder=None):
1454
1408
  """
1455
1409
  Unzips a zipfile to the specified output folder, defaulting to the same location as
1456
1410
  the input file.
1457
-
1411
+
1458
1412
  Args:
1459
1413
  input_file (str): zipfile to unzip
1460
1414
  output_folder (str, optional): folder to which we should unzip [input_file], defaults
1461
1415
  to unzipping to the folder where [input_file] lives
1462
1416
  """
1463
-
1417
+
1464
1418
  if output_folder is None:
1465
1419
  output_folder = os.path.dirname(input_file)
1466
-
1420
+
1467
1421
  with zipfile.ZipFile(input_file, 'r') as zf:
1468
1422
  zf.extractall(output_folder)
1469
1423
 
@@ -1473,31 +1427,31 @@ def unzip_file(input_file, output_folder=None):
1473
1427
  def compute_file_hash(file_path, algorithm='sha256', allow_failures=True):
1474
1428
  """
1475
1429
  Compute the hash of a file.
1476
-
1430
+
1477
1431
  Adapted from:
1478
-
1432
+
1479
1433
  https://www.geeksforgeeks.org/python-program-to-find-hash-of-file/
1480
-
1434
+
1481
1435
  Args:
1482
1436
  file_path (str): the file to hash
1483
1437
  algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
1484
-
1438
+
1485
1439
  Returns:
1486
1440
  str: the hash value for this file
1487
1441
  """
1488
-
1442
+
1489
1443
  try:
1490
-
1444
+
1491
1445
  hash_func = hashlib.new(algorithm)
1492
-
1446
+
1493
1447
  with open(file_path, 'rb') as file:
1494
1448
  while chunk := file.read(8192): # Read the file in chunks of 8192 bytes
1495
1449
  hash_func.update(chunk)
1496
-
1450
+
1497
1451
  return str(hash_func.hexdigest())
1498
-
1452
+
1499
1453
  except Exception:
1500
-
1454
+
1501
1455
  if allow_failures:
1502
1456
  return None
1503
1457
  else:
@@ -1507,14 +1461,14 @@ def compute_file_hash(file_path, algorithm='sha256', allow_failures=True):
1507
1461
 
1508
1462
 
1509
1463
  def parallel_compute_file_hashes(filenames,
1510
- max_workers=16,
1511
- use_threads=True,
1464
+ max_workers=16,
1465
+ use_threads=True,
1512
1466
  recursive=True,
1513
1467
  algorithm='sha256',
1514
1468
  verbose=False):
1515
1469
  """
1516
1470
  Compute file hashes for a list or folder of images.
1517
-
1471
+
1518
1472
  Args:
1519
1473
  filenames (list or str): a list of filenames or a folder
1520
1474
  max_workers (int, optional): the number of parallel workers to use; set to <=1 to disable
@@ -1524,8 +1478,8 @@ def parallel_compute_file_hashes(filenames,
1524
1478
  algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
1525
1479
  recursive (bool, optional): if [filenames] is a folder, whether to enumerate recursively.
1526
1480
  Ignored if [filenames] is a list.
1527
- verbose (bool, optional): enable additional debug output
1528
-
1481
+ verbose (bool, optional): enable additional debug output
1482
+
1529
1483
  Returns:
1530
1484
  dict: a dict mapping filenames to hash values; values will be None for files that fail
1531
1485
  to load.
@@ -1535,35 +1489,1140 @@ def parallel_compute_file_hashes(filenames,
1535
1489
  if verbose:
1536
1490
  print('Enumerating files in {}'.format(filenames))
1537
1491
  filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
1538
-
1492
+
1539
1493
  n_workers = min(max_workers,len(filenames))
1540
-
1494
+
1541
1495
  if verbose:
1542
1496
  print('Computing hashes for {} files on {} workers'.format(len(filenames),n_workers))
1543
-
1497
+
1544
1498
  if n_workers <= 1:
1545
-
1499
+
1546
1500
  results = []
1547
1501
  for filename in filenames:
1548
1502
  results.append(compute_file_hash(filename,algorithm=algorithm,allow_failures=True))
1549
-
1503
+
1550
1504
  else:
1551
-
1505
+
1552
1506
  if use_threads:
1553
1507
  pool = ThreadPool(n_workers)
1554
1508
  else:
1555
1509
  pool = Pool(n_workers)
1556
-
1510
+
1557
1511
  results = list(tqdm(pool.imap(
1558
1512
  partial(compute_file_hash,algorithm=algorithm,allow_failures=True),
1559
1513
  filenames), total=len(filenames)))
1560
-
1514
+
1561
1515
  assert len(filenames) == len(results), 'Internal error in parallel_compute_file_hashes'
1562
-
1516
+
1563
1517
  to_return = {}
1564
1518
  for i_file,filename in enumerate(filenames):
1565
1519
  to_return[filename] = results[i_file]
1566
-
1520
+
1567
1521
  return to_return
1568
1522
 
1569
1523
  # ...def parallel_compute_file_hashes(...)
1524
+
1525
+
1526
+ #%% Tests
1527
+
1528
+ class TestPathUtils:
1529
+ """
1530
+ Tests for path_utils.py
1531
+ """
1532
+
1533
+ def set_up(self):
1534
+ """
1535
+ Create a temporary directory for testing.
1536
+ """
1537
+
1538
+ self.test_dir = make_test_folder(subfolder='megadetector/path_utils_tests')
1539
+ os.makedirs(self.test_dir, exist_ok=True)
1540
+
1541
+
1542
+ def tear_down(self):
1543
+ """
1544
+ Remove the temporary directory after tests.
1545
+ """
1546
+
1547
+ if os.path.exists(self.test_dir):
1548
+ shutil.rmtree(self.test_dir)
1549
+
1550
+
1551
+ def test_is_image_file(self):
1552
+ """
1553
+ Test the is_image_file function.
1554
+ """
1555
+
1556
+ assert is_image_file('test.jpg')
1557
+ assert is_image_file('test.jpeg')
1558
+ assert is_image_file('test.png')
1559
+ assert is_image_file('test.gif')
1560
+ assert is_image_file('test.bmp')
1561
+ assert is_image_file('test.tiff')
1562
+ assert is_image_file('test.TIF')
1563
+ assert not is_image_file('test.txt')
1564
+ assert not is_image_file('test.doc')
1565
+ assert is_image_file('path/to/image.JPG')
1566
+ assert not is_image_file('image')
1567
+ assert is_image_file('test.custom', img_extensions=['.custom'])
1568
+ assert not is_image_file('test.jpg', img_extensions=['.custom'])
1569
+
1570
+
1571
+ def test_find_image_strings(self):
1572
+ """
1573
+ Test the find_image_strings function.
1574
+ """
1575
+
1576
+ strings = ['a.jpg', 'b.txt', 'c.PNG', 'd.gif', 'e.jpeg', 'f.doc']
1577
+ expected = ['a.jpg', 'c.PNG', 'd.gif', 'e.jpeg']
1578
+ assert sorted(find_image_strings(strings)) == sorted(expected)
1579
+ assert find_image_strings([]) == []
1580
+ assert find_image_strings(['no_image.txt', 'another.doc']) == []
1581
+
1582
+
1583
+ def test_find_images(self):
1584
+ """
1585
+ Test the find_images function.
1586
+ """
1587
+
1588
+ # Create some dummy files
1589
+ img1_abs = os.path.join(self.test_dir, 'img1.jpg')
1590
+ img2_abs = os.path.join(self.test_dir, 'img2.PNG')
1591
+ txt1_abs = os.path.join(self.test_dir, 'text1.txt')
1592
+ open(img1_abs, 'w').close()
1593
+ open(img2_abs, 'w').close()
1594
+ open(txt1_abs, 'w').close()
1595
+
1596
+ subdir = os.path.join(self.test_dir, 'subdir')
1597
+ os.makedirs(subdir, exist_ok=True)
1598
+ img3_abs = os.path.join(subdir, 'img3.jpeg')
1599
+ txt2_abs = os.path.join(subdir, 'text2.txt')
1600
+ open(img3_abs, 'w').close()
1601
+ open(txt2_abs, 'w').close()
1602
+
1603
+ # Test non-recursive
1604
+ expected_non_recursive_abs = sorted([img1_abs.replace('\\', '/'), img2_abs.replace('\\', '/')])
1605
+ found_non_recursive_abs = find_images(self.test_dir, recursive=False, return_relative_paths=False)
1606
+ assert sorted(found_non_recursive_abs) == expected_non_recursive_abs
1607
+
1608
+ # Test non-recursive, relative paths
1609
+ expected_non_recursive_rel = sorted(['img1.jpg', 'img2.PNG'])
1610
+ found_non_recursive_rel = find_images(self.test_dir, recursive=False, return_relative_paths=True)
1611
+ assert sorted(found_non_recursive_rel) == expected_non_recursive_rel
1612
+
1613
+ # Test recursive
1614
+ expected_recursive_abs = sorted([
1615
+ img1_abs.replace('\\', '/'),
1616
+ img2_abs.replace('\\', '/'),
1617
+ img3_abs.replace('\\', '/')
1618
+ ])
1619
+ found_recursive_abs = find_images(self.test_dir, recursive=True, return_relative_paths=False)
1620
+ assert sorted(found_recursive_abs) == expected_recursive_abs
1621
+
1622
+ # Test recursive, relative paths
1623
+ expected_recursive_rel = sorted([
1624
+ 'img1.jpg',
1625
+ 'img2.PNG',
1626
+ os.path.join('subdir', 'img3.jpeg').replace('\\', '/')
1627
+ ])
1628
+ found_recursive_rel = find_images(self.test_dir, recursive=True, return_relative_paths=True)
1629
+ assert sorted(found_recursive_rel) == expected_recursive_rel
1630
+
1631
+ # Test with an empty directory
1632
+ empty_dir = os.path.join(self.test_dir, 'empty_dir')
1633
+ os.makedirs(empty_dir, exist_ok=True)
1634
+ assert find_images(empty_dir, recursive=True) == []
1635
+
1636
+ # Test with a directory that doesn't exist (should assert)
1637
+ try:
1638
+ find_images(os.path.join(self.test_dir, 'non_existent_dir'))
1639
+ raise AssertionError("AssertionError not raised for non_existent_dir")
1640
+ except AssertionError:
1641
+ pass
1642
+
1643
+
1644
+ def test_recursive_file_list_and_file_list(self):
1645
+ """
1646
+ Test the recursive_file_list and file_list functions.
1647
+ """
1648
+
1649
+ # Setup directory structure
1650
+ # test_dir/
1651
+ # file1.txt
1652
+ # file2.jpg
1653
+ # subdir1/
1654
+ # file3.txt
1655
+ # subsubdir/
1656
+ # file4.png
1657
+ # subdir2/
1658
+ # file5.doc
1659
+
1660
+ list_dir = os.path.join(self.test_dir,'recursive_list')
1661
+
1662
+ f1 = os.path.join(list_dir, 'file1.txt')
1663
+ f2 = os.path.join(list_dir, 'file2.jpg')
1664
+ subdir1 = os.path.join(list_dir, 'subdir1')
1665
+ os.makedirs(subdir1, exist_ok=True)
1666
+ f3 = os.path.join(subdir1, 'file3.txt')
1667
+ subsubdir = os.path.join(subdir1, 'subsubdir')
1668
+ os.makedirs(subsubdir, exist_ok=True)
1669
+ f4 = os.path.join(subsubdir, 'file4.png')
1670
+ subdir2 = os.path.join(list_dir, 'subdir2')
1671
+ os.makedirs(subdir2, exist_ok=True)
1672
+ f5 = os.path.join(subdir2, 'file5.doc')
1673
+
1674
+ for filepath in [f1, f2, f3, f4, f5]:
1675
+ with open(filepath, 'w') as f:
1676
+ f.write('test')
1677
+
1678
+ # Test recursive_file_list (recursive=True by default)
1679
+ expected_all_files_abs = sorted([
1680
+ f1.replace('\\', '/'), f2.replace('\\', '/'), f3.replace('\\', '/'),
1681
+ f4.replace('\\', '/'), f5.replace('\\', '/')
1682
+ ])
1683
+ all_files_abs = recursive_file_list(list_dir, convert_slashes=True,
1684
+ return_relative_paths=False)
1685
+ assert sorted(all_files_abs) == expected_all_files_abs
1686
+
1687
+ # Test recursive_file_list with relative paths
1688
+ expected_all_files_rel = sorted([
1689
+ 'file1.txt', 'file2.jpg',
1690
+ os.path.join('subdir1', 'file3.txt').replace('\\', '/'),
1691
+ os.path.join('subdir1', 'subsubdir', 'file4.png').replace('\\', '/'),
1692
+ os.path.join('subdir2', 'file5.doc').replace('\\', '/')
1693
+ ])
1694
+ all_files_rel = recursive_file_list(list_dir, convert_slashes=True,
1695
+ return_relative_paths=True)
1696
+ assert sorted(all_files_rel) == expected_all_files_rel
1697
+
1698
+ # Test file_list (non-recursive by default via wrapper)
1699
+ expected_top_level_files_abs = sorted([f1.replace('\\', '/'), f2.replace('\\', '/')])
1700
+ top_level_files_abs = file_list(list_dir, convert_slashes=True,
1701
+ return_relative_paths=False, recursive=False)
1702
+ assert sorted(top_level_files_abs) == expected_top_level_files_abs
1703
+
1704
+ # Test file_list (recursive explicitly) - should be same as recursive_file_list
1705
+ recursive_via_file_list = file_list(list_dir, convert_slashes=True,
1706
+ return_relative_paths=False, recursive=True)
1707
+ assert sorted(recursive_via_file_list) == expected_all_files_abs
1708
+
1709
+ # Test with convert_slashes=False (use os.sep)
1710
+ #
1711
+ # Note: This test might be tricky if os.sep is '/', as no replacement happens. We'll check
1712
+ # that backslashes remain on Windows.
1713
+ if os.sep == '\\':
1714
+ f1_raw = os.path.join(list_dir, 'file1.txt')
1715
+ # Only one file for simplicity
1716
+ files_no_slash_conversion = file_list(list_dir, convert_slashes=False, recursive=False)
1717
+ assert any(f1_raw in s for s in files_no_slash_conversion)
1718
+
1719
+ # Test with an empty directory
1720
+ empty_dir = os.path.join(list_dir, "empty_dir_for_files")
1721
+ os.makedirs(empty_dir, exist_ok=True)
1722
+ assert recursive_file_list(empty_dir) == []
1723
+ assert file_list(empty_dir, recursive=False) == []
1724
+
1725
+ # Test with a non-existent directory
1726
+ try:
1727
+ recursive_file_list(os.path.join(list_dir, "non_existent_dir"))
1728
+ raise AssertionError("AssertionError not raised for non_existent_dir in recursive_file_list")
1729
+ except AssertionError:
1730
+ pass
1731
+
1732
+
1733
+ def test_folder_list(self):
1734
+ """
1735
+ Test the folder_list function.
1736
+ """
1737
+
1738
+ # Setup directory structure
1739
+ # test_dir/
1740
+ # subdir1/
1741
+ # subsubdir1/
1742
+ # subdir2/
1743
+ # file1.txt (should be ignored)
1744
+
1745
+ folder_list_dir = os.path.join(self.test_dir,'folder_list')
1746
+
1747
+ subdir1 = os.path.join(folder_list_dir, 'subdir1')
1748
+ subsubdir1 = os.path.join(subdir1, 'subsubdir1')
1749
+ subdir2 = os.path.join(folder_list_dir, 'subdir2')
1750
+ os.makedirs(subdir1, exist_ok=True)
1751
+ os.makedirs(subsubdir1, exist_ok=True)
1752
+ os.makedirs(subdir2, exist_ok=True)
1753
+ with open(os.path.join(folder_list_dir, 'file1.txt'), 'w') as f:
1754
+ f.write('test')
1755
+
1756
+ # Test non-recursive
1757
+ expected_folders_non_recursive_abs = sorted([
1758
+ subdir1.replace('\\', '/'), subdir2.replace('\\', '/')
1759
+ ])
1760
+ folders_non_recursive_abs = folder_list(folder_list_dir, recursive=False,
1761
+ return_relative_paths=False)
1762
+ assert sorted(folders_non_recursive_abs) == expected_folders_non_recursive_abs
1763
+
1764
+ # Test non-recursive, relative paths
1765
+ expected_folders_non_recursive_rel = sorted(['subdir1', 'subdir2'])
1766
+ folders_non_recursive_rel = folder_list(folder_list_dir, recursive=False,
1767
+ return_relative_paths=True)
1768
+ assert sorted(folders_non_recursive_rel) == expected_folders_non_recursive_rel
1769
+
1770
+ # Test recursive
1771
+ expected_folders_recursive_abs = sorted([
1772
+ subdir1.replace('\\', '/'),
1773
+ subsubdir1.replace('\\', '/'),
1774
+ subdir2.replace('\\', '/')
1775
+ ])
1776
+ folders_recursive_abs = folder_list(folder_list_dir, recursive=True,
1777
+ return_relative_paths=False)
1778
+ assert sorted(folders_recursive_abs) == expected_folders_recursive_abs
1779
+
1780
+ # Test recursive, relative paths
1781
+ expected_folders_recursive_rel = sorted([
1782
+ 'subdir1',
1783
+ os.path.join('subdir1', 'subsubdir1').replace('\\', '/'),
1784
+ 'subdir2'
1785
+ ])
1786
+ folders_recursive_rel = folder_list(folder_list_dir, recursive=True,
1787
+ return_relative_paths=True)
1788
+ assert sorted(folders_recursive_rel) == expected_folders_recursive_rel
1789
+
1790
+ # Test with an empty directory (except for the file)
1791
+ empty_dir_for_folders = os.path.join(folder_list_dir, "empty_for_folders")
1792
+ os.makedirs(empty_dir_for_folders, exist_ok=True)
1793
+ with open(os.path.join(empty_dir_for_folders, 'temp.txt'), 'w') as f: f.write('t')
1794
+ assert folder_list(empty_dir_for_folders, recursive=True) == []
1795
+ assert folder_list(empty_dir_for_folders, recursive=False) == []
1796
+
1797
+ # Test with a non-existent directory
1798
+ try:
1799
+ folder_list(os.path.join(self.test_dir, "non_existent_dir"))
1800
+ raise AssertionError("AssertionError not raised for non_existent_dir in folder_list")
1801
+ except AssertionError:
1802
+ pass
1803
+
1804
+
1805
+ def test_folder_summary(self):
1806
+ """
1807
+ Test the folder_summary function.
1808
+ """
1809
+
1810
+ # test_dir/
1811
+ # file1.txt
1812
+ # img1.jpg
1813
+ # subdir/
1814
+ # file2.txt
1815
+ # img2.png
1816
+ # img3.png
1817
+
1818
+ fodler_summary_dir = os.path.join(self.test_dir,'folder_summary')
1819
+
1820
+ f1 = os.path.join(fodler_summary_dir, 'file1.txt')
1821
+ img1 = os.path.join(fodler_summary_dir, 'img1.jpg')
1822
+ subdir = os.path.join(fodler_summary_dir, 'subdir')
1823
+ os.makedirs(subdir, exist_ok=True)
1824
+ f2 = os.path.join(subdir, 'file2.txt')
1825
+ img2 = os.path.join(subdir, 'img2.png')
1826
+ img3 = os.path.join(subdir, 'img3.png')
1827
+
1828
+ for filepath in [f1, img1, f2, img2, img3]:
1829
+ with open(filepath, 'w') as f:
1830
+ f.write('test')
1831
+
1832
+ summary = folder_summary(fodler_summary_dir, print_summary=False)
1833
+
1834
+ assert summary['n_files'] == 5
1835
+ assert summary['n_folders'] == 1 # 'subdir'
1836
+ assert summary['extension_to_count']['.txt'] == 2
1837
+ assert summary['extension_to_count']['.jpg'] == 1
1838
+ assert summary['extension_to_count']['.png'] == 2
1839
+
1840
+ # Check order (sorted by value, desc)
1841
+ #
1842
+ # The specific order of keys with the same counts can vary based on file system list
1843
+ # order. We'll check that the counts are correct and the number of unique extensions is
1844
+ # right.
1845
+ assert len(summary['extension_to_count']) == 3
1846
+
1847
+
1848
+ empty_dir = os.path.join(fodler_summary_dir, "empty_summary_dir")
1849
+ os.makedirs(empty_dir, exist_ok=True)
1850
+ empty_summary = folder_summary(empty_dir, print_summary=False)
1851
+ assert empty_summary['n_files'] == 0
1852
+ assert empty_summary['n_folders'] == 0
1853
+ assert empty_summary['extension_to_count'] == {}
1854
+
1855
+
1856
+ def test_fileparts(self):
1857
+ """
1858
+ Test the fileparts function.
1859
+ """
1860
+
1861
+ assert fileparts('file') == ('', 'file', '')
1862
+ assert fileparts('file.txt') == ('', 'file', '.txt')
1863
+ assert fileparts(r'c:/dir/file.jpg') == ('c:/dir', 'file', '.jpg')
1864
+ assert fileparts('/dir/subdir/file.jpg') == ('/dir/subdir', 'file', '.jpg')
1865
+ assert fileparts(r'c:\dir\file') == (r'c:\dir', 'file', '')
1866
+ assert fileparts(r'c:\dir\file.tar.gz') == (r'c:\dir', 'file.tar', '.gz')
1867
+ assert fileparts('.bashrc') == ('', '.bashrc', '') # Hidden file, no extension
1868
+ assert fileparts('nodir/.bashrc') == ('nodir', '.bashrc', '')
1869
+ assert fileparts('a/b/c.d.e') == ('a/b', 'c.d', '.e')
1870
+
1871
+
1872
+ def test_insert_before_extension(self):
1873
+ """
1874
+ Test the insert_before_extension function.
1875
+ """
1876
+
1877
+ assert insert_before_extension('file.ext', 'inserted') == 'file.inserted.ext'
1878
+ assert insert_before_extension('file', 'inserted') == 'file.inserted'
1879
+ assert insert_before_extension('path/to/file.ext', 'tag') == 'path/to/file.tag.ext'
1880
+ assert insert_before_extension('path/to/file', 'tag') == 'path/to/file.tag'
1881
+ assert insert_before_extension('file.tar.gz', 'new') == 'file.tar.new.gz'
1882
+
1883
+ # Test with custom separator
1884
+ assert insert_before_extension('file.ext', 'inserted', separator='_') == 'file_inserted.ext'
1885
+
1886
+ # Test with s=None (timestamp) - check format roughly
1887
+ fname_with_ts = insert_before_extension('file.ext', None)
1888
+ parts = fname_with_ts.split('.')
1889
+ # file.YYYY.MM.DD.HH.MM.SS.ext
1890
+ assert len(parts) >= 8 # file, Y, M, D, H, M, S, ext
1891
+ assert parts[0] == 'file'
1892
+ assert parts[-1] == 'ext'
1893
+ assert all(p.isdigit() for p in parts[1:-1])
1894
+
1895
+ fname_no_ext_ts = insert_before_extension('file', '') # s is empty string, should also use timestamp
1896
+ parts_no_ext = fname_no_ext_ts.split('.')
1897
+ assert len(parts_no_ext) >= 7 # file, Y, M, D, H, M, S
1898
+ assert parts_no_ext[0] == 'file'
1899
+ assert all(p.isdigit() for p in parts_no_ext[1:])
1900
+
1901
+
1902
+ def test_split_path(self):
1903
+ """
1904
+ Test the split_path function.
1905
+ """
1906
+
1907
+ if os.name == 'nt':
1908
+ assert split_path(r'c:\dir\subdir\file.txt') == ['c:\\', 'dir', 'subdir', 'file.txt']
1909
+ assert split_path('c:\\') == ['c:\\']
1910
+ # Test with mixed slashes, ntpath.split handles them
1911
+ assert split_path(r'c:/dir/subdir/file.txt') == ['c:/', 'dir', 'subdir', 'file.txt']
1912
+ else: # POSIX
1913
+ assert split_path('/dir/subdir/file.jpg') == ['/', 'dir', 'subdir', 'file.jpg']
1914
+ assert split_path('/') == ['/']
1915
+
1916
+ assert split_path('dir/file.txt') == ['dir', 'file.txt']
1917
+ assert split_path('file.txt') == ['file.txt']
1918
+ assert split_path('') == ''
1919
+ assert split_path('.') == ['.']
1920
+ assert split_path('..') == ['..']
1921
+ assert split_path('../a/b') == ['..', 'a', 'b']
1922
+
1923
+
1924
+ def test_path_is_abs(self):
1925
+ """
1926
+ Test the path_is_abs function.
1927
+ """
1928
+
1929
+ assert path_is_abs('/absolute/path')
1930
+ assert path_is_abs('c:/absolute/path')
1931
+ assert path_is_abs('C:\\absolute\\path')
1932
+ assert path_is_abs('\\\\server\\share\\path') # UNC path
1933
+ assert path_is_abs('c:file_without_slash_after_drive')
1934
+
1935
+ assert not path_is_abs('relative/path')
1936
+ assert not path_is_abs('file.txt')
1937
+ assert not path_is_abs('../relative')
1938
+ assert not path_is_abs('')
1939
+
1940
+
1941
+
1942
+ def test_safe_create_link_unix(self):
1943
+ """
1944
+ Test the safe_create_link function on Unix-like systems.
1945
+ """
1946
+
1947
+ if os.name == 'nt':
1948
+ # print("Skipping test_safe_create_link_unix on Windows.")
1949
+ return
1950
+
1951
+ source_file_path = os.path.join(self.test_dir, 'source.txt')
1952
+ link_path = os.path.join(self.test_dir, 'link.txt')
1953
+ other_source_path = os.path.join(self.test_dir, 'other_source.txt')
1954
+
1955
+ with open(source_file_path, 'w') as f:
1956
+ f.write('source data')
1957
+ with open(other_source_path, 'w') as f:
1958
+ f.write('other data')
1959
+
1960
+ # Create new link
1961
+ safe_create_link(source_file_path, link_path)
1962
+ assert os.path.islink(link_path)
1963
+ assert os.readlink(link_path) == source_file_path
1964
+
1965
+ # Link already exists and points to the correct source
1966
+ safe_create_link(source_file_path, link_path) # Should do nothing
1967
+ assert os.path.islink(link_path)
1968
+ assert os.readlink(link_path) == source_file_path
1969
+
1970
+ # Link already exists but points to a different source
1971
+ safe_create_link(other_source_path, link_path) # Should remove and re-create
1972
+ assert os.path.islink(link_path)
1973
+ assert os.readlink(link_path) == other_source_path
1974
+
1975
+ # Link_new path exists and is a file (not a link)
1976
+ file_path_conflict = os.path.join(self.test_dir, 'conflict_file.txt')
1977
+ with open(file_path_conflict, 'w') as f:
1978
+ f.write('actual file')
1979
+ try:
1980
+ safe_create_link(source_file_path, file_path_conflict)
1981
+ raise AssertionError("AssertionError not raised for file conflict")
1982
+ except AssertionError:
1983
+ pass
1984
+ os.remove(file_path_conflict)
1985
+
1986
+ # Link_new path exists and is a directory
1987
+ dir_path_conflict = os.path.join(self.test_dir, 'conflict_dir')
1988
+ os.makedirs(dir_path_conflict, exist_ok=True)
1989
+ try:
1990
+ safe_create_link(source_file_path, dir_path_conflict)
1991
+ raise AssertionError("AssertionError not raised for directory conflict")
1992
+ except AssertionError: # islink will be false
1993
+ pass
1994
+ shutil.rmtree(dir_path_conflict)
1995
+
1996
+
1997
+ def test_remove_empty_folders(self):
1998
+ """
1999
+ Test the remove_empty_folders function.
2000
+ """
2001
+
2002
+ # test_dir/
2003
+ # empty_top/
2004
+ # empty_mid/
2005
+ # empty_leaf/
2006
+ # mixed_top/
2007
+ # empty_mid_in_mixed/
2008
+ # empty_leaf_in_mixed/
2009
+ # non_empty_mid/
2010
+ # file.txt
2011
+ # non_empty_top/
2012
+ # file_in_top.txt
2013
+
2014
+ empty_top = os.path.join(self.test_dir, 'empty_top')
2015
+ empty_mid = os.path.join(empty_top, 'empty_mid')
2016
+ empty_leaf = os.path.join(empty_mid, 'empty_leaf')
2017
+ os.makedirs(empty_leaf, exist_ok=True)
2018
+
2019
+ mixed_top = os.path.join(self.test_dir, 'mixed_top')
2020
+ empty_mid_in_mixed = os.path.join(mixed_top, 'empty_mid_in_mixed')
2021
+ empty_leaf_in_mixed = os.path.join(empty_mid_in_mixed, 'empty_leaf_in_mixed')
2022
+ os.makedirs(empty_leaf_in_mixed, exist_ok=True)
2023
+ non_empty_mid = os.path.join(mixed_top, 'non_empty_mid')
2024
+ os.makedirs(non_empty_mid, exist_ok=True)
2025
+ with open(os.path.join(non_empty_mid, 'file.txt'), 'w') as f:
2026
+ f.write('data')
2027
+
2028
+ non_empty_top = os.path.join(self.test_dir, 'non_empty_top')
2029
+ os.makedirs(non_empty_top, exist_ok=True)
2030
+ with open(os.path.join(non_empty_top, 'file_in_top.txt'), 'w') as f:
2031
+ f.write('data')
2032
+
2033
+ # Process empty_top - should remove all three
2034
+ remove_empty_folders(empty_top, remove_root=True)
2035
+ assert not os.path.exists(empty_top)
2036
+ assert not os.path.exists(empty_mid)
2037
+ assert not os.path.exists(empty_leaf)
2038
+
2039
+ # Process mixed_top; should remove empty_leaf_in_mixed and empty_mid_in_mixed
2040
+ # but not mixed_top or non_empty_mid.
2041
+ remove_empty_folders(mixed_top, remove_root=True)
2042
+ assert os.path.exists(mixed_top) # mixed_top itself should remain
2043
+ assert not os.path.exists(empty_mid_in_mixed)
2044
+ assert not os.path.exists(empty_leaf_in_mixed)
2045
+ assert os.path.exists(non_empty_mid)
2046
+ assert os.path.exists(os.path.join(non_empty_mid, 'file.txt'))
2047
+
2048
+ # Process non_empty_top; should remove nothing.
2049
+ remove_empty_folders(non_empty_top, remove_root=True)
2050
+ assert os.path.exists(non_empty_top)
2051
+ assert os.path.exists(os.path.join(non_empty_top, 'file_in_top.txt'))
2052
+
2053
+ # Test with a file path (should do nothing and return False)
2054
+ file_path_for_removal = os.path.join(self.test_dir, 'a_file.txt')
2055
+ with open(file_path_for_removal, 'w') as f: f.write('t')
2056
+ assert not remove_empty_folders(file_path_for_removal, remove_root=True)
2057
+ assert os.path.exists(file_path_for_removal)
2058
+
2059
+ # Test with remove_root=False for the top level
2060
+ another_empty_top = os.path.join(self.test_dir, 'another_empty_top')
2061
+ another_empty_mid = os.path.join(another_empty_top, 'another_empty_mid')
2062
+ os.makedirs(another_empty_mid)
2063
+ remove_empty_folders(another_empty_top, remove_root=False)
2064
+ assert os.path.exists(another_empty_top) # Root not removed
2065
+ assert not os.path.exists(another_empty_mid) # Mid removed
2066
+
2067
+
2068
+ def test_path_join(self):
2069
+ """
2070
+ Test the path_join function.
2071
+ """
2072
+
2073
+ assert path_join('a', 'b', 'c') == 'a/b/c'
2074
+ assert path_join('a/b', 'c', 'd.txt') == 'a/b/c/d.txt'
2075
+ if os.name == 'nt':
2076
+ # On Windows, os.path.join uses '\', so convert_slashes=True should change it
2077
+ assert path_join('a', 'b', convert_slashes=True) == 'a/b'
2078
+ assert path_join('a', 'b', convert_slashes=False) == 'a\\b'
2079
+ assert path_join('c:\\', 'foo', 'bar', convert_slashes=True) == 'c:/foo/bar'
2080
+ assert path_join('c:\\', 'foo', 'bar', convert_slashes=False) == 'c:\\foo\\bar'
2081
+ else:
2082
+ # On POSIX, os.path.join uses '/', so convert_slashes=False should still be '/'
2083
+ assert path_join('a', 'b', convert_slashes=False) == 'a/b'
2084
+
2085
+ assert path_join('a', '', 'b') == 'a/b' # os.path.join behavior
2086
+ assert path_join('/a', 'b') == '/a/b'
2087
+ assert path_join('a', '/b') == '/b' # '/b' is absolute
2088
+
2089
+
2090
+ def test_filename_cleaning(self):
2091
+ """
2092
+ Test clean_filename, clean_path, and flatten_path functions.
2093
+ """
2094
+
2095
+ # clean_filename
2096
+ assert clean_filename("test file.txt") == "test file.txt"
2097
+ assert clean_filename("test*file?.txt", char_limit=10) == "testfile.t"
2098
+ assert clean_filename("TestFile.TXT", force_lower=True) == "testfile.txt"
2099
+ assert clean_filename("file:with<illegal>chars.txt") == "filewithillegalchars.txt"
2100
+ assert clean_filename(" accented_name_éà.txt") == " accented_name_ea.txt"
2101
+
2102
+ # Separators are not allowed by default in clean_filename
2103
+ assert clean_filename("path/to/file.txt") == "pathtofile.txt"
2104
+
2105
+ # clean_path
2106
+ assert clean_path("path/to/file.txt") == "path/to/file.txt" # slashes allowed
2107
+ assert clean_path("path\\to\\file.txt") == "path\\to\\file.txt" # backslashes allowed
2108
+ assert clean_path("path:to:file.txt") == "path:to:file.txt" # colons allowed
2109
+ assert clean_path("path/to<illegal>/file.txt") == "path/toillegal/file.txt"
2110
+
2111
+ # flatten_path
2112
+ assert flatten_path("path/to/file.txt") == "path~to~file.txt"
2113
+ assert flatten_path("path:to:file.txt", separator_char_replacement='_') == "path_to_file.txt"
2114
+ assert flatten_path("path\\to/file:name.txt") == "path~to~file~name.txt"
2115
+ assert flatten_path("path/to<illegal>/file.txt") == "path~toillegal~file.txt"
2116
+
2117
+
2118
+ def test_is_executable(self):
2119
+ """
2120
+ Test the is_executable function.
2121
+ This is a basic test; comprehensive testing is environment-dependent.
2122
+ """
2123
+
2124
+ # Hard to test reliably across all systems without knowing what's on PATH.
2125
+ if os.name == 'nt':
2126
+ assert is_executable('cmd.exe')
2127
+ assert not is_executable('non_existent_executable_blah_blah')
2128
+ else:
2129
+ assert is_executable('ls')
2130
+ assert is_executable('sh')
2131
+ assert not is_executable('non_existent_executable_blah_blah')
2132
+
2133
+
2134
+ def test_write_read_list_to_file(self):
2135
+ """
2136
+ Test write_list_to_file and read_list_from_file functions.
2137
+ """
2138
+
2139
+ test_list = ["item1", "item2 with space", "item3/with/slash"]
2140
+
2141
+ # Test with .json
2142
+ json_file_path = os.path.join(self.test_dir, "test_list.json")
2143
+ write_list_to_file(json_file_path, test_list)
2144
+ read_list_json = read_list_from_file(json_file_path)
2145
+ assert test_list == read_list_json
2146
+
2147
+ # Test with .txt
2148
+ txt_file_path = os.path.join(self.test_dir, "test_list.txt")
2149
+ write_list_to_file(txt_file_path, test_list)
2150
+ # read_list_from_file is specifically for JSON, so we read .txt manually
2151
+ with open(txt_file_path, 'r') as f:
2152
+ read_list_txt = [line.strip() for line in f.readlines()]
2153
+ assert test_list == read_list_txt
2154
+
2155
+ # Test reading non-existent json
2156
+ try:
2157
+ read_list_from_file(os.path.join(self.test_dir,"non_existent.json"))
2158
+ raise AssertionError("FileNotFoundError not raised")
2159
+ except FileNotFoundError:
2160
+ pass
2161
+
2162
+ # Test reading a non-json file with read_list_from_file (should fail parsing)
2163
+ non_json_path = os.path.join(self.test_dir, "not_a_list.json")
2164
+ with open(non_json_path, 'w') as f: f.write("this is not json")
2165
+ try:
2166
+ read_list_from_file(non_json_path)
2167
+ raise AssertionError("json.JSONDecodeError not raised")
2168
+ except json.JSONDecodeError:
2169
+ pass
2170
+
2171
+
2172
+ def test_parallel_copy_files(self):
2173
+ """
2174
+ Test the parallel_copy_files function (with max_workers=1 for test simplicity).
2175
+ """
2176
+
2177
+ source_dir = os.path.join(self.test_dir, "copy_source")
2178
+ target_dir = os.path.join(self.test_dir, "copy_target")
2179
+ os.makedirs(source_dir, exist_ok=True)
2180
+
2181
+ file_mappings = {}
2182
+ source_files_content = {}
2183
+
2184
+ for i in range(3):
2185
+ src_fn = f"file{i}.txt"
2186
+ src_path = os.path.join(source_dir, src_fn)
2187
+ if i == 0:
2188
+ tgt_fn = f"copied_file{i}.txt"
2189
+ tgt_path = os.path.join(target_dir, tgt_fn)
2190
+ else:
2191
+ tgt_fn = f"copied_file{i}_subdir.txt"
2192
+ tgt_path = os.path.join(target_dir, f"sub{i}", tgt_fn)
2193
+
2194
+ content = f"content of file {i}"
2195
+ with open(src_path, 'w') as f:
2196
+ f.write(content)
2197
+
2198
+ file_mappings[src_path] = tgt_path
2199
+ source_files_content[tgt_path] = content
2200
+
2201
+ # Test copy
2202
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, overwrite=False)
2203
+ for tgt_path, expected_content in source_files_content.items():
2204
+ assert os.path.exists(tgt_path)
2205
+ with open(tgt_path, 'r') as f:
2206
+ assert f.read() == expected_content
2207
+
2208
+ existing_target_path = list(source_files_content.keys())[0]
2209
+ with open(existing_target_path, 'w') as f:
2210
+ f.write("old content")
2211
+
2212
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, overwrite=False)
2213
+ with open(existing_target_path, 'r') as f:
2214
+ assert f.read() == "old content"
2215
+
2216
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, overwrite=True)
2217
+ with open(existing_target_path, 'r') as f:
2218
+ assert f.read() == source_files_content[existing_target_path]
2219
+
2220
+ for src_path_orig, tgt_path_orig in file_mappings.items(): # Re-create source for move
2221
+ with open(src_path_orig, 'w') as f:
2222
+ f.write(source_files_content[tgt_path_orig])
2223
+
2224
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, move=True, overwrite=True)
2225
+ for src_path, tgt_path in file_mappings.items():
2226
+ assert not os.path.exists(src_path)
2227
+ assert os.path.exists(tgt_path)
2228
+ with open(tgt_path, 'r') as f:
2229
+ assert f.read() == source_files_content[tgt_path]
2230
+
2231
+
2232
+ def test_get_file_sizes(self):
2233
+ """
2234
+ Test get_file_sizes and parallel_get_file_sizes functions.
2235
+ """
2236
+
2237
+ file_sizes_test_dir = os.path.join(self.test_dir,'file_sizes')
2238
+ os.makedirs(file_sizes_test_dir,exist_ok=True)
2239
+
2240
+ f1_path = os.path.join(file_sizes_test_dir, 'file1.txt')
2241
+ content1 = "0123456789" # 10 bytes
2242
+ with open(f1_path, 'w') as f:
2243
+ f.write(content1)
2244
+
2245
+ subdir_path = os.path.join(file_sizes_test_dir, 'subdir')
2246
+ os.makedirs(subdir_path, exist_ok=True)
2247
+ f2_path = os.path.join(subdir_path, 'file2.txt')
2248
+ content2 = "01234567890123456789" # 20 bytes
2249
+ with open(f2_path, 'w') as f:
2250
+ f.write(content2)
2251
+
2252
+ sizes_relative = get_file_sizes(file_sizes_test_dir)
2253
+ expected_sizes_relative = {
2254
+ 'file1.txt': len(content1),
2255
+ os.path.join('subdir', 'file2.txt').replace('\\','/'): len(content2)
2256
+ }
2257
+ assert sizes_relative == expected_sizes_relative
2258
+
2259
+ file_list_abs = [f1_path, f2_path]
2260
+ sizes_parallel_abs = parallel_get_file_sizes(file_list_abs, max_workers=1)
2261
+ expected_sizes_parallel_abs = {
2262
+ f1_path.replace('\\','/'): len(content1),
2263
+ f2_path.replace('\\','/'): len(content2)
2264
+ }
2265
+ assert sizes_parallel_abs == expected_sizes_parallel_abs
2266
+
2267
+ sizes_parallel_folder_abs = parallel_get_file_sizes(file_sizes_test_dir, max_workers=1, return_relative_paths=False)
2268
+ assert sizes_parallel_folder_abs == expected_sizes_parallel_abs
2269
+
2270
+ sizes_parallel_folder_rel = parallel_get_file_sizes(file_sizes_test_dir, max_workers=1, return_relative_paths=True)
2271
+ assert sizes_parallel_folder_rel == expected_sizes_relative
2272
+
2273
+ non_existent_file = os.path.join(file_sizes_test_dir, "no_such_file.txt")
2274
+ sizes_with_error = parallel_get_file_sizes([f1_path, non_existent_file], max_workers=1)
2275
+ expected_with_error = {
2276
+ f1_path.replace('\\','/'): len(content1),
2277
+ non_existent_file.replace('\\','/'): None
2278
+ }
2279
+ assert sizes_with_error == expected_with_error
2280
+
2281
+
2282
+ def test_zip_file_and_unzip_file(self):
2283
+ """
2284
+ Test zip_file and unzip_file functions.
2285
+ """
2286
+
2287
+ file_to_zip_name = "test_zip_me.txt"
2288
+ file_to_zip_path = os.path.join(self.test_dir, file_to_zip_name)
2289
+ content = "This is the content to be zipped."
2290
+ with open(file_to_zip_path, 'w') as f:
2291
+ f.write(content)
2292
+
2293
+ default_zip_output_path = file_to_zip_path + ".zip"
2294
+ returned_zip_path = zip_file(file_to_zip_path)
2295
+ assert returned_zip_path == default_zip_output_path
2296
+ assert os.path.exists(default_zip_output_path)
2297
+
2298
+ unzip_dir_default = os.path.join(self.test_dir, "unzip_default")
2299
+ os.makedirs(unzip_dir_default, exist_ok=True)
2300
+ unzip_file(default_zip_output_path, unzip_dir_default)
2301
+ unzipped_file_path_default = os.path.join(unzip_dir_default, file_to_zip_name)
2302
+ assert os.path.exists(unzipped_file_path_default)
2303
+ with open(unzipped_file_path_default, 'r') as f:
2304
+ assert f.read() == content
2305
+
2306
+ custom_zip_output_name = "custom_archive.zip"
2307
+ custom_zip_output_path = os.path.join(self.test_dir, custom_zip_output_name)
2308
+ zip_file(file_to_zip_path, output_fn=custom_zip_output_path, overwrite=True)
2309
+ assert os.path.exists(custom_zip_output_path)
2310
+
2311
+ zip_in_subdir_path = os.path.join(self.test_dir, "subdir_zip", "my.zip")
2312
+ file_in_subdir_name = "file_for_subdir_zip.txt"
2313
+ file_in_subdir_path = os.path.join(self.test_dir,"subdir_zip", file_in_subdir_name)
2314
+ os.makedirs(os.path.dirname(zip_in_subdir_path), exist_ok=True)
2315
+ with open(file_in_subdir_path, "w") as f: f.write("sub dir content")
2316
+ zip_file(file_in_subdir_path, output_fn=zip_in_subdir_path)
2317
+
2318
+ unzip_file(zip_in_subdir_path, output_folder=None)
2319
+ unzipped_in_same_dir_path = os.path.join(os.path.dirname(zip_in_subdir_path), file_in_subdir_name)
2320
+ assert os.path.exists(unzipped_in_same_dir_path)
2321
+ with open(unzipped_in_same_dir_path, 'r') as f:
2322
+ assert f.read() == "sub dir content"
2323
+
2324
+
2325
+ def test_zip_folder(self):
2326
+ """
2327
+ Test the zip_folder function.
2328
+ """
2329
+
2330
+ folder_to_zip = os.path.join(self.test_dir, "folder_to_zip")
2331
+ os.makedirs(folder_to_zip, exist_ok=True)
2332
+
2333
+ file1_name = "file1.txt"; path1 = os.path.join(folder_to_zip, file1_name)
2334
+ file2_name = "file2.log"; path2 = os.path.join(folder_to_zip, file2_name)
2335
+ subdir_name = "sub"; subdir_path = os.path.join(folder_to_zip, subdir_name)
2336
+ os.makedirs(subdir_path, exist_ok=True)
2337
+ file3_name = "file3.dat"; path3 = os.path.join(subdir_path, file3_name)
2338
+
2339
+ content1 = "content1"; content2 = "content2"; content3 = "content3"
2340
+ with open(path1, 'w') as f: f.write(content1)
2341
+ with open(path2, 'w') as f: f.write(content2)
2342
+ with open(path3, 'w') as f: f.write(content3)
2343
+
2344
+ default_zip_path = folder_to_zip + ".zip"
2345
+ zip_folder(folder_to_zip, output_fn=None, overwrite=True)
2346
+ assert os.path.exists(default_zip_path)
2347
+
2348
+ unzip_output_dir = os.path.join(self.test_dir, "unzipped_folder_content")
2349
+ os.makedirs(unzip_output_dir, exist_ok=True)
2350
+ unzip_file(default_zip_path, unzip_output_dir)
2351
+
2352
+ assert os.path.exists(os.path.join(unzip_output_dir, file1_name))
2353
+ assert os.path.exists(os.path.join(unzip_output_dir, file2_name))
2354
+ assert os.path.exists(os.path.join(unzip_output_dir, subdir_name, file3_name))
2355
+ with open(os.path.join(unzip_output_dir, file1_name), 'r')as f: assert f.read() == content1
2356
+ with open(os.path.join(unzip_output_dir, file2_name), 'r')as f: assert f.read() == content2
2357
+ with open(os.path.join(unzip_output_dir, subdir_name, file3_name), 'r')as f: assert f.read() == content3
2358
+
2359
+ mtime_before = os.path.getmtime(default_zip_path)
2360
+ zip_folder(folder_to_zip, output_fn=None, overwrite=False)
2361
+ mtime_after = os.path.getmtime(default_zip_path)
2362
+ assert mtime_before == mtime_after
2363
+
2364
+
2365
+ def test_zip_files_into_single_zipfile(self):
2366
+ """
2367
+ Test zip_files_into_single_zipfile.
2368
+ """
2369
+
2370
+ file1_path = os.path.join(self.test_dir, "zfs_file1.txt")
2371
+ content1 = "content for zfs1"
2372
+ with open(file1_path, 'w') as f: f.write(content1)
2373
+
2374
+ subdir_for_zfs = os.path.join(self.test_dir, "zfs_subdir")
2375
+ os.makedirs(subdir_for_zfs, exist_ok=True)
2376
+ file2_path = os.path.join(subdir_for_zfs, "zfs_file2.log")
2377
+ content2 = "content for zfs2"
2378
+ with open(file2_path, 'w') as f: f.write(content2)
2379
+
2380
+ input_files = [file1_path, file2_path]
2381
+ output_zip_path = os.path.join(self.test_dir, "multi_file_archive.zip")
2382
+ zip_files_into_single_zipfile(input_files, output_zip_path, arc_name_base=self.test_dir, overwrite=True)
2383
+ assert os.path.exists(output_zip_path)
2384
+
2385
+ unzip_dir = os.path.join(self.test_dir, "unzip_multi_file")
2386
+ os.makedirs(unzip_dir, exist_ok=True)
2387
+ unzip_file(output_zip_path, unzip_dir)
2388
+
2389
+ expected_unzipped_file1 = os.path.join(unzip_dir, os.path.relpath(file1_path, self.test_dir))
2390
+ expected_unzipped_file2 = os.path.join(unzip_dir, os.path.relpath(file2_path, self.test_dir))
2391
+
2392
+ assert os.path.exists(expected_unzipped_file1)
2393
+ with open(expected_unzipped_file1, 'r') as f: assert f.read() == content1
2394
+ assert os.path.exists(expected_unzipped_file2)
2395
+ assert os.path.basename(expected_unzipped_file2) == "zfs_file2.log"
2396
+ assert os.path.basename(os.path.dirname(expected_unzipped_file2)) == "zfs_subdir"
2397
+ with open(expected_unzipped_file2, 'r') as f: assert f.read() == content2
2398
+
2399
+
2400
+ def test_add_files_to_single_tar_file(self):
2401
+ """
2402
+ Test add_files_to_single_tar_file.
2403
+ """
2404
+
2405
+ file1_path = os.path.join(self.test_dir, "tar_file1.txt")
2406
+ content1 = "content for tar1"
2407
+ with open(file1_path, 'w') as f: f.write(content1)
2408
+
2409
+ subdir_for_tar = os.path.join(self.test_dir, "tar_subdir")
2410
+ os.makedirs(subdir_for_tar, exist_ok=True)
2411
+ file2_path = os.path.join(subdir_for_tar, "tar_file2.log")
2412
+ content2 = "content for tar2"
2413
+ with open(file2_path, 'w') as f: f.write(content2)
2414
+
2415
+ input_files = [file1_path, file2_path]
2416
+ output_tar_path = os.path.join(self.test_dir, "archive.tar.gz")
2417
+
2418
+ add_files_to_single_tar_file(input_files, output_tar_path, arc_name_base=self.test_dir,
2419
+ overwrite=True, mode='x:gz')
2420
+ assert os.path.exists(output_tar_path)
2421
+
2422
+ un_tar_dir = os.path.join(self.test_dir, "un_tar_contents")
2423
+ os.makedirs(un_tar_dir, exist_ok=True)
2424
+ with tarfile.open(output_tar_path, 'r:gz') as tf:
2425
+ tf.extractall(path=un_tar_dir)
2426
+
2427
+ expected_untarred_file1 = os.path.join(un_tar_dir, os.path.relpath(file1_path, self.test_dir))
2428
+ expected_untarred_file2 = os.path.join(un_tar_dir, os.path.relpath(file2_path, self.test_dir))
2429
+
2430
+ assert os.path.exists(expected_untarred_file1)
2431
+ with open(expected_untarred_file1, 'r') as f: assert f.read() == content1
2432
+ assert os.path.exists(expected_untarred_file2)
2433
+ with open(expected_untarred_file2, 'r') as f: assert f.read() == content2
2434
+
2435
+
2436
+ def test_parallel_zip_individual_files_and_folders(self):
2437
+ """
2438
+ Test parallel_zip_files, parallel_zip_folders, and zip_each_file_in_folder.
2439
+ """
2440
+
2441
+ file1_to_zip = os.path.join(self.test_dir, "pz_file1.txt")
2442
+ file2_to_zip = os.path.join(self.test_dir, "pz_file2.txt")
2443
+ with open(file1_to_zip, 'w') as f: f.write("pz_content1")
2444
+ with open(file2_to_zip, 'w') as f: f.write("pz_content2")
2445
+
2446
+ parallel_zip_files([file1_to_zip, file2_to_zip], max_workers=1, overwrite=True)
2447
+ assert os.path.exists(file1_to_zip + ".zip")
2448
+ assert os.path.exists(file2_to_zip + ".zip")
2449
+ unzip_dir_pz = os.path.join(self.test_dir, "unzip_pz")
2450
+ unzip_file(file1_to_zip + ".zip", unzip_dir_pz)
2451
+ assert os.path.exists(os.path.join(unzip_dir_pz, os.path.basename(file1_to_zip)))
2452
+
2453
+ folder1_to_zip = os.path.join(self.test_dir, "pz_folder1")
2454
+ os.makedirs(folder1_to_zip, exist_ok=True)
2455
+ with open(os.path.join(folder1_to_zip, "pf1.txt"), 'w') as f: f.write("pf1_content")
2456
+ folder2_to_zip = os.path.join(self.test_dir, "pz_folder2")
2457
+ os.makedirs(folder2_to_zip, exist_ok=True)
2458
+ with open(os.path.join(folder2_to_zip, "pf2.txt"), 'w') as f: f.write("pf2_content")
2459
+
2460
+ parallel_zip_folders([folder1_to_zip, folder2_to_zip], max_workers=1, overwrite=True)
2461
+ assert os.path.exists(folder1_to_zip + ".zip")
2462
+ assert os.path.exists(folder2_to_zip + ".zip")
2463
+ unzip_dir_pzf = os.path.join(self.test_dir, "unzip_pzf")
2464
+ unzip_file(folder1_to_zip + ".zip", unzip_dir_pzf)
2465
+ assert os.path.exists(os.path.join(unzip_dir_pzf, "pf1.txt"))
2466
+
2467
+ zef_folder = os.path.join(self.test_dir, "zef_test_folder")
2468
+ os.makedirs(zef_folder, exist_ok=True)
2469
+ zef_file1 = os.path.join(zef_folder, "zef1.txt")
2470
+ zef_file2_png = os.path.join(zef_folder, "zef2.png")
2471
+ zef_file3_zip = os.path.join(zef_folder, "zef3.zip")
2472
+ zef_subdir = os.path.join(zef_folder, "zef_sub")
2473
+ os.makedirs(zef_subdir, exist_ok=True)
2474
+ zef_file_in_sub = os.path.join(zef_subdir, "zef_subfile.txt")
2475
+
2476
+ for p_path in [zef_file1, zef_file2_png, zef_file3_zip, zef_file_in_sub]:
2477
+ with open(p_path, 'w') as f: f.write(f"content of {os.path.basename(p_path)}")
2478
+
2479
+ zip_each_file_in_folder(zef_folder, recursive=False, max_workers=1, overwrite=True)
2480
+ assert os.path.exists(zef_file1 + ".zip")
2481
+ assert os.path.exists(zef_file2_png + ".zip")
2482
+ assert not os.path.exists(zef_file3_zip + ".zip")
2483
+ assert not os.path.exists(zef_file_in_sub + ".zip")
2484
+
2485
+ if os.path.exists(zef_file1 + ".zip"): os.remove(zef_file1 + ".zip")
2486
+ if os.path.exists(zef_file2_png + ".zip"): os.remove(zef_file2_png + ".zip")
2487
+
2488
+ zip_each_file_in_folder(zef_folder, recursive=True, max_workers=1, overwrite=True)
2489
+ assert os.path.exists(zef_file1 + ".zip")
2490
+ assert os.path.exists(zef_file2_png + ".zip")
2491
+ assert not os.path.exists(zef_file3_zip + ".zip")
2492
+ assert os.path.exists(zef_file_in_sub + ".zip")
2493
+
2494
+ if os.path.exists(zef_file1 + ".zip"): os.remove(zef_file1 + ".zip")
2495
+ if os.path.exists(zef_file2_png + ".zip"): os.remove(zef_file2_png + ".zip")
2496
+ if os.path.exists(zef_file_in_sub + ".zip"): os.remove(zef_file_in_sub + ".zip")
2497
+ zip_each_file_in_folder(zef_folder, recursive=True, required_token="zef1", max_workers=1, overwrite=True)
2498
+ assert os.path.exists(zef_file1 + ".zip")
2499
+ assert not os.path.exists(zef_file2_png + ".zip")
2500
+ assert not os.path.exists(zef_file_in_sub + ".zip")
2501
+
2502
+ if os.path.exists(zef_file1 + ".zip"): os.remove(zef_file1 + ".zip")
2503
+ dummy_to_zip = os.path.join(zef_folder,"dummy.txt")
2504
+ with open(dummy_to_zip,'w') as f: f.write('d')
2505
+ zip_each_file_in_folder(zef_folder, recursive=False, exclude_zip=False, max_workers=1, overwrite=True)
2506
+ assert os.path.exists(dummy_to_zip + ".zip")
2507
+ assert os.path.exists(zef_file3_zip + ".zip")
2508
+ if os.path.exists(dummy_to_zip + ".zip"): os.remove(dummy_to_zip + ".zip")
2509
+ if os.path.exists(zef_file3_zip + ".zip"): os.remove(zef_file3_zip + ".zip")
2510
+
2511
+
2512
+ def test_compute_file_hash(self):
2513
+ """
2514
+ Test compute_file_hash and parallel_compute_file_hashes.
2515
+ """
2516
+
2517
+ file1_name = "hash_me1.txt"
2518
+ file1_path = os.path.join(self.test_dir, file1_name)
2519
+ content1 = "This is a test string for hashing."
2520
+ with open(file1_path, 'w') as f:
2521
+ f.write(content1)
2522
+
2523
+ file2_name = "hash_me2.txt"
2524
+ file2_path = os.path.join(self.test_dir, file2_name)
2525
+ with open(file2_path, 'w') as f:
2526
+ f.write(content1)
2527
+
2528
+ file3_name = "hash_me3.txt"
2529
+ file3_path = os.path.join(self.test_dir, file3_name)
2530
+ content3 = "This is a different test string for hashing."
2531
+ with open(file3_path, 'w') as f:
2532
+ f.write(content3)
2533
+
2534
+ expected_hash_content1_sha256 = \
2535
+ "c56f19d76df6a09e49fe0d9ce7b1bc7f1dbd582f668742bede65c54c47d5bcf4".lower()
2536
+ expected_hash_content3_sha256 = \
2537
+ "23013ff7e93264317f7b2fc0e9a217649f2dc0b11ca7e0bd49632424b70b6680".lower()
2538
+
2539
+ hash1 = compute_file_hash(file1_path)
2540
+ hash2 = compute_file_hash(file2_path)
2541
+ hash3 = compute_file_hash(file3_path)
2542
+ assert hash1 == expected_hash_content1_sha256
2543
+ assert hash2 == expected_hash_content1_sha256
2544
+ assert hash1 != hash3
2545
+ assert hash3 == expected_hash_content3_sha256
2546
+
2547
+ expected_hash_content1_md5 = "94b971f1f8cdb23c2af82af73160d4b0".lower()
2548
+ hash1_md5 = compute_file_hash(file1_path, algorithm='md5')
2549
+ assert hash1_md5 == expected_hash_content1_md5
2550
+
2551
+ non_existent_path = os.path.join(self.test_dir, "no_such_file.txt")
2552
+ assert compute_file_hash(non_existent_path, allow_failures=True) is None
2553
+ try:
2554
+ compute_file_hash(non_existent_path, allow_failures=False)
2555
+ raise AssertionError("FileNotFoundError not raised for compute_file_hash")
2556
+ except FileNotFoundError:
2557
+ pass
2558
+
2559
+ files_to_hash = [file1_path, file3_path, non_existent_path]
2560
+ hashes_parallel = parallel_compute_file_hashes(files_to_hash, max_workers=1)
2561
+
2562
+ norm_f1 = file1_path.replace('\\','/')
2563
+ norm_f3 = file3_path.replace('\\','/')
2564
+ norm_non = non_existent_path.replace('\\','/')
2565
+
2566
+ expected_parallel_hashes = {
2567
+ norm_f1: expected_hash_content1_sha256,
2568
+ norm_f3: expected_hash_content3_sha256,
2569
+ norm_non: None
2570
+ }
2571
+ hashes_parallel_norm = {k.replace('\\','/'): v for k,v in hashes_parallel.items()}
2572
+ assert hashes_parallel_norm == expected_parallel_hashes
2573
+
2574
+ hash_folder = os.path.join(self.test_dir, "hash_test_folder")
2575
+ os.makedirs(hash_folder, exist_ok=True)
2576
+ h_f1_name = "h_f1.txt"; h_f1_path = os.path.join(hash_folder, h_f1_name)
2577
+ h_f2_name = "h_f2.txt"; h_f2_path = os.path.join(hash_folder, h_f2_name)
2578
+ with open(h_f1_path, 'w') as f: f.write(content1)
2579
+ with open(h_f2_path, 'w') as f: f.write(content3)
2580
+
2581
+ hashes_folder_parallel = parallel_compute_file_hashes(hash_folder, recursive=False, max_workers=1)
2582
+ norm_hf1 = h_f1_path.replace('\\','/')
2583
+ norm_hf2 = h_f2_path.replace('\\','/')
2584
+ expected_folder_hashes = {
2585
+ norm_hf1: expected_hash_content1_sha256,
2586
+ norm_hf2: expected_hash_content3_sha256
2587
+ }
2588
+ hashes_folder_parallel_norm = {k.replace('\\','/'): v for k,v in hashes_folder_parallel.items()}
2589
+ assert hashes_folder_parallel_norm == expected_folder_hashes
2590
+
2591
+
2592
+ def test_path_utils():
2593
+ """
2594
+ Runs all tests in the TestPathUtils class.
2595
+ """
2596
+
2597
+ test_instance = TestPathUtils()
2598
+ test_instance.set_up()
2599
+ try:
2600
+ test_instance.test_is_image_file()
2601
+ test_instance.test_find_image_strings()
2602
+ test_instance.test_find_images()
2603
+ test_instance.test_recursive_file_list_and_file_list()
2604
+ test_instance.test_folder_list()
2605
+ test_instance.test_folder_summary()
2606
+ test_instance.test_fileparts()
2607
+ test_instance.test_insert_before_extension()
2608
+ test_instance.test_split_path()
2609
+ test_instance.test_path_is_abs()
2610
+ test_instance.test_safe_create_link_unix()
2611
+ test_instance.test_remove_empty_folders()
2612
+ test_instance.test_path_join()
2613
+ test_instance.test_filename_cleaning()
2614
+ test_instance.test_is_executable()
2615
+ test_instance.test_write_read_list_to_file()
2616
+ test_instance.test_parallel_copy_files()
2617
+ test_instance.test_get_file_sizes()
2618
+ test_instance.test_zip_file_and_unzip_file()
2619
+ test_instance.test_zip_folder()
2620
+ test_instance.test_zip_files_into_single_zipfile()
2621
+ test_instance.test_add_files_to_single_tar_file()
2622
+ test_instance.test_parallel_zip_individual_files_and_folders()
2623
+ test_instance.test_compute_file_hash()
2624
+ finally:
2625
+ test_instance.tear_down()
2626
+
2627
+ # from IPython import embed; embed()
2628
+ # test_path_utils()