megadetector 5.0.27__py3-none-any.whl → 5.0.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (176) hide show
  1. megadetector/api/batch_processing/api_core/batch_service/score.py +4 -5
  2. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +1 -1
  3. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +1 -1
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
  7. megadetector/api/synchronous/api_core/tests/load_test.py +2 -3
  8. megadetector/classification/aggregate_classifier_probs.py +3 -3
  9. megadetector/classification/analyze_failed_images.py +5 -5
  10. megadetector/classification/cache_batchapi_outputs.py +5 -5
  11. megadetector/classification/create_classification_dataset.py +11 -12
  12. megadetector/classification/crop_detections.py +10 -10
  13. megadetector/classification/csv_to_json.py +8 -8
  14. megadetector/classification/detect_and_crop.py +13 -15
  15. megadetector/classification/evaluate_model.py +7 -7
  16. megadetector/classification/identify_mislabeled_candidates.py +6 -6
  17. megadetector/classification/json_to_azcopy_list.py +1 -1
  18. megadetector/classification/json_validator.py +29 -32
  19. megadetector/classification/map_classification_categories.py +9 -9
  20. megadetector/classification/merge_classification_detection_output.py +12 -9
  21. megadetector/classification/prepare_classification_script.py +19 -19
  22. megadetector/classification/prepare_classification_script_mc.py +23 -23
  23. megadetector/classification/run_classifier.py +4 -4
  24. megadetector/classification/save_mislabeled.py +6 -6
  25. megadetector/classification/train_classifier.py +1 -1
  26. megadetector/classification/train_classifier_tf.py +9 -9
  27. megadetector/classification/train_utils.py +10 -10
  28. megadetector/data_management/annotations/annotation_constants.py +1 -1
  29. megadetector/data_management/camtrap_dp_to_coco.py +45 -45
  30. megadetector/data_management/cct_json_utils.py +101 -101
  31. megadetector/data_management/cct_to_md.py +49 -49
  32. megadetector/data_management/cct_to_wi.py +33 -33
  33. megadetector/data_management/coco_to_labelme.py +75 -75
  34. megadetector/data_management/coco_to_yolo.py +189 -189
  35. megadetector/data_management/databases/add_width_and_height_to_db.py +3 -2
  36. megadetector/data_management/databases/combine_coco_camera_traps_files.py +38 -38
  37. megadetector/data_management/databases/integrity_check_json_db.py +202 -188
  38. megadetector/data_management/databases/subset_json_db.py +33 -33
  39. megadetector/data_management/generate_crops_from_cct.py +38 -38
  40. megadetector/data_management/get_image_sizes.py +54 -49
  41. megadetector/data_management/labelme_to_coco.py +130 -124
  42. megadetector/data_management/labelme_to_yolo.py +78 -72
  43. megadetector/data_management/lila/create_lila_blank_set.py +81 -83
  44. megadetector/data_management/lila/create_lila_test_set.py +32 -31
  45. megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
  46. megadetector/data_management/lila/download_lila_subset.py +21 -24
  47. megadetector/data_management/lila/generate_lila_per_image_labels.py +91 -91
  48. megadetector/data_management/lila/get_lila_annotation_counts.py +30 -30
  49. megadetector/data_management/lila/get_lila_image_counts.py +22 -22
  50. megadetector/data_management/lila/lila_common.py +70 -70
  51. megadetector/data_management/lila/test_lila_metadata_urls.py +13 -14
  52. megadetector/data_management/mewc_to_md.py +339 -340
  53. megadetector/data_management/ocr_tools.py +258 -252
  54. megadetector/data_management/read_exif.py +232 -223
  55. megadetector/data_management/remap_coco_categories.py +26 -26
  56. megadetector/data_management/remove_exif.py +31 -20
  57. megadetector/data_management/rename_images.py +187 -187
  58. megadetector/data_management/resize_coco_dataset.py +41 -41
  59. megadetector/data_management/speciesnet_to_md.py +41 -41
  60. megadetector/data_management/wi_download_csv_to_coco.py +55 -55
  61. megadetector/data_management/yolo_output_to_md_output.py +117 -120
  62. megadetector/data_management/yolo_to_coco.py +195 -188
  63. megadetector/detection/change_detection.py +831 -0
  64. megadetector/detection/process_video.py +341 -338
  65. megadetector/detection/pytorch_detector.py +308 -266
  66. megadetector/detection/run_detector.py +186 -166
  67. megadetector/detection/run_detector_batch.py +366 -364
  68. megadetector/detection/run_inference_with_yolov5_val.py +328 -325
  69. megadetector/detection/run_tiled_inference.py +312 -253
  70. megadetector/detection/tf_detector.py +24 -24
  71. megadetector/detection/video_utils.py +291 -283
  72. megadetector/postprocessing/add_max_conf.py +15 -11
  73. megadetector/postprocessing/categorize_detections_by_size.py +44 -44
  74. megadetector/postprocessing/classification_postprocessing.py +808 -311
  75. megadetector/postprocessing/combine_batch_outputs.py +20 -21
  76. megadetector/postprocessing/compare_batch_results.py +528 -517
  77. megadetector/postprocessing/convert_output_format.py +97 -97
  78. megadetector/postprocessing/create_crop_folder.py +220 -147
  79. megadetector/postprocessing/detector_calibration.py +173 -168
  80. megadetector/postprocessing/generate_csv_report.py +508 -0
  81. megadetector/postprocessing/load_api_results.py +25 -22
  82. megadetector/postprocessing/md_to_coco.py +129 -98
  83. megadetector/postprocessing/md_to_labelme.py +89 -83
  84. megadetector/postprocessing/md_to_wi.py +40 -40
  85. megadetector/postprocessing/merge_detections.py +87 -114
  86. megadetector/postprocessing/postprocess_batch_results.py +319 -302
  87. megadetector/postprocessing/remap_detection_categories.py +36 -36
  88. megadetector/postprocessing/render_detection_confusion_matrix.py +205 -199
  89. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
  90. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
  91. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +702 -677
  92. megadetector/postprocessing/separate_detections_into_folders.py +226 -211
  93. megadetector/postprocessing/subset_json_detector_output.py +265 -262
  94. megadetector/postprocessing/top_folders_to_bottom.py +45 -45
  95. megadetector/postprocessing/validate_batch_results.py +70 -70
  96. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
  97. megadetector/taxonomy_mapping/map_new_lila_datasets.py +15 -15
  98. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +14 -14
  99. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +66 -69
  100. megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
  101. megadetector/taxonomy_mapping/simple_image_download.py +8 -8
  102. megadetector/taxonomy_mapping/species_lookup.py +33 -33
  103. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
  104. megadetector/taxonomy_mapping/taxonomy_graph.py +11 -11
  105. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
  106. megadetector/utils/azure_utils.py +22 -22
  107. megadetector/utils/ct_utils.py +1019 -200
  108. megadetector/utils/directory_listing.py +21 -77
  109. megadetector/utils/gpu_test.py +22 -22
  110. megadetector/utils/md_tests.py +541 -518
  111. megadetector/utils/path_utils.py +1511 -406
  112. megadetector/utils/process_utils.py +41 -41
  113. megadetector/utils/sas_blob_utils.py +53 -49
  114. megadetector/utils/split_locations_into_train_val.py +73 -60
  115. megadetector/utils/string_utils.py +147 -26
  116. megadetector/utils/url_utils.py +463 -173
  117. megadetector/utils/wi_utils.py +2629 -2868
  118. megadetector/utils/write_html_image_list.py +137 -137
  119. megadetector/visualization/plot_utils.py +21 -21
  120. megadetector/visualization/render_images_with_thumbnails.py +37 -73
  121. megadetector/visualization/visualization_utils.py +424 -404
  122. megadetector/visualization/visualize_db.py +197 -190
  123. megadetector/visualization/visualize_detector_output.py +126 -98
  124. {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/METADATA +6 -3
  125. megadetector-5.0.29.dist-info/RECORD +163 -0
  126. {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/WHEEL +1 -1
  127. megadetector/data_management/importers/add_nacti_sizes.py +0 -52
  128. megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
  129. megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
  130. megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
  131. megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
  132. megadetector/data_management/importers/awc_to_json.py +0 -191
  133. megadetector/data_management/importers/bellevue_to_json.py +0 -272
  134. megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
  135. megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
  136. megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
  137. megadetector/data_management/importers/cct_field_adjustments.py +0 -58
  138. megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
  139. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
  140. megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
  141. megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
  142. megadetector/data_management/importers/ena24_to_json.py +0 -276
  143. megadetector/data_management/importers/filenames_to_json.py +0 -386
  144. megadetector/data_management/importers/helena_to_cct.py +0 -283
  145. megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
  146. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
  147. megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
  148. megadetector/data_management/importers/jb_csv_to_json.py +0 -150
  149. megadetector/data_management/importers/mcgill_to_json.py +0 -250
  150. megadetector/data_management/importers/missouri_to_json.py +0 -490
  151. megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
  152. megadetector/data_management/importers/noaa_seals_2019.py +0 -181
  153. megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
  154. megadetector/data_management/importers/pc_to_json.py +0 -365
  155. megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
  156. megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
  157. megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
  158. megadetector/data_management/importers/rspb_to_json.py +0 -356
  159. megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
  160. megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
  161. megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
  162. megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
  163. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
  164. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
  165. megadetector/data_management/importers/sulross_get_exif.py +0 -65
  166. megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
  167. megadetector/data_management/importers/ubc_to_json.py +0 -399
  168. megadetector/data_management/importers/umn_to_json.py +0 -507
  169. megadetector/data_management/importers/wellington_to_json.py +0 -263
  170. megadetector/data_management/importers/wi_to_json.py +0 -442
  171. megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
  172. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
  173. megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
  174. megadetector-5.0.27.dist-info/RECORD +0 -208
  175. {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/licenses/LICENSE +0 -0
  176. {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,7 @@ import tarfile
24
24
  import webbrowser
25
25
  import subprocess
26
26
  import re
27
+ import tempfile
27
28
 
28
29
  from zipfile import ZipFile
29
30
  from datetime import datetime
@@ -34,6 +35,7 @@ from shutil import which
34
35
  from tqdm import tqdm
35
36
 
36
37
  from megadetector.utils.ct_utils import is_iterable
38
+ from megadetector.utils.ct_utils import make_test_folder
37
39
  from megadetector.utils.ct_utils import sort_dictionary_by_value
38
40
 
39
41
  # Should all be lower-case
@@ -47,14 +49,14 @@ CHAR_LIMIT = 255
47
49
 
48
50
  #%% General path functions
49
51
 
50
- def recursive_file_list(base_dir,
51
- convert_slashes=True,
52
- return_relative_paths=False,
52
+ def recursive_file_list(base_dir,
53
+ convert_slashes=True,
54
+ return_relative_paths=False,
53
55
  sort_files=True,
54
56
  recursive=True):
55
57
  r"""
56
58
  Enumerates files (not directories) in [base_dir].
57
-
59
+
58
60
  Args:
59
61
  base_dir (str): folder to enumerate
60
62
  convert_slashes (bool, optional): force forward slashes; if this is False, will use
@@ -64,13 +66,13 @@ def recursive_file_list(base_dir,
64
66
  sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
65
67
  provided by os.walk()
66
68
  recursive (bool, optional): enumerate recursively
67
-
69
+
68
70
  Returns:
69
71
  list: list of filenames
70
72
  """
71
-
73
+
72
74
  assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
73
-
75
+
74
76
  all_files = []
75
77
 
76
78
  if recursive:
@@ -82,29 +84,29 @@ def recursive_file_list(base_dir,
82
84
  all_files_relative = os.listdir(base_dir)
83
85
  all_files = [os.path.join(base_dir,fn) for fn in all_files_relative]
84
86
  all_files = [fn for fn in all_files if os.path.isfile(fn)]
85
-
87
+
86
88
  if return_relative_paths:
87
89
  all_files = [os.path.relpath(fn,base_dir) for fn in all_files]
88
90
 
89
91
  if convert_slashes:
90
92
  all_files = [fn.replace('\\', '/') for fn in all_files]
91
-
93
+
92
94
  if sort_files:
93
95
  all_files = sorted(all_files)
94
-
96
+
95
97
  return all_files
96
98
 
97
99
 
98
- def file_list(base_dir,
100
+ def file_list(base_dir,
99
101
  convert_slashes=True,
100
- return_relative_paths=False,
101
- sort_files=True,
102
+ return_relative_paths=False,
103
+ sort_files=True,
102
104
  recursive=False):
103
105
  """
104
- Trivial wrapper for recursive_file_list, which was a poor function name choice
105
- at the time, since I later wanted to add non-recursive lists, but it doesn't
106
+ Trivial wrapper for recursive_file_list, which was a poor function name choice
107
+ at the time, since I later wanted to add non-recursive lists, but it doesn't
106
108
  make sense to have a "recursive" option in a function called "recursive_file_list".
107
-
109
+
108
110
  Args:
109
111
  base_dir (str): folder to enumerate
110
112
  convert_slashes (bool, optional): force forward slashes; if this is False, will use
@@ -114,11 +116,11 @@ def file_list(base_dir,
114
116
  sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
115
117
  provided by os.walk()
116
118
  recursive (bool, optional): enumerate recursively
117
-
119
+
118
120
  Returns:
119
- list: list of filenames
121
+ list: list of filenames
120
122
  """
121
-
123
+
122
124
  return recursive_file_list(base_dir,convert_slashes,return_relative_paths,sort_files,
123
125
  recursive=recursive)
124
126
 
@@ -128,10 +130,9 @@ def folder_list(base_dir,
128
130
  return_relative_paths=False,
129
131
  sort_folders=True,
130
132
  recursive=False):
131
-
132
133
  """
133
134
  Enumerates folders (not files) in [base_dir].
134
-
135
+
135
136
  Args:
136
137
  base_dir (str): folder to enumerate
137
138
  convert_slashes (bool, optional): force forward slashes; if this is False, will use
@@ -141,81 +142,81 @@ def folder_list(base_dir,
141
142
  sort_files (bool, optional): force folders to be sorted, otherwise uses the sorting
142
143
  provided by os.walk()
143
144
  recursive (bool, optional): enumerate recursively
144
-
145
+
145
146
  Returns:
146
147
  list: list of folder names
147
148
  """
148
-
149
+
149
150
  assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
150
-
151
+
151
152
  folders = []
152
153
 
153
- if recursive:
154
+ if recursive:
154
155
  folders = []
155
156
  for root, dirs, _ in os.walk(base_dir):
156
157
  for d in dirs:
157
- folders.append(os.path.join(root, d))
158
+ folders.append(os.path.join(root, d))
158
159
  else:
159
160
  folders = os.listdir(base_dir)
160
161
  folders = [os.path.join(base_dir,fn) for fn in folders]
161
162
  folders = [fn for fn in folders if os.path.isdir(fn)]
162
-
163
+
163
164
  if return_relative_paths:
164
165
  folders = [os.path.relpath(fn,base_dir) for fn in folders]
165
166
 
166
167
  if convert_slashes:
167
168
  folders = [fn.replace('\\', '/') for fn in folders]
168
-
169
+
169
170
  if sort_folders:
170
- folders = sorted(folders)
171
-
171
+ folders = sorted(folders)
172
+
172
173
  return folders
173
174
 
174
175
 
175
176
  def folder_summary(folder,print_summary=True):
176
177
  """
177
178
  Returns (and optionally prints) a summary of [folder], including:
178
-
179
+
179
180
  * The total number of files
180
181
  * The total number of folders
181
- * The number of files for each extension
182
-
182
+ * The number of files for each extension
183
+
183
184
  Args:
184
185
  folder (str): folder to summarize
185
186
  print_summary (bool, optional): whether to print the summary
186
-
187
+
187
188
  Returns:
188
189
  dict: with fields "n_files", "n_folders", and "extension_to_count"
189
190
  """
190
-
191
+
191
192
  assert os.path.isdir(folder), '{} is not a folder'.format(folder)
192
-
193
+
193
194
  folders_relative = folder_list(folder,return_relative_paths=True,recursive=True)
194
195
  files_relative = file_list(folder,return_relative_paths=True,recursive=True)
195
-
196
+
196
197
  extension_to_count = defaultdict(int)
197
-
198
+
198
199
  for fn in files_relative:
199
200
  ext = os.path.splitext(fn)[1]
200
201
  extension_to_count[ext] += 1
201
-
202
+
202
203
  extension_to_count = sort_dictionary_by_value(extension_to_count,reverse=True)
203
-
204
+
204
205
  if print_summary:
205
206
  for extension in extension_to_count.keys():
206
207
  print('{}: {}'.format(extension,extension_to_count[extension]))
207
208
  print('')
208
209
  print('Total files: {}'.format(len(files_relative)))
209
210
  print('Total folders: {}'.format(len(folders_relative)))
210
-
211
+
211
212
  to_return = {}
212
213
  to_return['n_files'] = len(files_relative)
213
214
  to_return['n_folders'] = len(folders_relative)
214
- to_return['extension_to_count'] = extension_to_count
215
-
215
+ to_return['extension_to_count'] = extension_to_count
216
+
216
217
  return to_return
217
-
218
-
218
+
219
+
219
220
  def fileparts(path):
220
221
  r"""
221
222
  Breaks down a path into the directory path, filename, and extension.
@@ -223,25 +224,25 @@ def fileparts(path):
223
224
  Note that the '.' lives with the extension, and separators are removed.
224
225
 
225
226
  Examples:
226
-
227
+
227
228
  .. code-block:: none
228
229
 
229
- >>> fileparts('file')
230
+ >>> fileparts('file')
230
231
  ('', 'file', '')
231
232
  >>> fileparts(r'c:/dir/file.jpg')
232
233
  ('c:/dir', 'file', '.jpg')
233
234
  >>> fileparts('/dir/subdir/file.jpg')
234
- ('/dir/subdir', 'file', '.jpg')
235
+ ('/dir/subdir', 'file', '.jpg')
235
236
 
236
237
  Args:
237
238
  path (str): path name to separate into parts
238
239
  Returns:
239
- tuple: tuple containing (p,n,e):
240
+ tuple: tuple containing (p,n,e):
240
241
  - p: str, directory path
241
242
  - n: str, filename without extension
242
243
  - e: str, extension including the '.'
243
244
  """
244
-
245
+
245
246
  # ntpath seems to do the right thing for both Windows and Unix paths
246
247
  p = ntpath.dirname(path)
247
248
  basename = ntpath.basename(path)
@@ -257,27 +258,27 @@ def insert_before_extension(filename, s=None, separator='.'):
257
258
  appends [s].
258
259
 
259
260
  Examples:
260
-
261
+
261
262
  .. code-block:: none
262
-
263
+
263
264
  >>> insert_before_extension('/dir/subdir/file.ext', 'insert')
264
265
  '/dir/subdir/file.insert.ext'
265
266
  >>> insert_before_extension('/dir/subdir/file', 'insert')
266
267
  '/dir/subdir/file.insert'
267
268
  >>> insert_before_extension('/dir/subdir/file')
268
269
  '/dir/subdir/file.2020.07.20.10.54.38'
269
-
270
+
270
271
  Args:
271
272
  filename (str): filename to manipulate
272
273
  s (str, optional): string to insert before the extension in [filename], or
273
274
  None to insert a datestamp
274
275
  separator (str, optional): separator to place between the filename base
275
276
  and the inserted string
276
-
277
+
277
278
  Returns:
278
279
  str: modified string
279
280
  """
280
-
281
+
281
282
  assert len(filename) > 0
282
283
  if s is None or len(s) == 0:
283
284
  s = datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
@@ -290,9 +291,9 @@ def split_path(path):
290
291
  Splits [path] into all its constituent file/folder tokens.
291
292
 
292
293
  Examples:
293
-
294
+
294
295
  .. code-block:: none
295
-
296
+
296
297
  >>> split_path(r'c:\dir\subdir\file.txt')
297
298
  ['c:\\', 'dir', 'subdir', 'file.txt']
298
299
  >>> split_path('/dir/subdir/file.jpg')
@@ -301,13 +302,19 @@ def split_path(path):
301
302
  ['c:\\']
302
303
  >>> split_path('/')
303
304
  ['/']
304
-
305
+
305
306
  Args:
306
307
  path (str): path to split into tokens
307
-
308
+
308
309
  Returns:
309
310
  list: list of path tokens
310
311
  """
312
+
313
+ # Edge cases
314
+ if path == '':
315
+ return ''
316
+ if path is None:
317
+ return None
311
318
 
312
319
  parts = []
313
320
  while True:
@@ -325,32 +332,32 @@ def path_is_abs(p):
325
332
  """
326
333
  Determines whether [p] is an absolute path. An absolute path is defined as
327
334
  one that starts with slash, backslash, or a letter followed by a colon.
328
-
335
+
329
336
  Args:
330
337
  p (str): path to evaluate
331
-
338
+
332
339
  Returns:
333
340
  bool: True if [p] is an absolute path, else False
334
341
  """
335
-
342
+
336
343
  return (len(p) > 1) and (p[0] == '/' or p[1] == ':' or p[0] == '\\')
337
344
 
338
345
 
339
346
  def safe_create_link(link_exists,link_new):
340
347
  """
341
348
  Creates a symlink at [link_new] pointing to [link_exists].
342
-
349
+
343
350
  If [link_new] already exists, make sure it's a link (not a file),
344
351
  and if it has a different target than [link_exists], removes and re-creates
345
352
  it.
346
-
353
+
347
354
  Errors if [link_new] already exists but it's not a link.
348
-
355
+
349
356
  Args:
350
357
  link_exists (str): the source of the (possibly-new) symlink
351
358
  link_new (str): the target of the (possibly-new) symlink
352
359
  """
353
-
360
+
354
361
  if os.path.exists(link_new) or os.path.islink(link_new):
355
362
  assert os.path.islink(link_new)
356
363
  if not os.readlink(link_new) == link_exists:
@@ -358,35 +365,35 @@ def safe_create_link(link_exists,link_new):
358
365
  os.symlink(link_exists,link_new)
359
366
  else:
360
367
  os.symlink(link_exists,link_new)
361
-
368
+
362
369
 
363
370
  def remove_empty_folders(path, remove_root=False):
364
371
  """
365
372
  Recursively removes empty folders within the specified path.
366
-
373
+
367
374
  Args:
368
- path (str): the folder from which we should recursively remove
375
+ path (str): the folder from which we should recursively remove
369
376
  empty folders.
370
- remove_root (bool, optional): whether to remove the root directory if
377
+ remove_root (bool, optional): whether to remove the root directory if
371
378
  it's empty after removing all empty subdirectories. This will always
372
379
  be True during recursive calls.
373
-
380
+
374
381
  Returns:
375
382
  bool: True if the directory is empty after processing, False otherwise
376
383
  """
377
-
384
+
378
385
  # Verify that [path] is a directory
379
386
  if not os.path.isdir(path):
380
387
  return False
381
-
388
+
382
389
  # Track whether the current directory is empty
383
390
  is_empty = True
384
-
391
+
385
392
  # Iterate through all items in the directory
386
393
  for item in os.listdir(path):
387
-
394
+
388
395
  item_path = os.path.join(path, item)
389
-
396
+
390
397
  # If it's a directory, process it recursively
391
398
  if os.path.isdir(item_path):
392
399
  # If the subdirectory is empty after processing, it will be removed
@@ -396,99 +403,57 @@ def remove_empty_folders(path, remove_root=False):
396
403
  else:
397
404
  # If there's a file, the directory is not empty
398
405
  is_empty = False
399
-
406
+
400
407
  # If the directory is empty and we're supposed to remove it
401
408
  if is_empty and remove_root:
402
409
  try:
403
- os.rmdir(path)
410
+ os.rmdir(path)
404
411
  except Exception as e:
405
412
  print('Error removing directory {}: {}'.format(path,str(e)))
406
413
  is_empty = False
407
-
414
+
408
415
  return is_empty
409
416
 
410
417
  # ...def remove_empty_folders(...)
411
418
 
412
419
 
413
- def top_level_folder(p):
420
+ def path_join(*paths, convert_slashes=True):
414
421
  r"""
415
- Gets the top-level folder from the path *p*.
416
-
417
- On UNIX, this is straightforward:
418
-
419
- /blah/foo
420
-
421
- ...returns '/blah'
422
-
423
- On Windows, we define this as the top-level folder that isn't the drive, so:
424
-
425
- c:\blah\foo
426
-
427
- ...returns 'c:\blah'.
428
-
422
+ Wrapper for os.path.join that optionally converts backslashes to forward slashes.
423
+
429
424
  Args:
430
- p (str): filename to evaluate
431
-
425
+ *paths (variable-length set of strings): Path components to be joined.
426
+ convert_slashes (bool, optional): whether to convert \\ to /
427
+
432
428
  Returns:
433
- str: the top-level folder in [p], see above for details on how this is defined
429
+ A string with the joined path components.
434
430
  """
435
-
436
- if p == '':
437
- return ''
438
-
439
- # Path('/blah').parts is ('/','blah')
440
- parts = split_path(p)
441
-
442
- if len(parts) == 1:
443
- return parts[0]
444
-
445
- # Handle paths like:
446
- #
447
- # /, \, /stuff, c:, c:\stuff
448
- drive = os.path.splitdrive(p)[0]
449
- if parts[0] == drive or parts[0] == drive + '/' or parts[0] == drive + '\\' or parts[0] in ['\\', '/']:
450
- return os.path.join(parts[0], parts[1])
451
- else:
452
- return parts[0]
453
-
454
- # ...top_level_folder()
455
-
456
-
457
- #%% Test driver for top_level_folder
458
431
 
459
- if False:
432
+ joined_path = os.path.join(*paths)
433
+ if convert_slashes:
434
+ return joined_path.replace('\\', '/')
435
+ else:
436
+ return joined_path
460
437
 
461
- #%%
462
438
 
463
- p = 'blah/foo/bar'; s = top_level_folder(p); print(s); assert s == 'blah'
464
- p = '/blah/foo/bar'; s = top_level_folder(p); print(s); assert s == '/blah'
465
- p = 'bar'; s = top_level_folder(p); print(s); assert s == 'bar'
466
- p = ''; s = top_level_folder(p); print(s); assert s == ''
467
- p = 'c:\\'; s = top_level_folder(p); print(s); assert s == 'c:\\'
468
- p = r'c:\blah'; s = top_level_folder(p); print(s); assert s == 'c:\\blah'
469
- p = r'c:\foo'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
470
- p = r'c:/foo'; s = top_level_folder(p); print(s); assert s == 'c:/foo'
471
- p = r'c:\foo/bar'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
472
-
473
-
474
439
  #%% Image-related path functions
475
440
 
476
441
  def is_image_file(s, img_extensions=IMG_EXTENSIONS):
477
442
  """
478
443
  Checks a file's extension against a hard-coded set of image file
479
444
  extensions. Uses case-insensitive comparison.
480
-
445
+
481
446
  Does not check whether the file exists, only determines whether the filename
482
447
  implies it's an image file.
483
-
448
+
484
449
  Args:
485
450
  s (str): filename to evaluate for image-ness
486
451
  img_extensions (list, optional): list of known image file extensions
487
-
452
+
488
453
  Returns:
489
454
  bool: True if [s] appears to be an image file, else False
490
455
  """
491
-
456
+
492
457
  ext = os.path.splitext(s)[1]
493
458
  return ext.lower() in img_extensions
494
459
 
@@ -497,27 +462,27 @@ def find_image_strings(strings):
497
462
  """
498
463
  Given a list of strings that are potentially image file names, looks for
499
464
  strings that actually look like image file names (based on extension).
500
-
465
+
501
466
  Args:
502
467
  strings (list): list of filenames to check for image-ness
503
-
468
+
504
469
  Returns:
505
470
  list: the subset of [strings] that appear to be image filenames
506
471
  """
507
-
472
+
508
473
  return [s for s in strings if is_image_file(s)]
509
474
 
510
475
 
511
- def find_images(dirname,
512
- recursive=False,
513
- return_relative_paths=False,
476
+ def find_images(dirname,
477
+ recursive=False,
478
+ return_relative_paths=False,
514
479
  convert_slashes=True):
515
480
  """
516
481
  Finds all files in a directory that look like image file names. Returns
517
482
  absolute paths unless return_relative_paths is set. Uses the OS-native
518
483
  path separator unless convert_slashes is set, in which case will always
519
484
  use '/'.
520
-
485
+
521
486
  Args:
522
487
  dirname (str): the folder to search for images
523
488
  recursive (bool, optional): whether to search recursively
@@ -528,30 +493,30 @@ def find_images(dirname,
528
493
  Returns:
529
494
  list: list of image filenames found in [dirname]
530
495
  """
531
-
496
+
532
497
  assert os.path.isdir(dirname), '{} is not a folder'.format(dirname)
533
-
498
+
534
499
  if recursive:
535
500
  strings = glob.glob(os.path.join(dirname, '**', '*.*'), recursive=True)
536
501
  else:
537
502
  strings = glob.glob(os.path.join(dirname, '*.*'))
538
-
503
+
539
504
  image_files = find_image_strings(strings)
540
-
505
+
541
506
  if return_relative_paths:
542
507
  image_files = [os.path.relpath(fn,dirname) for fn in image_files]
543
-
508
+
544
509
  image_files = sorted(image_files)
545
-
510
+
546
511
  if convert_slashes:
547
512
  image_files = [fn.replace('\\', '/') for fn in image_files]
548
-
513
+
549
514
  return image_files
550
515
 
551
516
 
552
517
  #%% Filename cleaning functions
553
518
 
554
- def clean_filename(filename,
519
+ def clean_filename(filename,
555
520
  allow_list=VALID_FILENAME_CHARS,
556
521
  char_limit=CHAR_LIMIT,
557
522
  force_lower= False):
@@ -563,18 +528,18 @@ def clean_filename(filename,
563
528
 
564
529
  Adapted from
565
530
  https://gist.github.com/wassname/1393c4a57cfcbf03641dbc31886123b8
566
-
531
+
567
532
  Args:
568
533
  filename (str): filename to clean
569
534
  allow_list (str, optional): string containing all allowable filename characters
570
535
  char_limit (int, optional): maximum allowable filename length, if None will skip this
571
536
  step
572
537
  force_lower (bool, optional): convert the resulting filename to lowercase
573
-
574
- returns:
575
- str: cleaned version of [filename]
538
+
539
+ Returns:
540
+ str: cleaned version of [filename]
576
541
  """
577
-
542
+
578
543
  # keep only valid ascii chars
579
544
  cleaned_filename = (unicodedata.normalize('NFKD', filename)
580
545
  .encode('ASCII', 'ignore').decode())
@@ -588,26 +553,26 @@ def clean_filename(filename,
588
553
  return cleaned_filename
589
554
 
590
555
 
591
- def clean_path(pathname,
556
+ def clean_path(pathname,
592
557
  allow_list=VALID_PATH_CHARS,
593
558
  char_limit=CHAR_LIMIT,
594
559
  force_lower=False):
595
560
  """
596
561
  Removes non-ASCII and other invalid path characters (on any reasonable
597
562
  OS) from a path, then optionally trims to a maximum length.
598
-
563
+
599
564
  Args:
600
565
  pathname (str): path name to clean
601
566
  allow_list (str, optional): string containing all allowable filename characters
602
567
  char_limit (int, optional): maximum allowable filename length, if None will skip this
603
568
  step
604
569
  force_lower (bool, optional): convert the resulting filename to lowercase
605
-
606
- returns:
607
- str: cleaned version of [filename]
570
+
571
+ Returns:
572
+ str: cleaned version of [filename]
608
573
  """
609
-
610
- return clean_filename(pathname, allow_list=allow_list,
574
+
575
+ return clean_filename(pathname, allow_list=allow_list,
611
576
  char_limit=char_limit, force_lower=force_lower)
612
577
 
613
578
 
@@ -616,34 +581,34 @@ def flatten_path(pathname,separator_chars=SEPARATOR_CHARS,separator_char_replace
616
581
  Removes non-ASCII and other invalid path characters (on any reasonable
617
582
  OS) from a path, then trims to a maximum length. Replaces all valid
618
583
  separators with [separator_char_replacement.]
619
-
584
+
620
585
  Args:
621
586
  pathname (str): path name to flatten
622
587
  separator_chars (str, optional): string containing all known path separators
623
- separator_char_replacement (str, optional): string to insert in place of
588
+ separator_char_replacement (str, optional): string to insert in place of
624
589
  path separators.
625
-
590
+
626
591
  Returns:
627
592
  str: flattened version of [pathname]
628
593
  """
629
-
594
+
630
595
  s = clean_path(pathname)
631
596
  for c in separator_chars:
632
597
  s = s.replace(c, separator_char_replacement)
633
598
  return s
634
599
 
635
600
 
636
- def is_executable(filename):
601
+ def is_executable(filename):
637
602
  """
638
603
  Checks whether [filename] is on the system path and marked as executable.
639
-
604
+
640
605
  Args:
641
606
  filename (str): filename to check for executable status
642
-
607
+
643
608
  Returns:
644
609
  bool: True if [filename] is on the system path and marked as executable, otherwise False
645
610
  """
646
-
611
+
647
612
  # https://stackoverflow.com/questions/11210104/check-if-a-program-exists-from-a-python-script
648
613
 
649
614
  return which(filename) is not None
@@ -654,220 +619,247 @@ def is_executable(filename):
654
619
  def environment_is_wsl():
655
620
  """
656
621
  Determines whether we're running in WSL.
657
-
622
+
658
623
  Returns:
659
- True if we're running in WSL.
624
+ True if we're running in WSL.
660
625
  """
661
-
626
+
662
627
  if sys.platform not in ('linux','posix'):
663
628
  return False
664
629
  platform_string = ' '.join(platform.uname()).lower()
665
630
  return 'microsoft' in platform_string and 'wsl' in platform_string
666
-
667
631
 
668
- def wsl_path_to_windows_path(filename):
632
+
633
+ def wsl_path_to_windows_path(filename, failure_behavior='none'):
669
634
  r"""
670
- Converts a WSL path to a Windows path, or returns None if that's not possible. E.g.
671
- converts:
672
-
635
+ Converts a WSL path to a Windows path. For example, converts:
636
+
673
637
  /mnt/e/a/b/c
674
-
638
+
675
639
  ...to:
676
-
640
+
677
641
  e:\a\b\c
678
-
642
+
679
643
  Args:
680
644
  filename (str): filename to convert
681
-
645
+ failure_behavior (str): what to do if the path can't be processed as a WSL path.
646
+ 'none' to return None in this case, 'original' to return the original path.
647
+
682
648
  Returns:
683
- str: Windows equivalent to the WSL path [filename], or [filename] if the current
684
- environment is neither Windows nor WSL.
649
+ str: Windows equivalent to the WSL path [filename]
685
650
  """
686
-
687
- if (not environment_is_wsl()) and (os.name != 'nt'):
688
- return filename
689
-
690
- if environment_is_wsl():
691
- result = subprocess.run(['wslpath', '-w', filename], text=True, capture_output=True)
692
- else:
693
- result = subprocess.run(['wsl', 'wslpath', '-w', filename], text=True, capture_output=True)
694
- if result.returncode != 0:
695
- print('Could not convert path {} from WSL to Windows'.format(filename))
651
+
652
+ assert failure_behavior in ('none','original'), \
653
+ 'Unrecognized failure_behavior value {}'.format(failure_behavior)
654
+
655
+ # Check whether the path follows the standard WSL mount pattern
656
+ wsl_path_pattern = r'^/mnt/([a-zA-Z])(/.*)?$'
657
+ match = re.match(wsl_path_pattern, filename)
658
+
659
+ if match:
660
+
661
+ # Extract the drive letter and the rest of the path
662
+ drive_letter = match.group(1)
663
+ path_remainder = match.group(2) if match.group(2) else ''
664
+
665
+ # Convert forward slashes to backslashes for Windows
666
+ path_remainder = path_remainder.replace('/', '\\')
667
+
668
+ # Format the Windows path
669
+ windows_path = f"{drive_letter}:{path_remainder}"
670
+ return windows_path
671
+
672
+ if failure_behavior == 'none':
696
673
  return None
697
-
698
- return result.stdout.strip()
699
-
674
+ else:
675
+ return filename
676
+
677
+ # ...def wsl_path_to_windows_path(...)
700
678
 
701
- def windows_path_to_wsl_path(filename):
679
+
680
+ def windows_path_to_wsl_path(filename, failure_behavior='none'):
702
681
  r"""
703
682
  Converts a Windows path to a WSL path, or returns None if that's not possible. E.g.
704
683
  converts:
705
-
684
+
706
685
  e:\a\b\c
707
-
686
+
708
687
  ...to:
709
-
688
+
710
689
  /mnt/e/a/b/c
711
-
690
+
712
691
  Args:
713
692
  filename (str): filename to convert
714
-
693
+ failure_behavior (str): what to do if the path can't be processed as a Windows path.
694
+ 'none' to return None in this case, 'original' to return the original path.
695
+
715
696
  Returns:
716
- str: WSL equivalent to the Windows path [filename], or [filename] if the current
717
- environment is neither Windows nor WSL.
697
+ str: WSL equivalent to the Windows path [filename]
718
698
  """
719
-
720
- if (not environment_is_wsl()) and (os.name != 'nt'):
721
- return filename
722
-
723
- if environment_is_wsl():
724
- result = subprocess.run(['wslpath', '-u', filename], text=True, capture_output=True)
725
- else:
726
- result = subprocess.run(['wsl', 'wslpath', '-u', filename], text=True, capture_output=True)
727
- if result.returncode != 0:
728
- print('Could not convert path {} from Windows to WSL'.format(filename))
699
+
700
+ assert failure_behavior in ('none','original'), \
701
+ 'Unrecognized failure_behavior value {}'.format(failure_behavior)
702
+
703
+ filename = filename.replace('\\', '/')
704
+
705
+ # Check whether the path follows a Windows drive letter pattern
706
+ windows_path_pattern = r'^([a-zA-Z]):(/.*)?$'
707
+ match = re.match(windows_path_pattern, filename)
708
+
709
+ if match:
710
+ # Extract the drive letter and the rest of the path
711
+ drive_letter = match.group(1).lower() # Convert to lowercase for WSL
712
+ path_remainder = match.group(2) if match.group(2) else ''
713
+
714
+ # Format the WSL path
715
+ wsl_path = f"/mnt/{drive_letter}{path_remainder}"
716
+ return wsl_path
717
+
718
+ if failure_behavior == 'none':
729
719
  return None
730
-
731
- return result.stdout.strip()
732
-
720
+ else:
721
+ return filename
722
+
723
+ # ...def window_path_to_wsl_path(...)
724
+
733
725
 
734
726
  def open_file_in_chrome(filename):
735
727
  """
736
- Open a file in chrome, regardless of file type. I typically use this to open
728
+ Open a file in chrome, regardless of file type. I typically use this to open
737
729
  .md files in Chrome.
738
-
730
+
739
731
  Args:
740
732
  filename (str): file to open
741
-
733
+
742
734
  Return:
743
735
  bool: whether the operation was successful
744
736
  """
745
-
737
+
746
738
  # Create URL
747
739
  abs_path = os.path.abspath(filename)
748
-
740
+
749
741
  system = platform.system()
750
742
  if system == 'Windows':
751
743
  url = f'file:///{abs_path.replace(os.sep, "/")}'
752
744
  else: # macOS and Linux
753
745
  url = f'file://{abs_path}'
754
-
746
+
755
747
  # Determine the Chrome path
756
748
  if system == 'Windows':
757
-
749
+
758
750
  # This is a native Python module, but it only exists on Windows
759
751
  import winreg
760
-
752
+
761
753
  chrome_paths = [
762
754
  os.path.expanduser("~") + r"\AppData\Local\Google\Chrome\Application\chrome.exe",
763
755
  r"C:\Program Files\Google\Chrome\Application\chrome.exe",
764
756
  r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"
765
757
  ]
766
-
758
+
767
759
  # Default approach: run from a typical chrome location
768
760
  for path in chrome_paths:
769
761
  if os.path.exists(path):
770
762
  subprocess.run([path, url])
771
763
  return True
772
-
764
+
773
765
  # Method 2: Check registry for Chrome path
774
766
  try:
775
- with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE,
767
+ with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE,
776
768
  r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe") as key:
777
769
  chrome_path = winreg.QueryValue(key, None)
778
770
  if chrome_path and os.path.exists(chrome_path):
779
771
  subprocess.run([chrome_path, url])
780
772
  return True
781
- except:
773
+ except Exception:
782
774
  pass
783
-
775
+
784
776
  # Method 3: Try alternate registry location
785
777
  try:
786
- with winreg.OpenKey(winreg.HKEY_CURRENT_USER,
778
+ with winreg.OpenKey(winreg.HKEY_CURRENT_USER,
787
779
  r"Software\Google\Chrome\BLBeacon") as key:
788
780
  chrome_path = os.path.join(os.path.dirname(winreg.QueryValueEx(key, "version")[0]), "chrome.exe")
789
781
  if os.path.exists(chrome_path):
790
782
  subprocess.run([chrome_path, url])
791
783
  return True
792
- except:
784
+ except Exception:
793
785
  pass
794
-
786
+
795
787
  # Method 4: Try system path or command
796
788
  for chrome_cmd in ["chrome", "chrome.exe", "googlechrome", "google-chrome"]:
797
789
  try:
798
790
  subprocess.run([chrome_cmd, url], shell=True)
799
791
  return True
800
- except:
792
+ except Exception:
801
793
  continue
802
-
794
+
803
795
  # Method 5: Use Windows URL protocol handler
804
796
  try:
805
797
  os.startfile(url)
806
798
  return True
807
- except:
799
+ except Exception:
808
800
  pass
809
-
810
- # Method 6: Use rundll32
801
+
802
+ # Method 6: Use rundll32
811
803
  try:
812
804
  cmd = f'rundll32 url.dll,FileProtocolHandler {url}'
813
805
  subprocess.run(cmd, shell=True)
814
806
  return True
815
- except:
807
+ except Exception:
816
808
  pass
817
-
809
+
818
810
  elif system == 'Darwin':
819
-
811
+
820
812
  chrome_paths = [
821
813
  '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
822
814
  os.path.expanduser('~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome')
823
815
  ]
824
-
816
+
825
817
  for path in chrome_paths:
826
818
  if os.path.exists(path):
827
819
  subprocess.run([path, url])
828
820
  return True
829
-
821
+
830
822
  # Fallback to 'open' command with Chrome as the app
831
823
  try:
832
824
  subprocess.run(['open', '-a', 'Google Chrome', url])
833
825
  return True
834
- except:
826
+ except Exception:
835
827
  pass
836
-
828
+
837
829
  elif system == 'Linux':
838
-
830
+
839
831
  chrome_commands = ['google-chrome', 'chrome', 'chromium', 'chromium-browser']
840
-
832
+
841
833
  for cmd in chrome_commands:
842
834
  try:
843
835
  subprocess.run([cmd, url], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
844
836
  return True
845
- except:
837
+ except Exception:
846
838
  continue
847
-
839
+
848
840
  print(f"Could not open {filename} in Chrome on {system}.")
849
841
  return False
850
842
 
851
-
843
+
852
844
  def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
853
845
  """
854
846
  Opens [filename] in the default OS file handler for this file type.
855
-
847
+
856
848
  If browser_name is not None, uses the webbrowser module to open the filename
857
849
  in the specified browser; see https://docs.python.org/3/library/webbrowser.html
858
850
  for supported browsers. Falls back to the default file handler if webbrowser.open()
859
851
  fails. In this case, attempt_to_open_in_wsl_host is ignored unless webbrowser.open() fails.
860
-
861
- If browser_name is 'default', uses the system default. This is different from the
852
+
853
+ If browser_name is 'default', uses the system default. This is different from the
862
854
  parameter to webbrowser.get(), where None implies the system default.
863
-
855
+
864
856
  Args:
865
857
  filename (str): file to open
866
858
  attempt_to_open_in_wsl_host: if this is True, and we're in WSL, attempts to open
867
859
  [filename] in the Windows host environment
868
860
  browser_name: see above
869
861
  """
870
-
862
+
871
863
  if browser_name is not None:
872
864
  if browser_name == 'chrome':
873
865
  browser_name = 'google-chrome'
@@ -879,32 +871,32 @@ def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
879
871
  result = False
880
872
  if result:
881
873
  return
882
-
874
+
883
875
  if sys.platform == 'win32':
884
-
876
+
885
877
  os.startfile(filename)
886
878
 
887
879
  elif sys.platform == 'darwin':
888
-
880
+
889
881
  opener = 'open'
890
882
  subprocess.call([opener, filename])
891
-
883
+
892
884
  elif attempt_to_open_in_wsl_host and environment_is_wsl():
893
-
885
+
894
886
  windows_path = wsl_path_to_windows_path(filename)
895
-
887
+
896
888
  # Fall back to xdg-open
897
889
  if windows_path is None:
898
890
  subprocess.call(['xdg-open', filename])
899
-
900
- if os.path.isdir(filename):
891
+
892
+ if os.path.isdir(filename):
901
893
  subprocess.run(["explorer.exe", windows_path])
902
894
  else:
903
- os.system("cmd.exe /C start %s" % (re.escape(windows_path)))
904
-
895
+ os.system("cmd.exe /C start {}".format(re.escape(windows_path)))
896
+
905
897
  else:
906
-
907
- opener = 'xdg-open'
898
+
899
+ opener = 'xdg-open'
908
900
  subprocess.call([opener, filename])
909
901
 
910
902
  # ...def open_file(...)
@@ -916,12 +908,12 @@ def write_list_to_file(output_file,strings):
916
908
  """
917
909
  Writes a list of strings to either a JSON file or text file,
918
910
  depending on extension of the given file name.
919
-
911
+
920
912
  Args:
921
913
  output_file (str): file to write
922
914
  strings (list): list of strings to write to [output_file]
923
915
  """
924
-
916
+
925
917
  with open(output_file, 'w') as f:
926
918
  if output_file.endswith('.json'):
927
919
  json.dump(strings, f, indent=1)
@@ -932,14 +924,14 @@ def write_list_to_file(output_file,strings):
932
924
  def read_list_from_file(filename):
933
925
  """
934
926
  Reads a json-formatted list of strings from a file.
935
-
927
+
936
928
  Args:
937
929
  filename (str): .json filename to read
938
-
930
+
939
931
  Returns:
940
932
  list: list of strings read from [filename]
941
933
  """
942
-
934
+
943
935
  assert filename.endswith('.json')
944
936
  with open(filename, 'r') as f:
945
937
  file_list = json.load(f)
@@ -955,39 +947,39 @@ def _copy_file(input_output_tuple,overwrite=True,verbose=False,move=False):
955
947
  """
956
948
  Internal function for copying files from within parallel_copy_files.
957
949
  """
958
-
950
+
959
951
  assert len(input_output_tuple) == 2
960
952
  source_fn = input_output_tuple[0]
961
953
  target_fn = input_output_tuple[1]
962
954
  if (not overwrite) and (os.path.isfile(target_fn)):
963
955
  if verbose:
964
956
  print('Skipping existing target file {}'.format(target_fn))
965
- return
966
-
957
+ return
958
+
967
959
  if move:
968
960
  action_string = 'Moving'
969
961
  else:
970
962
  action_string = 'Copying'
971
-
963
+
972
964
  if verbose:
973
965
  print('{} to {}'.format(action_string,target_fn))
974
-
966
+
975
967
  os.makedirs(os.path.dirname(target_fn),exist_ok=True)
976
968
  if move:
977
969
  shutil.move(source_fn, target_fn)
978
970
  else:
979
971
  shutil.copyfile(source_fn,target_fn)
980
-
981
972
 
982
- def parallel_copy_files(input_file_to_output_file,
983
- max_workers=16,
984
- use_threads=True,
985
- overwrite=False,
973
+
974
+ def parallel_copy_files(input_file_to_output_file,
975
+ max_workers=16,
976
+ use_threads=True,
977
+ overwrite=False,
986
978
  verbose=False,
987
979
  move=False):
988
980
  """
989
981
  Copy (or move) files from source to target according to the dict input_file_to_output_file.
990
-
982
+
991
983
  Args:
992
984
  input_file_to_output_file (dict): dictionary mapping source files to the target files
993
985
  to which they should be copied
@@ -1000,24 +992,32 @@ def parallel_copy_files(input_file_to_output_file,
1000
992
  """
1001
993
 
1002
994
  n_workers = min(max_workers,len(input_file_to_output_file))
1003
-
995
+
1004
996
  # Package the dictionary as a set of 2-tuples
1005
997
  input_output_tuples = []
1006
998
  for input_fn in input_file_to_output_file:
1007
999
  input_output_tuples.append((input_fn,input_file_to_output_file[input_fn]))
1008
1000
 
1009
- if use_threads:
1010
- pool = ThreadPool(n_workers)
1011
- else:
1012
- pool = Pool(n_workers)
1001
+ pool = None
1013
1002
 
1014
- with tqdm(total=len(input_output_tuples)) as pbar:
1015
- for i,_ in enumerate(pool.imap_unordered(partial(_copy_file,
1016
- overwrite=overwrite,
1017
- verbose=verbose,
1018
- move=move),
1019
- input_output_tuples)):
1020
- pbar.update()
1003
+ try:
1004
+ if use_threads:
1005
+ pool = ThreadPool(n_workers)
1006
+ else:
1007
+ pool = Pool(n_workers)
1008
+
1009
+ with tqdm(total=len(input_output_tuples)) as pbar:
1010
+ for i,_ in enumerate(pool.imap_unordered(partial(_copy_file,
1011
+ overwrite=overwrite,
1012
+ verbose=verbose,
1013
+ move=move),
1014
+ input_output_tuples)):
1015
+ pbar.update()
1016
+ finally:
1017
+ pool.close()
1018
+ pool.join()
1019
+ if verbose:
1020
+ print("Pool closed and joined parallel file copying")
1021
1021
 
1022
1022
  # ...def parallel_copy_files(...)
1023
1023
 
@@ -1028,36 +1028,36 @@ def get_file_sizes(base_dir, convert_slashes=True):
1028
1028
  """
1029
1029
  Gets sizes recursively for all files in base_dir, returning a dict mapping
1030
1030
  relative filenames to size.
1031
-
1031
+
1032
1032
  TODO: merge the functionality here with parallel_get_file_sizes, which uses slightly
1033
1033
  different semantics.
1034
-
1034
+
1035
1035
  Args:
1036
1036
  base_dir (str): folder within which we want all file sizes
1037
1037
  convert_slashes (bool, optional): force forward slashes in return strings,
1038
1038
  otherwise uses the native path separator
1039
-
1039
+
1040
1040
  Returns:
1041
1041
  dict: dictionary mapping filenames to file sizes in bytes
1042
1042
  """
1043
-
1044
- relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
1043
+
1044
+ relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
1045
1045
  return_relative_paths=True)
1046
-
1046
+
1047
1047
  fn_to_size = {}
1048
1048
  for fn_relative in tqdm(relative_filenames):
1049
1049
  fn_abs = os.path.join(base_dir,fn_relative)
1050
1050
  fn_to_size[fn_relative] = os.path.getsize(fn_abs)
1051
-
1051
+
1052
1052
  return fn_to_size
1053
-
1053
+
1054
1054
 
1055
1055
  def _get_file_size(filename,verbose=False):
1056
1056
  """
1057
1057
  Internal function for safely getting the size of a file. Returns a (filename,size)
1058
1058
  tuple, where size is None if there is an error.
1059
1059
  """
1060
-
1060
+
1061
1061
  try:
1062
1062
  size = os.path.getsize(filename)
1063
1063
  except Exception as e:
@@ -1066,18 +1066,18 @@ def _get_file_size(filename,verbose=False):
1066
1066
  size = None
1067
1067
  return (filename,size)
1068
1068
 
1069
-
1070
- def parallel_get_file_sizes(filenames,
1071
- max_workers=16,
1072
- use_threads=True,
1069
+
1070
+ def parallel_get_file_sizes(filenames,
1071
+ max_workers=16,
1072
+ use_threads=True,
1073
1073
  verbose=False,
1074
- recursive=True,
1074
+ recursive=True,
1075
1075
  convert_slashes=True,
1076
1076
  return_relative_paths=False):
1077
1077
  """
1078
1078
  Returns a dictionary mapping every file in [filenames] to the corresponding file size,
1079
1079
  or None for errors. If [filenames] is a folder, will enumerate the folder (optionally recursively).
1080
-
1080
+
1081
1081
  Args:
1082
1082
  filenames (list or str): list of filenames for which we should read sizes, or a folder
1083
1083
  within which we should read all file sizes recursively
@@ -1089,33 +1089,33 @@ def parallel_get_file_sizes(filenames,
1089
1089
  convert_slashes (bool, optional): convert backslashes to forward slashes
1090
1090
  return_relative_paths (bool, optional): return relative paths; only relevant if [filenames]
1091
1091
  is a folder.
1092
-
1092
+
1093
1093
  Returns:
1094
1094
  dict: dictionary mapping filenames to file sizes in bytes
1095
1095
  """
1096
1096
 
1097
1097
  n_workers = min(max_workers,len(filenames))
1098
-
1098
+
1099
1099
  folder_name = None
1100
-
1100
+
1101
1101
  if isinstance(filenames,str):
1102
-
1102
+
1103
1103
  folder_name = filenames
1104
- assert os.path.isdir(filenames), 'Could not find folder {}'.format(folder_name)
1105
-
1104
+ assert os.path.isdir(filenames), 'Could not find folder {}'.format(folder_name)
1105
+
1106
1106
  if verbose:
1107
1107
  print('Enumerating files in {}'.format(folder_name))
1108
-
1108
+
1109
1109
  # Enumerate absolute paths here, we'll convert to relative later if requested
1110
1110
  filenames = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
1111
1111
 
1112
1112
  else:
1113
-
1113
+
1114
1114
  assert is_iterable(filenames), '[filenames] argument is neither a folder nor an iterable'
1115
-
1115
+
1116
1116
  if verbose:
1117
1117
  print('Creating worker pool')
1118
-
1118
+
1119
1119
  if use_threads:
1120
1120
  pool_string = 'thread'
1121
1121
  pool = ThreadPool(n_workers)
@@ -1126,11 +1126,11 @@ def parallel_get_file_sizes(filenames,
1126
1126
  if verbose:
1127
1127
  print('Created a {} pool of {} workers'.format(
1128
1128
  pool_string,n_workers))
1129
-
1129
+
1130
1130
  # This returns (filename,size) tuples
1131
1131
  get_size_results = list(tqdm(pool.imap(
1132
1132
  partial(_get_file_size,verbose=verbose),filenames), total=len(filenames)))
1133
-
1133
+
1134
1134
  to_return = {}
1135
1135
  for r in get_size_results:
1136
1136
  fn = r[0]
@@ -1151,7 +1151,7 @@ def parallel_get_file_sizes(filenames,
1151
1151
  def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
1152
1152
  """
1153
1153
  Zips a single file.
1154
-
1154
+
1155
1155
  Args:
1156
1156
  input_fn (str): file to zip
1157
1157
  output_fn (str, optional): target zipfile; if this is None, we'll use
@@ -1159,23 +1159,23 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
1159
1159
  overwrite (bool, optional): whether to overwrite an existing target file
1160
1160
  verbose (bool, optional): enable existing debug console output
1161
1161
  compresslevel (int, optional): compression level to use, between 0 and 9
1162
-
1162
+
1163
1163
  Returns:
1164
1164
  str: the output zipfile, whether we created it or determined that it already exists
1165
1165
  """
1166
-
1166
+
1167
1167
  basename = os.path.basename(input_fn)
1168
-
1168
+
1169
1169
  if output_fn is None:
1170
1170
  output_fn = input_fn + '.zip'
1171
-
1171
+
1172
1172
  if (not overwrite) and (os.path.isfile(output_fn)):
1173
1173
  print('Skipping existing file {}'.format(output_fn))
1174
1174
  return output_fn
1175
-
1175
+
1176
1176
  if verbose:
1177
1177
  print('Zipping {} to {} with level {}'.format(input_fn,output_fn,compresslevel))
1178
-
1178
+
1179
1179
  with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
1180
1180
  zipf.write(input_fn,arcname=basename,compresslevel=compresslevel,
1181
1181
  compress_type=zipfile.ZIP_DEFLATED)
@@ -1186,9 +1186,9 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
1186
1186
  def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
1187
1187
  overwrite=False, verbose=False, mode='x'):
1188
1188
  """
1189
- Adds all the files in [input_files] to the tar file [output_fn].
1189
+ Adds all the files in [input_files] to the tar file [output_fn].
1190
1190
  Archive names are relative to arc_name_base.
1191
-
1191
+
1192
1192
  Args:
1193
1193
  input_files (list): list of absolute filenames to include in the .tar file
1194
1194
  output_fn (str): .tar file to create
@@ -1198,11 +1198,11 @@ def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
1198
1198
  overwrite (bool, optional): whether to overwrite an existing .tar file
1199
1199
  verbose (bool, optional): enable additional debug console output
1200
1200
  mode (str, optional): compression type, can be 'x' (no compression), 'x:gz', or 'x:bz2'.
1201
-
1201
+
1202
1202
  Returns:
1203
1203
  str: the output tar file, whether we created it or determined that it already exists
1204
1204
  """
1205
-
1205
+
1206
1206
  if os.path.isfile(output_fn):
1207
1207
  if not overwrite:
1208
1208
  print('Tar file {} exists, skipping'.format(output_fn))
@@ -1210,11 +1210,11 @@ def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
1210
1210
  else:
1211
1211
  print('Tar file {} exists, deleting and re-creating'.format(output_fn))
1212
1212
  os.remove(output_fn)
1213
-
1213
+
1214
1214
  if verbose:
1215
1215
  print('Adding {} files to {} (mode {})'.format(
1216
1216
  len(input_files),output_fn,mode))
1217
-
1217
+
1218
1218
  with tarfile.open(output_fn,mode) as tarf:
1219
1219
  for input_fn_abs in tqdm(input_files,disable=(not verbose)):
1220
1220
  input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
@@ -1226,9 +1226,9 @@ def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
1226
1226
  def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
1227
1227
  overwrite=False, verbose=False, compresslevel=9):
1228
1228
  """
1229
- Zip all the files in [input_files] into [output_fn]. Archive names are relative to
1229
+ Zip all the files in [input_files] into [output_fn]. Archive names are relative to
1230
1230
  arc_name_base.
1231
-
1231
+
1232
1232
  Args:
1233
1233
  input_files (list): list of absolute filenames to include in the .tar file
1234
1234
  output_fn (str): .tar file to create
@@ -1238,20 +1238,20 @@ def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
1238
1238
  overwrite (bool, optional): whether to overwrite an existing .tar file
1239
1239
  verbose (bool, optional): enable additional debug console output
1240
1240
  compresslevel (int, optional): compression level to use, between 0 and 9
1241
-
1241
+
1242
1242
  Returns:
1243
1243
  str: the output zipfile, whether we created it or determined that it already exists
1244
1244
  """
1245
-
1245
+
1246
1246
  if not overwrite:
1247
1247
  if os.path.isfile(output_fn):
1248
1248
  print('Zip file {} exists, skipping'.format(output_fn))
1249
1249
  return output_fn
1250
-
1250
+
1251
1251
  if verbose:
1252
1252
  print('Zipping {} files to {} (compression level {})'.format(
1253
1253
  len(input_files),output_fn,compresslevel))
1254
-
1254
+
1255
1255
  with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
1256
1256
  for input_fn_abs in tqdm(input_files,disable=(not verbose)):
1257
1257
  input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
@@ -1261,41 +1261,41 @@ def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
1261
1261
  compress_type=zipfile.ZIP_DEFLATED)
1262
1262
 
1263
1263
  return output_fn
1264
-
1265
-
1264
+
1265
+
1266
1266
  def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
1267
1267
  """
1268
- Recursively zip everything in [input_folder] into a single zipfile, storing files as paths
1268
+ Recursively zip everything in [input_folder] into a single zipfile, storing files as paths
1269
1269
  relative to [input_folder].
1270
-
1271
- Args:
1270
+
1271
+ Args:
1272
1272
  input_folder (str): folder to zip
1273
1273
  output_fn (str, optional): output filename; if this is None, we'll write to [input_folder].zip
1274
1274
  overwrite (bool, optional): whether to overwrite an existing .tar file
1275
1275
  verbose (bool, optional): enable additional debug console output
1276
- compresslevel (int, optional): compression level to use, between 0 and 9
1277
-
1276
+ compresslevel (int, optional): compression level to use, between 0 and 9
1277
+
1278
1278
  Returns:
1279
- str: the output zipfile, whether we created it or determined that it already exists
1279
+ str: the output zipfile, whether we created it or determined that it already exists
1280
1280
  """
1281
-
1281
+
1282
1282
  if output_fn is None:
1283
1283
  output_fn = input_folder + '.zip'
1284
-
1284
+
1285
1285
  if not overwrite:
1286
1286
  if os.path.isfile(output_fn):
1287
1287
  print('Zip file {} exists, skipping'.format(output_fn))
1288
- return
1289
-
1288
+ return
1289
+
1290
1290
  if verbose:
1291
1291
  print('Zipping {} to {} (compression level {})'.format(
1292
1292
  input_folder,output_fn,compresslevel))
1293
-
1293
+
1294
1294
  relative_filenames = recursive_file_list(input_folder,return_relative_paths=True)
1295
-
1295
+
1296
1296
  with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
1297
1297
  for input_fn_relative in tqdm(relative_filenames,disable=(not verbose)):
1298
- input_fn_abs = os.path.join(input_folder,input_fn_relative)
1298
+ input_fn_abs = os.path.join(input_folder,input_fn_relative)
1299
1299
  zipf.write(input_fn_abs,
1300
1300
  arcname=input_fn_relative,
1301
1301
  compresslevel=compresslevel,
@@ -1303,17 +1303,17 @@ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, com
1303
1303
 
1304
1304
  return output_fn
1305
1305
 
1306
-
1307
- def parallel_zip_files(input_files,
1308
- max_workers=16,
1309
- use_threads=True,
1310
- compresslevel=9,
1311
- overwrite=False,
1306
+
1307
+ def parallel_zip_files(input_files,
1308
+ max_workers=16,
1309
+ use_threads=True,
1310
+ compresslevel=9,
1311
+ overwrite=False,
1312
1312
  verbose=False):
1313
1313
  """
1314
- Zips one or more files to separate output files in parallel, leaving the
1314
+ Zips one or more files to separate output files in parallel, leaving the
1315
1315
  original files in place. Each file is zipped to [filename].zip.
1316
-
1316
+
1317
1317
  Args:
1318
1318
  input_file (str): list of files to zip
1319
1319
  max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
@@ -1341,9 +1341,9 @@ def parallel_zip_files(input_files,
1341
1341
  def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
1342
1342
  compresslevel=9, overwrite=False, verbose=False):
1343
1343
  """
1344
- Zips one or more folders to separate output files in parallel, leaving the
1344
+ Zips one or more folders to separate output files in parallel, leaving the
1345
1345
  original folders in place. Each folder is zipped to [folder_name].zip.
1346
-
1346
+
1347
1347
  Args:
1348
1348
  input_folder (list): list of folders to zip
1349
1349
  max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
@@ -1360,7 +1360,7 @@ def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
1360
1360
  pool = ThreadPool(n_workers)
1361
1361
  else:
1362
1362
  pool = Pool(n_workers)
1363
-
1363
+
1364
1364
  with tqdm(total=len(input_folders)) as pbar:
1365
1365
  for i,_ in enumerate(pool.imap_unordered(
1366
1366
  partial(zip_folder,overwrite=overwrite,
@@ -1373,9 +1373,9 @@ def zip_each_file_in_folder(folder_name,recursive=False,max_workers=16,use_threa
1373
1373
  compresslevel=9,overwrite=False,required_token=None,verbose=False,
1374
1374
  exclude_zip=True):
1375
1375
  """
1376
- Zips each file in [folder_name] to its own zipfile (filename.zip), optionally recursing. To
1376
+ Zips each file in [folder_name] to its own zipfile (filename.zip), optionally recursing. To
1377
1377
  zip a whole folder into a single zipfile, use zip_folder().
1378
-
1378
+
1379
1379
  Args:
1380
1380
  folder_name (str): the folder within which we should zip files
1381
1381
  recursive (bool, optional): whether to recurse within [folder_name]
@@ -1386,19 +1386,19 @@ def zip_each_file_in_folder(folder_name,recursive=False,max_workers=16,use_threa
1386
1386
  overwrite (bool, optional): whether to overwrite an existing .tar file
1387
1387
  required_token (str, optional): only zip files whose names contain this string
1388
1388
  verbose (bool, optional): enable additional debug console output
1389
- exclude_zip (bool, optional): skip files ending in .zip
1389
+ exclude_zip (bool, optional): skip files ending in .zip
1390
1390
  """
1391
-
1391
+
1392
1392
  assert os.path.isdir(folder_name), '{} is not a folder'.format(folder_name)
1393
-
1393
+
1394
1394
  input_files = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
1395
-
1395
+
1396
1396
  if required_token is not None:
1397
1397
  input_files = [fn for fn in input_files if required_token in fn]
1398
-
1398
+
1399
1399
  if exclude_zip:
1400
1400
  input_files = [fn for fn in input_files if (not fn.endswith('.zip'))]
1401
-
1401
+
1402
1402
  parallel_zip_files(input_files=input_files,max_workers=max_workers,
1403
1403
  use_threads=use_threads,compresslevel=compresslevel,
1404
1404
  overwrite=overwrite,verbose=verbose)
@@ -1408,16 +1408,16 @@ def unzip_file(input_file, output_folder=None):
1408
1408
  """
1409
1409
  Unzips a zipfile to the specified output folder, defaulting to the same location as
1410
1410
  the input file.
1411
-
1411
+
1412
1412
  Args:
1413
1413
  input_file (str): zipfile to unzip
1414
1414
  output_folder (str, optional): folder to which we should unzip [input_file], defaults
1415
1415
  to unzipping to the folder where [input_file] lives
1416
1416
  """
1417
-
1417
+
1418
1418
  if output_folder is None:
1419
1419
  output_folder = os.path.dirname(input_file)
1420
-
1420
+
1421
1421
  with zipfile.ZipFile(input_file, 'r') as zf:
1422
1422
  zf.extractall(output_folder)
1423
1423
 
@@ -1427,31 +1427,31 @@ def unzip_file(input_file, output_folder=None):
1427
1427
  def compute_file_hash(file_path, algorithm='sha256', allow_failures=True):
1428
1428
  """
1429
1429
  Compute the hash of a file.
1430
-
1430
+
1431
1431
  Adapted from:
1432
-
1432
+
1433
1433
  https://www.geeksforgeeks.org/python-program-to-find-hash-of-file/
1434
-
1434
+
1435
1435
  Args:
1436
1436
  file_path (str): the file to hash
1437
1437
  algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
1438
-
1438
+
1439
1439
  Returns:
1440
1440
  str: the hash value for this file
1441
1441
  """
1442
-
1442
+
1443
1443
  try:
1444
-
1444
+
1445
1445
  hash_func = hashlib.new(algorithm)
1446
-
1446
+
1447
1447
  with open(file_path, 'rb') as file:
1448
1448
  while chunk := file.read(8192): # Read the file in chunks of 8192 bytes
1449
1449
  hash_func.update(chunk)
1450
-
1450
+
1451
1451
  return str(hash_func.hexdigest())
1452
-
1452
+
1453
1453
  except Exception:
1454
-
1454
+
1455
1455
  if allow_failures:
1456
1456
  return None
1457
1457
  else:
@@ -1461,14 +1461,14 @@ def compute_file_hash(file_path, algorithm='sha256', allow_failures=True):
1461
1461
 
1462
1462
 
1463
1463
  def parallel_compute_file_hashes(filenames,
1464
- max_workers=16,
1465
- use_threads=True,
1464
+ max_workers=16,
1465
+ use_threads=True,
1466
1466
  recursive=True,
1467
1467
  algorithm='sha256',
1468
1468
  verbose=False):
1469
1469
  """
1470
1470
  Compute file hashes for a list or folder of images.
1471
-
1471
+
1472
1472
  Args:
1473
1473
  filenames (list or str): a list of filenames or a folder
1474
1474
  max_workers (int, optional): the number of parallel workers to use; set to <=1 to disable
@@ -1478,8 +1478,8 @@ def parallel_compute_file_hashes(filenames,
1478
1478
  algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
1479
1479
  recursive (bool, optional): if [filenames] is a folder, whether to enumerate recursively.
1480
1480
  Ignored if [filenames] is a list.
1481
- verbose (bool, optional): enable additional debug output
1482
-
1481
+ verbose (bool, optional): enable additional debug output
1482
+
1483
1483
  Returns:
1484
1484
  dict: a dict mapping filenames to hash values; values will be None for files that fail
1485
1485
  to load.
@@ -1489,35 +1489,1140 @@ def parallel_compute_file_hashes(filenames,
1489
1489
  if verbose:
1490
1490
  print('Enumerating files in {}'.format(filenames))
1491
1491
  filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
1492
-
1492
+
1493
1493
  n_workers = min(max_workers,len(filenames))
1494
-
1494
+
1495
1495
  if verbose:
1496
1496
  print('Computing hashes for {} files on {} workers'.format(len(filenames),n_workers))
1497
-
1497
+
1498
1498
  if n_workers <= 1:
1499
-
1499
+
1500
1500
  results = []
1501
1501
  for filename in filenames:
1502
1502
  results.append(compute_file_hash(filename,algorithm=algorithm,allow_failures=True))
1503
-
1503
+
1504
1504
  else:
1505
-
1505
+
1506
1506
  if use_threads:
1507
1507
  pool = ThreadPool(n_workers)
1508
1508
  else:
1509
1509
  pool = Pool(n_workers)
1510
-
1510
+
1511
1511
  results = list(tqdm(pool.imap(
1512
1512
  partial(compute_file_hash,algorithm=algorithm,allow_failures=True),
1513
1513
  filenames), total=len(filenames)))
1514
-
1514
+
1515
1515
  assert len(filenames) == len(results), 'Internal error in parallel_compute_file_hashes'
1516
-
1516
+
1517
1517
  to_return = {}
1518
1518
  for i_file,filename in enumerate(filenames):
1519
1519
  to_return[filename] = results[i_file]
1520
-
1520
+
1521
1521
  return to_return
1522
1522
 
1523
1523
  # ...def parallel_compute_file_hashes(...)
1524
+
1525
+
1526
+ #%% Tests
1527
+
1528
+ class TestPathUtils:
1529
+ """
1530
+ Tests for path_utils.py
1531
+ """
1532
+
1533
+ def set_up(self):
1534
+ """
1535
+ Create a temporary directory for testing.
1536
+ """
1537
+
1538
+ self.test_dir = make_test_folder(subfolder='megadetector/path_utils_tests')
1539
+ os.makedirs(self.test_dir, exist_ok=True)
1540
+
1541
+
1542
+ def tear_down(self):
1543
+ """
1544
+ Remove the temporary directory after tests.
1545
+ """
1546
+
1547
+ if os.path.exists(self.test_dir):
1548
+ shutil.rmtree(self.test_dir)
1549
+
1550
+
1551
+ def test_is_image_file(self):
1552
+ """
1553
+ Test the is_image_file function.
1554
+ """
1555
+
1556
+ assert is_image_file('test.jpg')
1557
+ assert is_image_file('test.jpeg')
1558
+ assert is_image_file('test.png')
1559
+ assert is_image_file('test.gif')
1560
+ assert is_image_file('test.bmp')
1561
+ assert is_image_file('test.tiff')
1562
+ assert is_image_file('test.TIF')
1563
+ assert not is_image_file('test.txt')
1564
+ assert not is_image_file('test.doc')
1565
+ assert is_image_file('path/to/image.JPG')
1566
+ assert not is_image_file('image')
1567
+ assert is_image_file('test.custom', img_extensions=['.custom'])
1568
+ assert not is_image_file('test.jpg', img_extensions=['.custom'])
1569
+
1570
+
1571
+ def test_find_image_strings(self):
1572
+ """
1573
+ Test the find_image_strings function.
1574
+ """
1575
+
1576
+ strings = ['a.jpg', 'b.txt', 'c.PNG', 'd.gif', 'e.jpeg', 'f.doc']
1577
+ expected = ['a.jpg', 'c.PNG', 'd.gif', 'e.jpeg']
1578
+ assert sorted(find_image_strings(strings)) == sorted(expected)
1579
+ assert find_image_strings([]) == []
1580
+ assert find_image_strings(['no_image.txt', 'another.doc']) == []
1581
+
1582
+
1583
+ def test_find_images(self):
1584
+ """
1585
+ Test the find_images function.
1586
+ """
1587
+
1588
+ # Create some dummy files
1589
+ img1_abs = os.path.join(self.test_dir, 'img1.jpg')
1590
+ img2_abs = os.path.join(self.test_dir, 'img2.PNG')
1591
+ txt1_abs = os.path.join(self.test_dir, 'text1.txt')
1592
+ open(img1_abs, 'w').close()
1593
+ open(img2_abs, 'w').close()
1594
+ open(txt1_abs, 'w').close()
1595
+
1596
+ subdir = os.path.join(self.test_dir, 'subdir')
1597
+ os.makedirs(subdir, exist_ok=True)
1598
+ img3_abs = os.path.join(subdir, 'img3.jpeg')
1599
+ txt2_abs = os.path.join(subdir, 'text2.txt')
1600
+ open(img3_abs, 'w').close()
1601
+ open(txt2_abs, 'w').close()
1602
+
1603
+ # Test non-recursive
1604
+ expected_non_recursive_abs = sorted([img1_abs.replace('\\', '/'), img2_abs.replace('\\', '/')])
1605
+ found_non_recursive_abs = find_images(self.test_dir, recursive=False, return_relative_paths=False)
1606
+ assert sorted(found_non_recursive_abs) == expected_non_recursive_abs
1607
+
1608
+ # Test non-recursive, relative paths
1609
+ expected_non_recursive_rel = sorted(['img1.jpg', 'img2.PNG'])
1610
+ found_non_recursive_rel = find_images(self.test_dir, recursive=False, return_relative_paths=True)
1611
+ assert sorted(found_non_recursive_rel) == expected_non_recursive_rel
1612
+
1613
+ # Test recursive
1614
+ expected_recursive_abs = sorted([
1615
+ img1_abs.replace('\\', '/'),
1616
+ img2_abs.replace('\\', '/'),
1617
+ img3_abs.replace('\\', '/')
1618
+ ])
1619
+ found_recursive_abs = find_images(self.test_dir, recursive=True, return_relative_paths=False)
1620
+ assert sorted(found_recursive_abs) == expected_recursive_abs
1621
+
1622
+ # Test recursive, relative paths
1623
+ expected_recursive_rel = sorted([
1624
+ 'img1.jpg',
1625
+ 'img2.PNG',
1626
+ os.path.join('subdir', 'img3.jpeg').replace('\\', '/')
1627
+ ])
1628
+ found_recursive_rel = find_images(self.test_dir, recursive=True, return_relative_paths=True)
1629
+ assert sorted(found_recursive_rel) == expected_recursive_rel
1630
+
1631
+ # Test with an empty directory
1632
+ empty_dir = os.path.join(self.test_dir, 'empty_dir')
1633
+ os.makedirs(empty_dir, exist_ok=True)
1634
+ assert find_images(empty_dir, recursive=True) == []
1635
+
1636
+ # Test with a directory that doesn't exist (should assert)
1637
+ try:
1638
+ find_images(os.path.join(self.test_dir, 'non_existent_dir'))
1639
+ raise AssertionError("AssertionError not raised for non_existent_dir")
1640
+ except AssertionError:
1641
+ pass
1642
+
1643
+
1644
+ def test_recursive_file_list_and_file_list(self):
1645
+ """
1646
+ Test the recursive_file_list and file_list functions.
1647
+ """
1648
+
1649
+ # Setup directory structure
1650
+ # test_dir/
1651
+ # file1.txt
1652
+ # file2.jpg
1653
+ # subdir1/
1654
+ # file3.txt
1655
+ # subsubdir/
1656
+ # file4.png
1657
+ # subdir2/
1658
+ # file5.doc
1659
+
1660
+ list_dir = os.path.join(self.test_dir,'recursive_list')
1661
+
1662
+ f1 = os.path.join(list_dir, 'file1.txt')
1663
+ f2 = os.path.join(list_dir, 'file2.jpg')
1664
+ subdir1 = os.path.join(list_dir, 'subdir1')
1665
+ os.makedirs(subdir1, exist_ok=True)
1666
+ f3 = os.path.join(subdir1, 'file3.txt')
1667
+ subsubdir = os.path.join(subdir1, 'subsubdir')
1668
+ os.makedirs(subsubdir, exist_ok=True)
1669
+ f4 = os.path.join(subsubdir, 'file4.png')
1670
+ subdir2 = os.path.join(list_dir, 'subdir2')
1671
+ os.makedirs(subdir2, exist_ok=True)
1672
+ f5 = os.path.join(subdir2, 'file5.doc')
1673
+
1674
+ for filepath in [f1, f2, f3, f4, f5]:
1675
+ with open(filepath, 'w') as f:
1676
+ f.write('test')
1677
+
1678
+ # Test recursive_file_list (recursive=True by default)
1679
+ expected_all_files_abs = sorted([
1680
+ f1.replace('\\', '/'), f2.replace('\\', '/'), f3.replace('\\', '/'),
1681
+ f4.replace('\\', '/'), f5.replace('\\', '/')
1682
+ ])
1683
+ all_files_abs = recursive_file_list(list_dir, convert_slashes=True,
1684
+ return_relative_paths=False)
1685
+ assert sorted(all_files_abs) == expected_all_files_abs
1686
+
1687
+ # Test recursive_file_list with relative paths
1688
+ expected_all_files_rel = sorted([
1689
+ 'file1.txt', 'file2.jpg',
1690
+ os.path.join('subdir1', 'file3.txt').replace('\\', '/'),
1691
+ os.path.join('subdir1', 'subsubdir', 'file4.png').replace('\\', '/'),
1692
+ os.path.join('subdir2', 'file5.doc').replace('\\', '/')
1693
+ ])
1694
+ all_files_rel = recursive_file_list(list_dir, convert_slashes=True,
1695
+ return_relative_paths=True)
1696
+ assert sorted(all_files_rel) == expected_all_files_rel
1697
+
1698
+ # Test file_list (non-recursive by default via wrapper)
1699
+ expected_top_level_files_abs = sorted([f1.replace('\\', '/'), f2.replace('\\', '/')])
1700
+ top_level_files_abs = file_list(list_dir, convert_slashes=True,
1701
+ return_relative_paths=False, recursive=False)
1702
+ assert sorted(top_level_files_abs) == expected_top_level_files_abs
1703
+
1704
+ # Test file_list (recursive explicitly) - should be same as recursive_file_list
1705
+ recursive_via_file_list = file_list(list_dir, convert_slashes=True,
1706
+ return_relative_paths=False, recursive=True)
1707
+ assert sorted(recursive_via_file_list) == expected_all_files_abs
1708
+
1709
+ # Test with convert_slashes=False (use os.sep)
1710
+ #
1711
+ # Note: This test might be tricky if os.sep is '/', as no replacement happens. We'll check
1712
+ # that backslashes remain on Windows.
1713
+ if os.sep == '\\':
1714
+ f1_raw = os.path.join(list_dir, 'file1.txt')
1715
+ # Only one file for simplicity
1716
+ files_no_slash_conversion = file_list(list_dir, convert_slashes=False, recursive=False)
1717
+ assert any(f1_raw in s for s in files_no_slash_conversion)
1718
+
1719
+ # Test with an empty directory
1720
+ empty_dir = os.path.join(list_dir, "empty_dir_for_files")
1721
+ os.makedirs(empty_dir, exist_ok=True)
1722
+ assert recursive_file_list(empty_dir) == []
1723
+ assert file_list(empty_dir, recursive=False) == []
1724
+
1725
+ # Test with a non-existent directory
1726
+ try:
1727
+ recursive_file_list(os.path.join(list_dir, "non_existent_dir"))
1728
+ raise AssertionError("AssertionError not raised for non_existent_dir in recursive_file_list")
1729
+ except AssertionError:
1730
+ pass
1731
+
1732
+
1733
+ def test_folder_list(self):
1734
+ """
1735
+ Test the folder_list function.
1736
+ """
1737
+
1738
+ # Setup directory structure
1739
+ # test_dir/
1740
+ # subdir1/
1741
+ # subsubdir1/
1742
+ # subdir2/
1743
+ # file1.txt (should be ignored)
1744
+
1745
+ folder_list_dir = os.path.join(self.test_dir,'folder_list')
1746
+
1747
+ subdir1 = os.path.join(folder_list_dir, 'subdir1')
1748
+ subsubdir1 = os.path.join(subdir1, 'subsubdir1')
1749
+ subdir2 = os.path.join(folder_list_dir, 'subdir2')
1750
+ os.makedirs(subdir1, exist_ok=True)
1751
+ os.makedirs(subsubdir1, exist_ok=True)
1752
+ os.makedirs(subdir2, exist_ok=True)
1753
+ with open(os.path.join(folder_list_dir, 'file1.txt'), 'w') as f:
1754
+ f.write('test')
1755
+
1756
+ # Test non-recursive
1757
+ expected_folders_non_recursive_abs = sorted([
1758
+ subdir1.replace('\\', '/'), subdir2.replace('\\', '/')
1759
+ ])
1760
+ folders_non_recursive_abs = folder_list(folder_list_dir, recursive=False,
1761
+ return_relative_paths=False)
1762
+ assert sorted(folders_non_recursive_abs) == expected_folders_non_recursive_abs
1763
+
1764
+ # Test non-recursive, relative paths
1765
+ expected_folders_non_recursive_rel = sorted(['subdir1', 'subdir2'])
1766
+ folders_non_recursive_rel = folder_list(folder_list_dir, recursive=False,
1767
+ return_relative_paths=True)
1768
+ assert sorted(folders_non_recursive_rel) == expected_folders_non_recursive_rel
1769
+
1770
+ # Test recursive
1771
+ expected_folders_recursive_abs = sorted([
1772
+ subdir1.replace('\\', '/'),
1773
+ subsubdir1.replace('\\', '/'),
1774
+ subdir2.replace('\\', '/')
1775
+ ])
1776
+ folders_recursive_abs = folder_list(folder_list_dir, recursive=True,
1777
+ return_relative_paths=False)
1778
+ assert sorted(folders_recursive_abs) == expected_folders_recursive_abs
1779
+
1780
+ # Test recursive, relative paths
1781
+ expected_folders_recursive_rel = sorted([
1782
+ 'subdir1',
1783
+ os.path.join('subdir1', 'subsubdir1').replace('\\', '/'),
1784
+ 'subdir2'
1785
+ ])
1786
+ folders_recursive_rel = folder_list(folder_list_dir, recursive=True,
1787
+ return_relative_paths=True)
1788
+ assert sorted(folders_recursive_rel) == expected_folders_recursive_rel
1789
+
1790
+ # Test with an empty directory (except for the file)
1791
+ empty_dir_for_folders = os.path.join(folder_list_dir, "empty_for_folders")
1792
+ os.makedirs(empty_dir_for_folders, exist_ok=True)
1793
+ with open(os.path.join(empty_dir_for_folders, 'temp.txt'), 'w') as f: f.write('t')
1794
+ assert folder_list(empty_dir_for_folders, recursive=True) == []
1795
+ assert folder_list(empty_dir_for_folders, recursive=False) == []
1796
+
1797
+ # Test with a non-existent directory
1798
+ try:
1799
+ folder_list(os.path.join(self.test_dir, "non_existent_dir"))
1800
+ raise AssertionError("AssertionError not raised for non_existent_dir in folder_list")
1801
+ except AssertionError:
1802
+ pass
1803
+
1804
+
1805
+ def test_folder_summary(self):
1806
+ """
1807
+ Test the folder_summary function.
1808
+ """
1809
+
1810
+ # test_dir/
1811
+ # file1.txt
1812
+ # img1.jpg
1813
+ # subdir/
1814
+ # file2.txt
1815
+ # img2.png
1816
+ # img3.png
1817
+
1818
+ fodler_summary_dir = os.path.join(self.test_dir,'folder_summary')
1819
+
1820
+ f1 = os.path.join(fodler_summary_dir, 'file1.txt')
1821
+ img1 = os.path.join(fodler_summary_dir, 'img1.jpg')
1822
+ subdir = os.path.join(fodler_summary_dir, 'subdir')
1823
+ os.makedirs(subdir, exist_ok=True)
1824
+ f2 = os.path.join(subdir, 'file2.txt')
1825
+ img2 = os.path.join(subdir, 'img2.png')
1826
+ img3 = os.path.join(subdir, 'img3.png')
1827
+
1828
+ for filepath in [f1, img1, f2, img2, img3]:
1829
+ with open(filepath, 'w') as f:
1830
+ f.write('test')
1831
+
1832
+ summary = folder_summary(fodler_summary_dir, print_summary=False)
1833
+
1834
+ assert summary['n_files'] == 5
1835
+ assert summary['n_folders'] == 1 # 'subdir'
1836
+ assert summary['extension_to_count']['.txt'] == 2
1837
+ assert summary['extension_to_count']['.jpg'] == 1
1838
+ assert summary['extension_to_count']['.png'] == 2
1839
+
1840
+ # Check order (sorted by value, desc)
1841
+ #
1842
+ # The specific order of keys with the same counts can vary based on file system list
1843
+ # order. We'll check that the counts are correct and the number of unique extensions is
1844
+ # right.
1845
+ assert len(summary['extension_to_count']) == 3
1846
+
1847
+
1848
+ empty_dir = os.path.join(fodler_summary_dir, "empty_summary_dir")
1849
+ os.makedirs(empty_dir, exist_ok=True)
1850
+ empty_summary = folder_summary(empty_dir, print_summary=False)
1851
+ assert empty_summary['n_files'] == 0
1852
+ assert empty_summary['n_folders'] == 0
1853
+ assert empty_summary['extension_to_count'] == {}
1854
+
1855
+
1856
+ def test_fileparts(self):
1857
+ """
1858
+ Test the fileparts function.
1859
+ """
1860
+
1861
+ assert fileparts('file') == ('', 'file', '')
1862
+ assert fileparts('file.txt') == ('', 'file', '.txt')
1863
+ assert fileparts(r'c:/dir/file.jpg') == ('c:/dir', 'file', '.jpg')
1864
+ assert fileparts('/dir/subdir/file.jpg') == ('/dir/subdir', 'file', '.jpg')
1865
+ assert fileparts(r'c:\dir\file') == (r'c:\dir', 'file', '')
1866
+ assert fileparts(r'c:\dir\file.tar.gz') == (r'c:\dir', 'file.tar', '.gz')
1867
+ assert fileparts('.bashrc') == ('', '.bashrc', '') # Hidden file, no extension
1868
+ assert fileparts('nodir/.bashrc') == ('nodir', '.bashrc', '')
1869
+ assert fileparts('a/b/c.d.e') == ('a/b', 'c.d', '.e')
1870
+
1871
+
1872
+ def test_insert_before_extension(self):
1873
+ """
1874
+ Test the insert_before_extension function.
1875
+ """
1876
+
1877
+ assert insert_before_extension('file.ext', 'inserted') == 'file.inserted.ext'
1878
+ assert insert_before_extension('file', 'inserted') == 'file.inserted'
1879
+ assert insert_before_extension('path/to/file.ext', 'tag') == 'path/to/file.tag.ext'
1880
+ assert insert_before_extension('path/to/file', 'tag') == 'path/to/file.tag'
1881
+ assert insert_before_extension('file.tar.gz', 'new') == 'file.tar.new.gz'
1882
+
1883
+ # Test with custom separator
1884
+ assert insert_before_extension('file.ext', 'inserted', separator='_') == 'file_inserted.ext'
1885
+
1886
+ # Test with s=None (timestamp) - check format roughly
1887
+ fname_with_ts = insert_before_extension('file.ext', None)
1888
+ parts = fname_with_ts.split('.')
1889
+ # file.YYYY.MM.DD.HH.MM.SS.ext
1890
+ assert len(parts) >= 8 # file, Y, M, D, H, M, S, ext
1891
+ assert parts[0] == 'file'
1892
+ assert parts[-1] == 'ext'
1893
+ assert all(p.isdigit() for p in parts[1:-1])
1894
+
1895
+ fname_no_ext_ts = insert_before_extension('file', '') # s is empty string, should also use timestamp
1896
+ parts_no_ext = fname_no_ext_ts.split('.')
1897
+ assert len(parts_no_ext) >= 7 # file, Y, M, D, H, M, S
1898
+ assert parts_no_ext[0] == 'file'
1899
+ assert all(p.isdigit() for p in parts_no_ext[1:])
1900
+
1901
+
1902
+ def test_split_path(self):
1903
+ """
1904
+ Test the split_path function.
1905
+ """
1906
+
1907
+ if os.name == 'nt':
1908
+ assert split_path(r'c:\dir\subdir\file.txt') == ['c:\\', 'dir', 'subdir', 'file.txt']
1909
+ assert split_path('c:\\') == ['c:\\']
1910
+ # Test with mixed slashes, ntpath.split handles them
1911
+ assert split_path(r'c:/dir/subdir/file.txt') == ['c:/', 'dir', 'subdir', 'file.txt']
1912
+ else: # POSIX
1913
+ assert split_path('/dir/subdir/file.jpg') == ['/', 'dir', 'subdir', 'file.jpg']
1914
+ assert split_path('/') == ['/']
1915
+
1916
+ assert split_path('dir/file.txt') == ['dir', 'file.txt']
1917
+ assert split_path('file.txt') == ['file.txt']
1918
+ assert split_path('') == ''
1919
+ assert split_path('.') == ['.']
1920
+ assert split_path('..') == ['..']
1921
+ assert split_path('../a/b') == ['..', 'a', 'b']
1922
+
1923
+
1924
+ def test_path_is_abs(self):
1925
+ """
1926
+ Test the path_is_abs function.
1927
+ """
1928
+
1929
+ assert path_is_abs('/absolute/path')
1930
+ assert path_is_abs('c:/absolute/path')
1931
+ assert path_is_abs('C:\\absolute\\path')
1932
+ assert path_is_abs('\\\\server\\share\\path') # UNC path
1933
+ assert path_is_abs('c:file_without_slash_after_drive')
1934
+
1935
+ assert not path_is_abs('relative/path')
1936
+ assert not path_is_abs('file.txt')
1937
+ assert not path_is_abs('../relative')
1938
+ assert not path_is_abs('')
1939
+
1940
+
1941
+
1942
+ def test_safe_create_link_unix(self):
1943
+ """
1944
+ Test the safe_create_link function on Unix-like systems.
1945
+ """
1946
+
1947
+ if os.name == 'nt':
1948
+ # print("Skipping test_safe_create_link_unix on Windows.")
1949
+ return
1950
+
1951
+ source_file_path = os.path.join(self.test_dir, 'source.txt')
1952
+ link_path = os.path.join(self.test_dir, 'link.txt')
1953
+ other_source_path = os.path.join(self.test_dir, 'other_source.txt')
1954
+
1955
+ with open(source_file_path, 'w') as f:
1956
+ f.write('source data')
1957
+ with open(other_source_path, 'w') as f:
1958
+ f.write('other data')
1959
+
1960
+ # Create new link
1961
+ safe_create_link(source_file_path, link_path)
1962
+ assert os.path.islink(link_path)
1963
+ assert os.readlink(link_path) == source_file_path
1964
+
1965
+ # Link already exists and points to the correct source
1966
+ safe_create_link(source_file_path, link_path) # Should do nothing
1967
+ assert os.path.islink(link_path)
1968
+ assert os.readlink(link_path) == source_file_path
1969
+
1970
+ # Link already exists but points to a different source
1971
+ safe_create_link(other_source_path, link_path) # Should remove and re-create
1972
+ assert os.path.islink(link_path)
1973
+ assert os.readlink(link_path) == other_source_path
1974
+
1975
+ # Link_new path exists and is a file (not a link)
1976
+ file_path_conflict = os.path.join(self.test_dir, 'conflict_file.txt')
1977
+ with open(file_path_conflict, 'w') as f:
1978
+ f.write('actual file')
1979
+ try:
1980
+ safe_create_link(source_file_path, file_path_conflict)
1981
+ raise AssertionError("AssertionError not raised for file conflict")
1982
+ except AssertionError:
1983
+ pass
1984
+ os.remove(file_path_conflict)
1985
+
1986
+ # Link_new path exists and is a directory
1987
+ dir_path_conflict = os.path.join(self.test_dir, 'conflict_dir')
1988
+ os.makedirs(dir_path_conflict, exist_ok=True)
1989
+ try:
1990
+ safe_create_link(source_file_path, dir_path_conflict)
1991
+ raise AssertionError("AssertionError not raised for directory conflict")
1992
+ except AssertionError: # islink will be false
1993
+ pass
1994
+ shutil.rmtree(dir_path_conflict)
1995
+
1996
+
1997
+ def test_remove_empty_folders(self):
1998
+ """
1999
+ Test the remove_empty_folders function.
2000
+ """
2001
+
2002
+ # test_dir/
2003
+ # empty_top/
2004
+ # empty_mid/
2005
+ # empty_leaf/
2006
+ # mixed_top/
2007
+ # empty_mid_in_mixed/
2008
+ # empty_leaf_in_mixed/
2009
+ # non_empty_mid/
2010
+ # file.txt
2011
+ # non_empty_top/
2012
+ # file_in_top.txt
2013
+
2014
+ empty_top = os.path.join(self.test_dir, 'empty_top')
2015
+ empty_mid = os.path.join(empty_top, 'empty_mid')
2016
+ empty_leaf = os.path.join(empty_mid, 'empty_leaf')
2017
+ os.makedirs(empty_leaf, exist_ok=True)
2018
+
2019
+ mixed_top = os.path.join(self.test_dir, 'mixed_top')
2020
+ empty_mid_in_mixed = os.path.join(mixed_top, 'empty_mid_in_mixed')
2021
+ empty_leaf_in_mixed = os.path.join(empty_mid_in_mixed, 'empty_leaf_in_mixed')
2022
+ os.makedirs(empty_leaf_in_mixed, exist_ok=True)
2023
+ non_empty_mid = os.path.join(mixed_top, 'non_empty_mid')
2024
+ os.makedirs(non_empty_mid, exist_ok=True)
2025
+ with open(os.path.join(non_empty_mid, 'file.txt'), 'w') as f:
2026
+ f.write('data')
2027
+
2028
+ non_empty_top = os.path.join(self.test_dir, 'non_empty_top')
2029
+ os.makedirs(non_empty_top, exist_ok=True)
2030
+ with open(os.path.join(non_empty_top, 'file_in_top.txt'), 'w') as f:
2031
+ f.write('data')
2032
+
2033
+ # Process empty_top - should remove all three
2034
+ remove_empty_folders(empty_top, remove_root=True)
2035
+ assert not os.path.exists(empty_top)
2036
+ assert not os.path.exists(empty_mid)
2037
+ assert not os.path.exists(empty_leaf)
2038
+
2039
+ # Process mixed_top; should remove empty_leaf_in_mixed and empty_mid_in_mixed
2040
+ # but not mixed_top or non_empty_mid.
2041
+ remove_empty_folders(mixed_top, remove_root=True)
2042
+ assert os.path.exists(mixed_top) # mixed_top itself should remain
2043
+ assert not os.path.exists(empty_mid_in_mixed)
2044
+ assert not os.path.exists(empty_leaf_in_mixed)
2045
+ assert os.path.exists(non_empty_mid)
2046
+ assert os.path.exists(os.path.join(non_empty_mid, 'file.txt'))
2047
+
2048
+ # Process non_empty_top; should remove nothing.
2049
+ remove_empty_folders(non_empty_top, remove_root=True)
2050
+ assert os.path.exists(non_empty_top)
2051
+ assert os.path.exists(os.path.join(non_empty_top, 'file_in_top.txt'))
2052
+
2053
+ # Test with a file path (should do nothing and return False)
2054
+ file_path_for_removal = os.path.join(self.test_dir, 'a_file.txt')
2055
+ with open(file_path_for_removal, 'w') as f: f.write('t')
2056
+ assert not remove_empty_folders(file_path_for_removal, remove_root=True)
2057
+ assert os.path.exists(file_path_for_removal)
2058
+
2059
+ # Test with remove_root=False for the top level
2060
+ another_empty_top = os.path.join(self.test_dir, 'another_empty_top')
2061
+ another_empty_mid = os.path.join(another_empty_top, 'another_empty_mid')
2062
+ os.makedirs(another_empty_mid)
2063
+ remove_empty_folders(another_empty_top, remove_root=False)
2064
+ assert os.path.exists(another_empty_top) # Root not removed
2065
+ assert not os.path.exists(another_empty_mid) # Mid removed
2066
+
2067
+
2068
+ def test_path_join(self):
2069
+ """
2070
+ Test the path_join function.
2071
+ """
2072
+
2073
+ assert path_join('a', 'b', 'c') == 'a/b/c'
2074
+ assert path_join('a/b', 'c', 'd.txt') == 'a/b/c/d.txt'
2075
+ if os.name == 'nt':
2076
+ # On Windows, os.path.join uses '\', so convert_slashes=True should change it
2077
+ assert path_join('a', 'b', convert_slashes=True) == 'a/b'
2078
+ assert path_join('a', 'b', convert_slashes=False) == 'a\\b'
2079
+ assert path_join('c:\\', 'foo', 'bar', convert_slashes=True) == 'c:/foo/bar'
2080
+ assert path_join('c:\\', 'foo', 'bar', convert_slashes=False) == 'c:\\foo\\bar'
2081
+ else:
2082
+ # On POSIX, os.path.join uses '/', so convert_slashes=False should still be '/'
2083
+ assert path_join('a', 'b', convert_slashes=False) == 'a/b'
2084
+
2085
+ assert path_join('a', '', 'b') == 'a/b' # os.path.join behavior
2086
+ assert path_join('/a', 'b') == '/a/b'
2087
+ assert path_join('a', '/b') == '/b' # '/b' is absolute
2088
+
2089
+
2090
+ def test_filename_cleaning(self):
2091
+ """
2092
+ Test clean_filename, clean_path, and flatten_path functions.
2093
+ """
2094
+
2095
+ # clean_filename
2096
+ assert clean_filename("test file.txt") == "test file.txt"
2097
+ assert clean_filename("test*file?.txt", char_limit=10) == "testfile.t"
2098
+ assert clean_filename("TestFile.TXT", force_lower=True) == "testfile.txt"
2099
+ assert clean_filename("file:with<illegal>chars.txt") == "filewithillegalchars.txt"
2100
+ assert clean_filename(" accented_name_éà.txt") == " accented_name_ea.txt"
2101
+
2102
+ # Separators are not allowed by default in clean_filename
2103
+ assert clean_filename("path/to/file.txt") == "pathtofile.txt"
2104
+
2105
+ # clean_path
2106
+ assert clean_path("path/to/file.txt") == "path/to/file.txt" # slashes allowed
2107
+ assert clean_path("path\\to\\file.txt") == "path\\to\\file.txt" # backslashes allowed
2108
+ assert clean_path("path:to:file.txt") == "path:to:file.txt" # colons allowed
2109
+ assert clean_path("path/to<illegal>/file.txt") == "path/toillegal/file.txt"
2110
+
2111
+ # flatten_path
2112
+ assert flatten_path("path/to/file.txt") == "path~to~file.txt"
2113
+ assert flatten_path("path:to:file.txt", separator_char_replacement='_') == "path_to_file.txt"
2114
+ assert flatten_path("path\\to/file:name.txt") == "path~to~file~name.txt"
2115
+ assert flatten_path("path/to<illegal>/file.txt") == "path~toillegal~file.txt"
2116
+
2117
+
2118
+ def test_is_executable(self):
2119
+ """
2120
+ Test the is_executable function.
2121
+ This is a basic test; comprehensive testing is environment-dependent.
2122
+ """
2123
+
2124
+ # Hard to test reliably across all systems without knowing what's on PATH.
2125
+ if os.name == 'nt':
2126
+ assert is_executable('cmd.exe')
2127
+ assert not is_executable('non_existent_executable_blah_blah')
2128
+ else:
2129
+ assert is_executable('ls')
2130
+ assert is_executable('sh')
2131
+ assert not is_executable('non_existent_executable_blah_blah')
2132
+
2133
+
2134
+ def test_write_read_list_to_file(self):
2135
+ """
2136
+ Test write_list_to_file and read_list_from_file functions.
2137
+ """
2138
+
2139
+ test_list = ["item1", "item2 with space", "item3/with/slash"]
2140
+
2141
+ # Test with .json
2142
+ json_file_path = os.path.join(self.test_dir, "test_list.json")
2143
+ write_list_to_file(json_file_path, test_list)
2144
+ read_list_json = read_list_from_file(json_file_path)
2145
+ assert test_list == read_list_json
2146
+
2147
+ # Test with .txt
2148
+ txt_file_path = os.path.join(self.test_dir, "test_list.txt")
2149
+ write_list_to_file(txt_file_path, test_list)
2150
+ # read_list_from_file is specifically for JSON, so we read .txt manually
2151
+ with open(txt_file_path, 'r') as f:
2152
+ read_list_txt = [line.strip() for line in f.readlines()]
2153
+ assert test_list == read_list_txt
2154
+
2155
+ # Test reading non-existent json
2156
+ try:
2157
+ read_list_from_file(os.path.join(self.test_dir,"non_existent.json"))
2158
+ raise AssertionError("FileNotFoundError not raised")
2159
+ except FileNotFoundError:
2160
+ pass
2161
+
2162
+ # Test reading a non-json file with read_list_from_file (should fail parsing)
2163
+ non_json_path = os.path.join(self.test_dir, "not_a_list.json")
2164
+ with open(non_json_path, 'w') as f: f.write("this is not json")
2165
+ try:
2166
+ read_list_from_file(non_json_path)
2167
+ raise AssertionError("json.JSONDecodeError not raised")
2168
+ except json.JSONDecodeError:
2169
+ pass
2170
+
2171
+
2172
+ def test_parallel_copy_files(self):
2173
+ """
2174
+ Test the parallel_copy_files function (with max_workers=1 for test simplicity).
2175
+ """
2176
+
2177
+ source_dir = os.path.join(self.test_dir, "copy_source")
2178
+ target_dir = os.path.join(self.test_dir, "copy_target")
2179
+ os.makedirs(source_dir, exist_ok=True)
2180
+
2181
+ file_mappings = {}
2182
+ source_files_content = {}
2183
+
2184
+ for i in range(3):
2185
+ src_fn = f"file{i}.txt"
2186
+ src_path = os.path.join(source_dir, src_fn)
2187
+ if i == 0:
2188
+ tgt_fn = f"copied_file{i}.txt"
2189
+ tgt_path = os.path.join(target_dir, tgt_fn)
2190
+ else:
2191
+ tgt_fn = f"copied_file{i}_subdir.txt"
2192
+ tgt_path = os.path.join(target_dir, f"sub{i}", tgt_fn)
2193
+
2194
+ content = f"content of file {i}"
2195
+ with open(src_path, 'w') as f:
2196
+ f.write(content)
2197
+
2198
+ file_mappings[src_path] = tgt_path
2199
+ source_files_content[tgt_path] = content
2200
+
2201
+ # Test copy
2202
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, overwrite=False)
2203
+ for tgt_path, expected_content in source_files_content.items():
2204
+ assert os.path.exists(tgt_path)
2205
+ with open(tgt_path, 'r') as f:
2206
+ assert f.read() == expected_content
2207
+
2208
+ existing_target_path = list(source_files_content.keys())[0]
2209
+ with open(existing_target_path, 'w') as f:
2210
+ f.write("old content")
2211
+
2212
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, overwrite=False)
2213
+ with open(existing_target_path, 'r') as f:
2214
+ assert f.read() == "old content"
2215
+
2216
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, overwrite=True)
2217
+ with open(existing_target_path, 'r') as f:
2218
+ assert f.read() == source_files_content[existing_target_path]
2219
+
2220
+ for src_path_orig, tgt_path_orig in file_mappings.items(): # Re-create source for move
2221
+ with open(src_path_orig, 'w') as f:
2222
+ f.write(source_files_content[tgt_path_orig])
2223
+
2224
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, move=True, overwrite=True)
2225
+ for src_path, tgt_path in file_mappings.items():
2226
+ assert not os.path.exists(src_path)
2227
+ assert os.path.exists(tgt_path)
2228
+ with open(tgt_path, 'r') as f:
2229
+ assert f.read() == source_files_content[tgt_path]
2230
+
2231
+
2232
+ def test_get_file_sizes(self):
2233
+ """
2234
+ Test get_file_sizes and parallel_get_file_sizes functions.
2235
+ """
2236
+
2237
+ file_sizes_test_dir = os.path.join(self.test_dir,'file_sizes')
2238
+ os.makedirs(file_sizes_test_dir,exist_ok=True)
2239
+
2240
+ f1_path = os.path.join(file_sizes_test_dir, 'file1.txt')
2241
+ content1 = "0123456789" # 10 bytes
2242
+ with open(f1_path, 'w') as f:
2243
+ f.write(content1)
2244
+
2245
+ subdir_path = os.path.join(file_sizes_test_dir, 'subdir')
2246
+ os.makedirs(subdir_path, exist_ok=True)
2247
+ f2_path = os.path.join(subdir_path, 'file2.txt')
2248
+ content2 = "01234567890123456789" # 20 bytes
2249
+ with open(f2_path, 'w') as f:
2250
+ f.write(content2)
2251
+
2252
+ sizes_relative = get_file_sizes(file_sizes_test_dir)
2253
+ expected_sizes_relative = {
2254
+ 'file1.txt': len(content1),
2255
+ os.path.join('subdir', 'file2.txt').replace('\\','/'): len(content2)
2256
+ }
2257
+ assert sizes_relative == expected_sizes_relative
2258
+
2259
+ file_list_abs = [f1_path, f2_path]
2260
+ sizes_parallel_abs = parallel_get_file_sizes(file_list_abs, max_workers=1)
2261
+ expected_sizes_parallel_abs = {
2262
+ f1_path.replace('\\','/'): len(content1),
2263
+ f2_path.replace('\\','/'): len(content2)
2264
+ }
2265
+ assert sizes_parallel_abs == expected_sizes_parallel_abs
2266
+
2267
+ sizes_parallel_folder_abs = parallel_get_file_sizes(file_sizes_test_dir, max_workers=1, return_relative_paths=False)
2268
+ assert sizes_parallel_folder_abs == expected_sizes_parallel_abs
2269
+
2270
+ sizes_parallel_folder_rel = parallel_get_file_sizes(file_sizes_test_dir, max_workers=1, return_relative_paths=True)
2271
+ assert sizes_parallel_folder_rel == expected_sizes_relative
2272
+
2273
+ non_existent_file = os.path.join(file_sizes_test_dir, "no_such_file.txt")
2274
+ sizes_with_error = parallel_get_file_sizes([f1_path, non_existent_file], max_workers=1)
2275
+ expected_with_error = {
2276
+ f1_path.replace('\\','/'): len(content1),
2277
+ non_existent_file.replace('\\','/'): None
2278
+ }
2279
+ assert sizes_with_error == expected_with_error
2280
+
2281
+
2282
+ def test_zip_file_and_unzip_file(self):
2283
+ """
2284
+ Test zip_file and unzip_file functions.
2285
+ """
2286
+
2287
+ file_to_zip_name = "test_zip_me.txt"
2288
+ file_to_zip_path = os.path.join(self.test_dir, file_to_zip_name)
2289
+ content = "This is the content to be zipped."
2290
+ with open(file_to_zip_path, 'w') as f:
2291
+ f.write(content)
2292
+
2293
+ default_zip_output_path = file_to_zip_path + ".zip"
2294
+ returned_zip_path = zip_file(file_to_zip_path)
2295
+ assert returned_zip_path == default_zip_output_path
2296
+ assert os.path.exists(default_zip_output_path)
2297
+
2298
+ unzip_dir_default = os.path.join(self.test_dir, "unzip_default")
2299
+ os.makedirs(unzip_dir_default, exist_ok=True)
2300
+ unzip_file(default_zip_output_path, unzip_dir_default)
2301
+ unzipped_file_path_default = os.path.join(unzip_dir_default, file_to_zip_name)
2302
+ assert os.path.exists(unzipped_file_path_default)
2303
+ with open(unzipped_file_path_default, 'r') as f:
2304
+ assert f.read() == content
2305
+
2306
+ custom_zip_output_name = "custom_archive.zip"
2307
+ custom_zip_output_path = os.path.join(self.test_dir, custom_zip_output_name)
2308
+ zip_file(file_to_zip_path, output_fn=custom_zip_output_path, overwrite=True)
2309
+ assert os.path.exists(custom_zip_output_path)
2310
+
2311
+ zip_in_subdir_path = os.path.join(self.test_dir, "subdir_zip", "my.zip")
2312
+ file_in_subdir_name = "file_for_subdir_zip.txt"
2313
+ file_in_subdir_path = os.path.join(self.test_dir,"subdir_zip", file_in_subdir_name)
2314
+ os.makedirs(os.path.dirname(zip_in_subdir_path), exist_ok=True)
2315
+ with open(file_in_subdir_path, "w") as f: f.write("sub dir content")
2316
+ zip_file(file_in_subdir_path, output_fn=zip_in_subdir_path)
2317
+
2318
+ unzip_file(zip_in_subdir_path, output_folder=None)
2319
+ unzipped_in_same_dir_path = os.path.join(os.path.dirname(zip_in_subdir_path), file_in_subdir_name)
2320
+ assert os.path.exists(unzipped_in_same_dir_path)
2321
+ with open(unzipped_in_same_dir_path, 'r') as f:
2322
+ assert f.read() == "sub dir content"
2323
+
2324
+
2325
+ def test_zip_folder(self):
2326
+ """
2327
+ Test the zip_folder function.
2328
+ """
2329
+
2330
+ folder_to_zip = os.path.join(self.test_dir, "folder_to_zip")
2331
+ os.makedirs(folder_to_zip, exist_ok=True)
2332
+
2333
+ file1_name = "file1.txt"; path1 = os.path.join(folder_to_zip, file1_name)
2334
+ file2_name = "file2.log"; path2 = os.path.join(folder_to_zip, file2_name)
2335
+ subdir_name = "sub"; subdir_path = os.path.join(folder_to_zip, subdir_name)
2336
+ os.makedirs(subdir_path, exist_ok=True)
2337
+ file3_name = "file3.dat"; path3 = os.path.join(subdir_path, file3_name)
2338
+
2339
+ content1 = "content1"; content2 = "content2"; content3 = "content3"
2340
+ with open(path1, 'w') as f: f.write(content1)
2341
+ with open(path2, 'w') as f: f.write(content2)
2342
+ with open(path3, 'w') as f: f.write(content3)
2343
+
2344
+ default_zip_path = folder_to_zip + ".zip"
2345
+ zip_folder(folder_to_zip, output_fn=None, overwrite=True)
2346
+ assert os.path.exists(default_zip_path)
2347
+
2348
+ unzip_output_dir = os.path.join(self.test_dir, "unzipped_folder_content")
2349
+ os.makedirs(unzip_output_dir, exist_ok=True)
2350
+ unzip_file(default_zip_path, unzip_output_dir)
2351
+
2352
+ assert os.path.exists(os.path.join(unzip_output_dir, file1_name))
2353
+ assert os.path.exists(os.path.join(unzip_output_dir, file2_name))
2354
+ assert os.path.exists(os.path.join(unzip_output_dir, subdir_name, file3_name))
2355
+ with open(os.path.join(unzip_output_dir, file1_name), 'r')as f: assert f.read() == content1
2356
+ with open(os.path.join(unzip_output_dir, file2_name), 'r')as f: assert f.read() == content2
2357
+ with open(os.path.join(unzip_output_dir, subdir_name, file3_name), 'r')as f: assert f.read() == content3
2358
+
2359
+ mtime_before = os.path.getmtime(default_zip_path)
2360
+ zip_folder(folder_to_zip, output_fn=None, overwrite=False)
2361
+ mtime_after = os.path.getmtime(default_zip_path)
2362
+ assert mtime_before == mtime_after
2363
+
2364
+
2365
+ def test_zip_files_into_single_zipfile(self):
2366
+ """
2367
+ Test zip_files_into_single_zipfile.
2368
+ """
2369
+
2370
+ file1_path = os.path.join(self.test_dir, "zfs_file1.txt")
2371
+ content1 = "content for zfs1"
2372
+ with open(file1_path, 'w') as f: f.write(content1)
2373
+
2374
+ subdir_for_zfs = os.path.join(self.test_dir, "zfs_subdir")
2375
+ os.makedirs(subdir_for_zfs, exist_ok=True)
2376
+ file2_path = os.path.join(subdir_for_zfs, "zfs_file2.log")
2377
+ content2 = "content for zfs2"
2378
+ with open(file2_path, 'w') as f: f.write(content2)
2379
+
2380
+ input_files = [file1_path, file2_path]
2381
+ output_zip_path = os.path.join(self.test_dir, "multi_file_archive.zip")
2382
+ zip_files_into_single_zipfile(input_files, output_zip_path, arc_name_base=self.test_dir, overwrite=True)
2383
+ assert os.path.exists(output_zip_path)
2384
+
2385
+ unzip_dir = os.path.join(self.test_dir, "unzip_multi_file")
2386
+ os.makedirs(unzip_dir, exist_ok=True)
2387
+ unzip_file(output_zip_path, unzip_dir)
2388
+
2389
+ expected_unzipped_file1 = os.path.join(unzip_dir, os.path.relpath(file1_path, self.test_dir))
2390
+ expected_unzipped_file2 = os.path.join(unzip_dir, os.path.relpath(file2_path, self.test_dir))
2391
+
2392
+ assert os.path.exists(expected_unzipped_file1)
2393
+ with open(expected_unzipped_file1, 'r') as f: assert f.read() == content1
2394
+ assert os.path.exists(expected_unzipped_file2)
2395
+ assert os.path.basename(expected_unzipped_file2) == "zfs_file2.log"
2396
+ assert os.path.basename(os.path.dirname(expected_unzipped_file2)) == "zfs_subdir"
2397
+ with open(expected_unzipped_file2, 'r') as f: assert f.read() == content2
2398
+
2399
+
2400
+ def test_add_files_to_single_tar_file(self):
2401
+ """
2402
+ Test add_files_to_single_tar_file.
2403
+ """
2404
+
2405
+ file1_path = os.path.join(self.test_dir, "tar_file1.txt")
2406
+ content1 = "content for tar1"
2407
+ with open(file1_path, 'w') as f: f.write(content1)
2408
+
2409
+ subdir_for_tar = os.path.join(self.test_dir, "tar_subdir")
2410
+ os.makedirs(subdir_for_tar, exist_ok=True)
2411
+ file2_path = os.path.join(subdir_for_tar, "tar_file2.log")
2412
+ content2 = "content for tar2"
2413
+ with open(file2_path, 'w') as f: f.write(content2)
2414
+
2415
+ input_files = [file1_path, file2_path]
2416
+ output_tar_path = os.path.join(self.test_dir, "archive.tar.gz")
2417
+
2418
+ add_files_to_single_tar_file(input_files, output_tar_path, arc_name_base=self.test_dir,
2419
+ overwrite=True, mode='x:gz')
2420
+ assert os.path.exists(output_tar_path)
2421
+
2422
+ un_tar_dir = os.path.join(self.test_dir, "un_tar_contents")
2423
+ os.makedirs(un_tar_dir, exist_ok=True)
2424
+ with tarfile.open(output_tar_path, 'r:gz') as tf:
2425
+ tf.extractall(path=un_tar_dir)
2426
+
2427
+ expected_untarred_file1 = os.path.join(un_tar_dir, os.path.relpath(file1_path, self.test_dir))
2428
+ expected_untarred_file2 = os.path.join(un_tar_dir, os.path.relpath(file2_path, self.test_dir))
2429
+
2430
+ assert os.path.exists(expected_untarred_file1)
2431
+ with open(expected_untarred_file1, 'r') as f: assert f.read() == content1
2432
+ assert os.path.exists(expected_untarred_file2)
2433
+ with open(expected_untarred_file2, 'r') as f: assert f.read() == content2
2434
+
2435
+
2436
+ def test_parallel_zip_individual_files_and_folders(self):
2437
+ """
2438
+ Test parallel_zip_files, parallel_zip_folders, and zip_each_file_in_folder.
2439
+ """
2440
+
2441
+ file1_to_zip = os.path.join(self.test_dir, "pz_file1.txt")
2442
+ file2_to_zip = os.path.join(self.test_dir, "pz_file2.txt")
2443
+ with open(file1_to_zip, 'w') as f: f.write("pz_content1")
2444
+ with open(file2_to_zip, 'w') as f: f.write("pz_content2")
2445
+
2446
+ parallel_zip_files([file1_to_zip, file2_to_zip], max_workers=1, overwrite=True)
2447
+ assert os.path.exists(file1_to_zip + ".zip")
2448
+ assert os.path.exists(file2_to_zip + ".zip")
2449
+ unzip_dir_pz = os.path.join(self.test_dir, "unzip_pz")
2450
+ unzip_file(file1_to_zip + ".zip", unzip_dir_pz)
2451
+ assert os.path.exists(os.path.join(unzip_dir_pz, os.path.basename(file1_to_zip)))
2452
+
2453
+ folder1_to_zip = os.path.join(self.test_dir, "pz_folder1")
2454
+ os.makedirs(folder1_to_zip, exist_ok=True)
2455
+ with open(os.path.join(folder1_to_zip, "pf1.txt"), 'w') as f: f.write("pf1_content")
2456
+ folder2_to_zip = os.path.join(self.test_dir, "pz_folder2")
2457
+ os.makedirs(folder2_to_zip, exist_ok=True)
2458
+ with open(os.path.join(folder2_to_zip, "pf2.txt"), 'w') as f: f.write("pf2_content")
2459
+
2460
+ parallel_zip_folders([folder1_to_zip, folder2_to_zip], max_workers=1, overwrite=True)
2461
+ assert os.path.exists(folder1_to_zip + ".zip")
2462
+ assert os.path.exists(folder2_to_zip + ".zip")
2463
+ unzip_dir_pzf = os.path.join(self.test_dir, "unzip_pzf")
2464
+ unzip_file(folder1_to_zip + ".zip", unzip_dir_pzf)
2465
+ assert os.path.exists(os.path.join(unzip_dir_pzf, "pf1.txt"))
2466
+
2467
+ zef_folder = os.path.join(self.test_dir, "zef_test_folder")
2468
+ os.makedirs(zef_folder, exist_ok=True)
2469
+ zef_file1 = os.path.join(zef_folder, "zef1.txt")
2470
+ zef_file2_png = os.path.join(zef_folder, "zef2.png")
2471
+ zef_file3_zip = os.path.join(zef_folder, "zef3.zip")
2472
+ zef_subdir = os.path.join(zef_folder, "zef_sub")
2473
+ os.makedirs(zef_subdir, exist_ok=True)
2474
+ zef_file_in_sub = os.path.join(zef_subdir, "zef_subfile.txt")
2475
+
2476
+ for p_path in [zef_file1, zef_file2_png, zef_file3_zip, zef_file_in_sub]:
2477
+ with open(p_path, 'w') as f: f.write(f"content of {os.path.basename(p_path)}")
2478
+
2479
+ zip_each_file_in_folder(zef_folder, recursive=False, max_workers=1, overwrite=True)
2480
+ assert os.path.exists(zef_file1 + ".zip")
2481
+ assert os.path.exists(zef_file2_png + ".zip")
2482
+ assert not os.path.exists(zef_file3_zip + ".zip")
2483
+ assert not os.path.exists(zef_file_in_sub + ".zip")
2484
+
2485
+ if os.path.exists(zef_file1 + ".zip"): os.remove(zef_file1 + ".zip")
2486
+ if os.path.exists(zef_file2_png + ".zip"): os.remove(zef_file2_png + ".zip")
2487
+
2488
+ zip_each_file_in_folder(zef_folder, recursive=True, max_workers=1, overwrite=True)
2489
+ assert os.path.exists(zef_file1 + ".zip")
2490
+ assert os.path.exists(zef_file2_png + ".zip")
2491
+ assert not os.path.exists(zef_file3_zip + ".zip")
2492
+ assert os.path.exists(zef_file_in_sub + ".zip")
2493
+
2494
+ if os.path.exists(zef_file1 + ".zip"): os.remove(zef_file1 + ".zip")
2495
+ if os.path.exists(zef_file2_png + ".zip"): os.remove(zef_file2_png + ".zip")
2496
+ if os.path.exists(zef_file_in_sub + ".zip"): os.remove(zef_file_in_sub + ".zip")
2497
+ zip_each_file_in_folder(zef_folder, recursive=True, required_token="zef1", max_workers=1, overwrite=True)
2498
+ assert os.path.exists(zef_file1 + ".zip")
2499
+ assert not os.path.exists(zef_file2_png + ".zip")
2500
+ assert not os.path.exists(zef_file_in_sub + ".zip")
2501
+
2502
+ if os.path.exists(zef_file1 + ".zip"): os.remove(zef_file1 + ".zip")
2503
+ dummy_to_zip = os.path.join(zef_folder,"dummy.txt")
2504
+ with open(dummy_to_zip,'w') as f: f.write('d')
2505
+ zip_each_file_in_folder(zef_folder, recursive=False, exclude_zip=False, max_workers=1, overwrite=True)
2506
+ assert os.path.exists(dummy_to_zip + ".zip")
2507
+ assert os.path.exists(zef_file3_zip + ".zip")
2508
+ if os.path.exists(dummy_to_zip + ".zip"): os.remove(dummy_to_zip + ".zip")
2509
+ if os.path.exists(zef_file3_zip + ".zip"): os.remove(zef_file3_zip + ".zip")
2510
+
2511
+
2512
+ def test_compute_file_hash(self):
2513
+ """
2514
+ Test compute_file_hash and parallel_compute_file_hashes.
2515
+ """
2516
+
2517
+ file1_name = "hash_me1.txt"
2518
+ file1_path = os.path.join(self.test_dir, file1_name)
2519
+ content1 = "This is a test string for hashing."
2520
+ with open(file1_path, 'w') as f:
2521
+ f.write(content1)
2522
+
2523
+ file2_name = "hash_me2.txt"
2524
+ file2_path = os.path.join(self.test_dir, file2_name)
2525
+ with open(file2_path, 'w') as f:
2526
+ f.write(content1)
2527
+
2528
+ file3_name = "hash_me3.txt"
2529
+ file3_path = os.path.join(self.test_dir, file3_name)
2530
+ content3 = "This is a different test string for hashing."
2531
+ with open(file3_path, 'w') as f:
2532
+ f.write(content3)
2533
+
2534
+ expected_hash_content1_sha256 = \
2535
+ "c56f19d76df6a09e49fe0d9ce7b1bc7f1dbd582f668742bede65c54c47d5bcf4".lower()
2536
+ expected_hash_content3_sha256 = \
2537
+ "23013ff7e93264317f7b2fc0e9a217649f2dc0b11ca7e0bd49632424b70b6680".lower()
2538
+
2539
+ hash1 = compute_file_hash(file1_path)
2540
+ hash2 = compute_file_hash(file2_path)
2541
+ hash3 = compute_file_hash(file3_path)
2542
+ assert hash1 == expected_hash_content1_sha256
2543
+ assert hash2 == expected_hash_content1_sha256
2544
+ assert hash1 != hash3
2545
+ assert hash3 == expected_hash_content3_sha256
2546
+
2547
+ expected_hash_content1_md5 = "94b971f1f8cdb23c2af82af73160d4b0".lower()
2548
+ hash1_md5 = compute_file_hash(file1_path, algorithm='md5')
2549
+ assert hash1_md5 == expected_hash_content1_md5
2550
+
2551
+ non_existent_path = os.path.join(self.test_dir, "no_such_file.txt")
2552
+ assert compute_file_hash(non_existent_path, allow_failures=True) is None
2553
+ try:
2554
+ compute_file_hash(non_existent_path, allow_failures=False)
2555
+ raise AssertionError("FileNotFoundError not raised for compute_file_hash")
2556
+ except FileNotFoundError:
2557
+ pass
2558
+
2559
+ files_to_hash = [file1_path, file3_path, non_existent_path]
2560
+ hashes_parallel = parallel_compute_file_hashes(files_to_hash, max_workers=1)
2561
+
2562
+ norm_f1 = file1_path.replace('\\','/')
2563
+ norm_f3 = file3_path.replace('\\','/')
2564
+ norm_non = non_existent_path.replace('\\','/')
2565
+
2566
+ expected_parallel_hashes = {
2567
+ norm_f1: expected_hash_content1_sha256,
2568
+ norm_f3: expected_hash_content3_sha256,
2569
+ norm_non: None
2570
+ }
2571
+ hashes_parallel_norm = {k.replace('\\','/'): v for k,v in hashes_parallel.items()}
2572
+ assert hashes_parallel_norm == expected_parallel_hashes
2573
+
2574
+ hash_folder = os.path.join(self.test_dir, "hash_test_folder")
2575
+ os.makedirs(hash_folder, exist_ok=True)
2576
+ h_f1_name = "h_f1.txt"; h_f1_path = os.path.join(hash_folder, h_f1_name)
2577
+ h_f2_name = "h_f2.txt"; h_f2_path = os.path.join(hash_folder, h_f2_name)
2578
+ with open(h_f1_path, 'w') as f: f.write(content1)
2579
+ with open(h_f2_path, 'w') as f: f.write(content3)
2580
+
2581
+ hashes_folder_parallel = parallel_compute_file_hashes(hash_folder, recursive=False, max_workers=1)
2582
+ norm_hf1 = h_f1_path.replace('\\','/')
2583
+ norm_hf2 = h_f2_path.replace('\\','/')
2584
+ expected_folder_hashes = {
2585
+ norm_hf1: expected_hash_content1_sha256,
2586
+ norm_hf2: expected_hash_content3_sha256
2587
+ }
2588
+ hashes_folder_parallel_norm = {k.replace('\\','/'): v for k,v in hashes_folder_parallel.items()}
2589
+ assert hashes_folder_parallel_norm == expected_folder_hashes
2590
+
2591
+
2592
+ def test_path_utils():
2593
+ """
2594
+ Runs all tests in the TestPathUtils class.
2595
+ """
2596
+
2597
+ test_instance = TestPathUtils()
2598
+ test_instance.set_up()
2599
+ try:
2600
+ test_instance.test_is_image_file()
2601
+ test_instance.test_find_image_strings()
2602
+ test_instance.test_find_images()
2603
+ test_instance.test_recursive_file_list_and_file_list()
2604
+ test_instance.test_folder_list()
2605
+ test_instance.test_folder_summary()
2606
+ test_instance.test_fileparts()
2607
+ test_instance.test_insert_before_extension()
2608
+ test_instance.test_split_path()
2609
+ test_instance.test_path_is_abs()
2610
+ test_instance.test_safe_create_link_unix()
2611
+ test_instance.test_remove_empty_folders()
2612
+ test_instance.test_path_join()
2613
+ test_instance.test_filename_cleaning()
2614
+ test_instance.test_is_executable()
2615
+ test_instance.test_write_read_list_to_file()
2616
+ test_instance.test_parallel_copy_files()
2617
+ test_instance.test_get_file_sizes()
2618
+ test_instance.test_zip_file_and_unzip_file()
2619
+ test_instance.test_zip_folder()
2620
+ test_instance.test_zip_files_into_single_zipfile()
2621
+ test_instance.test_add_files_to_single_tar_file()
2622
+ test_instance.test_parallel_zip_individual_files_and_folders()
2623
+ test_instance.test_compute_file_hash()
2624
+ finally:
2625
+ test_instance.tear_down()
2626
+
2627
+ # from IPython import embed; embed()
2628
+ # test_path_utils()