megadetector 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (197) hide show
  1. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
  2. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
  3. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
  4. megadetector/classification/aggregate_classifier_probs.py +3 -3
  5. megadetector/classification/analyze_failed_images.py +5 -5
  6. megadetector/classification/cache_batchapi_outputs.py +5 -5
  7. megadetector/classification/create_classification_dataset.py +11 -12
  8. megadetector/classification/crop_detections.py +10 -10
  9. megadetector/classification/csv_to_json.py +8 -8
  10. megadetector/classification/detect_and_crop.py +13 -15
  11. megadetector/classification/efficientnet/model.py +8 -8
  12. megadetector/classification/efficientnet/utils.py +6 -5
  13. megadetector/classification/evaluate_model.py +7 -7
  14. megadetector/classification/identify_mislabeled_candidates.py +6 -6
  15. megadetector/classification/json_to_azcopy_list.py +1 -1
  16. megadetector/classification/json_validator.py +29 -32
  17. megadetector/classification/map_classification_categories.py +9 -9
  18. megadetector/classification/merge_classification_detection_output.py +12 -9
  19. megadetector/classification/prepare_classification_script.py +19 -19
  20. megadetector/classification/prepare_classification_script_mc.py +26 -26
  21. megadetector/classification/run_classifier.py +4 -4
  22. megadetector/classification/save_mislabeled.py +6 -6
  23. megadetector/classification/train_classifier.py +1 -1
  24. megadetector/classification/train_classifier_tf.py +9 -9
  25. megadetector/classification/train_utils.py +10 -10
  26. megadetector/data_management/annotations/annotation_constants.py +1 -2
  27. megadetector/data_management/camtrap_dp_to_coco.py +79 -46
  28. megadetector/data_management/cct_json_utils.py +103 -103
  29. megadetector/data_management/cct_to_md.py +49 -49
  30. megadetector/data_management/cct_to_wi.py +33 -33
  31. megadetector/data_management/coco_to_labelme.py +75 -75
  32. megadetector/data_management/coco_to_yolo.py +210 -193
  33. megadetector/data_management/databases/add_width_and_height_to_db.py +86 -12
  34. megadetector/data_management/databases/combine_coco_camera_traps_files.py +40 -40
  35. megadetector/data_management/databases/integrity_check_json_db.py +228 -200
  36. megadetector/data_management/databases/subset_json_db.py +33 -33
  37. megadetector/data_management/generate_crops_from_cct.py +88 -39
  38. megadetector/data_management/get_image_sizes.py +54 -49
  39. megadetector/data_management/labelme_to_coco.py +133 -125
  40. megadetector/data_management/labelme_to_yolo.py +159 -73
  41. megadetector/data_management/lila/create_lila_blank_set.py +81 -83
  42. megadetector/data_management/lila/create_lila_test_set.py +32 -31
  43. megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
  44. megadetector/data_management/lila/download_lila_subset.py +21 -24
  45. megadetector/data_management/lila/generate_lila_per_image_labels.py +365 -107
  46. megadetector/data_management/lila/get_lila_annotation_counts.py +35 -33
  47. megadetector/data_management/lila/get_lila_image_counts.py +22 -22
  48. megadetector/data_management/lila/lila_common.py +73 -70
  49. megadetector/data_management/lila/test_lila_metadata_urls.py +28 -19
  50. megadetector/data_management/mewc_to_md.py +344 -340
  51. megadetector/data_management/ocr_tools.py +262 -255
  52. megadetector/data_management/read_exif.py +249 -227
  53. megadetector/data_management/remap_coco_categories.py +90 -28
  54. megadetector/data_management/remove_exif.py +81 -21
  55. megadetector/data_management/rename_images.py +187 -187
  56. megadetector/data_management/resize_coco_dataset.py +588 -120
  57. megadetector/data_management/speciesnet_to_md.py +41 -41
  58. megadetector/data_management/wi_download_csv_to_coco.py +55 -55
  59. megadetector/data_management/yolo_output_to_md_output.py +248 -122
  60. megadetector/data_management/yolo_to_coco.py +333 -191
  61. megadetector/detection/change_detection.py +832 -0
  62. megadetector/detection/process_video.py +340 -337
  63. megadetector/detection/pytorch_detector.py +358 -278
  64. megadetector/detection/run_detector.py +399 -186
  65. megadetector/detection/run_detector_batch.py +404 -377
  66. megadetector/detection/run_inference_with_yolov5_val.py +340 -327
  67. megadetector/detection/run_tiled_inference.py +257 -249
  68. megadetector/detection/tf_detector.py +24 -24
  69. megadetector/detection/video_utils.py +332 -295
  70. megadetector/postprocessing/add_max_conf.py +19 -11
  71. megadetector/postprocessing/categorize_detections_by_size.py +45 -45
  72. megadetector/postprocessing/classification_postprocessing.py +468 -433
  73. megadetector/postprocessing/combine_batch_outputs.py +23 -23
  74. megadetector/postprocessing/compare_batch_results.py +590 -525
  75. megadetector/postprocessing/convert_output_format.py +106 -102
  76. megadetector/postprocessing/create_crop_folder.py +347 -147
  77. megadetector/postprocessing/detector_calibration.py +173 -168
  78. megadetector/postprocessing/generate_csv_report.py +508 -499
  79. megadetector/postprocessing/load_api_results.py +48 -27
  80. megadetector/postprocessing/md_to_coco.py +133 -102
  81. megadetector/postprocessing/md_to_labelme.py +107 -90
  82. megadetector/postprocessing/md_to_wi.py +40 -40
  83. megadetector/postprocessing/merge_detections.py +92 -114
  84. megadetector/postprocessing/postprocess_batch_results.py +319 -301
  85. megadetector/postprocessing/remap_detection_categories.py +91 -38
  86. megadetector/postprocessing/render_detection_confusion_matrix.py +214 -205
  87. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
  88. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
  89. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +704 -679
  90. megadetector/postprocessing/separate_detections_into_folders.py +226 -211
  91. megadetector/postprocessing/subset_json_detector_output.py +265 -262
  92. megadetector/postprocessing/top_folders_to_bottom.py +45 -45
  93. megadetector/postprocessing/validate_batch_results.py +70 -70
  94. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
  95. megadetector/taxonomy_mapping/map_new_lila_datasets.py +18 -19
  96. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +54 -33
  97. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +67 -67
  98. megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
  99. megadetector/taxonomy_mapping/simple_image_download.py +8 -8
  100. megadetector/taxonomy_mapping/species_lookup.py +156 -74
  101. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
  102. megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
  103. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
  104. megadetector/utils/ct_utils.py +1049 -211
  105. megadetector/utils/directory_listing.py +21 -77
  106. megadetector/utils/gpu_test.py +22 -22
  107. megadetector/utils/md_tests.py +632 -529
  108. megadetector/utils/path_utils.py +1520 -431
  109. megadetector/utils/process_utils.py +41 -41
  110. megadetector/utils/split_locations_into_train_val.py +62 -62
  111. megadetector/utils/string_utils.py +148 -27
  112. megadetector/utils/url_utils.py +489 -176
  113. megadetector/utils/wi_utils.py +2658 -2526
  114. megadetector/utils/write_html_image_list.py +137 -137
  115. megadetector/visualization/plot_utils.py +34 -30
  116. megadetector/visualization/render_images_with_thumbnails.py +39 -74
  117. megadetector/visualization/visualization_utils.py +487 -435
  118. megadetector/visualization/visualize_db.py +232 -198
  119. megadetector/visualization/visualize_detector_output.py +82 -76
  120. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/METADATA +5 -2
  121. megadetector-10.0.0.dist-info/RECORD +139 -0
  122. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/WHEEL +1 -1
  123. megadetector/api/batch_processing/api_core/__init__.py +0 -0
  124. megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
  125. megadetector/api/batch_processing/api_core/batch_service/score.py +0 -439
  126. megadetector/api/batch_processing/api_core/server.py +0 -294
  127. megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
  128. megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
  129. megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
  130. megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
  131. megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
  132. megadetector/api/batch_processing/api_core/server_utils.py +0 -88
  133. megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
  134. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
  135. megadetector/api/batch_processing/api_support/__init__.py +0 -0
  136. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
  137. megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
  138. megadetector/api/synchronous/__init__.py +0 -0
  139. megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  140. megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
  141. megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
  142. megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
  143. megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
  144. megadetector/api/synchronous/api_core/tests/load_test.py +0 -110
  145. megadetector/data_management/importers/add_nacti_sizes.py +0 -52
  146. megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
  147. megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
  148. megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
  149. megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
  150. megadetector/data_management/importers/awc_to_json.py +0 -191
  151. megadetector/data_management/importers/bellevue_to_json.py +0 -272
  152. megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
  153. megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
  154. megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
  155. megadetector/data_management/importers/cct_field_adjustments.py +0 -58
  156. megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
  157. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
  158. megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
  159. megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
  160. megadetector/data_management/importers/ena24_to_json.py +0 -276
  161. megadetector/data_management/importers/filenames_to_json.py +0 -386
  162. megadetector/data_management/importers/helena_to_cct.py +0 -283
  163. megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
  164. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
  165. megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
  166. megadetector/data_management/importers/jb_csv_to_json.py +0 -150
  167. megadetector/data_management/importers/mcgill_to_json.py +0 -250
  168. megadetector/data_management/importers/missouri_to_json.py +0 -490
  169. megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
  170. megadetector/data_management/importers/noaa_seals_2019.py +0 -181
  171. megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
  172. megadetector/data_management/importers/pc_to_json.py +0 -365
  173. megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
  174. megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
  175. megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
  176. megadetector/data_management/importers/rspb_to_json.py +0 -356
  177. megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
  178. megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
  179. megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
  180. megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
  181. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
  182. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
  183. megadetector/data_management/importers/sulross_get_exif.py +0 -65
  184. megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
  185. megadetector/data_management/importers/ubc_to_json.py +0 -399
  186. megadetector/data_management/importers/umn_to_json.py +0 -507
  187. megadetector/data_management/importers/wellington_to_json.py +0 -263
  188. megadetector/data_management/importers/wi_to_json.py +0 -442
  189. megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
  190. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
  191. megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
  192. megadetector/utils/azure_utils.py +0 -178
  193. megadetector/utils/sas_blob_utils.py +0 -509
  194. megadetector-5.0.28.dist-info/RECORD +0 -209
  195. /megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
  196. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
  197. {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0
@@ -34,6 +34,7 @@ from shutil import which
34
34
  from tqdm import tqdm
35
35
 
36
36
  from megadetector.utils.ct_utils import is_iterable
37
+ from megadetector.utils.ct_utils import make_test_folder
37
38
  from megadetector.utils.ct_utils import sort_dictionary_by_value
38
39
 
39
40
  # Should all be lower-case
@@ -47,14 +48,14 @@ CHAR_LIMIT = 255
47
48
 
48
49
  #%% General path functions
49
50
 
50
- def recursive_file_list(base_dir,
51
- convert_slashes=True,
52
- return_relative_paths=False,
51
+ def recursive_file_list(base_dir,
52
+ convert_slashes=True,
53
+ return_relative_paths=False,
53
54
  sort_files=True,
54
55
  recursive=True):
55
56
  r"""
56
57
  Enumerates files (not directories) in [base_dir].
57
-
58
+
58
59
  Args:
59
60
  base_dir (str): folder to enumerate
60
61
  convert_slashes (bool, optional): force forward slashes; if this is False, will use
@@ -64,15 +65,15 @@ def recursive_file_list(base_dir,
64
65
  sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
65
66
  provided by os.walk()
66
67
  recursive (bool, optional): enumerate recursively
67
-
68
+
68
69
  Returns:
69
70
  list: list of filenames
70
71
  """
71
-
72
+
72
73
  assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
73
-
74
+
74
75
  all_files = []
75
-
76
+
76
77
  if recursive:
77
78
  for root, _, filenames in os.walk(base_dir):
78
79
  for filename in filenames:
@@ -82,29 +83,29 @@ def recursive_file_list(base_dir,
82
83
  all_files_relative = os.listdir(base_dir)
83
84
  all_files = [os.path.join(base_dir,fn) for fn in all_files_relative]
84
85
  all_files = [fn for fn in all_files if os.path.isfile(fn)]
85
-
86
+
86
87
  if return_relative_paths:
87
88
  all_files = [os.path.relpath(fn,base_dir) for fn in all_files]
88
89
 
89
90
  if convert_slashes:
90
91
  all_files = [fn.replace('\\', '/') for fn in all_files]
91
-
92
+
92
93
  if sort_files:
93
94
  all_files = sorted(all_files)
94
-
95
+
95
96
  return all_files
96
97
 
97
98
 
98
- def file_list(base_dir,
99
+ def file_list(base_dir,
99
100
  convert_slashes=True,
100
- return_relative_paths=False,
101
- sort_files=True,
101
+ return_relative_paths=False,
102
+ sort_files=True,
102
103
  recursive=False):
103
104
  """
104
- Trivial wrapper for recursive_file_list, which was a poor function name choice
105
- at the time, since I later wanted to add non-recursive lists, but it doesn't
105
+ Trivial wrapper for recursive_file_list, which was a poor function name choice
106
+ at the time, since I later wanted to add non-recursive lists, but it doesn't
106
107
  make sense to have a "recursive" option in a function called "recursive_file_list".
107
-
108
+
108
109
  Args:
109
110
  base_dir (str): folder to enumerate
110
111
  convert_slashes (bool, optional): force forward slashes; if this is False, will use
@@ -114,11 +115,11 @@ def file_list(base_dir,
114
115
  sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
115
116
  provided by os.walk()
116
117
  recursive (bool, optional): enumerate recursively
117
-
118
+
118
119
  Returns:
119
- list: list of filenames
120
+ list: list of filenames
120
121
  """
121
-
122
+
122
123
  return recursive_file_list(base_dir,convert_slashes,return_relative_paths,sort_files,
123
124
  recursive=recursive)
124
125
 
@@ -128,94 +129,93 @@ def folder_list(base_dir,
128
129
  return_relative_paths=False,
129
130
  sort_folders=True,
130
131
  recursive=False):
131
-
132
132
  """
133
133
  Enumerates folders (not files) in [base_dir].
134
-
134
+
135
135
  Args:
136
136
  base_dir (str): folder to enumerate
137
137
  convert_slashes (bool, optional): force forward slashes; if this is False, will use
138
138
  the native path separator
139
139
  return_relative_paths (bool, optional): return paths that are relative to [base_dir],
140
140
  rather than absolute paths
141
- sort_files (bool, optional): force folders to be sorted, otherwise uses the sorting
141
+ sort_folders (bool, optional): force folders to be sorted, otherwise uses the sorting
142
142
  provided by os.walk()
143
143
  recursive (bool, optional): enumerate recursively
144
-
144
+
145
145
  Returns:
146
146
  list: list of folder names
147
147
  """
148
-
148
+
149
149
  assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
150
-
150
+
151
151
  folders = []
152
152
 
153
- if recursive:
153
+ if recursive:
154
154
  folders = []
155
155
  for root, dirs, _ in os.walk(base_dir):
156
156
  for d in dirs:
157
- folders.append(os.path.join(root, d))
157
+ folders.append(os.path.join(root, d))
158
158
  else:
159
159
  folders = os.listdir(base_dir)
160
160
  folders = [os.path.join(base_dir,fn) for fn in folders]
161
161
  folders = [fn for fn in folders if os.path.isdir(fn)]
162
-
162
+
163
163
  if return_relative_paths:
164
164
  folders = [os.path.relpath(fn,base_dir) for fn in folders]
165
165
 
166
166
  if convert_slashes:
167
167
  folders = [fn.replace('\\', '/') for fn in folders]
168
-
168
+
169
169
  if sort_folders:
170
- folders = sorted(folders)
171
-
170
+ folders = sorted(folders)
171
+
172
172
  return folders
173
173
 
174
174
 
175
175
  def folder_summary(folder,print_summary=True):
176
176
  """
177
177
  Returns (and optionally prints) a summary of [folder], including:
178
-
178
+
179
179
  * The total number of files
180
180
  * The total number of folders
181
- * The number of files for each extension
182
-
181
+ * The number of files for each extension
182
+
183
183
  Args:
184
184
  folder (str): folder to summarize
185
185
  print_summary (bool, optional): whether to print the summary
186
-
186
+
187
187
  Returns:
188
188
  dict: with fields "n_files", "n_folders", and "extension_to_count"
189
189
  """
190
-
190
+
191
191
  assert os.path.isdir(folder), '{} is not a folder'.format(folder)
192
-
192
+
193
193
  folders_relative = folder_list(folder,return_relative_paths=True,recursive=True)
194
194
  files_relative = file_list(folder,return_relative_paths=True,recursive=True)
195
-
195
+
196
196
  extension_to_count = defaultdict(int)
197
-
197
+
198
198
  for fn in files_relative:
199
199
  ext = os.path.splitext(fn)[1]
200
200
  extension_to_count[ext] += 1
201
-
201
+
202
202
  extension_to_count = sort_dictionary_by_value(extension_to_count,reverse=True)
203
-
203
+
204
204
  if print_summary:
205
205
  for extension in extension_to_count.keys():
206
206
  print('{}: {}'.format(extension,extension_to_count[extension]))
207
207
  print('')
208
208
  print('Total files: {}'.format(len(files_relative)))
209
209
  print('Total folders: {}'.format(len(folders_relative)))
210
-
210
+
211
211
  to_return = {}
212
212
  to_return['n_files'] = len(files_relative)
213
213
  to_return['n_folders'] = len(folders_relative)
214
- to_return['extension_to_count'] = extension_to_count
215
-
214
+ to_return['extension_to_count'] = extension_to_count
215
+
216
216
  return to_return
217
-
218
-
217
+
218
+
219
219
  def fileparts(path):
220
220
  r"""
221
221
  Breaks down a path into the directory path, filename, and extension.
@@ -223,25 +223,25 @@ def fileparts(path):
223
223
  Note that the '.' lives with the extension, and separators are removed.
224
224
 
225
225
  Examples:
226
-
226
+
227
227
  .. code-block:: none
228
228
 
229
- >>> fileparts('file')
229
+ >>> fileparts('file')
230
230
  ('', 'file', '')
231
231
  >>> fileparts(r'c:/dir/file.jpg')
232
232
  ('c:/dir', 'file', '.jpg')
233
233
  >>> fileparts('/dir/subdir/file.jpg')
234
- ('/dir/subdir', 'file', '.jpg')
234
+ ('/dir/subdir', 'file', '.jpg')
235
235
 
236
236
  Args:
237
237
  path (str): path name to separate into parts
238
238
  Returns:
239
- tuple: tuple containing (p,n,e):
239
+ tuple: tuple containing (p,n,e):
240
240
  - p: str, directory path
241
241
  - n: str, filename without extension
242
242
  - e: str, extension including the '.'
243
243
  """
244
-
244
+
245
245
  # ntpath seems to do the right thing for both Windows and Unix paths
246
246
  p = ntpath.dirname(path)
247
247
  basename = ntpath.basename(path)
@@ -257,27 +257,27 @@ def insert_before_extension(filename, s=None, separator='.'):
257
257
  appends [s].
258
258
 
259
259
  Examples:
260
-
260
+
261
261
  .. code-block:: none
262
-
262
+
263
263
  >>> insert_before_extension('/dir/subdir/file.ext', 'insert')
264
264
  '/dir/subdir/file.insert.ext'
265
265
  >>> insert_before_extension('/dir/subdir/file', 'insert')
266
266
  '/dir/subdir/file.insert'
267
267
  >>> insert_before_extension('/dir/subdir/file')
268
268
  '/dir/subdir/file.2020.07.20.10.54.38'
269
-
269
+
270
270
  Args:
271
271
  filename (str): filename to manipulate
272
272
  s (str, optional): string to insert before the extension in [filename], or
273
273
  None to insert a datestamp
274
274
  separator (str, optional): separator to place between the filename base
275
275
  and the inserted string
276
-
276
+
277
277
  Returns:
278
278
  str: modified string
279
279
  """
280
-
280
+
281
281
  assert len(filename) > 0
282
282
  if s is None or len(s) == 0:
283
283
  s = datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
@@ -290,9 +290,9 @@ def split_path(path):
290
290
  Splits [path] into all its constituent file/folder tokens.
291
291
 
292
292
  Examples:
293
-
293
+
294
294
  .. code-block:: none
295
-
295
+
296
296
  >>> split_path(r'c:\dir\subdir\file.txt')
297
297
  ['c:\\', 'dir', 'subdir', 'file.txt']
298
298
  >>> split_path('/dir/subdir/file.jpg')
@@ -301,14 +301,20 @@ def split_path(path):
301
301
  ['c:\\']
302
302
  >>> split_path('/')
303
303
  ['/']
304
-
304
+
305
305
  Args:
306
306
  path (str): path to split into tokens
307
-
307
+
308
308
  Returns:
309
309
  list: list of path tokens
310
310
  """
311
-
311
+
312
+ # Edge cases
313
+ if path == '':
314
+ return ''
315
+ if path is None:
316
+ return None
317
+
312
318
  parts = []
313
319
  while True:
314
320
  # ntpath seems to do the right thing for both Windows and Unix paths
@@ -325,68 +331,77 @@ def path_is_abs(p):
325
331
  """
326
332
  Determines whether [p] is an absolute path. An absolute path is defined as
327
333
  one that starts with slash, backslash, or a letter followed by a colon.
328
-
334
+
329
335
  Args:
330
336
  p (str): path to evaluate
331
-
337
+
332
338
  Returns:
333
339
  bool: True if [p] is an absolute path, else False
334
340
  """
335
-
341
+
336
342
  return (len(p) > 1) and (p[0] == '/' or p[1] == ':' or p[0] == '\\')
337
343
 
338
344
 
339
345
  def safe_create_link(link_exists,link_new):
340
346
  """
341
347
  Creates a symlink at [link_new] pointing to [link_exists].
342
-
348
+
343
349
  If [link_new] already exists, make sure it's a link (not a file),
344
350
  and if it has a different target than [link_exists], removes and re-creates
345
351
  it.
346
-
352
+
353
+ Creates a *real* directory if necessary.
354
+
347
355
  Errors if [link_new] already exists but it's not a link.
348
-
356
+
349
357
  Args:
350
358
  link_exists (str): the source of the (possibly-new) symlink
351
359
  link_new (str): the target of the (possibly-new) symlink
352
360
  """
353
-
361
+
362
+ # If the new file already exists...
354
363
  if os.path.exists(link_new) or os.path.islink(link_new):
364
+ # Error if it's not already a link
355
365
  assert os.path.islink(link_new)
366
+ # If it's already a link, and it points to the "exists" file,
367
+ # leave it alone, otherwise redirect it.
356
368
  if not os.readlink(link_new) == link_exists:
357
369
  os.remove(link_new)
358
370
  os.symlink(link_exists,link_new)
359
371
  else:
372
+ os.makedirs(os.path.dirname(link_new),exist_ok=True)
360
373
  os.symlink(link_exists,link_new)
361
-
374
+
375
+ # ...def safe_create_link(...)
376
+
362
377
 
363
378
  def remove_empty_folders(path, remove_root=False):
364
379
  """
365
380
  Recursively removes empty folders within the specified path.
366
-
381
+
367
382
  Args:
368
- path (str): the folder from which we should recursively remove
383
+ path (str): the folder from which we should recursively remove
369
384
  empty folders.
370
- remove_root (bool, optional): whether to remove the root directory if
385
+ remove_root (bool, optional): whether to remove the root directory if
371
386
  it's empty after removing all empty subdirectories. This will always
372
387
  be True during recursive calls.
373
-
388
+
374
389
  Returns:
375
390
  bool: True if the directory is empty after processing, False otherwise
376
391
  """
377
-
392
+
378
393
  # Verify that [path] is a directory
379
394
  if not os.path.isdir(path):
380
395
  return False
381
-
396
+
382
397
  # Track whether the current directory is empty
383
398
  is_empty = True
384
-
399
+
385
400
  # Iterate through all items in the directory
386
401
  for item in os.listdir(path):
387
-
402
+
388
403
  item_path = os.path.join(path, item)
389
-
404
+
390
405
  # If it's a directory, process it recursively
391
406
  if os.path.isdir(item_path):
392
407
  # If the subdirectory is empty after processing, it will be removed
@@ -396,76 +411,32 @@ def remove_empty_folders(path, remove_root=False):
396
411
  else:
397
412
  # If there's a file, the directory is not empty
398
413
  is_empty = False
399
-
414
+
400
415
  # If the directory is empty and we're supposed to remove it
401
416
  if is_empty and remove_root:
402
417
  try:
403
- os.rmdir(path)
418
+ os.rmdir(path)
404
419
  except Exception as e:
405
420
  print('Error removing directory {}: {}'.format(path,str(e)))
406
421
  is_empty = False
407
-
422
+
408
423
  return is_empty
409
424
 
410
425
  # ...def remove_empty_folders(...)
411
426
 
412
427
 
413
- def top_level_folder(p):
414
- r"""
415
- Gets the top-level folder from the path *p*.
416
-
417
- On UNIX, this is straightforward:
418
-
419
- /blah/foo
420
-
421
- ...returns '/blah'
422
-
423
- On Windows, we define this as the top-level folder that isn't the drive, so:
424
-
425
- c:\blah\foo
426
-
427
- ...returns 'c:\blah'.
428
-
429
- Args:
430
- p (str): filename to evaluate
431
-
432
- Returns:
433
- str: the top-level folder in [p], see above for details on how this is defined
434
- """
435
-
436
- if p == '':
437
- return ''
438
-
439
- # Path('/blah').parts is ('/','blah')
440
- parts = split_path(p)
441
-
442
- if len(parts) == 1:
443
- return parts[0]
444
-
445
- # Handle paths like:
446
- #
447
- # /, \, /stuff, c:, c:\stuff
448
- drive = os.path.splitdrive(p)[0]
449
- if parts[0] == drive or parts[0] == drive + '/' or parts[0] == drive + '\\' or parts[0] in ['\\', '/']:
450
- return os.path.join(parts[0], parts[1])
451
- else:
452
- return parts[0]
453
-
454
- # ...top_level_folder()
455
-
456
-
457
428
  def path_join(*paths, convert_slashes=True):
458
429
  r"""
459
430
  Wrapper for os.path.join that optionally converts backslashes to forward slashes.
460
-
431
+
461
432
  Args:
462
433
  *paths (variable-length set of strings): Path components to be joined.
463
434
  convert_slashes (bool, optional): whether to convert \\ to /
464
-
435
+
465
436
  Returns:
466
437
  A string with the joined path components.
467
438
  """
468
-
439
+
469
440
  joined_path = os.path.join(*paths)
470
441
  if convert_slashes:
471
442
  return joined_path.replace('\\', '/')
@@ -473,41 +444,24 @@ def path_join(*paths, convert_slashes=True):
473
444
  return joined_path
474
445
 
475
446
 
476
- #%% Test driver for top_level_folder
477
-
478
- if False:
479
-
480
- #%%
481
-
482
- p = 'blah/foo/bar'; s = top_level_folder(p); print(s); assert s == 'blah'
483
- p = '/blah/foo/bar'; s = top_level_folder(p); print(s); assert s == '/blah'
484
- p = 'bar'; s = top_level_folder(p); print(s); assert s == 'bar'
485
- p = ''; s = top_level_folder(p); print(s); assert s == ''
486
- p = 'c:\\'; s = top_level_folder(p); print(s); assert s == 'c:\\'
487
- p = r'c:\blah'; s = top_level_folder(p); print(s); assert s == 'c:\\blah'
488
- p = r'c:\foo'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
489
- p = r'c:/foo'; s = top_level_folder(p); print(s); assert s == 'c:/foo'
490
- p = r'c:\foo/bar'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
491
-
492
-
493
447
  #%% Image-related path functions
494
448
 
495
449
  def is_image_file(s, img_extensions=IMG_EXTENSIONS):
496
450
  """
497
451
  Checks a file's extension against a hard-coded set of image file
498
452
  extensions. Uses case-insensitive comparison.
499
-
453
+
500
454
  Does not check whether the file exists, only determines whether the filename
501
455
  implies it's an image file.
502
-
456
+
503
457
  Args:
504
458
  s (str): filename to evaluate for image-ness
505
459
  img_extensions (list, optional): list of known image file extensions
506
-
460
+
507
461
  Returns:
508
462
  bool: True if [s] appears to be an image file, else False
509
463
  """
510
-
464
+
511
465
  ext = os.path.splitext(s)[1]
512
466
  return ext.lower() in img_extensions
513
467
 
@@ -516,27 +470,27 @@ def find_image_strings(strings):
516
470
  """
517
471
  Given a list of strings that are potentially image file names, looks for
518
472
  strings that actually look like image file names (based on extension).
519
-
473
+
520
474
  Args:
521
475
  strings (list): list of filenames to check for image-ness
522
-
476
+
523
477
  Returns:
524
478
  list: the subset of [strings] that appear to be image filenames
525
479
  """
526
-
480
+
527
481
  return [s for s in strings if is_image_file(s)]
528
482
 
529
483
 
530
- def find_images(dirname,
531
- recursive=False,
532
- return_relative_paths=False,
484
+ def find_images(dirname,
485
+ recursive=False,
486
+ return_relative_paths=False,
533
487
  convert_slashes=True):
534
488
  """
535
489
  Finds all files in a directory that look like image file names. Returns
536
490
  absolute paths unless return_relative_paths is set. Uses the OS-native
537
491
  path separator unless convert_slashes is set, in which case will always
538
492
  use '/'.
539
-
493
+
540
494
  Args:
541
495
  dirname (str): the folder to search for images
542
496
  recursive (bool, optional): whether to search recursively
@@ -547,30 +501,30 @@ def find_images(dirname,
547
501
  Returns:
548
502
  list: list of image filenames found in [dirname]
549
503
  """
550
-
504
+
551
505
  assert os.path.isdir(dirname), '{} is not a folder'.format(dirname)
552
-
506
+
553
507
  if recursive:
554
508
  strings = glob.glob(os.path.join(dirname, '**', '*.*'), recursive=True)
555
509
  else:
556
510
  strings = glob.glob(os.path.join(dirname, '*.*'))
557
-
511
+
558
512
  image_files = find_image_strings(strings)
559
-
513
+
560
514
  if return_relative_paths:
561
515
  image_files = [os.path.relpath(fn,dirname) for fn in image_files]
562
-
516
+
563
517
  image_files = sorted(image_files)
564
-
518
+
565
519
  if convert_slashes:
566
520
  image_files = [fn.replace('\\', '/') for fn in image_files]
567
-
521
+
568
522
  return image_files
569
523
 
570
524
 
571
525
  #%% Filename cleaning functions
572
526
 
573
- def clean_filename(filename,
527
+ def clean_filename(filename,
574
528
  allow_list=VALID_FILENAME_CHARS,
575
529
  char_limit=CHAR_LIMIT,
576
530
  force_lower= False):
@@ -582,18 +536,18 @@ def clean_filename(filename,
582
536
 
583
537
  Adapted from
584
538
  https://gist.github.com/wassname/1393c4a57cfcbf03641dbc31886123b8
585
-
539
+
586
540
  Args:
587
541
  filename (str): filename to clean
588
542
  allow_list (str, optional): string containing all allowable filename characters
589
543
  char_limit (int, optional): maximum allowable filename length, if None will skip this
590
544
  step
591
545
  force_lower (bool, optional): convert the resulting filename to lowercase
592
-
593
- returns:
594
- str: cleaned version of [filename]
546
+
547
+ Returns:
548
+ str: cleaned version of [filename]
595
549
  """
596
-
550
+
597
551
  # keep only valid ascii chars
598
552
  cleaned_filename = (unicodedata.normalize('NFKD', filename)
599
553
  .encode('ASCII', 'ignore').decode())
@@ -607,26 +561,26 @@ def clean_filename(filename,
607
561
  return cleaned_filename
608
562
 
609
563
 
610
- def clean_path(pathname,
564
+ def clean_path(pathname,
611
565
  allow_list=VALID_PATH_CHARS,
612
566
  char_limit=CHAR_LIMIT,
613
567
  force_lower=False):
614
568
  """
615
569
  Removes non-ASCII and other invalid path characters (on any reasonable
616
570
  OS) from a path, then optionally trims to a maximum length.
617
-
571
+
618
572
  Args:
619
573
  pathname (str): path name to clean
620
574
  allow_list (str, optional): string containing all allowable filename characters
621
575
  char_limit (int, optional): maximum allowable filename length, if None will skip this
622
576
  step
623
577
  force_lower (bool, optional): convert the resulting filename to lowercase
624
-
625
- returns:
626
- str: cleaned version of [filename]
578
+
579
+ Returns:
580
+ str: cleaned version of [filename]
627
581
  """
628
-
629
- return clean_filename(pathname, allow_list=allow_list,
582
+
583
+ return clean_filename(pathname, allow_list=allow_list,
630
584
  char_limit=char_limit, force_lower=force_lower)
631
585
 
632
586
 
@@ -635,34 +589,34 @@ def flatten_path(pathname,separator_chars=SEPARATOR_CHARS,separator_char_replace
635
589
  Removes non-ASCII and other invalid path characters (on any reasonable
636
590
  OS) from a path, then trims to a maximum length. Replaces all valid
637
591
  separators with [separator_char_replacement.]
638
-
592
+
639
593
  Args:
640
594
  pathname (str): path name to flatten
641
595
  separator_chars (str, optional): string containing all known path separators
642
- separator_char_replacement (str, optional): string to insert in place of
596
+ separator_char_replacement (str, optional): string to insert in place of
643
597
  path separators.
644
-
598
+
645
599
  Returns:
646
600
  str: flattened version of [pathname]
647
601
  """
648
-
602
+
649
603
  s = clean_path(pathname)
650
604
  for c in separator_chars:
651
605
  s = s.replace(c, separator_char_replacement)
652
606
  return s
653
607
 
654
608
 
655
- def is_executable(filename):
609
+ def is_executable(filename):
656
610
  """
657
611
  Checks whether [filename] is on the system path and marked as executable.
658
-
612
+
659
613
  Args:
660
614
  filename (str): filename to check for executable status
661
-
615
+
662
616
  Returns:
663
617
  bool: True if [filename] is on the system path and marked as executable, otherwise False
664
618
  """
665
-
619
+
666
620
  # https://stackoverflow.com/questions/11210104/check-if-a-program-exists-from-a-python-script
667
621
 
668
622
  return which(filename) is not None
@@ -673,247 +627,249 @@ def is_executable(filename):
673
627
  def environment_is_wsl():
674
628
  """
675
629
  Determines whether we're running in WSL.
676
-
630
+
677
631
  Returns:
678
- True if we're running in WSL.
632
+ True if we're running in WSL.
679
633
  """
680
-
634
+
681
635
  if sys.platform not in ('linux','posix'):
682
636
  return False
683
637
  platform_string = ' '.join(platform.uname()).lower()
684
638
  return 'microsoft' in platform_string and 'wsl' in platform_string
685
-
639
+
686
640
 
687
641
  def wsl_path_to_windows_path(filename, failure_behavior='none'):
688
642
  r"""
689
643
  Converts a WSL path to a Windows path. For example, converts:
690
-
644
+
691
645
  /mnt/e/a/b/c
692
-
646
+
693
647
  ...to:
694
-
648
+
695
649
  e:\a\b\c
696
-
650
+
697
651
  Args:
698
652
  filename (str): filename to convert
699
- failure_behavior (str): what to do if the path can't be processed as a WSL path.
700
- 'none' to return None in this case, 'original' to return the original path.
701
-
653
+ failure_behavior (str, optional): what to do if the path can't be processed as a
654
+ WSL path. 'none' to return None in this case, 'original' to return the original path.
655
+
702
656
  Returns:
703
657
  str: Windows equivalent to the WSL path [filename]
704
658
  """
705
-
659
+
706
660
  assert failure_behavior in ('none','original'), \
707
661
  'Unrecognized failure_behavior value {}'.format(failure_behavior)
708
-
662
+
709
663
  # Check whether the path follows the standard WSL mount pattern
710
664
  wsl_path_pattern = r'^/mnt/([a-zA-Z])(/.*)?$'
711
665
  match = re.match(wsl_path_pattern, filename)
712
-
666
+
713
667
  if match:
714
668
 
715
669
  # Extract the drive letter and the rest of the path
716
670
  drive_letter = match.group(1)
717
671
  path_remainder = match.group(2) if match.group(2) else ''
718
-
672
+
719
673
  # Convert forward slashes to backslashes for Windows
720
674
  path_remainder = path_remainder.replace('/', '\\')
721
-
675
+
722
676
  # Format the Windows path
723
677
  windows_path = f"{drive_letter}:{path_remainder}"
724
678
  return windows_path
725
-
679
+
726
680
  if failure_behavior == 'none':
727
681
  return None
728
682
  else:
729
683
  return filename
730
684
 
731
685
  # ...def wsl_path_to_windows_path(...)
732
-
733
-
686
+
687
+
734
688
  def windows_path_to_wsl_path(filename, failure_behavior='none'):
735
689
  r"""
736
690
  Converts a Windows path to a WSL path, or returns None if that's not possible. E.g.
737
691
  converts:
738
-
692
+
739
693
  e:\a\b\c
740
-
694
+
741
695
  ...to:
742
-
696
+
743
697
  /mnt/e/a/b/c
744
-
698
+
745
699
  Args:
746
700
  filename (str): filename to convert
747
- failure_behavior (str): what to do if the path can't be processed as a Windows path.
701
+ failure_behavior (str, optional): what to do if the path can't be processed as a Windows path.
748
702
  'none' to return None in this case, 'original' to return the original path.
749
-
703
+
750
704
  Returns:
751
705
  str: WSL equivalent to the Windows path [filename]
752
706
  """
753
-
707
+
754
708
  assert failure_behavior in ('none','original'), \
755
709
  'Unrecognized failure_behavior value {}'.format(failure_behavior)
756
-
710
+
757
711
  filename = filename.replace('\\', '/')
758
-
712
+
759
713
  # Check whether the path follows a Windows drive letter pattern
760
714
  windows_path_pattern = r'^([a-zA-Z]):(/.*)?$'
761
715
  match = re.match(windows_path_pattern, filename)
762
-
716
+
763
717
  if match:
764
718
  # Extract the drive letter and the rest of the path
765
719
  drive_letter = match.group(1).lower() # Convert to lowercase for WSL
766
720
  path_remainder = match.group(2) if match.group(2) else ''
767
-
721
+
768
722
  # Format the WSL path
769
723
  wsl_path = f"/mnt/{drive_letter}{path_remainder}"
770
724
  return wsl_path
771
-
725
+
772
726
  if failure_behavior == 'none':
773
727
  return None
774
728
  else:
775
729
  return filename
776
-
730
+
777
731
  # ...def window_path_to_wsl_path(...)
778
732
 
779
733
 
780
734
  def open_file_in_chrome(filename):
781
735
  """
782
- Open a file in chrome, regardless of file type. I typically use this to open
736
+ Open a file in chrome, regardless of file type. I typically use this to open
783
737
  .md files in Chrome.
784
-
738
+
785
739
  Args:
786
740
  filename (str): file to open
787
-
741
+
788
742
  Return:
789
743
  bool: whether the operation was successful
790
744
  """
791
-
745
+
792
746
  # Create URL
793
747
  abs_path = os.path.abspath(filename)
794
-
748
+
795
749
  system = platform.system()
796
750
  if system == 'Windows':
797
751
  url = f'file:///{abs_path.replace(os.sep, "/")}'
798
752
  else: # macOS and Linux
799
753
  url = f'file://{abs_path}'
800
-
754
+
801
755
  # Determine the Chrome path
802
756
  if system == 'Windows':
803
-
757
+
804
758
  # This is a native Python module, but it only exists on Windows
805
759
  import winreg
806
-
760
+
807
761
  chrome_paths = [
808
762
  os.path.expanduser("~") + r"\AppData\Local\Google\Chrome\Application\chrome.exe",
809
763
  r"C:\Program Files\Google\Chrome\Application\chrome.exe",
810
764
  r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"
811
765
  ]
812
-
766
+
813
767
  # Default approach: run from a typical chrome location
814
768
  for path in chrome_paths:
815
769
  if os.path.exists(path):
816
770
  subprocess.run([path, url])
817
771
  return True
818
-
772
+
819
773
  # Method 2: Check registry for Chrome path
820
774
  try:
821
- with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE,
775
+ with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE,
822
776
  r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe") as key:
823
777
  chrome_path = winreg.QueryValue(key, None)
824
778
  if chrome_path and os.path.exists(chrome_path):
825
779
  subprocess.run([chrome_path, url])
826
780
  return True
827
- except:
781
+ except Exception:
828
782
  pass
829
-
783
+
830
784
  # Method 3: Try alternate registry location
831
785
  try:
832
- with winreg.OpenKey(winreg.HKEY_CURRENT_USER,
786
+ with winreg.OpenKey(winreg.HKEY_CURRENT_USER,
833
787
  r"Software\Google\Chrome\BLBeacon") as key:
834
788
  chrome_path = os.path.join(os.path.dirname(winreg.QueryValueEx(key, "version")[0]), "chrome.exe")
835
789
  if os.path.exists(chrome_path):
836
790
  subprocess.run([chrome_path, url])
837
791
  return True
838
- except:
792
+ except Exception:
839
793
  pass
840
-
794
+
841
795
  # Method 4: Try system path or command
842
796
  for chrome_cmd in ["chrome", "chrome.exe", "googlechrome", "google-chrome"]:
843
797
  try:
844
798
  subprocess.run([chrome_cmd, url], shell=True)
845
799
  return True
846
- except:
800
+ except Exception:
847
801
  continue
848
-
802
+
849
803
  # Method 5: Use Windows URL protocol handler
850
804
  try:
851
805
  os.startfile(url)
852
806
  return True
853
- except:
807
+ except Exception:
854
808
  pass
855
-
856
- # Method 6: Use rundll32
809
+
810
+ # Method 6: Use rundll32
857
811
  try:
858
812
  cmd = f'rundll32 url.dll,FileProtocolHandler {url}'
859
813
  subprocess.run(cmd, shell=True)
860
814
  return True
861
- except:
815
+ except Exception:
862
816
  pass
863
-
817
+
864
818
  elif system == 'Darwin':
865
-
819
+
866
820
  chrome_paths = [
867
821
  '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
868
822
  os.path.expanduser('~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome')
869
823
  ]
870
-
824
+
871
825
  for path in chrome_paths:
872
826
  if os.path.exists(path):
873
827
  subprocess.run([path, url])
874
828
  return True
875
-
829
+
876
830
  # Fallback to 'open' command with Chrome as the app
877
831
  try:
878
832
  subprocess.run(['open', '-a', 'Google Chrome', url])
879
833
  return True
880
- except:
834
+ except Exception:
881
835
  pass
882
-
836
+
883
837
  elif system == 'Linux':
884
-
838
+
885
839
  chrome_commands = ['google-chrome', 'chrome', 'chromium', 'chromium-browser']
886
-
840
+
887
841
  for cmd in chrome_commands:
888
842
  try:
889
843
  subprocess.run([cmd, url], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
890
844
  return True
891
- except:
845
+ except Exception:
892
846
  continue
893
-
847
+
894
848
  print(f"Could not open {filename} in Chrome on {system}.")
895
849
  return False
896
850
 
897
-
898
- def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
851
+
852
+ def open_file(filename,
853
+ attempt_to_open_in_wsl_host=False,
854
+ browser_name=None):
899
855
  """
900
856
  Opens [filename] in the default OS file handler for this file type.
901
-
857
+
902
858
  If browser_name is not None, uses the webbrowser module to open the filename
903
859
  in the specified browser; see https://docs.python.org/3/library/webbrowser.html
904
860
  for supported browsers. Falls back to the default file handler if webbrowser.open()
905
861
  fails. In this case, attempt_to_open_in_wsl_host is ignored unless webbrowser.open() fails.
906
-
907
- If browser_name is 'default', uses the system default. This is different from the
862
+
863
+ If browser_name is 'default', uses the system default. This is different from the
908
864
  parameter to webbrowser.get(), where None implies the system default.
909
-
865
+
910
866
  Args:
911
867
  filename (str): file to open
912
- attempt_to_open_in_wsl_host: if this is True, and we're in WSL, attempts to open
913
- [filename] in the Windows host environment
914
- browser_name: see above
868
+ attempt_to_open_in_wsl_host (bool, optional): if this is True, and we're in WSL, attempts
869
+ to open [filename] in the Windows host environment
870
+ browser_name (str, optional): see above
915
871
  """
916
-
872
+
917
873
  if browser_name is not None:
918
874
  if browser_name == 'chrome':
919
875
  browser_name = 'google-chrome'
@@ -925,32 +881,32 @@ def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
925
881
  result = False
926
882
  if result:
927
883
  return
928
-
884
+
929
885
  if sys.platform == 'win32':
930
-
886
+
931
887
  os.startfile(filename)
932
888
 
933
889
  elif sys.platform == 'darwin':
934
-
890
+
935
891
  opener = 'open'
936
892
  subprocess.call([opener, filename])
937
-
893
+
938
894
  elif attempt_to_open_in_wsl_host and environment_is_wsl():
939
-
895
+
940
896
  windows_path = wsl_path_to_windows_path(filename)
941
-
897
+
942
898
  # Fall back to xdg-open
943
899
  if windows_path is None:
944
900
  subprocess.call(['xdg-open', filename])
945
-
946
- if os.path.isdir(filename):
901
+
902
+ if os.path.isdir(filename):
947
903
  subprocess.run(["explorer.exe", windows_path])
948
904
  else:
949
- os.system("cmd.exe /C start %s" % (re.escape(windows_path)))
950
-
905
+ os.system("cmd.exe /C start {}".format(re.escape(windows_path)))
906
+
951
907
  else:
952
-
953
- opener = 'xdg-open'
908
+
909
+ opener = 'xdg-open'
954
910
  subprocess.call([opener, filename])
955
911
 
956
912
  # ...def open_file(...)
@@ -962,12 +918,12 @@ def write_list_to_file(output_file,strings):
962
918
  """
963
919
  Writes a list of strings to either a JSON file or text file,
964
920
  depending on extension of the given file name.
965
-
921
+
966
922
  Args:
967
923
  output_file (str): file to write
968
924
  strings (list): list of strings to write to [output_file]
969
925
  """
970
-
926
+
971
927
  with open(output_file, 'w') as f:
972
928
  if output_file.endswith('.json'):
973
929
  json.dump(strings, f, indent=1)
@@ -978,14 +934,14 @@ def write_list_to_file(output_file,strings):
978
934
  def read_list_from_file(filename):
979
935
  """
980
936
  Reads a json-formatted list of strings from a file.
981
-
937
+
982
938
  Args:
983
939
  filename (str): .json filename to read
984
-
940
+
985
941
  Returns:
986
942
  list: list of strings read from [filename]
987
943
  """
988
-
944
+
989
945
  assert filename.endswith('.json')
990
946
  with open(filename, 'r') as f:
991
947
  file_list = json.load(f)
@@ -1001,39 +957,39 @@ def _copy_file(input_output_tuple,overwrite=True,verbose=False,move=False):
1001
957
  """
1002
958
  Internal function for copying files from within parallel_copy_files.
1003
959
  """
1004
-
960
+
1005
961
  assert len(input_output_tuple) == 2
1006
962
  source_fn = input_output_tuple[0]
1007
963
  target_fn = input_output_tuple[1]
1008
964
  if (not overwrite) and (os.path.isfile(target_fn)):
1009
965
  if verbose:
1010
966
  print('Skipping existing target file {}'.format(target_fn))
1011
- return
1012
-
967
+ return
968
+
1013
969
  if move:
1014
970
  action_string = 'Moving'
1015
971
  else:
1016
972
  action_string = 'Copying'
1017
-
973
+
1018
974
  if verbose:
1019
975
  print('{} to {}'.format(action_string,target_fn))
1020
-
976
+
1021
977
  os.makedirs(os.path.dirname(target_fn),exist_ok=True)
1022
978
  if move:
1023
979
  shutil.move(source_fn, target_fn)
1024
980
  else:
1025
981
  shutil.copyfile(source_fn,target_fn)
1026
-
1027
982
 
1028
- def parallel_copy_files(input_file_to_output_file,
1029
- max_workers=16,
1030
- use_threads=True,
1031
- overwrite=False,
983
+
984
+ def parallel_copy_files(input_file_to_output_file,
985
+ max_workers=16,
986
+ use_threads=True,
987
+ overwrite=False,
1032
988
  verbose=False,
1033
989
  move=False):
1034
990
  """
1035
991
  Copy (or move) files from source to target according to the dict input_file_to_output_file.
1036
-
992
+
1037
993
  Args:
1038
994
  input_file_to_output_file (dict): dictionary mapping source files to the target files
1039
995
  to which they should be copied
@@ -1046,24 +1002,32 @@ def parallel_copy_files(input_file_to_output_file,
1046
1002
  """
1047
1003
 
1048
1004
  n_workers = min(max_workers,len(input_file_to_output_file))
1049
-
1005
+
1050
1006
  # Package the dictionary as a set of 2-tuples
1051
1007
  input_output_tuples = []
1052
1008
  for input_fn in input_file_to_output_file:
1053
1009
  input_output_tuples.append((input_fn,input_file_to_output_file[input_fn]))
1054
1010
 
1055
- if use_threads:
1056
- pool = ThreadPool(n_workers)
1057
- else:
1058
- pool = Pool(n_workers)
1011
+ pool = None
1059
1012
 
1060
- with tqdm(total=len(input_output_tuples)) as pbar:
1061
- for i,_ in enumerate(pool.imap_unordered(partial(_copy_file,
1062
- overwrite=overwrite,
1063
- verbose=verbose,
1064
- move=move),
1065
- input_output_tuples)):
1066
- pbar.update()
1013
+ try:
1014
+ if use_threads:
1015
+ pool = ThreadPool(n_workers)
1016
+ else:
1017
+ pool = Pool(n_workers)
1018
+
1019
+ with tqdm(total=len(input_output_tuples)) as pbar:
1020
+ for i,_ in enumerate(pool.imap_unordered(partial(_copy_file,
1021
+ overwrite=overwrite,
1022
+ verbose=verbose,
1023
+ move=move),
1024
+ input_output_tuples)):
1025
+ pbar.update()
1026
+ finally:
1027
+ pool.close()
1028
+ pool.join()
1029
+ if verbose:
1030
+ print("Pool closed and joined parallel file copying")
1067
1031
 
1068
1032
  # ...def parallel_copy_files(...)
1069
1033
 
@@ -1074,36 +1038,36 @@ def get_file_sizes(base_dir, convert_slashes=True):
1074
1038
  """
1075
1039
  Gets sizes recursively for all files in base_dir, returning a dict mapping
1076
1040
  relative filenames to size.
1077
-
1041
+
1078
1042
  TODO: merge the functionality here with parallel_get_file_sizes, which uses slightly
1079
1043
  different semantics.
1080
-
1044
+
1081
1045
  Args:
1082
1046
  base_dir (str): folder within which we want all file sizes
1083
1047
  convert_slashes (bool, optional): force forward slashes in return strings,
1084
1048
  otherwise uses the native path separator
1085
-
1049
+
1086
1050
  Returns:
1087
1051
  dict: dictionary mapping filenames to file sizes in bytes
1088
1052
  """
1089
-
1090
- relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
1053
+
1054
+ relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
1091
1055
  return_relative_paths=True)
1092
-
1056
+
1093
1057
  fn_to_size = {}
1094
1058
  for fn_relative in tqdm(relative_filenames):
1095
1059
  fn_abs = os.path.join(base_dir,fn_relative)
1096
1060
  fn_to_size[fn_relative] = os.path.getsize(fn_abs)
1097
-
1061
+
1098
1062
  return fn_to_size
1099
-
1063
+
1100
1064
 
1101
1065
  def _get_file_size(filename,verbose=False):
1102
1066
  """
1103
1067
  Internal function for safely getting the size of a file. Returns a (filename,size)
1104
1068
  tuple, where size is None if there is an error.
1105
1069
  """
1106
-
1070
+
1107
1071
  try:
1108
1072
  size = os.path.getsize(filename)
1109
1073
  except Exception as e:
@@ -1112,18 +1076,18 @@ def _get_file_size(filename,verbose=False):
1112
1076
  size = None
1113
1077
  return (filename,size)
1114
1078
 
1115
-
1116
- def parallel_get_file_sizes(filenames,
1117
- max_workers=16,
1118
- use_threads=True,
1079
+
1080
+ def parallel_get_file_sizes(filenames,
1081
+ max_workers=16,
1082
+ use_threads=True,
1119
1083
  verbose=False,
1120
- recursive=True,
1084
+ recursive=True,
1121
1085
  convert_slashes=True,
1122
1086
  return_relative_paths=False):
1123
1087
  """
1124
1088
  Returns a dictionary mapping every file in [filenames] to the corresponding file size,
1125
1089
  or None for errors. If [filenames] is a folder, will enumerate the folder (optionally recursively).
1126
-
1090
+
1127
1091
  Args:
1128
1092
  filenames (list or str): list of filenames for which we should read sizes, or a folder
1129
1093
  within which we should read all file sizes recursively
@@ -1135,33 +1099,33 @@ def parallel_get_file_sizes(filenames,
1135
1099
  convert_slashes (bool, optional): convert backslashes to forward slashes
1136
1100
  return_relative_paths (bool, optional): return relative paths; only relevant if [filenames]
1137
1101
  is a folder.
1138
-
1102
+
1139
1103
  Returns:
1140
1104
  dict: dictionary mapping filenames to file sizes in bytes
1141
1105
  """
1142
1106
 
1143
1107
  n_workers = min(max_workers,len(filenames))
1144
-
1108
+
1145
1109
  folder_name = None
1146
-
1110
+
1147
1111
  if isinstance(filenames,str):
1148
-
1112
+
1149
1113
  folder_name = filenames
1150
- assert os.path.isdir(filenames), 'Could not find folder {}'.format(folder_name)
1151
-
1114
+ assert os.path.isdir(filenames), 'Could not find folder {}'.format(folder_name)
1115
+
1152
1116
  if verbose:
1153
1117
  print('Enumerating files in {}'.format(folder_name))
1154
-
1118
+
1155
1119
  # Enumerate absolute paths here, we'll convert to relative later if requested
1156
1120
  filenames = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
1157
1121
 
1158
1122
  else:
1159
-
1123
+
1160
1124
  assert is_iterable(filenames), '[filenames] argument is neither a folder nor an iterable'
1161
-
1125
+
1162
1126
  if verbose:
1163
1127
  print('Creating worker pool')
1164
-
1128
+
1165
1129
  if use_threads:
1166
1130
  pool_string = 'thread'
1167
1131
  pool = ThreadPool(n_workers)
@@ -1172,11 +1136,11 @@ def parallel_get_file_sizes(filenames,
1172
1136
  if verbose:
1173
1137
  print('Created a {} pool of {} workers'.format(
1174
1138
  pool_string,n_workers))
1175
-
1139
+
1176
1140
  # This returns (filename,size) tuples
1177
1141
  get_size_results = list(tqdm(pool.imap(
1178
1142
  partial(_get_file_size,verbose=verbose),filenames), total=len(filenames)))
1179
-
1143
+
1180
1144
  to_return = {}
1181
1145
  for r in get_size_results:
1182
1146
  fn = r[0]
@@ -1194,36 +1158,38 @@ def parallel_get_file_sizes(filenames,
1194
1158
 
1195
1159
  #%% Compression (zip/tar) functions
1196
1160
 
1197
- def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
1161
+ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compress_level=9):
1198
1162
  """
1199
1163
  Zips a single file.
1200
-
1164
+
1201
1165
  Args:
1202
1166
  input_fn (str): file to zip
1203
1167
  output_fn (str, optional): target zipfile; if this is None, we'll use
1204
1168
  [input_fn].zip
1205
1169
  overwrite (bool, optional): whether to overwrite an existing target file
1206
1170
  verbose (bool, optional): enable existing debug console output
1207
- compresslevel (int, optional): compression level to use, between 0 and 9
1208
-
1171
+ compress_level (int, optional): compression level to use, between 0 and 9
1172
+
1209
1173
  Returns:
1210
1174
  str: the output zipfile, whether we created it or determined that it already exists
1211
1175
  """
1212
-
1176
+
1213
1177
  basename = os.path.basename(input_fn)
1214
-
1178
+
1215
1179
  if output_fn is None:
1216
1180
  output_fn = input_fn + '.zip'
1217
-
1181
+
1218
1182
  if (not overwrite) and (os.path.isfile(output_fn)):
1219
1183
  print('Skipping existing file {}'.format(output_fn))
1220
1184
  return output_fn
1221
-
1185
+
1222
1186
  if verbose:
1223
- print('Zipping {} to {} with level {}'.format(input_fn,output_fn,compresslevel))
1224
-
1187
+ print('Zipping {} to {} with level {}'.format(input_fn,output_fn,compress_level))
1188
+
1225
1189
  with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
1226
- zipf.write(input_fn,arcname=basename,compresslevel=compresslevel,
1190
+ zipf.write(input_fn,
1191
+ arcname=basename,
1192
+ compresslevel=compress_level,
1227
1193
  compress_type=zipfile.ZIP_DEFLATED)
1228
1194
 
1229
1195
  return output_fn
@@ -1232,9 +1198,9 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
1232
1198
  def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
1233
1199
  overwrite=False, verbose=False, mode='x'):
1234
1200
  """
1235
- Adds all the files in [input_files] to the tar file [output_fn].
1201
+ Adds all the files in [input_files] to the tar file [output_fn].
1236
1202
  Archive names are relative to arc_name_base.
1237
-
1203
+
1238
1204
  Args:
1239
1205
  input_files (list): list of absolute filenames to include in the .tar file
1240
1206
  output_fn (str): .tar file to create
@@ -1244,11 +1210,11 @@ def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
1244
1210
  overwrite (bool, optional): whether to overwrite an existing .tar file
1245
1211
  verbose (bool, optional): enable additional debug console output
1246
1212
  mode (str, optional): compression type, can be 'x' (no compression), 'x:gz', or 'x:bz2'.
1247
-
1213
+
1248
1214
  Returns:
1249
1215
  str: the output tar file, whether we created it or determined that it already exists
1250
1216
  """
1251
-
1217
+
1252
1218
  if os.path.isfile(output_fn):
1253
1219
  if not overwrite:
1254
1220
  print('Tar file {} exists, skipping'.format(output_fn))
@@ -1256,11 +1222,11 @@ def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
1256
1222
  else:
1257
1223
  print('Tar file {} exists, deleting and re-creating'.format(output_fn))
1258
1224
  os.remove(output_fn)
1259
-
1225
+
1260
1226
  if verbose:
1261
1227
  print('Adding {} files to {} (mode {})'.format(
1262
1228
  len(input_files),output_fn,mode))
1263
-
1229
+
1264
1230
  with tarfile.open(output_fn,mode) as tarf:
1265
1231
  for input_fn_abs in tqdm(input_files,disable=(not verbose)):
1266
1232
  input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
@@ -1269,12 +1235,16 @@ def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
1269
1235
  return output_fn
1270
1236
 
1271
1237
 
1272
- def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
1273
- overwrite=False, verbose=False, compresslevel=9):
1238
+ def zip_files_into_single_zipfile(input_files,
1239
+ output_fn,
1240
+ arc_name_base,
1241
+ overwrite=False,
1242
+ verbose=False,
1243
+ compress_level=9):
1274
1244
  """
1275
- Zip all the files in [input_files] into [output_fn]. Archive names are relative to
1245
+ Zip all the files in [input_files] into [output_fn]. Archive names are relative to
1276
1246
  arc_name_base.
1277
-
1247
+
1278
1248
  Args:
1279
1249
  input_files (list): list of absolute filenames to include in the .tar file
1280
1250
  output_fn (str): .tar file to create
@@ -1283,89 +1253,89 @@ def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
1283
1253
  [arc_name_base]
1284
1254
  overwrite (bool, optional): whether to overwrite an existing .tar file
1285
1255
  verbose (bool, optional): enable additional debug console output
1286
- compresslevel (int, optional): compression level to use, between 0 and 9
1287
-
1256
+ compress_level (int, optional): compression level to use, between 0 and 9
1257
+
1288
1258
  Returns:
1289
1259
  str: the output zipfile, whether we created it or determined that it already exists
1290
1260
  """
1291
-
1261
+
1292
1262
  if not overwrite:
1293
1263
  if os.path.isfile(output_fn):
1294
1264
  print('Zip file {} exists, skipping'.format(output_fn))
1295
1265
  return output_fn
1296
-
1266
+
1297
1267
  if verbose:
1298
1268
  print('Zipping {} files to {} (compression level {})'.format(
1299
- len(input_files),output_fn,compresslevel))
1300
-
1269
+ len(input_files),output_fn,compress_level))
1270
+
1301
1271
  with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
1302
1272
  for input_fn_abs in tqdm(input_files,disable=(not verbose)):
1303
1273
  input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
1304
1274
  zipf.write(input_fn_abs,
1305
1275
  arcname=input_fn_relative,
1306
- compresslevel=compresslevel,
1276
+ compresslevel=compress_level,
1307
1277
  compress_type=zipfile.ZIP_DEFLATED)
1308
1278
 
1309
1279
  return output_fn
1310
-
1311
-
1312
- def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
1280
+
1281
+
1282
+ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, compress_level=9):
1313
1283
  """
1314
- Recursively zip everything in [input_folder] into a single zipfile, storing files as paths
1284
+ Recursively zip everything in [input_folder] into a single zipfile, storing files as paths
1315
1285
  relative to [input_folder].
1316
-
1317
- Args:
1286
+
1287
+ Args:
1318
1288
  input_folder (str): folder to zip
1319
1289
  output_fn (str, optional): output filename; if this is None, we'll write to [input_folder].zip
1320
1290
  overwrite (bool, optional): whether to overwrite an existing .tar file
1321
1291
  verbose (bool, optional): enable additional debug console output
1322
- compresslevel (int, optional): compression level to use, between 0 and 9
1323
-
1292
+ compress_level (int, optional): compression level to use, between 0 and 9
1293
+
1324
1294
  Returns:
1325
- str: the output zipfile, whether we created it or determined that it already exists
1295
+ str: the output zipfile, whether we created it or determined that it already exists
1326
1296
  """
1327
-
1297
+
1328
1298
  if output_fn is None:
1329
1299
  output_fn = input_folder + '.zip'
1330
-
1300
+
1331
1301
  if not overwrite:
1332
1302
  if os.path.isfile(output_fn):
1333
1303
  print('Zip file {} exists, skipping'.format(output_fn))
1334
- return
1335
-
1304
+ return
1305
+
1336
1306
  if verbose:
1337
1307
  print('Zipping {} to {} (compression level {})'.format(
1338
- input_folder,output_fn,compresslevel))
1339
-
1308
+ input_folder,output_fn,compress_level))
1309
+
1340
1310
  relative_filenames = recursive_file_list(input_folder,return_relative_paths=True)
1341
-
1311
+
1342
1312
  with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
1343
1313
  for input_fn_relative in tqdm(relative_filenames,disable=(not verbose)):
1344
- input_fn_abs = os.path.join(input_folder,input_fn_relative)
1314
+ input_fn_abs = os.path.join(input_folder,input_fn_relative)
1345
1315
  zipf.write(input_fn_abs,
1346
1316
  arcname=input_fn_relative,
1347
- compresslevel=compresslevel,
1317
+ compresslevel=compress_level,
1348
1318
  compress_type=zipfile.ZIP_DEFLATED)
1349
1319
 
1350
1320
  return output_fn
1351
1321
 
1352
-
1353
- def parallel_zip_files(input_files,
1354
- max_workers=16,
1355
- use_threads=True,
1356
- compresslevel=9,
1357
- overwrite=False,
1322
+
1323
+ def parallel_zip_files(input_files,
1324
+ max_workers=16,
1325
+ use_threads=True,
1326
+ compress_level=9,
1327
+ overwrite=False,
1358
1328
  verbose=False):
1359
1329
  """
1360
- Zips one or more files to separate output files in parallel, leaving the
1330
+ Zips one or more files to separate output files in parallel, leaving the
1361
1331
  original files in place. Each file is zipped to [filename].zip.
1362
-
1332
+
1363
1333
  Args:
1364
- input_file (str): list of files to zip
1334
+ input_files (str): list of files to zip
1365
1335
  max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
1366
1336
  use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
1367
1337
  max_workers <= 1
1368
- compresslevel (int, optional): zip compression level between 0 and 9
1338
+ compress_level (int, optional): zip compression level between 0 and 9
1369
1339
  overwrite (bool, optional): whether to overwrite an existing .tar file
1370
1340
  verbose (bool, optional): enable additional debug console output
1371
1341
  """
@@ -1379,23 +1349,27 @@ def parallel_zip_files(input_files,
1379
1349
 
1380
1350
  with tqdm(total=len(input_files)) as pbar:
1381
1351
  for i,_ in enumerate(pool.imap_unordered(partial(zip_file,
1382
- output_fn=None,overwrite=overwrite,verbose=verbose,compresslevel=compresslevel),
1352
+ output_fn=None,overwrite=overwrite,verbose=verbose,compress_level=compress_level),
1383
1353
  input_files)):
1384
1354
  pbar.update()
1385
1355
 
1386
1356
 
1387
- def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
1388
- compresslevel=9, overwrite=False, verbose=False):
1357
+ def parallel_zip_folders(input_folders,
1358
+ max_workers=16,
1359
+ use_threads=True,
1360
+ compress_level=9,
1361
+ overwrite=False,
1362
+ verbose=False):
1389
1363
  """
1390
- Zips one or more folders to separate output files in parallel, leaving the
1364
+ Zips one or more folders to separate output files in parallel, leaving the
1391
1365
  original folders in place. Each folder is zipped to [folder_name].zip.
1392
-
1366
+
1393
1367
  Args:
1394
- input_folder (list): list of folders to zip
1368
+ input_folders (list): list of folders to zip
1395
1369
  max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
1396
1370
  use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
1397
1371
  max_workers <= 1
1398
- compresslevel (int, optional): zip compression level between 0 and 9
1372
+ compress_level (int, optional): zip compression level between 0 and 9
1399
1373
  overwrite (bool, optional): whether to overwrite an existing .tar file
1400
1374
  verbose (bool, optional): enable additional debug console output
1401
1375
  """
@@ -1406,47 +1380,53 @@ def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
1406
1380
  pool = ThreadPool(n_workers)
1407
1381
  else:
1408
1382
  pool = Pool(n_workers)
1409
-
1383
+
1410
1384
  with tqdm(total=len(input_folders)) as pbar:
1411
1385
  for i,_ in enumerate(pool.imap_unordered(
1412
1386
  partial(zip_folder,overwrite=overwrite,
1413
- compresslevel=compresslevel,verbose=verbose),
1387
+ compress_level=compress_level,verbose=verbose),
1414
1388
  input_folders)):
1415
1389
  pbar.update()
1416
1390
 
1417
1391
 
1418
- def zip_each_file_in_folder(folder_name,recursive=False,max_workers=16,use_threads=True,
1419
- compresslevel=9,overwrite=False,required_token=None,verbose=False,
1392
+ def zip_each_file_in_folder(folder_name,
1393
+ recursive=False,
1394
+ max_workers=16,
1395
+ use_threads=True,
1396
+ compress_level=9,
1397
+ overwrite=False,
1398
+ required_token=None,
1399
+ verbose=False,
1420
1400
  exclude_zip=True):
1421
1401
  """
1422
- Zips each file in [folder_name] to its own zipfile (filename.zip), optionally recursing. To
1402
+ Zips each file in [folder_name] to its own zipfile (filename.zip), optionally recursing. To
1423
1403
  zip a whole folder into a single zipfile, use zip_folder().
1424
-
1404
+
1425
1405
  Args:
1426
1406
  folder_name (str): the folder within which we should zip files
1427
1407
  recursive (bool, optional): whether to recurse within [folder_name]
1428
1408
  max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
1429
1409
  use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
1430
1410
  max_workers <= 1
1431
- compresslevel (int, optional): zip compression level between 0 and 9
1411
+ compress_level (int, optional): zip compression level between 0 and 9
1432
1412
  overwrite (bool, optional): whether to overwrite an existing .tar file
1433
1413
  required_token (str, optional): only zip files whose names contain this string
1434
1414
  verbose (bool, optional): enable additional debug console output
1435
- exclude_zip (bool, optional): skip files ending in .zip
1415
+ exclude_zip (bool, optional): skip files ending in .zip
1436
1416
  """
1437
-
1417
+
1438
1418
  assert os.path.isdir(folder_name), '{} is not a folder'.format(folder_name)
1439
-
1419
+
1440
1420
  input_files = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
1441
-
1421
+
1442
1422
  if required_token is not None:
1443
1423
  input_files = [fn for fn in input_files if required_token in fn]
1444
-
1424
+
1445
1425
  if exclude_zip:
1446
1426
  input_files = [fn for fn in input_files if (not fn.endswith('.zip'))]
1447
-
1427
+
1448
1428
  parallel_zip_files(input_files=input_files,max_workers=max_workers,
1449
- use_threads=use_threads,compresslevel=compresslevel,
1429
+ use_threads=use_threads,compress_level=compress_level,
1450
1430
  overwrite=overwrite,verbose=verbose)
1451
1431
 
1452
1432
 
@@ -1454,16 +1434,16 @@ def unzip_file(input_file, output_folder=None):
1454
1434
  """
1455
1435
  Unzips a zipfile to the specified output folder, defaulting to the same location as
1456
1436
  the input file.
1457
-
1437
+
1458
1438
  Args:
1459
1439
  input_file (str): zipfile to unzip
1460
1440
  output_folder (str, optional): folder to which we should unzip [input_file], defaults
1461
1441
  to unzipping to the folder where [input_file] lives
1462
1442
  """
1463
-
1443
+
1464
1444
  if output_folder is None:
1465
1445
  output_folder = os.path.dirname(input_file)
1466
-
1446
+
1467
1447
  with zipfile.ZipFile(input_file, 'r') as zf:
1468
1448
  zf.extractall(output_folder)
1469
1449
 
@@ -1473,31 +1453,33 @@ def unzip_file(input_file, output_folder=None):
1473
1453
  def compute_file_hash(file_path, algorithm='sha256', allow_failures=True):
1474
1454
  """
1475
1455
  Compute the hash of a file.
1476
-
1456
+
1477
1457
  Adapted from:
1478
-
1458
+
1479
1459
  https://www.geeksforgeeks.org/python-program-to-find-hash-of-file/
1480
-
1460
+
1481
1461
  Args:
1482
1462
  file_path (str): the file to hash
1483
1463
  algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
1484
-
1464
+ allow_failures (bool, optional): if True, read failures will silently return
1465
+ None; if false, read failures will raise exceptions
1466
+
1485
1467
  Returns:
1486
1468
  str: the hash value for this file
1487
1469
  """
1488
-
1470
+
1489
1471
  try:
1490
-
1472
+
1491
1473
  hash_func = hashlib.new(algorithm)
1492
-
1474
+
1493
1475
  with open(file_path, 'rb') as file:
1494
1476
  while chunk := file.read(8192): # Read the file in chunks of 8192 bytes
1495
1477
  hash_func.update(chunk)
1496
-
1478
+
1497
1479
  return str(hash_func.hexdigest())
1498
-
1480
+
1499
1481
  except Exception:
1500
-
1482
+
1501
1483
  if allow_failures:
1502
1484
  return None
1503
1485
  else:
@@ -1507,14 +1489,14 @@ def compute_file_hash(file_path, algorithm='sha256', allow_failures=True):
1507
1489
 
1508
1490
 
1509
1491
  def parallel_compute_file_hashes(filenames,
1510
- max_workers=16,
1511
- use_threads=True,
1492
+ max_workers=16,
1493
+ use_threads=True,
1512
1494
  recursive=True,
1513
1495
  algorithm='sha256',
1514
1496
  verbose=False):
1515
1497
  """
1516
1498
  Compute file hashes for a list or folder of images.
1517
-
1499
+
1518
1500
  Args:
1519
1501
  filenames (list or str): a list of filenames or a folder
1520
1502
  max_workers (int, optional): the number of parallel workers to use; set to <=1 to disable
@@ -1524,8 +1506,8 @@ def parallel_compute_file_hashes(filenames,
1524
1506
  algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
1525
1507
  recursive (bool, optional): if [filenames] is a folder, whether to enumerate recursively.
1526
1508
  Ignored if [filenames] is a list.
1527
- verbose (bool, optional): enable additional debug output
1528
-
1509
+ verbose (bool, optional): enable additional debug output
1510
+
1529
1511
  Returns:
1530
1512
  dict: a dict mapping filenames to hash values; values will be None for files that fail
1531
1513
  to load.
@@ -1535,35 +1517,1142 @@ def parallel_compute_file_hashes(filenames,
1535
1517
  if verbose:
1536
1518
  print('Enumerating files in {}'.format(filenames))
1537
1519
  filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
1538
-
1520
+
1539
1521
  n_workers = min(max_workers,len(filenames))
1540
-
1522
+
1541
1523
  if verbose:
1542
1524
  print('Computing hashes for {} files on {} workers'.format(len(filenames),n_workers))
1543
-
1525
+
1544
1526
  if n_workers <= 1:
1545
-
1527
+
1546
1528
  results = []
1547
1529
  for filename in filenames:
1548
1530
  results.append(compute_file_hash(filename,algorithm=algorithm,allow_failures=True))
1549
-
1531
+
1550
1532
  else:
1551
-
1533
+
1552
1534
  if use_threads:
1553
1535
  pool = ThreadPool(n_workers)
1554
1536
  else:
1555
1537
  pool = Pool(n_workers)
1556
-
1538
+
1557
1539
  results = list(tqdm(pool.imap(
1558
1540
  partial(compute_file_hash,algorithm=algorithm,allow_failures=True),
1559
1541
  filenames), total=len(filenames)))
1560
-
1542
+
1561
1543
  assert len(filenames) == len(results), 'Internal error in parallel_compute_file_hashes'
1562
-
1544
+
1563
1545
  to_return = {}
1564
1546
  for i_file,filename in enumerate(filenames):
1565
1547
  to_return[filename] = results[i_file]
1566
-
1548
+
1567
1549
  return to_return
1568
1550
 
1569
1551
  # ...def parallel_compute_file_hashes(...)
1552
+
1553
+
1554
+ #%% Tests
1555
+
1556
+ class TestPathUtils:
1557
+ """
1558
+ Tests for path_utils.py
1559
+ """
1560
+
1561
+ def set_up(self):
1562
+ """
1563
+ Create a temporary directory for testing.
1564
+ """
1565
+
1566
+ self.test_dir = make_test_folder(subfolder='megadetector/path_utils_tests')
1567
+ os.makedirs(self.test_dir, exist_ok=True)
1568
+
1569
+
1570
+ def tear_down(self):
1571
+ """
1572
+ Remove the temporary directory after tests.
1573
+ """
1574
+
1575
+ if os.path.exists(self.test_dir):
1576
+ shutil.rmtree(self.test_dir)
1577
+
1578
+
1579
+ def test_is_image_file(self):
1580
+ """
1581
+ Test the is_image_file function.
1582
+ """
1583
+
1584
+ assert is_image_file('test.jpg')
1585
+ assert is_image_file('test.jpeg')
1586
+ assert is_image_file('test.png')
1587
+ assert is_image_file('test.gif')
1588
+ assert is_image_file('test.bmp')
1589
+ assert is_image_file('test.tiff')
1590
+ assert is_image_file('test.TIF')
1591
+ assert not is_image_file('test.txt')
1592
+ assert not is_image_file('test.doc')
1593
+ assert is_image_file('path/to/image.JPG')
1594
+ assert not is_image_file('image')
1595
+ assert is_image_file('test.custom', img_extensions=['.custom'])
1596
+ assert not is_image_file('test.jpg', img_extensions=['.custom'])
1597
+
1598
+
1599
+ def test_find_image_strings(self):
1600
+ """
1601
+ Test the find_image_strings function.
1602
+ """
1603
+
1604
+ strings = ['a.jpg', 'b.txt', 'c.PNG', 'd.gif', 'e.jpeg', 'f.doc']
1605
+ expected = ['a.jpg', 'c.PNG', 'd.gif', 'e.jpeg']
1606
+ assert sorted(find_image_strings(strings)) == sorted(expected)
1607
+ assert find_image_strings([]) == []
1608
+ assert find_image_strings(['no_image.txt', 'another.doc']) == []
1609
+
1610
+
1611
+ def test_find_images(self):
1612
+ """
1613
+ Test the find_images function.
1614
+ """
1615
+
1616
+ # Create some dummy files
1617
+ img1_abs = os.path.join(self.test_dir, 'img1.jpg')
1618
+ img2_abs = os.path.join(self.test_dir, 'img2.PNG')
1619
+ txt1_abs = os.path.join(self.test_dir, 'text1.txt')
1620
+ open(img1_abs, 'w').close()
1621
+ open(img2_abs, 'w').close()
1622
+ open(txt1_abs, 'w').close()
1623
+
1624
+ subdir = os.path.join(self.test_dir, 'subdir')
1625
+ os.makedirs(subdir, exist_ok=True)
1626
+ img3_abs = os.path.join(subdir, 'img3.jpeg')
1627
+ txt2_abs = os.path.join(subdir, 'text2.txt')
1628
+ open(img3_abs, 'w').close()
1629
+ open(txt2_abs, 'w').close()
1630
+
1631
+ # Test non-recursive
1632
+ expected_non_recursive_abs = sorted([img1_abs.replace('\\', '/'), img2_abs.replace('\\', '/')])
1633
+ found_non_recursive_abs = find_images(self.test_dir, recursive=False, return_relative_paths=False)
1634
+ assert sorted(found_non_recursive_abs) == expected_non_recursive_abs
1635
+
1636
+ # Test non-recursive, relative paths
1637
+ expected_non_recursive_rel = sorted(['img1.jpg', 'img2.PNG'])
1638
+ found_non_recursive_rel = find_images(self.test_dir, recursive=False, return_relative_paths=True)
1639
+ assert sorted(found_non_recursive_rel) == expected_non_recursive_rel
1640
+
1641
+ # Test recursive
1642
+ expected_recursive_abs = sorted([
1643
+ img1_abs.replace('\\', '/'),
1644
+ img2_abs.replace('\\', '/'),
1645
+ img3_abs.replace('\\', '/')
1646
+ ])
1647
+ found_recursive_abs = find_images(self.test_dir, recursive=True, return_relative_paths=False)
1648
+ assert sorted(found_recursive_abs) == expected_recursive_abs
1649
+
1650
+ # Test recursive, relative paths
1651
+ expected_recursive_rel = sorted([
1652
+ 'img1.jpg',
1653
+ 'img2.PNG',
1654
+ os.path.join('subdir', 'img3.jpeg').replace('\\', '/')
1655
+ ])
1656
+ found_recursive_rel = find_images(self.test_dir, recursive=True, return_relative_paths=True)
1657
+ assert sorted(found_recursive_rel) == expected_recursive_rel
1658
+
1659
+ # Test with an empty directory
1660
+ empty_dir = os.path.join(self.test_dir, 'empty_dir')
1661
+ os.makedirs(empty_dir, exist_ok=True)
1662
+ assert find_images(empty_dir, recursive=True) == []
1663
+
1664
+ # Test with a directory that doesn't exist (should assert)
1665
+ try:
1666
+ find_images(os.path.join(self.test_dir, 'non_existent_dir'))
1667
+ raise AssertionError("AssertionError not raised for non_existent_dir")
1668
+ except AssertionError:
1669
+ pass
1670
+
1671
+
1672
+ def test_recursive_file_list_and_file_list(self):
1673
+ """
1674
+ Test the recursive_file_list and file_list functions.
1675
+ """
1676
+
1677
+ # Setup directory structure
1678
+ # test_dir/
1679
+ # file1.txt
1680
+ # file2.jpg
1681
+ # subdir1/
1682
+ # file3.txt
1683
+ # subsubdir/
1684
+ # file4.png
1685
+ # subdir2/
1686
+ # file5.doc
1687
+
1688
+ list_dir = os.path.join(self.test_dir,'recursive_list')
1689
+
1690
+ f1 = os.path.join(list_dir, 'file1.txt')
1691
+ f2 = os.path.join(list_dir, 'file2.jpg')
1692
+ subdir1 = os.path.join(list_dir, 'subdir1')
1693
+ os.makedirs(subdir1, exist_ok=True)
1694
+ f3 = os.path.join(subdir1, 'file3.txt')
1695
+ subsubdir = os.path.join(subdir1, 'subsubdir')
1696
+ os.makedirs(subsubdir, exist_ok=True)
1697
+ f4 = os.path.join(subsubdir, 'file4.png')
1698
+ subdir2 = os.path.join(list_dir, 'subdir2')
1699
+ os.makedirs(subdir2, exist_ok=True)
1700
+ f5 = os.path.join(subdir2, 'file5.doc')
1701
+
1702
+ for filepath in [f1, f2, f3, f4, f5]:
1703
+ with open(filepath, 'w') as f:
1704
+ f.write('test')
1705
+
1706
+ # Test recursive_file_list (recursive=True by default)
1707
+ expected_all_files_abs = sorted([
1708
+ f1.replace('\\', '/'), f2.replace('\\', '/'), f3.replace('\\', '/'),
1709
+ f4.replace('\\', '/'), f5.replace('\\', '/')
1710
+ ])
1711
+ all_files_abs = recursive_file_list(list_dir, convert_slashes=True,
1712
+ return_relative_paths=False)
1713
+ assert sorted(all_files_abs) == expected_all_files_abs
1714
+
1715
+ # Test recursive_file_list with relative paths
1716
+ expected_all_files_rel = sorted([
1717
+ 'file1.txt', 'file2.jpg',
1718
+ os.path.join('subdir1', 'file3.txt').replace('\\', '/'),
1719
+ os.path.join('subdir1', 'subsubdir', 'file4.png').replace('\\', '/'),
1720
+ os.path.join('subdir2', 'file5.doc').replace('\\', '/')
1721
+ ])
1722
+ all_files_rel = recursive_file_list(list_dir, convert_slashes=True,
1723
+ return_relative_paths=True)
1724
+ assert sorted(all_files_rel) == expected_all_files_rel
1725
+
1726
+ # Test file_list (non-recursive by default via wrapper)
1727
+ expected_top_level_files_abs = sorted([f1.replace('\\', '/'), f2.replace('\\', '/')])
1728
+ top_level_files_abs = file_list(list_dir, convert_slashes=True,
1729
+ return_relative_paths=False, recursive=False)
1730
+ assert sorted(top_level_files_abs) == expected_top_level_files_abs
1731
+
1732
+ # Test file_list (recursive explicitly) - should be same as recursive_file_list
1733
+ recursive_via_file_list = file_list(list_dir, convert_slashes=True,
1734
+ return_relative_paths=False, recursive=True)
1735
+ assert sorted(recursive_via_file_list) == expected_all_files_abs
1736
+
1737
+ # Test with convert_slashes=False (use os.sep)
1738
+ #
1739
+ # Note: This test might be tricky if os.sep is '/', as no replacement happens. We'll check
1740
+ # that backslashes remain on Windows.
1741
+ if os.sep == '\\':
1742
+ f1_raw = os.path.join(list_dir, 'file1.txt')
1743
+ # Only one file for simplicity
1744
+ files_no_slash_conversion = file_list(list_dir, convert_slashes=False, recursive=False)
1745
+ assert any(f1_raw in s for s in files_no_slash_conversion)
1746
+
1747
+ # Test with an empty directory
1748
+ empty_dir = os.path.join(list_dir, "empty_dir_for_files")
1749
+ os.makedirs(empty_dir, exist_ok=True)
1750
+ assert recursive_file_list(empty_dir) == []
1751
+ assert file_list(empty_dir, recursive=False) == []
1752
+
1753
+ # Test with a non-existent directory
1754
+ try:
1755
+ recursive_file_list(os.path.join(list_dir, "non_existent_dir"))
1756
+ raise AssertionError("AssertionError not raised for non_existent_dir in recursive_file_list")
1757
+ except AssertionError:
1758
+ pass
1759
+
1760
+
1761
+ def test_folder_list(self):
1762
+ """
1763
+ Test the folder_list function.
1764
+ """
1765
+
1766
+ # Setup directory structure
1767
+ # test_dir/
1768
+ # subdir1/
1769
+ # subsubdir1/
1770
+ # subdir2/
1771
+ # file1.txt (should be ignored)
1772
+
1773
+ folder_list_dir = os.path.join(self.test_dir,'folder_list')
1774
+
1775
+ subdir1 = os.path.join(folder_list_dir, 'subdir1')
1776
+ subsubdir1 = os.path.join(subdir1, 'subsubdir1')
1777
+ subdir2 = os.path.join(folder_list_dir, 'subdir2')
1778
+ os.makedirs(subdir1, exist_ok=True)
1779
+ os.makedirs(subsubdir1, exist_ok=True)
1780
+ os.makedirs(subdir2, exist_ok=True)
1781
+ with open(os.path.join(folder_list_dir, 'file1.txt'), 'w') as f:
1782
+ f.write('test')
1783
+
1784
+ # Test non-recursive
1785
+ expected_folders_non_recursive_abs = sorted([
1786
+ subdir1.replace('\\', '/'), subdir2.replace('\\', '/')
1787
+ ])
1788
+ folders_non_recursive_abs = folder_list(folder_list_dir, recursive=False,
1789
+ return_relative_paths=False)
1790
+ assert sorted(folders_non_recursive_abs) == expected_folders_non_recursive_abs
1791
+
1792
+ # Test non-recursive, relative paths
1793
+ expected_folders_non_recursive_rel = sorted(['subdir1', 'subdir2'])
1794
+ folders_non_recursive_rel = folder_list(folder_list_dir, recursive=False,
1795
+ return_relative_paths=True)
1796
+ assert sorted(folders_non_recursive_rel) == expected_folders_non_recursive_rel
1797
+
1798
+ # Test recursive
1799
+ expected_folders_recursive_abs = sorted([
1800
+ subdir1.replace('\\', '/'),
1801
+ subsubdir1.replace('\\', '/'),
1802
+ subdir2.replace('\\', '/')
1803
+ ])
1804
+ folders_recursive_abs = folder_list(folder_list_dir, recursive=True,
1805
+ return_relative_paths=False)
1806
+ assert sorted(folders_recursive_abs) == expected_folders_recursive_abs
1807
+
1808
+ # Test recursive, relative paths
1809
+ expected_folders_recursive_rel = sorted([
1810
+ 'subdir1',
1811
+ os.path.join('subdir1', 'subsubdir1').replace('\\', '/'),
1812
+ 'subdir2'
1813
+ ])
1814
+ folders_recursive_rel = folder_list(folder_list_dir, recursive=True,
1815
+ return_relative_paths=True)
1816
+ assert sorted(folders_recursive_rel) == expected_folders_recursive_rel
1817
+
1818
+ # Test with an empty directory (except for the file)
1819
+ empty_dir_for_folders = os.path.join(folder_list_dir, "empty_for_folders")
1820
+ os.makedirs(empty_dir_for_folders, exist_ok=True)
1821
+ with open(os.path.join(empty_dir_for_folders, 'temp.txt'), 'w') as f: f.write('t')
1822
+ assert folder_list(empty_dir_for_folders, recursive=True) == []
1823
+ assert folder_list(empty_dir_for_folders, recursive=False) == []
1824
+
1825
+ # Test with a non-existent directory
1826
+ try:
1827
+ folder_list(os.path.join(self.test_dir, "non_existent_dir"))
1828
+ raise AssertionError("AssertionError not raised for non_existent_dir in folder_list")
1829
+ except AssertionError:
1830
+ pass
1831
+
1832
+
1833
+ def test_folder_summary(self):
1834
+ """
1835
+ Test the folder_summary function.
1836
+ """
1837
+
1838
+ # test_dir/
1839
+ # file1.txt
1840
+ # img1.jpg
1841
+ # subdir/
1842
+ # file2.txt
1843
+ # img2.png
1844
+ # img3.png
1845
+
1846
+ folder_summary_dir = os.path.join(self.test_dir,'folder_summary')
1847
+
1848
+ f1 = os.path.join(folder_summary_dir, 'file1.txt')
1849
+ img1 = os.path.join(folder_summary_dir, 'img1.jpg')
1850
+ subdir = os.path.join(folder_summary_dir, 'subdir')
1851
+ os.makedirs(subdir, exist_ok=True)
1852
+ f2 = os.path.join(subdir, 'file2.txt')
1853
+ img2 = os.path.join(subdir, 'img2.png')
1854
+ img3 = os.path.join(subdir, 'img3.png')
1855
+
1856
+ for filepath in [f1, img1, f2, img2, img3]:
1857
+ with open(filepath, 'w') as f:
1858
+ f.write('test')
1859
+
1860
+ summary = folder_summary(folder_summary_dir, print_summary=False)
1861
+
1862
+ assert summary['n_files'] == 5
1863
+ assert summary['n_folders'] == 1 # 'subdir'
1864
+ assert summary['extension_to_count']['.txt'] == 2
1865
+ assert summary['extension_to_count']['.jpg'] == 1
1866
+ assert summary['extension_to_count']['.png'] == 2
1867
+
1868
+ # Check order (sorted by value, desc)
1869
+ #
1870
+ # The specific order of keys with the same counts can vary based on file system list
1871
+ # order. We'll check that the counts are correct and the number of unique extensions is
1872
+ # right.
1873
+ assert len(summary['extension_to_count']) == 3
1874
+
1875
+
1876
+ empty_dir = os.path.join(folder_summary_dir, "empty_summary_dir")
1877
+ os.makedirs(empty_dir, exist_ok=True)
1878
+ empty_summary = folder_summary(empty_dir, print_summary=False)
1879
+ assert empty_summary['n_files'] == 0
1880
+ assert empty_summary['n_folders'] == 0
1881
+ assert empty_summary['extension_to_count'] == {}
1882
+
1883
+
1884
+ def test_fileparts(self):
1885
+ """
1886
+ Test the fileparts function.
1887
+ """
1888
+
1889
+ assert fileparts('file') == ('', 'file', '')
1890
+ assert fileparts('file.txt') == ('', 'file', '.txt')
1891
+ assert fileparts(r'c:/dir/file.jpg') == ('c:/dir', 'file', '.jpg')
1892
+ assert fileparts('/dir/subdir/file.jpg') == ('/dir/subdir', 'file', '.jpg')
1893
+ assert fileparts(r'c:\dir\file') == (r'c:\dir', 'file', '')
1894
+ assert fileparts(r'c:\dir\file.tar.gz') == (r'c:\dir', 'file.tar', '.gz')
1895
+ assert fileparts('.bashrc') == ('', '.bashrc', '') # Hidden file, no extension
1896
+ assert fileparts('nodir/.bashrc') == ('nodir', '.bashrc', '')
1897
+ assert fileparts('a/b/c.d.e') == ('a/b', 'c.d', '.e')
1898
+
1899
+
1900
+ def test_insert_before_extension(self):
1901
+ """
1902
+ Test the insert_before_extension function.
1903
+ """
1904
+
1905
+ assert insert_before_extension('file.ext', 'inserted') == 'file.inserted.ext'
1906
+ assert insert_before_extension('file', 'inserted') == 'file.inserted'
1907
+ assert insert_before_extension('path/to/file.ext', 'tag') == 'path/to/file.tag.ext'
1908
+ assert insert_before_extension('path/to/file', 'tag') == 'path/to/file.tag'
1909
+ assert insert_before_extension('file.tar.gz', 'new') == 'file.tar.new.gz'
1910
+
1911
+ # Test with custom separator
1912
+ assert insert_before_extension('file.ext', 'inserted', separator='_') == 'file_inserted.ext'
1913
+
1914
+ # Test with s=None (timestamp) - check format roughly
1915
+ fname_with_ts = insert_before_extension('file.ext', None)
1916
+ parts = fname_with_ts.split('.')
1917
+ # file.YYYY.MM.DD.HH.MM.SS.ext
1918
+ assert len(parts) >= 8 # file, Y, M, D, H, M, S, ext
1919
+ assert parts[0] == 'file'
1920
+ assert parts[-1] == 'ext'
1921
+ assert all(p.isdigit() for p in parts[1:-1])
1922
+
1923
+ fname_no_ext_ts = insert_before_extension('file', '') # s is empty string, should also use timestamp
1924
+ parts_no_ext = fname_no_ext_ts.split('.')
1925
+ assert len(parts_no_ext) >= 7 # file, Y, M, D, H, M, S
1926
+ assert parts_no_ext[0] == 'file'
1927
+ assert all(p.isdigit() for p in parts_no_ext[1:])
1928
+
1929
+
1930
+ def test_split_path(self):
1931
+ """
1932
+ Test the split_path function.
1933
+ """
1934
+
1935
+ if os.name == 'nt':
1936
+ assert split_path(r'c:\dir\subdir\file.txt') == ['c:\\', 'dir', 'subdir', 'file.txt']
1937
+ assert split_path('c:\\') == ['c:\\']
1938
+ # Test with mixed slashes, ntpath.split handles them
1939
+ assert split_path(r'c:/dir/subdir/file.txt') == ['c:/', 'dir', 'subdir', 'file.txt']
1940
+ else: # POSIX
1941
+ assert split_path('/dir/subdir/file.jpg') == ['/', 'dir', 'subdir', 'file.jpg']
1942
+ assert split_path('/') == ['/']
1943
+
1944
+ assert split_path('dir/file.txt') == ['dir', 'file.txt']
1945
+ assert split_path('file.txt') == ['file.txt']
1946
+ assert split_path('') == ''
1947
+ assert split_path('.') == ['.']
1948
+ assert split_path('..') == ['..']
1949
+ assert split_path('../a/b') == ['..', 'a', 'b']
1950
+
1951
+
1952
+ def test_path_is_abs(self):
1953
+ """
1954
+ Test the path_is_abs function.
1955
+ """
1956
+
1957
+ assert path_is_abs('/absolute/path')
1958
+ assert path_is_abs('c:/absolute/path')
1959
+ assert path_is_abs('C:\\absolute\\path')
1960
+ assert path_is_abs('\\\\server\\share\\path') # UNC path
1961
+ assert path_is_abs('c:file_without_slash_after_drive')
1962
+
1963
+ assert not path_is_abs('relative/path')
1964
+ assert not path_is_abs('file.txt')
1965
+ assert not path_is_abs('../relative')
1966
+ assert not path_is_abs('')
1967
+
1968
+
1969
+
1970
+ def test_safe_create_link_unix(self):
1971
+ """
1972
+ Test the safe_create_link function on Unix-like systems.
1973
+ """
1974
+
1975
+ if os.name == 'nt':
1976
+ # print("Skipping test_safe_create_link_unix on Windows.")
1977
+ return
1978
+
1979
+ source_file_path = os.path.join(self.test_dir, 'source.txt')
1980
+ link_path = os.path.join(self.test_dir, 'link.txt')
1981
+ other_source_path = os.path.join(self.test_dir, 'other_source.txt')
1982
+
1983
+ with open(source_file_path, 'w') as f:
1984
+ f.write('source data')
1985
+ with open(other_source_path, 'w') as f:
1986
+ f.write('other data')
1987
+
1988
+ # Create new link
1989
+ safe_create_link(source_file_path, link_path)
1990
+ assert os.path.islink(link_path)
1991
+ assert os.readlink(link_path) == source_file_path
1992
+
1993
+ # Link already exists and points to the correct source
1994
+ safe_create_link(source_file_path, link_path) # Should do nothing
1995
+ assert os.path.islink(link_path)
1996
+ assert os.readlink(link_path) == source_file_path
1997
+
1998
+ # Link already exists but points to a different source
1999
+ safe_create_link(other_source_path, link_path) # Should remove and re-create
2000
+ assert os.path.islink(link_path)
2001
+ assert os.readlink(link_path) == other_source_path
2002
+
2003
+ # Link_new path exists and is a file (not a link)
2004
+ file_path_conflict = os.path.join(self.test_dir, 'conflict_file.txt')
2005
+ with open(file_path_conflict, 'w') as f:
2006
+ f.write('actual file')
2007
+ try:
2008
+ safe_create_link(source_file_path, file_path_conflict)
2009
+ raise AssertionError("AssertionError not raised for file conflict")
2010
+ except AssertionError:
2011
+ pass
2012
+ os.remove(file_path_conflict)
2013
+
2014
+ # Link_new path exists and is a directory
2015
+ dir_path_conflict = os.path.join(self.test_dir, 'conflict_dir')
2016
+ os.makedirs(dir_path_conflict, exist_ok=True)
2017
+ try:
2018
+ safe_create_link(source_file_path, dir_path_conflict)
2019
+ raise AssertionError("AssertionError not raised for directory conflict")
2020
+ except AssertionError: # islink will be false
2021
+ pass
2022
+ shutil.rmtree(dir_path_conflict)
2023
+
2024
+
2025
+ def test_remove_empty_folders(self):
2026
+ """
2027
+ Test the remove_empty_folders function.
2028
+ """
2029
+
2030
+ # test_dir/
2031
+ # empty_top/
2032
+ # empty_mid/
2033
+ # empty_leaf/
2034
+ # mixed_top/
2035
+ # empty_mid_in_mixed/
2036
+ # empty_leaf_in_mixed/
2037
+ # non_empty_mid/
2038
+ # file.txt
2039
+ # non_empty_top/
2040
+ # file_in_top.txt
2041
+
2042
+ empty_top = os.path.join(self.test_dir, 'empty_top')
2043
+ empty_mid = os.path.join(empty_top, 'empty_mid')
2044
+ empty_leaf = os.path.join(empty_mid, 'empty_leaf')
2045
+ os.makedirs(empty_leaf, exist_ok=True)
2046
+
2047
+ mixed_top = os.path.join(self.test_dir, 'mixed_top')
2048
+ empty_mid_in_mixed = os.path.join(mixed_top, 'empty_mid_in_mixed')
2049
+ empty_leaf_in_mixed = os.path.join(empty_mid_in_mixed, 'empty_leaf_in_mixed')
2050
+ os.makedirs(empty_leaf_in_mixed, exist_ok=True)
2051
+ non_empty_mid = os.path.join(mixed_top, 'non_empty_mid')
2052
+ os.makedirs(non_empty_mid, exist_ok=True)
2053
+ with open(os.path.join(non_empty_mid, 'file.txt'), 'w') as f:
2054
+ f.write('data')
2055
+
2056
+ non_empty_top = os.path.join(self.test_dir, 'non_empty_top')
2057
+ os.makedirs(non_empty_top, exist_ok=True)
2058
+ with open(os.path.join(non_empty_top, 'file_in_top.txt'), 'w') as f:
2059
+ f.write('data')
2060
+
2061
+ # Process empty_top - should remove all three
2062
+ remove_empty_folders(empty_top, remove_root=True)
2063
+ assert not os.path.exists(empty_top)
2064
+ assert not os.path.exists(empty_mid)
2065
+ assert not os.path.exists(empty_leaf)
2066
+
2067
+ # Process mixed_top; should remove empty_leaf_in_mixed and empty_mid_in_mixed
2068
+ # but not mixed_top or non_empty_mid.
2069
+ remove_empty_folders(mixed_top, remove_root=True)
2070
+ assert os.path.exists(mixed_top) # mixed_top itself should remain
2071
+ assert not os.path.exists(empty_mid_in_mixed)
2072
+ assert not os.path.exists(empty_leaf_in_mixed)
2073
+ assert os.path.exists(non_empty_mid)
2074
+ assert os.path.exists(os.path.join(non_empty_mid, 'file.txt'))
2075
+
2076
+ # Process non_empty_top; should remove nothing.
2077
+ remove_empty_folders(non_empty_top, remove_root=True)
2078
+ assert os.path.exists(non_empty_top)
2079
+ assert os.path.exists(os.path.join(non_empty_top, 'file_in_top.txt'))
2080
+
2081
+ # Test with a file path (should do nothing and return False)
2082
+ file_path_for_removal = os.path.join(self.test_dir, 'a_file.txt')
2083
+ with open(file_path_for_removal, 'w') as f: f.write('t')
2084
+ assert not remove_empty_folders(file_path_for_removal, remove_root=True)
2085
+ assert os.path.exists(file_path_for_removal)
2086
+
2087
+ # Test with remove_root=False for the top level
2088
+ another_empty_top = os.path.join(self.test_dir, 'another_empty_top')
2089
+ another_empty_mid = os.path.join(another_empty_top, 'another_empty_mid')
2090
+ os.makedirs(another_empty_mid)
2091
+ remove_empty_folders(another_empty_top, remove_root=False)
2092
+ assert os.path.exists(another_empty_top) # Root not removed
2093
+ assert not os.path.exists(another_empty_mid) # Mid removed
2094
+
2095
+
2096
+ def test_path_join(self):
2097
+ """
2098
+ Test the path_join function.
2099
+ """
2100
+
2101
+ assert path_join('a', 'b', 'c') == 'a/b/c'
2102
+ assert path_join('a/b', 'c', 'd.txt') == 'a/b/c/d.txt'
2103
+ if os.name == 'nt':
2104
+ # On Windows, os.path.join uses '\', so convert_slashes=True should change it
2105
+ assert path_join('a', 'b', convert_slashes=True) == 'a/b'
2106
+ assert path_join('a', 'b', convert_slashes=False) == 'a\\b'
2107
+ assert path_join('c:\\', 'foo', 'bar', convert_slashes=True) == 'c:/foo/bar'
2108
+ assert path_join('c:\\', 'foo', 'bar', convert_slashes=False) == 'c:\\foo\\bar'
2109
+ else:
2110
+ # On POSIX, os.path.join uses '/', so convert_slashes=False should still be '/'
2111
+ assert path_join('a', 'b', convert_slashes=False) == 'a/b'
2112
+
2113
+ assert path_join('a', '', 'b') == 'a/b' # os.path.join behavior
2114
+ assert path_join('/a', 'b') == '/a/b'
2115
+ assert path_join('a', '/b') == '/b' # '/b' is absolute
2116
+
2117
+
2118
+ def test_filename_cleaning(self):
2119
+ """
2120
+ Test clean_filename, clean_path, and flatten_path functions.
2121
+ """
2122
+
2123
+ # clean_filename
2124
+ assert clean_filename("test file.txt") == "test file.txt"
2125
+ assert clean_filename("test*file?.txt", char_limit=10) == "testfile.t"
2126
+ assert clean_filename("TestFile.TXT", force_lower=True) == "testfile.txt"
2127
+ assert clean_filename("file:with<illegal>chars.txt") == "filewithillegalchars.txt"
2128
+ assert clean_filename(" accented_name_éà.txt") == " accented_name_ea.txt"
2129
+
2130
+ # Separators are not allowed by default in clean_filename
2131
+ assert clean_filename("path/to/file.txt") == "pathtofile.txt"
2132
+
2133
+ # clean_path
2134
+ assert clean_path("path/to/file.txt") == "path/to/file.txt" # slashes allowed
2135
+ assert clean_path("path\\to\\file.txt") == "path\\to\\file.txt" # backslashes allowed
2136
+ assert clean_path("path:to:file.txt") == "path:to:file.txt" # colons allowed
2137
+ assert clean_path("path/to<illegal>/file.txt") == "path/toillegal/file.txt"
2138
+
2139
+ # flatten_path
2140
+ assert flatten_path("path/to/file.txt") == "path~to~file.txt"
2141
+ assert flatten_path("path:to:file.txt", separator_char_replacement='_') == "path_to_file.txt"
2142
+ assert flatten_path("path\\to/file:name.txt") == "path~to~file~name.txt"
2143
+ assert flatten_path("path/to<illegal>/file.txt") == "path~toillegal~file.txt"
2144
+
2145
+
2146
+ def test_is_executable(self):
2147
+ """
2148
+ Test the is_executable function.
2149
+ This is a basic test; comprehensive testing is environment-dependent.
2150
+ """
2151
+
2152
+ # Hard to test reliably across all systems without knowing what's on PATH.
2153
+ if os.name == 'nt':
2154
+ assert is_executable('cmd.exe')
2155
+ assert not is_executable('non_existent_executable_blah_blah')
2156
+ else:
2157
+ assert is_executable('ls')
2158
+ assert is_executable('sh')
2159
+ assert not is_executable('non_existent_executable_blah_blah')
2160
+
2161
+
2162
+ def test_write_read_list_to_file(self):
2163
+ """
2164
+ Test write_list_to_file and read_list_from_file functions.
2165
+ """
2166
+
2167
+ test_list = ["item1", "item2 with space", "item3/with/slash"]
2168
+
2169
+ # Test with .json
2170
+ json_file_path = os.path.join(self.test_dir, "test_list.json")
2171
+ write_list_to_file(json_file_path, test_list)
2172
+ read_list_json = read_list_from_file(json_file_path)
2173
+ assert test_list == read_list_json
2174
+
2175
+ # Test with .txt
2176
+ txt_file_path = os.path.join(self.test_dir, "test_list.txt")
2177
+ write_list_to_file(txt_file_path, test_list)
2178
+ # read_list_from_file is specifically for JSON, so we read .txt manually
2179
+ with open(txt_file_path, 'r') as f:
2180
+ read_list_txt = [line.strip() for line in f.readlines()]
2181
+ assert test_list == read_list_txt
2182
+
2183
+ # Test reading non-existent json
2184
+ try:
2185
+ read_list_from_file(os.path.join(self.test_dir,"non_existent.json"))
2186
+ raise AssertionError("FileNotFoundError not raised")
2187
+ except FileNotFoundError:
2188
+ pass
2189
+
2190
+ # Test reading a non-json file with read_list_from_file (should fail parsing)
2191
+ non_json_path = os.path.join(self.test_dir, "not_a_list.json")
2192
+ with open(non_json_path, 'w') as f: f.write("this is not json")
2193
+ try:
2194
+ read_list_from_file(non_json_path)
2195
+ raise AssertionError("json.JSONDecodeError not raised")
2196
+ except json.JSONDecodeError:
2197
+ pass
2198
+
2199
+
2200
+ def test_parallel_copy_files(self):
2201
+ """
2202
+ Test the parallel_copy_files function (with max_workers=1 for test simplicity).
2203
+ """
2204
+
2205
+ source_dir = os.path.join(self.test_dir, "copy_source")
2206
+ target_dir = os.path.join(self.test_dir, "copy_target")
2207
+ os.makedirs(source_dir, exist_ok=True)
2208
+
2209
+ file_mappings = {}
2210
+ source_files_content = {}
2211
+
2212
+ for i in range(3):
2213
+ src_fn = f"file{i}.txt"
2214
+ src_path = os.path.join(source_dir, src_fn)
2215
+ if i == 0:
2216
+ tgt_fn = f"copied_file{i}.txt"
2217
+ tgt_path = os.path.join(target_dir, tgt_fn)
2218
+ else:
2219
+ tgt_fn = f"copied_file{i}_subdir.txt"
2220
+ tgt_path = os.path.join(target_dir, f"sub{i}", tgt_fn)
2221
+
2222
+ content = f"content of file {i}"
2223
+ with open(src_path, 'w') as f:
2224
+ f.write(content)
2225
+
2226
+ file_mappings[src_path] = tgt_path
2227
+ source_files_content[tgt_path] = content
2228
+
2229
+ # Test copy
2230
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, overwrite=False)
2231
+ for tgt_path, expected_content in source_files_content.items():
2232
+ assert os.path.exists(tgt_path)
2233
+ with open(tgt_path, 'r') as f:
2234
+ assert f.read() == expected_content
2235
+
2236
+ existing_target_path = list(source_files_content.keys())[0]
2237
+ with open(existing_target_path, 'w') as f:
2238
+ f.write("old content")
2239
+
2240
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, overwrite=False)
2241
+ with open(existing_target_path, 'r') as f:
2242
+ assert f.read() == "old content"
2243
+
2244
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, overwrite=True)
2245
+ with open(existing_target_path, 'r') as f:
2246
+ assert f.read() == source_files_content[existing_target_path]
2247
+
2248
+ for src_path_orig, tgt_path_orig in file_mappings.items(): # Re-create source for move
2249
+ with open(src_path_orig, 'w') as f:
2250
+ f.write(source_files_content[tgt_path_orig])
2251
+
2252
+ parallel_copy_files(file_mappings, max_workers=1, use_threads=True, move=True, overwrite=True)
2253
+ for src_path, tgt_path in file_mappings.items():
2254
+ assert not os.path.exists(src_path)
2255
+ assert os.path.exists(tgt_path)
2256
+ with open(tgt_path, 'r') as f:
2257
+ assert f.read() == source_files_content[tgt_path]
2258
+
2259
+
2260
+ def test_get_file_sizes(self):
2261
+ """
2262
+ Test get_file_sizes and parallel_get_file_sizes functions.
2263
+ """
2264
+
2265
+ file_sizes_test_dir = os.path.join(self.test_dir,'file_sizes')
2266
+ os.makedirs(file_sizes_test_dir,exist_ok=True)
2267
+
2268
+ f1_path = os.path.join(file_sizes_test_dir, 'file1.txt')
2269
+ content1 = "0123456789" # 10 bytes
2270
+ with open(f1_path, 'w') as f:
2271
+ f.write(content1)
2272
+
2273
+ subdir_path = os.path.join(file_sizes_test_dir, 'subdir')
2274
+ os.makedirs(subdir_path, exist_ok=True)
2275
+ f2_path = os.path.join(subdir_path, 'file2.txt')
2276
+ content2 = "01234567890123456789" # 20 bytes
2277
+ with open(f2_path, 'w') as f:
2278
+ f.write(content2)
2279
+
2280
+ sizes_relative = get_file_sizes(file_sizes_test_dir)
2281
+ expected_sizes_relative = {
2282
+ 'file1.txt': len(content1),
2283
+ os.path.join('subdir', 'file2.txt').replace('\\','/'): len(content2)
2284
+ }
2285
+ assert sizes_relative == expected_sizes_relative
2286
+
2287
+ file_list_abs = [f1_path, f2_path]
2288
+ sizes_parallel_abs = parallel_get_file_sizes(file_list_abs, max_workers=1)
2289
+ expected_sizes_parallel_abs = {
2290
+ f1_path.replace('\\','/'): len(content1),
2291
+ f2_path.replace('\\','/'): len(content2)
2292
+ }
2293
+ assert sizes_parallel_abs == expected_sizes_parallel_abs
2294
+
2295
+ sizes_parallel_folder_abs = parallel_get_file_sizes(file_sizes_test_dir,
2296
+ max_workers=1,
2297
+ return_relative_paths=False)
2298
+ assert sizes_parallel_folder_abs == expected_sizes_parallel_abs
2299
+
2300
+ sizes_parallel_folder_rel = parallel_get_file_sizes(file_sizes_test_dir,
2301
+ max_workers=1,
2302
+ return_relative_paths=True)
2303
+ assert sizes_parallel_folder_rel == expected_sizes_relative
2304
+
2305
+ non_existent_file = os.path.join(file_sizes_test_dir, "no_such_file.txt")
2306
+ sizes_with_error = parallel_get_file_sizes([f1_path, non_existent_file],
2307
+ max_workers=1)
2308
+ expected_with_error = {
2309
+ f1_path.replace('\\','/'): len(content1),
2310
+ non_existent_file.replace('\\','/'): None
2311
+ }
2312
+ assert sizes_with_error == expected_with_error
2313
+
2314
+
2315
+ def test_zip_file_and_unzip_file(self):
2316
+ """
2317
+ Test zip_file and unzip_file functions.
2318
+ """
2319
+
2320
+ file_to_zip_name = "test_zip_me.txt"
2321
+ file_to_zip_path = os.path.join(self.test_dir, file_to_zip_name)
2322
+ content = "This is the content to be zipped."
2323
+ with open(file_to_zip_path, 'w') as f:
2324
+ f.write(content)
2325
+
2326
+ default_zip_output_path = file_to_zip_path + ".zip"
2327
+ returned_zip_path = zip_file(file_to_zip_path)
2328
+ assert returned_zip_path == default_zip_output_path
2329
+ assert os.path.exists(default_zip_output_path)
2330
+
2331
+ unzip_dir_default = os.path.join(self.test_dir, "unzip_default")
2332
+ os.makedirs(unzip_dir_default, exist_ok=True)
2333
+ unzip_file(default_zip_output_path, unzip_dir_default)
2334
+ unzipped_file_path_default = os.path.join(unzip_dir_default, file_to_zip_name)
2335
+ assert os.path.exists(unzipped_file_path_default)
2336
+ with open(unzipped_file_path_default, 'r') as f:
2337
+ assert f.read() == content
2338
+
2339
+ custom_zip_output_name = "custom_archive.zip"
2340
+ custom_zip_output_path = os.path.join(self.test_dir, custom_zip_output_name)
2341
+ zip_file(file_to_zip_path, output_fn=custom_zip_output_path, overwrite=True)
2342
+ assert os.path.exists(custom_zip_output_path)
2343
+
2344
+ zip_in_subdir_path = os.path.join(self.test_dir, "subdir_zip", "my.zip")
2345
+ file_in_subdir_name = "file_for_subdir_zip.txt"
2346
+ file_in_subdir_path = os.path.join(self.test_dir,"subdir_zip", file_in_subdir_name)
2347
+ os.makedirs(os.path.dirname(zip_in_subdir_path), exist_ok=True)
2348
+ with open(file_in_subdir_path, "w") as f: f.write("sub dir content")
2349
+ zip_file(file_in_subdir_path, output_fn=zip_in_subdir_path)
2350
+
2351
+ unzip_file(zip_in_subdir_path, output_folder=None)
2352
+ unzipped_in_same_dir_path = os.path.join(os.path.dirname(zip_in_subdir_path), file_in_subdir_name)
2353
+ assert os.path.exists(unzipped_in_same_dir_path)
2354
+ with open(unzipped_in_same_dir_path, 'r') as f:
2355
+ assert f.read() == "sub dir content"
2356
+
2357
+
2358
+ def test_zip_folder(self):
2359
+ """
2360
+ Test the zip_folder function.
2361
+ """
2362
+
2363
+ folder_to_zip = os.path.join(self.test_dir, "folder_to_zip")
2364
+ os.makedirs(folder_to_zip, exist_ok=True)
2365
+
2366
+ file1_name = "file1.txt"; path1 = os.path.join(folder_to_zip, file1_name)
2367
+ file2_name = "file2.log"; path2 = os.path.join(folder_to_zip, file2_name)
2368
+ subdir_name = "sub"; subdir_path = os.path.join(folder_to_zip, subdir_name)
2369
+ os.makedirs(subdir_path, exist_ok=True)
2370
+ file3_name = "file3.dat"; path3 = os.path.join(subdir_path, file3_name)
2371
+
2372
+ content1 = "content1"; content2 = "content2"; content3 = "content3"
2373
+ with open(path1, 'w') as f: f.write(content1)
2374
+ with open(path2, 'w') as f: f.write(content2)
2375
+ with open(path3, 'w') as f: f.write(content3)
2376
+
2377
+ default_zip_path = folder_to_zip + ".zip"
2378
+ zip_folder(folder_to_zip, output_fn=None, overwrite=True)
2379
+ assert os.path.exists(default_zip_path)
2380
+
2381
+ unzip_output_dir = os.path.join(self.test_dir, "unzipped_folder_content")
2382
+ os.makedirs(unzip_output_dir, exist_ok=True)
2383
+ unzip_file(default_zip_path, unzip_output_dir)
2384
+
2385
+ assert os.path.exists(os.path.join(unzip_output_dir, file1_name))
2386
+ assert os.path.exists(os.path.join(unzip_output_dir, file2_name))
2387
+ assert os.path.exists(os.path.join(unzip_output_dir, subdir_name, file3_name))
2388
+ with open(os.path.join(unzip_output_dir, file1_name), 'r')as f: assert f.read() == content1
2389
+ with open(os.path.join(unzip_output_dir, file2_name), 'r')as f: assert f.read() == content2
2390
+ with open(os.path.join(unzip_output_dir, subdir_name, file3_name), 'r')as f: assert f.read() == content3
2391
+
2392
+ mtime_before = os.path.getmtime(default_zip_path)
2393
+ zip_folder(folder_to_zip, output_fn=None, overwrite=False)
2394
+ mtime_after = os.path.getmtime(default_zip_path)
2395
+ assert mtime_before == mtime_after
2396
+
2397
+
2398
+ def test_zip_files_into_single_zipfile(self):
2399
+ """
2400
+ Test zip_files_into_single_zipfile.
2401
+ """
2402
+
2403
+ file1_path = os.path.join(self.test_dir, "zfs_file1.txt")
2404
+ content1 = "content for zfs1"
2405
+ with open(file1_path, 'w') as f: f.write(content1)
2406
+
2407
+ subdir_for_zfs = os.path.join(self.test_dir, "zfs_subdir")
2408
+ os.makedirs(subdir_for_zfs, exist_ok=True)
2409
+ file2_path = os.path.join(subdir_for_zfs, "zfs_file2.log")
2410
+ content2 = "content for zfs2"
2411
+ with open(file2_path, 'w') as f: f.write(content2)
2412
+
2413
+ input_files = [file1_path, file2_path]
2414
+ output_zip_path = os.path.join(self.test_dir, "multi_file_archive.zip")
2415
+ zip_files_into_single_zipfile(input_files, output_zip_path, arc_name_base=self.test_dir, overwrite=True)
2416
+ assert os.path.exists(output_zip_path)
2417
+
2418
+ unzip_dir = os.path.join(self.test_dir, "unzip_multi_file")
2419
+ os.makedirs(unzip_dir, exist_ok=True)
2420
+ unzip_file(output_zip_path, unzip_dir)
2421
+
2422
+ expected_unzipped_file1 = os.path.join(unzip_dir, os.path.relpath(file1_path, self.test_dir))
2423
+ expected_unzipped_file2 = os.path.join(unzip_dir, os.path.relpath(file2_path, self.test_dir))
2424
+
2425
+ assert os.path.exists(expected_unzipped_file1)
2426
+ with open(expected_unzipped_file1, 'r') as f: assert f.read() == content1
2427
+ assert os.path.exists(expected_unzipped_file2)
2428
+ assert os.path.basename(expected_unzipped_file2) == "zfs_file2.log"
2429
+ assert os.path.basename(os.path.dirname(expected_unzipped_file2)) == "zfs_subdir"
2430
+ with open(expected_unzipped_file2, 'r') as f: assert f.read() == content2
2431
+
2432
+
2433
+ def test_add_files_to_single_tar_file(self):
2434
+ """
2435
+ Test add_files_to_single_tar_file.
2436
+ """
2437
+
2438
+ file1_path = os.path.join(self.test_dir, "tar_file1.txt")
2439
+ content1 = "content for tar1"
2440
+ with open(file1_path, 'w') as f: f.write(content1)
2441
+
2442
+ subdir_for_tar = os.path.join(self.test_dir, "tar_subdir")
2443
+ os.makedirs(subdir_for_tar, exist_ok=True)
2444
+ file2_path = os.path.join(subdir_for_tar, "tar_file2.log")
2445
+ content2 = "content for tar2"
2446
+ with open(file2_path, 'w') as f: f.write(content2)
2447
+
2448
+ input_files = [file1_path, file2_path]
2449
+ output_tar_path = os.path.join(self.test_dir, "archive.tar.gz")
2450
+
2451
+ add_files_to_single_tar_file(input_files, output_tar_path, arc_name_base=self.test_dir,
2452
+ overwrite=True, mode='x:gz')
2453
+ assert os.path.exists(output_tar_path)
2454
+
2455
+ un_tar_dir = os.path.join(self.test_dir, "un_tar_contents")
2456
+ os.makedirs(un_tar_dir, exist_ok=True)
2457
+ with tarfile.open(output_tar_path, 'r:gz') as tf:
2458
+ tf.extractall(path=un_tar_dir)
2459
+
2460
+ expected_untarred_file1 = os.path.join(un_tar_dir, os.path.relpath(file1_path, self.test_dir))
2461
+ expected_untarred_file2 = os.path.join(un_tar_dir, os.path.relpath(file2_path, self.test_dir))
2462
+
2463
+ assert os.path.exists(expected_untarred_file1)
2464
+ with open(expected_untarred_file1, 'r') as f: assert f.read() == content1
2465
+ assert os.path.exists(expected_untarred_file2)
2466
+ with open(expected_untarred_file2, 'r') as f: assert f.read() == content2
2467
+
2468
+
2469
+ def test_parallel_zip_individual_files_and_folders(self):
2470
+ """
2471
+ Test parallel_zip_files, parallel_zip_folders, and zip_each_file_in_folder.
2472
+ """
2473
+
2474
+ file1_to_zip = os.path.join(self.test_dir, "pz_file1.txt")
2475
+ file2_to_zip = os.path.join(self.test_dir, "pz_file2.txt")
2476
+ with open(file1_to_zip, 'w') as f: f.write("pz_content1")
2477
+ with open(file2_to_zip, 'w') as f: f.write("pz_content2")
2478
+
2479
+ parallel_zip_files([file1_to_zip, file2_to_zip], max_workers=1, overwrite=True)
2480
+ assert os.path.exists(file1_to_zip + ".zip")
2481
+ assert os.path.exists(file2_to_zip + ".zip")
2482
+ unzip_dir_pz = os.path.join(self.test_dir, "unzip_pz")
2483
+ unzip_file(file1_to_zip + ".zip", unzip_dir_pz)
2484
+ assert os.path.exists(os.path.join(unzip_dir_pz, os.path.basename(file1_to_zip)))
2485
+
2486
+ folder1_to_zip = os.path.join(self.test_dir, "pz_folder1")
2487
+ os.makedirs(folder1_to_zip, exist_ok=True)
2488
+ with open(os.path.join(folder1_to_zip, "pf1.txt"), 'w') as f: f.write("pf1_content")
2489
+ folder2_to_zip = os.path.join(self.test_dir, "pz_folder2")
2490
+ os.makedirs(folder2_to_zip, exist_ok=True)
2491
+ with open(os.path.join(folder2_to_zip, "pf2.txt"), 'w') as f: f.write("pf2_content")
2492
+
2493
+ parallel_zip_folders([folder1_to_zip, folder2_to_zip], max_workers=1, overwrite=True)
2494
+ assert os.path.exists(folder1_to_zip + ".zip")
2495
+ assert os.path.exists(folder2_to_zip + ".zip")
2496
+ unzip_dir_pzf = os.path.join(self.test_dir, "unzip_pzf")
2497
+ unzip_file(folder1_to_zip + ".zip", unzip_dir_pzf)
2498
+ assert os.path.exists(os.path.join(unzip_dir_pzf, "pf1.txt"))
2499
+
2500
+ zef_folder = os.path.join(self.test_dir, "zef_test_folder")
2501
+ os.makedirs(zef_folder, exist_ok=True)
2502
+ zef_file1 = os.path.join(zef_folder, "zef1.txt")
2503
+ zef_file2_png = os.path.join(zef_folder, "zef2.png")
2504
+ zef_file3_zip = os.path.join(zef_folder, "zef3.zip")
2505
+ zef_subdir = os.path.join(zef_folder, "zef_sub")
2506
+ os.makedirs(zef_subdir, exist_ok=True)
2507
+ zef_file_in_sub = os.path.join(zef_subdir, "zef_subfile.txt")
2508
+
2509
+ for p_path in [zef_file1, zef_file2_png, zef_file3_zip, zef_file_in_sub]:
2510
+ with open(p_path, 'w') as f: f.write(f"content of {os.path.basename(p_path)}")
2511
+
2512
+ zip_each_file_in_folder(zef_folder, recursive=False, max_workers=1, overwrite=True)
2513
+ assert os.path.exists(zef_file1 + ".zip")
2514
+ assert os.path.exists(zef_file2_png + ".zip")
2515
+ assert not os.path.exists(zef_file3_zip + ".zip")
2516
+ assert not os.path.exists(zef_file_in_sub + ".zip")
2517
+
2518
+ if os.path.exists(zef_file1 + ".zip"): os.remove(zef_file1 + ".zip")
2519
+ if os.path.exists(zef_file2_png + ".zip"): os.remove(zef_file2_png + ".zip")
2520
+
2521
+ zip_each_file_in_folder(zef_folder, recursive=True, max_workers=1, overwrite=True)
2522
+ assert os.path.exists(zef_file1 + ".zip")
2523
+ assert os.path.exists(zef_file2_png + ".zip")
2524
+ assert not os.path.exists(zef_file3_zip + ".zip")
2525
+ assert os.path.exists(zef_file_in_sub + ".zip")
2526
+
2527
+ if os.path.exists(zef_file1 + ".zip"): os.remove(zef_file1 + ".zip")
2528
+ if os.path.exists(zef_file2_png + ".zip"): os.remove(zef_file2_png + ".zip")
2529
+ if os.path.exists(zef_file_in_sub + ".zip"): os.remove(zef_file_in_sub + ".zip")
2530
+ zip_each_file_in_folder(zef_folder, recursive=True, required_token="zef1", max_workers=1, overwrite=True)
2531
+ assert os.path.exists(zef_file1 + ".zip")
2532
+ assert not os.path.exists(zef_file2_png + ".zip")
2533
+ assert not os.path.exists(zef_file_in_sub + ".zip")
2534
+
2535
+ if os.path.exists(zef_file1 + ".zip"): os.remove(zef_file1 + ".zip")
2536
+ dummy_to_zip = os.path.join(zef_folder,"dummy.txt")
2537
+ with open(dummy_to_zip,'w') as f: f.write('d')
2538
+ zip_each_file_in_folder(zef_folder, recursive=False, exclude_zip=False, max_workers=1, overwrite=True)
2539
+ assert os.path.exists(dummy_to_zip + ".zip")
2540
+ assert os.path.exists(zef_file3_zip + ".zip")
2541
+ if os.path.exists(dummy_to_zip + ".zip"): os.remove(dummy_to_zip + ".zip")
2542
+ if os.path.exists(zef_file3_zip + ".zip"): os.remove(zef_file3_zip + ".zip")
2543
+
2544
+
2545
+ def test_compute_file_hash(self):
2546
+ """
2547
+ Test compute_file_hash and parallel_compute_file_hashes.
2548
+ """
2549
+
2550
+ file1_name = "hash_me1.txt"
2551
+ file1_path = os.path.join(self.test_dir, file1_name)
2552
+ content1 = "This is a test string for hashing."
2553
+ with open(file1_path, 'w') as f:
2554
+ f.write(content1)
2555
+
2556
+ file2_name = "hash_me2.txt"
2557
+ file2_path = os.path.join(self.test_dir, file2_name)
2558
+ with open(file2_path, 'w') as f:
2559
+ f.write(content1)
2560
+
2561
+ file3_name = "hash_me3.txt"
2562
+ file3_path = os.path.join(self.test_dir, file3_name)
2563
+ content3 = "This is a different test string for hashing."
2564
+ with open(file3_path, 'w') as f:
2565
+ f.write(content3)
2566
+
2567
+ expected_hash_content1_sha256 = \
2568
+ "c56f19d76df6a09e49fe0d9ce7b1bc7f1dbd582f668742bede65c54c47d5bcf4".lower()
2569
+ expected_hash_content3_sha256 = \
2570
+ "23013ff7e93264317f7b2fc0e9a217649f2dc0b11ca7e0bd49632424b70b6680".lower()
2571
+
2572
+ hash1 = compute_file_hash(file1_path)
2573
+ hash2 = compute_file_hash(file2_path)
2574
+ hash3 = compute_file_hash(file3_path)
2575
+ assert hash1 == expected_hash_content1_sha256
2576
+ assert hash2 == expected_hash_content1_sha256
2577
+ assert hash1 != hash3
2578
+ assert hash3 == expected_hash_content3_sha256
2579
+
2580
+ expected_hash_content1_md5 = "94b971f1f8cdb23c2af82af73160d4b0".lower()
2581
+ hash1_md5 = compute_file_hash(file1_path, algorithm='md5')
2582
+ assert hash1_md5 == expected_hash_content1_md5
2583
+
2584
+ non_existent_path = os.path.join(self.test_dir, "no_such_file.txt")
2585
+ assert compute_file_hash(non_existent_path, allow_failures=True) is None
2586
+ try:
2587
+ compute_file_hash(non_existent_path, allow_failures=False)
2588
+ raise AssertionError("FileNotFoundError not raised for compute_file_hash")
2589
+ except FileNotFoundError:
2590
+ pass
2591
+
2592
+ files_to_hash = [file1_path, file3_path, non_existent_path]
2593
+ hashes_parallel = parallel_compute_file_hashes(files_to_hash, max_workers=1)
2594
+
2595
+ norm_f1 = file1_path.replace('\\','/')
2596
+ norm_f3 = file3_path.replace('\\','/')
2597
+ norm_non = non_existent_path.replace('\\','/')
2598
+
2599
+ expected_parallel_hashes = {
2600
+ norm_f1: expected_hash_content1_sha256,
2601
+ norm_f3: expected_hash_content3_sha256,
2602
+ norm_non: None
2603
+ }
2604
+ hashes_parallel_norm = {k.replace('\\','/'): v for k,v in hashes_parallel.items()}
2605
+ assert hashes_parallel_norm == expected_parallel_hashes
2606
+
2607
+ hash_folder = os.path.join(self.test_dir, "hash_test_folder")
2608
+ os.makedirs(hash_folder, exist_ok=True)
2609
+ h_f1_name = "h_f1.txt"; h_f1_path = os.path.join(hash_folder, h_f1_name)
2610
+ h_f2_name = "h_f2.txt"; h_f2_path = os.path.join(hash_folder, h_f2_name)
2611
+ with open(h_f1_path, 'w') as f: f.write(content1)
2612
+ with open(h_f2_path, 'w') as f: f.write(content3)
2613
+
2614
+ hashes_folder_parallel = parallel_compute_file_hashes(hash_folder, recursive=False, max_workers=1)
2615
+ norm_hf1 = h_f1_path.replace('\\','/')
2616
+ norm_hf2 = h_f2_path.replace('\\','/')
2617
+ expected_folder_hashes = {
2618
+ norm_hf1: expected_hash_content1_sha256,
2619
+ norm_hf2: expected_hash_content3_sha256
2620
+ }
2621
+ hashes_folder_parallel_norm = {k.replace('\\','/'): v for k,v in hashes_folder_parallel.items()}
2622
+ assert hashes_folder_parallel_norm == expected_folder_hashes
2623
+
2624
+
2625
+ def test_path_utils():
2626
+ """
2627
+ Runs all tests in the TestPathUtils class.
2628
+ """
2629
+
2630
+ test_instance = TestPathUtils()
2631
+ test_instance.set_up()
2632
+ try:
2633
+ test_instance.test_is_image_file()
2634
+ test_instance.test_find_image_strings()
2635
+ test_instance.test_find_images()
2636
+ test_instance.test_recursive_file_list_and_file_list()
2637
+ test_instance.test_folder_list()
2638
+ test_instance.test_folder_summary()
2639
+ test_instance.test_fileparts()
2640
+ test_instance.test_insert_before_extension()
2641
+ test_instance.test_split_path()
2642
+ test_instance.test_path_is_abs()
2643
+ test_instance.test_safe_create_link_unix()
2644
+ test_instance.test_remove_empty_folders()
2645
+ test_instance.test_path_join()
2646
+ test_instance.test_filename_cleaning()
2647
+ test_instance.test_is_executable()
2648
+ test_instance.test_write_read_list_to_file()
2649
+ test_instance.test_parallel_copy_files()
2650
+ test_instance.test_get_file_sizes()
2651
+ test_instance.test_zip_file_and_unzip_file()
2652
+ test_instance.test_zip_folder()
2653
+ test_instance.test_zip_files_into_single_zipfile()
2654
+ test_instance.test_add_files_to_single_tar_file()
2655
+ test_instance.test_parallel_zip_individual_files_and_folders()
2656
+ test_instance.test_compute_file_hash()
2657
+ finally:
2658
+ test_instance.tear_down()