megadetector 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
- megadetector/classification/aggregate_classifier_probs.py +3 -3
- megadetector/classification/analyze_failed_images.py +5 -5
- megadetector/classification/cache_batchapi_outputs.py +5 -5
- megadetector/classification/create_classification_dataset.py +11 -12
- megadetector/classification/crop_detections.py +10 -10
- megadetector/classification/csv_to_json.py +8 -8
- megadetector/classification/detect_and_crop.py +13 -15
- megadetector/classification/efficientnet/model.py +8 -8
- megadetector/classification/efficientnet/utils.py +6 -5
- megadetector/classification/evaluate_model.py +7 -7
- megadetector/classification/identify_mislabeled_candidates.py +6 -6
- megadetector/classification/json_to_azcopy_list.py +1 -1
- megadetector/classification/json_validator.py +29 -32
- megadetector/classification/map_classification_categories.py +9 -9
- megadetector/classification/merge_classification_detection_output.py +12 -9
- megadetector/classification/prepare_classification_script.py +19 -19
- megadetector/classification/prepare_classification_script_mc.py +26 -26
- megadetector/classification/run_classifier.py +4 -4
- megadetector/classification/save_mislabeled.py +6 -6
- megadetector/classification/train_classifier.py +1 -1
- megadetector/classification/train_classifier_tf.py +9 -9
- megadetector/classification/train_utils.py +10 -10
- megadetector/data_management/annotations/annotation_constants.py +1 -2
- megadetector/data_management/camtrap_dp_to_coco.py +79 -46
- megadetector/data_management/cct_json_utils.py +103 -103
- megadetector/data_management/cct_to_md.py +49 -49
- megadetector/data_management/cct_to_wi.py +33 -33
- megadetector/data_management/coco_to_labelme.py +75 -75
- megadetector/data_management/coco_to_yolo.py +210 -193
- megadetector/data_management/databases/add_width_and_height_to_db.py +86 -12
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +40 -40
- megadetector/data_management/databases/integrity_check_json_db.py +228 -200
- megadetector/data_management/databases/subset_json_db.py +33 -33
- megadetector/data_management/generate_crops_from_cct.py +88 -39
- megadetector/data_management/get_image_sizes.py +54 -49
- megadetector/data_management/labelme_to_coco.py +133 -125
- megadetector/data_management/labelme_to_yolo.py +159 -73
- megadetector/data_management/lila/create_lila_blank_set.py +81 -83
- megadetector/data_management/lila/create_lila_test_set.py +32 -31
- megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
- megadetector/data_management/lila/download_lila_subset.py +21 -24
- megadetector/data_management/lila/generate_lila_per_image_labels.py +365 -107
- megadetector/data_management/lila/get_lila_annotation_counts.py +35 -33
- megadetector/data_management/lila/get_lila_image_counts.py +22 -22
- megadetector/data_management/lila/lila_common.py +73 -70
- megadetector/data_management/lila/test_lila_metadata_urls.py +28 -19
- megadetector/data_management/mewc_to_md.py +344 -340
- megadetector/data_management/ocr_tools.py +262 -255
- megadetector/data_management/read_exif.py +249 -227
- megadetector/data_management/remap_coco_categories.py +90 -28
- megadetector/data_management/remove_exif.py +81 -21
- megadetector/data_management/rename_images.py +187 -187
- megadetector/data_management/resize_coco_dataset.py +588 -120
- megadetector/data_management/speciesnet_to_md.py +41 -41
- megadetector/data_management/wi_download_csv_to_coco.py +55 -55
- megadetector/data_management/yolo_output_to_md_output.py +248 -122
- megadetector/data_management/yolo_to_coco.py +333 -191
- megadetector/detection/change_detection.py +832 -0
- megadetector/detection/process_video.py +340 -337
- megadetector/detection/pytorch_detector.py +358 -278
- megadetector/detection/run_detector.py +399 -186
- megadetector/detection/run_detector_batch.py +404 -377
- megadetector/detection/run_inference_with_yolov5_val.py +340 -327
- megadetector/detection/run_tiled_inference.py +257 -249
- megadetector/detection/tf_detector.py +24 -24
- megadetector/detection/video_utils.py +332 -295
- megadetector/postprocessing/add_max_conf.py +19 -11
- megadetector/postprocessing/categorize_detections_by_size.py +45 -45
- megadetector/postprocessing/classification_postprocessing.py +468 -433
- megadetector/postprocessing/combine_batch_outputs.py +23 -23
- megadetector/postprocessing/compare_batch_results.py +590 -525
- megadetector/postprocessing/convert_output_format.py +106 -102
- megadetector/postprocessing/create_crop_folder.py +347 -147
- megadetector/postprocessing/detector_calibration.py +173 -168
- megadetector/postprocessing/generate_csv_report.py +508 -499
- megadetector/postprocessing/load_api_results.py +48 -27
- megadetector/postprocessing/md_to_coco.py +133 -102
- megadetector/postprocessing/md_to_labelme.py +107 -90
- megadetector/postprocessing/md_to_wi.py +40 -40
- megadetector/postprocessing/merge_detections.py +92 -114
- megadetector/postprocessing/postprocess_batch_results.py +319 -301
- megadetector/postprocessing/remap_detection_categories.py +91 -38
- megadetector/postprocessing/render_detection_confusion_matrix.py +214 -205
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +704 -679
- megadetector/postprocessing/separate_detections_into_folders.py +226 -211
- megadetector/postprocessing/subset_json_detector_output.py +265 -262
- megadetector/postprocessing/top_folders_to_bottom.py +45 -45
- megadetector/postprocessing/validate_batch_results.py +70 -70
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +18 -19
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +54 -33
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +67 -67
- megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
- megadetector/taxonomy_mapping/simple_image_download.py +8 -8
- megadetector/taxonomy_mapping/species_lookup.py +156 -74
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
- megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
- megadetector/utils/ct_utils.py +1049 -211
- megadetector/utils/directory_listing.py +21 -77
- megadetector/utils/gpu_test.py +22 -22
- megadetector/utils/md_tests.py +632 -529
- megadetector/utils/path_utils.py +1520 -431
- megadetector/utils/process_utils.py +41 -41
- megadetector/utils/split_locations_into_train_val.py +62 -62
- megadetector/utils/string_utils.py +148 -27
- megadetector/utils/url_utils.py +489 -176
- megadetector/utils/wi_utils.py +2658 -2526
- megadetector/utils/write_html_image_list.py +137 -137
- megadetector/visualization/plot_utils.py +34 -30
- megadetector/visualization/render_images_with_thumbnails.py +39 -74
- megadetector/visualization/visualization_utils.py +487 -435
- megadetector/visualization/visualize_db.py +232 -198
- megadetector/visualization/visualize_detector_output.py +82 -76
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/METADATA +5 -2
- megadetector-10.0.0.dist-info/RECORD +139 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/WHEEL +1 -1
- megadetector/api/batch_processing/api_core/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/score.py +0 -439
- megadetector/api/batch_processing/api_core/server.py +0 -294
- megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
- megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
- megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
- megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
- megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
- megadetector/api/batch_processing/api_core/server_utils.py +0 -88
- megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
- megadetector/api/batch_processing/api_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
- megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
- megadetector/api/synchronous/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
- megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
- megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
- megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
- megadetector/api/synchronous/api_core/tests/load_test.py +0 -110
- megadetector/data_management/importers/add_nacti_sizes.py +0 -52
- megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
- megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
- megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
- megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
- megadetector/data_management/importers/awc_to_json.py +0 -191
- megadetector/data_management/importers/bellevue_to_json.py +0 -272
- megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
- megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
- megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
- megadetector/data_management/importers/cct_field_adjustments.py +0 -58
- megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
- megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
- megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
- megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
- megadetector/data_management/importers/ena24_to_json.py +0 -276
- megadetector/data_management/importers/filenames_to_json.py +0 -386
- megadetector/data_management/importers/helena_to_cct.py +0 -283
- megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
- megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
- megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
- megadetector/data_management/importers/jb_csv_to_json.py +0 -150
- megadetector/data_management/importers/mcgill_to_json.py +0 -250
- megadetector/data_management/importers/missouri_to_json.py +0 -490
- megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
- megadetector/data_management/importers/noaa_seals_2019.py +0 -181
- megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
- megadetector/data_management/importers/pc_to_json.py +0 -365
- megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
- megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
- megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
- megadetector/data_management/importers/rspb_to_json.py +0 -356
- megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
- megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
- megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
- megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
- megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
- megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
- megadetector/data_management/importers/sulross_get_exif.py +0 -65
- megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
- megadetector/data_management/importers/ubc_to_json.py +0 -399
- megadetector/data_management/importers/umn_to_json.py +0 -507
- megadetector/data_management/importers/wellington_to_json.py +0 -263
- megadetector/data_management/importers/wi_to_json.py +0 -442
- megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
- megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
- megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
- megadetector/utils/azure_utils.py +0 -178
- megadetector/utils/sas_blob_utils.py +0 -509
- megadetector-5.0.28.dist-info/RECORD +0 -209
- /megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0
megadetector/utils/path_utils.py
CHANGED
|
@@ -34,6 +34,7 @@ from shutil import which
|
|
|
34
34
|
from tqdm import tqdm
|
|
35
35
|
|
|
36
36
|
from megadetector.utils.ct_utils import is_iterable
|
|
37
|
+
from megadetector.utils.ct_utils import make_test_folder
|
|
37
38
|
from megadetector.utils.ct_utils import sort_dictionary_by_value
|
|
38
39
|
|
|
39
40
|
# Should all be lower-case
|
|
@@ -47,14 +48,14 @@ CHAR_LIMIT = 255
|
|
|
47
48
|
|
|
48
49
|
#%% General path functions
|
|
49
50
|
|
|
50
|
-
def recursive_file_list(base_dir,
|
|
51
|
-
convert_slashes=True,
|
|
52
|
-
return_relative_paths=False,
|
|
51
|
+
def recursive_file_list(base_dir,
|
|
52
|
+
convert_slashes=True,
|
|
53
|
+
return_relative_paths=False,
|
|
53
54
|
sort_files=True,
|
|
54
55
|
recursive=True):
|
|
55
56
|
r"""
|
|
56
57
|
Enumerates files (not directories) in [base_dir].
|
|
57
|
-
|
|
58
|
+
|
|
58
59
|
Args:
|
|
59
60
|
base_dir (str): folder to enumerate
|
|
60
61
|
convert_slashes (bool, optional): force forward slashes; if this is False, will use
|
|
@@ -64,15 +65,15 @@ def recursive_file_list(base_dir,
|
|
|
64
65
|
sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
|
|
65
66
|
provided by os.walk()
|
|
66
67
|
recursive (bool, optional): enumerate recursively
|
|
67
|
-
|
|
68
|
+
|
|
68
69
|
Returns:
|
|
69
70
|
list: list of filenames
|
|
70
71
|
"""
|
|
71
|
-
|
|
72
|
+
|
|
72
73
|
assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
|
|
73
|
-
|
|
74
|
+
|
|
74
75
|
all_files = []
|
|
75
|
-
|
|
76
|
+
|
|
76
77
|
if recursive:
|
|
77
78
|
for root, _, filenames in os.walk(base_dir):
|
|
78
79
|
for filename in filenames:
|
|
@@ -82,29 +83,29 @@ def recursive_file_list(base_dir,
|
|
|
82
83
|
all_files_relative = os.listdir(base_dir)
|
|
83
84
|
all_files = [os.path.join(base_dir,fn) for fn in all_files_relative]
|
|
84
85
|
all_files = [fn for fn in all_files if os.path.isfile(fn)]
|
|
85
|
-
|
|
86
|
+
|
|
86
87
|
if return_relative_paths:
|
|
87
88
|
all_files = [os.path.relpath(fn,base_dir) for fn in all_files]
|
|
88
89
|
|
|
89
90
|
if convert_slashes:
|
|
90
91
|
all_files = [fn.replace('\\', '/') for fn in all_files]
|
|
91
|
-
|
|
92
|
+
|
|
92
93
|
if sort_files:
|
|
93
94
|
all_files = sorted(all_files)
|
|
94
|
-
|
|
95
|
+
|
|
95
96
|
return all_files
|
|
96
97
|
|
|
97
98
|
|
|
98
|
-
def file_list(base_dir,
|
|
99
|
+
def file_list(base_dir,
|
|
99
100
|
convert_slashes=True,
|
|
100
|
-
return_relative_paths=False,
|
|
101
|
-
sort_files=True,
|
|
101
|
+
return_relative_paths=False,
|
|
102
|
+
sort_files=True,
|
|
102
103
|
recursive=False):
|
|
103
104
|
"""
|
|
104
|
-
Trivial wrapper for recursive_file_list, which was a poor function name choice
|
|
105
|
-
at the time, since I later wanted to add non-recursive lists, but it doesn't
|
|
105
|
+
Trivial wrapper for recursive_file_list, which was a poor function name choice
|
|
106
|
+
at the time, since I later wanted to add non-recursive lists, but it doesn't
|
|
106
107
|
make sense to have a "recursive" option in a function called "recursive_file_list".
|
|
107
|
-
|
|
108
|
+
|
|
108
109
|
Args:
|
|
109
110
|
base_dir (str): folder to enumerate
|
|
110
111
|
convert_slashes (bool, optional): force forward slashes; if this is False, will use
|
|
@@ -114,11 +115,11 @@ def file_list(base_dir,
|
|
|
114
115
|
sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
|
|
115
116
|
provided by os.walk()
|
|
116
117
|
recursive (bool, optional): enumerate recursively
|
|
117
|
-
|
|
118
|
+
|
|
118
119
|
Returns:
|
|
119
|
-
list: list of filenames
|
|
120
|
+
list: list of filenames
|
|
120
121
|
"""
|
|
121
|
-
|
|
122
|
+
|
|
122
123
|
return recursive_file_list(base_dir,convert_slashes,return_relative_paths,sort_files,
|
|
123
124
|
recursive=recursive)
|
|
124
125
|
|
|
@@ -128,94 +129,93 @@ def folder_list(base_dir,
|
|
|
128
129
|
return_relative_paths=False,
|
|
129
130
|
sort_folders=True,
|
|
130
131
|
recursive=False):
|
|
131
|
-
|
|
132
132
|
"""
|
|
133
133
|
Enumerates folders (not files) in [base_dir].
|
|
134
|
-
|
|
134
|
+
|
|
135
135
|
Args:
|
|
136
136
|
base_dir (str): folder to enumerate
|
|
137
137
|
convert_slashes (bool, optional): force forward slashes; if this is False, will use
|
|
138
138
|
the native path separator
|
|
139
139
|
return_relative_paths (bool, optional): return paths that are relative to [base_dir],
|
|
140
140
|
rather than absolute paths
|
|
141
|
-
|
|
141
|
+
sort_folders (bool, optional): force folders to be sorted, otherwise uses the sorting
|
|
142
142
|
provided by os.walk()
|
|
143
143
|
recursive (bool, optional): enumerate recursively
|
|
144
|
-
|
|
144
|
+
|
|
145
145
|
Returns:
|
|
146
146
|
list: list of folder names
|
|
147
147
|
"""
|
|
148
|
-
|
|
148
|
+
|
|
149
149
|
assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
|
|
150
|
-
|
|
150
|
+
|
|
151
151
|
folders = []
|
|
152
152
|
|
|
153
|
-
if recursive:
|
|
153
|
+
if recursive:
|
|
154
154
|
folders = []
|
|
155
155
|
for root, dirs, _ in os.walk(base_dir):
|
|
156
156
|
for d in dirs:
|
|
157
|
-
folders.append(os.path.join(root, d))
|
|
157
|
+
folders.append(os.path.join(root, d))
|
|
158
158
|
else:
|
|
159
159
|
folders = os.listdir(base_dir)
|
|
160
160
|
folders = [os.path.join(base_dir,fn) for fn in folders]
|
|
161
161
|
folders = [fn for fn in folders if os.path.isdir(fn)]
|
|
162
|
-
|
|
162
|
+
|
|
163
163
|
if return_relative_paths:
|
|
164
164
|
folders = [os.path.relpath(fn,base_dir) for fn in folders]
|
|
165
165
|
|
|
166
166
|
if convert_slashes:
|
|
167
167
|
folders = [fn.replace('\\', '/') for fn in folders]
|
|
168
|
-
|
|
168
|
+
|
|
169
169
|
if sort_folders:
|
|
170
|
-
folders = sorted(folders)
|
|
171
|
-
|
|
170
|
+
folders = sorted(folders)
|
|
171
|
+
|
|
172
172
|
return folders
|
|
173
173
|
|
|
174
174
|
|
|
175
175
|
def folder_summary(folder,print_summary=True):
|
|
176
176
|
"""
|
|
177
177
|
Returns (and optionally prints) a summary of [folder], including:
|
|
178
|
-
|
|
178
|
+
|
|
179
179
|
* The total number of files
|
|
180
180
|
* The total number of folders
|
|
181
|
-
* The number of files for each extension
|
|
182
|
-
|
|
181
|
+
* The number of files for each extension
|
|
182
|
+
|
|
183
183
|
Args:
|
|
184
184
|
folder (str): folder to summarize
|
|
185
185
|
print_summary (bool, optional): whether to print the summary
|
|
186
|
-
|
|
186
|
+
|
|
187
187
|
Returns:
|
|
188
188
|
dict: with fields "n_files", "n_folders", and "extension_to_count"
|
|
189
189
|
"""
|
|
190
|
-
|
|
190
|
+
|
|
191
191
|
assert os.path.isdir(folder), '{} is not a folder'.format(folder)
|
|
192
|
-
|
|
192
|
+
|
|
193
193
|
folders_relative = folder_list(folder,return_relative_paths=True,recursive=True)
|
|
194
194
|
files_relative = file_list(folder,return_relative_paths=True,recursive=True)
|
|
195
|
-
|
|
195
|
+
|
|
196
196
|
extension_to_count = defaultdict(int)
|
|
197
|
-
|
|
197
|
+
|
|
198
198
|
for fn in files_relative:
|
|
199
199
|
ext = os.path.splitext(fn)[1]
|
|
200
200
|
extension_to_count[ext] += 1
|
|
201
|
-
|
|
201
|
+
|
|
202
202
|
extension_to_count = sort_dictionary_by_value(extension_to_count,reverse=True)
|
|
203
|
-
|
|
203
|
+
|
|
204
204
|
if print_summary:
|
|
205
205
|
for extension in extension_to_count.keys():
|
|
206
206
|
print('{}: {}'.format(extension,extension_to_count[extension]))
|
|
207
207
|
print('')
|
|
208
208
|
print('Total files: {}'.format(len(files_relative)))
|
|
209
209
|
print('Total folders: {}'.format(len(folders_relative)))
|
|
210
|
-
|
|
210
|
+
|
|
211
211
|
to_return = {}
|
|
212
212
|
to_return['n_files'] = len(files_relative)
|
|
213
213
|
to_return['n_folders'] = len(folders_relative)
|
|
214
|
-
to_return['extension_to_count'] = extension_to_count
|
|
215
|
-
|
|
214
|
+
to_return['extension_to_count'] = extension_to_count
|
|
215
|
+
|
|
216
216
|
return to_return
|
|
217
|
-
|
|
218
|
-
|
|
217
|
+
|
|
218
|
+
|
|
219
219
|
def fileparts(path):
|
|
220
220
|
r"""
|
|
221
221
|
Breaks down a path into the directory path, filename, and extension.
|
|
@@ -223,25 +223,25 @@ def fileparts(path):
|
|
|
223
223
|
Note that the '.' lives with the extension, and separators are removed.
|
|
224
224
|
|
|
225
225
|
Examples:
|
|
226
|
-
|
|
226
|
+
|
|
227
227
|
.. code-block:: none
|
|
228
228
|
|
|
229
|
-
>>> fileparts('file')
|
|
229
|
+
>>> fileparts('file')
|
|
230
230
|
('', 'file', '')
|
|
231
231
|
>>> fileparts(r'c:/dir/file.jpg')
|
|
232
232
|
('c:/dir', 'file', '.jpg')
|
|
233
233
|
>>> fileparts('/dir/subdir/file.jpg')
|
|
234
|
-
('/dir/subdir', 'file', '.jpg')
|
|
234
|
+
('/dir/subdir', 'file', '.jpg')
|
|
235
235
|
|
|
236
236
|
Args:
|
|
237
237
|
path (str): path name to separate into parts
|
|
238
238
|
Returns:
|
|
239
|
-
tuple: tuple containing (p,n,e):
|
|
239
|
+
tuple: tuple containing (p,n,e):
|
|
240
240
|
- p: str, directory path
|
|
241
241
|
- n: str, filename without extension
|
|
242
242
|
- e: str, extension including the '.'
|
|
243
243
|
"""
|
|
244
|
-
|
|
244
|
+
|
|
245
245
|
# ntpath seems to do the right thing for both Windows and Unix paths
|
|
246
246
|
p = ntpath.dirname(path)
|
|
247
247
|
basename = ntpath.basename(path)
|
|
@@ -257,27 +257,27 @@ def insert_before_extension(filename, s=None, separator='.'):
|
|
|
257
257
|
appends [s].
|
|
258
258
|
|
|
259
259
|
Examples:
|
|
260
|
-
|
|
260
|
+
|
|
261
261
|
.. code-block:: none
|
|
262
|
-
|
|
262
|
+
|
|
263
263
|
>>> insert_before_extension('/dir/subdir/file.ext', 'insert')
|
|
264
264
|
'/dir/subdir/file.insert.ext'
|
|
265
265
|
>>> insert_before_extension('/dir/subdir/file', 'insert')
|
|
266
266
|
'/dir/subdir/file.insert'
|
|
267
267
|
>>> insert_before_extension('/dir/subdir/file')
|
|
268
268
|
'/dir/subdir/file.2020.07.20.10.54.38'
|
|
269
|
-
|
|
269
|
+
|
|
270
270
|
Args:
|
|
271
271
|
filename (str): filename to manipulate
|
|
272
272
|
s (str, optional): string to insert before the extension in [filename], or
|
|
273
273
|
None to insert a datestamp
|
|
274
274
|
separator (str, optional): separator to place between the filename base
|
|
275
275
|
and the inserted string
|
|
276
|
-
|
|
276
|
+
|
|
277
277
|
Returns:
|
|
278
278
|
str: modified string
|
|
279
279
|
"""
|
|
280
|
-
|
|
280
|
+
|
|
281
281
|
assert len(filename) > 0
|
|
282
282
|
if s is None or len(s) == 0:
|
|
283
283
|
s = datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
|
|
@@ -290,9 +290,9 @@ def split_path(path):
|
|
|
290
290
|
Splits [path] into all its constituent file/folder tokens.
|
|
291
291
|
|
|
292
292
|
Examples:
|
|
293
|
-
|
|
293
|
+
|
|
294
294
|
.. code-block:: none
|
|
295
|
-
|
|
295
|
+
|
|
296
296
|
>>> split_path(r'c:\dir\subdir\file.txt')
|
|
297
297
|
['c:\\', 'dir', 'subdir', 'file.txt']
|
|
298
298
|
>>> split_path('/dir/subdir/file.jpg')
|
|
@@ -301,14 +301,20 @@ def split_path(path):
|
|
|
301
301
|
['c:\\']
|
|
302
302
|
>>> split_path('/')
|
|
303
303
|
['/']
|
|
304
|
-
|
|
304
|
+
|
|
305
305
|
Args:
|
|
306
306
|
path (str): path to split into tokens
|
|
307
|
-
|
|
307
|
+
|
|
308
308
|
Returns:
|
|
309
309
|
list: list of path tokens
|
|
310
310
|
"""
|
|
311
|
-
|
|
311
|
+
|
|
312
|
+
# Edge cases
|
|
313
|
+
if path == '':
|
|
314
|
+
return ''
|
|
315
|
+
if path is None:
|
|
316
|
+
return None
|
|
317
|
+
|
|
312
318
|
parts = []
|
|
313
319
|
while True:
|
|
314
320
|
# ntpath seems to do the right thing for both Windows and Unix paths
|
|
@@ -325,68 +331,77 @@ def path_is_abs(p):
|
|
|
325
331
|
"""
|
|
326
332
|
Determines whether [p] is an absolute path. An absolute path is defined as
|
|
327
333
|
one that starts with slash, backslash, or a letter followed by a colon.
|
|
328
|
-
|
|
334
|
+
|
|
329
335
|
Args:
|
|
330
336
|
p (str): path to evaluate
|
|
331
|
-
|
|
337
|
+
|
|
332
338
|
Returns:
|
|
333
339
|
bool: True if [p] is an absolute path, else False
|
|
334
340
|
"""
|
|
335
|
-
|
|
341
|
+
|
|
336
342
|
return (len(p) > 1) and (p[0] == '/' or p[1] == ':' or p[0] == '\\')
|
|
337
343
|
|
|
338
344
|
|
|
339
345
|
def safe_create_link(link_exists,link_new):
|
|
340
346
|
"""
|
|
341
347
|
Creates a symlink at [link_new] pointing to [link_exists].
|
|
342
|
-
|
|
348
|
+
|
|
343
349
|
If [link_new] already exists, make sure it's a link (not a file),
|
|
344
350
|
and if it has a different target than [link_exists], removes and re-creates
|
|
345
351
|
it.
|
|
346
|
-
|
|
352
|
+
|
|
353
|
+
Creates a *real* directory if necessary.
|
|
354
|
+
|
|
347
355
|
Errors if [link_new] already exists but it's not a link.
|
|
348
|
-
|
|
356
|
+
|
|
349
357
|
Args:
|
|
350
358
|
link_exists (str): the source of the (possibly-new) symlink
|
|
351
359
|
link_new (str): the target of the (possibly-new) symlink
|
|
352
360
|
"""
|
|
353
|
-
|
|
361
|
+
|
|
362
|
+
# If the new file already exists...
|
|
354
363
|
if os.path.exists(link_new) or os.path.islink(link_new):
|
|
364
|
+
# Error if it's not already a link
|
|
355
365
|
assert os.path.islink(link_new)
|
|
366
|
+
# If it's already a link, and it points to the "exists" file,
|
|
367
|
+
# leave it alone, otherwise redirect it.
|
|
356
368
|
if not os.readlink(link_new) == link_exists:
|
|
357
369
|
os.remove(link_new)
|
|
358
370
|
os.symlink(link_exists,link_new)
|
|
359
371
|
else:
|
|
372
|
+
os.makedirs(os.path.dirname(link_new),exist_ok=True)
|
|
360
373
|
os.symlink(link_exists,link_new)
|
|
361
|
-
|
|
374
|
+
|
|
375
|
+
# ...def safe_create_link(...)
|
|
376
|
+
|
|
362
377
|
|
|
363
378
|
def remove_empty_folders(path, remove_root=False):
|
|
364
379
|
"""
|
|
365
380
|
Recursively removes empty folders within the specified path.
|
|
366
|
-
|
|
381
|
+
|
|
367
382
|
Args:
|
|
368
|
-
path (str): the folder from which we should recursively remove
|
|
383
|
+
path (str): the folder from which we should recursively remove
|
|
369
384
|
empty folders.
|
|
370
|
-
remove_root (bool, optional): whether to remove the root directory if
|
|
385
|
+
remove_root (bool, optional): whether to remove the root directory if
|
|
371
386
|
it's empty after removing all empty subdirectories. This will always
|
|
372
387
|
be True during recursive calls.
|
|
373
|
-
|
|
388
|
+
|
|
374
389
|
Returns:
|
|
375
390
|
bool: True if the directory is empty after processing, False otherwise
|
|
376
391
|
"""
|
|
377
|
-
|
|
392
|
+
|
|
378
393
|
# Verify that [path] is a directory
|
|
379
394
|
if not os.path.isdir(path):
|
|
380
395
|
return False
|
|
381
|
-
|
|
396
|
+
|
|
382
397
|
# Track whether the current directory is empty
|
|
383
398
|
is_empty = True
|
|
384
|
-
|
|
399
|
+
|
|
385
400
|
# Iterate through all items in the directory
|
|
386
401
|
for item in os.listdir(path):
|
|
387
|
-
|
|
402
|
+
|
|
388
403
|
item_path = os.path.join(path, item)
|
|
389
|
-
|
|
404
|
+
|
|
390
405
|
# If it's a directory, process it recursively
|
|
391
406
|
if os.path.isdir(item_path):
|
|
392
407
|
# If the subdirectory is empty after processing, it will be removed
|
|
@@ -396,76 +411,32 @@ def remove_empty_folders(path, remove_root=False):
|
|
|
396
411
|
else:
|
|
397
412
|
# If there's a file, the directory is not empty
|
|
398
413
|
is_empty = False
|
|
399
|
-
|
|
414
|
+
|
|
400
415
|
# If the directory is empty and we're supposed to remove it
|
|
401
416
|
if is_empty and remove_root:
|
|
402
417
|
try:
|
|
403
|
-
os.rmdir(path)
|
|
418
|
+
os.rmdir(path)
|
|
404
419
|
except Exception as e:
|
|
405
420
|
print('Error removing directory {}: {}'.format(path,str(e)))
|
|
406
421
|
is_empty = False
|
|
407
|
-
|
|
422
|
+
|
|
408
423
|
return is_empty
|
|
409
424
|
|
|
410
425
|
# ...def remove_empty_folders(...)
|
|
411
426
|
|
|
412
427
|
|
|
413
|
-
def top_level_folder(p):
|
|
414
|
-
r"""
|
|
415
|
-
Gets the top-level folder from the path *p*.
|
|
416
|
-
|
|
417
|
-
On UNIX, this is straightforward:
|
|
418
|
-
|
|
419
|
-
/blah/foo
|
|
420
|
-
|
|
421
|
-
...returns '/blah'
|
|
422
|
-
|
|
423
|
-
On Windows, we define this as the top-level folder that isn't the drive, so:
|
|
424
|
-
|
|
425
|
-
c:\blah\foo
|
|
426
|
-
|
|
427
|
-
...returns 'c:\blah'.
|
|
428
|
-
|
|
429
|
-
Args:
|
|
430
|
-
p (str): filename to evaluate
|
|
431
|
-
|
|
432
|
-
Returns:
|
|
433
|
-
str: the top-level folder in [p], see above for details on how this is defined
|
|
434
|
-
"""
|
|
435
|
-
|
|
436
|
-
if p == '':
|
|
437
|
-
return ''
|
|
438
|
-
|
|
439
|
-
# Path('/blah').parts is ('/','blah')
|
|
440
|
-
parts = split_path(p)
|
|
441
|
-
|
|
442
|
-
if len(parts) == 1:
|
|
443
|
-
return parts[0]
|
|
444
|
-
|
|
445
|
-
# Handle paths like:
|
|
446
|
-
#
|
|
447
|
-
# /, \, /stuff, c:, c:\stuff
|
|
448
|
-
drive = os.path.splitdrive(p)[0]
|
|
449
|
-
if parts[0] == drive or parts[0] == drive + '/' or parts[0] == drive + '\\' or parts[0] in ['\\', '/']:
|
|
450
|
-
return os.path.join(parts[0], parts[1])
|
|
451
|
-
else:
|
|
452
|
-
return parts[0]
|
|
453
|
-
|
|
454
|
-
# ...top_level_folder()
|
|
455
|
-
|
|
456
|
-
|
|
457
428
|
def path_join(*paths, convert_slashes=True):
|
|
458
429
|
r"""
|
|
459
430
|
Wrapper for os.path.join that optionally converts backslashes to forward slashes.
|
|
460
|
-
|
|
431
|
+
|
|
461
432
|
Args:
|
|
462
433
|
*paths (variable-length set of strings): Path components to be joined.
|
|
463
434
|
convert_slashes (bool, optional): whether to convert \\ to /
|
|
464
|
-
|
|
435
|
+
|
|
465
436
|
Returns:
|
|
466
437
|
A string with the joined path components.
|
|
467
438
|
"""
|
|
468
|
-
|
|
439
|
+
|
|
469
440
|
joined_path = os.path.join(*paths)
|
|
470
441
|
if convert_slashes:
|
|
471
442
|
return joined_path.replace('\\', '/')
|
|
@@ -473,41 +444,24 @@ def path_join(*paths, convert_slashes=True):
|
|
|
473
444
|
return joined_path
|
|
474
445
|
|
|
475
446
|
|
|
476
|
-
#%% Test driver for top_level_folder
|
|
477
|
-
|
|
478
|
-
if False:
|
|
479
|
-
|
|
480
|
-
#%%
|
|
481
|
-
|
|
482
|
-
p = 'blah/foo/bar'; s = top_level_folder(p); print(s); assert s == 'blah'
|
|
483
|
-
p = '/blah/foo/bar'; s = top_level_folder(p); print(s); assert s == '/blah'
|
|
484
|
-
p = 'bar'; s = top_level_folder(p); print(s); assert s == 'bar'
|
|
485
|
-
p = ''; s = top_level_folder(p); print(s); assert s == ''
|
|
486
|
-
p = 'c:\\'; s = top_level_folder(p); print(s); assert s == 'c:\\'
|
|
487
|
-
p = r'c:\blah'; s = top_level_folder(p); print(s); assert s == 'c:\\blah'
|
|
488
|
-
p = r'c:\foo'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
|
|
489
|
-
p = r'c:/foo'; s = top_level_folder(p); print(s); assert s == 'c:/foo'
|
|
490
|
-
p = r'c:\foo/bar'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
|
|
491
|
-
|
|
492
|
-
|
|
493
447
|
#%% Image-related path functions
|
|
494
448
|
|
|
495
449
|
def is_image_file(s, img_extensions=IMG_EXTENSIONS):
|
|
496
450
|
"""
|
|
497
451
|
Checks a file's extension against a hard-coded set of image file
|
|
498
452
|
extensions. Uses case-insensitive comparison.
|
|
499
|
-
|
|
453
|
+
|
|
500
454
|
Does not check whether the file exists, only determines whether the filename
|
|
501
455
|
implies it's an image file.
|
|
502
|
-
|
|
456
|
+
|
|
503
457
|
Args:
|
|
504
458
|
s (str): filename to evaluate for image-ness
|
|
505
459
|
img_extensions (list, optional): list of known image file extensions
|
|
506
|
-
|
|
460
|
+
|
|
507
461
|
Returns:
|
|
508
462
|
bool: True if [s] appears to be an image file, else False
|
|
509
463
|
"""
|
|
510
|
-
|
|
464
|
+
|
|
511
465
|
ext = os.path.splitext(s)[1]
|
|
512
466
|
return ext.lower() in img_extensions
|
|
513
467
|
|
|
@@ -516,27 +470,27 @@ def find_image_strings(strings):
|
|
|
516
470
|
"""
|
|
517
471
|
Given a list of strings that are potentially image file names, looks for
|
|
518
472
|
strings that actually look like image file names (based on extension).
|
|
519
|
-
|
|
473
|
+
|
|
520
474
|
Args:
|
|
521
475
|
strings (list): list of filenames to check for image-ness
|
|
522
|
-
|
|
476
|
+
|
|
523
477
|
Returns:
|
|
524
478
|
list: the subset of [strings] that appear to be image filenames
|
|
525
479
|
"""
|
|
526
|
-
|
|
480
|
+
|
|
527
481
|
return [s for s in strings if is_image_file(s)]
|
|
528
482
|
|
|
529
483
|
|
|
530
|
-
def find_images(dirname,
|
|
531
|
-
recursive=False,
|
|
532
|
-
return_relative_paths=False,
|
|
484
|
+
def find_images(dirname,
|
|
485
|
+
recursive=False,
|
|
486
|
+
return_relative_paths=False,
|
|
533
487
|
convert_slashes=True):
|
|
534
488
|
"""
|
|
535
489
|
Finds all files in a directory that look like image file names. Returns
|
|
536
490
|
absolute paths unless return_relative_paths is set. Uses the OS-native
|
|
537
491
|
path separator unless convert_slashes is set, in which case will always
|
|
538
492
|
use '/'.
|
|
539
|
-
|
|
493
|
+
|
|
540
494
|
Args:
|
|
541
495
|
dirname (str): the folder to search for images
|
|
542
496
|
recursive (bool, optional): whether to search recursively
|
|
@@ -547,30 +501,30 @@ def find_images(dirname,
|
|
|
547
501
|
Returns:
|
|
548
502
|
list: list of image filenames found in [dirname]
|
|
549
503
|
"""
|
|
550
|
-
|
|
504
|
+
|
|
551
505
|
assert os.path.isdir(dirname), '{} is not a folder'.format(dirname)
|
|
552
|
-
|
|
506
|
+
|
|
553
507
|
if recursive:
|
|
554
508
|
strings = glob.glob(os.path.join(dirname, '**', '*.*'), recursive=True)
|
|
555
509
|
else:
|
|
556
510
|
strings = glob.glob(os.path.join(dirname, '*.*'))
|
|
557
|
-
|
|
511
|
+
|
|
558
512
|
image_files = find_image_strings(strings)
|
|
559
|
-
|
|
513
|
+
|
|
560
514
|
if return_relative_paths:
|
|
561
515
|
image_files = [os.path.relpath(fn,dirname) for fn in image_files]
|
|
562
|
-
|
|
516
|
+
|
|
563
517
|
image_files = sorted(image_files)
|
|
564
|
-
|
|
518
|
+
|
|
565
519
|
if convert_slashes:
|
|
566
520
|
image_files = [fn.replace('\\', '/') for fn in image_files]
|
|
567
|
-
|
|
521
|
+
|
|
568
522
|
return image_files
|
|
569
523
|
|
|
570
524
|
|
|
571
525
|
#%% Filename cleaning functions
|
|
572
526
|
|
|
573
|
-
def clean_filename(filename,
|
|
527
|
+
def clean_filename(filename,
|
|
574
528
|
allow_list=VALID_FILENAME_CHARS,
|
|
575
529
|
char_limit=CHAR_LIMIT,
|
|
576
530
|
force_lower= False):
|
|
@@ -582,18 +536,18 @@ def clean_filename(filename,
|
|
|
582
536
|
|
|
583
537
|
Adapted from
|
|
584
538
|
https://gist.github.com/wassname/1393c4a57cfcbf03641dbc31886123b8
|
|
585
|
-
|
|
539
|
+
|
|
586
540
|
Args:
|
|
587
541
|
filename (str): filename to clean
|
|
588
542
|
allow_list (str, optional): string containing all allowable filename characters
|
|
589
543
|
char_limit (int, optional): maximum allowable filename length, if None will skip this
|
|
590
544
|
step
|
|
591
545
|
force_lower (bool, optional): convert the resulting filename to lowercase
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
str: cleaned version of [filename]
|
|
546
|
+
|
|
547
|
+
Returns:
|
|
548
|
+
str: cleaned version of [filename]
|
|
595
549
|
"""
|
|
596
|
-
|
|
550
|
+
|
|
597
551
|
# keep only valid ascii chars
|
|
598
552
|
cleaned_filename = (unicodedata.normalize('NFKD', filename)
|
|
599
553
|
.encode('ASCII', 'ignore').decode())
|
|
@@ -607,26 +561,26 @@ def clean_filename(filename,
|
|
|
607
561
|
return cleaned_filename
|
|
608
562
|
|
|
609
563
|
|
|
610
|
-
def clean_path(pathname,
|
|
564
|
+
def clean_path(pathname,
|
|
611
565
|
allow_list=VALID_PATH_CHARS,
|
|
612
566
|
char_limit=CHAR_LIMIT,
|
|
613
567
|
force_lower=False):
|
|
614
568
|
"""
|
|
615
569
|
Removes non-ASCII and other invalid path characters (on any reasonable
|
|
616
570
|
OS) from a path, then optionally trims to a maximum length.
|
|
617
|
-
|
|
571
|
+
|
|
618
572
|
Args:
|
|
619
573
|
pathname (str): path name to clean
|
|
620
574
|
allow_list (str, optional): string containing all allowable filename characters
|
|
621
575
|
char_limit (int, optional): maximum allowable filename length, if None will skip this
|
|
622
576
|
step
|
|
623
577
|
force_lower (bool, optional): convert the resulting filename to lowercase
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
str: cleaned version of [filename]
|
|
578
|
+
|
|
579
|
+
Returns:
|
|
580
|
+
str: cleaned version of [filename]
|
|
627
581
|
"""
|
|
628
|
-
|
|
629
|
-
return clean_filename(pathname, allow_list=allow_list,
|
|
582
|
+
|
|
583
|
+
return clean_filename(pathname, allow_list=allow_list,
|
|
630
584
|
char_limit=char_limit, force_lower=force_lower)
|
|
631
585
|
|
|
632
586
|
|
|
@@ -635,34 +589,34 @@ def flatten_path(pathname,separator_chars=SEPARATOR_CHARS,separator_char_replace
|
|
|
635
589
|
Removes non-ASCII and other invalid path characters (on any reasonable
|
|
636
590
|
OS) from a path, then trims to a maximum length. Replaces all valid
|
|
637
591
|
separators with [separator_char_replacement.]
|
|
638
|
-
|
|
592
|
+
|
|
639
593
|
Args:
|
|
640
594
|
pathname (str): path name to flatten
|
|
641
595
|
separator_chars (str, optional): string containing all known path separators
|
|
642
|
-
separator_char_replacement (str, optional): string to insert in place of
|
|
596
|
+
separator_char_replacement (str, optional): string to insert in place of
|
|
643
597
|
path separators.
|
|
644
|
-
|
|
598
|
+
|
|
645
599
|
Returns:
|
|
646
600
|
str: flattened version of [pathname]
|
|
647
601
|
"""
|
|
648
|
-
|
|
602
|
+
|
|
649
603
|
s = clean_path(pathname)
|
|
650
604
|
for c in separator_chars:
|
|
651
605
|
s = s.replace(c, separator_char_replacement)
|
|
652
606
|
return s
|
|
653
607
|
|
|
654
608
|
|
|
655
|
-
def is_executable(filename):
|
|
609
|
+
def is_executable(filename):
|
|
656
610
|
"""
|
|
657
611
|
Checks whether [filename] is on the system path and marked as executable.
|
|
658
|
-
|
|
612
|
+
|
|
659
613
|
Args:
|
|
660
614
|
filename (str): filename to check for executable status
|
|
661
|
-
|
|
615
|
+
|
|
662
616
|
Returns:
|
|
663
617
|
bool: True if [filename] is on the system path and marked as executable, otherwise False
|
|
664
618
|
"""
|
|
665
|
-
|
|
619
|
+
|
|
666
620
|
# https://stackoverflow.com/questions/11210104/check-if-a-program-exists-from-a-python-script
|
|
667
621
|
|
|
668
622
|
return which(filename) is not None
|
|
@@ -673,247 +627,249 @@ def is_executable(filename):
|
|
|
673
627
|
def environment_is_wsl():
|
|
674
628
|
"""
|
|
675
629
|
Determines whether we're running in WSL.
|
|
676
|
-
|
|
630
|
+
|
|
677
631
|
Returns:
|
|
678
|
-
True if we're running in WSL.
|
|
632
|
+
True if we're running in WSL.
|
|
679
633
|
"""
|
|
680
|
-
|
|
634
|
+
|
|
681
635
|
if sys.platform not in ('linux','posix'):
|
|
682
636
|
return False
|
|
683
637
|
platform_string = ' '.join(platform.uname()).lower()
|
|
684
638
|
return 'microsoft' in platform_string and 'wsl' in platform_string
|
|
685
|
-
|
|
639
|
+
|
|
686
640
|
|
|
687
641
|
def wsl_path_to_windows_path(filename, failure_behavior='none'):
|
|
688
642
|
r"""
|
|
689
643
|
Converts a WSL path to a Windows path. For example, converts:
|
|
690
|
-
|
|
644
|
+
|
|
691
645
|
/mnt/e/a/b/c
|
|
692
|
-
|
|
646
|
+
|
|
693
647
|
...to:
|
|
694
|
-
|
|
648
|
+
|
|
695
649
|
e:\a\b\c
|
|
696
|
-
|
|
650
|
+
|
|
697
651
|
Args:
|
|
698
652
|
filename (str): filename to convert
|
|
699
|
-
failure_behavior (str): what to do if the path can't be processed as a
|
|
700
|
-
'none' to return None in this case, 'original' to return the original path.
|
|
701
|
-
|
|
653
|
+
failure_behavior (str, optional): what to do if the path can't be processed as a
|
|
654
|
+
WSL path. 'none' to return None in this case, 'original' to return the original path.
|
|
655
|
+
|
|
702
656
|
Returns:
|
|
703
657
|
str: Windows equivalent to the WSL path [filename]
|
|
704
658
|
"""
|
|
705
|
-
|
|
659
|
+
|
|
706
660
|
assert failure_behavior in ('none','original'), \
|
|
707
661
|
'Unrecognized failure_behavior value {}'.format(failure_behavior)
|
|
708
|
-
|
|
662
|
+
|
|
709
663
|
# Check whether the path follows the standard WSL mount pattern
|
|
710
664
|
wsl_path_pattern = r'^/mnt/([a-zA-Z])(/.*)?$'
|
|
711
665
|
match = re.match(wsl_path_pattern, filename)
|
|
712
|
-
|
|
666
|
+
|
|
713
667
|
if match:
|
|
714
668
|
|
|
715
669
|
# Extract the drive letter and the rest of the path
|
|
716
670
|
drive_letter = match.group(1)
|
|
717
671
|
path_remainder = match.group(2) if match.group(2) else ''
|
|
718
|
-
|
|
672
|
+
|
|
719
673
|
# Convert forward slashes to backslashes for Windows
|
|
720
674
|
path_remainder = path_remainder.replace('/', '\\')
|
|
721
|
-
|
|
675
|
+
|
|
722
676
|
# Format the Windows path
|
|
723
677
|
windows_path = f"{drive_letter}:{path_remainder}"
|
|
724
678
|
return windows_path
|
|
725
|
-
|
|
679
|
+
|
|
726
680
|
if failure_behavior == 'none':
|
|
727
681
|
return None
|
|
728
682
|
else:
|
|
729
683
|
return filename
|
|
730
684
|
|
|
731
685
|
# ...def wsl_path_to_windows_path(...)
|
|
732
|
-
|
|
733
|
-
|
|
686
|
+
|
|
687
|
+
|
|
734
688
|
def windows_path_to_wsl_path(filename, failure_behavior='none'):
|
|
735
689
|
r"""
|
|
736
690
|
Converts a Windows path to a WSL path, or returns None if that's not possible. E.g.
|
|
737
691
|
converts:
|
|
738
|
-
|
|
692
|
+
|
|
739
693
|
e:\a\b\c
|
|
740
|
-
|
|
694
|
+
|
|
741
695
|
...to:
|
|
742
|
-
|
|
696
|
+
|
|
743
697
|
/mnt/e/a/b/c
|
|
744
|
-
|
|
698
|
+
|
|
745
699
|
Args:
|
|
746
700
|
filename (str): filename to convert
|
|
747
|
-
failure_behavior (str): what to do if the path can't be processed as a Windows path.
|
|
701
|
+
failure_behavior (str, optional): what to do if the path can't be processed as a Windows path.
|
|
748
702
|
'none' to return None in this case, 'original' to return the original path.
|
|
749
|
-
|
|
703
|
+
|
|
750
704
|
Returns:
|
|
751
705
|
str: WSL equivalent to the Windows path [filename]
|
|
752
706
|
"""
|
|
753
|
-
|
|
707
|
+
|
|
754
708
|
assert failure_behavior in ('none','original'), \
|
|
755
709
|
'Unrecognized failure_behavior value {}'.format(failure_behavior)
|
|
756
|
-
|
|
710
|
+
|
|
757
711
|
filename = filename.replace('\\', '/')
|
|
758
|
-
|
|
712
|
+
|
|
759
713
|
# Check whether the path follows a Windows drive letter pattern
|
|
760
714
|
windows_path_pattern = r'^([a-zA-Z]):(/.*)?$'
|
|
761
715
|
match = re.match(windows_path_pattern, filename)
|
|
762
|
-
|
|
716
|
+
|
|
763
717
|
if match:
|
|
764
718
|
# Extract the drive letter and the rest of the path
|
|
765
719
|
drive_letter = match.group(1).lower() # Convert to lowercase for WSL
|
|
766
720
|
path_remainder = match.group(2) if match.group(2) else ''
|
|
767
|
-
|
|
721
|
+
|
|
768
722
|
# Format the WSL path
|
|
769
723
|
wsl_path = f"/mnt/{drive_letter}{path_remainder}"
|
|
770
724
|
return wsl_path
|
|
771
|
-
|
|
725
|
+
|
|
772
726
|
if failure_behavior == 'none':
|
|
773
727
|
return None
|
|
774
728
|
else:
|
|
775
729
|
return filename
|
|
776
|
-
|
|
730
|
+
|
|
777
731
|
# ...def window_path_to_wsl_path(...)
|
|
778
732
|
|
|
779
733
|
|
|
780
734
|
def open_file_in_chrome(filename):
|
|
781
735
|
"""
|
|
782
|
-
Open a file in chrome, regardless of file type. I typically use this to open
|
|
736
|
+
Open a file in chrome, regardless of file type. I typically use this to open
|
|
783
737
|
.md files in Chrome.
|
|
784
|
-
|
|
738
|
+
|
|
785
739
|
Args:
|
|
786
740
|
filename (str): file to open
|
|
787
|
-
|
|
741
|
+
|
|
788
742
|
Return:
|
|
789
743
|
bool: whether the operation was successful
|
|
790
744
|
"""
|
|
791
|
-
|
|
745
|
+
|
|
792
746
|
# Create URL
|
|
793
747
|
abs_path = os.path.abspath(filename)
|
|
794
|
-
|
|
748
|
+
|
|
795
749
|
system = platform.system()
|
|
796
750
|
if system == 'Windows':
|
|
797
751
|
url = f'file:///{abs_path.replace(os.sep, "/")}'
|
|
798
752
|
else: # macOS and Linux
|
|
799
753
|
url = f'file://{abs_path}'
|
|
800
|
-
|
|
754
|
+
|
|
801
755
|
# Determine the Chrome path
|
|
802
756
|
if system == 'Windows':
|
|
803
|
-
|
|
757
|
+
|
|
804
758
|
# This is a native Python module, but it only exists on Windows
|
|
805
759
|
import winreg
|
|
806
|
-
|
|
760
|
+
|
|
807
761
|
chrome_paths = [
|
|
808
762
|
os.path.expanduser("~") + r"\AppData\Local\Google\Chrome\Application\chrome.exe",
|
|
809
763
|
r"C:\Program Files\Google\Chrome\Application\chrome.exe",
|
|
810
764
|
r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"
|
|
811
765
|
]
|
|
812
|
-
|
|
766
|
+
|
|
813
767
|
# Default approach: run from a typical chrome location
|
|
814
768
|
for path in chrome_paths:
|
|
815
769
|
if os.path.exists(path):
|
|
816
770
|
subprocess.run([path, url])
|
|
817
771
|
return True
|
|
818
|
-
|
|
772
|
+
|
|
819
773
|
# Method 2: Check registry for Chrome path
|
|
820
774
|
try:
|
|
821
|
-
with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE,
|
|
775
|
+
with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE,
|
|
822
776
|
r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe") as key:
|
|
823
777
|
chrome_path = winreg.QueryValue(key, None)
|
|
824
778
|
if chrome_path and os.path.exists(chrome_path):
|
|
825
779
|
subprocess.run([chrome_path, url])
|
|
826
780
|
return True
|
|
827
|
-
except:
|
|
781
|
+
except Exception:
|
|
828
782
|
pass
|
|
829
|
-
|
|
783
|
+
|
|
830
784
|
# Method 3: Try alternate registry location
|
|
831
785
|
try:
|
|
832
|
-
with winreg.OpenKey(winreg.HKEY_CURRENT_USER,
|
|
786
|
+
with winreg.OpenKey(winreg.HKEY_CURRENT_USER,
|
|
833
787
|
r"Software\Google\Chrome\BLBeacon") as key:
|
|
834
788
|
chrome_path = os.path.join(os.path.dirname(winreg.QueryValueEx(key, "version")[0]), "chrome.exe")
|
|
835
789
|
if os.path.exists(chrome_path):
|
|
836
790
|
subprocess.run([chrome_path, url])
|
|
837
791
|
return True
|
|
838
|
-
except:
|
|
792
|
+
except Exception:
|
|
839
793
|
pass
|
|
840
|
-
|
|
794
|
+
|
|
841
795
|
# Method 4: Try system path or command
|
|
842
796
|
for chrome_cmd in ["chrome", "chrome.exe", "googlechrome", "google-chrome"]:
|
|
843
797
|
try:
|
|
844
798
|
subprocess.run([chrome_cmd, url], shell=True)
|
|
845
799
|
return True
|
|
846
|
-
except:
|
|
800
|
+
except Exception:
|
|
847
801
|
continue
|
|
848
|
-
|
|
802
|
+
|
|
849
803
|
# Method 5: Use Windows URL protocol handler
|
|
850
804
|
try:
|
|
851
805
|
os.startfile(url)
|
|
852
806
|
return True
|
|
853
|
-
except:
|
|
807
|
+
except Exception:
|
|
854
808
|
pass
|
|
855
|
-
|
|
856
|
-
# Method 6: Use rundll32
|
|
809
|
+
|
|
810
|
+
# Method 6: Use rundll32
|
|
857
811
|
try:
|
|
858
812
|
cmd = f'rundll32 url.dll,FileProtocolHandler {url}'
|
|
859
813
|
subprocess.run(cmd, shell=True)
|
|
860
814
|
return True
|
|
861
|
-
except:
|
|
815
|
+
except Exception:
|
|
862
816
|
pass
|
|
863
|
-
|
|
817
|
+
|
|
864
818
|
elif system == 'Darwin':
|
|
865
|
-
|
|
819
|
+
|
|
866
820
|
chrome_paths = [
|
|
867
821
|
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
|
868
822
|
os.path.expanduser('~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome')
|
|
869
823
|
]
|
|
870
|
-
|
|
824
|
+
|
|
871
825
|
for path in chrome_paths:
|
|
872
826
|
if os.path.exists(path):
|
|
873
827
|
subprocess.run([path, url])
|
|
874
828
|
return True
|
|
875
|
-
|
|
829
|
+
|
|
876
830
|
# Fallback to 'open' command with Chrome as the app
|
|
877
831
|
try:
|
|
878
832
|
subprocess.run(['open', '-a', 'Google Chrome', url])
|
|
879
833
|
return True
|
|
880
|
-
except:
|
|
834
|
+
except Exception:
|
|
881
835
|
pass
|
|
882
|
-
|
|
836
|
+
|
|
883
837
|
elif system == 'Linux':
|
|
884
|
-
|
|
838
|
+
|
|
885
839
|
chrome_commands = ['google-chrome', 'chrome', 'chromium', 'chromium-browser']
|
|
886
|
-
|
|
840
|
+
|
|
887
841
|
for cmd in chrome_commands:
|
|
888
842
|
try:
|
|
889
843
|
subprocess.run([cmd, url], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
890
844
|
return True
|
|
891
|
-
except:
|
|
845
|
+
except Exception:
|
|
892
846
|
continue
|
|
893
|
-
|
|
847
|
+
|
|
894
848
|
print(f"Could not open {filename} in Chrome on {system}.")
|
|
895
849
|
return False
|
|
896
850
|
|
|
897
|
-
|
|
898
|
-
def open_file(filename,
|
|
851
|
+
|
|
852
|
+
def open_file(filename,
|
|
853
|
+
attempt_to_open_in_wsl_host=False,
|
|
854
|
+
browser_name=None):
|
|
899
855
|
"""
|
|
900
856
|
Opens [filename] in the default OS file handler for this file type.
|
|
901
|
-
|
|
857
|
+
|
|
902
858
|
If browser_name is not None, uses the webbrowser module to open the filename
|
|
903
859
|
in the specified browser; see https://docs.python.org/3/library/webbrowser.html
|
|
904
860
|
for supported browsers. Falls back to the default file handler if webbrowser.open()
|
|
905
861
|
fails. In this case, attempt_to_open_in_wsl_host is ignored unless webbrowser.open() fails.
|
|
906
|
-
|
|
907
|
-
If browser_name is 'default', uses the system default. This is different from the
|
|
862
|
+
|
|
863
|
+
If browser_name is 'default', uses the system default. This is different from the
|
|
908
864
|
parameter to webbrowser.get(), where None implies the system default.
|
|
909
|
-
|
|
865
|
+
|
|
910
866
|
Args:
|
|
911
867
|
filename (str): file to open
|
|
912
|
-
attempt_to_open_in_wsl_host: if this is True, and we're in WSL, attempts
|
|
913
|
-
[filename] in the Windows host environment
|
|
914
|
-
browser_name: see above
|
|
868
|
+
attempt_to_open_in_wsl_host (bool, optional): if this is True, and we're in WSL, attempts
|
|
869
|
+
to open [filename] in the Windows host environment
|
|
870
|
+
browser_name (str, optional): see above
|
|
915
871
|
"""
|
|
916
|
-
|
|
872
|
+
|
|
917
873
|
if browser_name is not None:
|
|
918
874
|
if browser_name == 'chrome':
|
|
919
875
|
browser_name = 'google-chrome'
|
|
@@ -925,32 +881,32 @@ def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
|
|
|
925
881
|
result = False
|
|
926
882
|
if result:
|
|
927
883
|
return
|
|
928
|
-
|
|
884
|
+
|
|
929
885
|
if sys.platform == 'win32':
|
|
930
|
-
|
|
886
|
+
|
|
931
887
|
os.startfile(filename)
|
|
932
888
|
|
|
933
889
|
elif sys.platform == 'darwin':
|
|
934
|
-
|
|
890
|
+
|
|
935
891
|
opener = 'open'
|
|
936
892
|
subprocess.call([opener, filename])
|
|
937
|
-
|
|
893
|
+
|
|
938
894
|
elif attempt_to_open_in_wsl_host and environment_is_wsl():
|
|
939
|
-
|
|
895
|
+
|
|
940
896
|
windows_path = wsl_path_to_windows_path(filename)
|
|
941
|
-
|
|
897
|
+
|
|
942
898
|
# Fall back to xdg-open
|
|
943
899
|
if windows_path is None:
|
|
944
900
|
subprocess.call(['xdg-open', filename])
|
|
945
|
-
|
|
946
|
-
if os.path.isdir(filename):
|
|
901
|
+
|
|
902
|
+
if os.path.isdir(filename):
|
|
947
903
|
subprocess.run(["explorer.exe", windows_path])
|
|
948
904
|
else:
|
|
949
|
-
os.system("cmd.exe /C start
|
|
950
|
-
|
|
905
|
+
os.system("cmd.exe /C start {}".format(re.escape(windows_path)))
|
|
906
|
+
|
|
951
907
|
else:
|
|
952
|
-
|
|
953
|
-
opener = 'xdg-open'
|
|
908
|
+
|
|
909
|
+
opener = 'xdg-open'
|
|
954
910
|
subprocess.call([opener, filename])
|
|
955
911
|
|
|
956
912
|
# ...def open_file(...)
|
|
@@ -962,12 +918,12 @@ def write_list_to_file(output_file,strings):
|
|
|
962
918
|
"""
|
|
963
919
|
Writes a list of strings to either a JSON file or text file,
|
|
964
920
|
depending on extension of the given file name.
|
|
965
|
-
|
|
921
|
+
|
|
966
922
|
Args:
|
|
967
923
|
output_file (str): file to write
|
|
968
924
|
strings (list): list of strings to write to [output_file]
|
|
969
925
|
"""
|
|
970
|
-
|
|
926
|
+
|
|
971
927
|
with open(output_file, 'w') as f:
|
|
972
928
|
if output_file.endswith('.json'):
|
|
973
929
|
json.dump(strings, f, indent=1)
|
|
@@ -978,14 +934,14 @@ def write_list_to_file(output_file,strings):
|
|
|
978
934
|
def read_list_from_file(filename):
|
|
979
935
|
"""
|
|
980
936
|
Reads a json-formatted list of strings from a file.
|
|
981
|
-
|
|
937
|
+
|
|
982
938
|
Args:
|
|
983
939
|
filename (str): .json filename to read
|
|
984
|
-
|
|
940
|
+
|
|
985
941
|
Returns:
|
|
986
942
|
list: list of strings read from [filename]
|
|
987
943
|
"""
|
|
988
|
-
|
|
944
|
+
|
|
989
945
|
assert filename.endswith('.json')
|
|
990
946
|
with open(filename, 'r') as f:
|
|
991
947
|
file_list = json.load(f)
|
|
@@ -1001,39 +957,39 @@ def _copy_file(input_output_tuple,overwrite=True,verbose=False,move=False):
|
|
|
1001
957
|
"""
|
|
1002
958
|
Internal function for copying files from within parallel_copy_files.
|
|
1003
959
|
"""
|
|
1004
|
-
|
|
960
|
+
|
|
1005
961
|
assert len(input_output_tuple) == 2
|
|
1006
962
|
source_fn = input_output_tuple[0]
|
|
1007
963
|
target_fn = input_output_tuple[1]
|
|
1008
964
|
if (not overwrite) and (os.path.isfile(target_fn)):
|
|
1009
965
|
if verbose:
|
|
1010
966
|
print('Skipping existing target file {}'.format(target_fn))
|
|
1011
|
-
return
|
|
1012
|
-
|
|
967
|
+
return
|
|
968
|
+
|
|
1013
969
|
if move:
|
|
1014
970
|
action_string = 'Moving'
|
|
1015
971
|
else:
|
|
1016
972
|
action_string = 'Copying'
|
|
1017
|
-
|
|
973
|
+
|
|
1018
974
|
if verbose:
|
|
1019
975
|
print('{} to {}'.format(action_string,target_fn))
|
|
1020
|
-
|
|
976
|
+
|
|
1021
977
|
os.makedirs(os.path.dirname(target_fn),exist_ok=True)
|
|
1022
978
|
if move:
|
|
1023
979
|
shutil.move(source_fn, target_fn)
|
|
1024
980
|
else:
|
|
1025
981
|
shutil.copyfile(source_fn,target_fn)
|
|
1026
|
-
|
|
1027
982
|
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
983
|
+
|
|
984
|
+
def parallel_copy_files(input_file_to_output_file,
|
|
985
|
+
max_workers=16,
|
|
986
|
+
use_threads=True,
|
|
987
|
+
overwrite=False,
|
|
1032
988
|
verbose=False,
|
|
1033
989
|
move=False):
|
|
1034
990
|
"""
|
|
1035
991
|
Copy (or move) files from source to target according to the dict input_file_to_output_file.
|
|
1036
|
-
|
|
992
|
+
|
|
1037
993
|
Args:
|
|
1038
994
|
input_file_to_output_file (dict): dictionary mapping source files to the target files
|
|
1039
995
|
to which they should be copied
|
|
@@ -1046,24 +1002,32 @@ def parallel_copy_files(input_file_to_output_file,
|
|
|
1046
1002
|
"""
|
|
1047
1003
|
|
|
1048
1004
|
n_workers = min(max_workers,len(input_file_to_output_file))
|
|
1049
|
-
|
|
1005
|
+
|
|
1050
1006
|
# Package the dictionary as a set of 2-tuples
|
|
1051
1007
|
input_output_tuples = []
|
|
1052
1008
|
for input_fn in input_file_to_output_file:
|
|
1053
1009
|
input_output_tuples.append((input_fn,input_file_to_output_file[input_fn]))
|
|
1054
1010
|
|
|
1055
|
-
|
|
1056
|
-
pool = ThreadPool(n_workers)
|
|
1057
|
-
else:
|
|
1058
|
-
pool = Pool(n_workers)
|
|
1011
|
+
pool = None
|
|
1059
1012
|
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1013
|
+
try:
|
|
1014
|
+
if use_threads:
|
|
1015
|
+
pool = ThreadPool(n_workers)
|
|
1016
|
+
else:
|
|
1017
|
+
pool = Pool(n_workers)
|
|
1018
|
+
|
|
1019
|
+
with tqdm(total=len(input_output_tuples)) as pbar:
|
|
1020
|
+
for i,_ in enumerate(pool.imap_unordered(partial(_copy_file,
|
|
1021
|
+
overwrite=overwrite,
|
|
1022
|
+
verbose=verbose,
|
|
1023
|
+
move=move),
|
|
1024
|
+
input_output_tuples)):
|
|
1025
|
+
pbar.update()
|
|
1026
|
+
finally:
|
|
1027
|
+
pool.close()
|
|
1028
|
+
pool.join()
|
|
1029
|
+
if verbose:
|
|
1030
|
+
print("Pool closed and joined parallel file copying")
|
|
1067
1031
|
|
|
1068
1032
|
# ...def parallel_copy_files(...)
|
|
1069
1033
|
|
|
@@ -1074,36 +1038,36 @@ def get_file_sizes(base_dir, convert_slashes=True):
|
|
|
1074
1038
|
"""
|
|
1075
1039
|
Gets sizes recursively for all files in base_dir, returning a dict mapping
|
|
1076
1040
|
relative filenames to size.
|
|
1077
|
-
|
|
1041
|
+
|
|
1078
1042
|
TODO: merge the functionality here with parallel_get_file_sizes, which uses slightly
|
|
1079
1043
|
different semantics.
|
|
1080
|
-
|
|
1044
|
+
|
|
1081
1045
|
Args:
|
|
1082
1046
|
base_dir (str): folder within which we want all file sizes
|
|
1083
1047
|
convert_slashes (bool, optional): force forward slashes in return strings,
|
|
1084
1048
|
otherwise uses the native path separator
|
|
1085
|
-
|
|
1049
|
+
|
|
1086
1050
|
Returns:
|
|
1087
1051
|
dict: dictionary mapping filenames to file sizes in bytes
|
|
1088
1052
|
"""
|
|
1089
|
-
|
|
1090
|
-
relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
|
|
1053
|
+
|
|
1054
|
+
relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
|
|
1091
1055
|
return_relative_paths=True)
|
|
1092
|
-
|
|
1056
|
+
|
|
1093
1057
|
fn_to_size = {}
|
|
1094
1058
|
for fn_relative in tqdm(relative_filenames):
|
|
1095
1059
|
fn_abs = os.path.join(base_dir,fn_relative)
|
|
1096
1060
|
fn_to_size[fn_relative] = os.path.getsize(fn_abs)
|
|
1097
|
-
|
|
1061
|
+
|
|
1098
1062
|
return fn_to_size
|
|
1099
|
-
|
|
1063
|
+
|
|
1100
1064
|
|
|
1101
1065
|
def _get_file_size(filename,verbose=False):
|
|
1102
1066
|
"""
|
|
1103
1067
|
Internal function for safely getting the size of a file. Returns a (filename,size)
|
|
1104
1068
|
tuple, where size is None if there is an error.
|
|
1105
1069
|
"""
|
|
1106
|
-
|
|
1070
|
+
|
|
1107
1071
|
try:
|
|
1108
1072
|
size = os.path.getsize(filename)
|
|
1109
1073
|
except Exception as e:
|
|
@@ -1112,18 +1076,18 @@ def _get_file_size(filename,verbose=False):
|
|
|
1112
1076
|
size = None
|
|
1113
1077
|
return (filename,size)
|
|
1114
1078
|
|
|
1115
|
-
|
|
1116
|
-
def parallel_get_file_sizes(filenames,
|
|
1117
|
-
max_workers=16,
|
|
1118
|
-
use_threads=True,
|
|
1079
|
+
|
|
1080
|
+
def parallel_get_file_sizes(filenames,
|
|
1081
|
+
max_workers=16,
|
|
1082
|
+
use_threads=True,
|
|
1119
1083
|
verbose=False,
|
|
1120
|
-
recursive=True,
|
|
1084
|
+
recursive=True,
|
|
1121
1085
|
convert_slashes=True,
|
|
1122
1086
|
return_relative_paths=False):
|
|
1123
1087
|
"""
|
|
1124
1088
|
Returns a dictionary mapping every file in [filenames] to the corresponding file size,
|
|
1125
1089
|
or None for errors. If [filenames] is a folder, will enumerate the folder (optionally recursively).
|
|
1126
|
-
|
|
1090
|
+
|
|
1127
1091
|
Args:
|
|
1128
1092
|
filenames (list or str): list of filenames for which we should read sizes, or a folder
|
|
1129
1093
|
within which we should read all file sizes recursively
|
|
@@ -1135,33 +1099,33 @@ def parallel_get_file_sizes(filenames,
|
|
|
1135
1099
|
convert_slashes (bool, optional): convert backslashes to forward slashes
|
|
1136
1100
|
return_relative_paths (bool, optional): return relative paths; only relevant if [filenames]
|
|
1137
1101
|
is a folder.
|
|
1138
|
-
|
|
1102
|
+
|
|
1139
1103
|
Returns:
|
|
1140
1104
|
dict: dictionary mapping filenames to file sizes in bytes
|
|
1141
1105
|
"""
|
|
1142
1106
|
|
|
1143
1107
|
n_workers = min(max_workers,len(filenames))
|
|
1144
|
-
|
|
1108
|
+
|
|
1145
1109
|
folder_name = None
|
|
1146
|
-
|
|
1110
|
+
|
|
1147
1111
|
if isinstance(filenames,str):
|
|
1148
|
-
|
|
1112
|
+
|
|
1149
1113
|
folder_name = filenames
|
|
1150
|
-
assert os.path.isdir(filenames), 'Could not find folder {}'.format(folder_name)
|
|
1151
|
-
|
|
1114
|
+
assert os.path.isdir(filenames), 'Could not find folder {}'.format(folder_name)
|
|
1115
|
+
|
|
1152
1116
|
if verbose:
|
|
1153
1117
|
print('Enumerating files in {}'.format(folder_name))
|
|
1154
|
-
|
|
1118
|
+
|
|
1155
1119
|
# Enumerate absolute paths here, we'll convert to relative later if requested
|
|
1156
1120
|
filenames = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
|
|
1157
1121
|
|
|
1158
1122
|
else:
|
|
1159
|
-
|
|
1123
|
+
|
|
1160
1124
|
assert is_iterable(filenames), '[filenames] argument is neither a folder nor an iterable'
|
|
1161
|
-
|
|
1125
|
+
|
|
1162
1126
|
if verbose:
|
|
1163
1127
|
print('Creating worker pool')
|
|
1164
|
-
|
|
1128
|
+
|
|
1165
1129
|
if use_threads:
|
|
1166
1130
|
pool_string = 'thread'
|
|
1167
1131
|
pool = ThreadPool(n_workers)
|
|
@@ -1172,11 +1136,11 @@ def parallel_get_file_sizes(filenames,
|
|
|
1172
1136
|
if verbose:
|
|
1173
1137
|
print('Created a {} pool of {} workers'.format(
|
|
1174
1138
|
pool_string,n_workers))
|
|
1175
|
-
|
|
1139
|
+
|
|
1176
1140
|
# This returns (filename,size) tuples
|
|
1177
1141
|
get_size_results = list(tqdm(pool.imap(
|
|
1178
1142
|
partial(_get_file_size,verbose=verbose),filenames), total=len(filenames)))
|
|
1179
|
-
|
|
1143
|
+
|
|
1180
1144
|
to_return = {}
|
|
1181
1145
|
for r in get_size_results:
|
|
1182
1146
|
fn = r[0]
|
|
@@ -1194,36 +1158,38 @@ def parallel_get_file_sizes(filenames,
|
|
|
1194
1158
|
|
|
1195
1159
|
#%% Compression (zip/tar) functions
|
|
1196
1160
|
|
|
1197
|
-
def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False,
|
|
1161
|
+
def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compress_level=9):
|
|
1198
1162
|
"""
|
|
1199
1163
|
Zips a single file.
|
|
1200
|
-
|
|
1164
|
+
|
|
1201
1165
|
Args:
|
|
1202
1166
|
input_fn (str): file to zip
|
|
1203
1167
|
output_fn (str, optional): target zipfile; if this is None, we'll use
|
|
1204
1168
|
[input_fn].zip
|
|
1205
1169
|
overwrite (bool, optional): whether to overwrite an existing target file
|
|
1206
1170
|
verbose (bool, optional): enable existing debug console output
|
|
1207
|
-
|
|
1208
|
-
|
|
1171
|
+
compress_level (int, optional): compression level to use, between 0 and 9
|
|
1172
|
+
|
|
1209
1173
|
Returns:
|
|
1210
1174
|
str: the output zipfile, whether we created it or determined that it already exists
|
|
1211
1175
|
"""
|
|
1212
|
-
|
|
1176
|
+
|
|
1213
1177
|
basename = os.path.basename(input_fn)
|
|
1214
|
-
|
|
1178
|
+
|
|
1215
1179
|
if output_fn is None:
|
|
1216
1180
|
output_fn = input_fn + '.zip'
|
|
1217
|
-
|
|
1181
|
+
|
|
1218
1182
|
if (not overwrite) and (os.path.isfile(output_fn)):
|
|
1219
1183
|
print('Skipping existing file {}'.format(output_fn))
|
|
1220
1184
|
return output_fn
|
|
1221
|
-
|
|
1185
|
+
|
|
1222
1186
|
if verbose:
|
|
1223
|
-
print('Zipping {} to {} with level {}'.format(input_fn,output_fn,
|
|
1224
|
-
|
|
1187
|
+
print('Zipping {} to {} with level {}'.format(input_fn,output_fn,compress_level))
|
|
1188
|
+
|
|
1225
1189
|
with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
|
|
1226
|
-
zipf.write(input_fn,
|
|
1190
|
+
zipf.write(input_fn,
|
|
1191
|
+
arcname=basename,
|
|
1192
|
+
compresslevel=compress_level,
|
|
1227
1193
|
compress_type=zipfile.ZIP_DEFLATED)
|
|
1228
1194
|
|
|
1229
1195
|
return output_fn
|
|
@@ -1232,9 +1198,9 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
|
|
|
1232
1198
|
def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
|
|
1233
1199
|
overwrite=False, verbose=False, mode='x'):
|
|
1234
1200
|
"""
|
|
1235
|
-
Adds all the files in [input_files] to the tar file [output_fn].
|
|
1201
|
+
Adds all the files in [input_files] to the tar file [output_fn].
|
|
1236
1202
|
Archive names are relative to arc_name_base.
|
|
1237
|
-
|
|
1203
|
+
|
|
1238
1204
|
Args:
|
|
1239
1205
|
input_files (list): list of absolute filenames to include in the .tar file
|
|
1240
1206
|
output_fn (str): .tar file to create
|
|
@@ -1244,11 +1210,11 @@ def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
|
|
|
1244
1210
|
overwrite (bool, optional): whether to overwrite an existing .tar file
|
|
1245
1211
|
verbose (bool, optional): enable additional debug console output
|
|
1246
1212
|
mode (str, optional): compression type, can be 'x' (no compression), 'x:gz', or 'x:bz2'.
|
|
1247
|
-
|
|
1213
|
+
|
|
1248
1214
|
Returns:
|
|
1249
1215
|
str: the output tar file, whether we created it or determined that it already exists
|
|
1250
1216
|
"""
|
|
1251
|
-
|
|
1217
|
+
|
|
1252
1218
|
if os.path.isfile(output_fn):
|
|
1253
1219
|
if not overwrite:
|
|
1254
1220
|
print('Tar file {} exists, skipping'.format(output_fn))
|
|
@@ -1256,11 +1222,11 @@ def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
|
|
|
1256
1222
|
else:
|
|
1257
1223
|
print('Tar file {} exists, deleting and re-creating'.format(output_fn))
|
|
1258
1224
|
os.remove(output_fn)
|
|
1259
|
-
|
|
1225
|
+
|
|
1260
1226
|
if verbose:
|
|
1261
1227
|
print('Adding {} files to {} (mode {})'.format(
|
|
1262
1228
|
len(input_files),output_fn,mode))
|
|
1263
|
-
|
|
1229
|
+
|
|
1264
1230
|
with tarfile.open(output_fn,mode) as tarf:
|
|
1265
1231
|
for input_fn_abs in tqdm(input_files,disable=(not verbose)):
|
|
1266
1232
|
input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
|
|
@@ -1269,12 +1235,16 @@ def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
|
|
|
1269
1235
|
return output_fn
|
|
1270
1236
|
|
|
1271
1237
|
|
|
1272
|
-
def zip_files_into_single_zipfile(input_files,
|
|
1273
|
-
|
|
1238
|
+
def zip_files_into_single_zipfile(input_files,
|
|
1239
|
+
output_fn,
|
|
1240
|
+
arc_name_base,
|
|
1241
|
+
overwrite=False,
|
|
1242
|
+
verbose=False,
|
|
1243
|
+
compress_level=9):
|
|
1274
1244
|
"""
|
|
1275
|
-
Zip all the files in [input_files] into [output_fn]. Archive names are relative to
|
|
1245
|
+
Zip all the files in [input_files] into [output_fn]. Archive names are relative to
|
|
1276
1246
|
arc_name_base.
|
|
1277
|
-
|
|
1247
|
+
|
|
1278
1248
|
Args:
|
|
1279
1249
|
input_files (list): list of absolute filenames to include in the .tar file
|
|
1280
1250
|
output_fn (str): .tar file to create
|
|
@@ -1283,89 +1253,89 @@ def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
|
|
|
1283
1253
|
[arc_name_base]
|
|
1284
1254
|
overwrite (bool, optional): whether to overwrite an existing .tar file
|
|
1285
1255
|
verbose (bool, optional): enable additional debug console output
|
|
1286
|
-
|
|
1287
|
-
|
|
1256
|
+
compress_level (int, optional): compression level to use, between 0 and 9
|
|
1257
|
+
|
|
1288
1258
|
Returns:
|
|
1289
1259
|
str: the output zipfile, whether we created it or determined that it already exists
|
|
1290
1260
|
"""
|
|
1291
|
-
|
|
1261
|
+
|
|
1292
1262
|
if not overwrite:
|
|
1293
1263
|
if os.path.isfile(output_fn):
|
|
1294
1264
|
print('Zip file {} exists, skipping'.format(output_fn))
|
|
1295
1265
|
return output_fn
|
|
1296
|
-
|
|
1266
|
+
|
|
1297
1267
|
if verbose:
|
|
1298
1268
|
print('Zipping {} files to {} (compression level {})'.format(
|
|
1299
|
-
len(input_files),output_fn,
|
|
1300
|
-
|
|
1269
|
+
len(input_files),output_fn,compress_level))
|
|
1270
|
+
|
|
1301
1271
|
with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
|
|
1302
1272
|
for input_fn_abs in tqdm(input_files,disable=(not verbose)):
|
|
1303
1273
|
input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
|
|
1304
1274
|
zipf.write(input_fn_abs,
|
|
1305
1275
|
arcname=input_fn_relative,
|
|
1306
|
-
compresslevel=
|
|
1276
|
+
compresslevel=compress_level,
|
|
1307
1277
|
compress_type=zipfile.ZIP_DEFLATED)
|
|
1308
1278
|
|
|
1309
1279
|
return output_fn
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False,
|
|
1280
|
+
|
|
1281
|
+
|
|
1282
|
+
def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, compress_level=9):
|
|
1313
1283
|
"""
|
|
1314
|
-
Recursively zip everything in [input_folder] into a single zipfile, storing files as paths
|
|
1284
|
+
Recursively zip everything in [input_folder] into a single zipfile, storing files as paths
|
|
1315
1285
|
relative to [input_folder].
|
|
1316
|
-
|
|
1317
|
-
Args:
|
|
1286
|
+
|
|
1287
|
+
Args:
|
|
1318
1288
|
input_folder (str): folder to zip
|
|
1319
1289
|
output_fn (str, optional): output filename; if this is None, we'll write to [input_folder].zip
|
|
1320
1290
|
overwrite (bool, optional): whether to overwrite an existing .tar file
|
|
1321
1291
|
verbose (bool, optional): enable additional debug console output
|
|
1322
|
-
|
|
1323
|
-
|
|
1292
|
+
compress_level (int, optional): compression level to use, between 0 and 9
|
|
1293
|
+
|
|
1324
1294
|
Returns:
|
|
1325
|
-
str: the output zipfile, whether we created it or determined that it already exists
|
|
1295
|
+
str: the output zipfile, whether we created it or determined that it already exists
|
|
1326
1296
|
"""
|
|
1327
|
-
|
|
1297
|
+
|
|
1328
1298
|
if output_fn is None:
|
|
1329
1299
|
output_fn = input_folder + '.zip'
|
|
1330
|
-
|
|
1300
|
+
|
|
1331
1301
|
if not overwrite:
|
|
1332
1302
|
if os.path.isfile(output_fn):
|
|
1333
1303
|
print('Zip file {} exists, skipping'.format(output_fn))
|
|
1334
|
-
return
|
|
1335
|
-
|
|
1304
|
+
return
|
|
1305
|
+
|
|
1336
1306
|
if verbose:
|
|
1337
1307
|
print('Zipping {} to {} (compression level {})'.format(
|
|
1338
|
-
input_folder,output_fn,
|
|
1339
|
-
|
|
1308
|
+
input_folder,output_fn,compress_level))
|
|
1309
|
+
|
|
1340
1310
|
relative_filenames = recursive_file_list(input_folder,return_relative_paths=True)
|
|
1341
|
-
|
|
1311
|
+
|
|
1342
1312
|
with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
|
|
1343
1313
|
for input_fn_relative in tqdm(relative_filenames,disable=(not verbose)):
|
|
1344
|
-
input_fn_abs = os.path.join(input_folder,input_fn_relative)
|
|
1314
|
+
input_fn_abs = os.path.join(input_folder,input_fn_relative)
|
|
1345
1315
|
zipf.write(input_fn_abs,
|
|
1346
1316
|
arcname=input_fn_relative,
|
|
1347
|
-
compresslevel=
|
|
1317
|
+
compresslevel=compress_level,
|
|
1348
1318
|
compress_type=zipfile.ZIP_DEFLATED)
|
|
1349
1319
|
|
|
1350
1320
|
return output_fn
|
|
1351
1321
|
|
|
1352
|
-
|
|
1353
|
-
def parallel_zip_files(input_files,
|
|
1354
|
-
max_workers=16,
|
|
1355
|
-
use_threads=True,
|
|
1356
|
-
|
|
1357
|
-
overwrite=False,
|
|
1322
|
+
|
|
1323
|
+
def parallel_zip_files(input_files,
|
|
1324
|
+
max_workers=16,
|
|
1325
|
+
use_threads=True,
|
|
1326
|
+
compress_level=9,
|
|
1327
|
+
overwrite=False,
|
|
1358
1328
|
verbose=False):
|
|
1359
1329
|
"""
|
|
1360
|
-
Zips one or more files to separate output files in parallel, leaving the
|
|
1330
|
+
Zips one or more files to separate output files in parallel, leaving the
|
|
1361
1331
|
original files in place. Each file is zipped to [filename].zip.
|
|
1362
|
-
|
|
1332
|
+
|
|
1363
1333
|
Args:
|
|
1364
|
-
|
|
1334
|
+
input_files (str): list of files to zip
|
|
1365
1335
|
max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
|
|
1366
1336
|
use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
|
|
1367
1337
|
max_workers <= 1
|
|
1368
|
-
|
|
1338
|
+
compress_level (int, optional): zip compression level between 0 and 9
|
|
1369
1339
|
overwrite (bool, optional): whether to overwrite an existing .tar file
|
|
1370
1340
|
verbose (bool, optional): enable additional debug console output
|
|
1371
1341
|
"""
|
|
@@ -1379,23 +1349,27 @@ def parallel_zip_files(input_files,
|
|
|
1379
1349
|
|
|
1380
1350
|
with tqdm(total=len(input_files)) as pbar:
|
|
1381
1351
|
for i,_ in enumerate(pool.imap_unordered(partial(zip_file,
|
|
1382
|
-
output_fn=None,overwrite=overwrite,verbose=verbose,
|
|
1352
|
+
output_fn=None,overwrite=overwrite,verbose=verbose,compress_level=compress_level),
|
|
1383
1353
|
input_files)):
|
|
1384
1354
|
pbar.update()
|
|
1385
1355
|
|
|
1386
1356
|
|
|
1387
|
-
def parallel_zip_folders(input_folders,
|
|
1388
|
-
|
|
1357
|
+
def parallel_zip_folders(input_folders,
|
|
1358
|
+
max_workers=16,
|
|
1359
|
+
use_threads=True,
|
|
1360
|
+
compress_level=9,
|
|
1361
|
+
overwrite=False,
|
|
1362
|
+
verbose=False):
|
|
1389
1363
|
"""
|
|
1390
|
-
Zips one or more folders to separate output files in parallel, leaving the
|
|
1364
|
+
Zips one or more folders to separate output files in parallel, leaving the
|
|
1391
1365
|
original folders in place. Each folder is zipped to [folder_name].zip.
|
|
1392
|
-
|
|
1366
|
+
|
|
1393
1367
|
Args:
|
|
1394
|
-
|
|
1368
|
+
input_folders (list): list of folders to zip
|
|
1395
1369
|
max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
|
|
1396
1370
|
use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
|
|
1397
1371
|
max_workers <= 1
|
|
1398
|
-
|
|
1372
|
+
compress_level (int, optional): zip compression level between 0 and 9
|
|
1399
1373
|
overwrite (bool, optional): whether to overwrite an existing .tar file
|
|
1400
1374
|
verbose (bool, optional): enable additional debug console output
|
|
1401
1375
|
"""
|
|
@@ -1406,47 +1380,53 @@ def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
|
|
|
1406
1380
|
pool = ThreadPool(n_workers)
|
|
1407
1381
|
else:
|
|
1408
1382
|
pool = Pool(n_workers)
|
|
1409
|
-
|
|
1383
|
+
|
|
1410
1384
|
with tqdm(total=len(input_folders)) as pbar:
|
|
1411
1385
|
for i,_ in enumerate(pool.imap_unordered(
|
|
1412
1386
|
partial(zip_folder,overwrite=overwrite,
|
|
1413
|
-
|
|
1387
|
+
compress_level=compress_level,verbose=verbose),
|
|
1414
1388
|
input_folders)):
|
|
1415
1389
|
pbar.update()
|
|
1416
1390
|
|
|
1417
1391
|
|
|
1418
|
-
def zip_each_file_in_folder(folder_name,
|
|
1419
|
-
|
|
1392
|
+
def zip_each_file_in_folder(folder_name,
|
|
1393
|
+
recursive=False,
|
|
1394
|
+
max_workers=16,
|
|
1395
|
+
use_threads=True,
|
|
1396
|
+
compress_level=9,
|
|
1397
|
+
overwrite=False,
|
|
1398
|
+
required_token=None,
|
|
1399
|
+
verbose=False,
|
|
1420
1400
|
exclude_zip=True):
|
|
1421
1401
|
"""
|
|
1422
|
-
Zips each file in [folder_name] to its own zipfile (filename.zip), optionally recursing. To
|
|
1402
|
+
Zips each file in [folder_name] to its own zipfile (filename.zip), optionally recursing. To
|
|
1423
1403
|
zip a whole folder into a single zipfile, use zip_folder().
|
|
1424
|
-
|
|
1404
|
+
|
|
1425
1405
|
Args:
|
|
1426
1406
|
folder_name (str): the folder within which we should zip files
|
|
1427
1407
|
recursive (bool, optional): whether to recurse within [folder_name]
|
|
1428
1408
|
max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
|
|
1429
1409
|
use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
|
|
1430
1410
|
max_workers <= 1
|
|
1431
|
-
|
|
1411
|
+
compress_level (int, optional): zip compression level between 0 and 9
|
|
1432
1412
|
overwrite (bool, optional): whether to overwrite an existing .tar file
|
|
1433
1413
|
required_token (str, optional): only zip files whose names contain this string
|
|
1434
1414
|
verbose (bool, optional): enable additional debug console output
|
|
1435
|
-
exclude_zip (bool, optional): skip files ending in .zip
|
|
1415
|
+
exclude_zip (bool, optional): skip files ending in .zip
|
|
1436
1416
|
"""
|
|
1437
|
-
|
|
1417
|
+
|
|
1438
1418
|
assert os.path.isdir(folder_name), '{} is not a folder'.format(folder_name)
|
|
1439
|
-
|
|
1419
|
+
|
|
1440
1420
|
input_files = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
|
|
1441
|
-
|
|
1421
|
+
|
|
1442
1422
|
if required_token is not None:
|
|
1443
1423
|
input_files = [fn for fn in input_files if required_token in fn]
|
|
1444
|
-
|
|
1424
|
+
|
|
1445
1425
|
if exclude_zip:
|
|
1446
1426
|
input_files = [fn for fn in input_files if (not fn.endswith('.zip'))]
|
|
1447
|
-
|
|
1427
|
+
|
|
1448
1428
|
parallel_zip_files(input_files=input_files,max_workers=max_workers,
|
|
1449
|
-
use_threads=use_threads,
|
|
1429
|
+
use_threads=use_threads,compress_level=compress_level,
|
|
1450
1430
|
overwrite=overwrite,verbose=verbose)
|
|
1451
1431
|
|
|
1452
1432
|
|
|
@@ -1454,16 +1434,16 @@ def unzip_file(input_file, output_folder=None):
|
|
|
1454
1434
|
"""
|
|
1455
1435
|
Unzips a zipfile to the specified output folder, defaulting to the same location as
|
|
1456
1436
|
the input file.
|
|
1457
|
-
|
|
1437
|
+
|
|
1458
1438
|
Args:
|
|
1459
1439
|
input_file (str): zipfile to unzip
|
|
1460
1440
|
output_folder (str, optional): folder to which we should unzip [input_file], defaults
|
|
1461
1441
|
to unzipping to the folder where [input_file] lives
|
|
1462
1442
|
"""
|
|
1463
|
-
|
|
1443
|
+
|
|
1464
1444
|
if output_folder is None:
|
|
1465
1445
|
output_folder = os.path.dirname(input_file)
|
|
1466
|
-
|
|
1446
|
+
|
|
1467
1447
|
with zipfile.ZipFile(input_file, 'r') as zf:
|
|
1468
1448
|
zf.extractall(output_folder)
|
|
1469
1449
|
|
|
@@ -1473,31 +1453,33 @@ def unzip_file(input_file, output_folder=None):
|
|
|
1473
1453
|
def compute_file_hash(file_path, algorithm='sha256', allow_failures=True):
|
|
1474
1454
|
"""
|
|
1475
1455
|
Compute the hash of a file.
|
|
1476
|
-
|
|
1456
|
+
|
|
1477
1457
|
Adapted from:
|
|
1478
|
-
|
|
1458
|
+
|
|
1479
1459
|
https://www.geeksforgeeks.org/python-program-to-find-hash-of-file/
|
|
1480
|
-
|
|
1460
|
+
|
|
1481
1461
|
Args:
|
|
1482
1462
|
file_path (str): the file to hash
|
|
1483
1463
|
algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
|
|
1484
|
-
|
|
1464
|
+
allow_failures (bool, optional): if True, read failures will silently return
|
|
1465
|
+
None; if false, read failures will raise exceptions
|
|
1466
|
+
|
|
1485
1467
|
Returns:
|
|
1486
1468
|
str: the hash value for this file
|
|
1487
1469
|
"""
|
|
1488
|
-
|
|
1470
|
+
|
|
1489
1471
|
try:
|
|
1490
|
-
|
|
1472
|
+
|
|
1491
1473
|
hash_func = hashlib.new(algorithm)
|
|
1492
|
-
|
|
1474
|
+
|
|
1493
1475
|
with open(file_path, 'rb') as file:
|
|
1494
1476
|
while chunk := file.read(8192): # Read the file in chunks of 8192 bytes
|
|
1495
1477
|
hash_func.update(chunk)
|
|
1496
|
-
|
|
1478
|
+
|
|
1497
1479
|
return str(hash_func.hexdigest())
|
|
1498
|
-
|
|
1480
|
+
|
|
1499
1481
|
except Exception:
|
|
1500
|
-
|
|
1482
|
+
|
|
1501
1483
|
if allow_failures:
|
|
1502
1484
|
return None
|
|
1503
1485
|
else:
|
|
@@ -1507,14 +1489,14 @@ def compute_file_hash(file_path, algorithm='sha256', allow_failures=True):
|
|
|
1507
1489
|
|
|
1508
1490
|
|
|
1509
1491
|
def parallel_compute_file_hashes(filenames,
|
|
1510
|
-
max_workers=16,
|
|
1511
|
-
use_threads=True,
|
|
1492
|
+
max_workers=16,
|
|
1493
|
+
use_threads=True,
|
|
1512
1494
|
recursive=True,
|
|
1513
1495
|
algorithm='sha256',
|
|
1514
1496
|
verbose=False):
|
|
1515
1497
|
"""
|
|
1516
1498
|
Compute file hashes for a list or folder of images.
|
|
1517
|
-
|
|
1499
|
+
|
|
1518
1500
|
Args:
|
|
1519
1501
|
filenames (list or str): a list of filenames or a folder
|
|
1520
1502
|
max_workers (int, optional): the number of parallel workers to use; set to <=1 to disable
|
|
@@ -1524,8 +1506,8 @@ def parallel_compute_file_hashes(filenames,
|
|
|
1524
1506
|
algorithm (str, optional): the hashing algorithm to use (e.g. md5, sha256)
|
|
1525
1507
|
recursive (bool, optional): if [filenames] is a folder, whether to enumerate recursively.
|
|
1526
1508
|
Ignored if [filenames] is a list.
|
|
1527
|
-
verbose (bool, optional): enable additional debug output
|
|
1528
|
-
|
|
1509
|
+
verbose (bool, optional): enable additional debug output
|
|
1510
|
+
|
|
1529
1511
|
Returns:
|
|
1530
1512
|
dict: a dict mapping filenames to hash values; values will be None for files that fail
|
|
1531
1513
|
to load.
|
|
@@ -1535,35 +1517,1142 @@ def parallel_compute_file_hashes(filenames,
|
|
|
1535
1517
|
if verbose:
|
|
1536
1518
|
print('Enumerating files in {}'.format(filenames))
|
|
1537
1519
|
filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
|
|
1538
|
-
|
|
1520
|
+
|
|
1539
1521
|
n_workers = min(max_workers,len(filenames))
|
|
1540
|
-
|
|
1522
|
+
|
|
1541
1523
|
if verbose:
|
|
1542
1524
|
print('Computing hashes for {} files on {} workers'.format(len(filenames),n_workers))
|
|
1543
|
-
|
|
1525
|
+
|
|
1544
1526
|
if n_workers <= 1:
|
|
1545
|
-
|
|
1527
|
+
|
|
1546
1528
|
results = []
|
|
1547
1529
|
for filename in filenames:
|
|
1548
1530
|
results.append(compute_file_hash(filename,algorithm=algorithm,allow_failures=True))
|
|
1549
|
-
|
|
1531
|
+
|
|
1550
1532
|
else:
|
|
1551
|
-
|
|
1533
|
+
|
|
1552
1534
|
if use_threads:
|
|
1553
1535
|
pool = ThreadPool(n_workers)
|
|
1554
1536
|
else:
|
|
1555
1537
|
pool = Pool(n_workers)
|
|
1556
|
-
|
|
1538
|
+
|
|
1557
1539
|
results = list(tqdm(pool.imap(
|
|
1558
1540
|
partial(compute_file_hash,algorithm=algorithm,allow_failures=True),
|
|
1559
1541
|
filenames), total=len(filenames)))
|
|
1560
|
-
|
|
1542
|
+
|
|
1561
1543
|
assert len(filenames) == len(results), 'Internal error in parallel_compute_file_hashes'
|
|
1562
|
-
|
|
1544
|
+
|
|
1563
1545
|
to_return = {}
|
|
1564
1546
|
for i_file,filename in enumerate(filenames):
|
|
1565
1547
|
to_return[filename] = results[i_file]
|
|
1566
|
-
|
|
1548
|
+
|
|
1567
1549
|
return to_return
|
|
1568
1550
|
|
|
1569
1551
|
# ...def parallel_compute_file_hashes(...)
|
|
1552
|
+
|
|
1553
|
+
|
|
1554
|
+
#%% Tests
|
|
1555
|
+
|
|
1556
|
+
class TestPathUtils:
|
|
1557
|
+
"""
|
|
1558
|
+
Tests for path_utils.py
|
|
1559
|
+
"""
|
|
1560
|
+
|
|
1561
|
+
def set_up(self):
|
|
1562
|
+
"""
|
|
1563
|
+
Create a temporary directory for testing.
|
|
1564
|
+
"""
|
|
1565
|
+
|
|
1566
|
+
self.test_dir = make_test_folder(subfolder='megadetector/path_utils_tests')
|
|
1567
|
+
os.makedirs(self.test_dir, exist_ok=True)
|
|
1568
|
+
|
|
1569
|
+
|
|
1570
|
+
def tear_down(self):
|
|
1571
|
+
"""
|
|
1572
|
+
Remove the temporary directory after tests.
|
|
1573
|
+
"""
|
|
1574
|
+
|
|
1575
|
+
if os.path.exists(self.test_dir):
|
|
1576
|
+
shutil.rmtree(self.test_dir)
|
|
1577
|
+
|
|
1578
|
+
|
|
1579
|
+
def test_is_image_file(self):
|
|
1580
|
+
"""
|
|
1581
|
+
Test the is_image_file function.
|
|
1582
|
+
"""
|
|
1583
|
+
|
|
1584
|
+
assert is_image_file('test.jpg')
|
|
1585
|
+
assert is_image_file('test.jpeg')
|
|
1586
|
+
assert is_image_file('test.png')
|
|
1587
|
+
assert is_image_file('test.gif')
|
|
1588
|
+
assert is_image_file('test.bmp')
|
|
1589
|
+
assert is_image_file('test.tiff')
|
|
1590
|
+
assert is_image_file('test.TIF')
|
|
1591
|
+
assert not is_image_file('test.txt')
|
|
1592
|
+
assert not is_image_file('test.doc')
|
|
1593
|
+
assert is_image_file('path/to/image.JPG')
|
|
1594
|
+
assert not is_image_file('image')
|
|
1595
|
+
assert is_image_file('test.custom', img_extensions=['.custom'])
|
|
1596
|
+
assert not is_image_file('test.jpg', img_extensions=['.custom'])
|
|
1597
|
+
|
|
1598
|
+
|
|
1599
|
+
def test_find_image_strings(self):
|
|
1600
|
+
"""
|
|
1601
|
+
Test the find_image_strings function.
|
|
1602
|
+
"""
|
|
1603
|
+
|
|
1604
|
+
strings = ['a.jpg', 'b.txt', 'c.PNG', 'd.gif', 'e.jpeg', 'f.doc']
|
|
1605
|
+
expected = ['a.jpg', 'c.PNG', 'd.gif', 'e.jpeg']
|
|
1606
|
+
assert sorted(find_image_strings(strings)) == sorted(expected)
|
|
1607
|
+
assert find_image_strings([]) == []
|
|
1608
|
+
assert find_image_strings(['no_image.txt', 'another.doc']) == []
|
|
1609
|
+
|
|
1610
|
+
|
|
1611
|
+
def test_find_images(self):
|
|
1612
|
+
"""
|
|
1613
|
+
Test the find_images function.
|
|
1614
|
+
"""
|
|
1615
|
+
|
|
1616
|
+
# Create some dummy files
|
|
1617
|
+
img1_abs = os.path.join(self.test_dir, 'img1.jpg')
|
|
1618
|
+
img2_abs = os.path.join(self.test_dir, 'img2.PNG')
|
|
1619
|
+
txt1_abs = os.path.join(self.test_dir, 'text1.txt')
|
|
1620
|
+
open(img1_abs, 'w').close()
|
|
1621
|
+
open(img2_abs, 'w').close()
|
|
1622
|
+
open(txt1_abs, 'w').close()
|
|
1623
|
+
|
|
1624
|
+
subdir = os.path.join(self.test_dir, 'subdir')
|
|
1625
|
+
os.makedirs(subdir, exist_ok=True)
|
|
1626
|
+
img3_abs = os.path.join(subdir, 'img3.jpeg')
|
|
1627
|
+
txt2_abs = os.path.join(subdir, 'text2.txt')
|
|
1628
|
+
open(img3_abs, 'w').close()
|
|
1629
|
+
open(txt2_abs, 'w').close()
|
|
1630
|
+
|
|
1631
|
+
# Test non-recursive
|
|
1632
|
+
expected_non_recursive_abs = sorted([img1_abs.replace('\\', '/'), img2_abs.replace('\\', '/')])
|
|
1633
|
+
found_non_recursive_abs = find_images(self.test_dir, recursive=False, return_relative_paths=False)
|
|
1634
|
+
assert sorted(found_non_recursive_abs) == expected_non_recursive_abs
|
|
1635
|
+
|
|
1636
|
+
# Test non-recursive, relative paths
|
|
1637
|
+
expected_non_recursive_rel = sorted(['img1.jpg', 'img2.PNG'])
|
|
1638
|
+
found_non_recursive_rel = find_images(self.test_dir, recursive=False, return_relative_paths=True)
|
|
1639
|
+
assert sorted(found_non_recursive_rel) == expected_non_recursive_rel
|
|
1640
|
+
|
|
1641
|
+
# Test recursive
|
|
1642
|
+
expected_recursive_abs = sorted([
|
|
1643
|
+
img1_abs.replace('\\', '/'),
|
|
1644
|
+
img2_abs.replace('\\', '/'),
|
|
1645
|
+
img3_abs.replace('\\', '/')
|
|
1646
|
+
])
|
|
1647
|
+
found_recursive_abs = find_images(self.test_dir, recursive=True, return_relative_paths=False)
|
|
1648
|
+
assert sorted(found_recursive_abs) == expected_recursive_abs
|
|
1649
|
+
|
|
1650
|
+
# Test recursive, relative paths
|
|
1651
|
+
expected_recursive_rel = sorted([
|
|
1652
|
+
'img1.jpg',
|
|
1653
|
+
'img2.PNG',
|
|
1654
|
+
os.path.join('subdir', 'img3.jpeg').replace('\\', '/')
|
|
1655
|
+
])
|
|
1656
|
+
found_recursive_rel = find_images(self.test_dir, recursive=True, return_relative_paths=True)
|
|
1657
|
+
assert sorted(found_recursive_rel) == expected_recursive_rel
|
|
1658
|
+
|
|
1659
|
+
# Test with an empty directory
|
|
1660
|
+
empty_dir = os.path.join(self.test_dir, 'empty_dir')
|
|
1661
|
+
os.makedirs(empty_dir, exist_ok=True)
|
|
1662
|
+
assert find_images(empty_dir, recursive=True) == []
|
|
1663
|
+
|
|
1664
|
+
# Test with a directory that doesn't exist (should assert)
|
|
1665
|
+
try:
|
|
1666
|
+
find_images(os.path.join(self.test_dir, 'non_existent_dir'))
|
|
1667
|
+
raise AssertionError("AssertionError not raised for non_existent_dir")
|
|
1668
|
+
except AssertionError:
|
|
1669
|
+
pass
|
|
1670
|
+
|
|
1671
|
+
|
|
1672
|
+
def test_recursive_file_list_and_file_list(self):
|
|
1673
|
+
"""
|
|
1674
|
+
Test the recursive_file_list and file_list functions.
|
|
1675
|
+
"""
|
|
1676
|
+
|
|
1677
|
+
# Setup directory structure
|
|
1678
|
+
# test_dir/
|
|
1679
|
+
# file1.txt
|
|
1680
|
+
# file2.jpg
|
|
1681
|
+
# subdir1/
|
|
1682
|
+
# file3.txt
|
|
1683
|
+
# subsubdir/
|
|
1684
|
+
# file4.png
|
|
1685
|
+
# subdir2/
|
|
1686
|
+
# file5.doc
|
|
1687
|
+
|
|
1688
|
+
list_dir = os.path.join(self.test_dir,'recursive_list')
|
|
1689
|
+
|
|
1690
|
+
f1 = os.path.join(list_dir, 'file1.txt')
|
|
1691
|
+
f2 = os.path.join(list_dir, 'file2.jpg')
|
|
1692
|
+
subdir1 = os.path.join(list_dir, 'subdir1')
|
|
1693
|
+
os.makedirs(subdir1, exist_ok=True)
|
|
1694
|
+
f3 = os.path.join(subdir1, 'file3.txt')
|
|
1695
|
+
subsubdir = os.path.join(subdir1, 'subsubdir')
|
|
1696
|
+
os.makedirs(subsubdir, exist_ok=True)
|
|
1697
|
+
f4 = os.path.join(subsubdir, 'file4.png')
|
|
1698
|
+
subdir2 = os.path.join(list_dir, 'subdir2')
|
|
1699
|
+
os.makedirs(subdir2, exist_ok=True)
|
|
1700
|
+
f5 = os.path.join(subdir2, 'file5.doc')
|
|
1701
|
+
|
|
1702
|
+
for filepath in [f1, f2, f3, f4, f5]:
|
|
1703
|
+
with open(filepath, 'w') as f:
|
|
1704
|
+
f.write('test')
|
|
1705
|
+
|
|
1706
|
+
# Test recursive_file_list (recursive=True by default)
|
|
1707
|
+
expected_all_files_abs = sorted([
|
|
1708
|
+
f1.replace('\\', '/'), f2.replace('\\', '/'), f3.replace('\\', '/'),
|
|
1709
|
+
f4.replace('\\', '/'), f5.replace('\\', '/')
|
|
1710
|
+
])
|
|
1711
|
+
all_files_abs = recursive_file_list(list_dir, convert_slashes=True,
|
|
1712
|
+
return_relative_paths=False)
|
|
1713
|
+
assert sorted(all_files_abs) == expected_all_files_abs
|
|
1714
|
+
|
|
1715
|
+
# Test recursive_file_list with relative paths
|
|
1716
|
+
expected_all_files_rel = sorted([
|
|
1717
|
+
'file1.txt', 'file2.jpg',
|
|
1718
|
+
os.path.join('subdir1', 'file3.txt').replace('\\', '/'),
|
|
1719
|
+
os.path.join('subdir1', 'subsubdir', 'file4.png').replace('\\', '/'),
|
|
1720
|
+
os.path.join('subdir2', 'file5.doc').replace('\\', '/')
|
|
1721
|
+
])
|
|
1722
|
+
all_files_rel = recursive_file_list(list_dir, convert_slashes=True,
|
|
1723
|
+
return_relative_paths=True)
|
|
1724
|
+
assert sorted(all_files_rel) == expected_all_files_rel
|
|
1725
|
+
|
|
1726
|
+
# Test file_list (non-recursive by default via wrapper)
|
|
1727
|
+
expected_top_level_files_abs = sorted([f1.replace('\\', '/'), f2.replace('\\', '/')])
|
|
1728
|
+
top_level_files_abs = file_list(list_dir, convert_slashes=True,
|
|
1729
|
+
return_relative_paths=False, recursive=False)
|
|
1730
|
+
assert sorted(top_level_files_abs) == expected_top_level_files_abs
|
|
1731
|
+
|
|
1732
|
+
# Test file_list (recursive explicitly) - should be same as recursive_file_list
|
|
1733
|
+
recursive_via_file_list = file_list(list_dir, convert_slashes=True,
|
|
1734
|
+
return_relative_paths=False, recursive=True)
|
|
1735
|
+
assert sorted(recursive_via_file_list) == expected_all_files_abs
|
|
1736
|
+
|
|
1737
|
+
# Test with convert_slashes=False (use os.sep)
|
|
1738
|
+
#
|
|
1739
|
+
# Note: This test might be tricky if os.sep is '/', as no replacement happens. We'll check
|
|
1740
|
+
# that backslashes remain on Windows.
|
|
1741
|
+
if os.sep == '\\':
|
|
1742
|
+
f1_raw = os.path.join(list_dir, 'file1.txt')
|
|
1743
|
+
# Only one file for simplicity
|
|
1744
|
+
files_no_slash_conversion = file_list(list_dir, convert_slashes=False, recursive=False)
|
|
1745
|
+
assert any(f1_raw in s for s in files_no_slash_conversion)
|
|
1746
|
+
|
|
1747
|
+
# Test with an empty directory
|
|
1748
|
+
empty_dir = os.path.join(list_dir, "empty_dir_for_files")
|
|
1749
|
+
os.makedirs(empty_dir, exist_ok=True)
|
|
1750
|
+
assert recursive_file_list(empty_dir) == []
|
|
1751
|
+
assert file_list(empty_dir, recursive=False) == []
|
|
1752
|
+
|
|
1753
|
+
# Test with a non-existent directory
|
|
1754
|
+
try:
|
|
1755
|
+
recursive_file_list(os.path.join(list_dir, "non_existent_dir"))
|
|
1756
|
+
raise AssertionError("AssertionError not raised for non_existent_dir in recursive_file_list")
|
|
1757
|
+
except AssertionError:
|
|
1758
|
+
pass
|
|
1759
|
+
|
|
1760
|
+
|
|
1761
|
+
def test_folder_list(self):
|
|
1762
|
+
"""
|
|
1763
|
+
Test the folder_list function.
|
|
1764
|
+
"""
|
|
1765
|
+
|
|
1766
|
+
# Setup directory structure
|
|
1767
|
+
# test_dir/
|
|
1768
|
+
# subdir1/
|
|
1769
|
+
# subsubdir1/
|
|
1770
|
+
# subdir2/
|
|
1771
|
+
# file1.txt (should be ignored)
|
|
1772
|
+
|
|
1773
|
+
folder_list_dir = os.path.join(self.test_dir,'folder_list')
|
|
1774
|
+
|
|
1775
|
+
subdir1 = os.path.join(folder_list_dir, 'subdir1')
|
|
1776
|
+
subsubdir1 = os.path.join(subdir1, 'subsubdir1')
|
|
1777
|
+
subdir2 = os.path.join(folder_list_dir, 'subdir2')
|
|
1778
|
+
os.makedirs(subdir1, exist_ok=True)
|
|
1779
|
+
os.makedirs(subsubdir1, exist_ok=True)
|
|
1780
|
+
os.makedirs(subdir2, exist_ok=True)
|
|
1781
|
+
with open(os.path.join(folder_list_dir, 'file1.txt'), 'w') as f:
|
|
1782
|
+
f.write('test')
|
|
1783
|
+
|
|
1784
|
+
# Test non-recursive
|
|
1785
|
+
expected_folders_non_recursive_abs = sorted([
|
|
1786
|
+
subdir1.replace('\\', '/'), subdir2.replace('\\', '/')
|
|
1787
|
+
])
|
|
1788
|
+
folders_non_recursive_abs = folder_list(folder_list_dir, recursive=False,
|
|
1789
|
+
return_relative_paths=False)
|
|
1790
|
+
assert sorted(folders_non_recursive_abs) == expected_folders_non_recursive_abs
|
|
1791
|
+
|
|
1792
|
+
# Test non-recursive, relative paths
|
|
1793
|
+
expected_folders_non_recursive_rel = sorted(['subdir1', 'subdir2'])
|
|
1794
|
+
folders_non_recursive_rel = folder_list(folder_list_dir, recursive=False,
|
|
1795
|
+
return_relative_paths=True)
|
|
1796
|
+
assert sorted(folders_non_recursive_rel) == expected_folders_non_recursive_rel
|
|
1797
|
+
|
|
1798
|
+
# Test recursive
|
|
1799
|
+
expected_folders_recursive_abs = sorted([
|
|
1800
|
+
subdir1.replace('\\', '/'),
|
|
1801
|
+
subsubdir1.replace('\\', '/'),
|
|
1802
|
+
subdir2.replace('\\', '/')
|
|
1803
|
+
])
|
|
1804
|
+
folders_recursive_abs = folder_list(folder_list_dir, recursive=True,
|
|
1805
|
+
return_relative_paths=False)
|
|
1806
|
+
assert sorted(folders_recursive_abs) == expected_folders_recursive_abs
|
|
1807
|
+
|
|
1808
|
+
# Test recursive, relative paths
|
|
1809
|
+
expected_folders_recursive_rel = sorted([
|
|
1810
|
+
'subdir1',
|
|
1811
|
+
os.path.join('subdir1', 'subsubdir1').replace('\\', '/'),
|
|
1812
|
+
'subdir2'
|
|
1813
|
+
])
|
|
1814
|
+
folders_recursive_rel = folder_list(folder_list_dir, recursive=True,
|
|
1815
|
+
return_relative_paths=True)
|
|
1816
|
+
assert sorted(folders_recursive_rel) == expected_folders_recursive_rel
|
|
1817
|
+
|
|
1818
|
+
# Test with an empty directory (except for the file)
|
|
1819
|
+
empty_dir_for_folders = os.path.join(folder_list_dir, "empty_for_folders")
|
|
1820
|
+
os.makedirs(empty_dir_for_folders, exist_ok=True)
|
|
1821
|
+
with open(os.path.join(empty_dir_for_folders, 'temp.txt'), 'w') as f: f.write('t')
|
|
1822
|
+
assert folder_list(empty_dir_for_folders, recursive=True) == []
|
|
1823
|
+
assert folder_list(empty_dir_for_folders, recursive=False) == []
|
|
1824
|
+
|
|
1825
|
+
# Test with a non-existent directory
|
|
1826
|
+
try:
|
|
1827
|
+
folder_list(os.path.join(self.test_dir, "non_existent_dir"))
|
|
1828
|
+
raise AssertionError("AssertionError not raised for non_existent_dir in folder_list")
|
|
1829
|
+
except AssertionError:
|
|
1830
|
+
pass
|
|
1831
|
+
|
|
1832
|
+
|
|
1833
|
+
def test_folder_summary(self):
|
|
1834
|
+
"""
|
|
1835
|
+
Test the folder_summary function.
|
|
1836
|
+
"""
|
|
1837
|
+
|
|
1838
|
+
# test_dir/
|
|
1839
|
+
# file1.txt
|
|
1840
|
+
# img1.jpg
|
|
1841
|
+
# subdir/
|
|
1842
|
+
# file2.txt
|
|
1843
|
+
# img2.png
|
|
1844
|
+
# img3.png
|
|
1845
|
+
|
|
1846
|
+
folder_summary_dir = os.path.join(self.test_dir,'folder_summary')
|
|
1847
|
+
|
|
1848
|
+
f1 = os.path.join(folder_summary_dir, 'file1.txt')
|
|
1849
|
+
img1 = os.path.join(folder_summary_dir, 'img1.jpg')
|
|
1850
|
+
subdir = os.path.join(folder_summary_dir, 'subdir')
|
|
1851
|
+
os.makedirs(subdir, exist_ok=True)
|
|
1852
|
+
f2 = os.path.join(subdir, 'file2.txt')
|
|
1853
|
+
img2 = os.path.join(subdir, 'img2.png')
|
|
1854
|
+
img3 = os.path.join(subdir, 'img3.png')
|
|
1855
|
+
|
|
1856
|
+
for filepath in [f1, img1, f2, img2, img3]:
|
|
1857
|
+
with open(filepath, 'w') as f:
|
|
1858
|
+
f.write('test')
|
|
1859
|
+
|
|
1860
|
+
summary = folder_summary(folder_summary_dir, print_summary=False)
|
|
1861
|
+
|
|
1862
|
+
assert summary['n_files'] == 5
|
|
1863
|
+
assert summary['n_folders'] == 1 # 'subdir'
|
|
1864
|
+
assert summary['extension_to_count']['.txt'] == 2
|
|
1865
|
+
assert summary['extension_to_count']['.jpg'] == 1
|
|
1866
|
+
assert summary['extension_to_count']['.png'] == 2
|
|
1867
|
+
|
|
1868
|
+
# Check order (sorted by value, desc)
|
|
1869
|
+
#
|
|
1870
|
+
# The specific order of keys with the same counts can vary based on file system list
|
|
1871
|
+
# order. We'll check that the counts are correct and the number of unique extensions is
|
|
1872
|
+
# right.
|
|
1873
|
+
assert len(summary['extension_to_count']) == 3
|
|
1874
|
+
|
|
1875
|
+
|
|
1876
|
+
empty_dir = os.path.join(folder_summary_dir, "empty_summary_dir")
|
|
1877
|
+
os.makedirs(empty_dir, exist_ok=True)
|
|
1878
|
+
empty_summary = folder_summary(empty_dir, print_summary=False)
|
|
1879
|
+
assert empty_summary['n_files'] == 0
|
|
1880
|
+
assert empty_summary['n_folders'] == 0
|
|
1881
|
+
assert empty_summary['extension_to_count'] == {}
|
|
1882
|
+
|
|
1883
|
+
|
|
1884
|
+
def test_fileparts(self):
|
|
1885
|
+
"""
|
|
1886
|
+
Test the fileparts function.
|
|
1887
|
+
"""
|
|
1888
|
+
|
|
1889
|
+
assert fileparts('file') == ('', 'file', '')
|
|
1890
|
+
assert fileparts('file.txt') == ('', 'file', '.txt')
|
|
1891
|
+
assert fileparts(r'c:/dir/file.jpg') == ('c:/dir', 'file', '.jpg')
|
|
1892
|
+
assert fileparts('/dir/subdir/file.jpg') == ('/dir/subdir', 'file', '.jpg')
|
|
1893
|
+
assert fileparts(r'c:\dir\file') == (r'c:\dir', 'file', '')
|
|
1894
|
+
assert fileparts(r'c:\dir\file.tar.gz') == (r'c:\dir', 'file.tar', '.gz')
|
|
1895
|
+
assert fileparts('.bashrc') == ('', '.bashrc', '') # Hidden file, no extension
|
|
1896
|
+
assert fileparts('nodir/.bashrc') == ('nodir', '.bashrc', '')
|
|
1897
|
+
assert fileparts('a/b/c.d.e') == ('a/b', 'c.d', '.e')
|
|
1898
|
+
|
|
1899
|
+
|
|
1900
|
+
def test_insert_before_extension(self):
|
|
1901
|
+
"""
|
|
1902
|
+
Test the insert_before_extension function.
|
|
1903
|
+
"""
|
|
1904
|
+
|
|
1905
|
+
assert insert_before_extension('file.ext', 'inserted') == 'file.inserted.ext'
|
|
1906
|
+
assert insert_before_extension('file', 'inserted') == 'file.inserted'
|
|
1907
|
+
assert insert_before_extension('path/to/file.ext', 'tag') == 'path/to/file.tag.ext'
|
|
1908
|
+
assert insert_before_extension('path/to/file', 'tag') == 'path/to/file.tag'
|
|
1909
|
+
assert insert_before_extension('file.tar.gz', 'new') == 'file.tar.new.gz'
|
|
1910
|
+
|
|
1911
|
+
# Test with custom separator
|
|
1912
|
+
assert insert_before_extension('file.ext', 'inserted', separator='_') == 'file_inserted.ext'
|
|
1913
|
+
|
|
1914
|
+
# Test with s=None (timestamp) - check format roughly
|
|
1915
|
+
fname_with_ts = insert_before_extension('file.ext', None)
|
|
1916
|
+
parts = fname_with_ts.split('.')
|
|
1917
|
+
# file.YYYY.MM.DD.HH.MM.SS.ext
|
|
1918
|
+
assert len(parts) >= 8 # file, Y, M, D, H, M, S, ext
|
|
1919
|
+
assert parts[0] == 'file'
|
|
1920
|
+
assert parts[-1] == 'ext'
|
|
1921
|
+
assert all(p.isdigit() for p in parts[1:-1])
|
|
1922
|
+
|
|
1923
|
+
fname_no_ext_ts = insert_before_extension('file', '') # s is empty string, should also use timestamp
|
|
1924
|
+
parts_no_ext = fname_no_ext_ts.split('.')
|
|
1925
|
+
assert len(parts_no_ext) >= 7 # file, Y, M, D, H, M, S
|
|
1926
|
+
assert parts_no_ext[0] == 'file'
|
|
1927
|
+
assert all(p.isdigit() for p in parts_no_ext[1:])
|
|
1928
|
+
|
|
1929
|
+
|
|
1930
|
+
def test_split_path(self):
|
|
1931
|
+
"""
|
|
1932
|
+
Test the split_path function.
|
|
1933
|
+
"""
|
|
1934
|
+
|
|
1935
|
+
if os.name == 'nt':
|
|
1936
|
+
assert split_path(r'c:\dir\subdir\file.txt') == ['c:\\', 'dir', 'subdir', 'file.txt']
|
|
1937
|
+
assert split_path('c:\\') == ['c:\\']
|
|
1938
|
+
# Test with mixed slashes, ntpath.split handles them
|
|
1939
|
+
assert split_path(r'c:/dir/subdir/file.txt') == ['c:/', 'dir', 'subdir', 'file.txt']
|
|
1940
|
+
else: # POSIX
|
|
1941
|
+
assert split_path('/dir/subdir/file.jpg') == ['/', 'dir', 'subdir', 'file.jpg']
|
|
1942
|
+
assert split_path('/') == ['/']
|
|
1943
|
+
|
|
1944
|
+
assert split_path('dir/file.txt') == ['dir', 'file.txt']
|
|
1945
|
+
assert split_path('file.txt') == ['file.txt']
|
|
1946
|
+
assert split_path('') == ''
|
|
1947
|
+
assert split_path('.') == ['.']
|
|
1948
|
+
assert split_path('..') == ['..']
|
|
1949
|
+
assert split_path('../a/b') == ['..', 'a', 'b']
|
|
1950
|
+
|
|
1951
|
+
|
|
1952
|
+
def test_path_is_abs(self):
|
|
1953
|
+
"""
|
|
1954
|
+
Test the path_is_abs function.
|
|
1955
|
+
"""
|
|
1956
|
+
|
|
1957
|
+
assert path_is_abs('/absolute/path')
|
|
1958
|
+
assert path_is_abs('c:/absolute/path')
|
|
1959
|
+
assert path_is_abs('C:\\absolute\\path')
|
|
1960
|
+
assert path_is_abs('\\\\server\\share\\path') # UNC path
|
|
1961
|
+
assert path_is_abs('c:file_without_slash_after_drive')
|
|
1962
|
+
|
|
1963
|
+
assert not path_is_abs('relative/path')
|
|
1964
|
+
assert not path_is_abs('file.txt')
|
|
1965
|
+
assert not path_is_abs('../relative')
|
|
1966
|
+
assert not path_is_abs('')
|
|
1967
|
+
|
|
1968
|
+
|
|
1969
|
+
|
|
1970
|
+
def test_safe_create_link_unix(self):
|
|
1971
|
+
"""
|
|
1972
|
+
Test the safe_create_link function on Unix-like systems.
|
|
1973
|
+
"""
|
|
1974
|
+
|
|
1975
|
+
if os.name == 'nt':
|
|
1976
|
+
# print("Skipping test_safe_create_link_unix on Windows.")
|
|
1977
|
+
return
|
|
1978
|
+
|
|
1979
|
+
source_file_path = os.path.join(self.test_dir, 'source.txt')
|
|
1980
|
+
link_path = os.path.join(self.test_dir, 'link.txt')
|
|
1981
|
+
other_source_path = os.path.join(self.test_dir, 'other_source.txt')
|
|
1982
|
+
|
|
1983
|
+
with open(source_file_path, 'w') as f:
|
|
1984
|
+
f.write('source data')
|
|
1985
|
+
with open(other_source_path, 'w') as f:
|
|
1986
|
+
f.write('other data')
|
|
1987
|
+
|
|
1988
|
+
# Create new link
|
|
1989
|
+
safe_create_link(source_file_path, link_path)
|
|
1990
|
+
assert os.path.islink(link_path)
|
|
1991
|
+
assert os.readlink(link_path) == source_file_path
|
|
1992
|
+
|
|
1993
|
+
# Link already exists and points to the correct source
|
|
1994
|
+
safe_create_link(source_file_path, link_path) # Should do nothing
|
|
1995
|
+
assert os.path.islink(link_path)
|
|
1996
|
+
assert os.readlink(link_path) == source_file_path
|
|
1997
|
+
|
|
1998
|
+
# Link already exists but points to a different source
|
|
1999
|
+
safe_create_link(other_source_path, link_path) # Should remove and re-create
|
|
2000
|
+
assert os.path.islink(link_path)
|
|
2001
|
+
assert os.readlink(link_path) == other_source_path
|
|
2002
|
+
|
|
2003
|
+
# Link_new path exists and is a file (not a link)
|
|
2004
|
+
file_path_conflict = os.path.join(self.test_dir, 'conflict_file.txt')
|
|
2005
|
+
with open(file_path_conflict, 'w') as f:
|
|
2006
|
+
f.write('actual file')
|
|
2007
|
+
try:
|
|
2008
|
+
safe_create_link(source_file_path, file_path_conflict)
|
|
2009
|
+
raise AssertionError("AssertionError not raised for file conflict")
|
|
2010
|
+
except AssertionError:
|
|
2011
|
+
pass
|
|
2012
|
+
os.remove(file_path_conflict)
|
|
2013
|
+
|
|
2014
|
+
# Link_new path exists and is a directory
|
|
2015
|
+
dir_path_conflict = os.path.join(self.test_dir, 'conflict_dir')
|
|
2016
|
+
os.makedirs(dir_path_conflict, exist_ok=True)
|
|
2017
|
+
try:
|
|
2018
|
+
safe_create_link(source_file_path, dir_path_conflict)
|
|
2019
|
+
raise AssertionError("AssertionError not raised for directory conflict")
|
|
2020
|
+
except AssertionError: # islink will be false
|
|
2021
|
+
pass
|
|
2022
|
+
shutil.rmtree(dir_path_conflict)
|
|
2023
|
+
|
|
2024
|
+
|
|
2025
|
+
def test_remove_empty_folders(self):
|
|
2026
|
+
"""
|
|
2027
|
+
Test the remove_empty_folders function.
|
|
2028
|
+
"""
|
|
2029
|
+
|
|
2030
|
+
# test_dir/
|
|
2031
|
+
# empty_top/
|
|
2032
|
+
# empty_mid/
|
|
2033
|
+
# empty_leaf/
|
|
2034
|
+
# mixed_top/
|
|
2035
|
+
# empty_mid_in_mixed/
|
|
2036
|
+
# empty_leaf_in_mixed/
|
|
2037
|
+
# non_empty_mid/
|
|
2038
|
+
# file.txt
|
|
2039
|
+
# non_empty_top/
|
|
2040
|
+
# file_in_top.txt
|
|
2041
|
+
|
|
2042
|
+
empty_top = os.path.join(self.test_dir, 'empty_top')
|
|
2043
|
+
empty_mid = os.path.join(empty_top, 'empty_mid')
|
|
2044
|
+
empty_leaf = os.path.join(empty_mid, 'empty_leaf')
|
|
2045
|
+
os.makedirs(empty_leaf, exist_ok=True)
|
|
2046
|
+
|
|
2047
|
+
mixed_top = os.path.join(self.test_dir, 'mixed_top')
|
|
2048
|
+
empty_mid_in_mixed = os.path.join(mixed_top, 'empty_mid_in_mixed')
|
|
2049
|
+
empty_leaf_in_mixed = os.path.join(empty_mid_in_mixed, 'empty_leaf_in_mixed')
|
|
2050
|
+
os.makedirs(empty_leaf_in_mixed, exist_ok=True)
|
|
2051
|
+
non_empty_mid = os.path.join(mixed_top, 'non_empty_mid')
|
|
2052
|
+
os.makedirs(non_empty_mid, exist_ok=True)
|
|
2053
|
+
with open(os.path.join(non_empty_mid, 'file.txt'), 'w') as f:
|
|
2054
|
+
f.write('data')
|
|
2055
|
+
|
|
2056
|
+
non_empty_top = os.path.join(self.test_dir, 'non_empty_top')
|
|
2057
|
+
os.makedirs(non_empty_top, exist_ok=True)
|
|
2058
|
+
with open(os.path.join(non_empty_top, 'file_in_top.txt'), 'w') as f:
|
|
2059
|
+
f.write('data')
|
|
2060
|
+
|
|
2061
|
+
# Process empty_top - should remove all three
|
|
2062
|
+
remove_empty_folders(empty_top, remove_root=True)
|
|
2063
|
+
assert not os.path.exists(empty_top)
|
|
2064
|
+
assert not os.path.exists(empty_mid)
|
|
2065
|
+
assert not os.path.exists(empty_leaf)
|
|
2066
|
+
|
|
2067
|
+
# Process mixed_top; should remove empty_leaf_in_mixed and empty_mid_in_mixed
|
|
2068
|
+
# but not mixed_top or non_empty_mid.
|
|
2069
|
+
remove_empty_folders(mixed_top, remove_root=True)
|
|
2070
|
+
assert os.path.exists(mixed_top) # mixed_top itself should remain
|
|
2071
|
+
assert not os.path.exists(empty_mid_in_mixed)
|
|
2072
|
+
assert not os.path.exists(empty_leaf_in_mixed)
|
|
2073
|
+
assert os.path.exists(non_empty_mid)
|
|
2074
|
+
assert os.path.exists(os.path.join(non_empty_mid, 'file.txt'))
|
|
2075
|
+
|
|
2076
|
+
# Process non_empty_top; should remove nothing.
|
|
2077
|
+
remove_empty_folders(non_empty_top, remove_root=True)
|
|
2078
|
+
assert os.path.exists(non_empty_top)
|
|
2079
|
+
assert os.path.exists(os.path.join(non_empty_top, 'file_in_top.txt'))
|
|
2080
|
+
|
|
2081
|
+
# Test with a file path (should do nothing and return False)
|
|
2082
|
+
file_path_for_removal = os.path.join(self.test_dir, 'a_file.txt')
|
|
2083
|
+
with open(file_path_for_removal, 'w') as f: f.write('t')
|
|
2084
|
+
assert not remove_empty_folders(file_path_for_removal, remove_root=True)
|
|
2085
|
+
assert os.path.exists(file_path_for_removal)
|
|
2086
|
+
|
|
2087
|
+
# Test with remove_root=False for the top level
|
|
2088
|
+
another_empty_top = os.path.join(self.test_dir, 'another_empty_top')
|
|
2089
|
+
another_empty_mid = os.path.join(another_empty_top, 'another_empty_mid')
|
|
2090
|
+
os.makedirs(another_empty_mid)
|
|
2091
|
+
remove_empty_folders(another_empty_top, remove_root=False)
|
|
2092
|
+
assert os.path.exists(another_empty_top) # Root not removed
|
|
2093
|
+
assert not os.path.exists(another_empty_mid) # Mid removed
|
|
2094
|
+
|
|
2095
|
+
|
|
2096
|
+
def test_path_join(self):
|
|
2097
|
+
"""
|
|
2098
|
+
Test the path_join function.
|
|
2099
|
+
"""
|
|
2100
|
+
|
|
2101
|
+
assert path_join('a', 'b', 'c') == 'a/b/c'
|
|
2102
|
+
assert path_join('a/b', 'c', 'd.txt') == 'a/b/c/d.txt'
|
|
2103
|
+
if os.name == 'nt':
|
|
2104
|
+
# On Windows, os.path.join uses '\', so convert_slashes=True should change it
|
|
2105
|
+
assert path_join('a', 'b', convert_slashes=True) == 'a/b'
|
|
2106
|
+
assert path_join('a', 'b', convert_slashes=False) == 'a\\b'
|
|
2107
|
+
assert path_join('c:\\', 'foo', 'bar', convert_slashes=True) == 'c:/foo/bar'
|
|
2108
|
+
assert path_join('c:\\', 'foo', 'bar', convert_slashes=False) == 'c:\\foo\\bar'
|
|
2109
|
+
else:
|
|
2110
|
+
# On POSIX, os.path.join uses '/', so convert_slashes=False should still be '/'
|
|
2111
|
+
assert path_join('a', 'b', convert_slashes=False) == 'a/b'
|
|
2112
|
+
|
|
2113
|
+
assert path_join('a', '', 'b') == 'a/b' # os.path.join behavior
|
|
2114
|
+
assert path_join('/a', 'b') == '/a/b'
|
|
2115
|
+
assert path_join('a', '/b') == '/b' # '/b' is absolute
|
|
2116
|
+
|
|
2117
|
+
|
|
2118
|
+
def test_filename_cleaning(self):
|
|
2119
|
+
"""
|
|
2120
|
+
Test clean_filename, clean_path, and flatten_path functions.
|
|
2121
|
+
"""
|
|
2122
|
+
|
|
2123
|
+
# clean_filename
|
|
2124
|
+
assert clean_filename("test file.txt") == "test file.txt"
|
|
2125
|
+
assert clean_filename("test*file?.txt", char_limit=10) == "testfile.t"
|
|
2126
|
+
assert clean_filename("TestFile.TXT", force_lower=True) == "testfile.txt"
|
|
2127
|
+
assert clean_filename("file:with<illegal>chars.txt") == "filewithillegalchars.txt"
|
|
2128
|
+
assert clean_filename(" accented_name_éà.txt") == " accented_name_ea.txt"
|
|
2129
|
+
|
|
2130
|
+
# Separators are not allowed by default in clean_filename
|
|
2131
|
+
assert clean_filename("path/to/file.txt") == "pathtofile.txt"
|
|
2132
|
+
|
|
2133
|
+
# clean_path
|
|
2134
|
+
assert clean_path("path/to/file.txt") == "path/to/file.txt" # slashes allowed
|
|
2135
|
+
assert clean_path("path\\to\\file.txt") == "path\\to\\file.txt" # backslashes allowed
|
|
2136
|
+
assert clean_path("path:to:file.txt") == "path:to:file.txt" # colons allowed
|
|
2137
|
+
assert clean_path("path/to<illegal>/file.txt") == "path/toillegal/file.txt"
|
|
2138
|
+
|
|
2139
|
+
# flatten_path
|
|
2140
|
+
assert flatten_path("path/to/file.txt") == "path~to~file.txt"
|
|
2141
|
+
assert flatten_path("path:to:file.txt", separator_char_replacement='_') == "path_to_file.txt"
|
|
2142
|
+
assert flatten_path("path\\to/file:name.txt") == "path~to~file~name.txt"
|
|
2143
|
+
assert flatten_path("path/to<illegal>/file.txt") == "path~toillegal~file.txt"
|
|
2144
|
+
|
|
2145
|
+
|
|
2146
|
+
def test_is_executable(self):
|
|
2147
|
+
"""
|
|
2148
|
+
Test the is_executable function.
|
|
2149
|
+
This is a basic test; comprehensive testing is environment-dependent.
|
|
2150
|
+
"""
|
|
2151
|
+
|
|
2152
|
+
# Hard to test reliably across all systems without knowing what's on PATH.
|
|
2153
|
+
if os.name == 'nt':
|
|
2154
|
+
assert is_executable('cmd.exe')
|
|
2155
|
+
assert not is_executable('non_existent_executable_blah_blah')
|
|
2156
|
+
else:
|
|
2157
|
+
assert is_executable('ls')
|
|
2158
|
+
assert is_executable('sh')
|
|
2159
|
+
assert not is_executable('non_existent_executable_blah_blah')
|
|
2160
|
+
|
|
2161
|
+
|
|
2162
|
+
def test_write_read_list_to_file(self):
|
|
2163
|
+
"""
|
|
2164
|
+
Test write_list_to_file and read_list_from_file functions.
|
|
2165
|
+
"""
|
|
2166
|
+
|
|
2167
|
+
test_list = ["item1", "item2 with space", "item3/with/slash"]
|
|
2168
|
+
|
|
2169
|
+
# Test with .json
|
|
2170
|
+
json_file_path = os.path.join(self.test_dir, "test_list.json")
|
|
2171
|
+
write_list_to_file(json_file_path, test_list)
|
|
2172
|
+
read_list_json = read_list_from_file(json_file_path)
|
|
2173
|
+
assert test_list == read_list_json
|
|
2174
|
+
|
|
2175
|
+
# Test with .txt
|
|
2176
|
+
txt_file_path = os.path.join(self.test_dir, "test_list.txt")
|
|
2177
|
+
write_list_to_file(txt_file_path, test_list)
|
|
2178
|
+
# read_list_from_file is specifically for JSON, so we read .txt manually
|
|
2179
|
+
with open(txt_file_path, 'r') as f:
|
|
2180
|
+
read_list_txt = [line.strip() for line in f.readlines()]
|
|
2181
|
+
assert test_list == read_list_txt
|
|
2182
|
+
|
|
2183
|
+
# Test reading non-existent json
|
|
2184
|
+
try:
|
|
2185
|
+
read_list_from_file(os.path.join(self.test_dir,"non_existent.json"))
|
|
2186
|
+
raise AssertionError("FileNotFoundError not raised")
|
|
2187
|
+
except FileNotFoundError:
|
|
2188
|
+
pass
|
|
2189
|
+
|
|
2190
|
+
# Test reading a non-json file with read_list_from_file (should fail parsing)
|
|
2191
|
+
non_json_path = os.path.join(self.test_dir, "not_a_list.json")
|
|
2192
|
+
with open(non_json_path, 'w') as f: f.write("this is not json")
|
|
2193
|
+
try:
|
|
2194
|
+
read_list_from_file(non_json_path)
|
|
2195
|
+
raise AssertionError("json.JSONDecodeError not raised")
|
|
2196
|
+
except json.JSONDecodeError:
|
|
2197
|
+
pass
|
|
2198
|
+
|
|
2199
|
+
|
|
2200
|
+
def test_parallel_copy_files(self):
|
|
2201
|
+
"""
|
|
2202
|
+
Test the parallel_copy_files function (with max_workers=1 for test simplicity).
|
|
2203
|
+
"""
|
|
2204
|
+
|
|
2205
|
+
source_dir = os.path.join(self.test_dir, "copy_source")
|
|
2206
|
+
target_dir = os.path.join(self.test_dir, "copy_target")
|
|
2207
|
+
os.makedirs(source_dir, exist_ok=True)
|
|
2208
|
+
|
|
2209
|
+
file_mappings = {}
|
|
2210
|
+
source_files_content = {}
|
|
2211
|
+
|
|
2212
|
+
for i in range(3):
|
|
2213
|
+
src_fn = f"file{i}.txt"
|
|
2214
|
+
src_path = os.path.join(source_dir, src_fn)
|
|
2215
|
+
if i == 0:
|
|
2216
|
+
tgt_fn = f"copied_file{i}.txt"
|
|
2217
|
+
tgt_path = os.path.join(target_dir, tgt_fn)
|
|
2218
|
+
else:
|
|
2219
|
+
tgt_fn = f"copied_file{i}_subdir.txt"
|
|
2220
|
+
tgt_path = os.path.join(target_dir, f"sub{i}", tgt_fn)
|
|
2221
|
+
|
|
2222
|
+
content = f"content of file {i}"
|
|
2223
|
+
with open(src_path, 'w') as f:
|
|
2224
|
+
f.write(content)
|
|
2225
|
+
|
|
2226
|
+
file_mappings[src_path] = tgt_path
|
|
2227
|
+
source_files_content[tgt_path] = content
|
|
2228
|
+
|
|
2229
|
+
# Test copy
|
|
2230
|
+
parallel_copy_files(file_mappings, max_workers=1, use_threads=True, overwrite=False)
|
|
2231
|
+
for tgt_path, expected_content in source_files_content.items():
|
|
2232
|
+
assert os.path.exists(tgt_path)
|
|
2233
|
+
with open(tgt_path, 'r') as f:
|
|
2234
|
+
assert f.read() == expected_content
|
|
2235
|
+
|
|
2236
|
+
existing_target_path = list(source_files_content.keys())[0]
|
|
2237
|
+
with open(existing_target_path, 'w') as f:
|
|
2238
|
+
f.write("old content")
|
|
2239
|
+
|
|
2240
|
+
parallel_copy_files(file_mappings, max_workers=1, use_threads=True, overwrite=False)
|
|
2241
|
+
with open(existing_target_path, 'r') as f:
|
|
2242
|
+
assert f.read() == "old content"
|
|
2243
|
+
|
|
2244
|
+
parallel_copy_files(file_mappings, max_workers=1, use_threads=True, overwrite=True)
|
|
2245
|
+
with open(existing_target_path, 'r') as f:
|
|
2246
|
+
assert f.read() == source_files_content[existing_target_path]
|
|
2247
|
+
|
|
2248
|
+
for src_path_orig, tgt_path_orig in file_mappings.items(): # Re-create source for move
|
|
2249
|
+
with open(src_path_orig, 'w') as f:
|
|
2250
|
+
f.write(source_files_content[tgt_path_orig])
|
|
2251
|
+
|
|
2252
|
+
parallel_copy_files(file_mappings, max_workers=1, use_threads=True, move=True, overwrite=True)
|
|
2253
|
+
for src_path, tgt_path in file_mappings.items():
|
|
2254
|
+
assert not os.path.exists(src_path)
|
|
2255
|
+
assert os.path.exists(tgt_path)
|
|
2256
|
+
with open(tgt_path, 'r') as f:
|
|
2257
|
+
assert f.read() == source_files_content[tgt_path]
|
|
2258
|
+
|
|
2259
|
+
|
|
2260
|
+
def test_get_file_sizes(self):
|
|
2261
|
+
"""
|
|
2262
|
+
Test get_file_sizes and parallel_get_file_sizes functions.
|
|
2263
|
+
"""
|
|
2264
|
+
|
|
2265
|
+
file_sizes_test_dir = os.path.join(self.test_dir,'file_sizes')
|
|
2266
|
+
os.makedirs(file_sizes_test_dir,exist_ok=True)
|
|
2267
|
+
|
|
2268
|
+
f1_path = os.path.join(file_sizes_test_dir, 'file1.txt')
|
|
2269
|
+
content1 = "0123456789" # 10 bytes
|
|
2270
|
+
with open(f1_path, 'w') as f:
|
|
2271
|
+
f.write(content1)
|
|
2272
|
+
|
|
2273
|
+
subdir_path = os.path.join(file_sizes_test_dir, 'subdir')
|
|
2274
|
+
os.makedirs(subdir_path, exist_ok=True)
|
|
2275
|
+
f2_path = os.path.join(subdir_path, 'file2.txt')
|
|
2276
|
+
content2 = "01234567890123456789" # 20 bytes
|
|
2277
|
+
with open(f2_path, 'w') as f:
|
|
2278
|
+
f.write(content2)
|
|
2279
|
+
|
|
2280
|
+
sizes_relative = get_file_sizes(file_sizes_test_dir)
|
|
2281
|
+
expected_sizes_relative = {
|
|
2282
|
+
'file1.txt': len(content1),
|
|
2283
|
+
os.path.join('subdir', 'file2.txt').replace('\\','/'): len(content2)
|
|
2284
|
+
}
|
|
2285
|
+
assert sizes_relative == expected_sizes_relative
|
|
2286
|
+
|
|
2287
|
+
file_list_abs = [f1_path, f2_path]
|
|
2288
|
+
sizes_parallel_abs = parallel_get_file_sizes(file_list_abs, max_workers=1)
|
|
2289
|
+
expected_sizes_parallel_abs = {
|
|
2290
|
+
f1_path.replace('\\','/'): len(content1),
|
|
2291
|
+
f2_path.replace('\\','/'): len(content2)
|
|
2292
|
+
}
|
|
2293
|
+
assert sizes_parallel_abs == expected_sizes_parallel_abs
|
|
2294
|
+
|
|
2295
|
+
sizes_parallel_folder_abs = parallel_get_file_sizes(file_sizes_test_dir,
|
|
2296
|
+
max_workers=1,
|
|
2297
|
+
return_relative_paths=False)
|
|
2298
|
+
assert sizes_parallel_folder_abs == expected_sizes_parallel_abs
|
|
2299
|
+
|
|
2300
|
+
sizes_parallel_folder_rel = parallel_get_file_sizes(file_sizes_test_dir,
|
|
2301
|
+
max_workers=1,
|
|
2302
|
+
return_relative_paths=True)
|
|
2303
|
+
assert sizes_parallel_folder_rel == expected_sizes_relative
|
|
2304
|
+
|
|
2305
|
+
non_existent_file = os.path.join(file_sizes_test_dir, "no_such_file.txt")
|
|
2306
|
+
sizes_with_error = parallel_get_file_sizes([f1_path, non_existent_file],
|
|
2307
|
+
max_workers=1)
|
|
2308
|
+
expected_with_error = {
|
|
2309
|
+
f1_path.replace('\\','/'): len(content1),
|
|
2310
|
+
non_existent_file.replace('\\','/'): None
|
|
2311
|
+
}
|
|
2312
|
+
assert sizes_with_error == expected_with_error
|
|
2313
|
+
|
|
2314
|
+
|
|
2315
|
+
def test_zip_file_and_unzip_file(self):
|
|
2316
|
+
"""
|
|
2317
|
+
Test zip_file and unzip_file functions.
|
|
2318
|
+
"""
|
|
2319
|
+
|
|
2320
|
+
file_to_zip_name = "test_zip_me.txt"
|
|
2321
|
+
file_to_zip_path = os.path.join(self.test_dir, file_to_zip_name)
|
|
2322
|
+
content = "This is the content to be zipped."
|
|
2323
|
+
with open(file_to_zip_path, 'w') as f:
|
|
2324
|
+
f.write(content)
|
|
2325
|
+
|
|
2326
|
+
default_zip_output_path = file_to_zip_path + ".zip"
|
|
2327
|
+
returned_zip_path = zip_file(file_to_zip_path)
|
|
2328
|
+
assert returned_zip_path == default_zip_output_path
|
|
2329
|
+
assert os.path.exists(default_zip_output_path)
|
|
2330
|
+
|
|
2331
|
+
unzip_dir_default = os.path.join(self.test_dir, "unzip_default")
|
|
2332
|
+
os.makedirs(unzip_dir_default, exist_ok=True)
|
|
2333
|
+
unzip_file(default_zip_output_path, unzip_dir_default)
|
|
2334
|
+
unzipped_file_path_default = os.path.join(unzip_dir_default, file_to_zip_name)
|
|
2335
|
+
assert os.path.exists(unzipped_file_path_default)
|
|
2336
|
+
with open(unzipped_file_path_default, 'r') as f:
|
|
2337
|
+
assert f.read() == content
|
|
2338
|
+
|
|
2339
|
+
custom_zip_output_name = "custom_archive.zip"
|
|
2340
|
+
custom_zip_output_path = os.path.join(self.test_dir, custom_zip_output_name)
|
|
2341
|
+
zip_file(file_to_zip_path, output_fn=custom_zip_output_path, overwrite=True)
|
|
2342
|
+
assert os.path.exists(custom_zip_output_path)
|
|
2343
|
+
|
|
2344
|
+
zip_in_subdir_path = os.path.join(self.test_dir, "subdir_zip", "my.zip")
|
|
2345
|
+
file_in_subdir_name = "file_for_subdir_zip.txt"
|
|
2346
|
+
file_in_subdir_path = os.path.join(self.test_dir,"subdir_zip", file_in_subdir_name)
|
|
2347
|
+
os.makedirs(os.path.dirname(zip_in_subdir_path), exist_ok=True)
|
|
2348
|
+
with open(file_in_subdir_path, "w") as f: f.write("sub dir content")
|
|
2349
|
+
zip_file(file_in_subdir_path, output_fn=zip_in_subdir_path)
|
|
2350
|
+
|
|
2351
|
+
unzip_file(zip_in_subdir_path, output_folder=None)
|
|
2352
|
+
unzipped_in_same_dir_path = os.path.join(os.path.dirname(zip_in_subdir_path), file_in_subdir_name)
|
|
2353
|
+
assert os.path.exists(unzipped_in_same_dir_path)
|
|
2354
|
+
with open(unzipped_in_same_dir_path, 'r') as f:
|
|
2355
|
+
assert f.read() == "sub dir content"
|
|
2356
|
+
|
|
2357
|
+
|
|
2358
|
+
def test_zip_folder(self):
|
|
2359
|
+
"""
|
|
2360
|
+
Test the zip_folder function.
|
|
2361
|
+
"""
|
|
2362
|
+
|
|
2363
|
+
folder_to_zip = os.path.join(self.test_dir, "folder_to_zip")
|
|
2364
|
+
os.makedirs(folder_to_zip, exist_ok=True)
|
|
2365
|
+
|
|
2366
|
+
file1_name = "file1.txt"; path1 = os.path.join(folder_to_zip, file1_name)
|
|
2367
|
+
file2_name = "file2.log"; path2 = os.path.join(folder_to_zip, file2_name)
|
|
2368
|
+
subdir_name = "sub"; subdir_path = os.path.join(folder_to_zip, subdir_name)
|
|
2369
|
+
os.makedirs(subdir_path, exist_ok=True)
|
|
2370
|
+
file3_name = "file3.dat"; path3 = os.path.join(subdir_path, file3_name)
|
|
2371
|
+
|
|
2372
|
+
content1 = "content1"; content2 = "content2"; content3 = "content3"
|
|
2373
|
+
with open(path1, 'w') as f: f.write(content1)
|
|
2374
|
+
with open(path2, 'w') as f: f.write(content2)
|
|
2375
|
+
with open(path3, 'w') as f: f.write(content3)
|
|
2376
|
+
|
|
2377
|
+
default_zip_path = folder_to_zip + ".zip"
|
|
2378
|
+
zip_folder(folder_to_zip, output_fn=None, overwrite=True)
|
|
2379
|
+
assert os.path.exists(default_zip_path)
|
|
2380
|
+
|
|
2381
|
+
unzip_output_dir = os.path.join(self.test_dir, "unzipped_folder_content")
|
|
2382
|
+
os.makedirs(unzip_output_dir, exist_ok=True)
|
|
2383
|
+
unzip_file(default_zip_path, unzip_output_dir)
|
|
2384
|
+
|
|
2385
|
+
assert os.path.exists(os.path.join(unzip_output_dir, file1_name))
|
|
2386
|
+
assert os.path.exists(os.path.join(unzip_output_dir, file2_name))
|
|
2387
|
+
assert os.path.exists(os.path.join(unzip_output_dir, subdir_name, file3_name))
|
|
2388
|
+
with open(os.path.join(unzip_output_dir, file1_name), 'r')as f: assert f.read() == content1
|
|
2389
|
+
with open(os.path.join(unzip_output_dir, file2_name), 'r')as f: assert f.read() == content2
|
|
2390
|
+
with open(os.path.join(unzip_output_dir, subdir_name, file3_name), 'r')as f: assert f.read() == content3
|
|
2391
|
+
|
|
2392
|
+
mtime_before = os.path.getmtime(default_zip_path)
|
|
2393
|
+
zip_folder(folder_to_zip, output_fn=None, overwrite=False)
|
|
2394
|
+
mtime_after = os.path.getmtime(default_zip_path)
|
|
2395
|
+
assert mtime_before == mtime_after
|
|
2396
|
+
|
|
2397
|
+
|
|
2398
|
+
def test_zip_files_into_single_zipfile(self):
|
|
2399
|
+
"""
|
|
2400
|
+
Test zip_files_into_single_zipfile.
|
|
2401
|
+
"""
|
|
2402
|
+
|
|
2403
|
+
file1_path = os.path.join(self.test_dir, "zfs_file1.txt")
|
|
2404
|
+
content1 = "content for zfs1"
|
|
2405
|
+
with open(file1_path, 'w') as f: f.write(content1)
|
|
2406
|
+
|
|
2407
|
+
subdir_for_zfs = os.path.join(self.test_dir, "zfs_subdir")
|
|
2408
|
+
os.makedirs(subdir_for_zfs, exist_ok=True)
|
|
2409
|
+
file2_path = os.path.join(subdir_for_zfs, "zfs_file2.log")
|
|
2410
|
+
content2 = "content for zfs2"
|
|
2411
|
+
with open(file2_path, 'w') as f: f.write(content2)
|
|
2412
|
+
|
|
2413
|
+
input_files = [file1_path, file2_path]
|
|
2414
|
+
output_zip_path = os.path.join(self.test_dir, "multi_file_archive.zip")
|
|
2415
|
+
zip_files_into_single_zipfile(input_files, output_zip_path, arc_name_base=self.test_dir, overwrite=True)
|
|
2416
|
+
assert os.path.exists(output_zip_path)
|
|
2417
|
+
|
|
2418
|
+
unzip_dir = os.path.join(self.test_dir, "unzip_multi_file")
|
|
2419
|
+
os.makedirs(unzip_dir, exist_ok=True)
|
|
2420
|
+
unzip_file(output_zip_path, unzip_dir)
|
|
2421
|
+
|
|
2422
|
+
expected_unzipped_file1 = os.path.join(unzip_dir, os.path.relpath(file1_path, self.test_dir))
|
|
2423
|
+
expected_unzipped_file2 = os.path.join(unzip_dir, os.path.relpath(file2_path, self.test_dir))
|
|
2424
|
+
|
|
2425
|
+
assert os.path.exists(expected_unzipped_file1)
|
|
2426
|
+
with open(expected_unzipped_file1, 'r') as f: assert f.read() == content1
|
|
2427
|
+
assert os.path.exists(expected_unzipped_file2)
|
|
2428
|
+
assert os.path.basename(expected_unzipped_file2) == "zfs_file2.log"
|
|
2429
|
+
assert os.path.basename(os.path.dirname(expected_unzipped_file2)) == "zfs_subdir"
|
|
2430
|
+
with open(expected_unzipped_file2, 'r') as f: assert f.read() == content2
|
|
2431
|
+
|
|
2432
|
+
|
|
2433
|
+
def test_add_files_to_single_tar_file(self):
|
|
2434
|
+
"""
|
|
2435
|
+
Test add_files_to_single_tar_file.
|
|
2436
|
+
"""
|
|
2437
|
+
|
|
2438
|
+
file1_path = os.path.join(self.test_dir, "tar_file1.txt")
|
|
2439
|
+
content1 = "content for tar1"
|
|
2440
|
+
with open(file1_path, 'w') as f: f.write(content1)
|
|
2441
|
+
|
|
2442
|
+
subdir_for_tar = os.path.join(self.test_dir, "tar_subdir")
|
|
2443
|
+
os.makedirs(subdir_for_tar, exist_ok=True)
|
|
2444
|
+
file2_path = os.path.join(subdir_for_tar, "tar_file2.log")
|
|
2445
|
+
content2 = "content for tar2"
|
|
2446
|
+
with open(file2_path, 'w') as f: f.write(content2)
|
|
2447
|
+
|
|
2448
|
+
input_files = [file1_path, file2_path]
|
|
2449
|
+
output_tar_path = os.path.join(self.test_dir, "archive.tar.gz")
|
|
2450
|
+
|
|
2451
|
+
add_files_to_single_tar_file(input_files, output_tar_path, arc_name_base=self.test_dir,
|
|
2452
|
+
overwrite=True, mode='x:gz')
|
|
2453
|
+
assert os.path.exists(output_tar_path)
|
|
2454
|
+
|
|
2455
|
+
un_tar_dir = os.path.join(self.test_dir, "un_tar_contents")
|
|
2456
|
+
os.makedirs(un_tar_dir, exist_ok=True)
|
|
2457
|
+
with tarfile.open(output_tar_path, 'r:gz') as tf:
|
|
2458
|
+
tf.extractall(path=un_tar_dir)
|
|
2459
|
+
|
|
2460
|
+
expected_untarred_file1 = os.path.join(un_tar_dir, os.path.relpath(file1_path, self.test_dir))
|
|
2461
|
+
expected_untarred_file2 = os.path.join(un_tar_dir, os.path.relpath(file2_path, self.test_dir))
|
|
2462
|
+
|
|
2463
|
+
assert os.path.exists(expected_untarred_file1)
|
|
2464
|
+
with open(expected_untarred_file1, 'r') as f: assert f.read() == content1
|
|
2465
|
+
assert os.path.exists(expected_untarred_file2)
|
|
2466
|
+
with open(expected_untarred_file2, 'r') as f: assert f.read() == content2
|
|
2467
|
+
|
|
2468
|
+
|
|
2469
|
+
def test_parallel_zip_individual_files_and_folders(self):
|
|
2470
|
+
"""
|
|
2471
|
+
Test parallel_zip_files, parallel_zip_folders, and zip_each_file_in_folder.
|
|
2472
|
+
"""
|
|
2473
|
+
|
|
2474
|
+
file1_to_zip = os.path.join(self.test_dir, "pz_file1.txt")
|
|
2475
|
+
file2_to_zip = os.path.join(self.test_dir, "pz_file2.txt")
|
|
2476
|
+
with open(file1_to_zip, 'w') as f: f.write("pz_content1")
|
|
2477
|
+
with open(file2_to_zip, 'w') as f: f.write("pz_content2")
|
|
2478
|
+
|
|
2479
|
+
parallel_zip_files([file1_to_zip, file2_to_zip], max_workers=1, overwrite=True)
|
|
2480
|
+
assert os.path.exists(file1_to_zip + ".zip")
|
|
2481
|
+
assert os.path.exists(file2_to_zip + ".zip")
|
|
2482
|
+
unzip_dir_pz = os.path.join(self.test_dir, "unzip_pz")
|
|
2483
|
+
unzip_file(file1_to_zip + ".zip", unzip_dir_pz)
|
|
2484
|
+
assert os.path.exists(os.path.join(unzip_dir_pz, os.path.basename(file1_to_zip)))
|
|
2485
|
+
|
|
2486
|
+
folder1_to_zip = os.path.join(self.test_dir, "pz_folder1")
|
|
2487
|
+
os.makedirs(folder1_to_zip, exist_ok=True)
|
|
2488
|
+
with open(os.path.join(folder1_to_zip, "pf1.txt"), 'w') as f: f.write("pf1_content")
|
|
2489
|
+
folder2_to_zip = os.path.join(self.test_dir, "pz_folder2")
|
|
2490
|
+
os.makedirs(folder2_to_zip, exist_ok=True)
|
|
2491
|
+
with open(os.path.join(folder2_to_zip, "pf2.txt"), 'w') as f: f.write("pf2_content")
|
|
2492
|
+
|
|
2493
|
+
parallel_zip_folders([folder1_to_zip, folder2_to_zip], max_workers=1, overwrite=True)
|
|
2494
|
+
assert os.path.exists(folder1_to_zip + ".zip")
|
|
2495
|
+
assert os.path.exists(folder2_to_zip + ".zip")
|
|
2496
|
+
unzip_dir_pzf = os.path.join(self.test_dir, "unzip_pzf")
|
|
2497
|
+
unzip_file(folder1_to_zip + ".zip", unzip_dir_pzf)
|
|
2498
|
+
assert os.path.exists(os.path.join(unzip_dir_pzf, "pf1.txt"))
|
|
2499
|
+
|
|
2500
|
+
zef_folder = os.path.join(self.test_dir, "zef_test_folder")
|
|
2501
|
+
os.makedirs(zef_folder, exist_ok=True)
|
|
2502
|
+
zef_file1 = os.path.join(zef_folder, "zef1.txt")
|
|
2503
|
+
zef_file2_png = os.path.join(zef_folder, "zef2.png")
|
|
2504
|
+
zef_file3_zip = os.path.join(zef_folder, "zef3.zip")
|
|
2505
|
+
zef_subdir = os.path.join(zef_folder, "zef_sub")
|
|
2506
|
+
os.makedirs(zef_subdir, exist_ok=True)
|
|
2507
|
+
zef_file_in_sub = os.path.join(zef_subdir, "zef_subfile.txt")
|
|
2508
|
+
|
|
2509
|
+
for p_path in [zef_file1, zef_file2_png, zef_file3_zip, zef_file_in_sub]:
|
|
2510
|
+
with open(p_path, 'w') as f: f.write(f"content of {os.path.basename(p_path)}")
|
|
2511
|
+
|
|
2512
|
+
zip_each_file_in_folder(zef_folder, recursive=False, max_workers=1, overwrite=True)
|
|
2513
|
+
assert os.path.exists(zef_file1 + ".zip")
|
|
2514
|
+
assert os.path.exists(zef_file2_png + ".zip")
|
|
2515
|
+
assert not os.path.exists(zef_file3_zip + ".zip")
|
|
2516
|
+
assert not os.path.exists(zef_file_in_sub + ".zip")
|
|
2517
|
+
|
|
2518
|
+
if os.path.exists(zef_file1 + ".zip"): os.remove(zef_file1 + ".zip")
|
|
2519
|
+
if os.path.exists(zef_file2_png + ".zip"): os.remove(zef_file2_png + ".zip")
|
|
2520
|
+
|
|
2521
|
+
zip_each_file_in_folder(zef_folder, recursive=True, max_workers=1, overwrite=True)
|
|
2522
|
+
assert os.path.exists(zef_file1 + ".zip")
|
|
2523
|
+
assert os.path.exists(zef_file2_png + ".zip")
|
|
2524
|
+
assert not os.path.exists(zef_file3_zip + ".zip")
|
|
2525
|
+
assert os.path.exists(zef_file_in_sub + ".zip")
|
|
2526
|
+
|
|
2527
|
+
if os.path.exists(zef_file1 + ".zip"): os.remove(zef_file1 + ".zip")
|
|
2528
|
+
if os.path.exists(zef_file2_png + ".zip"): os.remove(zef_file2_png + ".zip")
|
|
2529
|
+
if os.path.exists(zef_file_in_sub + ".zip"): os.remove(zef_file_in_sub + ".zip")
|
|
2530
|
+
zip_each_file_in_folder(zef_folder, recursive=True, required_token="zef1", max_workers=1, overwrite=True)
|
|
2531
|
+
assert os.path.exists(zef_file1 + ".zip")
|
|
2532
|
+
assert not os.path.exists(zef_file2_png + ".zip")
|
|
2533
|
+
assert not os.path.exists(zef_file_in_sub + ".zip")
|
|
2534
|
+
|
|
2535
|
+
if os.path.exists(zef_file1 + ".zip"): os.remove(zef_file1 + ".zip")
|
|
2536
|
+
dummy_to_zip = os.path.join(zef_folder,"dummy.txt")
|
|
2537
|
+
with open(dummy_to_zip,'w') as f: f.write('d')
|
|
2538
|
+
zip_each_file_in_folder(zef_folder, recursive=False, exclude_zip=False, max_workers=1, overwrite=True)
|
|
2539
|
+
assert os.path.exists(dummy_to_zip + ".zip")
|
|
2540
|
+
assert os.path.exists(zef_file3_zip + ".zip")
|
|
2541
|
+
if os.path.exists(dummy_to_zip + ".zip"): os.remove(dummy_to_zip + ".zip")
|
|
2542
|
+
if os.path.exists(zef_file3_zip + ".zip"): os.remove(zef_file3_zip + ".zip")
|
|
2543
|
+
|
|
2544
|
+
|
|
2545
|
+
def test_compute_file_hash(self):
|
|
2546
|
+
"""
|
|
2547
|
+
Test compute_file_hash and parallel_compute_file_hashes.
|
|
2548
|
+
"""
|
|
2549
|
+
|
|
2550
|
+
file1_name = "hash_me1.txt"
|
|
2551
|
+
file1_path = os.path.join(self.test_dir, file1_name)
|
|
2552
|
+
content1 = "This is a test string for hashing."
|
|
2553
|
+
with open(file1_path, 'w') as f:
|
|
2554
|
+
f.write(content1)
|
|
2555
|
+
|
|
2556
|
+
file2_name = "hash_me2.txt"
|
|
2557
|
+
file2_path = os.path.join(self.test_dir, file2_name)
|
|
2558
|
+
with open(file2_path, 'w') as f:
|
|
2559
|
+
f.write(content1)
|
|
2560
|
+
|
|
2561
|
+
file3_name = "hash_me3.txt"
|
|
2562
|
+
file3_path = os.path.join(self.test_dir, file3_name)
|
|
2563
|
+
content3 = "This is a different test string for hashing."
|
|
2564
|
+
with open(file3_path, 'w') as f:
|
|
2565
|
+
f.write(content3)
|
|
2566
|
+
|
|
2567
|
+
expected_hash_content1_sha256 = \
|
|
2568
|
+
"c56f19d76df6a09e49fe0d9ce7b1bc7f1dbd582f668742bede65c54c47d5bcf4".lower()
|
|
2569
|
+
expected_hash_content3_sha256 = \
|
|
2570
|
+
"23013ff7e93264317f7b2fc0e9a217649f2dc0b11ca7e0bd49632424b70b6680".lower()
|
|
2571
|
+
|
|
2572
|
+
hash1 = compute_file_hash(file1_path)
|
|
2573
|
+
hash2 = compute_file_hash(file2_path)
|
|
2574
|
+
hash3 = compute_file_hash(file3_path)
|
|
2575
|
+
assert hash1 == expected_hash_content1_sha256
|
|
2576
|
+
assert hash2 == expected_hash_content1_sha256
|
|
2577
|
+
assert hash1 != hash3
|
|
2578
|
+
assert hash3 == expected_hash_content3_sha256
|
|
2579
|
+
|
|
2580
|
+
expected_hash_content1_md5 = "94b971f1f8cdb23c2af82af73160d4b0".lower()
|
|
2581
|
+
hash1_md5 = compute_file_hash(file1_path, algorithm='md5')
|
|
2582
|
+
assert hash1_md5 == expected_hash_content1_md5
|
|
2583
|
+
|
|
2584
|
+
non_existent_path = os.path.join(self.test_dir, "no_such_file.txt")
|
|
2585
|
+
assert compute_file_hash(non_existent_path, allow_failures=True) is None
|
|
2586
|
+
try:
|
|
2587
|
+
compute_file_hash(non_existent_path, allow_failures=False)
|
|
2588
|
+
raise AssertionError("FileNotFoundError not raised for compute_file_hash")
|
|
2589
|
+
except FileNotFoundError:
|
|
2590
|
+
pass
|
|
2591
|
+
|
|
2592
|
+
files_to_hash = [file1_path, file3_path, non_existent_path]
|
|
2593
|
+
hashes_parallel = parallel_compute_file_hashes(files_to_hash, max_workers=1)
|
|
2594
|
+
|
|
2595
|
+
norm_f1 = file1_path.replace('\\','/')
|
|
2596
|
+
norm_f3 = file3_path.replace('\\','/')
|
|
2597
|
+
norm_non = non_existent_path.replace('\\','/')
|
|
2598
|
+
|
|
2599
|
+
expected_parallel_hashes = {
|
|
2600
|
+
norm_f1: expected_hash_content1_sha256,
|
|
2601
|
+
norm_f3: expected_hash_content3_sha256,
|
|
2602
|
+
norm_non: None
|
|
2603
|
+
}
|
|
2604
|
+
hashes_parallel_norm = {k.replace('\\','/'): v for k,v in hashes_parallel.items()}
|
|
2605
|
+
assert hashes_parallel_norm == expected_parallel_hashes
|
|
2606
|
+
|
|
2607
|
+
hash_folder = os.path.join(self.test_dir, "hash_test_folder")
|
|
2608
|
+
os.makedirs(hash_folder, exist_ok=True)
|
|
2609
|
+
h_f1_name = "h_f1.txt"; h_f1_path = os.path.join(hash_folder, h_f1_name)
|
|
2610
|
+
h_f2_name = "h_f2.txt"; h_f2_path = os.path.join(hash_folder, h_f2_name)
|
|
2611
|
+
with open(h_f1_path, 'w') as f: f.write(content1)
|
|
2612
|
+
with open(h_f2_path, 'w') as f: f.write(content3)
|
|
2613
|
+
|
|
2614
|
+
hashes_folder_parallel = parallel_compute_file_hashes(hash_folder, recursive=False, max_workers=1)
|
|
2615
|
+
norm_hf1 = h_f1_path.replace('\\','/')
|
|
2616
|
+
norm_hf2 = h_f2_path.replace('\\','/')
|
|
2617
|
+
expected_folder_hashes = {
|
|
2618
|
+
norm_hf1: expected_hash_content1_sha256,
|
|
2619
|
+
norm_hf2: expected_hash_content3_sha256
|
|
2620
|
+
}
|
|
2621
|
+
hashes_folder_parallel_norm = {k.replace('\\','/'): v for k,v in hashes_folder_parallel.items()}
|
|
2622
|
+
assert hashes_folder_parallel_norm == expected_folder_hashes
|
|
2623
|
+
|
|
2624
|
+
|
|
2625
|
+
def test_path_utils():
|
|
2626
|
+
"""
|
|
2627
|
+
Runs all tests in the TestPathUtils class.
|
|
2628
|
+
"""
|
|
2629
|
+
|
|
2630
|
+
test_instance = TestPathUtils()
|
|
2631
|
+
test_instance.set_up()
|
|
2632
|
+
try:
|
|
2633
|
+
test_instance.test_is_image_file()
|
|
2634
|
+
test_instance.test_find_image_strings()
|
|
2635
|
+
test_instance.test_find_images()
|
|
2636
|
+
test_instance.test_recursive_file_list_and_file_list()
|
|
2637
|
+
test_instance.test_folder_list()
|
|
2638
|
+
test_instance.test_folder_summary()
|
|
2639
|
+
test_instance.test_fileparts()
|
|
2640
|
+
test_instance.test_insert_before_extension()
|
|
2641
|
+
test_instance.test_split_path()
|
|
2642
|
+
test_instance.test_path_is_abs()
|
|
2643
|
+
test_instance.test_safe_create_link_unix()
|
|
2644
|
+
test_instance.test_remove_empty_folders()
|
|
2645
|
+
test_instance.test_path_join()
|
|
2646
|
+
test_instance.test_filename_cleaning()
|
|
2647
|
+
test_instance.test_is_executable()
|
|
2648
|
+
test_instance.test_write_read_list_to_file()
|
|
2649
|
+
test_instance.test_parallel_copy_files()
|
|
2650
|
+
test_instance.test_get_file_sizes()
|
|
2651
|
+
test_instance.test_zip_file_and_unzip_file()
|
|
2652
|
+
test_instance.test_zip_folder()
|
|
2653
|
+
test_instance.test_zip_files_into_single_zipfile()
|
|
2654
|
+
test_instance.test_add_files_to_single_tar_file()
|
|
2655
|
+
test_instance.test_parallel_zip_individual_files_and_folders()
|
|
2656
|
+
test_instance.test_compute_file_hash()
|
|
2657
|
+
finally:
|
|
2658
|
+
test_instance.tear_down()
|