megadetector 5.0.7__py3-none-any.whl → 5.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- api/__init__.py +0 -0
- api/batch_processing/__init__.py +0 -0
- api/batch_processing/api_core/__init__.py +0 -0
- api/batch_processing/api_core/batch_service/__init__.py +0 -0
- api/batch_processing/api_core/batch_service/score.py +0 -1
- api/batch_processing/api_core/server_job_status_table.py +0 -1
- api/batch_processing/api_core_support/__init__.py +0 -0
- api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
- api/batch_processing/api_support/__init__.py +0 -0
- api/batch_processing/api_support/summarize_daily_activity.py +0 -1
- api/batch_processing/data_preparation/__init__.py +0 -0
- api/batch_processing/data_preparation/manage_local_batch.py +93 -79
- api/batch_processing/data_preparation/manage_video_batch.py +8 -8
- api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
- api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
- api/batch_processing/postprocessing/__init__.py +0 -0
- api/batch_processing/postprocessing/add_max_conf.py +12 -12
- api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
- api/batch_processing/postprocessing/combine_api_outputs.py +69 -55
- api/batch_processing/postprocessing/compare_batch_results.py +114 -44
- api/batch_processing/postprocessing/convert_output_format.py +62 -19
- api/batch_processing/postprocessing/load_api_results.py +17 -20
- api/batch_processing/postprocessing/md_to_coco.py +31 -21
- api/batch_processing/postprocessing/md_to_labelme.py +165 -68
- api/batch_processing/postprocessing/merge_detections.py +40 -15
- api/batch_processing/postprocessing/postprocess_batch_results.py +270 -186
- api/batch_processing/postprocessing/remap_detection_categories.py +170 -0
- api/batch_processing/postprocessing/render_detection_confusion_matrix.py +75 -39
- api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
- api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
- api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +244 -160
- api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
- api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
- api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
- api/synchronous/__init__.py +0 -0
- api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
- api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
- api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
- api/synchronous/api_core/animal_detection_api/config.py +35 -35
- api/synchronous/api_core/tests/__init__.py +0 -0
- api/synchronous/api_core/tests/load_test.py +109 -109
- classification/__init__.py +0 -0
- classification/aggregate_classifier_probs.py +21 -24
- classification/analyze_failed_images.py +11 -13
- classification/cache_batchapi_outputs.py +51 -51
- classification/create_classification_dataset.py +69 -68
- classification/crop_detections.py +54 -53
- classification/csv_to_json.py +97 -100
- classification/detect_and_crop.py +105 -105
- classification/evaluate_model.py +43 -42
- classification/identify_mislabeled_candidates.py +47 -46
- classification/json_to_azcopy_list.py +10 -10
- classification/json_validator.py +72 -71
- classification/map_classification_categories.py +44 -43
- classification/merge_classification_detection_output.py +68 -68
- classification/prepare_classification_script.py +157 -154
- classification/prepare_classification_script_mc.py +228 -228
- classification/run_classifier.py +27 -26
- classification/save_mislabeled.py +30 -30
- classification/train_classifier.py +20 -20
- classification/train_classifier_tf.py +21 -22
- classification/train_utils.py +10 -10
- data_management/__init__.py +0 -0
- data_management/annotations/__init__.py +0 -0
- data_management/annotations/annotation_constants.py +18 -31
- data_management/camtrap_dp_to_coco.py +238 -0
- data_management/cct_json_utils.py +107 -59
- data_management/cct_to_md.py +176 -158
- data_management/cct_to_wi.py +247 -219
- data_management/coco_to_labelme.py +272 -0
- data_management/coco_to_yolo.py +86 -62
- data_management/databases/__init__.py +0 -0
- data_management/databases/add_width_and_height_to_db.py +20 -16
- data_management/databases/combine_coco_camera_traps_files.py +35 -31
- data_management/databases/integrity_check_json_db.py +130 -83
- data_management/databases/subset_json_db.py +25 -16
- data_management/generate_crops_from_cct.py +27 -45
- data_management/get_image_sizes.py +188 -144
- data_management/importers/add_nacti_sizes.py +8 -8
- data_management/importers/add_timestamps_to_icct.py +78 -78
- data_management/importers/animl_results_to_md_results.py +158 -160
- data_management/importers/auckland_doc_test_to_json.py +9 -9
- data_management/importers/auckland_doc_to_json.py +8 -8
- data_management/importers/awc_to_json.py +7 -7
- data_management/importers/bellevue_to_json.py +15 -15
- data_management/importers/cacophony-thermal-importer.py +13 -13
- data_management/importers/carrizo_shrubfree_2018.py +8 -8
- data_management/importers/carrizo_trail_cam_2017.py +8 -8
- data_management/importers/cct_field_adjustments.py +9 -9
- data_management/importers/channel_islands_to_cct.py +10 -10
- data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
- data_management/importers/ena24_to_json.py +7 -7
- data_management/importers/filenames_to_json.py +8 -8
- data_management/importers/helena_to_cct.py +7 -7
- data_management/importers/idaho-camera-traps.py +7 -7
- data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
- data_management/importers/jb_csv_to_json.py +9 -9
- data_management/importers/mcgill_to_json.py +8 -8
- data_management/importers/missouri_to_json.py +18 -18
- data_management/importers/nacti_fieldname_adjustments.py +10 -10
- data_management/importers/noaa_seals_2019.py +8 -8
- data_management/importers/pc_to_json.py +7 -7
- data_management/importers/plot_wni_giraffes.py +7 -7
- data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
- data_management/importers/prepare_zsl_imerit.py +7 -7
- data_management/importers/rspb_to_json.py +8 -8
- data_management/importers/save_the_elephants_survey_A.py +8 -8
- data_management/importers/save_the_elephants_survey_B.py +9 -9
- data_management/importers/snapshot_safari_importer.py +26 -26
- data_management/importers/snapshot_safari_importer_reprise.py +665 -665
- data_management/importers/snapshot_serengeti_lila.py +14 -14
- data_management/importers/sulross_get_exif.py +8 -9
- data_management/importers/timelapse_csv_set_to_json.py +11 -11
- data_management/importers/ubc_to_json.py +13 -13
- data_management/importers/umn_to_json.py +7 -7
- data_management/importers/wellington_to_json.py +8 -8
- data_management/importers/wi_to_json.py +9 -9
- data_management/importers/zamba_results_to_md_results.py +181 -181
- data_management/labelme_to_coco.py +309 -159
- data_management/labelme_to_yolo.py +103 -60
- data_management/lila/__init__.py +0 -0
- data_management/lila/add_locations_to_island_camera_traps.py +9 -9
- data_management/lila/add_locations_to_nacti.py +147 -147
- data_management/lila/create_lila_blank_set.py +114 -31
- data_management/lila/create_lila_test_set.py +8 -8
- data_management/lila/create_links_to_md_results_files.py +106 -106
- data_management/lila/download_lila_subset.py +92 -90
- data_management/lila/generate_lila_per_image_labels.py +56 -43
- data_management/lila/get_lila_annotation_counts.py +18 -15
- data_management/lila/get_lila_image_counts.py +11 -11
- data_management/lila/lila_common.py +103 -70
- data_management/lila/test_lila_metadata_urls.py +132 -116
- data_management/ocr_tools.py +173 -128
- data_management/read_exif.py +161 -99
- data_management/remap_coco_categories.py +84 -0
- data_management/remove_exif.py +58 -62
- data_management/resize_coco_dataset.py +32 -44
- data_management/wi_download_csv_to_coco.py +246 -0
- data_management/yolo_output_to_md_output.py +86 -73
- data_management/yolo_to_coco.py +535 -95
- detection/__init__.py +0 -0
- detection/detector_training/__init__.py +0 -0
- detection/process_video.py +85 -33
- detection/pytorch_detector.py +43 -25
- detection/run_detector.py +157 -72
- detection/run_detector_batch.py +189 -114
- detection/run_inference_with_yolov5_val.py +118 -51
- detection/run_tiled_inference.py +113 -42
- detection/tf_detector.py +51 -28
- detection/video_utils.py +606 -521
- docs/source/conf.py +43 -0
- md_utils/__init__.py +0 -0
- md_utils/azure_utils.py +9 -9
- md_utils/ct_utils.py +249 -70
- md_utils/directory_listing.py +59 -64
- md_utils/md_tests.py +968 -862
- md_utils/path_utils.py +655 -155
- md_utils/process_utils.py +157 -133
- md_utils/sas_blob_utils.py +20 -20
- md_utils/split_locations_into_train_val.py +45 -32
- md_utils/string_utils.py +33 -10
- md_utils/url_utils.py +208 -27
- md_utils/write_html_image_list.py +51 -35
- md_visualization/__init__.py +0 -0
- md_visualization/plot_utils.py +102 -109
- md_visualization/render_images_with_thumbnails.py +34 -34
- md_visualization/visualization_utils.py +908 -311
- md_visualization/visualize_db.py +109 -58
- md_visualization/visualize_detector_output.py +61 -42
- {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/METADATA +21 -17
- megadetector-5.0.9.dist-info/RECORD +224 -0
- {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/WHEEL +1 -1
- {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/top_level.txt +1 -0
- taxonomy_mapping/__init__.py +0 -0
- taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
- taxonomy_mapping/map_new_lila_datasets.py +154 -154
- taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
- taxonomy_mapping/preview_lila_taxonomy.py +591 -591
- taxonomy_mapping/retrieve_sample_image.py +12 -12
- taxonomy_mapping/simple_image_download.py +11 -11
- taxonomy_mapping/species_lookup.py +10 -10
- taxonomy_mapping/taxonomy_csv_checker.py +18 -18
- taxonomy_mapping/taxonomy_graph.py +47 -47
- taxonomy_mapping/validate_lila_category_mappings.py +83 -76
- data_management/cct_json_to_filename_json.py +0 -89
- data_management/cct_to_csv.py +0 -140
- data_management/databases/remove_corrupted_images_from_db.py +0 -191
- detection/detector_training/copy_checkpoints.py +0 -43
- md_visualization/visualize_megadb.py +0 -183
- megadetector-5.0.7.dist-info/RECORD +0 -202
- {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/LICENSE +0 -0
md_utils/path_utils.py
CHANGED
|
@@ -1,30 +1,37 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
path_utils.py
|
|
4
|
+
|
|
5
|
+
Miscellaneous useful utils for path manipulation, i.e. things that could *almost*
|
|
6
|
+
be in os.path, but aren't.
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
9
|
|
|
10
10
|
#%% Imports and constants
|
|
11
11
|
|
|
12
12
|
import glob
|
|
13
13
|
import ntpath
|
|
14
14
|
import os
|
|
15
|
-
import
|
|
15
|
+
import sys
|
|
16
|
+
import platform
|
|
16
17
|
import string
|
|
17
18
|
import json
|
|
19
|
+
import shutil
|
|
18
20
|
import unicodedata
|
|
19
21
|
import zipfile
|
|
22
|
+
import tarfile
|
|
23
|
+
import webbrowser
|
|
24
|
+
import subprocess
|
|
25
|
+
import re
|
|
20
26
|
|
|
21
27
|
from zipfile import ZipFile
|
|
22
28
|
from datetime import datetime
|
|
23
|
-
from typing import Container, Iterable, List, Optional, Tuple, Sequence
|
|
24
29
|
from multiprocessing.pool import Pool, ThreadPool
|
|
25
30
|
from functools import partial
|
|
31
|
+
from shutil import which
|
|
26
32
|
from tqdm import tqdm
|
|
27
33
|
|
|
34
|
+
# Should all be lower-case
|
|
28
35
|
IMG_EXTENSIONS = ('.jpg', '.jpeg', '.gif', '.png', '.tif', '.tiff', '.bmp')
|
|
29
36
|
|
|
30
37
|
VALID_FILENAME_CHARS = f"~-_.() {string.ascii_letters}{string.digits}"
|
|
@@ -35,14 +42,31 @@ CHAR_LIMIT = 255
|
|
|
35
42
|
|
|
36
43
|
#%% General path functions
|
|
37
44
|
|
|
38
|
-
def recursive_file_list(base_dir,
|
|
39
|
-
|
|
45
|
+
def recursive_file_list(base_dir,
|
|
46
|
+
convert_slashes=True,
|
|
47
|
+
return_relative_paths=False,
|
|
48
|
+
sort_files=True,
|
|
40
49
|
recursive=True):
|
|
41
50
|
r"""
|
|
42
|
-
|
|
43
|
-
|
|
51
|
+
Enumerates files (not directories) in [base_dir], optionally converting
|
|
52
|
+
backslahes to slashes
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
base_dir (str): folder to enumerate
|
|
56
|
+
convert_slashes (bool, optional): force forward slashes; if this is False, will use
|
|
57
|
+
the native path separator
|
|
58
|
+
return_relative_paths (bool, optional): return paths that are relative to [base_dir],
|
|
59
|
+
rather than absolute paths
|
|
60
|
+
sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
|
|
61
|
+
provided by os.walk()
|
|
62
|
+
recursive (bool, optional): enumerate recursively
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
list: list of filenames
|
|
44
66
|
"""
|
|
45
67
|
|
|
68
|
+
assert os.path.isdir(base_dir), '{} is not a folder'.format(base_dir)
|
|
69
|
+
|
|
46
70
|
all_files = []
|
|
47
71
|
|
|
48
72
|
if recursive:
|
|
@@ -71,61 +95,51 @@ def file_list(base_dir, convert_slashes=True, return_relative_paths=False, sort_
|
|
|
71
95
|
recursive=False):
|
|
72
96
|
"""
|
|
73
97
|
Trivial wrapper for recursive_file_list, which was a poor function name choice at the time,
|
|
74
|
-
it doesn't really make sense to have a "recursive" option in a function called
|
|
98
|
+
since it doesn't really make sense to have a "recursive" option in a function called
|
|
99
|
+
"recursive_file_list".
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
base_dir (str): folder to enumerate
|
|
103
|
+
convert_slashes (bool, optional): force forward slashes; if this is False, will use
|
|
104
|
+
the native path separator
|
|
105
|
+
return_relative_paths (bool, optional): return paths that are relative to [base_dir],
|
|
106
|
+
rather than absolute paths
|
|
107
|
+
sort_files (bool, optional): force files to be sorted, otherwise uses the sorting
|
|
108
|
+
provided by os.walk()
|
|
109
|
+
recursive (bool, optional): enumerate recursively
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
list: list of filenames
|
|
75
113
|
"""
|
|
76
114
|
|
|
77
115
|
return recursive_file_list(base_dir,convert_slashes,return_relative_paths,sort_files,
|
|
78
116
|
recursive=recursive)
|
|
79
117
|
|
|
80
118
|
|
|
81
|
-
def
|
|
82
|
-
r"""
|
|
83
|
-
Splits [path] into all its constituent tokens.
|
|
84
|
-
|
|
85
|
-
Non-recursive version of:
|
|
86
|
-
http://nicks-liquid-soapbox.blogspot.com/2011/03/splitting-path-to-list-in-python.html
|
|
87
|
-
|
|
88
|
-
Examples
|
|
89
|
-
>>> split_path(r'c:\dir\subdir\file.txt')
|
|
90
|
-
['c:\\', 'dir', 'subdir', 'file.txt']
|
|
91
|
-
>>> split_path('/dir/subdir/file.jpg')
|
|
92
|
-
['/', 'dir', 'subdir', 'file.jpg']
|
|
93
|
-
>>> split_path('c:\\')
|
|
94
|
-
['c:\\']
|
|
95
|
-
>>> split_path('/')
|
|
96
|
-
['/']
|
|
97
|
-
"""
|
|
98
|
-
|
|
99
|
-
parts = []
|
|
100
|
-
while True:
|
|
101
|
-
# ntpath seems to do the right thing for both Windows and Unix paths
|
|
102
|
-
head, tail = ntpath.split(path)
|
|
103
|
-
if head == '' or head == path:
|
|
104
|
-
break
|
|
105
|
-
parts.append(tail)
|
|
106
|
-
path = head
|
|
107
|
-
parts.append(head or tail)
|
|
108
|
-
return parts[::-1] # reverse
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
def fileparts(path: str) -> Tuple[str, str, str]:
|
|
119
|
+
def fileparts(path):
|
|
112
120
|
r"""
|
|
113
121
|
Breaks down a path into the directory path, filename, and extension.
|
|
114
122
|
|
|
115
123
|
Note that the '.' lives with the extension, and separators are removed.
|
|
116
124
|
|
|
117
|
-
Examples
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
125
|
+
Examples:
|
|
126
|
+
|
|
127
|
+
.. code-block:: none
|
|
128
|
+
|
|
129
|
+
>>> fileparts('file')
|
|
130
|
+
('', 'file', '')
|
|
131
|
+
>>> fileparts(r'c:/dir/file.jpg')
|
|
132
|
+
('c:/dir', 'file', '.jpg')
|
|
133
|
+
>>> fileparts('/dir/subdir/file.jpg')
|
|
134
|
+
('/dir/subdir', 'file', '.jpg')
|
|
124
135
|
|
|
136
|
+
Args:
|
|
137
|
+
path (str): path name to separate into parts
|
|
125
138
|
Returns:
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
139
|
+
tuple: tuple containing (p,n,e):
|
|
140
|
+
- p: str, directory path
|
|
141
|
+
- n: str, filename without extension
|
|
142
|
+
- e: str, extension including the '.'
|
|
129
143
|
"""
|
|
130
144
|
|
|
131
145
|
# ntpath seems to do the right thing for both Windows and Unix paths
|
|
@@ -135,79 +149,168 @@ def fileparts(path: str) -> Tuple[str, str, str]:
|
|
|
135
149
|
return p, n, e
|
|
136
150
|
|
|
137
151
|
|
|
138
|
-
def insert_before_extension(filename
|
|
152
|
+
def insert_before_extension(filename, s=None, separator='.'):
|
|
139
153
|
"""
|
|
140
154
|
Insert string [s] before the extension in [filename], separated with [separator].
|
|
141
155
|
|
|
142
156
|
If [s] is empty, generates a date/timestamp. If [filename] has no extension,
|
|
143
157
|
appends [s].
|
|
144
158
|
|
|
145
|
-
Examples
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
159
|
+
Examples:
|
|
160
|
+
|
|
161
|
+
.. code-block:: none
|
|
162
|
+
|
|
163
|
+
>>> insert_before_extension('/dir/subdir/file.ext', 'insert')
|
|
164
|
+
'/dir/subdir/file.insert.ext'
|
|
165
|
+
>>> insert_before_extension('/dir/subdir/file', 'insert')
|
|
166
|
+
'/dir/subdir/file.insert'
|
|
167
|
+
>>> insert_before_extension('/dir/subdir/file')
|
|
168
|
+
'/dir/subdir/file.2020.07.20.10.54.38'
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
filename (str): filename to manipulate
|
|
172
|
+
s (str, optional): string to insert before the extension in [filename], or
|
|
173
|
+
None to insert a datestamp
|
|
174
|
+
separator (str, optional): separator to place between the filename base
|
|
175
|
+
and the inserted string
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
str: modified string
|
|
152
179
|
"""
|
|
153
180
|
|
|
154
181
|
assert len(filename) > 0
|
|
155
|
-
if len(s) == 0:
|
|
182
|
+
if s is None or len(s) == 0:
|
|
156
183
|
s = datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
|
|
157
184
|
name, ext = os.path.splitext(filename)
|
|
158
185
|
return f'{name}{separator}{s}{ext}'
|
|
159
186
|
|
|
160
187
|
|
|
161
|
-
def
|
|
188
|
+
def split_path(path):
|
|
189
|
+
r"""
|
|
190
|
+
Splits [path] into all its constituent file/folder tokens.
|
|
191
|
+
|
|
192
|
+
Examples:
|
|
193
|
+
|
|
194
|
+
.. code-block:: none
|
|
195
|
+
|
|
196
|
+
>>> split_path(r'c:\dir\subdir\file.txt')
|
|
197
|
+
['c:\\', 'dir', 'subdir', 'file.txt']
|
|
198
|
+
>>> split_path('/dir/subdir/file.jpg')
|
|
199
|
+
['/', 'dir', 'subdir', 'file.jpg']
|
|
200
|
+
>>> split_path('c:\\')
|
|
201
|
+
['c:\\']
|
|
202
|
+
>>> split_path('/')
|
|
203
|
+
['/']
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
path (str): path to split into tokens
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
list: list of path tokens
|
|
162
210
|
"""
|
|
163
|
-
|
|
211
|
+
|
|
212
|
+
parts = []
|
|
213
|
+
while True:
|
|
214
|
+
# ntpath seems to do the right thing for both Windows and Unix paths
|
|
215
|
+
head, tail = ntpath.split(path)
|
|
216
|
+
if head == '' or head == path:
|
|
217
|
+
break
|
|
218
|
+
parts.append(tail)
|
|
219
|
+
path = head
|
|
220
|
+
parts.append(head or tail)
|
|
221
|
+
return parts[::-1] # reverse
|
|
164
222
|
|
|
165
|
-
This function behaves differently for Windows vs. Unix paths. Set
|
|
166
|
-
windows=True if [p] is a Windows path. Set windows=None (default) to treat
|
|
167
|
-
[p] as a native system path.
|
|
168
223
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
224
|
+
def path_is_abs(p):
|
|
225
|
+
"""
|
|
226
|
+
Determines whether [p] is an absolute path. An absolute path is defined as
|
|
227
|
+
one that starts with slash, backslash, or a letter followed by a colon.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
p (str): path to evaluate
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
bool: True if [p] is an absolute path, else False
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
return (len(p) > 1) and (p[0] == '/' or p[1] == ':' or p[0] == '\\')
|
|
237
|
+
|
|
172
238
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
239
|
+
def top_level_folder(p):
|
|
240
|
+
r"""
|
|
241
|
+
Gets the top-level folder from the path *p*.
|
|
242
|
+
|
|
243
|
+
On UNIX, this is straightforward:
|
|
244
|
+
|
|
245
|
+
/blah/foo
|
|
246
|
+
|
|
247
|
+
...returns '/blah'
|
|
248
|
+
|
|
249
|
+
On Windows, we define this as the top-level folder that isn't the drive, so:
|
|
250
|
+
|
|
251
|
+
c:\blah\foo
|
|
252
|
+
|
|
253
|
+
...returns 'c:\blah'.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
p (str): filename to evaluate
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
str: the top-level folder in [p], see above for details on how this is defined
|
|
176
260
|
"""
|
|
177
261
|
|
|
178
262
|
if p == '':
|
|
179
263
|
return ''
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
if windows is not None:
|
|
183
|
-
os.path = ntpath if windows else posixpath
|
|
184
|
-
|
|
185
|
-
# Path('/blah').parts is ('/', 'blah')
|
|
264
|
+
|
|
265
|
+
# Path('/blah').parts is ('/','blah')
|
|
186
266
|
parts = split_path(p)
|
|
267
|
+
|
|
268
|
+
if len(parts) == 1:
|
|
269
|
+
return parts[0]
|
|
187
270
|
|
|
271
|
+
# Handle paths like:
|
|
272
|
+
#
|
|
273
|
+
# /, \, /stuff, c:, c:\stuff
|
|
188
274
|
drive = os.path.splitdrive(p)[0]
|
|
189
|
-
if
|
|
190
|
-
|
|
191
|
-
or parts[0] == drive + '/'
|
|
192
|
-
or parts[0] == drive + '\\'
|
|
193
|
-
or parts[0] in ['\\', '/']):
|
|
194
|
-
result = os.path.join(parts[0], parts[1])
|
|
275
|
+
if parts[0] == drive or parts[0] == drive + '/' or parts[0] == drive + '\\' or parts[0] in ['\\', '/']:
|
|
276
|
+
return os.path.join(parts[0], parts[1])
|
|
195
277
|
else:
|
|
196
|
-
|
|
278
|
+
return parts[0]
|
|
279
|
+
|
|
280
|
+
# ...top_level_folder()
|
|
197
281
|
|
|
198
|
-
os.path = default_lib # restore default os.path
|
|
199
|
-
return result
|
|
200
282
|
|
|
283
|
+
#%% Test driver for top_level_folder
|
|
284
|
+
|
|
285
|
+
if False:
|
|
286
|
+
|
|
287
|
+
#%%
|
|
288
|
+
|
|
289
|
+
p = 'blah/foo/bar'; s = top_level_folder(p); print(s); assert s == 'blah'
|
|
290
|
+
p = '/blah/foo/bar'; s = top_level_folder(p); print(s); assert s == '/blah'
|
|
291
|
+
p = 'bar'; s = top_level_folder(p); print(s); assert s == 'bar'
|
|
292
|
+
p = ''; s = top_level_folder(p); print(s); assert s == ''
|
|
293
|
+
p = 'c:\\'; s = top_level_folder(p); print(s); assert s == 'c:\\'
|
|
294
|
+
p = r'c:\blah'; s = top_level_folder(p); print(s); assert s == 'c:\\blah'
|
|
295
|
+
p = r'c:\foo'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
|
|
296
|
+
p = r'c:/foo'; s = top_level_folder(p); print(s); assert s == 'c:/foo'
|
|
297
|
+
p = r'c:\foo/bar'; s = top_level_folder(p); print(s); assert s == 'c:\\foo'
|
|
298
|
+
|
|
299
|
+
#%%
|
|
201
300
|
|
|
202
301
|
def safe_create_link(link_exists,link_new):
|
|
203
302
|
"""
|
|
204
|
-
|
|
303
|
+
Creates a symlink at [link_new] pointing to [link_exists].
|
|
205
304
|
|
|
206
|
-
If link_new already exists, make sure it's a link (not a file),
|
|
207
|
-
and if it has a different target than link_exists,
|
|
305
|
+
If [link_new] already exists, make sure it's a link (not a file),
|
|
306
|
+
and if it has a different target than [link_exists], removes and re-creates
|
|
208
307
|
it.
|
|
209
308
|
|
|
210
|
-
Errors if link_new already exists but it's not a link.
|
|
309
|
+
Errors if [link_new] already exists but it's not a link.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
link_exists (str): the source of the (possibly-new) symlink
|
|
313
|
+
link_new (str): the target of the (possibly-new) symlink
|
|
211
314
|
"""
|
|
212
315
|
|
|
213
316
|
if os.path.exists(link_new) or os.path.islink(link_new):
|
|
@@ -219,58 +322,66 @@ def safe_create_link(link_exists,link_new):
|
|
|
219
322
|
os.symlink(link_exists,link_new)
|
|
220
323
|
|
|
221
324
|
|
|
222
|
-
def get_file_sizes(base_dir, convert_slashes=True):
|
|
223
|
-
"""
|
|
224
|
-
Get sizes recursively for all files in base_dir, returning a dict mapping
|
|
225
|
-
relative filenames to size.
|
|
226
|
-
"""
|
|
227
|
-
|
|
228
|
-
relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
|
|
229
|
-
return_relative_paths=True)
|
|
230
|
-
|
|
231
|
-
fn_to_size = {}
|
|
232
|
-
for fn_relative in tqdm(relative_filenames):
|
|
233
|
-
fn_abs = os.path.join(base_dir,fn_relative)
|
|
234
|
-
fn_to_size[fn_relative] = os.path.getsize(fn_abs)
|
|
235
|
-
|
|
236
|
-
return fn_to_size
|
|
237
|
-
|
|
238
|
-
|
|
239
325
|
#%% Image-related path functions
|
|
240
326
|
|
|
241
|
-
def is_image_file(s
|
|
242
|
-
) -> bool:
|
|
327
|
+
def is_image_file(s, img_extensions=IMG_EXTENSIONS):
|
|
243
328
|
"""
|
|
244
329
|
Checks a file's extension against a hard-coded set of image file
|
|
245
|
-
extensions.
|
|
330
|
+
extensions. Uses case-insensitive comparison.
|
|
246
331
|
|
|
247
332
|
Does not check whether the file exists, only determines whether the filename
|
|
248
333
|
implies it's an image file.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
s (str): filename to evaluate for image-ness
|
|
337
|
+
img_extensions (list, optional): list of known image file extensions
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
bool: True if [s] appears to be an image file, else False
|
|
249
341
|
"""
|
|
250
342
|
|
|
251
343
|
ext = os.path.splitext(s)[1]
|
|
252
344
|
return ext.lower() in img_extensions
|
|
253
345
|
|
|
254
346
|
|
|
255
|
-
def find_image_strings(strings
|
|
347
|
+
def find_image_strings(strings):
|
|
256
348
|
"""
|
|
257
349
|
Given a list of strings that are potentially image file names, looks for
|
|
258
350
|
strings that actually look like image file names (based on extension).
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
strings (list): list of filenames to check for image-ness
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
list: the subset of [strings] that appear to be image filenames
|
|
259
357
|
"""
|
|
260
358
|
|
|
261
359
|
return [s for s in strings if is_image_file(s)]
|
|
262
360
|
|
|
263
361
|
|
|
264
|
-
def find_images(dirname
|
|
265
|
-
|
|
266
|
-
|
|
362
|
+
def find_images(dirname,
|
|
363
|
+
recursive=False,
|
|
364
|
+
return_relative_paths=False,
|
|
365
|
+
convert_slashes=True):
|
|
267
366
|
"""
|
|
268
367
|
Finds all files in a directory that look like image file names. Returns
|
|
269
368
|
absolute paths unless return_relative_paths is set. Uses the OS-native
|
|
270
|
-
path separator unless
|
|
369
|
+
path separator unless convert_slashes is set, in which case will always
|
|
271
370
|
use '/'.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
dirname (str): the folder to search for images
|
|
374
|
+
recursive (bool, optional): whether to search recursively
|
|
375
|
+
return_relative_paths (str, optional): return paths that are relative
|
|
376
|
+
to [dirname], rather than absolute paths
|
|
377
|
+
convert_slashes (bool, optional): force forward slashes in return values
|
|
378
|
+
|
|
379
|
+
Returns:
|
|
380
|
+
list: list of image filenames found in [dirname]
|
|
272
381
|
"""
|
|
273
382
|
|
|
383
|
+
assert os.path.isdir(dirname), '{} is not a folder'.format(dirname)
|
|
384
|
+
|
|
274
385
|
if recursive:
|
|
275
386
|
strings = glob.glob(os.path.join(dirname, '**', '*.*'), recursive=True)
|
|
276
387
|
else:
|
|
@@ -291,16 +402,28 @@ def find_images(dirname: str, recursive: bool = False,
|
|
|
291
402
|
|
|
292
403
|
#%% Filename cleaning functions
|
|
293
404
|
|
|
294
|
-
def clean_filename(filename
|
|
295
|
-
|
|
405
|
+
def clean_filename(filename,
|
|
406
|
+
allow_list=VALID_FILENAME_CHARS,
|
|
407
|
+
char_limit=CHAR_LIMIT,
|
|
408
|
+
force_lower= False):
|
|
296
409
|
r"""
|
|
297
410
|
Removes non-ASCII and other invalid filename characters (on any
|
|
298
|
-
reasonable OS) from a filename, then trims to a maximum length.
|
|
411
|
+
reasonable OS) from a filename, then optionally trims to a maximum length.
|
|
299
412
|
|
|
300
413
|
Does not allow :\/ by default, use clean_path if you want to preserve those.
|
|
301
414
|
|
|
302
415
|
Adapted from
|
|
303
416
|
https://gist.github.com/wassname/1393c4a57cfcbf03641dbc31886123b8
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
filename (str): filename to clean
|
|
420
|
+
allow_list (str, optional): string containing all allowable filename characters
|
|
421
|
+
char_limit (int, optional): maximum allowable filename length, if None will skip this
|
|
422
|
+
step
|
|
423
|
+
force_lower (bool, optional): convert the resulting filename to lowercase
|
|
424
|
+
|
|
425
|
+
returns:
|
|
426
|
+
str: cleaned version of [filename]
|
|
304
427
|
"""
|
|
305
428
|
|
|
306
429
|
# keep only valid ascii chars
|
|
@@ -316,37 +439,75 @@ def clean_filename(filename: str, allow_list: str = VALID_FILENAME_CHARS,
|
|
|
316
439
|
return cleaned_filename
|
|
317
440
|
|
|
318
441
|
|
|
319
|
-
def clean_path(pathname
|
|
320
|
-
|
|
442
|
+
def clean_path(pathname,
|
|
443
|
+
allow_list=VALID_PATH_CHARS,
|
|
444
|
+
char_limit=CHAR_LIMIT,
|
|
445
|
+
force_lower=False):
|
|
321
446
|
"""
|
|
322
447
|
Removes non-ASCII and other invalid path characters (on any reasonable
|
|
323
|
-
OS) from a path, then trims to a maximum length.
|
|
448
|
+
OS) from a path, then optionally trims to a maximum length.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
pathname (str): path name to clean
|
|
452
|
+
allow_list (str, optional): string containing all allowable filename characters
|
|
453
|
+
char_limit (int, optional): maximum allowable filename length, if None will skip this
|
|
454
|
+
step
|
|
455
|
+
force_lower (bool, optional): convert the resulting filename to lowercase
|
|
456
|
+
|
|
457
|
+
returns:
|
|
458
|
+
str: cleaned version of [filename]
|
|
324
459
|
"""
|
|
325
460
|
|
|
326
461
|
return clean_filename(pathname, allow_list=allow_list,
|
|
327
462
|
char_limit=char_limit, force_lower=force_lower)
|
|
328
463
|
|
|
329
464
|
|
|
330
|
-
def flatten_path(pathname
|
|
331
|
-
"""
|
|
465
|
+
def flatten_path(pathname,separator_chars=SEPARATOR_CHARS,separator_char_replacement='~'):
|
|
466
|
+
r"""
|
|
332
467
|
Removes non-ASCII and other invalid path characters (on any reasonable
|
|
333
468
|
OS) from a path, then trims to a maximum length. Replaces all valid
|
|
334
|
-
separators with
|
|
469
|
+
separators with [separator_char_replacement.]
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
pathname (str): path name to flatten
|
|
473
|
+
separator_chars (str, optional): string containing all known path separators
|
|
474
|
+
separator_char_replacement (str, optional): string to insert in place of
|
|
475
|
+
path separators.
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
str: flattened version of [pathname]
|
|
335
479
|
"""
|
|
336
480
|
|
|
337
481
|
s = clean_path(pathname)
|
|
338
482
|
for c in separator_chars:
|
|
339
|
-
s = s.replace(c,
|
|
483
|
+
s = s.replace(c, separator_char_replacement)
|
|
340
484
|
return s
|
|
341
485
|
|
|
342
486
|
|
|
343
|
-
|
|
487
|
+
def is_executable(filename):
|
|
488
|
+
"""
|
|
489
|
+
Checks whether [filename] is on the system path and marked as executable.
|
|
490
|
+
|
|
491
|
+
Args:
|
|
492
|
+
filename (str): filename to check for executable status
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
bool: True if [filename] is on the system path and marked as executable, otherwise False
|
|
496
|
+
"""
|
|
497
|
+
|
|
498
|
+
# https://stackoverflow.com/questions/11210104/check-if-a-program-exists-from-a-python-script
|
|
499
|
+
|
|
500
|
+
return which(filename) is not None
|
|
344
501
|
|
|
345
|
-
|
|
502
|
+
|
|
503
|
+
#%% Platform-independent way to open files in their associated application
|
|
346
504
|
|
|
347
505
|
def environment_is_wsl():
|
|
348
506
|
"""
|
|
349
|
-
|
|
507
|
+
Determines whether we're running in WSL.
|
|
508
|
+
|
|
509
|
+
Returns:
|
|
510
|
+
True if we're running in WSL.
|
|
350
511
|
"""
|
|
351
512
|
|
|
352
513
|
if sys.platform not in ('linux','posix'):
|
|
@@ -356,7 +517,7 @@ def environment_is_wsl():
|
|
|
356
517
|
|
|
357
518
|
|
|
358
519
|
def wsl_path_to_windows_path(filename):
|
|
359
|
-
"""
|
|
520
|
+
r"""
|
|
360
521
|
Converts a WSL path to a Windows path, or returns None if that's not possible. E.g.
|
|
361
522
|
converts:
|
|
362
523
|
|
|
@@ -365,6 +526,12 @@ def wsl_path_to_windows_path(filename):
|
|
|
365
526
|
...to:
|
|
366
527
|
|
|
367
528
|
e:\a\b\c
|
|
529
|
+
|
|
530
|
+
Args:
|
|
531
|
+
filename (str): filename to convert
|
|
532
|
+
|
|
533
|
+
Returns:
|
|
534
|
+
str: Windows equivalent to the WSL path [filename]
|
|
368
535
|
"""
|
|
369
536
|
|
|
370
537
|
result = subprocess.run(['wslpath', '-w', filename], text=True, capture_output=True)
|
|
@@ -373,13 +540,38 @@ def wsl_path_to_windows_path(filename):
|
|
|
373
540
|
return None
|
|
374
541
|
return result.stdout.strip()
|
|
375
542
|
|
|
376
|
-
|
|
377
|
-
def open_file(filename,attempt_to_open_in_wsl_host=False):
|
|
543
|
+
|
|
544
|
+
def open_file(filename, attempt_to_open_in_wsl_host=False, browser_name=None):
|
|
378
545
|
"""
|
|
379
|
-
Opens [filename] in the
|
|
380
|
-
|
|
546
|
+
Opens [filename] in the default OS file handler for this file type.
|
|
547
|
+
|
|
548
|
+
If browser_name is not None, uses the webbrowser module to open the filename
|
|
549
|
+
in the specified browser; see https://docs.python.org/3/library/webbrowser.html
|
|
550
|
+
for supported browsers. Falls back to the default file handler if webbrowser.open()
|
|
551
|
+
fails. In this case, attempt_to_open_in_wsl_host is ignored unless webbrowser.open() fails.
|
|
552
|
+
|
|
553
|
+
If browser_name is 'default', uses the system default. This is different from the
|
|
554
|
+
parameter to webbrowser.get(), where None implies the system default.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
filename (str): file to open
|
|
558
|
+
attempt_to_open_in_wsl_host: if this is True, and we're in WSL, attempts to open
|
|
559
|
+
[filename] in the Windows host environment
|
|
560
|
+
browser_name: see above
|
|
381
561
|
"""
|
|
382
562
|
|
|
563
|
+
if browser_name is not None:
|
|
564
|
+
if browser_name == 'chrome':
|
|
565
|
+
browser_name = 'google-chrome'
|
|
566
|
+
elif browser_name == 'default':
|
|
567
|
+
browser_name = None
|
|
568
|
+
try:
|
|
569
|
+
result = webbrowser.get(using=browser_name).open(filename)
|
|
570
|
+
except Exception:
|
|
571
|
+
result = False
|
|
572
|
+
if result:
|
|
573
|
+
return
|
|
574
|
+
|
|
383
575
|
if sys.platform == 'win32':
|
|
384
576
|
|
|
385
577
|
os.startfile(filename)
|
|
@@ -410,10 +602,14 @@ def open_file(filename,attempt_to_open_in_wsl_host=False):
|
|
|
410
602
|
|
|
411
603
|
#%% File list functions
|
|
412
604
|
|
|
413
|
-
def write_list_to_file(output_file
|
|
605
|
+
def write_list_to_file(output_file,strings):
|
|
414
606
|
"""
|
|
415
607
|
Writes a list of strings to either a JSON file or text file,
|
|
416
608
|
depending on extension of the given file name.
|
|
609
|
+
|
|
610
|
+
Args:
|
|
611
|
+
output_file (str): file to write
|
|
612
|
+
strings (list): list of strings to write to [output_file]
|
|
417
613
|
"""
|
|
418
614
|
|
|
419
615
|
with open(output_file, 'w') as f:
|
|
@@ -423,9 +619,15 @@ def write_list_to_file(output_file: str, strings: Sequence[str]) -> None:
|
|
|
423
619
|
f.write('\n'.join(strings))
|
|
424
620
|
|
|
425
621
|
|
|
426
|
-
def read_list_from_file(filename
|
|
622
|
+
def read_list_from_file(filename):
|
|
427
623
|
"""
|
|
428
624
|
Reads a json-formatted list of strings from a file.
|
|
625
|
+
|
|
626
|
+
Args:
|
|
627
|
+
filename (str): .json filename to read
|
|
628
|
+
|
|
629
|
+
Returns:
|
|
630
|
+
list: list of strings read from [filename]
|
|
429
631
|
"""
|
|
430
632
|
|
|
431
633
|
assert filename.endswith('.json')
|
|
@@ -437,11 +639,155 @@ def read_list_from_file(filename: str) -> List[str]:
|
|
|
437
639
|
return file_list
|
|
438
640
|
|
|
439
641
|
|
|
642
|
+
def _copy_file(input_output_tuple,overwrite=True,verbose=False):
|
|
643
|
+
"""
|
|
644
|
+
Internal function for copying files from within parallel_copy_files.
|
|
645
|
+
"""
|
|
646
|
+
|
|
647
|
+
assert len(input_output_tuple) == 2
|
|
648
|
+
source_fn = input_output_tuple[0]
|
|
649
|
+
target_fn = input_output_tuple[1]
|
|
650
|
+
if (not overwrite) and (os.path.isfile(target_fn)):
|
|
651
|
+
if verbose:
|
|
652
|
+
print('Skipping existing file {}'.format(target_fn))
|
|
653
|
+
return
|
|
654
|
+
os.makedirs(os.path.dirname(target_fn),exist_ok=True)
|
|
655
|
+
shutil.copyfile(source_fn,target_fn)
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def parallel_copy_files(input_file_to_output_file, max_workers=16,
|
|
659
|
+
use_threads=True, overwrite=False, verbose=False):
|
|
660
|
+
"""
|
|
661
|
+
Copies files from source to target according to the dict input_file_to_output_file.
|
|
662
|
+
|
|
663
|
+
Args:
|
|
664
|
+
input_file_to_output_file (dict): dictionary mapping source files to the target files
|
|
665
|
+
to which they should be copied
|
|
666
|
+
max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
|
|
667
|
+
use_threads (bool, optional): whether to use threads (True) or processes (False) for
|
|
668
|
+
parallel copying; ignored if max_workers <= 1
|
|
669
|
+
overwrite (bool, optional): whether to overwrite existing destination files
|
|
670
|
+
verbose (bool, optional): enable additionald debug output
|
|
671
|
+
"""
|
|
672
|
+
|
|
673
|
+
n_workers = min(max_workers,len(input_file_to_output_file))
|
|
674
|
+
|
|
675
|
+
# Package the dictionary as a set of 2-tuples
|
|
676
|
+
input_output_tuples = []
|
|
677
|
+
for input_fn in input_file_to_output_file:
|
|
678
|
+
input_output_tuples.append((input_fn,input_file_to_output_file[input_fn]))
|
|
679
|
+
|
|
680
|
+
if use_threads:
|
|
681
|
+
pool = ThreadPool(n_workers)
|
|
682
|
+
else:
|
|
683
|
+
pool = Pool(n_workers)
|
|
684
|
+
|
|
685
|
+
with tqdm(total=len(input_output_tuples)) as pbar:
|
|
686
|
+
for i,_ in enumerate(pool.imap_unordered(partial(_copy_file,overwrite=overwrite,verbose=verbose),
|
|
687
|
+
input_output_tuples)):
|
|
688
|
+
pbar.update()
|
|
689
|
+
|
|
690
|
+
# ...def parallel_copy_files(...)
|
|
691
|
+
|
|
692
|
+
|
|
693
|
+
def get_file_sizes(base_dir, convert_slashes=True):
|
|
694
|
+
"""
|
|
695
|
+
Gets sizes recursively for all files in base_dir, returning a dict mapping
|
|
696
|
+
relative filenames to size.
|
|
697
|
+
|
|
698
|
+
TODO: merge the functionality here with parallel_get_file_sizes, which uses slightly
|
|
699
|
+
different semantics.
|
|
700
|
+
|
|
701
|
+
Args:
|
|
702
|
+
base_dir (str): folder within which we want all file sizes
|
|
703
|
+
convert_slashes (bool, optional): force forward slashes in return strings,
|
|
704
|
+
otherwise uses the native path separator
|
|
705
|
+
|
|
706
|
+
Returns:
|
|
707
|
+
dict: dictionary mapping filenames to file sizes in bytes
|
|
708
|
+
"""
|
|
709
|
+
|
|
710
|
+
relative_filenames = recursive_file_list(base_dir, convert_slashes=convert_slashes,
|
|
711
|
+
return_relative_paths=True)
|
|
712
|
+
|
|
713
|
+
fn_to_size = {}
|
|
714
|
+
for fn_relative in tqdm(relative_filenames):
|
|
715
|
+
fn_abs = os.path.join(base_dir,fn_relative)
|
|
716
|
+
fn_to_size[fn_relative] = os.path.getsize(fn_abs)
|
|
717
|
+
|
|
718
|
+
return fn_to_size
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
def _get_file_size(filename,verbose=False):
|
|
722
|
+
"""
|
|
723
|
+
Internal function for safely getting the size of a file. Returns a (filename,size)
|
|
724
|
+
tuple, where size is None if there is an error.
|
|
725
|
+
"""
|
|
726
|
+
|
|
727
|
+
try:
|
|
728
|
+
size = os.path.getsize(filename)
|
|
729
|
+
except Exception as e:
|
|
730
|
+
if verbose:
|
|
731
|
+
print('Error reading file size for {}: {}'.format(filename,str(e)))
|
|
732
|
+
size = None
|
|
733
|
+
return (filename,size)
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
def parallel_get_file_sizes(filenames, max_workers=16,
|
|
737
|
+
use_threads=True, verbose=False,
|
|
738
|
+
recursive=True):
|
|
739
|
+
"""
|
|
740
|
+
Returns a dictionary mapping every file in [filenames] to the corresponding file size,
|
|
741
|
+
or None for errors. If [filenames] is a folder, will enumerate the folder (optionally recursively).
|
|
742
|
+
|
|
743
|
+
Args:
|
|
744
|
+
filenames (list or str): list of filenames for which we should read sizes, or a folder
|
|
745
|
+
within which we should read all file sizes recursively
|
|
746
|
+
max_workers (int, optional): number of concurrent workers; set to <=1 to disable parallelism
|
|
747
|
+
use_threads (bool, optional): whether to use threads (True) or processes (False) for
|
|
748
|
+
parallel copying; ignored if max_workers <= 1
|
|
749
|
+
verbose (bool, optional): enable additionald debug output
|
|
750
|
+
|
|
751
|
+
Returns:
|
|
752
|
+
dict: dictionary mapping filenames to file sizes in bytes
|
|
753
|
+
"""
|
|
754
|
+
|
|
755
|
+
n_workers = min(max_workers,len(filenames))
|
|
756
|
+
|
|
757
|
+
if isinstance(filenames,str) and os.path.isdir(filenames):
|
|
758
|
+
filenames = recursive_file_list(filenames,recursive=recursive,return_relative_paths=False)
|
|
759
|
+
|
|
760
|
+
if use_threads:
|
|
761
|
+
pool = ThreadPool(n_workers)
|
|
762
|
+
else:
|
|
763
|
+
pool = Pool(n_workers)
|
|
764
|
+
|
|
765
|
+
resize_results = list(tqdm(pool.imap(
|
|
766
|
+
partial(_get_file_size,verbose=verbose),filenames), total=len(filenames)))
|
|
767
|
+
|
|
768
|
+
to_return = {}
|
|
769
|
+
for r in resize_results:
|
|
770
|
+
to_return[r[0]] = r[1]
|
|
771
|
+
|
|
772
|
+
return to_return
|
|
773
|
+
|
|
774
|
+
|
|
440
775
|
#%% Zip functions
|
|
441
776
|
|
|
442
777
|
def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
|
|
443
778
|
"""
|
|
444
|
-
|
|
779
|
+
Zips a single file.
|
|
780
|
+
|
|
781
|
+
Args:
|
|
782
|
+
input_fn (str): file to zip
|
|
783
|
+
output_fn (str, optional): target zipfile; if this is None, we'll use
|
|
784
|
+
[input_fn].zip
|
|
785
|
+
overwrite (bool, optional): whether to overwrite an existing target file
|
|
786
|
+
verbose (bool, optional): enable existing debug console output
|
|
787
|
+
compresslevel (int, optional): compression level to use, between 0 and 9
|
|
788
|
+
|
|
789
|
+
Returns:
|
|
790
|
+
str: the output zipfile, whether we created it or determined that it already exists
|
|
445
791
|
"""
|
|
446
792
|
|
|
447
793
|
basename = os.path.basename(input_fn)
|
|
@@ -451,10 +797,10 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
|
|
|
451
797
|
|
|
452
798
|
if (not overwrite) and (os.path.isfile(output_fn)):
|
|
453
799
|
print('Skipping existing file {}'.format(output_fn))
|
|
454
|
-
return
|
|
800
|
+
return output_fn
|
|
455
801
|
|
|
456
802
|
if verbose:
|
|
457
|
-
print('Zipping {} to {}'.format(input_fn,output_fn))
|
|
803
|
+
print('Zipping {} to {} with level {}'.format(input_fn,output_fn,compresslevel))
|
|
458
804
|
|
|
459
805
|
with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
|
|
460
806
|
zipf.write(input_fn,arcname=basename,compresslevel=compresslevel,
|
|
@@ -463,21 +809,113 @@ def zip_file(input_fn, output_fn=None, overwrite=False, verbose=False, compressl
|
|
|
463
809
|
return output_fn
|
|
464
810
|
|
|
465
811
|
|
|
812
|
+
def add_files_to_single_tar_file(input_files, output_fn, arc_name_base,
|
|
813
|
+
overwrite=False, verbose=False, mode='x'):
|
|
814
|
+
"""
|
|
815
|
+
Adds all the files in [input_files] to the tar file [output_fn].
|
|
816
|
+
Archive names are relative to arc_name_base.
|
|
817
|
+
|
|
818
|
+
Args:
|
|
819
|
+
input_files (list): list of absolute filenames to include in the .tar file
|
|
820
|
+
output_fn (str): .tar file to create
|
|
821
|
+
arc_name_base (str): absolute folder from which relative paths should be determined;
|
|
822
|
+
behavior is undefined if there are files in [input_files] that don't live within
|
|
823
|
+
[arc_name_base]
|
|
824
|
+
overwrite (bool, optional): whether to overwrite an existing .tar file
|
|
825
|
+
verbose (bool, optional): enable additional debug console output
|
|
826
|
+
mode (str, optional): compression type, can be 'x' (no compression), 'x:gz', or 'x:bz2'.
|
|
827
|
+
|
|
828
|
+
Returns:
|
|
829
|
+
str: the output tar file, whether we created it or determined that it already exists
|
|
830
|
+
"""
|
|
831
|
+
|
|
832
|
+
if os.path.isfile(output_fn):
|
|
833
|
+
if not overwrite:
|
|
834
|
+
print('Tar file {} exists, skipping'.format(output_fn))
|
|
835
|
+
return output_fn
|
|
836
|
+
else:
|
|
837
|
+
print('Tar file {} exists, deleting and re-creating'.format(output_fn))
|
|
838
|
+
os.remove(output_fn)
|
|
839
|
+
|
|
840
|
+
if verbose:
|
|
841
|
+
print('Adding {} files to {} (mode {})'.format(
|
|
842
|
+
len(input_files),output_fn,mode))
|
|
843
|
+
|
|
844
|
+
with tarfile.open(output_fn,mode) as tarf:
|
|
845
|
+
for input_fn_abs in tqdm(input_files,disable=(not verbose)):
|
|
846
|
+
input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
|
|
847
|
+
tarf.add(input_fn_abs,arcname=input_fn_relative)
|
|
848
|
+
|
|
849
|
+
return output_fn
|
|
850
|
+
|
|
851
|
+
|
|
852
|
+
def zip_files_into_single_zipfile(input_files, output_fn, arc_name_base,
|
|
853
|
+
overwrite=False, verbose=False, compresslevel=9):
|
|
854
|
+
"""
|
|
855
|
+
Zip all the files in [input_files] into [output_fn]. Archive names are relative to
|
|
856
|
+
arc_name_base.
|
|
857
|
+
|
|
858
|
+
Args:
|
|
859
|
+
input_files (list): list of absolute filenames to include in the .tar file
|
|
860
|
+
output_fn (str): .tar file to create
|
|
861
|
+
arc_name_base (str): absolute folder from which relative paths should be determined;
|
|
862
|
+
behavior is undefined if there are files in [input_files] that don't live within
|
|
863
|
+
[arc_name_base]
|
|
864
|
+
overwrite (bool, optional): whether to overwrite an existing .tar file
|
|
865
|
+
verbose (bool, optional): enable additional debug console output
|
|
866
|
+
compresslevel (int, optional): compression level to use, between 0 and 9
|
|
867
|
+
|
|
868
|
+
Returns:
|
|
869
|
+
str: the output zipfile, whether we created it or determined that it already exists
|
|
870
|
+
"""
|
|
871
|
+
|
|
872
|
+
if not overwrite:
|
|
873
|
+
if os.path.isfile(output_fn):
|
|
874
|
+
print('Zip file {} exists, skipping'.format(output_fn))
|
|
875
|
+
return output_fn
|
|
876
|
+
|
|
877
|
+
if verbose:
|
|
878
|
+
print('Zipping {} files to {} (compression level {})'.format(
|
|
879
|
+
len(input_files),output_fn,compresslevel))
|
|
880
|
+
|
|
881
|
+
with ZipFile(output_fn,'w',zipfile.ZIP_DEFLATED) as zipf:
|
|
882
|
+
for input_fn_abs in tqdm(input_files,disable=(not verbose)):
|
|
883
|
+
input_fn_relative = os.path.relpath(input_fn_abs,arc_name_base)
|
|
884
|
+
zipf.write(input_fn_abs,
|
|
885
|
+
arcname=input_fn_relative,
|
|
886
|
+
compresslevel=compresslevel,
|
|
887
|
+
compress_type=zipfile.ZIP_DEFLATED)
|
|
888
|
+
|
|
889
|
+
return output_fn
|
|
890
|
+
|
|
891
|
+
|
|
466
892
|
def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, compresslevel=9):
|
|
467
893
|
"""
|
|
468
|
-
Recursively zip everything in [input_folder], storing outputs as relative
|
|
894
|
+
Recursively zip everything in [input_folder] into a single zipfile, storing outputs as relative
|
|
895
|
+
paths.
|
|
469
896
|
|
|
470
|
-
|
|
897
|
+
Args:
|
|
898
|
+
input_folder (str): folder to zip
|
|
899
|
+
output_fn (str, optional): output filename; if this is None, we'll write to [input_folder].zip
|
|
900
|
+
overwrite (bool, optional): whether to overwrite an existing .tar file
|
|
901
|
+
verbose (bool, optional): enable additional debug console output
|
|
902
|
+
compresslevel (int, optional): compression level to use, between 0 and 9
|
|
903
|
+
|
|
904
|
+
Returns:
|
|
905
|
+
str: the output zipfile, whether we created it or determined that it already exists
|
|
471
906
|
"""
|
|
472
907
|
|
|
473
908
|
if output_fn is None:
|
|
474
909
|
output_fn = input_folder + '.zip'
|
|
475
910
|
|
|
476
911
|
if not overwrite:
|
|
477
|
-
|
|
912
|
+
if os.path.isfile(output_fn):
|
|
913
|
+
print('Zip file {} exists, skipping'.format(output_fn))
|
|
914
|
+
return
|
|
478
915
|
|
|
479
916
|
if verbose:
|
|
480
|
-
print('Zipping {} to {}'.format(
|
|
917
|
+
print('Zipping {} to {} (compression level {})'.format(
|
|
918
|
+
input_folder,output_fn,compresslevel))
|
|
481
919
|
|
|
482
920
|
relative_filenames = recursive_file_list(input_folder,return_relative_paths=True)
|
|
483
921
|
|
|
@@ -492,10 +930,20 @@ def zip_folder(input_folder, output_fn=None, overwrite=False, verbose=False, com
|
|
|
492
930
|
return output_fn
|
|
493
931
|
|
|
494
932
|
|
|
495
|
-
def parallel_zip_files(input_files, max_workers=16, use_threads=True
|
|
933
|
+
def parallel_zip_files(input_files, max_workers=16, use_threads=True, compresslevel=9,
|
|
934
|
+
overwrite=False, verbose=False):
|
|
496
935
|
"""
|
|
497
|
-
|
|
936
|
+
Zips one or more files to separate output files in parallel, leaving the
|
|
498
937
|
original files in place. Each file is zipped to [filename].zip.
|
|
938
|
+
|
|
939
|
+
Args:
|
|
940
|
+
input_file (str): list of files to zip
|
|
941
|
+
max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
|
|
942
|
+
use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
|
|
943
|
+
max_workers <= 1
|
|
944
|
+
compresslevel (int, optional): zip compression level between 0 and 9
|
|
945
|
+
overwrite (bool, optional): whether to overwrite an existing .tar file
|
|
946
|
+
verbose (bool, optional): enable additional debug console output
|
|
499
947
|
"""
|
|
500
948
|
|
|
501
949
|
n_workers = min(max_workers,len(input_files))
|
|
@@ -506,15 +954,26 @@ def parallel_zip_files(input_files, max_workers=16, use_threads=True):
|
|
|
506
954
|
pool = Pool(n_workers)
|
|
507
955
|
|
|
508
956
|
with tqdm(total=len(input_files)) as pbar:
|
|
509
|
-
for i,_ in enumerate(pool.imap_unordered(zip_file,
|
|
957
|
+
for i,_ in enumerate(pool.imap_unordered(partial(zip_file,
|
|
958
|
+
output_fn=None,overwrite=overwrite,verbose=verbose,compresslevel=compresslevel),
|
|
959
|
+
input_files)):
|
|
510
960
|
pbar.update()
|
|
511
961
|
|
|
512
962
|
|
|
513
963
|
def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
|
|
514
|
-
compresslevel=9, overwrite=False):
|
|
964
|
+
compresslevel=9, overwrite=False, verbose=False):
|
|
515
965
|
"""
|
|
516
|
-
|
|
966
|
+
Zips one or more folders to separate output files in parallel, leaving the
|
|
517
967
|
original folders in place. Each folder is zipped to [folder_name].zip.
|
|
968
|
+
|
|
969
|
+
Args:
|
|
970
|
+
input_folder (list): list of folders to zip
|
|
971
|
+
max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
|
|
972
|
+
use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
|
|
973
|
+
max_workers <= 1
|
|
974
|
+
compresslevel (int, optional): zip compression level between 0 and 9
|
|
975
|
+
overwrite (bool, optional): whether to overwrite an existing .tar file
|
|
976
|
+
verbose (bool, optional): enable additional debug console output
|
|
518
977
|
"""
|
|
519
978
|
|
|
520
979
|
n_workers = min(max_workers,len(input_folders))
|
|
@@ -526,15 +985,56 @@ def parallel_zip_folders(input_folders, max_workers=16, use_threads=True,
|
|
|
526
985
|
|
|
527
986
|
with tqdm(total=len(input_folders)) as pbar:
|
|
528
987
|
for i,_ in enumerate(pool.imap_unordered(
|
|
529
|
-
partial(zip_folder,overwrite=overwrite,
|
|
988
|
+
partial(zip_folder,overwrite=overwrite,
|
|
989
|
+
compresslevel=compresslevel,verbose=verbose),
|
|
530
990
|
input_folders)):
|
|
531
991
|
pbar.update()
|
|
532
992
|
|
|
533
993
|
|
|
994
|
+
def zip_each_file_in_folder(folder_name,recursive=False,max_workers=16,use_threads=True,
|
|
995
|
+
compresslevel=9,overwrite=False,required_token=None,verbose=False,
|
|
996
|
+
exclude_zip=True):
|
|
997
|
+
"""
|
|
998
|
+
Zips each file in [folder_name] to its own zipfile (filename.zip), optionally recursing. To
|
|
999
|
+
zip a whole folder into a single zipfile, use zip_folder().
|
|
1000
|
+
|
|
1001
|
+
Args:
|
|
1002
|
+
folder_name (str): the folder within which we should zip files
|
|
1003
|
+
recursive (bool, optional): whether to recurse within [folder_name]
|
|
1004
|
+
max_workers (int, optional): number of concurrent workers, set to <= 1 to disable parallelism
|
|
1005
|
+
use_threads (bool, optional): whether to use threads (True) or processes (False); ignored if
|
|
1006
|
+
max_workers <= 1
|
|
1007
|
+
compresslevel (int, optional): zip compression level between 0 and 9
|
|
1008
|
+
overwrite (bool, optional): whether to overwrite an existing .tar file
|
|
1009
|
+
required_token (str, optional): only zip files whose names contain this string
|
|
1010
|
+
verbose (bool, optional): enable additional debug console output
|
|
1011
|
+
exclude_zip (bool, optional): skip files ending in .zip
|
|
1012
|
+
"""
|
|
1013
|
+
|
|
1014
|
+
assert os.path.isdir(folder_name), '{} is not a folder'.format(folder_name)
|
|
1015
|
+
|
|
1016
|
+
input_files = recursive_file_list(folder_name,recursive=recursive,return_relative_paths=False)
|
|
1017
|
+
|
|
1018
|
+
if required_token is not None:
|
|
1019
|
+
input_files = [fn for fn in input_files if required_token in fn]
|
|
1020
|
+
|
|
1021
|
+
if exclude_zip:
|
|
1022
|
+
input_files = [fn for fn in input_files if (not fn.endswith('.zip'))]
|
|
1023
|
+
|
|
1024
|
+
parallel_zip_files(input_files=input_files,max_workers=max_workers,
|
|
1025
|
+
use_threads=use_threads,compresslevel=compresslevel,
|
|
1026
|
+
overwrite=overwrite,verbose=verbose)
|
|
1027
|
+
|
|
1028
|
+
|
|
534
1029
|
def unzip_file(input_file, output_folder=None):
|
|
535
1030
|
"""
|
|
536
|
-
|
|
537
|
-
the input file
|
|
1031
|
+
Unzips a zipfile to the specified output folder, defaulting to the same location as
|
|
1032
|
+
the input file.
|
|
1033
|
+
|
|
1034
|
+
Args:
|
|
1035
|
+
input_file (str): zipfile to unzip
|
|
1036
|
+
output_folder (str, optional): folder to which we should unzip [input_file], defaults
|
|
1037
|
+
to unzipping to the folder where [input_file] lives
|
|
538
1038
|
"""
|
|
539
1039
|
|
|
540
1040
|
if output_folder is None:
|