zhmiscellanylite 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of zhmiscellanylite might be problematic. Click here for more details.
- zhmiscellanylite/__init__.py +1 -0
- zhmiscellanylite/_fileio_supportfuncs.py +11 -0
- zhmiscellanylite/dict.py +3 -0
- zhmiscellanylite/fileio.py +673 -0
- zhmiscellanylite/list.py +58 -0
- zhmiscellanylite/math.py +87 -0
- zhmiscellanylite/misc.py +797 -0
- zhmiscellanylite/processing.py +134 -0
- zhmiscellanylite/string.py +161 -0
- zhmiscellanylite-0.0.3.dist-info/METADATA +26 -0
- zhmiscellanylite-0.0.3.dist-info/RECORD +13 -0
- zhmiscellanylite-0.0.3.dist-info/WHEEL +5 -0
- zhmiscellanylite-0.0.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from . import processing, misc, fileio, string, math, list, dict
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
def is_junction(entry):
|
|
2
|
+
import sys
|
|
3
|
+
if sys.platform != "win32":
|
|
4
|
+
return False
|
|
5
|
+
try:
|
|
6
|
+
st = entry.stat(follow_symlinks=False)
|
|
7
|
+
# On Windows, st_file_attributes is available.
|
|
8
|
+
# FILE_ATTRIBUTE_REPARSE_POINT (0x400) indicates a reparse point (e.g. junction).
|
|
9
|
+
return hasattr(st, "st_file_attributes") and bool(st.st_file_attributes & 0x400)
|
|
10
|
+
except Exception:
|
|
11
|
+
return False
|
zhmiscellanylite/dict.py
ADDED
|
@@ -0,0 +1,673 @@
|
|
|
1
|
+
def read_json_file(file_path):
|
|
2
|
+
"""
|
|
3
|
+
Reads JSON data from a file and returns it as a dictionary.
|
|
4
|
+
"""
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
if os.path.exists(file_path):
|
|
8
|
+
with open(file_path, 'r') as file:
|
|
9
|
+
data = json.load(file)
|
|
10
|
+
else:
|
|
11
|
+
with open(file_path, 'w') as f:
|
|
12
|
+
f.write('{}')
|
|
13
|
+
data = {}
|
|
14
|
+
return data
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def write_json_file(file_path, data):
|
|
18
|
+
"""
|
|
19
|
+
Writes a dictionary to a JSON file.
|
|
20
|
+
"""
|
|
21
|
+
import json
|
|
22
|
+
with open(file_path, 'w') as file:
|
|
23
|
+
json.dump(data, file, indent=4)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def create_folder(folder_name):
|
|
27
|
+
import os
|
|
28
|
+
if not os.path.exists(folder_name):
|
|
29
|
+
os.makedirs(folder_name)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def remove_folder(folder_name):
|
|
33
|
+
import os
|
|
34
|
+
import shutil
|
|
35
|
+
if os.path.exists(folder_name):
|
|
36
|
+
shutil.rmtree(folder_name)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def base_name_no_ext(file_path):
|
|
40
|
+
import os
|
|
41
|
+
base_name = os.path.basename(file_path)
|
|
42
|
+
base_name_without_extension, _ = os.path.splitext(base_name)
|
|
43
|
+
return base_name_without_extension
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def convert_name_to_filename(name):
|
|
47
|
+
import zhmiscellany.string
|
|
48
|
+
return zhmiscellany.string.multi_replace(name, [("/","["), (":","]"), (".","+")])
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def convert_filename_to_name(filename):
|
|
52
|
+
import zhmiscellany.string
|
|
53
|
+
return zhmiscellany.string.multi_replace(filename, [("[","/"), ("]",":"), ("+",".")])
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def recursive_copy_files(source_dir, destination_dir, prints=False):
|
|
57
|
+
import os
|
|
58
|
+
import shutil
|
|
59
|
+
if prints:
|
|
60
|
+
print('Validating matching directory structure')
|
|
61
|
+
for root, dirs, files in os.walk(source_dir):
|
|
62
|
+
for dir in dirs:
|
|
63
|
+
dir_path = os.path.join(root, dir)
|
|
64
|
+
dest_dir_path = os.path.join(destination_dir, os.path.relpath(dir_path, source_dir))
|
|
65
|
+
if not os.path.exists(dest_dir_path):
|
|
66
|
+
print(f'Creating missing directory {dest_dir_path}')
|
|
67
|
+
os.makedirs(dest_dir_path)
|
|
68
|
+
if prints:
|
|
69
|
+
print('Getting a list of files in the source directory')
|
|
70
|
+
source_files = []
|
|
71
|
+
for root, _, files in os.walk(source_dir):
|
|
72
|
+
for file in files:
|
|
73
|
+
source_files.append(os.path.join(root, file))
|
|
74
|
+
if prints:
|
|
75
|
+
print('Getting a list of files in the destination directory')
|
|
76
|
+
dest_files = []
|
|
77
|
+
for root, _, files in os.walk(destination_dir):
|
|
78
|
+
for file in files:
|
|
79
|
+
dest_files.append(os.path.join(root, file))
|
|
80
|
+
if prints:
|
|
81
|
+
print('Copying files from source to destination, skipping duplicates')
|
|
82
|
+
for root, dirs, files in os.walk(source_dir):
|
|
83
|
+
for file in files:
|
|
84
|
+
source_file = os.path.join(root, file)
|
|
85
|
+
rel_path = os.path.relpath(source_file, source_dir)
|
|
86
|
+
dest_file = os.path.join(destination_dir, rel_path)
|
|
87
|
+
if not os.path.exists(dest_file):
|
|
88
|
+
if prints:
|
|
89
|
+
print(f'Copying {source_file}')
|
|
90
|
+
shutil.copy2(source_file, dest_file)
|
|
91
|
+
elif os.path.getmtime(source_file) != os.path.getmtime(dest_file):
|
|
92
|
+
if prints:
|
|
93
|
+
print(f'Copying {source_file}')
|
|
94
|
+
shutil.copy2(source_file, dest_file)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def empty_directory(directory_path):
|
|
98
|
+
import os
|
|
99
|
+
import shutil
|
|
100
|
+
# Iterate over all items in the directory
|
|
101
|
+
for item in os.listdir(directory_path):
|
|
102
|
+
item_path = os.path.join(directory_path, item)
|
|
103
|
+
if os.path.isfile(item_path):
|
|
104
|
+
# If it's a file, delete it
|
|
105
|
+
os.unlink(item_path)
|
|
106
|
+
elif os.path.isdir(item_path):
|
|
107
|
+
# If it's a directory, delete it recursively
|
|
108
|
+
shutil.rmtree(item_path)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def abs_listdir(path):
|
|
112
|
+
import os
|
|
113
|
+
abs_directory_path = os.path.abspath(path)
|
|
114
|
+
|
|
115
|
+
# Verify the path exists and is a directory
|
|
116
|
+
if not os.path.exists(abs_directory_path):
|
|
117
|
+
raise FileNotFoundError(f"Directory not found: {abs_directory_path}")
|
|
118
|
+
|
|
119
|
+
if not os.path.isdir(abs_directory_path):
|
|
120
|
+
raise NotADirectoryError(f"Path is not a directory: {abs_directory_path}")
|
|
121
|
+
|
|
122
|
+
# Get all items in the directory
|
|
123
|
+
items = os.listdir(abs_directory_path)
|
|
124
|
+
|
|
125
|
+
# Create absolute paths by joining the directory path with each item
|
|
126
|
+
absolute_paths = []
|
|
127
|
+
for item in items:
|
|
128
|
+
absolute_path = os.path.join(abs_directory_path, item)
|
|
129
|
+
absolute_paths.append(absolute_path)
|
|
130
|
+
|
|
131
|
+
return absolute_paths
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def delete_ends_with(directory, string_endswith, avoid=[]):
|
|
135
|
+
import os
|
|
136
|
+
files = abs_listdir(directory)
|
|
137
|
+
for file in files:
|
|
138
|
+
if file.endswith(string_endswith):
|
|
139
|
+
if not file in avoid:
|
|
140
|
+
os.remove(file)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def read_bytes_section(file_path, section_start, section_end):
|
|
144
|
+
with open(file_path, 'rb') as file:
|
|
145
|
+
file.seek(section_start) # Move the file pointer to the 'start' position
|
|
146
|
+
bytes_to_read = section_end - section_start
|
|
147
|
+
data = file.read(bytes_to_read) # Read 'bytes_to_read' number of bytes
|
|
148
|
+
return data
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def copy_file_with_overwrite(src, dst):
|
|
152
|
+
import os
|
|
153
|
+
import shutil
|
|
154
|
+
if os.path.exists(dst):
|
|
155
|
+
os.remove(dst)
|
|
156
|
+
shutil.copy2(src, dst)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def fast_dill_dumps(object):
|
|
160
|
+
import pickle
|
|
161
|
+
import dill
|
|
162
|
+
try:
|
|
163
|
+
data = pickle.dumps(object, protocol=5) # pickle is much faster so at least attempt to use it at first
|
|
164
|
+
except:
|
|
165
|
+
data = dill.dumps(object, protocol=5)
|
|
166
|
+
return data
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def fast_dill_loads(data):
|
|
170
|
+
import pickle
|
|
171
|
+
import dill
|
|
172
|
+
try:
|
|
173
|
+
object = pickle.loads(data) # pickle is much faster so at least attempt to use it at first
|
|
174
|
+
except:
|
|
175
|
+
object = dill.loads(data)
|
|
176
|
+
return object
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
zstd_comp = None
|
|
180
|
+
zstd_decomp = None
|
|
181
|
+
|
|
182
|
+
def _get_std_objects():
|
|
183
|
+
global zstd_comp, zstd_decomp
|
|
184
|
+
if zstd_comp is None or zstd_decomp is None:
|
|
185
|
+
import zstandard as zstd
|
|
186
|
+
zstd_comp = zstd.ZstdCompressor(level=4)
|
|
187
|
+
zstd_decomp = zstd.ZstdDecompressor()
|
|
188
|
+
return zstd_comp, zstd_decomp
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def save_object_to_file(object, file_name, compressed=False):
|
|
192
|
+
zstd_comp, zstd_decomp = _get_std_objects()
|
|
193
|
+
with open(file_name, 'wb') as f:
|
|
194
|
+
if compressed:
|
|
195
|
+
f.write(zstd_comp.compress(fast_dill_dumps(object)))
|
|
196
|
+
else:
|
|
197
|
+
f.write(fast_dill_dumps(object))
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def load_object_from_file(file_name, compressed=False):
|
|
201
|
+
zstd_comp, zstd_decomp = _get_std_objects()
|
|
202
|
+
with open(file_name, 'rb') as f:
|
|
203
|
+
if compressed:
|
|
204
|
+
return fast_dill_loads(zstd_decomp.decompress(f.read()))
|
|
205
|
+
else:
|
|
206
|
+
return fast_dill_loads(f.read())
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def pickle_and_encode(obj):
|
|
210
|
+
zstd_comp, zstd_decomp = _get_std_objects()
|
|
211
|
+
"""Pickles an object and URL-safe encodes it."""
|
|
212
|
+
import base64
|
|
213
|
+
pickled_data = zstd_comp.compress(fast_dill_dumps(obj)) # Serialize the object
|
|
214
|
+
encoded_data = base64.urlsafe_b64encode(pickled_data).decode() # Base64 encode
|
|
215
|
+
return encoded_data
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def decode_and_unpickle(encoded_str):
|
|
219
|
+
zstd_comp, zstd_decomp = _get_std_objects()
|
|
220
|
+
"""Decodes a URL-safe encoded string and unpickles the object."""
|
|
221
|
+
import base64
|
|
222
|
+
pickled_data = base64.urlsafe_b64decode(encoded_str) # Decode from Base64
|
|
223
|
+
obj = fast_dill_loads(zstd_decomp.decompress(pickled_data)) # Deserialize
|
|
224
|
+
return obj
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def list_files_by_modified_time(directory):
|
|
228
|
+
import os
|
|
229
|
+
files_with_times = [(file, os.path.getmtime(file)) for file in abs_listdir(directory) if os.path.isfile(os.path.join(directory, file))]
|
|
230
|
+
sorted_files = sorted(files_with_times, key=lambda x: x[1], reverse=True)
|
|
231
|
+
sorted_file_names = [file for file, _ in sorted_files]
|
|
232
|
+
return sorted_file_names
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def get_script_path():
|
|
236
|
+
"""Returns the path to the current script or executable."""
|
|
237
|
+
import sys
|
|
238
|
+
if getattr(sys, 'frozen', False):
|
|
239
|
+
# Running as a standalone executable
|
|
240
|
+
return sys.executable
|
|
241
|
+
else:
|
|
242
|
+
# Running as a Python script
|
|
243
|
+
return sys.argv[0]
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def chdir_to_script_dir():
|
|
247
|
+
import os
|
|
248
|
+
os.chdir(os.path.dirname(get_script_path()))
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def cache(function, *args, _cache_compressed=False, **kwargs):
|
|
252
|
+
"""
|
|
253
|
+
Caches the result of a function call to disk.
|
|
254
|
+
"""
|
|
255
|
+
import os
|
|
256
|
+
import inspect
|
|
257
|
+
import orjson
|
|
258
|
+
import hashlib
|
|
259
|
+
from datetime import datetime
|
|
260
|
+
import zhmiscellany.fileio
|
|
261
|
+
|
|
262
|
+
cache_folder = 'zhmiscellany_cache'
|
|
263
|
+
|
|
264
|
+
def normalize_for_json(obj):
|
|
265
|
+
"""Recursively normalize objects to be JSON-serializable."""
|
|
266
|
+
if callable(obj):
|
|
267
|
+
try:
|
|
268
|
+
return ('__callable__', inspect.getsource(obj))
|
|
269
|
+
except (OSError, TypeError):
|
|
270
|
+
return ('__callable__', str(obj))
|
|
271
|
+
|
|
272
|
+
# Handle dict-like objects (including bidict, defaultdict, etc.)
|
|
273
|
+
if isinstance(obj, dict):
|
|
274
|
+
# Convert non-string keys to strings
|
|
275
|
+
return {str(k): normalize_for_json(v) for k, v in obj.items()}
|
|
276
|
+
|
|
277
|
+
# Handle lists and tuples
|
|
278
|
+
if isinstance(obj, (list, tuple)):
|
|
279
|
+
return type(obj)(normalize_for_json(item) for item in obj)
|
|
280
|
+
|
|
281
|
+
# Handle sets
|
|
282
|
+
if isinstance(obj, set):
|
|
283
|
+
return sorted([normalize_for_json(item) for item in obj])
|
|
284
|
+
|
|
285
|
+
# Handle pandas DataFrames - OPTIMIZED FOR HASHING
|
|
286
|
+
if hasattr(obj, '__class__') and obj.__class__.__name__ == 'DataFrame':
|
|
287
|
+
try:
|
|
288
|
+
import pandas as pd
|
|
289
|
+
if isinstance(obj, pd.DataFrame):
|
|
290
|
+
# For hashing purposes, use a much faster representation
|
|
291
|
+
# We use the pandas internal hash which is very fast
|
|
292
|
+
import pandas.util as pd_util
|
|
293
|
+
|
|
294
|
+
# Fast hash based on shape, columns, dtypes (as a single string), and a sample of data
|
|
295
|
+
return {
|
|
296
|
+
'__type__': 'DataFrame',
|
|
297
|
+
'shape': obj.shape,
|
|
298
|
+
'columns': list(obj.columns),
|
|
299
|
+
'dtypes_str': str(obj.dtypes.to_dict()), # Single conversion instead of dict comp
|
|
300
|
+
'index_name': obj.index.name,
|
|
301
|
+
# Use pandas' built-in hash on a sample for speed
|
|
302
|
+
'hash': str(pd_util.hash_pandas_object(obj.iloc[:min(100, len(obj))]).sum())
|
|
303
|
+
}
|
|
304
|
+
except ImportError:
|
|
305
|
+
pass
|
|
306
|
+
|
|
307
|
+
# Handle pandas Series - OPTIMIZED
|
|
308
|
+
if hasattr(obj, '__class__') and obj.__class__.__name__ == 'Series':
|
|
309
|
+
try:
|
|
310
|
+
import pandas as pd
|
|
311
|
+
if isinstance(obj, pd.Series):
|
|
312
|
+
import pandas.util as pd_util
|
|
313
|
+
return {
|
|
314
|
+
'__type__': 'Series',
|
|
315
|
+
'dtype': str(obj.dtype),
|
|
316
|
+
'name': obj.name,
|
|
317
|
+
'shape': obj.shape,
|
|
318
|
+
'hash': str(pd_util.hash_pandas_object(obj.iloc[:min(100, len(obj))]).sum())
|
|
319
|
+
}
|
|
320
|
+
except ImportError:
|
|
321
|
+
pass
|
|
322
|
+
|
|
323
|
+
# Handle bytes
|
|
324
|
+
if isinstance(obj, bytes):
|
|
325
|
+
return ('__bytes__', obj.hex())
|
|
326
|
+
|
|
327
|
+
# Handle datetime
|
|
328
|
+
if isinstance(obj, datetime):
|
|
329
|
+
return ('__datetime__', obj.isoformat())
|
|
330
|
+
|
|
331
|
+
# Handle custom objects with __dict__
|
|
332
|
+
if hasattr(obj, '__dict__') and not isinstance(obj, type):
|
|
333
|
+
return {f'__{obj.__class__.__name__}__': normalize_for_json(obj.__dict__)}
|
|
334
|
+
|
|
335
|
+
# Return primitives as-is
|
|
336
|
+
return obj
|
|
337
|
+
|
|
338
|
+
def get_hash_orjson(data):
|
|
339
|
+
# Pre-process the data to handle non-string keys and other issues
|
|
340
|
+
normalized_data = normalize_for_json(data)
|
|
341
|
+
|
|
342
|
+
json_bytes = orjson.dumps(
|
|
343
|
+
normalized_data,
|
|
344
|
+
option=orjson.OPT_SORT_KEYS
|
|
345
|
+
)
|
|
346
|
+
return hashlib.md5(json_bytes).hexdigest()
|
|
347
|
+
|
|
348
|
+
seed = {
|
|
349
|
+
'function': function,
|
|
350
|
+
'args': args,
|
|
351
|
+
'kwargs': kwargs,
|
|
352
|
+
'compressed': _cache_compressed
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
seed_hash = get_hash_orjson(seed)
|
|
356
|
+
|
|
357
|
+
cache_file = f'{cache_folder}/cache_{function.__name__}_{seed_hash}.pkl'
|
|
358
|
+
|
|
359
|
+
if os.path.exists(cache_file):
|
|
360
|
+
return load_object_from_file(cache_file, compressed=_cache_compressed)
|
|
361
|
+
else:
|
|
362
|
+
result = function(*args, **kwargs)
|
|
363
|
+
zhmiscellany.fileio.create_folder(cache_folder)
|
|
364
|
+
save_object_to_file(result, cache_file, compressed=_cache_compressed)
|
|
365
|
+
return result
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def load_all_cached():
|
|
369
|
+
"""
|
|
370
|
+
Loads all cached objects from the cache folder.
|
|
371
|
+
"""
|
|
372
|
+
import os
|
|
373
|
+
cache_folder = 'zhmiscellany_cache'
|
|
374
|
+
if os.path.exists(cache_folder):
|
|
375
|
+
files = abs_listdir(cache_folder)
|
|
376
|
+
files = [file for file in files if 'fn_cache_' in file]
|
|
377
|
+
if files:
|
|
378
|
+
return [load_object_from_file(file) for file in files]
|
|
379
|
+
else:
|
|
380
|
+
raise Exception('Nothing has been cached yet')
|
|
381
|
+
else:
|
|
382
|
+
raise Exception('Nothing has been cached yet')
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def list_files_recursive(folder):
|
|
386
|
+
"""
|
|
387
|
+
Recursively lists all files in a directory, excluding symlinks and junctions.
|
|
388
|
+
"""
|
|
389
|
+
import os
|
|
390
|
+
from ._fileio_supportfuncs import is_junction
|
|
391
|
+
files = []
|
|
392
|
+
try:
|
|
393
|
+
for entry in os.scandir(folder):
|
|
394
|
+
if entry.is_file():
|
|
395
|
+
files.append(entry.path)
|
|
396
|
+
elif entry.is_symlink() or is_junction(entry):
|
|
397
|
+
continue
|
|
398
|
+
elif entry.is_dir():
|
|
399
|
+
files.extend(list_files_recursive(entry.path))
|
|
400
|
+
except (PermissionError, FileNotFoundError):
|
|
401
|
+
pass
|
|
402
|
+
return files
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def list_files_recursive_multiprocessed(dir_path, return_folders=False):
|
|
406
|
+
import os
|
|
407
|
+
import zhmiscellany.processing
|
|
408
|
+
|
|
409
|
+
def is_junction(entry):
|
|
410
|
+
try:
|
|
411
|
+
st = entry.stat(follow_symlinks=False)
|
|
412
|
+
# On Windows, st_file_attributes is available.
|
|
413
|
+
# FILE_ATTRIBUTE_REPARSE_POINT (0x400) indicates a reparse point (e.g. junction).
|
|
414
|
+
return hasattr(st, "st_file_attributes") and bool(st.st_file_attributes & 0x400)
|
|
415
|
+
except Exception:
|
|
416
|
+
return False
|
|
417
|
+
|
|
418
|
+
def traversal(dir_path, depth):
|
|
419
|
+
depth += 1
|
|
420
|
+
files = []
|
|
421
|
+
folders = []
|
|
422
|
+
tasks = []
|
|
423
|
+
try:
|
|
424
|
+
for entry in os.scandir(dir_path):
|
|
425
|
+
if entry.is_file():
|
|
426
|
+
files.append(entry.path)
|
|
427
|
+
elif entry.is_symlink() or is_junction(entry):
|
|
428
|
+
continue
|
|
429
|
+
elif entry.is_dir():
|
|
430
|
+
folders.append(entry.path)
|
|
431
|
+
if depth > max_python_depth:
|
|
432
|
+
tasks.append((traversal, (entry.path, -99999)))
|
|
433
|
+
else:
|
|
434
|
+
new_files, new_folders, new_tasks = traversal(entry.path, depth)
|
|
435
|
+
files.extend(new_files)
|
|
436
|
+
folders.extend(new_folders)
|
|
437
|
+
tasks.extend(new_tasks)
|
|
438
|
+
except (PermissionError, FileNotFoundError):
|
|
439
|
+
pass
|
|
440
|
+
return (files, folders, tasks)
|
|
441
|
+
|
|
442
|
+
max_python_depth = 1
|
|
443
|
+
files, folders, tasks = traversal(dir_path, 0)
|
|
444
|
+
file_groups = zhmiscellany.processing.batch_multiprocess(tasks)
|
|
445
|
+
for group in file_groups:
|
|
446
|
+
files.extend(group[0])
|
|
447
|
+
folders.extend(group[1])
|
|
448
|
+
if return_folders:
|
|
449
|
+
return files, folders
|
|
450
|
+
else:
|
|
451
|
+
return files
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def encode_safe_filename(s, max_length=16):
|
|
455
|
+
"""Encodes a string into a short, URL-safe, and file name-safe string."""
|
|
456
|
+
import base64
|
|
457
|
+
import hashlib
|
|
458
|
+
encoded = base64.urlsafe_b64encode(s.encode()).decode().rstrip("=") # URL-safe encoding
|
|
459
|
+
if len(encoded) > max_length: # Truncate if too long
|
|
460
|
+
encoded = hashlib.md5(s.encode()).hexdigest()[:max_length] # Use a hash
|
|
461
|
+
return encoded
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def list_files_recursive_cache_optimised_multiprocessed(dir_path, show_timings=False, cache_in_temp=True):
|
|
465
|
+
import os
|
|
466
|
+
import zhmiscellany.processing
|
|
467
|
+
import zhmiscellany.fileio
|
|
468
|
+
import tempfile
|
|
469
|
+
from collections import defaultdict
|
|
470
|
+
import random
|
|
471
|
+
from itertools import chain
|
|
472
|
+
import zhmiscellany.misc
|
|
473
|
+
|
|
474
|
+
def is_junction(entry):
|
|
475
|
+
try:
|
|
476
|
+
st = entry.stat(follow_symlinks=False)
|
|
477
|
+
# On Windows, st_file_attributes is available.
|
|
478
|
+
# FILE_ATTRIBUTE_REPARSE_POINT (0x400) indicates a reparse point (e.g. junction).
|
|
479
|
+
return hasattr(st, "st_file_attributes") and bool(st.st_file_attributes & 0x400)
|
|
480
|
+
except Exception:
|
|
481
|
+
return False
|
|
482
|
+
|
|
483
|
+
def traversal(dir_path, depth=0):
|
|
484
|
+
depth += 1
|
|
485
|
+
files = defaultdict(list)
|
|
486
|
+
folders = []
|
|
487
|
+
tasks = []
|
|
488
|
+
try:
|
|
489
|
+
for entry in os.scandir(dir_path):
|
|
490
|
+
if entry.is_file():
|
|
491
|
+
files[dir_path].append(entry.path)
|
|
492
|
+
elif entry.is_symlink() or is_junction(entry):
|
|
493
|
+
continue
|
|
494
|
+
elif entry.is_dir():
|
|
495
|
+
folders.append(entry.path)
|
|
496
|
+
if depth > max_python_depth:
|
|
497
|
+
tasks.append((traversal, (entry.path, -99999)))
|
|
498
|
+
else:
|
|
499
|
+
new_files, new_folders, new_tasks = traversal(entry.path, depth)
|
|
500
|
+
files.update(new_files)
|
|
501
|
+
folders.extend(new_folders)
|
|
502
|
+
tasks.extend(new_tasks)
|
|
503
|
+
except (PermissionError, FileNotFoundError):
|
|
504
|
+
pass
|
|
505
|
+
return (files, folders, tasks)
|
|
506
|
+
|
|
507
|
+
def list_folder(folder):
|
|
508
|
+
files, folders = defaultdict(list), []
|
|
509
|
+
try:
|
|
510
|
+
for entry in os.scandir(folder):
|
|
511
|
+
if entry.is_file():
|
|
512
|
+
files[folder].append(entry.path)
|
|
513
|
+
elif entry.is_symlink() or is_junction(entry):
|
|
514
|
+
continue
|
|
515
|
+
elif entry.is_dir():
|
|
516
|
+
folders.append(entry.path)
|
|
517
|
+
except (PermissionError, FileNotFoundError):
|
|
518
|
+
pass
|
|
519
|
+
return files, folders
|
|
520
|
+
|
|
521
|
+
def split_into_n_groups(lst, n):
|
|
522
|
+
avg_size = len(lst) // n
|
|
523
|
+
remainder = len(lst) % n
|
|
524
|
+
sublists = []
|
|
525
|
+
|
|
526
|
+
start = 0
|
|
527
|
+
for i in range(n):
|
|
528
|
+
end = start + avg_size + (1 if i < remainder else 0) # Distribute remainder
|
|
529
|
+
sublists.append(lst[start:end])
|
|
530
|
+
start = end
|
|
531
|
+
sublists = [sublist for sublist in sublists if sublist]
|
|
532
|
+
return sublists
|
|
533
|
+
|
|
534
|
+
def get_m_times(folders):
|
|
535
|
+
groups = split_into_n_groups(folders, scan_mtime_worker_count)
|
|
536
|
+
|
|
537
|
+
def atom(folders):
|
|
538
|
+
mtimes = {}
|
|
539
|
+
for folder in folders:
|
|
540
|
+
try:
|
|
541
|
+
mtimes[folder] = os.path.getmtime(folder)
|
|
542
|
+
except:
|
|
543
|
+
pass
|
|
544
|
+
return mtimes
|
|
545
|
+
|
|
546
|
+
tasks = [(atom, (group,)) for group in groups]
|
|
547
|
+
results = zhmiscellany.processing.batch_multiprocess(tasks)
|
|
548
|
+
mtimes = {}
|
|
549
|
+
for i in results:
|
|
550
|
+
mtimes.update(i)
|
|
551
|
+
return mtimes
|
|
552
|
+
|
|
553
|
+
# parameters
|
|
554
|
+
scan_mtime_worker_count = 8
|
|
555
|
+
scan_changed_folders_thread_group_count = 64
|
|
556
|
+
fully_update_cache_threshold = 2**10
|
|
557
|
+
cache_compression = False
|
|
558
|
+
# end parameters
|
|
559
|
+
|
|
560
|
+
if cache_in_temp:
|
|
561
|
+
cache_folder = tempfile.gettempdir()
|
|
562
|
+
else:
|
|
563
|
+
cache_folder = 'zhmiscellany_cache'
|
|
564
|
+
zhmiscellany.fileio.create_folder(cache_folder)
|
|
565
|
+
|
|
566
|
+
cache_id = encode_safe_filename(dir_path)
|
|
567
|
+
cache_file = f'GFI_{cache_id}.pkl'
|
|
568
|
+
cache_file = os.path.join(cache_folder, cache_file)
|
|
569
|
+
|
|
570
|
+
if show_timings: zhmiscellany.misc.time_it(None, 'lfrcom')
|
|
571
|
+
if show_timings: zhmiscellany.misc.time_it(None, 'lfrcomt')
|
|
572
|
+
|
|
573
|
+
max_python_depth = 1
|
|
574
|
+
if not os.path.exists(cache_file):
|
|
575
|
+
files, folders, tasks = traversal(dir_path)
|
|
576
|
+
if show_timings: zhmiscellany.misc.time_it('initial traversal', 'lfrcom')
|
|
577
|
+
|
|
578
|
+
file_groups = zhmiscellany.processing.batch_multiprocess(tasks)
|
|
579
|
+
if show_timings: zhmiscellany.misc.time_it('multiprocessed deep traversal', 'lfrcom')
|
|
580
|
+
for group in file_groups:
|
|
581
|
+
files.update(group[0])
|
|
582
|
+
folders.extend(group[1])
|
|
583
|
+
if show_timings: zhmiscellany.misc.time_it('extending data', 'lfrcom')
|
|
584
|
+
|
|
585
|
+
folders = get_m_times(folders)
|
|
586
|
+
zhmiscellany.fileio.save_object_to_file((files, folders), cache_file, compressed=cache_compression)
|
|
587
|
+
if show_timings: zhmiscellany.misc.time_it('creating cache', 'lfrcom')
|
|
588
|
+
return list(chain.from_iterable(files.values()))
|
|
589
|
+
else:
|
|
590
|
+
files, folders = zhmiscellany.fileio.load_object_from_file(cache_file, compressed=cache_compression)
|
|
591
|
+
if show_timings: zhmiscellany.misc.time_it('loading cache', 'lfrcom')
|
|
592
|
+
|
|
593
|
+
fl_list = list(folders.keys())
|
|
594
|
+
new_folders = get_m_times(fl_list)
|
|
595
|
+
if show_timings: zhmiscellany.misc.time_it(f'getting m times of {len(fl_list)} folders', 'lfrcom')
|
|
596
|
+
changed_folders = []
|
|
597
|
+
for folder, mtime in new_folders.items():
|
|
598
|
+
if folders[folder] != mtime:
|
|
599
|
+
changed_folders.append(folder)
|
|
600
|
+
random.shuffle(changed_folders)
|
|
601
|
+
if show_timings: zhmiscellany.misc.time_it(f'creating {len(changed_folders)} changed folders', 'lfrcom')
|
|
602
|
+
|
|
603
|
+
for i in changed_folders: # clear files that might not exist
|
|
604
|
+
try:
|
|
605
|
+
del files[i]
|
|
606
|
+
except KeyError: # it is possible that the only thing that changed in a folder is another folder, so in that case it would not be inside the file dict
|
|
607
|
+
pass
|
|
608
|
+
|
|
609
|
+
if show_timings: zhmiscellany.misc.time_it(f'filtering files for changes', 'lfrcom')
|
|
610
|
+
|
|
611
|
+
def atom(_folders):
|
|
612
|
+
atom_files, atom_folders = defaultdict(list), []
|
|
613
|
+
for _folder in _folders:
|
|
614
|
+
fil, fol = list_folder(_folder)
|
|
615
|
+
atom_files.update(fil)
|
|
616
|
+
for fold in fol:
|
|
617
|
+
if fold not in new_folders:
|
|
618
|
+
atom_folders.append(fold)
|
|
619
|
+
fil, fo, _ = traversal(fold, -99999)
|
|
620
|
+
atom_files.update(fil)
|
|
621
|
+
atom_folders.extend(fo)
|
|
622
|
+
return atom_files, atom_folders
|
|
623
|
+
|
|
624
|
+
groups = split_into_n_groups(changed_folders, scan_changed_folders_thread_group_count)
|
|
625
|
+
tasks = [(atom, (group,)) for group in groups]
|
|
626
|
+
if not tasks:
|
|
627
|
+
results = []
|
|
628
|
+
else:
|
|
629
|
+
results = zhmiscellany.processing.batch_threading(tasks)
|
|
630
|
+
if show_timings: zhmiscellany.misc.time_it('multithreading processing changed folders', 'lfrcom')
|
|
631
|
+
|
|
632
|
+
new_new_folders = []
|
|
633
|
+
for fi, fo in results:
|
|
634
|
+
files.update(fi)
|
|
635
|
+
new_new_folders.extend(fo)
|
|
636
|
+
|
|
637
|
+
if len(changed_folders) > fully_update_cache_threshold:
|
|
638
|
+
new_folders.update(get_m_times(new_new_folders))
|
|
639
|
+
if show_timings: zhmiscellany.misc.time_it(f'get m times of {len(new_new_folders)} new folders')
|
|
640
|
+
zhmiscellany.fileio.save_object_to_file((files, new_folders), cache_file)
|
|
641
|
+
if show_timings: zhmiscellany.misc.time_it(f'writing to cache')
|
|
642
|
+
|
|
643
|
+
ret = list(chain.from_iterable(files.values()))
|
|
644
|
+
if show_timings: zhmiscellany.misc.time_it('Everything together', 'lfrcomt')
|
|
645
|
+
return ret
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
def save_chunk(name, data):
|
|
649
|
+
import zhmiscellany.string
|
|
650
|
+
create_folder(name)
|
|
651
|
+
chunk_path = f'{name}/chunk_{zhmiscellany.string.get_universally_unique_string()}.pkl'
|
|
652
|
+
save_object_to_file(data, chunk_path)
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
def load_chunks(name):
|
|
656
|
+
create_folder(name)
|
|
657
|
+
chunks = abs_listdir(name)
|
|
658
|
+
datas = []
|
|
659
|
+
for chunk_file in chunks:
|
|
660
|
+
datas.append(load_object_from_file(chunk_file))
|
|
661
|
+
return datas
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
def clear_chunks(name):
|
|
665
|
+
import os
|
|
666
|
+
if os.path.exists(name):
|
|
667
|
+
empty_directory(name)
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
def list_drives():
|
|
671
|
+
import os
|
|
672
|
+
import string
|
|
673
|
+
return [f"{d}:\\" for d in string.ascii_uppercase if os.path.exists(f"{d}:\\")]
|