cdxcore 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cdxcore might be problematic. Click here for more details.
- cdxcore/__init__.py +15 -0
- cdxcore/config.py +1633 -0
- cdxcore/crman.py +105 -0
- cdxcore/deferred.py +220 -0
- cdxcore/dynaplot.py +1155 -0
- cdxcore/filelock.py +430 -0
- cdxcore/jcpool.py +411 -0
- cdxcore/logger.py +319 -0
- cdxcore/np.py +1098 -0
- cdxcore/npio.py +270 -0
- cdxcore/prettydict.py +388 -0
- cdxcore/prettyobject.py +64 -0
- cdxcore/sharedarray.py +285 -0
- cdxcore/subdir.py +2963 -0
- cdxcore/uniquehash.py +970 -0
- cdxcore/util.py +1041 -0
- cdxcore/verbose.py +403 -0
- cdxcore/version.py +402 -0
- cdxcore-0.1.5.dist-info/METADATA +1418 -0
- cdxcore-0.1.5.dist-info/RECORD +30 -0
- cdxcore-0.1.5.dist-info/WHEEL +5 -0
- cdxcore-0.1.5.dist-info/licenses/LICENSE +21 -0
- cdxcore-0.1.5.dist-info/top_level.txt +4 -0
- conda/conda_exists.py +10 -0
- conda/conda_modify_yaml.py +42 -0
- tests/_cdxbasics.py +1086 -0
- tests/test_uniquehash.py +469 -0
- tests/test_util.py +329 -0
- up/git_message.py +7 -0
- up/pip_modify_setup.py +55 -0
cdxcore/uniquehash.py
ADDED
|
@@ -0,0 +1,970 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Basic utilities for Python
|
|
3
|
+
Hans Buehler 2022
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import datetime as datetime
|
|
7
|
+
import types as types
|
|
8
|
+
import hashlib as hashlib
|
|
9
|
+
import inspect as inspect
|
|
10
|
+
from collections.abc import Mapping, Collection, Sequence, Iterator, Callable
|
|
11
|
+
from collections import OrderedDict
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
import struct as struct
|
|
15
|
+
from .util import isFunction, DEF_FILE_NAME_MAP, fmt_filename
|
|
16
|
+
from .prettyobject import PrettyObject
|
|
17
|
+
|
|
18
|
+
def _qual_name(x, with_mod=False):
|
|
19
|
+
q = getattr(x, '__qualname__', x.__name__)
|
|
20
|
+
if with_mod:
|
|
21
|
+
m = getattr(x, "__module__", None)
|
|
22
|
+
if not m is None:
|
|
23
|
+
q += "@" + m
|
|
24
|
+
return q
|
|
25
|
+
|
|
26
|
+
# =============================================================================
|
|
27
|
+
# Hashing
|
|
28
|
+
# =============================================================================
|
|
29
|
+
|
|
30
|
+
class UniqueHash( object ):
|
|
31
|
+
"""
|
|
32
|
+
Object to compute recursively unique hashes with.
|
|
33
|
+
See UniqueHash.__init__ for documentation
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, length : int = 32, *,
|
|
37
|
+
parse_underscore : str = "none",
|
|
38
|
+
sort_dicts : bool = True,
|
|
39
|
+
parse_functions : bool = False,
|
|
40
|
+
# micro settings
|
|
41
|
+
pd_ignore_column_order : bool = True,
|
|
42
|
+
np_nan_equal : bool = False,
|
|
43
|
+
f_include_defaults : bool = True,
|
|
44
|
+
f_include_closure : bool = True,
|
|
45
|
+
f_include_globals : bool = True,
|
|
46
|
+
):
|
|
47
|
+
"""
|
|
48
|
+
Initializes a callable object which iteratively generates hashes of length at most 'length'.
|
|
49
|
+
The algorithm is meant to be mostly hands-off, but there are a few considerations with making:
|
|
50
|
+
|
|
51
|
+
Private and Protected members
|
|
52
|
+
-----------------------------
|
|
53
|
+
When an object is passed to this functional its members are iterated using __dict__ or __slots__, respectively.
|
|
54
|
+
By default this process ignores any fields in objects or dictionaries which start with "_". The idea here is
|
|
55
|
+
that 'functional' parameters are stored as members, but any derived data is stored in protected members.
|
|
56
|
+
This behaviour can be chanhed with 'parse_underscore'.
|
|
57
|
+
|
|
58
|
+
Objects can optionally implement their own hashing scheme by implementing
|
|
59
|
+
|
|
60
|
+
__unique_hash__( self, uniqueHash : UniqueHash, debug_trace : DebugTrace )
|
|
61
|
+
|
|
62
|
+
This function may return a unique string, or any other non-None Python object which will then be passed to
|
|
63
|
+
UniqueHash.__call__. A common use case is to return a tuple of the members of the class which are
|
|
64
|
+
pertinent for hashing.
|
|
65
|
+
|
|
66
|
+
Dictionaries
|
|
67
|
+
------------
|
|
68
|
+
Since Python 3.7 dictionaries preserve the order in which they were constructed https://mail.python.org/pipermail/python-dev/2017-December/151283.html.
|
|
69
|
+
However, Python semantics otherwise remain order invariant, i.e. {'x':1, 'y':2} tests equal to {'y':2',x':1}.
|
|
70
|
+
For this reasom the default behaviour for dictonaries is to sort them before hasing their content
|
|
71
|
+
(also recall that objects are typicall treated via their __dict__).
|
|
72
|
+
This can be turned off with 'sort_dicts'.
|
|
73
|
+
OrderedDicts are not sorted in any case.
|
|
74
|
+
|
|
75
|
+
Functions
|
|
76
|
+
---------
|
|
77
|
+
By default function members of objects and dictionaries (which include @properties) are
|
|
78
|
+
ignored. You can set 'parse_functions' = True to parse a reduced text of the function code.
|
|
79
|
+
There are a number of expert settings for handling functions, see below.
|
|
80
|
+
|
|
81
|
+
Numpy, Pandas
|
|
82
|
+
-------------
|
|
83
|
+
Hashing of large datasets is not advised. Use hashes on the generating parameter set instead.
|
|
84
|
+
|
|
85
|
+
Implementing custom object hashing
|
|
86
|
+
----------------------------------
|
|
87
|
+
An object may implement
|
|
88
|
+
__unique_hash__( self, uniqueHashExt : UniqueHash )
|
|
89
|
+
which is passed a unique hash object which contains all current run time parameters.
|
|
90
|
+
A good use case is:
|
|
91
|
+
|
|
92
|
+
Parameters
|
|
93
|
+
----------
|
|
94
|
+
length : int
|
|
95
|
+
Intended length of the hash function.
|
|
96
|
+
parse_underscore : bool
|
|
97
|
+
How to handle object members starting with '_'.
|
|
98
|
+
* 'none' : ignore members starting with '_' (the default)
|
|
99
|
+
* 'protected' : ignore 'private' members declared starting with '_' and containing '__' (*)
|
|
100
|
+
* 'private' : consider all members
|
|
101
|
+
sort_dicts : bool
|
|
102
|
+
From python 3.7 dictionaries are ordered. That means that strictly speaking
|
|
103
|
+
the two dictionaries {'x':1, 'y':2} and {'y':2, 'x':1} are not indentical;
|
|
104
|
+
however Python will sematicallly assume they are as == between the two will return True.
|
|
105
|
+
Accordingly, this function by default sorts first the keys of mappings before
|
|
106
|
+
hashing their items. (If dictionaries are derived from OrderedDict, the function will still process those
|
|
107
|
+
in order.)
|
|
108
|
+
This can be turned off by setting sort_dicts=False.
|
|
109
|
+
parse_functions : bool
|
|
110
|
+
If True, then the function will attempt to generate
|
|
111
|
+
unique hashes for function and property objects
|
|
112
|
+
using compress_function_code
|
|
113
|
+
|
|
114
|
+
Fine tuning
|
|
115
|
+
-----------
|
|
116
|
+
pd_ignore_column_order : bool
|
|
117
|
+
Whether to ingore the order of panda columns. The default is True
|
|
118
|
+
np_nan_equal : bool
|
|
119
|
+
Whether to ignore the specific type of a NaN. The default is False.
|
|
120
|
+
f_include_defaults : bool
|
|
121
|
+
When parsing functions whether to include default values. Default is True.
|
|
122
|
+
f_include_closure : bool
|
|
123
|
+
When parsing functions whether to include the function colusure. This can be expensive. Default is True.
|
|
124
|
+
f_include_globals : bool
|
|
125
|
+
When parsing functions whether to include globals used by the function. This can be expensicve. Default is False.
|
|
126
|
+
"""
|
|
127
|
+
self.length = int(length)
|
|
128
|
+
|
|
129
|
+
digest_size = self.length//2
|
|
130
|
+
if digest_size <= 0:
|
|
131
|
+
raise ValueError("'length' must be at least 2")
|
|
132
|
+
if digest_size > 64:
|
|
133
|
+
raise ValueError("'length' can be at most 128 (limitation of 'haslib.blake2b')")
|
|
134
|
+
|
|
135
|
+
self.parse_underscore = str(parse_underscore)
|
|
136
|
+
self.sort_dicts = bool(sort_dicts)
|
|
137
|
+
self.parse_functions = bool(parse_functions)
|
|
138
|
+
|
|
139
|
+
self.pd_ignore_column_order = bool(pd_ignore_column_order)
|
|
140
|
+
self.np_nan_equal = bool(np_nan_equal)
|
|
141
|
+
|
|
142
|
+
self.f_include_defaults = bool(f_include_defaults)
|
|
143
|
+
self.f_include_closure = bool(f_include_closure)
|
|
144
|
+
self.f_include_globals = bool(f_include_globals)
|
|
145
|
+
|
|
146
|
+
if parse_underscore == "none":
|
|
147
|
+
self._pi = 0
|
|
148
|
+
elif parse_underscore == "protected":
|
|
149
|
+
self._pi = 1
|
|
150
|
+
else:
|
|
151
|
+
if parse_underscore != "private": raise ValueError("'parse_underscore' must be 'none', 'private', or 'protected'. Found '{self.parse_underscore}'")
|
|
152
|
+
self._pi = 2
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def name(self) -> str:
|
|
156
|
+
return f"uniqueHash({self.length};{self.parse_underscore},{self.sort_dicts},{self.parse_functions})"
|
|
157
|
+
|
|
158
|
+
def clone(self):
|
|
159
|
+
""" Return copy of 'self' """
|
|
160
|
+
return UniqueHash( **{ k:v for k,v in self.__dict__.items() if not k[:1] == "_"} )
|
|
161
|
+
|
|
162
|
+
def __call__(self, *args, debug_trace = None, **kwargs):
|
|
163
|
+
"""
|
|
164
|
+
Returns a unique hash for the parameters passed to this function.
|
|
165
|
+
|
|
166
|
+
Hashing can have curious side effect, in particular when handling objects and functions.
|
|
167
|
+
For this reason this function allows tracing all hashing activity using its debug_trace
|
|
168
|
+
parameter.
|
|
169
|
+
|
|
170
|
+
Parameters
|
|
171
|
+
----------
|
|
172
|
+
args, kwargs:
|
|
173
|
+
Parameters to hash.
|
|
174
|
+
debug_trace : DebugTrace
|
|
175
|
+
Allows tracing of hashing activity.
|
|
176
|
+
Two classes are implemented:
|
|
177
|
+
DebugTraceVerbose()
|
|
178
|
+
Simply prints out hashing activity to stdout.
|
|
179
|
+
DebugTraceCollect()
|
|
180
|
+
Collects an array of tracing information.
|
|
181
|
+
The object itself is an iterable which contains
|
|
182
|
+
the respective tracing information.
|
|
183
|
+
|
|
184
|
+
Returns
|
|
185
|
+
-------
|
|
186
|
+
String of at most length 'length'
|
|
187
|
+
"""
|
|
188
|
+
h, _ = self._mk_blake( h=self.length//2 )
|
|
189
|
+
if len(args) > 0:
|
|
190
|
+
self._hash_any( h, args, debug_trace = debug_trace )
|
|
191
|
+
if len(kwargs) > 0:
|
|
192
|
+
self._hash_any( h, kwargs, debug_trace = debug_trace )
|
|
193
|
+
return h.hexdigest()
|
|
194
|
+
|
|
195
|
+
# Utility functions
|
|
196
|
+
# -----------------
|
|
197
|
+
|
|
198
|
+
@staticmethod
|
|
199
|
+
def _mk_blake( h ):
|
|
200
|
+
""" utility function to allow passing a hash 'h' or an 'int' """
|
|
201
|
+
if not isinstance(h, int):
|
|
202
|
+
return h, False
|
|
203
|
+
h = int(h)
|
|
204
|
+
assert h//2>0 and h//2<64, ("'h' must be at least 2 and not exceed 128", h)
|
|
205
|
+
h = hashlib.blake2b( digest_size=h ) if h > 16 else hashlib.blake2s( digest_size=h )
|
|
206
|
+
return h, True
|
|
207
|
+
|
|
208
|
+
def _hash_any(self, h, x, *, debug_trace = None ):
|
|
209
|
+
"""
|
|
210
|
+
Recursive function to hash "any" object.
|
|
211
|
+
|
|
212
|
+
Parameters
|
|
213
|
+
----------
|
|
214
|
+
h : hash
|
|
215
|
+
Hashlib algorithm
|
|
216
|
+
x : any
|
|
217
|
+
Value to hash.
|
|
218
|
+
debug_trace :
|
|
219
|
+
Optional DebugTrace object to debug uniqueHash calculations.
|
|
220
|
+
"""
|
|
221
|
+
if x is None:
|
|
222
|
+
h.update(b'\x00')
|
|
223
|
+
return
|
|
224
|
+
# numpy atomic
|
|
225
|
+
if isinstance(x, np.generic):
|
|
226
|
+
sz = x.itemsize
|
|
227
|
+
if sz==1:
|
|
228
|
+
x = x.view(np.int8)
|
|
229
|
+
elif sz==2:
|
|
230
|
+
x = x.view(np.int16)
|
|
231
|
+
elif sz==4:
|
|
232
|
+
x = x.view(np.int32)
|
|
233
|
+
else:
|
|
234
|
+
assert sz==8, ("Cannot handle itemsize",sz,"for numpy generic", type(x), "with value", x)
|
|
235
|
+
x = x.view(np.int64)
|
|
236
|
+
h.update(x.tobytes())
|
|
237
|
+
if not debug_trace is None: debug_trace.update( x )
|
|
238
|
+
return
|
|
239
|
+
# basic elements
|
|
240
|
+
if isinstance( x, bool ):
|
|
241
|
+
h.update( x.to_bytes(1,'little', signed=True) )
|
|
242
|
+
if not debug_trace is None: debug_trace.update( x )
|
|
243
|
+
return
|
|
244
|
+
if isinstance( x, int ):
|
|
245
|
+
h.update( x.to_bytes(8,'little', signed=True) )
|
|
246
|
+
if not debug_trace is None: debug_trace.update( x )
|
|
247
|
+
return
|
|
248
|
+
if isinstance( x, ( float, complex ) ):
|
|
249
|
+
h.update( struct.pack('<d', x) ) # little-endian double
|
|
250
|
+
if not debug_trace is None: debug_trace.update( x )
|
|
251
|
+
return
|
|
252
|
+
if isinstance( x, bytes ):
|
|
253
|
+
h.update( x )
|
|
254
|
+
if not debug_trace is None: debug_trace.update( x )
|
|
255
|
+
return
|
|
256
|
+
if isinstance( x, str ):
|
|
257
|
+
h.update( x.encode('utf-8') )
|
|
258
|
+
if not debug_trace is None: debug_trace.update( x )
|
|
259
|
+
return
|
|
260
|
+
# datetime etc
|
|
261
|
+
if isinstance(x,datetime.datetime):
|
|
262
|
+
if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
|
|
263
|
+
ts = float( x.timestamp() )
|
|
264
|
+
td = x.tzinfo.utcoffset(x) if not x.tzinfo is None else None
|
|
265
|
+
self._hash_any(h, ts, debug_trace=debug_trace)
|
|
266
|
+
if not td is None:
|
|
267
|
+
self._hash_any(h, td.total_seconds, debug_trace=debug_trace)
|
|
268
|
+
return
|
|
269
|
+
if isinstance(x,datetime.time):
|
|
270
|
+
"""
|
|
271
|
+
tzinfo for time is useless
|
|
272
|
+
if not x.tzinfo is None:
|
|
273
|
+
h.update( x.utcoffset().total_seconds.to_bytes(4,'little', signed=True) )
|
|
274
|
+
else:
|
|
275
|
+
h.update( int(0).to_bytes(4,'little', signed=True) )
|
|
276
|
+
"""
|
|
277
|
+
if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
|
|
278
|
+
total_seconds = float(x.hour*60*60+x.minute*60+x.second) +\
|
|
279
|
+
float(x.microsecond) / 1000000.
|
|
280
|
+
self._hash_any(h, total_seconds, debug_trace=debug_trace)
|
|
281
|
+
"""
|
|
282
|
+
h.update( x.hour.to_bytes(2,'little', signed=True) )
|
|
283
|
+
h.update( x.minute.to_bytes(2,'little', signed=True) )
|
|
284
|
+
h.update( x.second.to_bytes(2,'little', signed=True) )
|
|
285
|
+
h.update( x.microsecond.to_bytes(4,'little', signed=True))
|
|
286
|
+
if not debug_trace is None:
|
|
287
|
+
debug_trace = debug_trace.update_topic( x )
|
|
288
|
+
debug_trace.update( x.hour, "hour")
|
|
289
|
+
debug_trace.update( x.minute, "minute")
|
|
290
|
+
debug_trace.update( x.second, "second")
|
|
291
|
+
debug_trace.update( x.microsecond, "microsecond")
|
|
292
|
+
if not x.tzinfo is None:
|
|
293
|
+
debug_trace.warning( "datetime.time support for tzinfo is not working well. Use datetime.datetime")
|
|
294
|
+
"""
|
|
295
|
+
return
|
|
296
|
+
if isinstance(x,datetime.date):
|
|
297
|
+
if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
|
|
298
|
+
full = x.year * 10000 + x.month * 100 + x.day
|
|
299
|
+
self._hash_any(h, full, debug_trace=debug_trace)
|
|
300
|
+
"""
|
|
301
|
+
h.update( x.year.to_bytes(4,'little', signed=True) )
|
|
302
|
+
h.update( x.month.to_bytes(1,'little', signed=True) )
|
|
303
|
+
h.update( x.day.to_bytes(2,'little', signed=True) )
|
|
304
|
+
if not debug_trace is None:
|
|
305
|
+
debug_trace = debug_trace.update_topic( x )
|
|
306
|
+
debug_trace.update( x.year, "year" )
|
|
307
|
+
debug_trace.update( x.month, "month" )
|
|
308
|
+
debug_trace.update( x.day, "day" )
|
|
309
|
+
"""
|
|
310
|
+
return
|
|
311
|
+
if isinstance(x,datetime.timedelta):
|
|
312
|
+
if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
|
|
313
|
+
self._hash_any(h, x.total_seconds(), debug_trace=debug_trace )
|
|
314
|
+
return
|
|
315
|
+
# functions
|
|
316
|
+
if isFunction(x) or isinstance(x,property):
|
|
317
|
+
if self.parse_functions:
|
|
318
|
+
self._hash_function( h, x, debug_trace=debug_trace )
|
|
319
|
+
elif not debug_trace is None:
|
|
320
|
+
debug_trace.warning( f"Ignored function: {x.__qualname__}")
|
|
321
|
+
return
|
|
322
|
+
# slice -> tuple
|
|
323
|
+
if isinstance(x,slice):
|
|
324
|
+
if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
|
|
325
|
+
self._hash_any(h, (x.start,x.stop,x.step), debug_trace=debug_trace )
|
|
326
|
+
return
|
|
327
|
+
# test presence of __unique_hash__()
|
|
328
|
+
# objects can now simply set this member to a string
|
|
329
|
+
if hasattr(x,"__unique_hash__"):
|
|
330
|
+
unique_hash = x.__unique_hash__
|
|
331
|
+
if isinstance(unique_hash, str):
|
|
332
|
+
h.update(unique_hash.encode('utf-8') )
|
|
333
|
+
if not debug_trace is None:
|
|
334
|
+
debug_trace = debug_trace.update_topic( x, msg="__unique_hash__ str" )
|
|
335
|
+
debug_trace.update( unique_hash )
|
|
336
|
+
return
|
|
337
|
+
debug_trace = None if debug_trace is None else debug_trace.update_topic( x, msg="__unique_hash__ function" )
|
|
338
|
+
try:
|
|
339
|
+
unique_hash = unique_hash( self.clone(), debug_trace=debug_trace )
|
|
340
|
+
except Exception as e:
|
|
341
|
+
raise type(e)( e, f"Exception encountered while calling '__unique_hash__' of object of type {type(x)}.")
|
|
342
|
+
if unique_hash is None:
|
|
343
|
+
raise TypeError(f"{type(x).__qualname__}: __unique_hash__() cannot return None")
|
|
344
|
+
if isinstance(unique_hash, str):
|
|
345
|
+
h.update(unique_hash.encode('utf-8') )
|
|
346
|
+
if not debug_trace is None:
|
|
347
|
+
debug_trace.update( unique_hash )
|
|
348
|
+
else:
|
|
349
|
+
if not debug_trace is None:
|
|
350
|
+
debug_trace = debug_trace.update_topic( unique_hash )
|
|
351
|
+
self._hash_any(h, unique_hash, debug_trace=debug_trace )
|
|
352
|
+
return
|
|
353
|
+
# numpy
|
|
354
|
+
if isinstance(x,np.ndarray):
|
|
355
|
+
self._hash_numpy(h, x, debug_trace=debug_trace )
|
|
356
|
+
return
|
|
357
|
+
# pandas
|
|
358
|
+
if isinstance(x,pd.DataFrame):
|
|
359
|
+
self._hash_dataFrame(h, x, debug_trace=debug_trace )
|
|
360
|
+
return
|
|
361
|
+
# dictionaries, and similar
|
|
362
|
+
# note that objects with a __dict__ will
|
|
363
|
+
# be hashed using that dictionary
|
|
364
|
+
if isinstance(x,Mapping):
|
|
365
|
+
assert not isinstance(x, Sequence)
|
|
366
|
+
# from Python 3.7 onwards, dictionaries are ordered.
|
|
367
|
+
# however, we here assume here that unless they are
|
|
368
|
+
# specified as ordered, we can assume that the
|
|
369
|
+
# order does not matter.
|
|
370
|
+
if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
|
|
371
|
+
keys = sorted(x) if self.sort_dicts and not isinstance(x,OrderedDict) else list(x)
|
|
372
|
+
for k in keys:
|
|
373
|
+
if isinstance(k,str):
|
|
374
|
+
if k[:1] == '_':
|
|
375
|
+
if self._pi == 0:
|
|
376
|
+
continue
|
|
377
|
+
if self._pi == 1 and k.find("__") != -1:
|
|
378
|
+
continue
|
|
379
|
+
self._hash_any(h, (k, x[k]), debug_trace=debug_trace)
|
|
380
|
+
return
|
|
381
|
+
# lists, tuples and everything which looks like it --> lists
|
|
382
|
+
if isinstance(x, (Sequence, Iterator)):
|
|
383
|
+
assert not isinstance(x, dict)
|
|
384
|
+
if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
|
|
385
|
+
for k in x:
|
|
386
|
+
self._hash_any(h, k, debug_trace=debug_trace)
|
|
387
|
+
return
|
|
388
|
+
# all others such as sets need sorting first
|
|
389
|
+
if isinstance(x, Collection):
|
|
390
|
+
assert not isinstance(x, dict)
|
|
391
|
+
if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
|
|
392
|
+
x = sorted(x)
|
|
393
|
+
for k in x:
|
|
394
|
+
self._hash_any(h, k, debug_trace=debug_trace)
|
|
395
|
+
return
|
|
396
|
+
# objects: treat like dictionaries
|
|
397
|
+
if hasattr(x,"__dict__"):
|
|
398
|
+
"""
|
|
399
|
+
1)
|
|
400
|
+
from python 3.7 onwards dictionaries are ordered.
|
|
401
|
+
however, except in rare cases that order should not
|
|
402
|
+
impede the equivalence of objects
|
|
403
|
+
2)
|
|
404
|
+
private member handling in Python is subject to name space mangling which can have curious effects.
|
|
405
|
+
That's why we consider private any members staring with '_' and containing '__':
|
|
406
|
+
class A(object):
|
|
407
|
+
def f(self):
|
|
408
|
+
class X(object):
|
|
409
|
+
pass
|
|
410
|
+
x = X()
|
|
411
|
+
x.__p = 1
|
|
412
|
+
print(x.__dict__)
|
|
413
|
+
A().f() will print '{'_A__p': 1}' even though 'x.__p' is a private member to X.
|
|
414
|
+
"""
|
|
415
|
+
if not debug_trace is None: debug_trace = debug_trace.update_topic( x, "object with __dict__" )
|
|
416
|
+
self._hash_any( h, _qual_name( type(x),False), debug_trace=debug_trace)
|
|
417
|
+
x = x.__dict__
|
|
418
|
+
keys = sorted(x) if self.sort_dicts else list(x)
|
|
419
|
+
for k in keys:
|
|
420
|
+
if isinstance(k,str):
|
|
421
|
+
if k[:1] == '_':
|
|
422
|
+
if self._pi == 0:
|
|
423
|
+
continue
|
|
424
|
+
if self._pi == 1 and k.find("__") != -1:
|
|
425
|
+
continue
|
|
426
|
+
self._hash_any(h, k, debug_trace=debug_trace)
|
|
427
|
+
self._hash_any(h, x[k], debug_trace=debug_trace)
|
|
428
|
+
return
|
|
429
|
+
self._hash_any(h, x.__dict__)
|
|
430
|
+
return
|
|
431
|
+
if hasattr(x,"__slots__"):
|
|
432
|
+
if not debug_trace is None: debug_trace = debug_trace.update_topic( x, "object with __slots__" )
|
|
433
|
+
self._hash_any( h, _qual_name( type(x),False), debug_trace=debug_trace)
|
|
434
|
+
for k in x.__slots__:
|
|
435
|
+
if isinstance(k,str):
|
|
436
|
+
if k[:1] == '_':
|
|
437
|
+
if self._pi == 0:
|
|
438
|
+
continue
|
|
439
|
+
if self._pi == 1 and k.find("__") != -1:
|
|
440
|
+
continue
|
|
441
|
+
self._hash_any(h, k, debug_trace=debug_trace)
|
|
442
|
+
self._hash_any(h, getattr(x,k), debug_trace=debug_trace)
|
|
443
|
+
return
|
|
444
|
+
raise TypeError(f"Cannot generate unique hash for type '{_qual_name(type(x),True)}': it does not have __dict__ or __slots__")
|
|
445
|
+
|
|
446
|
+
def _hash_function( self, h, fn : Callable, *, debug_trace = None ):
|
|
447
|
+
"""
|
|
448
|
+
Hash a function
|
|
449
|
+
"""
|
|
450
|
+
fn = inspect.unwrap(getattr(fn, "__func__", fn))
|
|
451
|
+
if inspect.isbuiltin(fn):
|
|
452
|
+
# Builtins: best we can do is identity by module + qualname
|
|
453
|
+
ident = _qual_name(fn,False)
|
|
454
|
+
h.update( ident.encode("utf-8") )
|
|
455
|
+
if not debug_trace is None: debug_trace.update( ident, "builtin function" )
|
|
456
|
+
return
|
|
457
|
+
|
|
458
|
+
if not inspect.isfunction(fn):
|
|
459
|
+
if hasattr(fn, "__call__"):
|
|
460
|
+
obj_name = _qual_name(type(fn),False)
|
|
461
|
+
h.update( obj_name.encode("utf-8") )
|
|
462
|
+
if not debug_trace is None:
|
|
463
|
+
debug_trace = debug_trace.update_topic( fn, "using __call__" )
|
|
464
|
+
debug_trace.update( obj_name )
|
|
465
|
+
return self._hash_function(h, fn.__call__, debug_trace = debug_trace )
|
|
466
|
+
raise TypeError(f"'fn' is not a function but of type {type(fn)}.")
|
|
467
|
+
|
|
468
|
+
debug_trace = None if debug_trace is None else debug_trace.update_topic( fn )
|
|
469
|
+
func_name = _qual_name(fn,False)
|
|
470
|
+
self._hash_any( h, func_name )
|
|
471
|
+
|
|
472
|
+
src = inspect.getsourcelines( fn )[0]
|
|
473
|
+
if isinstance(fn,types.LambdaType) and fn.__name__ == "<lambda>":
|
|
474
|
+
assert len(src) > 0, "No source code ??"
|
|
475
|
+
l = src[0]
|
|
476
|
+
i = l.lower().find("lambda ")
|
|
477
|
+
assert i!=-1, (f"Cannot find keyword 'lambda' even though {func_name} is a LambdaType?")
|
|
478
|
+
src[0] = l[i+len("lambda "):]
|
|
479
|
+
# Compressed version of the code of the function 'f' where all blanks are removed"""
|
|
480
|
+
src = [ l.replace("\t"," ").replace(" ","").replace("\n","") for l in src ]
|
|
481
|
+
self._hash_any( h, src )
|
|
482
|
+
if not debug_trace is None:
|
|
483
|
+
debug_trace.update( func_name )
|
|
484
|
+
debug_trace.update( src, "reduced source code")
|
|
485
|
+
del src, func_name
|
|
486
|
+
|
|
487
|
+
if self.f_include_defaults:
|
|
488
|
+
# Defaults
|
|
489
|
+
if not fn.__defaults__ is None and len(fn.__defaults__) > 0:
|
|
490
|
+
def_debug_trace = None if debug_trace is None else debug_trace.update_topic( fn.__defaults__, "position defaults")
|
|
491
|
+
self._hash_any( h, fn.__defaults__, debug_trace = def_debug_trace )
|
|
492
|
+
del def_debug_trace
|
|
493
|
+
|
|
494
|
+
if not fn.__kwdefaults__ is None and len(fn.__kwdefaults__) > 0:
|
|
495
|
+
def_debug_trace = None if debug_trace is None else debug_trace.update_topic(fn.__kwdefaults__, "keyword defauls")
|
|
496
|
+
self._hash_any( h, fn.__kwdefaults__, debug_trace = def_debug_trace )
|
|
497
|
+
del def_debug_trace
|
|
498
|
+
|
|
499
|
+
if self.f_include_closure and not fn.__closure__ is None and len(fn.__closure__) > 0:
|
|
500
|
+
# Closure cells (can be large; disable if that’s a concern)
|
|
501
|
+
closure_debug_trace = None if debug_trace is None else debug_trace.update_topic( fn.__closure__, "closure" )
|
|
502
|
+
for cell in fn.__closure__:
|
|
503
|
+
self._hash_any( h, cell.cell_contents, debug_trace=closure_debug_trace )
|
|
504
|
+
del closure_debug_trace
|
|
505
|
+
|
|
506
|
+
if self.f_include_globals and len(fn.__globals__) > 0 and len(fn.__code__.co_names) > 0:
|
|
507
|
+
# Referenced globals (names actually used by the code)
|
|
508
|
+
g = fn.__globals__
|
|
509
|
+
glb_debug_trace = None if debug_trace is None else debug_trace.update_topic( fn.__code__.co_names, "linked globals" )
|
|
510
|
+
for name in sorted(fn.__code__.co_names):
|
|
511
|
+
if name in g:
|
|
512
|
+
self._hash_any( h, (name, g[name]), debug_trace=glb_debug_trace )
|
|
513
|
+
del glb_debug_trace
|
|
514
|
+
del g
|
|
515
|
+
|
|
516
|
+
def _hash_dataFrame( self, h, df : pd.DataFrame, *, debug_trace = None ):
|
|
517
|
+
"""
|
|
518
|
+
Compute hash for a dataframe, c.f. https://stackoverflow.com/questions/49883236/how-to-generate-a-hash-or-checksum-value-on-python-dataframe-created-from-a-fix
|
|
519
|
+
Returns a hex digest that changes if the DataFrame's *content* changes.
|
|
520
|
+
Does not hash attributes.
|
|
521
|
+
"""
|
|
522
|
+
assert isinstance(df, pd.DataFrame), ("DataFrame expected", type(df))
|
|
523
|
+
debug_trace = None if debug_trace is None else debug_trace.update_topic( df )
|
|
524
|
+
if self.pd_ignore_column_order:
|
|
525
|
+
df = df.reindex(sorted(df.columns), axis=1)
|
|
526
|
+
|
|
527
|
+
# hash index
|
|
528
|
+
idx_h = pd.util.hash_pandas_object(df.index, index=False, categorize=True).values
|
|
529
|
+
h.update(idx_h.tobytes())
|
|
530
|
+
if not debug_trace is None: debug_trace.update( idx_h )
|
|
531
|
+
|
|
532
|
+
# hash each column’s content + its name + dtype
|
|
533
|
+
for name, col in df.items():
|
|
534
|
+
h.update(str(name).encode('utf-8'))
|
|
535
|
+
h.update(str(col.dtype).encode('utf-8'))
|
|
536
|
+
col_h = pd.util.hash_pandas_object(col, index=False, categorize=True).values
|
|
537
|
+
h.update(col_h.tobytes())
|
|
538
|
+
if not debug_trace is None:
|
|
539
|
+
debug_trace.update( str(name) )
|
|
540
|
+
debug_trace.update( str(col.dtype) )
|
|
541
|
+
debug_trace.update( col_h )
|
|
542
|
+
|
|
543
|
+
# attrs, if any
|
|
544
|
+
attrs = getattr(df, "attrs", None)
|
|
545
|
+
if not attrs is None:
|
|
546
|
+
self._hash_any(h, attrs)
|
|
547
|
+
if not debug_trace is None: debug_trace.update( attrs, "attrs" )
|
|
548
|
+
|
|
549
|
+
def _hash_numpy( self, h, a : np.ndarray, *, debug_trace = None ):
|
|
550
|
+
"""
|
|
551
|
+
Numpy hash
|
|
552
|
+
"""
|
|
553
|
+
assert isinstance(a, np.ndarray), ("ndarray expected", type(a))
|
|
554
|
+
a = np.asarray(a)
|
|
555
|
+
|
|
556
|
+
debug_trace = None if debug_trace is None else debug_trace.update_topic( a )
|
|
557
|
+
# Disallow arbitrary Python objects (define your own encoding first)
|
|
558
|
+
if a.dtype.kind == 'O':
|
|
559
|
+
raise TypeError("object-dtype array: map elements to bytes first (e.g., via str/utf-8).")
|
|
560
|
+
|
|
561
|
+
# Datetime/timedelta: hash their int64 representation
|
|
562
|
+
if a.dtype.kind in 'Mm':
|
|
563
|
+
a = a.view(np.int64)
|
|
564
|
+
|
|
565
|
+
# Make contiguous and normalize to little-endian for numeric types
|
|
566
|
+
a_dtype = a.dtype
|
|
567
|
+
# a = np.ascontiguousarray(a)
|
|
568
|
+
if a.dtype.byteorder == '>' or (a.dtype.byteorder == '=' and not np.little_endian):
|
|
569
|
+
a = a.byteswap().newbyteorder()
|
|
570
|
+
|
|
571
|
+
# Canonicalize NaN bits so all NaNs hash the same (float16/32/64, complex64/128)
|
|
572
|
+
if self.np_nan_equal and a.dtype.kind in 'fc':
|
|
573
|
+
base_bytes = (a.dtype.itemsize // (2 if a.dtype.kind == 'c' else 1))
|
|
574
|
+
if base_bytes == 2:
|
|
575
|
+
base = np.float16
|
|
576
|
+
elif base_bytes == 4:
|
|
577
|
+
base = np.float32
|
|
578
|
+
else:
|
|
579
|
+
assert base_bytes == 8, ("Internal error: cannot handle base_bytes", base_bytes)
|
|
580
|
+
base= np.float64
|
|
581
|
+
a = a.view(base)
|
|
582
|
+
if np.isnan(a).any():
|
|
583
|
+
a = a.copy()
|
|
584
|
+
if base is np.float16:
|
|
585
|
+
qnan = np.frombuffer(np.uint16(0x7e00).tobytes(), dtype=np.float16)[0]
|
|
586
|
+
elif base is np.float32:
|
|
587
|
+
qnan = np.frombuffer(np.uint32(0x7fc00000).tobytes(), dtype=np.float32)[0]
|
|
588
|
+
else: # float64
|
|
589
|
+
qnan = np.frombuffer(np.uint64(0x7ff8000000000000).tobytes(), dtype=np.float64)[0]
|
|
590
|
+
a[np.isnan(a)] = qnan
|
|
591
|
+
a = a.view(a_dtype)
|
|
592
|
+
|
|
593
|
+
# shapes
|
|
594
|
+
h.update( len(a.shape).to_bytes(4,'little', signed=False) )
|
|
595
|
+
for i in a.shape:
|
|
596
|
+
h.update( i.to_bytes(4,'little', signed=False) )
|
|
597
|
+
# dtype
|
|
598
|
+
h.update(a.dtype.str.encode('utf-8'))
|
|
599
|
+
h.update(a.tobytes())
|
|
600
|
+
if not debug_trace is None:
|
|
601
|
+
debug_trace.update( a.shape )
|
|
602
|
+
debug_trace.update( a.dtype.str )
|
|
603
|
+
debug_trace.update( a.tobytes() )
|
|
604
|
+
|
|
605
|
+
# Debugging
|
|
606
|
+
# =========
|
|
607
|
+
|
|
608
|
+
class DebugTrace(object):
|
|
609
|
+
def update( self, x, msg : str = None ):
|
|
610
|
+
""" Notify processing of 'x', with an optional process 'msg' """
|
|
611
|
+
raise NotImplementedError()
|
|
612
|
+
def update_topic( self, x, msg : str = None ):
|
|
613
|
+
""" Notify and return a sub-trace context """
|
|
614
|
+
raise NotImplementedError()
|
|
615
|
+
def warning( self, msg : str):
|
|
616
|
+
""" Issue warning """
|
|
617
|
+
raise NotImplementedError()
|
|
618
|
+
|
|
619
|
+
class DebugTraceCollect(DebugTrace):
|
|
620
|
+
"""
|
|
621
|
+
Simple collection of all arguments parsed with DebugTrace
|
|
622
|
+
"""
|
|
623
|
+
def __init__(self, tostr : int = None ):
|
|
624
|
+
""" Initialize data collection """
|
|
625
|
+
if tostr and tostr<=0: raise ValueError("'tostr' must be None or a positive integer")
|
|
626
|
+
self.tostr = tostr
|
|
627
|
+
self.trace = []
|
|
628
|
+
def _update( self, x, msg, child ):
|
|
629
|
+
""" Notify processing of 'x', with an optional process 'msg' """
|
|
630
|
+
if self.tostr:
|
|
631
|
+
y = PrettyObject( typex = type(x),
|
|
632
|
+
reprx = repr(x)[:self.tostr],
|
|
633
|
+
msg = msg,
|
|
634
|
+
child = child )
|
|
635
|
+
else:
|
|
636
|
+
y = PrettyObject( x = x,
|
|
637
|
+
msg = msg,
|
|
638
|
+
child = child )
|
|
639
|
+
self.trace.append( y )
|
|
640
|
+
def update( self, x, msg : str = None ):
|
|
641
|
+
""" Notify processing of 'x', with an optional process 'msg' """
|
|
642
|
+
self._update( x, msg, None )
|
|
643
|
+
def update_topic( self, x, msg : str = None ):
|
|
644
|
+
""" Notify and return a sub-trace context """
|
|
645
|
+
child = DebugTraceCollect(tostr=self.tostr)
|
|
646
|
+
self._update( x, msg, child )
|
|
647
|
+
return child
|
|
648
|
+
def warning( self, msg : str):
|
|
649
|
+
""" Issue warning """
|
|
650
|
+
self._update( None, msg, None )
|
|
651
|
+
|
|
652
|
+
# results
|
|
653
|
+
# -------
|
|
654
|
+
|
|
655
|
+
def __getitem__(self, item):
|
|
656
|
+
return self.trace[item]
|
|
657
|
+
def __len__(self):
|
|
658
|
+
return len(self.trace)
|
|
659
|
+
def __iter__(self):
|
|
660
|
+
for y in self.trace:
|
|
661
|
+
yield y
|
|
662
|
+
def __str__(self):
|
|
663
|
+
return self.trace.__repr__()
|
|
664
|
+
def __repr__(self):
|
|
665
|
+
return f"DebugTraceCollect({self.trace.__str__()})"
|
|
666
|
+
|
|
667
|
+
class DebugTraceVerbose(DebugTrace):
|
|
668
|
+
"""
|
|
669
|
+
Live printing of tracing information with cdxbasics.verbose.Context
|
|
670
|
+
for some formatting. All objects will be reported by type and
|
|
671
|
+
their string representation, sufficiently reduced if necessary.
|
|
672
|
+
"""
|
|
673
|
+
def __init__(self, strsize : int = 50, verbose = None ):
|
|
674
|
+
"""
|
|
675
|
+
Initialize tracer.
|
|
676
|
+
|
|
677
|
+
Parameters
|
|
678
|
+
----------
|
|
679
|
+
strsize : int
|
|
680
|
+
Maximum string size when using repr() on reported objects.
|
|
681
|
+
verbose :
|
|
682
|
+
Context object or None for a new context object.
|
|
683
|
+
"""
|
|
684
|
+
from .verbose import Context
|
|
685
|
+
if strsize<=3: ValueError("'strsize' must exceed 3")
|
|
686
|
+
self.strsize = strsize
|
|
687
|
+
self.verbose = Context("all") if verbose is None else verbose
|
|
688
|
+
def update( self, x, msg : str = None ):
|
|
689
|
+
""" Notify processing of 'x', with an optional process 'msg' """
|
|
690
|
+
xstr = repr(x)
|
|
691
|
+
if xstr[:1] == "'" and xstr[-1] == "'":
|
|
692
|
+
xstr = xstr[1:-1]
|
|
693
|
+
if len(xstr) > self.strsize:
|
|
694
|
+
xstr = xstr[:self.strsize-3] + "..."
|
|
695
|
+
if msg is None or len(msg) == 0:
|
|
696
|
+
self.verbose.write( f"{type(x).__name__}: '{xstr}'" )
|
|
697
|
+
else:
|
|
698
|
+
self.verbose.write( f"{msg} {type(x).__name__}: '{xstr}'" )
|
|
699
|
+
def update_topic( self, x, msg : str = None ):
|
|
700
|
+
""" Notify and return a sub-trace context """
|
|
701
|
+
self.update( x, msg )
|
|
702
|
+
return DebugTraceVerbose( self.strsize, self.verbose(1) )
|
|
703
|
+
def warning( self, msg : str):
|
|
704
|
+
""" Issue warning """
|
|
705
|
+
self.verbose.write( msg )
|
|
706
|
+
|
|
707
|
+
# =============================================================================
|
|
708
|
+
# Utility wrappers
|
|
709
|
+
# =============================================================================
|
|
710
|
+
|
|
711
|
+
def uniqueHashExt(**parameters) -> UniqueHash:
|
|
712
|
+
return UniqueHash(**parameters)
|
|
713
|
+
uniqueHashExt.__doc__ = UniqueHash.__init__.__doc__
|
|
714
|
+
|
|
715
|
+
def namedUniqueHashExt( max_length : int = 60,
|
|
716
|
+
id_length : int = 16, *,
|
|
717
|
+
separator : str = ' ',
|
|
718
|
+
filename_by : str = None,
|
|
719
|
+
**unique_hash_arguments
|
|
720
|
+
):
|
|
721
|
+
"""
|
|
722
|
+
Returns a function
|
|
723
|
+
|
|
724
|
+
f( label, **argv, **argp )
|
|
725
|
+
|
|
726
|
+
which generates unique strings of at most a length of max_length of the format
|
|
727
|
+
label + separator + ID
|
|
728
|
+
where ID has length id_length.
|
|
729
|
+
|
|
730
|
+
The maximum length of the returned string is 'max_length'.
|
|
731
|
+
If total_lengths is id_length+len(separator) then the function just returns the ID of length max_length.
|
|
732
|
+
|
|
733
|
+
This function does not suppose that 'label' is unqiue, hence the ID is prioritized.
|
|
734
|
+
See uniqueLabelExt() for a function which assumes the label is unique.
|
|
735
|
+
|
|
736
|
+
The function optionally makes sure that the returned string is a valid file name using cdxbasics.util.fmt_filename.
|
|
737
|
+
|
|
738
|
+
Important
|
|
739
|
+
---------
|
|
740
|
+
It is strongly recommended to read the documentation for UniqueHash.__init__() for details on hashing logic
|
|
741
|
+
and the available parameters
|
|
742
|
+
|
|
743
|
+
Parameters
|
|
744
|
+
----------
|
|
745
|
+
max_length : int
|
|
746
|
+
Total length of the returned string including the ID.
|
|
747
|
+
Defaults to 60 to allow file names with extensions with three letters.
|
|
748
|
+
id_length : int
|
|
749
|
+
Intended length of the hash function, default 16
|
|
750
|
+
separator : str
|
|
751
|
+
Separator between label and id_length.
|
|
752
|
+
Note that the separator will be included in the ID calculation, hence different separators
|
|
753
|
+
lead to different IDs.
|
|
754
|
+
filename_by : str, None
|
|
755
|
+
If not None, use fmt_filename( *, by=filename_by ) to ensure the returned string is a valid
|
|
756
|
+
filename for both windows and linux, of at most 'max_length' size.
|
|
757
|
+
If set to the string "default", use DEF_FILE_NAME_MAP as the default mapping of cdxbasics.util.fmt_filename
|
|
758
|
+
**unique_hash_arguments:
|
|
759
|
+
Parameters passed to UniqueHash.__init__.
|
|
760
|
+
|
|
761
|
+
Returns
|
|
762
|
+
-------
|
|
763
|
+
hash function with signature (label, *args, **kwargs).
|
|
764
|
+
All arguments including label and separator will be used to generate the hash key.
|
|
765
|
+
"""
|
|
766
|
+
if id_length < 4: raise ValueError("'id_length' must be at least 4. Found {id_length}")
|
|
767
|
+
if id_length > max_length: raise ValueError(f"'max_length' must not be less than 'id_length'. Founb {max_length} and {id_length}, respectivelty")
|
|
768
|
+
if 'length' in unique_hash_arguments: raise ValueError("Cannot specify 'length' here. Used 'id_length' and 'max_length'")
|
|
769
|
+
filename_by = ( DEF_FILE_NAME_MAP if filename_by=="default" else filename_by ) if not filename_by is None else None
|
|
770
|
+
fseparator = fmt_filename( separator, by=filename_by ) if not filename_by is None else separator
|
|
771
|
+
|
|
772
|
+
label_length = max_length-id_length-len(fseparator)
|
|
773
|
+
if label_length<=0:
|
|
774
|
+
id_length = max_length
|
|
775
|
+
label_length = 0
|
|
776
|
+
unique_hash = UniqueHash( length=id_length, **unique_hash_arguments )
|
|
777
|
+
|
|
778
|
+
def named_unique_hash(label, *args, **kwargs) -> str:
|
|
779
|
+
if label_length>0:
|
|
780
|
+
assert not label is None, ("'label' cannot be None", args, kwargs)
|
|
781
|
+
label = fmt_filename( label, by=filename_by ) if not filename_by is None else label
|
|
782
|
+
base_hash = unique_hash( label, separator, *args, **kwargs )
|
|
783
|
+
label = label[:label_length] + fseparator + base_hash
|
|
784
|
+
else:
|
|
785
|
+
label = unique_hash( separator, *args, **kwargs ) # using 'separator' here to allow distinction at that level
|
|
786
|
+
return label
|
|
787
|
+
return named_unique_hash
|
|
788
|
+
|
|
789
|
+
def uniqueLabelExt( max_length : int = 60,
|
|
790
|
+
id_length : int = 8,
|
|
791
|
+
separator : str = ' ',
|
|
792
|
+
filename_by : str = None ):
|
|
793
|
+
"""
|
|
794
|
+
Returns a function
|
|
795
|
+
|
|
796
|
+
f( unique_label )
|
|
797
|
+
|
|
798
|
+
which generates strings of at most max_length of the format:
|
|
799
|
+
If len(unique_label) <= max_length:
|
|
800
|
+
unique_label
|
|
801
|
+
else:
|
|
802
|
+
unique_label + separator + ID
|
|
803
|
+
where the ID is of maximum length 'id_length'.
|
|
804
|
+
|
|
805
|
+
This function assumes that 'unique_label' is unique, hence the ID is dropped if 'unique_label' is less than 'max_length'
|
|
806
|
+
See namedUniqueHashExt() for a function does not assume the label is unique, hence the ID is always appended.
|
|
807
|
+
Note that if file name conversion is used, then this function will always attach the unique ID to the filename because
|
|
808
|
+
the reduction of the label to a filename is no longer guaranteed to be unique. If your label is unique as a filename, do not
|
|
809
|
+
use 'filename_by'. The function will return valid file names if label is a valid file name.
|
|
810
|
+
|
|
811
|
+
Parameters
|
|
812
|
+
----------
|
|
813
|
+
max_length : int
|
|
814
|
+
Total length of the returned string including the ID.
|
|
815
|
+
Defaults to 60 to allow file names with extensions with three letters.
|
|
816
|
+
id_length : int
|
|
817
|
+
Indicative length of the hash function, default 8.
|
|
818
|
+
id_length will be reduced to max_length if neccessary.
|
|
819
|
+
separator : str
|
|
820
|
+
Separator between label and id_length.
|
|
821
|
+
Note that the separator will be included in the ID calculation, hence different separators
|
|
822
|
+
lead to different IDs.
|
|
823
|
+
filename_by : str, None
|
|
824
|
+
If not None, use fmt_filename( *, by=filename_by ) to ensure the returned string is a valid
|
|
825
|
+
filename for both windows and linux, of at most 'max_length' size.
|
|
826
|
+
If set to the string "default", use DEF_FILE_NAME_MAP as the default mapping of cdxbasics.util.fmt_filename
|
|
827
|
+
|
|
828
|
+
Returns
|
|
829
|
+
-------
|
|
830
|
+
hash function with signature (unique_label).
|
|
831
|
+
"""
|
|
832
|
+
if id_length < 4: raise ValueError("'id_length' must be at least 4. Found {id_length}")
|
|
833
|
+
if id_length > max_length: raise ValueError(f"'max_length' must not be less than 'id_length'. Founb {max_length} and {id_length}, respectivelty")
|
|
834
|
+
|
|
835
|
+
filename_by = ( DEF_FILE_NAME_MAP if filename_by=="default" else filename_by ) if not filename_by is None else None
|
|
836
|
+
fseparator = fmt_filename( separator, by=filename_by ) if not filename_by is None else separator
|
|
837
|
+
|
|
838
|
+
if id_length>=max_length+len(fseparator):
|
|
839
|
+
id_length = max_length+len(fseparator)
|
|
840
|
+
|
|
841
|
+
unique_hash = UniqueHash( length=id_length )
|
|
842
|
+
|
|
843
|
+
def unique_label_hash(label) -> str:
|
|
844
|
+
if filename_by is None and len(label) <= max_length and len(label) > 0:
|
|
845
|
+
# no filename convertsion and label is short enough --> use this name
|
|
846
|
+
return label
|
|
847
|
+
|
|
848
|
+
base_hash = unique_hash( label, separator )
|
|
849
|
+
label_hash = fseparator + base_hash
|
|
850
|
+
if len(label_hash) >= max_length or len(label) == 0:
|
|
851
|
+
# hash and separator exceed total length. Note that len(base_hash) <= max_length
|
|
852
|
+
label = base_hash
|
|
853
|
+
else:
|
|
854
|
+
# convert label to filename
|
|
855
|
+
label = fmt_filename( label, by=filename_by ) if not filename_by is None else label
|
|
856
|
+
label = label[:max_length-len(label_hash)] + label_hash
|
|
857
|
+
return label
|
|
858
|
+
return unique_label_hash
|
|
859
|
+
|
|
860
|
+
# =============================================================================
|
|
861
|
+
# Short cuts
|
|
862
|
+
# =============================================================================
|
|
863
|
+
|
|
864
|
+
def uniqueHash8( *args, **argv ) -> str:
|
|
865
|
+
"""
|
|
866
|
+
Compute a unique ID of length 8 for the provided arguments.
|
|
867
|
+
|
|
868
|
+
The function
|
|
869
|
+
1) uses the repr() function to feed objects to the hash algorithm.
|
|
870
|
+
that means is only distinguishes floats up to str conversion precision
|
|
871
|
+
2) keys of dictionaries, and sets are sorted to ensure equality of hashes
|
|
872
|
+
accross different memory setups of strings
|
|
873
|
+
3) Members with leading '_' are ignored (*)
|
|
874
|
+
4) Functions and properties are ignored (*)
|
|
875
|
+
(*) you can create a hash function with different behaviour by using uniqueHashExt()
|
|
876
|
+
|
|
877
|
+
To support hashing directly in one of your objects, implement
|
|
878
|
+
|
|
879
|
+
__unique_hash__( length : int, parse_functions : bool, parse_underscore : str )
|
|
880
|
+
|
|
881
|
+
The parameters are the same as for uniqueHashExt.
|
|
882
|
+
The function is expected to return a hashable object, ideally a string.
|
|
883
|
+
"""
|
|
884
|
+
return UniqueHash(8)(*args,**argv)
|
|
885
|
+
|
|
886
|
+
def uniqueHash16( *args, **argv ) -> str:
|
|
887
|
+
"""
|
|
888
|
+
Compute a unique ID of length 16 for the provided arguments.
|
|
889
|
+
The function
|
|
890
|
+
1) uses the repr() function to feed objects to the hash algorithm.
|
|
891
|
+
that means is only distinguishes floats up to str conversion precision
|
|
892
|
+
2) keys of dictionaries, and sets are sorted to ensure equality of hashes
|
|
893
|
+
accross different memory setups of strings
|
|
894
|
+
3) Members with leading '_' are ignored (*)
|
|
895
|
+
4) Functions and properties are ignored (*)
|
|
896
|
+
(*) you can create a hash function with different behaviour by using uniqueHashExt()
|
|
897
|
+
|
|
898
|
+
To support hashing directly in one of your objects, implement
|
|
899
|
+
|
|
900
|
+
__unique_hash__( length : int, parse_functions : bool, parse_underscore : str )
|
|
901
|
+
|
|
902
|
+
The parameters are the same as for uniqueHashExt.
|
|
903
|
+
The function is expected to return a hashable object, ideally a string.
|
|
904
|
+
"""
|
|
905
|
+
return UniqueHash(16)(*args,**argv)
|
|
906
|
+
|
|
907
|
+
def uniqueHash32( *args, **argv ) -> str:
|
|
908
|
+
"""
|
|
909
|
+
Compute a unique ID of length 32 for the provided arguments.
|
|
910
|
+
The function
|
|
911
|
+
1) uses the repr() function to feed objects to the hash algorithm.
|
|
912
|
+
that means is only distinguishes floats up to str conversion precision
|
|
913
|
+
2) keys of dictionaries, and sets are sorted to ensure equality of hashes
|
|
914
|
+
accross different memory setups of strings
|
|
915
|
+
3) Members with leading '_' are ignored (*)
|
|
916
|
+
4) Functions and properties are ignored (*)
|
|
917
|
+
(*) you can create a hash function with different behaviour by using uniqueHashExt()
|
|
918
|
+
|
|
919
|
+
To support hashing directly in one of your objects, implement
|
|
920
|
+
|
|
921
|
+
__unique_hash__( length : int, parse_functions : bool, parse_underscore : str )
|
|
922
|
+
|
|
923
|
+
The parameters are the same as for uniqueHashExt.
|
|
924
|
+
The function is expected to return a hashable object, ideally a string.
|
|
925
|
+
"""
|
|
926
|
+
return UniqueHash(32)(*args,**argv)
|
|
927
|
+
|
|
928
|
+
uniqueHash = uniqueHash32
|
|
929
|
+
|
|
930
|
+
def uniqueHash48( *args, **argv ) -> str:
|
|
931
|
+
"""
|
|
932
|
+
Compute a unique ID of length 48 for the provided arguments.
|
|
933
|
+
The function
|
|
934
|
+
1) uses the repr() function to feed objects to the hash algorithm.
|
|
935
|
+
that means is only distinguishes floats up to str conversion precision
|
|
936
|
+
2) keys of dictionaries, and sets are sorted to ensure equality of hashes
|
|
937
|
+
accross different memory setups of strings
|
|
938
|
+
3) Members with leading '_' are ignored (*)
|
|
939
|
+
4) Functions and properties are ignored (*)
|
|
940
|
+
(*) you can create a hash function with different behaviour by using uniqueHashExt()
|
|
941
|
+
|
|
942
|
+
To support hashing directly in one of your objects, implement
|
|
943
|
+
|
|
944
|
+
__unique_hash__( length : int, parse_functions : bool, parse_underscore : str )
|
|
945
|
+
|
|
946
|
+
The parameters are the same as for uniqueHashExt.
|
|
947
|
+
The function is expected to return a hashable object, ideally a string.
|
|
948
|
+
"""
|
|
949
|
+
return UniqueHash(48)(*args,**argv)
|
|
950
|
+
|
|
951
|
+
def uniqueHash64( *args, **argv ) -> str:
|
|
952
|
+
"""
|
|
953
|
+
Compute a unique ID of length 64 for the provided arguments.
|
|
954
|
+
The function
|
|
955
|
+
1) uses the repr() function to feed objects to the hash algorithm.
|
|
956
|
+
that means is only distinguishes floats up to str conversion precision
|
|
957
|
+
2) keys of dictionaries, and sets are sorted to ensure equality of hashes
|
|
958
|
+
accross different memory setups of strings
|
|
959
|
+
3) Members with leading '_' are ignored (*)
|
|
960
|
+
4) Functions and properties are ignored (*)
|
|
961
|
+
(*) you can create a hash function with different behaviour by using uniqueHashExt()
|
|
962
|
+
|
|
963
|
+
To support hashing directly in one of your objects, implement
|
|
964
|
+
|
|
965
|
+
__unique_hash__( length : int, parse_functions : bool, parse_underscore : str )
|
|
966
|
+
|
|
967
|
+
The parameters are the same as for uniqueHashExt.
|
|
968
|
+
The function is expected to return a hashable object, ideally a string.
|
|
969
|
+
"""
|
|
970
|
+
return UniqueHash(64)(*args,**argv)
|