cdxcore 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cdxcore might be problematic. Click here for more details.

cdxcore/uniquehash.py ADDED
@@ -0,0 +1,970 @@
1
+ """
2
+ Basic utilities for Python
3
+ Hans Buehler 2022
4
+ """
5
+
6
+ import datetime as datetime
7
+ import types as types
8
+ import hashlib as hashlib
9
+ import inspect as inspect
10
+ from collections.abc import Mapping, Collection, Sequence, Iterator, Callable
11
+ from collections import OrderedDict
12
+ import numpy as np
13
+ import pandas as pd
14
+ import struct as struct
15
+ from .util import isFunction, DEF_FILE_NAME_MAP, fmt_filename
16
+ from .prettyobject import PrettyObject
17
+
18
+ def _qual_name(x, with_mod=False):
19
+ q = getattr(x, '__qualname__', x.__name__)
20
+ if with_mod:
21
+ m = getattr(x, "__module__", None)
22
+ if not m is None:
23
+ q += "@" + m
24
+ return q
25
+
26
+ # =============================================================================
27
+ # Hashing
28
+ # =============================================================================
29
+
30
+ class UniqueHash( object ):
31
+ """
32
+ Object to compute recursively unique hashes with.
33
+ See UniqueHash.__init__ for documentation
34
+ """
35
+
36
+ def __init__(self, length : int = 32, *,
37
+ parse_underscore : str = "none",
38
+ sort_dicts : bool = True,
39
+ parse_functions : bool = False,
40
+ # micro settings
41
+ pd_ignore_column_order : bool = True,
42
+ np_nan_equal : bool = False,
43
+ f_include_defaults : bool = True,
44
+ f_include_closure : bool = True,
45
+ f_include_globals : bool = True,
46
+ ):
47
+ """
48
+ Initializes a callable object which iteratively generates hashes of length at most 'length'.
49
+ The algorithm is meant to be mostly hands-off, but there are a few considerations with making:
50
+
51
+ Private and Protected members
52
+ -----------------------------
53
+ When an object is passed to this functional its members are iterated using __dict__ or __slots__, respectively.
54
+ By default this process ignores any fields in objects or dictionaries which start with "_". The idea here is
55
+ that 'functional' parameters are stored as members, but any derived data is stored in protected members.
56
+ This behaviour can be chanhed with 'parse_underscore'.
57
+
58
+ Objects can optionally implement their own hashing scheme by implementing
59
+
60
+ __unique_hash__( self, uniqueHash : UniqueHash, debug_trace : DebugTrace )
61
+
62
+ This function may return a unique string, or any other non-None Python object which will then be passed to
63
+ UniqueHash.__call__. A common use case is to return a tuple of the members of the class which are
64
+ pertinent for hashing.
65
+
66
+ Dictionaries
67
+ ------------
68
+ Since Python 3.7 dictionaries preserve the order in which they were constructed https://mail.python.org/pipermail/python-dev/2017-December/151283.html.
69
+ However, Python semantics otherwise remain order invariant, i.e. {'x':1, 'y':2} tests equal to {'y':2',x':1}.
70
+ For this reasom the default behaviour for dictonaries is to sort them before hasing their content
71
+ (also recall that objects are typicall treated via their __dict__).
72
+ This can be turned off with 'sort_dicts'.
73
+ OrderedDicts are not sorted in any case.
74
+
75
+ Functions
76
+ ---------
77
+ By default function members of objects and dictionaries (which include @properties) are
78
+ ignored. You can set 'parse_functions' = True to parse a reduced text of the function code.
79
+ There are a number of expert settings for handling functions, see below.
80
+
81
+ Numpy, Pandas
82
+ -------------
83
+ Hashing of large datasets is not advised. Use hashes on the generating parameter set instead.
84
+
85
+ Implementing custom object hashing
86
+ ----------------------------------
87
+ An object may implement
88
+ __unique_hash__( self, uniqueHashExt : UniqueHash )
89
+ which is passed a unique hash object which contains all current run time parameters.
90
+ A good use case is:
91
+
92
+ Parameters
93
+ ----------
94
+ length : int
95
+ Intended length of the hash function.
96
+ parse_underscore : bool
97
+ How to handle object members starting with '_'.
98
+ * 'none' : ignore members starting with '_' (the default)
99
+ * 'protected' : ignore 'private' members declared starting with '_' and containing '__' (*)
100
+ * 'private' : consider all members
101
+ sort_dicts : bool
102
+ From python 3.7 dictionaries are ordered. That means that strictly speaking
103
+ the two dictionaries {'x':1, 'y':2} and {'y':2, 'x':1} are not indentical;
104
+ however Python will sematicallly assume they are as == between the two will return True.
105
+ Accordingly, this function by default sorts first the keys of mappings before
106
+ hashing their items. (If dictionaries are derived from OrderedDict, the function will still process those
107
+ in order.)
108
+ This can be turned off by setting sort_dicts=False.
109
+ parse_functions : bool
110
+ If True, then the function will attempt to generate
111
+ unique hashes for function and property objects
112
+ using compress_function_code
113
+
114
+ Fine tuning
115
+ -----------
116
+ pd_ignore_column_order : bool
117
+ Whether to ingore the order of panda columns. The default is True
118
+ np_nan_equal : bool
119
+ Whether to ignore the specific type of a NaN. The default is False.
120
+ f_include_defaults : bool
121
+ When parsing functions whether to include default values. Default is True.
122
+ f_include_closure : bool
123
+ When parsing functions whether to include the function colusure. This can be expensive. Default is True.
124
+ f_include_globals : bool
125
+ When parsing functions whether to include globals used by the function. This can be expensicve. Default is False.
126
+ """
127
+ self.length = int(length)
128
+
129
+ digest_size = self.length//2
130
+ if digest_size <= 0:
131
+ raise ValueError("'length' must be at least 2")
132
+ if digest_size > 64:
133
+ raise ValueError("'length' can be at most 128 (limitation of 'haslib.blake2b')")
134
+
135
+ self.parse_underscore = str(parse_underscore)
136
+ self.sort_dicts = bool(sort_dicts)
137
+ self.parse_functions = bool(parse_functions)
138
+
139
+ self.pd_ignore_column_order = bool(pd_ignore_column_order)
140
+ self.np_nan_equal = bool(np_nan_equal)
141
+
142
+ self.f_include_defaults = bool(f_include_defaults)
143
+ self.f_include_closure = bool(f_include_closure)
144
+ self.f_include_globals = bool(f_include_globals)
145
+
146
+ if parse_underscore == "none":
147
+ self._pi = 0
148
+ elif parse_underscore == "protected":
149
+ self._pi = 1
150
+ else:
151
+ if parse_underscore != "private": raise ValueError("'parse_underscore' must be 'none', 'private', or 'protected'. Found '{self.parse_underscore}'")
152
+ self._pi = 2
153
+
154
+ @property
155
+ def name(self) -> str:
156
+ return f"uniqueHash({self.length};{self.parse_underscore},{self.sort_dicts},{self.parse_functions})"
157
+
158
+ def clone(self):
159
+ """ Return copy of 'self' """
160
+ return UniqueHash( **{ k:v for k,v in self.__dict__.items() if not k[:1] == "_"} )
161
+
162
+ def __call__(self, *args, debug_trace = None, **kwargs):
163
+ """
164
+ Returns a unique hash for the parameters passed to this function.
165
+
166
+ Hashing can have curious side effect, in particular when handling objects and functions.
167
+ For this reason this function allows tracing all hashing activity using its debug_trace
168
+ parameter.
169
+
170
+ Parameters
171
+ ----------
172
+ args, kwargs:
173
+ Parameters to hash.
174
+ debug_trace : DebugTrace
175
+ Allows tracing of hashing activity.
176
+ Two classes are implemented:
177
+ DebugTraceVerbose()
178
+ Simply prints out hashing activity to stdout.
179
+ DebugTraceCollect()
180
+ Collects an array of tracing information.
181
+ The object itself is an iterable which contains
182
+ the respective tracing information.
183
+
184
+ Returns
185
+ -------
186
+ String of at most length 'length'
187
+ """
188
+ h, _ = self._mk_blake( h=self.length//2 )
189
+ if len(args) > 0:
190
+ self._hash_any( h, args, debug_trace = debug_trace )
191
+ if len(kwargs) > 0:
192
+ self._hash_any( h, kwargs, debug_trace = debug_trace )
193
+ return h.hexdigest()
194
+
195
+ # Utility functions
196
+ # -----------------
197
+
198
+ @staticmethod
199
+ def _mk_blake( h ):
200
+ """ utility function to allow passing a hash 'h' or an 'int' """
201
+ if not isinstance(h, int):
202
+ return h, False
203
+ h = int(h)
204
+ assert h//2>0 and h//2<64, ("'h' must be at least 2 and not exceed 128", h)
205
+ h = hashlib.blake2b( digest_size=h ) if h > 16 else hashlib.blake2s( digest_size=h )
206
+ return h, True
207
+
208
+ def _hash_any(self, h, x, *, debug_trace = None ):
209
+ """
210
+ Recursive function to hash "any" object.
211
+
212
+ Parameters
213
+ ----------
214
+ h : hash
215
+ Hashlib algorithm
216
+ x : any
217
+ Value to hash.
218
+ debug_trace :
219
+ Optional DebugTrace object to debug uniqueHash calculations.
220
+ """
221
+ if x is None:
222
+ h.update(b'\x00')
223
+ return
224
+ # numpy atomic
225
+ if isinstance(x, np.generic):
226
+ sz = x.itemsize
227
+ if sz==1:
228
+ x = x.view(np.int8)
229
+ elif sz==2:
230
+ x = x.view(np.int16)
231
+ elif sz==4:
232
+ x = x.view(np.int32)
233
+ else:
234
+ assert sz==8, ("Cannot handle itemsize",sz,"for numpy generic", type(x), "with value", x)
235
+ x = x.view(np.int64)
236
+ h.update(x.tobytes())
237
+ if not debug_trace is None: debug_trace.update( x )
238
+ return
239
+ # basic elements
240
+ if isinstance( x, bool ):
241
+ h.update( x.to_bytes(1,'little', signed=True) )
242
+ if not debug_trace is None: debug_trace.update( x )
243
+ return
244
+ if isinstance( x, int ):
245
+ h.update( x.to_bytes(8,'little', signed=True) )
246
+ if not debug_trace is None: debug_trace.update( x )
247
+ return
248
+ if isinstance( x, ( float, complex ) ):
249
+ h.update( struct.pack('<d', x) ) # little-endian double
250
+ if not debug_trace is None: debug_trace.update( x )
251
+ return
252
+ if isinstance( x, bytes ):
253
+ h.update( x )
254
+ if not debug_trace is None: debug_trace.update( x )
255
+ return
256
+ if isinstance( x, str ):
257
+ h.update( x.encode('utf-8') )
258
+ if not debug_trace is None: debug_trace.update( x )
259
+ return
260
+ # datetime etc
261
+ if isinstance(x,datetime.datetime):
262
+ if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
263
+ ts = float( x.timestamp() )
264
+ td = x.tzinfo.utcoffset(x) if not x.tzinfo is None else None
265
+ self._hash_any(h, ts, debug_trace=debug_trace)
266
+ if not td is None:
267
+ self._hash_any(h, td.total_seconds, debug_trace=debug_trace)
268
+ return
269
+ if isinstance(x,datetime.time):
270
+ """
271
+ tzinfo for time is useless
272
+ if not x.tzinfo is None:
273
+ h.update( x.utcoffset().total_seconds.to_bytes(4,'little', signed=True) )
274
+ else:
275
+ h.update( int(0).to_bytes(4,'little', signed=True) )
276
+ """
277
+ if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
278
+ total_seconds = float(x.hour*60*60+x.minute*60+x.second) +\
279
+ float(x.microsecond) / 1000000.
280
+ self._hash_any(h, total_seconds, debug_trace=debug_trace)
281
+ """
282
+ h.update( x.hour.to_bytes(2,'little', signed=True) )
283
+ h.update( x.minute.to_bytes(2,'little', signed=True) )
284
+ h.update( x.second.to_bytes(2,'little', signed=True) )
285
+ h.update( x.microsecond.to_bytes(4,'little', signed=True))
286
+ if not debug_trace is None:
287
+ debug_trace = debug_trace.update_topic( x )
288
+ debug_trace.update( x.hour, "hour")
289
+ debug_trace.update( x.minute, "minute")
290
+ debug_trace.update( x.second, "second")
291
+ debug_trace.update( x.microsecond, "microsecond")
292
+ if not x.tzinfo is None:
293
+ debug_trace.warning( "datetime.time support for tzinfo is not working well. Use datetime.datetime")
294
+ """
295
+ return
296
+ if isinstance(x,datetime.date):
297
+ if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
298
+ full = x.year * 10000 + x.month * 100 + x.day
299
+ self._hash_any(h, full, debug_trace=debug_trace)
300
+ """
301
+ h.update( x.year.to_bytes(4,'little', signed=True) )
302
+ h.update( x.month.to_bytes(1,'little', signed=True) )
303
+ h.update( x.day.to_bytes(2,'little', signed=True) )
304
+ if not debug_trace is None:
305
+ debug_trace = debug_trace.update_topic( x )
306
+ debug_trace.update( x.year, "year" )
307
+ debug_trace.update( x.month, "month" )
308
+ debug_trace.update( x.day, "day" )
309
+ """
310
+ return
311
+ if isinstance(x,datetime.timedelta):
312
+ if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
313
+ self._hash_any(h, x.total_seconds(), debug_trace=debug_trace )
314
+ return
315
+ # functions
316
+ if isFunction(x) or isinstance(x,property):
317
+ if self.parse_functions:
318
+ self._hash_function( h, x, debug_trace=debug_trace )
319
+ elif not debug_trace is None:
320
+ debug_trace.warning( f"Ignored function: {x.__qualname__}")
321
+ return
322
+ # slice -> tuple
323
+ if isinstance(x,slice):
324
+ if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
325
+ self._hash_any(h, (x.start,x.stop,x.step), debug_trace=debug_trace )
326
+ return
327
+ # test presence of __unique_hash__()
328
+ # objects can now simply set this member to a string
329
+ if hasattr(x,"__unique_hash__"):
330
+ unique_hash = x.__unique_hash__
331
+ if isinstance(unique_hash, str):
332
+ h.update(unique_hash.encode('utf-8') )
333
+ if not debug_trace is None:
334
+ debug_trace = debug_trace.update_topic( x, msg="__unique_hash__ str" )
335
+ debug_trace.update( unique_hash )
336
+ return
337
+ debug_trace = None if debug_trace is None else debug_trace.update_topic( x, msg="__unique_hash__ function" )
338
+ try:
339
+ unique_hash = unique_hash( self.clone(), debug_trace=debug_trace )
340
+ except Exception as e:
341
+ raise type(e)( e, f"Exception encountered while calling '__unique_hash__' of object of type {type(x)}.")
342
+ if unique_hash is None:
343
+ raise TypeError(f"{type(x).__qualname__}: __unique_hash__() cannot return None")
344
+ if isinstance(unique_hash, str):
345
+ h.update(unique_hash.encode('utf-8') )
346
+ if not debug_trace is None:
347
+ debug_trace.update( unique_hash )
348
+ else:
349
+ if not debug_trace is None:
350
+ debug_trace = debug_trace.update_topic( unique_hash )
351
+ self._hash_any(h, unique_hash, debug_trace=debug_trace )
352
+ return
353
+ # numpy
354
+ if isinstance(x,np.ndarray):
355
+ self._hash_numpy(h, x, debug_trace=debug_trace )
356
+ return
357
+ # pandas
358
+ if isinstance(x,pd.DataFrame):
359
+ self._hash_dataFrame(h, x, debug_trace=debug_trace )
360
+ return
361
+ # dictionaries, and similar
362
+ # note that objects with a __dict__ will
363
+ # be hashed using that dictionary
364
+ if isinstance(x,Mapping):
365
+ assert not isinstance(x, Sequence)
366
+ # from Python 3.7 onwards, dictionaries are ordered.
367
+ # however, we here assume here that unless they are
368
+ # specified as ordered, we can assume that the
369
+ # order does not matter.
370
+ if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
371
+ keys = sorted(x) if self.sort_dicts and not isinstance(x,OrderedDict) else list(x)
372
+ for k in keys:
373
+ if isinstance(k,str):
374
+ if k[:1] == '_':
375
+ if self._pi == 0:
376
+ continue
377
+ if self._pi == 1 and k.find("__") != -1:
378
+ continue
379
+ self._hash_any(h, (k, x[k]), debug_trace=debug_trace)
380
+ return
381
+ # lists, tuples and everything which looks like it --> lists
382
+ if isinstance(x, (Sequence, Iterator)):
383
+ assert not isinstance(x, dict)
384
+ if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
385
+ for k in x:
386
+ self._hash_any(h, k, debug_trace=debug_trace)
387
+ return
388
+ # all others such as sets need sorting first
389
+ if isinstance(x, Collection):
390
+ assert not isinstance(x, dict)
391
+ if not debug_trace is None: debug_trace = debug_trace.update_topic( x )
392
+ x = sorted(x)
393
+ for k in x:
394
+ self._hash_any(h, k, debug_trace=debug_trace)
395
+ return
396
+ # objects: treat like dictionaries
397
+ if hasattr(x,"__dict__"):
398
+ """
399
+ 1)
400
+ from python 3.7 onwards dictionaries are ordered.
401
+ however, except in rare cases that order should not
402
+ impede the equivalence of objects
403
+ 2)
404
+ private member handling in Python is subject to name space mangling which can have curious effects.
405
+ That's why we consider private any members staring with '_' and containing '__':
406
+ class A(object):
407
+ def f(self):
408
+ class X(object):
409
+ pass
410
+ x = X()
411
+ x.__p = 1
412
+ print(x.__dict__)
413
+ A().f() will print '{'_A__p': 1}' even though 'x.__p' is a private member to X.
414
+ """
415
+ if not debug_trace is None: debug_trace = debug_trace.update_topic( x, "object with __dict__" )
416
+ self._hash_any( h, _qual_name( type(x),False), debug_trace=debug_trace)
417
+ x = x.__dict__
418
+ keys = sorted(x) if self.sort_dicts else list(x)
419
+ for k in keys:
420
+ if isinstance(k,str):
421
+ if k[:1] == '_':
422
+ if self._pi == 0:
423
+ continue
424
+ if self._pi == 1 and k.find("__") != -1:
425
+ continue
426
+ self._hash_any(h, k, debug_trace=debug_trace)
427
+ self._hash_any(h, x[k], debug_trace=debug_trace)
428
+ return
429
+ self._hash_any(h, x.__dict__)
430
+ return
431
+ if hasattr(x,"__slots__"):
432
+ if not debug_trace is None: debug_trace = debug_trace.update_topic( x, "object with __slots__" )
433
+ self._hash_any( h, _qual_name( type(x),False), debug_trace=debug_trace)
434
+ for k in x.__slots__:
435
+ if isinstance(k,str):
436
+ if k[:1] == '_':
437
+ if self._pi == 0:
438
+ continue
439
+ if self._pi == 1 and k.find("__") != -1:
440
+ continue
441
+ self._hash_any(h, k, debug_trace=debug_trace)
442
+ self._hash_any(h, getattr(x,k), debug_trace=debug_trace)
443
+ return
444
+ raise TypeError(f"Cannot generate unique hash for type '{_qual_name(type(x),True)}': it does not have __dict__ or __slots__")
445
+
446
+ def _hash_function( self, h, fn : Callable, *, debug_trace = None ):
447
+ """
448
+ Hash a function
449
+ """
450
+ fn = inspect.unwrap(getattr(fn, "__func__", fn))
451
+ if inspect.isbuiltin(fn):
452
+ # Builtins: best we can do is identity by module + qualname
453
+ ident = _qual_name(fn,False)
454
+ h.update( ident.encode("utf-8") )
455
+ if not debug_trace is None: debug_trace.update( ident, "builtin function" )
456
+ return
457
+
458
+ if not inspect.isfunction(fn):
459
+ if hasattr(fn, "__call__"):
460
+ obj_name = _qual_name(type(fn),False)
461
+ h.update( obj_name.encode("utf-8") )
462
+ if not debug_trace is None:
463
+ debug_trace = debug_trace.update_topic( fn, "using __call__" )
464
+ debug_trace.update( obj_name )
465
+ return self._hash_function(h, fn.__call__, debug_trace = debug_trace )
466
+ raise TypeError(f"'fn' is not a function but of type {type(fn)}.")
467
+
468
+ debug_trace = None if debug_trace is None else debug_trace.update_topic( fn )
469
+ func_name = _qual_name(fn,False)
470
+ self._hash_any( h, func_name )
471
+
472
+ src = inspect.getsourcelines( fn )[0]
473
+ if isinstance(fn,types.LambdaType) and fn.__name__ == "<lambda>":
474
+ assert len(src) > 0, "No source code ??"
475
+ l = src[0]
476
+ i = l.lower().find("lambda ")
477
+ assert i!=-1, (f"Cannot find keyword 'lambda' even though {func_name} is a LambdaType?")
478
+ src[0] = l[i+len("lambda "):]
479
+ # Compressed version of the code of the function 'f' where all blanks are removed"""
480
+ src = [ l.replace("\t"," ").replace(" ","").replace("\n","") for l in src ]
481
+ self._hash_any( h, src )
482
+ if not debug_trace is None:
483
+ debug_trace.update( func_name )
484
+ debug_trace.update( src, "reduced source code")
485
+ del src, func_name
486
+
487
+ if self.f_include_defaults:
488
+ # Defaults
489
+ if not fn.__defaults__ is None and len(fn.__defaults__) > 0:
490
+ def_debug_trace = None if debug_trace is None else debug_trace.update_topic( fn.__defaults__, "position defaults")
491
+ self._hash_any( h, fn.__defaults__, debug_trace = def_debug_trace )
492
+ del def_debug_trace
493
+
494
+ if not fn.__kwdefaults__ is None and len(fn.__kwdefaults__) > 0:
495
+ def_debug_trace = None if debug_trace is None else debug_trace.update_topic(fn.__kwdefaults__, "keyword defauls")
496
+ self._hash_any( h, fn.__kwdefaults__, debug_trace = def_debug_trace )
497
+ del def_debug_trace
498
+
499
+ if self.f_include_closure and not fn.__closure__ is None and len(fn.__closure__) > 0:
500
+ # Closure cells (can be large; disable if that’s a concern)
501
+ closure_debug_trace = None if debug_trace is None else debug_trace.update_topic( fn.__closure__, "closure" )
502
+ for cell in fn.__closure__:
503
+ self._hash_any( h, cell.cell_contents, debug_trace=closure_debug_trace )
504
+ del closure_debug_trace
505
+
506
+ if self.f_include_globals and len(fn.__globals__) > 0 and len(fn.__code__.co_names) > 0:
507
+ # Referenced globals (names actually used by the code)
508
+ g = fn.__globals__
509
+ glb_debug_trace = None if debug_trace is None else debug_trace.update_topic( fn.__code__.co_names, "linked globals" )
510
+ for name in sorted(fn.__code__.co_names):
511
+ if name in g:
512
+ self._hash_any( h, (name, g[name]), debug_trace=glb_debug_trace )
513
+ del glb_debug_trace
514
+ del g
515
+
516
+ def _hash_dataFrame( self, h, df : pd.DataFrame, *, debug_trace = None ):
517
+ """
518
+ Compute hash for a dataframe, c.f. https://stackoverflow.com/questions/49883236/how-to-generate-a-hash-or-checksum-value-on-python-dataframe-created-from-a-fix
519
+ Returns a hex digest that changes if the DataFrame's *content* changes.
520
+ Does not hash attributes.
521
+ """
522
+ assert isinstance(df, pd.DataFrame), ("DataFrame expected", type(df))
523
+ debug_trace = None if debug_trace is None else debug_trace.update_topic( df )
524
+ if self.pd_ignore_column_order:
525
+ df = df.reindex(sorted(df.columns), axis=1)
526
+
527
+ # hash index
528
+ idx_h = pd.util.hash_pandas_object(df.index, index=False, categorize=True).values
529
+ h.update(idx_h.tobytes())
530
+ if not debug_trace is None: debug_trace.update( idx_h )
531
+
532
+ # hash each column’s content + its name + dtype
533
+ for name, col in df.items():
534
+ h.update(str(name).encode('utf-8'))
535
+ h.update(str(col.dtype).encode('utf-8'))
536
+ col_h = pd.util.hash_pandas_object(col, index=False, categorize=True).values
537
+ h.update(col_h.tobytes())
538
+ if not debug_trace is None:
539
+ debug_trace.update( str(name) )
540
+ debug_trace.update( str(col.dtype) )
541
+ debug_trace.update( col_h )
542
+
543
+ # attrs, if any
544
+ attrs = getattr(df, "attrs", None)
545
+ if not attrs is None:
546
+ self._hash_any(h, attrs)
547
+ if not debug_trace is None: debug_trace.update( attrs, "attrs" )
548
+
549
+ def _hash_numpy( self, h, a : np.ndarray, *, debug_trace = None ):
550
+ """
551
+ Numpy hash
552
+ """
553
+ assert isinstance(a, np.ndarray), ("ndarray expected", type(a))
554
+ a = np.asarray(a)
555
+
556
+ debug_trace = None if debug_trace is None else debug_trace.update_topic( a )
557
+ # Disallow arbitrary Python objects (define your own encoding first)
558
+ if a.dtype.kind == 'O':
559
+ raise TypeError("object-dtype array: map elements to bytes first (e.g., via str/utf-8).")
560
+
561
+ # Datetime/timedelta: hash their int64 representation
562
+ if a.dtype.kind in 'Mm':
563
+ a = a.view(np.int64)
564
+
565
+ # Make contiguous and normalize to little-endian for numeric types
566
+ a_dtype = a.dtype
567
+ # a = np.ascontiguousarray(a)
568
+ if a.dtype.byteorder == '>' or (a.dtype.byteorder == '=' and not np.little_endian):
569
+ a = a.byteswap().newbyteorder()
570
+
571
+ # Canonicalize NaN bits so all NaNs hash the same (float16/32/64, complex64/128)
572
+ if self.np_nan_equal and a.dtype.kind in 'fc':
573
+ base_bytes = (a.dtype.itemsize // (2 if a.dtype.kind == 'c' else 1))
574
+ if base_bytes == 2:
575
+ base = np.float16
576
+ elif base_bytes == 4:
577
+ base = np.float32
578
+ else:
579
+ assert base_bytes == 8, ("Internal error: cannot handle base_bytes", base_bytes)
580
+ base= np.float64
581
+ a = a.view(base)
582
+ if np.isnan(a).any():
583
+ a = a.copy()
584
+ if base is np.float16:
585
+ qnan = np.frombuffer(np.uint16(0x7e00).tobytes(), dtype=np.float16)[0]
586
+ elif base is np.float32:
587
+ qnan = np.frombuffer(np.uint32(0x7fc00000).tobytes(), dtype=np.float32)[0]
588
+ else: # float64
589
+ qnan = np.frombuffer(np.uint64(0x7ff8000000000000).tobytes(), dtype=np.float64)[0]
590
+ a[np.isnan(a)] = qnan
591
+ a = a.view(a_dtype)
592
+
593
+ # shapes
594
+ h.update( len(a.shape).to_bytes(4,'little', signed=False) )
595
+ for i in a.shape:
596
+ h.update( i.to_bytes(4,'little', signed=False) )
597
+ # dtype
598
+ h.update(a.dtype.str.encode('utf-8'))
599
+ h.update(a.tobytes())
600
+ if not debug_trace is None:
601
+ debug_trace.update( a.shape )
602
+ debug_trace.update( a.dtype.str )
603
+ debug_trace.update( a.tobytes() )
604
+
605
+ # Debugging
606
+ # =========
607
+
608
+ class DebugTrace(object):
609
+ def update( self, x, msg : str = None ):
610
+ """ Notify processing of 'x', with an optional process 'msg' """
611
+ raise NotImplementedError()
612
+ def update_topic( self, x, msg : str = None ):
613
+ """ Notify and return a sub-trace context """
614
+ raise NotImplementedError()
615
+ def warning( self, msg : str):
616
+ """ Issue warning """
617
+ raise NotImplementedError()
618
+
619
+ class DebugTraceCollect(DebugTrace):
620
+ """
621
+ Simple collection of all arguments parsed with DebugTrace
622
+ """
623
+ def __init__(self, tostr : int = None ):
624
+ """ Initialize data collection """
625
+ if tostr and tostr<=0: raise ValueError("'tostr' must be None or a positive integer")
626
+ self.tostr = tostr
627
+ self.trace = []
628
+ def _update( self, x, msg, child ):
629
+ """ Notify processing of 'x', with an optional process 'msg' """
630
+ if self.tostr:
631
+ y = PrettyObject( typex = type(x),
632
+ reprx = repr(x)[:self.tostr],
633
+ msg = msg,
634
+ child = child )
635
+ else:
636
+ y = PrettyObject( x = x,
637
+ msg = msg,
638
+ child = child )
639
+ self.trace.append( y )
640
+ def update( self, x, msg : str = None ):
641
+ """ Notify processing of 'x', with an optional process 'msg' """
642
+ self._update( x, msg, None )
643
+ def update_topic( self, x, msg : str = None ):
644
+ """ Notify and return a sub-trace context """
645
+ child = DebugTraceCollect(tostr=self.tostr)
646
+ self._update( x, msg, child )
647
+ return child
648
+ def warning( self, msg : str):
649
+ """ Issue warning """
650
+ self._update( None, msg, None )
651
+
652
+ # results
653
+ # -------
654
+
655
+ def __getitem__(self, item):
656
+ return self.trace[item]
657
+ def __len__(self):
658
+ return len(self.trace)
659
+ def __iter__(self):
660
+ for y in self.trace:
661
+ yield y
662
+ def __str__(self):
663
+ return self.trace.__repr__()
664
+ def __repr__(self):
665
+ return f"DebugTraceCollect({self.trace.__str__()})"
666
+
667
+ class DebugTraceVerbose(DebugTrace):
668
+ """
669
+ Live printing of tracing information with cdxbasics.verbose.Context
670
+ for some formatting. All objects will be reported by type and
671
+ their string representation, sufficiently reduced if necessary.
672
+ """
673
+ def __init__(self, strsize : int = 50, verbose = None ):
674
+ """
675
+ Initialize tracer.
676
+
677
+ Parameters
678
+ ----------
679
+ strsize : int
680
+ Maximum string size when using repr() on reported objects.
681
+ verbose :
682
+ Context object or None for a new context object.
683
+ """
684
+ from .verbose import Context
685
+ if strsize<=3: ValueError("'strsize' must exceed 3")
686
+ self.strsize = strsize
687
+ self.verbose = Context("all") if verbose is None else verbose
688
+ def update( self, x, msg : str = None ):
689
+ """ Notify processing of 'x', with an optional process 'msg' """
690
+ xstr = repr(x)
691
+ if xstr[:1] == "'" and xstr[-1] == "'":
692
+ xstr = xstr[1:-1]
693
+ if len(xstr) > self.strsize:
694
+ xstr = xstr[:self.strsize-3] + "..."
695
+ if msg is None or len(msg) == 0:
696
+ self.verbose.write( f"{type(x).__name__}: '{xstr}'" )
697
+ else:
698
+ self.verbose.write( f"{msg} {type(x).__name__}: '{xstr}'" )
699
+ def update_topic( self, x, msg : str = None ):
700
+ """ Notify and return a sub-trace context """
701
+ self.update( x, msg )
702
+ return DebugTraceVerbose( self.strsize, self.verbose(1) )
703
+ def warning( self, msg : str):
704
+ """ Issue warning """
705
+ self.verbose.write( msg )
706
+
707
+ # =============================================================================
708
+ # Utility wrappers
709
+ # =============================================================================
710
+
711
+ def uniqueHashExt(**parameters) -> UniqueHash:
712
+ return UniqueHash(**parameters)
713
+ uniqueHashExt.__doc__ = UniqueHash.__init__.__doc__
714
+
715
+ def namedUniqueHashExt( max_length : int = 60,
716
+ id_length : int = 16, *,
717
+ separator : str = ' ',
718
+ filename_by : str = None,
719
+ **unique_hash_arguments
720
+ ):
721
+ """
722
+ Returns a function
723
+
724
+ f( label, **argv, **argp )
725
+
726
+ which generates unique strings of at most a length of max_length of the format
727
+ label + separator + ID
728
+ where ID has length id_length.
729
+
730
+ The maximum length of the returned string is 'max_length'.
731
+ If total_lengths is id_length+len(separator) then the function just returns the ID of length max_length.
732
+
733
+ This function does not suppose that 'label' is unqiue, hence the ID is prioritized.
734
+ See uniqueLabelExt() for a function which assumes the label is unique.
735
+
736
+ The function optionally makes sure that the returned string is a valid file name using cdxbasics.util.fmt_filename.
737
+
738
+ Important
739
+ ---------
740
+ It is strongly recommended to read the documentation for UniqueHash.__init__() for details on hashing logic
741
+ and the available parameters
742
+
743
+ Parameters
744
+ ----------
745
+ max_length : int
746
+ Total length of the returned string including the ID.
747
+ Defaults to 60 to allow file names with extensions with three letters.
748
+ id_length : int
749
+ Intended length of the hash function, default 16
750
+ separator : str
751
+ Separator between label and id_length.
752
+ Note that the separator will be included in the ID calculation, hence different separators
753
+ lead to different IDs.
754
+ filename_by : str, None
755
+ If not None, use fmt_filename( *, by=filename_by ) to ensure the returned string is a valid
756
+ filename for both windows and linux, of at most 'max_length' size.
757
+ If set to the string "default", use DEF_FILE_NAME_MAP as the default mapping of cdxbasics.util.fmt_filename
758
+ **unique_hash_arguments:
759
+ Parameters passed to UniqueHash.__init__.
760
+
761
+ Returns
762
+ -------
763
+ hash function with signature (label, *args, **kwargs).
764
+ All arguments including label and separator will be used to generate the hash key.
765
+ """
766
+ if id_length < 4: raise ValueError("'id_length' must be at least 4. Found {id_length}")
767
+ if id_length > max_length: raise ValueError(f"'max_length' must not be less than 'id_length'. Founb {max_length} and {id_length}, respectivelty")
768
+ if 'length' in unique_hash_arguments: raise ValueError("Cannot specify 'length' here. Used 'id_length' and 'max_length'")
769
+ filename_by = ( DEF_FILE_NAME_MAP if filename_by=="default" else filename_by ) if not filename_by is None else None
770
+ fseparator = fmt_filename( separator, by=filename_by ) if not filename_by is None else separator
771
+
772
+ label_length = max_length-id_length-len(fseparator)
773
+ if label_length<=0:
774
+ id_length = max_length
775
+ label_length = 0
776
+ unique_hash = UniqueHash( length=id_length, **unique_hash_arguments )
777
+
778
+ def named_unique_hash(label, *args, **kwargs) -> str:
779
+ if label_length>0:
780
+ assert not label is None, ("'label' cannot be None", args, kwargs)
781
+ label = fmt_filename( label, by=filename_by ) if not filename_by is None else label
782
+ base_hash = unique_hash( label, separator, *args, **kwargs )
783
+ label = label[:label_length] + fseparator + base_hash
784
+ else:
785
+ label = unique_hash( separator, *args, **kwargs ) # using 'separator' here to allow distinction at that level
786
+ return label
787
+ return named_unique_hash
788
+
789
+ def uniqueLabelExt( max_length : int = 60,
790
+ id_length : int = 8,
791
+ separator : str = ' ',
792
+ filename_by : str = None ):
793
+ """
794
+ Returns a function
795
+
796
+ f( unique_label )
797
+
798
+ which generates strings of at most max_length of the format:
799
+ If len(unique_label) <= max_length:
800
+ unique_label
801
+ else:
802
+ unique_label + separator + ID
803
+ where the ID is of maximum length 'id_length'.
804
+
805
+ This function assumes that 'unique_label' is unique, hence the ID is dropped if 'unique_label' is less than 'max_length'
806
+ See namedUniqueHashExt() for a function does not assume the label is unique, hence the ID is always appended.
807
+ Note that if file name conversion is used, then this function will always attach the unique ID to the filename because
808
+ the reduction of the label to a filename is no longer guaranteed to be unique. If your label is unique as a filename, do not
809
+ use 'filename_by'. The function will return valid file names if label is a valid file name.
810
+
811
+ Parameters
812
+ ----------
813
+ max_length : int
814
+ Total length of the returned string including the ID.
815
+ Defaults to 60 to allow file names with extensions with three letters.
816
+ id_length : int
817
+ Indicative length of the hash function, default 8.
818
+ id_length will be reduced to max_length if neccessary.
819
+ separator : str
820
+ Separator between label and id_length.
821
+ Note that the separator will be included in the ID calculation, hence different separators
822
+ lead to different IDs.
823
+ filename_by : str, None
824
+ If not None, use fmt_filename( *, by=filename_by ) to ensure the returned string is a valid
825
+ filename for both windows and linux, of at most 'max_length' size.
826
+ If set to the string "default", use DEF_FILE_NAME_MAP as the default mapping of cdxbasics.util.fmt_filename
827
+
828
+ Returns
829
+ -------
830
+ hash function with signature (unique_label).
831
+ """
832
+ if id_length < 4: raise ValueError("'id_length' must be at least 4. Found {id_length}")
833
+ if id_length > max_length: raise ValueError(f"'max_length' must not be less than 'id_length'. Founb {max_length} and {id_length}, respectivelty")
834
+
835
+ filename_by = ( DEF_FILE_NAME_MAP if filename_by=="default" else filename_by ) if not filename_by is None else None
836
+ fseparator = fmt_filename( separator, by=filename_by ) if not filename_by is None else separator
837
+
838
+ if id_length>=max_length+len(fseparator):
839
+ id_length = max_length+len(fseparator)
840
+
841
+ unique_hash = UniqueHash( length=id_length )
842
+
843
+ def unique_label_hash(label) -> str:
844
+ if filename_by is None and len(label) <= max_length and len(label) > 0:
845
+ # no filename convertsion and label is short enough --> use this name
846
+ return label
847
+
848
+ base_hash = unique_hash( label, separator )
849
+ label_hash = fseparator + base_hash
850
+ if len(label_hash) >= max_length or len(label) == 0:
851
+ # hash and separator exceed total length. Note that len(base_hash) <= max_length
852
+ label = base_hash
853
+ else:
854
+ # convert label to filename
855
+ label = fmt_filename( label, by=filename_by ) if not filename_by is None else label
856
+ label = label[:max_length-len(label_hash)] + label_hash
857
+ return label
858
+ return unique_label_hash
859
+
860
+ # =============================================================================
861
+ # Short cuts
862
+ # =============================================================================
863
+
864
+ def uniqueHash8( *args, **argv ) -> str:
865
+ """
866
+ Compute a unique ID of length 8 for the provided arguments.
867
+
868
+ The function
869
+ 1) uses the repr() function to feed objects to the hash algorithm.
870
+ that means is only distinguishes floats up to str conversion precision
871
+ 2) keys of dictionaries, and sets are sorted to ensure equality of hashes
872
+ accross different memory setups of strings
873
+ 3) Members with leading '_' are ignored (*)
874
+ 4) Functions and properties are ignored (*)
875
+ (*) you can create a hash function with different behaviour by using uniqueHashExt()
876
+
877
+ To support hashing directly in one of your objects, implement
878
+
879
+ __unique_hash__( length : int, parse_functions : bool, parse_underscore : str )
880
+
881
+ The parameters are the same as for uniqueHashExt.
882
+ The function is expected to return a hashable object, ideally a string.
883
+ """
884
+ return UniqueHash(8)(*args,**argv)
885
+
886
+ def uniqueHash16( *args, **argv ) -> str:
887
+ """
888
+ Compute a unique ID of length 16 for the provided arguments.
889
+ The function
890
+ 1) uses the repr() function to feed objects to the hash algorithm.
891
+ that means is only distinguishes floats up to str conversion precision
892
+ 2) keys of dictionaries, and sets are sorted to ensure equality of hashes
893
+ accross different memory setups of strings
894
+ 3) Members with leading '_' are ignored (*)
895
+ 4) Functions and properties are ignored (*)
896
+ (*) you can create a hash function with different behaviour by using uniqueHashExt()
897
+
898
+ To support hashing directly in one of your objects, implement
899
+
900
+ __unique_hash__( length : int, parse_functions : bool, parse_underscore : str )
901
+
902
+ The parameters are the same as for uniqueHashExt.
903
+ The function is expected to return a hashable object, ideally a string.
904
+ """
905
+ return UniqueHash(16)(*args,**argv)
906
+
907
+ def uniqueHash32( *args, **argv ) -> str:
908
+ """
909
+ Compute a unique ID of length 32 for the provided arguments.
910
+ The function
911
+ 1) uses the repr() function to feed objects to the hash algorithm.
912
+ that means is only distinguishes floats up to str conversion precision
913
+ 2) keys of dictionaries, and sets are sorted to ensure equality of hashes
914
+ accross different memory setups of strings
915
+ 3) Members with leading '_' are ignored (*)
916
+ 4) Functions and properties are ignored (*)
917
+ (*) you can create a hash function with different behaviour by using uniqueHashExt()
918
+
919
+ To support hashing directly in one of your objects, implement
920
+
921
+ __unique_hash__( length : int, parse_functions : bool, parse_underscore : str )
922
+
923
+ The parameters are the same as for uniqueHashExt.
924
+ The function is expected to return a hashable object, ideally a string.
925
+ """
926
+ return UniqueHash(32)(*args,**argv)
927
+
928
+ uniqueHash = uniqueHash32
929
+
930
+ def uniqueHash48( *args, **argv ) -> str:
931
+ """
932
+ Compute a unique ID of length 48 for the provided arguments.
933
+ The function
934
+ 1) uses the repr() function to feed objects to the hash algorithm.
935
+ that means is only distinguishes floats up to str conversion precision
936
+ 2) keys of dictionaries, and sets are sorted to ensure equality of hashes
937
+ accross different memory setups of strings
938
+ 3) Members with leading '_' are ignored (*)
939
+ 4) Functions and properties are ignored (*)
940
+ (*) you can create a hash function with different behaviour by using uniqueHashExt()
941
+
942
+ To support hashing directly in one of your objects, implement
943
+
944
+ __unique_hash__( length : int, parse_functions : bool, parse_underscore : str )
945
+
946
+ The parameters are the same as for uniqueHashExt.
947
+ The function is expected to return a hashable object, ideally a string.
948
+ """
949
+ return UniqueHash(48)(*args,**argv)
950
+
951
+ def uniqueHash64( *args, **argv ) -> str:
952
+ """
953
+ Compute a unique ID of length 64 for the provided arguments.
954
+ The function
955
+ 1) uses the repr() function to feed objects to the hash algorithm.
956
+ that means is only distinguishes floats up to str conversion precision
957
+ 2) keys of dictionaries, and sets are sorted to ensure equality of hashes
958
+ accross different memory setups of strings
959
+ 3) Members with leading '_' are ignored (*)
960
+ 4) Functions and properties are ignored (*)
961
+ (*) you can create a hash function with different behaviour by using uniqueHashExt()
962
+
963
+ To support hashing directly in one of your objects, implement
964
+
965
+ __unique_hash__( length : int, parse_functions : bool, parse_underscore : str )
966
+
967
+ The parameters are the same as for uniqueHashExt.
968
+ The function is expected to return a hashable object, ideally a string.
969
+ """
970
+ return UniqueHash(64)(*args,**argv)