relib 1.2.0__tar.gz → 1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {relib-1.2.0 → relib-1.2.1}/PKG-INFO +3 -2
- {relib-1.2.0 → relib-1.2.1}/pyproject.toml +1 -1
- {relib-1.2.0 → relib-1.2.1}/relib/__init__.py +3 -1
- relib-1.2.1/relib/hashing.py +179 -0
- {relib-1.2.0 → relib-1.2.1}/relib/utils.py +37 -42
- {relib-1.2.0 → relib-1.2.1}/uv.lock +1 -1
- relib-1.2.0/relib/hashing.py +0 -255
- {relib-1.2.0 → relib-1.2.1}/.gitignore +0 -0
- {relib-1.2.0 → relib-1.2.1}/LICENSE +0 -0
- {relib-1.2.0 → relib-1.2.1}/README.md +0 -0
- {relib-1.2.0 → relib-1.2.1}/relib/measure_duration.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: relib
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.1
|
4
4
|
Project-URL: Repository, https://github.com/Reddan/relib.git
|
5
5
|
Author: Hampus Hallman
|
6
6
|
License: Copyright 2023 Hampus Hallman
|
@@ -10,4 +10,5 @@ License: Copyright 2023 Hampus Hallman
|
|
10
10
|
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
11
11
|
|
12
12
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
13
|
+
License-File: LICENSE
|
13
14
|
Requires-Python: >=3.12
|
@@ -1,6 +1,8 @@
|
|
1
1
|
from .utils import (
|
2
2
|
clear_console,
|
3
|
+
console_link,
|
3
4
|
non_none,
|
5
|
+
as_any,
|
4
6
|
list_split,
|
5
7
|
drop_none,
|
6
8
|
distinct,
|
@@ -31,5 +33,5 @@ from .utils import (
|
|
31
33
|
StrFilter,
|
32
34
|
str_filterer,
|
33
35
|
)
|
34
|
-
from .hashing import hash
|
36
|
+
from .hashing import hash, hash_obj
|
35
37
|
from .measure_duration import measure_duration
|
@@ -0,0 +1,179 @@
|
|
1
|
+
# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
|
2
|
+
# Copyright (c) 2009 Gael Varoquaux
|
3
|
+
# License: BSD Style, 3 clauses.
|
4
|
+
|
5
|
+
import pickle
|
6
|
+
import hashlib
|
7
|
+
import sys
|
8
|
+
import types
|
9
|
+
import io
|
10
|
+
import decimal
|
11
|
+
|
12
|
+
try:
|
13
|
+
import numpy
|
14
|
+
except:
|
15
|
+
has_numpy = False
|
16
|
+
else:
|
17
|
+
has_numpy = True
|
18
|
+
|
19
|
+
Pickler = pickle._Pickler
|
20
|
+
|
21
|
+
|
22
|
+
class _ConsistentSet(object):
|
23
|
+
def __init__(self, set_sequence):
|
24
|
+
try:
|
25
|
+
self._sequence = sorted(set_sequence)
|
26
|
+
except (TypeError, decimal.InvalidOperation):
|
27
|
+
self._sequence = sorted(map(hash_obj, set_sequence))
|
28
|
+
|
29
|
+
|
30
|
+
class _MyHash(object):
|
31
|
+
""" Class used to hash objects that won't normally pickle """
|
32
|
+
|
33
|
+
def __init__(self, *args):
|
34
|
+
self.args = args
|
35
|
+
|
36
|
+
|
37
|
+
class Hasher(Pickler):
|
38
|
+
""" A subclass of pickler, to do cryptographic hashing, rather than pickling. """
|
39
|
+
|
40
|
+
def __init__(self, hash_name="md5"):
|
41
|
+
self.stream = io.BytesIO()
|
42
|
+
# We want a pickle protocol that only changes with major Python versions
|
43
|
+
protocol = pickle.HIGHEST_PROTOCOL
|
44
|
+
Pickler.__init__(self, self.stream, protocol=protocol)
|
45
|
+
self._hash = hashlib.new(hash_name)
|
46
|
+
|
47
|
+
def hash(self, obj) -> str:
|
48
|
+
try:
|
49
|
+
self.dump(obj)
|
50
|
+
except pickle.PicklingError as e:
|
51
|
+
e.args += ("PicklingError while hashing %r: %r" % (obj, e),)
|
52
|
+
raise
|
53
|
+
dumps = self.stream.getvalue()
|
54
|
+
self._hash.update(dumps)
|
55
|
+
return self._hash.hexdigest()
|
56
|
+
|
57
|
+
def save(self, obj):
|
58
|
+
if isinstance(obj, (types.MethodType, type({}.pop))):
|
59
|
+
# the Pickler cannot pickle instance methods; here we decompose
|
60
|
+
# them into components that make them uniquely identifiable
|
61
|
+
if hasattr(obj, "__func__"):
|
62
|
+
func_name = obj.__func__.__name__
|
63
|
+
else:
|
64
|
+
func_name = obj.__name__
|
65
|
+
inst = obj.__self__
|
66
|
+
if type(inst) == type(pickle):
|
67
|
+
obj = _MyHash(func_name, inst.__name__)
|
68
|
+
elif inst is None:
|
69
|
+
# type(None) or type(module) do not pickle
|
70
|
+
obj = _MyHash(func_name, inst)
|
71
|
+
else:
|
72
|
+
cls = obj.__self__.__class__
|
73
|
+
obj = _MyHash(func_name, inst, cls)
|
74
|
+
Pickler.save(self, obj)
|
75
|
+
|
76
|
+
def memoize(self, obj):
|
77
|
+
# We want hashing to be sensitive to value instead of reference.
|
78
|
+
# For example we want ["aa", "aa"] and ["aa", "aaZ"[:2]]
|
79
|
+
# to hash to the same value and that's why we disable memoization
|
80
|
+
# for strings
|
81
|
+
if isinstance(obj, (bytes, str)):
|
82
|
+
return
|
83
|
+
Pickler.memoize(self, obj)
|
84
|
+
|
85
|
+
# The dispatch table of the pickler is not accessible in Python
|
86
|
+
# 3, as these lines are only bugware for IPython, we skip them.
|
87
|
+
def save_global(self, obj, name=None):
|
88
|
+
# We have to override this method in order to deal with objects
|
89
|
+
# defined interactively in IPython that are not injected in
|
90
|
+
# __main__
|
91
|
+
try:
|
92
|
+
Pickler.save_global(self, obj, name=name)
|
93
|
+
except pickle.PicklingError:
|
94
|
+
Pickler.save_global(self, obj, name=name)
|
95
|
+
module = getattr(obj, "__module__", None)
|
96
|
+
if module == "__main__":
|
97
|
+
my_name = name
|
98
|
+
if my_name is None:
|
99
|
+
my_name = obj.__name__
|
100
|
+
mod = sys.modules[module]
|
101
|
+
if not hasattr(mod, my_name):
|
102
|
+
# IPython doesn't inject the variables define
|
103
|
+
# interactively in __main__
|
104
|
+
setattr(mod, my_name, obj)
|
105
|
+
|
106
|
+
def _batch_setitems(self, items):
|
107
|
+
try:
|
108
|
+
Pickler._batch_setitems(self, iter(sorted(items)))
|
109
|
+
except TypeError:
|
110
|
+
Pickler._batch_setitems(self, iter(sorted((hash_obj(k), v) for k, v in items)))
|
111
|
+
|
112
|
+
def save_set(self, set_items):
|
113
|
+
Pickler.save(self, _ConsistentSet(set_items))
|
114
|
+
|
115
|
+
dispatch = Pickler.dispatch.copy()
|
116
|
+
dispatch[type(len)] = save_global # builtin
|
117
|
+
dispatch[type(object)] = save_global # type
|
118
|
+
dispatch[type(Pickler)] = save_global # classobj
|
119
|
+
dispatch[type(pickle.dump)] = save_global # function
|
120
|
+
dispatch[type(set())] = save_set
|
121
|
+
|
122
|
+
|
123
|
+
class NumpyHasher(Hasher):
|
124
|
+
def __init__(self, hash_name="md5"):
|
125
|
+
Hasher.__init__(self, hash_name=hash_name)
|
126
|
+
|
127
|
+
def save(self, obj):
|
128
|
+
""" Subclass the save method, to hash ndarray subclass, rather
|
129
|
+
than pickling them. Off course, this is a total abuse of
|
130
|
+
the Pickler class.
|
131
|
+
"""
|
132
|
+
import numpy as np
|
133
|
+
|
134
|
+
if isinstance(obj, np.ndarray) and not obj.dtype.hasobject:
|
135
|
+
# Compute a hash of the object
|
136
|
+
# The update function of the hash requires a c_contiguous buffer.
|
137
|
+
if obj.shape == ():
|
138
|
+
# 0d arrays need to be flattened because viewing them as bytes
|
139
|
+
# raises a ValueError exception.
|
140
|
+
obj_c_contiguous = obj.flatten()
|
141
|
+
elif obj.flags.c_contiguous:
|
142
|
+
obj_c_contiguous = obj
|
143
|
+
elif obj.flags.f_contiguous:
|
144
|
+
obj_c_contiguous = obj.T
|
145
|
+
else:
|
146
|
+
# Cater for non-single-segment arrays: this creates a
|
147
|
+
# copy, and thus aleviates this issue.
|
148
|
+
# XXX: There might be a more efficient way of doing this
|
149
|
+
obj_c_contiguous = obj.flatten()
|
150
|
+
|
151
|
+
# View the array as bytes to support dtypes like datetime64
|
152
|
+
self._hash.update(memoryview(obj_c_contiguous.view(np.uint8)))
|
153
|
+
|
154
|
+
# The object will be pickled by the pickler hashed at the end.
|
155
|
+
obj = (obj.__class__, ("HASHED", obj.dtype, obj.shape, obj.strides))
|
156
|
+
elif isinstance(obj, np.dtype):
|
157
|
+
# Atomic dtype objects are interned by their default constructor:
|
158
|
+
# np.dtype("f8") is np.dtype("f8")
|
159
|
+
# This interning is not maintained by a
|
160
|
+
# pickle.loads + pickle.dumps cycle, because __reduce__
|
161
|
+
# uses copy=True in the dtype constructor. This
|
162
|
+
# non-deterministic behavior causes the internal memoizer
|
163
|
+
# of the hasher to generate different hash values
|
164
|
+
# depending on the history of the dtype object.
|
165
|
+
# To prevent the hash from being sensitive to this, we use
|
166
|
+
# .descr which is a full (and never interned) description of
|
167
|
+
# the array dtype according to the numpy doc.
|
168
|
+
obj = (obj.__class__, ("HASHED", obj.descr))
|
169
|
+
|
170
|
+
Hasher.save(self, obj)
|
171
|
+
|
172
|
+
|
173
|
+
def hash_obj(obj, hash_name="md5") -> str:
|
174
|
+
if has_numpy:
|
175
|
+
return NumpyHasher(hash_name=hash_name).hash(obj)
|
176
|
+
else:
|
177
|
+
return Hasher(hash_name=hash_name).hash(obj)
|
178
|
+
|
179
|
+
hash = hash_obj
|
@@ -1,21 +1,22 @@
|
|
1
1
|
import os
|
2
2
|
import re
|
3
|
-
from typing import
|
3
|
+
from typing import Iterable, Callable, Any, overload
|
4
4
|
from itertools import chain
|
5
5
|
|
6
|
-
T = TypeVar('T')
|
7
|
-
U = TypeVar('U')
|
8
|
-
K = TypeVar('K')
|
9
|
-
K1, K2, K3, K4, K5, K6 = TypeVar('K1'), TypeVar('K2'), TypeVar('K3'), TypeVar('K4'), TypeVar('K5'), TypeVar('K6')
|
10
|
-
|
11
6
|
def clear_console():
|
12
7
|
os.system("cls" if os.name == "nt" else "clear")
|
13
8
|
|
14
|
-
def
|
9
|
+
def console_link(text, url):
|
10
|
+
return f"\033]8;;{url}\033\\{text}\033]8;;\033\\"
|
11
|
+
|
12
|
+
def non_none[T](obj: T | None) -> T:
|
15
13
|
assert obj is not None
|
16
14
|
return obj
|
17
15
|
|
18
|
-
def
|
16
|
+
def as_any(obj: Any) -> Any:
|
17
|
+
return obj
|
18
|
+
|
19
|
+
def list_split[T](l: list[T], sep: T) -> list[list[T]]:
|
19
20
|
l = [sep, *l, sep]
|
20
21
|
split_at = [i for i, x in enumerate(l) if x is sep]
|
21
22
|
ranges = list(zip(split_at[0:-1], split_at[1:]))
|
@@ -24,16 +25,16 @@ def list_split(l: list[T], sep: T) -> list[list[T]]:
|
|
24
25
|
for start, end in ranges
|
25
26
|
]
|
26
27
|
|
27
|
-
def drop_none(l: Iterable[T | None]) -> list[T]:
|
28
|
+
def drop_none[T](l: Iterable[T | None]) -> list[T]:
|
28
29
|
return [x for x in l if x is not None]
|
29
30
|
|
30
|
-
def distinct(items: Iterable[T]) -> list[T]:
|
31
|
-
return list(
|
31
|
+
def distinct[T](items: Iterable[T]) -> list[T]:
|
32
|
+
return list(dict.fromkeys(items))
|
32
33
|
|
33
|
-
def first(iterable: Iterable[T]) -> T | None:
|
34
|
+
def first[T](iterable: Iterable[T]) -> T | None:
|
34
35
|
return next(iter(iterable), None)
|
35
36
|
|
36
|
-
def move_value(l: Iterable[T], from_i: int, to_i: int) -> list[T]:
|
37
|
+
def move_value[T](l: Iterable[T], from_i: int, to_i: int) -> list[T]:
|
37
38
|
l = list(l)
|
38
39
|
l.insert(to_i, l.pop(from_i))
|
39
40
|
return l
|
@@ -53,7 +54,7 @@ def transpose_dict(des):
|
|
53
54
|
{key: des[key][i] for key in keys}
|
54
55
|
for i in range(length)
|
55
56
|
]
|
56
|
-
raise ValueError(
|
57
|
+
raise ValueError("transpose_dict only accepts dict or list")
|
57
58
|
|
58
59
|
def make_combinations_by_dict(des, keys=None, pairs=[]):
|
59
60
|
keys = sorted(des.keys()) if keys == None else keys
|
@@ -67,7 +68,7 @@ def make_combinations_by_dict(des, keys=None, pairs=[]):
|
|
67
68
|
for pair in new_pairs
|
68
69
|
])
|
69
70
|
|
70
|
-
def merge_dicts(*dicts: dict[K, T]) -> dict[K, T]:
|
71
|
+
def merge_dicts[T, K](*dicts: dict[K, T]) -> dict[K, T]:
|
71
72
|
if len(dicts) == 1:
|
72
73
|
return dicts[0]
|
73
74
|
result = {}
|
@@ -75,32 +76,32 @@ def merge_dicts(*dicts: dict[K, T]) -> dict[K, T]:
|
|
75
76
|
result.update(d)
|
76
77
|
return result
|
77
78
|
|
78
|
-
def intersect(*lists: Iterable[T]) -> list[T]:
|
79
|
+
def intersect[T](*lists: Iterable[T]) -> list[T]:
|
79
80
|
return list(set.intersection(*map(set, lists)))
|
80
81
|
|
81
|
-
def ensure_tuple(value: T | tuple[T, ...]) -> tuple[T, ...]:
|
82
|
+
def ensure_tuple[T](value: T | tuple[T, ...]) -> tuple[T, ...]:
|
82
83
|
return value if isinstance(value, tuple) else (value,)
|
83
84
|
|
84
|
-
def key_of(dicts: Iterable[dict[T, U]], key: T) -> list[U]:
|
85
|
+
def key_of[T, U](dicts: Iterable[dict[T, U]], key: T) -> list[U]:
|
85
86
|
return [d[key] for d in dicts]
|
86
87
|
|
87
|
-
def omit(d: dict[K, T], keys: Iterable[K]) -> dict[K, T]:
|
88
|
+
def omit[T, K](d: dict[K, T], keys: Iterable[K]) -> dict[K, T]:
|
88
89
|
if keys:
|
89
90
|
d = dict(d)
|
90
91
|
for key in keys:
|
91
92
|
del d[key]
|
92
93
|
return d
|
93
94
|
|
94
|
-
def pick(d: dict[K, T], keys: Iterable[K]) -> dict[K, T]:
|
95
|
+
def pick[T, K](d: dict[K, T], keys: Iterable[K]) -> dict[K, T]:
|
95
96
|
return {key: d[key] for key in keys}
|
96
97
|
|
97
|
-
def dict_by(keys: Iterable[K], values: Iterable[T]) -> dict[K, T]:
|
98
|
+
def dict_by[T, K](keys: Iterable[K], values: Iterable[T]) -> dict[K, T]:
|
98
99
|
return dict(zip(keys, values))
|
99
100
|
|
100
|
-
def tuple_by(d: dict[K, T], keys: Iterable[K]) -> tuple[T, ...]:
|
101
|
+
def tuple_by[T, K](d: dict[K, T], keys: Iterable[K]) -> tuple[T, ...]:
|
101
102
|
return tuple(d[key] for key in keys)
|
102
103
|
|
103
|
-
def flatten(l: Iterable[Iterable[T]]) -> list[T]:
|
104
|
+
def flatten[T](l: Iterable[Iterable[T]]) -> list[T]:
|
104
105
|
return list(chain.from_iterable(l))
|
105
106
|
|
106
107
|
def transpose(tuples, default_num_returns=0):
|
@@ -109,27 +110,21 @@ def transpose(tuples, default_num_returns=0):
|
|
109
110
|
return ([],) * default_num_returns
|
110
111
|
return tuple(map(list, output))
|
111
112
|
|
112
|
-
def map_dict(fn: Callable[[T], U], d: dict[K, T]) -> dict[K, U]:
|
113
|
+
def map_dict[T, U, K](fn: Callable[[T], U], d: dict[K, T]) -> dict[K, U]:
|
113
114
|
return {key: fn(value) for key, value in d.items()}
|
114
115
|
|
115
116
|
@overload
|
116
|
-
def deepen_dict(d: dict[tuple[K1], U]) -> dict[K1, U]: ...
|
117
|
-
|
117
|
+
def deepen_dict[K1, U](d: dict[tuple[K1], U]) -> dict[K1, U]: ...
|
118
118
|
@overload
|
119
|
-
def deepen_dict(d: dict[tuple[K1, K2], U]) -> dict[K1, dict[K2, U]]: ...
|
120
|
-
|
119
|
+
def deepen_dict[K1, K2, U](d: dict[tuple[K1, K2], U]) -> dict[K1, dict[K2, U]]: ...
|
121
120
|
@overload
|
122
|
-
def deepen_dict(d: dict[tuple[K1, K2, K3], U]) -> dict[K1, dict[K2, dict[K3, U]]]: ...
|
123
|
-
|
121
|
+
def deepen_dict[K1, K2, K3, U](d: dict[tuple[K1, K2, K3], U]) -> dict[K1, dict[K2, dict[K3, U]]]: ...
|
124
122
|
@overload
|
125
|
-
def deepen_dict(d: dict[tuple[K1, K2, K3, K4], U]) -> dict[K1, dict[K2, dict[K3, dict[K4, U]]]]: ...
|
126
|
-
|
123
|
+
def deepen_dict[K1, K2, K3, K4, U](d: dict[tuple[K1, K2, K3, K4], U]) -> dict[K1, dict[K2, dict[K3, dict[K4, U]]]]: ...
|
127
124
|
@overload
|
128
|
-
def deepen_dict(d: dict[tuple[K1, K2, K3, K4, K5], U]) -> dict[K1, dict[K2, dict[K3, dict[K4, dict[K5, U]]]]]: ...
|
129
|
-
|
125
|
+
def deepen_dict[K1, K2, K3, K4, K5, U](d: dict[tuple[K1, K2, K3, K4, K5], U]) -> dict[K1, dict[K2, dict[K3, dict[K4, dict[K5, U]]]]]: ...
|
130
126
|
@overload
|
131
|
-
def deepen_dict(d: dict[tuple[K1, K2, K3, K4, K5, K6], U]) -> dict[K1, dict[K2, dict[K3, dict[K4, dict[K5, dict[K6, U]]]]]]: ...
|
132
|
-
|
127
|
+
def deepen_dict[K1, K2, K3, K4, K5, K6, U](d: dict[tuple[K1, K2, K3, K4, K5, K6], U]) -> dict[K1, dict[K2, dict[K3, dict[K4, dict[K5, dict[K6, U]]]]]]: ...
|
133
128
|
def deepen_dict(d: dict[tuple[Any, ...], Any]) -> dict:
|
134
129
|
output = {}
|
135
130
|
if () in d:
|
@@ -151,31 +146,31 @@ def flatten_dict_inner(d, prefix=()):
|
|
151
146
|
def flatten_dict(deep_dict: dict, prefix=()) -> dict:
|
152
147
|
return dict(flatten_dict_inner(deep_dict, prefix))
|
153
148
|
|
154
|
-
def group(pairs: Iterable[tuple[K, T]]) -> dict[K, list[T]]:
|
149
|
+
def group[T, K](pairs: Iterable[tuple[K, T]]) -> dict[K, list[T]]:
|
155
150
|
values_by_key = {}
|
156
151
|
for key, value in pairs:
|
157
152
|
values_by_key.setdefault(key, []).append(value)
|
158
153
|
return values_by_key
|
159
154
|
|
160
|
-
def reversed_enumerate(l: list[T] | tuple[T, ...]) -> Iterable[tuple[int, T]]:
|
155
|
+
def reversed_enumerate[T](l: list[T] | tuple[T, ...]) -> Iterable[tuple[int, T]]:
|
161
156
|
return zip(reversed(range(len(l))), reversed(l))
|
162
157
|
|
163
|
-
def get_at(d: dict, keys: Iterable[Any], default: T) -> T:
|
158
|
+
def get_at[T](d: dict, keys: Iterable[Any], default: T) -> T:
|
164
159
|
try:
|
165
160
|
for key in keys:
|
166
161
|
d = d[key]
|
167
162
|
except KeyError:
|
168
163
|
return default
|
169
|
-
return
|
164
|
+
return as_any(d)
|
170
165
|
|
171
|
-
def sized_partitions(values: Iterable[T], part_size: int) -> list[list[T]]:
|
166
|
+
def sized_partitions[T](values: Iterable[T], part_size: int) -> list[list[T]]:
|
172
167
|
# "chunk"
|
173
168
|
if not isinstance(values, list):
|
174
169
|
values = list(values)
|
175
170
|
num_parts = (len(values) / part_size).__ceil__()
|
176
171
|
return [values[i * part_size:(i + 1) * part_size] for i in range(num_parts)]
|
177
172
|
|
178
|
-
def num_partitions(values: Iterable[T], num_parts: int) -> list[list[T]]:
|
173
|
+
def num_partitions[T](values: Iterable[T], num_parts: int) -> list[list[T]]:
|
179
174
|
if not isinstance(values, list):
|
180
175
|
values = list(values)
|
181
176
|
part_size = (len(values) / num_parts).__ceil__()
|
relib-1.2.0/relib/hashing.py
DELETED
@@ -1,255 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Fast cryptographic hash of Python objects, with a special case for fast
|
3
|
-
hashing of numpy arrays.
|
4
|
-
"""
|
5
|
-
|
6
|
-
# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
|
7
|
-
# Copyright (c) 2009 Gael Varoquaux
|
8
|
-
# License: BSD Style, 3 clauses.
|
9
|
-
|
10
|
-
import pickle
|
11
|
-
import hashlib
|
12
|
-
import sys
|
13
|
-
import types
|
14
|
-
import struct
|
15
|
-
import io
|
16
|
-
import decimal
|
17
|
-
|
18
|
-
Pickler = pickle._Pickler
|
19
|
-
_bytes_or_unicode = (bytes, str)
|
20
|
-
|
21
|
-
|
22
|
-
class _ConsistentSet(object):
|
23
|
-
""" Class used to ensure the hash of Sets is preserved
|
24
|
-
whatever the order of its items.
|
25
|
-
"""
|
26
|
-
def __init__(self, set_sequence):
|
27
|
-
# Forces order of elements in set to ensure consistent hash.
|
28
|
-
try:
|
29
|
-
# Trying first to order the set assuming the type of elements is
|
30
|
-
# consistent and orderable.
|
31
|
-
# This fails on python 3 when elements are unorderable
|
32
|
-
# but we keep it in a try as it's faster.
|
33
|
-
self._sequence = sorted(set_sequence)
|
34
|
-
except (TypeError, decimal.InvalidOperation):
|
35
|
-
# If elements are unorderable, sorting them using their hash.
|
36
|
-
# This is slower but works in any case.
|
37
|
-
self._sequence = sorted((hash(e) for e in set_sequence))
|
38
|
-
|
39
|
-
|
40
|
-
class _MyHash(object):
|
41
|
-
""" Class used to hash objects that won't normally pickle """
|
42
|
-
|
43
|
-
def __init__(self, *args):
|
44
|
-
self.args = args
|
45
|
-
|
46
|
-
|
47
|
-
class Hasher(Pickler):
|
48
|
-
""" A subclass of pickler, to do cryptographic hashing, rather than
|
49
|
-
pickling.
|
50
|
-
"""
|
51
|
-
|
52
|
-
def __init__(self, hash_name='md5'):
|
53
|
-
self.stream = io.BytesIO()
|
54
|
-
# By default we want a pickle protocol that only changes with
|
55
|
-
# the major python version and not the minor one
|
56
|
-
protocol = pickle.HIGHEST_PROTOCOL
|
57
|
-
Pickler.__init__(self, self.stream, protocol=protocol)
|
58
|
-
# Initialise the hash obj
|
59
|
-
self._hash = hashlib.new(hash_name)
|
60
|
-
|
61
|
-
def hash(self, obj, return_digest=True):
|
62
|
-
try:
|
63
|
-
self.dump(obj)
|
64
|
-
except pickle.PicklingError as e:
|
65
|
-
e.args += ('PicklingError while hashing %r: %r' % (obj, e),)
|
66
|
-
raise
|
67
|
-
dumps = self.stream.getvalue()
|
68
|
-
self._hash.update(dumps)
|
69
|
-
if return_digest:
|
70
|
-
return self._hash.hexdigest()
|
71
|
-
|
72
|
-
def save(self, obj):
|
73
|
-
if isinstance(obj, (types.MethodType, type({}.pop))):
|
74
|
-
# the Pickler cannot pickle instance methods; here we decompose
|
75
|
-
# them into components that make them uniquely identifiable
|
76
|
-
if hasattr(obj, '__func__'):
|
77
|
-
func_name = obj.__func__.__name__
|
78
|
-
else:
|
79
|
-
func_name = obj.__name__
|
80
|
-
inst = obj.__self__
|
81
|
-
if type(inst) == type(pickle):
|
82
|
-
obj = _MyHash(func_name, inst.__name__)
|
83
|
-
elif inst is None:
|
84
|
-
# type(None) or type(module) do not pickle
|
85
|
-
obj = _MyHash(func_name, inst)
|
86
|
-
else:
|
87
|
-
cls = obj.__self__.__class__
|
88
|
-
obj = _MyHash(func_name, inst, cls)
|
89
|
-
Pickler.save(self, obj)
|
90
|
-
|
91
|
-
def memoize(self, obj):
|
92
|
-
# We want hashing to be sensitive to value instead of reference.
|
93
|
-
# For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]]
|
94
|
-
# to hash to the same value and that's why we disable memoization
|
95
|
-
# for strings
|
96
|
-
if isinstance(obj, _bytes_or_unicode):
|
97
|
-
return
|
98
|
-
Pickler.memoize(self, obj)
|
99
|
-
|
100
|
-
# The dispatch table of the pickler is not accessible in Python
|
101
|
-
# 3, as these lines are only bugware for IPython, we skip them.
|
102
|
-
def save_global(self, obj, name=None, pack=struct.pack):
|
103
|
-
# We have to override this method in order to deal with objects
|
104
|
-
# defined interactively in IPython that are not injected in
|
105
|
-
# __main__
|
106
|
-
kwargs = dict(name=name, pack=pack)
|
107
|
-
if sys.version_info >= (3, 4):
|
108
|
-
del kwargs['pack']
|
109
|
-
try:
|
110
|
-
Pickler.save_global(self, obj, **kwargs)
|
111
|
-
except pickle.PicklingError:
|
112
|
-
Pickler.save_global(self, obj, **kwargs)
|
113
|
-
module = getattr(obj, "__module__", None)
|
114
|
-
if module == '__main__':
|
115
|
-
my_name = name
|
116
|
-
if my_name is None:
|
117
|
-
my_name = obj.__name__
|
118
|
-
mod = sys.modules[module]
|
119
|
-
if not hasattr(mod, my_name):
|
120
|
-
# IPython doesn't inject the variables define
|
121
|
-
# interactively in __main__
|
122
|
-
setattr(mod, my_name, obj)
|
123
|
-
|
124
|
-
dispatch = Pickler.dispatch.copy()
|
125
|
-
# builtin
|
126
|
-
dispatch[type(len)] = save_global
|
127
|
-
# type
|
128
|
-
dispatch[type(object)] = save_global
|
129
|
-
# classobj
|
130
|
-
dispatch[type(Pickler)] = save_global
|
131
|
-
# function
|
132
|
-
dispatch[type(pickle.dump)] = save_global
|
133
|
-
|
134
|
-
def _batch_setitems(self, items):
|
135
|
-
# forces order of keys in dict to ensure consistent hash.
|
136
|
-
try:
|
137
|
-
# Trying first to compare dict assuming the type of keys is
|
138
|
-
# consistent and orderable.
|
139
|
-
# This fails on python 3 when keys are unorderable
|
140
|
-
# but we keep it in a try as it's faster.
|
141
|
-
Pickler._batch_setitems(self, iter(sorted(items)))
|
142
|
-
except TypeError:
|
143
|
-
# If keys are unorderable, sorting them using their hash. This is
|
144
|
-
# slower but works in any case.
|
145
|
-
Pickler._batch_setitems(self, iter(sorted((hash(k), v)
|
146
|
-
for k, v in items)))
|
147
|
-
|
148
|
-
def save_set(self, set_items):
|
149
|
-
# forces order of items in Set to ensure consistent hash
|
150
|
-
Pickler.save(self, _ConsistentSet(set_items))
|
151
|
-
|
152
|
-
dispatch[type(set())] = save_set
|
153
|
-
|
154
|
-
|
155
|
-
class NumpyHasher(Hasher):
|
156
|
-
""" Special case the hasher for when numpy is loaded.
|
157
|
-
"""
|
158
|
-
|
159
|
-
def __init__(self, hash_name='md5', coerce_mmap=False):
|
160
|
-
"""
|
161
|
-
Parameters
|
162
|
-
----------
|
163
|
-
hash_name: string
|
164
|
-
The hash algorithm to be used
|
165
|
-
coerce_mmap: boolean
|
166
|
-
Make no difference between np.memmap and np.ndarray
|
167
|
-
objects.
|
168
|
-
"""
|
169
|
-
self.coerce_mmap = coerce_mmap
|
170
|
-
Hasher.__init__(self, hash_name=hash_name)
|
171
|
-
# delayed import of numpy, to avoid tight coupling
|
172
|
-
import numpy as np
|
173
|
-
self.np = np
|
174
|
-
if hasattr(np, 'getbuffer'):
|
175
|
-
self._getbuffer = np.getbuffer
|
176
|
-
else:
|
177
|
-
self._getbuffer = memoryview
|
178
|
-
|
179
|
-
def save(self, obj):
|
180
|
-
""" Subclass the save method, to hash ndarray subclass, rather
|
181
|
-
than pickling them. Off course, this is a total abuse of
|
182
|
-
the Pickler class.
|
183
|
-
"""
|
184
|
-
if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject:
|
185
|
-
# Compute a hash of the object
|
186
|
-
# The update function of the hash requires a c_contiguous buffer.
|
187
|
-
if obj.shape == ():
|
188
|
-
# 0d arrays need to be flattened because viewing them as bytes
|
189
|
-
# raises a ValueError exception.
|
190
|
-
obj_c_contiguous = obj.flatten()
|
191
|
-
elif obj.flags.c_contiguous:
|
192
|
-
obj_c_contiguous = obj
|
193
|
-
elif obj.flags.f_contiguous:
|
194
|
-
obj_c_contiguous = obj.T
|
195
|
-
else:
|
196
|
-
# Cater for non-single-segment arrays: this creates a
|
197
|
-
# copy, and thus aleviates this issue.
|
198
|
-
# XXX: There might be a more efficient way of doing this
|
199
|
-
obj_c_contiguous = obj.flatten()
|
200
|
-
|
201
|
-
# memoryview is not supported for some dtypes, e.g. datetime64, see
|
202
|
-
# https://github.com/numpy/numpy/issues/4983. The
|
203
|
-
# workaround is to view the array as bytes before
|
204
|
-
# taking the memoryview.
|
205
|
-
self._hash.update(
|
206
|
-
self._getbuffer(obj_c_contiguous.view(self.np.uint8)))
|
207
|
-
|
208
|
-
# We store the class, to be able to distinguish between
|
209
|
-
# Objects with the same binary content, but different
|
210
|
-
# classes.
|
211
|
-
if self.coerce_mmap and isinstance(obj, self.np.memmap):
|
212
|
-
# We don't make the difference between memmap and
|
213
|
-
# normal ndarrays, to be able to reload previously
|
214
|
-
# computed results with memmap.
|
215
|
-
klass = self.np.ndarray
|
216
|
-
else:
|
217
|
-
klass = obj.__class__
|
218
|
-
# We also return the dtype and the shape, to distinguish
|
219
|
-
# different views on the same data with different dtypes.
|
220
|
-
|
221
|
-
# The object will be pickled by the pickler hashed at the end.
|
222
|
-
obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides))
|
223
|
-
elif isinstance(obj, self.np.dtype):
|
224
|
-
# Atomic dtype objects are interned by their default constructor:
|
225
|
-
# np.dtype('f8') is np.dtype('f8')
|
226
|
-
# This interning is not maintained by a
|
227
|
-
# pickle.loads + pickle.dumps cycle, because __reduce__
|
228
|
-
# uses copy=True in the dtype constructor. This
|
229
|
-
# non-deterministic behavior causes the internal memoizer
|
230
|
-
# of the hasher to generate different hash values
|
231
|
-
# depending on the history of the dtype object.
|
232
|
-
# To prevent the hash from being sensitive to this, we use
|
233
|
-
# .descr which is a full (and never interned) description of
|
234
|
-
# the array dtype according to the numpy doc.
|
235
|
-
klass = obj.__class__
|
236
|
-
obj = (klass, ('HASHED', obj.descr))
|
237
|
-
Hasher.save(self, obj)
|
238
|
-
|
239
|
-
|
240
|
-
def hash(obj, hash_name='md5', coerce_mmap=False) -> str:
|
241
|
-
""" Quick calculation of a hash to identify uniquely Python objects
|
242
|
-
containing numpy arrays.
|
243
|
-
Parameters
|
244
|
-
-----------
|
245
|
-
hash_name: 'md5' or 'sha1'
|
246
|
-
Hashing algorithm used. sha1 is supposedly safer, but md5 is
|
247
|
-
faster.
|
248
|
-
coerce_mmap: boolean
|
249
|
-
Make no difference between np.memmap and np.ndarray
|
250
|
-
"""
|
251
|
-
if 'numpy' in sys.modules:
|
252
|
-
hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap)
|
253
|
-
else:
|
254
|
-
hasher = Hasher(hash_name=hash_name)
|
255
|
-
return hasher.hash(obj)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|