datasketch 1.10.0__tar.gz → 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datasketch-1.10.0 → datasketch-2.0.0}/PKG-INFO +12 -1
- {datasketch-1.10.0 → datasketch-2.0.0}/README.rst +11 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/__init__.py +2 -1
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/aio/lsh.py +8 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/b_bit_minhash.py +64 -11
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/hyperloglog.py +1 -1
- datasketch-2.0.0/datasketch/lean_minhash.py +359 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/lsh.py +17 -1
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/lsh_bloom.py +18 -8
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/lshensemble.py +5 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/lshforest.py +26 -6
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/minhash.py +314 -56
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/weighted_minhash.py +2 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/pyproject.toml +1 -1
- datasketch-1.10.0/datasketch/lean_minhash.py +0 -253
- {datasketch-1.10.0 → datasketch-2.0.0}/.gitignore +0 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/LICENSE +0 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/aio/__init__.py +0 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/aio/storage.py +0 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/experimental/__init__.py +0 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/experimental/aio/__init__.py +0 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/experimental/aio/lsh.py +0 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/hashfunc.py +0 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/hnsw.py +0 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/hyperloglog_const.py +0 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/lshensemble_partition.py +0 -0
- {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/storage.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datasketch
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.0
|
|
4
4
|
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
5
|
Project-URL: Homepage, https://ekzhu.github.io/datasketch
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
|
|
@@ -80,6 +80,17 @@ datasketch gives you probabilistic data structures that can process and
|
|
|
80
80
|
search very large amount of data super fast, with little loss of
|
|
81
81
|
accuracy.
|
|
82
82
|
|
|
83
|
+
.. note::
|
|
84
|
+
**Version 2.0.0** changes the default MinHash permutation scheme to
|
|
85
|
+
``"affine32"``, which fixes a similarity over-estimation bias on large
|
|
86
|
+
sets (`issue #212 <https://github.com/ekzhu/datasketch/issues/212>`__),
|
|
87
|
+
halves sketch memory, and speeds up updates by roughly 4x. A 64-bit
|
|
88
|
+
``"affine64"`` scheme is available for billion-scale sets. Hash values
|
|
89
|
+
differ from earlier versions: rebuild persisted sketches and LSH
|
|
90
|
+
indexes, or pass ``MinHash(..., scheme="legacy")`` to interoperate with
|
|
91
|
+
existing data. See the `MinHash documentation
|
|
92
|
+
<https://ekzhu.github.io/datasketch/minhash.html>`__ for details.
|
|
93
|
+
|
|
83
94
|
This package contains the following data sketches:
|
|
84
95
|
|
|
85
96
|
+-------------------------+-----------------------------------------------+
|
|
@@ -14,6 +14,17 @@ datasketch gives you probabilistic data structures that can process and
|
|
|
14
14
|
search very large amount of data super fast, with little loss of
|
|
15
15
|
accuracy.
|
|
16
16
|
|
|
17
|
+
.. note::
|
|
18
|
+
**Version 2.0.0** changes the default MinHash permutation scheme to
|
|
19
|
+
``"affine32"``, which fixes a similarity over-estimation bias on large
|
|
20
|
+
sets (`issue #212 <https://github.com/ekzhu/datasketch/issues/212>`__),
|
|
21
|
+
halves sketch memory, and speeds up updates by roughly 4x. A 64-bit
|
|
22
|
+
``"affine64"`` scheme is available for billion-scale sets. Hash values
|
|
23
|
+
differ from earlier versions: rebuild persisted sketches and LSH
|
|
24
|
+
indexes, or pass ``MinHash(..., scheme="legacy")`` to interoperate with
|
|
25
|
+
existing data. See the `MinHash documentation
|
|
26
|
+
<https://ekzhu.github.io/datasketch/minhash.html>`__ for details.
|
|
27
|
+
|
|
17
28
|
This package contains the following data sketches:
|
|
18
29
|
|
|
19
30
|
+-------------------------+-----------------------------------------------+
|
|
@@ -9,7 +9,7 @@ __version__: Final[str] = _version
|
|
|
9
9
|
|
|
10
10
|
from datasketch.aio import AsyncMinHashLSH # Instantiation requires motor/redis.asyncio; import itself is always safe.
|
|
11
11
|
from datasketch.b_bit_minhash import bBitMinHash
|
|
12
|
-
from datasketch.hashfunc import sha1_hash32
|
|
12
|
+
from datasketch.hashfunc import sha1_hash32, sha1_hash64
|
|
13
13
|
from datasketch.hnsw import HNSW
|
|
14
14
|
from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
|
|
15
15
|
from datasketch.lean_minhash import LeanMinHash
|
|
@@ -41,4 +41,5 @@ __all__ = [
|
|
|
41
41
|
"WeightedMinHashLSHForest",
|
|
42
42
|
"bBitMinHash",
|
|
43
43
|
"sha1_hash32",
|
|
44
|
+
"sha1_hash64",
|
|
44
45
|
]
|
|
@@ -14,6 +14,7 @@ from datasketch.aio.storage import (
|
|
|
14
14
|
async_unordered_storage,
|
|
15
15
|
)
|
|
16
16
|
from datasketch.lsh import _optimal_param
|
|
17
|
+
from datasketch.minhash import _check_scheme_consistency
|
|
17
18
|
from datasketch.storage import _random_name, unordered_storage
|
|
18
19
|
|
|
19
20
|
|
|
@@ -88,6 +89,10 @@ class AsyncMinHashLSH:
|
|
|
88
89
|
self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
|
|
89
90
|
self.hashtables = None
|
|
90
91
|
self.keys = None
|
|
92
|
+
# The permutation scheme of the indexed MinHash, learned from the
|
|
93
|
+
# first insert. Note that an index attached to pre-existing external
|
|
94
|
+
# storage re-learns the scheme on its first insert.
|
|
95
|
+
self._minhash_scheme: Optional[str] = None
|
|
91
96
|
|
|
92
97
|
self._lock = asyncio.Lock()
|
|
93
98
|
self._initialized = False
|
|
@@ -248,6 +253,7 @@ class AsyncMinHashLSH:
|
|
|
248
253
|
async def _insert(self, key, minhash, check_duplication=True, buffer=False):
|
|
249
254
|
if len(minhash) != self.h:
|
|
250
255
|
raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
|
|
256
|
+
self._minhash_scheme = _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
|
|
251
257
|
if self._require_bytes_keys and not isinstance(key, bytes):
|
|
252
258
|
raise TypeError(
|
|
253
259
|
f"prepickle=False requires bytes keys for non-dict storage, got {type(key).__name__}. "
|
|
@@ -272,6 +278,7 @@ class AsyncMinHashLSH:
|
|
|
272
278
|
"""See :class:`datasketch.MinHashLSH`."""
|
|
273
279
|
if len(minhash) != self.h:
|
|
274
280
|
raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
|
|
281
|
+
_check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
|
|
275
282
|
|
|
276
283
|
fs = (
|
|
277
284
|
hashtable.get(self._H(minhash.hashvalues[start:end]))
|
|
@@ -322,6 +329,7 @@ class AsyncMinHashLSH:
|
|
|
322
329
|
async def _query_b(self, minhash, b):
|
|
323
330
|
if len(minhash) != self.h:
|
|
324
331
|
raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
|
|
332
|
+
_check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
|
|
325
333
|
if b > len(self.hashtables):
|
|
326
334
|
raise ValueError("b must be less or equal to the number of hash tables")
|
|
327
335
|
fs = []
|
|
@@ -9,16 +9,37 @@ import struct
|
|
|
9
9
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
12
|
+
from datasketch.minhash import _SCHEME_CODES, _SCHEME_CODES_INV, _SCHEME_LEGACY, _VALID_SCHEMES
|
|
13
|
+
|
|
12
14
|
|
|
13
15
|
class bBitMinHash:
|
|
14
|
-
"""
|
|
16
|
+
"""b-bit MinHash of an existing :class:`datasketch.MinHash`.
|
|
17
|
+
|
|
18
|
+
b-bit MinHash reduces storage by keeping only the lowest `b` bits of
|
|
19
|
+
each minimum hash value, at some loss of accuracy. It supports
|
|
20
|
+
:meth:`jaccard` and pickle serialization, but cannot be updated with
|
|
21
|
+
new values and cannot be used with the LSH indexes. See `b-Bit Minwise
|
|
22
|
+
Hashing <http://research.microsoft.com/pubs/120078/wfc0398-liPS.pdf>`_
|
|
23
|
+
by Ping Li and Arnd Christian König.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
minhash (datasketch.MinHash): The MinHash to compress.
|
|
27
|
+
b (int): The number of lowest bits to keep for each minimum hash
|
|
28
|
+
value, between 0 and 32.
|
|
29
|
+
r (float): The expected ratio of set size to the size of the
|
|
30
|
+
universe of all values, used by the Jaccard estimator. Leave
|
|
31
|
+
at 0.0 if unknown: the estimator then uses the limit as the
|
|
32
|
+
ratio goes to zero.
|
|
15
33
|
|
|
16
|
-
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
__slots__ = ("b", "hashvalues", "r", "scheme", "seed")
|
|
17
37
|
|
|
18
38
|
# seed as int64
|
|
19
39
|
# b as uint8
|
|
20
40
|
# r as float64
|
|
21
|
-
# num_perm as int32
|
|
41
|
+
# num_perm as int32 (negated and followed by a scheme code byte for
|
|
42
|
+
# non-legacy permutation schemes; legacy payloads predate the field)
|
|
22
43
|
_serial_fmt_params = "<qBdi"
|
|
23
44
|
# each block as uint64
|
|
24
45
|
_serial_fmt_block = "Q"
|
|
@@ -37,6 +58,11 @@ class bBitMinHash:
|
|
|
37
58
|
bmask = (1 << b) - 1
|
|
38
59
|
self.hashvalues = np.bitwise_and(minhash.hashvalues, bmask).astype(np.uint32)
|
|
39
60
|
self.seed = minhash.seed
|
|
61
|
+
# Requiring the attribute (rather than assuming a default) keeps a
|
|
62
|
+
# sketch type without a scheme from being silently mislabeled.
|
|
63
|
+
self.scheme = minhash.scheme
|
|
64
|
+
if self.scheme not in _VALID_SCHEMES:
|
|
65
|
+
raise ValueError("scheme must be one of %s, got %r" % (", ".join(_VALID_SCHEMES), self.scheme))
|
|
40
66
|
self.b = b
|
|
41
67
|
self.r = r
|
|
42
68
|
|
|
@@ -44,6 +70,7 @@ class bBitMinHash:
|
|
|
44
70
|
"""Check for full equality of two b-bit MinHash objects."""
|
|
45
71
|
return (
|
|
46
72
|
type(self) is type(other)
|
|
73
|
+
and self.scheme == other.scheme
|
|
47
74
|
and self.seed == other.seed
|
|
48
75
|
and self.b == other.b
|
|
49
76
|
and self.r == other.r
|
|
@@ -56,13 +83,15 @@ class bBitMinHash:
|
|
|
56
83
|
"""
|
|
57
84
|
if self.b != other.b:
|
|
58
85
|
raise ValueError(
|
|
59
|
-
"Cannot compare two b-bit MinHashes with different
|
|
60
|
-
|
|
86
|
+
"Cannot compare two b-bit MinHashes with different b values"
|
|
87
|
+
)
|
|
88
|
+
if self.scheme != other.scheme:
|
|
89
|
+
raise ValueError(
|
|
90
|
+
"Cannot compare two b-bit MinHashes with different permutation schemes"
|
|
61
91
|
)
|
|
62
92
|
if self.seed != other.seed:
|
|
63
93
|
raise ValueError(
|
|
64
|
-
"Cannot compare two b-bit MinHashes with different
|
|
65
|
-
set of permutations"
|
|
94
|
+
"Cannot compare two b-bit MinHashes with different set of permutations"
|
|
66
95
|
)
|
|
67
96
|
intersection = np.count_nonzero(self.hashvalues == other.hashvalues)
|
|
68
97
|
raw_est = float(intersection) / float(self.hashvalues.size)
|
|
@@ -75,6 +104,13 @@ class bBitMinHash:
|
|
|
75
104
|
"""Get the serialized size of this b-bit MinHash in number of bytes."""
|
|
76
105
|
return self._bytesize()[-1]
|
|
77
106
|
|
|
107
|
+
def _params_fmt(self):
|
|
108
|
+
"""The struct format of the parameter header for this scheme."""
|
|
109
|
+
if self.scheme == _SCHEME_LEGACY:
|
|
110
|
+
return self._serial_fmt_params
|
|
111
|
+
# Non-legacy schemes append a scheme code byte to the parameters.
|
|
112
|
+
return self._serial_fmt_params + "B"
|
|
113
|
+
|
|
78
114
|
def __getstate__(self):
|
|
79
115
|
"""Called when pickling the b-bit MinHash object.
|
|
80
116
|
Returns a bytearray which will then be pickled.
|
|
@@ -96,8 +132,14 @@ class bBitMinHash:
|
|
|
96
132
|
# Doing this in BigInteger guarantees we do not experience overflow and still
|
|
97
133
|
# coerces to np.uint64 as expected.
|
|
98
134
|
blocks[i] = int(blocks[i]) | (int(hv) << (n - 1 - j) * slot_size)
|
|
99
|
-
fmt = self.
|
|
100
|
-
|
|
135
|
+
fmt = self._params_fmt() + "%d%s" % (num_blocks, self._serial_fmt_block)
|
|
136
|
+
if self.scheme == _SCHEME_LEGACY:
|
|
137
|
+
struct.pack_into(fmt, buffer, 0, self.seed, self.b, self.r, self.hashvalues.size, *blocks)
|
|
138
|
+
else:
|
|
139
|
+
# A negated size marks the post-2.0.0 format carrying a scheme code.
|
|
140
|
+
struct.pack_into(
|
|
141
|
+
fmt, buffer, 0, self.seed, self.b, self.r, -self.hashvalues.size, _SCHEME_CODES[self.scheme], *blocks
|
|
142
|
+
)
|
|
101
143
|
return buffer
|
|
102
144
|
|
|
103
145
|
def __setstate__(self, buf):
|
|
@@ -107,8 +149,19 @@ class bBitMinHash:
|
|
|
107
149
|
try:
|
|
108
150
|
self.seed, self.b, self.r, num_perm = struct.unpack_from(self._serial_fmt_params, buf, 0)
|
|
109
151
|
except TypeError:
|
|
110
|
-
|
|
152
|
+
buf = memoryview(buf)
|
|
153
|
+
self.seed, self.b, self.r, num_perm = struct.unpack_from(self._serial_fmt_params, buf, 0)
|
|
111
154
|
offset = struct.calcsize(self._serial_fmt_params)
|
|
155
|
+
if num_perm >= 0:
|
|
156
|
+
# Payloads from before version 2.0.0 have no scheme field.
|
|
157
|
+
self.scheme = _SCHEME_LEGACY
|
|
158
|
+
else:
|
|
159
|
+
num_perm = -num_perm
|
|
160
|
+
(scheme_code,) = struct.unpack_from("<B", buf, offset)
|
|
161
|
+
if scheme_code not in _SCHEME_CODES_INV:
|
|
162
|
+
raise ValueError("Unknown permutation scheme code: %d" % scheme_code)
|
|
163
|
+
self.scheme = _SCHEME_CODES_INV[scheme_code]
|
|
164
|
+
offset += 1
|
|
112
165
|
self.hashvalues = np.zeros((num_perm,), dtype=np.uint32)
|
|
113
166
|
# Reconstruct the hash values
|
|
114
167
|
slot_size, n, num_blocks, _total = self._bytesize()
|
|
@@ -168,5 +221,5 @@ class bBitMinHash:
|
|
|
168
221
|
# Get the number of blocks required
|
|
169
222
|
num_blocks = int(np.ceil(float(self.hashvalues.size) / num_slots_per_block))
|
|
170
223
|
# Get the total serialized size
|
|
171
|
-
total = struct.calcsize(self.
|
|
224
|
+
total = struct.calcsize(self._params_fmt() + "%d%s" % (num_blocks, self._serial_fmt_block))
|
|
172
225
|
return slot_size, num_slots_per_block, num_blocks, total
|
|
@@ -265,7 +265,7 @@ class HyperLogLog:
|
|
|
265
265
|
different precisions"
|
|
266
266
|
)
|
|
267
267
|
reg = np.maximum.reduce([h.reg for h in hyperloglogs])
|
|
268
|
-
return cls(reg=reg)
|
|
268
|
+
return cls(reg=reg, hashfunc=hyperloglogs[0].hashfunc)
|
|
269
269
|
|
|
270
270
|
def bytesize(self) -> int:
|
|
271
271
|
"""Get the size of the HyperLogLog in bytes."""
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import struct
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from datasketch.minhash import (
|
|
10
|
+
_SCHEME_AFFINE32,
|
|
11
|
+
_SCHEME_AFFINE64,
|
|
12
|
+
_SCHEME_CODES,
|
|
13
|
+
_SCHEME_CODES_INV,
|
|
14
|
+
_SCHEME_LEGACY,
|
|
15
|
+
_VALID_SCHEMES,
|
|
16
|
+
MinHash,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# Byte-format notes: legacy payloads have no scheme field and are identified
|
|
20
|
+
# by a non-negative number-of-hash-values field, while the affine formats
|
|
21
|
+
# store the negated number followed by a scheme code byte. This keeps legacy
|
|
22
|
+
# sketches bit-identical to (and readable by) versions before 2.0.0.
|
|
23
|
+
# struct format character of one hash value, per scheme.
|
|
24
|
+
_SCHEME_VALUE_FMTS = {
|
|
25
|
+
_SCHEME_LEGACY: "I",
|
|
26
|
+
_SCHEME_AFFINE32: "I",
|
|
27
|
+
_SCHEME_AFFINE64: "Q",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LeanMinHash(MinHash):
|
|
32
|
+
"""Lean MinHash is MinHash with a smaller memory footprint
|
|
33
|
+
and faster deserialization, but with its internal state frozen
|
|
34
|
+
-- no `update()`.
|
|
35
|
+
|
|
36
|
+
Lean MinHash inherits all methods from :class:`datasketch.MinHash`.
|
|
37
|
+
It does not store the `permutations` and the `hashfunc` needed for updating.
|
|
38
|
+
If a MinHash does not need further updates, convert it into a lean MinHash
|
|
39
|
+
to save memory.
|
|
40
|
+
|
|
41
|
+
Example:
|
|
42
|
+
To create a lean MinHash from an existing MinHash:
|
|
43
|
+
|
|
44
|
+
.. code-block:: python
|
|
45
|
+
|
|
46
|
+
lean_minhash = LeanMinHash(minhash)
|
|
47
|
+
|
|
48
|
+
# You can compute the Jaccard similarity between two lean MinHash
|
|
49
|
+
lean_minhash.jaccard(lean_minhash2)
|
|
50
|
+
|
|
51
|
+
# Or between a lean MinHash and a MinHash
|
|
52
|
+
lean_minhash.jaccard(minhash2)
|
|
53
|
+
|
|
54
|
+
To create a lean MinHash from the hash values, seed, and scheme of an
|
|
55
|
+
existing MinHash:
|
|
56
|
+
|
|
57
|
+
.. code-block:: python
|
|
58
|
+
|
|
59
|
+
lean_minhash = LeanMinHash(
|
|
60
|
+
seed=minhash.seed,
|
|
61
|
+
hashvalues=minhash.hashvalues,
|
|
62
|
+
scheme=minhash.scheme,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
To create a MinHash from a lean MinHash:
|
|
66
|
+
|
|
67
|
+
.. code-block:: python
|
|
68
|
+
|
|
69
|
+
minhash = MinHash(
|
|
70
|
+
seed=lean_minhash.seed,
|
|
71
|
+
hashvalues=lean_minhash.hashvalues,
|
|
72
|
+
scheme=lean_minhash.scheme,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Or if you want to prevent further updates on minhash
|
|
76
|
+
# from affecting the state of lean_minhash
|
|
77
|
+
minhash = MinHash(
|
|
78
|
+
seed=lean_minhash.seed,
|
|
79
|
+
hashvalues=lean_minhash.digest(),
|
|
80
|
+
scheme=lean_minhash.scheme,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
Note:
|
|
84
|
+
Lean MinHash can also be used in :class:`datasketch.MinHashLSH`,
|
|
85
|
+
:class:`datasketch.MinHashLSHForest`, and :class:`datasketch.MinHashLSHEnsemble`.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
minhash (optional): The :class:`datasketch.MinHash` object used to
|
|
89
|
+
initialize the LeanMinHash. If this is not set, then `seed`
|
|
90
|
+
and `hashvalues` must be set.
|
|
91
|
+
seed (optional): The random seed that controls the set of random
|
|
92
|
+
permutation functions generated for this LeanMinHash. This parameter
|
|
93
|
+
must be used together with `hashvalues`.
|
|
94
|
+
hashvalues (optional): The hash values used to inititialize the state
|
|
95
|
+
of the LeanMinHash. This parameter must be used together with
|
|
96
|
+
`seed`.
|
|
97
|
+
scheme (optional): The permutation scheme of the MinHash the
|
|
98
|
+
`hashvalues` were taken from. Required when initializing from
|
|
99
|
+
`seed` and `hashvalues` (use ``"legacy"`` for hash values created
|
|
100
|
+
by datasketch before 2.0.0), because hash values carry no trace
|
|
101
|
+
of the scheme that produced them. When `minhash` is set the
|
|
102
|
+
scheme is taken from the MinHash object instead, and this
|
|
103
|
+
argument may only repeat it.
|
|
104
|
+
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
__slots__ = ("hashvalues", "scheme", "seed")
|
|
108
|
+
|
|
109
|
+
def _initialize_slots(self, seed, hashvalues, scheme=_SCHEME_LEGACY):
|
|
110
|
+
"""Initialize the slots of the LeanMinHash.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
seed (int): The random seed controls the set of random
|
|
114
|
+
permutation functions generated for this LeanMinHash.
|
|
115
|
+
hashvalues (Iterable): The hash values is the internal state of the LeanMinHash.
|
|
116
|
+
scheme (str): The permutation scheme of the hash values.
|
|
117
|
+
|
|
118
|
+
"""
|
|
119
|
+
if scheme not in _VALID_SCHEMES:
|
|
120
|
+
raise ValueError("scheme must be one of %s, got %r" % (", ".join(_VALID_SCHEMES), scheme))
|
|
121
|
+
self.seed = seed
|
|
122
|
+
self.scheme = scheme
|
|
123
|
+
self.hashvalues = self._parse_hashvalues(hashvalues)
|
|
124
|
+
if scheme != _SCHEME_LEGACY and len(self.hashvalues) == 0:
|
|
125
|
+
# An empty sketch would serialize with a hash value count of 0,
|
|
126
|
+
# which the deserializer cannot tell apart from the legacy format
|
|
127
|
+
# (identified by a non-negative count).
|
|
128
|
+
raise ValueError("hashvalues must not be empty")
|
|
129
|
+
|
|
130
|
+
def __init__(
|
|
131
|
+
self,
|
|
132
|
+
minhash: MinHash = None,
|
|
133
|
+
seed: Optional[int] = None,
|
|
134
|
+
hashvalues: Optional[Iterable] = None,
|
|
135
|
+
scheme: Optional[str] = None,
|
|
136
|
+
):
|
|
137
|
+
if minhash is not None:
|
|
138
|
+
if scheme is not None and scheme != minhash.scheme:
|
|
139
|
+
raise ValueError(
|
|
140
|
+
"scheme %r conflicts with the scheme %r of the given MinHash" % (scheme, minhash.scheme)
|
|
141
|
+
)
|
|
142
|
+
self._initialize_slots(minhash.seed, minhash.hashvalues, minhash.scheme)
|
|
143
|
+
elif hashvalues is not None and seed is not None:
|
|
144
|
+
if scheme is None:
|
|
145
|
+
# Hash values carry no trace of the scheme that produced
|
|
146
|
+
# them, so a default here would silently mislabel pre-2.0.0
|
|
147
|
+
# values and defeat the cross-scheme comparison guards.
|
|
148
|
+
raise ValueError(
|
|
149
|
+
"scheme must be specified explicitly when initializing from existing "
|
|
150
|
+
"hash values: pass the scheme of the MinHash they came from, or "
|
|
151
|
+
"scheme='legacy' for hash values created by datasketch before 2.0.0."
|
|
152
|
+
)
|
|
153
|
+
self._initialize_slots(seed, hashvalues, scheme)
|
|
154
|
+
else:
|
|
155
|
+
raise ValueError(
|
|
156
|
+
"Init parameters cannot be None: make sure to set either minhash or both of hash values and seed"
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def update(self, b) -> None:
|
|
160
|
+
"""Not available on a LeanMinHash.
|
|
161
|
+
Calling it raises a TypeError.
|
|
162
|
+
"""
|
|
163
|
+
raise TypeError("Cannot update a LeanMinHash")
|
|
164
|
+
|
|
165
|
+
def copy(self) -> LeanMinHash:
|
|
166
|
+
lmh = object.__new__(LeanMinHash)
|
|
167
|
+
lmh._initialize_slots(self.seed, self.hashvalues, self.scheme)
|
|
168
|
+
return lmh
|
|
169
|
+
|
|
170
|
+
def _value_fmt(self) -> str:
|
|
171
|
+
return _SCHEME_VALUE_FMTS[self.scheme]
|
|
172
|
+
|
|
173
|
+
def bytesize(self, byteorder="@") -> int:
|
|
174
|
+
"""Compute the byte size after serialization.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
byteorder (str, optional): This is byte order of the serialized data. Use one
|
|
178
|
+
of the `byte order characters
|
|
179
|
+
<https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_:
|
|
180
|
+
``@``, ``=``, ``<``, ``>``, and ``!``.
|
|
181
|
+
Default is ``@`` -- the native order.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
int: Size in number of bytes after serialization.
|
|
185
|
+
|
|
186
|
+
"""
|
|
187
|
+
if self.scheme == _SCHEME_LEGACY:
|
|
188
|
+
# 8 bytes for the seed, 4 bytes for the number of hash values,
|
|
189
|
+
# and 4 bytes for each hash value.
|
|
190
|
+
return struct.calcsize("%sqi%dI" % (byteorder, len(self)))
|
|
191
|
+
# The affine formats add a 1-byte scheme code after the number of
|
|
192
|
+
# hash values, and store each hash value in 4 ("affine32") or
|
|
193
|
+
# 8 ("affine64") bytes.
|
|
194
|
+
return struct.calcsize("%sqiB%d%s" % (byteorder, len(self), self._value_fmt()))
|
|
195
|
+
|
|
196
|
+
def serialize(self, buf, byteorder="@") -> None:
|
|
197
|
+
"""Serialize this lean MinHash and store the result in an allocated buffer.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
buf (buffer): `buf` must implement the `buffer`_ interface.
|
|
201
|
+
One such example is the built-in `bytearray`_ class.
|
|
202
|
+
byteorder (str, optional): This is byte order of the serialized data. Use one
|
|
203
|
+
of the `byte order characters
|
|
204
|
+
<https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_:
|
|
205
|
+
``@``, ``=``, ``<``, ``>``, and ``!``.
|
|
206
|
+
Default is ``@`` -- the native order.
|
|
207
|
+
|
|
208
|
+
This is preferred over using `pickle`_ if the serialized lean MinHash needs
|
|
209
|
+
to be used by another program in a different programming language.
|
|
210
|
+
|
|
211
|
+
The serialization schema for the ``"legacy"`` scheme (identical to
|
|
212
|
+
versions before 2.0.0):
|
|
213
|
+
1. The first 8 bytes is the seed integer
|
|
214
|
+
2. The next 4 bytes is the number of hash values
|
|
215
|
+
3. The rest is the serialized hash values, each uses 4 bytes
|
|
216
|
+
|
|
217
|
+
The serialization schema for the affine schemes:
|
|
218
|
+
1. The first 8 bytes is the seed integer
|
|
219
|
+
2. The next 4 bytes is the **negated** number of hash values
|
|
220
|
+
(a negative value marks the post-2.0.0 format)
|
|
221
|
+
3. The next byte is the scheme code (1 for ``"affine32"``,
|
|
222
|
+
2 for ``"affine64"``)
|
|
223
|
+
4. The rest is the serialized hash values, each uses 4 bytes
|
|
224
|
+
for ``"affine32"`` and 8 bytes for ``"affine64"``
|
|
225
|
+
|
|
226
|
+
Example:
|
|
227
|
+
To serialize a single lean MinHash into a `bytearray`_ buffer.
|
|
228
|
+
|
|
229
|
+
.. code-block:: python
|
|
230
|
+
|
|
231
|
+
buf = bytearray(lean_minhash.bytesize())
|
|
232
|
+
lean_minhash.serialize(buf)
|
|
233
|
+
|
|
234
|
+
To serialize multiple lean MinHash into a `bytearray`_ buffer.
|
|
235
|
+
|
|
236
|
+
.. code-block:: python
|
|
237
|
+
|
|
238
|
+
# assuming lean_minhashs is a list of LeanMinHash with the same size
|
|
239
|
+
size = lean_minhashs[0].bytesize()
|
|
240
|
+
buf = bytearray(size * len(lean_minhashs))
|
|
241
|
+
for i, lean_minhash in enumerate(lean_minhashs):
|
|
242
|
+
lean_minhash.serialize(buf[i * size :])
|
|
243
|
+
|
|
244
|
+
.. _`buffer`: https://docs.python.org/3/c-api/buffer.html
|
|
245
|
+
.. _`bytearray`: https://docs.python.org/3.6/library/functions.html#bytearray
|
|
246
|
+
.. _`byteorder`: https://docs.python.org/3/library/struct.html
|
|
247
|
+
|
|
248
|
+
"""
|
|
249
|
+
if len(buf) < self.bytesize(byteorder):
|
|
250
|
+
raise ValueError(
|
|
251
|
+
"The buffer does not have enough space for holding this MinHash."
|
|
252
|
+
)
|
|
253
|
+
if self.scheme == _SCHEME_LEGACY:
|
|
254
|
+
fmt = "%sqi%dI" % (byteorder, len(self))
|
|
255
|
+
struct.pack_into(fmt, buf, 0, self.seed, len(self), *self.hashvalues)
|
|
256
|
+
else:
|
|
257
|
+
fmt = "%sqiB%d%s" % (byteorder, len(self), self._value_fmt())
|
|
258
|
+
struct.pack_into(fmt, buf, 0, self.seed, -len(self), _SCHEME_CODES[self.scheme], *self.hashvalues)
|
|
259
|
+
|
|
260
|
+
@classmethod
|
|
261
|
+
def deserialize(cls, buf, byteorder="@") -> LeanMinHash:
|
|
262
|
+
"""Deserialize a lean MinHash from a buffer.
|
|
263
|
+
|
|
264
|
+
Buffers written by versions before 2.0.0 (which had no scheme field)
|
|
265
|
+
deserialize with ``scheme="legacy"``.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
buf (buffer): `buf` must implement the `buffer`_ interface.
|
|
269
|
+
One such example is the built-in `bytearray`_ class.
|
|
270
|
+
byteorder (str. optional): This is byte order of the serialized data. Use one
|
|
271
|
+
of the `byte order characters
|
|
272
|
+
<https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_:
|
|
273
|
+
``@``, ``=``, ``<``, ``>``, and ``!``.
|
|
274
|
+
Default is ``@`` -- the native order.
|
|
275
|
+
|
|
276
|
+
Return:
|
|
277
|
+
datasketch.LeanMinHash: The deserialized lean MinHash
|
|
278
|
+
|
|
279
|
+
Example:
|
|
280
|
+
To deserialize a lean MinHash from a buffer.
|
|
281
|
+
|
|
282
|
+
.. code-block:: python
|
|
283
|
+
|
|
284
|
+
lean_minhash = LeanMinHash.deserialize(buf)
|
|
285
|
+
|
|
286
|
+
"""
|
|
287
|
+
fmt_seed_size = "%sqi" % byteorder
|
|
288
|
+
try:
|
|
289
|
+
seed, num_perm = struct.unpack_from(fmt_seed_size, buf, 0)
|
|
290
|
+
except TypeError:
|
|
291
|
+
buf = memoryview(buf)
|
|
292
|
+
seed, num_perm = struct.unpack_from(fmt_seed_size, buf, 0)
|
|
293
|
+
if num_perm >= 0:
|
|
294
|
+
scheme = _SCHEME_LEGACY
|
|
295
|
+
offset = struct.calcsize(fmt_seed_size)
|
|
296
|
+
else:
|
|
297
|
+
num_perm = -num_perm
|
|
298
|
+
(scheme_code,) = struct.unpack_from(byteorder + "B", buf, struct.calcsize(fmt_seed_size))
|
|
299
|
+
if scheme_code not in _SCHEME_CODES_INV:
|
|
300
|
+
raise ValueError("Unknown permutation scheme code: %d" % scheme_code)
|
|
301
|
+
scheme = _SCHEME_CODES_INV[scheme_code]
|
|
302
|
+
# The 0-count value entry aligns the offset without consuming data
|
|
303
|
+
# (only relevant for the native byte order "@").
|
|
304
|
+
offset = struct.calcsize("%sqiB0%s" % (byteorder, _SCHEME_VALUE_FMTS[scheme]))
|
|
305
|
+
fmt_hash = "%s%d%s" % (byteorder, num_perm, _SCHEME_VALUE_FMTS[scheme])
|
|
306
|
+
hashvalues = struct.unpack_from(fmt_hash, buf, offset)
|
|
307
|
+
lmh = object.__new__(LeanMinHash)
|
|
308
|
+
lmh._initialize_slots(seed, hashvalues, scheme)
|
|
309
|
+
return lmh
|
|
310
|
+
|
|
311
|
+
def __getstate__(self):
|
|
312
|
+
buf = bytearray(self.bytesize())
|
|
313
|
+
if self.scheme == _SCHEME_LEGACY:
|
|
314
|
+
fmt = "qi%dI" % len(self)
|
|
315
|
+
struct.pack_into(fmt, buf, 0, self.seed, len(self), *self.hashvalues)
|
|
316
|
+
else:
|
|
317
|
+
fmt = "qiB%d%s" % (len(self), self._value_fmt())
|
|
318
|
+
struct.pack_into(fmt, buf, 0, self.seed, -len(self), _SCHEME_CODES[self.scheme], *self.hashvalues)
|
|
319
|
+
return buf
|
|
320
|
+
|
|
321
|
+
def __setstate__(self, buf):
|
|
322
|
+
try:
|
|
323
|
+
seed, num_perm = struct.unpack_from("qi", buf, 0)
|
|
324
|
+
except TypeError:
|
|
325
|
+
buf = memoryview(buf)
|
|
326
|
+
seed, num_perm = struct.unpack_from("qi", buf, 0)
|
|
327
|
+
if num_perm >= 0:
|
|
328
|
+
scheme = _SCHEME_LEGACY
|
|
329
|
+
offset = struct.calcsize("qi")
|
|
330
|
+
else:
|
|
331
|
+
num_perm = -num_perm
|
|
332
|
+
(scheme_code,) = struct.unpack_from("B", buf, struct.calcsize("qi"))
|
|
333
|
+
if scheme_code not in _SCHEME_CODES_INV:
|
|
334
|
+
raise ValueError("Unknown permutation scheme code: %d" % scheme_code)
|
|
335
|
+
scheme = _SCHEME_CODES_INV[scheme_code]
|
|
336
|
+
offset = struct.calcsize("qiB0%s" % _SCHEME_VALUE_FMTS[scheme])
|
|
337
|
+
hashvalues = struct.unpack_from("%d%s" % (num_perm, _SCHEME_VALUE_FMTS[scheme]), buf, offset)
|
|
338
|
+
self._initialize_slots(seed, hashvalues, scheme)
|
|
339
|
+
|
|
340
|
+
def __hash__(self) -> int:
|
|
341
|
+
return hash((self.scheme, self.seed, tuple(self.hashvalues)))
|
|
342
|
+
|
|
343
|
+
@classmethod
|
|
344
|
+
def union(cls, *lmhs: LeanMinHash) -> LeanMinHash:
|
|
345
|
+
"""Create a new lean MinHash by unioning multiple lean MinHash."""
|
|
346
|
+
if len(lmhs) < 2:
|
|
347
|
+
raise ValueError("Cannot union less than 2 MinHash")
|
|
348
|
+
num_perm = len(lmhs[0])
|
|
349
|
+
seed = lmhs[0].seed
|
|
350
|
+
scheme = lmhs[0].scheme
|
|
351
|
+
if any((seed != m.seed or num_perm != len(m) or scheme != m.scheme) for m in lmhs):
|
|
352
|
+
raise ValueError(
|
|
353
|
+
"The unioning MinHash must have the same seed, number of permutation functions and scheme."
|
|
354
|
+
)
|
|
355
|
+
hashvalues = np.minimum.reduce([m.hashvalues for m in lmhs])
|
|
356
|
+
|
|
357
|
+
lmh = object.__new__(LeanMinHash)
|
|
358
|
+
lmh._initialize_slots(seed, hashvalues, scheme)
|
|
359
|
+
return lmh
|
|
@@ -7,7 +7,7 @@ from typing import Callable, List, Optional, Union
|
|
|
7
7
|
|
|
8
8
|
from scipy.integrate import quad as integrate
|
|
9
9
|
|
|
10
|
-
from datasketch.minhash import MinHash
|
|
10
|
+
from datasketch.minhash import MinHash, _check_scheme_consistency
|
|
11
11
|
from datasketch.storage import (
|
|
12
12
|
OrderedStorage,
|
|
13
13
|
UnorderedStorage,
|
|
@@ -198,6 +198,10 @@ class MinHashLSH:
|
|
|
198
198
|
]
|
|
199
199
|
self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
|
|
200
200
|
self.keys: OrderedStorage = ordered_storage(storage_config, name=b"".join([basename, b"_keys"]))
|
|
201
|
+
# The permutation scheme of the indexed MinHash, learned from the
|
|
202
|
+
# first insert. Note that an index attached to pre-existing external
|
|
203
|
+
# storage (e.g. Redis) re-learns the scheme on its first insert.
|
|
204
|
+
self._minhash_scheme: Optional[str] = None
|
|
201
205
|
|
|
202
206
|
@property
|
|
203
207
|
def buffer_size(self) -> int:
|
|
@@ -332,6 +336,7 @@ class MinHashLSH:
|
|
|
332
336
|
):
|
|
333
337
|
if len(minhash) != self.h:
|
|
334
338
|
raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
|
|
339
|
+
self._minhash_scheme = _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
|
|
335
340
|
if self._require_bytes_keys and not isinstance(key, bytes):
|
|
336
341
|
raise TypeError(
|
|
337
342
|
f"prepickle=False requires bytes keys for non-dict storage, got {type(key).__name__}. "
|
|
@@ -355,8 +360,16 @@ class MinHashLSH:
|
|
|
355
360
|
|
|
356
361
|
def _merge(self, other: MinHashLSH, check_overlap: bool = False, buffer: bool = False) -> None:
|
|
357
362
|
if self.__equivalent(other):
|
|
363
|
+
known, other_known = getattr(self, "_minhash_scheme", None), getattr(other, "_minhash_scheme", None)
|
|
364
|
+
if known is not None and other_known is not None and known != other_known:
|
|
365
|
+
raise ValueError(
|
|
366
|
+
"Cannot merge MinHashLSH indexed with MinHash scheme %r into one indexed with scheme %r"
|
|
367
|
+
% (other_known, known)
|
|
368
|
+
)
|
|
358
369
|
if check_overlap and set(self.keys).intersection(set(other.keys)):
|
|
359
370
|
raise ValueError("The keys are overlapping, duplicate key exists.")
|
|
371
|
+
if known is None:
|
|
372
|
+
self._minhash_scheme = other_known
|
|
360
373
|
for key in other.keys:
|
|
361
374
|
Hs = other.keys.get(key)
|
|
362
375
|
self.keys.insert(key, *Hs, buffer=buffer)
|
|
@@ -422,6 +435,7 @@ class MinHashLSH:
|
|
|
422
435
|
"""
|
|
423
436
|
if len(minhash) != self.h:
|
|
424
437
|
raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
|
|
438
|
+
_check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
|
|
425
439
|
candidates = set()
|
|
426
440
|
for (start, end), hashtable in zip(self.hashranges, self.hashtables):
|
|
427
441
|
H = self._H(minhash.hashvalues[start:end])
|
|
@@ -448,6 +462,7 @@ class MinHashLSH:
|
|
|
448
462
|
"""
|
|
449
463
|
if len(minhash) != self.h:
|
|
450
464
|
raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
|
|
465
|
+
_check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
|
|
451
466
|
for (start, end), hashtable in zip(self.hashranges, self.hashtables):
|
|
452
467
|
H = self._H(minhash.hashvalues[start:end])
|
|
453
468
|
hashtable.add_to_select_buffer([H])
|
|
@@ -545,6 +560,7 @@ class MinHashLSH:
|
|
|
545
560
|
def _query_b(self, minhash, b):
|
|
546
561
|
if len(minhash) != self.h:
|
|
547
562
|
raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
|
|
563
|
+
_check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
|
|
548
564
|
if b > len(self.hashtables):
|
|
549
565
|
raise ValueError("b must be less or equal to the number of hash tables")
|
|
550
566
|
candidates = set()
|