datasketch 1.10.0__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {datasketch-1.10.0 → datasketch-2.0.0}/PKG-INFO +12 -1
  2. {datasketch-1.10.0 → datasketch-2.0.0}/README.rst +11 -0
  3. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/__init__.py +2 -1
  4. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/aio/lsh.py +8 -0
  5. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/b_bit_minhash.py +64 -11
  6. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/hyperloglog.py +1 -1
  7. datasketch-2.0.0/datasketch/lean_minhash.py +359 -0
  8. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/lsh.py +17 -1
  9. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/lsh_bloom.py +18 -8
  10. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/lshensemble.py +5 -0
  11. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/lshforest.py +26 -6
  12. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/minhash.py +314 -56
  13. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/weighted_minhash.py +2 -0
  14. {datasketch-1.10.0 → datasketch-2.0.0}/pyproject.toml +1 -1
  15. datasketch-1.10.0/datasketch/lean_minhash.py +0 -253
  16. {datasketch-1.10.0 → datasketch-2.0.0}/.gitignore +0 -0
  17. {datasketch-1.10.0 → datasketch-2.0.0}/LICENSE +0 -0
  18. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/aio/__init__.py +0 -0
  19. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/aio/storage.py +0 -0
  20. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/experimental/__init__.py +0 -0
  21. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/experimental/aio/__init__.py +0 -0
  22. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/experimental/aio/lsh.py +0 -0
  23. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/hashfunc.py +0 -0
  24. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/hnsw.py +0 -0
  25. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/hyperloglog_const.py +0 -0
  26. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/lshensemble_partition.py +0 -0
  27. {datasketch-1.10.0 → datasketch-2.0.0}/datasketch/storage.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datasketch
3
- Version: 1.10.0
3
+ Version: 2.0.0
4
4
  Summary: Probabilistic data structures for processing and searching very large datasets
5
5
  Project-URL: Homepage, https://ekzhu.github.io/datasketch
6
6
  Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
@@ -80,6 +80,17 @@ datasketch gives you probabilistic data structures that can process and
80
80
  search very large amount of data super fast, with little loss of
81
81
  accuracy.
82
82
 
83
+ .. note::
84
+ **Version 2.0.0** changes the default MinHash permutation scheme to
85
+ ``"affine32"``, which fixes a similarity over-estimation bias on large
86
+ sets (`issue #212 <https://github.com/ekzhu/datasketch/issues/212>`__),
87
+ halves sketch memory, and speeds up updates by roughly 4x. A 64-bit
88
+ ``"affine64"`` scheme is available for billion-scale sets. Hash values
89
+ differ from earlier versions: rebuild persisted sketches and LSH
90
+ indexes, or pass ``MinHash(..., scheme="legacy")`` to interoperate with
91
+ existing data. See the `MinHash documentation
92
+ <https://ekzhu.github.io/datasketch/minhash.html>`__ for details.
93
+
83
94
  This package contains the following data sketches:
84
95
 
85
96
  +-------------------------+-----------------------------------------------+
@@ -14,6 +14,17 @@ datasketch gives you probabilistic data structures that can process and
14
14
  search very large amount of data super fast, with little loss of
15
15
  accuracy.
16
16
 
17
+ .. note::
18
+ **Version 2.0.0** changes the default MinHash permutation scheme to
19
+ ``"affine32"``, which fixes a similarity over-estimation bias on large
20
+ sets (`issue #212 <https://github.com/ekzhu/datasketch/issues/212>`__),
21
+ halves sketch memory, and speeds up updates by roughly 4x. A 64-bit
22
+ ``"affine64"`` scheme is available for billion-scale sets. Hash values
23
+ differ from earlier versions: rebuild persisted sketches and LSH
24
+ indexes, or pass ``MinHash(..., scheme="legacy")`` to interoperate with
25
+ existing data. See the `MinHash documentation
26
+ <https://ekzhu.github.io/datasketch/minhash.html>`__ for details.
27
+
17
28
  This package contains the following data sketches:
18
29
 
19
30
  +-------------------------+-----------------------------------------------+
@@ -9,7 +9,7 @@ __version__: Final[str] = _version
9
9
 
10
10
  from datasketch.aio import AsyncMinHashLSH # Instantiation requires motor/redis.asyncio; import itself is always safe.
11
11
  from datasketch.b_bit_minhash import bBitMinHash
12
- from datasketch.hashfunc import sha1_hash32
12
+ from datasketch.hashfunc import sha1_hash32, sha1_hash64
13
13
  from datasketch.hnsw import HNSW
14
14
  from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
15
15
  from datasketch.lean_minhash import LeanMinHash
@@ -41,4 +41,5 @@ __all__ = [
41
41
  "WeightedMinHashLSHForest",
42
42
  "bBitMinHash",
43
43
  "sha1_hash32",
44
+ "sha1_hash64",
44
45
  ]
@@ -14,6 +14,7 @@ from datasketch.aio.storage import (
14
14
  async_unordered_storage,
15
15
  )
16
16
  from datasketch.lsh import _optimal_param
17
+ from datasketch.minhash import _check_scheme_consistency
17
18
  from datasketch.storage import _random_name, unordered_storage
18
19
 
19
20
 
@@ -88,6 +89,10 @@ class AsyncMinHashLSH:
88
89
  self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
89
90
  self.hashtables = None
90
91
  self.keys = None
92
+ # The permutation scheme of the indexed MinHash, learned from the
93
+ # first insert. Note that an index attached to pre-existing external
94
+ # storage re-learns the scheme on its first insert.
95
+ self._minhash_scheme: Optional[str] = None
91
96
 
92
97
  self._lock = asyncio.Lock()
93
98
  self._initialized = False
@@ -248,6 +253,7 @@ class AsyncMinHashLSH:
248
253
  async def _insert(self, key, minhash, check_duplication=True, buffer=False):
249
254
  if len(minhash) != self.h:
250
255
  raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
256
+ self._minhash_scheme = _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
251
257
  if self._require_bytes_keys and not isinstance(key, bytes):
252
258
  raise TypeError(
253
259
  f"prepickle=False requires bytes keys for non-dict storage, got {type(key).__name__}. "
@@ -272,6 +278,7 @@ class AsyncMinHashLSH:
272
278
  """See :class:`datasketch.MinHashLSH`."""
273
279
  if len(minhash) != self.h:
274
280
  raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
281
+ _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
275
282
 
276
283
  fs = (
277
284
  hashtable.get(self._H(minhash.hashvalues[start:end]))
@@ -322,6 +329,7 @@ class AsyncMinHashLSH:
322
329
  async def _query_b(self, minhash, b):
323
330
  if len(minhash) != self.h:
324
331
  raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
332
+ _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
325
333
  if b > len(self.hashtables):
326
334
  raise ValueError("b must be less or equal to the number of hash tables")
327
335
  fs = []
@@ -9,16 +9,37 @@ import struct
9
9
 
10
10
  import numpy as np
11
11
 
12
+ from datasketch.minhash import _SCHEME_CODES, _SCHEME_CODES_INV, _SCHEME_LEGACY, _VALID_SCHEMES
13
+
12
14
 
13
15
  class bBitMinHash:
14
- """The b-bit MinHash object."""
16
+ """b-bit MinHash of an existing :class:`datasketch.MinHash`.
17
+
18
+ b-bit MinHash reduces storage by keeping only the lowest `b` bits of
19
+ each minimum hash value, at some loss of accuracy. It supports
20
+ :meth:`jaccard` and pickle serialization, but cannot be updated with
21
+ new values and cannot be used with the LSH indexes. See `b-Bit Minwise
22
+ Hashing <http://research.microsoft.com/pubs/120078/wfc0398-liPS.pdf>`_
23
+ by Ping Li and Arnd Christian König.
24
+
25
+ Args:
26
+ minhash (datasketch.MinHash): The MinHash to compress.
27
+ b (int): The number of lowest bits to keep for each minimum hash
28
+ value, between 0 and 32.
29
+ r (float): The expected ratio of set size to the size of the
30
+ universe of all values, used by the Jaccard estimator. Leave
31
+ at 0.0 if unknown: the estimator then uses the limit as the
32
+ ratio goes to zero.
15
33
 
16
- __slots__ = ("b", "hashvalues", "r", "seed")
34
+ """
35
+
36
+ __slots__ = ("b", "hashvalues", "r", "scheme", "seed")
17
37
 
18
38
  # seed as int64
19
39
  # b as uint8
20
40
  # r as float64
21
- # num_perm as int32
41
+ # num_perm as int32 (negated and followed by a scheme code byte for
42
+ # non-legacy permutation schemes; legacy payloads predate the field)
22
43
  _serial_fmt_params = "<qBdi"
23
44
  # each block as uint64
24
45
  _serial_fmt_block = "Q"
@@ -37,6 +58,11 @@ class bBitMinHash:
37
58
  bmask = (1 << b) - 1
38
59
  self.hashvalues = np.bitwise_and(minhash.hashvalues, bmask).astype(np.uint32)
39
60
  self.seed = minhash.seed
61
+ # Requiring the attribute (rather than assuming a default) keeps a
62
+ # sketch type without a scheme from being silently mislabeled.
63
+ self.scheme = minhash.scheme
64
+ if self.scheme not in _VALID_SCHEMES:
65
+ raise ValueError("scheme must be one of %s, got %r" % (", ".join(_VALID_SCHEMES), self.scheme))
40
66
  self.b = b
41
67
  self.r = r
42
68
 
@@ -44,6 +70,7 @@ class bBitMinHash:
44
70
  """Check for full equality of two b-bit MinHash objects."""
45
71
  return (
46
72
  type(self) is type(other)
73
+ and self.scheme == other.scheme
47
74
  and self.seed == other.seed
48
75
  and self.b == other.b
49
76
  and self.r == other.r
@@ -56,13 +83,15 @@ class bBitMinHash:
56
83
  """
57
84
  if self.b != other.b:
58
85
  raise ValueError(
59
- "Cannot compare two b-bit MinHashes with different\
60
- b values"
86
+ "Cannot compare two b-bit MinHashes with different b values"
87
+ )
88
+ if self.scheme != other.scheme:
89
+ raise ValueError(
90
+ "Cannot compare two b-bit MinHashes with different permutation schemes"
61
91
  )
62
92
  if self.seed != other.seed:
63
93
  raise ValueError(
64
- "Cannot compare two b-bit MinHashes with different\
65
- set of permutations"
94
+ "Cannot compare two b-bit MinHashes with different set of permutations"
66
95
  )
67
96
  intersection = np.count_nonzero(self.hashvalues == other.hashvalues)
68
97
  raw_est = float(intersection) / float(self.hashvalues.size)
@@ -75,6 +104,13 @@ class bBitMinHash:
75
104
  """Get the serialized size of this b-bit MinHash in number of bytes."""
76
105
  return self._bytesize()[-1]
77
106
 
107
+ def _params_fmt(self):
108
+ """The struct format of the parameter header for this scheme."""
109
+ if self.scheme == _SCHEME_LEGACY:
110
+ return self._serial_fmt_params
111
+ # Non-legacy schemes append a scheme code byte to the parameters.
112
+ return self._serial_fmt_params + "B"
113
+
78
114
  def __getstate__(self):
79
115
  """Called when pickling the b-bit MinHash object.
80
116
  Returns a bytearray which will then be pickled.
@@ -96,8 +132,14 @@ class bBitMinHash:
96
132
  # Doing this in BigInteger guarantees we do not experience overflow and still
97
133
  # coerces to np.uint64 as expected.
98
134
  blocks[i] = int(blocks[i]) | (int(hv) << (n - 1 - j) * slot_size)
99
- fmt = self._serial_fmt_params + "%d%s" % (num_blocks, self._serial_fmt_block)
100
- struct.pack_into(fmt, buffer, 0, self.seed, self.b, self.r, self.hashvalues.size, *blocks)
135
+ fmt = self._params_fmt() + "%d%s" % (num_blocks, self._serial_fmt_block)
136
+ if self.scheme == _SCHEME_LEGACY:
137
+ struct.pack_into(fmt, buffer, 0, self.seed, self.b, self.r, self.hashvalues.size, *blocks)
138
+ else:
139
+ # A negated size marks the post-2.0.0 format carrying a scheme code.
140
+ struct.pack_into(
141
+ fmt, buffer, 0, self.seed, self.b, self.r, -self.hashvalues.size, _SCHEME_CODES[self.scheme], *blocks
142
+ )
101
143
  return buffer
102
144
 
103
145
  def __setstate__(self, buf):
@@ -107,8 +149,19 @@ class bBitMinHash:
107
149
  try:
108
150
  self.seed, self.b, self.r, num_perm = struct.unpack_from(self._serial_fmt_params, buf, 0)
109
151
  except TypeError:
110
- self.seed, self.b, self.r, num_perm = struct.unpack_from(self._serial_fmt_params, memoryview(buf), 0)
152
+ buf = memoryview(buf)
153
+ self.seed, self.b, self.r, num_perm = struct.unpack_from(self._serial_fmt_params, buf, 0)
111
154
  offset = struct.calcsize(self._serial_fmt_params)
155
+ if num_perm >= 0:
156
+ # Payloads from before version 2.0.0 have no scheme field.
157
+ self.scheme = _SCHEME_LEGACY
158
+ else:
159
+ num_perm = -num_perm
160
+ (scheme_code,) = struct.unpack_from("<B", buf, offset)
161
+ if scheme_code not in _SCHEME_CODES_INV:
162
+ raise ValueError("Unknown permutation scheme code: %d" % scheme_code)
163
+ self.scheme = _SCHEME_CODES_INV[scheme_code]
164
+ offset += 1
112
165
  self.hashvalues = np.zeros((num_perm,), dtype=np.uint32)
113
166
  # Reconstruct the hash values
114
167
  slot_size, n, num_blocks, _total = self._bytesize()
@@ -168,5 +221,5 @@ class bBitMinHash:
168
221
  # Get the number of blocks required
169
222
  num_blocks = int(np.ceil(float(self.hashvalues.size) / num_slots_per_block))
170
223
  # Get the total serialized size
171
- total = struct.calcsize(self._serial_fmt_params + "%d%s" % (num_blocks, self._serial_fmt_block))
224
+ total = struct.calcsize(self._params_fmt() + "%d%s" % (num_blocks, self._serial_fmt_block))
172
225
  return slot_size, num_slots_per_block, num_blocks, total
@@ -265,7 +265,7 @@ class HyperLogLog:
265
265
  different precisions"
266
266
  )
267
267
  reg = np.maximum.reduce([h.reg for h in hyperloglogs])
268
- return cls(reg=reg)
268
+ return cls(reg=reg, hashfunc=hyperloglogs[0].hashfunc)
269
269
 
270
270
  def bytesize(self) -> int:
271
271
  """Get the size of the HyperLogLog in bytes."""
@@ -0,0 +1,359 @@
1
+ from __future__ import annotations
2
+
3
+ import struct
4
+ from collections.abc import Iterable
5
+ from typing import Optional
6
+
7
+ import numpy as np
8
+
9
+ from datasketch.minhash import (
10
+ _SCHEME_AFFINE32,
11
+ _SCHEME_AFFINE64,
12
+ _SCHEME_CODES,
13
+ _SCHEME_CODES_INV,
14
+ _SCHEME_LEGACY,
15
+ _VALID_SCHEMES,
16
+ MinHash,
17
+ )
18
+
19
+ # Byte-format notes: legacy payloads have no scheme field and are identified
20
+ # by a non-negative number-of-hash-values field, while the affine formats
21
+ # store the negated number followed by a scheme code byte. This keeps legacy
22
+ # sketches bit-identical to (and readable by) versions before 2.0.0.
23
+ # struct format character of one hash value, per scheme.
24
+ _SCHEME_VALUE_FMTS = {
25
+ _SCHEME_LEGACY: "I",
26
+ _SCHEME_AFFINE32: "I",
27
+ _SCHEME_AFFINE64: "Q",
28
+ }
29
+
30
+
31
+ class LeanMinHash(MinHash):
32
+ """Lean MinHash is MinHash with a smaller memory footprint
33
+ and faster deserialization, but with its internal state frozen
34
+ -- no `update()`.
35
+
36
+ Lean MinHash inherits all methods from :class:`datasketch.MinHash`.
37
+ It does not store the `permutations` and the `hashfunc` needed for updating.
38
+ If a MinHash does not need further updates, convert it into a lean MinHash
39
+ to save memory.
40
+
41
+ Example:
42
+ To create a lean MinHash from an existing MinHash:
43
+
44
+ .. code-block:: python
45
+
46
+ lean_minhash = LeanMinHash(minhash)
47
+
48
+ # You can compute the Jaccard similarity between two lean MinHash
49
+ lean_minhash.jaccard(lean_minhash2)
50
+
51
+ # Or between a lean MinHash and a MinHash
52
+ lean_minhash.jaccard(minhash2)
53
+
54
+ To create a lean MinHash from the hash values, seed, and scheme of an
55
+ existing MinHash:
56
+
57
+ .. code-block:: python
58
+
59
+ lean_minhash = LeanMinHash(
60
+ seed=minhash.seed,
61
+ hashvalues=minhash.hashvalues,
62
+ scheme=minhash.scheme,
63
+ )
64
+
65
+ To create a MinHash from a lean MinHash:
66
+
67
+ .. code-block:: python
68
+
69
+ minhash = MinHash(
70
+ seed=lean_minhash.seed,
71
+ hashvalues=lean_minhash.hashvalues,
72
+ scheme=lean_minhash.scheme,
73
+ )
74
+
75
+ # Or if you want to prevent further updates on minhash
76
+ # from affecting the state of lean_minhash
77
+ minhash = MinHash(
78
+ seed=lean_minhash.seed,
79
+ hashvalues=lean_minhash.digest(),
80
+ scheme=lean_minhash.scheme,
81
+ )
82
+
83
+ Note:
84
+ Lean MinHash can also be used in :class:`datasketch.MinHashLSH`,
85
+ :class:`datasketch.MinHashLSHForest`, and :class:`datasketch.MinHashLSHEnsemble`.
86
+
87
+ Args:
88
+ minhash (optional): The :class:`datasketch.MinHash` object used to
89
+ initialize the LeanMinHash. If this is not set, then `seed`
90
+ and `hashvalues` must be set.
91
+ seed (optional): The random seed that controls the set of random
92
+ permutation functions generated for this LeanMinHash. This parameter
93
+ must be used together with `hashvalues`.
94
+ hashvalues (optional): The hash values used to inititialize the state
95
+ of the LeanMinHash. This parameter must be used together with
96
+ `seed`.
97
+ scheme (optional): The permutation scheme of the MinHash the
98
+ `hashvalues` were taken from. Required when initializing from
99
+ `seed` and `hashvalues` (use ``"legacy"`` for hash values created
100
+ by datasketch before 2.0.0), because hash values carry no trace
101
+ of the scheme that produced them. When `minhash` is set the
102
+ scheme is taken from the MinHash object instead, and this
103
+ argument may only repeat it.
104
+
105
+ """
106
+
107
+ __slots__ = ("hashvalues", "scheme", "seed")
108
+
109
+ def _initialize_slots(self, seed, hashvalues, scheme=_SCHEME_LEGACY):
110
+ """Initialize the slots of the LeanMinHash.
111
+
112
+ Args:
113
+ seed (int): The random seed controls the set of random
114
+ permutation functions generated for this LeanMinHash.
115
+ hashvalues (Iterable): The hash values is the internal state of the LeanMinHash.
116
+ scheme (str): The permutation scheme of the hash values.
117
+
118
+ """
119
+ if scheme not in _VALID_SCHEMES:
120
+ raise ValueError("scheme must be one of %s, got %r" % (", ".join(_VALID_SCHEMES), scheme))
121
+ self.seed = seed
122
+ self.scheme = scheme
123
+ self.hashvalues = self._parse_hashvalues(hashvalues)
124
+ if scheme != _SCHEME_LEGACY and len(self.hashvalues) == 0:
125
+ # An empty sketch would serialize with a hash value count of 0,
126
+ # which the deserializer cannot tell apart from the legacy format
127
+ # (identified by a non-negative count).
128
+ raise ValueError("hashvalues must not be empty")
129
+
130
+ def __init__(
131
+ self,
132
+ minhash: MinHash = None,
133
+ seed: Optional[int] = None,
134
+ hashvalues: Optional[Iterable] = None,
135
+ scheme: Optional[str] = None,
136
+ ):
137
+ if minhash is not None:
138
+ if scheme is not None and scheme != minhash.scheme:
139
+ raise ValueError(
140
+ "scheme %r conflicts with the scheme %r of the given MinHash" % (scheme, minhash.scheme)
141
+ )
142
+ self._initialize_slots(minhash.seed, minhash.hashvalues, minhash.scheme)
143
+ elif hashvalues is not None and seed is not None:
144
+ if scheme is None:
145
+ # Hash values carry no trace of the scheme that produced
146
+ # them, so a default here would silently mislabel pre-2.0.0
147
+ # values and defeat the cross-scheme comparison guards.
148
+ raise ValueError(
149
+ "scheme must be specified explicitly when initializing from existing "
150
+ "hash values: pass the scheme of the MinHash they came from, or "
151
+ "scheme='legacy' for hash values created by datasketch before 2.0.0."
152
+ )
153
+ self._initialize_slots(seed, hashvalues, scheme)
154
+ else:
155
+ raise ValueError(
156
+ "Init parameters cannot be None: make sure to set either minhash or both of hash values and seed"
157
+ )
158
+
159
+ def update(self, b) -> None:
160
+ """Not available on a LeanMinHash.
161
+ Calling it raises a TypeError.
162
+ """
163
+ raise TypeError("Cannot update a LeanMinHash")
164
+
165
+ def copy(self) -> LeanMinHash:
166
+ lmh = object.__new__(LeanMinHash)
167
+ lmh._initialize_slots(self.seed, self.hashvalues, self.scheme)
168
+ return lmh
169
+
170
+ def _value_fmt(self) -> str:
171
+ return _SCHEME_VALUE_FMTS[self.scheme]
172
+
173
+ def bytesize(self, byteorder="@") -> int:
174
+ """Compute the byte size after serialization.
175
+
176
+ Args:
177
+ byteorder (str, optional): This is byte order of the serialized data. Use one
178
+ of the `byte order characters
179
+ <https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_:
180
+ ``@``, ``=``, ``<``, ``>``, and ``!``.
181
+ Default is ``@`` -- the native order.
182
+
183
+ Returns:
184
+ int: Size in number of bytes after serialization.
185
+
186
+ """
187
+ if self.scheme == _SCHEME_LEGACY:
188
+ # 8 bytes for the seed, 4 bytes for the number of hash values,
189
+ # and 4 bytes for each hash value.
190
+ return struct.calcsize("%sqi%dI" % (byteorder, len(self)))
191
+ # The affine formats add a 1-byte scheme code after the number of
192
+ # hash values, and store each hash value in 4 ("affine32") or
193
+ # 8 ("affine64") bytes.
194
+ return struct.calcsize("%sqiB%d%s" % (byteorder, len(self), self._value_fmt()))
195
+
196
+ def serialize(self, buf, byteorder="@") -> None:
197
+ """Serialize this lean MinHash and store the result in an allocated buffer.
198
+
199
+ Args:
200
+ buf (buffer): `buf` must implement the `buffer`_ interface.
201
+ One such example is the built-in `bytearray`_ class.
202
+ byteorder (str, optional): This is byte order of the serialized data. Use one
203
+ of the `byte order characters
204
+ <https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_:
205
+ ``@``, ``=``, ``<``, ``>``, and ``!``.
206
+ Default is ``@`` -- the native order.
207
+
208
+ This is preferred over using `pickle`_ if the serialized lean MinHash needs
209
+ to be used by another program in a different programming language.
210
+
211
+ The serialization schema for the ``"legacy"`` scheme (identical to
212
+ versions before 2.0.0):
213
+ 1. The first 8 bytes is the seed integer
214
+ 2. The next 4 bytes is the number of hash values
215
+ 3. The rest is the serialized hash values, each uses 4 bytes
216
+
217
+ The serialization schema for the affine schemes:
218
+ 1. The first 8 bytes is the seed integer
219
+ 2. The next 4 bytes is the **negated** number of hash values
220
+ (a negative value marks the post-2.0.0 format)
221
+ 3. The next byte is the scheme code (1 for ``"affine32"``,
222
+ 2 for ``"affine64"``)
223
+ 4. The rest is the serialized hash values, each uses 4 bytes
224
+ for ``"affine32"`` and 8 bytes for ``"affine64"``
225
+
226
+ Example:
227
+ To serialize a single lean MinHash into a `bytearray`_ buffer.
228
+
229
+ .. code-block:: python
230
+
231
+ buf = bytearray(lean_minhash.bytesize())
232
+ lean_minhash.serialize(buf)
233
+
234
+ To serialize multiple lean MinHash into a `bytearray`_ buffer.
235
+
236
+ .. code-block:: python
237
+
238
+ # assuming lean_minhashs is a list of LeanMinHash with the same size
239
+ size = lean_minhashs[0].bytesize()
240
+ buf = bytearray(size * len(lean_minhashs))
241
+ for i, lean_minhash in enumerate(lean_minhashs):
242
+ lean_minhash.serialize(buf[i * size :])
243
+
244
+ .. _`buffer`: https://docs.python.org/3/c-api/buffer.html
245
+ .. _`bytearray`: https://docs.python.org/3.6/library/functions.html#bytearray
246
+ .. _`byteorder`: https://docs.python.org/3/library/struct.html
247
+
248
+ """
249
+ if len(buf) < self.bytesize(byteorder):
250
+ raise ValueError(
251
+ "The buffer does not have enough space for holding this MinHash."
252
+ )
253
+ if self.scheme == _SCHEME_LEGACY:
254
+ fmt = "%sqi%dI" % (byteorder, len(self))
255
+ struct.pack_into(fmt, buf, 0, self.seed, len(self), *self.hashvalues)
256
+ else:
257
+ fmt = "%sqiB%d%s" % (byteorder, len(self), self._value_fmt())
258
+ struct.pack_into(fmt, buf, 0, self.seed, -len(self), _SCHEME_CODES[self.scheme], *self.hashvalues)
259
+
260
+ @classmethod
261
+ def deserialize(cls, buf, byteorder="@") -> LeanMinHash:
262
+ """Deserialize a lean MinHash from a buffer.
263
+
264
+ Buffers written by versions before 2.0.0 (which had no scheme field)
265
+ deserialize with ``scheme="legacy"``.
266
+
267
+ Args:
268
+ buf (buffer): `buf` must implement the `buffer`_ interface.
269
+ One such example is the built-in `bytearray`_ class.
270
+ byteorder (str. optional): This is byte order of the serialized data. Use one
271
+ of the `byte order characters
272
+ <https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_:
273
+ ``@``, ``=``, ``<``, ``>``, and ``!``.
274
+ Default is ``@`` -- the native order.
275
+
276
+ Return:
277
+ datasketch.LeanMinHash: The deserialized lean MinHash
278
+
279
+ Example:
280
+ To deserialize a lean MinHash from a buffer.
281
+
282
+ .. code-block:: python
283
+
284
+ lean_minhash = LeanMinHash.deserialize(buf)
285
+
286
+ """
287
+ fmt_seed_size = "%sqi" % byteorder
288
+ try:
289
+ seed, num_perm = struct.unpack_from(fmt_seed_size, buf, 0)
290
+ except TypeError:
291
+ buf = memoryview(buf)
292
+ seed, num_perm = struct.unpack_from(fmt_seed_size, buf, 0)
293
+ if num_perm >= 0:
294
+ scheme = _SCHEME_LEGACY
295
+ offset = struct.calcsize(fmt_seed_size)
296
+ else:
297
+ num_perm = -num_perm
298
+ (scheme_code,) = struct.unpack_from(byteorder + "B", buf, struct.calcsize(fmt_seed_size))
299
+ if scheme_code not in _SCHEME_CODES_INV:
300
+ raise ValueError("Unknown permutation scheme code: %d" % scheme_code)
301
+ scheme = _SCHEME_CODES_INV[scheme_code]
302
+ # The 0-count value entry aligns the offset without consuming data
303
+ # (only relevant for the native byte order "@").
304
+ offset = struct.calcsize("%sqiB0%s" % (byteorder, _SCHEME_VALUE_FMTS[scheme]))
305
+ fmt_hash = "%s%d%s" % (byteorder, num_perm, _SCHEME_VALUE_FMTS[scheme])
306
+ hashvalues = struct.unpack_from(fmt_hash, buf, offset)
307
+ lmh = object.__new__(LeanMinHash)
308
+ lmh._initialize_slots(seed, hashvalues, scheme)
309
+ return lmh
310
+
311
+ def __getstate__(self):
312
+ buf = bytearray(self.bytesize())
313
+ if self.scheme == _SCHEME_LEGACY:
314
+ fmt = "qi%dI" % len(self)
315
+ struct.pack_into(fmt, buf, 0, self.seed, len(self), *self.hashvalues)
316
+ else:
317
+ fmt = "qiB%d%s" % (len(self), self._value_fmt())
318
+ struct.pack_into(fmt, buf, 0, self.seed, -len(self), _SCHEME_CODES[self.scheme], *self.hashvalues)
319
+ return buf
320
+
321
+ def __setstate__(self, buf):
322
+ try:
323
+ seed, num_perm = struct.unpack_from("qi", buf, 0)
324
+ except TypeError:
325
+ buf = memoryview(buf)
326
+ seed, num_perm = struct.unpack_from("qi", buf, 0)
327
+ if num_perm >= 0:
328
+ scheme = _SCHEME_LEGACY
329
+ offset = struct.calcsize("qi")
330
+ else:
331
+ num_perm = -num_perm
332
+ (scheme_code,) = struct.unpack_from("B", buf, struct.calcsize("qi"))
333
+ if scheme_code not in _SCHEME_CODES_INV:
334
+ raise ValueError("Unknown permutation scheme code: %d" % scheme_code)
335
+ scheme = _SCHEME_CODES_INV[scheme_code]
336
+ offset = struct.calcsize("qiB0%s" % _SCHEME_VALUE_FMTS[scheme])
337
+ hashvalues = struct.unpack_from("%d%s" % (num_perm, _SCHEME_VALUE_FMTS[scheme]), buf, offset)
338
+ self._initialize_slots(seed, hashvalues, scheme)
339
+
340
+ def __hash__(self) -> int:
341
+ return hash((self.scheme, self.seed, tuple(self.hashvalues)))
342
+
343
+ @classmethod
344
+ def union(cls, *lmhs: LeanMinHash) -> LeanMinHash:
345
+ """Create a new lean MinHash by unioning multiple lean MinHash."""
346
+ if len(lmhs) < 2:
347
+ raise ValueError("Cannot union less than 2 MinHash")
348
+ num_perm = len(lmhs[0])
349
+ seed = lmhs[0].seed
350
+ scheme = lmhs[0].scheme
351
+ if any((seed != m.seed or num_perm != len(m) or scheme != m.scheme) for m in lmhs):
352
+ raise ValueError(
353
+ "The unioning MinHash must have the same seed, number of permutation functions and scheme."
354
+ )
355
+ hashvalues = np.minimum.reduce([m.hashvalues for m in lmhs])
356
+
357
+ lmh = object.__new__(LeanMinHash)
358
+ lmh._initialize_slots(seed, hashvalues, scheme)
359
+ return lmh
@@ -7,7 +7,7 @@ from typing import Callable, List, Optional, Union
7
7
 
8
8
  from scipy.integrate import quad as integrate
9
9
 
10
- from datasketch.minhash import MinHash
10
+ from datasketch.minhash import MinHash, _check_scheme_consistency
11
11
  from datasketch.storage import (
12
12
  OrderedStorage,
13
13
  UnorderedStorage,
@@ -198,6 +198,10 @@ class MinHashLSH:
198
198
  ]
199
199
  self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
200
200
  self.keys: OrderedStorage = ordered_storage(storage_config, name=b"".join([basename, b"_keys"]))
201
+ # The permutation scheme of the indexed MinHash, learned from the
202
+ # first insert. Note that an index attached to pre-existing external
203
+ # storage (e.g. Redis) re-learns the scheme on its first insert.
204
+ self._minhash_scheme: Optional[str] = None
201
205
 
202
206
  @property
203
207
  def buffer_size(self) -> int:
@@ -332,6 +336,7 @@ class MinHashLSH:
332
336
  ):
333
337
  if len(minhash) != self.h:
334
338
  raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
339
+ self._minhash_scheme = _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
335
340
  if self._require_bytes_keys and not isinstance(key, bytes):
336
341
  raise TypeError(
337
342
  f"prepickle=False requires bytes keys for non-dict storage, got {type(key).__name__}. "
@@ -355,8 +360,16 @@ class MinHashLSH:
355
360
 
356
361
  def _merge(self, other: MinHashLSH, check_overlap: bool = False, buffer: bool = False) -> None:
357
362
  if self.__equivalent(other):
363
+ known, other_known = getattr(self, "_minhash_scheme", None), getattr(other, "_minhash_scheme", None)
364
+ if known is not None and other_known is not None and known != other_known:
365
+ raise ValueError(
366
+ "Cannot merge MinHashLSH indexed with MinHash scheme %r into one indexed with scheme %r"
367
+ % (other_known, known)
368
+ )
358
369
  if check_overlap and set(self.keys).intersection(set(other.keys)):
359
370
  raise ValueError("The keys are overlapping, duplicate key exists.")
371
+ if known is None:
372
+ self._minhash_scheme = other_known
360
373
  for key in other.keys:
361
374
  Hs = other.keys.get(key)
362
375
  self.keys.insert(key, *Hs, buffer=buffer)
@@ -422,6 +435,7 @@ class MinHashLSH:
422
435
  """
423
436
  if len(minhash) != self.h:
424
437
  raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
438
+ _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
425
439
  candidates = set()
426
440
  for (start, end), hashtable in zip(self.hashranges, self.hashtables):
427
441
  H = self._H(minhash.hashvalues[start:end])
@@ -448,6 +462,7 @@ class MinHashLSH:
448
462
  """
449
463
  if len(minhash) != self.h:
450
464
  raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
465
+ _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
451
466
  for (start, end), hashtable in zip(self.hashranges, self.hashtables):
452
467
  H = self._H(minhash.hashvalues[start:end])
453
468
  hashtable.add_to_select_buffer([H])
@@ -545,6 +560,7 @@ class MinHashLSH:
545
560
  def _query_b(self, minhash, b):
546
561
  if len(minhash) != self.h:
547
562
  raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
563
+ _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
548
564
  if b > len(self.hashtables):
549
565
  raise ValueError("b must be less or equal to the number of hash tables")
550
566
  candidates = set()