PyPI - datasketch - Versions diffs - 1.10.0__tar.gz → 2.0.0__tar.gz - Mend

datasketch 1.10.0tar.gz → 2.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{datasketch-1.10.0 → datasketch-2.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datasketch
-Version: 1.10.0
+Version: 2.0.0
 Summary: Probabilistic data structures for processing and searching very large datasets
 Project-URL: Homepage, https://ekzhu.github.io/datasketch
 Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
@@ -80,6 +80,17 @@ datasketch gives you probabilistic data structures that can process and
 search very large amount of data super fast, with little loss of
 accuracy.
+.. note::
+    **Version 2.0.0** changes the default MinHash permutation scheme to
+    ``"affine32"``, which fixes a similarity over-estimation bias on large
+    sets (`issue #212 <https://github.com/ekzhu/datasketch/issues/212>`__),
+    halves sketch memory, and speeds up updates by roughly 4x. A 64-bit
+    ``"affine64"`` scheme is available for billion-scale sets. Hash values
+    differ from earlier versions: rebuild persisted sketches and LSH
+    indexes, or pass ``MinHash(..., scheme="legacy")`` to interoperate with
+    existing data. See the `MinHash documentation
+    <https://ekzhu.github.io/datasketch/minhash.html>`__ for details.
 This package contains the following data sketches:
 +-------------------------+-----------------------------------------------+

{datasketch-1.10.0 → datasketch-2.0.0}/README.rst RENAMED Viewed

@@ -14,6 +14,17 @@ datasketch gives you probabilistic data structures that can process and
 search very large amount of data super fast, with little loss of
 accuracy.
+.. note::
+    **Version 2.0.0** changes the default MinHash permutation scheme to
+    ``"affine32"``, which fixes a similarity over-estimation bias on large
+    sets (`issue #212 <https://github.com/ekzhu/datasketch/issues/212>`__),
+    halves sketch memory, and speeds up updates by roughly 4x. A 64-bit
+    ``"affine64"`` scheme is available for billion-scale sets. Hash values
+    differ from earlier versions: rebuild persisted sketches and LSH
+    indexes, or pass ``MinHash(..., scheme="legacy")`` to interoperate with
+    existing data. See the `MinHash documentation
+    <https://ekzhu.github.io/datasketch/minhash.html>`__ for details.
 This package contains the following data sketches:
 +-------------------------+-----------------------------------------------+

{datasketch-1.10.0 → datasketch-2.0.0}/datasketch/__init__.py RENAMED Viewed

@@ -9,7 +9,7 @@ __version__: Final[str] = _version
 from datasketch.aio import AsyncMinHashLSH  # Instantiation requires motor/redis.asyncio; import itself is always safe.
 from datasketch.b_bit_minhash import bBitMinHash
-from datasketch.hashfunc import sha1_hash32
+from datasketch.hashfunc import sha1_hash32, sha1_hash64
 from datasketch.hnsw import HNSW
 from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
 from datasketch.lean_minhash import LeanMinHash
@@ -41,4 +41,5 @@ __all__ = [
     "WeightedMinHashLSHForest",
     "bBitMinHash",
     "sha1_hash32",
+    "sha1_hash64",
 ]

{datasketch-1.10.0 → datasketch-2.0.0}/datasketch/aio/lsh.py RENAMED Viewed

@@ -14,6 +14,7 @@ from datasketch.aio.storage import (
     async_unordered_storage,
 )
 from datasketch.lsh import _optimal_param
+from datasketch.minhash import _check_scheme_consistency
 from datasketch.storage import _random_name, unordered_storage
@@ -88,6 +89,10 @@ class AsyncMinHashLSH:
         self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
         self.hashtables = None
         self.keys = None
+        # The permutation scheme of the indexed MinHash, learned from the
+        # first insert. Note that an index attached to pre-existing external
+        # storage re-learns the scheme on its first insert.
+        self._minhash_scheme: Optional[str] = None
         self._lock = asyncio.Lock()
         self._initialized = False
@@ -248,6 +253,7 @@ class AsyncMinHashLSH:
     async def _insert(self, key, minhash, check_duplication=True, buffer=False):
         if len(minhash) != self.h:
             raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
+        self._minhash_scheme = _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
         if self._require_bytes_keys and not isinstance(key, bytes):
             raise TypeError(
                 f"prepickle=False requires bytes keys for non-dict storage, got {type(key).__name__}. "
@@ -272,6 +278,7 @@ class AsyncMinHashLSH:
         """See :class:`datasketch.MinHashLSH`."""
         if len(minhash) != self.h:
             raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
+        _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
         fs = (
             hashtable.get(self._H(minhash.hashvalues[start:end]))
@@ -322,6 +329,7 @@ class AsyncMinHashLSH:
     async def _query_b(self, minhash, b):
         if len(minhash) != self.h:
             raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
+        _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
         if b > len(self.hashtables):
             raise ValueError("b must be less or equal to the number of hash tables")
         fs = []

{datasketch-1.10.0 → datasketch-2.0.0}/datasketch/b_bit_minhash.py RENAMED Viewed

@@ -9,16 +9,37 @@ import struct
 import numpy as np
+from datasketch.minhash import _SCHEME_CODES, _SCHEME_CODES_INV, _SCHEME_LEGACY, _VALID_SCHEMES
 class bBitMinHash:
-    """The b-bit MinHash object."""
+    """b-bit MinHash of an existing :class:`datasketch.MinHash`.
+    b-bit MinHash reduces storage by keeping only the lowest `b` bits of
+    each minimum hash value, at some loss of accuracy. It supports
+    :meth:`jaccard` and pickle serialization, but cannot be updated with
+    new values and cannot be used with the LSH indexes. See `b-Bit Minwise
+    Hashing <http://research.microsoft.com/pubs/120078/wfc0398-liPS.pdf>`_
+    by Ping Li and Arnd Christian König.
+    Args:
+        minhash (datasketch.MinHash): The MinHash to compress.
+        b (int): The number of lowest bits to keep for each minimum hash
+            value, between 0 and 32.
+        r (float): The expected ratio of set size to the size of the
+            universe of all values, used by the Jaccard estimator. Leave
+            at 0.0 if unknown: the estimator then uses the limit as the
+            ratio goes to zero.
-    __slots__ = ("b", "hashvalues", "r", "seed")
+    """
+    __slots__ = ("b", "hashvalues", "r", "scheme", "seed")
     # seed as int64
     # b as uint8
     # r as float64
-    # num_perm as int32
+    # num_perm as int32 (negated and followed by a scheme code byte for
+    # non-legacy permutation schemes; legacy payloads predate the field)
     _serial_fmt_params = "<qBdi"
     # each block as uint64
     _serial_fmt_block = "Q"
@@ -37,6 +58,11 @@ class bBitMinHash:
         bmask = (1 << b) - 1
         self.hashvalues = np.bitwise_and(minhash.hashvalues, bmask).astype(np.uint32)
         self.seed = minhash.seed
+        # Requiring the attribute (rather than assuming a default) keeps a
+        # sketch type without a scheme from being silently mislabeled.
+        self.scheme = minhash.scheme
+        if self.scheme not in _VALID_SCHEMES:
+            raise ValueError("scheme must be one of %s, got %r" % (", ".join(_VALID_SCHEMES), self.scheme))
         self.b = b
         self.r = r
@@ -44,6 +70,7 @@ class bBitMinHash:
         """Check for full equality of two b-bit MinHash objects."""
         return (
             type(self) is type(other)
+            and self.scheme == other.scheme
             and self.seed == other.seed
             and self.b == other.b
             and self.r == other.r
@@ -56,13 +83,15 @@ class bBitMinHash:
         """
         if self.b != other.b:
             raise ValueError(
-                "Cannot compare two b-bit MinHashes with different\
-                    b values"
+                "Cannot compare two b-bit MinHashes with different b values"
+            )
+        if self.scheme != other.scheme:
+            raise ValueError(
+                "Cannot compare two b-bit MinHashes with different permutation schemes"
             )
         if self.seed != other.seed:
             raise ValueError(
-                "Cannot compare two b-bit MinHashes with different\
-                    set of permutations"
+                "Cannot compare two b-bit MinHashes with different set of permutations"
             )
         intersection = np.count_nonzero(self.hashvalues == other.hashvalues)
         raw_est = float(intersection) / float(self.hashvalues.size)
@@ -75,6 +104,13 @@ class bBitMinHash:
         """Get the serialized size of this b-bit MinHash in number of bytes."""
         return self._bytesize()[-1]
+    def _params_fmt(self):
+        """The struct format of the parameter header for this scheme."""
+        if self.scheme == _SCHEME_LEGACY:
+            return self._serial_fmt_params
+        # Non-legacy schemes append a scheme code byte to the parameters.
+        return self._serial_fmt_params + "B"
     def __getstate__(self):
         """Called when pickling the b-bit MinHash object.
         Returns a bytearray which will then be pickled.
@@ -96,8 +132,14 @@ class bBitMinHash:
                 # Doing this in BigInteger guarantees we do not experience overflow and still
                 # coerces to np.uint64 as expected.
                 blocks[i] = int(blocks[i]) | (int(hv) << (n - 1 - j) * slot_size)
-        fmt = self._serial_fmt_params + "%d%s" % (num_blocks, self._serial_fmt_block)
-        struct.pack_into(fmt, buffer, 0, self.seed, self.b, self.r, self.hashvalues.size, *blocks)
+        fmt = self._params_fmt() + "%d%s" % (num_blocks, self._serial_fmt_block)
+        if self.scheme == _SCHEME_LEGACY:
+            struct.pack_into(fmt, buffer, 0, self.seed, self.b, self.r, self.hashvalues.size, *blocks)
+        else:
+            # A negated size marks the post-2.0.0 format carrying a scheme code.
+            struct.pack_into(
+                fmt, buffer, 0, self.seed, self.b, self.r, -self.hashvalues.size, _SCHEME_CODES[self.scheme], *blocks
+            )
         return buffer
     def __setstate__(self, buf):
@@ -107,8 +149,19 @@ class bBitMinHash:
         try:
             self.seed, self.b, self.r, num_perm = struct.unpack_from(self._serial_fmt_params, buf, 0)
         except TypeError:
-            self.seed, self.b, self.r, num_perm = struct.unpack_from(self._serial_fmt_params, memoryview(buf), 0)
+            buf = memoryview(buf)
+            self.seed, self.b, self.r, num_perm = struct.unpack_from(self._serial_fmt_params, buf, 0)
         offset = struct.calcsize(self._serial_fmt_params)
+        if num_perm >= 0:
+            # Payloads from before version 2.0.0 have no scheme field.
+            self.scheme = _SCHEME_LEGACY
+        else:
+            num_perm = -num_perm
+            (scheme_code,) = struct.unpack_from("<B", buf, offset)
+            if scheme_code not in _SCHEME_CODES_INV:
+                raise ValueError("Unknown permutation scheme code: %d" % scheme_code)
+            self.scheme = _SCHEME_CODES_INV[scheme_code]
+            offset += 1
         self.hashvalues = np.zeros((num_perm,), dtype=np.uint32)
         # Reconstruct the hash values
         slot_size, n, num_blocks, _total = self._bytesize()
@@ -168,5 +221,5 @@ class bBitMinHash:
         # Get the number of blocks required
         num_blocks = int(np.ceil(float(self.hashvalues.size) / num_slots_per_block))
         # Get the total serialized size
-        total = struct.calcsize(self._serial_fmt_params + "%d%s" % (num_blocks, self._serial_fmt_block))
+        total = struct.calcsize(self._params_fmt() + "%d%s" % (num_blocks, self._serial_fmt_block))
         return slot_size, num_slots_per_block, num_blocks, total

{datasketch-1.10.0 → datasketch-2.0.0}/datasketch/hyperloglog.py RENAMED Viewed

@@ -265,7 +265,7 @@ class HyperLogLog:
                     different precisions"
             )
         reg = np.maximum.reduce([h.reg for h in hyperloglogs])
-        return cls(reg=reg)
+        return cls(reg=reg, hashfunc=hyperloglogs[0].hashfunc)
     def bytesize(self) -> int:
         """Get the size of the HyperLogLog in bytes."""

datasketch-2.0.0/datasketch/lean_minhash.py ADDED Viewed

@@ -0,0 +1,359 @@
+from __future__ import annotations
+import struct
+from collections.abc import Iterable
+from typing import Optional
+import numpy as np
+from datasketch.minhash import (
+    _SCHEME_AFFINE32,
+    _SCHEME_AFFINE64,
+    _SCHEME_CODES,
+    _SCHEME_CODES_INV,
+    _SCHEME_LEGACY,
+    _VALID_SCHEMES,
+    MinHash,
+)
+# Byte-format notes: legacy payloads have no scheme field and are identified
+# by a non-negative number-of-hash-values field, while the affine formats
+# store the negated number followed by a scheme code byte. This keeps legacy
+# sketches bit-identical to (and readable by) versions before 2.0.0.
+# struct format character of one hash value, per scheme.
+_SCHEME_VALUE_FMTS = {
+    _SCHEME_LEGACY: "I",
+    _SCHEME_AFFINE32: "I",
+    _SCHEME_AFFINE64: "Q",
+}
+class LeanMinHash(MinHash):
+    """Lean MinHash is MinHash with a smaller memory footprint
+    and faster deserialization, but with its internal state frozen
+    -- no `update()`.
+    Lean MinHash inherits all methods from :class:`datasketch.MinHash`.
+    It does not store the `permutations` and the `hashfunc` needed for updating.
+    If a MinHash does not need further updates, convert it into a lean MinHash
+    to save memory.
+    Example:
+        To create a lean MinHash from an existing MinHash:
+        .. code-block:: python
+            lean_minhash = LeanMinHash(minhash)
+            # You can compute the Jaccard similarity between two lean MinHash
+            lean_minhash.jaccard(lean_minhash2)
+            # Or between a lean MinHash and a MinHash
+            lean_minhash.jaccard(minhash2)
+        To create a lean MinHash from the hash values, seed, and scheme of an
+        existing MinHash:
+        .. code-block:: python
+            lean_minhash = LeanMinHash(
+                seed=minhash.seed,
+                hashvalues=minhash.hashvalues,
+                scheme=minhash.scheme,
+            )
+        To create a MinHash from a lean MinHash:
+        .. code-block:: python
+            minhash = MinHash(
+                seed=lean_minhash.seed,
+                hashvalues=lean_minhash.hashvalues,
+                scheme=lean_minhash.scheme,
+            )
+            # Or if you want to prevent further updates on minhash
+            # from affecting the state of lean_minhash
+            minhash = MinHash(
+                seed=lean_minhash.seed,
+                hashvalues=lean_minhash.digest(),
+                scheme=lean_minhash.scheme,
+            )
+    Note:
+        Lean MinHash can also be used in :class:`datasketch.MinHashLSH`,
+        :class:`datasketch.MinHashLSHForest`, and :class:`datasketch.MinHashLSHEnsemble`.
+    Args:
+        minhash (optional): The :class:`datasketch.MinHash` object used to
+            initialize the LeanMinHash. If this is not set, then `seed`
+            and `hashvalues` must be set.
+        seed (optional): The random seed that controls the set of random
+            permutation functions generated for this LeanMinHash. This parameter
+            must be used together with `hashvalues`.
+        hashvalues (optional): The hash values used to inititialize the state
+            of the LeanMinHash. This parameter must be used together with
+            `seed`.
+        scheme (optional): The permutation scheme of the MinHash the
+            `hashvalues` were taken from. Required when initializing from
+            `seed` and `hashvalues` (use ``"legacy"`` for hash values created
+            by datasketch before 2.0.0), because hash values carry no trace
+            of the scheme that produced them. When `minhash` is set the
+            scheme is taken from the MinHash object instead, and this
+            argument may only repeat it.
+    """
+    __slots__ = ("hashvalues", "scheme", "seed")
+    def _initialize_slots(self, seed, hashvalues, scheme=_SCHEME_LEGACY):
+        """Initialize the slots of the LeanMinHash.
+        Args:
+            seed (int): The random seed controls the set of random
+                permutation functions generated for this LeanMinHash.
+            hashvalues (Iterable): The hash values is the internal state of the LeanMinHash.
+            scheme (str): The permutation scheme of the hash values.
+        """
+        if scheme not in _VALID_SCHEMES:
+            raise ValueError("scheme must be one of %s, got %r" % (", ".join(_VALID_SCHEMES), scheme))
+        self.seed = seed
+        self.scheme = scheme
+        self.hashvalues = self._parse_hashvalues(hashvalues)
+        if scheme != _SCHEME_LEGACY and len(self.hashvalues) == 0:
+            # An empty sketch would serialize with a hash value count of 0,
+            # which the deserializer cannot tell apart from the legacy format
+            # (identified by a non-negative count).
+            raise ValueError("hashvalues must not be empty")
+    def __init__(
+        self,
+        minhash: MinHash = None,
+        seed: Optional[int] = None,
+        hashvalues: Optional[Iterable] = None,
+        scheme: Optional[str] = None,
+    ):
+        if minhash is not None:
+            if scheme is not None and scheme != minhash.scheme:
+                raise ValueError(
+                    "scheme %r conflicts with the scheme %r of the given MinHash" % (scheme, minhash.scheme)
+                )
+            self._initialize_slots(minhash.seed, minhash.hashvalues, minhash.scheme)
+        elif hashvalues is not None and seed is not None:
+            if scheme is None:
+                # Hash values carry no trace of the scheme that produced
+                # them, so a default here would silently mislabel pre-2.0.0
+                # values and defeat the cross-scheme comparison guards.
+                raise ValueError(
+                    "scheme must be specified explicitly when initializing from existing "
+                    "hash values: pass the scheme of the MinHash they came from, or "
+                    "scheme='legacy' for hash values created by datasketch before 2.0.0."
+                )
+            self._initialize_slots(seed, hashvalues, scheme)
+        else:
+            raise ValueError(
+                "Init parameters cannot be None: make sure to set either minhash or both of hash values and seed"
+            )
+    def update(self, b) -> None:
+        """Not available on a LeanMinHash.
+        Calling it raises a TypeError.
+        """
+        raise TypeError("Cannot update a LeanMinHash")
+    def copy(self) -> LeanMinHash:
+        lmh = object.__new__(LeanMinHash)
+        lmh._initialize_slots(self.seed, self.hashvalues, self.scheme)
+        return lmh
+    def _value_fmt(self) -> str:
+        return _SCHEME_VALUE_FMTS[self.scheme]
+    def bytesize(self, byteorder="@") -> int:
+        """Compute the byte size after serialization.
+        Args:
+            byteorder (str, optional): This is byte order of the serialized data. Use one
+                of the `byte order characters
+                <https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_:
+                ``@``, ``=``, ``<``, ``>``, and ``!``.
+                Default is ``@`` -- the native order.
+        Returns:
+            int: Size in number of bytes after serialization.
+        """
+        if self.scheme == _SCHEME_LEGACY:
+            # 8 bytes for the seed, 4 bytes for the number of hash values,
+            # and 4 bytes for each hash value.
+            return struct.calcsize("%sqi%dI" % (byteorder, len(self)))
+        # The affine formats add a 1-byte scheme code after the number of
+        # hash values, and store each hash value in 4 ("affine32") or
+        # 8 ("affine64") bytes.
+        return struct.calcsize("%sqiB%d%s" % (byteorder, len(self), self._value_fmt()))
+    def serialize(self, buf, byteorder="@") -> None:
+        """Serialize this lean MinHash and store the result in an allocated buffer.
+        Args:
+            buf (buffer): `buf` must implement the `buffer`_ interface.
+                One such example is the built-in `bytearray`_ class.
+            byteorder (str, optional): This is byte order of the serialized data. Use one
+                of the `byte order characters
+                <https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_:
+                ``@``, ``=``, ``<``, ``>``, and ``!``.
+                Default is ``@`` -- the native order.
+        This is preferred over using `pickle`_ if the serialized lean MinHash needs
+        to be used by another program in a different programming language.
+        The serialization schema for the ``"legacy"`` scheme (identical to
+        versions before 2.0.0):
+            1. The first 8 bytes is the seed integer
+            2. The next 4 bytes is the number of hash values
+            3. The rest is the serialized hash values, each uses 4 bytes
+        The serialization schema for the affine schemes:
+            1. The first 8 bytes is the seed integer
+            2. The next 4 bytes is the **negated** number of hash values
+               (a negative value marks the post-2.0.0 format)
+            3. The next byte is the scheme code (1 for ``"affine32"``,
+               2 for ``"affine64"``)
+            4. The rest is the serialized hash values, each uses 4 bytes
+               for ``"affine32"`` and 8 bytes for ``"affine64"``
+        Example:
+            To serialize a single lean MinHash into a `bytearray`_ buffer.
+            .. code-block:: python
+                buf = bytearray(lean_minhash.bytesize())
+                lean_minhash.serialize(buf)
+            To serialize multiple lean MinHash into a `bytearray`_ buffer.
+            .. code-block:: python
+                # assuming lean_minhashs is a list of LeanMinHash with the same size
+                size = lean_minhashs[0].bytesize()
+                buf = bytearray(size * len(lean_minhashs))
+                for i, lean_minhash in enumerate(lean_minhashs):
+                    lean_minhash.serialize(buf[i * size :])
+        .. _`buffer`: https://docs.python.org/3/c-api/buffer.html
+        .. _`bytearray`: https://docs.python.org/3.6/library/functions.html#bytearray
+        .. _`byteorder`: https://docs.python.org/3/library/struct.html
+        """
+        if len(buf) < self.bytesize(byteorder):
+            raise ValueError(
+                "The buffer does not have enough space for holding this MinHash."
+            )
+        if self.scheme == _SCHEME_LEGACY:
+            fmt = "%sqi%dI" % (byteorder, len(self))
+            struct.pack_into(fmt, buf, 0, self.seed, len(self), *self.hashvalues)
+        else:
+            fmt = "%sqiB%d%s" % (byteorder, len(self), self._value_fmt())
+            struct.pack_into(fmt, buf, 0, self.seed, -len(self), _SCHEME_CODES[self.scheme], *self.hashvalues)
+    @classmethod
+    def deserialize(cls, buf, byteorder="@") -> LeanMinHash:
+        """Deserialize a lean MinHash from a buffer.
+        Buffers written by versions before 2.0.0 (which had no scheme field)
+        deserialize with ``scheme="legacy"``.
+        Args:
+            buf (buffer): `buf` must implement the `buffer`_ interface.
+                One such example is the built-in `bytearray`_ class.
+            byteorder (str. optional): This is byte order of the serialized data. Use one
+                of the `byte order characters
+                <https://docs.python.org/3/library/struct.html#byte-order-size-and-alignment>`_:
+                ``@``, ``=``, ``<``, ``>``, and ``!``.
+                Default is ``@`` -- the native order.
+        Return:
+            datasketch.LeanMinHash: The deserialized lean MinHash
+        Example:
+            To deserialize a lean MinHash from a buffer.
+            .. code-block:: python
+                lean_minhash = LeanMinHash.deserialize(buf)
+        """
+        fmt_seed_size = "%sqi" % byteorder
+        try:
+            seed, num_perm = struct.unpack_from(fmt_seed_size, buf, 0)
+        except TypeError:
+            buf = memoryview(buf)
+            seed, num_perm = struct.unpack_from(fmt_seed_size, buf, 0)
+        if num_perm >= 0:
+            scheme = _SCHEME_LEGACY
+            offset = struct.calcsize(fmt_seed_size)
+        else:
+            num_perm = -num_perm
+            (scheme_code,) = struct.unpack_from(byteorder + "B", buf, struct.calcsize(fmt_seed_size))
+            if scheme_code not in _SCHEME_CODES_INV:
+                raise ValueError("Unknown permutation scheme code: %d" % scheme_code)
+            scheme = _SCHEME_CODES_INV[scheme_code]
+            # The 0-count value entry aligns the offset without consuming data
+            # (only relevant for the native byte order "@").
+            offset = struct.calcsize("%sqiB0%s" % (byteorder, _SCHEME_VALUE_FMTS[scheme]))
+        fmt_hash = "%s%d%s" % (byteorder, num_perm, _SCHEME_VALUE_FMTS[scheme])
+        hashvalues = struct.unpack_from(fmt_hash, buf, offset)
+        lmh = object.__new__(LeanMinHash)
+        lmh._initialize_slots(seed, hashvalues, scheme)
+        return lmh
+    def __getstate__(self):
+        buf = bytearray(self.bytesize())
+        if self.scheme == _SCHEME_LEGACY:
+            fmt = "qi%dI" % len(self)
+            struct.pack_into(fmt, buf, 0, self.seed, len(self), *self.hashvalues)
+        else:
+            fmt = "qiB%d%s" % (len(self), self._value_fmt())
+            struct.pack_into(fmt, buf, 0, self.seed, -len(self), _SCHEME_CODES[self.scheme], *self.hashvalues)
+        return buf
+    def __setstate__(self, buf):
+        try:
+            seed, num_perm = struct.unpack_from("qi", buf, 0)
+        except TypeError:
+            buf = memoryview(buf)
+            seed, num_perm = struct.unpack_from("qi", buf, 0)
+        if num_perm >= 0:
+            scheme = _SCHEME_LEGACY
+            offset = struct.calcsize("qi")
+        else:
+            num_perm = -num_perm
+            (scheme_code,) = struct.unpack_from("B", buf, struct.calcsize("qi"))
+            if scheme_code not in _SCHEME_CODES_INV:
+                raise ValueError("Unknown permutation scheme code: %d" % scheme_code)
+            scheme = _SCHEME_CODES_INV[scheme_code]
+            offset = struct.calcsize("qiB0%s" % _SCHEME_VALUE_FMTS[scheme])
+        hashvalues = struct.unpack_from("%d%s" % (num_perm, _SCHEME_VALUE_FMTS[scheme]), buf, offset)
+        self._initialize_slots(seed, hashvalues, scheme)
+    def __hash__(self) -> int:
+        return hash((self.scheme, self.seed, tuple(self.hashvalues)))
+    @classmethod
+    def union(cls, *lmhs: LeanMinHash) -> LeanMinHash:
+        """Create a new lean MinHash by unioning multiple lean MinHash."""
+        if len(lmhs) < 2:
+            raise ValueError("Cannot union less than 2 MinHash")
+        num_perm = len(lmhs[0])
+        seed = lmhs[0].seed
+        scheme = lmhs[0].scheme
+        if any((seed != m.seed or num_perm != len(m) or scheme != m.scheme) for m in lmhs):
+            raise ValueError(
+                "The unioning MinHash must have the same seed, number of permutation functions and scheme."
+            )
+        hashvalues = np.minimum.reduce([m.hashvalues for m in lmhs])
+        lmh = object.__new__(LeanMinHash)
+        lmh._initialize_slots(seed, hashvalues, scheme)
+        return lmh

{datasketch-1.10.0 → datasketch-2.0.0}/datasketch/lsh.py RENAMED Viewed

@@ -7,7 +7,7 @@ from typing import Callable, List, Optional, Union
 from scipy.integrate import quad as integrate
-from datasketch.minhash import MinHash
+from datasketch.minhash import MinHash, _check_scheme_consistency
 from datasketch.storage import (
     OrderedStorage,
     UnorderedStorage,
@@ -198,6 +198,10 @@ class MinHashLSH:
         ]
         self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
         self.keys: OrderedStorage = ordered_storage(storage_config, name=b"".join([basename, b"_keys"]))
+        # The permutation scheme of the indexed MinHash, learned from the
+        # first insert. Note that an index attached to pre-existing external
+        # storage (e.g. Redis) re-learns the scheme on its first insert.
+        self._minhash_scheme: Optional[str] = None
     @property
     def buffer_size(self) -> int:
@@ -332,6 +336,7 @@ class MinHashLSH:
     ):
         if len(minhash) != self.h:
             raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
+        self._minhash_scheme = _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
         if self._require_bytes_keys and not isinstance(key, bytes):
             raise TypeError(
                 f"prepickle=False requires bytes keys for non-dict storage, got {type(key).__name__}. "
@@ -355,8 +360,16 @@ class MinHashLSH:
     def _merge(self, other: MinHashLSH, check_overlap: bool = False, buffer: bool = False) -> None:
         if self.__equivalent(other):
+            known, other_known = getattr(self, "_minhash_scheme", None), getattr(other, "_minhash_scheme", None)
+            if known is not None and other_known is not None and known != other_known:
+                raise ValueError(
+                    "Cannot merge MinHashLSH indexed with MinHash scheme %r into one indexed with scheme %r"
+                    % (other_known, known)
+                )
             if check_overlap and set(self.keys).intersection(set(other.keys)):
                 raise ValueError("The keys are overlapping, duplicate key exists.")
+            if known is None:
+                self._minhash_scheme = other_known
             for key in other.keys:
                 Hs = other.keys.get(key)
                 self.keys.insert(key, *Hs, buffer=buffer)
@@ -422,6 +435,7 @@ class MinHashLSH:
         """
         if len(minhash) != self.h:
             raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
+        _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
         candidates = set()
         for (start, end), hashtable in zip(self.hashranges, self.hashtables):
             H = self._H(minhash.hashvalues[start:end])
@@ -448,6 +462,7 @@ class MinHashLSH:
         """
         if len(minhash) != self.h:
             raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
+        _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
         for (start, end), hashtable in zip(self.hashranges, self.hashtables):
             H = self._H(minhash.hashvalues[start:end])
             hashtable.add_to_select_buffer([H])
@@ -545,6 +560,7 @@ class MinHashLSH:
     def _query_b(self, minhash, b):
         if len(minhash) != self.h:
             raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
+        _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
         if b > len(self.hashtables):
             raise ValueError("b must be less or equal to the number of hash tables")
         candidates = set()

datasketch 1.10.0__tar.gz → 2.0.0__tar.gz

datasketch 1.10.0tar.gz → 2.0.0tar.gz