PyPI - datasketch - Versions diffs - 1.6.4__tar.gz → 1.7.0__tar.gz - Mend

datasketch 1.6.4tar.gz → 1.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

{datasketch-1.6.4 → datasketch-1.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: datasketch
-Version: 1.6.4
+Version: 1.7.0
 Summary: Probabilistic data structures for processing and searching very large datasets
 Home-page: https://ekzhu.github.io/datasketch
 Author: ekzhu
@@ -13,11 +13,11 @@ Classifier: Intended Audience :: Developers
 Classifier: Topic :: Database
 Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Classifier: License :: OSI Approved :: MIT License
-Classifier: Programming Language :: Python :: 3.7
 Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 License-File: LICENSE
 Requires-Dist: numpy>=1.11
 Requires-Dist: scipy>=1.0.0
@@ -25,6 +25,9 @@ Provides-Extra: cassandra
 Requires-Dist: cassandra-driver>=3.20; extra == "cassandra"
 Provides-Extra: redis
 Requires-Dist: redis>=2.10.0; extra == "redis"
+Provides-Extra: bloom
+Requires-Dist: pybloomfilter3>=0.7.2; python_version >= "3.9" and extra == "bloom"
+Requires-Dist: pybloomfiltermmap3==0.6.0; python_version < "3.9" and extra == "bloom"
 Provides-Extra: benchmark
 Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
 Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
@@ -45,17 +48,29 @@ Requires-Dist: nose>=1.3.7; extra == "test"
 Requires-Dist: nose-exclude>=0.5.0; extra == "test"
 Requires-Dist: pytest; extra == "test"
 Provides-Extra: experimental-aio
-Requires-Dist: aiounittest; python_version >= "3.6" and extra == "experimental-aio"
-Requires-Dist: motor; python_version >= "3.6" and extra == "experimental-aio"
+Requires-Dist: aiounittest; python_version >= "3.8" and extra == "experimental-aio"
+Requires-Dist: motor>3.6.0; python_version >= "3.8" and extra == "experimental-aio"
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license
+Dynamic: license-file
+Dynamic: project-url
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: summary
 datasketch: Big Data Looks Small
 ================================
-.. image:: https://github.com/ekzhu/datasketch/workflows/Python%20package/badge.svg
-    :target: https://github.com/ekzhu/datasketch/actions
+.. image:: https://static.pepy.tech/badge/datasketch/month
+    :target: https://pepy.tech/project/datasketch
-.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.290602.svg
-   :target: https://doi.org/10.5281/zenodo.290602
+.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
+   :target: https://zenodo.org/doi/10.5281/zenodo.598238
 datasketch gives you probabilistic data structures that can process and
 search very large amount of data super fast, with little loss of
@@ -83,6 +98,8 @@ sub-linear query time:
 +===========================+=============================+========================+
 | `MinHash LSH`_            | MinHash, Weighted MinHash   | Jaccard Threshold      |
 +---------------------------+-----------------------------+------------------------+
+| `LSHBloom`_               | MinHash, Weighted MinHash   | Jaccard Threshold      |
++---------------------------+-----------------------------+------------------------+
 | `MinHash LSH Forest`_     | MinHash, Weighted MinHash   | Jaccard Top-K          |
 +---------------------------+-----------------------------+------------------------+
 | `MinHash LSH Ensemble`_   | MinHash                     | Containment Threshold  |
@@ -90,7 +107,7 @@ sub-linear query time:
 | `HNSW`_                   | Any                         | Custom Metric Top-K    |
 +---------------------------+-----------------------------+------------------------+
-datasketch must be used with Python 3.7 or above, NumPy 1.11 or above, and Scipy.
+datasketch must be used with Python 3.8 or above, NumPy 1.11 or above, and Scipy.
 Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
 storage layer (see `MinHash LSH at Scale`_).
@@ -118,6 +135,11 @@ To install with Cassandra dependency:
     pip install datasketch[cassandra]
+To install with Bloom filter dependency:
+::
+    pip install datasketch[bloom]
 .. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
 .. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
@@ -126,5 +148,6 @@ To install with Cassandra dependency:
 .. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
 .. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
 .. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
+.. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
 .. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
 .. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw

{datasketch-1.6.4 → datasketch-1.7.0}/README.rst RENAMED Viewed

@@ -1,11 +1,11 @@
 datasketch: Big Data Looks Small
 ================================
-.. image:: https://github.com/ekzhu/datasketch/workflows/Python%20package/badge.svg
-    :target: https://github.com/ekzhu/datasketch/actions
+.. image:: https://static.pepy.tech/badge/datasketch/month
+    :target: https://pepy.tech/project/datasketch
-.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.290602.svg
-   :target: https://doi.org/10.5281/zenodo.290602
+.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
+   :target: https://zenodo.org/doi/10.5281/zenodo.598238
 datasketch gives you probabilistic data structures that can process and
 search very large amount of data super fast, with little loss of
@@ -33,6 +33,8 @@ sub-linear query time:
 +===========================+=============================+========================+
 | `MinHash LSH`_            | MinHash, Weighted MinHash   | Jaccard Threshold      |
 +---------------------------+-----------------------------+------------------------+
+| `LSHBloom`_               | MinHash, Weighted MinHash   | Jaccard Threshold      |
++---------------------------+-----------------------------+------------------------+
 | `MinHash LSH Forest`_     | MinHash, Weighted MinHash   | Jaccard Top-K          |
 +---------------------------+-----------------------------+------------------------+
 | `MinHash LSH Ensemble`_   | MinHash                     | Containment Threshold  |
@@ -40,7 +42,7 @@ sub-linear query time:
 | `HNSW`_                   | Any                         | Custom Metric Top-K    |
 +---------------------------+-----------------------------+------------------------+
-datasketch must be used with Python 3.7 or above, NumPy 1.11 or above, and Scipy.
+datasketch must be used with Python 3.8 or above, NumPy 1.11 or above, and Scipy.
 Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
 storage layer (see `MinHash LSH at Scale`_).
@@ -68,6 +70,11 @@ To install with Cassandra dependency:
     pip install datasketch[cassandra]
+To install with Bloom filter dependency:
+::
+    pip install datasketch[bloom]
 .. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
 .. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
@@ -76,5 +83,6 @@ To install with Cassandra dependency:
 .. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
 .. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
 .. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
+.. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
 .. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
 .. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw

{datasketch-1.6.4 → datasketch-1.7.0}/datasketch/__init__.py RENAMED Viewed

@@ -2,6 +2,7 @@ from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
 from datasketch.minhash import MinHash
 from datasketch.b_bit_minhash import bBitMinHash
 from datasketch.lsh import MinHashLSH
+from datasketch.lsh_bloom import MinHashLSHBloom
 from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerator
 from datasketch.lshforest import MinHashLSHForest
 from datasketch.lshensemble import MinHashLSHEnsemble

{datasketch-1.6.4 → datasketch-1.7.0}/datasketch/b_bit_minhash.py RENAMED Viewed

@@ -92,7 +92,14 @@ class bBitMinHash(object):
             hvs = self.hashvalues[start:start+n]
             # Store the n b-bit hashed values in the current block
             for j, hv in enumerate(hvs):
-                blocks[i] |= np.uint64(hv << (n - 1 - j) * slot_size)
+                # We do this in BigInteger rather than np.uint64 because of inconsistencies
+                # in NumPy type coercion rules between NumPy 1.x and NumPy 2.x environments.
+                # In NumPy 2.x, implicit type conversion during bitwise operations is not
+                # performed which can cause integer overflows. This, in turn can corrupt
+                # hashvalues and cause pickled bBitMinHash objects to have the wrong representation.
+                # Doing this in BigInteger guarantees we do not experience overflow and still
+                # coerces to np.uint64 as expected.
+                blocks[i] = int(blocks[i]) | (int(hv) << (n - 1 - j) * slot_size)
         fmt = self._serial_fmt_params + \
                 "%d%s" % (num_blocks, self._serial_fmt_block)
         struct.pack_into(fmt, buffer, 0, self.seed, self.b, self.r, \

{datasketch-1.6.4 → datasketch-1.7.0}/datasketch/lsh.py RENAMED Viewed

@@ -226,6 +226,29 @@ class MinHashLSH(object):
         """
         self._insert(key, minhash, check_duplication=check_duplication, buffer=False)
+    def merge(
+            self,
+            other: MinHashLSH,
+            check_overlap: bool = False
+    ):
+        """Merge the other MinHashLSH with this one, making this one the union
+        of both.
+        Note:
+            Only num_perm, number of bands and sizes of each band is checked for equivalency of two MinHashLSH indexes.
+            Other initialization parameters threshold, weights, storage_config, prepickle and hash_func are not checked.
+        Args:
+            other (MinHashLSH): The other MinHashLSH.
+            check_overlap (bool): Check if there are any overlapping keys before merging and raise if there are any.
+                (`default=False`)
+        Raises:
+            ValueError: If the two MinHashLSH have different initialization
+                parameters, or if `check_overlap` is `True` and there are overlapping keys.
+        """
+        self._merge(other, check_overlap=check_overlap, buffer=False)
     def insertion_session(self, buffer_size: int = 50000) -> MinHashLSHInsertionSession:
         """
         Create a context manager for fast insertion into this index.
@@ -282,6 +305,38 @@ class MinHashLSH(object):
         for H, hashtable in zip(Hs, self.hashtables):
             hashtable.insert(H, key, buffer=buffer)
+    def __equivalent(self, other:MinHashLSH) -> bool:
+        """
+        Returns:
+            bool: If the two MinHashLSH have equal num_perm, number of bands, size of each band then two are equivalent.
+        """
+        return (
+            type(self) is type(other) and
+            self.h == other.h and
+            self.b == other.b and
+            self.r == other.r
+        )
+    def _merge(
+        self,
+        other: MinHashLSH,
+        check_overlap: bool = False,
+        buffer: bool = False
+    ) -> MinHashLSH:
+        if self.__equivalent(other):
+            if check_overlap and set(self.keys).intersection(set(other.keys)):
+                raise ValueError("The keys are overlapping, duplicate key exists.")
+            for key in other.keys:
+                Hs = other.keys.get(key)
+                self.keys.insert(key, *Hs, buffer=buffer)
+                for H, hashtable in zip(Hs, self.hashtables):
+                    hashtable.insert(H, key, buffer=buffer)
+        else:
+            if type(self) is not type(other):
+                raise ValueError(f"Cannot merge type MinHashLSH and type {type(other).__name__}.")
+            raise ValueError(
+                "Cannot merge MinHashLSH with different initialization parameters.")
     def query(self, minhash) -> List[Hashable]:
         """
         Giving the MinHash of the query set, retrieve

datasketch-1.7.0/datasketch/lsh_bloom.py ADDED Viewed

@@ -0,0 +1,335 @@
+from __future__ import annotations
+from typing import Callable, List, Optional, Tuple
+from datasketch.minhash import MinHash
+from scipy.integrate import quad as integrate
+import numpy as np
+import warnings
+import os
+try:
+	import pybloomfilter
+except ImportError:
+	pybloomfilter = None
+_mersenne_prime = np.uint64((1 << 61) - 1)
+def _false_positive_probability(threshold, b, r):
+	_probability = lambda s: 1 - (1 - s ** float(r)) ** float(b)
+	a, err = integrate(_probability, 0.0, threshold)
+	return a
+def _false_negative_probability(threshold, b, r):
+	_probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b))
+	a, err = integrate(_probability, threshold, 1.0)
+	return a
+def _optimal_param(threshold, num_perm, false_positive_weight, false_negative_weight):
+	"""
+	Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
+	of probabilities of false positive and false negative.
+	"""
+	min_error = float("inf")
+	opt = (0, 0)
+	for b in range(1, num_perm + 1):
+		max_r = int(num_perm / b)
+		for r in range(1, max_r + 1):
+			fp = _false_positive_probability(threshold, b, r)
+			fn = _false_negative_probability(threshold, b, r)
+			error = fp * false_positive_weight + fn * false_negative_weight
+			if error < min_error:
+				min_error = error
+				opt = (b, r)
+	return opt
+if pybloomfilter is not None:
+	class BloomTable:
+		"""
+		Interface to a Bloom Filter meant to model a single band of the MinHash signature matrix
+		Args:
+			item_count (int): Number of items expected to be inserted (size of dataset). Used to create Bloom filter.
+			fp (float): False positive rate for Bloom filter in (0,1).
+			band_size (int): Size of band from MinHash signature matrix this filter is meant to model.
+			fname (str): File path where Bloom filter will be saved. If this file already exists, will initialize the Bloom filter from this path.
+			max_size (int): Maximum number of elements we should plan to insert into this Bloom filter. Upper bounds the size of the Bloom filter.
+		"""
+		def __init__(self, item_count: int, fp: float, band_size: int, fname: str = None):
+			self.r = band_size
+			self.fname = fname
+			if fname is not None and os.path.exists(fname):
+				print(f"Loading Bloom Filter at {fname}...")
+				self.bloom_filter = pybloomfilter.BloomFilter.open(fname)
+			else:
+				self.bloom_filter = pybloomfilter.BloomFilter(
+					capacity=item_count,
+					error_rate=fp,
+					filename=self.fname
+				)
+		def sync(self):
+			if self.fname is not None:
+				self.bloom_filter.sync()
+			else:
+				warnings.warn("Attempting to save in-memory Bloom filter, this is a no-op.", RuntimeWarning)
+		def assert_size(self, hashvalues: List[int]):
+			if not len(hashvalues) == self.r:
+				raise RuntimeError(f"Invalid length for indices, {len(hashvalues)}, expected {self.r} hashvalues in band")
+		def insert(self, hashvalues: List[int]) -> None:
+			"""
+			Takes as input the indices for a single band and inserts them into the corresponding bit arrays
+			Args:
+				hashvalues (List[int]): The hashvalues from a single band of a MinHash object.
+			"""
+			self.assert_size(hashvalues)
+			# https://en.wikipedia.org/wiki/Universal_hashing#Hashing_vectors
+			# as the hashvalues are the result of a universal hashing function, their sum is also a univeral hash function
+			x = sum(hashvalues) % _mersenne_prime
+			self.bloom_filter.add(x)
+		def query(self, hashvalues: List[int]) -> bool:
+			"""
+			Takes as input the indices for a single band and queries them against the corresponding arrays
+			returns True if the each query returns True, otherwise returns False
+			Args:
+				hashvalues (List[int]): The hashvalues from a single band of a MinHash object.
+			"""
+			self.assert_size(hashvalues)
+			x = sum(hashvalues) % _mersenne_prime
+			return x in self.bloom_filter
+else:
+	class BloomTable:
+		def __init__(self, item_count: int, fp: float, band_size: int, fname: str = None):
+			raise ImportError("Required dependency pybloomfilter is missing, did you `pip install datasketch[bloom]`?")
+class MinHashLSHBloom(object):
+	"""
+	The :ref:`lsh_bloom` index.
+	It supports query with `Jaccard similarity`_ threshold.
+	Reference: `LSHBloom paper
+	<https://arxiv.org/abs/2411.04257>`_.
+	Args:
+		threshold (float): The Jaccard similarity threshold between 0.0 and
+			1.0. The initialized LSH index will be optimized for the threshold by
+			minizing the false positive and false negative.
+		num_perm (int): The number of permutation functions used
+			by the MinHash to be indexed. For weighted MinHash, this
+			is the sample size (`sample_size`).
+		n (int): The number of elements to be inserted (estimate of dataset size).
+		fp (float): The false positive rate for each Bloom filter. Must be in (0,1).
+		save_dir (str): The directory to save the Bloom filter index to. If Bloom filters
+			already exist in this directory, the index will be loaded from here. If None,
+			an in-memory index will be created - this index can not be persisted.
+		weights (Tuple[float, float]): Used to adjust the relative importance of
+			minimizing false positive and false negative when optimizing
+			for the Jaccard similarity threshold.
+			`weights` is a tuple in the format of
+			:code:`(false_positive_weight, false_negative_weight)`.
+		params (Optiona[Tuple[int, int]]): The LSH parameters (i.e., number of bands and size
+			of each bands). This is used to bypass the parameter optimization
+			step in the constructor. `threshold` and `weights` will be ignored
+			if this is given.
+	Note:
+		This algorithm is a space optimized version of MinHashLSH.
+		For more details on :ref:`minhash_lsh`, see the documentation.
+		This algorithm uses Bloom filters to drastically reduce the space
+		that the LSH index occupies on disk. However, it loses the ability
+		to retrieve candidate duplicate keys. Rather, it can only tell you
+		whether a query set is a duplicate of a set that was inserted previously.
+		This enables scaling to datasets of many hundreds of millions or billions
+		of documents, but may not be appropriate for all use cases.
+	Examples:
+		Create an index with 128 permutation functions optimized for Jaccard
+		threshold 0.9:
+		.. code-block:: python
+			from datasketch import MinHash, MinHashLSH
+			set1 = set(['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
+            			'estimating', 'the', 'similarity', 'between', 'datasets'])
+			set2 = set(['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
+						'estimating', 'the', 'similarity', 'between', 'documents'])
+			set3 = set(['minhash', 'is', 'probability', 'data', 'structure', 'for',
+						'estimating', 'the', 'similarity', 'between', 'documents'])
+			m1 = MinHash(num_perm=128)
+			m2 = MinHash(num_perm=128)
+			m3 = MinHash(num_perm=128)
+			for d in set1:
+				m1.update(d.encode('utf8'))
+			for d in set2:
+				m2.update(d.encode('utf8'))
+			for d in set3:
+				m3.update(d.encode('utf8'))
+			# Create LSHBloom index
+			lsh = MinHashLSHBloom(threshold=0.5, num_perm=128, n=100, fp=0.0001, save_dir="./index/")
+			lsh.insert(m2)
+			lsh.insert(m3)
+			# Query whether m1 is a duplicate according to the given threshold
+			is_duplicate = lsh.query(m1)
+	"""
+	def __init__(
+		self,
+		threshold: float = 0.9,
+		num_perm: int = 128,
+		n: int = None,
+		fp: float = None,
+		save_dir: str = None,
+		weights: Tuple[float, float] = (0.5, 0.5),
+		params: Optional[Tuple[int, int]] = None,
+	) -> None:
+		if threshold > 1.0 or threshold < 0.0:
+			raise ValueError("threshold must be in [0.0, 1.0]")
+		if num_perm < 2:
+			raise ValueError("Too few permutation functions")
+		if n <= 0:
+			raise ValueError("n for LSHBloom must be >= 0")
+		if fp >= 1.0 or fp <= 0.0:
+			raise ValueError("fp must be in (0.0, 1.0)")
+		if save_dir is None:
+			warnings.warn("Creating LSHBloom index without save directory, this index will not be persisted.", RuntimeWarning)
+		if any(w < 0.0 or w > 1.0 for w in weights):
+			raise ValueError("Weight must be in [0.0, 1.0]")
+		if sum(weights) != 1.0:
+			raise ValueError("Weights must sum to 1.0")
+		self.h = num_perm
+		if params is not None:
+			self.b, self.r = params
+			if self.b * self.r > num_perm:
+				raise ValueError(
+					"The product of b and r in params is "
+					"{} * {} = {} -- it must be less than num_perm {}. "
+					"Did you forget to specify num_perm?".format(
+						self.b, self.r, self.b * self.r, num_perm
+					)
+				)
+		else:
+			false_positive_weight, false_negative_weight = weights
+			self.b, self.r = _optimal_param(
+				threshold, num_perm, false_positive_weight, false_negative_weight
+			)
+		if self.b < 2:
+			raise ValueError("The number of bands are too small (b < 2)")
+		# create a Bloom filter for each band in the signature matrix
+		if save_dir is not None:
+			os.makedirs(save_dir, exist_ok=True)
+		self.hashtables = [
+			BloomTable(
+					item_count=n,
+					fp=fp, band_size=self.r,
+					fname=os.path.join(save_dir, f"band-{i}.bf") if save_dir is not None else None,
+				)
+			for i in range(self.b)
+		]
+		self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
+	def insert(
+		self,
+		minhash: MinHash
+	):
+		"""
+		Insert the MinHash or Weighted MinHash
+		of a set to the index.
+		Args:
+			minhash (Union[MinHash, WeightedMinHash]): The MinHash of the set.
+		"""
+		self._insert(minhash)
+	def _insert(
+		self,
+		minhash: MinHash
+	):
+		if len(minhash) != self.h:
+			raise ValueError(
+				"Expecting minhash with length %d, got %d" % (self.h, len(minhash))
+			)
+		Hs = [minhash.hashvalues[start:end] for start, end in self.hashranges]
+		for H, hashtable in zip(Hs, self.hashtables):
+			hashtable.insert(H)
+	def query(self, minhash) -> bool:
+		"""
+		Given the MinHash of the query set, determine
+		whether any previously inserted sets have
+		Jaccard similarity with the query that is
+		likely greater than the threshold.
+		Results are based on minhash segment collision
+		and are thus approximate.
+		Args:
+			minhash (MinHash): The MinHash of the query set.
+		Returns:
+			bool: Whether the item is a duplicate or not, based on the given threshold.
+		Example:
+			.. code-block:: python
+				from datasketch import MinHash, MinHashLSHBloom
+				import numpy as np
+				# Generate 100 random MinHashes.
+				minhashes = MinHash.bulk(
+					np.random.randint(low=0, high=30, size=(100, 10)),
+					num_perm=128
+				)
+				# Create LSHBloom index.
+				lsh = MinHashLSHBloom(threshold=0.5, num_perm=128, n=100, fp=0.0001, save_dir="./index/")
+				for i, m in enumerate(minhashes):
+					lsh.insert(i, m)
+				# Get the duplication result from LSHBloom.
+				query = minhashes[0]
+				is_duplicate = lsh.query(query)
+				print(is_duplicate)
+			Output:
+			.. code-block::
+				True
+			Note that although the threshold is set to 0.5, the results are not
+			guaranteed to be above 0.5 because the LSHBloom index is approximate and
+			the Jaccard similarity is estimated by MinHash.
+		"""
+		if len(minhash) != self.h:
+			raise ValueError(
+				"Expecting minhash with length %d, got %d" % (self.h, len(minhash))
+			)
+		# if we match in any band, this is a candidate pair
+		for (start, end), hashtable in zip(self.hashranges, self.hashtables):
+			H = minhash.hashvalues[start:end]
+			collision = hashtable.query(H)
+			if collision:
+				return True
+		return False
+	def sync(self):
+		print("Saving Bloom Index...")
+		for table in self.hashtables:
+			table.sync()

{datasketch-1.6.4 → datasketch-1.7.0}/datasketch/lshforest.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from collections import defaultdict
 from typing import Hashable, List
+import numpy as np
 from datasketch.minhash import MinHash
@@ -128,6 +129,30 @@ class MinHashLSHForest(object):
             r -= 1
         return list(results)
+    def get_minhash_hashvalues(self, key: Hashable) -> np.ndarray:
+        """
+        Returns the hashvalues from the MinHash object that corresponds to the given key in the LSHForest,
+        if it exists. This is useful for when we want to reconstruct the original MinHash
+        object to manually check the Jaccard Similarity for the top-k results from a query.
+        Args:
+            key (Hashable): The key whose MinHash hashvalues we want to retrieve.
+        Returns:
+            hashvalues: The hashvalues for the MinHash object corresponding to the given key.
+        """
+        byteslist = self.keys.get(key, None)
+        if byteslist is None:
+            raise KeyError(f"The provided key does not exist in the LSHForest: {key}")
+        hashvalue_byte_size = len(byteslist[0])//8
+        hashvalues = np.empty(len(byteslist)*hashvalue_byte_size, dtype=np.uint64)
+        for index, item in enumerate(byteslist):
+            # unswap the bytes, as their representation is flipped during storage
+            hv_segment = np.frombuffer(item, dtype=np.uint64).byteswap()
+            curr_index = index*hashvalue_byte_size
+            hashvalues[curr_index:curr_index+hashvalue_byte_size] = hv_segment
+        return hashvalues
     def _binary_search(self, n, func):
         """
         https://golang.org/src/sort/search.go?s=2247:2287#L49

datasketch-1.7.0/datasketch/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.7.0"

{datasketch-1.6.4 → datasketch-1.7.0}/datasketch.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: datasketch
-Version: 1.6.4
+Version: 1.7.0
 Summary: Probabilistic data structures for processing and searching very large datasets
 Home-page: https://ekzhu.github.io/datasketch
 Author: ekzhu
@@ -13,11 +13,11 @@ Classifier: Intended Audience :: Developers
 Classifier: Topic :: Database
 Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Classifier: License :: OSI Approved :: MIT License
-Classifier: Programming Language :: Python :: 3.7
 Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 License-File: LICENSE
 Requires-Dist: numpy>=1.11
 Requires-Dist: scipy>=1.0.0
@@ -25,6 +25,9 @@ Provides-Extra: cassandra
 Requires-Dist: cassandra-driver>=3.20; extra == "cassandra"
 Provides-Extra: redis
 Requires-Dist: redis>=2.10.0; extra == "redis"
+Provides-Extra: bloom
+Requires-Dist: pybloomfilter3>=0.7.2; python_version >= "3.9" and extra == "bloom"
+Requires-Dist: pybloomfiltermmap3==0.6.0; python_version < "3.9" and extra == "bloom"
 Provides-Extra: benchmark
 Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
 Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
@@ -45,17 +48,29 @@ Requires-Dist: nose>=1.3.7; extra == "test"
 Requires-Dist: nose-exclude>=0.5.0; extra == "test"
 Requires-Dist: pytest; extra == "test"
 Provides-Extra: experimental-aio
-Requires-Dist: aiounittest; python_version >= "3.6" and extra == "experimental-aio"
-Requires-Dist: motor; python_version >= "3.6" and extra == "experimental-aio"
+Requires-Dist: aiounittest; python_version >= "3.8" and extra == "experimental-aio"
+Requires-Dist: motor>3.6.0; python_version >= "3.8" and extra == "experimental-aio"
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license
+Dynamic: license-file
+Dynamic: project-url
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: summary
 datasketch: Big Data Looks Small
 ================================
-.. image:: https://github.com/ekzhu/datasketch/workflows/Python%20package/badge.svg
-    :target: https://github.com/ekzhu/datasketch/actions
+.. image:: https://static.pepy.tech/badge/datasketch/month
+    :target: https://pepy.tech/project/datasketch
-.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.290602.svg
-   :target: https://doi.org/10.5281/zenodo.290602
+.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
+   :target: https://zenodo.org/doi/10.5281/zenodo.598238
 datasketch gives you probabilistic data structures that can process and
 search very large amount of data super fast, with little loss of
@@ -83,6 +98,8 @@ sub-linear query time:
 +===========================+=============================+========================+
 | `MinHash LSH`_            | MinHash, Weighted MinHash   | Jaccard Threshold      |
 +---------------------------+-----------------------------+------------------------+
+| `LSHBloom`_               | MinHash, Weighted MinHash   | Jaccard Threshold      |
++---------------------------+-----------------------------+------------------------+
 | `MinHash LSH Forest`_     | MinHash, Weighted MinHash   | Jaccard Top-K          |
 +---------------------------+-----------------------------+------------------------+
 | `MinHash LSH Ensemble`_   | MinHash                     | Containment Threshold  |
@@ -90,7 +107,7 @@ sub-linear query time:
 | `HNSW`_                   | Any                         | Custom Metric Top-K    |
 +---------------------------+-----------------------------+------------------------+
-datasketch must be used with Python 3.7 or above, NumPy 1.11 or above, and Scipy.
+datasketch must be used with Python 3.8 or above, NumPy 1.11 or above, and Scipy.
 Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
 storage layer (see `MinHash LSH at Scale`_).
@@ -118,6 +135,11 @@ To install with Cassandra dependency:
     pip install datasketch[cassandra]
+To install with Bloom filter dependency:
+::
+    pip install datasketch[bloom]
 .. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
 .. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
@@ -126,5 +148,6 @@ To install with Cassandra dependency:
 .. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
 .. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
 .. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
+.. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
 .. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
 .. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw

{datasketch-1.6.4 → datasketch-1.7.0}/datasketch.egg-info/SOURCES.txt RENAMED Viewed

@@ -9,6 +9,7 @@ datasketch/hyperloglog.py
 datasketch/hyperloglog_const.py
 datasketch/lean_minhash.py
 datasketch/lsh.py
+datasketch/lsh_bloom.py
 datasketch/lshensemble.py
 datasketch/lshensemble_partition.py
 datasketch/lshforest.py
@@ -30,6 +31,7 @@ test/test_hyperloglog.py
 test/test_lean_minhash.py
 test/test_lsh.py
 test/test_lsh_cassandra.py
+test/test_lshbloom.py
 test/test_lshensemble.py
 test/test_lshforest.py
 test/test_minhash.py

{datasketch-1.6.4 → datasketch-1.7.0}/datasketch.egg-info/requires.txt RENAMED Viewed

@@ -11,14 +11,22 @@ SetSimilaritySearch>=0.1.7
 pyfarmhash>=0.2.2
 nltk>=3.4.5
+[bloom]
+[bloom:python_version < "3.9"]
+pybloomfiltermmap3==0.6.0
+[bloom:python_version >= "3.9"]
+pybloomfilter3>=0.7.2
 [cassandra]
 cassandra-driver>=3.20
 [experimental_aio]
-[experimental_aio:python_version >= "3.6"]
+[experimental_aio:python_version >= "3.8"]
 aiounittest
-motor
+motor>3.6.0
 [redis]
 redis>=2.10.0

{datasketch-1.6.4 → datasketch-1.7.0}/setup.py RENAMED Viewed

@@ -39,11 +39,11 @@ setup(
         'Topic :: Database',
         'Topic :: Scientific/Engineering :: Information Analysis',
         'License :: OSI Approved :: MIT License',
-        'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
         'Programming Language :: Python :: 3.11',
+        'Programming Language :: Python :: 3.12',
     ],
     keywords='database datamining',
     packages=find_packages(include=['datasketch*']),
@@ -58,6 +58,10 @@ setup(
         'redis': [
             'redis>=2.10.0',
         ],
+        'bloom': [
+            'pybloomfilter3>=0.7.2 ; python_version>="3.9"',
+            'pybloomfiltermmap3==0.6.0 ; python_version<"3.9"',
+        ],
         'benchmark': [
             'pyhash>=0.9.3',
             'matplotlib>=3.1.2',
@@ -80,8 +84,8 @@ setup(
             'pytest',
         ],
         'experimental_aio': [
-            "aiounittest ; python_version>='3.6'",
-            "motor ; python_version>='3.6'",
+            "aiounittest ; python_version>='3.8'",
+            "motor>3.6.0 ; python_version>='3.8'",
         ],
     },
 )

{datasketch-1.6.4 → datasketch-1.7.0}/test/test_lsh.py RENAMED Viewed

@@ -240,6 +240,117 @@ class TestMinHashLSH(unittest.TestCase):
         for table in counts:
             self.assertEqual(sum(table.values()), 2)
+    def test_merge(self):
+        lsh1 = MinHashLSH(threshold=0.5, num_perm=16)
+        m1 = MinHash(16)
+        m1.update("a".encode("utf-8"))
+        m2 = MinHash(16)
+        m2.update("b".encode("utf-8"))
+        lsh1.insert("a",m1)
+        lsh1.insert("b",m2)
+        lsh2 = MinHashLSH(threshold=0.5, num_perm=16)
+        m3 = MinHash(16)
+        m3.update("c".encode("utf-8"))
+        m4 = MinHash(16)
+        m4.update("d".encode("utf-8"))
+        lsh2.insert("c",m1)
+        lsh2.insert("d",m2)
+        lsh1.merge(lsh2)
+        for t in lsh1.hashtables:
+            self.assertTrue(len(t) >= 1)
+            items = []
+            for H in t:
+                items.extend(t[H])
+            self.assertTrue("c" in items)
+            self.assertTrue("d" in items)
+        self.assertTrue("a" in lsh1)
+        self.assertTrue("b" in lsh1)
+        self.assertTrue("c" in lsh1)
+        self.assertTrue("d" in lsh1)
+        for i, H in enumerate(lsh1.keys["c"]):
+            self.assertTrue("c" in lsh1.hashtables[i][H])
+        self.assertTrue(lsh1.merge, lsh2)
+        self.assertRaises(ValueError, lsh1.merge, lsh2, check_overlap=True)
+        m5 = MinHash(16)
+        m5.update("e".encode("utf-8"))
+        lsh3 = MinHashLSH(threshold=0.5, num_perm=16)
+        lsh3.insert("a",m5)
+        self.assertRaises(ValueError, lsh1.merge, lsh3, check_overlap=True)
+        lsh1.merge(lsh3)
+        m6 = MinHash(16)
+        m6.update("e".encode("utf-8"))
+        lsh4 = MinHashLSH(threshold=0.5, num_perm=16)
+        lsh4.insert("a",m6)
+        lsh1.merge(lsh4, check_overlap=False)
+    def test_merge_redis(self):
+        with patch('redis.Redis', fake_redis) as mock_redis:
+            lsh1 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
+                'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
+            })
+            lsh2 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
+                'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
+            })
+            m1 = MinHash(16)
+            m1.update("a".encode("utf8"))
+            m2 = MinHash(16)
+            m2.update("b".encode("utf8"))
+            lsh1.insert("a", m1)
+            lsh1.insert("b", m2)
+            m3 = MinHash(16)
+            m3.update("c".encode("utf8"))
+            m4 = MinHash(16)
+            m4.update("d".encode("utf8"))
+            lsh2.insert("c", m3)
+            lsh2.insert("d", m4)
+            lsh1.merge(lsh2)
+            for t in lsh1.hashtables:
+                self.assertTrue(len(t) >= 1)
+                items = []
+                for H in t:
+                    items.extend(t[H])
+                self.assertTrue(pickle.dumps("c") in items)
+                self.assertTrue(pickle.dumps("d") in items)
+            self.assertTrue("a" in lsh1)
+            self.assertTrue("b" in lsh1)
+            self.assertTrue("c" in lsh1)
+            self.assertTrue("d" in lsh1)
+            for i, H in enumerate(lsh1.keys[pickle.dumps("c")]):
+                self.assertTrue(pickle.dumps("c") in lsh1.hashtables[i][H])
+            self.assertTrue(lsh1.merge, lsh2)
+            self.assertRaises(ValueError, lsh1.merge, lsh2, check_overlap=True)
+            m5 = MinHash(16)
+            m5.update("e".encode("utf-8"))
+            lsh3 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
+                'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
+            })
+            lsh3.insert("a",m5)
+            self.assertRaises(ValueError, lsh1.merge, lsh3, check_overlap=True)
+            m6 = MinHash(16)
+            m6.update("e".encode("utf-8"))
+            lsh4 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
+                'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
+            })
+            lsh4.insert("a",m6)
+            lsh1.merge(lsh4, check_overlap=False)
 class TestWeightedMinHashLSH(unittest.TestCase):

datasketch-1.7.0/test/test_lshbloom.py ADDED Viewed

@@ -0,0 +1,126 @@
+import unittest
+import pickle
+from glob import glob
+from datasketch.lsh_bloom import BloomTable, MinHashLSHBloom
+from datasketch.minhash import MinHash
+import numpy as np
+import os
+class TestBloomTable(unittest.TestCase):
+	def test_insert(self):
+		r = 3
+		x = np.array([2,3,31], dtype=np.uint32)
+		b = BloomTable(10, 0.01, band_size=r)
+		b.insert(x)
+		self.assertRaises(RuntimeError, b.insert, np.array([2,2], dtype=np.uint32))
+	def test_query(self):
+		r = 3
+		x = np.array([2,3,31], dtype=np.uint32)
+		b = BloomTable(10, 0.01, band_size=r)
+		b.insert(x)
+		self.assertTrue(b.query(x))
+		self.assertFalse(b.query(np.array([2,3,30], dtype=np.uint32)))
+		self.assertRaises(RuntimeError, b.query, [2,2])
+	def test_save(self):
+		fname = "/tmp/bloomfilter.bf"
+		if os.path.exists(fname):
+			os.remove(fname)
+		r = 3
+		x = np.array([2,3,31], dtype=np.uint32)
+		y = np.array([12,10,29], dtype=np.uint32)
+		z = np.array([27,30,8], dtype=np.uint32)
+		items = [x,y,z]
+		b = BloomTable(10, 0.01, band_size=r, fname=fname)
+		for item in items:
+			b.insert(item)
+		for item in items:
+			self.assertTrue(b.query(item))
+		b.sync()
+		del b
+		b_ = BloomTable(10, 0.01, band_size=r, fname=fname)
+		for item in items:
+			self.assertTrue(b_.query(item))
+class TestMinHashLSHBloom(unittest.TestCase):
+	def test_init(self):
+		lsh = MinHashLSHBloom(threshold=0.8, n=10, fp=0.01)
+		b1, r1 = lsh.b, lsh.r
+		lsh = MinHashLSHBloom(threshold=0.8, weights=(0.2,0.8), n=10, fp=0.01)
+		b2, r2 = lsh.b, lsh.r
+		self.assertTrue(b1 < b2)
+		self.assertTrue(r1 > r2)
+		self.assertTrue(len(lsh.hashtables) == lsh.b)
+	def test_insert(self):
+		lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01)
+		m1 = MinHash(16)
+		m1.update("a".encode("utf8"))
+		m2 = MinHash(16)
+		m2.update("b".encode("utf8"))
+		lsh.insert(m1)
+		lsh.insert(m2)
+		m3 = MinHash(18)
+		self.assertRaises(ValueError, lsh.insert, m3)
+	def test_query(self):
+		lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01)
+		m1 = MinHash(16)
+		m1.update("a".encode("utf8"))
+		m2 = MinHash(16)
+		m2.update("b".encode("utf8"))
+		lsh.insert(m1)
+		lsh.insert(m2)
+		result = lsh.query(m1)
+		self.assertTrue(result)
+		result = lsh.query(m2)
+		self.assertTrue(result)
+		m3 = MinHash(18)
+		self.assertRaises(ValueError, lsh.query, m3)
+	def test_save(self):
+		save_path = "./test_save/"
+		for item in glob(f"{save_path}/*.bf"):
+			os.remove(item)
+		lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01, save_dir=save_path)
+		m1 = MinHash(16)
+		m1.update("a".encode("utf8"))
+		m2 = MinHash(16)
+		m2.update("b".encode("utf8"))
+		lsh.insert(m1)
+		lsh.insert(m2)
+		lsh.sync()
+		lsh2 = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01, save_dir=save_path)
+		result = lsh2.query(m1)
+		self.assertTrue(result)
+		result = lsh2.query(m2)
+		self.assertTrue(result)
+	def test_save_in_memory(self):
+		with self.assertWarns(RuntimeWarning):
+			lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01, save_dir=None)
+		m1 = MinHash(16)
+		m1.update("a".encode("utf8"))
+		m2 = MinHash(16)
+		m2.update("b".encode("utf8"))
+		lsh.insert(m1)
+		lsh.insert(m2)
+		with self.assertWarns(RuntimeWarning):
+			lsh.sync()
+if __name__ == "__main__":
+	unittest.main()

{datasketch-1.6.4 → datasketch-1.7.0}/test/test_lshforest.py RENAMED Viewed

@@ -62,6 +62,18 @@ class TestMinHashLSHForest(unittest.TestCase):
             results = forest.query(data[key], 10)
             self.assertIn(key, results)
+    def test_get_minhash_hashvalues(self):
+        forest, data = self._setup()
+        for key in data:
+            minhash_ori = data[key]
+            hashvalues = forest.get_minhash_hashvalues(key)
+            minhash_retrieved = MinHash(hashvalues=hashvalues)
+            retrieved_hashvalues = minhash_retrieved.hashvalues
+            self.assertEqual(len(hashvalues), len(retrieved_hashvalues))
+            self.assertEqual(minhash_retrieved.jaccard(minhash_ori), 1.0)
+            for i in range(len(retrieved_hashvalues)):
+                self.assertEqual(hashvalues[i], retrieved_hashvalues[i])
     def test_pickle(self):
         forest, _ = self._setup()
         forest2 = pickle.loads(pickle.dumps(forest))