datasketch 1.6.5__tar.gz → 1.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {datasketch-1.6.5 → datasketch-1.7.0}/PKG-INFO +29 -6
  2. {datasketch-1.6.5 → datasketch-1.7.0}/README.rst +9 -1
  3. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/__init__.py +1 -0
  4. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/b_bit_minhash.py +8 -1
  5. datasketch-1.7.0/datasketch/lsh_bloom.py +335 -0
  6. datasketch-1.7.0/datasketch/version.py +1 -0
  7. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch.egg-info/PKG-INFO +29 -6
  8. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch.egg-info/SOURCES.txt +2 -0
  9. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch.egg-info/requires.txt +10 -2
  10. {datasketch-1.6.5 → datasketch-1.7.0}/setup.py +7 -3
  11. datasketch-1.7.0/test/test_lshbloom.py +126 -0
  12. datasketch-1.6.5/datasketch/version.py +0 -1
  13. {datasketch-1.6.5 → datasketch-1.7.0}/LICENSE +0 -0
  14. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/experimental/__init__.py +0 -0
  15. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/experimental/aio/__init__.py +0 -0
  16. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/experimental/aio/lsh.py +0 -0
  17. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/experimental/aio/storage.py +0 -0
  18. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/hashfunc.py +0 -0
  19. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/hnsw.py +0 -0
  20. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/hyperloglog.py +0 -0
  21. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/hyperloglog_const.py +0 -0
  22. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/lean_minhash.py +0 -0
  23. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/lsh.py +0 -0
  24. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/lshensemble.py +0 -0
  25. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/lshensemble_partition.py +0 -0
  26. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/lshforest.py +0 -0
  27. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/minhash.py +0 -0
  28. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/storage.py +0 -0
  29. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/weighted_minhash.py +0 -0
  30. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch.egg-info/dependency_links.txt +0 -0
  31. {datasketch-1.6.5 → datasketch-1.7.0}/datasketch.egg-info/top_level.txt +0 -0
  32. {datasketch-1.6.5 → datasketch-1.7.0}/setup.cfg +0 -0
  33. {datasketch-1.6.5 → datasketch-1.7.0}/test/test_hnsw.py +0 -0
  34. {datasketch-1.6.5 → datasketch-1.7.0}/test/test_hyperloglog.py +0 -0
  35. {datasketch-1.6.5 → datasketch-1.7.0}/test/test_lean_minhash.py +0 -0
  36. {datasketch-1.6.5 → datasketch-1.7.0}/test/test_lsh.py +0 -0
  37. {datasketch-1.6.5 → datasketch-1.7.0}/test/test_lsh_cassandra.py +0 -0
  38. {datasketch-1.6.5 → datasketch-1.7.0}/test/test_lshensemble.py +0 -0
  39. {datasketch-1.6.5 → datasketch-1.7.0}/test/test_lshforest.py +0 -0
  40. {datasketch-1.6.5 → datasketch-1.7.0}/test/test_minhash.py +0 -0
  41. {datasketch-1.6.5 → datasketch-1.7.0}/test/test_weighted_minhash.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: datasketch
3
- Version: 1.6.5
3
+ Version: 1.7.0
4
4
  Summary: Probabilistic data structures for processing and searching very large datasets
5
5
  Home-page: https://ekzhu.github.io/datasketch
6
6
  Author: ekzhu
@@ -13,11 +13,11 @@ Classifier: Intended Audience :: Developers
13
13
  Classifier: Topic :: Database
14
14
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
15
15
  Classifier: License :: OSI Approved :: MIT License
16
- Classifier: Programming Language :: Python :: 3.7
17
16
  Classifier: Programming Language :: Python :: 3.8
18
17
  Classifier: Programming Language :: Python :: 3.9
19
18
  Classifier: Programming Language :: Python :: 3.10
20
19
  Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
21
  License-File: LICENSE
22
22
  Requires-Dist: numpy>=1.11
23
23
  Requires-Dist: scipy>=1.0.0
@@ -25,6 +25,9 @@ Provides-Extra: cassandra
25
25
  Requires-Dist: cassandra-driver>=3.20; extra == "cassandra"
26
26
  Provides-Extra: redis
27
27
  Requires-Dist: redis>=2.10.0; extra == "redis"
28
+ Provides-Extra: bloom
29
+ Requires-Dist: pybloomfilter3>=0.7.2; python_version >= "3.9" and extra == "bloom"
30
+ Requires-Dist: pybloomfiltermmap3==0.6.0; python_version < "3.9" and extra == "bloom"
28
31
  Provides-Extra: benchmark
29
32
  Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
30
33
  Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
@@ -45,8 +48,20 @@ Requires-Dist: nose>=1.3.7; extra == "test"
45
48
  Requires-Dist: nose-exclude>=0.5.0; extra == "test"
46
49
  Requires-Dist: pytest; extra == "test"
47
50
  Provides-Extra: experimental-aio
48
- Requires-Dist: aiounittest; python_version >= "3.6" and extra == "experimental-aio"
49
- Requires-Dist: motor; python_version >= "3.6" and extra == "experimental-aio"
51
+ Requires-Dist: aiounittest; python_version >= "3.8" and extra == "experimental-aio"
52
+ Requires-Dist: motor>3.6.0; python_version >= "3.8" and extra == "experimental-aio"
53
+ Dynamic: author
54
+ Dynamic: author-email
55
+ Dynamic: classifier
56
+ Dynamic: description
57
+ Dynamic: home-page
58
+ Dynamic: keywords
59
+ Dynamic: license
60
+ Dynamic: license-file
61
+ Dynamic: project-url
62
+ Dynamic: provides-extra
63
+ Dynamic: requires-dist
64
+ Dynamic: summary
50
65
 
51
66
  datasketch: Big Data Looks Small
52
67
  ================================
@@ -83,6 +98,8 @@ sub-linear query time:
83
98
  +===========================+=============================+========================+
84
99
  | `MinHash LSH`_ | MinHash, Weighted MinHash | Jaccard Threshold |
85
100
  +---------------------------+-----------------------------+------------------------+
101
+ | `LSHBloom`_ | MinHash, Weighted MinHash | Jaccard Threshold |
102
+ +---------------------------+-----------------------------+------------------------+
86
103
  | `MinHash LSH Forest`_ | MinHash, Weighted MinHash | Jaccard Top-K |
87
104
  +---------------------------+-----------------------------+------------------------+
88
105
  | `MinHash LSH Ensemble`_ | MinHash | Containment Threshold |
@@ -90,7 +107,7 @@ sub-linear query time:
90
107
  | `HNSW`_ | Any | Custom Metric Top-K |
91
108
  +---------------------------+-----------------------------+------------------------+
92
109
 
93
- datasketch must be used with Python 3.7 or above, NumPy 1.11 or above, and Scipy.
110
+ datasketch must be used with Python 3.8 or above, NumPy 1.11 or above, and Scipy.
94
111
 
95
112
  Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
96
113
  storage layer (see `MinHash LSH at Scale`_).
@@ -118,6 +135,11 @@ To install with Cassandra dependency:
118
135
 
119
136
  pip install datasketch[cassandra]
120
137
 
138
+ To install with Bloom filter dependency:
139
+
140
+ ::
141
+
142
+ pip install datasketch[bloom]
121
143
 
122
144
  .. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
123
145
  .. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
@@ -126,5 +148,6 @@ To install with Cassandra dependency:
126
148
  .. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
127
149
  .. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
128
150
  .. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
151
+ .. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
129
152
  .. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
130
153
  .. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
@@ -33,6 +33,8 @@ sub-linear query time:
33
33
  +===========================+=============================+========================+
34
34
  | `MinHash LSH`_ | MinHash, Weighted MinHash | Jaccard Threshold |
35
35
  +---------------------------+-----------------------------+------------------------+
36
+ | `LSHBloom`_ | MinHash, Weighted MinHash | Jaccard Threshold |
37
+ +---------------------------+-----------------------------+------------------------+
36
38
  | `MinHash LSH Forest`_ | MinHash, Weighted MinHash | Jaccard Top-K |
37
39
  +---------------------------+-----------------------------+------------------------+
38
40
  | `MinHash LSH Ensemble`_ | MinHash | Containment Threshold |
@@ -40,7 +42,7 @@ sub-linear query time:
40
42
  | `HNSW`_ | Any | Custom Metric Top-K |
41
43
  +---------------------------+-----------------------------+------------------------+
42
44
 
43
- datasketch must be used with Python 3.7 or above, NumPy 1.11 or above, and Scipy.
45
+ datasketch must be used with Python 3.8 or above, NumPy 1.11 or above, and Scipy.
44
46
 
45
47
  Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
46
48
  storage layer (see `MinHash LSH at Scale`_).
@@ -68,6 +70,11 @@ To install with Cassandra dependency:
68
70
 
69
71
  pip install datasketch[cassandra]
70
72
 
73
+ To install with Bloom filter dependency:
74
+
75
+ ::
76
+
77
+ pip install datasketch[bloom]
71
78
 
72
79
  .. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
73
80
  .. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
@@ -76,5 +83,6 @@ To install with Cassandra dependency:
76
83
  .. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
77
84
  .. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
78
85
  .. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
86
+ .. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
79
87
  .. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
80
88
  .. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
@@ -2,6 +2,7 @@ from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
2
2
  from datasketch.minhash import MinHash
3
3
  from datasketch.b_bit_minhash import bBitMinHash
4
4
  from datasketch.lsh import MinHashLSH
5
+ from datasketch.lsh_bloom import MinHashLSHBloom
5
6
  from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerator
6
7
  from datasketch.lshforest import MinHashLSHForest
7
8
  from datasketch.lshensemble import MinHashLSHEnsemble
@@ -92,7 +92,14 @@ class bBitMinHash(object):
92
92
  hvs = self.hashvalues[start:start+n]
93
93
  # Store the n b-bit hashed values in the current block
94
94
  for j, hv in enumerate(hvs):
95
- blocks[i] |= np.uint64(hv << (n - 1 - j) * slot_size)
95
+ # We do this in BigInteger rather than np.uint64 because of inconsistencies
96
+ # in NumPy type coercion rules between NumPy 1.x and NumPy 2.x environments.
97
+ # In NumPy 2.x, implicit type conversion during bitwise operations is not
98
+ # performed which can cause integer overflows. This, in turn can corrupt
99
+ # hashvalues and cause pickled bBitMinHash objects to have the wrong representation.
100
+ # Doing this in BigInteger guarantees we do not experience overflow and still
101
+ # coerces to np.uint64 as expected.
102
+ blocks[i] = int(blocks[i]) | (int(hv) << (n - 1 - j) * slot_size)
96
103
  fmt = self._serial_fmt_params + \
97
104
  "%d%s" % (num_blocks, self._serial_fmt_block)
98
105
  struct.pack_into(fmt, buffer, 0, self.seed, self.b, self.r, \
@@ -0,0 +1,335 @@
1
+ from __future__ import annotations
2
+ from typing import Callable, List, Optional, Tuple
3
+ from datasketch.minhash import MinHash
4
+ from scipy.integrate import quad as integrate
5
+ import numpy as np
6
+ import warnings
7
+ import os
8
+
9
+ try:
10
+ import pybloomfilter
11
+ except ImportError:
12
+ pybloomfilter = None
13
+
14
+ _mersenne_prime = np.uint64((1 << 61) - 1)
15
+
16
+ def _false_positive_probability(threshold, b, r):
17
+ _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b)
18
+ a, err = integrate(_probability, 0.0, threshold)
19
+ return a
20
+
21
+
22
+ def _false_negative_probability(threshold, b, r):
23
+ _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b))
24
+ a, err = integrate(_probability, threshold, 1.0)
25
+ return a
26
+
27
+
28
+ def _optimal_param(threshold, num_perm, false_positive_weight, false_negative_weight):
29
+ """
30
+ Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
31
+ of probabilities of false positive and false negative.
32
+ """
33
+ min_error = float("inf")
34
+ opt = (0, 0)
35
+ for b in range(1, num_perm + 1):
36
+ max_r = int(num_perm / b)
37
+ for r in range(1, max_r + 1):
38
+ fp = _false_positive_probability(threshold, b, r)
39
+ fn = _false_negative_probability(threshold, b, r)
40
+ error = fp * false_positive_weight + fn * false_negative_weight
41
+ if error < min_error:
42
+ min_error = error
43
+ opt = (b, r)
44
+ return opt
45
+
46
+ if pybloomfilter is not None:
47
+ class BloomTable:
48
+ """
49
+ Interface to a Bloom Filter meant to model a single band of the MinHash signature matrix
50
+
51
+ Args:
52
+ item_count (int): Number of items expected to be inserted (size of dataset). Used to create Bloom filter.
53
+ fp (float): False positive rate for Bloom filter in (0,1).
54
+ band_size (int): Size of band from MinHash signature matrix this filter is meant to model.
55
+ fname (str): File path where Bloom filter will be saved. If this file already exists, will initialize the Bloom filter from this path.
56
+ max_size (int): Maximum number of elements we should plan to insert into this Bloom filter. Upper bounds the size of the Bloom filter.
57
+ """
58
+ def __init__(self, item_count: int, fp: float, band_size: int, fname: str = None):
59
+ self.r = band_size
60
+ self.fname = fname
61
+ if fname is not None and os.path.exists(fname):
62
+ print(f"Loading Bloom Filter at {fname}...")
63
+ self.bloom_filter = pybloomfilter.BloomFilter.open(fname)
64
+ else:
65
+ self.bloom_filter = pybloomfilter.BloomFilter(
66
+ capacity=item_count,
67
+ error_rate=fp,
68
+ filename=self.fname
69
+ )
70
+
71
+ def sync(self):
72
+ if self.fname is not None:
73
+ self.bloom_filter.sync()
74
+ else:
75
+ warnings.warn("Attempting to save in-memory Bloom filter, this is a no-op.", RuntimeWarning)
76
+
77
+ def assert_size(self, hashvalues: List[int]):
78
+ if not len(hashvalues) == self.r:
79
+ raise RuntimeError(f"Invalid length for indices, {len(hashvalues)}, expected {self.r} hashvalues in band")
80
+
81
+ def insert(self, hashvalues: List[int]) -> None:
82
+ """
83
+ Takes as input the indices for a single band and inserts them into the corresponding bit arrays
84
+
85
+ Args:
86
+ hashvalues (List[int]): The hashvalues from a single band of a MinHash object.
87
+ """
88
+ self.assert_size(hashvalues)
89
+ # https://en.wikipedia.org/wiki/Universal_hashing#Hashing_vectors
90
+ # as the hashvalues are the result of a universal hashing function, their sum is also a univeral hash function
91
+ x = sum(hashvalues) % _mersenne_prime
92
+ self.bloom_filter.add(x)
93
+
94
+ def query(self, hashvalues: List[int]) -> bool:
95
+ """
96
+ Takes as input the indices for a single band and queries them against the corresponding arrays
97
+ returns True if the each query returns True, otherwise returns False
98
+
99
+ Args:
100
+ hashvalues (List[int]): The hashvalues from a single band of a MinHash object.
101
+ """
102
+ self.assert_size(hashvalues)
103
+ x = sum(hashvalues) % _mersenne_prime
104
+ return x in self.bloom_filter
105
+ else:
106
+ class BloomTable:
107
+ def __init__(self, item_count: int, fp: float, band_size: int, fname: str = None):
108
+ raise ImportError("Required dependency pybloomfilter is missing, did you `pip install datasketch[bloom]`?")
109
+
110
+ class MinHashLSHBloom(object):
111
+ """
112
+ The :ref:`lsh_bloom` index.
113
+ It supports query with `Jaccard similarity`_ threshold.
114
+ Reference: `LSHBloom paper
115
+ <https://arxiv.org/abs/2411.04257>`_.
116
+
117
+ Args:
118
+ threshold (float): The Jaccard similarity threshold between 0.0 and
119
+ 1.0. The initialized LSH index will be optimized for the threshold by
120
+ minizing the false positive and false negative.
121
+ num_perm (int): The number of permutation functions used
122
+ by the MinHash to be indexed. For weighted MinHash, this
123
+ is the sample size (`sample_size`).
124
+ n (int): The number of elements to be inserted (estimate of dataset size).
125
+ fp (float): The false positive rate for each Bloom filter. Must be in (0,1).
126
+ save_dir (str): The directory to save the Bloom filter index to. If Bloom filters
127
+ already exist in this directory, the index will be loaded from here. If None,
128
+ an in-memory index will be created - this index can not be persisted.
129
+ weights (Tuple[float, float]): Used to adjust the relative importance of
130
+ minimizing false positive and false negative when optimizing
131
+ for the Jaccard similarity threshold.
132
+ `weights` is a tuple in the format of
133
+ :code:`(false_positive_weight, false_negative_weight)`.
134
+ params (Optiona[Tuple[int, int]]): The LSH parameters (i.e., number of bands and size
135
+ of each bands). This is used to bypass the parameter optimization
136
+ step in the constructor. `threshold` and `weights` will be ignored
137
+ if this is given.
138
+
139
+ Note:
140
+ This algorithm is a space optimized version of MinHashLSH.
141
+ For more details on :ref:`minhash_lsh`, see the documentation.
142
+
143
+ This algorithm uses Bloom filters to drastically reduce the space
144
+ that the LSH index occupies on disk. However, it loses the ability
145
+ to retrieve candidate duplicate keys. Rather, it can only tell you
146
+ whether a query set is a duplicate of a set that was inserted previously.
147
+ This enables scaling to datasets of many hundreds of millions or billions
148
+ of documents, but may not be appropriate for all use cases.
149
+
150
+ Examples:
151
+
152
+ Create an index with 128 permutation functions optimized for Jaccard
153
+ threshold 0.9:
154
+
155
+ .. code-block:: python
156
+
157
+ from datasketch import MinHash, MinHashLSH
158
+
159
+ set1 = set(['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
160
+ 'estimating', 'the', 'similarity', 'between', 'datasets'])
161
+ set2 = set(['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
162
+ 'estimating', 'the', 'similarity', 'between', 'documents'])
163
+ set3 = set(['minhash', 'is', 'probability', 'data', 'structure', 'for',
164
+ 'estimating', 'the', 'similarity', 'between', 'documents'])
165
+
166
+ m1 = MinHash(num_perm=128)
167
+ m2 = MinHash(num_perm=128)
168
+ m3 = MinHash(num_perm=128)
169
+ for d in set1:
170
+ m1.update(d.encode('utf8'))
171
+ for d in set2:
172
+ m2.update(d.encode('utf8'))
173
+ for d in set3:
174
+ m3.update(d.encode('utf8'))
175
+
176
+ # Create LSHBloom index
177
+ lsh = MinHashLSHBloom(threshold=0.5, num_perm=128, n=100, fp=0.0001, save_dir="./index/")
178
+ lsh.insert(m2)
179
+ lsh.insert(m3)
180
+
181
+ # Query whether m1 is a duplicate according to the given threshold
182
+ is_duplicate = lsh.query(m1)
183
+ """
184
+
185
+ def __init__(
186
+ self,
187
+ threshold: float = 0.9,
188
+ num_perm: int = 128,
189
+ n: int = None,
190
+ fp: float = None,
191
+ save_dir: str = None,
192
+ weights: Tuple[float, float] = (0.5, 0.5),
193
+ params: Optional[Tuple[int, int]] = None,
194
+ ) -> None:
195
+ if threshold > 1.0 or threshold < 0.0:
196
+ raise ValueError("threshold must be in [0.0, 1.0]")
197
+ if num_perm < 2:
198
+ raise ValueError("Too few permutation functions")
199
+ if n <= 0:
200
+ raise ValueError("n for LSHBloom must be >= 0")
201
+ if fp >= 1.0 or fp <= 0.0:
202
+ raise ValueError("fp must be in (0.0, 1.0)")
203
+ if save_dir is None:
204
+ warnings.warn("Creating LSHBloom index without save directory, this index will not be persisted.", RuntimeWarning)
205
+ if any(w < 0.0 or w > 1.0 for w in weights):
206
+ raise ValueError("Weight must be in [0.0, 1.0]")
207
+ if sum(weights) != 1.0:
208
+ raise ValueError("Weights must sum to 1.0")
209
+ self.h = num_perm
210
+ if params is not None:
211
+ self.b, self.r = params
212
+ if self.b * self.r > num_perm:
213
+ raise ValueError(
214
+ "The product of b and r in params is "
215
+ "{} * {} = {} -- it must be less than num_perm {}. "
216
+ "Did you forget to specify num_perm?".format(
217
+ self.b, self.r, self.b * self.r, num_perm
218
+ )
219
+ )
220
+ else:
221
+ false_positive_weight, false_negative_weight = weights
222
+ self.b, self.r = _optimal_param(
223
+ threshold, num_perm, false_positive_weight, false_negative_weight
224
+ )
225
+ if self.b < 2:
226
+ raise ValueError("The number of bands are too small (b < 2)")
227
+
228
+ # create a Bloom filter for each band in the signature matrix
229
+ if save_dir is not None:
230
+ os.makedirs(save_dir, exist_ok=True)
231
+ self.hashtables = [
232
+ BloomTable(
233
+ item_count=n,
234
+ fp=fp, band_size=self.r,
235
+ fname=os.path.join(save_dir, f"band-{i}.bf") if save_dir is not None else None,
236
+ )
237
+ for i in range(self.b)
238
+ ]
239
+ self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
240
+
241
+ def insert(
242
+ self,
243
+ minhash: MinHash
244
+ ):
245
+ """
246
+ Insert the MinHash or Weighted MinHash
247
+ of a set to the index.
248
+
249
+ Args:
250
+ minhash (Union[MinHash, WeightedMinHash]): The MinHash of the set.
251
+
252
+ """
253
+ self._insert(minhash)
254
+
255
+ def _insert(
256
+ self,
257
+ minhash: MinHash
258
+ ):
259
+ if len(minhash) != self.h:
260
+ raise ValueError(
261
+ "Expecting minhash with length %d, got %d" % (self.h, len(minhash))
262
+ )
263
+
264
+ Hs = [minhash.hashvalues[start:end] for start, end in self.hashranges]
265
+
266
+ for H, hashtable in zip(Hs, self.hashtables):
267
+ hashtable.insert(H)
268
+
269
+ def query(self, minhash) -> bool:
270
+ """
271
+ Given the MinHash of the query set, determine
272
+ whether any previously inserted sets have
273
+ Jaccard similarity with the query that is
274
+ likely greater than the threshold.
275
+
276
+ Results are based on minhash segment collision
277
+ and are thus approximate.
278
+
279
+ Args:
280
+ minhash (MinHash): The MinHash of the query set.
281
+
282
+ Returns:
283
+ bool: Whether the item is a duplicate or not, based on the given threshold.
284
+
285
+ Example:
286
+
287
+ .. code-block:: python
288
+
289
+ from datasketch import MinHash, MinHashLSHBloom
290
+ import numpy as np
291
+
292
+ # Generate 100 random MinHashes.
293
+ minhashes = MinHash.bulk(
294
+ np.random.randint(low=0, high=30, size=(100, 10)),
295
+ num_perm=128
296
+ )
297
+
298
+ # Create LSHBloom index.
299
+ lsh = MinHashLSHBloom(threshold=0.5, num_perm=128, n=100, fp=0.0001, save_dir="./index/")
300
+ for i, m in enumerate(minhashes):
301
+ lsh.insert(i, m)
302
+
303
+ # Get the duplication result from LSHBloom.
304
+ query = minhashes[0]
305
+ is_duplicate = lsh.query(query)
306
+ print(is_duplicate)
307
+
308
+ Output:
309
+
310
+ .. code-block::
311
+
312
+ True
313
+
314
+ Note that although the threshold is set to 0.5, the results are not
315
+ guaranteed to be above 0.5 because the LSHBloom index is approximate and
316
+ the Jaccard similarity is estimated by MinHash.
317
+
318
+ """
319
+ if len(minhash) != self.h:
320
+ raise ValueError(
321
+ "Expecting minhash with length %d, got %d" % (self.h, len(minhash))
322
+ )
323
+
324
+ # if we match in any band, this is a candidate pair
325
+ for (start, end), hashtable in zip(self.hashranges, self.hashtables):
326
+ H = minhash.hashvalues[start:end]
327
+ collision = hashtable.query(H)
328
+ if collision:
329
+ return True
330
+ return False
331
+
332
+ def sync(self):
333
+ print("Saving Bloom Index...")
334
+ for table in self.hashtables:
335
+ table.sync()
@@ -0,0 +1 @@
1
+ __version__ = "1.7.0"
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: datasketch
3
- Version: 1.6.5
3
+ Version: 1.7.0
4
4
  Summary: Probabilistic data structures for processing and searching very large datasets
5
5
  Home-page: https://ekzhu.github.io/datasketch
6
6
  Author: ekzhu
@@ -13,11 +13,11 @@ Classifier: Intended Audience :: Developers
13
13
  Classifier: Topic :: Database
14
14
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
15
15
  Classifier: License :: OSI Approved :: MIT License
16
- Classifier: Programming Language :: Python :: 3.7
17
16
  Classifier: Programming Language :: Python :: 3.8
18
17
  Classifier: Programming Language :: Python :: 3.9
19
18
  Classifier: Programming Language :: Python :: 3.10
20
19
  Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
21
  License-File: LICENSE
22
22
  Requires-Dist: numpy>=1.11
23
23
  Requires-Dist: scipy>=1.0.0
@@ -25,6 +25,9 @@ Provides-Extra: cassandra
25
25
  Requires-Dist: cassandra-driver>=3.20; extra == "cassandra"
26
26
  Provides-Extra: redis
27
27
  Requires-Dist: redis>=2.10.0; extra == "redis"
28
+ Provides-Extra: bloom
29
+ Requires-Dist: pybloomfilter3>=0.7.2; python_version >= "3.9" and extra == "bloom"
30
+ Requires-Dist: pybloomfiltermmap3==0.6.0; python_version < "3.9" and extra == "bloom"
28
31
  Provides-Extra: benchmark
29
32
  Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
30
33
  Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
@@ -45,8 +48,20 @@ Requires-Dist: nose>=1.3.7; extra == "test"
45
48
  Requires-Dist: nose-exclude>=0.5.0; extra == "test"
46
49
  Requires-Dist: pytest; extra == "test"
47
50
  Provides-Extra: experimental-aio
48
- Requires-Dist: aiounittest; python_version >= "3.6" and extra == "experimental-aio"
49
- Requires-Dist: motor; python_version >= "3.6" and extra == "experimental-aio"
51
+ Requires-Dist: aiounittest; python_version >= "3.8" and extra == "experimental-aio"
52
+ Requires-Dist: motor>3.6.0; python_version >= "3.8" and extra == "experimental-aio"
53
+ Dynamic: author
54
+ Dynamic: author-email
55
+ Dynamic: classifier
56
+ Dynamic: description
57
+ Dynamic: home-page
58
+ Dynamic: keywords
59
+ Dynamic: license
60
+ Dynamic: license-file
61
+ Dynamic: project-url
62
+ Dynamic: provides-extra
63
+ Dynamic: requires-dist
64
+ Dynamic: summary
50
65
 
51
66
  datasketch: Big Data Looks Small
52
67
  ================================
@@ -83,6 +98,8 @@ sub-linear query time:
83
98
  +===========================+=============================+========================+
84
99
  | `MinHash LSH`_ | MinHash, Weighted MinHash | Jaccard Threshold |
85
100
  +---------------------------+-----------------------------+------------------------+
101
+ | `LSHBloom`_ | MinHash, Weighted MinHash | Jaccard Threshold |
102
+ +---------------------------+-----------------------------+------------------------+
86
103
  | `MinHash LSH Forest`_ | MinHash, Weighted MinHash | Jaccard Top-K |
87
104
  +---------------------------+-----------------------------+------------------------+
88
105
  | `MinHash LSH Ensemble`_ | MinHash | Containment Threshold |
@@ -90,7 +107,7 @@ sub-linear query time:
90
107
  | `HNSW`_ | Any | Custom Metric Top-K |
91
108
  +---------------------------+-----------------------------+------------------------+
92
109
 
93
- datasketch must be used with Python 3.7 or above, NumPy 1.11 or above, and Scipy.
110
+ datasketch must be used with Python 3.8 or above, NumPy 1.11 or above, and Scipy.
94
111
 
95
112
  Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
96
113
  storage layer (see `MinHash LSH at Scale`_).
@@ -118,6 +135,11 @@ To install with Cassandra dependency:
118
135
 
119
136
  pip install datasketch[cassandra]
120
137
 
138
+ To install with Bloom filter dependency:
139
+
140
+ ::
141
+
142
+ pip install datasketch[bloom]
121
143
 
122
144
  .. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
123
145
  .. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
@@ -126,5 +148,6 @@ To install with Cassandra dependency:
126
148
  .. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
127
149
  .. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
128
150
  .. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
151
+ .. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
129
152
  .. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
130
153
  .. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
@@ -9,6 +9,7 @@ datasketch/hyperloglog.py
9
9
  datasketch/hyperloglog_const.py
10
10
  datasketch/lean_minhash.py
11
11
  datasketch/lsh.py
12
+ datasketch/lsh_bloom.py
12
13
  datasketch/lshensemble.py
13
14
  datasketch/lshensemble_partition.py
14
15
  datasketch/lshforest.py
@@ -30,6 +31,7 @@ test/test_hyperloglog.py
30
31
  test/test_lean_minhash.py
31
32
  test/test_lsh.py
32
33
  test/test_lsh_cassandra.py
34
+ test/test_lshbloom.py
33
35
  test/test_lshensemble.py
34
36
  test/test_lshforest.py
35
37
  test/test_minhash.py
@@ -11,14 +11,22 @@ SetSimilaritySearch>=0.1.7
11
11
  pyfarmhash>=0.2.2
12
12
  nltk>=3.4.5
13
13
 
14
+ [bloom]
15
+
16
+ [bloom:python_version < "3.9"]
17
+ pybloomfiltermmap3==0.6.0
18
+
19
+ [bloom:python_version >= "3.9"]
20
+ pybloomfilter3>=0.7.2
21
+
14
22
  [cassandra]
15
23
  cassandra-driver>=3.20
16
24
 
17
25
  [experimental_aio]
18
26
 
19
- [experimental_aio:python_version >= "3.6"]
27
+ [experimental_aio:python_version >= "3.8"]
20
28
  aiounittest
21
- motor
29
+ motor>3.6.0
22
30
 
23
31
  [redis]
24
32
  redis>=2.10.0
@@ -39,11 +39,11 @@ setup(
39
39
  'Topic :: Database',
40
40
  'Topic :: Scientific/Engineering :: Information Analysis',
41
41
  'License :: OSI Approved :: MIT License',
42
- 'Programming Language :: Python :: 3.7',
43
42
  'Programming Language :: Python :: 3.8',
44
43
  'Programming Language :: Python :: 3.9',
45
44
  'Programming Language :: Python :: 3.10',
46
45
  'Programming Language :: Python :: 3.11',
46
+ 'Programming Language :: Python :: 3.12',
47
47
  ],
48
48
  keywords='database datamining',
49
49
  packages=find_packages(include=['datasketch*']),
@@ -58,6 +58,10 @@ setup(
58
58
  'redis': [
59
59
  'redis>=2.10.0',
60
60
  ],
61
+ 'bloom': [
62
+ 'pybloomfilter3>=0.7.2 ; python_version>="3.9"',
63
+ 'pybloomfiltermmap3==0.6.0 ; python_version<"3.9"',
64
+ ],
61
65
  'benchmark': [
62
66
  'pyhash>=0.9.3',
63
67
  'matplotlib>=3.1.2',
@@ -80,8 +84,8 @@ setup(
80
84
  'pytest',
81
85
  ],
82
86
  'experimental_aio': [
83
- "aiounittest ; python_version>='3.6'",
84
- "motor ; python_version>='3.6'",
87
+ "aiounittest ; python_version>='3.8'",
88
+ "motor>3.6.0 ; python_version>='3.8'",
85
89
  ],
86
90
  },
87
91
  )
@@ -0,0 +1,126 @@
1
+ import unittest
2
+ import pickle
3
+ from glob import glob
4
+ from datasketch.lsh_bloom import BloomTable, MinHashLSHBloom
5
+ from datasketch.minhash import MinHash
6
+ import numpy as np
7
+ import os
8
+
9
+ class TestBloomTable(unittest.TestCase):
10
+ def test_insert(self):
11
+ r = 3
12
+ x = np.array([2,3,31], dtype=np.uint32)
13
+ b = BloomTable(10, 0.01, band_size=r)
14
+ b.insert(x)
15
+ self.assertRaises(RuntimeError, b.insert, np.array([2,2], dtype=np.uint32))
16
+
17
+ def test_query(self):
18
+ r = 3
19
+ x = np.array([2,3,31], dtype=np.uint32)
20
+ b = BloomTable(10, 0.01, band_size=r)
21
+ b.insert(x)
22
+ self.assertTrue(b.query(x))
23
+ self.assertFalse(b.query(np.array([2,3,30], dtype=np.uint32)))
24
+ self.assertRaises(RuntimeError, b.query, [2,2])
25
+
26
+ def test_save(self):
27
+ fname = "/tmp/bloomfilter.bf"
28
+ if os.path.exists(fname):
29
+ os.remove(fname)
30
+ r = 3
31
+ x = np.array([2,3,31], dtype=np.uint32)
32
+ y = np.array([12,10,29], dtype=np.uint32)
33
+ z = np.array([27,30,8], dtype=np.uint32)
34
+ items = [x,y,z]
35
+ b = BloomTable(10, 0.01, band_size=r, fname=fname)
36
+ for item in items:
37
+ b.insert(item)
38
+ for item in items:
39
+ self.assertTrue(b.query(item))
40
+ b.sync()
41
+
42
+ del b
43
+
44
+ b_ = BloomTable(10, 0.01, band_size=r, fname=fname)
45
+ for item in items:
46
+ self.assertTrue(b_.query(item))
47
+
48
+
49
+ class TestMinHashLSHBloom(unittest.TestCase):
50
+
51
+ def test_init(self):
52
+ lsh = MinHashLSHBloom(threshold=0.8, n=10, fp=0.01)
53
+ b1, r1 = lsh.b, lsh.r
54
+ lsh = MinHashLSHBloom(threshold=0.8, weights=(0.2,0.8), n=10, fp=0.01)
55
+ b2, r2 = lsh.b, lsh.r
56
+ self.assertTrue(b1 < b2)
57
+ self.assertTrue(r1 > r2)
58
+ self.assertTrue(len(lsh.hashtables) == lsh.b)
59
+
60
+
61
+ def test_insert(self):
62
+ lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01)
63
+ m1 = MinHash(16)
64
+ m1.update("a".encode("utf8"))
65
+ m2 = MinHash(16)
66
+ m2.update("b".encode("utf8"))
67
+ lsh.insert(m1)
68
+ lsh.insert(m2)
69
+
70
+ m3 = MinHash(18)
71
+ self.assertRaises(ValueError, lsh.insert, m3)
72
+
73
+ def test_query(self):
74
+ lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01)
75
+ m1 = MinHash(16)
76
+ m1.update("a".encode("utf8"))
77
+ m2 = MinHash(16)
78
+ m2.update("b".encode("utf8"))
79
+ lsh.insert(m1)
80
+ lsh.insert(m2)
81
+ result = lsh.query(m1)
82
+ self.assertTrue(result)
83
+ result = lsh.query(m2)
84
+ self.assertTrue(result)
85
+
86
+ m3 = MinHash(18)
87
+ self.assertRaises(ValueError, lsh.query, m3)
88
+
89
+ def test_save(self):
90
+ save_path = "./test_save/"
91
+ for item in glob(f"{save_path}/*.bf"):
92
+ os.remove(item)
93
+
94
+ lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01, save_dir=save_path)
95
+ m1 = MinHash(16)
96
+ m1.update("a".encode("utf8"))
97
+ m2 = MinHash(16)
98
+ m2.update("b".encode("utf8"))
99
+ lsh.insert(m1)
100
+ lsh.insert(m2)
101
+ lsh.sync()
102
+
103
+ lsh2 = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01, save_dir=save_path)
104
+ result = lsh2.query(m1)
105
+ self.assertTrue(result)
106
+ result = lsh2.query(m2)
107
+ self.assertTrue(result)
108
+
109
+ def test_save_in_memory(self):
110
+
111
+ with self.assertWarns(RuntimeWarning):
112
+ lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01, save_dir=None)
113
+
114
+ m1 = MinHash(16)
115
+ m1.update("a".encode("utf8"))
116
+ m2 = MinHash(16)
117
+ m2.update("b".encode("utf8"))
118
+ lsh.insert(m1)
119
+ lsh.insert(m2)
120
+
121
+ with self.assertWarns(RuntimeWarning):
122
+ lsh.sync()
123
+
124
+
125
+ if __name__ == "__main__":
126
+ unittest.main()
@@ -1 +0,0 @@
1
- __version__ = "1.6.5"
File without changes
File without changes
File without changes
File without changes
File without changes