datasketch 1.6.4__tar.gz → 1.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datasketch-1.6.4 → datasketch-1.7.0}/PKG-INFO +33 -10
- {datasketch-1.6.4 → datasketch-1.7.0}/README.rst +13 -5
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/__init__.py +1 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/b_bit_minhash.py +8 -1
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/lsh.py +55 -0
- datasketch-1.7.0/datasketch/lsh_bloom.py +335 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/lshforest.py +25 -0
- datasketch-1.7.0/datasketch/version.py +1 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch.egg-info/PKG-INFO +33 -10
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch.egg-info/SOURCES.txt +2 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch.egg-info/requires.txt +10 -2
- {datasketch-1.6.4 → datasketch-1.7.0}/setup.py +7 -3
- {datasketch-1.6.4 → datasketch-1.7.0}/test/test_lsh.py +111 -0
- datasketch-1.7.0/test/test_lshbloom.py +126 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/test/test_lshforest.py +12 -0
- datasketch-1.6.4/datasketch/version.py +0 -1
- {datasketch-1.6.4 → datasketch-1.7.0}/LICENSE +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/experimental/__init__.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/experimental/aio/__init__.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/experimental/aio/lsh.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/experimental/aio/storage.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/hashfunc.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/hnsw.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/hyperloglog.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/hyperloglog_const.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/lean_minhash.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/lshensemble.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/lshensemble_partition.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/minhash.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/storage.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch/weighted_minhash.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch.egg-info/dependency_links.txt +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/datasketch.egg-info/top_level.txt +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/setup.cfg +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/test/test_hnsw.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/test/test_hyperloglog.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/test/test_lean_minhash.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/test/test_lsh_cassandra.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/test/test_lshensemble.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/test/test_minhash.py +0 -0
- {datasketch-1.6.4 → datasketch-1.7.0}/test/test_weighted_minhash.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: datasketch
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.7.0
|
|
4
4
|
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
5
|
Home-page: https://ekzhu.github.io/datasketch
|
|
6
6
|
Author: ekzhu
|
|
@@ -13,11 +13,11 @@ Classifier: Intended Audience :: Developers
|
|
|
13
13
|
Classifier: Topic :: Database
|
|
14
14
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
15
15
|
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.7
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.8
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.9
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.10
|
|
20
19
|
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: numpy>=1.11
|
|
23
23
|
Requires-Dist: scipy>=1.0.0
|
|
@@ -25,6 +25,9 @@ Provides-Extra: cassandra
|
|
|
25
25
|
Requires-Dist: cassandra-driver>=3.20; extra == "cassandra"
|
|
26
26
|
Provides-Extra: redis
|
|
27
27
|
Requires-Dist: redis>=2.10.0; extra == "redis"
|
|
28
|
+
Provides-Extra: bloom
|
|
29
|
+
Requires-Dist: pybloomfilter3>=0.7.2; python_version >= "3.9" and extra == "bloom"
|
|
30
|
+
Requires-Dist: pybloomfiltermmap3==0.6.0; python_version < "3.9" and extra == "bloom"
|
|
28
31
|
Provides-Extra: benchmark
|
|
29
32
|
Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
|
|
30
33
|
Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
|
|
@@ -45,17 +48,29 @@ Requires-Dist: nose>=1.3.7; extra == "test"
|
|
|
45
48
|
Requires-Dist: nose-exclude>=0.5.0; extra == "test"
|
|
46
49
|
Requires-Dist: pytest; extra == "test"
|
|
47
50
|
Provides-Extra: experimental-aio
|
|
48
|
-
Requires-Dist: aiounittest; python_version >= "3.
|
|
49
|
-
Requires-Dist: motor; python_version >= "3.
|
|
51
|
+
Requires-Dist: aiounittest; python_version >= "3.8" and extra == "experimental-aio"
|
|
52
|
+
Requires-Dist: motor>3.6.0; python_version >= "3.8" and extra == "experimental-aio"
|
|
53
|
+
Dynamic: author
|
|
54
|
+
Dynamic: author-email
|
|
55
|
+
Dynamic: classifier
|
|
56
|
+
Dynamic: description
|
|
57
|
+
Dynamic: home-page
|
|
58
|
+
Dynamic: keywords
|
|
59
|
+
Dynamic: license
|
|
60
|
+
Dynamic: license-file
|
|
61
|
+
Dynamic: project-url
|
|
62
|
+
Dynamic: provides-extra
|
|
63
|
+
Dynamic: requires-dist
|
|
64
|
+
Dynamic: summary
|
|
50
65
|
|
|
51
66
|
datasketch: Big Data Looks Small
|
|
52
67
|
================================
|
|
53
68
|
|
|
54
|
-
.. image:: https://
|
|
55
|
-
:target: https://
|
|
69
|
+
.. image:: https://static.pepy.tech/badge/datasketch/month
|
|
70
|
+
:target: https://pepy.tech/project/datasketch
|
|
56
71
|
|
|
57
|
-
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.
|
|
58
|
-
:target: https://
|
|
72
|
+
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
|
|
73
|
+
:target: https://zenodo.org/doi/10.5281/zenodo.598238
|
|
59
74
|
|
|
60
75
|
datasketch gives you probabilistic data structures that can process and
|
|
61
76
|
search very large amount of data super fast, with little loss of
|
|
@@ -83,6 +98,8 @@ sub-linear query time:
|
|
|
83
98
|
+===========================+=============================+========================+
|
|
84
99
|
| `MinHash LSH`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
85
100
|
+---------------------------+-----------------------------+------------------------+
|
|
101
|
+
| `LSHBloom`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
102
|
+
+---------------------------+-----------------------------+------------------------+
|
|
86
103
|
| `MinHash LSH Forest`_ | MinHash, Weighted MinHash | Jaccard Top-K |
|
|
87
104
|
+---------------------------+-----------------------------+------------------------+
|
|
88
105
|
| `MinHash LSH Ensemble`_ | MinHash | Containment Threshold |
|
|
@@ -90,7 +107,7 @@ sub-linear query time:
|
|
|
90
107
|
| `HNSW`_ | Any | Custom Metric Top-K |
|
|
91
108
|
+---------------------------+-----------------------------+------------------------+
|
|
92
109
|
|
|
93
|
-
datasketch must be used with Python 3.
|
|
110
|
+
datasketch must be used with Python 3.8 or above, NumPy 1.11 or above, and Scipy.
|
|
94
111
|
|
|
95
112
|
Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
|
|
96
113
|
storage layer (see `MinHash LSH at Scale`_).
|
|
@@ -118,6 +135,11 @@ To install with Cassandra dependency:
|
|
|
118
135
|
|
|
119
136
|
pip install datasketch[cassandra]
|
|
120
137
|
|
|
138
|
+
To install with Bloom filter dependency:
|
|
139
|
+
|
|
140
|
+
::
|
|
141
|
+
|
|
142
|
+
pip install datasketch[bloom]
|
|
121
143
|
|
|
122
144
|
.. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
|
|
123
145
|
.. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
|
|
@@ -126,5 +148,6 @@ To install with Cassandra dependency:
|
|
|
126
148
|
.. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
|
|
127
149
|
.. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
|
|
128
150
|
.. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
|
|
151
|
+
.. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
|
|
129
152
|
.. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
|
|
130
153
|
.. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
datasketch: Big Data Looks Small
|
|
2
2
|
================================
|
|
3
3
|
|
|
4
|
-
.. image:: https://
|
|
5
|
-
:target: https://
|
|
4
|
+
.. image:: https://static.pepy.tech/badge/datasketch/month
|
|
5
|
+
:target: https://pepy.tech/project/datasketch
|
|
6
6
|
|
|
7
|
-
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.
|
|
8
|
-
:target: https://
|
|
7
|
+
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
|
|
8
|
+
:target: https://zenodo.org/doi/10.5281/zenodo.598238
|
|
9
9
|
|
|
10
10
|
datasketch gives you probabilistic data structures that can process and
|
|
11
11
|
search very large amount of data super fast, with little loss of
|
|
@@ -33,6 +33,8 @@ sub-linear query time:
|
|
|
33
33
|
+===========================+=============================+========================+
|
|
34
34
|
| `MinHash LSH`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
35
35
|
+---------------------------+-----------------------------+------------------------+
|
|
36
|
+
| `LSHBloom`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
37
|
+
+---------------------------+-----------------------------+------------------------+
|
|
36
38
|
| `MinHash LSH Forest`_ | MinHash, Weighted MinHash | Jaccard Top-K |
|
|
37
39
|
+---------------------------+-----------------------------+------------------------+
|
|
38
40
|
| `MinHash LSH Ensemble`_ | MinHash | Containment Threshold |
|
|
@@ -40,7 +42,7 @@ sub-linear query time:
|
|
|
40
42
|
| `HNSW`_ | Any | Custom Metric Top-K |
|
|
41
43
|
+---------------------------+-----------------------------+------------------------+
|
|
42
44
|
|
|
43
|
-
datasketch must be used with Python 3.
|
|
45
|
+
datasketch must be used with Python 3.8 or above, NumPy 1.11 or above, and Scipy.
|
|
44
46
|
|
|
45
47
|
Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
|
|
46
48
|
storage layer (see `MinHash LSH at Scale`_).
|
|
@@ -68,6 +70,11 @@ To install with Cassandra dependency:
|
|
|
68
70
|
|
|
69
71
|
pip install datasketch[cassandra]
|
|
70
72
|
|
|
73
|
+
To install with Bloom filter dependency:
|
|
74
|
+
|
|
75
|
+
::
|
|
76
|
+
|
|
77
|
+
pip install datasketch[bloom]
|
|
71
78
|
|
|
72
79
|
.. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
|
|
73
80
|
.. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
|
|
@@ -76,5 +83,6 @@ To install with Cassandra dependency:
|
|
|
76
83
|
.. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
|
|
77
84
|
.. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
|
|
78
85
|
.. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
|
|
86
|
+
.. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
|
|
79
87
|
.. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
|
|
80
88
|
.. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
|
|
@@ -2,6 +2,7 @@ from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
|
|
|
2
2
|
from datasketch.minhash import MinHash
|
|
3
3
|
from datasketch.b_bit_minhash import bBitMinHash
|
|
4
4
|
from datasketch.lsh import MinHashLSH
|
|
5
|
+
from datasketch.lsh_bloom import MinHashLSHBloom
|
|
5
6
|
from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerator
|
|
6
7
|
from datasketch.lshforest import MinHashLSHForest
|
|
7
8
|
from datasketch.lshensemble import MinHashLSHEnsemble
|
|
@@ -92,7 +92,14 @@ class bBitMinHash(object):
|
|
|
92
92
|
hvs = self.hashvalues[start:start+n]
|
|
93
93
|
# Store the n b-bit hashed values in the current block
|
|
94
94
|
for j, hv in enumerate(hvs):
|
|
95
|
-
|
|
95
|
+
# We do this in BigInteger rather than np.uint64 because of inconsistencies
|
|
96
|
+
# in NumPy type coercion rules between NumPy 1.x and NumPy 2.x environments.
|
|
97
|
+
# In NumPy 2.x, implicit type conversion during bitwise operations is not
|
|
98
|
+
# performed which can cause integer overflows. This, in turn can corrupt
|
|
99
|
+
# hashvalues and cause pickled bBitMinHash objects to have the wrong representation.
|
|
100
|
+
# Doing this in BigInteger guarantees we do not experience overflow and still
|
|
101
|
+
# coerces to np.uint64 as expected.
|
|
102
|
+
blocks[i] = int(blocks[i]) | (int(hv) << (n - 1 - j) * slot_size)
|
|
96
103
|
fmt = self._serial_fmt_params + \
|
|
97
104
|
"%d%s" % (num_blocks, self._serial_fmt_block)
|
|
98
105
|
struct.pack_into(fmt, buffer, 0, self.seed, self.b, self.r, \
|
|
@@ -226,6 +226,29 @@ class MinHashLSH(object):
|
|
|
226
226
|
"""
|
|
227
227
|
self._insert(key, minhash, check_duplication=check_duplication, buffer=False)
|
|
228
228
|
|
|
229
|
+
def merge(
|
|
230
|
+
self,
|
|
231
|
+
other: MinHashLSH,
|
|
232
|
+
check_overlap: bool = False
|
|
233
|
+
):
|
|
234
|
+
"""Merge the other MinHashLSH with this one, making this one the union
|
|
235
|
+
of both.
|
|
236
|
+
|
|
237
|
+
Note:
|
|
238
|
+
Only num_perm, number of bands and sizes of each band is checked for equivalency of two MinHashLSH indexes.
|
|
239
|
+
Other initialization parameters threshold, weights, storage_config, prepickle and hash_func are not checked.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
other (MinHashLSH): The other MinHashLSH.
|
|
243
|
+
check_overlap (bool): Check if there are any overlapping keys before merging and raise if there are any.
|
|
244
|
+
(`default=False`)
|
|
245
|
+
|
|
246
|
+
Raises:
|
|
247
|
+
ValueError: If the two MinHashLSH have different initialization
|
|
248
|
+
parameters, or if `check_overlap` is `True` and there are overlapping keys.
|
|
249
|
+
"""
|
|
250
|
+
self._merge(other, check_overlap=check_overlap, buffer=False)
|
|
251
|
+
|
|
229
252
|
def insertion_session(self, buffer_size: int = 50000) -> MinHashLSHInsertionSession:
|
|
230
253
|
"""
|
|
231
254
|
Create a context manager for fast insertion into this index.
|
|
@@ -282,6 +305,38 @@ class MinHashLSH(object):
|
|
|
282
305
|
for H, hashtable in zip(Hs, self.hashtables):
|
|
283
306
|
hashtable.insert(H, key, buffer=buffer)
|
|
284
307
|
|
|
308
|
+
def __equivalent(self, other:MinHashLSH) -> bool:
|
|
309
|
+
"""
|
|
310
|
+
Returns:
|
|
311
|
+
bool: If the two MinHashLSH have equal num_perm, number of bands, size of each band then two are equivalent.
|
|
312
|
+
"""
|
|
313
|
+
return (
|
|
314
|
+
type(self) is type(other) and
|
|
315
|
+
self.h == other.h and
|
|
316
|
+
self.b == other.b and
|
|
317
|
+
self.r == other.r
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
def _merge(
|
|
321
|
+
self,
|
|
322
|
+
other: MinHashLSH,
|
|
323
|
+
check_overlap: bool = False,
|
|
324
|
+
buffer: bool = False
|
|
325
|
+
) -> MinHashLSH:
|
|
326
|
+
if self.__equivalent(other):
|
|
327
|
+
if check_overlap and set(self.keys).intersection(set(other.keys)):
|
|
328
|
+
raise ValueError("The keys are overlapping, duplicate key exists.")
|
|
329
|
+
for key in other.keys:
|
|
330
|
+
Hs = other.keys.get(key)
|
|
331
|
+
self.keys.insert(key, *Hs, buffer=buffer)
|
|
332
|
+
for H, hashtable in zip(Hs, self.hashtables):
|
|
333
|
+
hashtable.insert(H, key, buffer=buffer)
|
|
334
|
+
else:
|
|
335
|
+
if type(self) is not type(other):
|
|
336
|
+
raise ValueError(f"Cannot merge type MinHashLSH and type {type(other).__name__}.")
|
|
337
|
+
raise ValueError(
|
|
338
|
+
"Cannot merge MinHashLSH with different initialization parameters.")
|
|
339
|
+
|
|
285
340
|
def query(self, minhash) -> List[Hashable]:
|
|
286
341
|
"""
|
|
287
342
|
Giving the MinHash of the query set, retrieve
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Callable, List, Optional, Tuple
|
|
3
|
+
from datasketch.minhash import MinHash
|
|
4
|
+
from scipy.integrate import quad as integrate
|
|
5
|
+
import numpy as np
|
|
6
|
+
import warnings
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import pybloomfilter
|
|
11
|
+
except ImportError:
|
|
12
|
+
pybloomfilter = None
|
|
13
|
+
|
|
14
|
+
_mersenne_prime = np.uint64((1 << 61) - 1)
|
|
15
|
+
|
|
16
|
+
def _false_positive_probability(threshold, b, r):
|
|
17
|
+
_probability = lambda s: 1 - (1 - s ** float(r)) ** float(b)
|
|
18
|
+
a, err = integrate(_probability, 0.0, threshold)
|
|
19
|
+
return a
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _false_negative_probability(threshold, b, r):
|
|
23
|
+
_probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b))
|
|
24
|
+
a, err = integrate(_probability, threshold, 1.0)
|
|
25
|
+
return a
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _optimal_param(threshold, num_perm, false_positive_weight, false_negative_weight):
|
|
29
|
+
"""
|
|
30
|
+
Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
|
|
31
|
+
of probabilities of false positive and false negative.
|
|
32
|
+
"""
|
|
33
|
+
min_error = float("inf")
|
|
34
|
+
opt = (0, 0)
|
|
35
|
+
for b in range(1, num_perm + 1):
|
|
36
|
+
max_r = int(num_perm / b)
|
|
37
|
+
for r in range(1, max_r + 1):
|
|
38
|
+
fp = _false_positive_probability(threshold, b, r)
|
|
39
|
+
fn = _false_negative_probability(threshold, b, r)
|
|
40
|
+
error = fp * false_positive_weight + fn * false_negative_weight
|
|
41
|
+
if error < min_error:
|
|
42
|
+
min_error = error
|
|
43
|
+
opt = (b, r)
|
|
44
|
+
return opt
|
|
45
|
+
|
|
46
|
+
if pybloomfilter is not None:
|
|
47
|
+
class BloomTable:
|
|
48
|
+
"""
|
|
49
|
+
Interface to a Bloom Filter meant to model a single band of the MinHash signature matrix
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
item_count (int): Number of items expected to be inserted (size of dataset). Used to create Bloom filter.
|
|
53
|
+
fp (float): False positive rate for Bloom filter in (0,1).
|
|
54
|
+
band_size (int): Size of band from MinHash signature matrix this filter is meant to model.
|
|
55
|
+
fname (str): File path where Bloom filter will be saved. If this file already exists, will initialize the Bloom filter from this path.
|
|
56
|
+
max_size (int): Maximum number of elements we should plan to insert into this Bloom filter. Upper bounds the size of the Bloom filter.
|
|
57
|
+
"""
|
|
58
|
+
def __init__(self, item_count: int, fp: float, band_size: int, fname: str = None):
|
|
59
|
+
self.r = band_size
|
|
60
|
+
self.fname = fname
|
|
61
|
+
if fname is not None and os.path.exists(fname):
|
|
62
|
+
print(f"Loading Bloom Filter at {fname}...")
|
|
63
|
+
self.bloom_filter = pybloomfilter.BloomFilter.open(fname)
|
|
64
|
+
else:
|
|
65
|
+
self.bloom_filter = pybloomfilter.BloomFilter(
|
|
66
|
+
capacity=item_count,
|
|
67
|
+
error_rate=fp,
|
|
68
|
+
filename=self.fname
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
def sync(self):
|
|
72
|
+
if self.fname is not None:
|
|
73
|
+
self.bloom_filter.sync()
|
|
74
|
+
else:
|
|
75
|
+
warnings.warn("Attempting to save in-memory Bloom filter, this is a no-op.", RuntimeWarning)
|
|
76
|
+
|
|
77
|
+
def assert_size(self, hashvalues: List[int]):
|
|
78
|
+
if not len(hashvalues) == self.r:
|
|
79
|
+
raise RuntimeError(f"Invalid length for indices, {len(hashvalues)}, expected {self.r} hashvalues in band")
|
|
80
|
+
|
|
81
|
+
def insert(self, hashvalues: List[int]) -> None:
|
|
82
|
+
"""
|
|
83
|
+
Takes as input the indices for a single band and inserts them into the corresponding bit arrays
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
hashvalues (List[int]): The hashvalues from a single band of a MinHash object.
|
|
87
|
+
"""
|
|
88
|
+
self.assert_size(hashvalues)
|
|
89
|
+
# https://en.wikipedia.org/wiki/Universal_hashing#Hashing_vectors
|
|
90
|
+
# as the hashvalues are the result of a universal hashing function, their sum is also a univeral hash function
|
|
91
|
+
x = sum(hashvalues) % _mersenne_prime
|
|
92
|
+
self.bloom_filter.add(x)
|
|
93
|
+
|
|
94
|
+
def query(self, hashvalues: List[int]) -> bool:
|
|
95
|
+
"""
|
|
96
|
+
Takes as input the indices for a single band and queries them against the corresponding arrays
|
|
97
|
+
returns True if the each query returns True, otherwise returns False
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
hashvalues (List[int]): The hashvalues from a single band of a MinHash object.
|
|
101
|
+
"""
|
|
102
|
+
self.assert_size(hashvalues)
|
|
103
|
+
x = sum(hashvalues) % _mersenne_prime
|
|
104
|
+
return x in self.bloom_filter
|
|
105
|
+
else:
|
|
106
|
+
class BloomTable:
|
|
107
|
+
def __init__(self, item_count: int, fp: float, band_size: int, fname: str = None):
|
|
108
|
+
raise ImportError("Required dependency pybloomfilter is missing, did you `pip install datasketch[bloom]`?")
|
|
109
|
+
|
|
110
|
+
class MinHashLSHBloom(object):
|
|
111
|
+
"""
|
|
112
|
+
The :ref:`lsh_bloom` index.
|
|
113
|
+
It supports query with `Jaccard similarity`_ threshold.
|
|
114
|
+
Reference: `LSHBloom paper
|
|
115
|
+
<https://arxiv.org/abs/2411.04257>`_.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
threshold (float): The Jaccard similarity threshold between 0.0 and
|
|
119
|
+
1.0. The initialized LSH index will be optimized for the threshold by
|
|
120
|
+
minizing the false positive and false negative.
|
|
121
|
+
num_perm (int): The number of permutation functions used
|
|
122
|
+
by the MinHash to be indexed. For weighted MinHash, this
|
|
123
|
+
is the sample size (`sample_size`).
|
|
124
|
+
n (int): The number of elements to be inserted (estimate of dataset size).
|
|
125
|
+
fp (float): The false positive rate for each Bloom filter. Must be in (0,1).
|
|
126
|
+
save_dir (str): The directory to save the Bloom filter index to. If Bloom filters
|
|
127
|
+
already exist in this directory, the index will be loaded from here. If None,
|
|
128
|
+
an in-memory index will be created - this index can not be persisted.
|
|
129
|
+
weights (Tuple[float, float]): Used to adjust the relative importance of
|
|
130
|
+
minimizing false positive and false negative when optimizing
|
|
131
|
+
for the Jaccard similarity threshold.
|
|
132
|
+
`weights` is a tuple in the format of
|
|
133
|
+
:code:`(false_positive_weight, false_negative_weight)`.
|
|
134
|
+
params (Optiona[Tuple[int, int]]): The LSH parameters (i.e., number of bands and size
|
|
135
|
+
of each bands). This is used to bypass the parameter optimization
|
|
136
|
+
step in the constructor. `threshold` and `weights` will be ignored
|
|
137
|
+
if this is given.
|
|
138
|
+
|
|
139
|
+
Note:
|
|
140
|
+
This algorithm is a space optimized version of MinHashLSH.
|
|
141
|
+
For more details on :ref:`minhash_lsh`, see the documentation.
|
|
142
|
+
|
|
143
|
+
This algorithm uses Bloom filters to drastically reduce the space
|
|
144
|
+
that the LSH index occupies on disk. However, it loses the ability
|
|
145
|
+
to retrieve candidate duplicate keys. Rather, it can only tell you
|
|
146
|
+
whether a query set is a duplicate of a set that was inserted previously.
|
|
147
|
+
This enables scaling to datasets of many hundreds of millions or billions
|
|
148
|
+
of documents, but may not be appropriate for all use cases.
|
|
149
|
+
|
|
150
|
+
Examples:
|
|
151
|
+
|
|
152
|
+
Create an index with 128 permutation functions optimized for Jaccard
|
|
153
|
+
threshold 0.9:
|
|
154
|
+
|
|
155
|
+
.. code-block:: python
|
|
156
|
+
|
|
157
|
+
from datasketch import MinHash, MinHashLSH
|
|
158
|
+
|
|
159
|
+
set1 = set(['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
|
|
160
|
+
'estimating', 'the', 'similarity', 'between', 'datasets'])
|
|
161
|
+
set2 = set(['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
|
|
162
|
+
'estimating', 'the', 'similarity', 'between', 'documents'])
|
|
163
|
+
set3 = set(['minhash', 'is', 'probability', 'data', 'structure', 'for',
|
|
164
|
+
'estimating', 'the', 'similarity', 'between', 'documents'])
|
|
165
|
+
|
|
166
|
+
m1 = MinHash(num_perm=128)
|
|
167
|
+
m2 = MinHash(num_perm=128)
|
|
168
|
+
m3 = MinHash(num_perm=128)
|
|
169
|
+
for d in set1:
|
|
170
|
+
m1.update(d.encode('utf8'))
|
|
171
|
+
for d in set2:
|
|
172
|
+
m2.update(d.encode('utf8'))
|
|
173
|
+
for d in set3:
|
|
174
|
+
m3.update(d.encode('utf8'))
|
|
175
|
+
|
|
176
|
+
# Create LSHBloom index
|
|
177
|
+
lsh = MinHashLSHBloom(threshold=0.5, num_perm=128, n=100, fp=0.0001, save_dir="./index/")
|
|
178
|
+
lsh.insert(m2)
|
|
179
|
+
lsh.insert(m3)
|
|
180
|
+
|
|
181
|
+
# Query whether m1 is a duplicate according to the given threshold
|
|
182
|
+
is_duplicate = lsh.query(m1)
|
|
183
|
+
"""
|
|
184
|
+
|
|
185
|
+
def __init__(
|
|
186
|
+
self,
|
|
187
|
+
threshold: float = 0.9,
|
|
188
|
+
num_perm: int = 128,
|
|
189
|
+
n: int = None,
|
|
190
|
+
fp: float = None,
|
|
191
|
+
save_dir: str = None,
|
|
192
|
+
weights: Tuple[float, float] = (0.5, 0.5),
|
|
193
|
+
params: Optional[Tuple[int, int]] = None,
|
|
194
|
+
) -> None:
|
|
195
|
+
if threshold > 1.0 or threshold < 0.0:
|
|
196
|
+
raise ValueError("threshold must be in [0.0, 1.0]")
|
|
197
|
+
if num_perm < 2:
|
|
198
|
+
raise ValueError("Too few permutation functions")
|
|
199
|
+
if n <= 0:
|
|
200
|
+
raise ValueError("n for LSHBloom must be >= 0")
|
|
201
|
+
if fp >= 1.0 or fp <= 0.0:
|
|
202
|
+
raise ValueError("fp must be in (0.0, 1.0)")
|
|
203
|
+
if save_dir is None:
|
|
204
|
+
warnings.warn("Creating LSHBloom index without save directory, this index will not be persisted.", RuntimeWarning)
|
|
205
|
+
if any(w < 0.0 or w > 1.0 for w in weights):
|
|
206
|
+
raise ValueError("Weight must be in [0.0, 1.0]")
|
|
207
|
+
if sum(weights) != 1.0:
|
|
208
|
+
raise ValueError("Weights must sum to 1.0")
|
|
209
|
+
self.h = num_perm
|
|
210
|
+
if params is not None:
|
|
211
|
+
self.b, self.r = params
|
|
212
|
+
if self.b * self.r > num_perm:
|
|
213
|
+
raise ValueError(
|
|
214
|
+
"The product of b and r in params is "
|
|
215
|
+
"{} * {} = {} -- it must be less than num_perm {}. "
|
|
216
|
+
"Did you forget to specify num_perm?".format(
|
|
217
|
+
self.b, self.r, self.b * self.r, num_perm
|
|
218
|
+
)
|
|
219
|
+
)
|
|
220
|
+
else:
|
|
221
|
+
false_positive_weight, false_negative_weight = weights
|
|
222
|
+
self.b, self.r = _optimal_param(
|
|
223
|
+
threshold, num_perm, false_positive_weight, false_negative_weight
|
|
224
|
+
)
|
|
225
|
+
if self.b < 2:
|
|
226
|
+
raise ValueError("The number of bands are too small (b < 2)")
|
|
227
|
+
|
|
228
|
+
# create a Bloom filter for each band in the signature matrix
|
|
229
|
+
if save_dir is not None:
|
|
230
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
231
|
+
self.hashtables = [
|
|
232
|
+
BloomTable(
|
|
233
|
+
item_count=n,
|
|
234
|
+
fp=fp, band_size=self.r,
|
|
235
|
+
fname=os.path.join(save_dir, f"band-{i}.bf") if save_dir is not None else None,
|
|
236
|
+
)
|
|
237
|
+
for i in range(self.b)
|
|
238
|
+
]
|
|
239
|
+
self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
|
|
240
|
+
|
|
241
|
+
def insert(
|
|
242
|
+
self,
|
|
243
|
+
minhash: MinHash
|
|
244
|
+
):
|
|
245
|
+
"""
|
|
246
|
+
Insert the MinHash or Weighted MinHash
|
|
247
|
+
of a set to the index.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
minhash (Union[MinHash, WeightedMinHash]): The MinHash of the set.
|
|
251
|
+
|
|
252
|
+
"""
|
|
253
|
+
self._insert(minhash)
|
|
254
|
+
|
|
255
|
+
def _insert(
|
|
256
|
+
self,
|
|
257
|
+
minhash: MinHash
|
|
258
|
+
):
|
|
259
|
+
if len(minhash) != self.h:
|
|
260
|
+
raise ValueError(
|
|
261
|
+
"Expecting minhash with length %d, got %d" % (self.h, len(minhash))
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
Hs = [minhash.hashvalues[start:end] for start, end in self.hashranges]
|
|
265
|
+
|
|
266
|
+
for H, hashtable in zip(Hs, self.hashtables):
|
|
267
|
+
hashtable.insert(H)
|
|
268
|
+
|
|
269
|
+
def query(self, minhash) -> bool:
|
|
270
|
+
"""
|
|
271
|
+
Given the MinHash of the query set, determine
|
|
272
|
+
whether any previously inserted sets have
|
|
273
|
+
Jaccard similarity with the query that is
|
|
274
|
+
likely greater than the threshold.
|
|
275
|
+
|
|
276
|
+
Results are based on minhash segment collision
|
|
277
|
+
and are thus approximate.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
minhash (MinHash): The MinHash of the query set.
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
bool: Whether the item is a duplicate or not, based on the given threshold.
|
|
284
|
+
|
|
285
|
+
Example:
|
|
286
|
+
|
|
287
|
+
.. code-block:: python
|
|
288
|
+
|
|
289
|
+
from datasketch import MinHash, MinHashLSHBloom
|
|
290
|
+
import numpy as np
|
|
291
|
+
|
|
292
|
+
# Generate 100 random MinHashes.
|
|
293
|
+
minhashes = MinHash.bulk(
|
|
294
|
+
np.random.randint(low=0, high=30, size=(100, 10)),
|
|
295
|
+
num_perm=128
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Create LSHBloom index.
|
|
299
|
+
lsh = MinHashLSHBloom(threshold=0.5, num_perm=128, n=100, fp=0.0001, save_dir="./index/")
|
|
300
|
+
for i, m in enumerate(minhashes):
|
|
301
|
+
lsh.insert(i, m)
|
|
302
|
+
|
|
303
|
+
# Get the duplication result from LSHBloom.
|
|
304
|
+
query = minhashes[0]
|
|
305
|
+
is_duplicate = lsh.query(query)
|
|
306
|
+
print(is_duplicate)
|
|
307
|
+
|
|
308
|
+
Output:
|
|
309
|
+
|
|
310
|
+
.. code-block::
|
|
311
|
+
|
|
312
|
+
True
|
|
313
|
+
|
|
314
|
+
Note that although the threshold is set to 0.5, the results are not
|
|
315
|
+
guaranteed to be above 0.5 because the LSHBloom index is approximate and
|
|
316
|
+
the Jaccard similarity is estimated by MinHash.
|
|
317
|
+
|
|
318
|
+
"""
|
|
319
|
+
if len(minhash) != self.h:
|
|
320
|
+
raise ValueError(
|
|
321
|
+
"Expecting minhash with length %d, got %d" % (self.h, len(minhash))
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
# if we match in any band, this is a candidate pair
|
|
325
|
+
for (start, end), hashtable in zip(self.hashranges, self.hashtables):
|
|
326
|
+
H = minhash.hashvalues[start:end]
|
|
327
|
+
collision = hashtable.query(H)
|
|
328
|
+
if collision:
|
|
329
|
+
return True
|
|
330
|
+
return False
|
|
331
|
+
|
|
332
|
+
def sync(self):
|
|
333
|
+
print("Saving Bloom Index...")
|
|
334
|
+
for table in self.hashtables:
|
|
335
|
+
table.sync()
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
2
|
from typing import Hashable, List
|
|
3
|
+
import numpy as np
|
|
3
4
|
|
|
4
5
|
from datasketch.minhash import MinHash
|
|
5
6
|
|
|
@@ -128,6 +129,30 @@ class MinHashLSHForest(object):
|
|
|
128
129
|
r -= 1
|
|
129
130
|
return list(results)
|
|
130
131
|
|
|
132
|
+
def get_minhash_hashvalues(self, key: Hashable) -> np.ndarray:
|
|
133
|
+
"""
|
|
134
|
+
Returns the hashvalues from the MinHash object that corresponds to the given key in the LSHForest,
|
|
135
|
+
if it exists. This is useful for when we want to reconstruct the original MinHash
|
|
136
|
+
object to manually check the Jaccard Similarity for the top-k results from a query.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
key (Hashable): The key whose MinHash hashvalues we want to retrieve.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
hashvalues: The hashvalues for the MinHash object corresponding to the given key.
|
|
143
|
+
"""
|
|
144
|
+
byteslist = self.keys.get(key, None)
|
|
145
|
+
if byteslist is None:
|
|
146
|
+
raise KeyError(f"The provided key does not exist in the LSHForest: {key}")
|
|
147
|
+
hashvalue_byte_size = len(byteslist[0])//8
|
|
148
|
+
hashvalues = np.empty(len(byteslist)*hashvalue_byte_size, dtype=np.uint64)
|
|
149
|
+
for index, item in enumerate(byteslist):
|
|
150
|
+
# unswap the bytes, as their representation is flipped during storage
|
|
151
|
+
hv_segment = np.frombuffer(item, dtype=np.uint64).byteswap()
|
|
152
|
+
curr_index = index*hashvalue_byte_size
|
|
153
|
+
hashvalues[curr_index:curr_index+hashvalue_byte_size] = hv_segment
|
|
154
|
+
return hashvalues
|
|
155
|
+
|
|
131
156
|
def _binary_search(self, n, func):
|
|
132
157
|
"""
|
|
133
158
|
https://golang.org/src/sort/search.go?s=2247:2287#L49
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.7.0"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: datasketch
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.7.0
|
|
4
4
|
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
5
|
Home-page: https://ekzhu.github.io/datasketch
|
|
6
6
|
Author: ekzhu
|
|
@@ -13,11 +13,11 @@ Classifier: Intended Audience :: Developers
|
|
|
13
13
|
Classifier: Topic :: Database
|
|
14
14
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
15
15
|
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.7
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.8
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.9
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.10
|
|
20
19
|
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: numpy>=1.11
|
|
23
23
|
Requires-Dist: scipy>=1.0.0
|
|
@@ -25,6 +25,9 @@ Provides-Extra: cassandra
|
|
|
25
25
|
Requires-Dist: cassandra-driver>=3.20; extra == "cassandra"
|
|
26
26
|
Provides-Extra: redis
|
|
27
27
|
Requires-Dist: redis>=2.10.0; extra == "redis"
|
|
28
|
+
Provides-Extra: bloom
|
|
29
|
+
Requires-Dist: pybloomfilter3>=0.7.2; python_version >= "3.9" and extra == "bloom"
|
|
30
|
+
Requires-Dist: pybloomfiltermmap3==0.6.0; python_version < "3.9" and extra == "bloom"
|
|
28
31
|
Provides-Extra: benchmark
|
|
29
32
|
Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
|
|
30
33
|
Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
|
|
@@ -45,17 +48,29 @@ Requires-Dist: nose>=1.3.7; extra == "test"
|
|
|
45
48
|
Requires-Dist: nose-exclude>=0.5.0; extra == "test"
|
|
46
49
|
Requires-Dist: pytest; extra == "test"
|
|
47
50
|
Provides-Extra: experimental-aio
|
|
48
|
-
Requires-Dist: aiounittest; python_version >= "3.
|
|
49
|
-
Requires-Dist: motor; python_version >= "3.
|
|
51
|
+
Requires-Dist: aiounittest; python_version >= "3.8" and extra == "experimental-aio"
|
|
52
|
+
Requires-Dist: motor>3.6.0; python_version >= "3.8" and extra == "experimental-aio"
|
|
53
|
+
Dynamic: author
|
|
54
|
+
Dynamic: author-email
|
|
55
|
+
Dynamic: classifier
|
|
56
|
+
Dynamic: description
|
|
57
|
+
Dynamic: home-page
|
|
58
|
+
Dynamic: keywords
|
|
59
|
+
Dynamic: license
|
|
60
|
+
Dynamic: license-file
|
|
61
|
+
Dynamic: project-url
|
|
62
|
+
Dynamic: provides-extra
|
|
63
|
+
Dynamic: requires-dist
|
|
64
|
+
Dynamic: summary
|
|
50
65
|
|
|
51
66
|
datasketch: Big Data Looks Small
|
|
52
67
|
================================
|
|
53
68
|
|
|
54
|
-
.. image:: https://
|
|
55
|
-
:target: https://
|
|
69
|
+
.. image:: https://static.pepy.tech/badge/datasketch/month
|
|
70
|
+
:target: https://pepy.tech/project/datasketch
|
|
56
71
|
|
|
57
|
-
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.
|
|
58
|
-
:target: https://
|
|
72
|
+
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
|
|
73
|
+
:target: https://zenodo.org/doi/10.5281/zenodo.598238
|
|
59
74
|
|
|
60
75
|
datasketch gives you probabilistic data structures that can process and
|
|
61
76
|
search very large amount of data super fast, with little loss of
|
|
@@ -83,6 +98,8 @@ sub-linear query time:
|
|
|
83
98
|
+===========================+=============================+========================+
|
|
84
99
|
| `MinHash LSH`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
85
100
|
+---------------------------+-----------------------------+------------------------+
|
|
101
|
+
| `LSHBloom`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
102
|
+
+---------------------------+-----------------------------+------------------------+
|
|
86
103
|
| `MinHash LSH Forest`_ | MinHash, Weighted MinHash | Jaccard Top-K |
|
|
87
104
|
+---------------------------+-----------------------------+------------------------+
|
|
88
105
|
| `MinHash LSH Ensemble`_ | MinHash | Containment Threshold |
|
|
@@ -90,7 +107,7 @@ sub-linear query time:
|
|
|
90
107
|
| `HNSW`_ | Any | Custom Metric Top-K |
|
|
91
108
|
+---------------------------+-----------------------------+------------------------+
|
|
92
109
|
|
|
93
|
-
datasketch must be used with Python 3.
|
|
110
|
+
datasketch must be used with Python 3.8 or above, NumPy 1.11 or above, and Scipy.
|
|
94
111
|
|
|
95
112
|
Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
|
|
96
113
|
storage layer (see `MinHash LSH at Scale`_).
|
|
@@ -118,6 +135,11 @@ To install with Cassandra dependency:
|
|
|
118
135
|
|
|
119
136
|
pip install datasketch[cassandra]
|
|
120
137
|
|
|
138
|
+
To install with Bloom filter dependency:
|
|
139
|
+
|
|
140
|
+
::
|
|
141
|
+
|
|
142
|
+
pip install datasketch[bloom]
|
|
121
143
|
|
|
122
144
|
.. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
|
|
123
145
|
.. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
|
|
@@ -126,5 +148,6 @@ To install with Cassandra dependency:
|
|
|
126
148
|
.. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
|
|
127
149
|
.. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
|
|
128
150
|
.. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
|
|
151
|
+
.. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
|
|
129
152
|
.. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
|
|
130
153
|
.. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
|
|
@@ -9,6 +9,7 @@ datasketch/hyperloglog.py
|
|
|
9
9
|
datasketch/hyperloglog_const.py
|
|
10
10
|
datasketch/lean_minhash.py
|
|
11
11
|
datasketch/lsh.py
|
|
12
|
+
datasketch/lsh_bloom.py
|
|
12
13
|
datasketch/lshensemble.py
|
|
13
14
|
datasketch/lshensemble_partition.py
|
|
14
15
|
datasketch/lshforest.py
|
|
@@ -30,6 +31,7 @@ test/test_hyperloglog.py
|
|
|
30
31
|
test/test_lean_minhash.py
|
|
31
32
|
test/test_lsh.py
|
|
32
33
|
test/test_lsh_cassandra.py
|
|
34
|
+
test/test_lshbloom.py
|
|
33
35
|
test/test_lshensemble.py
|
|
34
36
|
test/test_lshforest.py
|
|
35
37
|
test/test_minhash.py
|
|
@@ -11,14 +11,22 @@ SetSimilaritySearch>=0.1.7
|
|
|
11
11
|
pyfarmhash>=0.2.2
|
|
12
12
|
nltk>=3.4.5
|
|
13
13
|
|
|
14
|
+
[bloom]
|
|
15
|
+
|
|
16
|
+
[bloom:python_version < "3.9"]
|
|
17
|
+
pybloomfiltermmap3==0.6.0
|
|
18
|
+
|
|
19
|
+
[bloom:python_version >= "3.9"]
|
|
20
|
+
pybloomfilter3>=0.7.2
|
|
21
|
+
|
|
14
22
|
[cassandra]
|
|
15
23
|
cassandra-driver>=3.20
|
|
16
24
|
|
|
17
25
|
[experimental_aio]
|
|
18
26
|
|
|
19
|
-
[experimental_aio:python_version >= "3.
|
|
27
|
+
[experimental_aio:python_version >= "3.8"]
|
|
20
28
|
aiounittest
|
|
21
|
-
motor
|
|
29
|
+
motor>3.6.0
|
|
22
30
|
|
|
23
31
|
[redis]
|
|
24
32
|
redis>=2.10.0
|
|
@@ -39,11 +39,11 @@ setup(
|
|
|
39
39
|
'Topic :: Database',
|
|
40
40
|
'Topic :: Scientific/Engineering :: Information Analysis',
|
|
41
41
|
'License :: OSI Approved :: MIT License',
|
|
42
|
-
'Programming Language :: Python :: 3.7',
|
|
43
42
|
'Programming Language :: Python :: 3.8',
|
|
44
43
|
'Programming Language :: Python :: 3.9',
|
|
45
44
|
'Programming Language :: Python :: 3.10',
|
|
46
45
|
'Programming Language :: Python :: 3.11',
|
|
46
|
+
'Programming Language :: Python :: 3.12',
|
|
47
47
|
],
|
|
48
48
|
keywords='database datamining',
|
|
49
49
|
packages=find_packages(include=['datasketch*']),
|
|
@@ -58,6 +58,10 @@ setup(
|
|
|
58
58
|
'redis': [
|
|
59
59
|
'redis>=2.10.0',
|
|
60
60
|
],
|
|
61
|
+
'bloom': [
|
|
62
|
+
'pybloomfilter3>=0.7.2 ; python_version>="3.9"',
|
|
63
|
+
'pybloomfiltermmap3==0.6.0 ; python_version<"3.9"',
|
|
64
|
+
],
|
|
61
65
|
'benchmark': [
|
|
62
66
|
'pyhash>=0.9.3',
|
|
63
67
|
'matplotlib>=3.1.2',
|
|
@@ -80,8 +84,8 @@ setup(
|
|
|
80
84
|
'pytest',
|
|
81
85
|
],
|
|
82
86
|
'experimental_aio': [
|
|
83
|
-
"aiounittest ; python_version>='3.
|
|
84
|
-
"motor ; python_version>='3.
|
|
87
|
+
"aiounittest ; python_version>='3.8'",
|
|
88
|
+
"motor>3.6.0 ; python_version>='3.8'",
|
|
85
89
|
],
|
|
86
90
|
},
|
|
87
91
|
)
|
|
@@ -240,6 +240,117 @@ class TestMinHashLSH(unittest.TestCase):
|
|
|
240
240
|
for table in counts:
|
|
241
241
|
self.assertEqual(sum(table.values()), 2)
|
|
242
242
|
|
|
243
|
+
def test_merge(self):
|
|
244
|
+
lsh1 = MinHashLSH(threshold=0.5, num_perm=16)
|
|
245
|
+
m1 = MinHash(16)
|
|
246
|
+
m1.update("a".encode("utf-8"))
|
|
247
|
+
m2 = MinHash(16)
|
|
248
|
+
m2.update("b".encode("utf-8"))
|
|
249
|
+
lsh1.insert("a",m1)
|
|
250
|
+
lsh1.insert("b",m2)
|
|
251
|
+
|
|
252
|
+
lsh2 = MinHashLSH(threshold=0.5, num_perm=16)
|
|
253
|
+
m3 = MinHash(16)
|
|
254
|
+
m3.update("c".encode("utf-8"))
|
|
255
|
+
m4 = MinHash(16)
|
|
256
|
+
m4.update("d".encode("utf-8"))
|
|
257
|
+
lsh2.insert("c",m1)
|
|
258
|
+
lsh2.insert("d",m2)
|
|
259
|
+
|
|
260
|
+
lsh1.merge(lsh2)
|
|
261
|
+
for t in lsh1.hashtables:
|
|
262
|
+
self.assertTrue(len(t) >= 1)
|
|
263
|
+
items = []
|
|
264
|
+
for H in t:
|
|
265
|
+
items.extend(t[H])
|
|
266
|
+
self.assertTrue("c" in items)
|
|
267
|
+
self.assertTrue("d" in items)
|
|
268
|
+
self.assertTrue("a" in lsh1)
|
|
269
|
+
self.assertTrue("b" in lsh1)
|
|
270
|
+
self.assertTrue("c" in lsh1)
|
|
271
|
+
self.assertTrue("d" in lsh1)
|
|
272
|
+
for i, H in enumerate(lsh1.keys["c"]):
|
|
273
|
+
self.assertTrue("c" in lsh1.hashtables[i][H])
|
|
274
|
+
|
|
275
|
+
self.assertTrue(lsh1.merge, lsh2)
|
|
276
|
+
self.assertRaises(ValueError, lsh1.merge, lsh2, check_overlap=True)
|
|
277
|
+
|
|
278
|
+
m5 = MinHash(16)
|
|
279
|
+
m5.update("e".encode("utf-8"))
|
|
280
|
+
lsh3 = MinHashLSH(threshold=0.5, num_perm=16)
|
|
281
|
+
lsh3.insert("a",m5)
|
|
282
|
+
|
|
283
|
+
self.assertRaises(ValueError, lsh1.merge, lsh3, check_overlap=True)
|
|
284
|
+
|
|
285
|
+
lsh1.merge(lsh3)
|
|
286
|
+
|
|
287
|
+
m6 = MinHash(16)
|
|
288
|
+
m6.update("e".encode("utf-8"))
|
|
289
|
+
lsh4 = MinHashLSH(threshold=0.5, num_perm=16)
|
|
290
|
+
lsh4.insert("a",m6)
|
|
291
|
+
|
|
292
|
+
lsh1.merge(lsh4, check_overlap=False)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def test_merge_redis(self):
|
|
296
|
+
with patch('redis.Redis', fake_redis) as mock_redis:
|
|
297
|
+
lsh1 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
|
|
298
|
+
'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
|
|
299
|
+
})
|
|
300
|
+
lsh2 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
|
|
301
|
+
'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
|
|
302
|
+
})
|
|
303
|
+
|
|
304
|
+
m1 = MinHash(16)
|
|
305
|
+
m1.update("a".encode("utf8"))
|
|
306
|
+
m2 = MinHash(16)
|
|
307
|
+
m2.update("b".encode("utf8"))
|
|
308
|
+
lsh1.insert("a", m1)
|
|
309
|
+
lsh1.insert("b", m2)
|
|
310
|
+
|
|
311
|
+
m3 = MinHash(16)
|
|
312
|
+
m3.update("c".encode("utf8"))
|
|
313
|
+
m4 = MinHash(16)
|
|
314
|
+
m4.update("d".encode("utf8"))
|
|
315
|
+
lsh2.insert("c", m3)
|
|
316
|
+
lsh2.insert("d", m4)
|
|
317
|
+
|
|
318
|
+
lsh1.merge(lsh2)
|
|
319
|
+
for t in lsh1.hashtables:
|
|
320
|
+
self.assertTrue(len(t) >= 1)
|
|
321
|
+
items = []
|
|
322
|
+
for H in t:
|
|
323
|
+
items.extend(t[H])
|
|
324
|
+
self.assertTrue(pickle.dumps("c") in items)
|
|
325
|
+
self.assertTrue(pickle.dumps("d") in items)
|
|
326
|
+
self.assertTrue("a" in lsh1)
|
|
327
|
+
self.assertTrue("b" in lsh1)
|
|
328
|
+
self.assertTrue("c" in lsh1)
|
|
329
|
+
self.assertTrue("d" in lsh1)
|
|
330
|
+
for i, H in enumerate(lsh1.keys[pickle.dumps("c")]):
|
|
331
|
+
self.assertTrue(pickle.dumps("c") in lsh1.hashtables[i][H])
|
|
332
|
+
|
|
333
|
+
self.assertTrue(lsh1.merge, lsh2)
|
|
334
|
+
self.assertRaises(ValueError, lsh1.merge, lsh2, check_overlap=True)
|
|
335
|
+
|
|
336
|
+
m5 = MinHash(16)
|
|
337
|
+
m5.update("e".encode("utf-8"))
|
|
338
|
+
lsh3 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
|
|
339
|
+
'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
|
|
340
|
+
})
|
|
341
|
+
lsh3.insert("a",m5)
|
|
342
|
+
|
|
343
|
+
self.assertRaises(ValueError, lsh1.merge, lsh3, check_overlap=True)
|
|
344
|
+
|
|
345
|
+
m6 = MinHash(16)
|
|
346
|
+
m6.update("e".encode("utf-8"))
|
|
347
|
+
lsh4 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
|
|
348
|
+
'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
|
|
349
|
+
})
|
|
350
|
+
lsh4.insert("a",m6)
|
|
351
|
+
|
|
352
|
+
lsh1.merge(lsh4, check_overlap=False)
|
|
353
|
+
|
|
243
354
|
|
|
244
355
|
class TestWeightedMinHashLSH(unittest.TestCase):
|
|
245
356
|
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
import pickle
|
|
3
|
+
from glob import glob
|
|
4
|
+
from datasketch.lsh_bloom import BloomTable, MinHashLSHBloom
|
|
5
|
+
from datasketch.minhash import MinHash
|
|
6
|
+
import numpy as np
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
class TestBloomTable(unittest.TestCase):
|
|
10
|
+
def test_insert(self):
|
|
11
|
+
r = 3
|
|
12
|
+
x = np.array([2,3,31], dtype=np.uint32)
|
|
13
|
+
b = BloomTable(10, 0.01, band_size=r)
|
|
14
|
+
b.insert(x)
|
|
15
|
+
self.assertRaises(RuntimeError, b.insert, np.array([2,2], dtype=np.uint32))
|
|
16
|
+
|
|
17
|
+
def test_query(self):
|
|
18
|
+
r = 3
|
|
19
|
+
x = np.array([2,3,31], dtype=np.uint32)
|
|
20
|
+
b = BloomTable(10, 0.01, band_size=r)
|
|
21
|
+
b.insert(x)
|
|
22
|
+
self.assertTrue(b.query(x))
|
|
23
|
+
self.assertFalse(b.query(np.array([2,3,30], dtype=np.uint32)))
|
|
24
|
+
self.assertRaises(RuntimeError, b.query, [2,2])
|
|
25
|
+
|
|
26
|
+
def test_save(self):
|
|
27
|
+
fname = "/tmp/bloomfilter.bf"
|
|
28
|
+
if os.path.exists(fname):
|
|
29
|
+
os.remove(fname)
|
|
30
|
+
r = 3
|
|
31
|
+
x = np.array([2,3,31], dtype=np.uint32)
|
|
32
|
+
y = np.array([12,10,29], dtype=np.uint32)
|
|
33
|
+
z = np.array([27,30,8], dtype=np.uint32)
|
|
34
|
+
items = [x,y,z]
|
|
35
|
+
b = BloomTable(10, 0.01, band_size=r, fname=fname)
|
|
36
|
+
for item in items:
|
|
37
|
+
b.insert(item)
|
|
38
|
+
for item in items:
|
|
39
|
+
self.assertTrue(b.query(item))
|
|
40
|
+
b.sync()
|
|
41
|
+
|
|
42
|
+
del b
|
|
43
|
+
|
|
44
|
+
b_ = BloomTable(10, 0.01, band_size=r, fname=fname)
|
|
45
|
+
for item in items:
|
|
46
|
+
self.assertTrue(b_.query(item))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class TestMinHashLSHBloom(unittest.TestCase):
|
|
50
|
+
|
|
51
|
+
def test_init(self):
|
|
52
|
+
lsh = MinHashLSHBloom(threshold=0.8, n=10, fp=0.01)
|
|
53
|
+
b1, r1 = lsh.b, lsh.r
|
|
54
|
+
lsh = MinHashLSHBloom(threshold=0.8, weights=(0.2,0.8), n=10, fp=0.01)
|
|
55
|
+
b2, r2 = lsh.b, lsh.r
|
|
56
|
+
self.assertTrue(b1 < b2)
|
|
57
|
+
self.assertTrue(r1 > r2)
|
|
58
|
+
self.assertTrue(len(lsh.hashtables) == lsh.b)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_insert(self):
|
|
62
|
+
lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01)
|
|
63
|
+
m1 = MinHash(16)
|
|
64
|
+
m1.update("a".encode("utf8"))
|
|
65
|
+
m2 = MinHash(16)
|
|
66
|
+
m2.update("b".encode("utf8"))
|
|
67
|
+
lsh.insert(m1)
|
|
68
|
+
lsh.insert(m2)
|
|
69
|
+
|
|
70
|
+
m3 = MinHash(18)
|
|
71
|
+
self.assertRaises(ValueError, lsh.insert, m3)
|
|
72
|
+
|
|
73
|
+
def test_query(self):
|
|
74
|
+
lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01)
|
|
75
|
+
m1 = MinHash(16)
|
|
76
|
+
m1.update("a".encode("utf8"))
|
|
77
|
+
m2 = MinHash(16)
|
|
78
|
+
m2.update("b".encode("utf8"))
|
|
79
|
+
lsh.insert(m1)
|
|
80
|
+
lsh.insert(m2)
|
|
81
|
+
result = lsh.query(m1)
|
|
82
|
+
self.assertTrue(result)
|
|
83
|
+
result = lsh.query(m2)
|
|
84
|
+
self.assertTrue(result)
|
|
85
|
+
|
|
86
|
+
m3 = MinHash(18)
|
|
87
|
+
self.assertRaises(ValueError, lsh.query, m3)
|
|
88
|
+
|
|
89
|
+
def test_save(self):
|
|
90
|
+
save_path = "./test_save/"
|
|
91
|
+
for item in glob(f"{save_path}/*.bf"):
|
|
92
|
+
os.remove(item)
|
|
93
|
+
|
|
94
|
+
lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01, save_dir=save_path)
|
|
95
|
+
m1 = MinHash(16)
|
|
96
|
+
m1.update("a".encode("utf8"))
|
|
97
|
+
m2 = MinHash(16)
|
|
98
|
+
m2.update("b".encode("utf8"))
|
|
99
|
+
lsh.insert(m1)
|
|
100
|
+
lsh.insert(m2)
|
|
101
|
+
lsh.sync()
|
|
102
|
+
|
|
103
|
+
lsh2 = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01, save_dir=save_path)
|
|
104
|
+
result = lsh2.query(m1)
|
|
105
|
+
self.assertTrue(result)
|
|
106
|
+
result = lsh2.query(m2)
|
|
107
|
+
self.assertTrue(result)
|
|
108
|
+
|
|
109
|
+
def test_save_in_memory(self):
|
|
110
|
+
|
|
111
|
+
with self.assertWarns(RuntimeWarning):
|
|
112
|
+
lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01, save_dir=None)
|
|
113
|
+
|
|
114
|
+
m1 = MinHash(16)
|
|
115
|
+
m1.update("a".encode("utf8"))
|
|
116
|
+
m2 = MinHash(16)
|
|
117
|
+
m2.update("b".encode("utf8"))
|
|
118
|
+
lsh.insert(m1)
|
|
119
|
+
lsh.insert(m2)
|
|
120
|
+
|
|
121
|
+
with self.assertWarns(RuntimeWarning):
|
|
122
|
+
lsh.sync()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
if __name__ == "__main__":
|
|
126
|
+
unittest.main()
|
|
@@ -62,6 +62,18 @@ class TestMinHashLSHForest(unittest.TestCase):
|
|
|
62
62
|
results = forest.query(data[key], 10)
|
|
63
63
|
self.assertIn(key, results)
|
|
64
64
|
|
|
65
|
+
def test_get_minhash_hashvalues(self):
|
|
66
|
+
forest, data = self._setup()
|
|
67
|
+
for key in data:
|
|
68
|
+
minhash_ori = data[key]
|
|
69
|
+
hashvalues = forest.get_minhash_hashvalues(key)
|
|
70
|
+
minhash_retrieved = MinHash(hashvalues=hashvalues)
|
|
71
|
+
retrieved_hashvalues = minhash_retrieved.hashvalues
|
|
72
|
+
self.assertEqual(len(hashvalues), len(retrieved_hashvalues))
|
|
73
|
+
self.assertEqual(minhash_retrieved.jaccard(minhash_ori), 1.0)
|
|
74
|
+
for i in range(len(retrieved_hashvalues)):
|
|
75
|
+
self.assertEqual(hashvalues[i], retrieved_hashvalues[i])
|
|
76
|
+
|
|
65
77
|
def test_pickle(self):
|
|
66
78
|
forest, _ = self._setup()
|
|
67
79
|
forest2 = pickle.loads(pickle.dumps(forest))
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.6.4"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|