datasketch 1.6.5__tar.gz → 1.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datasketch-1.6.5 → datasketch-1.7.0}/PKG-INFO +29 -6
- {datasketch-1.6.5 → datasketch-1.7.0}/README.rst +9 -1
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/__init__.py +1 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/b_bit_minhash.py +8 -1
- datasketch-1.7.0/datasketch/lsh_bloom.py +335 -0
- datasketch-1.7.0/datasketch/version.py +1 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch.egg-info/PKG-INFO +29 -6
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch.egg-info/SOURCES.txt +2 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch.egg-info/requires.txt +10 -2
- {datasketch-1.6.5 → datasketch-1.7.0}/setup.py +7 -3
- datasketch-1.7.0/test/test_lshbloom.py +126 -0
- datasketch-1.6.5/datasketch/version.py +0 -1
- {datasketch-1.6.5 → datasketch-1.7.0}/LICENSE +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/experimental/__init__.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/experimental/aio/__init__.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/experimental/aio/lsh.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/experimental/aio/storage.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/hashfunc.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/hnsw.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/hyperloglog.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/hyperloglog_const.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/lean_minhash.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/lsh.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/lshensemble.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/lshensemble_partition.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/lshforest.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/minhash.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/storage.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch/weighted_minhash.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch.egg-info/dependency_links.txt +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/datasketch.egg-info/top_level.txt +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/setup.cfg +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/test/test_hnsw.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/test/test_hyperloglog.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/test/test_lean_minhash.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/test/test_lsh.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/test/test_lsh_cassandra.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/test/test_lshensemble.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/test/test_lshforest.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/test/test_minhash.py +0 -0
- {datasketch-1.6.5 → datasketch-1.7.0}/test/test_weighted_minhash.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: datasketch
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.7.0
|
|
4
4
|
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
5
|
Home-page: https://ekzhu.github.io/datasketch
|
|
6
6
|
Author: ekzhu
|
|
@@ -13,11 +13,11 @@ Classifier: Intended Audience :: Developers
|
|
|
13
13
|
Classifier: Topic :: Database
|
|
14
14
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
15
15
|
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.7
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.8
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.9
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.10
|
|
20
19
|
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: numpy>=1.11
|
|
23
23
|
Requires-Dist: scipy>=1.0.0
|
|
@@ -25,6 +25,9 @@ Provides-Extra: cassandra
|
|
|
25
25
|
Requires-Dist: cassandra-driver>=3.20; extra == "cassandra"
|
|
26
26
|
Provides-Extra: redis
|
|
27
27
|
Requires-Dist: redis>=2.10.0; extra == "redis"
|
|
28
|
+
Provides-Extra: bloom
|
|
29
|
+
Requires-Dist: pybloomfilter3>=0.7.2; python_version >= "3.9" and extra == "bloom"
|
|
30
|
+
Requires-Dist: pybloomfiltermmap3==0.6.0; python_version < "3.9" and extra == "bloom"
|
|
28
31
|
Provides-Extra: benchmark
|
|
29
32
|
Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
|
|
30
33
|
Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
|
|
@@ -45,8 +48,20 @@ Requires-Dist: nose>=1.3.7; extra == "test"
|
|
|
45
48
|
Requires-Dist: nose-exclude>=0.5.0; extra == "test"
|
|
46
49
|
Requires-Dist: pytest; extra == "test"
|
|
47
50
|
Provides-Extra: experimental-aio
|
|
48
|
-
Requires-Dist: aiounittest; python_version >= "3.
|
|
49
|
-
Requires-Dist: motor; python_version >= "3.
|
|
51
|
+
Requires-Dist: aiounittest; python_version >= "3.8" and extra == "experimental-aio"
|
|
52
|
+
Requires-Dist: motor>3.6.0; python_version >= "3.8" and extra == "experimental-aio"
|
|
53
|
+
Dynamic: author
|
|
54
|
+
Dynamic: author-email
|
|
55
|
+
Dynamic: classifier
|
|
56
|
+
Dynamic: description
|
|
57
|
+
Dynamic: home-page
|
|
58
|
+
Dynamic: keywords
|
|
59
|
+
Dynamic: license
|
|
60
|
+
Dynamic: license-file
|
|
61
|
+
Dynamic: project-url
|
|
62
|
+
Dynamic: provides-extra
|
|
63
|
+
Dynamic: requires-dist
|
|
64
|
+
Dynamic: summary
|
|
50
65
|
|
|
51
66
|
datasketch: Big Data Looks Small
|
|
52
67
|
================================
|
|
@@ -83,6 +98,8 @@ sub-linear query time:
|
|
|
83
98
|
+===========================+=============================+========================+
|
|
84
99
|
| `MinHash LSH`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
85
100
|
+---------------------------+-----------------------------+------------------------+
|
|
101
|
+
| `LSHBloom`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
102
|
+
+---------------------------+-----------------------------+------------------------+
|
|
86
103
|
| `MinHash LSH Forest`_ | MinHash, Weighted MinHash | Jaccard Top-K |
|
|
87
104
|
+---------------------------+-----------------------------+------------------------+
|
|
88
105
|
| `MinHash LSH Ensemble`_ | MinHash | Containment Threshold |
|
|
@@ -90,7 +107,7 @@ sub-linear query time:
|
|
|
90
107
|
| `HNSW`_ | Any | Custom Metric Top-K |
|
|
91
108
|
+---------------------------+-----------------------------+------------------------+
|
|
92
109
|
|
|
93
|
-
datasketch must be used with Python 3.
|
|
110
|
+
datasketch must be used with Python 3.8 or above, NumPy 1.11 or above, and Scipy.
|
|
94
111
|
|
|
95
112
|
Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
|
|
96
113
|
storage layer (see `MinHash LSH at Scale`_).
|
|
@@ -118,6 +135,11 @@ To install with Cassandra dependency:
|
|
|
118
135
|
|
|
119
136
|
pip install datasketch[cassandra]
|
|
120
137
|
|
|
138
|
+
To install with Bloom filter dependency:
|
|
139
|
+
|
|
140
|
+
::
|
|
141
|
+
|
|
142
|
+
pip install datasketch[bloom]
|
|
121
143
|
|
|
122
144
|
.. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
|
|
123
145
|
.. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
|
|
@@ -126,5 +148,6 @@ To install with Cassandra dependency:
|
|
|
126
148
|
.. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
|
|
127
149
|
.. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
|
|
128
150
|
.. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
|
|
151
|
+
.. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
|
|
129
152
|
.. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
|
|
130
153
|
.. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
|
|
@@ -33,6 +33,8 @@ sub-linear query time:
|
|
|
33
33
|
+===========================+=============================+========================+
|
|
34
34
|
| `MinHash LSH`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
35
35
|
+---------------------------+-----------------------------+------------------------+
|
|
36
|
+
| `LSHBloom`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
37
|
+
+---------------------------+-----------------------------+------------------------+
|
|
36
38
|
| `MinHash LSH Forest`_ | MinHash, Weighted MinHash | Jaccard Top-K |
|
|
37
39
|
+---------------------------+-----------------------------+------------------------+
|
|
38
40
|
| `MinHash LSH Ensemble`_ | MinHash | Containment Threshold |
|
|
@@ -40,7 +42,7 @@ sub-linear query time:
|
|
|
40
42
|
| `HNSW`_ | Any | Custom Metric Top-K |
|
|
41
43
|
+---------------------------+-----------------------------+------------------------+
|
|
42
44
|
|
|
43
|
-
datasketch must be used with Python 3.
|
|
45
|
+
datasketch must be used with Python 3.8 or above, NumPy 1.11 or above, and Scipy.
|
|
44
46
|
|
|
45
47
|
Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
|
|
46
48
|
storage layer (see `MinHash LSH at Scale`_).
|
|
@@ -68,6 +70,11 @@ To install with Cassandra dependency:
|
|
|
68
70
|
|
|
69
71
|
pip install datasketch[cassandra]
|
|
70
72
|
|
|
73
|
+
To install with Bloom filter dependency:
|
|
74
|
+
|
|
75
|
+
::
|
|
76
|
+
|
|
77
|
+
pip install datasketch[bloom]
|
|
71
78
|
|
|
72
79
|
.. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
|
|
73
80
|
.. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
|
|
@@ -76,5 +83,6 @@ To install with Cassandra dependency:
|
|
|
76
83
|
.. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
|
|
77
84
|
.. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
|
|
78
85
|
.. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
|
|
86
|
+
.. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
|
|
79
87
|
.. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
|
|
80
88
|
.. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
|
|
@@ -2,6 +2,7 @@ from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
|
|
|
2
2
|
from datasketch.minhash import MinHash
|
|
3
3
|
from datasketch.b_bit_minhash import bBitMinHash
|
|
4
4
|
from datasketch.lsh import MinHashLSH
|
|
5
|
+
from datasketch.lsh_bloom import MinHashLSHBloom
|
|
5
6
|
from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerator
|
|
6
7
|
from datasketch.lshforest import MinHashLSHForest
|
|
7
8
|
from datasketch.lshensemble import MinHashLSHEnsemble
|
|
@@ -92,7 +92,14 @@ class bBitMinHash(object):
|
|
|
92
92
|
hvs = self.hashvalues[start:start+n]
|
|
93
93
|
# Store the n b-bit hashed values in the current block
|
|
94
94
|
for j, hv in enumerate(hvs):
|
|
95
|
-
|
|
95
|
+
# We do this in BigInteger rather than np.uint64 because of inconsistencies
|
|
96
|
+
# in NumPy type coercion rules between NumPy 1.x and NumPy 2.x environments.
|
|
97
|
+
# In NumPy 2.x, implicit type conversion during bitwise operations is not
|
|
98
|
+
# performed which can cause integer overflows. This, in turn can corrupt
|
|
99
|
+
# hashvalues and cause pickled bBitMinHash objects to have the wrong representation.
|
|
100
|
+
# Doing this in BigInteger guarantees we do not experience overflow and still
|
|
101
|
+
# coerces to np.uint64 as expected.
|
|
102
|
+
blocks[i] = int(blocks[i]) | (int(hv) << (n - 1 - j) * slot_size)
|
|
96
103
|
fmt = self._serial_fmt_params + \
|
|
97
104
|
"%d%s" % (num_blocks, self._serial_fmt_block)
|
|
98
105
|
struct.pack_into(fmt, buffer, 0, self.seed, self.b, self.r, \
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Callable, List, Optional, Tuple
|
|
3
|
+
from datasketch.minhash import MinHash
|
|
4
|
+
from scipy.integrate import quad as integrate
|
|
5
|
+
import numpy as np
|
|
6
|
+
import warnings
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import pybloomfilter
|
|
11
|
+
except ImportError:
|
|
12
|
+
pybloomfilter = None
|
|
13
|
+
|
|
14
|
+
_mersenne_prime = np.uint64((1 << 61) - 1)
|
|
15
|
+
|
|
16
|
+
def _false_positive_probability(threshold, b, r):
|
|
17
|
+
_probability = lambda s: 1 - (1 - s ** float(r)) ** float(b)
|
|
18
|
+
a, err = integrate(_probability, 0.0, threshold)
|
|
19
|
+
return a
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _false_negative_probability(threshold, b, r):
|
|
23
|
+
_probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b))
|
|
24
|
+
a, err = integrate(_probability, threshold, 1.0)
|
|
25
|
+
return a
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _optimal_param(threshold, num_perm, false_positive_weight, false_negative_weight):
|
|
29
|
+
"""
|
|
30
|
+
Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
|
|
31
|
+
of probabilities of false positive and false negative.
|
|
32
|
+
"""
|
|
33
|
+
min_error = float("inf")
|
|
34
|
+
opt = (0, 0)
|
|
35
|
+
for b in range(1, num_perm + 1):
|
|
36
|
+
max_r = int(num_perm / b)
|
|
37
|
+
for r in range(1, max_r + 1):
|
|
38
|
+
fp = _false_positive_probability(threshold, b, r)
|
|
39
|
+
fn = _false_negative_probability(threshold, b, r)
|
|
40
|
+
error = fp * false_positive_weight + fn * false_negative_weight
|
|
41
|
+
if error < min_error:
|
|
42
|
+
min_error = error
|
|
43
|
+
opt = (b, r)
|
|
44
|
+
return opt
|
|
45
|
+
|
|
46
|
+
if pybloomfilter is not None:
|
|
47
|
+
class BloomTable:
|
|
48
|
+
"""
|
|
49
|
+
Interface to a Bloom Filter meant to model a single band of the MinHash signature matrix
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
item_count (int): Number of items expected to be inserted (size of dataset). Used to create Bloom filter.
|
|
53
|
+
fp (float): False positive rate for Bloom filter in (0,1).
|
|
54
|
+
band_size (int): Size of band from MinHash signature matrix this filter is meant to model.
|
|
55
|
+
fname (str): File path where Bloom filter will be saved. If this file already exists, will initialize the Bloom filter from this path.
|
|
56
|
+
max_size (int): Maximum number of elements we should plan to insert into this Bloom filter. Upper bounds the size of the Bloom filter.
|
|
57
|
+
"""
|
|
58
|
+
def __init__(self, item_count: int, fp: float, band_size: int, fname: str = None):
|
|
59
|
+
self.r = band_size
|
|
60
|
+
self.fname = fname
|
|
61
|
+
if fname is not None and os.path.exists(fname):
|
|
62
|
+
print(f"Loading Bloom Filter at {fname}...")
|
|
63
|
+
self.bloom_filter = pybloomfilter.BloomFilter.open(fname)
|
|
64
|
+
else:
|
|
65
|
+
self.bloom_filter = pybloomfilter.BloomFilter(
|
|
66
|
+
capacity=item_count,
|
|
67
|
+
error_rate=fp,
|
|
68
|
+
filename=self.fname
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
def sync(self):
|
|
72
|
+
if self.fname is not None:
|
|
73
|
+
self.bloom_filter.sync()
|
|
74
|
+
else:
|
|
75
|
+
warnings.warn("Attempting to save in-memory Bloom filter, this is a no-op.", RuntimeWarning)
|
|
76
|
+
|
|
77
|
+
def assert_size(self, hashvalues: List[int]):
|
|
78
|
+
if not len(hashvalues) == self.r:
|
|
79
|
+
raise RuntimeError(f"Invalid length for indices, {len(hashvalues)}, expected {self.r} hashvalues in band")
|
|
80
|
+
|
|
81
|
+
def insert(self, hashvalues: List[int]) -> None:
|
|
82
|
+
"""
|
|
83
|
+
Takes as input the indices for a single band and inserts them into the corresponding bit arrays
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
hashvalues (List[int]): The hashvalues from a single band of a MinHash object.
|
|
87
|
+
"""
|
|
88
|
+
self.assert_size(hashvalues)
|
|
89
|
+
# https://en.wikipedia.org/wiki/Universal_hashing#Hashing_vectors
|
|
90
|
+
# as the hashvalues are the result of a universal hashing function, their sum is also a univeral hash function
|
|
91
|
+
x = sum(hashvalues) % _mersenne_prime
|
|
92
|
+
self.bloom_filter.add(x)
|
|
93
|
+
|
|
94
|
+
def query(self, hashvalues: List[int]) -> bool:
|
|
95
|
+
"""
|
|
96
|
+
Takes as input the indices for a single band and queries them against the corresponding arrays
|
|
97
|
+
returns True if the each query returns True, otherwise returns False
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
hashvalues (List[int]): The hashvalues from a single band of a MinHash object.
|
|
101
|
+
"""
|
|
102
|
+
self.assert_size(hashvalues)
|
|
103
|
+
x = sum(hashvalues) % _mersenne_prime
|
|
104
|
+
return x in self.bloom_filter
|
|
105
|
+
else:
|
|
106
|
+
class BloomTable:
|
|
107
|
+
def __init__(self, item_count: int, fp: float, band_size: int, fname: str = None):
|
|
108
|
+
raise ImportError("Required dependency pybloomfilter is missing, did you `pip install datasketch[bloom]`?")
|
|
109
|
+
|
|
110
|
+
class MinHashLSHBloom(object):
|
|
111
|
+
"""
|
|
112
|
+
The :ref:`lsh_bloom` index.
|
|
113
|
+
It supports query with `Jaccard similarity`_ threshold.
|
|
114
|
+
Reference: `LSHBloom paper
|
|
115
|
+
<https://arxiv.org/abs/2411.04257>`_.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
threshold (float): The Jaccard similarity threshold between 0.0 and
|
|
119
|
+
1.0. The initialized LSH index will be optimized for the threshold by
|
|
120
|
+
minizing the false positive and false negative.
|
|
121
|
+
num_perm (int): The number of permutation functions used
|
|
122
|
+
by the MinHash to be indexed. For weighted MinHash, this
|
|
123
|
+
is the sample size (`sample_size`).
|
|
124
|
+
n (int): The number of elements to be inserted (estimate of dataset size).
|
|
125
|
+
fp (float): The false positive rate for each Bloom filter. Must be in (0,1).
|
|
126
|
+
save_dir (str): The directory to save the Bloom filter index to. If Bloom filters
|
|
127
|
+
already exist in this directory, the index will be loaded from here. If None,
|
|
128
|
+
an in-memory index will be created - this index can not be persisted.
|
|
129
|
+
weights (Tuple[float, float]): Used to adjust the relative importance of
|
|
130
|
+
minimizing false positive and false negative when optimizing
|
|
131
|
+
for the Jaccard similarity threshold.
|
|
132
|
+
`weights` is a tuple in the format of
|
|
133
|
+
:code:`(false_positive_weight, false_negative_weight)`.
|
|
134
|
+
params (Optiona[Tuple[int, int]]): The LSH parameters (i.e., number of bands and size
|
|
135
|
+
of each bands). This is used to bypass the parameter optimization
|
|
136
|
+
step in the constructor. `threshold` and `weights` will be ignored
|
|
137
|
+
if this is given.
|
|
138
|
+
|
|
139
|
+
Note:
|
|
140
|
+
This algorithm is a space optimized version of MinHashLSH.
|
|
141
|
+
For more details on :ref:`minhash_lsh`, see the documentation.
|
|
142
|
+
|
|
143
|
+
This algorithm uses Bloom filters to drastically reduce the space
|
|
144
|
+
that the LSH index occupies on disk. However, it loses the ability
|
|
145
|
+
to retrieve candidate duplicate keys. Rather, it can only tell you
|
|
146
|
+
whether a query set is a duplicate of a set that was inserted previously.
|
|
147
|
+
This enables scaling to datasets of many hundreds of millions or billions
|
|
148
|
+
of documents, but may not be appropriate for all use cases.
|
|
149
|
+
|
|
150
|
+
Examples:
|
|
151
|
+
|
|
152
|
+
Create an index with 128 permutation functions optimized for Jaccard
|
|
153
|
+
threshold 0.9:
|
|
154
|
+
|
|
155
|
+
.. code-block:: python
|
|
156
|
+
|
|
157
|
+
from datasketch import MinHash, MinHashLSH
|
|
158
|
+
|
|
159
|
+
set1 = set(['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
|
|
160
|
+
'estimating', 'the', 'similarity', 'between', 'datasets'])
|
|
161
|
+
set2 = set(['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
|
|
162
|
+
'estimating', 'the', 'similarity', 'between', 'documents'])
|
|
163
|
+
set3 = set(['minhash', 'is', 'probability', 'data', 'structure', 'for',
|
|
164
|
+
'estimating', 'the', 'similarity', 'between', 'documents'])
|
|
165
|
+
|
|
166
|
+
m1 = MinHash(num_perm=128)
|
|
167
|
+
m2 = MinHash(num_perm=128)
|
|
168
|
+
m3 = MinHash(num_perm=128)
|
|
169
|
+
for d in set1:
|
|
170
|
+
m1.update(d.encode('utf8'))
|
|
171
|
+
for d in set2:
|
|
172
|
+
m2.update(d.encode('utf8'))
|
|
173
|
+
for d in set3:
|
|
174
|
+
m3.update(d.encode('utf8'))
|
|
175
|
+
|
|
176
|
+
# Create LSHBloom index
|
|
177
|
+
lsh = MinHashLSHBloom(threshold=0.5, num_perm=128, n=100, fp=0.0001, save_dir="./index/")
|
|
178
|
+
lsh.insert(m2)
|
|
179
|
+
lsh.insert(m3)
|
|
180
|
+
|
|
181
|
+
# Query whether m1 is a duplicate according to the given threshold
|
|
182
|
+
is_duplicate = lsh.query(m1)
|
|
183
|
+
"""
|
|
184
|
+
|
|
185
|
+
def __init__(
|
|
186
|
+
self,
|
|
187
|
+
threshold: float = 0.9,
|
|
188
|
+
num_perm: int = 128,
|
|
189
|
+
n: int = None,
|
|
190
|
+
fp: float = None,
|
|
191
|
+
save_dir: str = None,
|
|
192
|
+
weights: Tuple[float, float] = (0.5, 0.5),
|
|
193
|
+
params: Optional[Tuple[int, int]] = None,
|
|
194
|
+
) -> None:
|
|
195
|
+
if threshold > 1.0 or threshold < 0.0:
|
|
196
|
+
raise ValueError("threshold must be in [0.0, 1.0]")
|
|
197
|
+
if num_perm < 2:
|
|
198
|
+
raise ValueError("Too few permutation functions")
|
|
199
|
+
if n <= 0:
|
|
200
|
+
raise ValueError("n for LSHBloom must be >= 0")
|
|
201
|
+
if fp >= 1.0 or fp <= 0.0:
|
|
202
|
+
raise ValueError("fp must be in (0.0, 1.0)")
|
|
203
|
+
if save_dir is None:
|
|
204
|
+
warnings.warn("Creating LSHBloom index without save directory, this index will not be persisted.", RuntimeWarning)
|
|
205
|
+
if any(w < 0.0 or w > 1.0 for w in weights):
|
|
206
|
+
raise ValueError("Weight must be in [0.0, 1.0]")
|
|
207
|
+
if sum(weights) != 1.0:
|
|
208
|
+
raise ValueError("Weights must sum to 1.0")
|
|
209
|
+
self.h = num_perm
|
|
210
|
+
if params is not None:
|
|
211
|
+
self.b, self.r = params
|
|
212
|
+
if self.b * self.r > num_perm:
|
|
213
|
+
raise ValueError(
|
|
214
|
+
"The product of b and r in params is "
|
|
215
|
+
"{} * {} = {} -- it must be less than num_perm {}. "
|
|
216
|
+
"Did you forget to specify num_perm?".format(
|
|
217
|
+
self.b, self.r, self.b * self.r, num_perm
|
|
218
|
+
)
|
|
219
|
+
)
|
|
220
|
+
else:
|
|
221
|
+
false_positive_weight, false_negative_weight = weights
|
|
222
|
+
self.b, self.r = _optimal_param(
|
|
223
|
+
threshold, num_perm, false_positive_weight, false_negative_weight
|
|
224
|
+
)
|
|
225
|
+
if self.b < 2:
|
|
226
|
+
raise ValueError("The number of bands are too small (b < 2)")
|
|
227
|
+
|
|
228
|
+
# create a Bloom filter for each band in the signature matrix
|
|
229
|
+
if save_dir is not None:
|
|
230
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
231
|
+
self.hashtables = [
|
|
232
|
+
BloomTable(
|
|
233
|
+
item_count=n,
|
|
234
|
+
fp=fp, band_size=self.r,
|
|
235
|
+
fname=os.path.join(save_dir, f"band-{i}.bf") if save_dir is not None else None,
|
|
236
|
+
)
|
|
237
|
+
for i in range(self.b)
|
|
238
|
+
]
|
|
239
|
+
self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
|
|
240
|
+
|
|
241
|
+
def insert(
|
|
242
|
+
self,
|
|
243
|
+
minhash: MinHash
|
|
244
|
+
):
|
|
245
|
+
"""
|
|
246
|
+
Insert the MinHash or Weighted MinHash
|
|
247
|
+
of a set to the index.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
minhash (Union[MinHash, WeightedMinHash]): The MinHash of the set.
|
|
251
|
+
|
|
252
|
+
"""
|
|
253
|
+
self._insert(minhash)
|
|
254
|
+
|
|
255
|
+
def _insert(
|
|
256
|
+
self,
|
|
257
|
+
minhash: MinHash
|
|
258
|
+
):
|
|
259
|
+
if len(minhash) != self.h:
|
|
260
|
+
raise ValueError(
|
|
261
|
+
"Expecting minhash with length %d, got %d" % (self.h, len(minhash))
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
Hs = [minhash.hashvalues[start:end] for start, end in self.hashranges]
|
|
265
|
+
|
|
266
|
+
for H, hashtable in zip(Hs, self.hashtables):
|
|
267
|
+
hashtable.insert(H)
|
|
268
|
+
|
|
269
|
+
def query(self, minhash) -> bool:
|
|
270
|
+
"""
|
|
271
|
+
Given the MinHash of the query set, determine
|
|
272
|
+
whether any previously inserted sets have
|
|
273
|
+
Jaccard similarity with the query that is
|
|
274
|
+
likely greater than the threshold.
|
|
275
|
+
|
|
276
|
+
Results are based on minhash segment collision
|
|
277
|
+
and are thus approximate.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
minhash (MinHash): The MinHash of the query set.
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
bool: Whether the item is a duplicate or not, based on the given threshold.
|
|
284
|
+
|
|
285
|
+
Example:
|
|
286
|
+
|
|
287
|
+
.. code-block:: python
|
|
288
|
+
|
|
289
|
+
from datasketch import MinHash, MinHashLSHBloom
|
|
290
|
+
import numpy as np
|
|
291
|
+
|
|
292
|
+
# Generate 100 random MinHashes.
|
|
293
|
+
minhashes = MinHash.bulk(
|
|
294
|
+
np.random.randint(low=0, high=30, size=(100, 10)),
|
|
295
|
+
num_perm=128
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Create LSHBloom index.
|
|
299
|
+
lsh = MinHashLSHBloom(threshold=0.5, num_perm=128, n=100, fp=0.0001, save_dir="./index/")
|
|
300
|
+
for i, m in enumerate(minhashes):
|
|
301
|
+
lsh.insert(i, m)
|
|
302
|
+
|
|
303
|
+
# Get the duplication result from LSHBloom.
|
|
304
|
+
query = minhashes[0]
|
|
305
|
+
is_duplicate = lsh.query(query)
|
|
306
|
+
print(is_duplicate)
|
|
307
|
+
|
|
308
|
+
Output:
|
|
309
|
+
|
|
310
|
+
.. code-block::
|
|
311
|
+
|
|
312
|
+
True
|
|
313
|
+
|
|
314
|
+
Note that although the threshold is set to 0.5, the results are not
|
|
315
|
+
guaranteed to be above 0.5 because the LSHBloom index is approximate and
|
|
316
|
+
the Jaccard similarity is estimated by MinHash.
|
|
317
|
+
|
|
318
|
+
"""
|
|
319
|
+
if len(minhash) != self.h:
|
|
320
|
+
raise ValueError(
|
|
321
|
+
"Expecting minhash with length %d, got %d" % (self.h, len(minhash))
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
# if we match in any band, this is a candidate pair
|
|
325
|
+
for (start, end), hashtable in zip(self.hashranges, self.hashtables):
|
|
326
|
+
H = minhash.hashvalues[start:end]
|
|
327
|
+
collision = hashtable.query(H)
|
|
328
|
+
if collision:
|
|
329
|
+
return True
|
|
330
|
+
return False
|
|
331
|
+
|
|
332
|
+
def sync(self):
|
|
333
|
+
print("Saving Bloom Index...")
|
|
334
|
+
for table in self.hashtables:
|
|
335
|
+
table.sync()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.7.0"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: datasketch
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.7.0
|
|
4
4
|
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
5
|
Home-page: https://ekzhu.github.io/datasketch
|
|
6
6
|
Author: ekzhu
|
|
@@ -13,11 +13,11 @@ Classifier: Intended Audience :: Developers
|
|
|
13
13
|
Classifier: Topic :: Database
|
|
14
14
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
15
15
|
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.7
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.8
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.9
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.10
|
|
20
19
|
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: numpy>=1.11
|
|
23
23
|
Requires-Dist: scipy>=1.0.0
|
|
@@ -25,6 +25,9 @@ Provides-Extra: cassandra
|
|
|
25
25
|
Requires-Dist: cassandra-driver>=3.20; extra == "cassandra"
|
|
26
26
|
Provides-Extra: redis
|
|
27
27
|
Requires-Dist: redis>=2.10.0; extra == "redis"
|
|
28
|
+
Provides-Extra: bloom
|
|
29
|
+
Requires-Dist: pybloomfilter3>=0.7.2; python_version >= "3.9" and extra == "bloom"
|
|
30
|
+
Requires-Dist: pybloomfiltermmap3==0.6.0; python_version < "3.9" and extra == "bloom"
|
|
28
31
|
Provides-Extra: benchmark
|
|
29
32
|
Requires-Dist: pyhash>=0.9.3; extra == "benchmark"
|
|
30
33
|
Requires-Dist: matplotlib>=3.1.2; extra == "benchmark"
|
|
@@ -45,8 +48,20 @@ Requires-Dist: nose>=1.3.7; extra == "test"
|
|
|
45
48
|
Requires-Dist: nose-exclude>=0.5.0; extra == "test"
|
|
46
49
|
Requires-Dist: pytest; extra == "test"
|
|
47
50
|
Provides-Extra: experimental-aio
|
|
48
|
-
Requires-Dist: aiounittest; python_version >= "3.
|
|
49
|
-
Requires-Dist: motor; python_version >= "3.
|
|
51
|
+
Requires-Dist: aiounittest; python_version >= "3.8" and extra == "experimental-aio"
|
|
52
|
+
Requires-Dist: motor>3.6.0; python_version >= "3.8" and extra == "experimental-aio"
|
|
53
|
+
Dynamic: author
|
|
54
|
+
Dynamic: author-email
|
|
55
|
+
Dynamic: classifier
|
|
56
|
+
Dynamic: description
|
|
57
|
+
Dynamic: home-page
|
|
58
|
+
Dynamic: keywords
|
|
59
|
+
Dynamic: license
|
|
60
|
+
Dynamic: license-file
|
|
61
|
+
Dynamic: project-url
|
|
62
|
+
Dynamic: provides-extra
|
|
63
|
+
Dynamic: requires-dist
|
|
64
|
+
Dynamic: summary
|
|
50
65
|
|
|
51
66
|
datasketch: Big Data Looks Small
|
|
52
67
|
================================
|
|
@@ -83,6 +98,8 @@ sub-linear query time:
|
|
|
83
98
|
+===========================+=============================+========================+
|
|
84
99
|
| `MinHash LSH`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
85
100
|
+---------------------------+-----------------------------+------------------------+
|
|
101
|
+
| `LSHBloom`_ | MinHash, Weighted MinHash | Jaccard Threshold |
|
|
102
|
+
+---------------------------+-----------------------------+------------------------+
|
|
86
103
|
| `MinHash LSH Forest`_ | MinHash, Weighted MinHash | Jaccard Top-K |
|
|
87
104
|
+---------------------------+-----------------------------+------------------------+
|
|
88
105
|
| `MinHash LSH Ensemble`_ | MinHash | Containment Threshold |
|
|
@@ -90,7 +107,7 @@ sub-linear query time:
|
|
|
90
107
|
| `HNSW`_ | Any | Custom Metric Top-K |
|
|
91
108
|
+---------------------------+-----------------------------+------------------------+
|
|
92
109
|
|
|
93
|
-
datasketch must be used with Python 3.
|
|
110
|
+
datasketch must be used with Python 3.8 or above, NumPy 1.11 or above, and Scipy.
|
|
94
111
|
|
|
95
112
|
Note that `MinHash LSH`_ and `MinHash LSH Ensemble`_ also support Redis and Cassandra
|
|
96
113
|
storage layer (see `MinHash LSH at Scale`_).
|
|
@@ -118,6 +135,11 @@ To install with Cassandra dependency:
|
|
|
118
135
|
|
|
119
136
|
pip install datasketch[cassandra]
|
|
120
137
|
|
|
138
|
+
To install with Bloom filter dependency:
|
|
139
|
+
|
|
140
|
+
::
|
|
141
|
+
|
|
142
|
+
pip install datasketch[bloom]
|
|
121
143
|
|
|
122
144
|
.. _`MinHash`: https://ekzhu.github.io/datasketch/minhash.html
|
|
123
145
|
.. _`Weighted MinHash`: https://ekzhu.github.io/datasketch/weightedminhash.html
|
|
@@ -126,5 +148,6 @@ To install with Cassandra dependency:
|
|
|
126
148
|
.. _`MinHash LSH`: https://ekzhu.github.io/datasketch/lsh.html
|
|
127
149
|
.. _`MinHash LSH Forest`: https://ekzhu.github.io/datasketch/lshforest.html
|
|
128
150
|
.. _`MinHash LSH Ensemble`: https://ekzhu.github.io/datasketch/lshensemble.html
|
|
151
|
+
.. _`LSHBloom`: https://ekzhu.github.io/datasketch/lshbloom.html
|
|
129
152
|
.. _`Minhash LSH at Scale`: http://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
|
|
130
153
|
.. _`HNSW`: https://ekzhu.github.io/datasketch/documentation.html#hnsw
|
|
@@ -9,6 +9,7 @@ datasketch/hyperloglog.py
|
|
|
9
9
|
datasketch/hyperloglog_const.py
|
|
10
10
|
datasketch/lean_minhash.py
|
|
11
11
|
datasketch/lsh.py
|
|
12
|
+
datasketch/lsh_bloom.py
|
|
12
13
|
datasketch/lshensemble.py
|
|
13
14
|
datasketch/lshensemble_partition.py
|
|
14
15
|
datasketch/lshforest.py
|
|
@@ -30,6 +31,7 @@ test/test_hyperloglog.py
|
|
|
30
31
|
test/test_lean_minhash.py
|
|
31
32
|
test/test_lsh.py
|
|
32
33
|
test/test_lsh_cassandra.py
|
|
34
|
+
test/test_lshbloom.py
|
|
33
35
|
test/test_lshensemble.py
|
|
34
36
|
test/test_lshforest.py
|
|
35
37
|
test/test_minhash.py
|
|
@@ -11,14 +11,22 @@ SetSimilaritySearch>=0.1.7
|
|
|
11
11
|
pyfarmhash>=0.2.2
|
|
12
12
|
nltk>=3.4.5
|
|
13
13
|
|
|
14
|
+
[bloom]
|
|
15
|
+
|
|
16
|
+
[bloom:python_version < "3.9"]
|
|
17
|
+
pybloomfiltermmap3==0.6.0
|
|
18
|
+
|
|
19
|
+
[bloom:python_version >= "3.9"]
|
|
20
|
+
pybloomfilter3>=0.7.2
|
|
21
|
+
|
|
14
22
|
[cassandra]
|
|
15
23
|
cassandra-driver>=3.20
|
|
16
24
|
|
|
17
25
|
[experimental_aio]
|
|
18
26
|
|
|
19
|
-
[experimental_aio:python_version >= "3.
|
|
27
|
+
[experimental_aio:python_version >= "3.8"]
|
|
20
28
|
aiounittest
|
|
21
|
-
motor
|
|
29
|
+
motor>3.6.0
|
|
22
30
|
|
|
23
31
|
[redis]
|
|
24
32
|
redis>=2.10.0
|
|
@@ -39,11 +39,11 @@ setup(
|
|
|
39
39
|
'Topic :: Database',
|
|
40
40
|
'Topic :: Scientific/Engineering :: Information Analysis',
|
|
41
41
|
'License :: OSI Approved :: MIT License',
|
|
42
|
-
'Programming Language :: Python :: 3.7',
|
|
43
42
|
'Programming Language :: Python :: 3.8',
|
|
44
43
|
'Programming Language :: Python :: 3.9',
|
|
45
44
|
'Programming Language :: Python :: 3.10',
|
|
46
45
|
'Programming Language :: Python :: 3.11',
|
|
46
|
+
'Programming Language :: Python :: 3.12',
|
|
47
47
|
],
|
|
48
48
|
keywords='database datamining',
|
|
49
49
|
packages=find_packages(include=['datasketch*']),
|
|
@@ -58,6 +58,10 @@ setup(
|
|
|
58
58
|
'redis': [
|
|
59
59
|
'redis>=2.10.0',
|
|
60
60
|
],
|
|
61
|
+
'bloom': [
|
|
62
|
+
'pybloomfilter3>=0.7.2 ; python_version>="3.9"',
|
|
63
|
+
'pybloomfiltermmap3==0.6.0 ; python_version<"3.9"',
|
|
64
|
+
],
|
|
61
65
|
'benchmark': [
|
|
62
66
|
'pyhash>=0.9.3',
|
|
63
67
|
'matplotlib>=3.1.2',
|
|
@@ -80,8 +84,8 @@ setup(
|
|
|
80
84
|
'pytest',
|
|
81
85
|
],
|
|
82
86
|
'experimental_aio': [
|
|
83
|
-
"aiounittest ; python_version>='3.
|
|
84
|
-
"motor ; python_version>='3.
|
|
87
|
+
"aiounittest ; python_version>='3.8'",
|
|
88
|
+
"motor>3.6.0 ; python_version>='3.8'",
|
|
85
89
|
],
|
|
86
90
|
},
|
|
87
91
|
)
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
import pickle
|
|
3
|
+
from glob import glob
|
|
4
|
+
from datasketch.lsh_bloom import BloomTable, MinHashLSHBloom
|
|
5
|
+
from datasketch.minhash import MinHash
|
|
6
|
+
import numpy as np
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
class TestBloomTable(unittest.TestCase):
|
|
10
|
+
def test_insert(self):
|
|
11
|
+
r = 3
|
|
12
|
+
x = np.array([2,3,31], dtype=np.uint32)
|
|
13
|
+
b = BloomTable(10, 0.01, band_size=r)
|
|
14
|
+
b.insert(x)
|
|
15
|
+
self.assertRaises(RuntimeError, b.insert, np.array([2,2], dtype=np.uint32))
|
|
16
|
+
|
|
17
|
+
def test_query(self):
|
|
18
|
+
r = 3
|
|
19
|
+
x = np.array([2,3,31], dtype=np.uint32)
|
|
20
|
+
b = BloomTable(10, 0.01, band_size=r)
|
|
21
|
+
b.insert(x)
|
|
22
|
+
self.assertTrue(b.query(x))
|
|
23
|
+
self.assertFalse(b.query(np.array([2,3,30], dtype=np.uint32)))
|
|
24
|
+
self.assertRaises(RuntimeError, b.query, [2,2])
|
|
25
|
+
|
|
26
|
+
def test_save(self):
|
|
27
|
+
fname = "/tmp/bloomfilter.bf"
|
|
28
|
+
if os.path.exists(fname):
|
|
29
|
+
os.remove(fname)
|
|
30
|
+
r = 3
|
|
31
|
+
x = np.array([2,3,31], dtype=np.uint32)
|
|
32
|
+
y = np.array([12,10,29], dtype=np.uint32)
|
|
33
|
+
z = np.array([27,30,8], dtype=np.uint32)
|
|
34
|
+
items = [x,y,z]
|
|
35
|
+
b = BloomTable(10, 0.01, band_size=r, fname=fname)
|
|
36
|
+
for item in items:
|
|
37
|
+
b.insert(item)
|
|
38
|
+
for item in items:
|
|
39
|
+
self.assertTrue(b.query(item))
|
|
40
|
+
b.sync()
|
|
41
|
+
|
|
42
|
+
del b
|
|
43
|
+
|
|
44
|
+
b_ = BloomTable(10, 0.01, band_size=r, fname=fname)
|
|
45
|
+
for item in items:
|
|
46
|
+
self.assertTrue(b_.query(item))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class TestMinHashLSHBloom(unittest.TestCase):
|
|
50
|
+
|
|
51
|
+
def test_init(self):
|
|
52
|
+
lsh = MinHashLSHBloom(threshold=0.8, n=10, fp=0.01)
|
|
53
|
+
b1, r1 = lsh.b, lsh.r
|
|
54
|
+
lsh = MinHashLSHBloom(threshold=0.8, weights=(0.2,0.8), n=10, fp=0.01)
|
|
55
|
+
b2, r2 = lsh.b, lsh.r
|
|
56
|
+
self.assertTrue(b1 < b2)
|
|
57
|
+
self.assertTrue(r1 > r2)
|
|
58
|
+
self.assertTrue(len(lsh.hashtables) == lsh.b)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_insert(self):
|
|
62
|
+
lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01)
|
|
63
|
+
m1 = MinHash(16)
|
|
64
|
+
m1.update("a".encode("utf8"))
|
|
65
|
+
m2 = MinHash(16)
|
|
66
|
+
m2.update("b".encode("utf8"))
|
|
67
|
+
lsh.insert(m1)
|
|
68
|
+
lsh.insert(m2)
|
|
69
|
+
|
|
70
|
+
m3 = MinHash(18)
|
|
71
|
+
self.assertRaises(ValueError, lsh.insert, m3)
|
|
72
|
+
|
|
73
|
+
def test_query(self):
|
|
74
|
+
lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01)
|
|
75
|
+
m1 = MinHash(16)
|
|
76
|
+
m1.update("a".encode("utf8"))
|
|
77
|
+
m2 = MinHash(16)
|
|
78
|
+
m2.update("b".encode("utf8"))
|
|
79
|
+
lsh.insert(m1)
|
|
80
|
+
lsh.insert(m2)
|
|
81
|
+
result = lsh.query(m1)
|
|
82
|
+
self.assertTrue(result)
|
|
83
|
+
result = lsh.query(m2)
|
|
84
|
+
self.assertTrue(result)
|
|
85
|
+
|
|
86
|
+
m3 = MinHash(18)
|
|
87
|
+
self.assertRaises(ValueError, lsh.query, m3)
|
|
88
|
+
|
|
89
|
+
def test_save(self):
|
|
90
|
+
save_path = "./test_save/"
|
|
91
|
+
for item in glob(f"{save_path}/*.bf"):
|
|
92
|
+
os.remove(item)
|
|
93
|
+
|
|
94
|
+
lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01, save_dir=save_path)
|
|
95
|
+
m1 = MinHash(16)
|
|
96
|
+
m1.update("a".encode("utf8"))
|
|
97
|
+
m2 = MinHash(16)
|
|
98
|
+
m2.update("b".encode("utf8"))
|
|
99
|
+
lsh.insert(m1)
|
|
100
|
+
lsh.insert(m2)
|
|
101
|
+
lsh.sync()
|
|
102
|
+
|
|
103
|
+
lsh2 = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01, save_dir=save_path)
|
|
104
|
+
result = lsh2.query(m1)
|
|
105
|
+
self.assertTrue(result)
|
|
106
|
+
result = lsh2.query(m2)
|
|
107
|
+
self.assertTrue(result)
|
|
108
|
+
|
|
109
|
+
def test_save_in_memory(self):
|
|
110
|
+
|
|
111
|
+
with self.assertWarns(RuntimeWarning):
|
|
112
|
+
lsh = MinHashLSHBloom(threshold=0.5, num_perm=16, n=10, fp=0.01, save_dir=None)
|
|
113
|
+
|
|
114
|
+
m1 = MinHash(16)
|
|
115
|
+
m1.update("a".encode("utf8"))
|
|
116
|
+
m2 = MinHash(16)
|
|
117
|
+
m2.update("b".encode("utf8"))
|
|
118
|
+
lsh.insert(m1)
|
|
119
|
+
lsh.insert(m2)
|
|
120
|
+
|
|
121
|
+
with self.assertWarns(RuntimeWarning):
|
|
122
|
+
lsh.sync()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
if __name__ == "__main__":
|
|
126
|
+
unittest.main()
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.6.5"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|