datasketch 1.6.4__tar.gz → 1.6.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datasketch-1.6.4 → datasketch-1.6.5}/PKG-INFO +5 -5
- {datasketch-1.6.4 → datasketch-1.6.5}/README.rst +4 -4
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/lsh.py +55 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/lshforest.py +25 -0
- datasketch-1.6.5/datasketch/version.py +1 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch.egg-info/PKG-INFO +5 -5
- {datasketch-1.6.4 → datasketch-1.6.5}/test/test_lsh.py +111 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/test/test_lshforest.py +12 -0
- datasketch-1.6.4/datasketch/version.py +0 -1
- {datasketch-1.6.4 → datasketch-1.6.5}/LICENSE +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/__init__.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/b_bit_minhash.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/experimental/__init__.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/experimental/aio/__init__.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/experimental/aio/lsh.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/experimental/aio/storage.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/hashfunc.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/hnsw.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/hyperloglog.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/hyperloglog_const.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/lean_minhash.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/lshensemble.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/lshensemble_partition.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/minhash.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/storage.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/weighted_minhash.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch.egg-info/SOURCES.txt +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch.egg-info/dependency_links.txt +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch.egg-info/requires.txt +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/datasketch.egg-info/top_level.txt +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/setup.cfg +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/setup.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/test/test_hnsw.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/test/test_hyperloglog.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/test/test_lean_minhash.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/test/test_lsh_cassandra.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/test/test_lshensemble.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/test/test_minhash.py +0 -0
- {datasketch-1.6.4 → datasketch-1.6.5}/test/test_weighted_minhash.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datasketch
|
|
3
|
-
Version: 1.6.
|
|
3
|
+
Version: 1.6.5
|
|
4
4
|
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
5
|
Home-page: https://ekzhu.github.io/datasketch
|
|
6
6
|
Author: ekzhu
|
|
@@ -51,11 +51,11 @@ Requires-Dist: motor; python_version >= "3.6" and extra == "experimental-aio"
|
|
|
51
51
|
datasketch: Big Data Looks Small
|
|
52
52
|
================================
|
|
53
53
|
|
|
54
|
-
.. image:: https://
|
|
55
|
-
:target: https://
|
|
54
|
+
.. image:: https://static.pepy.tech/badge/datasketch/month
|
|
55
|
+
:target: https://pepy.tech/project/datasketch
|
|
56
56
|
|
|
57
|
-
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.
|
|
58
|
-
:target: https://
|
|
57
|
+
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
|
|
58
|
+
:target: https://zenodo.org/doi/10.5281/zenodo.598238
|
|
59
59
|
|
|
60
60
|
datasketch gives you probabilistic data structures that can process and
|
|
61
61
|
search very large amount of data super fast, with little loss of
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
datasketch: Big Data Looks Small
|
|
2
2
|
================================
|
|
3
3
|
|
|
4
|
-
.. image:: https://
|
|
5
|
-
:target: https://
|
|
4
|
+
.. image:: https://static.pepy.tech/badge/datasketch/month
|
|
5
|
+
:target: https://pepy.tech/project/datasketch
|
|
6
6
|
|
|
7
|
-
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.
|
|
8
|
-
:target: https://
|
|
7
|
+
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
|
|
8
|
+
:target: https://zenodo.org/doi/10.5281/zenodo.598238
|
|
9
9
|
|
|
10
10
|
datasketch gives you probabilistic data structures that can process and
|
|
11
11
|
search very large amount of data super fast, with little loss of
|
|
@@ -226,6 +226,29 @@ class MinHashLSH(object):
|
|
|
226
226
|
"""
|
|
227
227
|
self._insert(key, minhash, check_duplication=check_duplication, buffer=False)
|
|
228
228
|
|
|
229
|
+
def merge(
|
|
230
|
+
self,
|
|
231
|
+
other: MinHashLSH,
|
|
232
|
+
check_overlap: bool = False
|
|
233
|
+
):
|
|
234
|
+
"""Merge the other MinHashLSH with this one, making this one the union
|
|
235
|
+
of both.
|
|
236
|
+
|
|
237
|
+
Note:
|
|
238
|
+
Only num_perm, number of bands and sizes of each band is checked for equivalency of two MinHashLSH indexes.
|
|
239
|
+
Other initialization parameters threshold, weights, storage_config, prepickle and hash_func are not checked.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
other (MinHashLSH): The other MinHashLSH.
|
|
243
|
+
check_overlap (bool): Check if there are any overlapping keys before merging and raise if there are any.
|
|
244
|
+
(`default=False`)
|
|
245
|
+
|
|
246
|
+
Raises:
|
|
247
|
+
ValueError: If the two MinHashLSH have different initialization
|
|
248
|
+
parameters, or if `check_overlap` is `True` and there are overlapping keys.
|
|
249
|
+
"""
|
|
250
|
+
self._merge(other, check_overlap=check_overlap, buffer=False)
|
|
251
|
+
|
|
229
252
|
def insertion_session(self, buffer_size: int = 50000) -> MinHashLSHInsertionSession:
|
|
230
253
|
"""
|
|
231
254
|
Create a context manager for fast insertion into this index.
|
|
@@ -282,6 +305,38 @@ class MinHashLSH(object):
|
|
|
282
305
|
for H, hashtable in zip(Hs, self.hashtables):
|
|
283
306
|
hashtable.insert(H, key, buffer=buffer)
|
|
284
307
|
|
|
308
|
+
def __equivalent(self, other:MinHashLSH) -> bool:
|
|
309
|
+
"""
|
|
310
|
+
Returns:
|
|
311
|
+
bool: If the two MinHashLSH have equal num_perm, number of bands, size of each band then two are equivalent.
|
|
312
|
+
"""
|
|
313
|
+
return (
|
|
314
|
+
type(self) is type(other) and
|
|
315
|
+
self.h == other.h and
|
|
316
|
+
self.b == other.b and
|
|
317
|
+
self.r == other.r
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
def _merge(
|
|
321
|
+
self,
|
|
322
|
+
other: MinHashLSH,
|
|
323
|
+
check_overlap: bool = False,
|
|
324
|
+
buffer: bool = False
|
|
325
|
+
) -> MinHashLSH:
|
|
326
|
+
if self.__equivalent(other):
|
|
327
|
+
if check_overlap and set(self.keys).intersection(set(other.keys)):
|
|
328
|
+
raise ValueError("The keys are overlapping, duplicate key exists.")
|
|
329
|
+
for key in other.keys:
|
|
330
|
+
Hs = other.keys.get(key)
|
|
331
|
+
self.keys.insert(key, *Hs, buffer=buffer)
|
|
332
|
+
for H, hashtable in zip(Hs, self.hashtables):
|
|
333
|
+
hashtable.insert(H, key, buffer=buffer)
|
|
334
|
+
else:
|
|
335
|
+
if type(self) is not type(other):
|
|
336
|
+
raise ValueError(f"Cannot merge type MinHashLSH and type {type(other).__name__}.")
|
|
337
|
+
raise ValueError(
|
|
338
|
+
"Cannot merge MinHashLSH with different initialization parameters.")
|
|
339
|
+
|
|
285
340
|
def query(self, minhash) -> List[Hashable]:
|
|
286
341
|
"""
|
|
287
342
|
Giving the MinHash of the query set, retrieve
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
2
|
from typing import Hashable, List
|
|
3
|
+
import numpy as np
|
|
3
4
|
|
|
4
5
|
from datasketch.minhash import MinHash
|
|
5
6
|
|
|
@@ -128,6 +129,30 @@ class MinHashLSHForest(object):
|
|
|
128
129
|
r -= 1
|
|
129
130
|
return list(results)
|
|
130
131
|
|
|
132
|
+
def get_minhash_hashvalues(self, key: Hashable) -> np.ndarray:
|
|
133
|
+
"""
|
|
134
|
+
Returns the hashvalues from the MinHash object that corresponds to the given key in the LSHForest,
|
|
135
|
+
if it exists. This is useful for when we want to reconstruct the original MinHash
|
|
136
|
+
object to manually check the Jaccard Similarity for the top-k results from a query.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
key (Hashable): The key whose MinHash hashvalues we want to retrieve.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
hashvalues: The hashvalues for the MinHash object corresponding to the given key.
|
|
143
|
+
"""
|
|
144
|
+
byteslist = self.keys.get(key, None)
|
|
145
|
+
if byteslist is None:
|
|
146
|
+
raise KeyError(f"The provided key does not exist in the LSHForest: {key}")
|
|
147
|
+
hashvalue_byte_size = len(byteslist[0])//8
|
|
148
|
+
hashvalues = np.empty(len(byteslist)*hashvalue_byte_size, dtype=np.uint64)
|
|
149
|
+
for index, item in enumerate(byteslist):
|
|
150
|
+
# unswap the bytes, as their representation is flipped during storage
|
|
151
|
+
hv_segment = np.frombuffer(item, dtype=np.uint64).byteswap()
|
|
152
|
+
curr_index = index*hashvalue_byte_size
|
|
153
|
+
hashvalues[curr_index:curr_index+hashvalue_byte_size] = hv_segment
|
|
154
|
+
return hashvalues
|
|
155
|
+
|
|
131
156
|
def _binary_search(self, n, func):
|
|
132
157
|
"""
|
|
133
158
|
https://golang.org/src/sort/search.go?s=2247:2287#L49
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.6.5"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datasketch
|
|
3
|
-
Version: 1.6.
|
|
3
|
+
Version: 1.6.5
|
|
4
4
|
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
5
|
Home-page: https://ekzhu.github.io/datasketch
|
|
6
6
|
Author: ekzhu
|
|
@@ -51,11 +51,11 @@ Requires-Dist: motor; python_version >= "3.6" and extra == "experimental-aio"
|
|
|
51
51
|
datasketch: Big Data Looks Small
|
|
52
52
|
================================
|
|
53
53
|
|
|
54
|
-
.. image:: https://
|
|
55
|
-
:target: https://
|
|
54
|
+
.. image:: https://static.pepy.tech/badge/datasketch/month
|
|
55
|
+
:target: https://pepy.tech/project/datasketch
|
|
56
56
|
|
|
57
|
-
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.
|
|
58
|
-
:target: https://
|
|
57
|
+
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
|
|
58
|
+
:target: https://zenodo.org/doi/10.5281/zenodo.598238
|
|
59
59
|
|
|
60
60
|
datasketch gives you probabilistic data structures that can process and
|
|
61
61
|
search very large amount of data super fast, with little loss of
|
|
@@ -240,6 +240,117 @@ class TestMinHashLSH(unittest.TestCase):
|
|
|
240
240
|
for table in counts:
|
|
241
241
|
self.assertEqual(sum(table.values()), 2)
|
|
242
242
|
|
|
243
|
+
def test_merge(self):
|
|
244
|
+
lsh1 = MinHashLSH(threshold=0.5, num_perm=16)
|
|
245
|
+
m1 = MinHash(16)
|
|
246
|
+
m1.update("a".encode("utf-8"))
|
|
247
|
+
m2 = MinHash(16)
|
|
248
|
+
m2.update("b".encode("utf-8"))
|
|
249
|
+
lsh1.insert("a",m1)
|
|
250
|
+
lsh1.insert("b",m2)
|
|
251
|
+
|
|
252
|
+
lsh2 = MinHashLSH(threshold=0.5, num_perm=16)
|
|
253
|
+
m3 = MinHash(16)
|
|
254
|
+
m3.update("c".encode("utf-8"))
|
|
255
|
+
m4 = MinHash(16)
|
|
256
|
+
m4.update("d".encode("utf-8"))
|
|
257
|
+
lsh2.insert("c",m1)
|
|
258
|
+
lsh2.insert("d",m2)
|
|
259
|
+
|
|
260
|
+
lsh1.merge(lsh2)
|
|
261
|
+
for t in lsh1.hashtables:
|
|
262
|
+
self.assertTrue(len(t) >= 1)
|
|
263
|
+
items = []
|
|
264
|
+
for H in t:
|
|
265
|
+
items.extend(t[H])
|
|
266
|
+
self.assertTrue("c" in items)
|
|
267
|
+
self.assertTrue("d" in items)
|
|
268
|
+
self.assertTrue("a" in lsh1)
|
|
269
|
+
self.assertTrue("b" in lsh1)
|
|
270
|
+
self.assertTrue("c" in lsh1)
|
|
271
|
+
self.assertTrue("d" in lsh1)
|
|
272
|
+
for i, H in enumerate(lsh1.keys["c"]):
|
|
273
|
+
self.assertTrue("c" in lsh1.hashtables[i][H])
|
|
274
|
+
|
|
275
|
+
self.assertTrue(lsh1.merge, lsh2)
|
|
276
|
+
self.assertRaises(ValueError, lsh1.merge, lsh2, check_overlap=True)
|
|
277
|
+
|
|
278
|
+
m5 = MinHash(16)
|
|
279
|
+
m5.update("e".encode("utf-8"))
|
|
280
|
+
lsh3 = MinHashLSH(threshold=0.5, num_perm=16)
|
|
281
|
+
lsh3.insert("a",m5)
|
|
282
|
+
|
|
283
|
+
self.assertRaises(ValueError, lsh1.merge, lsh3, check_overlap=True)
|
|
284
|
+
|
|
285
|
+
lsh1.merge(lsh3)
|
|
286
|
+
|
|
287
|
+
m6 = MinHash(16)
|
|
288
|
+
m6.update("e".encode("utf-8"))
|
|
289
|
+
lsh4 = MinHashLSH(threshold=0.5, num_perm=16)
|
|
290
|
+
lsh4.insert("a",m6)
|
|
291
|
+
|
|
292
|
+
lsh1.merge(lsh4, check_overlap=False)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def test_merge_redis(self):
|
|
296
|
+
with patch('redis.Redis', fake_redis) as mock_redis:
|
|
297
|
+
lsh1 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
|
|
298
|
+
'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
|
|
299
|
+
})
|
|
300
|
+
lsh2 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
|
|
301
|
+
'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
|
|
302
|
+
})
|
|
303
|
+
|
|
304
|
+
m1 = MinHash(16)
|
|
305
|
+
m1.update("a".encode("utf8"))
|
|
306
|
+
m2 = MinHash(16)
|
|
307
|
+
m2.update("b".encode("utf8"))
|
|
308
|
+
lsh1.insert("a", m1)
|
|
309
|
+
lsh1.insert("b", m2)
|
|
310
|
+
|
|
311
|
+
m3 = MinHash(16)
|
|
312
|
+
m3.update("c".encode("utf8"))
|
|
313
|
+
m4 = MinHash(16)
|
|
314
|
+
m4.update("d".encode("utf8"))
|
|
315
|
+
lsh2.insert("c", m3)
|
|
316
|
+
lsh2.insert("d", m4)
|
|
317
|
+
|
|
318
|
+
lsh1.merge(lsh2)
|
|
319
|
+
for t in lsh1.hashtables:
|
|
320
|
+
self.assertTrue(len(t) >= 1)
|
|
321
|
+
items = []
|
|
322
|
+
for H in t:
|
|
323
|
+
items.extend(t[H])
|
|
324
|
+
self.assertTrue(pickle.dumps("c") in items)
|
|
325
|
+
self.assertTrue(pickle.dumps("d") in items)
|
|
326
|
+
self.assertTrue("a" in lsh1)
|
|
327
|
+
self.assertTrue("b" in lsh1)
|
|
328
|
+
self.assertTrue("c" in lsh1)
|
|
329
|
+
self.assertTrue("d" in lsh1)
|
|
330
|
+
for i, H in enumerate(lsh1.keys[pickle.dumps("c")]):
|
|
331
|
+
self.assertTrue(pickle.dumps("c") in lsh1.hashtables[i][H])
|
|
332
|
+
|
|
333
|
+
self.assertTrue(lsh1.merge, lsh2)
|
|
334
|
+
self.assertRaises(ValueError, lsh1.merge, lsh2, check_overlap=True)
|
|
335
|
+
|
|
336
|
+
m5 = MinHash(16)
|
|
337
|
+
m5.update("e".encode("utf-8"))
|
|
338
|
+
lsh3 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
|
|
339
|
+
'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
|
|
340
|
+
})
|
|
341
|
+
lsh3.insert("a",m5)
|
|
342
|
+
|
|
343
|
+
self.assertRaises(ValueError, lsh1.merge, lsh3, check_overlap=True)
|
|
344
|
+
|
|
345
|
+
m6 = MinHash(16)
|
|
346
|
+
m6.update("e".encode("utf-8"))
|
|
347
|
+
lsh4 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
|
|
348
|
+
'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
|
|
349
|
+
})
|
|
350
|
+
lsh4.insert("a",m6)
|
|
351
|
+
|
|
352
|
+
lsh1.merge(lsh4, check_overlap=False)
|
|
353
|
+
|
|
243
354
|
|
|
244
355
|
class TestWeightedMinHashLSH(unittest.TestCase):
|
|
245
356
|
|
|
@@ -62,6 +62,18 @@ class TestMinHashLSHForest(unittest.TestCase):
|
|
|
62
62
|
results = forest.query(data[key], 10)
|
|
63
63
|
self.assertIn(key, results)
|
|
64
64
|
|
|
65
|
+
def test_get_minhash_hashvalues(self):
|
|
66
|
+
forest, data = self._setup()
|
|
67
|
+
for key in data:
|
|
68
|
+
minhash_ori = data[key]
|
|
69
|
+
hashvalues = forest.get_minhash_hashvalues(key)
|
|
70
|
+
minhash_retrieved = MinHash(hashvalues=hashvalues)
|
|
71
|
+
retrieved_hashvalues = minhash_retrieved.hashvalues
|
|
72
|
+
self.assertEqual(len(hashvalues), len(retrieved_hashvalues))
|
|
73
|
+
self.assertEqual(minhash_retrieved.jaccard(minhash_ori), 1.0)
|
|
74
|
+
for i in range(len(retrieved_hashvalues)):
|
|
75
|
+
self.assertEqual(hashvalues[i], retrieved_hashvalues[i])
|
|
76
|
+
|
|
65
77
|
def test_pickle(self):
|
|
66
78
|
forest, _ = self._setup()
|
|
67
79
|
forest2 = pickle.loads(pickle.dumps(forest))
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.6.4"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|